xref: /trunk/main/sal/inc/rtl/tencinfo.h (revision 86e1cf34)
1514f4c20SAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3514f4c20SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4514f4c20SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5514f4c20SAndrew Rist  * distributed with this work for additional information
6514f4c20SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7514f4c20SAndrew Rist  * to you under the Apache License, Version 2.0 (the
8514f4c20SAndrew Rist  * "License"); you may not use this file except in compliance
9514f4c20SAndrew Rist  * with the License.  You may obtain a copy of the License at
10514f4c20SAndrew Rist  *
11514f4c20SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12514f4c20SAndrew Rist  *
13514f4c20SAndrew Rist  * Unless required by applicable law or agreed to in writing,
14514f4c20SAndrew Rist  * software distributed under the License is distributed on an
15514f4c20SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16514f4c20SAndrew Rist  * KIND, either express or implied.  See the License for the
17514f4c20SAndrew Rist  * specific language governing permissions and limitations
18514f4c20SAndrew Rist  * under the License.
19514f4c20SAndrew Rist  *
20514f4c20SAndrew Rist  *************************************************************/
21514f4c20SAndrew Rist 
22514f4c20SAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir #ifndef _RTL_TENCINFO_H
25cdf0e10cSrcweir #define _RTL_TENCINFO_H
26cdf0e10cSrcweir 
27cdf0e10cSrcweir #ifndef _SAL_TYPES_H
28cdf0e10cSrcweir #include <sal/types.h>
29cdf0e10cSrcweir #endif
30cdf0e10cSrcweir #include <rtl/textenc.h>
31cdf0e10cSrcweir 
32cdf0e10cSrcweir #ifdef __cplusplus
33cdf0e10cSrcweir extern "C" {
34cdf0e10cSrcweir #endif
35cdf0e10cSrcweir 
36cdf0e10cSrcweir // See rtl_TextEncodingInfo.Flags below for documentation on these values:
37cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_CONTEXT   ((sal_uInt32)0x00000001)
38cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_ASCII     ((sal_uInt32)0x00000002)
39cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_UNICODE   ((sal_uInt32)0x00000004)
40cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_MULTIBYTE ((sal_uInt32)0x00000008)
41cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_R2L       ((sal_uInt32)0x00000010)
42cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_7BIT      ((sal_uInt32)0x00000020)
43cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_SYMBOL    ((sal_uInt32)0x00000040)
44cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_MIME      ((sal_uInt32)0x00000080)
45cdf0e10cSrcweir 
46cdf0e10cSrcweir /** Information about a text encoding.
47cdf0e10cSrcweir  */
48cdf0e10cSrcweir typedef struct _rtl_TextEncodingInfo
49cdf0e10cSrcweir {
50cdf0e10cSrcweir     /** The size (in bytes) of this structure.  Should be 12.
51cdf0e10cSrcweir      */
52cdf0e10cSrcweir     sal_uInt32          StructSize;
53cdf0e10cSrcweir 
54cdf0e10cSrcweir     /** The minimum number of bytes needed to encode any character in the
55cdf0e10cSrcweir         given encoding.
56cdf0e10cSrcweir 
57cdf0e10cSrcweir         Can be rather meaningless for encodings that encode global state along
58cdf0e10cSrcweir         with the characters (e.g., ISO-2022 encodings).
59cdf0e10cSrcweir      */
60cdf0e10cSrcweir     sal_uInt8           MinimumCharSize;
61cdf0e10cSrcweir 
62cdf0e10cSrcweir     /** The maximum number of bytes needed to encode any character in the
63cdf0e10cSrcweir         given encoding.
64cdf0e10cSrcweir 
65cdf0e10cSrcweir         Can be rather meaningless for encodings that encode global state along
66cdf0e10cSrcweir         with the characters (e.g., ISO-2022 encodings).
67cdf0e10cSrcweir      */
68cdf0e10cSrcweir     sal_uInt8           MaximumCharSize;
69cdf0e10cSrcweir 
70cdf0e10cSrcweir     /** The average number of bytes needed to encode a character in the given
71cdf0e10cSrcweir         encoding.
72cdf0e10cSrcweir      */
73cdf0e10cSrcweir     sal_uInt8           AverageCharSize;
74cdf0e10cSrcweir 
75cdf0e10cSrcweir     /** An unused byte, for padding.
76cdf0e10cSrcweir      */
77cdf0e10cSrcweir     sal_uInt8           Reserved;
78cdf0e10cSrcweir 
79cdf0e10cSrcweir     /** Any combination of the RTL_TEXTENCODING_INFO flags.
80cdf0e10cSrcweir 
81cdf0e10cSrcweir         RTL_TEXTENCODING_INFO_CONTEXT:  The encoding uses some mechanism (like
82cdf0e10cSrcweir         state-changing byte sequences) to switch between different modes (e.g.,
83cdf0e10cSrcweir         to encode multiple character repertoires within the same byte ranges).
84cdf0e10cSrcweir 
85cdf0e10cSrcweir         Even if an encoding does not have the CONTEXT property, interpretation
86cdf0e10cSrcweir         of certain byte values within that encoding can depend on context (e.g.,
87cdf0e10cSrcweir         a certain byte value could be either a single-byte character or a
88cdf0e10cSrcweir         subsequent byte of a multi-byte character).  Likewise, the single shift
89cdf0e10cSrcweir         characters (SS2 and SS3) used by some of the EUC encodings (to denote
90cdf0e10cSrcweir         that the following bytes constitute a character from another character
91cdf0e10cSrcweir         repertoire) do not imply that encodings making use of these characters
92cdf0e10cSrcweir         have the CONTEXT property.  Examples of encodings that do have the
93cdf0e10cSrcweir         CONTEXT property are the ISO-2022 encodings and UTF-7.
94cdf0e10cSrcweir 
95cdf0e10cSrcweir         RTL_TEXTENCODING_INFO_ASCII:  The encoding is a superset of ASCII.  More
96cdf0e10cSrcweir         specifically, any appearance of a byte in the range 0x20--7F denotes the
97cdf0e10cSrcweir         corresponding ASCII character (from SPACE to DELETE); in particular,
98cdf0e10cSrcweir         such a byte cannot be part of a multi-byte character.  Note that the
99cdf0e10cSrcweir         ASCII control codes 0x00--1F are not included here, as they are used for
100cdf0e10cSrcweir         special purposes in some encodings.
101cdf0e10cSrcweir 
102*86e1cf34SPedro Giffuni         If an encoding has this property, it is easy to search for occurrences of
103cdf0e10cSrcweir         ASCII characters within strings of this encoding---you do not need to
104cdf0e10cSrcweir         keep track whether a byte in the range 0x20--7F really represents an
105cdf0e10cSrcweir         ASCII character or rather is part of some multi-byte character.
106cdf0e10cSrcweir 
107cdf0e10cSrcweir         The guarantees when mapping between Unicode and a given encoding with
108cdf0e10cSrcweir         the ASCII property are as follows:  When mapping from Unicode to the
109cdf0e10cSrcweir         given encoding, U+0020--007F map to 0x20--7F (but there can also be
110cdf0e10cSrcweir         other Unicode characters mapping into the range 0x20--7F), and when
111cdf0e10cSrcweir         mapping from the given encoding to Unicode, 0x20--7F map to U+0020--007F
112cdf0e10cSrcweir         (again, there can also be other characters mapping into the range
113cdf0e10cSrcweir         U+0020--007F).  In particular, this ensures round-trip conversion for
114cdf0e10cSrcweir         the ASCII range.
115cdf0e10cSrcweir 
116cdf0e10cSrcweir         In principle, the ASCII property is orthogonal to the CONTEXT property.
117cdf0e10cSrcweir         In practice, however, an encoding that has the ASCII property will most
118cdf0e10cSrcweir         likely not also have the CONTEXT property.
119cdf0e10cSrcweir 
120cdf0e10cSrcweir         RTL_TEXTENCODING_INFO_UNICODE:  The encoding is based on the Unicode
121cdf0e10cSrcweir         character repertoire.
122cdf0e10cSrcweir 
123cdf0e10cSrcweir         RTL_TEXTENCODING_INFO_MULTIBYTE:  A multi-byte encoding.
124cdf0e10cSrcweir 
125cdf0e10cSrcweir         RTL_TEXTENCODING_INFO_R2L:  An encoding used mainly or exclusively for
126cdf0e10cSrcweir         languages written from right to left.
127cdf0e10cSrcweir 
128cdf0e10cSrcweir         RTL_TEXTENCODING_INFO_7BIT:  A 7-bit instead of an 8-bit encoding.
129cdf0e10cSrcweir 
130cdf0e10cSrcweir         RTL_TEXTENCODING_INFO_SYMBOL:  A (generic) encoding for symbol character
131cdf0e10cSrcweir         sets.
132cdf0e10cSrcweir 
133cdf0e10cSrcweir         RTL_TEXTENCODING_INFO_MIME:  The encoding is registered as a MIME
134cdf0e10cSrcweir         charset.
135cdf0e10cSrcweir      */
136cdf0e10cSrcweir     sal_uInt32          Flags;
137cdf0e10cSrcweir } rtl_TextEncodingInfo;
138cdf0e10cSrcweir 
139cdf0e10cSrcweir /** Determine whether a text encoding uses single octets as basic units of
140cdf0e10cSrcweir     information (and can thus be used with the conversion routines in
141cdf0e10cSrcweir     rtl/textcvt.h).
142cdf0e10cSrcweir 
143cdf0e10cSrcweir     @param nEncoding
144cdf0e10cSrcweir     Any rtl_TextEncoding value.
145cdf0e10cSrcweir 
146cdf0e10cSrcweir     @return
147cdf0e10cSrcweir     True if the given encoding uses single octets as basic units of
148cdf0e10cSrcweir     information, false otherwise.
149cdf0e10cSrcweir  */
150cdf0e10cSrcweir sal_Bool SAL_CALL rtl_isOctetTextEncoding(rtl_TextEncoding nEncoding);
151cdf0e10cSrcweir 
152cdf0e10cSrcweir /** Return information about a text encoding.
153cdf0e10cSrcweir 
154cdf0e10cSrcweir     @param eTextEncoding
155cdf0e10cSrcweir     Any rtl_TextEncoding value.
156cdf0e10cSrcweir 
157cdf0e10cSrcweir     @param pEncInfo
158cdf0e10cSrcweir     Returns information about the given encoding.  Must not be null, and the
159cdf0e10cSrcweir     StructSize member must be set correctly.
160cdf0e10cSrcweir 
161cdf0e10cSrcweir     @return
162cdf0e10cSrcweir     True if information about the given encoding is available, false
163cdf0e10cSrcweir     otherwise.
164cdf0e10cSrcweir  */
165cdf0e10cSrcweir sal_Bool SAL_CALL rtl_getTextEncodingInfo( rtl_TextEncoding eTextEncoding, rtl_TextEncodingInfo* pEncInfo );
166cdf0e10cSrcweir 
167cdf0e10cSrcweir /** Map from a numeric Windows charset to a text encoding.
168cdf0e10cSrcweir 
169cdf0e10cSrcweir     @param nWinCharset
170cdf0e10cSrcweir     Any numeric Windows charset.
171cdf0e10cSrcweir 
172cdf0e10cSrcweir     @return
173cdf0e10cSrcweir     The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if
174cdf0e10cSrcweir     no mapping is applicable.
175cdf0e10cSrcweir  */
176cdf0e10cSrcweir rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromWindowsCharset( sal_uInt8 nWinCharset );
177cdf0e10cSrcweir 
178cdf0e10cSrcweir /** Map from a MIME charset to a text encoding.
179cdf0e10cSrcweir 
180cdf0e10cSrcweir     @param pMimeCharset
181cdf0e10cSrcweir     Any MIME charset string.  Must not be null.
182cdf0e10cSrcweir 
183cdf0e10cSrcweir     @return
184cdf0e10cSrcweir     The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if
185cdf0e10cSrcweir     no mapping is applicable.
186cdf0e10cSrcweir  */
187cdf0e10cSrcweir rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromMimeCharset( const sal_Char* pMimeCharset );
188cdf0e10cSrcweir 
189cdf0e10cSrcweir /** Map from a Unix charset to a text encoding.
190cdf0e10cSrcweir 
191cdf0e10cSrcweir     @param pMimeCharset
192cdf0e10cSrcweir     Any Unix charset string.  Must not be null.
193cdf0e10cSrcweir 
194cdf0e10cSrcweir     @return
195cdf0e10cSrcweir     The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if
196cdf0e10cSrcweir     no mapping is applicable.
197cdf0e10cSrcweir  */
198cdf0e10cSrcweir rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromUnixCharset( const sal_Char* pUnixCharset );
199cdf0e10cSrcweir 
200cdf0e10cSrcweir /** Map from a text encoding to the best matching numeric Windows charset.
201cdf0e10cSrcweir 
202cdf0e10cSrcweir     @param eTextEncoding
203cdf0e10cSrcweir     Any rtl_TextEncoding value.
204cdf0e10cSrcweir 
205cdf0e10cSrcweir     @return
206cdf0e10cSrcweir     The best matching numeric Windows charset, or 1 if none matches.
207cdf0e10cSrcweir  */
208cdf0e10cSrcweir sal_uInt8       SAL_CALL rtl_getBestWindowsCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding );
209cdf0e10cSrcweir 
210cdf0e10cSrcweir /** Map from a text encoding to a corresponding MIME charset name, if
211cdf0e10cSrcweir     available (see <http://www.iana.org/assignments/character-sets>).
212cdf0e10cSrcweir 
213cdf0e10cSrcweir     @param nEncoding
214cdf0e10cSrcweir     Any rtl_TextEncoding value.
215cdf0e10cSrcweir 
216cdf0e10cSrcweir     @return
217cdf0e10cSrcweir     The (preferred) MIME charset name corresponding to the given encoding, or
218cdf0e10cSrcweir     NULL if none is available.
219cdf0e10cSrcweir  */
220cdf0e10cSrcweir char const * SAL_CALL rtl_getMimeCharsetFromTextEncoding(rtl_TextEncoding
221cdf0e10cSrcweir                                                              nEncoding);
222cdf0e10cSrcweir 
223cdf0e10cSrcweir /** Map from a text encoding to the best matching MIME charset.
224cdf0e10cSrcweir 
225cdf0e10cSrcweir     @param eTextEncoding
226cdf0e10cSrcweir     Any rtl_TextEncoding value.
227cdf0e10cSrcweir 
228cdf0e10cSrcweir     @return
229cdf0e10cSrcweir     The best matching MIME charset string, or null if none matches.
230cdf0e10cSrcweir  */
231cdf0e10cSrcweir const sal_Char* SAL_CALL rtl_getBestMimeCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding );
232cdf0e10cSrcweir 
233cdf0e10cSrcweir /** Map from a text encoding to the best matching Unix charset.
234cdf0e10cSrcweir 
235cdf0e10cSrcweir     @param eTextEncoding
236cdf0e10cSrcweir     Any rtl_TextEncoding value.
237cdf0e10cSrcweir 
238cdf0e10cSrcweir     @return
239cdf0e10cSrcweir     The best matching Unix charset string, or null if none matches.
240cdf0e10cSrcweir  */
241cdf0e10cSrcweir const sal_Char* SAL_CALL rtl_getBestUnixCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding  );
242cdf0e10cSrcweir 
243cdf0e10cSrcweir /** Map from a Windows code page to a text encoding.
244cdf0e10cSrcweir 
245cdf0e10cSrcweir     @param nCodePage
246cdf0e10cSrcweir     Any Windows code page number.
247cdf0e10cSrcweir 
248cdf0e10cSrcweir     @return
249cdf0e10cSrcweir     The corresponding rtl_TextEncoding value (which will be an octet text
250cdf0e10cSrcweir     encoding, see rtl_isOctetTextEncoding), or RTL_TEXTENCODING_DONTKNOW if no
251cdf0e10cSrcweir     mapping is applicable.
252cdf0e10cSrcweir  */
253cdf0e10cSrcweir rtl_TextEncoding SAL_CALL
254cdf0e10cSrcweir rtl_getTextEncodingFromWindowsCodePage(sal_uInt32 nCodePage);
255cdf0e10cSrcweir 
256cdf0e10cSrcweir /** Map from a text encoding to a Windows code page.
257cdf0e10cSrcweir 
258cdf0e10cSrcweir     @param nEncoding
259cdf0e10cSrcweir     Any rtl_TextEncoding value.
260cdf0e10cSrcweir 
261cdf0e10cSrcweir     @return
262cdf0e10cSrcweir     The corresponding Windows code page number, or 0 if no mapping is
263cdf0e10cSrcweir     applicable.
264cdf0e10cSrcweir  */
265cdf0e10cSrcweir sal_uInt32 SAL_CALL
266cdf0e10cSrcweir rtl_getWindowsCodePageFromTextEncoding(rtl_TextEncoding nEncoding);
267cdf0e10cSrcweir 
268cdf0e10cSrcweir #ifdef __cplusplus
269cdf0e10cSrcweir }
270cdf0e10cSrcweir #endif
271cdf0e10cSrcweir 
272cdf0e10cSrcweir #endif /* _RTL_TENCINFO_H */
273