1*514f4c20SAndrew Rist /************************************************************** 2cdf0e10cSrcweir * 3*514f4c20SAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one 4*514f4c20SAndrew Rist * or more contributor license agreements. See the NOTICE file 5*514f4c20SAndrew Rist * distributed with this work for additional information 6*514f4c20SAndrew Rist * regarding copyright ownership. The ASF licenses this file 7*514f4c20SAndrew Rist * to you under the Apache License, Version 2.0 (the 8*514f4c20SAndrew Rist * "License"); you may not use this file except in compliance 9*514f4c20SAndrew Rist * with the License. You may obtain a copy of the License at 10*514f4c20SAndrew Rist * 11*514f4c20SAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0 12*514f4c20SAndrew Rist * 13*514f4c20SAndrew Rist * Unless required by applicable law or agreed to in writing, 14*514f4c20SAndrew Rist * software distributed under the License is distributed on an 15*514f4c20SAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 16*514f4c20SAndrew Rist * KIND, either express or implied. See the License for the 17*514f4c20SAndrew Rist * specific language governing permissions and limitations 18*514f4c20SAndrew Rist * under the License. 19*514f4c20SAndrew Rist * 20*514f4c20SAndrew Rist *************************************************************/ 21*514f4c20SAndrew Rist 22*514f4c20SAndrew Rist 23cdf0e10cSrcweir 24cdf0e10cSrcweir #ifndef _RTL_TENCINFO_H 25cdf0e10cSrcweir #define _RTL_TENCINFO_H 26cdf0e10cSrcweir 27cdf0e10cSrcweir #ifndef _SAL_TYPES_H 28cdf0e10cSrcweir #include <sal/types.h> 29cdf0e10cSrcweir #endif 30cdf0e10cSrcweir #include <rtl/textenc.h> 31cdf0e10cSrcweir 32cdf0e10cSrcweir #ifdef __cplusplus 33cdf0e10cSrcweir extern "C" { 34cdf0e10cSrcweir #endif 35cdf0e10cSrcweir 36cdf0e10cSrcweir // See rtl_TextEncodingInfo.Flags below for documentation on these values: 37cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_CONTEXT ((sal_uInt32)0x00000001) 38cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_ASCII ((sal_uInt32)0x00000002) 39cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_UNICODE ((sal_uInt32)0x00000004) 40cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_MULTIBYTE ((sal_uInt32)0x00000008) 41cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_R2L ((sal_uInt32)0x00000010) 42cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_7BIT ((sal_uInt32)0x00000020) 43cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_SYMBOL ((sal_uInt32)0x00000040) 44cdf0e10cSrcweir #define RTL_TEXTENCODING_INFO_MIME ((sal_uInt32)0x00000080) 45cdf0e10cSrcweir 46cdf0e10cSrcweir /** Information about a text encoding. 47cdf0e10cSrcweir */ 48cdf0e10cSrcweir typedef struct _rtl_TextEncodingInfo 49cdf0e10cSrcweir { 50cdf0e10cSrcweir /** The size (in bytes) of this structure. Should be 12. 51cdf0e10cSrcweir */ 52cdf0e10cSrcweir sal_uInt32 StructSize; 53cdf0e10cSrcweir 54cdf0e10cSrcweir /** The minimum number of bytes needed to encode any character in the 55cdf0e10cSrcweir given encoding. 56cdf0e10cSrcweir 57cdf0e10cSrcweir Can be rather meaningless for encodings that encode global state along 58cdf0e10cSrcweir with the characters (e.g., ISO-2022 encodings). 59cdf0e10cSrcweir */ 60cdf0e10cSrcweir sal_uInt8 MinimumCharSize; 61cdf0e10cSrcweir 62cdf0e10cSrcweir /** The maximum number of bytes needed to encode any character in the 63cdf0e10cSrcweir given encoding. 64cdf0e10cSrcweir 65cdf0e10cSrcweir Can be rather meaningless for encodings that encode global state along 66cdf0e10cSrcweir with the characters (e.g., ISO-2022 encodings). 67cdf0e10cSrcweir */ 68cdf0e10cSrcweir sal_uInt8 MaximumCharSize; 69cdf0e10cSrcweir 70cdf0e10cSrcweir /** The average number of bytes needed to encode a character in the given 71cdf0e10cSrcweir encoding. 72cdf0e10cSrcweir */ 73cdf0e10cSrcweir sal_uInt8 AverageCharSize; 74cdf0e10cSrcweir 75cdf0e10cSrcweir /** An unused byte, for padding. 76cdf0e10cSrcweir */ 77cdf0e10cSrcweir sal_uInt8 Reserved; 78cdf0e10cSrcweir 79cdf0e10cSrcweir /** Any combination of the RTL_TEXTENCODING_INFO flags. 80cdf0e10cSrcweir 81cdf0e10cSrcweir RTL_TEXTENCODING_INFO_CONTEXT: The encoding uses some mechanism (like 82cdf0e10cSrcweir state-changing byte sequences) to switch between different modes (e.g., 83cdf0e10cSrcweir to encode multiple character repertoires within the same byte ranges). 84cdf0e10cSrcweir 85cdf0e10cSrcweir Even if an encoding does not have the CONTEXT property, interpretation 86cdf0e10cSrcweir of certain byte values within that encoding can depend on context (e.g., 87cdf0e10cSrcweir a certain byte value could be either a single-byte character or a 88cdf0e10cSrcweir subsequent byte of a multi-byte character). Likewise, the single shift 89cdf0e10cSrcweir characters (SS2 and SS3) used by some of the EUC encodings (to denote 90cdf0e10cSrcweir that the following bytes constitute a character from another character 91cdf0e10cSrcweir repertoire) do not imply that encodings making use of these characters 92cdf0e10cSrcweir have the CONTEXT property. Examples of encodings that do have the 93cdf0e10cSrcweir CONTEXT property are the ISO-2022 encodings and UTF-7. 94cdf0e10cSrcweir 95cdf0e10cSrcweir RTL_TEXTENCODING_INFO_ASCII: The encoding is a superset of ASCII. More 96cdf0e10cSrcweir specifically, any appearance of a byte in the range 0x20--7F denotes the 97cdf0e10cSrcweir corresponding ASCII character (from SPACE to DELETE); in particular, 98cdf0e10cSrcweir such a byte cannot be part of a multi-byte character. Note that the 99cdf0e10cSrcweir ASCII control codes 0x00--1F are not included here, as they are used for 100cdf0e10cSrcweir special purposes in some encodings. 101cdf0e10cSrcweir 102cdf0e10cSrcweir If an encoding has this property, it is easy to search for occurences of 103cdf0e10cSrcweir ASCII characters within strings of this encoding---you do not need to 104cdf0e10cSrcweir keep track whether a byte in the range 0x20--7F really represents an 105cdf0e10cSrcweir ASCII character or rather is part of some multi-byte character. 106cdf0e10cSrcweir 107cdf0e10cSrcweir The guarantees when mapping between Unicode and a given encoding with 108cdf0e10cSrcweir the ASCII property are as follows: When mapping from Unicode to the 109cdf0e10cSrcweir given encoding, U+0020--007F map to 0x20--7F (but there can also be 110cdf0e10cSrcweir other Unicode characters mapping into the range 0x20--7F), and when 111cdf0e10cSrcweir mapping from the given encoding to Unicode, 0x20--7F map to U+0020--007F 112cdf0e10cSrcweir (again, there can also be other characters mapping into the range 113cdf0e10cSrcweir U+0020--007F). In particular, this ensures round-trip conversion for 114cdf0e10cSrcweir the ASCII range. 115cdf0e10cSrcweir 116cdf0e10cSrcweir In principle, the ASCII property is orthogonal to the CONTEXT property. 117cdf0e10cSrcweir In practice, however, an encoding that has the ASCII property will most 118cdf0e10cSrcweir likely not also have the CONTEXT property. 119cdf0e10cSrcweir 120cdf0e10cSrcweir RTL_TEXTENCODING_INFO_UNICODE: The encoding is based on the Unicode 121cdf0e10cSrcweir character repertoire. 122cdf0e10cSrcweir 123cdf0e10cSrcweir RTL_TEXTENCODING_INFO_MULTIBYTE: A multi-byte encoding. 124cdf0e10cSrcweir 125cdf0e10cSrcweir RTL_TEXTENCODING_INFO_R2L: An encoding used mainly or exclusively for 126cdf0e10cSrcweir languages written from right to left. 127cdf0e10cSrcweir 128cdf0e10cSrcweir RTL_TEXTENCODING_INFO_7BIT: A 7-bit instead of an 8-bit encoding. 129cdf0e10cSrcweir 130cdf0e10cSrcweir RTL_TEXTENCODING_INFO_SYMBOL: A (generic) encoding for symbol character 131cdf0e10cSrcweir sets. 132cdf0e10cSrcweir 133cdf0e10cSrcweir RTL_TEXTENCODING_INFO_MIME: The encoding is registered as a MIME 134cdf0e10cSrcweir charset. 135cdf0e10cSrcweir */ 136cdf0e10cSrcweir sal_uInt32 Flags; 137cdf0e10cSrcweir } rtl_TextEncodingInfo; 138cdf0e10cSrcweir 139cdf0e10cSrcweir /** Determine whether a text encoding uses single octets as basic units of 140cdf0e10cSrcweir information (and can thus be used with the conversion routines in 141cdf0e10cSrcweir rtl/textcvt.h). 142cdf0e10cSrcweir 143cdf0e10cSrcweir @param nEncoding 144cdf0e10cSrcweir Any rtl_TextEncoding value. 145cdf0e10cSrcweir 146cdf0e10cSrcweir @return 147cdf0e10cSrcweir True if the given encoding uses single octets as basic units of 148cdf0e10cSrcweir information, false otherwise. 149cdf0e10cSrcweir */ 150cdf0e10cSrcweir sal_Bool SAL_CALL rtl_isOctetTextEncoding(rtl_TextEncoding nEncoding); 151cdf0e10cSrcweir 152cdf0e10cSrcweir /** Return information about a text encoding. 153cdf0e10cSrcweir 154cdf0e10cSrcweir @param eTextEncoding 155cdf0e10cSrcweir Any rtl_TextEncoding value. 156cdf0e10cSrcweir 157cdf0e10cSrcweir @param pEncInfo 158cdf0e10cSrcweir Returns information about the given encoding. Must not be null, and the 159cdf0e10cSrcweir StructSize member must be set correctly. 160cdf0e10cSrcweir 161cdf0e10cSrcweir @return 162cdf0e10cSrcweir True if information about the given encoding is available, false 163cdf0e10cSrcweir otherwise. 164cdf0e10cSrcweir */ 165cdf0e10cSrcweir sal_Bool SAL_CALL rtl_getTextEncodingInfo( rtl_TextEncoding eTextEncoding, rtl_TextEncodingInfo* pEncInfo ); 166cdf0e10cSrcweir 167cdf0e10cSrcweir /** Map from a numeric Windows charset to a text encoding. 168cdf0e10cSrcweir 169cdf0e10cSrcweir @param nWinCharset 170cdf0e10cSrcweir Any numeric Windows charset. 171cdf0e10cSrcweir 172cdf0e10cSrcweir @return 173cdf0e10cSrcweir The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if 174cdf0e10cSrcweir no mapping is applicable. 175cdf0e10cSrcweir */ 176cdf0e10cSrcweir rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromWindowsCharset( sal_uInt8 nWinCharset ); 177cdf0e10cSrcweir 178cdf0e10cSrcweir /** Map from a MIME charset to a text encoding. 179cdf0e10cSrcweir 180cdf0e10cSrcweir @param pMimeCharset 181cdf0e10cSrcweir Any MIME charset string. Must not be null. 182cdf0e10cSrcweir 183cdf0e10cSrcweir @return 184cdf0e10cSrcweir The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if 185cdf0e10cSrcweir no mapping is applicable. 186cdf0e10cSrcweir */ 187cdf0e10cSrcweir rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromMimeCharset( const sal_Char* pMimeCharset ); 188cdf0e10cSrcweir 189cdf0e10cSrcweir /** Map from a Unix charset to a text encoding. 190cdf0e10cSrcweir 191cdf0e10cSrcweir @param pMimeCharset 192cdf0e10cSrcweir Any Unix charset string. Must not be null. 193cdf0e10cSrcweir 194cdf0e10cSrcweir @return 195cdf0e10cSrcweir The corresponding rtl_TextEncoding value, or RTL_TEXTENCODING_DONTKNOW if 196cdf0e10cSrcweir no mapping is applicable. 197cdf0e10cSrcweir */ 198cdf0e10cSrcweir rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromUnixCharset( const sal_Char* pUnixCharset ); 199cdf0e10cSrcweir 200cdf0e10cSrcweir /** Map from a text encoding to the best matching numeric Windows charset. 201cdf0e10cSrcweir 202cdf0e10cSrcweir @param eTextEncoding 203cdf0e10cSrcweir Any rtl_TextEncoding value. 204cdf0e10cSrcweir 205cdf0e10cSrcweir @return 206cdf0e10cSrcweir The best matching numeric Windows charset, or 1 if none matches. 207cdf0e10cSrcweir */ 208cdf0e10cSrcweir sal_uInt8 SAL_CALL rtl_getBestWindowsCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding ); 209cdf0e10cSrcweir 210cdf0e10cSrcweir /** Map from a text encoding to a corresponding MIME charset name, if 211cdf0e10cSrcweir available (see <http://www.iana.org/assignments/character-sets>). 212cdf0e10cSrcweir 213cdf0e10cSrcweir @param nEncoding 214cdf0e10cSrcweir Any rtl_TextEncoding value. 215cdf0e10cSrcweir 216cdf0e10cSrcweir @return 217cdf0e10cSrcweir The (preferred) MIME charset name corresponding to the given encoding, or 218cdf0e10cSrcweir NULL if none is available. 219cdf0e10cSrcweir */ 220cdf0e10cSrcweir char const * SAL_CALL rtl_getMimeCharsetFromTextEncoding(rtl_TextEncoding 221cdf0e10cSrcweir nEncoding); 222cdf0e10cSrcweir 223cdf0e10cSrcweir /** Map from a text encoding to the best matching MIME charset. 224cdf0e10cSrcweir 225cdf0e10cSrcweir @param eTextEncoding 226cdf0e10cSrcweir Any rtl_TextEncoding value. 227cdf0e10cSrcweir 228cdf0e10cSrcweir @return 229cdf0e10cSrcweir The best matching MIME charset string, or null if none matches. 230cdf0e10cSrcweir */ 231cdf0e10cSrcweir const sal_Char* SAL_CALL rtl_getBestMimeCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding ); 232cdf0e10cSrcweir 233cdf0e10cSrcweir /** Map from a text encoding to the best matching Unix charset. 234cdf0e10cSrcweir 235cdf0e10cSrcweir @param eTextEncoding 236cdf0e10cSrcweir Any rtl_TextEncoding value. 237cdf0e10cSrcweir 238cdf0e10cSrcweir @return 239cdf0e10cSrcweir The best matching Unix charset string, or null if none matches. 240cdf0e10cSrcweir */ 241cdf0e10cSrcweir const sal_Char* SAL_CALL rtl_getBestUnixCharsetFromTextEncoding( rtl_TextEncoding eTextEncoding ); 242cdf0e10cSrcweir 243cdf0e10cSrcweir /** Map from a Windows code page to a text encoding. 244cdf0e10cSrcweir 245cdf0e10cSrcweir @param nCodePage 246cdf0e10cSrcweir Any Windows code page number. 247cdf0e10cSrcweir 248cdf0e10cSrcweir @return 249cdf0e10cSrcweir The corresponding rtl_TextEncoding value (which will be an octet text 250cdf0e10cSrcweir encoding, see rtl_isOctetTextEncoding), or RTL_TEXTENCODING_DONTKNOW if no 251cdf0e10cSrcweir mapping is applicable. 252cdf0e10cSrcweir */ 253cdf0e10cSrcweir rtl_TextEncoding SAL_CALL 254cdf0e10cSrcweir rtl_getTextEncodingFromWindowsCodePage(sal_uInt32 nCodePage); 255cdf0e10cSrcweir 256cdf0e10cSrcweir /** Map from a text encoding to a Windows code page. 257cdf0e10cSrcweir 258cdf0e10cSrcweir @param nEncoding 259cdf0e10cSrcweir Any rtl_TextEncoding value. 260cdf0e10cSrcweir 261cdf0e10cSrcweir @return 262cdf0e10cSrcweir The corresponding Windows code page number, or 0 if no mapping is 263cdf0e10cSrcweir applicable. 264cdf0e10cSrcweir */ 265cdf0e10cSrcweir sal_uInt32 SAL_CALL 266cdf0e10cSrcweir rtl_getWindowsCodePageFromTextEncoding(rtl_TextEncoding nEncoding); 267cdf0e10cSrcweir 268cdf0e10cSrcweir #ifdef __cplusplus 269cdf0e10cSrcweir } 270cdf0e10cSrcweir #endif 271cdf0e10cSrcweir 272cdf0e10cSrcweir #endif /* _RTL_TENCINFO_H */ 273