xref: /aoo41x/main/sal/textenc/tcvtutf8.c (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 #include "sal/types.h"
29 #include "rtl/alloc.h"
30 #include "rtl/textcvt.h"
31 
32 #include "converter.h"
33 #include "tenchelp.h"
34 #include "unichars.h"
35 
36 struct ImplUtf8ToUnicodeContext
37 {
38     sal_uInt32 nUtf32;
39     int nShift;
40     sal_Bool bCheckBom;
41 };
42 
43 struct ImplUnicodeToUtf8Context
44 {
45     sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */
46 };
47 
48 void * ImplCreateUtf8ToUnicodeContext(void)
49 {
50     void * p = rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext));
51     ImplResetUtf8ToUnicodeContext(p);
52     return p;
53 }
54 
55 void ImplResetUtf8ToUnicodeContext(void * pContext)
56 {
57     if (pContext != NULL)
58     {
59         ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = -1;
60         ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = sal_True;
61     }
62 }
63 
64 sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData,
65                                   void * pContext, sal_Char const * pSrcBuf,
66                                   sal_Size nSrcBytes, sal_Unicode * pDestBuf,
67                                   sal_Size nDestChars, sal_uInt32 nFlags,
68                                   sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes)
69 {
70 	/*
71        This function is very liberal with the UTF-8 input.  Accepted are:
72        - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041)
73        - surrogates (e.g., ED A0 80 to represent U+D800)
74        - encodings with up to six bytes (everything outside the range
75          U+0000..10FFFF is considered "undefined")
76        The first two of these points allow this routine to translate from both
77        RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8.
78 	  */
79 
80     int bJavaUtf8 = pData != NULL;
81     sal_uInt32 nUtf32 = 0;
82     int nShift = -1;
83     sal_Bool bCheckBom = sal_True;
84     sal_uInt32 nInfo = 0;
85     sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf;
86     sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes;
87     sal_Unicode * pDestBufPtr = pDestBuf;
88     sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars;
89 
90     if (pContext != NULL)
91     {
92         nUtf32 = ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32;
93         nShift = ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift;
94         bCheckBom = ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom;
95     }
96 
97     while (pSrcBufPtr < pSrcBufEnd)
98     {
99         sal_Bool bUndefined = sal_False;
100         int bConsume = sal_True;
101         sal_uInt32 nChar = *pSrcBufPtr++;
102         if (nShift < 0)
103             if (nChar <= 0x7F)
104             {
105                 nUtf32 = nChar;
106                 goto transform;
107             }
108             else if (nChar <= 0xBF)
109                 goto bad_input;
110             else if (nChar <= 0xDF)
111             {
112                 nUtf32 = (nChar & 0x1F) << 6;
113                 nShift = 0;
114             }
115             else if (nChar <= 0xEF)
116             {
117                 nUtf32 = (nChar & 0x0F) << 12;
118                 nShift = 6;
119             }
120             else if (nChar <= 0xF7)
121             {
122                 nUtf32 = (nChar & 0x07) << 18;
123                 nShift = 12;
124             }
125             else if (nChar <= 0xFB)
126             {
127                 nUtf32 = (nChar & 0x03) << 24;
128                 nShift = 18;
129             }
130             else if (nChar <= 0xFD)
131             {
132                 nUtf32 = (nChar & 0x01) << 30;
133                 nShift = 24;
134             }
135             else
136                 goto bad_input;
137         else if ((nChar & 0xC0) == 0x80)
138         {
139             nUtf32 |= (nChar & 0x3F) << nShift;
140             if (nShift == 0)
141                 goto transform;
142             else
143                 nShift -= 6;
144         }
145         else
146         {
147 			/*
148              This byte is preceeded by a broken UTF-8 sequence; if this byte
149              is neither in the range [0x80..0xBF] nor in the range
150              [0xFE..0xFF], assume that this byte does not belong to that
151              broken sequence, but instead starts a new, legal UTF-8 sequence:
152 			 */
153             bConsume = nChar >= 0xFE;
154             goto bad_input;
155         }
156         continue;
157 
158     transform:
159         if (!bCheckBom || nUtf32 != 0xFEFF
160             || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0
161             || bJavaUtf8)
162         {
163             if (nUtf32 <= 0xFFFF)
164                 if (pDestBufPtr != pDestBufEnd)
165                     *pDestBufPtr++ = (sal_Unicode) nUtf32;
166                 else
167                     goto no_output;
168             else if (nUtf32 <= 0x10FFFF)
169                 if (pDestBufEnd - pDestBufPtr >= 2)
170                 {
171                     *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32);
172                     *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32);
173                 }
174                 else
175                     goto no_output;
176             else
177             {
178                 bUndefined = sal_True;
179                 goto bad_input;
180             }
181         }
182         nShift = -1;
183         bCheckBom = sal_False;
184         continue;
185 
186     bad_input:
187         switch (ImplHandleBadInputTextToUnicodeConversion(
188                     bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
189                     &nInfo))
190         {
191         case IMPL_BAD_INPUT_STOP:
192             nShift = -1;
193             bCheckBom = sal_False;
194             if (!bConsume)
195                 --pSrcBufPtr;
196             break;
197 
198         case IMPL_BAD_INPUT_CONTINUE:
199             nShift = -1;
200             bCheckBom = sal_False;
201             if (!bConsume)
202                 --pSrcBufPtr;
203             continue;
204 
205         case IMPL_BAD_INPUT_NO_OUTPUT:
206             goto no_output;
207         }
208         break;
209 
210     no_output:
211         --pSrcBufPtr;
212         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
213         break;
214     }
215 
216     if (nShift >= 0
217         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
218                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
219                == 0)
220     {
221         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
222             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
223         else
224             switch (ImplHandleBadInputTextToUnicodeConversion(
225                         sal_False, sal_True, 0, nFlags, &pDestBufPtr,
226                         pDestBufEnd, &nInfo))
227             {
228             case IMPL_BAD_INPUT_STOP:
229             case IMPL_BAD_INPUT_CONTINUE:
230                 nShift = -1;
231                 bCheckBom = sal_False;
232                 break;
233 
234             case IMPL_BAD_INPUT_NO_OUTPUT:
235                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
236                 break;
237             }
238     }
239 
240     if (pContext != NULL)
241     {
242         ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32 = nUtf32;
243         ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = nShift;
244         ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = bCheckBom;
245     }
246     if (pInfo != NULL)
247         *pInfo = nInfo;
248     if (pSrcCvtBytes != NULL)
249         *pSrcCvtBytes = (sal_Char const *) pSrcBufPtr - pSrcBuf;
250     return pDestBufPtr - pDestBuf;
251 }
252 
253 void * ImplCreateUnicodeToUtf8Context(void)
254 {
255     void * p = rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context));
256     ImplResetUnicodeToUtf8Context(p);
257     return p;
258 }
259 
260 void ImplResetUnicodeToUtf8Context(void * pContext)
261 {
262     if (pContext != NULL)
263         ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = 0xFFFF;
264 }
265 
266 sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData,
267                                   void * pContext, sal_Unicode const * pSrcBuf,
268                                   sal_Size nSrcChars, sal_Char * pDestBuf,
269                                   sal_Size nDestBytes, sal_uInt32 nFlags,
270                                   sal_uInt32 * pInfo, sal_Size* pSrcCvtChars)
271 {
272     int bJavaUtf8 = pData != NULL;
273     sal_Unicode nHighSurrogate = 0xFFFF;
274     sal_uInt32 nInfo = 0;
275     sal_Unicode const * pSrcBufPtr = pSrcBuf;
276     sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars;
277     sal_Char * pDestBufPtr = pDestBuf;
278     sal_Char * pDestBufEnd = pDestBufPtr + nDestBytes;
279 
280     if (pContext != NULL)
281         nHighSurrogate
282             = ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate;
283 
284     if (nHighSurrogate == 0xFFFF)
285     {
286         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0
287             && !bJavaUtf8)
288         {
289             if (pDestBufEnd - pDestBufPtr >= 3)
290             {
291                 /* Write BOM (U+FEFF) as UTF-8: */
292                 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xEF;
293                 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBB;
294                 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBF;
295             }
296             else
297             {
298                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
299                 goto done;
300             }
301         }
302         nHighSurrogate = 0;
303     }
304 
305     while (pSrcBufPtr < pSrcBufEnd)
306     {
307         sal_uInt32 nChar = *pSrcBufPtr++;
308         if (nHighSurrogate == 0)
309         {
310             if (ImplIsHighSurrogate(nChar) && !bJavaUtf8)
311             {
312                 nHighSurrogate = (sal_Unicode) nChar;
313                 continue;
314             }
315         }
316         else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8)
317             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
318         else
319             goto bad_input;
320 
321         if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8)
322             || ImplIsNoncharacter(nChar))
323             goto bad_input;
324 
325         if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0))
326             if (pDestBufPtr != pDestBufEnd)
327                 *pDestBufPtr++ = (sal_Char) nChar;
328             else
329                 goto no_output;
330         else if (nChar <= 0x7FF)
331             if (pDestBufEnd - pDestBufPtr >= 2)
332             {
333                 *pDestBufPtr++ = (sal_Char) (0xC0 | (nChar >> 6));
334                 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
335             }
336             else
337                 goto no_output;
338         else if (nChar <= 0xFFFF)
339             if (pDestBufEnd - pDestBufPtr >= 3)
340             {
341                 *pDestBufPtr++ = (sal_Char) (0xE0 | (nChar >> 12));
342                 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
343                 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
344             }
345             else
346                 goto no_output;
347         else if (pDestBufEnd - pDestBufPtr >= 4)
348         {
349             *pDestBufPtr++ = (sal_Char) (0xF0 | (nChar >> 18));
350             *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 12) & 0x3F));
351             *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F));
352             *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F));
353         }
354         else
355             goto no_output;
356         nHighSurrogate = 0;
357         continue;
358 
359     bad_input:
360         switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags,
361                                                           &pDestBufPtr,
362                                                           pDestBufEnd, &nInfo,
363                                                           NULL, 0, NULL))
364         {
365         case IMPL_BAD_INPUT_STOP:
366             nHighSurrogate = 0;
367             break;
368 
369         case IMPL_BAD_INPUT_CONTINUE:
370             nHighSurrogate = 0;
371             continue;
372 
373         case IMPL_BAD_INPUT_NO_OUTPUT:
374             goto no_output;
375         }
376         break;
377 
378     no_output:
379         --pSrcBufPtr;
380         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
381         break;
382     }
383 
384     if (nHighSurrogate != 0
385         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
386                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
387                == 0)
388     {
389         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
390             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
391         else
392             switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0,
393                                                               nFlags,
394                                                               &pDestBufPtr,
395                                                               pDestBufEnd,
396                                                               &nInfo, NULL, 0,
397                                                               NULL))
398             {
399             case IMPL_BAD_INPUT_STOP:
400             case IMPL_BAD_INPUT_CONTINUE:
401                 nHighSurrogate = 0;
402                 break;
403 
404             case IMPL_BAD_INPUT_NO_OUTPUT:
405                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
406                 break;
407             }
408     }
409 
410  done:
411     if (pContext != NULL)
412         ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate
413             = nHighSurrogate;
414     if (pInfo != NULL)
415         *pInfo = nInfo;
416     if (pSrcCvtChars != NULL)
417         *pSrcCvtChars = pSrcBufPtr - pSrcBuf;
418     return pDestBufPtr - pDestBuf;
419 }
420