xref: /aoo41x/main/sal/textenc/converteuctw.c (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 #include "converteuctw.h"
29 #include "context.h"
30 #include "converter.h"
31 #include "tenchelp.h"
32 #include "unichars.h"
33 #include "rtl/alloc.h"
34 #include "rtl/textcvt.h"
35 #include "sal/types.h"
36 
37 typedef enum
38 {
39     IMPL_EUC_TW_TO_UNICODE_STATE_0,
40     IMPL_EUC_TW_TO_UNICODE_STATE_1,
41     IMPL_EUC_TW_TO_UNICODE_STATE_2_1,
42     IMPL_EUC_TW_TO_UNICODE_STATE_2_2,
43     IMPL_EUC_TW_TO_UNICODE_STATE_2_3
44 } ImplEucTwToUnicodeState;
45 
46 typedef struct
47 {
48     ImplEucTwToUnicodeState m_eState;
49     sal_Int32 m_nPlane; /* 0--15 */
50     sal_Int32 m_nRow; /* 0--93 */
51 } ImplEucTwToUnicodeContext;
52 
53 void * ImplCreateEucTwToUnicodeContext(void)
54 {
55     void * pContext = rtl_allocateMemory(sizeof (ImplEucTwToUnicodeContext));
56     ((ImplEucTwToUnicodeContext *) pContext)->m_eState
57         = IMPL_EUC_TW_TO_UNICODE_STATE_0;
58     return pContext;
59 }
60 
61 void ImplResetEucTwToUnicodeContext(void * pContext)
62 {
63     if (pContext)
64         ((ImplEucTwToUnicodeContext *) pContext)->m_eState
65             = IMPL_EUC_TW_TO_UNICODE_STATE_0;
66 }
67 
68 sal_Size ImplConvertEucTwToUnicode(ImplTextConverterData const * pData,
69                                    void * pContext,
70                                    sal_Char const * pSrcBuf,
71                                    sal_Size nSrcBytes,
72                                    sal_Unicode * pDestBuf,
73                                    sal_Size nDestChars,
74                                    sal_uInt32 nFlags,
75                                    sal_uInt32 * pInfo,
76                                    sal_Size * pSrcCvtBytes)
77 {
78     sal_uInt16 const * pCns116431992Data
79         = ((ImplEucTwConverterData const *) pData)->
80               m_pCns116431992ToUnicodeData;
81     sal_Int32 const * pCns116431992RowOffsets
82         = ((ImplEucTwConverterData const *) pData)->
83               m_pCns116431992ToUnicodeRowOffsets;
84     sal_Int32 const * pCns116431992PlaneOffsets
85         = ((ImplEucTwConverterData const *) pData)->
86               m_pCns116431992ToUnicodePlaneOffsets;
87     ImplEucTwToUnicodeState eState = IMPL_EUC_TW_TO_UNICODE_STATE_0;
88     sal_Int32 nPlane = 0;
89     sal_Int32 nRow = 0;
90     sal_uInt32 nInfo = 0;
91     sal_Size nConverted = 0;
92     sal_Unicode * pDestBufPtr = pDestBuf;
93     sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
94 
95     if (pContext)
96     {
97         eState = ((ImplEucTwToUnicodeContext *) pContext)->m_eState;
98         nPlane = ((ImplEucTwToUnicodeContext *) pContext)->m_nPlane;
99         nRow = ((ImplEucTwToUnicodeContext *) pContext)->m_nRow;
100     }
101 
102     for (; nConverted < nSrcBytes; ++nConverted)
103     {
104         sal_Bool bUndefined = sal_True;
105         sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
106         switch (eState)
107         {
108         case IMPL_EUC_TW_TO_UNICODE_STATE_0:
109             if (nChar < 0x80)
110                 if (pDestBufPtr != pDestBufEnd)
111                     *pDestBufPtr++ = (sal_Unicode) nChar;
112                 else
113                     goto no_output;
114             else if (nChar >= 0xA1 && nChar <= 0xFE)
115             {
116                 nRow = nChar - 0xA1;
117                 eState = IMPL_EUC_TW_TO_UNICODE_STATE_1;
118             }
119             else if (nChar == 0x8E)
120                 eState = IMPL_EUC_TW_TO_UNICODE_STATE_2_1;
121             else
122             {
123                 bUndefined = sal_False;
124                 goto bad_input;
125             }
126             break;
127 
128         case IMPL_EUC_TW_TO_UNICODE_STATE_1:
129             if (nChar >= 0xA1 && nChar <= 0xFE)
130             {
131                 nPlane = 0;
132                 goto transform;
133             }
134             else
135             {
136                 bUndefined = sal_False;
137                 goto bad_input;
138             }
139             break;
140 
141         case IMPL_EUC_TW_TO_UNICODE_STATE_2_1:
142             if (nChar >= 0xA1 && nChar <= 0xB0)
143             {
144                 nPlane = nChar - 0xA1;
145                 ++eState;
146             }
147             else
148             {
149                 bUndefined = sal_False;
150                 goto bad_input;
151             }
152             break;
153 
154         case IMPL_EUC_TW_TO_UNICODE_STATE_2_2:
155             if (nChar >= 0xA1 && nChar <= 0xFE)
156             {
157                 nRow = nChar - 0xA1;
158                 ++eState;
159             }
160             else
161             {
162                 bUndefined = sal_False;
163                 goto bad_input;
164             }
165             break;
166 
167         case IMPL_EUC_TW_TO_UNICODE_STATE_2_3:
168             if (nChar >= 0xA1 && nChar <= 0xFE)
169                 goto transform;
170             else
171             {
172                 bUndefined = sal_False;
173                 goto bad_input;
174             }
175             break;
176         }
177         continue;
178 
179     transform:
180         {
181             sal_Int32 nPlaneOffset = pCns116431992PlaneOffsets[nPlane];
182             if (nPlaneOffset == -1)
183                 goto bad_input;
184             else
185             {
186                 sal_Int32 nOffset
187                     = pCns116431992RowOffsets[nPlaneOffset + nRow];
188                 if (nOffset == -1)
189                     goto bad_input;
190                 else
191                 {
192                     sal_uInt32 nFirstLast = pCns116431992Data[nOffset++];
193                     sal_uInt32 nFirst = nFirstLast & 0xFF;
194                     sal_uInt32 nLast = nFirstLast >> 8;
195                     nChar -= 0xA0;
196                     if (nChar >= nFirst && nChar <= nLast)
197                     {
198                         sal_uInt32 nUnicode
199                             = pCns116431992Data[nOffset + (nChar - nFirst)];
200                         if (nUnicode == 0xFFFF)
201                             goto bad_input;
202                         else if (ImplIsHighSurrogate(nUnicode))
203                             if (pDestBufEnd - pDestBufPtr >= 2)
204                             {
205                                 nOffset += nLast - nFirst + 1;
206                                 nFirst = pCns116431992Data[nOffset++];
207                                 *pDestBufPtr++ = (sal_Unicode) nUnicode;
208                                 *pDestBufPtr++
209                                     = (sal_Unicode)
210                                           pCns116431992Data[
211                                               nOffset + (nChar - nFirst)];
212                             }
213                             else
214                                 goto no_output;
215                         else
216                             if (pDestBufPtr != pDestBufEnd)
217                                 *pDestBufPtr++ = (sal_Unicode) nUnicode;
218                             else
219                                 goto no_output;
220                     }
221                     else
222                         goto bad_input;
223                     eState = IMPL_EUC_TW_TO_UNICODE_STATE_0;
224                 }
225             }
226             continue;
227         }
228 
229     bad_input:
230         switch (ImplHandleBadInputTextToUnicodeConversion(
231                     bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
232                     &nInfo))
233         {
234         case IMPL_BAD_INPUT_STOP:
235             eState = IMPL_EUC_TW_TO_UNICODE_STATE_0;
236             break;
237 
238         case IMPL_BAD_INPUT_CONTINUE:
239             eState = IMPL_EUC_TW_TO_UNICODE_STATE_0;
240             continue;
241 
242         case IMPL_BAD_INPUT_NO_OUTPUT:
243             goto no_output;
244         }
245         break;
246 
247     no_output:
248         --pSrcBuf;
249         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
250         break;
251     }
252 
253     if (eState != IMPL_EUC_TW_TO_UNICODE_STATE_0
254         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
255                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
256                == 0)
257     {
258         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
259             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
260         else
261             switch (ImplHandleBadInputTextToUnicodeConversion(
262                         sal_False, sal_True, 0, nFlags, &pDestBufPtr,
263                         pDestBufEnd, &nInfo))
264             {
265             case IMPL_BAD_INPUT_STOP:
266             case IMPL_BAD_INPUT_CONTINUE:
267                 eState = IMPL_EUC_TW_TO_UNICODE_STATE_0;
268                 break;
269 
270             case IMPL_BAD_INPUT_NO_OUTPUT:
271                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
272                 break;
273             }
274     }
275 
276     if (pContext)
277     {
278         ((ImplEucTwToUnicodeContext *) pContext)->m_eState = eState;
279         ((ImplEucTwToUnicodeContext *) pContext)->m_nPlane = nPlane;
280         ((ImplEucTwToUnicodeContext *) pContext)->m_nRow = nRow;
281     }
282     if (pInfo)
283         *pInfo = nInfo;
284     if (pSrcCvtBytes)
285         *pSrcCvtBytes = nConverted;
286 
287     return pDestBufPtr - pDestBuf;
288 }
289 
290 sal_Size ImplConvertUnicodeToEucTw(ImplTextConverterData const * pData,
291                                    void * pContext,
292                                    sal_Unicode const * pSrcBuf,
293                                    sal_Size nSrcChars,
294                                    sal_Char * pDestBuf,
295                                    sal_Size nDestBytes,
296                                    sal_uInt32 nFlags,
297                                    sal_uInt32 * pInfo,
298                                    sal_Size * pSrcCvtChars)
299 {
300     sal_uInt8 const * pCns116431992Data
301         = ((ImplEucTwConverterData const *) pData)->
302               m_pUnicodeToCns116431992Data;
303     sal_Int32 const * pCns116431992PageOffsets
304         = ((ImplEucTwConverterData const *) pData)->
305               m_pUnicodeToCns116431992PageOffsets;
306     sal_Int32 const * pCns116431992PlaneOffsets
307         = ((ImplEucTwConverterData const *) pData)->
308               m_pUnicodeToCns116431992PlaneOffsets;
309     sal_Unicode nHighSurrogate = 0;
310     sal_uInt32 nInfo = 0;
311     sal_Size nConverted = 0;
312     sal_Char * pDestBufPtr = pDestBuf;
313     sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
314 
315     if (pContext)
316         nHighSurrogate
317             = ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate;
318 
319     for (; nConverted < nSrcChars; ++nConverted)
320     {
321         sal_Bool bUndefined = sal_True;
322         sal_uInt32 nChar = *pSrcBuf++;
323         if (nHighSurrogate == 0)
324         {
325             if (ImplIsHighSurrogate(nChar))
326             {
327                 nHighSurrogate = (sal_Unicode) nChar;
328                 continue;
329             }
330         }
331         else if (ImplIsLowSurrogate(nChar))
332             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
333         else
334         {
335             bUndefined = sal_False;
336             goto bad_input;
337         }
338 
339         if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
340         {
341             bUndefined = sal_False;
342             goto bad_input;
343         }
344 
345         if (nChar < 0x80)
346             if (pDestBufPtr != pDestBufEnd)
347                 *pDestBufPtr++ = (sal_Char) nChar;
348             else
349                 goto no_output;
350         else
351         {
352             sal_Int32 nOffset = pCns116431992PlaneOffsets[nChar >> 16];
353             sal_uInt32 nFirst;
354             sal_uInt32 nLast;
355             sal_uInt32 nPlane;
356             if (nOffset == -1)
357                 goto bad_input;
358             nOffset
359                 = pCns116431992PageOffsets[nOffset + ((nChar & 0xFF00) >> 8)];
360             if (nOffset == -1)
361                 goto bad_input;
362             nFirst = pCns116431992Data[nOffset++];
363             nLast = pCns116431992Data[nOffset++];
364             nChar &= 0xFF;
365             if (nChar < nFirst || nChar > nLast)
366                 goto bad_input;
367             nOffset += 3 * (nChar - nFirst);
368             nPlane = pCns116431992Data[nOffset++];
369             if (nPlane == 0)
370                 goto bad_input;
371             if (pDestBufEnd - pDestBufPtr < (nPlane == 1 ? 2 : 4))
372                 goto no_output;
373             if (nPlane != 1)
374             {
375                 *pDestBufPtr++ = (sal_Char) (unsigned char) 0x8E;
376                 *pDestBufPtr++ = (sal_Char) (0xA0 + nPlane);
377             }
378             *pDestBufPtr++ = (sal_Char) (0xA0 + pCns116431992Data[nOffset++]);
379             *pDestBufPtr++ = (sal_Char) (0xA0 + pCns116431992Data[nOffset]);
380         }
381         nHighSurrogate = 0;
382         continue;
383 
384     bad_input:
385         switch (ImplHandleBadInputUnicodeToTextConversion(bUndefined,
386                                                           nChar,
387                                                           nFlags,
388                                                           &pDestBufPtr,
389                                                           pDestBufEnd,
390                                                           &nInfo,
391                                                           NULL,
392                                                           0,
393                                                           NULL))
394         {
395         case IMPL_BAD_INPUT_STOP:
396             nHighSurrogate = 0;
397             break;
398 
399         case IMPL_BAD_INPUT_CONTINUE:
400             nHighSurrogate = 0;
401             continue;
402 
403         case IMPL_BAD_INPUT_NO_OUTPUT:
404             goto no_output;
405         }
406         break;
407 
408     no_output:
409         --pSrcBuf;
410         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
411         break;
412     }
413 
414     if (nHighSurrogate != 0
415         && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
416                          | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
417                == 0)
418     {
419         if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
420             nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
421         else
422             switch (ImplHandleBadInputUnicodeToTextConversion(sal_False,
423                                                               0,
424                                                               nFlags,
425                                                               &pDestBufPtr,
426                                                               pDestBufEnd,
427                                                               &nInfo,
428                                                               NULL,
429                                                               0,
430                                                               NULL))
431             {
432             case IMPL_BAD_INPUT_STOP:
433             case IMPL_BAD_INPUT_CONTINUE:
434                 nHighSurrogate = 0;
435                 break;
436 
437             case IMPL_BAD_INPUT_NO_OUTPUT:
438                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
439                 break;
440             }
441     }
442 
443     if (pContext)
444         ((ImplUnicodeToTextContext *) pContext)->m_nHighSurrogate
445             = nHighSurrogate;
446     if (pInfo)
447         *pInfo = nInfo;
448     if (pSrcCvtChars)
449         *pSrcCvtChars = nConverted;
450 
451     return pDestBufPtr - pDestBuf;
452 }
453