xref: /aoo41x/main/sal/textenc/convertiso2022jp.c (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 #include "convertiso2022jp.h"
29 #include "context.h"
30 #include "converter.h"
31 #include "tenchelp.h"
32 #include "unichars.h"
33 #include "rtl/alloc.h"
34 #include "rtl/textcvt.h"
35 #include "sal/types.h"
36 
37 typedef enum /* order is important: */
38 {
39     IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII,
40     IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN,
41     IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208,
42     IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2,
43     IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC,
44     IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN,
45     IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR
46 } ImplIso2022JpToUnicodeState;
47 
48 typedef struct
49 {
50     ImplIso2022JpToUnicodeState m_eState;
51     sal_uInt32 m_nRow;
52 } ImplIso2022JpToUnicodeContext;
53 
54 typedef struct
55 {
56     sal_Unicode m_nHighSurrogate;
57     sal_Bool m_b0208;
58 } ImplUnicodeToIso2022JpContext;
59 
60 void * ImplCreateIso2022JpToUnicodeContext(void)
61 {
62     void * pContext
63         = rtl_allocateMemory(sizeof (ImplIso2022JpToUnicodeContext));
64     ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState
65         = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
66     return pContext;
67 }
68 
69 void ImplResetIso2022JpToUnicodeContext(void * pContext)
70 {
71     if (pContext)
72         ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState
73             = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
74 }
75 
76 sal_Size ImplConvertIso2022JpToUnicode(ImplTextConverterData const * pData,
77                                        void * pContext,
78                                        sal_Char const * pSrcBuf,
79                                        sal_Size nSrcBytes,
80                                        sal_Unicode * pDestBuf,
81                                        sal_Size nDestChars,
82                                        sal_uInt32 nFlags,
83                                        sal_uInt32 * pInfo,
84                                        sal_Size * pSrcCvtBytes)
85 {
86     ImplDBCSToUniLeadTab const * pJisX0208Data
87         = ((ImplIso2022JpConverterData const *) pData)->
88               m_pJisX0208ToUnicodeData;
89     ImplIso2022JpToUnicodeState eState
90         = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
91     sal_uInt32 nRow = 0;
92     sal_uInt32 nInfo = 0;
93     sal_Size nConverted = 0;
94     sal_Unicode * pDestBufPtr = pDestBuf;
95     sal_Unicode * pDestBufEnd = pDestBuf + nDestChars;
96 
97     if (pContext)
98     {
99         eState = ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState;
100         nRow = ((ImplIso2022JpToUnicodeContext *) pContext)->m_nRow;
101     }
102 
103     for (; nConverted < nSrcBytes; ++nConverted)
104     {
105         sal_Bool bUndefined = sal_True;
106         sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++;
107         switch (eState)
108         {
109         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII:
110             if (nChar == 0x1B) /* ESC */
111                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
112             else if (nChar < 0x80)
113                 if (pDestBufPtr != pDestBufEnd)
114                     *pDestBufPtr++ = (sal_Unicode) nChar;
115                 else
116                     goto no_output;
117             else
118             {
119                 bUndefined = sal_False;
120                 goto bad_input;
121             }
122             break;
123 
124         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN:
125             if (nChar == 0x1B) /* ESC */
126                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
127             else if (nChar < 0x80)
128                 if (pDestBufPtr != pDestBufEnd)
129                 {
130                     switch (nChar)
131                     {
132                     case 0x5C: /* \ */
133                         nChar = 0xA5; /* YEN SIGN */
134                         break;
135 
136                     case 0x7E: /* ~ */
137                         nChar = 0xAF; /* MACRON */
138                         break;
139                     }
140                     *pDestBufPtr++ = (sal_Unicode) nChar;
141                 }
142                 else
143                     goto no_output;
144             else
145             {
146                 bUndefined = sal_False;
147                 goto bad_input;
148             }
149             break;
150 
151         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208:
152             if (nChar == 0x1B) /* ESC */
153                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC;
154             else if (nChar >= 0x21 && nChar <= 0x7E)
155             {
156                 nRow = nChar;
157                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2;
158             }
159             else
160             {
161                 bUndefined = sal_False;
162                 goto bad_input;
163             }
164             break;
165 
166         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2:
167             if (nChar >= 0x21 && nChar <= 0x7E)
168             {
169                 sal_uInt16 nUnicode = 0;
170                 sal_uInt32 nFirst = pJisX0208Data[nRow].mnTrailStart;
171                 if (nChar >= nFirst
172                     && nChar <= pJisX0208Data[nRow].mnTrailEnd)
173                     nUnicode = pJisX0208Data[nRow].
174                                    mpToUniTrailTab[nChar - nFirst];
175                 if (nUnicode != 0)
176                     if (pDestBufPtr != pDestBufEnd)
177                     {
178                         *pDestBufPtr++ = (sal_Unicode) nUnicode;
179                         eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208;
180                     }
181                     else
182                         goto no_output;
183                 else
184                     goto bad_input;
185             }
186             else
187             {
188                 bUndefined = sal_False;
189                 goto bad_input;
190             }
191             break;
192 
193         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC:
194             switch (nChar)
195             {
196             case 0x24: /* $ */
197                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR;
198                 break;
199 
200             case 0x28: /* ( */
201                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN;
202                 break;
203 
204             default:
205                 bUndefined = sal_False;
206                 goto bad_input;
207             }
208             break;
209 
210         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN:
211             switch (nChar)
212             {
213             case 0x42: /* A */
214                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
215                 break;
216 
217             case 0x4A: /* J */
218                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN;
219                 break;
220 
221             default:
222                 bUndefined = sal_False;
223                 goto bad_input;
224             }
225             break;
226 
227         case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR:
228             switch (nChar)
229             {
230             case 0x40: /* @ */
231             case 0x42: /* B */
232                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208;
233                 break;
234 
235             default:
236                 bUndefined = sal_False;
237                 goto bad_input;
238             }
239             break;
240         }
241         continue;
242 
243     bad_input:
244         switch (ImplHandleBadInputTextToUnicodeConversion(
245                     bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
246                     &nInfo))
247         {
248         case IMPL_BAD_INPUT_STOP:
249             eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
250             break;
251 
252         case IMPL_BAD_INPUT_CONTINUE:
253             eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
254             continue;
255 
256         case IMPL_BAD_INPUT_NO_OUTPUT:
257             goto no_output;
258         }
259         break;
260 
261     no_output:
262         --pSrcBuf;
263         nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
264         break;
265     }
266 
267     if (eState > IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208
268         && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR
269                          | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL))
270                == 0)
271     {
272         if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0)
273             nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL;
274         else
275             switch (ImplHandleBadInputTextToUnicodeConversion(
276                         sal_False, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd,
277                         &nInfo))
278             {
279             case IMPL_BAD_INPUT_STOP:
280             case IMPL_BAD_INPUT_CONTINUE:
281                 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII;
282                 break;
283 
284             case IMPL_BAD_INPUT_NO_OUTPUT:
285                 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL;
286                 break;
287             }
288     }
289 
290     if (pContext)
291     {
292         ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState = eState;
293         ((ImplIso2022JpToUnicodeContext *) pContext)->m_nRow = nRow;
294     }
295     if (pInfo)
296         *pInfo = nInfo;
297     if (pSrcCvtBytes)
298         *pSrcCvtBytes = nConverted;
299 
300     return pDestBufPtr - pDestBuf;
301 }
302 
303 void * ImplCreateUnicodeToIso2022JpContext(void)
304 {
305     void * pContext
306         = rtl_allocateMemory(sizeof (ImplUnicodeToIso2022JpContext));
307     ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate = 0;
308     ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = sal_False;
309     return pContext;
310 }
311 
312 void ImplResetUnicodeToIso2022JpContext(void * pContext)
313 {
314     if (pContext)
315     {
316         ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate = 0;
317         ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = sal_False;
318     }
319 }
320 
321 sal_Size ImplConvertUnicodeToIso2022Jp(ImplTextConverterData const * pData,
322                                        void * pContext,
323                                        sal_Unicode const * pSrcBuf,
324                                        sal_Size nSrcChars,
325                                        sal_Char * pDestBuf,
326                                        sal_Size nDestBytes,
327                                        sal_uInt32 nFlags,
328                                        sal_uInt32 * pInfo,
329                                        sal_Size * pSrcCvtChars)
330 {
331     ImplUniToDBCSHighTab const * pJisX0208Data
332         = ((ImplIso2022JpConverterData const *) pData)->
333               m_pUnicodeToJisX0208Data;
334     sal_Unicode nHighSurrogate = 0;
335     sal_Bool b0208 = sal_False;
336     sal_uInt32 nInfo = 0;
337     sal_Size nConverted = 0;
338     sal_Char * pDestBufPtr = pDestBuf;
339     sal_Char * pDestBufEnd = pDestBuf + nDestBytes;
340     sal_Bool bWritten;
341 
342     if (pContext)
343     {
344         nHighSurrogate
345             = ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate;
346         b0208 = ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208;
347     }
348 
349     for (; nConverted < nSrcChars; ++nConverted)
350     {
351         sal_Bool bUndefined = sal_True;
352         sal_uInt32 nChar = *pSrcBuf++;
353         if (nHighSurrogate == 0)
354         {
355             if (ImplIsHighSurrogate(nChar))
356             {
357                 nHighSurrogate = (sal_Unicode) nChar;
358                 continue;
359             }
360         }
361         else if (ImplIsLowSurrogate(nChar))
362             nChar = ImplCombineSurrogates(nHighSurrogate, nChar);
363         else
364         {
365             bUndefined = sal_False;
366             goto bad_input;
367         }
368 
369         if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar))
370         {
371             bUndefined = sal_False;
372             goto bad_input;
373         }
374 
375         if (nChar == 0x0A || nChar == 0x0D) /* LF, CR */
376         {
377             if (b0208)
378             {
379                 if (pDestBufEnd - pDestBufPtr >= 3)
380                 {
381                     *pDestBufPtr++ = 0x1B; /* ESC */
382                     *pDestBufPtr++ = 0x28; /* ( */
383                     *pDestBufPtr++ = 0x42; /* B */
384                     b0208 = sal_False;
385                 }
386                 else
387                     goto no_output;
388             }
389             if (pDestBufPtr != pDestBufEnd)
390                 *pDestBufPtr++ = (sal_Char) nChar;
391             else
392                 goto no_output;
393         }
394         else if (nChar == 0x1B)
395             goto bad_input;
396         else if (nChar < 0x80)
397         {
398             if (b0208)
399             {
400                 if (pDestBufEnd - pDestBufPtr >= 3)
401                 {
402                     *pDestBufPtr++ = 0x1B; /* ESC */
403                     *pDestBufPtr++ = 0x28; /* ( */
404                     *pDestBufPtr++ = 0x42; /* B */
405                     b0208 = sal_False;
406                 }
407                 else
408                     goto no_output;
409             }
410             if (pDestBufPtr != pDestBufEnd)
411                 *pDestBufPtr++ = (sal_Char) nChar;
412             else
413                 goto no_output;
414         }
415         else
416         {
417             sal_uInt16 nBytes = 0;
418             sal_uInt32 nIndex1 = nChar >> 8;
419             if (nIndex1 < 0x100)
420             {
421                 sal_uInt32 nIndex2 = nChar & 0xFF;
422                 sal_uInt32 nFirst = pJisX0208Data[nIndex1].mnLowStart;
423                 if (nIndex2 >= nFirst
424                     && nIndex2 <= pJisX0208Data[nIndex1].mnLowEnd)
425                 {
426                     nBytes = pJisX0208Data[nIndex1].
427                                  mpToUniTrailTab[nIndex2 - nFirst];
428                     if (nBytes == 0)
429                         /* For some reason, the tables in tcvtjp4.tab do not
430                            include these two conversions: */
431                         switch (nChar)
432                         {
433                         case 0xA5: /* YEN SIGN */
434                             nBytes = 0x216F;
435                             break;
436 
437                         case 0xAF: /* MACRON */
438                             nBytes = 0x2131;
439                             break;
440                         }
441                 }
442             }
443             if (nBytes != 0)
444             {
445                 if (!b0208)
446                 {
447                     if (pDestBufEnd - pDestBufPtr >= 3)
448                     {
449                         *pDestBufPtr++ = 0x1B; /* ESC */
450                         *pDestBufPtr++ = 0x24; /* $ */
451                         *pDestBufPtr++ = 0x42; /* B */
452                         b0208 = sal_True;
453                     }
454                     else
455                         goto no_output;
456                 }
457                 if (pDestBufEnd - pDestBufPtr >= 2)
458                 {
459                     *pDestBufPtr++ = (sal_Char) (nBytes >> 8);
460                     *pDestBufPtr++ = (sal_Char) (nBytes & 0xFF);
461                 }
462                 else
463                     goto no_output;
464             }
465             else
466                 goto bad_input;
467         }
468         nHighSurrogate = 0;
469         continue;
470 
471     bad_input:
472         switch (ImplHandleBadInputUnicodeToTextConversion(
473                     bUndefined,
474                     nChar,
475                     nFlags,
476                     &pDestBufPtr,
477                     pDestBufEnd,
478                     &nInfo,
479                     "\x1B(B",
480                     b0208 ? 3 : 0,
481                     &bWritten))
482         {
483         case IMPL_BAD_INPUT_STOP:
484             nHighSurrogate = 0;
485             break;
486 
487         case IMPL_BAD_INPUT_CONTINUE:
488             if (bWritten)
489                 b0208 = sal_False;
490             nHighSurrogate = 0;
491             continue;
492 
493         case IMPL_BAD_INPUT_NO_OUTPUT:
494             goto no_output;
495         }
496         break;
497 
498     no_output:
499         --pSrcBuf;
500         nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
501         break;
502     }
503 
504     if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR
505                       | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL))
506             == 0)
507     {
508         sal_Bool bFlush = sal_True;
509         if (nHighSurrogate != 0)
510         {
511             if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
512                 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL;
513             else
514                 switch (ImplHandleBadInputUnicodeToTextConversion(
515                             sal_False,
516                             0,
517                             nFlags,
518                             &pDestBufPtr,
519                             pDestBufEnd,
520                             &nInfo,
521                             "\x1B(B",
522                             b0208 ? 3 : 0,
523                             &bWritten))
524                 {
525                 case IMPL_BAD_INPUT_STOP:
526                     nHighSurrogate = 0;
527                     bFlush = sal_False;
528                     break;
529 
530                 case IMPL_BAD_INPUT_CONTINUE:
531                     if (bWritten)
532                         b0208 = sal_False;
533                     nHighSurrogate = 0;
534                     break;
535 
536                 case IMPL_BAD_INPUT_NO_OUTPUT:
537                     nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
538                     break;
539                 }
540         }
541         if (bFlush
542             && b0208
543             && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0)
544         {
545             if (pDestBufEnd - pDestBufPtr >= 3)
546             {
547                 *pDestBufPtr++ = 0x1B; /* ESC */
548                 *pDestBufPtr++ = 0x28; /* ( */
549                 *pDestBufPtr++ = 0x42; /* B */
550                 b0208 = sal_False;
551             }
552             else
553                 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL;
554         }
555     }
556 
557     if (pContext)
558     {
559         ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate
560             = nHighSurrogate;
561         ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = b0208;
562     }
563     if (pInfo)
564         *pInfo = nInfo;
565     if (pSrcCvtChars)
566         *pSrcCvtChars = nConverted;
567 
568     return pDestBufPtr - pDestBuf;
569 }
570