1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 #include "convertiso2022jp.h" 29 #include "context.h" 30 #include "converter.h" 31 #include "tenchelp.h" 32 #include "unichars.h" 33 #include "rtl/alloc.h" 34 #include "rtl/textcvt.h" 35 #include "sal/types.h" 36 37 typedef enum /* order is important: */ 38 { 39 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII, 40 IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN, 41 IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208, 42 IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2, 43 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC, 44 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN, 45 IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR 46 } ImplIso2022JpToUnicodeState; 47 48 typedef struct 49 { 50 ImplIso2022JpToUnicodeState m_eState; 51 sal_uInt32 m_nRow; 52 } ImplIso2022JpToUnicodeContext; 53 54 typedef struct 55 { 56 sal_Unicode m_nHighSurrogate; 57 sal_Bool m_b0208; 58 } ImplUnicodeToIso2022JpContext; 59 60 void * ImplCreateIso2022JpToUnicodeContext(void) 61 { 62 void * pContext 63 = rtl_allocateMemory(sizeof (ImplIso2022JpToUnicodeContext)); 64 ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState 65 = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 66 return pContext; 67 } 68 69 void ImplResetIso2022JpToUnicodeContext(void * pContext) 70 { 71 if (pContext) 72 ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState 73 = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 74 } 75 76 sal_Size ImplConvertIso2022JpToUnicode(ImplTextConverterData const * pData, 77 void * pContext, 78 sal_Char const * pSrcBuf, 79 sal_Size nSrcBytes, 80 sal_Unicode * pDestBuf, 81 sal_Size nDestChars, 82 sal_uInt32 nFlags, 83 sal_uInt32 * pInfo, 84 sal_Size * pSrcCvtBytes) 85 { 86 ImplDBCSToUniLeadTab const * pJisX0208Data 87 = ((ImplIso2022JpConverterData const *) pData)-> 88 m_pJisX0208ToUnicodeData; 89 ImplIso2022JpToUnicodeState eState 90 = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 91 sal_uInt32 nRow = 0; 92 sal_uInt32 nInfo = 0; 93 sal_Size nConverted = 0; 94 sal_Unicode * pDestBufPtr = pDestBuf; 95 sal_Unicode * pDestBufEnd = pDestBuf + nDestChars; 96 97 if (pContext) 98 { 99 eState = ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState; 100 nRow = ((ImplIso2022JpToUnicodeContext *) pContext)->m_nRow; 101 } 102 103 for (; nConverted < nSrcBytes; ++nConverted) 104 { 105 sal_Bool bUndefined = sal_True; 106 sal_uInt32 nChar = *(sal_uChar const *) pSrcBuf++; 107 switch (eState) 108 { 109 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII: 110 if (nChar == 0x1B) /* ESC */ 111 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC; 112 else if (nChar < 0x80) 113 if (pDestBufPtr != pDestBufEnd) 114 *pDestBufPtr++ = (sal_Unicode) nChar; 115 else 116 goto no_output; 117 else 118 { 119 bUndefined = sal_False; 120 goto bad_input; 121 } 122 break; 123 124 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN: 125 if (nChar == 0x1B) /* ESC */ 126 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC; 127 else if (nChar < 0x80) 128 if (pDestBufPtr != pDestBufEnd) 129 { 130 switch (nChar) 131 { 132 case 0x5C: /* \ */ 133 nChar = 0xA5; /* YEN SIGN */ 134 break; 135 136 case 0x7E: /* ~ */ 137 nChar = 0xAF; /* MACRON */ 138 break; 139 } 140 *pDestBufPtr++ = (sal_Unicode) nChar; 141 } 142 else 143 goto no_output; 144 else 145 { 146 bUndefined = sal_False; 147 goto bad_input; 148 } 149 break; 150 151 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208: 152 if (nChar == 0x1B) /* ESC */ 153 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC; 154 else if (nChar >= 0x21 && nChar <= 0x7E) 155 { 156 nRow = nChar; 157 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2; 158 } 159 else 160 { 161 bUndefined = sal_False; 162 goto bad_input; 163 } 164 break; 165 166 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208_2: 167 if (nChar >= 0x21 && nChar <= 0x7E) 168 { 169 sal_uInt16 nUnicode = 0; 170 sal_uInt32 nFirst = pJisX0208Data[nRow].mnTrailStart; 171 if (nChar >= nFirst 172 && nChar <= pJisX0208Data[nRow].mnTrailEnd) 173 nUnicode = pJisX0208Data[nRow]. 174 mpToUniTrailTab[nChar - nFirst]; 175 if (nUnicode != 0) 176 if (pDestBufPtr != pDestBufEnd) 177 { 178 *pDestBufPtr++ = (sal_Unicode) nUnicode; 179 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208; 180 } 181 else 182 goto no_output; 183 else 184 goto bad_input; 185 } 186 else 187 { 188 bUndefined = sal_False; 189 goto bad_input; 190 } 191 break; 192 193 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC: 194 switch (nChar) 195 { 196 case 0x24: /* $ */ 197 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR; 198 break; 199 200 case 0x28: /* ( */ 201 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN; 202 break; 203 204 default: 205 bUndefined = sal_False; 206 goto bad_input; 207 } 208 break; 209 210 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_LPAREN: 211 switch (nChar) 212 { 213 case 0x42: /* A */ 214 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 215 break; 216 217 case 0x4A: /* J */ 218 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_JIS_ROMAN; 219 break; 220 221 default: 222 bUndefined = sal_False; 223 goto bad_input; 224 } 225 break; 226 227 case IMPL_ISO_2022_JP_TO_UNICODE_STATE_ESC_DOLLAR: 228 switch (nChar) 229 { 230 case 0x40: /* @ */ 231 case 0x42: /* B */ 232 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208; 233 break; 234 235 default: 236 bUndefined = sal_False; 237 goto bad_input; 238 } 239 break; 240 } 241 continue; 242 243 bad_input: 244 switch (ImplHandleBadInputTextToUnicodeConversion( 245 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, 246 &nInfo)) 247 { 248 case IMPL_BAD_INPUT_STOP: 249 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 250 break; 251 252 case IMPL_BAD_INPUT_CONTINUE: 253 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 254 continue; 255 256 case IMPL_BAD_INPUT_NO_OUTPUT: 257 goto no_output; 258 } 259 break; 260 261 no_output: 262 --pSrcBuf; 263 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 264 break; 265 } 266 267 if (eState > IMPL_ISO_2022_JP_TO_UNICODE_STATE_0208 268 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR 269 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)) 270 == 0) 271 { 272 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) 273 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; 274 else 275 switch (ImplHandleBadInputTextToUnicodeConversion( 276 sal_False, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, 277 &nInfo)) 278 { 279 case IMPL_BAD_INPUT_STOP: 280 case IMPL_BAD_INPUT_CONTINUE: 281 eState = IMPL_ISO_2022_JP_TO_UNICODE_STATE_ASCII; 282 break; 283 284 case IMPL_BAD_INPUT_NO_OUTPUT: 285 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 286 break; 287 } 288 } 289 290 if (pContext) 291 { 292 ((ImplIso2022JpToUnicodeContext *) pContext)->m_eState = eState; 293 ((ImplIso2022JpToUnicodeContext *) pContext)->m_nRow = nRow; 294 } 295 if (pInfo) 296 *pInfo = nInfo; 297 if (pSrcCvtBytes) 298 *pSrcCvtBytes = nConverted; 299 300 return pDestBufPtr - pDestBuf; 301 } 302 303 void * ImplCreateUnicodeToIso2022JpContext(void) 304 { 305 void * pContext 306 = rtl_allocateMemory(sizeof (ImplUnicodeToIso2022JpContext)); 307 ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate = 0; 308 ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = sal_False; 309 return pContext; 310 } 311 312 void ImplResetUnicodeToIso2022JpContext(void * pContext) 313 { 314 if (pContext) 315 { 316 ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate = 0; 317 ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = sal_False; 318 } 319 } 320 321 sal_Size ImplConvertUnicodeToIso2022Jp(ImplTextConverterData const * pData, 322 void * pContext, 323 sal_Unicode const * pSrcBuf, 324 sal_Size nSrcChars, 325 sal_Char * pDestBuf, 326 sal_Size nDestBytes, 327 sal_uInt32 nFlags, 328 sal_uInt32 * pInfo, 329 sal_Size * pSrcCvtChars) 330 { 331 ImplUniToDBCSHighTab const * pJisX0208Data 332 = ((ImplIso2022JpConverterData const *) pData)-> 333 m_pUnicodeToJisX0208Data; 334 sal_Unicode nHighSurrogate = 0; 335 sal_Bool b0208 = sal_False; 336 sal_uInt32 nInfo = 0; 337 sal_Size nConverted = 0; 338 sal_Char * pDestBufPtr = pDestBuf; 339 sal_Char * pDestBufEnd = pDestBuf + nDestBytes; 340 sal_Bool bWritten; 341 342 if (pContext) 343 { 344 nHighSurrogate 345 = ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate; 346 b0208 = ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208; 347 } 348 349 for (; nConverted < nSrcChars; ++nConverted) 350 { 351 sal_Bool bUndefined = sal_True; 352 sal_uInt32 nChar = *pSrcBuf++; 353 if (nHighSurrogate == 0) 354 { 355 if (ImplIsHighSurrogate(nChar)) 356 { 357 nHighSurrogate = (sal_Unicode) nChar; 358 continue; 359 } 360 } 361 else if (ImplIsLowSurrogate(nChar)) 362 nChar = ImplCombineSurrogates(nHighSurrogate, nChar); 363 else 364 { 365 bUndefined = sal_False; 366 goto bad_input; 367 } 368 369 if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar)) 370 { 371 bUndefined = sal_False; 372 goto bad_input; 373 } 374 375 if (nChar == 0x0A || nChar == 0x0D) /* LF, CR */ 376 { 377 if (b0208) 378 { 379 if (pDestBufEnd - pDestBufPtr >= 3) 380 { 381 *pDestBufPtr++ = 0x1B; /* ESC */ 382 *pDestBufPtr++ = 0x28; /* ( */ 383 *pDestBufPtr++ = 0x42; /* B */ 384 b0208 = sal_False; 385 } 386 else 387 goto no_output; 388 } 389 if (pDestBufPtr != pDestBufEnd) 390 *pDestBufPtr++ = (sal_Char) nChar; 391 else 392 goto no_output; 393 } 394 else if (nChar == 0x1B) 395 goto bad_input; 396 else if (nChar < 0x80) 397 { 398 if (b0208) 399 { 400 if (pDestBufEnd - pDestBufPtr >= 3) 401 { 402 *pDestBufPtr++ = 0x1B; /* ESC */ 403 *pDestBufPtr++ = 0x28; /* ( */ 404 *pDestBufPtr++ = 0x42; /* B */ 405 b0208 = sal_False; 406 } 407 else 408 goto no_output; 409 } 410 if (pDestBufPtr != pDestBufEnd) 411 *pDestBufPtr++ = (sal_Char) nChar; 412 else 413 goto no_output; 414 } 415 else 416 { 417 sal_uInt16 nBytes = 0; 418 sal_uInt32 nIndex1 = nChar >> 8; 419 if (nIndex1 < 0x100) 420 { 421 sal_uInt32 nIndex2 = nChar & 0xFF; 422 sal_uInt32 nFirst = pJisX0208Data[nIndex1].mnLowStart; 423 if (nIndex2 >= nFirst 424 && nIndex2 <= pJisX0208Data[nIndex1].mnLowEnd) 425 { 426 nBytes = pJisX0208Data[nIndex1]. 427 mpToUniTrailTab[nIndex2 - nFirst]; 428 if (nBytes == 0) 429 /* For some reason, the tables in tcvtjp4.tab do not 430 include these two conversions: */ 431 switch (nChar) 432 { 433 case 0xA5: /* YEN SIGN */ 434 nBytes = 0x216F; 435 break; 436 437 case 0xAF: /* MACRON */ 438 nBytes = 0x2131; 439 break; 440 } 441 } 442 } 443 if (nBytes != 0) 444 { 445 if (!b0208) 446 { 447 if (pDestBufEnd - pDestBufPtr >= 3) 448 { 449 *pDestBufPtr++ = 0x1B; /* ESC */ 450 *pDestBufPtr++ = 0x24; /* $ */ 451 *pDestBufPtr++ = 0x42; /* B */ 452 b0208 = sal_True; 453 } 454 else 455 goto no_output; 456 } 457 if (pDestBufEnd - pDestBufPtr >= 2) 458 { 459 *pDestBufPtr++ = (sal_Char) (nBytes >> 8); 460 *pDestBufPtr++ = (sal_Char) (nBytes & 0xFF); 461 } 462 else 463 goto no_output; 464 } 465 else 466 goto bad_input; 467 } 468 nHighSurrogate = 0; 469 continue; 470 471 bad_input: 472 switch (ImplHandleBadInputUnicodeToTextConversion( 473 bUndefined, 474 nChar, 475 nFlags, 476 &pDestBufPtr, 477 pDestBufEnd, 478 &nInfo, 479 "\x1B(B", 480 b0208 ? 3 : 0, 481 &bWritten)) 482 { 483 case IMPL_BAD_INPUT_STOP: 484 nHighSurrogate = 0; 485 break; 486 487 case IMPL_BAD_INPUT_CONTINUE: 488 if (bWritten) 489 b0208 = sal_False; 490 nHighSurrogate = 0; 491 continue; 492 493 case IMPL_BAD_INPUT_NO_OUTPUT: 494 goto no_output; 495 } 496 break; 497 498 no_output: 499 --pSrcBuf; 500 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 501 break; 502 } 503 504 if ((nInfo & (RTL_UNICODETOTEXT_INFO_ERROR 505 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL)) 506 == 0) 507 { 508 sal_Bool bFlush = sal_True; 509 if (nHighSurrogate != 0) 510 { 511 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) 512 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; 513 else 514 switch (ImplHandleBadInputUnicodeToTextConversion( 515 sal_False, 516 0, 517 nFlags, 518 &pDestBufPtr, 519 pDestBufEnd, 520 &nInfo, 521 "\x1B(B", 522 b0208 ? 3 : 0, 523 &bWritten)) 524 { 525 case IMPL_BAD_INPUT_STOP: 526 nHighSurrogate = 0; 527 bFlush = sal_False; 528 break; 529 530 case IMPL_BAD_INPUT_CONTINUE: 531 if (bWritten) 532 b0208 = sal_False; 533 nHighSurrogate = 0; 534 break; 535 536 case IMPL_BAD_INPUT_NO_OUTPUT: 537 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 538 break; 539 } 540 } 541 if (bFlush 542 && b0208 543 && (nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) 544 { 545 if (pDestBufEnd - pDestBufPtr >= 3) 546 { 547 *pDestBufPtr++ = 0x1B; /* ESC */ 548 *pDestBufPtr++ = 0x28; /* ( */ 549 *pDestBufPtr++ = 0x42; /* B */ 550 b0208 = sal_False; 551 } 552 else 553 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 554 } 555 } 556 557 if (pContext) 558 { 559 ((ImplUnicodeToIso2022JpContext *) pContext)->m_nHighSurrogate 560 = nHighSurrogate; 561 ((ImplUnicodeToIso2022JpContext *) pContext)->m_b0208 = b0208; 562 } 563 if (pInfo) 564 *pInfo = nInfo; 565 if (pSrcCvtChars) 566 *pSrcCvtChars = nConverted; 567 568 return pDestBufPtr - pDestBuf; 569 } 570