1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 #include "sal/types.h" 29 #include "rtl/alloc.h" 30 #include "rtl/textcvt.h" 31 32 #include "converter.h" 33 #include "tenchelp.h" 34 #include "unichars.h" 35 36 struct ImplUtf8ToUnicodeContext 37 { 38 sal_uInt32 nUtf32; 39 int nShift; 40 sal_Bool bCheckBom; 41 }; 42 43 struct ImplUnicodeToUtf8Context 44 { 45 sal_Unicode nHighSurrogate; /* 0xFFFF: write BOM */ 46 }; 47 48 void * ImplCreateUtf8ToUnicodeContext(void) 49 { 50 void * p = rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext)); 51 ImplResetUtf8ToUnicodeContext(p); 52 return p; 53 } 54 55 void ImplResetUtf8ToUnicodeContext(void * pContext) 56 { 57 if (pContext != NULL) 58 { 59 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = -1; 60 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = sal_True; 61 } 62 } 63 64 sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData, 65 void * pContext, sal_Char const * pSrcBuf, 66 sal_Size nSrcBytes, sal_Unicode * pDestBuf, 67 sal_Size nDestChars, sal_uInt32 nFlags, 68 sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes) 69 { 70 /* 71 This function is very liberal with the UTF-8 input. Accepted are: 72 - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041) 73 - surrogates (e.g., ED A0 80 to represent U+D800) 74 - encodings with up to six bytes (everything outside the range 75 U+0000..10FFFF is considered "undefined") 76 The first two of these points allow this routine to translate from both 77 RTL_TEXTENCODING_UTF8 and RTL_TEXTENCODING_JAVA_UTF8. 78 */ 79 80 int bJavaUtf8 = pData != NULL; 81 sal_uInt32 nUtf32 = 0; 82 int nShift = -1; 83 sal_Bool bCheckBom = sal_True; 84 sal_uInt32 nInfo = 0; 85 sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf; 86 sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes; 87 sal_Unicode * pDestBufPtr = pDestBuf; 88 sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars; 89 90 if (pContext != NULL) 91 { 92 nUtf32 = ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32; 93 nShift = ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift; 94 bCheckBom = ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom; 95 } 96 97 while (pSrcBufPtr < pSrcBufEnd) 98 { 99 sal_Bool bUndefined = sal_False; 100 int bConsume = sal_True; 101 sal_uInt32 nChar = *pSrcBufPtr++; 102 if (nShift < 0) 103 if (nChar <= 0x7F) 104 { 105 nUtf32 = nChar; 106 goto transform; 107 } 108 else if (nChar <= 0xBF) 109 goto bad_input; 110 else if (nChar <= 0xDF) 111 { 112 nUtf32 = (nChar & 0x1F) << 6; 113 nShift = 0; 114 } 115 else if (nChar <= 0xEF) 116 { 117 nUtf32 = (nChar & 0x0F) << 12; 118 nShift = 6; 119 } 120 else if (nChar <= 0xF7) 121 { 122 nUtf32 = (nChar & 0x07) << 18; 123 nShift = 12; 124 } 125 else if (nChar <= 0xFB) 126 { 127 nUtf32 = (nChar & 0x03) << 24; 128 nShift = 18; 129 } 130 else if (nChar <= 0xFD) 131 { 132 nUtf32 = (nChar & 0x01) << 30; 133 nShift = 24; 134 } 135 else 136 goto bad_input; 137 else if ((nChar & 0xC0) == 0x80) 138 { 139 nUtf32 |= (nChar & 0x3F) << nShift; 140 if (nShift == 0) 141 goto transform; 142 else 143 nShift -= 6; 144 } 145 else 146 { 147 /* 148 This byte is preceeded by a broken UTF-8 sequence; if this byte 149 is neither in the range [0x80..0xBF] nor in the range 150 [0xFE..0xFF], assume that this byte does not belong to that 151 broken sequence, but instead starts a new, legal UTF-8 sequence: 152 */ 153 bConsume = nChar >= 0xFE; 154 goto bad_input; 155 } 156 continue; 157 158 transform: 159 if (!bCheckBom || nUtf32 != 0xFEFF 160 || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0 161 || bJavaUtf8) 162 { 163 if (nUtf32 <= 0xFFFF) 164 if (pDestBufPtr != pDestBufEnd) 165 *pDestBufPtr++ = (sal_Unicode) nUtf32; 166 else 167 goto no_output; 168 else if (nUtf32 <= 0x10FFFF) 169 if (pDestBufEnd - pDestBufPtr >= 2) 170 { 171 *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32); 172 *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32); 173 } 174 else 175 goto no_output; 176 else 177 { 178 bUndefined = sal_True; 179 goto bad_input; 180 } 181 } 182 nShift = -1; 183 bCheckBom = sal_False; 184 continue; 185 186 bad_input: 187 switch (ImplHandleBadInputTextToUnicodeConversion( 188 bUndefined, sal_True, 0, nFlags, &pDestBufPtr, pDestBufEnd, 189 &nInfo)) 190 { 191 case IMPL_BAD_INPUT_STOP: 192 nShift = -1; 193 bCheckBom = sal_False; 194 if (!bConsume) 195 --pSrcBufPtr; 196 break; 197 198 case IMPL_BAD_INPUT_CONTINUE: 199 nShift = -1; 200 bCheckBom = sal_False; 201 if (!bConsume) 202 --pSrcBufPtr; 203 continue; 204 205 case IMPL_BAD_INPUT_NO_OUTPUT: 206 goto no_output; 207 } 208 break; 209 210 no_output: 211 --pSrcBufPtr; 212 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 213 break; 214 } 215 216 if (nShift >= 0 217 && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR 218 | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)) 219 == 0) 220 { 221 if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) 222 nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; 223 else 224 switch (ImplHandleBadInputTextToUnicodeConversion( 225 sal_False, sal_True, 0, nFlags, &pDestBufPtr, 226 pDestBufEnd, &nInfo)) 227 { 228 case IMPL_BAD_INPUT_STOP: 229 case IMPL_BAD_INPUT_CONTINUE: 230 nShift = -1; 231 bCheckBom = sal_False; 232 break; 233 234 case IMPL_BAD_INPUT_NO_OUTPUT: 235 nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; 236 break; 237 } 238 } 239 240 if (pContext != NULL) 241 { 242 ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32 = nUtf32; 243 ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = nShift; 244 ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = bCheckBom; 245 } 246 if (pInfo != NULL) 247 *pInfo = nInfo; 248 if (pSrcCvtBytes != NULL) 249 *pSrcCvtBytes = (sal_Char const *) pSrcBufPtr - pSrcBuf; 250 return pDestBufPtr - pDestBuf; 251 } 252 253 void * ImplCreateUnicodeToUtf8Context(void) 254 { 255 void * p = rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context)); 256 ImplResetUnicodeToUtf8Context(p); 257 return p; 258 } 259 260 void ImplResetUnicodeToUtf8Context(void * pContext) 261 { 262 if (pContext != NULL) 263 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = 0xFFFF; 264 } 265 266 sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData, 267 void * pContext, sal_Unicode const * pSrcBuf, 268 sal_Size nSrcChars, sal_Char * pDestBuf, 269 sal_Size nDestBytes, sal_uInt32 nFlags, 270 sal_uInt32 * pInfo, sal_Size* pSrcCvtChars) 271 { 272 int bJavaUtf8 = pData != NULL; 273 sal_Unicode nHighSurrogate = 0xFFFF; 274 sal_uInt32 nInfo = 0; 275 sal_Unicode const * pSrcBufPtr = pSrcBuf; 276 sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars; 277 sal_Char * pDestBufPtr = pDestBuf; 278 sal_Char * pDestBufEnd = pDestBufPtr + nDestBytes; 279 280 if (pContext != NULL) 281 nHighSurrogate 282 = ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate; 283 284 if (nHighSurrogate == 0xFFFF) 285 { 286 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0 287 && !bJavaUtf8) 288 { 289 if (pDestBufEnd - pDestBufPtr >= 3) 290 { 291 /* Write BOM (U+FEFF) as UTF-8: */ 292 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xEF; 293 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBB; 294 *pDestBufPtr++ = (sal_Char) (unsigned char) 0xBF; 295 } 296 else 297 { 298 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 299 goto done; 300 } 301 } 302 nHighSurrogate = 0; 303 } 304 305 while (pSrcBufPtr < pSrcBufEnd) 306 { 307 sal_uInt32 nChar = *pSrcBufPtr++; 308 if (nHighSurrogate == 0) 309 { 310 if (ImplIsHighSurrogate(nChar) && !bJavaUtf8) 311 { 312 nHighSurrogate = (sal_Unicode) nChar; 313 continue; 314 } 315 } 316 else if (ImplIsLowSurrogate(nChar) && !bJavaUtf8) 317 nChar = ImplCombineSurrogates(nHighSurrogate, nChar); 318 else 319 goto bad_input; 320 321 if ((ImplIsLowSurrogate(nChar) && !bJavaUtf8) 322 || ImplIsNoncharacter(nChar)) 323 goto bad_input; 324 325 if (nChar <= 0x7F && (!bJavaUtf8 || nChar != 0)) 326 if (pDestBufPtr != pDestBufEnd) 327 *pDestBufPtr++ = (sal_Char) nChar; 328 else 329 goto no_output; 330 else if (nChar <= 0x7FF) 331 if (pDestBufEnd - pDestBufPtr >= 2) 332 { 333 *pDestBufPtr++ = (sal_Char) (0xC0 | (nChar >> 6)); 334 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); 335 } 336 else 337 goto no_output; 338 else if (nChar <= 0xFFFF) 339 if (pDestBufEnd - pDestBufPtr >= 3) 340 { 341 *pDestBufPtr++ = (sal_Char) (0xE0 | (nChar >> 12)); 342 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F)); 343 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); 344 } 345 else 346 goto no_output; 347 else if (pDestBufEnd - pDestBufPtr >= 4) 348 { 349 *pDestBufPtr++ = (sal_Char) (0xF0 | (nChar >> 18)); 350 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 12) & 0x3F)); 351 *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F)); 352 *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); 353 } 354 else 355 goto no_output; 356 nHighSurrogate = 0; 357 continue; 358 359 bad_input: 360 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags, 361 &pDestBufPtr, 362 pDestBufEnd, &nInfo, 363 NULL, 0, NULL)) 364 { 365 case IMPL_BAD_INPUT_STOP: 366 nHighSurrogate = 0; 367 break; 368 369 case IMPL_BAD_INPUT_CONTINUE: 370 nHighSurrogate = 0; 371 continue; 372 373 case IMPL_BAD_INPUT_NO_OUTPUT: 374 goto no_output; 375 } 376 break; 377 378 no_output: 379 --pSrcBufPtr; 380 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 381 break; 382 } 383 384 if (nHighSurrogate != 0 385 && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR 386 | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL)) 387 == 0) 388 { 389 if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) 390 nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; 391 else 392 switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, 393 nFlags, 394 &pDestBufPtr, 395 pDestBufEnd, 396 &nInfo, NULL, 0, 397 NULL)) 398 { 399 case IMPL_BAD_INPUT_STOP: 400 case IMPL_BAD_INPUT_CONTINUE: 401 nHighSurrogate = 0; 402 break; 403 404 case IMPL_BAD_INPUT_NO_OUTPUT: 405 nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; 406 break; 407 } 408 } 409 410 done: 411 if (pContext != NULL) 412 ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate 413 = nHighSurrogate; 414 if (pInfo != NULL) 415 *pInfo = nInfo; 416 if (pSrcCvtChars != NULL) 417 *pSrcCvtChars = pSrcBufPtr - pSrcBuf; 418 return pDestBufPtr - pDestBuf; 419 } 420