1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 #include <string.h> 28 29 #include <sal/types.h> 30 31 #include <rtl/textenc.h> 32 #include <rtl/tencinfo.h> 33 34 35 #include <com/sun/star/io/XInputStream.hpp> 36 37 using namespace rtl; 38 using namespace ::com::sun::star::uno; 39 using namespace ::com::sun::star::io; 40 41 #include "xml2utf.hxx" 42 43 namespace sax_expatwrap { 44 45 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead ) 46 throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException ) 47 { 48 49 Sequence<sal_Int8> seqIn; 50 51 if( ! m_in.is() ) { 52 throw NotConnectedException(); 53 } 54 if( ! m_bStarted ) { 55 nMaxToRead = Max( 512 , nMaxToRead ); // it should be possible to find the encoding attribute 56 // within the first 512 bytes == 128 chars in UCS-4 57 } 58 59 sal_Int32 nRead; 60 Sequence< sal_Int8 > seqStart; 61 while( sal_True ) 62 { 63 nRead = m_in->readSomeBytes( seq , nMaxToRead ); 64 65 if( nRead + seqStart.getLength()) 66 { 67 // if nRead is 0, the file is already eof. 68 if( ! m_bStarted && nRead ) 69 { 70 // ensure that enough data is available to parse encoding 71 if( seqStart.getLength() ) 72 { 73 // prefix with what we had so far. 74 sal_Int32 nLength = seq.getLength(); 75 seq.realloc( seqStart.getLength() + nLength ); 76 77 memmove (seq.getArray() + seqStart.getLength(), 78 seq.getConstArray(), 79 nLength); 80 memcpy (seq.getArray(), 81 seqStart.getConstArray(), 82 seqStart.getLength()); 83 } 84 85 // autodetection with the first bytes 86 if( ! isEncodingRecognizable( seq ) ) 87 { 88 // remember what we have so far. 89 seqStart = seq; 90 91 // read more ! 92 continue; 93 } 94 if( scanForEncoding( seq ) || m_sEncoding.getLength() ) { 95 // initialize decoding 96 initializeDecoding(); 97 } 98 nRead = seq.getLength(); 99 seqStart = Sequence < sal_Int8 > (); 100 } 101 102 // do the encoding 103 if( m_pText2Unicode && m_pUnicode2Text && 104 m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) { 105 106 Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq ); 107 seq = m_pUnicode2Text->convert( seqUnicode.getConstArray(), seqUnicode.getLength() ); 108 } 109 110 if( ! m_bStarted ) 111 { 112 // it must now be ensured, that no encoding attribute exist anymore 113 // ( otherwise the expat-Parser will crash ) 114 // This must be done after decoding ! 115 // ( e.g. Files decoded in ucs-4 cannot be read properly ) 116 m_bStarted = sal_True; 117 removeEncoding( seq ); 118 } 119 nRead = seq.getLength(); 120 } 121 122 break; 123 } 124 return nRead; 125 } 126 127 128 XMLFile2UTFConverter::~XMLFile2UTFConverter() 129 { 130 if( m_pText2Unicode ) 131 delete m_pText2Unicode; 132 if( m_pUnicode2Text ) 133 delete m_pUnicode2Text; 134 } 135 136 137 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq ) 138 { 139 const sal_Int8 *pSource = seq.getArray(); 140 if( ! strncmp( (const char * ) pSource , "<?xml" , 4) ) 141 { 142 143 // scan for encoding 144 OString str( (sal_Char * ) pSource , seq.getLength() ); 145 146 // cut sequence to first line break 147 // find first line break; 148 int nMax = str.indexOf( 10 ); 149 if( nMax >= 0 ) 150 { 151 str = str.copy( 0 , nMax ); 152 } 153 154 int nFound = str.indexOf( " encoding" ); 155 if( nFound >= 0 ) { 156 int nStop; 157 int nStart = str.indexOf( "\"" , nFound ); 158 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) 159 { 160 nStart = str.indexOf( "'" , nFound ); 161 nStop = str.indexOf( "'" , nStart +1 ); 162 } 163 else 164 { 165 nStop = str.indexOf( "\"" , nStart +1); 166 } 167 168 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) 169 { 170 // remove encoding tag from file 171 memmove( &( seq.getArray()[nFound] ) , 172 &( seq.getArray()[nStop+1]) , 173 seq.getLength() - nStop -1); 174 seq.realloc( seq.getLength() - ( nStop+1 - nFound ) ); 175 // str = String( (char * ) seq.getArray() , seq.getLen() ); 176 } 177 } 178 } 179 } 180 181 // Checks, if enough data has been accumulated to recognize the encoding 182 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq) 183 { 184 const sal_Int8 *pSource = seq.getConstArray(); 185 sal_Bool bCheckIfFirstClosingBracketExsists = sal_False; 186 187 if( seq.getLength() < 8 ) { 188 // no recognition possible, when less than 8 bytes are available 189 return sal_False; 190 } 191 192 if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) { 193 // scan if the <?xml tag finishes within this buffer 194 bCheckIfFirstClosingBracketExsists = sal_True; 195 } 196 else if( ('<' == pSource[0] || '<' == pSource[2] ) && 197 ( ('?' == pSource[4] || '?' == pSource[6] ) ) ) 198 { 199 // check for utf-16 200 bCheckIfFirstClosingBracketExsists = sal_True; 201 } 202 else if( ( '<' == pSource[1] || '<' == pSource[3] ) && 203 ( '?' == pSource[5] || '?' == pSource[7] ) ) 204 { 205 // check for 206 bCheckIfFirstClosingBracketExsists = sal_True; 207 } 208 209 if( bCheckIfFirstClosingBracketExsists ) 210 { 211 for( sal_Int32 i = 0; i < seq.getLength() ; i ++ ) 212 { 213 // whole <?xml tag is valid 214 if( '>' == pSource[ i ] ) 215 { 216 return sal_True; 217 } 218 } 219 return sal_False; 220 } 221 222 // No <? tag in front, no need for a bigger buffer 223 return sal_True; 224 } 225 226 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq ) 227 { 228 const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() ); 229 sal_Bool bReturn = sal_True; 230 231 if( seq.getLength() < 4 ) { 232 // no recognition possible, when less than 4 bytes are available 233 return sal_False; 234 } 235 236 // first level : detect possible file formats 237 if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) { 238 239 // scan for encoding 240 OString str( (const sal_Char *) pSource , seq.getLength() ); 241 242 // cut sequence to first line break 243 //find first line break; 244 int nMax = str.indexOf( 10 ); 245 if( nMax >= 0 ) 246 { 247 str = str.copy( 0 , nMax ); 248 } 249 250 int nFound = str.indexOf( " encoding" ); 251 if( nFound < str.getLength() ) { 252 int nStop; 253 int nStart = str.indexOf( "\"" , nFound ); 254 if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart ) 255 { 256 nStart = str.indexOf( "'" , nFound ); 257 nStop = str.indexOf( "'" , nStart +1 ); 258 } 259 else 260 { 261 nStop = str.indexOf( "\"" , nStart +1); 262 } 263 if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop ) 264 { 265 // encoding found finally 266 m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 ); 267 } 268 } 269 } 270 else if( 0xFE == pSource[0] && 271 0xFF == pSource[1] ) { 272 // UTF-16 big endian 273 // conversion is done so that encoding information can be easily extracted 274 m_sEncoding = "utf-16"; 275 } 276 else if( 0xFF == pSource[0] && 277 0xFE == pSource[1] ) { 278 // UTF-16 little endian 279 // conversion is done so that encoding information can be easily extracted 280 m_sEncoding = "utf-16"; 281 } 282 else if( 0x00 == pSource[0] && 0x3c == pSource[1] && 0x00 == pSource[2] && 0x3f == pSource[3] ) { 283 // UTF-16 big endian without byte order mark (this is (strictly speaking) an error.) 284 // The byte order mark is simply added 285 286 // simply add the byte order mark ! 287 seq.realloc( seq.getLength() + 2 ); 288 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); 289 ((sal_uInt8*)seq.getArray())[0] = 0xFE; 290 ((sal_uInt8*)seq.getArray())[1] = 0xFF; 291 292 m_sEncoding = "utf-16"; 293 } 294 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x3f == pSource[2] && 0x00 == pSource[3] ) { 295 // UTF-16 little endian without byte order mark (this is (strictly speaking) an error.) 296 // The byte order mark is simply added 297 298 seq.realloc( seq.getLength() + 2 ); 299 memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 ); 300 ((sal_uInt8*)seq.getArray())[0] = 0xFF; 301 ((sal_uInt8*)seq.getArray())[1] = 0xFE; 302 303 m_sEncoding = "utf-16"; 304 } 305 else if( 0xEF == pSource[0] && 306 0xBB == pSource[1] && 307 0xBF == pSource[2] ) 308 { 309 // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order 310 // The BOM is removed. 311 memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 ); 312 seq.realloc( seq.getLength() - 3 ); 313 m_sEncoding = "utf-8"; 314 } 315 else if( 0x00 == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x3c == pSource[3] ) { 316 // UCS-4 big endian 317 m_sEncoding = "ucs-4"; 318 } 319 else if( 0x3c == pSource[0] && 0x00 == pSource[1] && 0x00 == pSource[2] && 0x00 == pSource[3] ) { 320 // UCS-4 little endian 321 m_sEncoding = "ucs-4"; 322 } 323 else if( 0x4c == pSource[0] && 0x6f == pSource[1] && 324 0xa7 == static_cast<unsigned char> (pSource[2]) && 325 0x94 == static_cast<unsigned char> (pSource[3]) ) { 326 // EBCDIC 327 bReturn = sal_False; // must be extended 328 } 329 else { 330 // other 331 // UTF8 is directly recognized by the parser. 332 bReturn = sal_False; 333 } 334 335 return bReturn; 336 } 337 338 void XMLFile2UTFConverter::initializeDecoding() 339 { 340 341 if( m_sEncoding.getLength() ) 342 { 343 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() ); 344 if( encoding != RTL_TEXTENCODING_UTF8 ) 345 { 346 m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding ); 347 m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 ); 348 } 349 } 350 } 351 352 353 //---------------------------------------------- 354 // 355 // Text2UnicodeConverter 356 // 357 //---------------------------------------------- 358 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding ) 359 { 360 rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() ); 361 if( RTL_TEXTENCODING_DONTKNOW == encoding ) 362 { 363 m_bCanContinue = sal_False; 364 m_bInitialized = sal_False; 365 } 366 else 367 { 368 init( encoding ); 369 } 370 } 371 372 Text2UnicodeConverter::~Text2UnicodeConverter() 373 { 374 if( m_bInitialized ) 375 { 376 rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode ); 377 rtl_destroyUnicodeToTextConverter( m_convText2Unicode ); 378 } 379 } 380 381 void Text2UnicodeConverter::init( rtl_TextEncoding encoding ) 382 { 383 m_bCanContinue = sal_True; 384 m_bInitialized = sal_True; 385 386 m_convText2Unicode = rtl_createTextToUnicodeConverter(encoding); 387 m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode ); 388 m_rtlEncoding = encoding; 389 } 390 391 392 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText ) 393 { 394 sal_uInt32 uiInfo; 395 sal_Size nSrcCvtBytes = 0; 396 sal_Size nTargetCount = 0; 397 sal_Size nSourceCount = 0; 398 399 // the whole source size 400 sal_Int32 nSourceSize = seqText.getLength() + m_seqSource.getLength(); 401 Sequence<sal_Unicode> seqUnicode ( nSourceSize ); 402 403 const sal_Int8 *pbSource = seqText.getConstArray(); 404 sal_Int8 *pbTempMem = 0; 405 406 if( m_seqSource.getLength() ) { 407 // put old rest and new byte sequence into one array 408 pbTempMem = new sal_Int8[ nSourceSize ]; 409 memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() ); 410 memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() ); 411 pbSource = pbTempMem; 412 413 // set to zero again 414 m_seqSource = Sequence< sal_Int8 >(); 415 } 416 417 while( sal_True ) { 418 419 /* All invalid characters are transformed to the unicode undefined char */ 420 nTargetCount += rtl_convertTextToUnicode( 421 m_convText2Unicode, 422 m_contextText2Unicode, 423 ( const sal_Char * ) &( pbSource[nSourceCount] ), 424 nSourceSize - nSourceCount , 425 &( seqUnicode.getArray()[ nTargetCount ] ), 426 seqUnicode.getLength() - nTargetCount, 427 RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT | 428 RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT | 429 RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT, 430 &uiInfo, 431 &nSrcCvtBytes ); 432 nSourceCount += nSrcCvtBytes; 433 434 if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) { 435 // save necessary bytes for next conversion 436 seqUnicode.realloc( seqUnicode.getLength() * 2 ); 437 continue; 438 } 439 break; 440 } 441 if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) { 442 m_seqSource.realloc( nSourceSize - nSourceCount ); 443 memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount ); 444 } 445 446 447 if( pbTempMem ) { 448 delete [] pbTempMem; 449 } 450 451 // set to correct unicode size 452 seqUnicode.realloc( nTargetCount ); 453 454 return seqUnicode; 455 } 456 457 458 459 //---------------------------------------------- 460 // 461 // Unicode2TextConverter 462 // 463 //---------------------------------------------- 464 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding ) 465 { 466 init( encoding ); 467 } 468 469 470 Unicode2TextConverter::~Unicode2TextConverter() 471 { 472 if( m_bInitialized ) { 473 rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text ); 474 rtl_destroyUnicodeToTextConverter( m_convUnicode2Text ); 475 } 476 } 477 478 479 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize) 480 { 481 sal_Unicode *puTempMem = 0; 482 483 if( m_seqSource.getLength() ) { 484 // For surrogates ! 485 // put old rest and new byte sequence into one array 486 // In general when surrogates are used, they should be rarely 487 // cut off between two convert()-calls. So this code is used 488 // rarely and the extra copy is acceptable. 489 puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()]; 490 memcpy( puTempMem , 491 m_seqSource.getConstArray() , 492 m_seqSource.getLength() * sizeof( sal_Unicode ) ); 493 memcpy( 494 &(puTempMem[ m_seqSource.getLength() ]) , 495 puSource , 496 nSourceSize*sizeof( sal_Unicode ) ); 497 puSource = puTempMem; 498 nSourceSize += m_seqSource.getLength(); 499 500 m_seqSource = Sequence< sal_Unicode > (); 501 } 502 503 504 sal_Size nTargetCount = 0; 505 sal_Size nSourceCount = 0; 506 507 sal_uInt32 uiInfo; 508 sal_Size nSrcCvtChars; 509 510 // take nSourceSize * 3 as preference 511 // this is an upper boundary for converting to utf8, 512 // which most often used as the target. 513 sal_Int32 nSeqSize = nSourceSize * 3; 514 515 Sequence<sal_Int8> seqText( nSeqSize ); 516 sal_Char *pTarget = (sal_Char *) seqText.getArray(); 517 while( sal_True ) { 518 519 nTargetCount += rtl_convertUnicodeToText( 520 m_convUnicode2Text, 521 m_contextUnicode2Text, 522 &( puSource[nSourceCount] ), 523 nSourceSize - nSourceCount , 524 &( pTarget[nTargetCount] ), 525 nSeqSize - nTargetCount, 526 RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT | 527 RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT , 528 &uiInfo, 529 &nSrcCvtChars); 530 nSourceCount += nSrcCvtChars; 531 532 if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) { 533 nSeqSize = nSeqSize *2; 534 seqText.realloc( nSeqSize ); // double array size 535 pTarget = ( sal_Char * ) seqText.getArray(); 536 continue; 537 } 538 break; 539 } 540 541 // for surrogates 542 if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) { 543 m_seqSource.realloc( nSourceSize - nSourceCount ); 544 memcpy( m_seqSource.getArray() , 545 &(puSource[nSourceCount]), 546 (nSourceSize - nSourceCount) * sizeof( sal_Unicode ) ); 547 } 548 549 if( puTempMem ) { 550 delete [] puTempMem; 551 } 552 553 // reduce the size of the buffer (fast, no copy necessary) 554 seqText.realloc( nTargetCount ); 555 556 return seqText; 557 } 558 559 void Unicode2TextConverter::init( rtl_TextEncoding encoding ) 560 { 561 m_bCanContinue = sal_True; 562 m_bInitialized = sal_True; 563 564 m_convUnicode2Text = rtl_createUnicodeToTextConverter( encoding ); 565 m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text ); 566 m_rtlEncoding = encoding; 567 }; 568 569 570 } 571