xref: /aoo41x/main/sax/source/expatwrap/xml2utf.cxx (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 #include <string.h>
28 
29 #include <sal/types.h>
30 
31 #include <rtl/textenc.h>
32 #include <rtl/tencinfo.h>
33 
34 
35 #include <com/sun/star/io/XInputStream.hpp>
36 
37 using namespace rtl;
38 using namespace ::com::sun::star::uno;
39 using namespace ::com::sun::star::io;
40 
41 #include "xml2utf.hxx"
42 
43 namespace sax_expatwrap {
44 
45 sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
46 	throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
47 {
48 
49 	Sequence<sal_Int8> seqIn;
50 
51 	if( ! m_in.is() ) {
52 		throw NotConnectedException();
53 	}
54 	if( ! m_bStarted ) {
55 		nMaxToRead = Max( 512 , nMaxToRead );  	// it should be possible to find the encoding attribute
56 						     					// within the first 512 bytes == 128 chars in UCS-4
57 	}
58 
59 	sal_Int32 nRead;
60 	Sequence< sal_Int8 > seqStart;
61 	while( sal_True )
62 	{
63 		nRead = m_in->readSomeBytes( seq , nMaxToRead );
64 
65 		if( nRead + seqStart.getLength())
66 		{
67 			// if nRead is 0, the file is already eof.
68 			if( ! m_bStarted && nRead )
69 			{
70 				// ensure that enough data is available to parse encoding
71 				if( seqStart.getLength() )
72 				{
73 				  // prefix with what we had so far.
74 				  sal_Int32 nLength = seq.getLength();
75 				  seq.realloc( seqStart.getLength() + nLength );
76 
77 				  memmove (seq.getArray() + seqStart.getLength(),
78 					   seq.getConstArray(),
79 					   nLength);
80 				  memcpy  (seq.getArray(),
81 					   seqStart.getConstArray(),
82 					   seqStart.getLength());
83 				}
84 
85 				// autodetection with the first bytes
86 				if( ! isEncodingRecognizable( seq ) )
87 				{
88 				  // remember what we have so far.
89 				  seqStart = seq;
90 
91 				  // read more !
92 				  continue;
93 				}
94 				if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
95 					// initialize decoding
96 					initializeDecoding();
97 				}
98 				nRead = seq.getLength();
99 				seqStart = Sequence < sal_Int8 > ();
100 			}
101 
102 			// do the encoding
103 			if( m_pText2Unicode && m_pUnicode2Text &&
104 				m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
105 
106 				Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
107 				seq = m_pUnicode2Text->convert(	seqUnicode.getConstArray(),	seqUnicode.getLength() );
108 			}
109 
110 			if( ! m_bStarted )
111 			{
112 				// it must now be ensured, that no encoding attribute exist anymore
113 				// ( otherwise the expat-Parser will crash )
114 				// This must be done after decoding !
115 				// ( e.g. Files decoded in ucs-4 cannot be read properly )
116 				m_bStarted = sal_True;
117 				removeEncoding( seq );
118 			}
119 			nRead = seq.getLength();
120 		}
121 
122 		break;
123 	}
124 	return nRead;
125 }
126 
127 
128 XMLFile2UTFConverter::~XMLFile2UTFConverter()
129 {
130 	if( m_pText2Unicode )
131 		delete m_pText2Unicode;
132 	if( m_pUnicode2Text )
133 		delete m_pUnicode2Text;
134 }
135 
136 
137 void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
138 {
139 	const sal_Int8 *pSource = seq.getArray();
140 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
141 	{
142 
143 		// scan for encoding
144 		OString str( (sal_Char * ) pSource , seq.getLength() );
145 
146 		// cut sequence to first line break
147 		// find first line break;
148 		int nMax = str.indexOf( 10 );
149 		if( nMax >= 0 )
150 		{
151 			str = str.copy( 0 , nMax );
152 		}
153 
154 		int nFound = str.indexOf( " encoding" );
155 		if( nFound >= 0 ) {
156 			int nStop;
157 			int nStart = str.indexOf( "\"" , nFound );
158 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
159 			{
160 				nStart = str.indexOf( "'" , nFound );
161 				nStop  = str.indexOf( "'" , nStart +1 );
162 			}
163 			else
164 			{
165 				nStop  = str.indexOf( "\"" , nStart +1);
166 			}
167 
168 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
169 			{
170 				// remove encoding tag from file
171 				memmove(        &( seq.getArray()[nFound] ) ,
172 								&( seq.getArray()[nStop+1]) ,
173 								seq.getLength() - nStop -1);
174 				seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
175 //				str = String( (char * ) seq.getArray() , seq.getLen() );
176 			}
177 		}
178 	}
179 }
180 
181 // Checks, if enough data has been accumulated to recognize the encoding
182 sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
183 {
184 	const sal_Int8 *pSource = seq.getConstArray();
185 	sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
186 
187 	if( seq.getLength() < 8 ) {
188 		// no recognition possible, when less than 8 bytes are available
189 		return sal_False;
190 	}
191 
192 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
193 		// scan if the <?xml tag finishes within this buffer
194 		bCheckIfFirstClosingBracketExsists = sal_True;
195 	}
196 	else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
197 			 ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
198 	{
199 		// check for utf-16
200 		bCheckIfFirstClosingBracketExsists = sal_True;
201 	}
202 	else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
203 		     ( '?' == pSource[5] || '?' == pSource[7] ) )
204 	{
205 		// check for
206 		bCheckIfFirstClosingBracketExsists = sal_True;
207 	}
208 
209 	if( bCheckIfFirstClosingBracketExsists )
210 	{
211 		for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
212 		{
213 			// whole <?xml tag is valid
214 			if( '>' == pSource[ i ] )
215 			{
216 				return sal_True;
217 			}
218 		}
219 		return sal_False;
220 	}
221 
222 	// No <? tag in front, no need for a bigger buffer
223 	return sal_True;
224 }
225 
226 sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
227 {
228 	const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
229 	sal_Bool bReturn = sal_True;
230 
231 	if( seq.getLength() < 4 ) {
232 		// no recognition possible, when less than 4 bytes are available
233 		return sal_False;
234 	}
235 
236 	// first level : detect possible file formats
237 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
238 
239 		// scan for encoding
240 		OString str( (const sal_Char *) pSource , seq.getLength() );
241 
242 		// cut sequence to first line break
243 		//find first line break;
244 		int nMax = str.indexOf( 10 );
245 		if( nMax >= 0 )
246 		{
247 			str = str.copy( 0 , nMax );
248 		}
249 
250 		int nFound = str.indexOf( " encoding" );
251 		if( nFound < str.getLength() ) {
252 			int nStop;
253 			int nStart = str.indexOf( "\"" , nFound );
254 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
255 			{
256 				nStart = str.indexOf( "'" , nFound );
257 				nStop  = str.indexOf( "'" , nStart +1 );
258 			}
259 			else
260 			{
261 				nStop  = str.indexOf( "\"" , nStart +1);
262 			}
263 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
264 			{
265 				// encoding found finally
266 				m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
267 			}
268 		}
269 	}
270 	else if( 0xFE == pSource[0] &&
271 	         0xFF == pSource[1] ) {
272 		// UTF-16 big endian
273 		// conversion is done so that encoding information can be easily extracted
274 		m_sEncoding = "utf-16";
275 	}
276 	else if( 0xFF == pSource[0] &&
277 	         0xFE == pSource[1] ) {
278 		// UTF-16 little endian
279 		// conversion is done so that encoding information can be easily extracted
280 		m_sEncoding = "utf-16";
281 	}
282 	else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
283 		// UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
284 		// The byte order mark is simply added
285 
286 		// simply add the byte order mark !
287 		seq.realloc( seq.getLength() + 2 );
288 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
289 		((sal_uInt8*)seq.getArray())[0] = 0xFE;
290 		((sal_uInt8*)seq.getArray())[1] = 0xFF;
291 
292 		m_sEncoding = "utf-16";
293 	}
294 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
295 		// UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
296 		// The byte order mark is simply added
297 
298 		seq.realloc( seq.getLength() + 2 );
299 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
300 		((sal_uInt8*)seq.getArray())[0] = 0xFF;
301 		((sal_uInt8*)seq.getArray())[1] = 0xFE;
302 
303 		m_sEncoding = "utf-16";
304 	}
305     else if( 0xEF == pSource[0] &&
306              0xBB == pSource[1] &&
307              0xBF == pSource[2] )
308     {
309         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
310         // The BOM is removed.
311         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
312         seq.realloc( seq.getLength() - 3 );
313         m_sEncoding = "utf-8";
314     }
315 	else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
316 		// UCS-4 big endian
317 		m_sEncoding = "ucs-4";
318 	}
319 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
320 		// UCS-4 little endian
321 		m_sEncoding = "ucs-4";
322 	}
323 	else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
324 	         0xa7 == static_cast<unsigned char> (pSource[2]) &&
325 	         0x94 == static_cast<unsigned char> (pSource[3]) ) {
326 		// EBCDIC
327 		bReturn = sal_False;   // must be extended
328 	}
329 	else {
330 		// other
331 		// UTF8 is directly recognized by the parser.
332 		bReturn = sal_False;
333 	}
334 
335 	return bReturn;
336 }
337 
338 void XMLFile2UTFConverter::initializeDecoding()
339 {
340 
341 	if( m_sEncoding.getLength() )
342 	{
343 		rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
344 		if( encoding != RTL_TEXTENCODING_UTF8 )
345 		{
346 			m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
347 			m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
348 		}
349 	}
350 }
351 
352 
353 //----------------------------------------------
354 //
355 // Text2UnicodeConverter
356 //
357 //----------------------------------------------
358 Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
359 {
360 	rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
361 	if( RTL_TEXTENCODING_DONTKNOW == encoding )
362 	{
363 		m_bCanContinue = sal_False;
364 		m_bInitialized = sal_False;
365 	}
366 	else
367 	{
368 		init( encoding );
369 	}
370 }
371 
372 Text2UnicodeConverter::~Text2UnicodeConverter()
373 {
374 	if( m_bInitialized )
375 	{
376 		rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
377 		rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
378 	}
379 }
380 
381 void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
382 {
383 	m_bCanContinue = sal_True;
384 	m_bInitialized = sal_True;
385 
386 	m_convText2Unicode 	= rtl_createTextToUnicodeConverter(encoding);
387 	m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
388 	m_rtlEncoding = encoding;
389 }
390 
391 
392 Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
393 {
394 	sal_uInt32 uiInfo;
395 	sal_Size nSrcCvtBytes 	= 0;
396 	sal_Size nTargetCount 	= 0;
397 	sal_Size nSourceCount   = 0;
398 
399 	// the whole source size
400 	sal_Int32 	nSourceSize = seqText.getLength() + m_seqSource.getLength();
401 	Sequence<sal_Unicode> 	seqUnicode ( nSourceSize );
402 
403 	const sal_Int8 *pbSource = seqText.getConstArray();
404 	sal_Int8 *pbTempMem = 0;
405 
406 	if( m_seqSource.getLength() ) {
407 		// put old rest and new byte sequence into one array
408 		pbTempMem = new sal_Int8[ nSourceSize ];
409 		memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
410 		memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
411 		pbSource = pbTempMem;
412 
413 		// set to zero again
414 		m_seqSource = Sequence< sal_Int8 >();
415 	}
416 
417 	while( sal_True ) {
418 
419 		/* All invalid characters are transformed to the unicode undefined char */
420 		nTargetCount += 	rtl_convertTextToUnicode(
421 									m_convText2Unicode,
422 									m_contextText2Unicode,
423 									( const sal_Char * ) &( pbSource[nSourceCount] ),
424 									nSourceSize - nSourceCount ,
425 									&( seqUnicode.getArray()[ nTargetCount ] ),
426 									seqUnicode.getLength() - nTargetCount,
427 									RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
428 									RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
429 									RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
430 									&uiInfo,
431 									&nSrcCvtBytes );
432 		nSourceCount += nSrcCvtBytes;
433 
434 		if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
435 			// save necessary bytes for next conversion
436 			seqUnicode.realloc( seqUnicode.getLength() * 2 );
437 			continue;
438 		}
439 		break;
440 	}
441 	if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
442 		m_seqSource.realloc( nSourceSize - nSourceCount );
443 		memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
444 	}
445 
446 
447 	if( pbTempMem ) {
448 		delete [] pbTempMem;
449 	}
450 
451 	// set to correct unicode size
452 	seqUnicode.realloc( nTargetCount );
453 
454 	return seqUnicode;
455 }
456 
457 
458 
459 //----------------------------------------------
460 //
461 // Unicode2TextConverter
462 //
463 //----------------------------------------------
464 Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
465 {
466 	init( encoding );
467 }
468 
469 
470 Unicode2TextConverter::~Unicode2TextConverter()
471 {
472 	if( m_bInitialized ) {
473 		rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
474 		rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
475 	}
476 }
477 
478 
479 Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
480 {
481 	sal_Unicode *puTempMem = 0;
482 
483 	if( m_seqSource.getLength() ) {
484 		// For surrogates !
485 		// put old rest and new byte sequence into one array
486 		// In general when surrogates are used, they should be rarely
487 		// cut off between two convert()-calls. So this code is used
488 		// rarely and the extra copy is acceptable.
489 		puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
490 		memcpy( puTempMem ,
491 				m_seqSource.getConstArray() ,
492 				m_seqSource.getLength() * sizeof( sal_Unicode ) );
493 		memcpy(
494 			&(puTempMem[ m_seqSource.getLength() ]) ,
495 			puSource ,
496 			nSourceSize*sizeof( sal_Unicode ) );
497 		puSource = puTempMem;
498 		nSourceSize += m_seqSource.getLength();
499 
500 		m_seqSource = Sequence< sal_Unicode > ();
501 	}
502 
503 
504 	sal_Size nTargetCount = 0;
505 	sal_Size nSourceCount = 0;
506 
507 	sal_uInt32 uiInfo;
508 	sal_Size nSrcCvtChars;
509 
510 	// take nSourceSize * 3 as preference
511 	// this is an upper boundary for converting to utf8,
512 	// which most often used as the target.
513 	sal_Int32 nSeqSize =  nSourceSize * 3;
514 
515 	Sequence<sal_Int8> 	seqText( nSeqSize );
516 	sal_Char *pTarget = (sal_Char *) seqText.getArray();
517 	while( sal_True ) {
518 
519 		nTargetCount += rtl_convertUnicodeToText(
520 									m_convUnicode2Text,
521 									m_contextUnicode2Text,
522 									&( puSource[nSourceCount] ),
523 									nSourceSize - nSourceCount ,
524 									&( pTarget[nTargetCount] ),
525 									nSeqSize - nTargetCount,
526 									RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
527 									RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
528 									&uiInfo,
529 									&nSrcCvtChars);
530 		nSourceCount += nSrcCvtChars;
531 
532 		if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
533 			nSeqSize = nSeqSize *2;
534 			seqText.realloc( nSeqSize );  // double array size
535 			pTarget = ( sal_Char * ) seqText.getArray();
536 			continue;
537 		}
538 		break;
539 	}
540 
541 	// for surrogates
542 	if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
543 		m_seqSource.realloc( nSourceSize - nSourceCount );
544 		memcpy( m_seqSource.getArray() ,
545 				&(puSource[nSourceCount]),
546 				(nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
547 	}
548 
549 	if( puTempMem ) {
550 		delete [] puTempMem;
551 	}
552 
553 	// reduce the size of the buffer (fast, no copy necessary)
554 	seqText.realloc( nTargetCount );
555 
556 	return seqText;
557 }
558 
559 void Unicode2TextConverter::init( rtl_TextEncoding encoding )
560 {
561 	m_bCanContinue = sal_True;
562 	m_bInitialized = sal_True;
563 
564 	m_convUnicode2Text 	= rtl_createUnicodeToTextConverter( encoding );
565 	m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
566 	m_rtlEncoding = encoding;
567 };
568 
569 
570 }
571