xref: /aoo41x/main/sax/source/expatwrap/xml2utf.cxx (revision cdf0e10c)
1*cdf0e10cSrcweir /*************************************************************************
2*cdf0e10cSrcweir  *
3*cdf0e10cSrcweir  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4*cdf0e10cSrcweir  *
5*cdf0e10cSrcweir  * Copyright 2000, 2010 Oracle and/or its affiliates.
6*cdf0e10cSrcweir  *
7*cdf0e10cSrcweir  * OpenOffice.org - a multi-platform office productivity suite
8*cdf0e10cSrcweir  *
9*cdf0e10cSrcweir  * This file is part of OpenOffice.org.
10*cdf0e10cSrcweir  *
11*cdf0e10cSrcweir  * OpenOffice.org is free software: you can redistribute it and/or modify
12*cdf0e10cSrcweir  * it under the terms of the GNU Lesser General Public License version 3
13*cdf0e10cSrcweir  * only, as published by the Free Software Foundation.
14*cdf0e10cSrcweir  *
15*cdf0e10cSrcweir  * OpenOffice.org is distributed in the hope that it will be useful,
16*cdf0e10cSrcweir  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17*cdf0e10cSrcweir  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18*cdf0e10cSrcweir  * GNU Lesser General Public License version 3 for more details
19*cdf0e10cSrcweir  * (a copy is included in the LICENSE file that accompanied this code).
20*cdf0e10cSrcweir  *
21*cdf0e10cSrcweir  * You should have received a copy of the GNU Lesser General Public License
22*cdf0e10cSrcweir  * version 3 along with OpenOffice.org.  If not, see
23*cdf0e10cSrcweir  * <http://www.openoffice.org/license.html>
24*cdf0e10cSrcweir  * for a copy of the LGPLv3 License.
25*cdf0e10cSrcweir  *
26*cdf0e10cSrcweir  ************************************************************************/
27*cdf0e10cSrcweir #include <string.h>
28*cdf0e10cSrcweir 
29*cdf0e10cSrcweir #include <sal/types.h>
30*cdf0e10cSrcweir 
31*cdf0e10cSrcweir #include <rtl/textenc.h>
32*cdf0e10cSrcweir #include <rtl/tencinfo.h>
33*cdf0e10cSrcweir 
34*cdf0e10cSrcweir 
35*cdf0e10cSrcweir #include <com/sun/star/io/XInputStream.hpp>
36*cdf0e10cSrcweir 
37*cdf0e10cSrcweir using namespace rtl;
38*cdf0e10cSrcweir using namespace ::com::sun::star::uno;
39*cdf0e10cSrcweir using namespace ::com::sun::star::io;
40*cdf0e10cSrcweir 
41*cdf0e10cSrcweir #include "xml2utf.hxx"
42*cdf0e10cSrcweir 
43*cdf0e10cSrcweir namespace sax_expatwrap {
44*cdf0e10cSrcweir 
45*cdf0e10cSrcweir sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
46*cdf0e10cSrcweir 	throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
47*cdf0e10cSrcweir {
48*cdf0e10cSrcweir 
49*cdf0e10cSrcweir 	Sequence<sal_Int8> seqIn;
50*cdf0e10cSrcweir 
51*cdf0e10cSrcweir 	if( ! m_in.is() ) {
52*cdf0e10cSrcweir 		throw NotConnectedException();
53*cdf0e10cSrcweir 	}
54*cdf0e10cSrcweir 	if( ! m_bStarted ) {
55*cdf0e10cSrcweir 		nMaxToRead = Max( 512 , nMaxToRead );  	// it should be possible to find the encoding attribute
56*cdf0e10cSrcweir 						     					// within the first 512 bytes == 128 chars in UCS-4
57*cdf0e10cSrcweir 	}
58*cdf0e10cSrcweir 
59*cdf0e10cSrcweir 	sal_Int32 nRead;
60*cdf0e10cSrcweir 	Sequence< sal_Int8 > seqStart;
61*cdf0e10cSrcweir 	while( sal_True )
62*cdf0e10cSrcweir 	{
63*cdf0e10cSrcweir 		nRead = m_in->readSomeBytes( seq , nMaxToRead );
64*cdf0e10cSrcweir 
65*cdf0e10cSrcweir 		if( nRead + seqStart.getLength())
66*cdf0e10cSrcweir 		{
67*cdf0e10cSrcweir 			// if nRead is 0, the file is already eof.
68*cdf0e10cSrcweir 			if( ! m_bStarted && nRead )
69*cdf0e10cSrcweir 			{
70*cdf0e10cSrcweir 				// ensure that enough data is available to parse encoding
71*cdf0e10cSrcweir 				if( seqStart.getLength() )
72*cdf0e10cSrcweir 				{
73*cdf0e10cSrcweir 				  // prefix with what we had so far.
74*cdf0e10cSrcweir 				  sal_Int32 nLength = seq.getLength();
75*cdf0e10cSrcweir 				  seq.realloc( seqStart.getLength() + nLength );
76*cdf0e10cSrcweir 
77*cdf0e10cSrcweir 				  memmove (seq.getArray() + seqStart.getLength(),
78*cdf0e10cSrcweir 					   seq.getConstArray(),
79*cdf0e10cSrcweir 					   nLength);
80*cdf0e10cSrcweir 				  memcpy  (seq.getArray(),
81*cdf0e10cSrcweir 					   seqStart.getConstArray(),
82*cdf0e10cSrcweir 					   seqStart.getLength());
83*cdf0e10cSrcweir 				}
84*cdf0e10cSrcweir 
85*cdf0e10cSrcweir 				// autodetection with the first bytes
86*cdf0e10cSrcweir 				if( ! isEncodingRecognizable( seq ) )
87*cdf0e10cSrcweir 				{
88*cdf0e10cSrcweir 				  // remember what we have so far.
89*cdf0e10cSrcweir 				  seqStart = seq;
90*cdf0e10cSrcweir 
91*cdf0e10cSrcweir 				  // read more !
92*cdf0e10cSrcweir 				  continue;
93*cdf0e10cSrcweir 				}
94*cdf0e10cSrcweir 				if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
95*cdf0e10cSrcweir 					// initialize decoding
96*cdf0e10cSrcweir 					initializeDecoding();
97*cdf0e10cSrcweir 				}
98*cdf0e10cSrcweir 				nRead = seq.getLength();
99*cdf0e10cSrcweir 				seqStart = Sequence < sal_Int8 > ();
100*cdf0e10cSrcweir 			}
101*cdf0e10cSrcweir 
102*cdf0e10cSrcweir 			// do the encoding
103*cdf0e10cSrcweir 			if( m_pText2Unicode && m_pUnicode2Text &&
104*cdf0e10cSrcweir 				m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
105*cdf0e10cSrcweir 
106*cdf0e10cSrcweir 				Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
107*cdf0e10cSrcweir 				seq = m_pUnicode2Text->convert(	seqUnicode.getConstArray(),	seqUnicode.getLength() );
108*cdf0e10cSrcweir 			}
109*cdf0e10cSrcweir 
110*cdf0e10cSrcweir 			if( ! m_bStarted )
111*cdf0e10cSrcweir 			{
112*cdf0e10cSrcweir 				// it must now be ensured, that no encoding attribute exist anymore
113*cdf0e10cSrcweir 				// ( otherwise the expat-Parser will crash )
114*cdf0e10cSrcweir 				// This must be done after decoding !
115*cdf0e10cSrcweir 				// ( e.g. Files decoded in ucs-4 cannot be read properly )
116*cdf0e10cSrcweir 				m_bStarted = sal_True;
117*cdf0e10cSrcweir 				removeEncoding( seq );
118*cdf0e10cSrcweir 			}
119*cdf0e10cSrcweir 			nRead = seq.getLength();
120*cdf0e10cSrcweir 		}
121*cdf0e10cSrcweir 
122*cdf0e10cSrcweir 		break;
123*cdf0e10cSrcweir 	}
124*cdf0e10cSrcweir 	return nRead;
125*cdf0e10cSrcweir }
126*cdf0e10cSrcweir 
127*cdf0e10cSrcweir 
128*cdf0e10cSrcweir XMLFile2UTFConverter::~XMLFile2UTFConverter()
129*cdf0e10cSrcweir {
130*cdf0e10cSrcweir 	if( m_pText2Unicode )
131*cdf0e10cSrcweir 		delete m_pText2Unicode;
132*cdf0e10cSrcweir 	if( m_pUnicode2Text )
133*cdf0e10cSrcweir 		delete m_pUnicode2Text;
134*cdf0e10cSrcweir }
135*cdf0e10cSrcweir 
136*cdf0e10cSrcweir 
137*cdf0e10cSrcweir void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
138*cdf0e10cSrcweir {
139*cdf0e10cSrcweir 	const sal_Int8 *pSource = seq.getArray();
140*cdf0e10cSrcweir 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
141*cdf0e10cSrcweir 	{
142*cdf0e10cSrcweir 
143*cdf0e10cSrcweir 		// scan for encoding
144*cdf0e10cSrcweir 		OString str( (sal_Char * ) pSource , seq.getLength() );
145*cdf0e10cSrcweir 
146*cdf0e10cSrcweir 		// cut sequence to first line break
147*cdf0e10cSrcweir 		// find first line break;
148*cdf0e10cSrcweir 		int nMax = str.indexOf( 10 );
149*cdf0e10cSrcweir 		if( nMax >= 0 )
150*cdf0e10cSrcweir 		{
151*cdf0e10cSrcweir 			str = str.copy( 0 , nMax );
152*cdf0e10cSrcweir 		}
153*cdf0e10cSrcweir 
154*cdf0e10cSrcweir 		int nFound = str.indexOf( " encoding" );
155*cdf0e10cSrcweir 		if( nFound >= 0 ) {
156*cdf0e10cSrcweir 			int nStop;
157*cdf0e10cSrcweir 			int nStart = str.indexOf( "\"" , nFound );
158*cdf0e10cSrcweir 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
159*cdf0e10cSrcweir 			{
160*cdf0e10cSrcweir 				nStart = str.indexOf( "'" , nFound );
161*cdf0e10cSrcweir 				nStop  = str.indexOf( "'" , nStart +1 );
162*cdf0e10cSrcweir 			}
163*cdf0e10cSrcweir 			else
164*cdf0e10cSrcweir 			{
165*cdf0e10cSrcweir 				nStop  = str.indexOf( "\"" , nStart +1);
166*cdf0e10cSrcweir 			}
167*cdf0e10cSrcweir 
168*cdf0e10cSrcweir 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
169*cdf0e10cSrcweir 			{
170*cdf0e10cSrcweir 				// remove encoding tag from file
171*cdf0e10cSrcweir 				memmove(        &( seq.getArray()[nFound] ) ,
172*cdf0e10cSrcweir 								&( seq.getArray()[nStop+1]) ,
173*cdf0e10cSrcweir 								seq.getLength() - nStop -1);
174*cdf0e10cSrcweir 				seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
175*cdf0e10cSrcweir //				str = String( (char * ) seq.getArray() , seq.getLen() );
176*cdf0e10cSrcweir 			}
177*cdf0e10cSrcweir 		}
178*cdf0e10cSrcweir 	}
179*cdf0e10cSrcweir }
180*cdf0e10cSrcweir 
181*cdf0e10cSrcweir // Checks, if enough data has been accumulated to recognize the encoding
182*cdf0e10cSrcweir sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
183*cdf0e10cSrcweir {
184*cdf0e10cSrcweir 	const sal_Int8 *pSource = seq.getConstArray();
185*cdf0e10cSrcweir 	sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
186*cdf0e10cSrcweir 
187*cdf0e10cSrcweir 	if( seq.getLength() < 8 ) {
188*cdf0e10cSrcweir 		// no recognition possible, when less than 8 bytes are available
189*cdf0e10cSrcweir 		return sal_False;
190*cdf0e10cSrcweir 	}
191*cdf0e10cSrcweir 
192*cdf0e10cSrcweir 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
193*cdf0e10cSrcweir 		// scan if the <?xml tag finishes within this buffer
194*cdf0e10cSrcweir 		bCheckIfFirstClosingBracketExsists = sal_True;
195*cdf0e10cSrcweir 	}
196*cdf0e10cSrcweir 	else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
197*cdf0e10cSrcweir 			 ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
198*cdf0e10cSrcweir 	{
199*cdf0e10cSrcweir 		// check for utf-16
200*cdf0e10cSrcweir 		bCheckIfFirstClosingBracketExsists = sal_True;
201*cdf0e10cSrcweir 	}
202*cdf0e10cSrcweir 	else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
203*cdf0e10cSrcweir 		     ( '?' == pSource[5] || '?' == pSource[7] ) )
204*cdf0e10cSrcweir 	{
205*cdf0e10cSrcweir 		// check for
206*cdf0e10cSrcweir 		bCheckIfFirstClosingBracketExsists = sal_True;
207*cdf0e10cSrcweir 	}
208*cdf0e10cSrcweir 
209*cdf0e10cSrcweir 	if( bCheckIfFirstClosingBracketExsists )
210*cdf0e10cSrcweir 	{
211*cdf0e10cSrcweir 		for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
212*cdf0e10cSrcweir 		{
213*cdf0e10cSrcweir 			// whole <?xml tag is valid
214*cdf0e10cSrcweir 			if( '>' == pSource[ i ] )
215*cdf0e10cSrcweir 			{
216*cdf0e10cSrcweir 				return sal_True;
217*cdf0e10cSrcweir 			}
218*cdf0e10cSrcweir 		}
219*cdf0e10cSrcweir 		return sal_False;
220*cdf0e10cSrcweir 	}
221*cdf0e10cSrcweir 
222*cdf0e10cSrcweir 	// No <? tag in front, no need for a bigger buffer
223*cdf0e10cSrcweir 	return sal_True;
224*cdf0e10cSrcweir }
225*cdf0e10cSrcweir 
226*cdf0e10cSrcweir sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
227*cdf0e10cSrcweir {
228*cdf0e10cSrcweir 	const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
229*cdf0e10cSrcweir 	sal_Bool bReturn = sal_True;
230*cdf0e10cSrcweir 
231*cdf0e10cSrcweir 	if( seq.getLength() < 4 ) {
232*cdf0e10cSrcweir 		// no recognition possible, when less than 4 bytes are available
233*cdf0e10cSrcweir 		return sal_False;
234*cdf0e10cSrcweir 	}
235*cdf0e10cSrcweir 
236*cdf0e10cSrcweir 	// first level : detect possible file formats
237*cdf0e10cSrcweir 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
238*cdf0e10cSrcweir 
239*cdf0e10cSrcweir 		// scan for encoding
240*cdf0e10cSrcweir 		OString str( (const sal_Char *) pSource , seq.getLength() );
241*cdf0e10cSrcweir 
242*cdf0e10cSrcweir 		// cut sequence to first line break
243*cdf0e10cSrcweir 		//find first line break;
244*cdf0e10cSrcweir 		int nMax = str.indexOf( 10 );
245*cdf0e10cSrcweir 		if( nMax >= 0 )
246*cdf0e10cSrcweir 		{
247*cdf0e10cSrcweir 			str = str.copy( 0 , nMax );
248*cdf0e10cSrcweir 		}
249*cdf0e10cSrcweir 
250*cdf0e10cSrcweir 		int nFound = str.indexOf( " encoding" );
251*cdf0e10cSrcweir 		if( nFound < str.getLength() ) {
252*cdf0e10cSrcweir 			int nStop;
253*cdf0e10cSrcweir 			int nStart = str.indexOf( "\"" , nFound );
254*cdf0e10cSrcweir 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
255*cdf0e10cSrcweir 			{
256*cdf0e10cSrcweir 				nStart = str.indexOf( "'" , nFound );
257*cdf0e10cSrcweir 				nStop  = str.indexOf( "'" , nStart +1 );
258*cdf0e10cSrcweir 			}
259*cdf0e10cSrcweir 			else
260*cdf0e10cSrcweir 			{
261*cdf0e10cSrcweir 				nStop  = str.indexOf( "\"" , nStart +1);
262*cdf0e10cSrcweir 			}
263*cdf0e10cSrcweir 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
264*cdf0e10cSrcweir 			{
265*cdf0e10cSrcweir 				// encoding found finally
266*cdf0e10cSrcweir 				m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
267*cdf0e10cSrcweir 			}
268*cdf0e10cSrcweir 		}
269*cdf0e10cSrcweir 	}
270*cdf0e10cSrcweir 	else if( 0xFE == pSource[0] &&
271*cdf0e10cSrcweir 	         0xFF == pSource[1] ) {
272*cdf0e10cSrcweir 		// UTF-16 big endian
273*cdf0e10cSrcweir 		// conversion is done so that encoding information can be easily extracted
274*cdf0e10cSrcweir 		m_sEncoding = "utf-16";
275*cdf0e10cSrcweir 	}
276*cdf0e10cSrcweir 	else if( 0xFF == pSource[0] &&
277*cdf0e10cSrcweir 	         0xFE == pSource[1] ) {
278*cdf0e10cSrcweir 		// UTF-16 little endian
279*cdf0e10cSrcweir 		// conversion is done so that encoding information can be easily extracted
280*cdf0e10cSrcweir 		m_sEncoding = "utf-16";
281*cdf0e10cSrcweir 	}
282*cdf0e10cSrcweir 	else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
283*cdf0e10cSrcweir 		// UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
284*cdf0e10cSrcweir 		// The byte order mark is simply added
285*cdf0e10cSrcweir 
286*cdf0e10cSrcweir 		// simply add the byte order mark !
287*cdf0e10cSrcweir 		seq.realloc( seq.getLength() + 2 );
288*cdf0e10cSrcweir 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
289*cdf0e10cSrcweir 		((sal_uInt8*)seq.getArray())[0] = 0xFE;
290*cdf0e10cSrcweir 		((sal_uInt8*)seq.getArray())[1] = 0xFF;
291*cdf0e10cSrcweir 
292*cdf0e10cSrcweir 		m_sEncoding = "utf-16";
293*cdf0e10cSrcweir 	}
294*cdf0e10cSrcweir 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
295*cdf0e10cSrcweir 		// UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
296*cdf0e10cSrcweir 		// The byte order mark is simply added
297*cdf0e10cSrcweir 
298*cdf0e10cSrcweir 		seq.realloc( seq.getLength() + 2 );
299*cdf0e10cSrcweir 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
300*cdf0e10cSrcweir 		((sal_uInt8*)seq.getArray())[0] = 0xFF;
301*cdf0e10cSrcweir 		((sal_uInt8*)seq.getArray())[1] = 0xFE;
302*cdf0e10cSrcweir 
303*cdf0e10cSrcweir 		m_sEncoding = "utf-16";
304*cdf0e10cSrcweir 	}
305*cdf0e10cSrcweir     else if( 0xEF == pSource[0] &&
306*cdf0e10cSrcweir              0xBB == pSource[1] &&
307*cdf0e10cSrcweir              0xBF == pSource[2] )
308*cdf0e10cSrcweir     {
309*cdf0e10cSrcweir         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
310*cdf0e10cSrcweir         // The BOM is removed.
311*cdf0e10cSrcweir         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
312*cdf0e10cSrcweir         seq.realloc( seq.getLength() - 3 );
313*cdf0e10cSrcweir         m_sEncoding = "utf-8";
314*cdf0e10cSrcweir     }
315*cdf0e10cSrcweir 	else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
316*cdf0e10cSrcweir 		// UCS-4 big endian
317*cdf0e10cSrcweir 		m_sEncoding = "ucs-4";
318*cdf0e10cSrcweir 	}
319*cdf0e10cSrcweir 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
320*cdf0e10cSrcweir 		// UCS-4 little endian
321*cdf0e10cSrcweir 		m_sEncoding = "ucs-4";
322*cdf0e10cSrcweir 	}
323*cdf0e10cSrcweir 	else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
324*cdf0e10cSrcweir 	         0xa7 == static_cast<unsigned char> (pSource[2]) &&
325*cdf0e10cSrcweir 	         0x94 == static_cast<unsigned char> (pSource[3]) ) {
326*cdf0e10cSrcweir 		// EBCDIC
327*cdf0e10cSrcweir 		bReturn = sal_False;   // must be extended
328*cdf0e10cSrcweir 	}
329*cdf0e10cSrcweir 	else {
330*cdf0e10cSrcweir 		// other
331*cdf0e10cSrcweir 		// UTF8 is directly recognized by the parser.
332*cdf0e10cSrcweir 		bReturn = sal_False;
333*cdf0e10cSrcweir 	}
334*cdf0e10cSrcweir 
335*cdf0e10cSrcweir 	return bReturn;
336*cdf0e10cSrcweir }
337*cdf0e10cSrcweir 
338*cdf0e10cSrcweir void XMLFile2UTFConverter::initializeDecoding()
339*cdf0e10cSrcweir {
340*cdf0e10cSrcweir 
341*cdf0e10cSrcweir 	if( m_sEncoding.getLength() )
342*cdf0e10cSrcweir 	{
343*cdf0e10cSrcweir 		rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
344*cdf0e10cSrcweir 		if( encoding != RTL_TEXTENCODING_UTF8 )
345*cdf0e10cSrcweir 		{
346*cdf0e10cSrcweir 			m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
347*cdf0e10cSrcweir 			m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
348*cdf0e10cSrcweir 		}
349*cdf0e10cSrcweir 	}
350*cdf0e10cSrcweir }
351*cdf0e10cSrcweir 
352*cdf0e10cSrcweir 
353*cdf0e10cSrcweir //----------------------------------------------
354*cdf0e10cSrcweir //
355*cdf0e10cSrcweir // Text2UnicodeConverter
356*cdf0e10cSrcweir //
357*cdf0e10cSrcweir //----------------------------------------------
358*cdf0e10cSrcweir Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
359*cdf0e10cSrcweir {
360*cdf0e10cSrcweir 	rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
361*cdf0e10cSrcweir 	if( RTL_TEXTENCODING_DONTKNOW == encoding )
362*cdf0e10cSrcweir 	{
363*cdf0e10cSrcweir 		m_bCanContinue = sal_False;
364*cdf0e10cSrcweir 		m_bInitialized = sal_False;
365*cdf0e10cSrcweir 	}
366*cdf0e10cSrcweir 	else
367*cdf0e10cSrcweir 	{
368*cdf0e10cSrcweir 		init( encoding );
369*cdf0e10cSrcweir 	}
370*cdf0e10cSrcweir }
371*cdf0e10cSrcweir 
372*cdf0e10cSrcweir Text2UnicodeConverter::~Text2UnicodeConverter()
373*cdf0e10cSrcweir {
374*cdf0e10cSrcweir 	if( m_bInitialized )
375*cdf0e10cSrcweir 	{
376*cdf0e10cSrcweir 		rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
377*cdf0e10cSrcweir 		rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
378*cdf0e10cSrcweir 	}
379*cdf0e10cSrcweir }
380*cdf0e10cSrcweir 
381*cdf0e10cSrcweir void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
382*cdf0e10cSrcweir {
383*cdf0e10cSrcweir 	m_bCanContinue = sal_True;
384*cdf0e10cSrcweir 	m_bInitialized = sal_True;
385*cdf0e10cSrcweir 
386*cdf0e10cSrcweir 	m_convText2Unicode 	= rtl_createTextToUnicodeConverter(encoding);
387*cdf0e10cSrcweir 	m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
388*cdf0e10cSrcweir 	m_rtlEncoding = encoding;
389*cdf0e10cSrcweir }
390*cdf0e10cSrcweir 
391*cdf0e10cSrcweir 
392*cdf0e10cSrcweir Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
393*cdf0e10cSrcweir {
394*cdf0e10cSrcweir 	sal_uInt32 uiInfo;
395*cdf0e10cSrcweir 	sal_Size nSrcCvtBytes 	= 0;
396*cdf0e10cSrcweir 	sal_Size nTargetCount 	= 0;
397*cdf0e10cSrcweir 	sal_Size nSourceCount   = 0;
398*cdf0e10cSrcweir 
399*cdf0e10cSrcweir 	// the whole source size
400*cdf0e10cSrcweir 	sal_Int32 	nSourceSize = seqText.getLength() + m_seqSource.getLength();
401*cdf0e10cSrcweir 	Sequence<sal_Unicode> 	seqUnicode ( nSourceSize );
402*cdf0e10cSrcweir 
403*cdf0e10cSrcweir 	const sal_Int8 *pbSource = seqText.getConstArray();
404*cdf0e10cSrcweir 	sal_Int8 *pbTempMem = 0;
405*cdf0e10cSrcweir 
406*cdf0e10cSrcweir 	if( m_seqSource.getLength() ) {
407*cdf0e10cSrcweir 		// put old rest and new byte sequence into one array
408*cdf0e10cSrcweir 		pbTempMem = new sal_Int8[ nSourceSize ];
409*cdf0e10cSrcweir 		memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
410*cdf0e10cSrcweir 		memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
411*cdf0e10cSrcweir 		pbSource = pbTempMem;
412*cdf0e10cSrcweir 
413*cdf0e10cSrcweir 		// set to zero again
414*cdf0e10cSrcweir 		m_seqSource = Sequence< sal_Int8 >();
415*cdf0e10cSrcweir 	}
416*cdf0e10cSrcweir 
417*cdf0e10cSrcweir 	while( sal_True ) {
418*cdf0e10cSrcweir 
419*cdf0e10cSrcweir 		/* All invalid characters are transformed to the unicode undefined char */
420*cdf0e10cSrcweir 		nTargetCount += 	rtl_convertTextToUnicode(
421*cdf0e10cSrcweir 									m_convText2Unicode,
422*cdf0e10cSrcweir 									m_contextText2Unicode,
423*cdf0e10cSrcweir 									( const sal_Char * ) &( pbSource[nSourceCount] ),
424*cdf0e10cSrcweir 									nSourceSize - nSourceCount ,
425*cdf0e10cSrcweir 									&( seqUnicode.getArray()[ nTargetCount ] ),
426*cdf0e10cSrcweir 									seqUnicode.getLength() - nTargetCount,
427*cdf0e10cSrcweir 									RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
428*cdf0e10cSrcweir 									RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
429*cdf0e10cSrcweir 									RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
430*cdf0e10cSrcweir 									&uiInfo,
431*cdf0e10cSrcweir 									&nSrcCvtBytes );
432*cdf0e10cSrcweir 		nSourceCount += nSrcCvtBytes;
433*cdf0e10cSrcweir 
434*cdf0e10cSrcweir 		if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
435*cdf0e10cSrcweir 			// save necessary bytes for next conversion
436*cdf0e10cSrcweir 			seqUnicode.realloc( seqUnicode.getLength() * 2 );
437*cdf0e10cSrcweir 			continue;
438*cdf0e10cSrcweir 		}
439*cdf0e10cSrcweir 		break;
440*cdf0e10cSrcweir 	}
441*cdf0e10cSrcweir 	if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
442*cdf0e10cSrcweir 		m_seqSource.realloc( nSourceSize - nSourceCount );
443*cdf0e10cSrcweir 		memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
444*cdf0e10cSrcweir 	}
445*cdf0e10cSrcweir 
446*cdf0e10cSrcweir 
447*cdf0e10cSrcweir 	if( pbTempMem ) {
448*cdf0e10cSrcweir 		delete [] pbTempMem;
449*cdf0e10cSrcweir 	}
450*cdf0e10cSrcweir 
451*cdf0e10cSrcweir 	// set to correct unicode size
452*cdf0e10cSrcweir 	seqUnicode.realloc( nTargetCount );
453*cdf0e10cSrcweir 
454*cdf0e10cSrcweir 	return seqUnicode;
455*cdf0e10cSrcweir }
456*cdf0e10cSrcweir 
457*cdf0e10cSrcweir 
458*cdf0e10cSrcweir 
459*cdf0e10cSrcweir //----------------------------------------------
460*cdf0e10cSrcweir //
461*cdf0e10cSrcweir // Unicode2TextConverter
462*cdf0e10cSrcweir //
463*cdf0e10cSrcweir //----------------------------------------------
464*cdf0e10cSrcweir Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
465*cdf0e10cSrcweir {
466*cdf0e10cSrcweir 	init( encoding );
467*cdf0e10cSrcweir }
468*cdf0e10cSrcweir 
469*cdf0e10cSrcweir 
470*cdf0e10cSrcweir Unicode2TextConverter::~Unicode2TextConverter()
471*cdf0e10cSrcweir {
472*cdf0e10cSrcweir 	if( m_bInitialized ) {
473*cdf0e10cSrcweir 		rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
474*cdf0e10cSrcweir 		rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
475*cdf0e10cSrcweir 	}
476*cdf0e10cSrcweir }
477*cdf0e10cSrcweir 
478*cdf0e10cSrcweir 
479*cdf0e10cSrcweir Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
480*cdf0e10cSrcweir {
481*cdf0e10cSrcweir 	sal_Unicode *puTempMem = 0;
482*cdf0e10cSrcweir 
483*cdf0e10cSrcweir 	if( m_seqSource.getLength() ) {
484*cdf0e10cSrcweir 		// For surrogates !
485*cdf0e10cSrcweir 		// put old rest and new byte sequence into one array
486*cdf0e10cSrcweir 		// In general when surrogates are used, they should be rarely
487*cdf0e10cSrcweir 		// cut off between two convert()-calls. So this code is used
488*cdf0e10cSrcweir 		// rarely and the extra copy is acceptable.
489*cdf0e10cSrcweir 		puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
490*cdf0e10cSrcweir 		memcpy( puTempMem ,
491*cdf0e10cSrcweir 				m_seqSource.getConstArray() ,
492*cdf0e10cSrcweir 				m_seqSource.getLength() * sizeof( sal_Unicode ) );
493*cdf0e10cSrcweir 		memcpy(
494*cdf0e10cSrcweir 			&(puTempMem[ m_seqSource.getLength() ]) ,
495*cdf0e10cSrcweir 			puSource ,
496*cdf0e10cSrcweir 			nSourceSize*sizeof( sal_Unicode ) );
497*cdf0e10cSrcweir 		puSource = puTempMem;
498*cdf0e10cSrcweir 		nSourceSize += m_seqSource.getLength();
499*cdf0e10cSrcweir 
500*cdf0e10cSrcweir 		m_seqSource = Sequence< sal_Unicode > ();
501*cdf0e10cSrcweir 	}
502*cdf0e10cSrcweir 
503*cdf0e10cSrcweir 
504*cdf0e10cSrcweir 	sal_Size nTargetCount = 0;
505*cdf0e10cSrcweir 	sal_Size nSourceCount = 0;
506*cdf0e10cSrcweir 
507*cdf0e10cSrcweir 	sal_uInt32 uiInfo;
508*cdf0e10cSrcweir 	sal_Size nSrcCvtChars;
509*cdf0e10cSrcweir 
510*cdf0e10cSrcweir 	// take nSourceSize * 3 as preference
511*cdf0e10cSrcweir 	// this is an upper boundary for converting to utf8,
512*cdf0e10cSrcweir 	// which most often used as the target.
513*cdf0e10cSrcweir 	sal_Int32 nSeqSize =  nSourceSize * 3;
514*cdf0e10cSrcweir 
515*cdf0e10cSrcweir 	Sequence<sal_Int8> 	seqText( nSeqSize );
516*cdf0e10cSrcweir 	sal_Char *pTarget = (sal_Char *) seqText.getArray();
517*cdf0e10cSrcweir 	while( sal_True ) {
518*cdf0e10cSrcweir 
519*cdf0e10cSrcweir 		nTargetCount += rtl_convertUnicodeToText(
520*cdf0e10cSrcweir 									m_convUnicode2Text,
521*cdf0e10cSrcweir 									m_contextUnicode2Text,
522*cdf0e10cSrcweir 									&( puSource[nSourceCount] ),
523*cdf0e10cSrcweir 									nSourceSize - nSourceCount ,
524*cdf0e10cSrcweir 									&( pTarget[nTargetCount] ),
525*cdf0e10cSrcweir 									nSeqSize - nTargetCount,
526*cdf0e10cSrcweir 									RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
527*cdf0e10cSrcweir 									RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
528*cdf0e10cSrcweir 									&uiInfo,
529*cdf0e10cSrcweir 									&nSrcCvtChars);
530*cdf0e10cSrcweir 		nSourceCount += nSrcCvtChars;
531*cdf0e10cSrcweir 
532*cdf0e10cSrcweir 		if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
533*cdf0e10cSrcweir 			nSeqSize = nSeqSize *2;
534*cdf0e10cSrcweir 			seqText.realloc( nSeqSize );  // double array size
535*cdf0e10cSrcweir 			pTarget = ( sal_Char * ) seqText.getArray();
536*cdf0e10cSrcweir 			continue;
537*cdf0e10cSrcweir 		}
538*cdf0e10cSrcweir 		break;
539*cdf0e10cSrcweir 	}
540*cdf0e10cSrcweir 
541*cdf0e10cSrcweir 	// for surrogates
542*cdf0e10cSrcweir 	if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
543*cdf0e10cSrcweir 		m_seqSource.realloc( nSourceSize - nSourceCount );
544*cdf0e10cSrcweir 		memcpy( m_seqSource.getArray() ,
545*cdf0e10cSrcweir 				&(puSource[nSourceCount]),
546*cdf0e10cSrcweir 				(nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
547*cdf0e10cSrcweir 	}
548*cdf0e10cSrcweir 
549*cdf0e10cSrcweir 	if( puTempMem ) {
550*cdf0e10cSrcweir 		delete [] puTempMem;
551*cdf0e10cSrcweir 	}
552*cdf0e10cSrcweir 
553*cdf0e10cSrcweir 	// reduce the size of the buffer (fast, no copy necessary)
554*cdf0e10cSrcweir 	seqText.realloc( nTargetCount );
555*cdf0e10cSrcweir 
556*cdf0e10cSrcweir 	return seqText;
557*cdf0e10cSrcweir }
558*cdf0e10cSrcweir 
559*cdf0e10cSrcweir void Unicode2TextConverter::init( rtl_TextEncoding encoding )
560*cdf0e10cSrcweir {
561*cdf0e10cSrcweir 	m_bCanContinue = sal_True;
562*cdf0e10cSrcweir 	m_bInitialized = sal_True;
563*cdf0e10cSrcweir 
564*cdf0e10cSrcweir 	m_convUnicode2Text 	= rtl_createUnicodeToTextConverter( encoding );
565*cdf0e10cSrcweir 	m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
566*cdf0e10cSrcweir 	m_rtlEncoding = encoding;
567*cdf0e10cSrcweir };
568*cdf0e10cSrcweir 
569*cdf0e10cSrcweir 
570*cdf0e10cSrcweir }
571