xref: /aoo41x/main/sax/source/expatwrap/xml2utf.cxx (revision f9b72d11)
1*f9b72d11SAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3*f9b72d11SAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*f9b72d11SAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*f9b72d11SAndrew Rist  * distributed with this work for additional information
6*f9b72d11SAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*f9b72d11SAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*f9b72d11SAndrew Rist  * "License"); you may not use this file except in compliance
9*f9b72d11SAndrew Rist  * with the License.  You may obtain a copy of the License at
10*f9b72d11SAndrew Rist  *
11*f9b72d11SAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12*f9b72d11SAndrew Rist  *
13*f9b72d11SAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*f9b72d11SAndrew Rist  * software distributed under the License is distributed on an
15*f9b72d11SAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*f9b72d11SAndrew Rist  * KIND, either express or implied.  See the License for the
17*f9b72d11SAndrew Rist  * specific language governing permissions and limitations
18*f9b72d11SAndrew Rist  * under the License.
19*f9b72d11SAndrew Rist  *
20*f9b72d11SAndrew Rist  *************************************************************/
21*f9b72d11SAndrew Rist 
22*f9b72d11SAndrew Rist 
23cdf0e10cSrcweir #include <string.h>
24cdf0e10cSrcweir 
25cdf0e10cSrcweir #include <sal/types.h>
26cdf0e10cSrcweir 
27cdf0e10cSrcweir #include <rtl/textenc.h>
28cdf0e10cSrcweir #include <rtl/tencinfo.h>
29cdf0e10cSrcweir 
30cdf0e10cSrcweir 
31cdf0e10cSrcweir #include <com/sun/star/io/XInputStream.hpp>
32cdf0e10cSrcweir 
33cdf0e10cSrcweir using namespace rtl;
34cdf0e10cSrcweir using namespace ::com::sun::star::uno;
35cdf0e10cSrcweir using namespace ::com::sun::star::io;
36cdf0e10cSrcweir 
37cdf0e10cSrcweir #include "xml2utf.hxx"
38cdf0e10cSrcweir 
39cdf0e10cSrcweir namespace sax_expatwrap {
40cdf0e10cSrcweir 
readAndConvert(Sequence<sal_Int8> & seq,sal_Int32 nMaxToRead)41cdf0e10cSrcweir sal_Int32 XMLFile2UTFConverter::readAndConvert( Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
42cdf0e10cSrcweir 	throw ( IOException, NotConnectedException , BufferSizeExceededException , RuntimeException )
43cdf0e10cSrcweir {
44cdf0e10cSrcweir 
45cdf0e10cSrcweir 	Sequence<sal_Int8> seqIn;
46cdf0e10cSrcweir 
47cdf0e10cSrcweir 	if( ! m_in.is() ) {
48cdf0e10cSrcweir 		throw NotConnectedException();
49cdf0e10cSrcweir 	}
50cdf0e10cSrcweir 	if( ! m_bStarted ) {
51cdf0e10cSrcweir 		nMaxToRead = Max( 512 , nMaxToRead );  	// it should be possible to find the encoding attribute
52cdf0e10cSrcweir 						     					// within the first 512 bytes == 128 chars in UCS-4
53cdf0e10cSrcweir 	}
54cdf0e10cSrcweir 
55cdf0e10cSrcweir 	sal_Int32 nRead;
56cdf0e10cSrcweir 	Sequence< sal_Int8 > seqStart;
57cdf0e10cSrcweir 	while( sal_True )
58cdf0e10cSrcweir 	{
59cdf0e10cSrcweir 		nRead = m_in->readSomeBytes( seq , nMaxToRead );
60cdf0e10cSrcweir 
61cdf0e10cSrcweir 		if( nRead + seqStart.getLength())
62cdf0e10cSrcweir 		{
63cdf0e10cSrcweir 			// if nRead is 0, the file is already eof.
64cdf0e10cSrcweir 			if( ! m_bStarted && nRead )
65cdf0e10cSrcweir 			{
66cdf0e10cSrcweir 				// ensure that enough data is available to parse encoding
67cdf0e10cSrcweir 				if( seqStart.getLength() )
68cdf0e10cSrcweir 				{
69cdf0e10cSrcweir 				  // prefix with what we had so far.
70cdf0e10cSrcweir 				  sal_Int32 nLength = seq.getLength();
71cdf0e10cSrcweir 				  seq.realloc( seqStart.getLength() + nLength );
72cdf0e10cSrcweir 
73cdf0e10cSrcweir 				  memmove (seq.getArray() + seqStart.getLength(),
74cdf0e10cSrcweir 					   seq.getConstArray(),
75cdf0e10cSrcweir 					   nLength);
76cdf0e10cSrcweir 				  memcpy  (seq.getArray(),
77cdf0e10cSrcweir 					   seqStart.getConstArray(),
78cdf0e10cSrcweir 					   seqStart.getLength());
79cdf0e10cSrcweir 				}
80cdf0e10cSrcweir 
81cdf0e10cSrcweir 				// autodetection with the first bytes
82cdf0e10cSrcweir 				if( ! isEncodingRecognizable( seq ) )
83cdf0e10cSrcweir 				{
84cdf0e10cSrcweir 				  // remember what we have so far.
85cdf0e10cSrcweir 				  seqStart = seq;
86cdf0e10cSrcweir 
87cdf0e10cSrcweir 				  // read more !
88cdf0e10cSrcweir 				  continue;
89cdf0e10cSrcweir 				}
90cdf0e10cSrcweir 				if( scanForEncoding( seq ) || m_sEncoding.getLength() ) {
91cdf0e10cSrcweir 					// initialize decoding
92cdf0e10cSrcweir 					initializeDecoding();
93cdf0e10cSrcweir 				}
94cdf0e10cSrcweir 				nRead = seq.getLength();
95cdf0e10cSrcweir 				seqStart = Sequence < sal_Int8 > ();
96cdf0e10cSrcweir 			}
97cdf0e10cSrcweir 
98cdf0e10cSrcweir 			// do the encoding
99cdf0e10cSrcweir 			if( m_pText2Unicode && m_pUnicode2Text &&
100cdf0e10cSrcweir 				m_pText2Unicode->canContinue() && m_pUnicode2Text->canContinue() ) {
101cdf0e10cSrcweir 
102cdf0e10cSrcweir 				Sequence<sal_Unicode> seqUnicode = m_pText2Unicode->convert( seq );
103cdf0e10cSrcweir 				seq = m_pUnicode2Text->convert(	seqUnicode.getConstArray(),	seqUnicode.getLength() );
104cdf0e10cSrcweir 			}
105cdf0e10cSrcweir 
106cdf0e10cSrcweir 			if( ! m_bStarted )
107cdf0e10cSrcweir 			{
108cdf0e10cSrcweir 				// it must now be ensured, that no encoding attribute exist anymore
109cdf0e10cSrcweir 				// ( otherwise the expat-Parser will crash )
110cdf0e10cSrcweir 				// This must be done after decoding !
111cdf0e10cSrcweir 				// ( e.g. Files decoded in ucs-4 cannot be read properly )
112cdf0e10cSrcweir 				m_bStarted = sal_True;
113cdf0e10cSrcweir 				removeEncoding( seq );
114cdf0e10cSrcweir 			}
115cdf0e10cSrcweir 			nRead = seq.getLength();
116cdf0e10cSrcweir 		}
117cdf0e10cSrcweir 
118cdf0e10cSrcweir 		break;
119cdf0e10cSrcweir 	}
120cdf0e10cSrcweir 	return nRead;
121cdf0e10cSrcweir }
122cdf0e10cSrcweir 
123cdf0e10cSrcweir 
~XMLFile2UTFConverter()124cdf0e10cSrcweir XMLFile2UTFConverter::~XMLFile2UTFConverter()
125cdf0e10cSrcweir {
126cdf0e10cSrcweir 	if( m_pText2Unicode )
127cdf0e10cSrcweir 		delete m_pText2Unicode;
128cdf0e10cSrcweir 	if( m_pUnicode2Text )
129cdf0e10cSrcweir 		delete m_pUnicode2Text;
130cdf0e10cSrcweir }
131cdf0e10cSrcweir 
132cdf0e10cSrcweir 
removeEncoding(Sequence<sal_Int8> & seq)133cdf0e10cSrcweir void XMLFile2UTFConverter::removeEncoding( Sequence<sal_Int8> &seq )
134cdf0e10cSrcweir {
135cdf0e10cSrcweir 	const sal_Int8 *pSource = seq.getArray();
136cdf0e10cSrcweir 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4) )
137cdf0e10cSrcweir 	{
138cdf0e10cSrcweir 
139cdf0e10cSrcweir 		// scan for encoding
140cdf0e10cSrcweir 		OString str( (sal_Char * ) pSource , seq.getLength() );
141cdf0e10cSrcweir 
142cdf0e10cSrcweir 		// cut sequence to first line break
143cdf0e10cSrcweir 		// find first line break;
144cdf0e10cSrcweir 		int nMax = str.indexOf( 10 );
145cdf0e10cSrcweir 		if( nMax >= 0 )
146cdf0e10cSrcweir 		{
147cdf0e10cSrcweir 			str = str.copy( 0 , nMax );
148cdf0e10cSrcweir 		}
149cdf0e10cSrcweir 
150cdf0e10cSrcweir 		int nFound = str.indexOf( " encoding" );
151cdf0e10cSrcweir 		if( nFound >= 0 ) {
152cdf0e10cSrcweir 			int nStop;
153cdf0e10cSrcweir 			int nStart = str.indexOf( "\"" , nFound );
154cdf0e10cSrcweir 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
155cdf0e10cSrcweir 			{
156cdf0e10cSrcweir 				nStart = str.indexOf( "'" , nFound );
157cdf0e10cSrcweir 				nStop  = str.indexOf( "'" , nStart +1 );
158cdf0e10cSrcweir 			}
159cdf0e10cSrcweir 			else
160cdf0e10cSrcweir 			{
161cdf0e10cSrcweir 				nStop  = str.indexOf( "\"" , nStart +1);
162cdf0e10cSrcweir 			}
163cdf0e10cSrcweir 
164cdf0e10cSrcweir 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
165cdf0e10cSrcweir 			{
166cdf0e10cSrcweir 				// remove encoding tag from file
167cdf0e10cSrcweir 				memmove(        &( seq.getArray()[nFound] ) ,
168cdf0e10cSrcweir 								&( seq.getArray()[nStop+1]) ,
169cdf0e10cSrcweir 								seq.getLength() - nStop -1);
170cdf0e10cSrcweir 				seq.realloc( seq.getLength() - ( nStop+1 - nFound ) );
171cdf0e10cSrcweir //				str = String( (char * ) seq.getArray() , seq.getLen() );
172cdf0e10cSrcweir 			}
173cdf0e10cSrcweir 		}
174cdf0e10cSrcweir 	}
175cdf0e10cSrcweir }
176cdf0e10cSrcweir 
177cdf0e10cSrcweir // Checks, if enough data has been accumulated to recognize the encoding
isEncodingRecognizable(const Sequence<sal_Int8> & seq)178cdf0e10cSrcweir sal_Bool XMLFile2UTFConverter::isEncodingRecognizable( const Sequence< sal_Int8 > &seq)
179cdf0e10cSrcweir {
180cdf0e10cSrcweir 	const sal_Int8 *pSource = seq.getConstArray();
181cdf0e10cSrcweir 	sal_Bool bCheckIfFirstClosingBracketExsists = sal_False;
182cdf0e10cSrcweir 
183cdf0e10cSrcweir 	if( seq.getLength() < 8 ) {
184cdf0e10cSrcweir 		// no recognition possible, when less than 8 bytes are available
185cdf0e10cSrcweir 		return sal_False;
186cdf0e10cSrcweir 	}
187cdf0e10cSrcweir 
188cdf0e10cSrcweir 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
189cdf0e10cSrcweir 		// scan if the <?xml tag finishes within this buffer
190cdf0e10cSrcweir 		bCheckIfFirstClosingBracketExsists = sal_True;
191cdf0e10cSrcweir 	}
192cdf0e10cSrcweir 	else if( ('<' == pSource[0] || '<' == pSource[2] ) &&
193cdf0e10cSrcweir 			 ( ('?' == pSource[4] || '?' == pSource[6] ) ) )
194cdf0e10cSrcweir 	{
195cdf0e10cSrcweir 		// check for utf-16
196cdf0e10cSrcweir 		bCheckIfFirstClosingBracketExsists = sal_True;
197cdf0e10cSrcweir 	}
198cdf0e10cSrcweir 	else if( ( '<' == pSource[1] || '<' == pSource[3] ) &&
199cdf0e10cSrcweir 		     ( '?' == pSource[5] || '?' == pSource[7] ) )
200cdf0e10cSrcweir 	{
201cdf0e10cSrcweir 		// check for
202cdf0e10cSrcweir 		bCheckIfFirstClosingBracketExsists = sal_True;
203cdf0e10cSrcweir 	}
204cdf0e10cSrcweir 
205cdf0e10cSrcweir 	if( bCheckIfFirstClosingBracketExsists )
206cdf0e10cSrcweir 	{
207cdf0e10cSrcweir 		for( sal_Int32 i = 0; i < seq.getLength() ; i ++ )
208cdf0e10cSrcweir 		{
209cdf0e10cSrcweir 			// whole <?xml tag is valid
210cdf0e10cSrcweir 			if( '>' == pSource[ i ] )
211cdf0e10cSrcweir 			{
212cdf0e10cSrcweir 				return sal_True;
213cdf0e10cSrcweir 			}
214cdf0e10cSrcweir 		}
215cdf0e10cSrcweir 		return sal_False;
216cdf0e10cSrcweir 	}
217cdf0e10cSrcweir 
218cdf0e10cSrcweir 	// No <? tag in front, no need for a bigger buffer
219cdf0e10cSrcweir 	return sal_True;
220cdf0e10cSrcweir }
221cdf0e10cSrcweir 
scanForEncoding(Sequence<sal_Int8> & seq)222cdf0e10cSrcweir sal_Bool XMLFile2UTFConverter::scanForEncoding( Sequence< sal_Int8 > &seq )
223cdf0e10cSrcweir {
224cdf0e10cSrcweir 	const sal_uInt8 *pSource = reinterpret_cast<const sal_uInt8*>( seq.getConstArray() );
225cdf0e10cSrcweir 	sal_Bool bReturn = sal_True;
226cdf0e10cSrcweir 
227cdf0e10cSrcweir 	if( seq.getLength() < 4 ) {
228cdf0e10cSrcweir 		// no recognition possible, when less than 4 bytes are available
229cdf0e10cSrcweir 		return sal_False;
230cdf0e10cSrcweir 	}
231cdf0e10cSrcweir 
232cdf0e10cSrcweir 	// first level : detect possible file formats
233cdf0e10cSrcweir 	if( ! strncmp( (const char * ) pSource , "<?xml" , 4 ) ) {
234cdf0e10cSrcweir 
235cdf0e10cSrcweir 		// scan for encoding
236cdf0e10cSrcweir 		OString str( (const sal_Char *) pSource , seq.getLength() );
237cdf0e10cSrcweir 
238cdf0e10cSrcweir 		// cut sequence to first line break
239cdf0e10cSrcweir 		//find first line break;
240cdf0e10cSrcweir 		int nMax = str.indexOf( 10 );
241cdf0e10cSrcweir 		if( nMax >= 0 )
242cdf0e10cSrcweir 		{
243cdf0e10cSrcweir 			str = str.copy( 0 , nMax );
244cdf0e10cSrcweir 		}
245cdf0e10cSrcweir 
246cdf0e10cSrcweir 		int nFound = str.indexOf( " encoding" );
247cdf0e10cSrcweir 		if( nFound < str.getLength() ) {
248cdf0e10cSrcweir 			int nStop;
249cdf0e10cSrcweir 			int nStart = str.indexOf( "\"" , nFound );
250cdf0e10cSrcweir 			if( nStart < 0 || str.indexOf( "'" , nFound ) < nStart )
251cdf0e10cSrcweir 			{
252cdf0e10cSrcweir 				nStart = str.indexOf( "'" , nFound );
253cdf0e10cSrcweir 				nStop  = str.indexOf( "'" , nStart +1 );
254cdf0e10cSrcweir 			}
255cdf0e10cSrcweir 			else
256cdf0e10cSrcweir 			{
257cdf0e10cSrcweir 				nStop  = str.indexOf( "\"" , nStart +1);
258cdf0e10cSrcweir 			}
259cdf0e10cSrcweir 			if( nStart >= 0 && nStop >= 0 && nStart+1 < nStop )
260cdf0e10cSrcweir 			{
261cdf0e10cSrcweir 				// encoding found finally
262cdf0e10cSrcweir 				m_sEncoding = str.copy( nStart+1 , nStop - nStart - 1 );
263cdf0e10cSrcweir 			}
264cdf0e10cSrcweir 		}
265cdf0e10cSrcweir 	}
266cdf0e10cSrcweir 	else if( 0xFE == pSource[0] &&
267cdf0e10cSrcweir 	         0xFF == pSource[1] ) {
268cdf0e10cSrcweir 		// UTF-16 big endian
269cdf0e10cSrcweir 		// conversion is done so that encoding information can be easily extracted
270cdf0e10cSrcweir 		m_sEncoding = "utf-16";
271cdf0e10cSrcweir 	}
272cdf0e10cSrcweir 	else if( 0xFF == pSource[0] &&
273cdf0e10cSrcweir 	         0xFE == pSource[1] ) {
274cdf0e10cSrcweir 		// UTF-16 little endian
275cdf0e10cSrcweir 		// conversion is done so that encoding information can be easily extracted
276cdf0e10cSrcweir 		m_sEncoding = "utf-16";
277cdf0e10cSrcweir 	}
278cdf0e10cSrcweir 	else if( 0x00 == pSource[0] && 0x3c == pSource[1]  && 0x00 == pSource[2] && 0x3f == pSource[3] ) {
279cdf0e10cSrcweir 		// UTF-16 big endian without byte order mark (this is (strictly speaking) an error.)
280cdf0e10cSrcweir 		// The byte order mark is simply added
281cdf0e10cSrcweir 
282cdf0e10cSrcweir 		// simply add the byte order mark !
283cdf0e10cSrcweir 		seq.realloc( seq.getLength() + 2 );
284cdf0e10cSrcweir 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
285cdf0e10cSrcweir 		((sal_uInt8*)seq.getArray())[0] = 0xFE;
286cdf0e10cSrcweir 		((sal_uInt8*)seq.getArray())[1] = 0xFF;
287cdf0e10cSrcweir 
288cdf0e10cSrcweir 		m_sEncoding = "utf-16";
289cdf0e10cSrcweir 	}
290cdf0e10cSrcweir 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x3f == pSource[2] && 0x00 == pSource[3] ) {
291cdf0e10cSrcweir 		// UTF-16 little endian without byte order mark (this is (strictly speaking) an error.)
292cdf0e10cSrcweir 		// The byte order mark is simply added
293cdf0e10cSrcweir 
294cdf0e10cSrcweir 		seq.realloc( seq.getLength() + 2 );
295cdf0e10cSrcweir 		memmove( &( seq.getArray()[2] ) , seq.getArray() , seq.getLength() - 2 );
296cdf0e10cSrcweir 		((sal_uInt8*)seq.getArray())[0] = 0xFF;
297cdf0e10cSrcweir 		((sal_uInt8*)seq.getArray())[1] = 0xFE;
298cdf0e10cSrcweir 
299cdf0e10cSrcweir 		m_sEncoding = "utf-16";
300cdf0e10cSrcweir 	}
301cdf0e10cSrcweir     else if( 0xEF == pSource[0] &&
302cdf0e10cSrcweir              0xBB == pSource[1] &&
303cdf0e10cSrcweir              0xBF == pSource[2] )
304cdf0e10cSrcweir     {
305cdf0e10cSrcweir         // UTF-8 BOM (byte order mark); signifies utf-8, and not byte order
306cdf0e10cSrcweir         // The BOM is removed.
307cdf0e10cSrcweir         memmove( seq.getArray(), &( seq.getArray()[3] ), seq.getLength()-3 );
308cdf0e10cSrcweir         seq.realloc( seq.getLength() - 3 );
309cdf0e10cSrcweir         m_sEncoding = "utf-8";
310cdf0e10cSrcweir     }
311cdf0e10cSrcweir 	else if( 0x00 == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x3c == pSource[3] ) {
312cdf0e10cSrcweir 		// UCS-4 big endian
313cdf0e10cSrcweir 		m_sEncoding = "ucs-4";
314cdf0e10cSrcweir 	}
315cdf0e10cSrcweir 	else if( 0x3c == pSource[0] && 0x00 == pSource[1]  && 0x00 == pSource[2] && 0x00 == pSource[3] ) {
316cdf0e10cSrcweir 		// UCS-4 little endian
317cdf0e10cSrcweir 		m_sEncoding = "ucs-4";
318cdf0e10cSrcweir 	}
319cdf0e10cSrcweir 	else if( 0x4c == pSource[0] && 0x6f == pSource[1]  &&
320cdf0e10cSrcweir 	         0xa7 == static_cast<unsigned char> (pSource[2]) &&
321cdf0e10cSrcweir 	         0x94 == static_cast<unsigned char> (pSource[3]) ) {
322cdf0e10cSrcweir 		// EBCDIC
323cdf0e10cSrcweir 		bReturn = sal_False;   // must be extended
324cdf0e10cSrcweir 	}
325cdf0e10cSrcweir 	else {
326cdf0e10cSrcweir 		// other
327cdf0e10cSrcweir 		// UTF8 is directly recognized by the parser.
328cdf0e10cSrcweir 		bReturn = sal_False;
329cdf0e10cSrcweir 	}
330cdf0e10cSrcweir 
331cdf0e10cSrcweir 	return bReturn;
332cdf0e10cSrcweir }
333cdf0e10cSrcweir 
initializeDecoding()334cdf0e10cSrcweir void XMLFile2UTFConverter::initializeDecoding()
335cdf0e10cSrcweir {
336cdf0e10cSrcweir 
337cdf0e10cSrcweir 	if( m_sEncoding.getLength() )
338cdf0e10cSrcweir 	{
339cdf0e10cSrcweir 		rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( m_sEncoding.getStr() );
340cdf0e10cSrcweir 		if( encoding != RTL_TEXTENCODING_UTF8 )
341cdf0e10cSrcweir 		{
342cdf0e10cSrcweir 			m_pText2Unicode = new Text2UnicodeConverter( m_sEncoding );
343cdf0e10cSrcweir 			m_pUnicode2Text = new Unicode2TextConverter( RTL_TEXTENCODING_UTF8 );
344cdf0e10cSrcweir 		}
345cdf0e10cSrcweir 	}
346cdf0e10cSrcweir }
347cdf0e10cSrcweir 
348cdf0e10cSrcweir 
349cdf0e10cSrcweir //----------------------------------------------
350cdf0e10cSrcweir //
351cdf0e10cSrcweir // Text2UnicodeConverter
352cdf0e10cSrcweir //
353cdf0e10cSrcweir //----------------------------------------------
Text2UnicodeConverter(const OString & sEncoding)354cdf0e10cSrcweir Text2UnicodeConverter::Text2UnicodeConverter( const OString &sEncoding )
355cdf0e10cSrcweir {
356cdf0e10cSrcweir 	rtl_TextEncoding encoding = rtl_getTextEncodingFromMimeCharset( sEncoding.getStr() );
357cdf0e10cSrcweir 	if( RTL_TEXTENCODING_DONTKNOW == encoding )
358cdf0e10cSrcweir 	{
359cdf0e10cSrcweir 		m_bCanContinue = sal_False;
360cdf0e10cSrcweir 		m_bInitialized = sal_False;
361cdf0e10cSrcweir 	}
362cdf0e10cSrcweir 	else
363cdf0e10cSrcweir 	{
364cdf0e10cSrcweir 		init( encoding );
365cdf0e10cSrcweir 	}
366cdf0e10cSrcweir }
367cdf0e10cSrcweir 
~Text2UnicodeConverter()368cdf0e10cSrcweir Text2UnicodeConverter::~Text2UnicodeConverter()
369cdf0e10cSrcweir {
370cdf0e10cSrcweir 	if( m_bInitialized )
371cdf0e10cSrcweir 	{
372cdf0e10cSrcweir 		rtl_destroyTextToUnicodeContext( m_convText2Unicode , m_contextText2Unicode );
373cdf0e10cSrcweir 		rtl_destroyUnicodeToTextConverter( m_convText2Unicode );
374cdf0e10cSrcweir 	}
375cdf0e10cSrcweir }
376cdf0e10cSrcweir 
init(rtl_TextEncoding encoding)377cdf0e10cSrcweir void Text2UnicodeConverter::init( rtl_TextEncoding encoding )
378cdf0e10cSrcweir {
379cdf0e10cSrcweir 	m_bCanContinue = sal_True;
380cdf0e10cSrcweir 	m_bInitialized = sal_True;
381cdf0e10cSrcweir 
382cdf0e10cSrcweir 	m_convText2Unicode 	= rtl_createTextToUnicodeConverter(encoding);
383cdf0e10cSrcweir 	m_contextText2Unicode = rtl_createTextToUnicodeContext( m_convText2Unicode );
384cdf0e10cSrcweir 	m_rtlEncoding = encoding;
385cdf0e10cSrcweir }
386cdf0e10cSrcweir 
387cdf0e10cSrcweir 
convert(const Sequence<sal_Int8> & seqText)388cdf0e10cSrcweir Sequence<sal_Unicode> Text2UnicodeConverter::convert( const Sequence<sal_Int8> &seqText )
389cdf0e10cSrcweir {
390cdf0e10cSrcweir 	sal_uInt32 uiInfo;
391cdf0e10cSrcweir 	sal_Size nSrcCvtBytes 	= 0;
392cdf0e10cSrcweir 	sal_Size nTargetCount 	= 0;
393cdf0e10cSrcweir 	sal_Size nSourceCount   = 0;
394cdf0e10cSrcweir 
395cdf0e10cSrcweir 	// the whole source size
396cdf0e10cSrcweir 	sal_Int32 	nSourceSize = seqText.getLength() + m_seqSource.getLength();
397cdf0e10cSrcweir 	Sequence<sal_Unicode> 	seqUnicode ( nSourceSize );
398cdf0e10cSrcweir 
399cdf0e10cSrcweir 	const sal_Int8 *pbSource = seqText.getConstArray();
400cdf0e10cSrcweir 	sal_Int8 *pbTempMem = 0;
401cdf0e10cSrcweir 
402cdf0e10cSrcweir 	if( m_seqSource.getLength() ) {
403cdf0e10cSrcweir 		// put old rest and new byte sequence into one array
404cdf0e10cSrcweir 		pbTempMem = new sal_Int8[ nSourceSize ];
405cdf0e10cSrcweir 		memcpy( pbTempMem , m_seqSource.getConstArray() , m_seqSource.getLength() );
406cdf0e10cSrcweir 		memcpy( &(pbTempMem[ m_seqSource.getLength() ]) , seqText.getConstArray() , seqText.getLength() );
407cdf0e10cSrcweir 		pbSource = pbTempMem;
408cdf0e10cSrcweir 
409cdf0e10cSrcweir 		// set to zero again
410cdf0e10cSrcweir 		m_seqSource = Sequence< sal_Int8 >();
411cdf0e10cSrcweir 	}
412cdf0e10cSrcweir 
413cdf0e10cSrcweir 	while( sal_True ) {
414cdf0e10cSrcweir 
415cdf0e10cSrcweir 		/* All invalid characters are transformed to the unicode undefined char */
416cdf0e10cSrcweir 		nTargetCount += 	rtl_convertTextToUnicode(
417cdf0e10cSrcweir 									m_convText2Unicode,
418cdf0e10cSrcweir 									m_contextText2Unicode,
419cdf0e10cSrcweir 									( const sal_Char * ) &( pbSource[nSourceCount] ),
420cdf0e10cSrcweir 									nSourceSize - nSourceCount ,
421cdf0e10cSrcweir 									&( seqUnicode.getArray()[ nTargetCount ] ),
422cdf0e10cSrcweir 									seqUnicode.getLength() - nTargetCount,
423cdf0e10cSrcweir 									RTL_TEXTTOUNICODE_FLAGS_UNDEFINED_DEFAULT   |
424cdf0e10cSrcweir 									RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_DEFAULT |
425cdf0e10cSrcweir 									RTL_TEXTTOUNICODE_FLAGS_INVALID_DEFAULT,
426cdf0e10cSrcweir 									&uiInfo,
427cdf0e10cSrcweir 									&nSrcCvtBytes );
428cdf0e10cSrcweir 		nSourceCount += nSrcCvtBytes;
429cdf0e10cSrcweir 
430cdf0e10cSrcweir 		if( uiInfo & RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL ) {
431cdf0e10cSrcweir 			// save necessary bytes for next conversion
432cdf0e10cSrcweir 			seqUnicode.realloc( seqUnicode.getLength() * 2 );
433cdf0e10cSrcweir 			continue;
434cdf0e10cSrcweir 		}
435cdf0e10cSrcweir 		break;
436cdf0e10cSrcweir 	}
437cdf0e10cSrcweir 	if( uiInfo & RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL ) {
438cdf0e10cSrcweir 		m_seqSource.realloc( nSourceSize - nSourceCount );
439cdf0e10cSrcweir 		memcpy( m_seqSource.getArray() , &(pbSource[nSourceCount]) , nSourceSize-nSourceCount );
440cdf0e10cSrcweir 	}
441cdf0e10cSrcweir 
442cdf0e10cSrcweir 
443cdf0e10cSrcweir 	if( pbTempMem ) {
444cdf0e10cSrcweir 		delete [] pbTempMem;
445cdf0e10cSrcweir 	}
446cdf0e10cSrcweir 
447cdf0e10cSrcweir 	// set to correct unicode size
448cdf0e10cSrcweir 	seqUnicode.realloc( nTargetCount );
449cdf0e10cSrcweir 
450cdf0e10cSrcweir 	return seqUnicode;
451cdf0e10cSrcweir }
452cdf0e10cSrcweir 
453cdf0e10cSrcweir 
454cdf0e10cSrcweir 
455cdf0e10cSrcweir //----------------------------------------------
456cdf0e10cSrcweir //
457cdf0e10cSrcweir // Unicode2TextConverter
458cdf0e10cSrcweir //
459cdf0e10cSrcweir //----------------------------------------------
Unicode2TextConverter(rtl_TextEncoding encoding)460cdf0e10cSrcweir Unicode2TextConverter::Unicode2TextConverter( rtl_TextEncoding encoding )
461cdf0e10cSrcweir {
462cdf0e10cSrcweir 	init( encoding );
463cdf0e10cSrcweir }
464cdf0e10cSrcweir 
465cdf0e10cSrcweir 
~Unicode2TextConverter()466cdf0e10cSrcweir Unicode2TextConverter::~Unicode2TextConverter()
467cdf0e10cSrcweir {
468cdf0e10cSrcweir 	if( m_bInitialized ) {
469cdf0e10cSrcweir 		rtl_destroyUnicodeToTextContext( m_convUnicode2Text , m_contextUnicode2Text );
470cdf0e10cSrcweir 		rtl_destroyUnicodeToTextConverter( m_convUnicode2Text );
471cdf0e10cSrcweir 	}
472cdf0e10cSrcweir }
473cdf0e10cSrcweir 
474cdf0e10cSrcweir 
convert(const sal_Unicode * puSource,sal_Int32 nSourceSize)475cdf0e10cSrcweir Sequence<sal_Int8> Unicode2TextConverter::convert(const sal_Unicode *puSource , sal_Int32 nSourceSize)
476cdf0e10cSrcweir {
477cdf0e10cSrcweir 	sal_Unicode *puTempMem = 0;
478cdf0e10cSrcweir 
479cdf0e10cSrcweir 	if( m_seqSource.getLength() ) {
480cdf0e10cSrcweir 		// For surrogates !
481cdf0e10cSrcweir 		// put old rest and new byte sequence into one array
482cdf0e10cSrcweir 		// In general when surrogates are used, they should be rarely
483cdf0e10cSrcweir 		// cut off between two convert()-calls. So this code is used
484cdf0e10cSrcweir 		// rarely and the extra copy is acceptable.
485cdf0e10cSrcweir 		puTempMem = new sal_Unicode[ nSourceSize + m_seqSource.getLength()];
486cdf0e10cSrcweir 		memcpy( puTempMem ,
487cdf0e10cSrcweir 				m_seqSource.getConstArray() ,
488cdf0e10cSrcweir 				m_seqSource.getLength() * sizeof( sal_Unicode ) );
489cdf0e10cSrcweir 		memcpy(
490cdf0e10cSrcweir 			&(puTempMem[ m_seqSource.getLength() ]) ,
491cdf0e10cSrcweir 			puSource ,
492cdf0e10cSrcweir 			nSourceSize*sizeof( sal_Unicode ) );
493cdf0e10cSrcweir 		puSource = puTempMem;
494cdf0e10cSrcweir 		nSourceSize += m_seqSource.getLength();
495cdf0e10cSrcweir 
496cdf0e10cSrcweir 		m_seqSource = Sequence< sal_Unicode > ();
497cdf0e10cSrcweir 	}
498cdf0e10cSrcweir 
499cdf0e10cSrcweir 
500cdf0e10cSrcweir 	sal_Size nTargetCount = 0;
501cdf0e10cSrcweir 	sal_Size nSourceCount = 0;
502cdf0e10cSrcweir 
503cdf0e10cSrcweir 	sal_uInt32 uiInfo;
504cdf0e10cSrcweir 	sal_Size nSrcCvtChars;
505cdf0e10cSrcweir 
506cdf0e10cSrcweir 	// take nSourceSize * 3 as preference
507cdf0e10cSrcweir 	// this is an upper boundary for converting to utf8,
508cdf0e10cSrcweir 	// which most often used as the target.
509cdf0e10cSrcweir 	sal_Int32 nSeqSize =  nSourceSize * 3;
510cdf0e10cSrcweir 
511cdf0e10cSrcweir 	Sequence<sal_Int8> 	seqText( nSeqSize );
512cdf0e10cSrcweir 	sal_Char *pTarget = (sal_Char *) seqText.getArray();
513cdf0e10cSrcweir 	while( sal_True ) {
514cdf0e10cSrcweir 
515cdf0e10cSrcweir 		nTargetCount += rtl_convertUnicodeToText(
516cdf0e10cSrcweir 									m_convUnicode2Text,
517cdf0e10cSrcweir 									m_contextUnicode2Text,
518cdf0e10cSrcweir 									&( puSource[nSourceCount] ),
519cdf0e10cSrcweir 									nSourceSize - nSourceCount ,
520cdf0e10cSrcweir 									&( pTarget[nTargetCount] ),
521cdf0e10cSrcweir 									nSeqSize - nTargetCount,
522cdf0e10cSrcweir 									RTL_UNICODETOTEXT_FLAGS_UNDEFINED_DEFAULT |
523cdf0e10cSrcweir 									RTL_UNICODETOTEXT_FLAGS_INVALID_DEFAULT ,
524cdf0e10cSrcweir 									&uiInfo,
525cdf0e10cSrcweir 									&nSrcCvtChars);
526cdf0e10cSrcweir 		nSourceCount += nSrcCvtChars;
527cdf0e10cSrcweir 
528cdf0e10cSrcweir 		if( uiInfo & RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL ) {
529cdf0e10cSrcweir 			nSeqSize = nSeqSize *2;
530cdf0e10cSrcweir 			seqText.realloc( nSeqSize );  // double array size
531cdf0e10cSrcweir 			pTarget = ( sal_Char * ) seqText.getArray();
532cdf0e10cSrcweir 			continue;
533cdf0e10cSrcweir 		}
534cdf0e10cSrcweir 		break;
535cdf0e10cSrcweir 	}
536cdf0e10cSrcweir 
537cdf0e10cSrcweir 	// for surrogates
538cdf0e10cSrcweir 	if( uiInfo & RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL ) {
539cdf0e10cSrcweir 		m_seqSource.realloc( nSourceSize - nSourceCount );
540cdf0e10cSrcweir 		memcpy( m_seqSource.getArray() ,
541cdf0e10cSrcweir 				&(puSource[nSourceCount]),
542cdf0e10cSrcweir 				(nSourceSize - nSourceCount) * sizeof( sal_Unicode ) );
543cdf0e10cSrcweir 	}
544cdf0e10cSrcweir 
545cdf0e10cSrcweir 	if( puTempMem ) {
546cdf0e10cSrcweir 		delete [] puTempMem;
547cdf0e10cSrcweir 	}
548cdf0e10cSrcweir 
549cdf0e10cSrcweir 	// reduce the size of the buffer (fast, no copy necessary)
550cdf0e10cSrcweir 	seqText.realloc( nTargetCount );
551cdf0e10cSrcweir 
552cdf0e10cSrcweir 	return seqText;
553cdf0e10cSrcweir }
554cdf0e10cSrcweir 
init(rtl_TextEncoding encoding)555cdf0e10cSrcweir void Unicode2TextConverter::init( rtl_TextEncoding encoding )
556cdf0e10cSrcweir {
557cdf0e10cSrcweir 	m_bCanContinue = sal_True;
558cdf0e10cSrcweir 	m_bInitialized = sal_True;
559cdf0e10cSrcweir 
560cdf0e10cSrcweir 	m_convUnicode2Text 	= rtl_createUnicodeToTextConverter( encoding );
561cdf0e10cSrcweir 	m_contextUnicode2Text = rtl_createUnicodeToTextContext( m_convUnicode2Text );
562cdf0e10cSrcweir 	m_rtlEncoding = encoding;
563cdf0e10cSrcweir };
564cdf0e10cSrcweir 
565cdf0e10cSrcweir 
566cdf0e10cSrcweir }
567