xref: /trunk/main/sax/inc/xml2utf.hxx (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // TODO: Woher?
29 #define Max( a, b )		(((a)>(b)) ? (a) : (b) )
30 #define Min( a, b )		(((a)<(b)) ? (a) : (b) )
31 
32 /*
33 *
34 * Text2UnicodeConverter
35 *
36 **/
37 namespace sax_expatwrap {
38 
39 class Text2UnicodeConverter
40 {
41 
42 public:
43 	Text2UnicodeConverter( const ::rtl::OString & sEncoding );
44 	~Text2UnicodeConverter();
45 
46 	::com::sun::star::uno::Sequence < sal_Unicode > convert( const ::com::sun::star::uno::Sequence<sal_Int8> & );
47 	sal_Bool canContinue() {  return m_bCanContinue; }
48 
49 private:
50 	void init( rtl_TextEncoding encoding );
51 
52 	rtl_TextToUnicodeConverter 	m_convText2Unicode;
53 	rtl_TextToUnicodeContext   	m_contextText2Unicode;
54 	sal_Bool					m_bCanContinue;
55 	sal_Bool					m_bInitialized;
56 	rtl_TextEncoding			m_rtlEncoding;
57 	::com::sun::star::uno::Sequence<sal_Int8> m_seqSource;
58 };
59 
60 /*----------------------------------------
61 *
62 * Unicode2TextConverter
63 *
64 **-----------------------------------------*/
65 class Unicode2TextConverter
66 {
67 public:
68 	Unicode2TextConverter( rtl_TextEncoding encoding );
69 	~Unicode2TextConverter();
70 
71 	inline ::com::sun::star::uno::Sequence<sal_Int8> convert( const ::rtl::OUString &s )
72 		{
73 			return convert( s.getStr() , s.getLength() );
74 		}
75 	::com::sun::star::uno::Sequence<sal_Int8> convert( const sal_Unicode * , sal_Int32 nLength );
76 	sal_Bool canContinue() {  return m_bCanContinue; }
77 
78 private:
79 	void init( rtl_TextEncoding encoding );
80 
81 	rtl_UnicodeToTextConverter 	m_convUnicode2Text;
82 	rtl_UnicodeToTextContext   	m_contextUnicode2Text;
83 	sal_Bool					m_bCanContinue;
84 	sal_Bool					m_bInitialized;
85 	rtl_TextEncoding			m_rtlEncoding;
86 	::com::sun::star::uno::Sequence<sal_Unicode>		m_seqSource;
87 };
88 
89 
90 
91 /*----------------------------------------
92 *
93 * XMLFile2UTFConverter
94 *
95 **-----------------------------------------*/
96 class XMLFile2UTFConverter
97 {
98 public:
99 	XMLFile2UTFConverter( ):
100 		m_bStarted( sal_False ),
101 		m_pText2Unicode( 0 ),
102 		m_pUnicode2Text( 0 )
103 		{}
104 
105 	~XMLFile2UTFConverter();
106 
107 	void setInputStream( ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream > &r ) { m_in = r; }
108 	void setEncoding( const ::rtl::OString &s ) { m_sEncoding = s; }
109 
110 
111 
112 	// @param nMaxToRead The number of chars, that should be read. Note that this is no exact number. There
113 	//                   may be returned less or more bytes than ordered.
114 	sal_Int32 readAndConvert( ::com::sun::star::uno::Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
115 		throw ( ::com::sun::star::io::IOException,
116 				::com::sun::star::io::NotConnectedException ,
117 				::com::sun::star::io::BufferSizeExceededException ,
118 				::com::sun::star::uno::RuntimeException );
119 
120 private:
121 
122 	// Called only on first Sequence of bytes. Tries to figure out file format and encoding information.
123 	// @return TRUE, when encoding information could be retrieved
124 	// @return FALSE, when no encoding information was found in file
125 	sal_Bool scanForEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
126 
127 	// Called only on first Sequence of bytes. Tries to figure out
128 	// if enough data is available to scan encoding
129 	// @return TRUE, when encoding is retrievable
130 	// @return FALSE, when more data is needed
131 	sal_Bool isEncodingRecognizable( const ::com::sun::star::uno::Sequence< sal_Int8 > & seq );
132 
133 	// When encoding attribute is within the text (in the first line), it is removed.
134 	void removeEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
135 
136 	// Initializes decoding depending on m_sEncoding setting
137 	void initializeDecoding();
138 private:
139 	::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream >  m_in;
140 
141 	sal_Bool m_bStarted;
142 	::rtl::OString m_sEncoding;
143 
144 	Text2UnicodeConverter *m_pText2Unicode;
145 	Unicode2TextConverter *m_pUnicode2Text;
146 };
147 }
148