xref: /trunk/main/sax/inc/xml2utf.hxx (revision 8d192041)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // TODO: Woher?
25 #define Max( a, b )		(((a)>(b)) ? (a) : (b) )
26 #define Min( a, b )		(((a)<(b)) ? (a) : (b) )
27 
28 /*
29 *
30 * Text2UnicodeConverter
31 *
32 **/
33 namespace sax_expatwrap {
34 
35 class Text2UnicodeConverter
36 {
37 
38 public:
39 	Text2UnicodeConverter( const ::rtl::OString & sEncoding );
40 	~Text2UnicodeConverter();
41 
42 	::com::sun::star::uno::Sequence < sal_Unicode > convert( const ::com::sun::star::uno::Sequence<sal_Int8> & );
canContinue()43 	sal_Bool canContinue() {  return m_bCanContinue; }
44 
45 private:
46 	void init( rtl_TextEncoding encoding );
47 
48 	rtl_TextToUnicodeConverter 	m_convText2Unicode;
49 	rtl_TextToUnicodeContext   	m_contextText2Unicode;
50 	sal_Bool					m_bCanContinue;
51 	sal_Bool					m_bInitialized;
52 	rtl_TextEncoding			m_rtlEncoding;
53 	::com::sun::star::uno::Sequence<sal_Int8> m_seqSource;
54 };
55 
56 /*----------------------------------------
57 *
58 * Unicode2TextConverter
59 *
60 **-----------------------------------------*/
61 class Unicode2TextConverter
62 {
63 public:
64 	Unicode2TextConverter( rtl_TextEncoding encoding );
65 	~Unicode2TextConverter();
66 
convert(const::rtl::OUString & s)67 	inline ::com::sun::star::uno::Sequence<sal_Int8> convert( const ::rtl::OUString &s )
68 		{
69 			return convert( s.getStr() , s.getLength() );
70 		}
71 	::com::sun::star::uno::Sequence<sal_Int8> convert( const sal_Unicode * , sal_Int32 nLength );
canContinue()72 	sal_Bool canContinue() {  return m_bCanContinue; }
73 
74 private:
75 	void init( rtl_TextEncoding encoding );
76 
77 	rtl_UnicodeToTextConverter 	m_convUnicode2Text;
78 	rtl_UnicodeToTextContext   	m_contextUnicode2Text;
79 	sal_Bool					m_bCanContinue;
80 	sal_Bool					m_bInitialized;
81 	rtl_TextEncoding			m_rtlEncoding;
82 	::com::sun::star::uno::Sequence<sal_Unicode>		m_seqSource;
83 };
84 
85 
86 
87 /*----------------------------------------
88 *
89 * XMLFile2UTFConverter
90 *
91 **-----------------------------------------*/
92 class XMLFile2UTFConverter
93 {
94 public:
XMLFile2UTFConverter()95 	XMLFile2UTFConverter( ):
96 		m_bStarted( sal_False ),
97 		m_pText2Unicode( 0 ),
98 		m_pUnicode2Text( 0 )
99 		{}
100 
101 	~XMLFile2UTFConverter();
102 
setInputStream(::com::sun::star::uno::Reference<::com::sun::star::io::XInputStream> & r)103 	void setInputStream( ::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream > &r ) { m_in = r; }
setEncoding(const::rtl::OString & s)104 	void setEncoding( const ::rtl::OString &s ) { m_sEncoding = s; }
105 
106 
107 
108 	// @param nMaxToRead The number of chars, that should be read. Note that this is no exact number. There
109 	//                   may be returned less or more bytes than ordered.
110 	sal_Int32 readAndConvert( ::com::sun::star::uno::Sequence<sal_Int8> &seq , sal_Int32 nMaxToRead )
111 		throw ( ::com::sun::star::io::IOException,
112 				::com::sun::star::io::NotConnectedException ,
113 				::com::sun::star::io::BufferSizeExceededException ,
114 				::com::sun::star::uno::RuntimeException );
115 
116 private:
117 
118 	// Called only on first Sequence of bytes. Tries to figure out file format and encoding information.
119 	// @return TRUE, when encoding information could be retrieved
120 	// @return FALSE, when no encoding information was found in file
121 	sal_Bool scanForEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
122 
123 	// Called only on first Sequence of bytes. Tries to figure out
124 	// if enough data is available to scan encoding
125 	// @return TRUE, when encoding is retrievable
126 	// @return FALSE, when more data is needed
127 	sal_Bool isEncodingRecognizable( const ::com::sun::star::uno::Sequence< sal_Int8 > & seq );
128 
129 	// When encoding attribute is within the text (in the first line), it is removed.
130 	void removeEncoding( ::com::sun::star::uno::Sequence<sal_Int8> &seq );
131 
132 	// Initializes decoding depending on m_sEncoding setting
133 	void initializeDecoding();
134 private:
135 	::com::sun::star::uno::Reference< ::com::sun::star::io::XInputStream >  m_in;
136 
137 	sal_Bool m_bStarted;
138 	::rtl::OString m_sEncoding;
139 
140 	Text2UnicodeConverter *m_pText2Unicode;
141 	Unicode2TextConverter *m_pUnicode2Text;
142 };
143 }
144