xref: /aoo42x/main/oox/source/vml/vmlinputstream.cxx (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 #include "oox/vml/vmlinputstream.hxx"
29 
30 #include <com/sun/star/io/XTextInputStream.hpp>
31 #include <map>
32 #include <string.h>
33 #include <rtl/strbuf.hxx>
34 #include "oox/helper/helper.hxx"
35 #include "oox/helper/textinputstream.hxx"
36 
37 namespace oox {
38 namespace vml {
39 
40 // ============================================================================
41 
42 using namespace ::com::sun::star::io;
43 using namespace ::com::sun::star::uno;
44 
45 using ::rtl::OString;
46 using ::rtl::OStringBuffer;
47 
48 // ============================================================================
49 
50 namespace {
51 
52 inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar )
53 {
54     sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar );
55     return (nIndex < 0) ? pcEnd : (pcBeg + nIndex);
56 }
57 
58 inline bool lclIsWhiteSpace( sal_Char cChar )
59 {
60     return cChar < 32;
61 }
62 
63 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
64 {
65     for( ; pcBeg < pcEnd; ++pcBeg )
66         if( lclIsWhiteSpace( *pcBeg ) )
67             return pcBeg;
68     return pcEnd;
69 }
70 
71 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd )
72 {
73     for( ; pcBeg < pcEnd; ++pcBeg )
74         if( !lclIsWhiteSpace( *pcBeg ) )
75             return pcBeg;
76     return pcEnd;
77 }
78 
79 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd )
80 {
81     while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) )
82         --pcEnd;
83     return pcEnd;
84 }
85 
86 inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
87 {
88     rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) );
89 }
90 
91 // ----------------------------------------------------------------------------
92 
93 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd )
94 {
95     /*  Map attribute names to char-pointer of all attributes. This map is used
96         to find multiple occurences of attributes with the same name. The
97         mapped pointers are used as map key in the next map below. */
98     typedef ::std::map< OString, const sal_Char* > AttributeNameMap;
99     AttributeNameMap aAttributeNames;
100 
101     /*  Map the char-pointers of all attributes to the full attribute definition
102         string. This preserves the original order of the used attributes. */
103     typedef ::std::map< const sal_Char*, OString > AttributeDataMap;
104     AttributeDataMap aAttributes;
105 
106     bool bOk = true;
107     const sal_Char* pcNameBeg = pcBeg;
108     while( bOk && (pcNameBeg < pcEnd) )
109     {
110         // pcNameBeg points to begin of attribute name, find equality sign
111         const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' );
112         if( (bOk = pcEqualSign < pcEnd) == true )
113         {
114             // find end of attribute name (ignore whitespace between name and equality sign)
115             const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign );
116             if( (bOk = pcNameBeg < pcNameEnd) == true )
117             {
118                 // find begin of attribute value (must be single or double quote)
119                 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd );
120                 if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) == true )
121                 {
122                     // find end of attribute value (matching quote character)
123                     const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg );
124                     if( (bOk = pcValueEnd < pcEnd) == true )
125                     {
126                         ++pcValueEnd;
127                         OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) );
128                         OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) );
129                         // search for an existing attribute with the same name
130                         AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName );
131                         // remove its definition from the data map
132                         if( aIt != aAttributeNames.end() )
133                             aAttributes.erase( aIt->second );
134                         // insert the attribute into both maps
135                         aAttributeNames[ aAttribName ] = pcNameBeg;
136                         aAttributes[ pcNameBeg ] = aAttribData;
137                         // continue with next attribute (skip whitespace after this attribute)
138                         pcNameBeg = pcValueEnd;
139                         if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg )) == true) )
140                             pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd );
141                     }
142                 }
143             }
144         }
145     }
146 
147     // if no error has occured, build the resulting attribute list
148     if( bOk )
149         for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt )
150             rBuffer.append( ' ' ).append( aIt->second );
151     // on error, just append the complete passed string
152     else
153         lclAppendToBuffer( rBuffer, pcBeg, pcEnd );
154 }
155 
156 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement )
157 {
158     // check that passed string starts and ends with the brackets of an XML element
159     sal_Int32 nElementLen = rElement.getLength();
160     if( nElementLen == 0 )
161         return;
162 
163     const sal_Char* pcOpen = rElement.getStr();
164     const sal_Char* pcClose = pcOpen + nElementLen - 1;
165 
166     // no complete element found
167     if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') )
168     {
169         // just append all passed characters
170         rBuffer.append( rElement );
171     }
172 
173     // skip parser instructions: '<![...]>'
174     else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') )
175     {
176         // do nothing
177     }
178 
179     // replace '<br>' element with newline
180     else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) )
181     {
182         rBuffer.append( '\n' );
183     }
184 
185     // check start elements and simple elements for repeated attributes
186     else if( pcOpen[ 1 ] != '/' )
187     {
188         // find positions of text content inside brackets, exclude '/' in '<simpleelement/>'
189         const sal_Char* pcContentBeg = pcOpen + 1;
190         bool bIsEmptyElement = pcClose[ -1 ] == '/';
191         const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose;
192         // append opening bracket and element name to buffer
193         const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd );
194         lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace );
195         // find begin of attributes, and process all attributes
196         const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd );
197         if( pcAttribBeg < pcContentEnd )
198             lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd );
199         // close the element
200         if( bIsEmptyElement )
201             rBuffer.append( '/' );
202         rBuffer.append( '>' );
203     }
204 
205     // append end elements without further processing
206     else
207     {
208         rBuffer.append( rElement );
209     }
210 }
211 
212 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars )
213 {
214     /*  MSO has a very weird way to store and handle whitespaces. The stream
215         may contain lots of spaces, tabs, and newlines which have to be handled
216         as single space character. This will be done in this function.
217 
218         If the element text contains a literal line break, it will be stored as
219         <br> tag (without matching </br> element). This input stream wrapper
220         will replace this element with a literal LF character (see below).
221 
222         A single space character for its own is stored as is. Example: The
223         element
224             <font> </font>
225         represents a single space character. The XML parser will ignore this
226         space character completely without issuing a 'characters' event. The
227         VML import filter implementation has to react on this case manually.
228 
229         A single space character following another character is stored
230         literally and must not be stipped away here. Example: The element
231             <font>abc </font>
232         contains the three letters a, b, and c, followed by a space character.
233 
234         Consecutive space characters, or a leading single space character, are
235         stored in a <span> element. If there are N space characters (N > 1),
236         then the <span> element contains exactly (N-1) NBSP (non-breaking
237         space) characters, followed by a regular space character. Examples:
238         The element
239             <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font>
240         represents 4 consecutive space characters. Has to be handled by the
241         implementation. The element
242             <font><span style='mso-spacerun:yes'> abc</span></font>
243         represents a space characters followed by the letters a, b, c. These
244         strings have to be handled by the VML import filter implementation.
245      */
246 
247     // passed string ends with the leading opening bracket of an XML element
248     const sal_Char* pcBeg = rChars.getStr();
249     const sal_Char* pcEnd = pcBeg + rChars.getLength();
250     bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<');
251     if( bHasBracket ) --pcEnd;
252 
253     // skip leading whitespace
254     const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd );
255     while( pcContentsBeg < pcEnd )
256     {
257         const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd );
258         lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg );
259         if( pcWhitespaceBeg < pcEnd )
260             rBuffer.append( ' ' );
261         pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd );
262     }
263 
264     return bHasBracket;
265 }
266 
267 } // namespace
268 
269 // ============================================================================
270 
271 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) :
272     // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters
273     mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ),
274     maOpeningBracket( 1 ),
275     maClosingBracket( 1 ),
276     maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ),
277     maClosingCData( CREATE_OSTRING( "]]>" ) ),
278     mnBufferPos( 0 )
279 {
280     maOpeningBracket[ 0 ] = '<';
281     maClosingBracket[ 0 ] = '>';
282 }
283 
284 InputStream::~InputStream()
285 {
286 }
287 
288 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead )
289         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
290 {
291     if( nBytesToRead < 0 )
292         throw IOException();
293 
294     rData.realloc( nBytesToRead );
295     sal_Int8* pcDest = rData.getArray();
296     sal_Int32 nRet = 0;
297     while( (nBytesToRead > 0) && !mxTextStrm->isEOF() )
298     {
299         updateBuffer();
300         sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos );
301         if( nReadSize > 0 )
302         {
303             memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) );
304             mnBufferPos += nReadSize;
305             nBytesToRead -= nReadSize;
306             nRet += nReadSize;
307         }
308     }
309     if( nRet < rData.getLength() )
310         rData.realloc( nRet );
311     return nRet;
312 }
313 
314 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead )
315         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
316 {
317     return readBytes( rData, nMaxBytesToRead );
318 }
319 
320 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip )
321         throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException)
322 {
323     if( nBytesToSkip < 0 )
324         throw IOException();
325 
326     while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() )
327     {
328         updateBuffer();
329         sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos );
330         mnBufferPos += nSkipSize;
331         nBytesToSkip -= nSkipSize;
332     }
333 }
334 
335 sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException)
336 {
337     updateBuffer();
338     return maBuffer.getLength() - mnBufferPos;
339 }
340 
341 void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException)
342 {
343     mxTextStrm->closeInput();
344 }
345 
346 // private --------------------------------------------------------------------
347 
348 void InputStream::updateBuffer() throw (IOException, RuntimeException)
349 {
350     while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() )
351     {
352         // collect new contents in a string buffer
353         OStringBuffer aBuffer;
354 
355         // read and process characters until the opening bracket of the next XML element
356         OString aChars = readToElementBegin();
357         bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars );
358 
359         // read and process characters until (and including) closing bracket (an XML element)
360         OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" );
361         if( bHasOpeningBracket && !mxTextStrm->isEOF() )
362         {
363             // read the element text (add the leading opening bracket manually)
364             OString aElement = OString( '<' ) + readToElementEnd();
365             // check for CDATA part, starting with '<![CDATA['
366             if( aElement.match( maOpeningCData ) )
367             {
368                 // search the end tag ']]>'
369                 while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.match( maClosingCData, aElement.getLength() - maClosingCData.getLength() )) && !mxTextStrm->isEOF() )
370                     aElement += readToElementEnd();
371                 // copy the entire CDATA part
372                 aBuffer.append( aElement );
373             }
374             else
375             {
376                 // no CDATA part - process the contents of the element
377                 lclProcessElement( aBuffer, aElement );
378             }
379         }
380 
381         maBuffer = aBuffer.makeStringAndClear();
382         mnBufferPos = 0;
383     }
384 }
385 
386 OString InputStream::readToElementBegin() throw (IOException, RuntimeException)
387 {
388     return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
389 }
390 
391 OString InputStream::readToElementEnd() throw (IOException, RuntimeException)
392 {
393     OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 );
394     OSL_ENSURE( (aText.getLength() > 0) && (aText[ aText.getLength() - 1 ] == '>'), "InputStream::readToElementEnd - missing closing bracket of XML element" );
395     return aText;
396 }
397 
398 // ============================================================================
399 
400 } // namespace vml
401 } // namespave oox
402