1 /************************************************************************* 2 * 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * Copyright 2000, 2010 Oracle and/or its affiliates. 6 * 7 * OpenOffice.org - a multi-platform office productivity suite 8 * 9 * This file is part of OpenOffice.org. 10 * 11 * OpenOffice.org is free software: you can redistribute it and/or modify 12 * it under the terms of the GNU Lesser General Public License version 3 13 * only, as published by the Free Software Foundation. 14 * 15 * OpenOffice.org is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU Lesser General Public License version 3 for more details 19 * (a copy is included in the LICENSE file that accompanied this code). 20 * 21 * You should have received a copy of the GNU Lesser General Public License 22 * version 3 along with OpenOffice.org. If not, see 23 * <http://www.openoffice.org/license.html> 24 * for a copy of the LGPLv3 License. 25 * 26 ************************************************************************/ 27 28 #include "oox/vml/vmlinputstream.hxx" 29 30 #include <com/sun/star/io/XTextInputStream.hpp> 31 #include <map> 32 #include <string.h> 33 #include <rtl/strbuf.hxx> 34 #include "oox/helper/helper.hxx" 35 #include "oox/helper/textinputstream.hxx" 36 37 namespace oox { 38 namespace vml { 39 40 // ============================================================================ 41 42 using namespace ::com::sun::star::io; 43 using namespace ::com::sun::star::uno; 44 45 using ::rtl::OString; 46 using ::rtl::OStringBuffer; 47 48 // ============================================================================ 49 50 namespace { 51 52 inline const sal_Char* lclFindCharacter( const sal_Char* pcBeg, const sal_Char* pcEnd, sal_Char cChar ) 53 { 54 sal_Int32 nIndex = rtl_str_indexOfChar_WithLength( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ), cChar ); 55 return (nIndex < 0) ? pcEnd : (pcBeg + nIndex); 56 } 57 58 inline bool lclIsWhiteSpace( sal_Char cChar ) 59 { 60 return cChar < 32; 61 } 62 63 const sal_Char* lclFindWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd ) 64 { 65 for( ; pcBeg < pcEnd; ++pcBeg ) 66 if( lclIsWhiteSpace( *pcBeg ) ) 67 return pcBeg; 68 return pcEnd; 69 } 70 71 const sal_Char* lclFindNonWhiteSpace( const sal_Char* pcBeg, const sal_Char* pcEnd ) 72 { 73 for( ; pcBeg < pcEnd; ++pcBeg ) 74 if( !lclIsWhiteSpace( *pcBeg ) ) 75 return pcBeg; 76 return pcEnd; 77 } 78 79 const sal_Char* lclTrimWhiteSpaceFromEnd( const sal_Char* pcBeg, const sal_Char* pcEnd ) 80 { 81 while( (pcBeg < pcEnd) && lclIsWhiteSpace( pcEnd[ -1 ] ) ) 82 --pcEnd; 83 return pcEnd; 84 } 85 86 inline void lclAppendToBuffer( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd ) 87 { 88 rBuffer.append( pcBeg, static_cast< sal_Int32 >( pcEnd - pcBeg ) ); 89 } 90 91 // ---------------------------------------------------------------------------- 92 93 void lclProcessAttribs( OStringBuffer& rBuffer, const sal_Char* pcBeg, const sal_Char* pcEnd ) 94 { 95 /* Map attribute names to char-pointer of all attributes. This map is used 96 to find multiple occurences of attributes with the same name. The 97 mapped pointers are used as map key in the next map below. */ 98 typedef ::std::map< OString, const sal_Char* > AttributeNameMap; 99 AttributeNameMap aAttributeNames; 100 101 /* Map the char-pointers of all attributes to the full attribute definition 102 string. This preserves the original order of the used attributes. */ 103 typedef ::std::map< const sal_Char*, OString > AttributeDataMap; 104 AttributeDataMap aAttributes; 105 106 bool bOk = true; 107 const sal_Char* pcNameBeg = pcBeg; 108 while( bOk && (pcNameBeg < pcEnd) ) 109 { 110 // pcNameBeg points to begin of attribute name, find equality sign 111 const sal_Char* pcEqualSign = lclFindCharacter( pcNameBeg, pcEnd, '=' ); 112 if( (bOk = pcEqualSign < pcEnd) == true ) 113 { 114 // find end of attribute name (ignore whitespace between name and equality sign) 115 const sal_Char* pcNameEnd = lclTrimWhiteSpaceFromEnd( pcNameBeg, pcEqualSign ); 116 if( (bOk = pcNameBeg < pcNameEnd) == true ) 117 { 118 // find begin of attribute value (must be single or double quote) 119 const sal_Char* pcValueBeg = lclFindNonWhiteSpace( pcEqualSign + 1, pcEnd ); 120 if( (bOk = (pcValueBeg < pcEnd) && ((*pcValueBeg == '\'') || (*pcValueBeg == '"'))) == true ) 121 { 122 // find end of attribute value (matching quote character) 123 const sal_Char* pcValueEnd = lclFindCharacter( pcValueBeg + 1, pcEnd, *pcValueBeg ); 124 if( (bOk = pcValueEnd < pcEnd) == true ) 125 { 126 ++pcValueEnd; 127 OString aAttribName( pcNameBeg, static_cast< sal_Int32 >( pcNameEnd - pcNameBeg ) ); 128 OString aAttribData( pcNameBeg, static_cast< sal_Int32 >( pcValueEnd - pcNameBeg ) ); 129 // search for an existing attribute with the same name 130 AttributeNameMap::iterator aIt = aAttributeNames.find( aAttribName ); 131 // remove its definition from the data map 132 if( aIt != aAttributeNames.end() ) 133 aAttributes.erase( aIt->second ); 134 // insert the attribute into both maps 135 aAttributeNames[ aAttribName ] = pcNameBeg; 136 aAttributes[ pcNameBeg ] = aAttribData; 137 // continue with next attribute (skip whitespace after this attribute) 138 pcNameBeg = pcValueEnd; 139 if( (pcNameBeg < pcEnd) && ((bOk = lclIsWhiteSpace( *pcNameBeg )) == true) ) 140 pcNameBeg = lclFindNonWhiteSpace( pcNameBeg + 1, pcEnd ); 141 } 142 } 143 } 144 } 145 } 146 147 // if no error has occured, build the resulting attribute list 148 if( bOk ) 149 for( AttributeDataMap::iterator aIt = aAttributes.begin(), aEnd = aAttributes.end(); aIt != aEnd; ++aIt ) 150 rBuffer.append( ' ' ).append( aIt->second ); 151 // on error, just append the complete passed string 152 else 153 lclAppendToBuffer( rBuffer, pcBeg, pcEnd ); 154 } 155 156 void lclProcessElement( OStringBuffer& rBuffer, const OString& rElement ) 157 { 158 // check that passed string starts and ends with the brackets of an XML element 159 sal_Int32 nElementLen = rElement.getLength(); 160 if( nElementLen == 0 ) 161 return; 162 163 const sal_Char* pcOpen = rElement.getStr(); 164 const sal_Char* pcClose = pcOpen + nElementLen - 1; 165 166 // no complete element found 167 if( (pcOpen >= pcClose) || (*pcOpen != '<') || (*pcClose != '>') ) 168 { 169 // just append all passed characters 170 rBuffer.append( rElement ); 171 } 172 173 // skip parser instructions: '<![...]>' 174 else if( (nElementLen >= 5) && (pcOpen[ 1 ] == '!') && (pcOpen[ 2 ] == '[') && (pcClose[ -1 ] == ']') ) 175 { 176 // do nothing 177 } 178 179 // replace '<br>' element with newline 180 else if( (nElementLen >= 4) && (pcOpen[ 1 ] == 'b') && (pcOpen[ 2 ] == 'r') && (lclFindNonWhiteSpace( pcOpen + 3, pcClose ) == pcClose) ) 181 { 182 rBuffer.append( '\n' ); 183 } 184 185 // check start elements and simple elements for repeated attributes 186 else if( pcOpen[ 1 ] != '/' ) 187 { 188 // find positions of text content inside brackets, exclude '/' in '<simpleelement/>' 189 const sal_Char* pcContentBeg = pcOpen + 1; 190 bool bIsEmptyElement = pcClose[ -1 ] == '/'; 191 const sal_Char* pcContentEnd = bIsEmptyElement ? (pcClose - 1) : pcClose; 192 // append opening bracket and element name to buffer 193 const sal_Char* pcWhiteSpace = lclFindWhiteSpace( pcContentBeg, pcContentEnd ); 194 lclAppendToBuffer( rBuffer, pcOpen, pcWhiteSpace ); 195 // find begin of attributes, and process all attributes 196 const sal_Char* pcAttribBeg = lclFindNonWhiteSpace( pcWhiteSpace, pcContentEnd ); 197 if( pcAttribBeg < pcContentEnd ) 198 lclProcessAttribs( rBuffer, pcAttribBeg, pcContentEnd ); 199 // close the element 200 if( bIsEmptyElement ) 201 rBuffer.append( '/' ); 202 rBuffer.append( '>' ); 203 } 204 205 // append end elements without further processing 206 else 207 { 208 rBuffer.append( rElement ); 209 } 210 } 211 212 bool lclProcessCharacters( OStringBuffer& rBuffer, const OString& rChars ) 213 { 214 /* MSO has a very weird way to store and handle whitespaces. The stream 215 may contain lots of spaces, tabs, and newlines which have to be handled 216 as single space character. This will be done in this function. 217 218 If the element text contains a literal line break, it will be stored as 219 <br> tag (without matching </br> element). This input stream wrapper 220 will replace this element with a literal LF character (see below). 221 222 A single space character for its own is stored as is. Example: The 223 element 224 <font> </font> 225 represents a single space character. The XML parser will ignore this 226 space character completely without issuing a 'characters' event. The 227 VML import filter implementation has to react on this case manually. 228 229 A single space character following another character is stored 230 literally and must not be stipped away here. Example: The element 231 <font>abc </font> 232 contains the three letters a, b, and c, followed by a space character. 233 234 Consecutive space characters, or a leading single space character, are 235 stored in a <span> element. If there are N space characters (N > 1), 236 then the <span> element contains exactly (N-1) NBSP (non-breaking 237 space) characters, followed by a regular space character. Examples: 238 The element 239 <font><span style='mso-spacerun:yes'>\xA0\xA0\xA0 </span></font> 240 represents 4 consecutive space characters. Has to be handled by the 241 implementation. The element 242 <font><span style='mso-spacerun:yes'> abc</span></font> 243 represents a space characters followed by the letters a, b, c. These 244 strings have to be handled by the VML import filter implementation. 245 */ 246 247 // passed string ends with the leading opening bracket of an XML element 248 const sal_Char* pcBeg = rChars.getStr(); 249 const sal_Char* pcEnd = pcBeg + rChars.getLength(); 250 bool bHasBracket = (pcBeg < pcEnd) && (pcEnd[ -1 ] == '<'); 251 if( bHasBracket ) --pcEnd; 252 253 // skip leading whitespace 254 const sal_Char* pcContentsBeg = lclFindNonWhiteSpace( pcBeg, pcEnd ); 255 while( pcContentsBeg < pcEnd ) 256 { 257 const sal_Char* pcWhitespaceBeg = lclFindWhiteSpace( pcContentsBeg + 1, pcEnd ); 258 lclAppendToBuffer( rBuffer, pcContentsBeg, pcWhitespaceBeg ); 259 if( pcWhitespaceBeg < pcEnd ) 260 rBuffer.append( ' ' ); 261 pcContentsBeg = lclFindNonWhiteSpace( pcWhitespaceBeg, pcEnd ); 262 } 263 264 return bHasBracket; 265 } 266 267 } // namespace 268 269 // ============================================================================ 270 271 InputStream::InputStream( const Reference< XComponentContext >& rxContext, const Reference< XInputStream >& rxInStrm ) : 272 // use single-byte ISO-8859-1 encoding which maps all byte characters to the first 256 Unicode characters 273 mxTextStrm( TextInputStream::createXTextInputStream( rxContext, rxInStrm, RTL_TEXTENCODING_ISO_8859_1 ) ), 274 maOpeningBracket( 1 ), 275 maClosingBracket( 1 ), 276 maOpeningCData( CREATE_OSTRING( "<![CDATA[" ) ), 277 maClosingCData( CREATE_OSTRING( "]]>" ) ), 278 mnBufferPos( 0 ) 279 { 280 maOpeningBracket[ 0 ] = '<'; 281 maClosingBracket[ 0 ] = '>'; 282 } 283 284 InputStream::~InputStream() 285 { 286 } 287 288 sal_Int32 SAL_CALL InputStream::readBytes( Sequence< sal_Int8 >& rData, sal_Int32 nBytesToRead ) 289 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException) 290 { 291 if( nBytesToRead < 0 ) 292 throw IOException(); 293 294 rData.realloc( nBytesToRead ); 295 sal_Int8* pcDest = rData.getArray(); 296 sal_Int32 nRet = 0; 297 while( (nBytesToRead > 0) && !mxTextStrm->isEOF() ) 298 { 299 updateBuffer(); 300 sal_Int32 nReadSize = ::std::min( nBytesToRead, maBuffer.getLength() - mnBufferPos ); 301 if( nReadSize > 0 ) 302 { 303 memcpy( pcDest + nRet, maBuffer.getStr() + mnBufferPos, static_cast< size_t >( nReadSize ) ); 304 mnBufferPos += nReadSize; 305 nBytesToRead -= nReadSize; 306 nRet += nReadSize; 307 } 308 } 309 if( nRet < rData.getLength() ) 310 rData.realloc( nRet ); 311 return nRet; 312 } 313 314 sal_Int32 SAL_CALL InputStream::readSomeBytes( Sequence< sal_Int8 >& rData, sal_Int32 nMaxBytesToRead ) 315 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException) 316 { 317 return readBytes( rData, nMaxBytesToRead ); 318 } 319 320 void SAL_CALL InputStream::skipBytes( sal_Int32 nBytesToSkip ) 321 throw (NotConnectedException, BufferSizeExceededException, IOException, RuntimeException) 322 { 323 if( nBytesToSkip < 0 ) 324 throw IOException(); 325 326 while( (nBytesToSkip > 0) && !mxTextStrm->isEOF() ) 327 { 328 updateBuffer(); 329 sal_Int32 nSkipSize = ::std::min( nBytesToSkip, maBuffer.getLength() - mnBufferPos ); 330 mnBufferPos += nSkipSize; 331 nBytesToSkip -= nSkipSize; 332 } 333 } 334 335 sal_Int32 SAL_CALL InputStream::available() throw (NotConnectedException, IOException, RuntimeException) 336 { 337 updateBuffer(); 338 return maBuffer.getLength() - mnBufferPos; 339 } 340 341 void SAL_CALL InputStream::closeInput() throw (NotConnectedException, IOException, RuntimeException) 342 { 343 mxTextStrm->closeInput(); 344 } 345 346 // private -------------------------------------------------------------------- 347 348 void InputStream::updateBuffer() throw (IOException, RuntimeException) 349 { 350 while( (mnBufferPos >= maBuffer.getLength()) && !mxTextStrm->isEOF() ) 351 { 352 // collect new contents in a string buffer 353 OStringBuffer aBuffer; 354 355 // read and process characters until the opening bracket of the next XML element 356 OString aChars = readToElementBegin(); 357 bool bHasOpeningBracket = lclProcessCharacters( aBuffer, aChars ); 358 359 // read and process characters until (and including) closing bracket (an XML element) 360 OSL_ENSURE( bHasOpeningBracket || mxTextStrm->isEOF(), "InputStream::updateBuffer - missing opening bracket of XML element" ); 361 if( bHasOpeningBracket && !mxTextStrm->isEOF() ) 362 { 363 // read the element text (add the leading opening bracket manually) 364 OString aElement = OString( '<' ) + readToElementEnd(); 365 // check for CDATA part, starting with '<![CDATA[' 366 if( aElement.match( maOpeningCData ) ) 367 { 368 // search the end tag ']]>' 369 while( ((aElement.getLength() < maClosingCData.getLength()) || !aElement.match( maClosingCData, aElement.getLength() - maClosingCData.getLength() )) && !mxTextStrm->isEOF() ) 370 aElement += readToElementEnd(); 371 // copy the entire CDATA part 372 aBuffer.append( aElement ); 373 } 374 else 375 { 376 // no CDATA part - process the contents of the element 377 lclProcessElement( aBuffer, aElement ); 378 } 379 } 380 381 maBuffer = aBuffer.makeStringAndClear(); 382 mnBufferPos = 0; 383 } 384 } 385 386 OString InputStream::readToElementBegin() throw (IOException, RuntimeException) 387 { 388 return OUStringToOString( mxTextStrm->readString( maOpeningBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 ); 389 } 390 391 OString InputStream::readToElementEnd() throw (IOException, RuntimeException) 392 { 393 OString aText = OUStringToOString( mxTextStrm->readString( maClosingBracket, sal_False ), RTL_TEXTENCODING_ISO_8859_1 ); 394 OSL_ENSURE( (aText.getLength() > 0) && (aText[ aText.getLength() - 1 ] == '>'), "InputStream::readToElementEnd - missing closing bracket of XML element" ); 395 return aText; 396 } 397 398 // ============================================================================ 399 400 } // namespace vml 401 } // namespave oox 402