1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_svtools.hxx"
26
27 #include <stdio.h> // for EOF
28 #include <rtl/tencinfo.h>
29 #include <tools/stream.hxx>
30 #include <tools/debug.hxx>
31 #include <svtools/rtftoken.h>
32 #include <svtools/rtfkeywd.hxx>
33 #include <svtools/parrtf.hxx>
34
35 const int MAX_STRING_LEN = 1024;
36 const int MAX_TOKEN_LEN = 128;
37
38 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
39 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
40
SvRTFParser(SvStream & rIn,sal_uInt8 nStackSize)41 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
42 : SvParser( rIn, nStackSize ),
43 eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default is ANSI code set
44 nUCharOverread( 1 )
45 {
46 // default is ANSI code set
47 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
48 bRTF_InTextRead = false;
49 }
50
~SvRTFParser()51 SvRTFParser::~SvRTFParser()
52 {
53 }
54
55
56
57
_GetNextToken()58 int SvRTFParser::_GetNextToken()
59 {
60 int nRet = 0;
61 do {
62 int bNextCh = true;
63 switch( nNextCh )
64 {
65 case '\\':
66 {
67 // Steuerzeichen
68 switch( nNextCh = GetNextChar() )
69 {
70 case '{':
71 case '}':
72 case '\\':
73 case '+': // habe ich in einem RTF-File gefunden
74 case '~': // nonbreaking space
75 case '-': // optional hyphen
76 case '_': // nonbreaking hyphen
77 case '\'': // HexValue
78 nNextCh = '\\';
79 rInput.SeekRel( -1 );
80 ScanText();
81 nRet = RTF_TEXTTOKEN;
82 bNextCh = 0 == nNextCh;
83 break;
84
85 case '*': // ignoreflag
86 nRet = RTF_IGNOREFLAG;
87 break;
88 case ':': // subentry in an index entry
89 nRet = RTF_SUBENTRYINDEX;
90 break;
91 case '|': // formula-character
92 nRet = RTF_FORMULA;
93 break;
94
95 case 0x0a:
96 case 0x0d:
97 nRet = RTF_PAR;
98 break;
99
100 default:
101 if( RTF_ISALPHA( nNextCh ) )
102 {
103 aToken = '\\';
104 {
105 String aStrBuffer;
106 sal_Unicode* pStr = aStrBuffer.AllocBuffer(
107 MAX_TOKEN_LEN );
108 xub_StrLen nStrLen = 0;
109 do {
110 *(pStr + nStrLen++) = nNextCh;
111 if( MAX_TOKEN_LEN == nStrLen )
112 {
113 aToken += aStrBuffer;
114 aToken.GetBufferAccess(); // make unique string!
115 nStrLen = 0;
116 }
117 nNextCh = GetNextChar();
118 } while( RTF_ISALPHA( nNextCh ) );
119 if( nStrLen )
120 {
121 aStrBuffer.ReleaseBufferAccess( nStrLen );
122 aToken += aStrBuffer;
123 }
124 }
125
126 // Minus fuer numerischen Parameter
127 int bNegValue = false;
128 if( '-' == nNextCh )
129 {
130 bNegValue = true;
131 nNextCh = GetNextChar();
132 }
133
134 // evt. Numerischer Parameter
135 if( RTF_ISDIGIT( nNextCh ) )
136 {
137 nTokenValue = 0;
138 do {
139 nTokenValue *= 10;
140 nTokenValue += nNextCh - '0';
141 nNextCh = GetNextChar();
142 } while( RTF_ISDIGIT( nNextCh ) );
143 if( bNegValue )
144 nTokenValue = -nTokenValue;
145 bTokenHasValue=true;
146 }
147 else if( bNegValue ) // das Minus wieder zurueck
148 {
149 nNextCh = '-';
150 rInput.SeekRel( -1 );
151 }
152 if( ' ' == nNextCh ) // Blank gehoert zum Token!
153 nNextCh = GetNextChar();
154
155 // suche das Token in der Tabelle:
156 if( 0 == (nRet = GetRTFToken( aToken )) )
157 // Unknown Control
158 nRet = RTF_UNKNOWNCONTROL;
159
160 // bug 76812 - unicode token handled as normal text
161 bNextCh = false;
162 switch( nRet )
163 {
164 case RTF_UC:
165 if( 0 <= nTokenValue )
166 {
167 nUCharOverread = (sal_uInt8)nTokenValue;
168 #if 1
169 // cmc: other ifdef breaks #i3584
170 aParserStates.top().
171 nUCharOverread = nUCharOverread;
172 #else
173 if( !nUCharOverread )
174 nUCharOverread = aParserStates.top().nUCharOverread;
175 else
176 aParserStates.top().
177 nUCharOverread = nUCharOverread;
178 #endif
179 }
180 aToken.Erase(); // #i47831# erase token to prevent the token from being treated as text
181 // read next token
182 nRet = 0;
183 break;
184
185 case RTF_UPR:
186 if (!_inSkipGroup) {
187 // UPR - overread the group with the ansi
188 // information
189 while( '{' != _GetNextToken() )
190 ;
191 SkipGroup();
192 _GetNextToken(); // overread the last bracket
193 nRet = 0;
194 }
195 break;
196
197 case RTF_U:
198 if( !bRTF_InTextRead )
199 {
200 nRet = RTF_TEXTTOKEN;
201 aToken = (sal_Unicode)nTokenValue;
202
203 // overread the next n "RTF" characters. This
204 // can be also \{, \}, \'88
205 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
206 {
207 sal_Unicode cAnsi = nNextCh;
208 while( 0xD == cAnsi )
209 cAnsi = GetNextChar();
210 while( 0xA == cAnsi )
211 cAnsi = GetNextChar();
212
213 if( '\\' == cAnsi &&
214 '\'' == ( cAnsi = GetNextChar() ))
215 // HexValue ueberlesen
216 cAnsi = GetHexValue();
217 nNextCh = GetNextChar();
218 }
219 ScanText();
220 bNextCh = 0 == nNextCh;
221 }
222 break;
223 }
224 }
225 else if( SVPAR_PENDING != eState )
226 {
227 // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
228 // eState = SVPAR_ERROR;
229 bNextCh = false;
230 }
231 break;
232 }
233 }
234 break;
235
236 case sal_Unicode(EOF):
237 eState = SVPAR_ACCEPTED;
238 nRet = nNextCh;
239 break;
240
241 case '{':
242 {
243 if( 0 <= nOpenBrakets )
244 {
245 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
246 aParserStates.push( aState );
247 }
248 ++nOpenBrakets;
249 DBG_ASSERT(
250 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
251 "ParserStateStack unequal to bracket count" );
252 nRet = nNextCh;
253 }
254 break;
255
256 case '}':
257 --nOpenBrakets;
258 if( 0 <= nOpenBrakets )
259 {
260 aParserStates.pop();
261 if( !aParserStates.empty() )
262 {
263 const RtfParserState_Impl& rRPS =
264 aParserStates.top();
265 nUCharOverread = rRPS.nUCharOverread;
266 SetSrcEncoding( rRPS.eCodeSet );
267 }
268 else
269 {
270 nUCharOverread = 1;
271 SetSrcEncoding( GetCodeSet() );
272 }
273 }
274 DBG_ASSERT(
275 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
276 "ParserStateStack unequal to bracket count" );
277 nRet = nNextCh;
278 break;
279
280 case 0x0d:
281 case 0x0a:
282 break;
283
284 default:
285 // es folgt normaler Text
286 ScanText();
287 nRet = RTF_TEXTTOKEN;
288 bNextCh = 0 == nNextCh;
289 break;
290 }
291
292 if( bNextCh )
293 nNextCh = GetNextChar();
294
295 } while( !nRet && SVPAR_WORKING == eState );
296 return nRet;
297 }
298
299
GetHexValue()300 sal_Unicode SvRTFParser::GetHexValue()
301 {
302 // Hex-Wert sammeln
303 register int n;
304 register sal_Unicode nHexVal = 0;
305
306 for( n = 0; n < 2; ++n )
307 {
308 nHexVal *= 16;
309 nNextCh = GetNextChar();
310 if( nNextCh >= '0' && nNextCh <= '9' )
311 nHexVal += (nNextCh - 48);
312 else if( nNextCh >= 'a' && nNextCh <= 'f' )
313 nHexVal += (nNextCh - 87);
314 else if( nNextCh >= 'A' && nNextCh <= 'F' )
315 nHexVal += (nNextCh - 55);
316 }
317 return nHexVal;
318 }
319
ScanText(const sal_Unicode cBreak)320 void SvRTFParser::ScanText( const sal_Unicode cBreak )
321 {
322 String aStrBuffer;
323 int bWeiter = true;
324 while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
325 {
326 int bNextCh = true;
327 switch( nNextCh )
328 {
329 case '\\':
330 {
331 switch (nNextCh = GetNextChar())
332 {
333 case '\'':
334 {
335
336 #if 0
337 // #i35653 patch from cmc
338 ByteString aByteString(static_cast<char>(GetHexValue()));
339 if (aByteString.Len())
340 aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
341 #else
342 ByteString aByteString;
343 while (1)
344 {
345 aByteString.Append((char)GetHexValue());
346
347 bool bBreak = false;
348 sal_Char nSlash = '\\';
349 while (!bBreak)
350 {
351 wchar_t __next=GetNextChar();
352 if (__next>0xFF) // fix for #i43933# and #i35653#
353 {
354 if (aByteString.Len())
355 aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
356 aStrBuffer.Append((sal_Unicode)__next);
357
358 aByteString.Erase();
359 continue;
360 }
361 nSlash = (sal_Char)__next;
362 while (nSlash == 0xD || nSlash == 0xA)
363 nSlash = (sal_Char)GetNextChar();
364
365 switch (nSlash)
366 {
367 case '{':
368 case '}':
369 case '\\':
370 bBreak = true;
371 break;
372 default:
373 aByteString.Append(nSlash);
374 break;
375 }
376 }
377
378 nNextCh = GetNextChar();
379
380 if (nSlash != '\\' || nNextCh != '\'')
381 {
382 rInput.SeekRel(-1);
383 nNextCh = nSlash;
384 break;
385 }
386 }
387
388 bNextCh = false;
389
390 if (aByteString.Len())
391 aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
392 #endif
393 }
394 break;
395 case '\\':
396 case '}':
397 case '{':
398 case '+': // habe ich in einem RTF-File gefunden
399 aStrBuffer.Append(nNextCh);
400 break;
401 case '~': // nonbreaking space
402 aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
403 break;
404 case '-': // optional hyphen
405 aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
406 break;
407 case '_': // nonbreaking hyphen
408 aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
409 break;
410
411 case 'u':
412 // UNI-Code Zeichen lesen
413 {
414 nNextCh = GetNextChar();
415 rInput.SeekRel( -2 );
416
417 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
418 {
419 bRTF_InTextRead = true;
420
421 String sSave( aToken );
422 nNextCh = '\\';
423 #ifdef DBG_UTIL
424 int nToken =
425 #endif
426 _GetNextToken();
427 DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
428 // dont convert symbol chars
429 aStrBuffer.Append(
430 static_cast< sal_Unicode >(nTokenValue));
431
432 // overread the next n "RTF" characters. This
433 // can be also \{, \}, \'88
434 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
435 {
436 sal_Unicode cAnsi = nNextCh;
437 while( 0xD == cAnsi )
438 cAnsi = GetNextChar();
439 while( 0xA == cAnsi )
440 cAnsi = GetNextChar();
441
442 if( '\\' == cAnsi &&
443 '\'' == ( cAnsi = GetNextChar() ))
444 // HexValue ueberlesen
445 cAnsi = GetHexValue();
446 nNextCh = GetNextChar();
447 }
448 bNextCh = false;
449 aToken = sSave;
450 bRTF_InTextRead = false;
451 }
452 else
453 {
454 nNextCh = '\\';
455 bWeiter = false; // Abbrechen, String zusammen
456 }
457 }
458 break;
459
460 default:
461 rInput.SeekRel( -1 );
462 nNextCh = '\\';
463 bWeiter = false; // Abbrechen, String zusammen
464 break;
465 }
466 }
467 break;
468
469 case sal_Unicode(EOF):
470 eState = SVPAR_ERROR;
471 // weiter
472 case '{':
473 case '}':
474 bWeiter = false;
475 break;
476
477 case 0x0a:
478 case 0x0d:
479 break;
480
481 default:
482 if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
483 bWeiter = false;
484 else
485 {
486 do {
487 // alle anderen Zeichen kommen in den Text
488 aStrBuffer.Append(nNextCh);
489
490 if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
491 {
492 if (aStrBuffer.Len())
493 aToken += aStrBuffer;
494 return;
495 }
496 } while
497 (
498 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
499 (aStrBuffer.Len() < MAX_STRING_LEN)
500 );
501 bNextCh = false;
502 }
503 }
504
505 if( bWeiter && bNextCh )
506 nNextCh = GetNextChar();
507 }
508
509 if (aStrBuffer.Len())
510 aToken += aStrBuffer;
511 }
512
513
514 short SvRTFParser::_inSkipGroup=0;
515
SkipGroup()516 void SvRTFParser::SkipGroup()
517 {
518 short nBrackets=1;
519 if (_inSkipGroup>0)
520 return;
521 _inSkipGroup++;
522 #if 1 // #i16185# fecking \bin keyword
523 do
524 {
525 switch (nNextCh)
526 {
527 case '{':
528 ++nBrackets;
529 break;
530 case '}':
531 if (!--nBrackets) {
532 _inSkipGroup--;
533 return;
534 }
535 break;
536 }
537 int nToken = _GetNextToken();
538 if (nToken == RTF_BIN)
539 {
540 rInput.SeekRel(-1);
541 rInput.SeekRel(nTokenValue);
542 nNextCh = GetNextChar();
543 }
544 while (nNextCh==0xa || nNextCh==0xd)
545 {
546 nNextCh = GetNextChar();
547 }
548 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
549 #else
550 sal_Unicode cPrev = 0;
551 do {
552 switch( nNextCh )
553 {
554 case '{':
555 if( '\\' != cPrev )
556 ++nBrackets;
557 break;
558
559 case '}':
560 if( '\\' != cPrev && !--nBrackets )
561 return;
562 break;
563
564 case '\\':
565 if( '\\' == cPrev )
566 nNextCh = 0;
567 break;
568 }
569 cPrev = nNextCh;
570 nNextCh = GetNextChar();
571 } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
572 #endif
573
574 if( SVPAR_PENDING != eState && '}' != nNextCh )
575 eState = SVPAR_ERROR;
576 _inSkipGroup--;
577 }
578
ReadUnknownData()579 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
ReadBitmapData()580 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
ReadOLEData()581 void SvRTFParser::ReadOLEData() { SkipGroup(); }
582
583
CallParser()584 SvParserState SvRTFParser::CallParser()
585 {
586 sal_Char cFirstCh;
587 nNextChPos = rInput.Tell();
588 rInput >> cFirstCh; nNextCh = cFirstCh;
589 eState = SVPAR_WORKING;
590 nOpenBrakets = 0;
591 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
592 eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet
593
594 // die 1. beiden Token muessen '{' und \\rtf sein !!
595 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
596 {
597 AddRef();
598 Continue( 0 );
599 if( SVPAR_PENDING != eState )
600 ReleaseRef(); // dann brauchen wir den Parser nicht mehr!
601 }
602 else
603 eState = SVPAR_ERROR;
604
605 return eState;
606 }
607
Continue(int nToken)608 void SvRTFParser::Continue( int nToken )
609 {
610 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
611 // "Zeichensatz wurde geaendert." );
612
613 if( !nToken )
614 nToken = GetNextToken();
615
616 while( IsParserWorking() )
617 {
618 SaveState( nToken );
619 switch( nToken )
620 {
621 case '}':
622 if( nOpenBrakets )
623 goto NEXTTOKEN;
624 eState = SVPAR_ACCEPTED;
625 break;
626
627 case '{':
628 // eine unbekannte Gruppe ?
629 {
630 if( RTF_IGNOREFLAG != GetNextToken() )
631 nToken = SkipToken( -1 );
632 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
633 nToken = SkipToken( -2 );
634 else
635 {
636 // gleich herausfiltern
637 ReadUnknownData();
638 nToken = GetNextToken();
639 if( '}' != nToken )
640 eState = SVPAR_ERROR;
641 break; // auf zum naechsten Token!!
642 }
643 }
644 goto NEXTTOKEN;
645
646 case RTF_UNKNOWNCONTROL:
647 break; // unbekannte Token ueberspringen
648 case RTF_NEXTTYPE:
649 case RTF_ANSITYPE:
650 SetEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
651 break;
652 case RTF_MACTYPE:
653 SetEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
654 break;
655 case RTF_PCTYPE:
656 SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
657 break;
658 case RTF_PCATYPE:
659 SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
660 break;
661 case RTF_ANSICPG:
662 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
663 SetEncoding(eCodeSet);
664 break;
665 default:
666 NEXTTOKEN:
667 NextToken( nToken );
668 break;
669 }
670 if( IsParserWorking() )
671 SaveState( 0 ); // bis hierhin abgearbeitet,
672 // weiter mit neuem Token!
673 nToken = GetNextToken();
674 }
675 if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
676 eState = SVPAR_ERROR;
677 }
678
SetEncoding(rtl_TextEncoding eEnc)679 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
680 {
681 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
682 eEnc = GetCodeSet();
683
684 if (!aParserStates.empty())
685 aParserStates.top().eCodeSet = eEnc;
686 SetSrcEncoding(eEnc);
687 }
688
689 #ifdef USED
SaveState(int nToken)690 void SvRTFParser::SaveState( int nToken )
691 {
692 SvParser::SaveState( nToken );
693 }
694
RestoreState()695 void SvRTFParser::RestoreState()
696 {
697 SvParser::RestoreState();
698 }
699 #endif
700
701 /* vim: set noet sw=4 ts=4: */
702