1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_svtools.hxx"
26
27 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
28
29 #include <stdio.h> // for EOF
30 #include <rtl/tencinfo.h>
31 #include <tools/stream.hxx>
32 #include <tools/debug.hxx>
33 #include <svtools/rtftoken.h>
34 #include <svtools/rtfkeywd.hxx>
35 #include <svtools/parrtf.hxx>
36
37 const int MAX_STRING_LEN = 1024;
38 const int MAX_TOKEN_LEN = 128;
39
40 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
41 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
42
SvRTFParser(SvStream & rIn,sal_uInt8 nStackSize)43 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
44 : SvParser( rIn, nStackSize ),
45 eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), // default ist ANSI-CodeSet
46 nUCharOverread( 1 )
47 {
48 // default ist ANSI-CodeSet
49 SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
50 bRTF_InTextRead = false;
51 }
52
~SvRTFParser()53 SvRTFParser::~SvRTFParser()
54 {
55 }
56
57
58
59
_GetNextToken()60 int SvRTFParser::_GetNextToken()
61 {
62 int nRet = 0;
63 do {
64 int bNextCh = true;
65 switch( nNextCh )
66 {
67 case '\\':
68 {
69 // Steuerzeichen
70 switch( nNextCh = GetNextChar() )
71 {
72 case '{':
73 case '}':
74 case '\\':
75 case '+': // habe ich in einem RTF-File gefunden
76 case '~': // nonbreaking space
77 case '-': // optional hyphen
78 case '_': // nonbreaking hyphen
79 case '\'': // HexValue
80 nNextCh = '\\';
81 rInput.SeekRel( -1 );
82 ScanText();
83 nRet = RTF_TEXTTOKEN;
84 bNextCh = 0 == nNextCh;
85 break;
86
87 case '*': // ignoreflag
88 nRet = RTF_IGNOREFLAG;
89 break;
90 case ':': // subentry in an index entry
91 nRet = RTF_SUBENTRYINDEX;
92 break;
93 case '|': // formula-charakter
94 nRet = RTF_FORMULA;
95 break;
96
97 case 0x0a:
98 case 0x0d:
99 nRet = RTF_PAR;
100 break;
101
102 default:
103 if( RTF_ISALPHA( nNextCh ) )
104 {
105 aToken = '\\';
106 {
107 String aStrBuffer;
108 sal_Unicode* pStr = aStrBuffer.AllocBuffer(
109 MAX_TOKEN_LEN );
110 xub_StrLen nStrLen = 0;
111 do {
112 *(pStr + nStrLen++) = nNextCh;
113 if( MAX_TOKEN_LEN == nStrLen )
114 {
115 aToken += aStrBuffer;
116 aToken.GetBufferAccess(); // make unique string!
117 nStrLen = 0;
118 }
119 nNextCh = GetNextChar();
120 } while( RTF_ISALPHA( nNextCh ) );
121 if( nStrLen )
122 {
123 aStrBuffer.ReleaseBufferAccess( nStrLen );
124 aToken += aStrBuffer;
125 }
126 }
127
128 // Minus fuer numerischen Parameter
129 int bNegValue = false;
130 if( '-' == nNextCh )
131 {
132 bNegValue = true;
133 nNextCh = GetNextChar();
134 }
135
136 // evt. Numerischer Parameter
137 if( RTF_ISDIGIT( nNextCh ) )
138 {
139 nTokenValue = 0;
140 do {
141 nTokenValue *= 10;
142 nTokenValue += nNextCh - '0';
143 nNextCh = GetNextChar();
144 } while( RTF_ISDIGIT( nNextCh ) );
145 if( bNegValue )
146 nTokenValue = -nTokenValue;
147 bTokenHasValue=true;
148 }
149 else if( bNegValue ) // das Minus wieder zurueck
150 {
151 nNextCh = '-';
152 rInput.SeekRel( -1 );
153 }
154 if( ' ' == nNextCh ) // Blank gehoert zum Token!
155 nNextCh = GetNextChar();
156
157 // suche das Token in der Tabelle:
158 if( 0 == (nRet = GetRTFToken( aToken )) )
159 // Unknown Control
160 nRet = RTF_UNKNOWNCONTROL;
161
162 // bug 76812 - unicode token handled as normal text
163 bNextCh = false;
164 switch( nRet )
165 {
166 case RTF_UC:
167 if( 0 <= nTokenValue )
168 {
169 nUCharOverread = (sal_uInt8)nTokenValue;
170 #if 1
171 //cmc: other ifdef breaks #i3584
172 aParserStates.top().
173 nUCharOverread = nUCharOverread;
174 #else
175 if( !nUCharOverread )
176 nUCharOverread = aParserStates.top().nUCharOverread;
177 else
178 aParserStates.top().
179 nUCharOverread = nUCharOverread;
180 #endif
181 }
182 aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
183 // read next token
184 nRet = 0;
185 break;
186
187 case RTF_UPR:
188 if (!_inSkipGroup) {
189 // UPR - overread the group with the ansi
190 // informations
191 while( '{' != _GetNextToken() )
192 ;
193 SkipGroup();
194 _GetNextToken(); // overread the last bracket
195 nRet = 0;
196 }
197 break;
198
199 case RTF_U:
200 if( !bRTF_InTextRead )
201 {
202 nRet = RTF_TEXTTOKEN;
203 aToken = (sal_Unicode)nTokenValue;
204
205 // overread the next n "RTF" characters. This
206 // can be also \{, \}, \'88
207 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
208 {
209 sal_Unicode cAnsi = nNextCh;
210 while( 0xD == cAnsi )
211 cAnsi = GetNextChar();
212 while( 0xA == cAnsi )
213 cAnsi = GetNextChar();
214
215 if( '\\' == cAnsi &&
216 '\'' == ( cAnsi = GetNextChar() ))
217 // HexValue ueberlesen
218 cAnsi = GetHexValue();
219 nNextCh = GetNextChar();
220 }
221 ScanText();
222 bNextCh = 0 == nNextCh;
223 }
224 break;
225 }
226 }
227 else if( SVPAR_PENDING != eState )
228 {
229 // Bug 34631 - "\ " ueberlesen - Blank als Zeichen
230 // eState = SVPAR_ERROR;
231 bNextCh = false;
232 }
233 break;
234 }
235 }
236 break;
237
238 case sal_Unicode(EOF):
239 eState = SVPAR_ACCEPTED;
240 nRet = nNextCh;
241 break;
242
243 case '{':
244 {
245 if( 0 <= nOpenBrakets )
246 {
247 RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
248 aParserStates.push( aState );
249 }
250 ++nOpenBrakets;
251 DBG_ASSERT(
252 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
253 "ParserStateStack unequal to bracket count" );
254 nRet = nNextCh;
255 }
256 break;
257
258 case '}':
259 --nOpenBrakets;
260 if( 0 <= nOpenBrakets )
261 {
262 aParserStates.pop();
263 if( !aParserStates.empty() )
264 {
265 const RtfParserState_Impl& rRPS =
266 aParserStates.top();
267 nUCharOverread = rRPS.nUCharOverread;
268 SetSrcEncoding( rRPS.eCodeSet );
269 }
270 else
271 {
272 nUCharOverread = 1;
273 SetSrcEncoding( GetCodeSet() );
274 }
275 }
276 DBG_ASSERT(
277 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
278 "ParserStateStack unequal to bracket count" );
279 nRet = nNextCh;
280 break;
281
282 case 0x0d:
283 case 0x0a:
284 break;
285
286 default:
287 // es folgt normaler Text
288 ScanText();
289 nRet = RTF_TEXTTOKEN;
290 bNextCh = 0 == nNextCh;
291 break;
292 }
293
294 if( bNextCh )
295 nNextCh = GetNextChar();
296
297 } while( !nRet && SVPAR_WORKING == eState );
298 return nRet;
299 }
300
301
GetHexValue()302 sal_Unicode SvRTFParser::GetHexValue()
303 {
304 // Hex-Wert sammeln
305 register int n;
306 register sal_Unicode nHexVal = 0;
307
308 for( n = 0; n < 2; ++n )
309 {
310 nHexVal *= 16;
311 nNextCh = GetNextChar();
312 if( nNextCh >= '0' && nNextCh <= '9' )
313 nHexVal += (nNextCh - 48);
314 else if( nNextCh >= 'a' && nNextCh <= 'f' )
315 nHexVal += (nNextCh - 87);
316 else if( nNextCh >= 'A' && nNextCh <= 'F' )
317 nHexVal += (nNextCh - 55);
318 }
319 return nHexVal;
320 }
321
ScanText(const sal_Unicode cBreak)322 void SvRTFParser::ScanText( const sal_Unicode cBreak )
323 {
324 String aStrBuffer;
325 int bWeiter = true;
326 while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
327 {
328 int bNextCh = true;
329 switch( nNextCh )
330 {
331 case '\\':
332 {
333 switch (nNextCh = GetNextChar())
334 {
335 case '\'':
336 {
337
338 #if 0
339 // #i35653 patch from cmc
340 ByteString aByteString(static_cast<char>(GetHexValue()));
341 if (aByteString.Len())
342 aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
343 #else
344 ByteString aByteString;
345 while (1)
346 {
347 aByteString.Append((char)GetHexValue());
348
349 bool bBreak = false;
350 sal_Char nSlash = '\\';
351 while (!bBreak)
352 {
353 wchar_t __next=GetNextChar();
354 if (__next>0xFF) // fix for #i43933# and #i35653#
355 {
356 if (aByteString.Len())
357 aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
358 aStrBuffer.Append((sal_Unicode)__next);
359
360 aByteString.Erase();
361 continue;
362 }
363 nSlash = (sal_Char)__next;
364 while (nSlash == 0xD || nSlash == 0xA)
365 nSlash = (sal_Char)GetNextChar();
366
367 switch (nSlash)
368 {
369 case '{':
370 case '}':
371 case '\\':
372 bBreak = true;
373 break;
374 default:
375 aByteString.Append(nSlash);
376 break;
377 }
378 }
379
380 nNextCh = GetNextChar();
381
382 if (nSlash != '\\' || nNextCh != '\'')
383 {
384 rInput.SeekRel(-1);
385 nNextCh = nSlash;
386 break;
387 }
388 }
389
390 bNextCh = false;
391
392 if (aByteString.Len())
393 aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
394 #endif
395 }
396 break;
397 case '\\':
398 case '}':
399 case '{':
400 case '+': // habe ich in einem RTF-File gefunden
401 aStrBuffer.Append(nNextCh);
402 break;
403 case '~': // nonbreaking space
404 aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
405 break;
406 case '-': // optional hyphen
407 aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
408 break;
409 case '_': // nonbreaking hyphen
410 aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
411 break;
412
413 case 'u':
414 // UNI-Code Zeichen lesen
415 {
416 nNextCh = GetNextChar();
417 rInput.SeekRel( -2 );
418
419 if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
420 {
421 bRTF_InTextRead = true;
422
423 String sSave( aToken );
424 nNextCh = '\\';
425 #ifdef DBG_UTIL
426 int nToken =
427 #endif
428 _GetNextToken();
429 DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
430 // dont convert symbol chars
431 aStrBuffer.Append(
432 static_cast< sal_Unicode >(nTokenValue));
433
434 // overread the next n "RTF" characters. This
435 // can be also \{, \}, \'88
436 for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
437 {
438 sal_Unicode cAnsi = nNextCh;
439 while( 0xD == cAnsi )
440 cAnsi = GetNextChar();
441 while( 0xA == cAnsi )
442 cAnsi = GetNextChar();
443
444 if( '\\' == cAnsi &&
445 '\'' == ( cAnsi = GetNextChar() ))
446 // HexValue ueberlesen
447 cAnsi = GetHexValue();
448 nNextCh = GetNextChar();
449 }
450 bNextCh = false;
451 aToken = sSave;
452 bRTF_InTextRead = false;
453 }
454 else
455 {
456 nNextCh = '\\';
457 bWeiter = false; // Abbrechen, String zusammen
458 }
459 }
460 break;
461
462 default:
463 rInput.SeekRel( -1 );
464 nNextCh = '\\';
465 bWeiter = false; // Abbrechen, String zusammen
466 break;
467 }
468 }
469 break;
470
471 case sal_Unicode(EOF):
472 eState = SVPAR_ERROR;
473 // weiter
474 case '{':
475 case '}':
476 bWeiter = false;
477 break;
478
479 case 0x0a:
480 case 0x0d:
481 break;
482
483 default:
484 if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
485 bWeiter = false;
486 else
487 {
488 do {
489 // alle anderen Zeichen kommen in den Text
490 aStrBuffer.Append(nNextCh);
491
492 if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
493 {
494 if (aStrBuffer.Len())
495 aToken += aStrBuffer;
496 return;
497 }
498 } while
499 (
500 (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
501 (aStrBuffer.Len() < MAX_STRING_LEN)
502 );
503 bNextCh = false;
504 }
505 }
506
507 if( bWeiter && bNextCh )
508 nNextCh = GetNextChar();
509 }
510
511 if (aStrBuffer.Len())
512 aToken += aStrBuffer;
513 }
514
515
516 short SvRTFParser::_inSkipGroup=0;
517
SkipGroup()518 void SvRTFParser::SkipGroup()
519 {
520 short nBrackets=1;
521 if (_inSkipGroup>0)
522 return;
523 _inSkipGroup++;
524 #if 1 //#i16185# fecking \bin keyword
525 do
526 {
527 switch (nNextCh)
528 {
529 case '{':
530 ++nBrackets;
531 break;
532 case '}':
533 if (!--nBrackets) {
534 _inSkipGroup--;
535 return;
536 }
537 break;
538 }
539 int nToken = _GetNextToken();
540 if (nToken == RTF_BIN)
541 {
542 rInput.SeekRel(-1);
543 rInput.SeekRel(nTokenValue);
544 nNextCh = GetNextChar();
545 }
546 while (nNextCh==0xa || nNextCh==0xd)
547 {
548 nNextCh = GetNextChar();
549 }
550 } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
551 #else
552 sal_Unicode cPrev = 0;
553 do {
554 switch( nNextCh )
555 {
556 case '{':
557 if( '\\' != cPrev )
558 ++nBrackets;
559 break;
560
561 case '}':
562 if( '\\' != cPrev && !--nBrackets )
563 return;
564 break;
565
566 case '\\':
567 if( '\\' == cPrev )
568 nNextCh = 0;
569 break;
570 }
571 cPrev = nNextCh;
572 nNextCh = GetNextChar();
573 } while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
574 #endif
575
576 if( SVPAR_PENDING != eState && '}' != nNextCh )
577 eState = SVPAR_ERROR;
578 _inSkipGroup--;
579 }
580
ReadUnknownData()581 void SvRTFParser::ReadUnknownData() { SkipGroup(); }
ReadBitmapData()582 void SvRTFParser::ReadBitmapData() { SkipGroup(); }
ReadOLEData()583 void SvRTFParser::ReadOLEData() { SkipGroup(); }
584
585
CallParser()586 SvParserState SvRTFParser::CallParser()
587 {
588 sal_Char cFirstCh;
589 nNextChPos = rInput.Tell();
590 rInput >> cFirstCh; nNextCh = cFirstCh;
591 eState = SVPAR_WORKING;
592 nOpenBrakets = 0;
593 SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
594 eUNICodeSet = RTL_TEXTENCODING_MS_1252; // default ist ANSI-CodeSet
595
596 // die 1. beiden Token muessen '{' und \\rtf sein !!
597 if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
598 {
599 AddRef();
600 Continue( 0 );
601 if( SVPAR_PENDING != eState )
602 ReleaseRef(); // dann brauchen wir den Parser nicht mehr!
603 }
604 else
605 eState = SVPAR_ERROR;
606
607 return eState;
608 }
609
Continue(int nToken)610 void SvRTFParser::Continue( int nToken )
611 {
612 // DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
613 // "Zeichensatz wurde geaendert." );
614
615 if( !nToken )
616 nToken = GetNextToken();
617
618 while( IsParserWorking() )
619 {
620 SaveState( nToken );
621 switch( nToken )
622 {
623 case '}':
624 if( nOpenBrakets )
625 goto NEXTTOKEN;
626 eState = SVPAR_ACCEPTED;
627 break;
628
629 case '{':
630 // eine unbekannte Gruppe ?
631 {
632 if( RTF_IGNOREFLAG != GetNextToken() )
633 nToken = SkipToken( -1 );
634 else if( RTF_UNKNOWNCONTROL != GetNextToken() )
635 nToken = SkipToken( -2 );
636 else
637 {
638 // gleich herausfiltern
639 ReadUnknownData();
640 nToken = GetNextToken();
641 if( '}' != nToken )
642 eState = SVPAR_ERROR;
643 break; // auf zum naechsten Token!!
644 }
645 }
646 goto NEXTTOKEN;
647
648 case RTF_UNKNOWNCONTROL:
649 break; // unbekannte Token ueberspringen
650 case RTF_NEXTTYPE:
651 case RTF_ANSITYPE:
652 SetEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
653 break;
654 case RTF_MACTYPE:
655 SetEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
656 break;
657 case RTF_PCTYPE:
658 SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
659 break;
660 case RTF_PCATYPE:
661 SetEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
662 break;
663 case RTF_ANSICPG:
664 eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
665 SetEncoding(eCodeSet);
666 break;
667 default:
668 NEXTTOKEN:
669 NextToken( nToken );
670 break;
671 }
672 if( IsParserWorking() )
673 SaveState( 0 ); // bis hierhin abgearbeitet,
674 // weiter mit neuem Token!
675 nToken = GetNextToken();
676 }
677 if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
678 eState = SVPAR_ERROR;
679 }
680
SetEncoding(rtl_TextEncoding eEnc)681 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
682 {
683 if (eEnc == RTL_TEXTENCODING_DONTKNOW)
684 eEnc = GetCodeSet();
685
686 if (!aParserStates.empty())
687 aParserStates.top().eCodeSet = eEnc;
688 SetSrcEncoding(eEnc);
689 }
690
691 #ifdef USED
SaveState(int nToken)692 void SvRTFParser::SaveState( int nToken )
693 {
694 SvParser::SaveState( nToken );
695 }
696
RestoreState()697 void SvRTFParser::RestoreState()
698 {
699 SvParser::RestoreState();
700 }
701 #endif
702
703 /* vi:set tabstop=4 shiftwidth=4 expandtab: */
704