xref: /trunk/main/svtools/source/svrtf/parrtf.cxx (revision 5900e8ec)
1 /**************************************************************
2  *
3  * Licensed to the Apache Software Foundation (ASF) under one
4  * or more contributor license agreements.  See the NOTICE file
5  * distributed with this work for additional information
6  * regarding copyright ownership.  The ASF licenses this file
7  * to you under the Apache License, Version 2.0 (the
8  * "License"); you may not use this file except in compliance
9  * with the License.  You may obtain a copy of the License at
10  *
11  *   http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing,
14  * software distributed under the License is distributed on an
15  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16  * KIND, either express or implied.  See the License for the
17  * specific language governing permissions and limitations
18  * under the License.
19  *
20  *************************************************************/
21 
22 
23 
24 // MARKER(update_precomp.py): autogen include statement, do not remove
25 #include "precompiled_svtools.hxx"
26 
27 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
28 
29 #include <stdio.h>		                // for EOF
30 #include <rtl/tencinfo.h>
31 #include <tools/stream.hxx>
32 #include <tools/debug.hxx>
33 #include <svtools/rtftoken.h>
34 #include <svtools/rtfkeywd.hxx>
35 #include <svtools/parrtf.hxx>
36 
37 const int MAX_STRING_LEN = 1024;
38 const int MAX_TOKEN_LEN = 128;
39 
40 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
41 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
42 
43 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
44 	: SvParser( rIn, nStackSize ),
45 	eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), 	// default ist ANSI-CodeSet
46 	nUCharOverread( 1 )
47 {
48 	// default ist ANSI-CodeSet
49 	SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
50 	bRTF_InTextRead = false;
51 }
52 
53 SvRTFParser::~SvRTFParser()
54 {
55 }
56 
57 
58 
59 
60 int SvRTFParser::_GetNextToken()
61 {
62 	int nRet = 0;
63 	do {
64 		int bNextCh = true;
65 		switch( nNextCh )
66 		{
67 		case '\\':
68 			{
69 				// Steuerzeichen
70 				switch( nNextCh = GetNextChar() )
71 				{
72 				case '{':
73 				case '}':
74 				case '\\':
75 				case '+':		// habe ich in einem RTF-File gefunden
76 				case '~':		// nonbreaking space
77 				case '-':		// optional hyphen
78 				case '_':		// nonbreaking hyphen
79 				case '\'':		// HexValue
80 					nNextCh = '\\';
81 					rInput.SeekRel( -1 );
82 					ScanText();
83 					nRet = RTF_TEXTTOKEN;
84 					bNextCh = 0 == nNextCh;
85 					break;
86 
87 				case '*':		// ignoreflag
88 					nRet = RTF_IGNOREFLAG;
89 					break;
90 				case ':':	 	// subentry in an index entry
91 					nRet = RTF_SUBENTRYINDEX;
92 					break;
93 				case '|':		// formula-charakter
94 					nRet = RTF_FORMULA;
95 					break;
96 
97 				case 0x0a:
98 				case 0x0d:
99 					nRet = RTF_PAR;
100 					break;
101 
102 				default:
103 					if( RTF_ISALPHA( nNextCh ) )
104 					{
105 						aToken = '\\';
106 						{
107 							String aStrBuffer;
108 							sal_Unicode* pStr = aStrBuffer.AllocBuffer(
109 															MAX_TOKEN_LEN );
110 							xub_StrLen nStrLen = 0;
111 							do {
112 								*(pStr + nStrLen++) = nNextCh;
113 								if( MAX_TOKEN_LEN == nStrLen )
114 								{
115 									aToken += aStrBuffer;
116 									aToken.GetBufferAccess();  // make unique string!
117 									nStrLen = 0;
118 								}
119 								nNextCh = GetNextChar();
120 							} while( RTF_ISALPHA( nNextCh ) );
121 							if( nStrLen )
122 							{
123 								aStrBuffer.ReleaseBufferAccess( nStrLen );
124 								aToken += aStrBuffer;
125 							}
126 						}
127 
128 						// Minus fuer numerischen Parameter
129 						int bNegValue = false;
130 						if( '-' == nNextCh )
131 						{
132 							bNegValue = true;
133 							nNextCh = GetNextChar();
134 						}
135 
136 						// evt. Numerischer Parameter
137 						if( RTF_ISDIGIT( nNextCh ) )
138 						{
139 							nTokenValue = 0;
140 							do {
141 								nTokenValue *= 10;
142 								nTokenValue += nNextCh - '0';
143 								nNextCh = GetNextChar();
144 							} while( RTF_ISDIGIT( nNextCh ) );
145 							if( bNegValue )
146 								nTokenValue = -nTokenValue;
147 							bTokenHasValue=true;
148 						}
149 						else if( bNegValue )		// das Minus wieder zurueck
150 						{
151 							nNextCh = '-';
152 							rInput.SeekRel( -1 );
153 						}
154 						if( ' ' == nNextCh )		// Blank gehoert zum Token!
155 							nNextCh = GetNextChar();
156 
157 						// suche das Token in der Tabelle:
158 						if( 0 == (nRet = GetRTFToken( aToken )) )
159 							// Unknown Control
160 							nRet = RTF_UNKNOWNCONTROL;
161 
162 						// bug 76812 - unicode token handled as normal text
163 						bNextCh = false;
164 						switch( nRet )
165 						{
166 						case RTF_UC:
167 							if( 0 <= nTokenValue )
168 							{
169 								nUCharOverread = (sal_uInt8)nTokenValue;
170 #if 1
171                                 //cmc: other ifdef breaks #i3584
172 								aParserStates.top().
173 									nUCharOverread = nUCharOverread;
174 #else
175 								if( !nUCharOverread )
176 									nUCharOverread = aParserStates.top().nUCharOverread;
177 								else
178 									aParserStates.top().
179 										nUCharOverread = nUCharOverread;
180 #endif
181 							}
182 							aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
183 							// read next token
184 							nRet = 0;
185 							break;
186 
187 						case RTF_UPR:
188 							if (!_inSkipGroup) {
189 							// UPR - overread the group with the ansi
190 							//       informations
191 							while( '{' != _GetNextToken() )
192 								;
193 							SkipGroup();
194 							_GetNextToken();  // overread the last bracket
195 							nRet = 0;
196 							}
197 							break;
198 
199 						case RTF_U:
200 							if( !bRTF_InTextRead )
201 							{
202 								nRet = RTF_TEXTTOKEN;
203 								aToken = (sal_Unicode)nTokenValue;
204 
205 								// overread the next n "RTF" characters. This
206 								// can be also \{, \}, \'88
207 								for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
208 								{
209 									sal_Unicode cAnsi = nNextCh;
210 									while( 0xD == cAnsi )
211 										cAnsi = GetNextChar();
212 									while( 0xA == cAnsi )
213 										cAnsi = GetNextChar();
214 
215 									if( '\\' == cAnsi &&
216 										'\'' == ( cAnsi = GetNextChar() ))
217 										// HexValue ueberlesen
218 										cAnsi = GetHexValue();
219 									nNextCh = GetNextChar();
220 								}
221 								ScanText();
222 								bNextCh = 0 == nNextCh;
223 							}
224 							break;
225 						}
226 					}
227 					else if( SVPAR_PENDING != eState )
228 					{
229 						// Bug 34631 - "\ " ueberlesen - Blank als Zeichen
230 						// eState = SVPAR_ERROR;
231 						bNextCh = false;
232 					}
233 					break;
234 				}
235 			}
236 			break;
237 
238 		case sal_Unicode(EOF):
239 			eState = SVPAR_ACCEPTED;
240 			nRet = nNextCh;
241 			break;
242 
243 		case '{':
244 			{
245 				if( 0 <= nOpenBrakets )
246 				{
247 					RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
248                     aParserStates.push( aState );
249 				}
250 				++nOpenBrakets;
251                 DBG_ASSERT(
252                     static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
253                     "ParserStateStack unequal to bracket count" );
254 				nRet = nNextCh;
255 			}
256 			break;
257 
258 		case '}':
259 			--nOpenBrakets;
260 			if( 0 <= nOpenBrakets )
261 			{
262                 aParserStates.pop();
263 				if( !aParserStates.empty() )
264 				{
265 					const RtfParserState_Impl& rRPS =
266 							aParserStates.top();
267 					nUCharOverread = rRPS.nUCharOverread;
268 					SetSrcEncoding( rRPS.eCodeSet );
269 				}
270 				else
271 				{
272 					nUCharOverread = 1;
273 					SetSrcEncoding( GetCodeSet() );
274 				}
275 			}
276             DBG_ASSERT(
277                 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
278                 "ParserStateStack unequal to bracket count" );
279 			nRet = nNextCh;
280 			break;
281 
282 		case 0x0d:
283 		case 0x0a:
284 			break;
285 
286 		default:
287 			// es folgt normaler Text
288 			ScanText();
289 			nRet = RTF_TEXTTOKEN;
290 			bNextCh = 0 == nNextCh;
291 			break;
292 		}
293 
294 		if( bNextCh )
295 			nNextCh = GetNextChar();
296 
297 	} while( !nRet && SVPAR_WORKING == eState );
298 	return nRet;
299 }
300 
301 
302 sal_Unicode SvRTFParser::GetHexValue()
303 {
304 	// Hex-Wert sammeln
305 	register int n;
306 	register sal_Unicode nHexVal = 0;
307 
308 	for( n = 0; n < 2; ++n )
309 	{
310 		nHexVal *= 16;
311 		nNextCh = GetNextChar();
312 		if( nNextCh >= '0' && nNextCh <= '9' )
313 			nHexVal += (nNextCh - 48);
314 		else if( nNextCh >= 'a' && nNextCh <= 'f' )
315 			nHexVal += (nNextCh - 87);
316 		else if( nNextCh >= 'A' && nNextCh <= 'F' )
317 			nHexVal += (nNextCh - 55);
318 	}
319 	return nHexVal;
320 }
321 
322 void SvRTFParser::ScanText( const sal_Unicode cBreak )
323 {
324 	String aStrBuffer;
325 	int bWeiter = true;
326 	while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
327 	{
328 		int bNextCh = true;
329 		switch( nNextCh )
330 		{
331 		case '\\':
332 			{
333 				switch (nNextCh = GetNextChar())
334 				{
335 				case '\'':
336 					{
337 
338 #if 0
339                         // #i35653 patch from cmc
340                         ByteString aByteString(static_cast<char>(GetHexValue()));
341                         if (aByteString.Len())
342                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
343 #else
344                         ByteString aByteString;
345                         while (1)
346                         {
347                             aByteString.Append((char)GetHexValue());
348 
349                             bool bBreak = false;
350                             sal_Char nSlash = '\\';
351                             while (!bBreak)
352                             {
353 								wchar_t __next=GetNextChar();
354 								if (__next>0xFF) // fix for #i43933# and #i35653#
355 								{
356 									if (aByteString.Len())
357 										aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
358 									aStrBuffer.Append((sal_Unicode)__next);
359 
360 									aByteString.Erase();
361 									continue;
362 								}
363                                 nSlash = (sal_Char)__next;
364                                 while (nSlash == 0xD || nSlash == 0xA)
365                                     nSlash = (sal_Char)GetNextChar();
366 
367                                 switch (nSlash)
368                                 {
369                                     case '{':
370                                     case '}':
371                                     case '\\':
372                                         bBreak = true;
373                                         break;
374                                     default:
375                                         aByteString.Append(nSlash);
376                                         break;
377                                 }
378                             }
379 
380                             nNextCh = GetNextChar();
381 
382                             if (nSlash != '\\' || nNextCh != '\'')
383                             {
384                                 rInput.SeekRel(-1);
385                                 nNextCh = nSlash;
386                                 break;
387                             }
388                         }
389 
390                         bNextCh = false;
391 
392                         if (aByteString.Len())
393                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
394 #endif
395                     }
396 					break;
397 				case '\\':
398 				case '}':
399 				case '{':
400 				case '+':		// habe ich in einem RTF-File gefunden
401 					aStrBuffer.Append(nNextCh);
402 					break;
403 				case '~':		// nonbreaking space
404 					aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
405 					break;
406 				case '-':		// optional hyphen
407 					aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
408 					break;
409 				case '_':		// nonbreaking hyphen
410 					aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
411 					break;
412 
413 				case 'u':
414 					// UNI-Code Zeichen lesen
415 					{
416 						nNextCh = GetNextChar();
417 						rInput.SeekRel( -2 );
418 
419 						if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
420 						{
421 							bRTF_InTextRead = true;
422 
423 							String sSave( aToken );
424 							nNextCh = '\\';
425                             #ifdef DBG_UTIL
426 							int nToken =
427                             #endif
428                                 _GetNextToken();
429 							DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
430 							// dont convert symbol chars
431 							aStrBuffer.Append(
432                                 static_cast< sal_Unicode >(nTokenValue));
433 
434 							// overread the next n "RTF" characters. This
435 							// can be also \{, \}, \'88
436 							for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
437 							{
438 								sal_Unicode cAnsi = nNextCh;
439 								while( 0xD == cAnsi )
440 									cAnsi = GetNextChar();
441 								while( 0xA == cAnsi )
442 									cAnsi = GetNextChar();
443 
444 								if( '\\' == cAnsi &&
445 									'\'' == ( cAnsi = GetNextChar() ))
446 									// HexValue ueberlesen
447 									cAnsi = GetHexValue();
448 								nNextCh = GetNextChar();
449 							}
450 							bNextCh = false;
451 							aToken = sSave;
452 							bRTF_InTextRead = false;
453 						}
454 						else
455 						{
456 							nNextCh = '\\';
457 							bWeiter = false;		// Abbrechen, String zusammen
458 						}
459 					}
460 					break;
461 
462 				default:
463 					rInput.SeekRel( -1 );
464 					nNextCh = '\\';
465 					bWeiter = false;		// Abbrechen, String zusammen
466 					break;
467 				}
468 			}
469 			break;
470 
471 		case sal_Unicode(EOF):
472 				eState = SVPAR_ERROR;
473 				// weiter
474 		case '{':
475 		case '}':
476 			bWeiter = false;
477 			break;
478 
479 		case 0x0a:
480 		case 0x0d:
481 			break;
482 
483 		default:
484 			if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
485 				bWeiter = false;
486 			else
487 			{
488 				do {
489 					// alle anderen Zeichen kommen in den Text
490 					aStrBuffer.Append(nNextCh);
491 
492 					if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
493 					{
494                         if (aStrBuffer.Len())
495 		                    aToken += aStrBuffer;
496 						return;
497 					}
498 				} while
499                 (
500                     (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
501                     (aStrBuffer.Len() < MAX_STRING_LEN)
502                 );
503 				bNextCh = false;
504 			}
505 		}
506 
507 		if( bWeiter && bNextCh )
508 			nNextCh = GetNextChar();
509 	}
510 
511 	if (aStrBuffer.Len())
512 		aToken += aStrBuffer;
513 }
514 
515 
516 short SvRTFParser::_inSkipGroup=0;
517 
518 void SvRTFParser::SkipGroup()
519 {
520 short nBrackets=1;
521 if (_inSkipGroup>0)
522 	return;
523 _inSkipGroup++;
524 #if 1	//#i16185# fecking \bin keyword
525     do
526     {
527         switch (nNextCh)
528         {
529             case '{':
530                 ++nBrackets;
531                 break;
532             case '}':
533 				if (!--nBrackets) {
534 					_inSkipGroup--;
535                     return;
536 				}
537                 break;
538         }
539         int nToken = _GetNextToken();
540         if (nToken == RTF_BIN)
541         {
542             rInput.SeekRel(-1);
543             rInput.SeekRel(nTokenValue);
544 		    nNextCh = GetNextChar();
545         }
546 		while (nNextCh==0xa || nNextCh==0xd)
547 		{
548 			nNextCh = GetNextChar();
549 		}
550     } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
551 #else
552 	sal_Unicode cPrev = 0;
553 	do {
554 		switch( nNextCh )
555 		{
556 		case '{':
557 			if( '\\' != cPrev )
558 				++nBrackets;
559 			break;
560 
561 		case '}':
562 			if( '\\' != cPrev && !--nBrackets )
563 				return;
564 			break;
565 
566 		case '\\':
567 			if( '\\' == cPrev )
568 				nNextCh = 0;
569 			break;
570 		}
571 		cPrev = nNextCh;
572 		nNextCh = GetNextChar();
573 	} while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
574 #endif
575 
576 	if( SVPAR_PENDING != eState && '}' != nNextCh )
577 		eState = SVPAR_ERROR;
578 	_inSkipGroup--;
579 }
580 
581 void SvRTFParser::ReadUnknownData()	{ SkipGroup(); }
582 void SvRTFParser::ReadBitmapData()	{ SkipGroup(); }
583 void SvRTFParser::ReadOLEData()		{ SkipGroup(); }
584 
585 
586 SvParserState SvRTFParser::CallParser()
587 {
588 	sal_Char cFirstCh;
589     nNextChPos = rInput.Tell();
590 	rInput >> cFirstCh; nNextCh = cFirstCh;
591 	eState = SVPAR_WORKING;
592 	nOpenBrakets = 0;
593 	SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
594 	eUNICodeSet = RTL_TEXTENCODING_MS_1252; 	// default ist ANSI-CodeSet
595 
596 	// die 1. beiden Token muessen '{' und \\rtf sein !!
597 	if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
598 	{
599 		AddRef();
600 		Continue( 0 );
601 		if( SVPAR_PENDING != eState )
602 			ReleaseRef();		// dann brauchen wir den Parser nicht mehr!
603 	}
604 	else
605 		eState = SVPAR_ERROR;
606 
607 	return eState;
608 }
609 
610 void SvRTFParser::Continue( int nToken )
611 {
612 //	DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
613 //				"Zeichensatz wurde geaendert." );
614 
615 	if( !nToken )
616 		nToken = GetNextToken();
617 
618 	while( IsParserWorking() )
619 	{
620 		SaveState( nToken );
621 		switch( nToken )
622 		{
623 		case '}':
624 			if( nOpenBrakets )
625 				goto NEXTTOKEN;
626 			eState = SVPAR_ACCEPTED;
627 			break;
628 
629 		case '{':
630 			// eine unbekannte Gruppe ?
631 			{
632 				if( RTF_IGNOREFLAG != GetNextToken() )
633 					nToken = SkipToken( -1 );
634 				else if( RTF_UNKNOWNCONTROL != GetNextToken() )
635 					nToken = SkipToken( -2 );
636 				else
637 				{
638 					// gleich herausfiltern
639 					ReadUnknownData();
640 					nToken = GetNextToken();
641 					if( '}' != nToken )
642 						eState = SVPAR_ERROR;
643 					break;		// auf zum naechsten Token!!
644 				}
645 			}
646 			goto NEXTTOKEN;
647 
648 		case RTF_UNKNOWNCONTROL:
649 			break;		// unbekannte Token ueberspringen
650 		case RTF_NEXTTYPE:
651 		case RTF_ANSITYPE:
652             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
653             break;
654 		case RTF_MACTYPE:
655             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
656             break;
657 		case RTF_PCTYPE:
658             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
659             break;
660 		case RTF_PCATYPE:
661             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
662             break;
663 		case RTF_ANSICPG:
664             eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
665             SetSrcEncoding(eCodeSet);
666 			break;
667 		default:
668 NEXTTOKEN:
669 			NextToken( nToken );
670 			break;
671 		}
672 		if( IsParserWorking() )
673 			SaveState( 0 );			// bis hierhin abgearbeitet,
674 									// weiter mit neuem Token!
675 		nToken = GetNextToken();
676 	}
677 	if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
678 		eState = SVPAR_ERROR;
679 }
680 
681 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
682 {
683 	if (eEnc == RTL_TEXTENCODING_DONTKNOW)
684 		eEnc = GetCodeSet();
685 
686 	if (!aParserStates.empty())
687 		aParserStates.top().eCodeSet = eEnc;
688 	SetSrcEncoding(eEnc);
689 }
690 
691 #ifdef USED
692 void SvRTFParser::SaveState( int nToken )
693 {
694 	SvParser::SaveState( nToken );
695 }
696 
697 void SvRTFParser::RestoreState()
698 {
699 	SvParser::RestoreState();
700 }
701 #endif
702 
703 /* vi:set tabstop=4 shiftwidth=4 expandtab: */
704