xref: /trunk/main/svtools/source/svrtf/parrtf.cxx (revision cdf0e10c)
1 /*************************************************************************
2  *
3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4  *
5  * Copyright 2000, 2010 Oracle and/or its affiliates.
6  *
7  * OpenOffice.org - a multi-platform office productivity suite
8  *
9  * This file is part of OpenOffice.org.
10  *
11  * OpenOffice.org is free software: you can redistribute it and/or modify
12  * it under the terms of the GNU Lesser General Public License version 3
13  * only, as published by the Free Software Foundation.
14  *
15  * OpenOffice.org is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU Lesser General Public License version 3 for more details
19  * (a copy is included in the LICENSE file that accompanied this code).
20  *
21  * You should have received a copy of the GNU Lesser General Public License
22  * version 3 along with OpenOffice.org.  If not, see
23  * <http://www.openoffice.org/license.html>
24  * for a copy of the LGPLv3 License.
25  *
26  ************************************************************************/
27 
28 // MARKER(update_precomp.py): autogen include statement, do not remove
29 #include "precompiled_svtools.hxx"
30 
31 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil -*- */
32 
33 #include <stdio.h>		                // for EOF
34 #include <rtl/tencinfo.h>
35 #include <tools/stream.hxx>
36 #include <tools/debug.hxx>
37 #include <svtools/rtftoken.h>
38 #include <svtools/rtfkeywd.hxx>
39 #include <svtools/parrtf.hxx>
40 
41 const int MAX_STRING_LEN = 1024;
42 const int MAX_TOKEN_LEN = 128;
43 
44 #define RTF_ISDIGIT( c ) (c >= '0' && c <= '9')
45 #define RTF_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
46 
47 SvRTFParser::SvRTFParser( SvStream& rIn, sal_uInt8 nStackSize )
48 	: SvParser( rIn, nStackSize ),
49 	eUNICodeSet( RTL_TEXTENCODING_MS_1252 ), 	// default ist ANSI-CodeSet
50 	nUCharOverread( 1 )
51 {
52 	// default ist ANSI-CodeSet
53 	SetSrcEncoding( RTL_TEXTENCODING_MS_1252 );
54 	bRTF_InTextRead = false;
55 }
56 
57 SvRTFParser::~SvRTFParser()
58 {
59 }
60 
61 
62 
63 
64 int SvRTFParser::_GetNextToken()
65 {
66 	int nRet = 0;
67 	do {
68 		int bNextCh = true;
69 		switch( nNextCh )
70 		{
71 		case '\\':
72 			{
73 				// Steuerzeichen
74 				switch( nNextCh = GetNextChar() )
75 				{
76 				case '{':
77 				case '}':
78 				case '\\':
79 				case '+':		// habe ich in einem RTF-File gefunden
80 				case '~':		// nonbreaking space
81 				case '-':		// optional hyphen
82 				case '_':		// nonbreaking hyphen
83 				case '\'':		// HexValue
84 					nNextCh = '\\';
85 					rInput.SeekRel( -1 );
86 					ScanText();
87 					nRet = RTF_TEXTTOKEN;
88 					bNextCh = 0 == nNextCh;
89 					break;
90 
91 				case '*':		// ignoreflag
92 					nRet = RTF_IGNOREFLAG;
93 					break;
94 				case ':':	 	// subentry in an index entry
95 					nRet = RTF_SUBENTRYINDEX;
96 					break;
97 				case '|':		// formula-charakter
98 					nRet = RTF_FORMULA;
99 					break;
100 
101 				case 0x0a:
102 				case 0x0d:
103 					nRet = RTF_PAR;
104 					break;
105 
106 				default:
107 					if( RTF_ISALPHA( nNextCh ) )
108 					{
109 						aToken = '\\';
110 						{
111 							String aStrBuffer;
112 							sal_Unicode* pStr = aStrBuffer.AllocBuffer(
113 															MAX_TOKEN_LEN );
114 							xub_StrLen nStrLen = 0;
115 							do {
116 								*(pStr + nStrLen++) = nNextCh;
117 								if( MAX_TOKEN_LEN == nStrLen )
118 								{
119 									aToken += aStrBuffer;
120 									aToken.GetBufferAccess();  // make unique string!
121 									nStrLen = 0;
122 								}
123 								nNextCh = GetNextChar();
124 							} while( RTF_ISALPHA( nNextCh ) );
125 							if( nStrLen )
126 							{
127 								aStrBuffer.ReleaseBufferAccess( nStrLen );
128 								aToken += aStrBuffer;
129 							}
130 						}
131 
132 						// Minus fuer numerischen Parameter
133 						int bNegValue = false;
134 						if( '-' == nNextCh )
135 						{
136 							bNegValue = true;
137 							nNextCh = GetNextChar();
138 						}
139 
140 						// evt. Numerischer Parameter
141 						if( RTF_ISDIGIT( nNextCh ) )
142 						{
143 							nTokenValue = 0;
144 							do {
145 								nTokenValue *= 10;
146 								nTokenValue += nNextCh - '0';
147 								nNextCh = GetNextChar();
148 							} while( RTF_ISDIGIT( nNextCh ) );
149 							if( bNegValue )
150 								nTokenValue = -nTokenValue;
151 							bTokenHasValue=true;
152 						}
153 						else if( bNegValue )		// das Minus wieder zurueck
154 						{
155 							nNextCh = '-';
156 							rInput.SeekRel( -1 );
157 						}
158 						if( ' ' == nNextCh )		// Blank gehoert zum Token!
159 							nNextCh = GetNextChar();
160 
161 						// suche das Token in der Tabelle:
162 						if( 0 == (nRet = GetRTFToken( aToken )) )
163 							// Unknown Control
164 							nRet = RTF_UNKNOWNCONTROL;
165 
166 						// bug 76812 - unicode token handled as normal text
167 						bNextCh = false;
168 						switch( nRet )
169 						{
170 						case RTF_UC:
171 							if( 0 <= nTokenValue )
172 							{
173 								nUCharOverread = (sal_uInt8)nTokenValue;
174 #if 1
175                                 //cmc: other ifdef breaks #i3584
176 								aParserStates.top().
177 									nUCharOverread = nUCharOverread;
178 #else
179 								if( !nUCharOverread )
180 									nUCharOverread = aParserStates.top().nUCharOverread;
181 								else
182 									aParserStates.top().
183 										nUCharOverread = nUCharOverread;
184 #endif
185 							}
186 							aToken.Erase(); // #i47831# erase token to prevent the token from beeing treated as text
187 							// read next token
188 							nRet = 0;
189 							break;
190 
191 						case RTF_UPR:
192 							if (!_inSkipGroup) {
193 							// UPR - overread the group with the ansi
194 							//       informations
195 							while( '{' != _GetNextToken() )
196 								;
197 							SkipGroup();
198 							_GetNextToken();  // overread the last bracket
199 							nRet = 0;
200 							}
201 							break;
202 
203 						case RTF_U:
204 							if( !bRTF_InTextRead )
205 							{
206 								nRet = RTF_TEXTTOKEN;
207 								aToken = (sal_Unicode)nTokenValue;
208 
209 								// overread the next n "RTF" characters. This
210 								// can be also \{, \}, \'88
211 								for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
212 								{
213 									sal_Unicode cAnsi = nNextCh;
214 									while( 0xD == cAnsi )
215 										cAnsi = GetNextChar();
216 									while( 0xA == cAnsi )
217 										cAnsi = GetNextChar();
218 
219 									if( '\\' == cAnsi &&
220 										'\'' == ( cAnsi = GetNextChar() ))
221 										// HexValue ueberlesen
222 										cAnsi = GetHexValue();
223 									nNextCh = GetNextChar();
224 								}
225 								ScanText();
226 								bNextCh = 0 == nNextCh;
227 							}
228 							break;
229 						}
230 					}
231 					else if( SVPAR_PENDING != eState )
232 					{
233 						// Bug 34631 - "\ " ueberlesen - Blank als Zeichen
234 						// eState = SVPAR_ERROR;
235 						bNextCh = false;
236 					}
237 					break;
238 				}
239 			}
240 			break;
241 
242 		case sal_Unicode(EOF):
243 			eState = SVPAR_ACCEPTED;
244 			nRet = nNextCh;
245 			break;
246 
247 		case '{':
248 			{
249 				if( 0 <= nOpenBrakets )
250 				{
251 					RtfParserState_Impl aState( nUCharOverread, GetSrcEncoding() );
252                     aParserStates.push( aState );
253 				}
254 				++nOpenBrakets;
255                 DBG_ASSERT(
256                     static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
257                     "ParserStateStack unequal to bracket count" );
258 				nRet = nNextCh;
259 			}
260 			break;
261 
262 		case '}':
263 			--nOpenBrakets;
264 			if( 0 <= nOpenBrakets )
265 			{
266                 aParserStates.pop();
267 				if( !aParserStates.empty() )
268 				{
269 					const RtfParserState_Impl& rRPS =
270 							aParserStates.top();
271 					nUCharOverread = rRPS.nUCharOverread;
272 					SetSrcEncoding( rRPS.eCodeSet );
273 				}
274 				else
275 				{
276 					nUCharOverread = 1;
277 					SetSrcEncoding( GetCodeSet() );
278 				}
279 			}
280             DBG_ASSERT(
281                 static_cast<size_t>(nOpenBrakets) == aParserStates.size(),
282                 "ParserStateStack unequal to bracket count" );
283 			nRet = nNextCh;
284 			break;
285 
286 		case 0x0d:
287 		case 0x0a:
288 			break;
289 
290 		default:
291 			// es folgt normaler Text
292 			ScanText();
293 			nRet = RTF_TEXTTOKEN;
294 			bNextCh = 0 == nNextCh;
295 			break;
296 		}
297 
298 		if( bNextCh )
299 			nNextCh = GetNextChar();
300 
301 	} while( !nRet && SVPAR_WORKING == eState );
302 	return nRet;
303 }
304 
305 
306 sal_Unicode SvRTFParser::GetHexValue()
307 {
308 	// Hex-Wert sammeln
309 	register int n;
310 	register sal_Unicode nHexVal = 0;
311 
312 	for( n = 0; n < 2; ++n )
313 	{
314 		nHexVal *= 16;
315 		nNextCh = GetNextChar();
316 		if( nNextCh >= '0' && nNextCh <= '9' )
317 			nHexVal += (nNextCh - 48);
318 		else if( nNextCh >= 'a' && nNextCh <= 'f' )
319 			nHexVal += (nNextCh - 87);
320 		else if( nNextCh >= 'A' && nNextCh <= 'F' )
321 			nHexVal += (nNextCh - 55);
322 	}
323 	return nHexVal;
324 }
325 
326 void SvRTFParser::ScanText( const sal_Unicode cBreak )
327 {
328 	String aStrBuffer;
329 	int bWeiter = true;
330 	while( bWeiter && IsParserWorking() && aStrBuffer.Len() < MAX_STRING_LEN)
331 	{
332 		int bNextCh = true;
333 		switch( nNextCh )
334 		{
335 		case '\\':
336 			{
337 				switch (nNextCh = GetNextChar())
338 				{
339 				case '\'':
340 					{
341 
342 #if 0
343                         // #i35653 patch from cmc
344                         ByteString aByteString(static_cast<char>(GetHexValue()));
345                         if (aByteString.Len())
346                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
347 #else
348                         ByteString aByteString;
349                         while (1)
350                         {
351                             aByteString.Append((char)GetHexValue());
352 
353                             bool bBreak = false;
354                             sal_Char nSlash = '\\';
355                             while (!bBreak)
356                             {
357 								wchar_t __next=GetNextChar();
358 								if (__next>0xFF) // fix for #i43933# and #i35653#
359 								{
360 									if (aByteString.Len())
361 										aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
362 									aStrBuffer.Append((sal_Unicode)__next);
363 
364 									aByteString.Erase();
365 									continue;
366 								}
367                                 nSlash = (sal_Char)__next;
368                                 while (nSlash == 0xD || nSlash == 0xA)
369                                     nSlash = (sal_Char)GetNextChar();
370 
371                                 switch (nSlash)
372                                 {
373                                     case '{':
374                                     case '}':
375                                     case '\\':
376                                         bBreak = true;
377                                         break;
378                                     default:
379                                         aByteString.Append(nSlash);
380                                         break;
381                                 }
382                             }
383 
384                             nNextCh = GetNextChar();
385 
386                             if (nSlash != '\\' || nNextCh != '\'')
387                             {
388                                 rInput.SeekRel(-1);
389                                 nNextCh = nSlash;
390                                 break;
391                             }
392                         }
393 
394                         bNextCh = false;
395 
396                         if (aByteString.Len())
397                             aStrBuffer.Append(String(aByteString, GetSrcEncoding()));
398 #endif
399                     }
400 					break;
401 				case '\\':
402 				case '}':
403 				case '{':
404 				case '+':		// habe ich in einem RTF-File gefunden
405 					aStrBuffer.Append(nNextCh);
406 					break;
407 				case '~':		// nonbreaking space
408 					aStrBuffer.Append(static_cast< sal_Unicode >(0xA0));
409 					break;
410 				case '-':		// optional hyphen
411 					aStrBuffer.Append(static_cast< sal_Unicode >(0xAD));
412 					break;
413 				case '_':		// nonbreaking hyphen
414 					aStrBuffer.Append(static_cast< sal_Unicode >(0x2011));
415 					break;
416 
417 				case 'u':
418 					// UNI-Code Zeichen lesen
419 					{
420 						nNextCh = GetNextChar();
421 						rInput.SeekRel( -2 );
422 
423 						if( '-' == nNextCh || RTF_ISDIGIT( nNextCh ) )
424 						{
425 							bRTF_InTextRead = true;
426 
427 							String sSave( aToken );
428 							nNextCh = '\\';
429                             #ifdef DBG_UTIL
430 							int nToken =
431                             #endif
432                                 _GetNextToken();
433 							DBG_ASSERT( RTF_U == nToken, "doch kein UNI-Code Zeichen" );
434 							// dont convert symbol chars
435 							aStrBuffer.Append(
436                                 static_cast< sal_Unicode >(nTokenValue));
437 
438 							// overread the next n "RTF" characters. This
439 							// can be also \{, \}, \'88
440 							for( sal_uInt8 m = 0; m < nUCharOverread; ++m )
441 							{
442 								sal_Unicode cAnsi = nNextCh;
443 								while( 0xD == cAnsi )
444 									cAnsi = GetNextChar();
445 								while( 0xA == cAnsi )
446 									cAnsi = GetNextChar();
447 
448 								if( '\\' == cAnsi &&
449 									'\'' == ( cAnsi = GetNextChar() ))
450 									// HexValue ueberlesen
451 									cAnsi = GetHexValue();
452 								nNextCh = GetNextChar();
453 							}
454 							bNextCh = false;
455 							aToken = sSave;
456 							bRTF_InTextRead = false;
457 						}
458 						else
459 						{
460 							nNextCh = '\\';
461 							bWeiter = false;		// Abbrechen, String zusammen
462 						}
463 					}
464 					break;
465 
466 				default:
467 					rInput.SeekRel( -1 );
468 					nNextCh = '\\';
469 					bWeiter = false;		// Abbrechen, String zusammen
470 					break;
471 				}
472 			}
473 			break;
474 
475 		case sal_Unicode(EOF):
476 				eState = SVPAR_ERROR;
477 				// weiter
478 		case '{':
479 		case '}':
480 			bWeiter = false;
481 			break;
482 
483 		case 0x0a:
484 		case 0x0d:
485 			break;
486 
487 		default:
488 			if( nNextCh == cBreak || aStrBuffer.Len() >= MAX_STRING_LEN)
489 				bWeiter = false;
490 			else
491 			{
492 				do {
493 					// alle anderen Zeichen kommen in den Text
494 					aStrBuffer.Append(nNextCh);
495 
496 					if (sal_Unicode(EOF) == (nNextCh = GetNextChar()))
497 					{
498                         if (aStrBuffer.Len())
499 		                    aToken += aStrBuffer;
500 						return;
501 					}
502 				} while
503                 (
504                     (RTF_ISALPHA(nNextCh) || RTF_ISDIGIT(nNextCh)) &&
505                     (aStrBuffer.Len() < MAX_STRING_LEN)
506                 );
507 				bNextCh = false;
508 			}
509 		}
510 
511 		if( bWeiter && bNextCh )
512 			nNextCh = GetNextChar();
513 	}
514 
515 	if (aStrBuffer.Len())
516 		aToken += aStrBuffer;
517 }
518 
519 
520 short SvRTFParser::_inSkipGroup=0;
521 
522 void SvRTFParser::SkipGroup()
523 {
524 short nBrackets=1;
525 if (_inSkipGroup>0)
526 	return;
527 _inSkipGroup++;
528 #if 1	//#i16185# fecking \bin keyword
529     do
530     {
531         switch (nNextCh)
532         {
533             case '{':
534                 ++nBrackets;
535                 break;
536             case '}':
537 				if (!--nBrackets) {
538 					_inSkipGroup--;
539                     return;
540 				}
541                 break;
542         }
543         int nToken = _GetNextToken();
544         if (nToken == RTF_BIN)
545         {
546             rInput.SeekRel(-1);
547             rInput.SeekRel(nTokenValue);
548 		    nNextCh = GetNextChar();
549         }
550 		while (nNextCh==0xa || nNextCh==0xd)
551 		{
552 			nNextCh = GetNextChar();
553 		}
554     } while (sal_Unicode(EOF) != nNextCh && IsParserWorking());
555 #else
556 	sal_Unicode cPrev = 0;
557 	do {
558 		switch( nNextCh )
559 		{
560 		case '{':
561 			if( '\\' != cPrev )
562 				++nBrackets;
563 			break;
564 
565 		case '}':
566 			if( '\\' != cPrev && !--nBrackets )
567 				return;
568 			break;
569 
570 		case '\\':
571 			if( '\\' == cPrev )
572 				nNextCh = 0;
573 			break;
574 		}
575 		cPrev = nNextCh;
576 		nNextCh = GetNextChar();
577 	} while( sal_Unicode(EOF) != nNextCh && IsParserWorking() );
578 #endif
579 
580 	if( SVPAR_PENDING != eState && '}' != nNextCh )
581 		eState = SVPAR_ERROR;
582 	_inSkipGroup--;
583 }
584 
585 void SvRTFParser::ReadUnknownData()	{ SkipGroup(); }
586 void SvRTFParser::ReadBitmapData()	{ SkipGroup(); }
587 void SvRTFParser::ReadOLEData()		{ SkipGroup(); }
588 
589 
590 SvParserState SvRTFParser::CallParser()
591 {
592 	sal_Char cFirstCh;
593     nNextChPos = rInput.Tell();
594 	rInput >> cFirstCh; nNextCh = cFirstCh;
595 	eState = SVPAR_WORKING;
596 	nOpenBrakets = 0;
597 	SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
598 	eUNICodeSet = RTL_TEXTENCODING_MS_1252; 	// default ist ANSI-CodeSet
599 
600 	// die 1. beiden Token muessen '{' und \\rtf sein !!
601 	if( '{' == GetNextToken() && RTF_RTF == GetNextToken() )
602 	{
603 		AddRef();
604 		Continue( 0 );
605 		if( SVPAR_PENDING != eState )
606 			ReleaseRef();		// dann brauchen wir den Parser nicht mehr!
607 	}
608 	else
609 		eState = SVPAR_ERROR;
610 
611 	return eState;
612 }
613 
614 void SvRTFParser::Continue( int nToken )
615 {
616 //	DBG_ASSERT( SVPAR_CS_DONTKNOW == GetCharSet(),
617 //				"Zeichensatz wurde geaendert." );
618 
619 	if( !nToken )
620 		nToken = GetNextToken();
621 
622 	while( IsParserWorking() )
623 	{
624 		SaveState( nToken );
625 		switch( nToken )
626 		{
627 		case '}':
628 			if( nOpenBrakets )
629 				goto NEXTTOKEN;
630 			eState = SVPAR_ACCEPTED;
631 			break;
632 
633 		case '{':
634 			// eine unbekannte Gruppe ?
635 			{
636 				if( RTF_IGNOREFLAG != GetNextToken() )
637 					nToken = SkipToken( -1 );
638 				else if( RTF_UNKNOWNCONTROL != GetNextToken() )
639 					nToken = SkipToken( -2 );
640 				else
641 				{
642 					// gleich herausfiltern
643 					ReadUnknownData();
644 					nToken = GetNextToken();
645 					if( '}' != nToken )
646 						eState = SVPAR_ERROR;
647 					break;		// auf zum naechsten Token!!
648 				}
649 			}
650 			goto NEXTTOKEN;
651 
652 		case RTF_UNKNOWNCONTROL:
653 			break;		// unbekannte Token ueberspringen
654 		case RTF_NEXTTYPE:
655 		case RTF_ANSITYPE:
656             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_MS_1252 );
657             break;
658 		case RTF_MACTYPE:
659             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_APPLE_ROMAN );
660             break;
661 		case RTF_PCTYPE:
662             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_437 );
663             break;
664 		case RTF_PCATYPE:
665             SetSrcEncoding( eCodeSet = RTL_TEXTENCODING_IBM_850 );
666             break;
667 		case RTF_ANSICPG:
668             eCodeSet = rtl_getTextEncodingFromWindowsCodePage(nTokenValue);
669             SetSrcEncoding(eCodeSet);
670 			break;
671 		default:
672 NEXTTOKEN:
673 			NextToken( nToken );
674 			break;
675 		}
676 		if( IsParserWorking() )
677 			SaveState( 0 );			// bis hierhin abgearbeitet,
678 									// weiter mit neuem Token!
679 		nToken = GetNextToken();
680 	}
681 	if( SVPAR_ACCEPTED == eState && 0 < nOpenBrakets )
682 		eState = SVPAR_ERROR;
683 }
684 
685 void SvRTFParser::SetEncoding( rtl_TextEncoding eEnc )
686 {
687 	if (eEnc == RTL_TEXTENCODING_DONTKNOW)
688 		eEnc = GetCodeSet();
689 
690 	if (!aParserStates.empty())
691 		aParserStates.top().eCodeSet = eEnc;
692 	SetSrcEncoding(eEnc);
693 }
694 
695 #ifdef USED
696 void SvRTFParser::SaveState( int nToken )
697 {
698 	SvParser::SaveState( nToken );
699 }
700 
701 void SvRTFParser::RestoreState()
702 {
703 	SvParser::RestoreState();
704 }
705 #endif
706 
707 /* vi:set tabstop=4 shiftwidth=4 expandtab: */
708