xref: /aoo41x/main/svtools/source/svhtml/parhtml.cxx (revision 5900e8ec)
1*5900e8ecSAndrew Rist /**************************************************************
2cdf0e10cSrcweir  *
3*5900e8ecSAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*5900e8ecSAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*5900e8ecSAndrew Rist  * distributed with this work for additional information
6*5900e8ecSAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*5900e8ecSAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*5900e8ecSAndrew Rist  * "License"); you may not use this file except in compliance
9*5900e8ecSAndrew Rist  * with the License.  You may obtain a copy of the License at
10*5900e8ecSAndrew Rist  *
11*5900e8ecSAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12*5900e8ecSAndrew Rist  *
13*5900e8ecSAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*5900e8ecSAndrew Rist  * software distributed under the License is distributed on an
15*5900e8ecSAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*5900e8ecSAndrew Rist  * KIND, either express or implied.  See the License for the
17*5900e8ecSAndrew Rist  * specific language governing permissions and limitations
18*5900e8ecSAndrew Rist  * under the License.
19*5900e8ecSAndrew Rist  *
20*5900e8ecSAndrew Rist  *************************************************************/
21*5900e8ecSAndrew Rist 
22*5900e8ecSAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
25cdf0e10cSrcweir #include "precompiled_svtools.hxx"
26cdf0e10cSrcweir 
27cdf0e10cSrcweir #include <ctype.h>
28cdf0e10cSrcweir #include <stdio.h>
29cdf0e10cSrcweir #include <tools/stream.hxx>
30cdf0e10cSrcweir #include <tools/debug.hxx>
31cdf0e10cSrcweir #include <tools/color.hxx>
32cdf0e10cSrcweir #include <rtl/ustrbuf.hxx>
33cdf0e10cSrcweir #include <rtl/strbuf.hxx>
34cdf0e10cSrcweir #ifndef _SVSTDARR_HXX
35cdf0e10cSrcweir #define _SVSTDARR_ULONGS
36cdf0e10cSrcweir #include <svl/svstdarr.hxx>
37cdf0e10cSrcweir #endif
38cdf0e10cSrcweir 
39cdf0e10cSrcweir #include <tools/tenccvt.hxx>
40cdf0e10cSrcweir #include <tools/datetime.hxx>
41cdf0e10cSrcweir #include <svl/inettype.hxx>
42cdf0e10cSrcweir #include <comphelper/string.hxx>
43cdf0e10cSrcweir #include <com/sun/star/beans/PropertyAttribute.hpp>
44cdf0e10cSrcweir #include <com/sun/star/document/XDocumentProperties.hpp>
45cdf0e10cSrcweir 
46cdf0e10cSrcweir #include <svtools/parhtml.hxx>
47cdf0e10cSrcweir #include <svtools/htmltokn.h>
48cdf0e10cSrcweir #include <svtools/htmlkywd.hxx>
49cdf0e10cSrcweir 
50cdf0e10cSrcweir 
51cdf0e10cSrcweir using namespace ::com::sun::star;
52cdf0e10cSrcweir 
53cdf0e10cSrcweir 
54cdf0e10cSrcweir const sal_Int32 MAX_LEN( 1024L );
55cdf0e10cSrcweir //static sal_Unicode sTmpBuffer[ MAX_LEN+1 ];
56cdf0e10cSrcweir const sal_Int32 MAX_MACRO_LEN( 1024 );
57cdf0e10cSrcweir 
58cdf0e10cSrcweir const sal_Int32 MAX_ENTITY_LEN( 8L );
59cdf0e10cSrcweir 
60cdf0e10cSrcweir /*  */
61cdf0e10cSrcweir 
62cdf0e10cSrcweir // Tabellen zum Umwandeln von Options-Werten in Strings
63cdf0e10cSrcweir 
64cdf0e10cSrcweir // <INPUT TYPE=xxx>
65cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aInputTypeOptEnums[] =
66cdf0e10cSrcweir {
67cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_text,		HTML_IT_TEXT		},
68cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_password,	HTML_IT_PASSWORD	},
69cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_checkbox,	HTML_IT_CHECKBOX	},
70cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_radio,   	HTML_IT_RADIO		},
71cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_range,   	HTML_IT_RANGE		},
72cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_scribble,	HTML_IT_SCRIBBLE	},
73cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_file,    	HTML_IT_FILE		},
74cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_hidden,  	HTML_IT_HIDDEN		},
75cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_submit,  	HTML_IT_SUBMIT		},
76cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_image,   	HTML_IT_IMAGE		},
77cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_reset,   	HTML_IT_RESET		},
78cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_IT_button,   	HTML_IT_BUTTON		},
79cdf0e10cSrcweir 	{ 0,					0					}
80cdf0e10cSrcweir };
81cdf0e10cSrcweir 
82cdf0e10cSrcweir // <TABLE FRAME=xxx>
83cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableFrameOptEnums[] =
84cdf0e10cSrcweir {
85cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TF_void,	HTML_TF_VOID	},
86cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TF_above,	HTML_TF_ABOVE	},
87cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TF_below,	HTML_TF_BELOW	},
88cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TF_hsides,	HTML_TF_HSIDES	},
89cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TF_lhs,		HTML_TF_LHS		},
90cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TF_rhs,		HTML_TF_RHS		},
91cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TF_vsides,	HTML_TF_VSIDES	},
92cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TF_box,		HTML_TF_BOX		},
93cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TF_border,	HTML_TF_BOX		},
94cdf0e10cSrcweir 	{ 0,				0				}
95cdf0e10cSrcweir };
96cdf0e10cSrcweir 
97cdf0e10cSrcweir // <TABLE RULES=xxx>
98cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableRulesOptEnums[] =
99cdf0e10cSrcweir {
100cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TR_none,	HTML_TR_NONE	},
101cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TR_groups,	HTML_TR_GROUPS	},
102cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TR_rows,	HTML_TR_ROWS	},
103cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TR_cols,	HTML_TR_COLS	},
104cdf0e10cSrcweir 	{ OOO_STRING_SVTOOLS_HTML_TR_all,		HTML_TR_ALL		},
105cdf0e10cSrcweir 	{ 0,				0				}
106cdf0e10cSrcweir };
107cdf0e10cSrcweir 
108cdf0e10cSrcweir 
SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr)109cdf0e10cSrcweir SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr)
110cdf0e10cSrcweir 
111cdf0e10cSrcweir /*  */
112cdf0e10cSrcweir 
113cdf0e10cSrcweir sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const
114cdf0e10cSrcweir {
115cdf0e10cSrcweir 	sal_uInt16 nValue = nDflt;
116cdf0e10cSrcweir 
117cdf0e10cSrcweir 	while( pOptEnums->pName )
118cdf0e10cSrcweir 		if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
119cdf0e10cSrcweir 			break;
120cdf0e10cSrcweir 		else
121cdf0e10cSrcweir 			pOptEnums++;
122cdf0e10cSrcweir 
123cdf0e10cSrcweir 	if( pOptEnums->pName )
124cdf0e10cSrcweir 		nValue = pOptEnums->nValue;
125cdf0e10cSrcweir 
126cdf0e10cSrcweir 	return nValue;
127cdf0e10cSrcweir }
128cdf0e10cSrcweir 
GetEnum(sal_uInt16 & rEnum,const HTMLOptionEnum * pOptEnums) const129cdf0e10cSrcweir sal_Bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const
130cdf0e10cSrcweir {
131cdf0e10cSrcweir 	while( pOptEnums->pName )
132cdf0e10cSrcweir 	{
133cdf0e10cSrcweir 		if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
134cdf0e10cSrcweir 			break;
135cdf0e10cSrcweir 		else
136cdf0e10cSrcweir 			pOptEnums++;
137cdf0e10cSrcweir 	}
138cdf0e10cSrcweir 
139cdf0e10cSrcweir 	const sal_Char *pName = pOptEnums->pName;
140cdf0e10cSrcweir 	if( pName )
141cdf0e10cSrcweir 		rEnum = pOptEnums->nValue;
142cdf0e10cSrcweir 
143cdf0e10cSrcweir 	return (pName != 0);
144cdf0e10cSrcweir }
145cdf0e10cSrcweir 
HTMLOption(sal_uInt16 nTok,const String & rToken,const String & rValue)146cdf0e10cSrcweir HTMLOption::HTMLOption( sal_uInt16 nTok, const String& rToken,
147cdf0e10cSrcweir 						const String& rValue )
148cdf0e10cSrcweir 	: aValue(rValue)
149cdf0e10cSrcweir 	, aToken(rToken)
150cdf0e10cSrcweir 	, nToken( nTok )
151cdf0e10cSrcweir {
152cdf0e10cSrcweir 	DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END,
153cdf0e10cSrcweir 		"HTMLOption: unbekanntes Token" );
154cdf0e10cSrcweir }
155cdf0e10cSrcweir 
GetNumber() const156cdf0e10cSrcweir sal_uInt32 HTMLOption::GetNumber() const
157cdf0e10cSrcweir {
158cdf0e10cSrcweir 	DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START &&
159cdf0e10cSrcweir 				 nToken<HTML_OPTION_NUMBER_END) ||
160cdf0e10cSrcweir 				(nToken>=HTML_OPTION_CONTEXT_START &&
161cdf0e10cSrcweir 				 nToken<HTML_OPTION_CONTEXT_END) ||
162cdf0e10cSrcweir 				nToken==HTML_O_VALUE,
163cdf0e10cSrcweir 		"GetNumber: Option ist nicht numerisch" );
164cdf0e10cSrcweir 	String aTmp( aValue );
165cdf0e10cSrcweir 	aTmp.EraseLeadingChars();
166cdf0e10cSrcweir 	sal_Int32 nTmp = aTmp.ToInt32();
167cdf0e10cSrcweir 	return nTmp >= 0 ? (sal_uInt32)nTmp : 0;
168cdf0e10cSrcweir }
169cdf0e10cSrcweir 
GetSNumber() const170cdf0e10cSrcweir sal_Int32 HTMLOption::GetSNumber() const
171cdf0e10cSrcweir {
172cdf0e10cSrcweir 	DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) ||
173cdf0e10cSrcweir 				(nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END),
174cdf0e10cSrcweir 		"GetSNumber: Option ist nicht numerisch" );
175cdf0e10cSrcweir 	String aTmp( aValue );
176cdf0e10cSrcweir 	aTmp.EraseLeadingChars();
177cdf0e10cSrcweir 	return aTmp.ToInt32();
178cdf0e10cSrcweir }
179cdf0e10cSrcweir 
GetNumbers(SvULongs & rLongs,sal_Bool bSpaceDelim) const180cdf0e10cSrcweir void HTMLOption::GetNumbers( SvULongs &rLongs, sal_Bool bSpaceDelim ) const
181cdf0e10cSrcweir {
182cdf0e10cSrcweir 	if( rLongs.Count() )
183cdf0e10cSrcweir 		rLongs.Remove( 0, rLongs.Count() );
184cdf0e10cSrcweir 
185cdf0e10cSrcweir 	if( bSpaceDelim )
186cdf0e10cSrcweir 	{
187cdf0e10cSrcweir 		// das ist ein sehr stark vereinfachter Scanner. Er sucht einfach
188cdf0e10cSrcweir 		// alle Tiffern aus dem String
189cdf0e10cSrcweir 		sal_Bool bInNum = sal_False;
190cdf0e10cSrcweir 		sal_uLong nNum = 0;
191cdf0e10cSrcweir 		for( xub_StrLen i=0; i<aValue.Len(); i++ )
192cdf0e10cSrcweir 		{
193cdf0e10cSrcweir 			register sal_Unicode c = aValue.GetChar( i );
194cdf0e10cSrcweir 			if( c>='0' && c<='9' )
195cdf0e10cSrcweir 			{
196cdf0e10cSrcweir 				nNum *= 10;
197cdf0e10cSrcweir 				nNum += (c - '0');
198cdf0e10cSrcweir 				bInNum = sal_True;
199cdf0e10cSrcweir 			}
200cdf0e10cSrcweir 			else if( bInNum )
201cdf0e10cSrcweir 			{
202cdf0e10cSrcweir 				rLongs.Insert( nNum, rLongs.Count() );
203cdf0e10cSrcweir 				bInNum = sal_False;
204cdf0e10cSrcweir 				nNum = 0;
205cdf0e10cSrcweir 			}
206cdf0e10cSrcweir 		}
207cdf0e10cSrcweir 		if( bInNum )
208cdf0e10cSrcweir 		{
209cdf0e10cSrcweir 			rLongs.Insert( nNum, rLongs.Count() );
210cdf0e10cSrcweir 		}
211cdf0e10cSrcweir 	}
212cdf0e10cSrcweir 	else
213cdf0e10cSrcweir 	{
214cdf0e10cSrcweir 		// hier wird auf die korrekte Trennung der Zahlen durch ',' geachtet
215cdf0e10cSrcweir 		// und auch mal eine 0 eingefuegt
216cdf0e10cSrcweir 		xub_StrLen nPos = 0;
217cdf0e10cSrcweir 		while( nPos < aValue.Len() )
218cdf0e10cSrcweir 		{
219cdf0e10cSrcweir 			register sal_Unicode c;
220cdf0e10cSrcweir 			while( nPos < aValue.Len() &&
221cdf0e10cSrcweir 				   ((c=aValue.GetChar(nPos)) == ' ' || c == '\t' ||
222cdf0e10cSrcweir 				   c == '\n' || c== '\r' ) )
223cdf0e10cSrcweir 				nPos++;
224cdf0e10cSrcweir 
225cdf0e10cSrcweir 			if( nPos==aValue.Len() )
226cdf0e10cSrcweir 				rLongs.Insert( sal_uLong(0), rLongs.Count() );
227cdf0e10cSrcweir 			else
228cdf0e10cSrcweir 			{
229cdf0e10cSrcweir 				xub_StrLen nEnd = aValue.Search( (sal_Unicode)',', nPos );
230cdf0e10cSrcweir 				if( STRING_NOTFOUND==nEnd )
231cdf0e10cSrcweir 				{
232cdf0e10cSrcweir 					sal_Int32 nTmp = aValue.Copy(nPos).ToInt32();
233cdf0e10cSrcweir 					rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0,
234cdf0e10cSrcweir 								   rLongs.Count() );
235cdf0e10cSrcweir 					nPos = aValue.Len();
236cdf0e10cSrcweir 				}
237cdf0e10cSrcweir 				else
238cdf0e10cSrcweir 				{
239cdf0e10cSrcweir 					sal_Int32 nTmp =
240cdf0e10cSrcweir 						aValue.Copy(nPos,nEnd-nPos).ToInt32();
241cdf0e10cSrcweir 					rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0,
242cdf0e10cSrcweir 								   rLongs.Count() );
243cdf0e10cSrcweir 					nPos = nEnd+1;
244cdf0e10cSrcweir 				}
245cdf0e10cSrcweir 			}
246cdf0e10cSrcweir 		}
247cdf0e10cSrcweir 	}
248cdf0e10cSrcweir }
249cdf0e10cSrcweir 
GetColor(Color & rColor) const250cdf0e10cSrcweir void HTMLOption::GetColor( Color& rColor ) const
251cdf0e10cSrcweir {
252cdf0e10cSrcweir 	DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE,
253cdf0e10cSrcweir 		"GetColor: Option spezifiziert keine Farbe" );
254cdf0e10cSrcweir 
255cdf0e10cSrcweir 	String aTmp( aValue );
256cdf0e10cSrcweir 	aTmp.ToUpperAscii();
257cdf0e10cSrcweir 	sal_uLong nColor = ULONG_MAX;
258cdf0e10cSrcweir 	if( '#'!=aTmp.GetChar( 0 ) )
259cdf0e10cSrcweir 		nColor = GetHTMLColor( aTmp );
260cdf0e10cSrcweir 
261cdf0e10cSrcweir 	if( ULONG_MAX == nColor )
262cdf0e10cSrcweir 	{
263cdf0e10cSrcweir 		nColor = 0;
264cdf0e10cSrcweir 		xub_StrLen nPos = 0;
265cdf0e10cSrcweir 		for( sal_uInt32 i=0; i<6; i++ )
266cdf0e10cSrcweir 		{
267cdf0e10cSrcweir 			// MIB 26.06.97: Wie auch immer Netscape Farbwerte ermittelt,
268cdf0e10cSrcweir 			// maximal drei Zeichen, die kleiner als '0' sind werden
269cdf0e10cSrcweir 			// ignoriert. Bug #40901# stimmt damit. Mal schauen, was sich
270cdf0e10cSrcweir 			// irgendwelche HTML-Autoren noch so einfallen lassen...
271cdf0e10cSrcweir 			register sal_Unicode c = nPos<aTmp.Len() ? aTmp.GetChar( nPos++ )
272cdf0e10cSrcweir 													 : '0';
273cdf0e10cSrcweir 			if( c < '0' )
274cdf0e10cSrcweir 			{
275cdf0e10cSrcweir 				c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
276cdf0e10cSrcweir 				if( c < '0' )
277cdf0e10cSrcweir 					c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
278cdf0e10cSrcweir 			}
279cdf0e10cSrcweir 			nColor *= 16;
280cdf0e10cSrcweir 			if( c >= '0' && c <= '9' )
281cdf0e10cSrcweir 				nColor += (c - 48);
282cdf0e10cSrcweir 			else if( c >= 'A' && c <= 'F' )
283cdf0e10cSrcweir 				nColor += (c - 55);
284cdf0e10cSrcweir 		}
285cdf0e10cSrcweir 	}
286cdf0e10cSrcweir 
287cdf0e10cSrcweir 	rColor.SetRed(   (sal_uInt8)((nColor & 0x00ff0000) >> 16) );
288cdf0e10cSrcweir 	rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8));
289cdf0e10cSrcweir 	rColor.SetBlue(  (sal_uInt8)(nColor & 0x000000ff) );
290cdf0e10cSrcweir }
291cdf0e10cSrcweir 
GetInputType() const292cdf0e10cSrcweir HTMLInputType HTMLOption::GetInputType() const
293cdf0e10cSrcweir {
294cdf0e10cSrcweir 	DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option nicht TYPE" );
295cdf0e10cSrcweir 	return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT );
296cdf0e10cSrcweir }
297cdf0e10cSrcweir 
GetTableFrame() const298cdf0e10cSrcweir HTMLTableFrame HTMLOption::GetTableFrame() const
299cdf0e10cSrcweir {
300cdf0e10cSrcweir 	DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option nicht FRAME" );
301cdf0e10cSrcweir 	return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID );
302cdf0e10cSrcweir }
303cdf0e10cSrcweir 
GetTableRules() const304cdf0e10cSrcweir HTMLTableRules HTMLOption::GetTableRules() const
305cdf0e10cSrcweir {
306cdf0e10cSrcweir 	DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option nicht RULES" );
307cdf0e10cSrcweir 	return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE );
308cdf0e10cSrcweir }
309cdf0e10cSrcweir 
310cdf0e10cSrcweir /*  */
311cdf0e10cSrcweir 
HTMLParser(SvStream & rIn,int bReadNewDoc)312cdf0e10cSrcweir HTMLParser::HTMLParser( SvStream& rIn, int bReadNewDoc )
313cdf0e10cSrcweir 	: SvParser( rIn )
314cdf0e10cSrcweir {
315cdf0e10cSrcweir 	bNewDoc = bReadNewDoc;
316cdf0e10cSrcweir 	bReadListing = bReadXMP = bReadPRE = bReadTextArea =
317cdf0e10cSrcweir 		bReadScript = bReadStyle =
318cdf0e10cSrcweir 		bEndTokenFound = bIsInBody = bReadNextChar =
319cdf0e10cSrcweir 		bReadComment = sal_False;
320cdf0e10cSrcweir 	bIsInHeader = sal_True;
321cdf0e10cSrcweir 	pOptions = new HTMLOptions;
3228d621361SPedro Giffuni 
3238d621361SPedro Giffuni 	//#i76649, default to UTF-8 for HTML unless we know differently
3248d621361SPedro Giffuni 	SetSrcEncoding(RTL_TEXTENCODING_UTF8);
325cdf0e10cSrcweir }
326cdf0e10cSrcweir 
~HTMLParser()327cdf0e10cSrcweir HTMLParser::~HTMLParser()
328cdf0e10cSrcweir {
329cdf0e10cSrcweir 	if( pOptions && pOptions->Count() )
330cdf0e10cSrcweir 		pOptions->DeleteAndDestroy( 0, pOptions->Count() );
331cdf0e10cSrcweir 	delete pOptions;
332cdf0e10cSrcweir }
333cdf0e10cSrcweir 
CallParser()334cdf0e10cSrcweir SvParserState __EXPORT HTMLParser::CallParser()
335cdf0e10cSrcweir {
336cdf0e10cSrcweir 	eState = SVPAR_WORKING;
337cdf0e10cSrcweir 	nNextCh = GetNextChar();
338cdf0e10cSrcweir 	SaveState( 0 );
339cdf0e10cSrcweir 
340cdf0e10cSrcweir 	nPre_LinePos = 0;
341cdf0e10cSrcweir 	bPre_IgnoreNewPara = sal_False;
342cdf0e10cSrcweir 
343cdf0e10cSrcweir 	AddRef();
344cdf0e10cSrcweir 	Continue( 0 );
345cdf0e10cSrcweir 	if( SVPAR_PENDING != eState )
346cdf0e10cSrcweir 		ReleaseRef();		// dann brauchen wir den Parser nicht mehr!
347cdf0e10cSrcweir 
348cdf0e10cSrcweir 	return eState;
349cdf0e10cSrcweir }
350cdf0e10cSrcweir 
Continue(int nToken)351cdf0e10cSrcweir void HTMLParser::Continue( int nToken )
352cdf0e10cSrcweir {
353cdf0e10cSrcweir 	if( !nToken )
354cdf0e10cSrcweir 		nToken = GetNextToken();
355cdf0e10cSrcweir 
356cdf0e10cSrcweir 	while( IsParserWorking() )
357cdf0e10cSrcweir 	{
358cdf0e10cSrcweir 		SaveState( nToken );
359cdf0e10cSrcweir 		nToken = FilterToken( nToken );
360cdf0e10cSrcweir 
361cdf0e10cSrcweir 		if( nToken )
362cdf0e10cSrcweir 			NextToken( nToken );
363cdf0e10cSrcweir 
364cdf0e10cSrcweir 		if( IsParserWorking() )
365cdf0e10cSrcweir 			SaveState( 0 );			// bis hierhin abgearbeitet,
366cdf0e10cSrcweir 									// weiter mit neuem Token!
367cdf0e10cSrcweir 		nToken = GetNextToken();
368cdf0e10cSrcweir 	}
369cdf0e10cSrcweir }
370cdf0e10cSrcweir 
FilterToken(int nToken)371cdf0e10cSrcweir int HTMLParser::FilterToken( int nToken )
372cdf0e10cSrcweir {
373cdf0e10cSrcweir 	switch( nToken )
374cdf0e10cSrcweir 	{
375cdf0e10cSrcweir 	case sal_Unicode(EOF):
376cdf0e10cSrcweir 		nToken = 0;
377cdf0e10cSrcweir 		break;			// nicht verschicken
378cdf0e10cSrcweir 
379cdf0e10cSrcweir 	case HTML_HEAD_OFF:
380cdf0e10cSrcweir 		bIsInBody = sal_True;
381cdf0e10cSrcweir 	case HTML_HEAD_ON:
382cdf0e10cSrcweir 		bIsInHeader = HTML_HEAD_ON == nToken;
383cdf0e10cSrcweir 		break;
384cdf0e10cSrcweir 
385cdf0e10cSrcweir 	case HTML_BODY_ON:
386cdf0e10cSrcweir 	case HTML_FRAMESET_ON:
387cdf0e10cSrcweir 		bIsInHeader = sal_False;
388cdf0e10cSrcweir 		bIsInBody = HTML_BODY_ON == nToken;
389cdf0e10cSrcweir 		break;
390cdf0e10cSrcweir 
391cdf0e10cSrcweir 	case HTML_BODY_OFF:
392cdf0e10cSrcweir 		bIsInBody = bReadPRE = bReadListing = bReadXMP = sal_False;
393cdf0e10cSrcweir 		break;
394cdf0e10cSrcweir 
395cdf0e10cSrcweir 	case HTML_HTML_OFF:
396cdf0e10cSrcweir 		nToken = 0;
397cdf0e10cSrcweir 		bReadPRE = bReadListing = bReadXMP = sal_False;
398cdf0e10cSrcweir 		break;		// HTML_ON wurde auch nicht verschickt !
399cdf0e10cSrcweir 
400cdf0e10cSrcweir 	case HTML_PREFORMTXT_ON:
401cdf0e10cSrcweir 		StartPRE();
402cdf0e10cSrcweir 		break;
403cdf0e10cSrcweir 
404cdf0e10cSrcweir 	case HTML_PREFORMTXT_OFF:
405cdf0e10cSrcweir 		FinishPRE();
406cdf0e10cSrcweir 		break;
407cdf0e10cSrcweir 
408cdf0e10cSrcweir 	case HTML_LISTING_ON:
409cdf0e10cSrcweir 		StartListing();
410cdf0e10cSrcweir 		break;
411cdf0e10cSrcweir 
412cdf0e10cSrcweir 	case HTML_LISTING_OFF:
413cdf0e10cSrcweir 		FinishListing();
414cdf0e10cSrcweir 		break;
415cdf0e10cSrcweir 
416cdf0e10cSrcweir 	case HTML_XMP_ON:
417cdf0e10cSrcweir 		StartXMP();
418cdf0e10cSrcweir 		break;
419cdf0e10cSrcweir 
420cdf0e10cSrcweir 	case HTML_XMP_OFF:
421cdf0e10cSrcweir 		FinishXMP();
422cdf0e10cSrcweir 		break;
423cdf0e10cSrcweir 
424cdf0e10cSrcweir 	default:
425cdf0e10cSrcweir 		if( bReadPRE )
426cdf0e10cSrcweir 			nToken = FilterPRE( nToken );
427cdf0e10cSrcweir 		else if( bReadListing )
428cdf0e10cSrcweir 			nToken = FilterListing( nToken );
429cdf0e10cSrcweir 		else if( bReadXMP )
430cdf0e10cSrcweir 			nToken = FilterXMP( nToken );
431cdf0e10cSrcweir 
432cdf0e10cSrcweir 		break;
433cdf0e10cSrcweir 	}
434cdf0e10cSrcweir 
435cdf0e10cSrcweir 	return nToken;
436cdf0e10cSrcweir }
437cdf0e10cSrcweir 
438cdf0e10cSrcweir #define HTML_ISDIGIT( c ) (c >= '0' && c <= '9')
439cdf0e10cSrcweir #define HTML_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
440cdf0e10cSrcweir #define HTML_ISALNUM( c ) ( HTML_ISALPHA(c) || HTML_ISDIGIT(c) )
441cdf0e10cSrcweir #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) )
442cdf0e10cSrcweir #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127)
443cdf0e10cSrcweir // --> OD 2006-07-26 #138464#
444cdf0e10cSrcweir #define HTML_ISHEXDIGIT( c ) ( HTML_ISDIGIT(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') )
445cdf0e10cSrcweir // <--
446cdf0e10cSrcweir 
ScanText(const sal_Unicode cBreak)447cdf0e10cSrcweir int HTMLParser::ScanText( const sal_Unicode cBreak )
448cdf0e10cSrcweir {
449cdf0e10cSrcweir 	::rtl::OUStringBuffer sTmpBuffer( MAX_LEN );
450cdf0e10cSrcweir 	int bWeiter = sal_True;
451cdf0e10cSrcweir 	int bEqSignFound = sal_False;
452cdf0e10cSrcweir 	sal_Unicode cQuote = 0U;
453cdf0e10cSrcweir 
454cdf0e10cSrcweir 	while( bWeiter && IsParserWorking() )
455cdf0e10cSrcweir 	{
456cdf0e10cSrcweir 		int bNextCh = sal_True;
457cdf0e10cSrcweir 		switch( nNextCh )
458cdf0e10cSrcweir 		{
459cdf0e10cSrcweir 		case '&':
460cdf0e10cSrcweir 			bEqSignFound = sal_False;
461cdf0e10cSrcweir 			if( bReadXMP )
462cdf0e10cSrcweir 				sTmpBuffer.append( (sal_Unicode)'&' );
463cdf0e10cSrcweir 			else
464cdf0e10cSrcweir 			{
465cdf0e10cSrcweir 				sal_uLong nStreamPos = rInput.Tell();
466cdf0e10cSrcweir 				sal_uLong nLinePos = GetLinePos();
467cdf0e10cSrcweir 
468cdf0e10cSrcweir 				sal_Unicode cChar = 0U;
469cdf0e10cSrcweir 				if( '#' == (nNextCh = GetNextChar()) )
470cdf0e10cSrcweir 				{
471cdf0e10cSrcweir 					nNextCh = GetNextChar();
472cdf0e10cSrcweir                     // --> OD 2006-07-26 #138464#
473cdf0e10cSrcweir                     // consider hexadecimal digits
474cdf0e10cSrcweir                     const sal_Bool bIsHex( 'x' == nNextCh );
475cdf0e10cSrcweir                     const sal_Bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) );
476cdf0e10cSrcweir                     if ( bIsDecOrHex )
477cdf0e10cSrcweir 					{
478cdf0e10cSrcweir                         if ( bIsHex )
479cdf0e10cSrcweir                         {
480cdf0e10cSrcweir                             nNextCh = GetNextChar();
481cdf0e10cSrcweir                             while ( HTML_ISHEXDIGIT(nNextCh) )
482cdf0e10cSrcweir                             {
483cdf0e10cSrcweir                                 cChar = cChar * 16U +
484cdf0e10cSrcweir                                         ( nNextCh <= '9'
485cdf0e10cSrcweir                                           ? sal_Unicode( nNextCh - '0' )
486cdf0e10cSrcweir                                           : ( nNextCh <= 'F'
487cdf0e10cSrcweir                                               ? sal_Unicode( nNextCh - 'A' + 10 )
488cdf0e10cSrcweir                                               : sal_Unicode( nNextCh - 'a' + 10 ) ) );
489cdf0e10cSrcweir                                 nNextCh = GetNextChar();
490cdf0e10cSrcweir                             }
491cdf0e10cSrcweir                         }
492cdf0e10cSrcweir                         else
493cdf0e10cSrcweir                         {
494cdf0e10cSrcweir                             do
495cdf0e10cSrcweir                             {
496cdf0e10cSrcweir                                 cChar = cChar * 10U + sal_Unicode( nNextCh - '0');
497cdf0e10cSrcweir                                 nNextCh = GetNextChar();
498cdf0e10cSrcweir                             }
499cdf0e10cSrcweir                             while( HTML_ISDIGIT(nNextCh) );
500cdf0e10cSrcweir                         }
501cdf0e10cSrcweir 
502cdf0e10cSrcweir 						if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
503cdf0e10cSrcweir 							RTL_TEXTENCODING_UCS2 != eSrcEnc &&
504cdf0e10cSrcweir 							RTL_TEXTENCODING_UTF8 != eSrcEnc &&
505cdf0e10cSrcweir 						 	cChar < 256 )
506cdf0e10cSrcweir 						{
507cdf0e10cSrcweir 						 	sal_Unicode cOrig = cChar;
508cdf0e10cSrcweir 							cChar = ByteString::ConvertToUnicode(
509cdf0e10cSrcweir 											(sal_Char)cChar, eSrcEnc );
510cdf0e10cSrcweir 							if( 0U == cChar )
511cdf0e10cSrcweir 							{
512cdf0e10cSrcweir 								// #73398#: If the character could not be
513cdf0e10cSrcweir 								// converted, because a conversion is not
514cdf0e10cSrcweir 								// available, do no conversion at all.
515cdf0e10cSrcweir 								cChar = cOrig;
516cdf0e10cSrcweir 							}
517cdf0e10cSrcweir 						}
518cdf0e10cSrcweir 					}
519cdf0e10cSrcweir                     // <--
520cdf0e10cSrcweir 					else
521cdf0e10cSrcweir 						nNextCh = 0U;
522cdf0e10cSrcweir 				}
523cdf0e10cSrcweir 				else if( HTML_ISALPHA( nNextCh ) )
524cdf0e10cSrcweir 				{
525cdf0e10cSrcweir 					::rtl::OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
526cdf0e10cSrcweir 					xub_StrLen nPos = 0L;
527cdf0e10cSrcweir 					do
528cdf0e10cSrcweir 					{
529cdf0e10cSrcweir 						sEntityBuffer.append( nNextCh );
530cdf0e10cSrcweir 						nPos++;
531cdf0e10cSrcweir 						nNextCh = GetNextChar();
532cdf0e10cSrcweir 					}
533cdf0e10cSrcweir 					while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) &&
534cdf0e10cSrcweir 						   !rInput.IsEof() );
535cdf0e10cSrcweir 
536cdf0e10cSrcweir 					if( IsParserWorking() && !rInput.IsEof() )
537cdf0e10cSrcweir 					{
538cdf0e10cSrcweir 						String sEntity( sEntityBuffer.getStr(), nPos );
539cdf0e10cSrcweir 						cChar = GetHTMLCharName( sEntity );
540cdf0e10cSrcweir 
541cdf0e10cSrcweir 						// nicht gefunden ( == 0 ), dann Klartext
542cdf0e10cSrcweir 						// oder ein Zeichen das als Attribut eingefuegt
543cdf0e10cSrcweir 						// wird
544cdf0e10cSrcweir 						if( 0U == cChar && ';' != nNextCh )
545cdf0e10cSrcweir 						{
546cdf0e10cSrcweir 							DBG_ASSERT( rInput.Tell() - nStreamPos ==
547cdf0e10cSrcweir 										(sal_uLong)(nPos+1L)*GetCharSize(),
548cdf0e10cSrcweir 										"UTF-8 geht hier schief" );
549cdf0e10cSrcweir 							for( xub_StrLen i=nPos-1L; i>1L; i-- )
550cdf0e10cSrcweir 							{
551cdf0e10cSrcweir 								nNextCh = sEntityBuffer[i];
552cdf0e10cSrcweir 								sEntityBuffer.setLength( i );
553cdf0e10cSrcweir 								sEntity.Assign( sEntityBuffer.getStr(), i );
554cdf0e10cSrcweir  								cChar = GetHTMLCharName( sEntity );
555cdf0e10cSrcweir 								if( cChar )
556cdf0e10cSrcweir 								{
557cdf0e10cSrcweir 									rInput.SeekRel( -(long)
558cdf0e10cSrcweir 											((nPos-i)*GetCharSize()) );
559cdf0e10cSrcweir 									nlLinePos -= sal_uInt32(nPos-i);
560cdf0e10cSrcweir 									nPos = i;
561cdf0e10cSrcweir 									ClearTxtConvContext();
562cdf0e10cSrcweir 									break;
563cdf0e10cSrcweir 								}
564cdf0e10cSrcweir 							}
565cdf0e10cSrcweir 						}
566cdf0e10cSrcweir 
567cdf0e10cSrcweir 						if( !cChar )		// unbekanntes Zeichen?
568cdf0e10cSrcweir 						{
569cdf0e10cSrcweir 							// dann im Stream zurueck, das '&' als Zeichen
570cdf0e10cSrcweir 							// einfuegen und mit dem nachfolgenden Zeichen
571cdf0e10cSrcweir 							// wieder aufsetzen
572cdf0e10cSrcweir 							sTmpBuffer.append( (sal_Unicode)'&' );
573cdf0e10cSrcweir 
574cdf0e10cSrcweir //							rInput.SeekRel( -(long)(++nPos*GetCharSize()) );
575cdf0e10cSrcweir //							nlLinePos -= nPos;
576cdf0e10cSrcweir 							DBG_ASSERT( rInput.Tell()-nStreamPos ==
577cdf0e10cSrcweir 										(sal_uLong)(nPos+1)*GetCharSize(),
578cdf0e10cSrcweir 										"Falsche Stream-Position" );
579cdf0e10cSrcweir 							DBG_ASSERT( nlLinePos-nLinePos ==
580cdf0e10cSrcweir 										(sal_uLong)(nPos+1),
581cdf0e10cSrcweir 										"Falsche Zeilen-Position" );
582cdf0e10cSrcweir 							rInput.Seek( nStreamPos );
583cdf0e10cSrcweir 							nlLinePos = nLinePos;
584cdf0e10cSrcweir 							ClearTxtConvContext();
585cdf0e10cSrcweir 							break;
586cdf0e10cSrcweir 						}
587cdf0e10cSrcweir 
588cdf0e10cSrcweir 						// 1 == Non Breaking Space
589cdf0e10cSrcweir 						// 2 == SoftHyphen
590cdf0e10cSrcweir 
591cdf0e10cSrcweir 						if( cChar < 3U )
592cdf0e10cSrcweir 						{
593cdf0e10cSrcweir 							if( '>' == cBreak )
594cdf0e10cSrcweir 							{
595cdf0e10cSrcweir 								// Wenn der Inhalt eines Tags gelesen wird,
596cdf0e10cSrcweir 								// muessen wir ein Space bzw. - daraus machen
597cdf0e10cSrcweir 								switch( cChar )
598cdf0e10cSrcweir 								{
599cdf0e10cSrcweir 								case 1U: cChar = ' '; break;
600cdf0e10cSrcweir 								case 2U: cChar = '-'; break;
601cdf0e10cSrcweir 								default:
602cdf0e10cSrcweir 									DBG_ASSERT( cChar==1U,
603cdf0e10cSrcweir 							"\0x00 sollte doch schon laengt abgefangen sein!" );
604cdf0e10cSrcweir 									break;
605cdf0e10cSrcweir 								}
606cdf0e10cSrcweir 							}
607cdf0e10cSrcweir 							else
608cdf0e10cSrcweir 							{
609cdf0e10cSrcweir 								// Wenn kein Tag gescannt wird, enstprechendes
610cdf0e10cSrcweir 								// Token zurueckgeben
611cdf0e10cSrcweir 								aToken +=
612cdf0e10cSrcweir 									String( sTmpBuffer.makeStringAndClear() );
613cdf0e10cSrcweir 								if( cChar )
614cdf0e10cSrcweir 								{
615cdf0e10cSrcweir 									if( aToken.Len() )
616cdf0e10cSrcweir 									{
617cdf0e10cSrcweir 										// mit dem Zeichen wieder aufsetzen
618cdf0e10cSrcweir 										nNextCh = '&';
619cdf0e10cSrcweir //										rInput.SeekRel( -(long)(++nPos*GetCharSize()) );
620cdf0e10cSrcweir //										nlLinePos -= nPos;
621cdf0e10cSrcweir 										DBG_ASSERT( rInput.Tell()-nStreamPos ==
622cdf0e10cSrcweir 													(sal_uLong)(nPos+1)*GetCharSize(),
623cdf0e10cSrcweir 													"Falsche Stream-Position" );
624cdf0e10cSrcweir 										DBG_ASSERT( nlLinePos-nLinePos ==
625cdf0e10cSrcweir 													(sal_uLong)(nPos+1),
626cdf0e10cSrcweir 													"Falsche Zeilen-Position" );
627cdf0e10cSrcweir 										rInput.Seek( nStreamPos );
628cdf0e10cSrcweir 										nlLinePos = nLinePos;
629cdf0e10cSrcweir 										ClearTxtConvContext();
630cdf0e10cSrcweir 										return HTML_TEXTTOKEN;
631cdf0e10cSrcweir 									}
632cdf0e10cSrcweir 
633cdf0e10cSrcweir 									// Hack: _GetNextChar soll nicht das
634cdf0e10cSrcweir 									// naechste Zeichen lesen
635cdf0e10cSrcweir 									if( ';' != nNextCh )
636cdf0e10cSrcweir 										aToken += ' ';
637cdf0e10cSrcweir 									if( 1U == cChar )
638cdf0e10cSrcweir 										return HTML_NONBREAKSPACE;
639cdf0e10cSrcweir 									if( 2U == cChar )
640cdf0e10cSrcweir 										return HTML_SOFTHYPH;
641cdf0e10cSrcweir 								}
642cdf0e10cSrcweir 								aToken += (sal_Unicode)'&';
643cdf0e10cSrcweir 								aToken +=
644cdf0e10cSrcweir 									String(sEntityBuffer.makeStringAndClear());
645cdf0e10cSrcweir 								break;
646cdf0e10cSrcweir 							}
647cdf0e10cSrcweir 						}
648cdf0e10cSrcweir 					}
649cdf0e10cSrcweir 					else
650cdf0e10cSrcweir 						nNextCh = 0U;
651cdf0e10cSrcweir 				}
652cdf0e10cSrcweir 				// MIB 03/02/2000: &{...};-JavaScript-Macros are not
653cdf0e10cSrcweir 				// supported any longer.
654cdf0e10cSrcweir 				else if( IsParserWorking() )
655cdf0e10cSrcweir 				{
656cdf0e10cSrcweir 					sTmpBuffer.append( (sal_Unicode)'&' );
657cdf0e10cSrcweir 					bNextCh = sal_False;
658cdf0e10cSrcweir 					break;
659cdf0e10cSrcweir 				}
660cdf0e10cSrcweir 
661cdf0e10cSrcweir 				bNextCh = (';' == nNextCh);
662cdf0e10cSrcweir 				if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
663cdf0e10cSrcweir 									cChar=='\"' || cChar==' ') )
664cdf0e10cSrcweir 				{
665cdf0e10cSrcweir 					// ' und " mussen innerhalb von Tags mit einem
666cdf0e10cSrcweir 					// gekennzeichnet werden, um sie von ' und " als Klammern
667cdf0e10cSrcweir 					// um Optionen zu unterscheiden. Logischerweise muss
668cdf0e10cSrcweir 					// deshalb auch ein \ gekeenzeichnet werden. Ausserdem
669cdf0e10cSrcweir 					// schuetzen wir ein Space, weil es kein Trennzeichen
670cdf0e10cSrcweir 					// zwischen Optionen ist.
671cdf0e10cSrcweir 					sTmpBuffer.append( (sal_Unicode)'\\' );
672cdf0e10cSrcweir 					if( MAX_LEN == sTmpBuffer.getLength() )
673cdf0e10cSrcweir 						aToken += String(sTmpBuffer.makeStringAndClear());
674cdf0e10cSrcweir 				}
675cdf0e10cSrcweir 				if( IsParserWorking() )
676cdf0e10cSrcweir 				{
677cdf0e10cSrcweir 					if( cChar )
678cdf0e10cSrcweir 						sTmpBuffer.append( cChar );
679cdf0e10cSrcweir 				}
680cdf0e10cSrcweir 				else if( SVPAR_PENDING==eState && '>'!=cBreak )
681cdf0e10cSrcweir 				{
682cdf0e10cSrcweir 					// Mit dem '&' Zeichen wieder aufsetzen, der Rest
683cdf0e10cSrcweir 					// wird als Texttoken zurueckgegeben.
684cdf0e10cSrcweir 					if( aToken.Len() || sTmpBuffer.getLength() )
685cdf0e10cSrcweir 					{
686cdf0e10cSrcweir 						// Der bisherige Text wird von _GetNextChar()
687cdf0e10cSrcweir 						// zurueckgegeben und beim naechsten Aufruf wird
688cdf0e10cSrcweir 						// ein neues Zeichen gelesen. Also muessen wir uns
689cdf0e10cSrcweir 						// noch vor das & stellen.
690cdf0e10cSrcweir 						nNextCh = 0U;
691cdf0e10cSrcweir 						rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() );
692cdf0e10cSrcweir 						nlLinePos = nLinePos-1;
693cdf0e10cSrcweir 						ClearTxtConvContext();
694cdf0e10cSrcweir 						bReadNextChar = sal_True;
695cdf0e10cSrcweir 					}
696cdf0e10cSrcweir 					bNextCh = sal_False;
697cdf0e10cSrcweir 				}
698cdf0e10cSrcweir 			}
699cdf0e10cSrcweir 			break;
700cdf0e10cSrcweir 		case '=':
701cdf0e10cSrcweir 			if( '>'==cBreak && !cQuote )
702cdf0e10cSrcweir 				bEqSignFound = sal_True;
703cdf0e10cSrcweir 			sTmpBuffer.append( nNextCh );
704cdf0e10cSrcweir 			break;
705cdf0e10cSrcweir 
706cdf0e10cSrcweir 		case '\\':
707cdf0e10cSrcweir 			if( '>'==cBreak )
708cdf0e10cSrcweir 			{
709cdf0e10cSrcweir 				// Innerhalb von Tags kennzeichnen
710cdf0e10cSrcweir 				sTmpBuffer.append( (sal_Unicode)'\\' );
711cdf0e10cSrcweir 				if( MAX_LEN == sTmpBuffer.getLength() )
712cdf0e10cSrcweir 					aToken += String(sTmpBuffer.makeStringAndClear());
713cdf0e10cSrcweir 			}
714cdf0e10cSrcweir 			sTmpBuffer.append( (sal_Unicode)'\\' );
715cdf0e10cSrcweir 			break;
716cdf0e10cSrcweir 
717cdf0e10cSrcweir 		case '\"':
718cdf0e10cSrcweir 		case '\'':
719cdf0e10cSrcweir 			if( '>'==cBreak )
720cdf0e10cSrcweir 			{
721cdf0e10cSrcweir 				if( bEqSignFound )
722cdf0e10cSrcweir 					cQuote = nNextCh;
723cdf0e10cSrcweir 				else if( cQuote && (cQuote==nNextCh ) )
724cdf0e10cSrcweir 					cQuote = 0U;
725cdf0e10cSrcweir 			}
726cdf0e10cSrcweir 			sTmpBuffer.append( nNextCh );
727cdf0e10cSrcweir 			bEqSignFound = sal_False;
728cdf0e10cSrcweir 			break;
729cdf0e10cSrcweir 
730cdf0e10cSrcweir 		case sal_Unicode(EOF):
731cdf0e10cSrcweir 			if( rInput.IsEof() )
732cdf0e10cSrcweir 			{
733cdf0e10cSrcweir // MIB 20.11.98: Das macht hier keinen Sinn, oder doch: Zumindest wird
734cdf0e10cSrcweir // abc&auml;<EOF> nicht angezeigt, also lassen wir das in Zukunft.
735cdf0e10cSrcweir //				if( '>' != cBreak )
736cdf0e10cSrcweir //					eState = SVPAR_ACCEPTED;
737cdf0e10cSrcweir 				bWeiter = sal_False;
738cdf0e10cSrcweir 			}
739cdf0e10cSrcweir 			else
740cdf0e10cSrcweir 			{
741cdf0e10cSrcweir 				sTmpBuffer.append( nNextCh );
742cdf0e10cSrcweir 			}
743cdf0e10cSrcweir 			break;
744cdf0e10cSrcweir 
745cdf0e10cSrcweir 		case '<':
746cdf0e10cSrcweir 			bEqSignFound = sal_False;
747cdf0e10cSrcweir 			if( '>'==cBreak )
748cdf0e10cSrcweir 				sTmpBuffer.append( nNextCh );
749cdf0e10cSrcweir 			else
750cdf0e10cSrcweir 				bWeiter = sal_False;		// Abbrechen, String zusammen
751cdf0e10cSrcweir 			break;
752cdf0e10cSrcweir 
753cdf0e10cSrcweir 		case '\f':
754cdf0e10cSrcweir 			if( '>' == cBreak )
755cdf0e10cSrcweir 			{
756cdf0e10cSrcweir 				// Beim Scannen von Optionen wie ein Space behandeln
757cdf0e10cSrcweir 				sTmpBuffer.append( (sal_Unicode)' ' );
758cdf0e10cSrcweir 			}
759cdf0e10cSrcweir 			else
760cdf0e10cSrcweir 			{
761cdf0e10cSrcweir 				// sonst wird es ein eigenes Token
762cdf0e10cSrcweir 				bWeiter = sal_False;
763cdf0e10cSrcweir 			}
764cdf0e10cSrcweir 			break;
765cdf0e10cSrcweir 
766cdf0e10cSrcweir 		case '\r':
767cdf0e10cSrcweir 		case '\n':
768cdf0e10cSrcweir 			if( '>'==cBreak )
769cdf0e10cSrcweir 			{
770cdf0e10cSrcweir 				// #26979# cr/lf in Tag wird in _GetNextToken() behandeln
771cdf0e10cSrcweir 				sTmpBuffer.append( nNextCh );
772cdf0e10cSrcweir 				break;
773cdf0e10cSrcweir 			}
774cdf0e10cSrcweir 			else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
775cdf0e10cSrcweir 			{
776cdf0e10cSrcweir 				bWeiter = sal_False;
777cdf0e10cSrcweir 				break;
778cdf0e10cSrcweir 			}
779cdf0e10cSrcweir 			// Bug 18984: CR-LF -> Blank
780cdf0e10cSrcweir 			// 		Folge von CR/LF/BLANK/TAB nur in ein Blank wandeln
781cdf0e10cSrcweir 			// kein break!!
782cdf0e10cSrcweir 		case '\t':
783cdf0e10cSrcweir 			if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
784cdf0e10cSrcweir 			{
785cdf0e10cSrcweir 				// In <PRE>: Tabs nach oben durchreichen
786cdf0e10cSrcweir 				bWeiter = sal_False;
787cdf0e10cSrcweir 				break;
788cdf0e10cSrcweir 			}
789cdf0e10cSrcweir 			// kein break
790cdf0e10cSrcweir 		case '\x0b':
791cdf0e10cSrcweir 			if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
792cdf0e10cSrcweir 				'>'!=cBreak )
793cdf0e10cSrcweir 			{
794cdf0e10cSrcweir 				break;
795cdf0e10cSrcweir 			}
796cdf0e10cSrcweir 			nNextCh = ' ';
797cdf0e10cSrcweir 			// kein break;
798cdf0e10cSrcweir 		case ' ':
799cdf0e10cSrcweir 			sTmpBuffer.append( nNextCh );
800cdf0e10cSrcweir 			if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
801cdf0e10cSrcweir 								!bReadPRE && !bReadTextArea) )
802cdf0e10cSrcweir 			{
803cdf0e10cSrcweir 				// alle Folgen von Blanks/Tabs/CR/LF zu einem Blank umwandeln
804cdf0e10cSrcweir 				do {
805cdf0e10cSrcweir 					if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
806cdf0e10cSrcweir 						rInput.IsEof() )
807cdf0e10cSrcweir 					{
808cdf0e10cSrcweir 						if( aToken.Len() || sTmpBuffer.getLength() > 1L )
809cdf0e10cSrcweir 						{
810cdf0e10cSrcweir 							// ausser den Blanks wurde noch etwas geselen
811cdf0e10cSrcweir 							aToken += String(sTmpBuffer.makeStringAndClear());
812cdf0e10cSrcweir 							return HTML_TEXTTOKEN;
813cdf0e10cSrcweir 						}
814cdf0e10cSrcweir 						else
815cdf0e10cSrcweir 							// nur Blanks gelesen: dann darf kein Text
816cdf0e10cSrcweir 							// mehr zurueckgegeben werden und _GetNextToken
817cdf0e10cSrcweir 							// muss auf EOF laufen
818cdf0e10cSrcweir 							return 0;
819cdf0e10cSrcweir 					}
820cdf0e10cSrcweir 				} while ( ' ' == nNextCh || '\t' == nNextCh ||
821cdf0e10cSrcweir 						  '\r' == nNextCh || '\n' == nNextCh ||
822cdf0e10cSrcweir 						  '\x0b' == nNextCh );
823cdf0e10cSrcweir 				bNextCh = sal_False;
824cdf0e10cSrcweir 			}
825cdf0e10cSrcweir 			break;
826cdf0e10cSrcweir 
827cdf0e10cSrcweir 		default:
828cdf0e10cSrcweir 			bEqSignFound = sal_False;
829cdf0e10cSrcweir 			if( (nNextCh==cBreak && !cQuote) ||
830cdf0e10cSrcweir 				(sal_uLong(aToken.Len()) + MAX_LEN) > sal_uLong(STRING_MAXLEN & ~1 ))
831cdf0e10cSrcweir 				bWeiter = sal_False;
832cdf0e10cSrcweir 			else
833cdf0e10cSrcweir 			{
834cdf0e10cSrcweir 				do {
835cdf0e10cSrcweir 					// alle anderen Zeichen kommen in den Text
836cdf0e10cSrcweir 					sTmpBuffer.append( nNextCh );
837cdf0e10cSrcweir 					if( MAX_LEN == sTmpBuffer.getLength() )
838cdf0e10cSrcweir 					{
839cdf0e10cSrcweir 						aToken += String(sTmpBuffer.makeStringAndClear());
840cdf0e10cSrcweir 						if( (sal_uLong(aToken.Len()) + MAX_LEN) >
841cdf0e10cSrcweir 								sal_uLong(STRING_MAXLEN & ~1 ) )
842cdf0e10cSrcweir 						{
843cdf0e10cSrcweir 							nNextCh = GetNextChar();
844cdf0e10cSrcweir 							return HTML_TEXTTOKEN;
845cdf0e10cSrcweir 						}
846cdf0e10cSrcweir 					}
847cdf0e10cSrcweir 					if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
848cdf0e10cSrcweir 						  rInput.IsEof() ) ||
849cdf0e10cSrcweir 						!IsParserWorking() )
850cdf0e10cSrcweir 					{
851cdf0e10cSrcweir 						if( sTmpBuffer.getLength() )
852cdf0e10cSrcweir 							aToken += String(sTmpBuffer.makeStringAndClear());
853cdf0e10cSrcweir 						return HTML_TEXTTOKEN;
854cdf0e10cSrcweir 					}
855cdf0e10cSrcweir 				} while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) );
856cdf0e10cSrcweir 				bNextCh = sal_False;
857cdf0e10cSrcweir 			}
858cdf0e10cSrcweir 		}
859cdf0e10cSrcweir 
860cdf0e10cSrcweir 		if( MAX_LEN == sTmpBuffer.getLength() )
861cdf0e10cSrcweir 			aToken += String(sTmpBuffer.makeStringAndClear());
862cdf0e10cSrcweir 
863cdf0e10cSrcweir 		if( bWeiter && bNextCh )
864cdf0e10cSrcweir 			nNextCh = GetNextChar();
865cdf0e10cSrcweir 	}
866cdf0e10cSrcweir 
867cdf0e10cSrcweir 	if( sTmpBuffer.getLength() )
868cdf0e10cSrcweir 		aToken += String(sTmpBuffer.makeStringAndClear());
869cdf0e10cSrcweir 
870cdf0e10cSrcweir 	return HTML_TEXTTOKEN;
871cdf0e10cSrcweir }
872cdf0e10cSrcweir 
_GetNextRawToken()873cdf0e10cSrcweir int HTMLParser::_GetNextRawToken()
874cdf0e10cSrcweir {
875cdf0e10cSrcweir 	::rtl::OUStringBuffer sTmpBuffer( MAX_LEN );
876cdf0e10cSrcweir 
877cdf0e10cSrcweir 	if( bEndTokenFound )
878cdf0e10cSrcweir 	{
879cdf0e10cSrcweir 		// beim letzten Aufruf haben wir das End-Token bereits gefunden,
880cdf0e10cSrcweir 		// deshalb muessen wir es nicht noch einmal suchen
881cdf0e10cSrcweir 		bReadScript = sal_False;
882cdf0e10cSrcweir 		bReadStyle = sal_False;
883cdf0e10cSrcweir 		aEndToken.Erase();
884cdf0e10cSrcweir 		bEndTokenFound = sal_False;
885cdf0e10cSrcweir 
886cdf0e10cSrcweir 		return 0;
887cdf0e10cSrcweir 	}
888cdf0e10cSrcweir 
889cdf0e10cSrcweir 	// per default geben wir HTML_RAWDATA zurueck
890cdf0e10cSrcweir 	int bWeiter = sal_True;
891cdf0e10cSrcweir 	int nToken = HTML_RAWDATA;
892cdf0e10cSrcweir 	SaveState( 0 );
893cdf0e10cSrcweir 	while( bWeiter && IsParserWorking() )
894cdf0e10cSrcweir 	{
895cdf0e10cSrcweir 		int bNextCh = sal_True;
896cdf0e10cSrcweir 		switch( nNextCh )
897cdf0e10cSrcweir 		{
898cdf0e10cSrcweir 		case '<':
899cdf0e10cSrcweir 			{
900cdf0e10cSrcweir 				// Vielleicht haben wir das Ende erreicht
901cdf0e10cSrcweir 
902cdf0e10cSrcweir 				// das bisher gelesene erstmal retten
903cdf0e10cSrcweir 				aToken += String(sTmpBuffer.makeStringAndClear());
904cdf0e10cSrcweir 
905cdf0e10cSrcweir 				// und die Position im Stream merken
906cdf0e10cSrcweir 				sal_uLong nStreamPos = rInput.Tell();
907cdf0e10cSrcweir 				sal_uLong nLineNr = GetLineNr();
908cdf0e10cSrcweir 				sal_uLong nLinePos = GetLinePos();
909cdf0e10cSrcweir 
910cdf0e10cSrcweir 				// Start eines End-Token?
911cdf0e10cSrcweir 				int bOffState = sal_False;
912cdf0e10cSrcweir 				if( '/' == (nNextCh = GetNextChar()) )
913cdf0e10cSrcweir 				{
914cdf0e10cSrcweir 					bOffState = sal_True;
915cdf0e10cSrcweir 					nNextCh = GetNextChar();
916cdf0e10cSrcweir 				}
917cdf0e10cSrcweir 				else if( '!' == nNextCh )
918cdf0e10cSrcweir 				{
919cdf0e10cSrcweir 					sTmpBuffer.append( nNextCh );
920cdf0e10cSrcweir 					nNextCh = GetNextChar();
921cdf0e10cSrcweir 				}
922cdf0e10cSrcweir 
923cdf0e10cSrcweir 				// jetzt die Buchstaben danach lesen
924cdf0e10cSrcweir 				while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) &&
925cdf0e10cSrcweir 					   IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
926cdf0e10cSrcweir 				{
927cdf0e10cSrcweir 					sTmpBuffer.append( nNextCh );
928cdf0e10cSrcweir 					nNextCh = GetNextChar();
929cdf0e10cSrcweir 				}
930cdf0e10cSrcweir 
931cdf0e10cSrcweir 				String aTok( sTmpBuffer.getStr(),
932cdf0e10cSrcweir 							 sal::static_int_cast< xub_StrLen >(
933cdf0e10cSrcweir                                  sTmpBuffer.getLength()) );
934cdf0e10cSrcweir 				aTok.ToUpperAscii();
935cdf0e10cSrcweir 				sal_Bool bDone = sal_False;
936cdf0e10cSrcweir 				if( bReadScript || aEndToken.Len() )
937cdf0e10cSrcweir 				{
938cdf0e10cSrcweir 					if( !bReadComment )
939cdf0e10cSrcweir 					{
940cdf0e10cSrcweir 						if( aTok.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment, 3 )
941cdf0e10cSrcweir 								== COMPARE_EQUAL )
942cdf0e10cSrcweir 						{
943cdf0e10cSrcweir 							bReadComment = sal_True;
944cdf0e10cSrcweir 						}
945cdf0e10cSrcweir 						else
946cdf0e10cSrcweir 						{
947cdf0e10cSrcweir 							// ein Script muss mit "</SCRIPT>" aufhoehren, wobei
948cdf0e10cSrcweir 							// wir es mit dem ">" aus sicherheitsgruenden
949cdf0e10cSrcweir 							// erstmal nicht so genau nehmen
950cdf0e10cSrcweir 							bDone = bOffState && // '>'==nNextCh &&
951cdf0e10cSrcweir 							COMPARE_EQUAL == ( bReadScript
952cdf0e10cSrcweir 								? aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script)
953cdf0e10cSrcweir 								: aTok.CompareTo(aEndToken) );
954cdf0e10cSrcweir 						}
955cdf0e10cSrcweir 					}
956cdf0e10cSrcweir 					if( bReadComment && '>'==nNextCh && aTok.Len() >= 2 &&
957cdf0e10cSrcweir 						aTok.Copy( aTok.Len()-2 ).EqualsAscii( "--" ) )
958cdf0e10cSrcweir 					{
959cdf0e10cSrcweir 						// hier ist ein Kommentar der Art <!-----> zuende
960cdf0e10cSrcweir 						bReadComment = sal_False;
961cdf0e10cSrcweir 					}
962cdf0e10cSrcweir 				}
963cdf0e10cSrcweir 				else
964cdf0e10cSrcweir 				{
965cdf0e10cSrcweir 					// ein Style-Sheet kann mit </STYLE>, </HEAD> oder
966cdf0e10cSrcweir 					// <BODY> aughoehren
967cdf0e10cSrcweir 					if( bOffState )
968cdf0e10cSrcweir 						bDone = aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style)
969cdf0e10cSrcweir 									== COMPARE_EQUAL ||
970cdf0e10cSrcweir 								aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head)
971cdf0e10cSrcweir 									== COMPARE_EQUAL;
972cdf0e10cSrcweir 					else
973cdf0e10cSrcweir 						bDone =
974cdf0e10cSrcweir 							aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body) == COMPARE_EQUAL;
975cdf0e10cSrcweir 				}
976cdf0e10cSrcweir 
977cdf0e10cSrcweir 				if( bDone )
978cdf0e10cSrcweir 				{
979cdf0e10cSrcweir 					// das war's, jetzt muessen wir gegebenenfalls den
980cdf0e10cSrcweir 					// bisher gelesenen String zurueckgeben und dnach normal
981cdf0e10cSrcweir 					// weitermachen
982cdf0e10cSrcweir 
983cdf0e10cSrcweir 					bWeiter = sal_False;
984cdf0e10cSrcweir 
985cdf0e10cSrcweir 					// nToken==0 heisst, dass _GetNextToken gleich weiterliest
986cdf0e10cSrcweir 					if( !aToken.Len() && (bReadStyle || bReadScript) )
987cdf0e10cSrcweir 					{
988cdf0e10cSrcweir 						// wir koennen sofort die Umgebung beeden und
989cdf0e10cSrcweir 						// das End-Token parsen
990cdf0e10cSrcweir 						bReadScript = sal_False;
991cdf0e10cSrcweir 						bReadStyle = sal_False;
992cdf0e10cSrcweir 						aEndToken.Erase();
993cdf0e10cSrcweir 						nToken = 0;
994cdf0e10cSrcweir 					}
995cdf0e10cSrcweir 					else
996cdf0e10cSrcweir 					{
997cdf0e10cSrcweir 						// wir muessen bReadScript/bReadStyle noch am
998cdf0e10cSrcweir 						// Leben lassen und koennen erst beim naechsten
999cdf0e10cSrcweir 						// mal das End-Token Parsen
1000cdf0e10cSrcweir 						bEndTokenFound = sal_True;
1001cdf0e10cSrcweir 					}
1002cdf0e10cSrcweir 
1003cdf0e10cSrcweir 					// jetzt fahren wir im Stream auf das '<' zurueck
1004cdf0e10cSrcweir 					rInput.Seek( nStreamPos );
1005cdf0e10cSrcweir 					SetLineNr( nLineNr );
1006cdf0e10cSrcweir 					SetLinePos( nLinePos );
1007cdf0e10cSrcweir 					ClearTxtConvContext();
1008cdf0e10cSrcweir 					nNextCh = '<';
1009cdf0e10cSrcweir 
1010cdf0e10cSrcweir 					// den String wollen wir nicht an das Token haengen
1011cdf0e10cSrcweir 					sTmpBuffer.setLength( 0L );
1012cdf0e10cSrcweir 				}
1013cdf0e10cSrcweir 				else
1014cdf0e10cSrcweir 				{
1015cdf0e10cSrcweir 					// "</" merken, alles andere steht noch im buffer
1016cdf0e10cSrcweir 					aToken += (sal_Unicode)'<';
1017cdf0e10cSrcweir 					if( bOffState )
1018cdf0e10cSrcweir 						aToken += (sal_Unicode)'/';
1019cdf0e10cSrcweir 
1020cdf0e10cSrcweir 					bNextCh = sal_False;
1021cdf0e10cSrcweir 				}
1022cdf0e10cSrcweir 			}
1023cdf0e10cSrcweir 			break;
1024cdf0e10cSrcweir 		case '-':
1025cdf0e10cSrcweir 			sTmpBuffer.append( nNextCh );
1026cdf0e10cSrcweir 			if( bReadComment )
1027cdf0e10cSrcweir 			{
1028cdf0e10cSrcweir 				sal_Bool bTwoMinus = sal_False;
1029cdf0e10cSrcweir 				nNextCh = GetNextChar();
1030cdf0e10cSrcweir 				while( '-' == nNextCh && IsParserWorking() )
1031cdf0e10cSrcweir 				{
1032cdf0e10cSrcweir 					bTwoMinus = sal_True;
1033cdf0e10cSrcweir 
1034cdf0e10cSrcweir 					if( MAX_LEN == sTmpBuffer.getLength() )
1035cdf0e10cSrcweir 						aToken += String(sTmpBuffer.makeStringAndClear());
1036cdf0e10cSrcweir 					sTmpBuffer.append( nNextCh );
1037cdf0e10cSrcweir 					nNextCh = GetNextChar();
1038cdf0e10cSrcweir 				}
1039cdf0e10cSrcweir 
1040cdf0e10cSrcweir 				if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
1041cdf0e10cSrcweir 					bReadComment = sal_False;
1042cdf0e10cSrcweir 
1043cdf0e10cSrcweir 				bNextCh = sal_False;
1044cdf0e10cSrcweir 			}
1045cdf0e10cSrcweir 			break;
1046cdf0e10cSrcweir 
1047cdf0e10cSrcweir 		case '\r':
1048cdf0e10cSrcweir 			// \r\n? beendet das aktuelle Text-Token (auch wenn es leer ist)
1049cdf0e10cSrcweir 			nNextCh = GetNextChar();
1050cdf0e10cSrcweir 			if( nNextCh=='\n' )
1051cdf0e10cSrcweir 				nNextCh = GetNextChar();
1052cdf0e10cSrcweir 			bWeiter = sal_False;
1053cdf0e10cSrcweir 			break;
1054cdf0e10cSrcweir 		case '\n':
1055cdf0e10cSrcweir 			// \n beendet das aktuelle Text-Token (auch wenn es leer ist)
1056cdf0e10cSrcweir 			nNextCh = GetNextChar();
1057cdf0e10cSrcweir 			bWeiter = sal_False;
1058cdf0e10cSrcweir 			break;
1059cdf0e10cSrcweir 		case sal_Unicode(EOF):
1060cdf0e10cSrcweir 			// eof beendet das aktuelle Text-Token und tut so, als ob
1061cdf0e10cSrcweir 			// ein End-Token gelesen wurde
1062cdf0e10cSrcweir 			if( rInput.IsEof() )
1063cdf0e10cSrcweir 			{
1064cdf0e10cSrcweir 				bWeiter = sal_False;
1065cdf0e10cSrcweir 				if( aToken.Len() || sTmpBuffer.getLength() )
1066cdf0e10cSrcweir 				{
1067cdf0e10cSrcweir 					bEndTokenFound = sal_True;
1068cdf0e10cSrcweir 				}
1069cdf0e10cSrcweir 				else
1070cdf0e10cSrcweir 				{
1071cdf0e10cSrcweir 					bReadScript = sal_False;
1072cdf0e10cSrcweir 					bReadStyle = sal_False;
1073cdf0e10cSrcweir 					aEndToken.Erase();
1074cdf0e10cSrcweir 					nToken = 0;
1075cdf0e10cSrcweir 				}
1076cdf0e10cSrcweir 				break;
1077cdf0e10cSrcweir 			}
1078cdf0e10cSrcweir 			// kein break
1079cdf0e10cSrcweir 		default:
1080cdf0e10cSrcweir 			// alle anderen Zeichen landen im Buffer
1081cdf0e10cSrcweir 			sTmpBuffer.append( nNextCh );
1082cdf0e10cSrcweir 			break;
1083cdf0e10cSrcweir 		}
1084cdf0e10cSrcweir 
1085cdf0e10cSrcweir 		if( (!bWeiter && sTmpBuffer.getLength() > 0L) ||
1086cdf0e10cSrcweir 			MAX_LEN == sTmpBuffer.getLength() )
1087cdf0e10cSrcweir 			aToken += String(sTmpBuffer.makeStringAndClear());
1088cdf0e10cSrcweir 
1089cdf0e10cSrcweir 		if( bWeiter && bNextCh )
1090cdf0e10cSrcweir 			nNextCh = GetNextChar();
1091cdf0e10cSrcweir 	}
1092cdf0e10cSrcweir 
1093cdf0e10cSrcweir 	if( IsParserWorking() )
1094cdf0e10cSrcweir 		SaveState( 0 );
1095cdf0e10cSrcweir 	else
1096cdf0e10cSrcweir 		nToken = 0;
1097cdf0e10cSrcweir 
1098cdf0e10cSrcweir 	return nToken;
1099cdf0e10cSrcweir }
1100cdf0e10cSrcweir 
1101cdf0e10cSrcweir // scanne das naechste Token,
_GetNextToken()1102cdf0e10cSrcweir int __EXPORT HTMLParser::_GetNextToken()
1103cdf0e10cSrcweir {
1104cdf0e10cSrcweir 	int nRet = 0;
1105cdf0e10cSrcweir 	sSaveToken.Erase();
1106cdf0e10cSrcweir 
1107cdf0e10cSrcweir 	// die Optionen loeschen
1108cdf0e10cSrcweir 	if( pOptions->Count() )
1109cdf0e10cSrcweir 		pOptions->DeleteAndDestroy( 0, pOptions->Count() );
1110cdf0e10cSrcweir 
1111cdf0e10cSrcweir 	if( !IsParserWorking() )		// wenn schon Fehler, dann nicht weiter!
1112cdf0e10cSrcweir 		return 0;
1113cdf0e10cSrcweir 
1114cdf0e10cSrcweir 	sal_Bool bReadNextCharSave = bReadNextChar;
1115cdf0e10cSrcweir 	if( bReadNextChar )
1116cdf0e10cSrcweir 	{
1117cdf0e10cSrcweir 		DBG_ASSERT( !bEndTokenFound,
1118cdf0e10cSrcweir 					"</SCRIPT> gelesen und trotzdem noch ein Zeichen lesen?" );
1119cdf0e10cSrcweir 		nNextCh = GetNextChar();
1120cdf0e10cSrcweir 		if( !IsParserWorking() )		// wenn schon Fehler, dann nicht weiter!
1121cdf0e10cSrcweir 			return 0;
1122cdf0e10cSrcweir 		bReadNextChar = sal_False;
1123cdf0e10cSrcweir 	}
1124cdf0e10cSrcweir 
1125cdf0e10cSrcweir 	if( bReadScript || bReadStyle || aEndToken.Len() )
1126cdf0e10cSrcweir 	{
1127cdf0e10cSrcweir 		nRet = _GetNextRawToken();
1128cdf0e10cSrcweir 		if( nRet || !IsParserWorking() )
1129cdf0e10cSrcweir 			return nRet;
1130cdf0e10cSrcweir 	}
1131cdf0e10cSrcweir 
1132cdf0e10cSrcweir 	do {
1133cdf0e10cSrcweir 		int bNextCh = sal_True;
1134cdf0e10cSrcweir 		switch( nNextCh )
1135cdf0e10cSrcweir 		{
1136cdf0e10cSrcweir 		case '<':
1137cdf0e10cSrcweir 			{
1138cdf0e10cSrcweir 				sal_uLong nStreamPos = rInput.Tell();
1139cdf0e10cSrcweir 				sal_uLong nLineNr = GetLineNr();
1140cdf0e10cSrcweir 				sal_uLong nLinePos = GetLinePos();
1141cdf0e10cSrcweir 
1142cdf0e10cSrcweir 				int bOffState = sal_False;
1143cdf0e10cSrcweir 				if( '/' == (nNextCh = GetNextChar()) )
1144cdf0e10cSrcweir 				{
1145cdf0e10cSrcweir 					bOffState = sal_True;
1146cdf0e10cSrcweir 					nNextCh = GetNextChar();
1147cdf0e10cSrcweir 				}
1148cdf0e10cSrcweir 				if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh ) // fix #26984#
1149cdf0e10cSrcweir 				{
1150cdf0e10cSrcweir 					::rtl::OUStringBuffer sTmpBuffer;
1151cdf0e10cSrcweir 					do {
1152cdf0e10cSrcweir 						sTmpBuffer.append( nNextCh );
1153cdf0e10cSrcweir 						if( MAX_LEN == sTmpBuffer.getLength() )
1154cdf0e10cSrcweir 							aToken += String(sTmpBuffer.makeStringAndClear());
1155cdf0e10cSrcweir 						nNextCh = GetNextChar();
1156cdf0e10cSrcweir 					} while( '>' != nNextCh && !HTML_ISSPACE( nNextCh ) &&
1157cdf0e10cSrcweir 							 IsParserWorking() && !rInput.IsEof() );
1158cdf0e10cSrcweir 
1159cdf0e10cSrcweir 					if( sTmpBuffer.getLength() )
1160cdf0e10cSrcweir 						aToken += String(sTmpBuffer.makeStringAndClear());
1161cdf0e10cSrcweir 
1162cdf0e10cSrcweir 					// Blanks ueberlesen
1163cdf0e10cSrcweir 					while( HTML_ISSPACE( nNextCh ) && IsParserWorking() )
1164cdf0e10cSrcweir 						nNextCh = GetNextChar();
1165cdf0e10cSrcweir 
1166cdf0e10cSrcweir 					if( !IsParserWorking() )
1167cdf0e10cSrcweir 					{
1168cdf0e10cSrcweir 						if( SVPAR_PENDING == eState )
1169cdf0e10cSrcweir 							bReadNextChar = bReadNextCharSave;
1170cdf0e10cSrcweir 						break;
1171cdf0e10cSrcweir 					}
1172cdf0e10cSrcweir 
1173cdf0e10cSrcweir 					// suche das Token in der Tabelle:
1174cdf0e10cSrcweir 					sSaveToken = aToken;
1175cdf0e10cSrcweir 					aToken.ToUpperAscii();
1176cdf0e10cSrcweir 					if( 0 == (nRet = GetHTMLToken( aToken )) )
1177cdf0e10cSrcweir 						// Unknown Control
1178cdf0e10cSrcweir 						nRet = HTML_UNKNOWNCONTROL_ON;
1179cdf0e10cSrcweir 
1180cdf0e10cSrcweir 					// Wenn es ein Token zum ausschalten ist ...
1181cdf0e10cSrcweir 					if( bOffState )
1182cdf0e10cSrcweir 					{
1183cdf0e10cSrcweir 						 if( HTML_TOKEN_ONOFF & nRet )
1184cdf0e10cSrcweir 						 {
1185cdf0e10cSrcweir 							// und es ein Off-Token gibt, das daraus machen
1186cdf0e10cSrcweir 							++nRet;
1187cdf0e10cSrcweir 						 }
1188cdf0e10cSrcweir 						 else if( HTML_LINEBREAK!=nRet )
1189cdf0e10cSrcweir 						 {
1190cdf0e10cSrcweir 							// und es kein Off-Token gibt, ein unbekanntes
1191cdf0e10cSrcweir 							// Token daraus machen (ausser </BR>, das wird
1192cdf0e10cSrcweir 							// wie <BR> behandelt
1193cdf0e10cSrcweir 							nRet = HTML_UNKNOWNCONTROL_OFF;
1194cdf0e10cSrcweir 						 }
1195cdf0e10cSrcweir 					}
1196cdf0e10cSrcweir 
1197cdf0e10cSrcweir 					if( nRet == HTML_COMMENT )
1198cdf0e10cSrcweir 					{
1199cdf0e10cSrcweir 						// fix: sSaveToken wegen Gross-/Kleinschreibung
1200cdf0e10cSrcweir 						// als Anfang des Kommentars benutzen und ein
1201cdf0e10cSrcweir 						// Space anhaengen.
1202cdf0e10cSrcweir 						aToken = sSaveToken;
1203cdf0e10cSrcweir 						if( '>'!=nNextCh )
1204cdf0e10cSrcweir 							aToken += (sal_Unicode)' ';
1205cdf0e10cSrcweir 						sal_uLong nCStreamPos = 0;
1206cdf0e10cSrcweir 						sal_uLong nCLineNr = 0;
1207cdf0e10cSrcweir 						sal_uLong nCLinePos = 0;
1208cdf0e10cSrcweir 						xub_StrLen nCStrLen = 0;
1209cdf0e10cSrcweir 
1210cdf0e10cSrcweir 						sal_Bool bDone = sal_False;
1211cdf0e10cSrcweir 						// bis zum schliessenden --> lesen. wenn keins gefunden
1212cdf0e10cSrcweir 						// wurde beim der ersten > wieder aufsetzen
1213cdf0e10cSrcweir 						while( !bDone && !rInput.IsEof() && IsParserWorking() )
1214cdf0e10cSrcweir 						{
1215cdf0e10cSrcweir 							if( '>'==nNextCh )
1216cdf0e10cSrcweir 							{
1217cdf0e10cSrcweir 								if( !nCStreamPos )
1218cdf0e10cSrcweir 								{
1219cdf0e10cSrcweir 									nCStreamPos = rInput.Tell();
1220cdf0e10cSrcweir 									nCStrLen = aToken.Len();
1221cdf0e10cSrcweir 									nCLineNr = GetLineNr();
1222cdf0e10cSrcweir 									nCLinePos = GetLinePos();
1223cdf0e10cSrcweir 								}
1224cdf0e10cSrcweir 								bDone = aToken.Len() >= 2 &&
1225cdf0e10cSrcweir 										aToken.Copy(aToken.Len()-2,2).
1226cdf0e10cSrcweir 														EqualsAscii( "--" );
1227cdf0e10cSrcweir 								if( !bDone )
1228cdf0e10cSrcweir 								aToken += nNextCh;
1229cdf0e10cSrcweir 							}
1230cdf0e10cSrcweir 							else
1231cdf0e10cSrcweir 								aToken += nNextCh;
1232cdf0e10cSrcweir 							if( !bDone )
1233cdf0e10cSrcweir 								nNextCh = GetNextChar();
1234cdf0e10cSrcweir 						}
1235cdf0e10cSrcweir 						if( !bDone && IsParserWorking() && nCStreamPos )
1236cdf0e10cSrcweir 						{
1237cdf0e10cSrcweir 							rInput.Seek( nCStreamPos );
1238cdf0e10cSrcweir 							SetLineNr( nCLineNr );
1239cdf0e10cSrcweir 							SetLinePos( nCLinePos );
1240cdf0e10cSrcweir 							ClearTxtConvContext();
1241cdf0e10cSrcweir 							aToken.Erase( nCStrLen );
1242cdf0e10cSrcweir 							nNextCh = '>';
1243cdf0e10cSrcweir 						}
1244cdf0e10cSrcweir 					}
1245cdf0e10cSrcweir 					else
1246cdf0e10cSrcweir 					{
1247cdf0e10cSrcweir 						// den TokenString koennen wir jetzt verwerfen
1248cdf0e10cSrcweir 						aToken.Erase();
1249cdf0e10cSrcweir 					}
1250cdf0e10cSrcweir 
1251cdf0e10cSrcweir 					// dann lesen wir mal alles bis zur schliessenden '>'
1252cdf0e10cSrcweir 					if( '>' != nNextCh && IsParserWorking() )
1253cdf0e10cSrcweir 					{
1254cdf0e10cSrcweir 						ScanText( '>' );
1255cdf0e10cSrcweir 						if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1256cdf0e10cSrcweir 						{
1257cdf0e10cSrcweir 							// zurueck hinter die < gehen  und dort neu
1258cdf0e10cSrcweir 							// aufsetzen, das < als Text zurueckgeben
1259cdf0e10cSrcweir 							rInput.Seek( nStreamPos );
1260cdf0e10cSrcweir 							SetLineNr( nLineNr );
1261cdf0e10cSrcweir 							SetLinePos( nLinePos );
1262cdf0e10cSrcweir 							ClearTxtConvContext();
1263cdf0e10cSrcweir 
1264cdf0e10cSrcweir 							aToken = '<';
1265cdf0e10cSrcweir 							nRet = HTML_TEXTTOKEN;
1266cdf0e10cSrcweir 							nNextCh = GetNextChar();
1267cdf0e10cSrcweir 							bNextCh = sal_False;
1268cdf0e10cSrcweir 							break;
1269cdf0e10cSrcweir 						}
1270cdf0e10cSrcweir 					}
1271cdf0e10cSrcweir 					if( SVPAR_PENDING == eState )
1272cdf0e10cSrcweir 						bReadNextChar = bReadNextCharSave;
1273cdf0e10cSrcweir 				}
1274cdf0e10cSrcweir 				else
1275cdf0e10cSrcweir 				{
1276cdf0e10cSrcweir 					if( bOffState )
1277cdf0e10cSrcweir 					{
1278cdf0e10cSrcweir 						// einfach alles wegschmeissen
1279cdf0e10cSrcweir 						ScanText( '>' );
1280cdf0e10cSrcweir 						if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1281cdf0e10cSrcweir 						{
1282cdf0e10cSrcweir 							// zurueck hinter die < gehen  und dort neu
1283cdf0e10cSrcweir 							// aufsetzen, das < als Text zurueckgeben
1284cdf0e10cSrcweir 							rInput.Seek( nStreamPos );
1285cdf0e10cSrcweir 							SetLineNr( nLineNr );
1286cdf0e10cSrcweir 							SetLinePos( nLinePos );
1287cdf0e10cSrcweir 							ClearTxtConvContext();
1288cdf0e10cSrcweir 
1289cdf0e10cSrcweir 							aToken = '<';
1290cdf0e10cSrcweir 							nRet = HTML_TEXTTOKEN;
1291cdf0e10cSrcweir 							nNextCh = GetNextChar();
1292cdf0e10cSrcweir 							bNextCh = sal_False;
1293cdf0e10cSrcweir 							break;
1294cdf0e10cSrcweir 						}
1295cdf0e10cSrcweir 						if( SVPAR_PENDING == eState )
1296cdf0e10cSrcweir 							bReadNextChar = bReadNextCharSave;
1297cdf0e10cSrcweir 						aToken.Erase();
1298cdf0e10cSrcweir 					}
1299cdf0e10cSrcweir 					else if( '%' == nNextCh )
1300cdf0e10cSrcweir 					{
1301cdf0e10cSrcweir 						nRet = HTML_UNKNOWNCONTROL_ON;
1302cdf0e10cSrcweir 
1303cdf0e10cSrcweir 						sal_uLong nCStreamPos = rInput.Tell();
1304cdf0e10cSrcweir 						sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1305cdf0e10cSrcweir 
1306cdf0e10cSrcweir 						sal_Bool bDone = sal_False;
1307cdf0e10cSrcweir 						// bis zum schliessenden %> lesen. wenn keins gefunden
1308cdf0e10cSrcweir 						// wurde beim der ersten > wieder aufsetzen
1309cdf0e10cSrcweir 						while( !bDone && !rInput.IsEof() && IsParserWorking() )
1310cdf0e10cSrcweir 						{
1311cdf0e10cSrcweir 							bDone = '>'==nNextCh && aToken.Len() >= 1 &&
1312cdf0e10cSrcweir 									'%' == aToken.GetChar( aToken.Len()-1 );
1313cdf0e10cSrcweir 							if( !bDone )
1314cdf0e10cSrcweir 							{
1315cdf0e10cSrcweir 								aToken += nNextCh;
1316cdf0e10cSrcweir 								nNextCh = GetNextChar();
1317cdf0e10cSrcweir 							}
1318cdf0e10cSrcweir 						}
1319cdf0e10cSrcweir 						if( !bDone && IsParserWorking() )
1320cdf0e10cSrcweir 						{
1321cdf0e10cSrcweir 							rInput.Seek( nCStreamPos );
1322cdf0e10cSrcweir 							SetLineNr( nCLineNr );
1323cdf0e10cSrcweir 							SetLinePos( nCLinePos );
1324cdf0e10cSrcweir 							ClearTxtConvContext();
1325cdf0e10cSrcweir 							aToken.AssignAscii( "<%", 2 );
1326cdf0e10cSrcweir 							nRet = HTML_TEXTTOKEN;
1327cdf0e10cSrcweir 							break;
1328cdf0e10cSrcweir 						}
1329cdf0e10cSrcweir 						if( IsParserWorking() )
1330cdf0e10cSrcweir 						{
1331cdf0e10cSrcweir 							sSaveToken = aToken;
1332cdf0e10cSrcweir 							aToken.Erase();
1333cdf0e10cSrcweir 						}
1334cdf0e10cSrcweir 					}
1335cdf0e10cSrcweir 					else
1336cdf0e10cSrcweir 					{
1337cdf0e10cSrcweir 						aToken = '<';
1338cdf0e10cSrcweir 						nRet = HTML_TEXTTOKEN;
1339cdf0e10cSrcweir 						bNextCh = sal_False;
1340cdf0e10cSrcweir 						break;
1341cdf0e10cSrcweir 					}
1342cdf0e10cSrcweir 				}
1343cdf0e10cSrcweir 
1344cdf0e10cSrcweir 				if( IsParserWorking() )
1345cdf0e10cSrcweir 				{
1346cdf0e10cSrcweir 					bNextCh = '>' == nNextCh;
1347cdf0e10cSrcweir 					switch( nRet )
1348cdf0e10cSrcweir 					{
1349cdf0e10cSrcweir 					case HTML_TEXTAREA_ON:
1350cdf0e10cSrcweir 						bReadTextArea = sal_True;
1351cdf0e10cSrcweir 						break;
1352cdf0e10cSrcweir 					case HTML_TEXTAREA_OFF:
1353cdf0e10cSrcweir 						bReadTextArea = sal_False;
1354cdf0e10cSrcweir 						break;
1355cdf0e10cSrcweir 					case HTML_SCRIPT_ON:
1356cdf0e10cSrcweir 						if( !bReadTextArea )
1357cdf0e10cSrcweir 							bReadScript = sal_True;
1358cdf0e10cSrcweir 						break;
1359cdf0e10cSrcweir 					case HTML_SCRIPT_OFF:
1360cdf0e10cSrcweir 						if( !bReadTextArea )
1361cdf0e10cSrcweir 						{
1362cdf0e10cSrcweir 							bReadScript = sal_False;
1363cdf0e10cSrcweir 							// JavaScript kann den Stream veraendern
1364cdf0e10cSrcweir 							// also muss das letzte Zeichen nochmals
1365cdf0e10cSrcweir 							// gelesen werden
1366cdf0e10cSrcweir 							bReadNextChar = sal_True;
1367cdf0e10cSrcweir 							bNextCh = sal_False;
1368cdf0e10cSrcweir 						}
1369cdf0e10cSrcweir 						break;
1370cdf0e10cSrcweir 
1371cdf0e10cSrcweir 					case HTML_STYLE_ON:
1372cdf0e10cSrcweir 						bReadStyle = sal_True;
1373cdf0e10cSrcweir 						break;
1374cdf0e10cSrcweir 					case HTML_STYLE_OFF:
1375cdf0e10cSrcweir 						bReadStyle = sal_False;
1376cdf0e10cSrcweir 						break;
1377cdf0e10cSrcweir 					}
1378cdf0e10cSrcweir 
1379cdf0e10cSrcweir 				}
1380cdf0e10cSrcweir 			}
1381cdf0e10cSrcweir 			break;
1382cdf0e10cSrcweir 
1383cdf0e10cSrcweir 		case sal_Unicode(EOF):
1384cdf0e10cSrcweir 			if( rInput.IsEof() )
1385cdf0e10cSrcweir 			{
1386cdf0e10cSrcweir 				eState = SVPAR_ACCEPTED;
1387cdf0e10cSrcweir 				nRet = nNextCh;
1388cdf0e10cSrcweir 			}
1389cdf0e10cSrcweir 			else
1390cdf0e10cSrcweir 			{
1391cdf0e10cSrcweir 				// normalen Text lesen
1392cdf0e10cSrcweir 				goto scan_text;
1393cdf0e10cSrcweir 			}
1394cdf0e10cSrcweir 			break;
1395cdf0e10cSrcweir 
1396cdf0e10cSrcweir 		case '\f':
1397cdf0e10cSrcweir 			// Form-Feeds werden jetzt extra nach oben gereicht
1398cdf0e10cSrcweir 			nRet = HTML_LINEFEEDCHAR; // !!! eigentlich FORMFEEDCHAR
1399cdf0e10cSrcweir 			break;
1400cdf0e10cSrcweir 
1401cdf0e10cSrcweir 		case '\n':
1402cdf0e10cSrcweir 		case '\r':
1403cdf0e10cSrcweir 			if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1404cdf0e10cSrcweir 			{
1405cdf0e10cSrcweir 				sal_Unicode c = GetNextChar();
1406cdf0e10cSrcweir 				if( ( '\n' != nNextCh || '\r' != c ) &&
1407cdf0e10cSrcweir 					( '\r' != nNextCh || '\n' != c ) )
1408cdf0e10cSrcweir 				{
1409cdf0e10cSrcweir 					bNextCh = sal_False;
1410cdf0e10cSrcweir 					nNextCh = c;
1411cdf0e10cSrcweir 				}
1412cdf0e10cSrcweir 				nRet = HTML_NEWPARA;
1413cdf0e10cSrcweir 				break;
1414cdf0e10cSrcweir 			}
1415cdf0e10cSrcweir 			// kein break !
1416cdf0e10cSrcweir 		case '\t':
1417cdf0e10cSrcweir 			if( bReadPRE )
1418cdf0e10cSrcweir 			{
1419cdf0e10cSrcweir 				nRet = HTML_TABCHAR;
1420cdf0e10cSrcweir 				break;
1421cdf0e10cSrcweir 			}
1422cdf0e10cSrcweir 			// kein break !
1423cdf0e10cSrcweir 		case ' ':
1424cdf0e10cSrcweir 			// kein break !
1425cdf0e10cSrcweir 		default:
1426cdf0e10cSrcweir 
1427cdf0e10cSrcweir scan_text:
1428cdf0e10cSrcweir 			// es folgt "normaler" Text
1429cdf0e10cSrcweir 			nRet = ScanText();
1430cdf0e10cSrcweir 			bNextCh = 0 == aToken.Len();
1431cdf0e10cSrcweir 
1432cdf0e10cSrcweir 			// der Text sollte noch verarbeitet werden
1433cdf0e10cSrcweir 			if( !bNextCh && eState == SVPAR_PENDING )
1434cdf0e10cSrcweir 			{
1435cdf0e10cSrcweir 				eState = SVPAR_WORKING;
1436cdf0e10cSrcweir 				bReadNextChar = sal_True;
1437cdf0e10cSrcweir 			}
1438cdf0e10cSrcweir 
1439cdf0e10cSrcweir 			break;
1440cdf0e10cSrcweir 		}
1441cdf0e10cSrcweir 
1442cdf0e10cSrcweir 		if( bNextCh && SVPAR_WORKING == eState )
1443cdf0e10cSrcweir 		{
1444cdf0e10cSrcweir 			nNextCh = GetNextChar();
1445cdf0e10cSrcweir 			if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet )
1446cdf0e10cSrcweir 			{
1447cdf0e10cSrcweir 				bReadNextChar = sal_True;
1448cdf0e10cSrcweir 				eState = SVPAR_WORKING;
1449cdf0e10cSrcweir 			}
1450cdf0e10cSrcweir 		}
1451cdf0e10cSrcweir 
1452cdf0e10cSrcweir 	} while( !nRet && SVPAR_WORKING == eState );
1453cdf0e10cSrcweir 
1454cdf0e10cSrcweir 	if( SVPAR_PENDING == eState )
1455cdf0e10cSrcweir 		nRet = -1;		// irgendwas ungueltiges
1456cdf0e10cSrcweir 
1457cdf0e10cSrcweir 	return nRet;
1458cdf0e10cSrcweir }
1459cdf0e10cSrcweir 
UnescapeToken()1460cdf0e10cSrcweir void HTMLParser::UnescapeToken()
1461cdf0e10cSrcweir {
1462cdf0e10cSrcweir 	xub_StrLen nPos=0;
1463cdf0e10cSrcweir 
1464cdf0e10cSrcweir 	sal_Bool bEscape = sal_False;
1465cdf0e10cSrcweir 	while( nPos < aToken.Len() )
1466cdf0e10cSrcweir 	{
1467cdf0e10cSrcweir 		sal_Bool bOldEscape = bEscape;
1468cdf0e10cSrcweir 		bEscape = sal_False;
1469cdf0e10cSrcweir 		if( '\\'==aToken.GetChar(nPos) && !bOldEscape )
1470cdf0e10cSrcweir 		{
1471cdf0e10cSrcweir 			aToken.Erase( nPos, 1 );
1472cdf0e10cSrcweir 			bEscape = sal_True;
1473cdf0e10cSrcweir 		}
1474cdf0e10cSrcweir 		else
1475cdf0e10cSrcweir 		{
1476cdf0e10cSrcweir 			nPos++;
1477cdf0e10cSrcweir 		}
1478cdf0e10cSrcweir 	}
1479cdf0e10cSrcweir }
1480cdf0e10cSrcweir 
1481cdf0e10cSrcweir // hole die Optionen
GetOptions(sal_uInt16 * pNoConvertToken) const1482cdf0e10cSrcweir const HTMLOptions *HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken ) const
1483cdf0e10cSrcweir {
1484cdf0e10cSrcweir 	// wenn die Option fuer das aktuelle Token schon einmal
1485cdf0e10cSrcweir 	// geholt wurden, geben wir sie noch einmal zurueck
1486cdf0e10cSrcweir 	if( pOptions->Count() )
1487cdf0e10cSrcweir 		return pOptions;
1488cdf0e10cSrcweir 
1489cdf0e10cSrcweir 	xub_StrLen nPos = 0;
1490cdf0e10cSrcweir 	while( nPos < aToken.Len() )
1491cdf0e10cSrcweir 	{
1492cdf0e10cSrcweir 		// ein Zeichen ? Dann faengt hier eine Option an
1493cdf0e10cSrcweir 		if( HTML_ISALPHA( aToken.GetChar(nPos) ) )
1494cdf0e10cSrcweir 		{
1495cdf0e10cSrcweir 			int nToken;
1496cdf0e10cSrcweir 			String aValue;
1497cdf0e10cSrcweir 			xub_StrLen nStt = nPos;
1498cdf0e10cSrcweir 			sal_Unicode cChar = 0;
1499cdf0e10cSrcweir 
1500cdf0e10cSrcweir 			// Eigentlich sind hier nur ganz bestimmte Zeichen erlaubt.
1501cdf0e10cSrcweir 			// Netscape achtet aber nur auf "=" und Leerzeichen (siehe
1502cdf0e10cSrcweir 			// Mozilla: PA_FetchRequestedNameValues in
1503cdf0e10cSrcweir 			// lipparse/pa_mdl.c
1504cdf0e10cSrcweir //			while( nPos < aToken.Len() &&
1505cdf0e10cSrcweir //					( '-'==(c=aToken[nPos]) || isalnum(c) || '.'==c || '_'==c) )
1506cdf0e10cSrcweir 			while( nPos < aToken.Len() && '=' != (cChar=aToken.GetChar(nPos)) &&
1507cdf0e10cSrcweir 				   HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) )
1508cdf0e10cSrcweir 				nPos++;
1509cdf0e10cSrcweir 
1510cdf0e10cSrcweir 			String sName( aToken.Copy( nStt, nPos-nStt ) );
1511cdf0e10cSrcweir 
1512cdf0e10cSrcweir //JP 23.03.97: die PlugIns wollen die TokenName im "Original" haben
1513cdf0e10cSrcweir //				also nur fuers Suchen in UpperCase wandeln
1514cdf0e10cSrcweir 			String sNameUpperCase( sName );
1515cdf0e10cSrcweir 			sNameUpperCase.ToUpperAscii();
1516cdf0e10cSrcweir 
1517cdf0e10cSrcweir 			nToken = GetHTMLOption( sNameUpperCase ); // der Name ist fertig
1518cdf0e10cSrcweir 			DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN,
1519cdf0e10cSrcweir 						"GetOption: unbekannte HTML-Option" );
1520cdf0e10cSrcweir 			sal_Bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START ||
1521cdf0e10cSrcweir 							   nToken >= HTML_OPTION_SCRIPT_END) &&
1522cdf0e10cSrcweir 							  (!pNoConvertToken || nToken != *pNoConvertToken);
1523cdf0e10cSrcweir 
1524cdf0e10cSrcweir 			while( nPos < aToken.Len() &&
1525cdf0e10cSrcweir 				   ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1526cdf0e10cSrcweir 					 HTML_ISSPACE(cChar) ) )
1527cdf0e10cSrcweir 				nPos++;
1528cdf0e10cSrcweir 
1529cdf0e10cSrcweir 			// hat die Option auch einen Wert?
1530cdf0e10cSrcweir 			if( nPos!=aToken.Len() && '='==cChar )
1531cdf0e10cSrcweir 			{
1532cdf0e10cSrcweir 				nPos++;
1533cdf0e10cSrcweir 
1534cdf0e10cSrcweir 				while( nPos < aToken.Len() &&
1535cdf0e10cSrcweir 						( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1536cdf0e10cSrcweir 						  ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) )
1537cdf0e10cSrcweir 					nPos++;
1538cdf0e10cSrcweir 
1539cdf0e10cSrcweir 				if( nPos != aToken.Len() )
1540cdf0e10cSrcweir 				{
1541cdf0e10cSrcweir 					xub_StrLen nLen = 0;
1542cdf0e10cSrcweir 					nStt = nPos;
1543cdf0e10cSrcweir 					if( ('"'==cChar) || ('\'')==cChar )
1544cdf0e10cSrcweir 					{
1545cdf0e10cSrcweir 						sal_Unicode cEnd = cChar;
1546cdf0e10cSrcweir 						nPos++; nStt++;
1547cdf0e10cSrcweir 						sal_Bool bDone = sal_False;
1548cdf0e10cSrcweir 						sal_Bool bEscape = sal_False;
1549cdf0e10cSrcweir 						while( nPos < aToken.Len() && !bDone )
1550cdf0e10cSrcweir 						{
1551cdf0e10cSrcweir 							sal_Bool bOldEscape = bEscape;
1552cdf0e10cSrcweir 							bEscape = sal_False;
1553cdf0e10cSrcweir 							cChar = aToken.GetChar(nPos);
1554cdf0e10cSrcweir 							switch( cChar )
1555cdf0e10cSrcweir 							{
1556cdf0e10cSrcweir 							case '\r':
1557cdf0e10cSrcweir 							case '\n':
1558cdf0e10cSrcweir 								if( bStripCRLF )
1559cdf0e10cSrcweir 									((String &)aToken).Erase( nPos, 1 );
1560cdf0e10cSrcweir 								else
1561cdf0e10cSrcweir 									nPos++, nLen++;
1562cdf0e10cSrcweir 								break;
1563cdf0e10cSrcweir 							case '\\':
1564cdf0e10cSrcweir 								if( bOldEscape )
1565cdf0e10cSrcweir 								{
1566cdf0e10cSrcweir 									nPos++, nLen++;
1567cdf0e10cSrcweir 								}
1568cdf0e10cSrcweir 								else
1569cdf0e10cSrcweir 								{
1570cdf0e10cSrcweir 									((String &)aToken).Erase( nPos, 1 );
1571cdf0e10cSrcweir 									bEscape = sal_True;
1572cdf0e10cSrcweir 								}
1573cdf0e10cSrcweir 								break;
1574cdf0e10cSrcweir 							case '"':
1575cdf0e10cSrcweir 							case '\'':
1576cdf0e10cSrcweir 								bDone = !bOldEscape && cChar==cEnd;
1577cdf0e10cSrcweir 								if( !bDone )
1578cdf0e10cSrcweir 									nPos++, nLen++;
1579cdf0e10cSrcweir 								break;
1580cdf0e10cSrcweir 							default:
1581cdf0e10cSrcweir 								nPos++, nLen++;
1582cdf0e10cSrcweir 								break;
1583cdf0e10cSrcweir 							}
1584cdf0e10cSrcweir 						}
1585cdf0e10cSrcweir 						if( nPos!=aToken.Len() )
1586cdf0e10cSrcweir 							nPos++;
1587cdf0e10cSrcweir 					}
1588cdf0e10cSrcweir 					else
1589cdf0e10cSrcweir 					{
1590cdf0e10cSrcweir 						// hier sind wir etwas laxer als der
1591cdf0e10cSrcweir 						// Standard und erlauben alles druckbare
1592cdf0e10cSrcweir 						sal_Bool bEscape = sal_False;
1593cdf0e10cSrcweir 						sal_Bool bDone = sal_False;
1594cdf0e10cSrcweir 						while( nPos < aToken.Len() && !bDone )
1595cdf0e10cSrcweir 						{
1596cdf0e10cSrcweir 							sal_Bool bOldEscape = bEscape;
1597cdf0e10cSrcweir 							bEscape = sal_False;
1598cdf0e10cSrcweir 							sal_Unicode c = aToken.GetChar(nPos);
1599cdf0e10cSrcweir 							switch( c )
1600cdf0e10cSrcweir 							{
1601cdf0e10cSrcweir 							case ' ':
1602cdf0e10cSrcweir 								bDone = !bOldEscape;
1603cdf0e10cSrcweir 								if( !bDone )
1604cdf0e10cSrcweir 									nPos++, nLen++;
1605cdf0e10cSrcweir 								break;
1606cdf0e10cSrcweir 
1607cdf0e10cSrcweir 							case '\t':
1608cdf0e10cSrcweir 							case '\r':
1609cdf0e10cSrcweir 							case '\n':
1610cdf0e10cSrcweir 								bDone = sal_True;
1611cdf0e10cSrcweir 								break;
1612cdf0e10cSrcweir 
1613cdf0e10cSrcweir 							case '\\':
1614cdf0e10cSrcweir 								if( bOldEscape )
1615cdf0e10cSrcweir 								{
1616cdf0e10cSrcweir 									nPos++, nLen++;
1617cdf0e10cSrcweir 								}
1618cdf0e10cSrcweir 								else
1619cdf0e10cSrcweir 								{
1620cdf0e10cSrcweir 									((String &)aToken).Erase( nPos, 1 );
1621cdf0e10cSrcweir 									bEscape = sal_True;
1622cdf0e10cSrcweir 								}
1623cdf0e10cSrcweir 								break;
1624cdf0e10cSrcweir 
1625cdf0e10cSrcweir 							default:
1626cdf0e10cSrcweir 								if( HTML_ISPRINTABLE( c ) )
1627cdf0e10cSrcweir 									nPos++, nLen++;
1628cdf0e10cSrcweir 								else
1629cdf0e10cSrcweir 									bDone = sal_True;
1630cdf0e10cSrcweir 								break;
1631cdf0e10cSrcweir 							}
1632cdf0e10cSrcweir 						}
1633cdf0e10cSrcweir 					}
1634cdf0e10cSrcweir 
1635cdf0e10cSrcweir 					if( nLen )
1636cdf0e10cSrcweir 						aValue = aToken.Copy( nStt, nLen );
1637cdf0e10cSrcweir 				}
1638cdf0e10cSrcweir 			}
1639cdf0e10cSrcweir 
1640cdf0e10cSrcweir 			// Wir kennen das Token und koennen es Speichern
1641cdf0e10cSrcweir 			HTMLOption *pOption =
1642cdf0e10cSrcweir 				new HTMLOption(
1643cdf0e10cSrcweir                     sal::static_int_cast< sal_uInt16 >(nToken), sName, aValue );
1644cdf0e10cSrcweir 
1645cdf0e10cSrcweir 			pOptions->Insert( pOption, pOptions->Count() );
1646cdf0e10cSrcweir 
1647cdf0e10cSrcweir 		}
1648cdf0e10cSrcweir 		else
1649cdf0e10cSrcweir 			// white space un unerwartete Zeichen ignorieren wie
1650cdf0e10cSrcweir 			nPos++;
1651cdf0e10cSrcweir 	}
1652cdf0e10cSrcweir 
1653cdf0e10cSrcweir 	return pOptions;
1654cdf0e10cSrcweir }
1655cdf0e10cSrcweir 
FilterPRE(int nToken)1656cdf0e10cSrcweir int HTMLParser::FilterPRE( int nToken )
1657cdf0e10cSrcweir {
1658cdf0e10cSrcweir 	switch( nToken )
1659cdf0e10cSrcweir 	{
1660cdf0e10cSrcweir #ifdef HTML_BEHAVIOUR
1661cdf0e10cSrcweir 	// diese werden laut Definition zu LFs
1662cdf0e10cSrcweir 	case HTML_PARABREAK_ON:
1663cdf0e10cSrcweir 	case HTML_LINEBREAK:
1664cdf0e10cSrcweir 		nToken = HTML_NEWPARA;
1665cdf0e10cSrcweir #else
1666cdf0e10cSrcweir 	// in Netscape zeigen sie aber nur in nicht-leeren Absaetzen Wirkung
1667cdf0e10cSrcweir 	case HTML_PARABREAK_ON:
1668cdf0e10cSrcweir 		nToken = HTML_LINEBREAK;
1669cdf0e10cSrcweir 	case HTML_LINEBREAK:
1670cdf0e10cSrcweir #endif
1671cdf0e10cSrcweir 	case HTML_NEWPARA:
1672cdf0e10cSrcweir 		nPre_LinePos = 0;
1673cdf0e10cSrcweir 		if( bPre_IgnoreNewPara )
1674cdf0e10cSrcweir 			nToken = 0;
1675cdf0e10cSrcweir 		break;
1676cdf0e10cSrcweir 
1677cdf0e10cSrcweir 	case HTML_TABCHAR:
1678cdf0e10cSrcweir 		{
1679cdf0e10cSrcweir 			xub_StrLen nSpaces = sal::static_int_cast< xub_StrLen >(
1680cdf0e10cSrcweir                 8 - (nPre_LinePos % 8));
1681cdf0e10cSrcweir 			DBG_ASSERT( !aToken.Len(), "Wieso ist das Token nicht leer?" );
1682cdf0e10cSrcweir 			aToken.Expand( nSpaces, ' ' );
1683cdf0e10cSrcweir 			nPre_LinePos += nSpaces;
1684cdf0e10cSrcweir 			nToken = HTML_TEXTTOKEN;
1685cdf0e10cSrcweir 		}
1686cdf0e10cSrcweir 		break;
1687cdf0e10cSrcweir 	// diese bleiben erhalten
1688cdf0e10cSrcweir 	case HTML_TEXTTOKEN:
1689cdf0e10cSrcweir 		nPre_LinePos += aToken.Len();
1690cdf0e10cSrcweir 		break;
1691cdf0e10cSrcweir 
1692cdf0e10cSrcweir 	case HTML_SELECT_ON:
1693cdf0e10cSrcweir 	case HTML_SELECT_OFF:
1694cdf0e10cSrcweir 	case HTML_BODY_ON:
1695cdf0e10cSrcweir 	case HTML_FORM_ON:
1696cdf0e10cSrcweir 	case HTML_FORM_OFF:
1697cdf0e10cSrcweir 	case HTML_INPUT:
1698cdf0e10cSrcweir 	case HTML_OPTION:
1699cdf0e10cSrcweir 	case HTML_TEXTAREA_ON:
1700cdf0e10cSrcweir 	case HTML_TEXTAREA_OFF:
1701cdf0e10cSrcweir 
1702cdf0e10cSrcweir 	case HTML_IMAGE:
1703cdf0e10cSrcweir 	case HTML_APPLET_ON:
1704cdf0e10cSrcweir 	case HTML_APPLET_OFF:
1705cdf0e10cSrcweir 	case HTML_PARAM:
1706cdf0e10cSrcweir 	case HTML_EMBED:
1707cdf0e10cSrcweir 
1708cdf0e10cSrcweir 	case HTML_HEAD1_ON:
1709cdf0e10cSrcweir 	case HTML_HEAD1_OFF:
1710cdf0e10cSrcweir 	case HTML_HEAD2_ON:
1711cdf0e10cSrcweir 	case HTML_HEAD2_OFF:
1712cdf0e10cSrcweir 	case HTML_HEAD3_ON:
1713cdf0e10cSrcweir 	case HTML_HEAD3_OFF:
1714cdf0e10cSrcweir 	case HTML_HEAD4_ON:
1715cdf0e10cSrcweir 	case HTML_HEAD4_OFF:
1716cdf0e10cSrcweir 	case HTML_HEAD5_ON:
1717cdf0e10cSrcweir 	case HTML_HEAD5_OFF:
1718cdf0e10cSrcweir 	case HTML_HEAD6_ON:
1719cdf0e10cSrcweir 	case HTML_HEAD6_OFF:
1720cdf0e10cSrcweir 	case HTML_BLOCKQUOTE_ON:
1721cdf0e10cSrcweir 	case HTML_BLOCKQUOTE_OFF:
1722cdf0e10cSrcweir 	case HTML_ADDRESS_ON:
1723cdf0e10cSrcweir 	case HTML_ADDRESS_OFF:
1724cdf0e10cSrcweir 	case HTML_HORZRULE:
1725cdf0e10cSrcweir 
1726cdf0e10cSrcweir 	case HTML_CENTER_ON:
1727cdf0e10cSrcweir 	case HTML_CENTER_OFF:
1728cdf0e10cSrcweir 	case HTML_DIVISION_ON:
1729cdf0e10cSrcweir 	case HTML_DIVISION_OFF:
1730cdf0e10cSrcweir 
1731cdf0e10cSrcweir 	case HTML_SCRIPT_ON:
1732cdf0e10cSrcweir 	case HTML_SCRIPT_OFF:
1733cdf0e10cSrcweir 	case HTML_RAWDATA:
1734cdf0e10cSrcweir 
1735cdf0e10cSrcweir 	case HTML_TABLE_ON:
1736cdf0e10cSrcweir 	case HTML_TABLE_OFF:
1737cdf0e10cSrcweir 	case HTML_CAPTION_ON:
1738cdf0e10cSrcweir 	case HTML_CAPTION_OFF:
1739cdf0e10cSrcweir 	case HTML_COLGROUP_ON:
1740cdf0e10cSrcweir 	case HTML_COLGROUP_OFF:
1741cdf0e10cSrcweir 	case HTML_COL_ON:
1742cdf0e10cSrcweir 	case HTML_COL_OFF:
1743cdf0e10cSrcweir 	case HTML_THEAD_ON:
1744cdf0e10cSrcweir 	case HTML_THEAD_OFF:
1745cdf0e10cSrcweir 	case HTML_TFOOT_ON:
1746cdf0e10cSrcweir 	case HTML_TFOOT_OFF:
1747cdf0e10cSrcweir 	case HTML_TBODY_ON:
1748cdf0e10cSrcweir 	case HTML_TBODY_OFF:
1749cdf0e10cSrcweir 	case HTML_TABLEROW_ON:
1750cdf0e10cSrcweir 	case HTML_TABLEROW_OFF:
1751cdf0e10cSrcweir 	case HTML_TABLEDATA_ON:
1752cdf0e10cSrcweir 	case HTML_TABLEDATA_OFF:
1753cdf0e10cSrcweir 	case HTML_TABLEHEADER_ON:
1754cdf0e10cSrcweir 	case HTML_TABLEHEADER_OFF:
1755cdf0e10cSrcweir 
1756cdf0e10cSrcweir 	case HTML_ANCHOR_ON:
1757cdf0e10cSrcweir 	case HTML_ANCHOR_OFF:
1758cdf0e10cSrcweir 	case HTML_BOLD_ON:
1759cdf0e10cSrcweir 	case HTML_BOLD_OFF:
1760cdf0e10cSrcweir 	case HTML_ITALIC_ON:
1761cdf0e10cSrcweir 	case HTML_ITALIC_OFF:
1762cdf0e10cSrcweir 	case HTML_STRIKE_ON:
1763cdf0e10cSrcweir 	case HTML_STRIKE_OFF:
1764cdf0e10cSrcweir 	case HTML_STRIKETHROUGH_ON:
1765cdf0e10cSrcweir 	case HTML_STRIKETHROUGH_OFF:
1766cdf0e10cSrcweir 	case HTML_UNDERLINE_ON:
1767cdf0e10cSrcweir 	case HTML_UNDERLINE_OFF:
1768cdf0e10cSrcweir 	case HTML_BASEFONT_ON:
1769cdf0e10cSrcweir 	case HTML_BASEFONT_OFF:
1770cdf0e10cSrcweir 	case HTML_FONT_ON:
1771cdf0e10cSrcweir 	case HTML_FONT_OFF:
1772cdf0e10cSrcweir 	case HTML_BLINK_ON:
1773cdf0e10cSrcweir 	case HTML_BLINK_OFF:
1774cdf0e10cSrcweir 	case HTML_SPAN_ON:
1775cdf0e10cSrcweir 	case HTML_SPAN_OFF:
1776cdf0e10cSrcweir 	case HTML_SUBSCRIPT_ON:
1777cdf0e10cSrcweir 	case HTML_SUBSCRIPT_OFF:
1778cdf0e10cSrcweir 	case HTML_SUPERSCRIPT_ON:
1779cdf0e10cSrcweir 	case HTML_SUPERSCRIPT_OFF:
1780cdf0e10cSrcweir 	case HTML_BIGPRINT_ON:
1781cdf0e10cSrcweir 	case HTML_BIGPRINT_OFF:
1782cdf0e10cSrcweir 	case HTML_SMALLPRINT_OFF:
1783cdf0e10cSrcweir 	case HTML_SMALLPRINT_ON:
1784cdf0e10cSrcweir 
1785cdf0e10cSrcweir 	case HTML_EMPHASIS_ON:
1786cdf0e10cSrcweir 	case HTML_EMPHASIS_OFF:
1787cdf0e10cSrcweir 	case HTML_CITIATION_ON:
1788cdf0e10cSrcweir 	case HTML_CITIATION_OFF:
1789cdf0e10cSrcweir 	case HTML_STRONG_ON:
1790cdf0e10cSrcweir 	case HTML_STRONG_OFF:
1791cdf0e10cSrcweir 	case HTML_CODE_ON:
1792cdf0e10cSrcweir 	case HTML_CODE_OFF:
1793cdf0e10cSrcweir 	case HTML_SAMPLE_ON:
1794cdf0e10cSrcweir 	case HTML_SAMPLE_OFF:
1795cdf0e10cSrcweir 	case HTML_KEYBOARD_ON:
1796cdf0e10cSrcweir 	case HTML_KEYBOARD_OFF:
1797cdf0e10cSrcweir 	case HTML_VARIABLE_ON:
1798cdf0e10cSrcweir 	case HTML_VARIABLE_OFF:
1799cdf0e10cSrcweir 	case HTML_DEFINSTANCE_ON:
1800cdf0e10cSrcweir 	case HTML_DEFINSTANCE_OFF:
1801cdf0e10cSrcweir 	case HTML_SHORTQUOTE_ON:
1802cdf0e10cSrcweir 	case HTML_SHORTQUOTE_OFF:
1803cdf0e10cSrcweir 	case HTML_LANGUAGE_ON:
1804cdf0e10cSrcweir 	case HTML_LANGUAGE_OFF:
1805cdf0e10cSrcweir 	case HTML_AUTHOR_ON:
1806cdf0e10cSrcweir 	case HTML_AUTHOR_OFF:
1807cdf0e10cSrcweir 	case HTML_PERSON_ON:
1808cdf0e10cSrcweir 	case HTML_PERSON_OFF:
1809cdf0e10cSrcweir 	case HTML_ACRONYM_ON:
1810cdf0e10cSrcweir 	case HTML_ACRONYM_OFF:
1811cdf0e10cSrcweir 	case HTML_ABBREVIATION_ON:
1812cdf0e10cSrcweir 	case HTML_ABBREVIATION_OFF:
1813cdf0e10cSrcweir 	case HTML_INSERTEDTEXT_ON:
1814cdf0e10cSrcweir 	case HTML_INSERTEDTEXT_OFF:
1815cdf0e10cSrcweir 	case HTML_DELETEDTEXT_ON:
1816cdf0e10cSrcweir 	case HTML_DELETEDTEXT_OFF:
1817cdf0e10cSrcweir 	case HTML_TELETYPE_ON:
1818cdf0e10cSrcweir 	case HTML_TELETYPE_OFF:
1819cdf0e10cSrcweir 
1820cdf0e10cSrcweir 		break;
1821cdf0e10cSrcweir 
1822cdf0e10cSrcweir 	// der Rest wird als unbekanntes Token behandelt
1823cdf0e10cSrcweir 	default:
1824cdf0e10cSrcweir 		if( nToken )
1825cdf0e10cSrcweir 		{
1826cdf0e10cSrcweir 			nToken =
1827cdf0e10cSrcweir 				( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1828cdf0e10cSrcweir 					? HTML_UNKNOWNCONTROL_OFF
1829cdf0e10cSrcweir 					: HTML_UNKNOWNCONTROL_ON );
1830cdf0e10cSrcweir 		}
1831cdf0e10cSrcweir 		break;
1832cdf0e10cSrcweir 	}
1833cdf0e10cSrcweir 
1834cdf0e10cSrcweir 	bPre_IgnoreNewPara = sal_False;
1835cdf0e10cSrcweir 
1836cdf0e10cSrcweir 	return nToken;
1837cdf0e10cSrcweir }
1838cdf0e10cSrcweir 
FilterXMP(int nToken)1839cdf0e10cSrcweir int HTMLParser::FilterXMP( int nToken )
1840cdf0e10cSrcweir {
1841cdf0e10cSrcweir 	switch( nToken )
1842cdf0e10cSrcweir 	{
1843cdf0e10cSrcweir 	case HTML_NEWPARA:
1844cdf0e10cSrcweir 		if( bPre_IgnoreNewPara )
1845cdf0e10cSrcweir 			nToken = 0;
1846cdf0e10cSrcweir 	case HTML_TEXTTOKEN:
1847cdf0e10cSrcweir 	case HTML_NONBREAKSPACE:
1848cdf0e10cSrcweir 	case HTML_SOFTHYPH:
1849cdf0e10cSrcweir 		break;				// bleiben erhalten
1850cdf0e10cSrcweir 
1851cdf0e10cSrcweir 	default:
1852cdf0e10cSrcweir 		if( nToken )
1853cdf0e10cSrcweir 		{
1854cdf0e10cSrcweir 			if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) )
1855cdf0e10cSrcweir 			{
1856cdf0e10cSrcweir 				sSaveToken.Insert( '<', 0 );
1857cdf0e10cSrcweir 				sSaveToken.Insert( '/', 1 );
1858cdf0e10cSrcweir 			}
1859cdf0e10cSrcweir 			else
1860cdf0e10cSrcweir 				sSaveToken.Insert( '<', 0 );
1861cdf0e10cSrcweir 			if( aToken.Len() )
1862cdf0e10cSrcweir 			{
1863cdf0e10cSrcweir 				UnescapeToken();
1864cdf0e10cSrcweir 				sSaveToken += (sal_Unicode)' ';
1865cdf0e10cSrcweir 				aToken.Insert( sSaveToken, 0 );
1866cdf0e10cSrcweir 			}
1867cdf0e10cSrcweir 			else
1868cdf0e10cSrcweir 				aToken = sSaveToken;
1869cdf0e10cSrcweir 			aToken += (sal_Unicode)'>';
1870cdf0e10cSrcweir 			nToken = HTML_TEXTTOKEN;
1871cdf0e10cSrcweir 		}
1872cdf0e10cSrcweir 		break;
1873cdf0e10cSrcweir 	}
1874cdf0e10cSrcweir 
1875cdf0e10cSrcweir 	bPre_IgnoreNewPara = sal_False;
1876cdf0e10cSrcweir 
1877cdf0e10cSrcweir 	return nToken;
1878cdf0e10cSrcweir }
1879cdf0e10cSrcweir 
FilterListing(int nToken)1880cdf0e10cSrcweir int HTMLParser::FilterListing( int nToken )
1881cdf0e10cSrcweir {
1882cdf0e10cSrcweir 	switch( nToken )
1883cdf0e10cSrcweir 	{
1884cdf0e10cSrcweir 	case HTML_NEWPARA:
1885cdf0e10cSrcweir 		if( bPre_IgnoreNewPara )
1886cdf0e10cSrcweir 			nToken = 0;
1887cdf0e10cSrcweir 	case HTML_TEXTTOKEN:
1888cdf0e10cSrcweir 	case HTML_NONBREAKSPACE:
1889cdf0e10cSrcweir 	case HTML_SOFTHYPH:
1890cdf0e10cSrcweir 		break;		// bleiben erhalten
1891cdf0e10cSrcweir 
1892cdf0e10cSrcweir 	default:
1893cdf0e10cSrcweir 		if( nToken )
1894cdf0e10cSrcweir 		{
1895cdf0e10cSrcweir 			nToken =
1896cdf0e10cSrcweir 				( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1897cdf0e10cSrcweir 					? HTML_UNKNOWNCONTROL_OFF
1898cdf0e10cSrcweir 					: HTML_UNKNOWNCONTROL_ON );
1899cdf0e10cSrcweir 		}
1900cdf0e10cSrcweir 		break;
1901cdf0e10cSrcweir 	}
1902cdf0e10cSrcweir 
1903cdf0e10cSrcweir 	bPre_IgnoreNewPara = sal_False;
1904cdf0e10cSrcweir 
1905cdf0e10cSrcweir 	return nToken;
1906cdf0e10cSrcweir }
1907cdf0e10cSrcweir 
IsHTMLFormat(const sal_Char * pHeader,sal_Bool bSwitchToUCS2,rtl_TextEncoding eEnc)1908cdf0e10cSrcweir FASTBOOL HTMLParser::IsHTMLFormat( const sal_Char* pHeader,
1909cdf0e10cSrcweir 								   sal_Bool bSwitchToUCS2,
1910cdf0e10cSrcweir 								   rtl_TextEncoding eEnc )
1911cdf0e10cSrcweir {
1912cdf0e10cSrcweir 	// Einer der folgenden regulaeren Ausdrucke muss sich auf den String
1913cdf0e10cSrcweir 	// anwenden lassen, damit das Dok ein HTML-Dokument ist.
1914cdf0e10cSrcweir 	//
1915cdf0e10cSrcweir 	// ^[^<]*<[^ \t]*[> \t]
1916cdf0e10cSrcweir 	//        -------
1917cdf0e10cSrcweir 	// ^<!
1918cdf0e10cSrcweir 	//
1919cdf0e10cSrcweir 	// wobei der unterstrichene Teilausdruck einem HTML-Token
1920cdf0e10cSrcweir 	// ensprechen muss
1921cdf0e10cSrcweir 
1922cdf0e10cSrcweir 	ByteString sCmp;
1923cdf0e10cSrcweir 	sal_Bool bUCS2B = sal_False;
1924cdf0e10cSrcweir 	if( bSwitchToUCS2 )
1925cdf0e10cSrcweir 	{
1926cdf0e10cSrcweir 		if( 0xfeU == (sal_uChar)pHeader[0] &&
1927cdf0e10cSrcweir 			0xffU == (sal_uChar)pHeader[1] )
1928cdf0e10cSrcweir 		{
1929cdf0e10cSrcweir 			eEnc = RTL_TEXTENCODING_UCS2;
1930cdf0e10cSrcweir 			bUCS2B = sal_True;
1931cdf0e10cSrcweir 		}
1932cdf0e10cSrcweir 		else if( 0xffU == (sal_uChar)pHeader[0] &&
1933cdf0e10cSrcweir 				 0xfeU == (sal_uChar)pHeader[1] )
1934cdf0e10cSrcweir 		{
1935cdf0e10cSrcweir 			eEnc = RTL_TEXTENCODING_UCS2;
1936cdf0e10cSrcweir 		}
1937cdf0e10cSrcweir 	}
1938cdf0e10cSrcweir 	if
1939cdf0e10cSrcweir        (
1940cdf0e10cSrcweir         RTL_TEXTENCODING_UCS2 == eEnc &&
1941cdf0e10cSrcweir         (
1942cdf0e10cSrcweir          (0xfe == (sal_uChar)pHeader[0] && 0xff == (sal_uChar)pHeader[1]) ||
1943cdf0e10cSrcweir          (0xff == (sal_uChar)pHeader[0] && 0xfe == (sal_uChar)pHeader[1])
1944cdf0e10cSrcweir         )
1945cdf0e10cSrcweir        )
1946cdf0e10cSrcweir 	{
1947cdf0e10cSrcweir 		if( 0xfe == (sal_uChar)pHeader[0] )
1948cdf0e10cSrcweir 			bUCS2B = sal_True;
1949cdf0e10cSrcweir 
1950cdf0e10cSrcweir 		xub_StrLen nLen;
1951cdf0e10cSrcweir 		for( nLen = 2;
1952cdf0e10cSrcweir 			 pHeader[nLen] != 0 || pHeader[nLen+1] != 0;
1953cdf0e10cSrcweir 			 nLen+=2 )
1954cdf0e10cSrcweir 			;
1955cdf0e10cSrcweir 
1956cdf0e10cSrcweir 		::rtl::OStringBuffer sTmp( (nLen - 2)/2 );
1957cdf0e10cSrcweir 		for( xub_StrLen nPos = 2; nPos < nLen; nPos += 2 )
1958cdf0e10cSrcweir 		{
1959cdf0e10cSrcweir 			sal_Unicode cUC;
1960cdf0e10cSrcweir 			if( bUCS2B )
1961cdf0e10cSrcweir 				cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1];
1962cdf0e10cSrcweir 			else
1963cdf0e10cSrcweir 				cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos];
1964cdf0e10cSrcweir 			if( 0U == cUC )
1965cdf0e10cSrcweir 				break;
1966cdf0e10cSrcweir 
1967cdf0e10cSrcweir 			sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' );
1968cdf0e10cSrcweir 		}
1969cdf0e10cSrcweir 		sCmp = ByteString( sTmp.makeStringAndClear() );
1970cdf0e10cSrcweir 	}
1971cdf0e10cSrcweir 	else
1972cdf0e10cSrcweir 	{
1973cdf0e10cSrcweir 		sCmp = (sal_Char *)pHeader;
1974cdf0e10cSrcweir 	}
1975cdf0e10cSrcweir 
1976cdf0e10cSrcweir 	sCmp.ToUpperAscii();
1977cdf0e10cSrcweir 
1978cdf0e10cSrcweir 	// Ein HTML-Dokument muss in der ersten Zeile ein '<' besitzen
1979cdf0e10cSrcweir 	xub_StrLen nStart = sCmp.Search( '<' );
1980cdf0e10cSrcweir 	if( STRING_NOTFOUND  == nStart )
1981cdf0e10cSrcweir 		return sal_False;
1982cdf0e10cSrcweir 	nStart++;
1983cdf0e10cSrcweir 
1984cdf0e10cSrcweir 	// danach duerfen beliebige andere Zeichen bis zu einem blank oder
1985cdf0e10cSrcweir 	// '>' kommen
1986cdf0e10cSrcweir 	sal_Char c;
1987cdf0e10cSrcweir 	xub_StrLen nPos;
1988cdf0e10cSrcweir 	for( nPos = nStart; nPos<sCmp.Len(); nPos++ )
1989cdf0e10cSrcweir 	{
1990cdf0e10cSrcweir 		if( '>'==(c=sCmp.GetChar(nPos)) || HTML_ISSPACE(c) )
1991cdf0e10cSrcweir 			break;
1992cdf0e10cSrcweir 	}
1993cdf0e10cSrcweir 
1994cdf0e10cSrcweir 	// wenn das Dokeument hinter dem < aufhoert ist es wohl kein HTML
1995cdf0e10cSrcweir 	if( nPos==nStart )
1996cdf0e10cSrcweir 		return sal_False;
1997cdf0e10cSrcweir 
1998cdf0e10cSrcweir 	// die Zeichenkette nach dem '<' muss ausserdem ein bekanntes
1999cdf0e10cSrcweir 	// HTML Token sein. Damit die Ausgabe eines DOS-dir-Befehls nicht
2000cdf0e10cSrcweir 	// als HTML interpretiert wird, wird ein <DIR> jedoch nicht als HTML
2001cdf0e10cSrcweir 	// interpretiert.
2002cdf0e10cSrcweir 	String sTest( sCmp.Copy( nStart, nPos-nStart ), RTL_TEXTENCODING_ASCII_US );
2003cdf0e10cSrcweir 	int nTok = GetHTMLToken( sTest );
2004cdf0e10cSrcweir 	if( 0 != nTok && HTML_DIRLIST_ON != nTok )
2005cdf0e10cSrcweir 		return sal_True;
2006cdf0e10cSrcweir 
2007cdf0e10cSrcweir 	// oder es handelt sich um ein "<!" ganz am Anfang der Datei (fix #27092#)
2008cdf0e10cSrcweir 	if( nStart == 1 && '!' == sCmp.GetChar( 1 ) )
2009cdf0e10cSrcweir 		return sal_True;
2010cdf0e10cSrcweir 
2011cdf0e10cSrcweir 	// oder wir finden irgendwo ein <HTML> in den ersten 80 Zeichen
2012cdf0e10cSrcweir 	nStart = sCmp.Search( OOO_STRING_SVTOOLS_HTML_html );
2013cdf0e10cSrcweir 	if( nStart!=STRING_NOTFOUND &&
2014cdf0e10cSrcweir 		nStart>0 && '<'==sCmp.GetChar(nStart-1) &&
2015cdf0e10cSrcweir 		nStart+4 < sCmp.Len() && '>'==sCmp.GetChar(nStart+4) )
2016cdf0e10cSrcweir 		return sal_True;
2017cdf0e10cSrcweir 
2018cdf0e10cSrcweir 	// sonst ist es wohl doch eher kein HTML-Dokument
2019cdf0e10cSrcweir 	return sal_False;
2020cdf0e10cSrcweir }
2021cdf0e10cSrcweir 
InternalImgToPrivateURL(String & rURL)2022cdf0e10cSrcweir sal_Bool HTMLParser::InternalImgToPrivateURL( String& rURL )
2023cdf0e10cSrcweir {
2024cdf0e10cSrcweir 	if( rURL.Len() < 19 || 'i' != rURL.GetChar(0) ||
2025cdf0e10cSrcweir 		rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != COMPARE_EQUAL )
2026cdf0e10cSrcweir 		return sal_False;
2027cdf0e10cSrcweir 
2028cdf0e10cSrcweir 	sal_Bool bFound = sal_False;
2029cdf0e10cSrcweir 
2030cdf0e10cSrcweir 	if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == COMPARE_EQUAL )
2031cdf0e10cSrcweir 	{
2032cdf0e10cSrcweir 		String aName( rURL.Copy(16) );
2033cdf0e10cSrcweir 		switch( aName.GetChar(0) )
2034cdf0e10cSrcweir 		{
2035cdf0e10cSrcweir 		case 'b':
2036cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary );
2037cdf0e10cSrcweir 			break;
2038cdf0e10cSrcweir 		case 'i':
2039cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ) ||
2040cdf0e10cSrcweir 					 aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index );
2041cdf0e10cSrcweir 			break;
2042cdf0e10cSrcweir 		case 'm':
2043cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ) ||
2044cdf0e10cSrcweir 					 aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie );
2045cdf0e10cSrcweir 			break;
2046cdf0e10cSrcweir 		case 's':
2047cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound );
2048cdf0e10cSrcweir 			break;
2049cdf0e10cSrcweir 		case 't':
2050cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ) ||
2051cdf0e10cSrcweir 					 aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text );
2052cdf0e10cSrcweir 			break;
2053cdf0e10cSrcweir 		case 'u':
2054cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown );
2055cdf0e10cSrcweir 			break;
2056cdf0e10cSrcweir 		}
2057cdf0e10cSrcweir 	}
2058cdf0e10cSrcweir 	else if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == COMPARE_EQUAL )
2059cdf0e10cSrcweir 	{
2060cdf0e10cSrcweir 		String aName( rURL.Copy(14) );
2061cdf0e10cSrcweir 		switch( aName.GetChar(0) )
2062cdf0e10cSrcweir 		{
2063cdf0e10cSrcweir 		case 'b':
2064cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata );
2065cdf0e10cSrcweir 			break;
2066cdf0e10cSrcweir 		case 'd':
2067cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed );
2068cdf0e10cSrcweir 			break;
2069cdf0e10cSrcweir 		case 'e':
2070cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed );
2071cdf0e10cSrcweir 			break;
2072cdf0e10cSrcweir 		case 'i':
2073cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure );
2074cdf0e10cSrcweir 			break;
2075cdf0e10cSrcweir 		case 'n':
2076cdf0e10cSrcweir 			bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound );
2077cdf0e10cSrcweir 			break;
2078cdf0e10cSrcweir 		}
2079cdf0e10cSrcweir 	}
2080cdf0e10cSrcweir 	if( bFound )
2081cdf0e10cSrcweir 	{
2082cdf0e10cSrcweir 		String sTmp ( rURL );
2083cdf0e10cSrcweir 		rURL.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image );
2084cdf0e10cSrcweir 		rURL.Append( sTmp );
2085cdf0e10cSrcweir 	}
2086cdf0e10cSrcweir 
2087cdf0e10cSrcweir 	return bFound;
2088cdf0e10cSrcweir }
2089cdf0e10cSrcweir 
2090cdf0e10cSrcweir #ifdef USED
SaveState(int nToken)2091cdf0e10cSrcweir void HTMLParser::SaveState( int nToken )
2092cdf0e10cSrcweir {
2093cdf0e10cSrcweir 	SvParser::SaveState( nToken );
2094cdf0e10cSrcweir }
2095cdf0e10cSrcweir 
RestoreState()2096cdf0e10cSrcweir void HTMLParser::RestoreState()
2097cdf0e10cSrcweir {
2098cdf0e10cSrcweir 	SvParser::RestoreState();
2099cdf0e10cSrcweir }
2100cdf0e10cSrcweir #endif
2101cdf0e10cSrcweir 
2102cdf0e10cSrcweir 
2103cdf0e10cSrcweir enum eHtmlMetas {
2104cdf0e10cSrcweir     HTML_META_NONE = 0,
2105cdf0e10cSrcweir     HTML_META_AUTHOR,
2106cdf0e10cSrcweir     HTML_META_DESCRIPTION,
2107cdf0e10cSrcweir     HTML_META_KEYWORDS,
2108cdf0e10cSrcweir     HTML_META_REFRESH,
2109cdf0e10cSrcweir     HTML_META_CLASSIFICATION,
2110cdf0e10cSrcweir     HTML_META_CREATED,
2111cdf0e10cSrcweir     HTML_META_CHANGEDBY,
2112cdf0e10cSrcweir     HTML_META_CHANGED,
2113cdf0e10cSrcweir     HTML_META_GENERATOR,
2114cdf0e10cSrcweir     HTML_META_SDFOOTNOTE,
2115cdf0e10cSrcweir     HTML_META_SDENDNOTE,
2116cdf0e10cSrcweir     HTML_META_CONTENT_TYPE
2117cdf0e10cSrcweir };
2118cdf0e10cSrcweir 
2119cdf0e10cSrcweir // <META NAME=xxx>
2120cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aHTMLMetaNameTable[] =
2121cdf0e10cSrcweir {
2122cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_author,        HTML_META_AUTHOR        },
2123cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_changed,       HTML_META_CHANGED       },
2124cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_changedby,     HTML_META_CHANGEDBY     },
2125cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION},
2126cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_content_type,  HTML_META_CONTENT_TYPE  },
2127cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_created,       HTML_META_CREATED       },
2128cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_description,   HTML_META_DESCRIPTION   },
2129cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_keywords,      HTML_META_KEYWORDS      },
2130cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_generator,     HTML_META_GENERATOR     },
2131cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_refresh,       HTML_META_REFRESH       },
2132cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_sdendnote,     HTML_META_SDENDNOTE     },
2133cdf0e10cSrcweir     { OOO_STRING_SVTOOLS_HTML_META_sdfootnote,    HTML_META_SDFOOTNOTE    },
2134cdf0e10cSrcweir     { 0,                                          0                       }
2135cdf0e10cSrcweir };
2136cdf0e10cSrcweir 
2137cdf0e10cSrcweir 
AddMetaUserDefined(::rtl::OUString const &)2138cdf0e10cSrcweir void HTMLParser::AddMetaUserDefined( ::rtl::OUString const & )
2139cdf0e10cSrcweir {
2140cdf0e10cSrcweir }
2141cdf0e10cSrcweir 
ParseMetaOptionsImpl(const uno::Reference<document::XDocumentProperties> & i_xDocProps,SvKeyValueIterator * i_pHTTPHeader,const HTMLOptions * i_pOptions,rtl_TextEncoding & o_rEnc)2142cdf0e10cSrcweir bool HTMLParser::ParseMetaOptionsImpl(
2143cdf0e10cSrcweir         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2144cdf0e10cSrcweir         SvKeyValueIterator *i_pHTTPHeader,
2145cdf0e10cSrcweir         const HTMLOptions *i_pOptions,
2146cdf0e10cSrcweir         rtl_TextEncoding& o_rEnc )
2147cdf0e10cSrcweir {
2148cdf0e10cSrcweir     String aName, aContent;
2149cdf0e10cSrcweir     sal_uInt16 nAction = HTML_META_NONE;
2150cdf0e10cSrcweir     bool bHTTPEquiv = false, bChanged = false;
2151cdf0e10cSrcweir 
2152cdf0e10cSrcweir     for ( sal_uInt16 i = i_pOptions->Count(); i; )
2153cdf0e10cSrcweir     {
2154cdf0e10cSrcweir         const HTMLOption *pOption = (*i_pOptions)[ --i ];
2155cdf0e10cSrcweir         switch ( pOption->GetToken() )
2156cdf0e10cSrcweir         {
2157cdf0e10cSrcweir             case HTML_O_NAME:
2158cdf0e10cSrcweir                 aName = pOption->GetString();
2159cdf0e10cSrcweir                 if ( HTML_META_NONE==nAction )
2160cdf0e10cSrcweir                 {
2161cdf0e10cSrcweir                     pOption->GetEnum( nAction, aHTMLMetaNameTable );
2162cdf0e10cSrcweir                 }
2163cdf0e10cSrcweir                 break;
2164cdf0e10cSrcweir             case HTML_O_HTTPEQUIV:
2165cdf0e10cSrcweir                 aName = pOption->GetString();
2166cdf0e10cSrcweir                 pOption->GetEnum( nAction, aHTMLMetaNameTable );
2167cdf0e10cSrcweir                 bHTTPEquiv = true;
2168cdf0e10cSrcweir                 break;
2169cdf0e10cSrcweir             case HTML_O_CONTENT:
2170cdf0e10cSrcweir                 aContent = pOption->GetString();
2171cdf0e10cSrcweir                 break;
2172cdf0e10cSrcweir         }
2173cdf0e10cSrcweir     }
2174cdf0e10cSrcweir 
2175cdf0e10cSrcweir     if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction )
2176cdf0e10cSrcweir     {
2177cdf0e10cSrcweir         // if it is not a Description, remove CRs and LFs from CONTENT
2178cdf0e10cSrcweir         aContent.EraseAllChars( _CR );
2179cdf0e10cSrcweir         aContent.EraseAllChars( _LF );
2180cdf0e10cSrcweir     }
2181cdf0e10cSrcweir     else
2182cdf0e10cSrcweir     {
2183cdf0e10cSrcweir         // convert line endings for Description
2184cdf0e10cSrcweir         aContent.ConvertLineEnd();
2185cdf0e10cSrcweir     }
2186cdf0e10cSrcweir 
2187cdf0e10cSrcweir 
2188cdf0e10cSrcweir     if ( bHTTPEquiv && i_pHTTPHeader )
2189cdf0e10cSrcweir     {
2190cdf0e10cSrcweir         // #57232#: Netscape seems to just ignore a closing ", so we do too
2191cdf0e10cSrcweir         if ( aContent.Len() && '"' == aContent.GetChar( aContent.Len()-1 ) )
2192cdf0e10cSrcweir         {
2193cdf0e10cSrcweir             aContent.Erase( aContent.Len() - 1 );
2194cdf0e10cSrcweir         }
2195cdf0e10cSrcweir         SvKeyValue aKeyValue( aName, aContent );
2196cdf0e10cSrcweir         i_pHTTPHeader->Append( aKeyValue );
2197cdf0e10cSrcweir     }
2198cdf0e10cSrcweir 
2199cdf0e10cSrcweir     switch ( nAction )
2200cdf0e10cSrcweir     {
2201cdf0e10cSrcweir         case HTML_META_AUTHOR:
2202cdf0e10cSrcweir             if (i_xDocProps.is()) {
2203cdf0e10cSrcweir                 i_xDocProps->setAuthor( aContent );
2204cdf0e10cSrcweir                 bChanged = true;
2205cdf0e10cSrcweir             }
2206cdf0e10cSrcweir             break;
2207cdf0e10cSrcweir         case HTML_META_DESCRIPTION:
2208cdf0e10cSrcweir             if (i_xDocProps.is()) {
2209cdf0e10cSrcweir                 i_xDocProps->setDescription( aContent );
2210cdf0e10cSrcweir                 bChanged = true;
2211cdf0e10cSrcweir             }
2212cdf0e10cSrcweir             break;
2213cdf0e10cSrcweir         case HTML_META_KEYWORDS:
2214cdf0e10cSrcweir             if (i_xDocProps.is()) {
2215cdf0e10cSrcweir                 i_xDocProps->setKeywords(
2216cdf0e10cSrcweir                     ::comphelper::string::convertCommaSeparated(aContent));
2217cdf0e10cSrcweir                 bChanged = true;
2218cdf0e10cSrcweir             }
2219cdf0e10cSrcweir             break;
2220cdf0e10cSrcweir         case HTML_META_CLASSIFICATION:
2221cdf0e10cSrcweir             if (i_xDocProps.is()) {
2222cdf0e10cSrcweir                 i_xDocProps->setSubject( aContent );
2223cdf0e10cSrcweir                 bChanged = true;
2224cdf0e10cSrcweir             }
2225cdf0e10cSrcweir             break;
2226cdf0e10cSrcweir 
2227cdf0e10cSrcweir         case HTML_META_CHANGEDBY:
2228cdf0e10cSrcweir             if (i_xDocProps.is()) {
2229cdf0e10cSrcweir                 i_xDocProps->setModifiedBy( aContent );
2230cdf0e10cSrcweir             }
2231cdf0e10cSrcweir             break;
2232cdf0e10cSrcweir 
2233cdf0e10cSrcweir         case HTML_META_CREATED:
2234cdf0e10cSrcweir         case HTML_META_CHANGED:
2235cdf0e10cSrcweir             if ( i_xDocProps.is() && aContent.Len() &&
2236cdf0e10cSrcweir                  aContent.GetTokenCount() == 2 )
2237cdf0e10cSrcweir             {
2238cdf0e10cSrcweir                 Date aDate( (sal_uLong)aContent.GetToken(0).ToInt32() );
2239cdf0e10cSrcweir                 Time aTime( (sal_uLong)aContent.GetToken(1).ToInt32() );
2240cdf0e10cSrcweir                 DateTime aDateTime( aDate, aTime );
2241cdf0e10cSrcweir                 ::util::DateTime uDT(aDateTime.Get100Sec(),
2242cdf0e10cSrcweir                     aDateTime.GetSec(), aDateTime.GetMin(),
2243cdf0e10cSrcweir                     aDateTime.GetHour(), aDateTime.GetDay(),
2244cdf0e10cSrcweir                     aDateTime.GetMonth(), aDateTime.GetYear());
2245cdf0e10cSrcweir                 if ( HTML_META_CREATED==nAction )
2246cdf0e10cSrcweir                     i_xDocProps->setCreationDate( uDT );
2247cdf0e10cSrcweir                 else
2248cdf0e10cSrcweir                     i_xDocProps->setModificationDate( uDT );
2249cdf0e10cSrcweir                 bChanged = true;
2250cdf0e10cSrcweir             }
2251cdf0e10cSrcweir             break;
2252cdf0e10cSrcweir 
2253cdf0e10cSrcweir         case HTML_META_REFRESH:
2254cdf0e10cSrcweir             DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader,
2255cdf0e10cSrcweir         "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" );
2256cdf0e10cSrcweir             break;
2257cdf0e10cSrcweir 
2258cdf0e10cSrcweir         case HTML_META_CONTENT_TYPE:
2259cdf0e10cSrcweir             if ( aContent.Len() )
2260cdf0e10cSrcweir             {
2261cdf0e10cSrcweir                 o_rEnc = GetEncodingByMIME( aContent );
2262cdf0e10cSrcweir             }
2263cdf0e10cSrcweir             break;
2264cdf0e10cSrcweir 
2265cdf0e10cSrcweir         case HTML_META_NONE:
2266cdf0e10cSrcweir             if ( !bHTTPEquiv )
2267cdf0e10cSrcweir             {
2268cdf0e10cSrcweir                 if (i_xDocProps.is())
2269cdf0e10cSrcweir                 {
2270cdf0e10cSrcweir                     uno::Reference<beans::XPropertyContainer> xUDProps
2271cdf0e10cSrcweir                         = i_xDocProps->getUserDefinedProperties();
2272cdf0e10cSrcweir                     try {
2273cdf0e10cSrcweir                         xUDProps->addProperty(aName,
2274cdf0e10cSrcweir                             beans::PropertyAttribute::REMOVEABLE,
2275cdf0e10cSrcweir                             uno::makeAny(::rtl::OUString(aContent)));
2276cdf0e10cSrcweir                         AddMetaUserDefined(aName);
2277cdf0e10cSrcweir                         bChanged = true;
2278cdf0e10cSrcweir                     } catch (uno::Exception &) {
2279cdf0e10cSrcweir                         // ignore
2280cdf0e10cSrcweir                     }
2281cdf0e10cSrcweir                 }
2282cdf0e10cSrcweir             }
2283cdf0e10cSrcweir             break;
2284cdf0e10cSrcweir         default:
2285cdf0e10cSrcweir             break;
2286cdf0e10cSrcweir     }
2287cdf0e10cSrcweir 
2288cdf0e10cSrcweir     return bChanged;
2289cdf0e10cSrcweir }
2290cdf0e10cSrcweir 
ParseMetaOptions(const uno::Reference<document::XDocumentProperties> & i_xDocProps,SvKeyValueIterator * i_pHeader)2291cdf0e10cSrcweir bool HTMLParser::ParseMetaOptions(
2292cdf0e10cSrcweir         const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2293cdf0e10cSrcweir         SvKeyValueIterator *i_pHeader )
2294cdf0e10cSrcweir {
2295cdf0e10cSrcweir     sal_uInt16 nContentOption = HTML_O_CONTENT;
2296cdf0e10cSrcweir     rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2297cdf0e10cSrcweir 
2298cdf0e10cSrcweir     bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2299cdf0e10cSrcweir 				      GetOptions(&nContentOption),
2300cdf0e10cSrcweir 				      eEnc );
2301cdf0e10cSrcweir 
2302cdf0e10cSrcweir     // If the encoding is set by a META tag, it may only overwrite the
2303cdf0e10cSrcweir     // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2304cdf0e10cSrcweir     // encodings. Everything else cannot lead to reasonable results.
2305cdf0e10cSrcweir     if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2306cdf0e10cSrcweir         rtl_isOctetTextEncoding( eEnc ) &&
2307cdf0e10cSrcweir         rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2308cdf0e10cSrcweir     {
2309cdf0e10cSrcweir         eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); // #89973#
2310cdf0e10cSrcweir         SetSrcEncoding( eEnc );
2311cdf0e10cSrcweir     }
2312cdf0e10cSrcweir 
2313cdf0e10cSrcweir     return bRet;
2314cdf0e10cSrcweir }
2315cdf0e10cSrcweir 
GetEncodingByMIME(const String & rMime)2316cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByMIME( const String& rMime )
2317cdf0e10cSrcweir {
2318cdf0e10cSrcweir     ByteString sType;
2319cdf0e10cSrcweir     ByteString sSubType;
2320cdf0e10cSrcweir     INetContentTypeParameterList aParameters;
2321cdf0e10cSrcweir     ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US );
2322cdf0e10cSrcweir     if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters))
2323cdf0e10cSrcweir     {
2324cdf0e10cSrcweir         const INetContentTypeParameter * pCharset
2325cdf0e10cSrcweir             = aParameters.find("charset");
2326cdf0e10cSrcweir         if (pCharset != 0)
2327cdf0e10cSrcweir         {
2328cdf0e10cSrcweir             ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US );
2329cdf0e10cSrcweir             return GetExtendedCompatibilityTextEncoding(
2330cdf0e10cSrcweir                     rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ) );
2331cdf0e10cSrcweir         }
2332cdf0e10cSrcweir     }
2333cdf0e10cSrcweir     return RTL_TEXTENCODING_DONTKNOW;
2334cdf0e10cSrcweir }
2335cdf0e10cSrcweir 
GetEncodingByHttpHeader(SvKeyValueIterator * pHTTPHeader)2336cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2337cdf0e10cSrcweir {
2338cdf0e10cSrcweir     rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2339cdf0e10cSrcweir     if( pHTTPHeader )
2340cdf0e10cSrcweir 	{
2341cdf0e10cSrcweir         SvKeyValue aKV;
2342cdf0e10cSrcweir 		for( sal_Bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2343cdf0e10cSrcweir 			 bCont = pHTTPHeader->GetNext( aKV ) )
2344cdf0e10cSrcweir 		{
2345cdf0e10cSrcweir 			if( aKV.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2346cdf0e10cSrcweir 			{
2347cdf0e10cSrcweir 				if( aKV.GetValue().Len() )
2348cdf0e10cSrcweir 				{
2349cdf0e10cSrcweir                     eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2350cdf0e10cSrcweir                 }
2351cdf0e10cSrcweir 			}
2352cdf0e10cSrcweir 		}
2353cdf0e10cSrcweir 	}
2354cdf0e10cSrcweir     return eRet;
2355cdf0e10cSrcweir }
2356cdf0e10cSrcweir 
SetEncodingByHTTPHeader(SvKeyValueIterator * pHTTPHeader)2357cdf0e10cSrcweir sal_Bool HTMLParser::SetEncodingByHTTPHeader(
2358cdf0e10cSrcweir 								SvKeyValueIterator *pHTTPHeader )
2359cdf0e10cSrcweir {
2360cdf0e10cSrcweir 	sal_Bool bRet = sal_False;
2361cdf0e10cSrcweir     rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2362cdf0e10cSrcweir     if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2363cdf0e10cSrcweir 	{
2364cdf0e10cSrcweir         SetSrcEncoding( eEnc );
2365cdf0e10cSrcweir         bRet = sal_True;
2366cdf0e10cSrcweir     }
2367cdf0e10cSrcweir 	return bRet;
2368cdf0e10cSrcweir }
2369cdf0e10cSrcweir 
2370cdf0e10cSrcweir 
2371