1*5900e8ecSAndrew Rist /**************************************************************
2cdf0e10cSrcweir *
3*5900e8ecSAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one
4*5900e8ecSAndrew Rist * or more contributor license agreements. See the NOTICE file
5*5900e8ecSAndrew Rist * distributed with this work for additional information
6*5900e8ecSAndrew Rist * regarding copyright ownership. The ASF licenses this file
7*5900e8ecSAndrew Rist * to you under the Apache License, Version 2.0 (the
8*5900e8ecSAndrew Rist * "License"); you may not use this file except in compliance
9*5900e8ecSAndrew Rist * with the License. You may obtain a copy of the License at
10*5900e8ecSAndrew Rist *
11*5900e8ecSAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0
12*5900e8ecSAndrew Rist *
13*5900e8ecSAndrew Rist * Unless required by applicable law or agreed to in writing,
14*5900e8ecSAndrew Rist * software distributed under the License is distributed on an
15*5900e8ecSAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*5900e8ecSAndrew Rist * KIND, either express or implied. See the License for the
17*5900e8ecSAndrew Rist * specific language governing permissions and limitations
18*5900e8ecSAndrew Rist * under the License.
19*5900e8ecSAndrew Rist *
20*5900e8ecSAndrew Rist *************************************************************/
21*5900e8ecSAndrew Rist
22*5900e8ecSAndrew Rist
23cdf0e10cSrcweir
24cdf0e10cSrcweir // MARKER(update_precomp.py): autogen include statement, do not remove
25cdf0e10cSrcweir #include "precompiled_svtools.hxx"
26cdf0e10cSrcweir
27cdf0e10cSrcweir #include <ctype.h>
28cdf0e10cSrcweir #include <stdio.h>
29cdf0e10cSrcweir #include <tools/stream.hxx>
30cdf0e10cSrcweir #include <tools/debug.hxx>
31cdf0e10cSrcweir #include <tools/color.hxx>
32cdf0e10cSrcweir #include <rtl/ustrbuf.hxx>
33cdf0e10cSrcweir #include <rtl/strbuf.hxx>
34cdf0e10cSrcweir #ifndef _SVSTDARR_HXX
35cdf0e10cSrcweir #define _SVSTDARR_ULONGS
36cdf0e10cSrcweir #include <svl/svstdarr.hxx>
37cdf0e10cSrcweir #endif
38cdf0e10cSrcweir
39cdf0e10cSrcweir #include <tools/tenccvt.hxx>
40cdf0e10cSrcweir #include <tools/datetime.hxx>
41cdf0e10cSrcweir #include <svl/inettype.hxx>
42cdf0e10cSrcweir #include <comphelper/string.hxx>
43cdf0e10cSrcweir #include <com/sun/star/beans/PropertyAttribute.hpp>
44cdf0e10cSrcweir #include <com/sun/star/document/XDocumentProperties.hpp>
45cdf0e10cSrcweir
46cdf0e10cSrcweir #include <svtools/parhtml.hxx>
47cdf0e10cSrcweir #include <svtools/htmltokn.h>
48cdf0e10cSrcweir #include <svtools/htmlkywd.hxx>
49cdf0e10cSrcweir
50cdf0e10cSrcweir
51cdf0e10cSrcweir using namespace ::com::sun::star;
52cdf0e10cSrcweir
53cdf0e10cSrcweir
54cdf0e10cSrcweir const sal_Int32 MAX_LEN( 1024L );
55cdf0e10cSrcweir //static sal_Unicode sTmpBuffer[ MAX_LEN+1 ];
56cdf0e10cSrcweir const sal_Int32 MAX_MACRO_LEN( 1024 );
57cdf0e10cSrcweir
58cdf0e10cSrcweir const sal_Int32 MAX_ENTITY_LEN( 8L );
59cdf0e10cSrcweir
60cdf0e10cSrcweir /* */
61cdf0e10cSrcweir
62cdf0e10cSrcweir // Tabellen zum Umwandeln von Options-Werten in Strings
63cdf0e10cSrcweir
64cdf0e10cSrcweir // <INPUT TYPE=xxx>
65cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aInputTypeOptEnums[] =
66cdf0e10cSrcweir {
67cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_text, HTML_IT_TEXT },
68cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_password, HTML_IT_PASSWORD },
69cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_checkbox, HTML_IT_CHECKBOX },
70cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_radio, HTML_IT_RADIO },
71cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_range, HTML_IT_RANGE },
72cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_scribble, HTML_IT_SCRIBBLE },
73cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_file, HTML_IT_FILE },
74cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_hidden, HTML_IT_HIDDEN },
75cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_submit, HTML_IT_SUBMIT },
76cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_image, HTML_IT_IMAGE },
77cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_reset, HTML_IT_RESET },
78cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_IT_button, HTML_IT_BUTTON },
79cdf0e10cSrcweir { 0, 0 }
80cdf0e10cSrcweir };
81cdf0e10cSrcweir
82cdf0e10cSrcweir // <TABLE FRAME=xxx>
83cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableFrameOptEnums[] =
84cdf0e10cSrcweir {
85cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_void, HTML_TF_VOID },
86cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_above, HTML_TF_ABOVE },
87cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_below, HTML_TF_BELOW },
88cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_hsides, HTML_TF_HSIDES },
89cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_lhs, HTML_TF_LHS },
90cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_rhs, HTML_TF_RHS },
91cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_vsides, HTML_TF_VSIDES },
92cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_box, HTML_TF_BOX },
93cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TF_border, HTML_TF_BOX },
94cdf0e10cSrcweir { 0, 0 }
95cdf0e10cSrcweir };
96cdf0e10cSrcweir
97cdf0e10cSrcweir // <TABLE RULES=xxx>
98cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aTableRulesOptEnums[] =
99cdf0e10cSrcweir {
100cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_none, HTML_TR_NONE },
101cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_groups, HTML_TR_GROUPS },
102cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_rows, HTML_TR_ROWS },
103cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_cols, HTML_TR_COLS },
104cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_TR_all, HTML_TR_ALL },
105cdf0e10cSrcweir { 0, 0 }
106cdf0e10cSrcweir };
107cdf0e10cSrcweir
108cdf0e10cSrcweir
SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr)109cdf0e10cSrcweir SV_IMPL_PTRARR(HTMLOptions,HTMLOptionPtr)
110cdf0e10cSrcweir
111cdf0e10cSrcweir /* */
112cdf0e10cSrcweir
113cdf0e10cSrcweir sal_uInt16 HTMLOption::GetEnum( const HTMLOptionEnum *pOptEnums, sal_uInt16 nDflt ) const
114cdf0e10cSrcweir {
115cdf0e10cSrcweir sal_uInt16 nValue = nDflt;
116cdf0e10cSrcweir
117cdf0e10cSrcweir while( pOptEnums->pName )
118cdf0e10cSrcweir if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
119cdf0e10cSrcweir break;
120cdf0e10cSrcweir else
121cdf0e10cSrcweir pOptEnums++;
122cdf0e10cSrcweir
123cdf0e10cSrcweir if( pOptEnums->pName )
124cdf0e10cSrcweir nValue = pOptEnums->nValue;
125cdf0e10cSrcweir
126cdf0e10cSrcweir return nValue;
127cdf0e10cSrcweir }
128cdf0e10cSrcweir
GetEnum(sal_uInt16 & rEnum,const HTMLOptionEnum * pOptEnums) const129cdf0e10cSrcweir sal_Bool HTMLOption::GetEnum( sal_uInt16 &rEnum, const HTMLOptionEnum *pOptEnums ) const
130cdf0e10cSrcweir {
131cdf0e10cSrcweir while( pOptEnums->pName )
132cdf0e10cSrcweir {
133cdf0e10cSrcweir if( aValue.EqualsIgnoreCaseAscii( pOptEnums->pName ) )
134cdf0e10cSrcweir break;
135cdf0e10cSrcweir else
136cdf0e10cSrcweir pOptEnums++;
137cdf0e10cSrcweir }
138cdf0e10cSrcweir
139cdf0e10cSrcweir const sal_Char *pName = pOptEnums->pName;
140cdf0e10cSrcweir if( pName )
141cdf0e10cSrcweir rEnum = pOptEnums->nValue;
142cdf0e10cSrcweir
143cdf0e10cSrcweir return (pName != 0);
144cdf0e10cSrcweir }
145cdf0e10cSrcweir
HTMLOption(sal_uInt16 nTok,const String & rToken,const String & rValue)146cdf0e10cSrcweir HTMLOption::HTMLOption( sal_uInt16 nTok, const String& rToken,
147cdf0e10cSrcweir const String& rValue )
148cdf0e10cSrcweir : aValue(rValue)
149cdf0e10cSrcweir , aToken(rToken)
150cdf0e10cSrcweir , nToken( nTok )
151cdf0e10cSrcweir {
152cdf0e10cSrcweir DBG_ASSERT( nToken>=HTML_OPTION_START && nToken<HTML_OPTION_END,
153cdf0e10cSrcweir "HTMLOption: unbekanntes Token" );
154cdf0e10cSrcweir }
155cdf0e10cSrcweir
GetNumber() const156cdf0e10cSrcweir sal_uInt32 HTMLOption::GetNumber() const
157cdf0e10cSrcweir {
158cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START &&
159cdf0e10cSrcweir nToken<HTML_OPTION_NUMBER_END) ||
160cdf0e10cSrcweir (nToken>=HTML_OPTION_CONTEXT_START &&
161cdf0e10cSrcweir nToken<HTML_OPTION_CONTEXT_END) ||
162cdf0e10cSrcweir nToken==HTML_O_VALUE,
163cdf0e10cSrcweir "GetNumber: Option ist nicht numerisch" );
164cdf0e10cSrcweir String aTmp( aValue );
165cdf0e10cSrcweir aTmp.EraseLeadingChars();
166cdf0e10cSrcweir sal_Int32 nTmp = aTmp.ToInt32();
167cdf0e10cSrcweir return nTmp >= 0 ? (sal_uInt32)nTmp : 0;
168cdf0e10cSrcweir }
169cdf0e10cSrcweir
GetSNumber() const170cdf0e10cSrcweir sal_Int32 HTMLOption::GetSNumber() const
171cdf0e10cSrcweir {
172cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_NUMBER_START && nToken<HTML_OPTION_NUMBER_END) ||
173cdf0e10cSrcweir (nToken>=HTML_OPTION_CONTEXT_START && nToken<HTML_OPTION_CONTEXT_END),
174cdf0e10cSrcweir "GetSNumber: Option ist nicht numerisch" );
175cdf0e10cSrcweir String aTmp( aValue );
176cdf0e10cSrcweir aTmp.EraseLeadingChars();
177cdf0e10cSrcweir return aTmp.ToInt32();
178cdf0e10cSrcweir }
179cdf0e10cSrcweir
GetNumbers(SvULongs & rLongs,sal_Bool bSpaceDelim) const180cdf0e10cSrcweir void HTMLOption::GetNumbers( SvULongs &rLongs, sal_Bool bSpaceDelim ) const
181cdf0e10cSrcweir {
182cdf0e10cSrcweir if( rLongs.Count() )
183cdf0e10cSrcweir rLongs.Remove( 0, rLongs.Count() );
184cdf0e10cSrcweir
185cdf0e10cSrcweir if( bSpaceDelim )
186cdf0e10cSrcweir {
187cdf0e10cSrcweir // das ist ein sehr stark vereinfachter Scanner. Er sucht einfach
188cdf0e10cSrcweir // alle Tiffern aus dem String
189cdf0e10cSrcweir sal_Bool bInNum = sal_False;
190cdf0e10cSrcweir sal_uLong nNum = 0;
191cdf0e10cSrcweir for( xub_StrLen i=0; i<aValue.Len(); i++ )
192cdf0e10cSrcweir {
193cdf0e10cSrcweir register sal_Unicode c = aValue.GetChar( i );
194cdf0e10cSrcweir if( c>='0' && c<='9' )
195cdf0e10cSrcweir {
196cdf0e10cSrcweir nNum *= 10;
197cdf0e10cSrcweir nNum += (c - '0');
198cdf0e10cSrcweir bInNum = sal_True;
199cdf0e10cSrcweir }
200cdf0e10cSrcweir else if( bInNum )
201cdf0e10cSrcweir {
202cdf0e10cSrcweir rLongs.Insert( nNum, rLongs.Count() );
203cdf0e10cSrcweir bInNum = sal_False;
204cdf0e10cSrcweir nNum = 0;
205cdf0e10cSrcweir }
206cdf0e10cSrcweir }
207cdf0e10cSrcweir if( bInNum )
208cdf0e10cSrcweir {
209cdf0e10cSrcweir rLongs.Insert( nNum, rLongs.Count() );
210cdf0e10cSrcweir }
211cdf0e10cSrcweir }
212cdf0e10cSrcweir else
213cdf0e10cSrcweir {
214cdf0e10cSrcweir // hier wird auf die korrekte Trennung der Zahlen durch ',' geachtet
215cdf0e10cSrcweir // und auch mal eine 0 eingefuegt
216cdf0e10cSrcweir xub_StrLen nPos = 0;
217cdf0e10cSrcweir while( nPos < aValue.Len() )
218cdf0e10cSrcweir {
219cdf0e10cSrcweir register sal_Unicode c;
220cdf0e10cSrcweir while( nPos < aValue.Len() &&
221cdf0e10cSrcweir ((c=aValue.GetChar(nPos)) == ' ' || c == '\t' ||
222cdf0e10cSrcweir c == '\n' || c== '\r' ) )
223cdf0e10cSrcweir nPos++;
224cdf0e10cSrcweir
225cdf0e10cSrcweir if( nPos==aValue.Len() )
226cdf0e10cSrcweir rLongs.Insert( sal_uLong(0), rLongs.Count() );
227cdf0e10cSrcweir else
228cdf0e10cSrcweir {
229cdf0e10cSrcweir xub_StrLen nEnd = aValue.Search( (sal_Unicode)',', nPos );
230cdf0e10cSrcweir if( STRING_NOTFOUND==nEnd )
231cdf0e10cSrcweir {
232cdf0e10cSrcweir sal_Int32 nTmp = aValue.Copy(nPos).ToInt32();
233cdf0e10cSrcweir rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0,
234cdf0e10cSrcweir rLongs.Count() );
235cdf0e10cSrcweir nPos = aValue.Len();
236cdf0e10cSrcweir }
237cdf0e10cSrcweir else
238cdf0e10cSrcweir {
239cdf0e10cSrcweir sal_Int32 nTmp =
240cdf0e10cSrcweir aValue.Copy(nPos,nEnd-nPos).ToInt32();
241cdf0e10cSrcweir rLongs.Insert( nTmp >= 0 ? (sal_uInt32)nTmp : 0,
242cdf0e10cSrcweir rLongs.Count() );
243cdf0e10cSrcweir nPos = nEnd+1;
244cdf0e10cSrcweir }
245cdf0e10cSrcweir }
246cdf0e10cSrcweir }
247cdf0e10cSrcweir }
248cdf0e10cSrcweir }
249cdf0e10cSrcweir
GetColor(Color & rColor) const250cdf0e10cSrcweir void HTMLOption::GetColor( Color& rColor ) const
251cdf0e10cSrcweir {
252cdf0e10cSrcweir DBG_ASSERT( (nToken>=HTML_OPTION_COLOR_START && nToken<HTML_OPTION_COLOR_END) || nToken==HTML_O_SIZE,
253cdf0e10cSrcweir "GetColor: Option spezifiziert keine Farbe" );
254cdf0e10cSrcweir
255cdf0e10cSrcweir String aTmp( aValue );
256cdf0e10cSrcweir aTmp.ToUpperAscii();
257cdf0e10cSrcweir sal_uLong nColor = ULONG_MAX;
258cdf0e10cSrcweir if( '#'!=aTmp.GetChar( 0 ) )
259cdf0e10cSrcweir nColor = GetHTMLColor( aTmp );
260cdf0e10cSrcweir
261cdf0e10cSrcweir if( ULONG_MAX == nColor )
262cdf0e10cSrcweir {
263cdf0e10cSrcweir nColor = 0;
264cdf0e10cSrcweir xub_StrLen nPos = 0;
265cdf0e10cSrcweir for( sal_uInt32 i=0; i<6; i++ )
266cdf0e10cSrcweir {
267cdf0e10cSrcweir // MIB 26.06.97: Wie auch immer Netscape Farbwerte ermittelt,
268cdf0e10cSrcweir // maximal drei Zeichen, die kleiner als '0' sind werden
269cdf0e10cSrcweir // ignoriert. Bug #40901# stimmt damit. Mal schauen, was sich
270cdf0e10cSrcweir // irgendwelche HTML-Autoren noch so einfallen lassen...
271cdf0e10cSrcweir register sal_Unicode c = nPos<aTmp.Len() ? aTmp.GetChar( nPos++ )
272cdf0e10cSrcweir : '0';
273cdf0e10cSrcweir if( c < '0' )
274cdf0e10cSrcweir {
275cdf0e10cSrcweir c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
276cdf0e10cSrcweir if( c < '0' )
277cdf0e10cSrcweir c = nPos<aTmp.Len() ? aTmp.GetChar(nPos++) : '0';
278cdf0e10cSrcweir }
279cdf0e10cSrcweir nColor *= 16;
280cdf0e10cSrcweir if( c >= '0' && c <= '9' )
281cdf0e10cSrcweir nColor += (c - 48);
282cdf0e10cSrcweir else if( c >= 'A' && c <= 'F' )
283cdf0e10cSrcweir nColor += (c - 55);
284cdf0e10cSrcweir }
285cdf0e10cSrcweir }
286cdf0e10cSrcweir
287cdf0e10cSrcweir rColor.SetRed( (sal_uInt8)((nColor & 0x00ff0000) >> 16) );
288cdf0e10cSrcweir rColor.SetGreen( (sal_uInt8)((nColor & 0x0000ff00) >> 8));
289cdf0e10cSrcweir rColor.SetBlue( (sal_uInt8)(nColor & 0x000000ff) );
290cdf0e10cSrcweir }
291cdf0e10cSrcweir
GetInputType() const292cdf0e10cSrcweir HTMLInputType HTMLOption::GetInputType() const
293cdf0e10cSrcweir {
294cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_TYPE, "GetInputType: Option nicht TYPE" );
295cdf0e10cSrcweir return (HTMLInputType)GetEnum( aInputTypeOptEnums, HTML_IT_TEXT );
296cdf0e10cSrcweir }
297cdf0e10cSrcweir
GetTableFrame() const298cdf0e10cSrcweir HTMLTableFrame HTMLOption::GetTableFrame() const
299cdf0e10cSrcweir {
300cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_FRAME, "GetTableFrame: Option nicht FRAME" );
301cdf0e10cSrcweir return (HTMLTableFrame)GetEnum( aTableFrameOptEnums, HTML_TF_VOID );
302cdf0e10cSrcweir }
303cdf0e10cSrcweir
GetTableRules() const304cdf0e10cSrcweir HTMLTableRules HTMLOption::GetTableRules() const
305cdf0e10cSrcweir {
306cdf0e10cSrcweir DBG_ASSERT( nToken==HTML_O_RULES, "GetTableRules: Option nicht RULES" );
307cdf0e10cSrcweir return (HTMLTableRules)GetEnum( aTableRulesOptEnums, HTML_TR_NONE );
308cdf0e10cSrcweir }
309cdf0e10cSrcweir
310cdf0e10cSrcweir /* */
311cdf0e10cSrcweir
HTMLParser(SvStream & rIn,int bReadNewDoc)312cdf0e10cSrcweir HTMLParser::HTMLParser( SvStream& rIn, int bReadNewDoc )
313cdf0e10cSrcweir : SvParser( rIn )
314cdf0e10cSrcweir {
315cdf0e10cSrcweir bNewDoc = bReadNewDoc;
316cdf0e10cSrcweir bReadListing = bReadXMP = bReadPRE = bReadTextArea =
317cdf0e10cSrcweir bReadScript = bReadStyle =
318cdf0e10cSrcweir bEndTokenFound = bIsInBody = bReadNextChar =
319cdf0e10cSrcweir bReadComment = sal_False;
320cdf0e10cSrcweir bIsInHeader = sal_True;
321cdf0e10cSrcweir pOptions = new HTMLOptions;
3228d621361SPedro Giffuni
3238d621361SPedro Giffuni //#i76649, default to UTF-8 for HTML unless we know differently
3248d621361SPedro Giffuni SetSrcEncoding(RTL_TEXTENCODING_UTF8);
325cdf0e10cSrcweir }
326cdf0e10cSrcweir
~HTMLParser()327cdf0e10cSrcweir HTMLParser::~HTMLParser()
328cdf0e10cSrcweir {
329cdf0e10cSrcweir if( pOptions && pOptions->Count() )
330cdf0e10cSrcweir pOptions->DeleteAndDestroy( 0, pOptions->Count() );
331cdf0e10cSrcweir delete pOptions;
332cdf0e10cSrcweir }
333cdf0e10cSrcweir
CallParser()334cdf0e10cSrcweir SvParserState __EXPORT HTMLParser::CallParser()
335cdf0e10cSrcweir {
336cdf0e10cSrcweir eState = SVPAR_WORKING;
337cdf0e10cSrcweir nNextCh = GetNextChar();
338cdf0e10cSrcweir SaveState( 0 );
339cdf0e10cSrcweir
340cdf0e10cSrcweir nPre_LinePos = 0;
341cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False;
342cdf0e10cSrcweir
343cdf0e10cSrcweir AddRef();
344cdf0e10cSrcweir Continue( 0 );
345cdf0e10cSrcweir if( SVPAR_PENDING != eState )
346cdf0e10cSrcweir ReleaseRef(); // dann brauchen wir den Parser nicht mehr!
347cdf0e10cSrcweir
348cdf0e10cSrcweir return eState;
349cdf0e10cSrcweir }
350cdf0e10cSrcweir
Continue(int nToken)351cdf0e10cSrcweir void HTMLParser::Continue( int nToken )
352cdf0e10cSrcweir {
353cdf0e10cSrcweir if( !nToken )
354cdf0e10cSrcweir nToken = GetNextToken();
355cdf0e10cSrcweir
356cdf0e10cSrcweir while( IsParserWorking() )
357cdf0e10cSrcweir {
358cdf0e10cSrcweir SaveState( nToken );
359cdf0e10cSrcweir nToken = FilterToken( nToken );
360cdf0e10cSrcweir
361cdf0e10cSrcweir if( nToken )
362cdf0e10cSrcweir NextToken( nToken );
363cdf0e10cSrcweir
364cdf0e10cSrcweir if( IsParserWorking() )
365cdf0e10cSrcweir SaveState( 0 ); // bis hierhin abgearbeitet,
366cdf0e10cSrcweir // weiter mit neuem Token!
367cdf0e10cSrcweir nToken = GetNextToken();
368cdf0e10cSrcweir }
369cdf0e10cSrcweir }
370cdf0e10cSrcweir
FilterToken(int nToken)371cdf0e10cSrcweir int HTMLParser::FilterToken( int nToken )
372cdf0e10cSrcweir {
373cdf0e10cSrcweir switch( nToken )
374cdf0e10cSrcweir {
375cdf0e10cSrcweir case sal_Unicode(EOF):
376cdf0e10cSrcweir nToken = 0;
377cdf0e10cSrcweir break; // nicht verschicken
378cdf0e10cSrcweir
379cdf0e10cSrcweir case HTML_HEAD_OFF:
380cdf0e10cSrcweir bIsInBody = sal_True;
381cdf0e10cSrcweir case HTML_HEAD_ON:
382cdf0e10cSrcweir bIsInHeader = HTML_HEAD_ON == nToken;
383cdf0e10cSrcweir break;
384cdf0e10cSrcweir
385cdf0e10cSrcweir case HTML_BODY_ON:
386cdf0e10cSrcweir case HTML_FRAMESET_ON:
387cdf0e10cSrcweir bIsInHeader = sal_False;
388cdf0e10cSrcweir bIsInBody = HTML_BODY_ON == nToken;
389cdf0e10cSrcweir break;
390cdf0e10cSrcweir
391cdf0e10cSrcweir case HTML_BODY_OFF:
392cdf0e10cSrcweir bIsInBody = bReadPRE = bReadListing = bReadXMP = sal_False;
393cdf0e10cSrcweir break;
394cdf0e10cSrcweir
395cdf0e10cSrcweir case HTML_HTML_OFF:
396cdf0e10cSrcweir nToken = 0;
397cdf0e10cSrcweir bReadPRE = bReadListing = bReadXMP = sal_False;
398cdf0e10cSrcweir break; // HTML_ON wurde auch nicht verschickt !
399cdf0e10cSrcweir
400cdf0e10cSrcweir case HTML_PREFORMTXT_ON:
401cdf0e10cSrcweir StartPRE();
402cdf0e10cSrcweir break;
403cdf0e10cSrcweir
404cdf0e10cSrcweir case HTML_PREFORMTXT_OFF:
405cdf0e10cSrcweir FinishPRE();
406cdf0e10cSrcweir break;
407cdf0e10cSrcweir
408cdf0e10cSrcweir case HTML_LISTING_ON:
409cdf0e10cSrcweir StartListing();
410cdf0e10cSrcweir break;
411cdf0e10cSrcweir
412cdf0e10cSrcweir case HTML_LISTING_OFF:
413cdf0e10cSrcweir FinishListing();
414cdf0e10cSrcweir break;
415cdf0e10cSrcweir
416cdf0e10cSrcweir case HTML_XMP_ON:
417cdf0e10cSrcweir StartXMP();
418cdf0e10cSrcweir break;
419cdf0e10cSrcweir
420cdf0e10cSrcweir case HTML_XMP_OFF:
421cdf0e10cSrcweir FinishXMP();
422cdf0e10cSrcweir break;
423cdf0e10cSrcweir
424cdf0e10cSrcweir default:
425cdf0e10cSrcweir if( bReadPRE )
426cdf0e10cSrcweir nToken = FilterPRE( nToken );
427cdf0e10cSrcweir else if( bReadListing )
428cdf0e10cSrcweir nToken = FilterListing( nToken );
429cdf0e10cSrcweir else if( bReadXMP )
430cdf0e10cSrcweir nToken = FilterXMP( nToken );
431cdf0e10cSrcweir
432cdf0e10cSrcweir break;
433cdf0e10cSrcweir }
434cdf0e10cSrcweir
435cdf0e10cSrcweir return nToken;
436cdf0e10cSrcweir }
437cdf0e10cSrcweir
438cdf0e10cSrcweir #define HTML_ISDIGIT( c ) (c >= '0' && c <= '9')
439cdf0e10cSrcweir #define HTML_ISALPHA( c ) ( (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') )
440cdf0e10cSrcweir #define HTML_ISALNUM( c ) ( HTML_ISALPHA(c) || HTML_ISDIGIT(c) )
441cdf0e10cSrcweir #define HTML_ISSPACE( c ) ( ' ' == c || (c >= 0x09 && c <= 0x0d) )
442cdf0e10cSrcweir #define HTML_ISPRINTABLE( c ) ( c >= 32 && c != 127)
443cdf0e10cSrcweir // --> OD 2006-07-26 #138464#
444cdf0e10cSrcweir #define HTML_ISHEXDIGIT( c ) ( HTML_ISDIGIT(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f') )
445cdf0e10cSrcweir // <--
446cdf0e10cSrcweir
ScanText(const sal_Unicode cBreak)447cdf0e10cSrcweir int HTMLParser::ScanText( const sal_Unicode cBreak )
448cdf0e10cSrcweir {
449cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN );
450cdf0e10cSrcweir int bWeiter = sal_True;
451cdf0e10cSrcweir int bEqSignFound = sal_False;
452cdf0e10cSrcweir sal_Unicode cQuote = 0U;
453cdf0e10cSrcweir
454cdf0e10cSrcweir while( bWeiter && IsParserWorking() )
455cdf0e10cSrcweir {
456cdf0e10cSrcweir int bNextCh = sal_True;
457cdf0e10cSrcweir switch( nNextCh )
458cdf0e10cSrcweir {
459cdf0e10cSrcweir case '&':
460cdf0e10cSrcweir bEqSignFound = sal_False;
461cdf0e10cSrcweir if( bReadXMP )
462cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' );
463cdf0e10cSrcweir else
464cdf0e10cSrcweir {
465cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell();
466cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos();
467cdf0e10cSrcweir
468cdf0e10cSrcweir sal_Unicode cChar = 0U;
469cdf0e10cSrcweir if( '#' == (nNextCh = GetNextChar()) )
470cdf0e10cSrcweir {
471cdf0e10cSrcweir nNextCh = GetNextChar();
472cdf0e10cSrcweir // --> OD 2006-07-26 #138464#
473cdf0e10cSrcweir // consider hexadecimal digits
474cdf0e10cSrcweir const sal_Bool bIsHex( 'x' == nNextCh );
475cdf0e10cSrcweir const sal_Bool bIsDecOrHex( bIsHex || HTML_ISDIGIT(nNextCh) );
476cdf0e10cSrcweir if ( bIsDecOrHex )
477cdf0e10cSrcweir {
478cdf0e10cSrcweir if ( bIsHex )
479cdf0e10cSrcweir {
480cdf0e10cSrcweir nNextCh = GetNextChar();
481cdf0e10cSrcweir while ( HTML_ISHEXDIGIT(nNextCh) )
482cdf0e10cSrcweir {
483cdf0e10cSrcweir cChar = cChar * 16U +
484cdf0e10cSrcweir ( nNextCh <= '9'
485cdf0e10cSrcweir ? sal_Unicode( nNextCh - '0' )
486cdf0e10cSrcweir : ( nNextCh <= 'F'
487cdf0e10cSrcweir ? sal_Unicode( nNextCh - 'A' + 10 )
488cdf0e10cSrcweir : sal_Unicode( nNextCh - 'a' + 10 ) ) );
489cdf0e10cSrcweir nNextCh = GetNextChar();
490cdf0e10cSrcweir }
491cdf0e10cSrcweir }
492cdf0e10cSrcweir else
493cdf0e10cSrcweir {
494cdf0e10cSrcweir do
495cdf0e10cSrcweir {
496cdf0e10cSrcweir cChar = cChar * 10U + sal_Unicode( nNextCh - '0');
497cdf0e10cSrcweir nNextCh = GetNextChar();
498cdf0e10cSrcweir }
499cdf0e10cSrcweir while( HTML_ISDIGIT(nNextCh) );
500cdf0e10cSrcweir }
501cdf0e10cSrcweir
502cdf0e10cSrcweir if( RTL_TEXTENCODING_DONTKNOW != eSrcEnc &&
503cdf0e10cSrcweir RTL_TEXTENCODING_UCS2 != eSrcEnc &&
504cdf0e10cSrcweir RTL_TEXTENCODING_UTF8 != eSrcEnc &&
505cdf0e10cSrcweir cChar < 256 )
506cdf0e10cSrcweir {
507cdf0e10cSrcweir sal_Unicode cOrig = cChar;
508cdf0e10cSrcweir cChar = ByteString::ConvertToUnicode(
509cdf0e10cSrcweir (sal_Char)cChar, eSrcEnc );
510cdf0e10cSrcweir if( 0U == cChar )
511cdf0e10cSrcweir {
512cdf0e10cSrcweir // #73398#: If the character could not be
513cdf0e10cSrcweir // converted, because a conversion is not
514cdf0e10cSrcweir // available, do no conversion at all.
515cdf0e10cSrcweir cChar = cOrig;
516cdf0e10cSrcweir }
517cdf0e10cSrcweir }
518cdf0e10cSrcweir }
519cdf0e10cSrcweir // <--
520cdf0e10cSrcweir else
521cdf0e10cSrcweir nNextCh = 0U;
522cdf0e10cSrcweir }
523cdf0e10cSrcweir else if( HTML_ISALPHA( nNextCh ) )
524cdf0e10cSrcweir {
525cdf0e10cSrcweir ::rtl::OUStringBuffer sEntityBuffer( MAX_ENTITY_LEN );
526cdf0e10cSrcweir xub_StrLen nPos = 0L;
527cdf0e10cSrcweir do
528cdf0e10cSrcweir {
529cdf0e10cSrcweir sEntityBuffer.append( nNextCh );
530cdf0e10cSrcweir nPos++;
531cdf0e10cSrcweir nNextCh = GetNextChar();
532cdf0e10cSrcweir }
533cdf0e10cSrcweir while( nPos < MAX_ENTITY_LEN && HTML_ISALNUM( nNextCh ) &&
534cdf0e10cSrcweir !rInput.IsEof() );
535cdf0e10cSrcweir
536cdf0e10cSrcweir if( IsParserWorking() && !rInput.IsEof() )
537cdf0e10cSrcweir {
538cdf0e10cSrcweir String sEntity( sEntityBuffer.getStr(), nPos );
539cdf0e10cSrcweir cChar = GetHTMLCharName( sEntity );
540cdf0e10cSrcweir
541cdf0e10cSrcweir // nicht gefunden ( == 0 ), dann Klartext
542cdf0e10cSrcweir // oder ein Zeichen das als Attribut eingefuegt
543cdf0e10cSrcweir // wird
544cdf0e10cSrcweir if( 0U == cChar && ';' != nNextCh )
545cdf0e10cSrcweir {
546cdf0e10cSrcweir DBG_ASSERT( rInput.Tell() - nStreamPos ==
547cdf0e10cSrcweir (sal_uLong)(nPos+1L)*GetCharSize(),
548cdf0e10cSrcweir "UTF-8 geht hier schief" );
549cdf0e10cSrcweir for( xub_StrLen i=nPos-1L; i>1L; i-- )
550cdf0e10cSrcweir {
551cdf0e10cSrcweir nNextCh = sEntityBuffer[i];
552cdf0e10cSrcweir sEntityBuffer.setLength( i );
553cdf0e10cSrcweir sEntity.Assign( sEntityBuffer.getStr(), i );
554cdf0e10cSrcweir cChar = GetHTMLCharName( sEntity );
555cdf0e10cSrcweir if( cChar )
556cdf0e10cSrcweir {
557cdf0e10cSrcweir rInput.SeekRel( -(long)
558cdf0e10cSrcweir ((nPos-i)*GetCharSize()) );
559cdf0e10cSrcweir nlLinePos -= sal_uInt32(nPos-i);
560cdf0e10cSrcweir nPos = i;
561cdf0e10cSrcweir ClearTxtConvContext();
562cdf0e10cSrcweir break;
563cdf0e10cSrcweir }
564cdf0e10cSrcweir }
565cdf0e10cSrcweir }
566cdf0e10cSrcweir
567cdf0e10cSrcweir if( !cChar ) // unbekanntes Zeichen?
568cdf0e10cSrcweir {
569cdf0e10cSrcweir // dann im Stream zurueck, das '&' als Zeichen
570cdf0e10cSrcweir // einfuegen und mit dem nachfolgenden Zeichen
571cdf0e10cSrcweir // wieder aufsetzen
572cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' );
573cdf0e10cSrcweir
574cdf0e10cSrcweir // rInput.SeekRel( -(long)(++nPos*GetCharSize()) );
575cdf0e10cSrcweir // nlLinePos -= nPos;
576cdf0e10cSrcweir DBG_ASSERT( rInput.Tell()-nStreamPos ==
577cdf0e10cSrcweir (sal_uLong)(nPos+1)*GetCharSize(),
578cdf0e10cSrcweir "Falsche Stream-Position" );
579cdf0e10cSrcweir DBG_ASSERT( nlLinePos-nLinePos ==
580cdf0e10cSrcweir (sal_uLong)(nPos+1),
581cdf0e10cSrcweir "Falsche Zeilen-Position" );
582cdf0e10cSrcweir rInput.Seek( nStreamPos );
583cdf0e10cSrcweir nlLinePos = nLinePos;
584cdf0e10cSrcweir ClearTxtConvContext();
585cdf0e10cSrcweir break;
586cdf0e10cSrcweir }
587cdf0e10cSrcweir
588cdf0e10cSrcweir // 1 == Non Breaking Space
589cdf0e10cSrcweir // 2 == SoftHyphen
590cdf0e10cSrcweir
591cdf0e10cSrcweir if( cChar < 3U )
592cdf0e10cSrcweir {
593cdf0e10cSrcweir if( '>' == cBreak )
594cdf0e10cSrcweir {
595cdf0e10cSrcweir // Wenn der Inhalt eines Tags gelesen wird,
596cdf0e10cSrcweir // muessen wir ein Space bzw. - daraus machen
597cdf0e10cSrcweir switch( cChar )
598cdf0e10cSrcweir {
599cdf0e10cSrcweir case 1U: cChar = ' '; break;
600cdf0e10cSrcweir case 2U: cChar = '-'; break;
601cdf0e10cSrcweir default:
602cdf0e10cSrcweir DBG_ASSERT( cChar==1U,
603cdf0e10cSrcweir "\0x00 sollte doch schon laengt abgefangen sein!" );
604cdf0e10cSrcweir break;
605cdf0e10cSrcweir }
606cdf0e10cSrcweir }
607cdf0e10cSrcweir else
608cdf0e10cSrcweir {
609cdf0e10cSrcweir // Wenn kein Tag gescannt wird, enstprechendes
610cdf0e10cSrcweir // Token zurueckgeben
611cdf0e10cSrcweir aToken +=
612cdf0e10cSrcweir String( sTmpBuffer.makeStringAndClear() );
613cdf0e10cSrcweir if( cChar )
614cdf0e10cSrcweir {
615cdf0e10cSrcweir if( aToken.Len() )
616cdf0e10cSrcweir {
617cdf0e10cSrcweir // mit dem Zeichen wieder aufsetzen
618cdf0e10cSrcweir nNextCh = '&';
619cdf0e10cSrcweir // rInput.SeekRel( -(long)(++nPos*GetCharSize()) );
620cdf0e10cSrcweir // nlLinePos -= nPos;
621cdf0e10cSrcweir DBG_ASSERT( rInput.Tell()-nStreamPos ==
622cdf0e10cSrcweir (sal_uLong)(nPos+1)*GetCharSize(),
623cdf0e10cSrcweir "Falsche Stream-Position" );
624cdf0e10cSrcweir DBG_ASSERT( nlLinePos-nLinePos ==
625cdf0e10cSrcweir (sal_uLong)(nPos+1),
626cdf0e10cSrcweir "Falsche Zeilen-Position" );
627cdf0e10cSrcweir rInput.Seek( nStreamPos );
628cdf0e10cSrcweir nlLinePos = nLinePos;
629cdf0e10cSrcweir ClearTxtConvContext();
630cdf0e10cSrcweir return HTML_TEXTTOKEN;
631cdf0e10cSrcweir }
632cdf0e10cSrcweir
633cdf0e10cSrcweir // Hack: _GetNextChar soll nicht das
634cdf0e10cSrcweir // naechste Zeichen lesen
635cdf0e10cSrcweir if( ';' != nNextCh )
636cdf0e10cSrcweir aToken += ' ';
637cdf0e10cSrcweir if( 1U == cChar )
638cdf0e10cSrcweir return HTML_NONBREAKSPACE;
639cdf0e10cSrcweir if( 2U == cChar )
640cdf0e10cSrcweir return HTML_SOFTHYPH;
641cdf0e10cSrcweir }
642cdf0e10cSrcweir aToken += (sal_Unicode)'&';
643cdf0e10cSrcweir aToken +=
644cdf0e10cSrcweir String(sEntityBuffer.makeStringAndClear());
645cdf0e10cSrcweir break;
646cdf0e10cSrcweir }
647cdf0e10cSrcweir }
648cdf0e10cSrcweir }
649cdf0e10cSrcweir else
650cdf0e10cSrcweir nNextCh = 0U;
651cdf0e10cSrcweir }
652cdf0e10cSrcweir // MIB 03/02/2000: &{...};-JavaScript-Macros are not
653cdf0e10cSrcweir // supported any longer.
654cdf0e10cSrcweir else if( IsParserWorking() )
655cdf0e10cSrcweir {
656cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'&' );
657cdf0e10cSrcweir bNextCh = sal_False;
658cdf0e10cSrcweir break;
659cdf0e10cSrcweir }
660cdf0e10cSrcweir
661cdf0e10cSrcweir bNextCh = (';' == nNextCh);
662cdf0e10cSrcweir if( cBreak=='>' && (cChar=='\\' || cChar=='\'' ||
663cdf0e10cSrcweir cChar=='\"' || cChar==' ') )
664cdf0e10cSrcweir {
665cdf0e10cSrcweir // ' und " mussen innerhalb von Tags mit einem
666cdf0e10cSrcweir // gekennzeichnet werden, um sie von ' und " als Klammern
667cdf0e10cSrcweir // um Optionen zu unterscheiden. Logischerweise muss
668cdf0e10cSrcweir // deshalb auch ein \ gekeenzeichnet werden. Ausserdem
669cdf0e10cSrcweir // schuetzen wir ein Space, weil es kein Trennzeichen
670cdf0e10cSrcweir // zwischen Optionen ist.
671cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' );
672cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() )
673cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
674cdf0e10cSrcweir }
675cdf0e10cSrcweir if( IsParserWorking() )
676cdf0e10cSrcweir {
677cdf0e10cSrcweir if( cChar )
678cdf0e10cSrcweir sTmpBuffer.append( cChar );
679cdf0e10cSrcweir }
680cdf0e10cSrcweir else if( SVPAR_PENDING==eState && '>'!=cBreak )
681cdf0e10cSrcweir {
682cdf0e10cSrcweir // Mit dem '&' Zeichen wieder aufsetzen, der Rest
683cdf0e10cSrcweir // wird als Texttoken zurueckgegeben.
684cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() )
685cdf0e10cSrcweir {
686cdf0e10cSrcweir // Der bisherige Text wird von _GetNextChar()
687cdf0e10cSrcweir // zurueckgegeben und beim naechsten Aufruf wird
688cdf0e10cSrcweir // ein neues Zeichen gelesen. Also muessen wir uns
689cdf0e10cSrcweir // noch vor das & stellen.
690cdf0e10cSrcweir nNextCh = 0U;
691cdf0e10cSrcweir rInput.Seek( nStreamPos-(sal_uInt32)GetCharSize() );
692cdf0e10cSrcweir nlLinePos = nLinePos-1;
693cdf0e10cSrcweir ClearTxtConvContext();
694cdf0e10cSrcweir bReadNextChar = sal_True;
695cdf0e10cSrcweir }
696cdf0e10cSrcweir bNextCh = sal_False;
697cdf0e10cSrcweir }
698cdf0e10cSrcweir }
699cdf0e10cSrcweir break;
700cdf0e10cSrcweir case '=':
701cdf0e10cSrcweir if( '>'==cBreak && !cQuote )
702cdf0e10cSrcweir bEqSignFound = sal_True;
703cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
704cdf0e10cSrcweir break;
705cdf0e10cSrcweir
706cdf0e10cSrcweir case '\\':
707cdf0e10cSrcweir if( '>'==cBreak )
708cdf0e10cSrcweir {
709cdf0e10cSrcweir // Innerhalb von Tags kennzeichnen
710cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' );
711cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() )
712cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
713cdf0e10cSrcweir }
714cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)'\\' );
715cdf0e10cSrcweir break;
716cdf0e10cSrcweir
717cdf0e10cSrcweir case '\"':
718cdf0e10cSrcweir case '\'':
719cdf0e10cSrcweir if( '>'==cBreak )
720cdf0e10cSrcweir {
721cdf0e10cSrcweir if( bEqSignFound )
722cdf0e10cSrcweir cQuote = nNextCh;
723cdf0e10cSrcweir else if( cQuote && (cQuote==nNextCh ) )
724cdf0e10cSrcweir cQuote = 0U;
725cdf0e10cSrcweir }
726cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
727cdf0e10cSrcweir bEqSignFound = sal_False;
728cdf0e10cSrcweir break;
729cdf0e10cSrcweir
730cdf0e10cSrcweir case sal_Unicode(EOF):
731cdf0e10cSrcweir if( rInput.IsEof() )
732cdf0e10cSrcweir {
733cdf0e10cSrcweir // MIB 20.11.98: Das macht hier keinen Sinn, oder doch: Zumindest wird
734cdf0e10cSrcweir // abcä<EOF> nicht angezeigt, also lassen wir das in Zukunft.
735cdf0e10cSrcweir // if( '>' != cBreak )
736cdf0e10cSrcweir // eState = SVPAR_ACCEPTED;
737cdf0e10cSrcweir bWeiter = sal_False;
738cdf0e10cSrcweir }
739cdf0e10cSrcweir else
740cdf0e10cSrcweir {
741cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
742cdf0e10cSrcweir }
743cdf0e10cSrcweir break;
744cdf0e10cSrcweir
745cdf0e10cSrcweir case '<':
746cdf0e10cSrcweir bEqSignFound = sal_False;
747cdf0e10cSrcweir if( '>'==cBreak )
748cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
749cdf0e10cSrcweir else
750cdf0e10cSrcweir bWeiter = sal_False; // Abbrechen, String zusammen
751cdf0e10cSrcweir break;
752cdf0e10cSrcweir
753cdf0e10cSrcweir case '\f':
754cdf0e10cSrcweir if( '>' == cBreak )
755cdf0e10cSrcweir {
756cdf0e10cSrcweir // Beim Scannen von Optionen wie ein Space behandeln
757cdf0e10cSrcweir sTmpBuffer.append( (sal_Unicode)' ' );
758cdf0e10cSrcweir }
759cdf0e10cSrcweir else
760cdf0e10cSrcweir {
761cdf0e10cSrcweir // sonst wird es ein eigenes Token
762cdf0e10cSrcweir bWeiter = sal_False;
763cdf0e10cSrcweir }
764cdf0e10cSrcweir break;
765cdf0e10cSrcweir
766cdf0e10cSrcweir case '\r':
767cdf0e10cSrcweir case '\n':
768cdf0e10cSrcweir if( '>'==cBreak )
769cdf0e10cSrcweir {
770cdf0e10cSrcweir // #26979# cr/lf in Tag wird in _GetNextToken() behandeln
771cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
772cdf0e10cSrcweir break;
773cdf0e10cSrcweir }
774cdf0e10cSrcweir else if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
775cdf0e10cSrcweir {
776cdf0e10cSrcweir bWeiter = sal_False;
777cdf0e10cSrcweir break;
778cdf0e10cSrcweir }
779cdf0e10cSrcweir // Bug 18984: CR-LF -> Blank
780cdf0e10cSrcweir // Folge von CR/LF/BLANK/TAB nur in ein Blank wandeln
781cdf0e10cSrcweir // kein break!!
782cdf0e10cSrcweir case '\t':
783cdf0e10cSrcweir if( '\t'==nNextCh && bReadPRE && '>'!=cBreak )
784cdf0e10cSrcweir {
785cdf0e10cSrcweir // In <PRE>: Tabs nach oben durchreichen
786cdf0e10cSrcweir bWeiter = sal_False;
787cdf0e10cSrcweir break;
788cdf0e10cSrcweir }
789cdf0e10cSrcweir // kein break
790cdf0e10cSrcweir case '\x0b':
791cdf0e10cSrcweir if( '\x0b'==nNextCh && (bReadPRE || bReadXMP ||bReadListing) &&
792cdf0e10cSrcweir '>'!=cBreak )
793cdf0e10cSrcweir {
794cdf0e10cSrcweir break;
795cdf0e10cSrcweir }
796cdf0e10cSrcweir nNextCh = ' ';
797cdf0e10cSrcweir // kein break;
798cdf0e10cSrcweir case ' ':
799cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
800cdf0e10cSrcweir if( '>'!=cBreak && (!bReadListing && !bReadXMP &&
801cdf0e10cSrcweir !bReadPRE && !bReadTextArea) )
802cdf0e10cSrcweir {
803cdf0e10cSrcweir // alle Folgen von Blanks/Tabs/CR/LF zu einem Blank umwandeln
804cdf0e10cSrcweir do {
805cdf0e10cSrcweir if( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
806cdf0e10cSrcweir rInput.IsEof() )
807cdf0e10cSrcweir {
808cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() > 1L )
809cdf0e10cSrcweir {
810cdf0e10cSrcweir // ausser den Blanks wurde noch etwas geselen
811cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
812cdf0e10cSrcweir return HTML_TEXTTOKEN;
813cdf0e10cSrcweir }
814cdf0e10cSrcweir else
815cdf0e10cSrcweir // nur Blanks gelesen: dann darf kein Text
816cdf0e10cSrcweir // mehr zurueckgegeben werden und _GetNextToken
817cdf0e10cSrcweir // muss auf EOF laufen
818cdf0e10cSrcweir return 0;
819cdf0e10cSrcweir }
820cdf0e10cSrcweir } while ( ' ' == nNextCh || '\t' == nNextCh ||
821cdf0e10cSrcweir '\r' == nNextCh || '\n' == nNextCh ||
822cdf0e10cSrcweir '\x0b' == nNextCh );
823cdf0e10cSrcweir bNextCh = sal_False;
824cdf0e10cSrcweir }
825cdf0e10cSrcweir break;
826cdf0e10cSrcweir
827cdf0e10cSrcweir default:
828cdf0e10cSrcweir bEqSignFound = sal_False;
829cdf0e10cSrcweir if( (nNextCh==cBreak && !cQuote) ||
830cdf0e10cSrcweir (sal_uLong(aToken.Len()) + MAX_LEN) > sal_uLong(STRING_MAXLEN & ~1 ))
831cdf0e10cSrcweir bWeiter = sal_False;
832cdf0e10cSrcweir else
833cdf0e10cSrcweir {
834cdf0e10cSrcweir do {
835cdf0e10cSrcweir // alle anderen Zeichen kommen in den Text
836cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
837cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() )
838cdf0e10cSrcweir {
839cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
840cdf0e10cSrcweir if( (sal_uLong(aToken.Len()) + MAX_LEN) >
841cdf0e10cSrcweir sal_uLong(STRING_MAXLEN & ~1 ) )
842cdf0e10cSrcweir {
843cdf0e10cSrcweir nNextCh = GetNextChar();
844cdf0e10cSrcweir return HTML_TEXTTOKEN;
845cdf0e10cSrcweir }
846cdf0e10cSrcweir }
847cdf0e10cSrcweir if( ( sal_Unicode(EOF) == (nNextCh = GetNextChar()) &&
848cdf0e10cSrcweir rInput.IsEof() ) ||
849cdf0e10cSrcweir !IsParserWorking() )
850cdf0e10cSrcweir {
851cdf0e10cSrcweir if( sTmpBuffer.getLength() )
852cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
853cdf0e10cSrcweir return HTML_TEXTTOKEN;
854cdf0e10cSrcweir }
855cdf0e10cSrcweir } while( HTML_ISALPHA( nNextCh ) || HTML_ISDIGIT( nNextCh ) );
856cdf0e10cSrcweir bNextCh = sal_False;
857cdf0e10cSrcweir }
858cdf0e10cSrcweir }
859cdf0e10cSrcweir
860cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() )
861cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
862cdf0e10cSrcweir
863cdf0e10cSrcweir if( bWeiter && bNextCh )
864cdf0e10cSrcweir nNextCh = GetNextChar();
865cdf0e10cSrcweir }
866cdf0e10cSrcweir
867cdf0e10cSrcweir if( sTmpBuffer.getLength() )
868cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
869cdf0e10cSrcweir
870cdf0e10cSrcweir return HTML_TEXTTOKEN;
871cdf0e10cSrcweir }
872cdf0e10cSrcweir
_GetNextRawToken()873cdf0e10cSrcweir int HTMLParser::_GetNextRawToken()
874cdf0e10cSrcweir {
875cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer( MAX_LEN );
876cdf0e10cSrcweir
877cdf0e10cSrcweir if( bEndTokenFound )
878cdf0e10cSrcweir {
879cdf0e10cSrcweir // beim letzten Aufruf haben wir das End-Token bereits gefunden,
880cdf0e10cSrcweir // deshalb muessen wir es nicht noch einmal suchen
881cdf0e10cSrcweir bReadScript = sal_False;
882cdf0e10cSrcweir bReadStyle = sal_False;
883cdf0e10cSrcweir aEndToken.Erase();
884cdf0e10cSrcweir bEndTokenFound = sal_False;
885cdf0e10cSrcweir
886cdf0e10cSrcweir return 0;
887cdf0e10cSrcweir }
888cdf0e10cSrcweir
889cdf0e10cSrcweir // per default geben wir HTML_RAWDATA zurueck
890cdf0e10cSrcweir int bWeiter = sal_True;
891cdf0e10cSrcweir int nToken = HTML_RAWDATA;
892cdf0e10cSrcweir SaveState( 0 );
893cdf0e10cSrcweir while( bWeiter && IsParserWorking() )
894cdf0e10cSrcweir {
895cdf0e10cSrcweir int bNextCh = sal_True;
896cdf0e10cSrcweir switch( nNextCh )
897cdf0e10cSrcweir {
898cdf0e10cSrcweir case '<':
899cdf0e10cSrcweir {
900cdf0e10cSrcweir // Vielleicht haben wir das Ende erreicht
901cdf0e10cSrcweir
902cdf0e10cSrcweir // das bisher gelesene erstmal retten
903cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
904cdf0e10cSrcweir
905cdf0e10cSrcweir // und die Position im Stream merken
906cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell();
907cdf0e10cSrcweir sal_uLong nLineNr = GetLineNr();
908cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos();
909cdf0e10cSrcweir
910cdf0e10cSrcweir // Start eines End-Token?
911cdf0e10cSrcweir int bOffState = sal_False;
912cdf0e10cSrcweir if( '/' == (nNextCh = GetNextChar()) )
913cdf0e10cSrcweir {
914cdf0e10cSrcweir bOffState = sal_True;
915cdf0e10cSrcweir nNextCh = GetNextChar();
916cdf0e10cSrcweir }
917cdf0e10cSrcweir else if( '!' == nNextCh )
918cdf0e10cSrcweir {
919cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
920cdf0e10cSrcweir nNextCh = GetNextChar();
921cdf0e10cSrcweir }
922cdf0e10cSrcweir
923cdf0e10cSrcweir // jetzt die Buchstaben danach lesen
924cdf0e10cSrcweir while( (HTML_ISALPHA(nNextCh) || '-'==nNextCh) &&
925cdf0e10cSrcweir IsParserWorking() && sTmpBuffer.getLength() < MAX_LEN )
926cdf0e10cSrcweir {
927cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
928cdf0e10cSrcweir nNextCh = GetNextChar();
929cdf0e10cSrcweir }
930cdf0e10cSrcweir
931cdf0e10cSrcweir String aTok( sTmpBuffer.getStr(),
932cdf0e10cSrcweir sal::static_int_cast< xub_StrLen >(
933cdf0e10cSrcweir sTmpBuffer.getLength()) );
934cdf0e10cSrcweir aTok.ToUpperAscii();
935cdf0e10cSrcweir sal_Bool bDone = sal_False;
936cdf0e10cSrcweir if( bReadScript || aEndToken.Len() )
937cdf0e10cSrcweir {
938cdf0e10cSrcweir if( !bReadComment )
939cdf0e10cSrcweir {
940cdf0e10cSrcweir if( aTok.CompareToAscii( OOO_STRING_SVTOOLS_HTML_comment, 3 )
941cdf0e10cSrcweir == COMPARE_EQUAL )
942cdf0e10cSrcweir {
943cdf0e10cSrcweir bReadComment = sal_True;
944cdf0e10cSrcweir }
945cdf0e10cSrcweir else
946cdf0e10cSrcweir {
947cdf0e10cSrcweir // ein Script muss mit "</SCRIPT>" aufhoehren, wobei
948cdf0e10cSrcweir // wir es mit dem ">" aus sicherheitsgruenden
949cdf0e10cSrcweir // erstmal nicht so genau nehmen
950cdf0e10cSrcweir bDone = bOffState && // '>'==nNextCh &&
951cdf0e10cSrcweir COMPARE_EQUAL == ( bReadScript
952cdf0e10cSrcweir ? aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_script)
953cdf0e10cSrcweir : aTok.CompareTo(aEndToken) );
954cdf0e10cSrcweir }
955cdf0e10cSrcweir }
956cdf0e10cSrcweir if( bReadComment && '>'==nNextCh && aTok.Len() >= 2 &&
957cdf0e10cSrcweir aTok.Copy( aTok.Len()-2 ).EqualsAscii( "--" ) )
958cdf0e10cSrcweir {
959cdf0e10cSrcweir // hier ist ein Kommentar der Art <!-----> zuende
960cdf0e10cSrcweir bReadComment = sal_False;
961cdf0e10cSrcweir }
962cdf0e10cSrcweir }
963cdf0e10cSrcweir else
964cdf0e10cSrcweir {
965cdf0e10cSrcweir // ein Style-Sheet kann mit </STYLE>, </HEAD> oder
966cdf0e10cSrcweir // <BODY> aughoehren
967cdf0e10cSrcweir if( bOffState )
968cdf0e10cSrcweir bDone = aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_style)
969cdf0e10cSrcweir == COMPARE_EQUAL ||
970cdf0e10cSrcweir aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_head)
971cdf0e10cSrcweir == COMPARE_EQUAL;
972cdf0e10cSrcweir else
973cdf0e10cSrcweir bDone =
974cdf0e10cSrcweir aTok.CompareToAscii(OOO_STRING_SVTOOLS_HTML_body) == COMPARE_EQUAL;
975cdf0e10cSrcweir }
976cdf0e10cSrcweir
977cdf0e10cSrcweir if( bDone )
978cdf0e10cSrcweir {
979cdf0e10cSrcweir // das war's, jetzt muessen wir gegebenenfalls den
980cdf0e10cSrcweir // bisher gelesenen String zurueckgeben und dnach normal
981cdf0e10cSrcweir // weitermachen
982cdf0e10cSrcweir
983cdf0e10cSrcweir bWeiter = sal_False;
984cdf0e10cSrcweir
985cdf0e10cSrcweir // nToken==0 heisst, dass _GetNextToken gleich weiterliest
986cdf0e10cSrcweir if( !aToken.Len() && (bReadStyle || bReadScript) )
987cdf0e10cSrcweir {
988cdf0e10cSrcweir // wir koennen sofort die Umgebung beeden und
989cdf0e10cSrcweir // das End-Token parsen
990cdf0e10cSrcweir bReadScript = sal_False;
991cdf0e10cSrcweir bReadStyle = sal_False;
992cdf0e10cSrcweir aEndToken.Erase();
993cdf0e10cSrcweir nToken = 0;
994cdf0e10cSrcweir }
995cdf0e10cSrcweir else
996cdf0e10cSrcweir {
997cdf0e10cSrcweir // wir muessen bReadScript/bReadStyle noch am
998cdf0e10cSrcweir // Leben lassen und koennen erst beim naechsten
999cdf0e10cSrcweir // mal das End-Token Parsen
1000cdf0e10cSrcweir bEndTokenFound = sal_True;
1001cdf0e10cSrcweir }
1002cdf0e10cSrcweir
1003cdf0e10cSrcweir // jetzt fahren wir im Stream auf das '<' zurueck
1004cdf0e10cSrcweir rInput.Seek( nStreamPos );
1005cdf0e10cSrcweir SetLineNr( nLineNr );
1006cdf0e10cSrcweir SetLinePos( nLinePos );
1007cdf0e10cSrcweir ClearTxtConvContext();
1008cdf0e10cSrcweir nNextCh = '<';
1009cdf0e10cSrcweir
1010cdf0e10cSrcweir // den String wollen wir nicht an das Token haengen
1011cdf0e10cSrcweir sTmpBuffer.setLength( 0L );
1012cdf0e10cSrcweir }
1013cdf0e10cSrcweir else
1014cdf0e10cSrcweir {
1015cdf0e10cSrcweir // "</" merken, alles andere steht noch im buffer
1016cdf0e10cSrcweir aToken += (sal_Unicode)'<';
1017cdf0e10cSrcweir if( bOffState )
1018cdf0e10cSrcweir aToken += (sal_Unicode)'/';
1019cdf0e10cSrcweir
1020cdf0e10cSrcweir bNextCh = sal_False;
1021cdf0e10cSrcweir }
1022cdf0e10cSrcweir }
1023cdf0e10cSrcweir break;
1024cdf0e10cSrcweir case '-':
1025cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
1026cdf0e10cSrcweir if( bReadComment )
1027cdf0e10cSrcweir {
1028cdf0e10cSrcweir sal_Bool bTwoMinus = sal_False;
1029cdf0e10cSrcweir nNextCh = GetNextChar();
1030cdf0e10cSrcweir while( '-' == nNextCh && IsParserWorking() )
1031cdf0e10cSrcweir {
1032cdf0e10cSrcweir bTwoMinus = sal_True;
1033cdf0e10cSrcweir
1034cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() )
1035cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
1036cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
1037cdf0e10cSrcweir nNextCh = GetNextChar();
1038cdf0e10cSrcweir }
1039cdf0e10cSrcweir
1040cdf0e10cSrcweir if( '>' == nNextCh && IsParserWorking() && bTwoMinus )
1041cdf0e10cSrcweir bReadComment = sal_False;
1042cdf0e10cSrcweir
1043cdf0e10cSrcweir bNextCh = sal_False;
1044cdf0e10cSrcweir }
1045cdf0e10cSrcweir break;
1046cdf0e10cSrcweir
1047cdf0e10cSrcweir case '\r':
1048cdf0e10cSrcweir // \r\n? beendet das aktuelle Text-Token (auch wenn es leer ist)
1049cdf0e10cSrcweir nNextCh = GetNextChar();
1050cdf0e10cSrcweir if( nNextCh=='\n' )
1051cdf0e10cSrcweir nNextCh = GetNextChar();
1052cdf0e10cSrcweir bWeiter = sal_False;
1053cdf0e10cSrcweir break;
1054cdf0e10cSrcweir case '\n':
1055cdf0e10cSrcweir // \n beendet das aktuelle Text-Token (auch wenn es leer ist)
1056cdf0e10cSrcweir nNextCh = GetNextChar();
1057cdf0e10cSrcweir bWeiter = sal_False;
1058cdf0e10cSrcweir break;
1059cdf0e10cSrcweir case sal_Unicode(EOF):
1060cdf0e10cSrcweir // eof beendet das aktuelle Text-Token und tut so, als ob
1061cdf0e10cSrcweir // ein End-Token gelesen wurde
1062cdf0e10cSrcweir if( rInput.IsEof() )
1063cdf0e10cSrcweir {
1064cdf0e10cSrcweir bWeiter = sal_False;
1065cdf0e10cSrcweir if( aToken.Len() || sTmpBuffer.getLength() )
1066cdf0e10cSrcweir {
1067cdf0e10cSrcweir bEndTokenFound = sal_True;
1068cdf0e10cSrcweir }
1069cdf0e10cSrcweir else
1070cdf0e10cSrcweir {
1071cdf0e10cSrcweir bReadScript = sal_False;
1072cdf0e10cSrcweir bReadStyle = sal_False;
1073cdf0e10cSrcweir aEndToken.Erase();
1074cdf0e10cSrcweir nToken = 0;
1075cdf0e10cSrcweir }
1076cdf0e10cSrcweir break;
1077cdf0e10cSrcweir }
1078cdf0e10cSrcweir // kein break
1079cdf0e10cSrcweir default:
1080cdf0e10cSrcweir // alle anderen Zeichen landen im Buffer
1081cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
1082cdf0e10cSrcweir break;
1083cdf0e10cSrcweir }
1084cdf0e10cSrcweir
1085cdf0e10cSrcweir if( (!bWeiter && sTmpBuffer.getLength() > 0L) ||
1086cdf0e10cSrcweir MAX_LEN == sTmpBuffer.getLength() )
1087cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
1088cdf0e10cSrcweir
1089cdf0e10cSrcweir if( bWeiter && bNextCh )
1090cdf0e10cSrcweir nNextCh = GetNextChar();
1091cdf0e10cSrcweir }
1092cdf0e10cSrcweir
1093cdf0e10cSrcweir if( IsParserWorking() )
1094cdf0e10cSrcweir SaveState( 0 );
1095cdf0e10cSrcweir else
1096cdf0e10cSrcweir nToken = 0;
1097cdf0e10cSrcweir
1098cdf0e10cSrcweir return nToken;
1099cdf0e10cSrcweir }
1100cdf0e10cSrcweir
1101cdf0e10cSrcweir // scanne das naechste Token,
_GetNextToken()1102cdf0e10cSrcweir int __EXPORT HTMLParser::_GetNextToken()
1103cdf0e10cSrcweir {
1104cdf0e10cSrcweir int nRet = 0;
1105cdf0e10cSrcweir sSaveToken.Erase();
1106cdf0e10cSrcweir
1107cdf0e10cSrcweir // die Optionen loeschen
1108cdf0e10cSrcweir if( pOptions->Count() )
1109cdf0e10cSrcweir pOptions->DeleteAndDestroy( 0, pOptions->Count() );
1110cdf0e10cSrcweir
1111cdf0e10cSrcweir if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter!
1112cdf0e10cSrcweir return 0;
1113cdf0e10cSrcweir
1114cdf0e10cSrcweir sal_Bool bReadNextCharSave = bReadNextChar;
1115cdf0e10cSrcweir if( bReadNextChar )
1116cdf0e10cSrcweir {
1117cdf0e10cSrcweir DBG_ASSERT( !bEndTokenFound,
1118cdf0e10cSrcweir "</SCRIPT> gelesen und trotzdem noch ein Zeichen lesen?" );
1119cdf0e10cSrcweir nNextCh = GetNextChar();
1120cdf0e10cSrcweir if( !IsParserWorking() ) // wenn schon Fehler, dann nicht weiter!
1121cdf0e10cSrcweir return 0;
1122cdf0e10cSrcweir bReadNextChar = sal_False;
1123cdf0e10cSrcweir }
1124cdf0e10cSrcweir
1125cdf0e10cSrcweir if( bReadScript || bReadStyle || aEndToken.Len() )
1126cdf0e10cSrcweir {
1127cdf0e10cSrcweir nRet = _GetNextRawToken();
1128cdf0e10cSrcweir if( nRet || !IsParserWorking() )
1129cdf0e10cSrcweir return nRet;
1130cdf0e10cSrcweir }
1131cdf0e10cSrcweir
1132cdf0e10cSrcweir do {
1133cdf0e10cSrcweir int bNextCh = sal_True;
1134cdf0e10cSrcweir switch( nNextCh )
1135cdf0e10cSrcweir {
1136cdf0e10cSrcweir case '<':
1137cdf0e10cSrcweir {
1138cdf0e10cSrcweir sal_uLong nStreamPos = rInput.Tell();
1139cdf0e10cSrcweir sal_uLong nLineNr = GetLineNr();
1140cdf0e10cSrcweir sal_uLong nLinePos = GetLinePos();
1141cdf0e10cSrcweir
1142cdf0e10cSrcweir int bOffState = sal_False;
1143cdf0e10cSrcweir if( '/' == (nNextCh = GetNextChar()) )
1144cdf0e10cSrcweir {
1145cdf0e10cSrcweir bOffState = sal_True;
1146cdf0e10cSrcweir nNextCh = GetNextChar();
1147cdf0e10cSrcweir }
1148cdf0e10cSrcweir if( HTML_ISALPHA( nNextCh ) || '!'==nNextCh ) // fix #26984#
1149cdf0e10cSrcweir {
1150cdf0e10cSrcweir ::rtl::OUStringBuffer sTmpBuffer;
1151cdf0e10cSrcweir do {
1152cdf0e10cSrcweir sTmpBuffer.append( nNextCh );
1153cdf0e10cSrcweir if( MAX_LEN == sTmpBuffer.getLength() )
1154cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
1155cdf0e10cSrcweir nNextCh = GetNextChar();
1156cdf0e10cSrcweir } while( '>' != nNextCh && !HTML_ISSPACE( nNextCh ) &&
1157cdf0e10cSrcweir IsParserWorking() && !rInput.IsEof() );
1158cdf0e10cSrcweir
1159cdf0e10cSrcweir if( sTmpBuffer.getLength() )
1160cdf0e10cSrcweir aToken += String(sTmpBuffer.makeStringAndClear());
1161cdf0e10cSrcweir
1162cdf0e10cSrcweir // Blanks ueberlesen
1163cdf0e10cSrcweir while( HTML_ISSPACE( nNextCh ) && IsParserWorking() )
1164cdf0e10cSrcweir nNextCh = GetNextChar();
1165cdf0e10cSrcweir
1166cdf0e10cSrcweir if( !IsParserWorking() )
1167cdf0e10cSrcweir {
1168cdf0e10cSrcweir if( SVPAR_PENDING == eState )
1169cdf0e10cSrcweir bReadNextChar = bReadNextCharSave;
1170cdf0e10cSrcweir break;
1171cdf0e10cSrcweir }
1172cdf0e10cSrcweir
1173cdf0e10cSrcweir // suche das Token in der Tabelle:
1174cdf0e10cSrcweir sSaveToken = aToken;
1175cdf0e10cSrcweir aToken.ToUpperAscii();
1176cdf0e10cSrcweir if( 0 == (nRet = GetHTMLToken( aToken )) )
1177cdf0e10cSrcweir // Unknown Control
1178cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_ON;
1179cdf0e10cSrcweir
1180cdf0e10cSrcweir // Wenn es ein Token zum ausschalten ist ...
1181cdf0e10cSrcweir if( bOffState )
1182cdf0e10cSrcweir {
1183cdf0e10cSrcweir if( HTML_TOKEN_ONOFF & nRet )
1184cdf0e10cSrcweir {
1185cdf0e10cSrcweir // und es ein Off-Token gibt, das daraus machen
1186cdf0e10cSrcweir ++nRet;
1187cdf0e10cSrcweir }
1188cdf0e10cSrcweir else if( HTML_LINEBREAK!=nRet )
1189cdf0e10cSrcweir {
1190cdf0e10cSrcweir // und es kein Off-Token gibt, ein unbekanntes
1191cdf0e10cSrcweir // Token daraus machen (ausser </BR>, das wird
1192cdf0e10cSrcweir // wie <BR> behandelt
1193cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_OFF;
1194cdf0e10cSrcweir }
1195cdf0e10cSrcweir }
1196cdf0e10cSrcweir
1197cdf0e10cSrcweir if( nRet == HTML_COMMENT )
1198cdf0e10cSrcweir {
1199cdf0e10cSrcweir // fix: sSaveToken wegen Gross-/Kleinschreibung
1200cdf0e10cSrcweir // als Anfang des Kommentars benutzen und ein
1201cdf0e10cSrcweir // Space anhaengen.
1202cdf0e10cSrcweir aToken = sSaveToken;
1203cdf0e10cSrcweir if( '>'!=nNextCh )
1204cdf0e10cSrcweir aToken += (sal_Unicode)' ';
1205cdf0e10cSrcweir sal_uLong nCStreamPos = 0;
1206cdf0e10cSrcweir sal_uLong nCLineNr = 0;
1207cdf0e10cSrcweir sal_uLong nCLinePos = 0;
1208cdf0e10cSrcweir xub_StrLen nCStrLen = 0;
1209cdf0e10cSrcweir
1210cdf0e10cSrcweir sal_Bool bDone = sal_False;
1211cdf0e10cSrcweir // bis zum schliessenden --> lesen. wenn keins gefunden
1212cdf0e10cSrcweir // wurde beim der ersten > wieder aufsetzen
1213cdf0e10cSrcweir while( !bDone && !rInput.IsEof() && IsParserWorking() )
1214cdf0e10cSrcweir {
1215cdf0e10cSrcweir if( '>'==nNextCh )
1216cdf0e10cSrcweir {
1217cdf0e10cSrcweir if( !nCStreamPos )
1218cdf0e10cSrcweir {
1219cdf0e10cSrcweir nCStreamPos = rInput.Tell();
1220cdf0e10cSrcweir nCStrLen = aToken.Len();
1221cdf0e10cSrcweir nCLineNr = GetLineNr();
1222cdf0e10cSrcweir nCLinePos = GetLinePos();
1223cdf0e10cSrcweir }
1224cdf0e10cSrcweir bDone = aToken.Len() >= 2 &&
1225cdf0e10cSrcweir aToken.Copy(aToken.Len()-2,2).
1226cdf0e10cSrcweir EqualsAscii( "--" );
1227cdf0e10cSrcweir if( !bDone )
1228cdf0e10cSrcweir aToken += nNextCh;
1229cdf0e10cSrcweir }
1230cdf0e10cSrcweir else
1231cdf0e10cSrcweir aToken += nNextCh;
1232cdf0e10cSrcweir if( !bDone )
1233cdf0e10cSrcweir nNextCh = GetNextChar();
1234cdf0e10cSrcweir }
1235cdf0e10cSrcweir if( !bDone && IsParserWorking() && nCStreamPos )
1236cdf0e10cSrcweir {
1237cdf0e10cSrcweir rInput.Seek( nCStreamPos );
1238cdf0e10cSrcweir SetLineNr( nCLineNr );
1239cdf0e10cSrcweir SetLinePos( nCLinePos );
1240cdf0e10cSrcweir ClearTxtConvContext();
1241cdf0e10cSrcweir aToken.Erase( nCStrLen );
1242cdf0e10cSrcweir nNextCh = '>';
1243cdf0e10cSrcweir }
1244cdf0e10cSrcweir }
1245cdf0e10cSrcweir else
1246cdf0e10cSrcweir {
1247cdf0e10cSrcweir // den TokenString koennen wir jetzt verwerfen
1248cdf0e10cSrcweir aToken.Erase();
1249cdf0e10cSrcweir }
1250cdf0e10cSrcweir
1251cdf0e10cSrcweir // dann lesen wir mal alles bis zur schliessenden '>'
1252cdf0e10cSrcweir if( '>' != nNextCh && IsParserWorking() )
1253cdf0e10cSrcweir {
1254cdf0e10cSrcweir ScanText( '>' );
1255cdf0e10cSrcweir if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1256cdf0e10cSrcweir {
1257cdf0e10cSrcweir // zurueck hinter die < gehen und dort neu
1258cdf0e10cSrcweir // aufsetzen, das < als Text zurueckgeben
1259cdf0e10cSrcweir rInput.Seek( nStreamPos );
1260cdf0e10cSrcweir SetLineNr( nLineNr );
1261cdf0e10cSrcweir SetLinePos( nLinePos );
1262cdf0e10cSrcweir ClearTxtConvContext();
1263cdf0e10cSrcweir
1264cdf0e10cSrcweir aToken = '<';
1265cdf0e10cSrcweir nRet = HTML_TEXTTOKEN;
1266cdf0e10cSrcweir nNextCh = GetNextChar();
1267cdf0e10cSrcweir bNextCh = sal_False;
1268cdf0e10cSrcweir break;
1269cdf0e10cSrcweir }
1270cdf0e10cSrcweir }
1271cdf0e10cSrcweir if( SVPAR_PENDING == eState )
1272cdf0e10cSrcweir bReadNextChar = bReadNextCharSave;
1273cdf0e10cSrcweir }
1274cdf0e10cSrcweir else
1275cdf0e10cSrcweir {
1276cdf0e10cSrcweir if( bOffState )
1277cdf0e10cSrcweir {
1278cdf0e10cSrcweir // einfach alles wegschmeissen
1279cdf0e10cSrcweir ScanText( '>' );
1280cdf0e10cSrcweir if( sal_Unicode(EOF) == nNextCh && rInput.IsEof() )
1281cdf0e10cSrcweir {
1282cdf0e10cSrcweir // zurueck hinter die < gehen und dort neu
1283cdf0e10cSrcweir // aufsetzen, das < als Text zurueckgeben
1284cdf0e10cSrcweir rInput.Seek( nStreamPos );
1285cdf0e10cSrcweir SetLineNr( nLineNr );
1286cdf0e10cSrcweir SetLinePos( nLinePos );
1287cdf0e10cSrcweir ClearTxtConvContext();
1288cdf0e10cSrcweir
1289cdf0e10cSrcweir aToken = '<';
1290cdf0e10cSrcweir nRet = HTML_TEXTTOKEN;
1291cdf0e10cSrcweir nNextCh = GetNextChar();
1292cdf0e10cSrcweir bNextCh = sal_False;
1293cdf0e10cSrcweir break;
1294cdf0e10cSrcweir }
1295cdf0e10cSrcweir if( SVPAR_PENDING == eState )
1296cdf0e10cSrcweir bReadNextChar = bReadNextCharSave;
1297cdf0e10cSrcweir aToken.Erase();
1298cdf0e10cSrcweir }
1299cdf0e10cSrcweir else if( '%' == nNextCh )
1300cdf0e10cSrcweir {
1301cdf0e10cSrcweir nRet = HTML_UNKNOWNCONTROL_ON;
1302cdf0e10cSrcweir
1303cdf0e10cSrcweir sal_uLong nCStreamPos = rInput.Tell();
1304cdf0e10cSrcweir sal_uLong nCLineNr = GetLineNr(), nCLinePos = GetLinePos();
1305cdf0e10cSrcweir
1306cdf0e10cSrcweir sal_Bool bDone = sal_False;
1307cdf0e10cSrcweir // bis zum schliessenden %> lesen. wenn keins gefunden
1308cdf0e10cSrcweir // wurde beim der ersten > wieder aufsetzen
1309cdf0e10cSrcweir while( !bDone && !rInput.IsEof() && IsParserWorking() )
1310cdf0e10cSrcweir {
1311cdf0e10cSrcweir bDone = '>'==nNextCh && aToken.Len() >= 1 &&
1312cdf0e10cSrcweir '%' == aToken.GetChar( aToken.Len()-1 );
1313cdf0e10cSrcweir if( !bDone )
1314cdf0e10cSrcweir {
1315cdf0e10cSrcweir aToken += nNextCh;
1316cdf0e10cSrcweir nNextCh = GetNextChar();
1317cdf0e10cSrcweir }
1318cdf0e10cSrcweir }
1319cdf0e10cSrcweir if( !bDone && IsParserWorking() )
1320cdf0e10cSrcweir {
1321cdf0e10cSrcweir rInput.Seek( nCStreamPos );
1322cdf0e10cSrcweir SetLineNr( nCLineNr );
1323cdf0e10cSrcweir SetLinePos( nCLinePos );
1324cdf0e10cSrcweir ClearTxtConvContext();
1325cdf0e10cSrcweir aToken.AssignAscii( "<%", 2 );
1326cdf0e10cSrcweir nRet = HTML_TEXTTOKEN;
1327cdf0e10cSrcweir break;
1328cdf0e10cSrcweir }
1329cdf0e10cSrcweir if( IsParserWorking() )
1330cdf0e10cSrcweir {
1331cdf0e10cSrcweir sSaveToken = aToken;
1332cdf0e10cSrcweir aToken.Erase();
1333cdf0e10cSrcweir }
1334cdf0e10cSrcweir }
1335cdf0e10cSrcweir else
1336cdf0e10cSrcweir {
1337cdf0e10cSrcweir aToken = '<';
1338cdf0e10cSrcweir nRet = HTML_TEXTTOKEN;
1339cdf0e10cSrcweir bNextCh = sal_False;
1340cdf0e10cSrcweir break;
1341cdf0e10cSrcweir }
1342cdf0e10cSrcweir }
1343cdf0e10cSrcweir
1344cdf0e10cSrcweir if( IsParserWorking() )
1345cdf0e10cSrcweir {
1346cdf0e10cSrcweir bNextCh = '>' == nNextCh;
1347cdf0e10cSrcweir switch( nRet )
1348cdf0e10cSrcweir {
1349cdf0e10cSrcweir case HTML_TEXTAREA_ON:
1350cdf0e10cSrcweir bReadTextArea = sal_True;
1351cdf0e10cSrcweir break;
1352cdf0e10cSrcweir case HTML_TEXTAREA_OFF:
1353cdf0e10cSrcweir bReadTextArea = sal_False;
1354cdf0e10cSrcweir break;
1355cdf0e10cSrcweir case HTML_SCRIPT_ON:
1356cdf0e10cSrcweir if( !bReadTextArea )
1357cdf0e10cSrcweir bReadScript = sal_True;
1358cdf0e10cSrcweir break;
1359cdf0e10cSrcweir case HTML_SCRIPT_OFF:
1360cdf0e10cSrcweir if( !bReadTextArea )
1361cdf0e10cSrcweir {
1362cdf0e10cSrcweir bReadScript = sal_False;
1363cdf0e10cSrcweir // JavaScript kann den Stream veraendern
1364cdf0e10cSrcweir // also muss das letzte Zeichen nochmals
1365cdf0e10cSrcweir // gelesen werden
1366cdf0e10cSrcweir bReadNextChar = sal_True;
1367cdf0e10cSrcweir bNextCh = sal_False;
1368cdf0e10cSrcweir }
1369cdf0e10cSrcweir break;
1370cdf0e10cSrcweir
1371cdf0e10cSrcweir case HTML_STYLE_ON:
1372cdf0e10cSrcweir bReadStyle = sal_True;
1373cdf0e10cSrcweir break;
1374cdf0e10cSrcweir case HTML_STYLE_OFF:
1375cdf0e10cSrcweir bReadStyle = sal_False;
1376cdf0e10cSrcweir break;
1377cdf0e10cSrcweir }
1378cdf0e10cSrcweir
1379cdf0e10cSrcweir }
1380cdf0e10cSrcweir }
1381cdf0e10cSrcweir break;
1382cdf0e10cSrcweir
1383cdf0e10cSrcweir case sal_Unicode(EOF):
1384cdf0e10cSrcweir if( rInput.IsEof() )
1385cdf0e10cSrcweir {
1386cdf0e10cSrcweir eState = SVPAR_ACCEPTED;
1387cdf0e10cSrcweir nRet = nNextCh;
1388cdf0e10cSrcweir }
1389cdf0e10cSrcweir else
1390cdf0e10cSrcweir {
1391cdf0e10cSrcweir // normalen Text lesen
1392cdf0e10cSrcweir goto scan_text;
1393cdf0e10cSrcweir }
1394cdf0e10cSrcweir break;
1395cdf0e10cSrcweir
1396cdf0e10cSrcweir case '\f':
1397cdf0e10cSrcweir // Form-Feeds werden jetzt extra nach oben gereicht
1398cdf0e10cSrcweir nRet = HTML_LINEFEEDCHAR; // !!! eigentlich FORMFEEDCHAR
1399cdf0e10cSrcweir break;
1400cdf0e10cSrcweir
1401cdf0e10cSrcweir case '\n':
1402cdf0e10cSrcweir case '\r':
1403cdf0e10cSrcweir if( bReadListing || bReadXMP || bReadPRE || bReadTextArea )
1404cdf0e10cSrcweir {
1405cdf0e10cSrcweir sal_Unicode c = GetNextChar();
1406cdf0e10cSrcweir if( ( '\n' != nNextCh || '\r' != c ) &&
1407cdf0e10cSrcweir ( '\r' != nNextCh || '\n' != c ) )
1408cdf0e10cSrcweir {
1409cdf0e10cSrcweir bNextCh = sal_False;
1410cdf0e10cSrcweir nNextCh = c;
1411cdf0e10cSrcweir }
1412cdf0e10cSrcweir nRet = HTML_NEWPARA;
1413cdf0e10cSrcweir break;
1414cdf0e10cSrcweir }
1415cdf0e10cSrcweir // kein break !
1416cdf0e10cSrcweir case '\t':
1417cdf0e10cSrcweir if( bReadPRE )
1418cdf0e10cSrcweir {
1419cdf0e10cSrcweir nRet = HTML_TABCHAR;
1420cdf0e10cSrcweir break;
1421cdf0e10cSrcweir }
1422cdf0e10cSrcweir // kein break !
1423cdf0e10cSrcweir case ' ':
1424cdf0e10cSrcweir // kein break !
1425cdf0e10cSrcweir default:
1426cdf0e10cSrcweir
1427cdf0e10cSrcweir scan_text:
1428cdf0e10cSrcweir // es folgt "normaler" Text
1429cdf0e10cSrcweir nRet = ScanText();
1430cdf0e10cSrcweir bNextCh = 0 == aToken.Len();
1431cdf0e10cSrcweir
1432cdf0e10cSrcweir // der Text sollte noch verarbeitet werden
1433cdf0e10cSrcweir if( !bNextCh && eState == SVPAR_PENDING )
1434cdf0e10cSrcweir {
1435cdf0e10cSrcweir eState = SVPAR_WORKING;
1436cdf0e10cSrcweir bReadNextChar = sal_True;
1437cdf0e10cSrcweir }
1438cdf0e10cSrcweir
1439cdf0e10cSrcweir break;
1440cdf0e10cSrcweir }
1441cdf0e10cSrcweir
1442cdf0e10cSrcweir if( bNextCh && SVPAR_WORKING == eState )
1443cdf0e10cSrcweir {
1444cdf0e10cSrcweir nNextCh = GetNextChar();
1445cdf0e10cSrcweir if( SVPAR_PENDING == eState && nRet && HTML_TEXTTOKEN != nRet )
1446cdf0e10cSrcweir {
1447cdf0e10cSrcweir bReadNextChar = sal_True;
1448cdf0e10cSrcweir eState = SVPAR_WORKING;
1449cdf0e10cSrcweir }
1450cdf0e10cSrcweir }
1451cdf0e10cSrcweir
1452cdf0e10cSrcweir } while( !nRet && SVPAR_WORKING == eState );
1453cdf0e10cSrcweir
1454cdf0e10cSrcweir if( SVPAR_PENDING == eState )
1455cdf0e10cSrcweir nRet = -1; // irgendwas ungueltiges
1456cdf0e10cSrcweir
1457cdf0e10cSrcweir return nRet;
1458cdf0e10cSrcweir }
1459cdf0e10cSrcweir
UnescapeToken()1460cdf0e10cSrcweir void HTMLParser::UnescapeToken()
1461cdf0e10cSrcweir {
1462cdf0e10cSrcweir xub_StrLen nPos=0;
1463cdf0e10cSrcweir
1464cdf0e10cSrcweir sal_Bool bEscape = sal_False;
1465cdf0e10cSrcweir while( nPos < aToken.Len() )
1466cdf0e10cSrcweir {
1467cdf0e10cSrcweir sal_Bool bOldEscape = bEscape;
1468cdf0e10cSrcweir bEscape = sal_False;
1469cdf0e10cSrcweir if( '\\'==aToken.GetChar(nPos) && !bOldEscape )
1470cdf0e10cSrcweir {
1471cdf0e10cSrcweir aToken.Erase( nPos, 1 );
1472cdf0e10cSrcweir bEscape = sal_True;
1473cdf0e10cSrcweir }
1474cdf0e10cSrcweir else
1475cdf0e10cSrcweir {
1476cdf0e10cSrcweir nPos++;
1477cdf0e10cSrcweir }
1478cdf0e10cSrcweir }
1479cdf0e10cSrcweir }
1480cdf0e10cSrcweir
1481cdf0e10cSrcweir // hole die Optionen
GetOptions(sal_uInt16 * pNoConvertToken) const1482cdf0e10cSrcweir const HTMLOptions *HTMLParser::GetOptions( sal_uInt16 *pNoConvertToken ) const
1483cdf0e10cSrcweir {
1484cdf0e10cSrcweir // wenn die Option fuer das aktuelle Token schon einmal
1485cdf0e10cSrcweir // geholt wurden, geben wir sie noch einmal zurueck
1486cdf0e10cSrcweir if( pOptions->Count() )
1487cdf0e10cSrcweir return pOptions;
1488cdf0e10cSrcweir
1489cdf0e10cSrcweir xub_StrLen nPos = 0;
1490cdf0e10cSrcweir while( nPos < aToken.Len() )
1491cdf0e10cSrcweir {
1492cdf0e10cSrcweir // ein Zeichen ? Dann faengt hier eine Option an
1493cdf0e10cSrcweir if( HTML_ISALPHA( aToken.GetChar(nPos) ) )
1494cdf0e10cSrcweir {
1495cdf0e10cSrcweir int nToken;
1496cdf0e10cSrcweir String aValue;
1497cdf0e10cSrcweir xub_StrLen nStt = nPos;
1498cdf0e10cSrcweir sal_Unicode cChar = 0;
1499cdf0e10cSrcweir
1500cdf0e10cSrcweir // Eigentlich sind hier nur ganz bestimmte Zeichen erlaubt.
1501cdf0e10cSrcweir // Netscape achtet aber nur auf "=" und Leerzeichen (siehe
1502cdf0e10cSrcweir // Mozilla: PA_FetchRequestedNameValues in
1503cdf0e10cSrcweir // lipparse/pa_mdl.c
1504cdf0e10cSrcweir // while( nPos < aToken.Len() &&
1505cdf0e10cSrcweir // ( '-'==(c=aToken[nPos]) || isalnum(c) || '.'==c || '_'==c) )
1506cdf0e10cSrcweir while( nPos < aToken.Len() && '=' != (cChar=aToken.GetChar(nPos)) &&
1507cdf0e10cSrcweir HTML_ISPRINTABLE(cChar) && !HTML_ISSPACE(cChar) )
1508cdf0e10cSrcweir nPos++;
1509cdf0e10cSrcweir
1510cdf0e10cSrcweir String sName( aToken.Copy( nStt, nPos-nStt ) );
1511cdf0e10cSrcweir
1512cdf0e10cSrcweir //JP 23.03.97: die PlugIns wollen die TokenName im "Original" haben
1513cdf0e10cSrcweir // also nur fuers Suchen in UpperCase wandeln
1514cdf0e10cSrcweir String sNameUpperCase( sName );
1515cdf0e10cSrcweir sNameUpperCase.ToUpperAscii();
1516cdf0e10cSrcweir
1517cdf0e10cSrcweir nToken = GetHTMLOption( sNameUpperCase ); // der Name ist fertig
1518cdf0e10cSrcweir DBG_ASSERTWARNING( nToken!=HTML_O_UNKNOWN,
1519cdf0e10cSrcweir "GetOption: unbekannte HTML-Option" );
1520cdf0e10cSrcweir sal_Bool bStripCRLF = (nToken < HTML_OPTION_SCRIPT_START ||
1521cdf0e10cSrcweir nToken >= HTML_OPTION_SCRIPT_END) &&
1522cdf0e10cSrcweir (!pNoConvertToken || nToken != *pNoConvertToken);
1523cdf0e10cSrcweir
1524cdf0e10cSrcweir while( nPos < aToken.Len() &&
1525cdf0e10cSrcweir ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1526cdf0e10cSrcweir HTML_ISSPACE(cChar) ) )
1527cdf0e10cSrcweir nPos++;
1528cdf0e10cSrcweir
1529cdf0e10cSrcweir // hat die Option auch einen Wert?
1530cdf0e10cSrcweir if( nPos!=aToken.Len() && '='==cChar )
1531cdf0e10cSrcweir {
1532cdf0e10cSrcweir nPos++;
1533cdf0e10cSrcweir
1534cdf0e10cSrcweir while( nPos < aToken.Len() &&
1535cdf0e10cSrcweir ( !HTML_ISPRINTABLE( (cChar=aToken.GetChar(nPos)) ) ||
1536cdf0e10cSrcweir ' '==cChar || '\t'==cChar || '\r'==cChar || '\n'==cChar ) )
1537cdf0e10cSrcweir nPos++;
1538cdf0e10cSrcweir
1539cdf0e10cSrcweir if( nPos != aToken.Len() )
1540cdf0e10cSrcweir {
1541cdf0e10cSrcweir xub_StrLen nLen = 0;
1542cdf0e10cSrcweir nStt = nPos;
1543cdf0e10cSrcweir if( ('"'==cChar) || ('\'')==cChar )
1544cdf0e10cSrcweir {
1545cdf0e10cSrcweir sal_Unicode cEnd = cChar;
1546cdf0e10cSrcweir nPos++; nStt++;
1547cdf0e10cSrcweir sal_Bool bDone = sal_False;
1548cdf0e10cSrcweir sal_Bool bEscape = sal_False;
1549cdf0e10cSrcweir while( nPos < aToken.Len() && !bDone )
1550cdf0e10cSrcweir {
1551cdf0e10cSrcweir sal_Bool bOldEscape = bEscape;
1552cdf0e10cSrcweir bEscape = sal_False;
1553cdf0e10cSrcweir cChar = aToken.GetChar(nPos);
1554cdf0e10cSrcweir switch( cChar )
1555cdf0e10cSrcweir {
1556cdf0e10cSrcweir case '\r':
1557cdf0e10cSrcweir case '\n':
1558cdf0e10cSrcweir if( bStripCRLF )
1559cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 );
1560cdf0e10cSrcweir else
1561cdf0e10cSrcweir nPos++, nLen++;
1562cdf0e10cSrcweir break;
1563cdf0e10cSrcweir case '\\':
1564cdf0e10cSrcweir if( bOldEscape )
1565cdf0e10cSrcweir {
1566cdf0e10cSrcweir nPos++, nLen++;
1567cdf0e10cSrcweir }
1568cdf0e10cSrcweir else
1569cdf0e10cSrcweir {
1570cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 );
1571cdf0e10cSrcweir bEscape = sal_True;
1572cdf0e10cSrcweir }
1573cdf0e10cSrcweir break;
1574cdf0e10cSrcweir case '"':
1575cdf0e10cSrcweir case '\'':
1576cdf0e10cSrcweir bDone = !bOldEscape && cChar==cEnd;
1577cdf0e10cSrcweir if( !bDone )
1578cdf0e10cSrcweir nPos++, nLen++;
1579cdf0e10cSrcweir break;
1580cdf0e10cSrcweir default:
1581cdf0e10cSrcweir nPos++, nLen++;
1582cdf0e10cSrcweir break;
1583cdf0e10cSrcweir }
1584cdf0e10cSrcweir }
1585cdf0e10cSrcweir if( nPos!=aToken.Len() )
1586cdf0e10cSrcweir nPos++;
1587cdf0e10cSrcweir }
1588cdf0e10cSrcweir else
1589cdf0e10cSrcweir {
1590cdf0e10cSrcweir // hier sind wir etwas laxer als der
1591cdf0e10cSrcweir // Standard und erlauben alles druckbare
1592cdf0e10cSrcweir sal_Bool bEscape = sal_False;
1593cdf0e10cSrcweir sal_Bool bDone = sal_False;
1594cdf0e10cSrcweir while( nPos < aToken.Len() && !bDone )
1595cdf0e10cSrcweir {
1596cdf0e10cSrcweir sal_Bool bOldEscape = bEscape;
1597cdf0e10cSrcweir bEscape = sal_False;
1598cdf0e10cSrcweir sal_Unicode c = aToken.GetChar(nPos);
1599cdf0e10cSrcweir switch( c )
1600cdf0e10cSrcweir {
1601cdf0e10cSrcweir case ' ':
1602cdf0e10cSrcweir bDone = !bOldEscape;
1603cdf0e10cSrcweir if( !bDone )
1604cdf0e10cSrcweir nPos++, nLen++;
1605cdf0e10cSrcweir break;
1606cdf0e10cSrcweir
1607cdf0e10cSrcweir case '\t':
1608cdf0e10cSrcweir case '\r':
1609cdf0e10cSrcweir case '\n':
1610cdf0e10cSrcweir bDone = sal_True;
1611cdf0e10cSrcweir break;
1612cdf0e10cSrcweir
1613cdf0e10cSrcweir case '\\':
1614cdf0e10cSrcweir if( bOldEscape )
1615cdf0e10cSrcweir {
1616cdf0e10cSrcweir nPos++, nLen++;
1617cdf0e10cSrcweir }
1618cdf0e10cSrcweir else
1619cdf0e10cSrcweir {
1620cdf0e10cSrcweir ((String &)aToken).Erase( nPos, 1 );
1621cdf0e10cSrcweir bEscape = sal_True;
1622cdf0e10cSrcweir }
1623cdf0e10cSrcweir break;
1624cdf0e10cSrcweir
1625cdf0e10cSrcweir default:
1626cdf0e10cSrcweir if( HTML_ISPRINTABLE( c ) )
1627cdf0e10cSrcweir nPos++, nLen++;
1628cdf0e10cSrcweir else
1629cdf0e10cSrcweir bDone = sal_True;
1630cdf0e10cSrcweir break;
1631cdf0e10cSrcweir }
1632cdf0e10cSrcweir }
1633cdf0e10cSrcweir }
1634cdf0e10cSrcweir
1635cdf0e10cSrcweir if( nLen )
1636cdf0e10cSrcweir aValue = aToken.Copy( nStt, nLen );
1637cdf0e10cSrcweir }
1638cdf0e10cSrcweir }
1639cdf0e10cSrcweir
1640cdf0e10cSrcweir // Wir kennen das Token und koennen es Speichern
1641cdf0e10cSrcweir HTMLOption *pOption =
1642cdf0e10cSrcweir new HTMLOption(
1643cdf0e10cSrcweir sal::static_int_cast< sal_uInt16 >(nToken), sName, aValue );
1644cdf0e10cSrcweir
1645cdf0e10cSrcweir pOptions->Insert( pOption, pOptions->Count() );
1646cdf0e10cSrcweir
1647cdf0e10cSrcweir }
1648cdf0e10cSrcweir else
1649cdf0e10cSrcweir // white space un unerwartete Zeichen ignorieren wie
1650cdf0e10cSrcweir nPos++;
1651cdf0e10cSrcweir }
1652cdf0e10cSrcweir
1653cdf0e10cSrcweir return pOptions;
1654cdf0e10cSrcweir }
1655cdf0e10cSrcweir
FilterPRE(int nToken)1656cdf0e10cSrcweir int HTMLParser::FilterPRE( int nToken )
1657cdf0e10cSrcweir {
1658cdf0e10cSrcweir switch( nToken )
1659cdf0e10cSrcweir {
1660cdf0e10cSrcweir #ifdef HTML_BEHAVIOUR
1661cdf0e10cSrcweir // diese werden laut Definition zu LFs
1662cdf0e10cSrcweir case HTML_PARABREAK_ON:
1663cdf0e10cSrcweir case HTML_LINEBREAK:
1664cdf0e10cSrcweir nToken = HTML_NEWPARA;
1665cdf0e10cSrcweir #else
1666cdf0e10cSrcweir // in Netscape zeigen sie aber nur in nicht-leeren Absaetzen Wirkung
1667cdf0e10cSrcweir case HTML_PARABREAK_ON:
1668cdf0e10cSrcweir nToken = HTML_LINEBREAK;
1669cdf0e10cSrcweir case HTML_LINEBREAK:
1670cdf0e10cSrcweir #endif
1671cdf0e10cSrcweir case HTML_NEWPARA:
1672cdf0e10cSrcweir nPre_LinePos = 0;
1673cdf0e10cSrcweir if( bPre_IgnoreNewPara )
1674cdf0e10cSrcweir nToken = 0;
1675cdf0e10cSrcweir break;
1676cdf0e10cSrcweir
1677cdf0e10cSrcweir case HTML_TABCHAR:
1678cdf0e10cSrcweir {
1679cdf0e10cSrcweir xub_StrLen nSpaces = sal::static_int_cast< xub_StrLen >(
1680cdf0e10cSrcweir 8 - (nPre_LinePos % 8));
1681cdf0e10cSrcweir DBG_ASSERT( !aToken.Len(), "Wieso ist das Token nicht leer?" );
1682cdf0e10cSrcweir aToken.Expand( nSpaces, ' ' );
1683cdf0e10cSrcweir nPre_LinePos += nSpaces;
1684cdf0e10cSrcweir nToken = HTML_TEXTTOKEN;
1685cdf0e10cSrcweir }
1686cdf0e10cSrcweir break;
1687cdf0e10cSrcweir // diese bleiben erhalten
1688cdf0e10cSrcweir case HTML_TEXTTOKEN:
1689cdf0e10cSrcweir nPre_LinePos += aToken.Len();
1690cdf0e10cSrcweir break;
1691cdf0e10cSrcweir
1692cdf0e10cSrcweir case HTML_SELECT_ON:
1693cdf0e10cSrcweir case HTML_SELECT_OFF:
1694cdf0e10cSrcweir case HTML_BODY_ON:
1695cdf0e10cSrcweir case HTML_FORM_ON:
1696cdf0e10cSrcweir case HTML_FORM_OFF:
1697cdf0e10cSrcweir case HTML_INPUT:
1698cdf0e10cSrcweir case HTML_OPTION:
1699cdf0e10cSrcweir case HTML_TEXTAREA_ON:
1700cdf0e10cSrcweir case HTML_TEXTAREA_OFF:
1701cdf0e10cSrcweir
1702cdf0e10cSrcweir case HTML_IMAGE:
1703cdf0e10cSrcweir case HTML_APPLET_ON:
1704cdf0e10cSrcweir case HTML_APPLET_OFF:
1705cdf0e10cSrcweir case HTML_PARAM:
1706cdf0e10cSrcweir case HTML_EMBED:
1707cdf0e10cSrcweir
1708cdf0e10cSrcweir case HTML_HEAD1_ON:
1709cdf0e10cSrcweir case HTML_HEAD1_OFF:
1710cdf0e10cSrcweir case HTML_HEAD2_ON:
1711cdf0e10cSrcweir case HTML_HEAD2_OFF:
1712cdf0e10cSrcweir case HTML_HEAD3_ON:
1713cdf0e10cSrcweir case HTML_HEAD3_OFF:
1714cdf0e10cSrcweir case HTML_HEAD4_ON:
1715cdf0e10cSrcweir case HTML_HEAD4_OFF:
1716cdf0e10cSrcweir case HTML_HEAD5_ON:
1717cdf0e10cSrcweir case HTML_HEAD5_OFF:
1718cdf0e10cSrcweir case HTML_HEAD6_ON:
1719cdf0e10cSrcweir case HTML_HEAD6_OFF:
1720cdf0e10cSrcweir case HTML_BLOCKQUOTE_ON:
1721cdf0e10cSrcweir case HTML_BLOCKQUOTE_OFF:
1722cdf0e10cSrcweir case HTML_ADDRESS_ON:
1723cdf0e10cSrcweir case HTML_ADDRESS_OFF:
1724cdf0e10cSrcweir case HTML_HORZRULE:
1725cdf0e10cSrcweir
1726cdf0e10cSrcweir case HTML_CENTER_ON:
1727cdf0e10cSrcweir case HTML_CENTER_OFF:
1728cdf0e10cSrcweir case HTML_DIVISION_ON:
1729cdf0e10cSrcweir case HTML_DIVISION_OFF:
1730cdf0e10cSrcweir
1731cdf0e10cSrcweir case HTML_SCRIPT_ON:
1732cdf0e10cSrcweir case HTML_SCRIPT_OFF:
1733cdf0e10cSrcweir case HTML_RAWDATA:
1734cdf0e10cSrcweir
1735cdf0e10cSrcweir case HTML_TABLE_ON:
1736cdf0e10cSrcweir case HTML_TABLE_OFF:
1737cdf0e10cSrcweir case HTML_CAPTION_ON:
1738cdf0e10cSrcweir case HTML_CAPTION_OFF:
1739cdf0e10cSrcweir case HTML_COLGROUP_ON:
1740cdf0e10cSrcweir case HTML_COLGROUP_OFF:
1741cdf0e10cSrcweir case HTML_COL_ON:
1742cdf0e10cSrcweir case HTML_COL_OFF:
1743cdf0e10cSrcweir case HTML_THEAD_ON:
1744cdf0e10cSrcweir case HTML_THEAD_OFF:
1745cdf0e10cSrcweir case HTML_TFOOT_ON:
1746cdf0e10cSrcweir case HTML_TFOOT_OFF:
1747cdf0e10cSrcweir case HTML_TBODY_ON:
1748cdf0e10cSrcweir case HTML_TBODY_OFF:
1749cdf0e10cSrcweir case HTML_TABLEROW_ON:
1750cdf0e10cSrcweir case HTML_TABLEROW_OFF:
1751cdf0e10cSrcweir case HTML_TABLEDATA_ON:
1752cdf0e10cSrcweir case HTML_TABLEDATA_OFF:
1753cdf0e10cSrcweir case HTML_TABLEHEADER_ON:
1754cdf0e10cSrcweir case HTML_TABLEHEADER_OFF:
1755cdf0e10cSrcweir
1756cdf0e10cSrcweir case HTML_ANCHOR_ON:
1757cdf0e10cSrcweir case HTML_ANCHOR_OFF:
1758cdf0e10cSrcweir case HTML_BOLD_ON:
1759cdf0e10cSrcweir case HTML_BOLD_OFF:
1760cdf0e10cSrcweir case HTML_ITALIC_ON:
1761cdf0e10cSrcweir case HTML_ITALIC_OFF:
1762cdf0e10cSrcweir case HTML_STRIKE_ON:
1763cdf0e10cSrcweir case HTML_STRIKE_OFF:
1764cdf0e10cSrcweir case HTML_STRIKETHROUGH_ON:
1765cdf0e10cSrcweir case HTML_STRIKETHROUGH_OFF:
1766cdf0e10cSrcweir case HTML_UNDERLINE_ON:
1767cdf0e10cSrcweir case HTML_UNDERLINE_OFF:
1768cdf0e10cSrcweir case HTML_BASEFONT_ON:
1769cdf0e10cSrcweir case HTML_BASEFONT_OFF:
1770cdf0e10cSrcweir case HTML_FONT_ON:
1771cdf0e10cSrcweir case HTML_FONT_OFF:
1772cdf0e10cSrcweir case HTML_BLINK_ON:
1773cdf0e10cSrcweir case HTML_BLINK_OFF:
1774cdf0e10cSrcweir case HTML_SPAN_ON:
1775cdf0e10cSrcweir case HTML_SPAN_OFF:
1776cdf0e10cSrcweir case HTML_SUBSCRIPT_ON:
1777cdf0e10cSrcweir case HTML_SUBSCRIPT_OFF:
1778cdf0e10cSrcweir case HTML_SUPERSCRIPT_ON:
1779cdf0e10cSrcweir case HTML_SUPERSCRIPT_OFF:
1780cdf0e10cSrcweir case HTML_BIGPRINT_ON:
1781cdf0e10cSrcweir case HTML_BIGPRINT_OFF:
1782cdf0e10cSrcweir case HTML_SMALLPRINT_OFF:
1783cdf0e10cSrcweir case HTML_SMALLPRINT_ON:
1784cdf0e10cSrcweir
1785cdf0e10cSrcweir case HTML_EMPHASIS_ON:
1786cdf0e10cSrcweir case HTML_EMPHASIS_OFF:
1787cdf0e10cSrcweir case HTML_CITIATION_ON:
1788cdf0e10cSrcweir case HTML_CITIATION_OFF:
1789cdf0e10cSrcweir case HTML_STRONG_ON:
1790cdf0e10cSrcweir case HTML_STRONG_OFF:
1791cdf0e10cSrcweir case HTML_CODE_ON:
1792cdf0e10cSrcweir case HTML_CODE_OFF:
1793cdf0e10cSrcweir case HTML_SAMPLE_ON:
1794cdf0e10cSrcweir case HTML_SAMPLE_OFF:
1795cdf0e10cSrcweir case HTML_KEYBOARD_ON:
1796cdf0e10cSrcweir case HTML_KEYBOARD_OFF:
1797cdf0e10cSrcweir case HTML_VARIABLE_ON:
1798cdf0e10cSrcweir case HTML_VARIABLE_OFF:
1799cdf0e10cSrcweir case HTML_DEFINSTANCE_ON:
1800cdf0e10cSrcweir case HTML_DEFINSTANCE_OFF:
1801cdf0e10cSrcweir case HTML_SHORTQUOTE_ON:
1802cdf0e10cSrcweir case HTML_SHORTQUOTE_OFF:
1803cdf0e10cSrcweir case HTML_LANGUAGE_ON:
1804cdf0e10cSrcweir case HTML_LANGUAGE_OFF:
1805cdf0e10cSrcweir case HTML_AUTHOR_ON:
1806cdf0e10cSrcweir case HTML_AUTHOR_OFF:
1807cdf0e10cSrcweir case HTML_PERSON_ON:
1808cdf0e10cSrcweir case HTML_PERSON_OFF:
1809cdf0e10cSrcweir case HTML_ACRONYM_ON:
1810cdf0e10cSrcweir case HTML_ACRONYM_OFF:
1811cdf0e10cSrcweir case HTML_ABBREVIATION_ON:
1812cdf0e10cSrcweir case HTML_ABBREVIATION_OFF:
1813cdf0e10cSrcweir case HTML_INSERTEDTEXT_ON:
1814cdf0e10cSrcweir case HTML_INSERTEDTEXT_OFF:
1815cdf0e10cSrcweir case HTML_DELETEDTEXT_ON:
1816cdf0e10cSrcweir case HTML_DELETEDTEXT_OFF:
1817cdf0e10cSrcweir case HTML_TELETYPE_ON:
1818cdf0e10cSrcweir case HTML_TELETYPE_OFF:
1819cdf0e10cSrcweir
1820cdf0e10cSrcweir break;
1821cdf0e10cSrcweir
1822cdf0e10cSrcweir // der Rest wird als unbekanntes Token behandelt
1823cdf0e10cSrcweir default:
1824cdf0e10cSrcweir if( nToken )
1825cdf0e10cSrcweir {
1826cdf0e10cSrcweir nToken =
1827cdf0e10cSrcweir ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1828cdf0e10cSrcweir ? HTML_UNKNOWNCONTROL_OFF
1829cdf0e10cSrcweir : HTML_UNKNOWNCONTROL_ON );
1830cdf0e10cSrcweir }
1831cdf0e10cSrcweir break;
1832cdf0e10cSrcweir }
1833cdf0e10cSrcweir
1834cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False;
1835cdf0e10cSrcweir
1836cdf0e10cSrcweir return nToken;
1837cdf0e10cSrcweir }
1838cdf0e10cSrcweir
FilterXMP(int nToken)1839cdf0e10cSrcweir int HTMLParser::FilterXMP( int nToken )
1840cdf0e10cSrcweir {
1841cdf0e10cSrcweir switch( nToken )
1842cdf0e10cSrcweir {
1843cdf0e10cSrcweir case HTML_NEWPARA:
1844cdf0e10cSrcweir if( bPre_IgnoreNewPara )
1845cdf0e10cSrcweir nToken = 0;
1846cdf0e10cSrcweir case HTML_TEXTTOKEN:
1847cdf0e10cSrcweir case HTML_NONBREAKSPACE:
1848cdf0e10cSrcweir case HTML_SOFTHYPH:
1849cdf0e10cSrcweir break; // bleiben erhalten
1850cdf0e10cSrcweir
1851cdf0e10cSrcweir default:
1852cdf0e10cSrcweir if( nToken )
1853cdf0e10cSrcweir {
1854cdf0e10cSrcweir if( (HTML_TOKEN_ONOFF & nToken) && (1 & nToken) )
1855cdf0e10cSrcweir {
1856cdf0e10cSrcweir sSaveToken.Insert( '<', 0 );
1857cdf0e10cSrcweir sSaveToken.Insert( '/', 1 );
1858cdf0e10cSrcweir }
1859cdf0e10cSrcweir else
1860cdf0e10cSrcweir sSaveToken.Insert( '<', 0 );
1861cdf0e10cSrcweir if( aToken.Len() )
1862cdf0e10cSrcweir {
1863cdf0e10cSrcweir UnescapeToken();
1864cdf0e10cSrcweir sSaveToken += (sal_Unicode)' ';
1865cdf0e10cSrcweir aToken.Insert( sSaveToken, 0 );
1866cdf0e10cSrcweir }
1867cdf0e10cSrcweir else
1868cdf0e10cSrcweir aToken = sSaveToken;
1869cdf0e10cSrcweir aToken += (sal_Unicode)'>';
1870cdf0e10cSrcweir nToken = HTML_TEXTTOKEN;
1871cdf0e10cSrcweir }
1872cdf0e10cSrcweir break;
1873cdf0e10cSrcweir }
1874cdf0e10cSrcweir
1875cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False;
1876cdf0e10cSrcweir
1877cdf0e10cSrcweir return nToken;
1878cdf0e10cSrcweir }
1879cdf0e10cSrcweir
FilterListing(int nToken)1880cdf0e10cSrcweir int HTMLParser::FilterListing( int nToken )
1881cdf0e10cSrcweir {
1882cdf0e10cSrcweir switch( nToken )
1883cdf0e10cSrcweir {
1884cdf0e10cSrcweir case HTML_NEWPARA:
1885cdf0e10cSrcweir if( bPre_IgnoreNewPara )
1886cdf0e10cSrcweir nToken = 0;
1887cdf0e10cSrcweir case HTML_TEXTTOKEN:
1888cdf0e10cSrcweir case HTML_NONBREAKSPACE:
1889cdf0e10cSrcweir case HTML_SOFTHYPH:
1890cdf0e10cSrcweir break; // bleiben erhalten
1891cdf0e10cSrcweir
1892cdf0e10cSrcweir default:
1893cdf0e10cSrcweir if( nToken )
1894cdf0e10cSrcweir {
1895cdf0e10cSrcweir nToken =
1896cdf0e10cSrcweir ( ((HTML_TOKEN_ONOFF & nToken) && (1 & nToken))
1897cdf0e10cSrcweir ? HTML_UNKNOWNCONTROL_OFF
1898cdf0e10cSrcweir : HTML_UNKNOWNCONTROL_ON );
1899cdf0e10cSrcweir }
1900cdf0e10cSrcweir break;
1901cdf0e10cSrcweir }
1902cdf0e10cSrcweir
1903cdf0e10cSrcweir bPre_IgnoreNewPara = sal_False;
1904cdf0e10cSrcweir
1905cdf0e10cSrcweir return nToken;
1906cdf0e10cSrcweir }
1907cdf0e10cSrcweir
IsHTMLFormat(const sal_Char * pHeader,sal_Bool bSwitchToUCS2,rtl_TextEncoding eEnc)1908cdf0e10cSrcweir FASTBOOL HTMLParser::IsHTMLFormat( const sal_Char* pHeader,
1909cdf0e10cSrcweir sal_Bool bSwitchToUCS2,
1910cdf0e10cSrcweir rtl_TextEncoding eEnc )
1911cdf0e10cSrcweir {
1912cdf0e10cSrcweir // Einer der folgenden regulaeren Ausdrucke muss sich auf den String
1913cdf0e10cSrcweir // anwenden lassen, damit das Dok ein HTML-Dokument ist.
1914cdf0e10cSrcweir //
1915cdf0e10cSrcweir // ^[^<]*<[^ \t]*[> \t]
1916cdf0e10cSrcweir // -------
1917cdf0e10cSrcweir // ^<!
1918cdf0e10cSrcweir //
1919cdf0e10cSrcweir // wobei der unterstrichene Teilausdruck einem HTML-Token
1920cdf0e10cSrcweir // ensprechen muss
1921cdf0e10cSrcweir
1922cdf0e10cSrcweir ByteString sCmp;
1923cdf0e10cSrcweir sal_Bool bUCS2B = sal_False;
1924cdf0e10cSrcweir if( bSwitchToUCS2 )
1925cdf0e10cSrcweir {
1926cdf0e10cSrcweir if( 0xfeU == (sal_uChar)pHeader[0] &&
1927cdf0e10cSrcweir 0xffU == (sal_uChar)pHeader[1] )
1928cdf0e10cSrcweir {
1929cdf0e10cSrcweir eEnc = RTL_TEXTENCODING_UCS2;
1930cdf0e10cSrcweir bUCS2B = sal_True;
1931cdf0e10cSrcweir }
1932cdf0e10cSrcweir else if( 0xffU == (sal_uChar)pHeader[0] &&
1933cdf0e10cSrcweir 0xfeU == (sal_uChar)pHeader[1] )
1934cdf0e10cSrcweir {
1935cdf0e10cSrcweir eEnc = RTL_TEXTENCODING_UCS2;
1936cdf0e10cSrcweir }
1937cdf0e10cSrcweir }
1938cdf0e10cSrcweir if
1939cdf0e10cSrcweir (
1940cdf0e10cSrcweir RTL_TEXTENCODING_UCS2 == eEnc &&
1941cdf0e10cSrcweir (
1942cdf0e10cSrcweir (0xfe == (sal_uChar)pHeader[0] && 0xff == (sal_uChar)pHeader[1]) ||
1943cdf0e10cSrcweir (0xff == (sal_uChar)pHeader[0] && 0xfe == (sal_uChar)pHeader[1])
1944cdf0e10cSrcweir )
1945cdf0e10cSrcweir )
1946cdf0e10cSrcweir {
1947cdf0e10cSrcweir if( 0xfe == (sal_uChar)pHeader[0] )
1948cdf0e10cSrcweir bUCS2B = sal_True;
1949cdf0e10cSrcweir
1950cdf0e10cSrcweir xub_StrLen nLen;
1951cdf0e10cSrcweir for( nLen = 2;
1952cdf0e10cSrcweir pHeader[nLen] != 0 || pHeader[nLen+1] != 0;
1953cdf0e10cSrcweir nLen+=2 )
1954cdf0e10cSrcweir ;
1955cdf0e10cSrcweir
1956cdf0e10cSrcweir ::rtl::OStringBuffer sTmp( (nLen - 2)/2 );
1957cdf0e10cSrcweir for( xub_StrLen nPos = 2; nPos < nLen; nPos += 2 )
1958cdf0e10cSrcweir {
1959cdf0e10cSrcweir sal_Unicode cUC;
1960cdf0e10cSrcweir if( bUCS2B )
1961cdf0e10cSrcweir cUC = (sal_Unicode(pHeader[nPos]) << 8) | pHeader[nPos+1];
1962cdf0e10cSrcweir else
1963cdf0e10cSrcweir cUC = (sal_Unicode(pHeader[nPos+1]) << 8) | pHeader[nPos];
1964cdf0e10cSrcweir if( 0U == cUC )
1965cdf0e10cSrcweir break;
1966cdf0e10cSrcweir
1967cdf0e10cSrcweir sTmp.append( cUC < 256U ? (sal_Char)cUC : '.' );
1968cdf0e10cSrcweir }
1969cdf0e10cSrcweir sCmp = ByteString( sTmp.makeStringAndClear() );
1970cdf0e10cSrcweir }
1971cdf0e10cSrcweir else
1972cdf0e10cSrcweir {
1973cdf0e10cSrcweir sCmp = (sal_Char *)pHeader;
1974cdf0e10cSrcweir }
1975cdf0e10cSrcweir
1976cdf0e10cSrcweir sCmp.ToUpperAscii();
1977cdf0e10cSrcweir
1978cdf0e10cSrcweir // Ein HTML-Dokument muss in der ersten Zeile ein '<' besitzen
1979cdf0e10cSrcweir xub_StrLen nStart = sCmp.Search( '<' );
1980cdf0e10cSrcweir if( STRING_NOTFOUND == nStart )
1981cdf0e10cSrcweir return sal_False;
1982cdf0e10cSrcweir nStart++;
1983cdf0e10cSrcweir
1984cdf0e10cSrcweir // danach duerfen beliebige andere Zeichen bis zu einem blank oder
1985cdf0e10cSrcweir // '>' kommen
1986cdf0e10cSrcweir sal_Char c;
1987cdf0e10cSrcweir xub_StrLen nPos;
1988cdf0e10cSrcweir for( nPos = nStart; nPos<sCmp.Len(); nPos++ )
1989cdf0e10cSrcweir {
1990cdf0e10cSrcweir if( '>'==(c=sCmp.GetChar(nPos)) || HTML_ISSPACE(c) )
1991cdf0e10cSrcweir break;
1992cdf0e10cSrcweir }
1993cdf0e10cSrcweir
1994cdf0e10cSrcweir // wenn das Dokeument hinter dem < aufhoert ist es wohl kein HTML
1995cdf0e10cSrcweir if( nPos==nStart )
1996cdf0e10cSrcweir return sal_False;
1997cdf0e10cSrcweir
1998cdf0e10cSrcweir // die Zeichenkette nach dem '<' muss ausserdem ein bekanntes
1999cdf0e10cSrcweir // HTML Token sein. Damit die Ausgabe eines DOS-dir-Befehls nicht
2000cdf0e10cSrcweir // als HTML interpretiert wird, wird ein <DIR> jedoch nicht als HTML
2001cdf0e10cSrcweir // interpretiert.
2002cdf0e10cSrcweir String sTest( sCmp.Copy( nStart, nPos-nStart ), RTL_TEXTENCODING_ASCII_US );
2003cdf0e10cSrcweir int nTok = GetHTMLToken( sTest );
2004cdf0e10cSrcweir if( 0 != nTok && HTML_DIRLIST_ON != nTok )
2005cdf0e10cSrcweir return sal_True;
2006cdf0e10cSrcweir
2007cdf0e10cSrcweir // oder es handelt sich um ein "<!" ganz am Anfang der Datei (fix #27092#)
2008cdf0e10cSrcweir if( nStart == 1 && '!' == sCmp.GetChar( 1 ) )
2009cdf0e10cSrcweir return sal_True;
2010cdf0e10cSrcweir
2011cdf0e10cSrcweir // oder wir finden irgendwo ein <HTML> in den ersten 80 Zeichen
2012cdf0e10cSrcweir nStart = sCmp.Search( OOO_STRING_SVTOOLS_HTML_html );
2013cdf0e10cSrcweir if( nStart!=STRING_NOTFOUND &&
2014cdf0e10cSrcweir nStart>0 && '<'==sCmp.GetChar(nStart-1) &&
2015cdf0e10cSrcweir nStart+4 < sCmp.Len() && '>'==sCmp.GetChar(nStart+4) )
2016cdf0e10cSrcweir return sal_True;
2017cdf0e10cSrcweir
2018cdf0e10cSrcweir // sonst ist es wohl doch eher kein HTML-Dokument
2019cdf0e10cSrcweir return sal_False;
2020cdf0e10cSrcweir }
2021cdf0e10cSrcweir
InternalImgToPrivateURL(String & rURL)2022cdf0e10cSrcweir sal_Bool HTMLParser::InternalImgToPrivateURL( String& rURL )
2023cdf0e10cSrcweir {
2024cdf0e10cSrcweir if( rURL.Len() < 19 || 'i' != rURL.GetChar(0) ||
2025cdf0e10cSrcweir rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher, 9 ) != COMPARE_EQUAL )
2026cdf0e10cSrcweir return sal_False;
2027cdf0e10cSrcweir
2028cdf0e10cSrcweir sal_Bool bFound = sal_False;
2029cdf0e10cSrcweir
2030cdf0e10cSrcweir if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_gopher,16) == COMPARE_EQUAL )
2031cdf0e10cSrcweir {
2032cdf0e10cSrcweir String aName( rURL.Copy(16) );
2033cdf0e10cSrcweir switch( aName.GetChar(0) )
2034cdf0e10cSrcweir {
2035cdf0e10cSrcweir case 'b':
2036cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_binary );
2037cdf0e10cSrcweir break;
2038cdf0e10cSrcweir case 'i':
2039cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_image ) ||
2040cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_index );
2041cdf0e10cSrcweir break;
2042cdf0e10cSrcweir case 'm':
2043cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_menu ) ||
2044cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_movie );
2045cdf0e10cSrcweir break;
2046cdf0e10cSrcweir case 's':
2047cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_sound );
2048cdf0e10cSrcweir break;
2049cdf0e10cSrcweir case 't':
2050cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_telnet ) ||
2051cdf0e10cSrcweir aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_text );
2052cdf0e10cSrcweir break;
2053cdf0e10cSrcweir case 'u':
2054cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_GOPHER_unknown );
2055cdf0e10cSrcweir break;
2056cdf0e10cSrcweir }
2057cdf0e10cSrcweir }
2058cdf0e10cSrcweir else if( rURL.CompareToAscii( OOO_STRING_SVTOOLS_HTML_internal_icon,14) == COMPARE_EQUAL )
2059cdf0e10cSrcweir {
2060cdf0e10cSrcweir String aName( rURL.Copy(14) );
2061cdf0e10cSrcweir switch( aName.GetChar(0) )
2062cdf0e10cSrcweir {
2063cdf0e10cSrcweir case 'b':
2064cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_baddata );
2065cdf0e10cSrcweir break;
2066cdf0e10cSrcweir case 'd':
2067cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_delayed );
2068cdf0e10cSrcweir break;
2069cdf0e10cSrcweir case 'e':
2070cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_embed );
2071cdf0e10cSrcweir break;
2072cdf0e10cSrcweir case 'i':
2073cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_insecure );
2074cdf0e10cSrcweir break;
2075cdf0e10cSrcweir case 'n':
2076cdf0e10cSrcweir bFound = aName.EqualsAscii( OOO_STRING_SVTOOLS_HTML_INT_ICON_notfound );
2077cdf0e10cSrcweir break;
2078cdf0e10cSrcweir }
2079cdf0e10cSrcweir }
2080cdf0e10cSrcweir if( bFound )
2081cdf0e10cSrcweir {
2082cdf0e10cSrcweir String sTmp ( rURL );
2083cdf0e10cSrcweir rURL.AssignAscii( OOO_STRING_SVTOOLS_HTML_private_image );
2084cdf0e10cSrcweir rURL.Append( sTmp );
2085cdf0e10cSrcweir }
2086cdf0e10cSrcweir
2087cdf0e10cSrcweir return bFound;
2088cdf0e10cSrcweir }
2089cdf0e10cSrcweir
2090cdf0e10cSrcweir #ifdef USED
SaveState(int nToken)2091cdf0e10cSrcweir void HTMLParser::SaveState( int nToken )
2092cdf0e10cSrcweir {
2093cdf0e10cSrcweir SvParser::SaveState( nToken );
2094cdf0e10cSrcweir }
2095cdf0e10cSrcweir
RestoreState()2096cdf0e10cSrcweir void HTMLParser::RestoreState()
2097cdf0e10cSrcweir {
2098cdf0e10cSrcweir SvParser::RestoreState();
2099cdf0e10cSrcweir }
2100cdf0e10cSrcweir #endif
2101cdf0e10cSrcweir
2102cdf0e10cSrcweir
2103cdf0e10cSrcweir enum eHtmlMetas {
2104cdf0e10cSrcweir HTML_META_NONE = 0,
2105cdf0e10cSrcweir HTML_META_AUTHOR,
2106cdf0e10cSrcweir HTML_META_DESCRIPTION,
2107cdf0e10cSrcweir HTML_META_KEYWORDS,
2108cdf0e10cSrcweir HTML_META_REFRESH,
2109cdf0e10cSrcweir HTML_META_CLASSIFICATION,
2110cdf0e10cSrcweir HTML_META_CREATED,
2111cdf0e10cSrcweir HTML_META_CHANGEDBY,
2112cdf0e10cSrcweir HTML_META_CHANGED,
2113cdf0e10cSrcweir HTML_META_GENERATOR,
2114cdf0e10cSrcweir HTML_META_SDFOOTNOTE,
2115cdf0e10cSrcweir HTML_META_SDENDNOTE,
2116cdf0e10cSrcweir HTML_META_CONTENT_TYPE
2117cdf0e10cSrcweir };
2118cdf0e10cSrcweir
2119cdf0e10cSrcweir // <META NAME=xxx>
2120cdf0e10cSrcweir static HTMLOptionEnum __READONLY_DATA aHTMLMetaNameTable[] =
2121cdf0e10cSrcweir {
2122cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_author, HTML_META_AUTHOR },
2123cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_changed, HTML_META_CHANGED },
2124cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_changedby, HTML_META_CHANGEDBY },
2125cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_classification,HTML_META_CLASSIFICATION},
2126cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_content_type, HTML_META_CONTENT_TYPE },
2127cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_created, HTML_META_CREATED },
2128cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_description, HTML_META_DESCRIPTION },
2129cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_keywords, HTML_META_KEYWORDS },
2130cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_generator, HTML_META_GENERATOR },
2131cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_refresh, HTML_META_REFRESH },
2132cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_sdendnote, HTML_META_SDENDNOTE },
2133cdf0e10cSrcweir { OOO_STRING_SVTOOLS_HTML_META_sdfootnote, HTML_META_SDFOOTNOTE },
2134cdf0e10cSrcweir { 0, 0 }
2135cdf0e10cSrcweir };
2136cdf0e10cSrcweir
2137cdf0e10cSrcweir
AddMetaUserDefined(::rtl::OUString const &)2138cdf0e10cSrcweir void HTMLParser::AddMetaUserDefined( ::rtl::OUString const & )
2139cdf0e10cSrcweir {
2140cdf0e10cSrcweir }
2141cdf0e10cSrcweir
ParseMetaOptionsImpl(const uno::Reference<document::XDocumentProperties> & i_xDocProps,SvKeyValueIterator * i_pHTTPHeader,const HTMLOptions * i_pOptions,rtl_TextEncoding & o_rEnc)2142cdf0e10cSrcweir bool HTMLParser::ParseMetaOptionsImpl(
2143cdf0e10cSrcweir const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2144cdf0e10cSrcweir SvKeyValueIterator *i_pHTTPHeader,
2145cdf0e10cSrcweir const HTMLOptions *i_pOptions,
2146cdf0e10cSrcweir rtl_TextEncoding& o_rEnc )
2147cdf0e10cSrcweir {
2148cdf0e10cSrcweir String aName, aContent;
2149cdf0e10cSrcweir sal_uInt16 nAction = HTML_META_NONE;
2150cdf0e10cSrcweir bool bHTTPEquiv = false, bChanged = false;
2151cdf0e10cSrcweir
2152cdf0e10cSrcweir for ( sal_uInt16 i = i_pOptions->Count(); i; )
2153cdf0e10cSrcweir {
2154cdf0e10cSrcweir const HTMLOption *pOption = (*i_pOptions)[ --i ];
2155cdf0e10cSrcweir switch ( pOption->GetToken() )
2156cdf0e10cSrcweir {
2157cdf0e10cSrcweir case HTML_O_NAME:
2158cdf0e10cSrcweir aName = pOption->GetString();
2159cdf0e10cSrcweir if ( HTML_META_NONE==nAction )
2160cdf0e10cSrcweir {
2161cdf0e10cSrcweir pOption->GetEnum( nAction, aHTMLMetaNameTable );
2162cdf0e10cSrcweir }
2163cdf0e10cSrcweir break;
2164cdf0e10cSrcweir case HTML_O_HTTPEQUIV:
2165cdf0e10cSrcweir aName = pOption->GetString();
2166cdf0e10cSrcweir pOption->GetEnum( nAction, aHTMLMetaNameTable );
2167cdf0e10cSrcweir bHTTPEquiv = true;
2168cdf0e10cSrcweir break;
2169cdf0e10cSrcweir case HTML_O_CONTENT:
2170cdf0e10cSrcweir aContent = pOption->GetString();
2171cdf0e10cSrcweir break;
2172cdf0e10cSrcweir }
2173cdf0e10cSrcweir }
2174cdf0e10cSrcweir
2175cdf0e10cSrcweir if ( bHTTPEquiv || HTML_META_DESCRIPTION != nAction )
2176cdf0e10cSrcweir {
2177cdf0e10cSrcweir // if it is not a Description, remove CRs and LFs from CONTENT
2178cdf0e10cSrcweir aContent.EraseAllChars( _CR );
2179cdf0e10cSrcweir aContent.EraseAllChars( _LF );
2180cdf0e10cSrcweir }
2181cdf0e10cSrcweir else
2182cdf0e10cSrcweir {
2183cdf0e10cSrcweir // convert line endings for Description
2184cdf0e10cSrcweir aContent.ConvertLineEnd();
2185cdf0e10cSrcweir }
2186cdf0e10cSrcweir
2187cdf0e10cSrcweir
2188cdf0e10cSrcweir if ( bHTTPEquiv && i_pHTTPHeader )
2189cdf0e10cSrcweir {
2190cdf0e10cSrcweir // #57232#: Netscape seems to just ignore a closing ", so we do too
2191cdf0e10cSrcweir if ( aContent.Len() && '"' == aContent.GetChar( aContent.Len()-1 ) )
2192cdf0e10cSrcweir {
2193cdf0e10cSrcweir aContent.Erase( aContent.Len() - 1 );
2194cdf0e10cSrcweir }
2195cdf0e10cSrcweir SvKeyValue aKeyValue( aName, aContent );
2196cdf0e10cSrcweir i_pHTTPHeader->Append( aKeyValue );
2197cdf0e10cSrcweir }
2198cdf0e10cSrcweir
2199cdf0e10cSrcweir switch ( nAction )
2200cdf0e10cSrcweir {
2201cdf0e10cSrcweir case HTML_META_AUTHOR:
2202cdf0e10cSrcweir if (i_xDocProps.is()) {
2203cdf0e10cSrcweir i_xDocProps->setAuthor( aContent );
2204cdf0e10cSrcweir bChanged = true;
2205cdf0e10cSrcweir }
2206cdf0e10cSrcweir break;
2207cdf0e10cSrcweir case HTML_META_DESCRIPTION:
2208cdf0e10cSrcweir if (i_xDocProps.is()) {
2209cdf0e10cSrcweir i_xDocProps->setDescription( aContent );
2210cdf0e10cSrcweir bChanged = true;
2211cdf0e10cSrcweir }
2212cdf0e10cSrcweir break;
2213cdf0e10cSrcweir case HTML_META_KEYWORDS:
2214cdf0e10cSrcweir if (i_xDocProps.is()) {
2215cdf0e10cSrcweir i_xDocProps->setKeywords(
2216cdf0e10cSrcweir ::comphelper::string::convertCommaSeparated(aContent));
2217cdf0e10cSrcweir bChanged = true;
2218cdf0e10cSrcweir }
2219cdf0e10cSrcweir break;
2220cdf0e10cSrcweir case HTML_META_CLASSIFICATION:
2221cdf0e10cSrcweir if (i_xDocProps.is()) {
2222cdf0e10cSrcweir i_xDocProps->setSubject( aContent );
2223cdf0e10cSrcweir bChanged = true;
2224cdf0e10cSrcweir }
2225cdf0e10cSrcweir break;
2226cdf0e10cSrcweir
2227cdf0e10cSrcweir case HTML_META_CHANGEDBY:
2228cdf0e10cSrcweir if (i_xDocProps.is()) {
2229cdf0e10cSrcweir i_xDocProps->setModifiedBy( aContent );
2230cdf0e10cSrcweir }
2231cdf0e10cSrcweir break;
2232cdf0e10cSrcweir
2233cdf0e10cSrcweir case HTML_META_CREATED:
2234cdf0e10cSrcweir case HTML_META_CHANGED:
2235cdf0e10cSrcweir if ( i_xDocProps.is() && aContent.Len() &&
2236cdf0e10cSrcweir aContent.GetTokenCount() == 2 )
2237cdf0e10cSrcweir {
2238cdf0e10cSrcweir Date aDate( (sal_uLong)aContent.GetToken(0).ToInt32() );
2239cdf0e10cSrcweir Time aTime( (sal_uLong)aContent.GetToken(1).ToInt32() );
2240cdf0e10cSrcweir DateTime aDateTime( aDate, aTime );
2241cdf0e10cSrcweir ::util::DateTime uDT(aDateTime.Get100Sec(),
2242cdf0e10cSrcweir aDateTime.GetSec(), aDateTime.GetMin(),
2243cdf0e10cSrcweir aDateTime.GetHour(), aDateTime.GetDay(),
2244cdf0e10cSrcweir aDateTime.GetMonth(), aDateTime.GetYear());
2245cdf0e10cSrcweir if ( HTML_META_CREATED==nAction )
2246cdf0e10cSrcweir i_xDocProps->setCreationDate( uDT );
2247cdf0e10cSrcweir else
2248cdf0e10cSrcweir i_xDocProps->setModificationDate( uDT );
2249cdf0e10cSrcweir bChanged = true;
2250cdf0e10cSrcweir }
2251cdf0e10cSrcweir break;
2252cdf0e10cSrcweir
2253cdf0e10cSrcweir case HTML_META_REFRESH:
2254cdf0e10cSrcweir DBG_ASSERT( !bHTTPEquiv || i_pHTTPHeader,
2255cdf0e10cSrcweir "Reload-URL aufgrund unterlassener MUSS-Aenderung verlorengegangen" );
2256cdf0e10cSrcweir break;
2257cdf0e10cSrcweir
2258cdf0e10cSrcweir case HTML_META_CONTENT_TYPE:
2259cdf0e10cSrcweir if ( aContent.Len() )
2260cdf0e10cSrcweir {
2261cdf0e10cSrcweir o_rEnc = GetEncodingByMIME( aContent );
2262cdf0e10cSrcweir }
2263cdf0e10cSrcweir break;
2264cdf0e10cSrcweir
2265cdf0e10cSrcweir case HTML_META_NONE:
2266cdf0e10cSrcweir if ( !bHTTPEquiv )
2267cdf0e10cSrcweir {
2268cdf0e10cSrcweir if (i_xDocProps.is())
2269cdf0e10cSrcweir {
2270cdf0e10cSrcweir uno::Reference<beans::XPropertyContainer> xUDProps
2271cdf0e10cSrcweir = i_xDocProps->getUserDefinedProperties();
2272cdf0e10cSrcweir try {
2273cdf0e10cSrcweir xUDProps->addProperty(aName,
2274cdf0e10cSrcweir beans::PropertyAttribute::REMOVEABLE,
2275cdf0e10cSrcweir uno::makeAny(::rtl::OUString(aContent)));
2276cdf0e10cSrcweir AddMetaUserDefined(aName);
2277cdf0e10cSrcweir bChanged = true;
2278cdf0e10cSrcweir } catch (uno::Exception &) {
2279cdf0e10cSrcweir // ignore
2280cdf0e10cSrcweir }
2281cdf0e10cSrcweir }
2282cdf0e10cSrcweir }
2283cdf0e10cSrcweir break;
2284cdf0e10cSrcweir default:
2285cdf0e10cSrcweir break;
2286cdf0e10cSrcweir }
2287cdf0e10cSrcweir
2288cdf0e10cSrcweir return bChanged;
2289cdf0e10cSrcweir }
2290cdf0e10cSrcweir
ParseMetaOptions(const uno::Reference<document::XDocumentProperties> & i_xDocProps,SvKeyValueIterator * i_pHeader)2291cdf0e10cSrcweir bool HTMLParser::ParseMetaOptions(
2292cdf0e10cSrcweir const uno::Reference<document::XDocumentProperties> & i_xDocProps,
2293cdf0e10cSrcweir SvKeyValueIterator *i_pHeader )
2294cdf0e10cSrcweir {
2295cdf0e10cSrcweir sal_uInt16 nContentOption = HTML_O_CONTENT;
2296cdf0e10cSrcweir rtl_TextEncoding eEnc = RTL_TEXTENCODING_DONTKNOW;
2297cdf0e10cSrcweir
2298cdf0e10cSrcweir bool bRet = ParseMetaOptionsImpl( i_xDocProps, i_pHeader,
2299cdf0e10cSrcweir GetOptions(&nContentOption),
2300cdf0e10cSrcweir eEnc );
2301cdf0e10cSrcweir
2302cdf0e10cSrcweir // If the encoding is set by a META tag, it may only overwrite the
2303cdf0e10cSrcweir // current encoding if both, the current and the new encoding, are 1-sal_uInt8
2304cdf0e10cSrcweir // encodings. Everything else cannot lead to reasonable results.
2305cdf0e10cSrcweir if (RTL_TEXTENCODING_DONTKNOW != eEnc &&
2306cdf0e10cSrcweir rtl_isOctetTextEncoding( eEnc ) &&
2307cdf0e10cSrcweir rtl_isOctetTextEncoding( GetSrcEncoding() ) )
2308cdf0e10cSrcweir {
2309cdf0e10cSrcweir eEnc = GetExtendedCompatibilityTextEncoding( eEnc ); // #89973#
2310cdf0e10cSrcweir SetSrcEncoding( eEnc );
2311cdf0e10cSrcweir }
2312cdf0e10cSrcweir
2313cdf0e10cSrcweir return bRet;
2314cdf0e10cSrcweir }
2315cdf0e10cSrcweir
GetEncodingByMIME(const String & rMime)2316cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByMIME( const String& rMime )
2317cdf0e10cSrcweir {
2318cdf0e10cSrcweir ByteString sType;
2319cdf0e10cSrcweir ByteString sSubType;
2320cdf0e10cSrcweir INetContentTypeParameterList aParameters;
2321cdf0e10cSrcweir ByteString sMime( rMime, RTL_TEXTENCODING_ASCII_US );
2322cdf0e10cSrcweir if (INetContentTypes::parse(sMime, sType, sSubType, &aParameters))
2323cdf0e10cSrcweir {
2324cdf0e10cSrcweir const INetContentTypeParameter * pCharset
2325cdf0e10cSrcweir = aParameters.find("charset");
2326cdf0e10cSrcweir if (pCharset != 0)
2327cdf0e10cSrcweir {
2328cdf0e10cSrcweir ByteString sValue( pCharset->m_sValue, RTL_TEXTENCODING_ASCII_US );
2329cdf0e10cSrcweir return GetExtendedCompatibilityTextEncoding(
2330cdf0e10cSrcweir rtl_getTextEncodingFromMimeCharset( sValue.GetBuffer() ) );
2331cdf0e10cSrcweir }
2332cdf0e10cSrcweir }
2333cdf0e10cSrcweir return RTL_TEXTENCODING_DONTKNOW;
2334cdf0e10cSrcweir }
2335cdf0e10cSrcweir
GetEncodingByHttpHeader(SvKeyValueIterator * pHTTPHeader)2336cdf0e10cSrcweir rtl_TextEncoding HTMLParser::GetEncodingByHttpHeader( SvKeyValueIterator *pHTTPHeader )
2337cdf0e10cSrcweir {
2338cdf0e10cSrcweir rtl_TextEncoding eRet = RTL_TEXTENCODING_DONTKNOW;
2339cdf0e10cSrcweir if( pHTTPHeader )
2340cdf0e10cSrcweir {
2341cdf0e10cSrcweir SvKeyValue aKV;
2342cdf0e10cSrcweir for( sal_Bool bCont = pHTTPHeader->GetFirst( aKV ); bCont;
2343cdf0e10cSrcweir bCont = pHTTPHeader->GetNext( aKV ) )
2344cdf0e10cSrcweir {
2345cdf0e10cSrcweir if( aKV.GetKey().EqualsIgnoreCaseAscii( OOO_STRING_SVTOOLS_HTML_META_content_type ) )
2346cdf0e10cSrcweir {
2347cdf0e10cSrcweir if( aKV.GetValue().Len() )
2348cdf0e10cSrcweir {
2349cdf0e10cSrcweir eRet = HTMLParser::GetEncodingByMIME( aKV.GetValue() );
2350cdf0e10cSrcweir }
2351cdf0e10cSrcweir }
2352cdf0e10cSrcweir }
2353cdf0e10cSrcweir }
2354cdf0e10cSrcweir return eRet;
2355cdf0e10cSrcweir }
2356cdf0e10cSrcweir
SetEncodingByHTTPHeader(SvKeyValueIterator * pHTTPHeader)2357cdf0e10cSrcweir sal_Bool HTMLParser::SetEncodingByHTTPHeader(
2358cdf0e10cSrcweir SvKeyValueIterator *pHTTPHeader )
2359cdf0e10cSrcweir {
2360cdf0e10cSrcweir sal_Bool bRet = sal_False;
2361cdf0e10cSrcweir rtl_TextEncoding eEnc = HTMLParser::GetEncodingByHttpHeader( pHTTPHeader );
2362cdf0e10cSrcweir if(RTL_TEXTENCODING_DONTKNOW != eEnc)
2363cdf0e10cSrcweir {
2364cdf0e10cSrcweir SetSrcEncoding( eEnc );
2365cdf0e10cSrcweir bRet = sal_True;
2366cdf0e10cSrcweir }
2367cdf0e10cSrcweir return bRet;
2368cdf0e10cSrcweir }
2369cdf0e10cSrcweir
2370cdf0e10cSrcweir
2371