xref: /aoo41x/main/xmlreader/source/xmlreader.cxx (revision b725e8eb)
1*b725e8ebSAndrew Rist /**************************************************************
2*b725e8ebSAndrew Rist  *
3*b725e8ebSAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*b725e8ebSAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*b725e8ebSAndrew Rist  * distributed with this work for additional information
6*b725e8ebSAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*b725e8ebSAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*b725e8ebSAndrew Rist  * "License"); you may not use this file except in compliance
9*b725e8ebSAndrew Rist  * with the License.  You may obtain a copy of the License at
10*b725e8ebSAndrew Rist  *
11*b725e8ebSAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12*b725e8ebSAndrew Rist  *
13*b725e8ebSAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*b725e8ebSAndrew Rist  * software distributed under the License is distributed on an
15*b725e8ebSAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*b725e8ebSAndrew Rist  * KIND, either express or implied.  See the License for the
17*b725e8ebSAndrew Rist  * specific language governing permissions and limitations
18*b725e8ebSAndrew Rist  * under the License.
19*b725e8ebSAndrew Rist  *
20*b725e8ebSAndrew Rist  *************************************************************/
21*b725e8ebSAndrew Rist 
22*b725e8ebSAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir #include "precompiled_xmlreader.hxx"
25cdf0e10cSrcweir #include "sal/config.h"
26cdf0e10cSrcweir 
27cdf0e10cSrcweir #include <climits>
28cdf0e10cSrcweir #include <cstddef>
29cdf0e10cSrcweir 
30cdf0e10cSrcweir #include "com/sun/star/container/NoSuchElementException.hpp"
31cdf0e10cSrcweir #include "com/sun/star/uno/Reference.hxx"
32cdf0e10cSrcweir #include "com/sun/star/uno/RuntimeException.hpp"
33cdf0e10cSrcweir #include "com/sun/star/uno/XInterface.hpp"
34cdf0e10cSrcweir #include "osl/diagnose.h"
35cdf0e10cSrcweir #include "osl/file.h"
36cdf0e10cSrcweir #include "rtl/string.h"
37cdf0e10cSrcweir #include "rtl/ustring.h"
38cdf0e10cSrcweir #include "rtl/ustring.hxx"
39cdf0e10cSrcweir #include "sal/types.h"
40cdf0e10cSrcweir #include "xmlreader/pad.hxx"
41cdf0e10cSrcweir #include "xmlreader/span.hxx"
42cdf0e10cSrcweir #include "xmlreader/xmlreader.hxx"
43cdf0e10cSrcweir 
44cdf0e10cSrcweir namespace xmlreader {
45cdf0e10cSrcweir 
46cdf0e10cSrcweir namespace {
47cdf0e10cSrcweir 
48cdf0e10cSrcweir namespace css = com::sun::star;
49cdf0e10cSrcweir 
isSpace(char c)50cdf0e10cSrcweir bool isSpace(char c) {
51cdf0e10cSrcweir     switch (c) {
52cdf0e10cSrcweir     case '\x09':
53cdf0e10cSrcweir     case '\x0A':
54cdf0e10cSrcweir     case '\x0D':
55cdf0e10cSrcweir     case ' ':
56cdf0e10cSrcweir         return true;
57cdf0e10cSrcweir     default:
58cdf0e10cSrcweir         return false;
59cdf0e10cSrcweir     }
60cdf0e10cSrcweir }
61cdf0e10cSrcweir 
62cdf0e10cSrcweir }
63cdf0e10cSrcweir 
XmlReader(rtl::OUString const & fileUrl)64cdf0e10cSrcweir XmlReader::XmlReader(rtl::OUString const & fileUrl)
65cdf0e10cSrcweir     SAL_THROW((
66cdf0e10cSrcweir         css::container::NoSuchElementException, css::uno::RuntimeException)):
67cdf0e10cSrcweir     fileUrl_(fileUrl)
68cdf0e10cSrcweir {
69cdf0e10cSrcweir     switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
70cdf0e10cSrcweir     {
71cdf0e10cSrcweir     case osl_File_E_None:
72cdf0e10cSrcweir         break;
73cdf0e10cSrcweir     case osl_File_E_NOENT:
74cdf0e10cSrcweir         throw css::container::NoSuchElementException(
75cdf0e10cSrcweir             fileUrl_, css::uno::Reference< css::uno::XInterface >());
76cdf0e10cSrcweir     default:
77cdf0e10cSrcweir         throw css::uno::RuntimeException(
78cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
79cdf0e10cSrcweir              fileUrl_),
80cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
81cdf0e10cSrcweir     }
82cdf0e10cSrcweir     oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
83cdf0e10cSrcweir     if (e == osl_File_E_None) {
84cdf0e10cSrcweir         e = osl_mapFile(
85cdf0e10cSrcweir             fileHandle_, &fileAddress_, fileSize_, 0,
86cdf0e10cSrcweir             osl_File_MapFlag_WillNeed);
87cdf0e10cSrcweir     }
88cdf0e10cSrcweir     if (e != osl_File_E_None) {
89cdf0e10cSrcweir         e = osl_closeFile(fileHandle_);
90cdf0e10cSrcweir         if (e != osl_File_E_None) {
91cdf0e10cSrcweir             OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
92cdf0e10cSrcweir         }
93cdf0e10cSrcweir         throw css::uno::RuntimeException(
94cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
95cdf0e10cSrcweir              fileUrl_),
96cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
97cdf0e10cSrcweir     }
98cdf0e10cSrcweir     namespaceIris_.push_back(
99cdf0e10cSrcweir         Span(
100cdf0e10cSrcweir             RTL_CONSTASCII_STRINGPARAM(
101cdf0e10cSrcweir                 "http://www.w3.org/XML/1998/namespace")));
102cdf0e10cSrcweir     namespaces_.push_back(
103cdf0e10cSrcweir         NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
104cdf0e10cSrcweir     pos_ = static_cast< char * >(fileAddress_);
105cdf0e10cSrcweir     end_ = pos_ + fileSize_;
106cdf0e10cSrcweir     state_ = STATE_CONTENT;
107cdf0e10cSrcweir }
108cdf0e10cSrcweir 
~XmlReader()109cdf0e10cSrcweir XmlReader::~XmlReader() {
110cdf0e10cSrcweir     oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
111cdf0e10cSrcweir     if (e != osl_File_E_None) {
112cdf0e10cSrcweir         OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
113cdf0e10cSrcweir     }
114cdf0e10cSrcweir     e = osl_closeFile(fileHandle_);
115cdf0e10cSrcweir     if (e != osl_File_E_None) {
116cdf0e10cSrcweir         OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
117cdf0e10cSrcweir     }
118cdf0e10cSrcweir }
119cdf0e10cSrcweir 
registerNamespaceIri(Span const & iri)120cdf0e10cSrcweir int XmlReader::registerNamespaceIri(Span const & iri) {
121cdf0e10cSrcweir     int id = toNamespaceId(namespaceIris_.size());
122cdf0e10cSrcweir     namespaceIris_.push_back(iri);
123cdf0e10cSrcweir     if (iri.equals(
124cdf0e10cSrcweir             Span(
125cdf0e10cSrcweir                 RTL_CONSTASCII_STRINGPARAM(
126cdf0e10cSrcweir                     "http://www.w3.org/2001/XMLSchema-instance"))))
127cdf0e10cSrcweir     {
128cdf0e10cSrcweir         // Old user layer .xcu files used the xsi namespace prefix without
129cdf0e10cSrcweir         // declaring a corresponding namespace binding, see issue 77174; reading
130cdf0e10cSrcweir         // those files during migration would fail without this hack that can be
131cdf0e10cSrcweir         // removed once migration is no longer relevant (see
132cdf0e10cSrcweir         // configmgr::Components::parseModificationLayer):
133cdf0e10cSrcweir         namespaces_.push_back(
134cdf0e10cSrcweir             NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
135cdf0e10cSrcweir     }
136cdf0e10cSrcweir     return id;
137cdf0e10cSrcweir }
138cdf0e10cSrcweir 
nextItem(Text reportText,Span * data,int * nsId)139cdf0e10cSrcweir XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
140cdf0e10cSrcweir {
141cdf0e10cSrcweir     switch (state_) {
142cdf0e10cSrcweir     case STATE_CONTENT:
143cdf0e10cSrcweir         switch (reportText) {
144cdf0e10cSrcweir         case TEXT_NONE:
145cdf0e10cSrcweir             return handleSkippedText(data, nsId);
146cdf0e10cSrcweir         case TEXT_RAW:
147cdf0e10cSrcweir             return handleRawText(data);
148cdf0e10cSrcweir         case TEXT_NORMALIZED:
149cdf0e10cSrcweir             return handleNormalizedText(data);
150cdf0e10cSrcweir         }
151cdf0e10cSrcweir     case STATE_START_TAG:
152cdf0e10cSrcweir         return handleStartTag(nsId, data);
153cdf0e10cSrcweir     case STATE_END_TAG:
154cdf0e10cSrcweir         return handleEndTag();
155cdf0e10cSrcweir     case STATE_EMPTY_ELEMENT_TAG:
156cdf0e10cSrcweir         handleElementEnd();
157cdf0e10cSrcweir         return RESULT_END;
158cdf0e10cSrcweir     default: // STATE_DONE
159cdf0e10cSrcweir         return RESULT_DONE;
160cdf0e10cSrcweir     }
161cdf0e10cSrcweir }
162cdf0e10cSrcweir 
nextAttribute(int * nsId,Span * localName)163cdf0e10cSrcweir bool XmlReader::nextAttribute(int * nsId, Span * localName) {
164cdf0e10cSrcweir     OSL_ASSERT(nsId != 0 && localName != 0);
165cdf0e10cSrcweir     if (firstAttribute_) {
166cdf0e10cSrcweir         currentAttribute_ = attributes_.begin();
167cdf0e10cSrcweir         firstAttribute_ = false;
168cdf0e10cSrcweir     } else {
169cdf0e10cSrcweir         ++currentAttribute_;
170cdf0e10cSrcweir     }
171cdf0e10cSrcweir     if (currentAttribute_ == attributes_.end()) {
172cdf0e10cSrcweir         return false;
173cdf0e10cSrcweir     }
174cdf0e10cSrcweir     if (currentAttribute_->nameColon == 0) {
175cdf0e10cSrcweir         *nsId = NAMESPACE_NONE;
176cdf0e10cSrcweir         *localName = Span(
177cdf0e10cSrcweir             currentAttribute_->nameBegin,
178cdf0e10cSrcweir             currentAttribute_->nameEnd - currentAttribute_->nameBegin);
179cdf0e10cSrcweir     } else {
180cdf0e10cSrcweir         *nsId = getNamespaceId(
181cdf0e10cSrcweir             Span(
182cdf0e10cSrcweir                 currentAttribute_->nameBegin,
183cdf0e10cSrcweir                 currentAttribute_->nameColon - currentAttribute_->nameBegin));
184cdf0e10cSrcweir         *localName = Span(
185cdf0e10cSrcweir             currentAttribute_->nameColon + 1,
186cdf0e10cSrcweir             currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
187cdf0e10cSrcweir     }
188cdf0e10cSrcweir     return true;
189cdf0e10cSrcweir }
190cdf0e10cSrcweir 
getAttributeValue(bool fullyNormalize)191cdf0e10cSrcweir Span XmlReader::getAttributeValue(bool fullyNormalize) {
192cdf0e10cSrcweir     return handleAttributeValue(
193cdf0e10cSrcweir         currentAttribute_->valueBegin, currentAttribute_->valueEnd,
194cdf0e10cSrcweir         fullyNormalize);
195cdf0e10cSrcweir }
196cdf0e10cSrcweir 
getNamespaceId(Span const & prefix) const197cdf0e10cSrcweir int XmlReader::getNamespaceId(Span const & prefix) const {
198cdf0e10cSrcweir     for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
199cdf0e10cSrcweir          i != namespaces_.rend(); ++i)
200cdf0e10cSrcweir     {
201cdf0e10cSrcweir         if (prefix.equals(i->prefix)) {
202cdf0e10cSrcweir             return i->nsId;
203cdf0e10cSrcweir         }
204cdf0e10cSrcweir     }
205cdf0e10cSrcweir     return NAMESPACE_UNKNOWN;
206cdf0e10cSrcweir }
207cdf0e10cSrcweir 
getUrl() const208cdf0e10cSrcweir rtl::OUString XmlReader::getUrl() const {
209cdf0e10cSrcweir     return fileUrl_;
210cdf0e10cSrcweir }
211cdf0e10cSrcweir 
normalizeLineEnds(Span const & text)212cdf0e10cSrcweir void XmlReader::normalizeLineEnds(Span const & text) {
213cdf0e10cSrcweir     char const * p = text.begin;
214cdf0e10cSrcweir     sal_Int32 n = text.length;
215cdf0e10cSrcweir     for (;;) {
216cdf0e10cSrcweir         sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
217cdf0e10cSrcweir         if (i < 0) {
218cdf0e10cSrcweir             break;
219cdf0e10cSrcweir         }
220cdf0e10cSrcweir         pad_.add(p, i);
221cdf0e10cSrcweir         p += i + 1;
222cdf0e10cSrcweir         n -= i + 1;
223cdf0e10cSrcweir         if (n == 0 || *p != '\x0A') {
224cdf0e10cSrcweir             pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
225cdf0e10cSrcweir         }
226cdf0e10cSrcweir     }
227cdf0e10cSrcweir     pad_.add(p, n);
228cdf0e10cSrcweir }
229cdf0e10cSrcweir 
skipSpace()230cdf0e10cSrcweir void XmlReader::skipSpace() {
231cdf0e10cSrcweir     while (isSpace(peek())) {
232cdf0e10cSrcweir         ++pos_;
233cdf0e10cSrcweir     }
234cdf0e10cSrcweir }
235cdf0e10cSrcweir 
skipComment()236cdf0e10cSrcweir bool XmlReader::skipComment() {
237cdf0e10cSrcweir     if (rtl_str_shortenedCompare_WithLength(
238cdf0e10cSrcweir             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
239cdf0e10cSrcweir             RTL_CONSTASCII_LENGTH("--")) !=
240cdf0e10cSrcweir         0)
241cdf0e10cSrcweir     {
242cdf0e10cSrcweir         return false;
243cdf0e10cSrcweir     }
244cdf0e10cSrcweir     pos_ += RTL_CONSTASCII_LENGTH("--");
245cdf0e10cSrcweir     sal_Int32 i = rtl_str_indexOfStr_WithLength(
246cdf0e10cSrcweir         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
247cdf0e10cSrcweir     if (i < 0) {
248cdf0e10cSrcweir         throw css::uno::RuntimeException(
249cdf0e10cSrcweir             (rtl::OUString(
250cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM(
251cdf0e10cSrcweir                     "premature end (within comment) of ")) +
252cdf0e10cSrcweir              fileUrl_),
253cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
254cdf0e10cSrcweir     }
255cdf0e10cSrcweir     pos_ += i + RTL_CONSTASCII_LENGTH("--");
256cdf0e10cSrcweir     if (read() != '>') {
257cdf0e10cSrcweir         throw css::uno::RuntimeException(
258cdf0e10cSrcweir             (rtl::OUString(
259cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM(
260cdf0e10cSrcweir                     "illegal \"--\" within comment in ")) +
261cdf0e10cSrcweir              fileUrl_),
262cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
263cdf0e10cSrcweir     }
264cdf0e10cSrcweir     return true;
265cdf0e10cSrcweir }
266cdf0e10cSrcweir 
skipProcessingInstruction()267cdf0e10cSrcweir void XmlReader::skipProcessingInstruction() {
268cdf0e10cSrcweir     sal_Int32 i = rtl_str_indexOfStr_WithLength(
269cdf0e10cSrcweir         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
270cdf0e10cSrcweir     if (i < 0) {
271cdf0e10cSrcweir         throw css::uno::RuntimeException(
272cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
273cdf0e10cSrcweir              fileUrl_),
274cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
275cdf0e10cSrcweir     }
276cdf0e10cSrcweir     pos_ += i + RTL_CONSTASCII_LENGTH("?>");
277cdf0e10cSrcweir }
278cdf0e10cSrcweir 
skipDocumentTypeDeclaration()279cdf0e10cSrcweir void XmlReader::skipDocumentTypeDeclaration() {
280cdf0e10cSrcweir     // Neither is it checked that the doctypedecl is at the correct position in
281cdf0e10cSrcweir     // the document, nor that it is well-formed:
282cdf0e10cSrcweir     for (;;) {
283cdf0e10cSrcweir         char c = read();
284cdf0e10cSrcweir         switch (c) {
285cdf0e10cSrcweir         case '\0': // i.e., EOF
286cdf0e10cSrcweir             throw css::uno::RuntimeException(
287cdf0e10cSrcweir                 (rtl::OUString(
288cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
289cdf0e10cSrcweir                         "premature end (within DTD) of ")) +
290cdf0e10cSrcweir                  fileUrl_),
291cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
292cdf0e10cSrcweir         case '"':
293cdf0e10cSrcweir         case '\'':
294cdf0e10cSrcweir             {
295cdf0e10cSrcweir                 sal_Int32 i = rtl_str_indexOfChar_WithLength(
296cdf0e10cSrcweir                     pos_, end_ - pos_, c);
297cdf0e10cSrcweir                 if (i < 0) {
298cdf0e10cSrcweir                     throw css::uno::RuntimeException(
299cdf0e10cSrcweir                         (rtl::OUString(
300cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
301cdf0e10cSrcweir                                 "premature end (within DTD) of ")) +
302cdf0e10cSrcweir                          fileUrl_),
303cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
304cdf0e10cSrcweir                 }
305cdf0e10cSrcweir                 pos_ += i + 1;
306cdf0e10cSrcweir             }
307cdf0e10cSrcweir             break;
308cdf0e10cSrcweir         case '>':
309cdf0e10cSrcweir             return;
310cdf0e10cSrcweir         case '[':
311cdf0e10cSrcweir             for (;;) {
312cdf0e10cSrcweir                 c = read();
313cdf0e10cSrcweir                 switch (c) {
314cdf0e10cSrcweir                 case '\0': // i.e., EOF
315cdf0e10cSrcweir                     throw css::uno::RuntimeException(
316cdf0e10cSrcweir                         (rtl::OUString(
317cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
318cdf0e10cSrcweir                                 "premature end (within DTD) of ")) +
319cdf0e10cSrcweir                          fileUrl_),
320cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
321cdf0e10cSrcweir                 case '"':
322cdf0e10cSrcweir                 case '\'':
323cdf0e10cSrcweir                     {
324cdf0e10cSrcweir                         sal_Int32 i = rtl_str_indexOfChar_WithLength(
325cdf0e10cSrcweir                             pos_, end_ - pos_, c);
326cdf0e10cSrcweir                         if (i < 0) {
327cdf0e10cSrcweir                             throw css::uno::RuntimeException(
328cdf0e10cSrcweir                             (rtl::OUString(
329cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
330cdf0e10cSrcweir                                     "premature end (within DTD) of ")) +
331cdf0e10cSrcweir                              fileUrl_),
332cdf0e10cSrcweir                             css::uno::Reference< css::uno::XInterface >());
333cdf0e10cSrcweir                         }
334cdf0e10cSrcweir                         pos_ += i + 1;
335cdf0e10cSrcweir                     }
336cdf0e10cSrcweir                     break;
337cdf0e10cSrcweir                 case '<':
338cdf0e10cSrcweir                     switch (read()) {
339cdf0e10cSrcweir                     case '\0': // i.e., EOF
340cdf0e10cSrcweir                         throw css::uno::RuntimeException(
341cdf0e10cSrcweir                             (rtl::OUString(
342cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
343cdf0e10cSrcweir                                     "premature end (within DTD) of ")) +
344cdf0e10cSrcweir                              fileUrl_),
345cdf0e10cSrcweir                             css::uno::Reference< css::uno::XInterface >());
346cdf0e10cSrcweir                     case '!':
347cdf0e10cSrcweir                         skipComment();
348cdf0e10cSrcweir                         break;
349cdf0e10cSrcweir                     case '?':
350cdf0e10cSrcweir                         skipProcessingInstruction();
351cdf0e10cSrcweir                         break;
352cdf0e10cSrcweir                     default:
353cdf0e10cSrcweir                         break;
354cdf0e10cSrcweir                     }
355cdf0e10cSrcweir                     break;
356cdf0e10cSrcweir                 case ']':
357cdf0e10cSrcweir                     skipSpace();
358cdf0e10cSrcweir                     if (read() != '>') {
359cdf0e10cSrcweir                         throw css::uno::RuntimeException(
360cdf0e10cSrcweir                             (rtl::OUString(
361cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
362cdf0e10cSrcweir                                     "missing \">\" of DTD in ")) +
363cdf0e10cSrcweir                              fileUrl_),
364cdf0e10cSrcweir                             css::uno::Reference< css::uno::XInterface >());
365cdf0e10cSrcweir                     }
366cdf0e10cSrcweir                     return;
367cdf0e10cSrcweir                 default:
368cdf0e10cSrcweir                     break;
369cdf0e10cSrcweir                 }
370cdf0e10cSrcweir             }
371cdf0e10cSrcweir         default:
372cdf0e10cSrcweir             break;
373cdf0e10cSrcweir         }
374cdf0e10cSrcweir     }
375cdf0e10cSrcweir }
376cdf0e10cSrcweir 
scanCdataSection()377cdf0e10cSrcweir Span XmlReader::scanCdataSection() {
378cdf0e10cSrcweir     if (rtl_str_shortenedCompare_WithLength(
379cdf0e10cSrcweir             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
380cdf0e10cSrcweir             RTL_CONSTASCII_LENGTH("[CDATA[")) !=
381cdf0e10cSrcweir         0)
382cdf0e10cSrcweir     {
383cdf0e10cSrcweir         return Span();
384cdf0e10cSrcweir     }
385cdf0e10cSrcweir     pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
386cdf0e10cSrcweir     char const * begin = pos_;
387cdf0e10cSrcweir     sal_Int32 i = rtl_str_indexOfStr_WithLength(
388cdf0e10cSrcweir         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
389cdf0e10cSrcweir     if (i < 0) {
390cdf0e10cSrcweir         throw css::uno::RuntimeException(
391cdf0e10cSrcweir             (rtl::OUString(
392cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM(
393cdf0e10cSrcweir                     "premature end (within CDATA section) of ")) +
394cdf0e10cSrcweir              fileUrl_),
395cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
396cdf0e10cSrcweir     }
397cdf0e10cSrcweir     pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
398cdf0e10cSrcweir     return Span(begin, i);
399cdf0e10cSrcweir }
400cdf0e10cSrcweir 
scanName(char const ** nameColon)401cdf0e10cSrcweir bool XmlReader::scanName(char const ** nameColon) {
402cdf0e10cSrcweir     OSL_ASSERT(nameColon != 0 && *nameColon == 0);
403cdf0e10cSrcweir     for (char const * begin = pos_;; ++pos_) {
404cdf0e10cSrcweir         switch (peek()) {
405cdf0e10cSrcweir         case '\0': // i.e., EOF
406cdf0e10cSrcweir         case '\x09':
407cdf0e10cSrcweir         case '\x0A':
408cdf0e10cSrcweir         case '\x0D':
409cdf0e10cSrcweir         case ' ':
410cdf0e10cSrcweir         case '/':
411cdf0e10cSrcweir         case '=':
412cdf0e10cSrcweir         case '>':
413cdf0e10cSrcweir             return pos_ != begin;
414cdf0e10cSrcweir         case ':':
415cdf0e10cSrcweir             *nameColon = pos_;
416cdf0e10cSrcweir             break;
417cdf0e10cSrcweir         default:
418cdf0e10cSrcweir             break;
419cdf0e10cSrcweir         }
420cdf0e10cSrcweir     }
421cdf0e10cSrcweir }
422cdf0e10cSrcweir 
scanNamespaceIri(char const * begin,char const * end)423cdf0e10cSrcweir int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
424cdf0e10cSrcweir     OSL_ASSERT(begin != 0 && begin <= end);
425cdf0e10cSrcweir     Span iri(handleAttributeValue(begin, end, false));
426cdf0e10cSrcweir     for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
427cdf0e10cSrcweir         if (namespaceIris_[i].equals(iri)) {
428cdf0e10cSrcweir             return toNamespaceId(i);
429cdf0e10cSrcweir         }
430cdf0e10cSrcweir     }
431cdf0e10cSrcweir     return XmlReader::NAMESPACE_UNKNOWN;
432cdf0e10cSrcweir }
433cdf0e10cSrcweir 
handleReference(char const * position,char const * end)434cdf0e10cSrcweir char const * XmlReader::handleReference(char const * position, char const * end)
435cdf0e10cSrcweir {
436cdf0e10cSrcweir     OSL_ASSERT(position != 0 && *position == '&' && position < end);
437cdf0e10cSrcweir     ++position;
438cdf0e10cSrcweir     if (*position == '#') {
439cdf0e10cSrcweir         ++position;
440cdf0e10cSrcweir         sal_Int32 val = 0;
441cdf0e10cSrcweir         char const * p;
442cdf0e10cSrcweir         if (*position == 'x') {
443cdf0e10cSrcweir             ++position;
444cdf0e10cSrcweir             p = position;
445cdf0e10cSrcweir             for (;; ++position) {
446cdf0e10cSrcweir                 char c = *position;
447cdf0e10cSrcweir                 if (c >= '0' && c <= '9') {
448cdf0e10cSrcweir                     val = 16 * val + (c - '0');
449cdf0e10cSrcweir                 } else if (c >= 'A' && c <= 'F') {
450cdf0e10cSrcweir                     val = 16 * val + (c - 'A') + 10;
451cdf0e10cSrcweir                 } else if (c >= 'a' && c <= 'f') {
452cdf0e10cSrcweir                     val = 16 * val + (c - 'a') + 10;
453cdf0e10cSrcweir                 } else {
454cdf0e10cSrcweir                     break;
455cdf0e10cSrcweir                 }
456cdf0e10cSrcweir                 if (val > 0x10FFFF) { // avoid overflow
457cdf0e10cSrcweir                     throw css::uno::RuntimeException(
458cdf0e10cSrcweir                         (rtl::OUString(
459cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
460cdf0e10cSrcweir                                 "'&#x...' too large in ")) +
461cdf0e10cSrcweir                          fileUrl_),
462cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
463cdf0e10cSrcweir                 }
464cdf0e10cSrcweir             }
465cdf0e10cSrcweir         } else {
466cdf0e10cSrcweir             p = position;
467cdf0e10cSrcweir             for (;; ++position) {
468cdf0e10cSrcweir                 char c = *position;
469cdf0e10cSrcweir                 if (c >= '0' && c <= '9') {
470cdf0e10cSrcweir                     val = 10 * val + (c - '0');
471cdf0e10cSrcweir                 } else {
472cdf0e10cSrcweir                     break;
473cdf0e10cSrcweir                 }
474cdf0e10cSrcweir                 if (val > 0x10FFFF) { // avoid overflow
475cdf0e10cSrcweir                     throw css::uno::RuntimeException(
476cdf0e10cSrcweir                         (rtl::OUString(
477cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
478cdf0e10cSrcweir                                 "'&#...' too large in ")) +
479cdf0e10cSrcweir                          fileUrl_),
480cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
481cdf0e10cSrcweir                 }
482cdf0e10cSrcweir             }
483cdf0e10cSrcweir         }
484cdf0e10cSrcweir         if (position == p || *position++ != ';') {
485cdf0e10cSrcweir             throw css::uno::RuntimeException(
486cdf0e10cSrcweir                 (rtl::OUString(
487cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
488cdf0e10cSrcweir                  fileUrl_),
489cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
490cdf0e10cSrcweir         }
491cdf0e10cSrcweir         OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
492cdf0e10cSrcweir         if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
493cdf0e10cSrcweir             (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
494cdf0e10cSrcweir         {
495cdf0e10cSrcweir             throw css::uno::RuntimeException(
496cdf0e10cSrcweir                 (rtl::OUString(
497cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
498cdf0e10cSrcweir                         "character reference denoting invalid character in ")) +
499cdf0e10cSrcweir                  fileUrl_),
500cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
501cdf0e10cSrcweir         }
502cdf0e10cSrcweir         char buf[4];
503cdf0e10cSrcweir         sal_Int32 len;
504cdf0e10cSrcweir         if (val < 0x80) {
505cdf0e10cSrcweir             buf[0] = static_cast< char >(val);
506cdf0e10cSrcweir             len = 1;
507cdf0e10cSrcweir         } else if (val < 0x800) {
508cdf0e10cSrcweir             buf[0] = static_cast< char >((val >> 6) | 0xC0);
509cdf0e10cSrcweir             buf[1] = static_cast< char >((val & 0x3F) | 0x80);
510cdf0e10cSrcweir             len = 2;
511cdf0e10cSrcweir         } else if (val < 0x10000) {
512cdf0e10cSrcweir             buf[0] = static_cast< char >((val >> 12) | 0xE0);
513cdf0e10cSrcweir             buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
514cdf0e10cSrcweir             buf[2] = static_cast< char >((val & 0x3F) | 0x80);
515cdf0e10cSrcweir             len = 3;
516cdf0e10cSrcweir         } else {
517cdf0e10cSrcweir             buf[0] = static_cast< char >((val >> 18) | 0xF0);
518cdf0e10cSrcweir             buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
519cdf0e10cSrcweir             buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
520cdf0e10cSrcweir             buf[3] = static_cast< char >((val & 0x3F) | 0x80);
521cdf0e10cSrcweir             len = 4;
522cdf0e10cSrcweir         }
523cdf0e10cSrcweir         pad_.addEphemeral(buf, len);
524cdf0e10cSrcweir         return position;
525cdf0e10cSrcweir     } else {
526cdf0e10cSrcweir         struct EntityRef {
527cdf0e10cSrcweir             char const * inBegin;
528cdf0e10cSrcweir             sal_Int32 inLength;
529cdf0e10cSrcweir             char const * outBegin;
530cdf0e10cSrcweir             sal_Int32 outLength;
531cdf0e10cSrcweir         };
532cdf0e10cSrcweir         static EntityRef const refs[] = {
533cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("amp;"),
534cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("&") },
535cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("lt;"),
536cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("<") },
537cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("gt;"),
538cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM(">") },
539cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("apos;"),
540cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("'") },
541cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("quot;"),
542cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("\"") } };
543cdf0e10cSrcweir         for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
544cdf0e10cSrcweir             if (rtl_str_shortenedCompare_WithLength(
545cdf0e10cSrcweir                     position, end - position, refs[i].inBegin, refs[i].inLength,
546cdf0e10cSrcweir                     refs[i].inLength) ==
547cdf0e10cSrcweir                 0)
548cdf0e10cSrcweir             {
549cdf0e10cSrcweir                 position += refs[i].inLength;
550cdf0e10cSrcweir                 pad_.add(refs[i].outBegin, refs[i].outLength);
551cdf0e10cSrcweir                 return position;
552cdf0e10cSrcweir             }
553cdf0e10cSrcweir         }
554cdf0e10cSrcweir         throw css::uno::RuntimeException(
555cdf0e10cSrcweir             (rtl::OUString(
556cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
557cdf0e10cSrcweir              fileUrl_),
558cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
559cdf0e10cSrcweir     }
560cdf0e10cSrcweir }
561cdf0e10cSrcweir 
handleAttributeValue(char const * begin,char const * end,bool fullyNormalize)562cdf0e10cSrcweir Span XmlReader::handleAttributeValue(
563cdf0e10cSrcweir     char const * begin, char const * end, bool fullyNormalize)
564cdf0e10cSrcweir {
565cdf0e10cSrcweir     pad_.clear();
566cdf0e10cSrcweir     if (fullyNormalize) {
567cdf0e10cSrcweir         while (begin != end && isSpace(*begin)) {
568cdf0e10cSrcweir             ++begin;
569cdf0e10cSrcweir         }
570cdf0e10cSrcweir         while (end != begin && isSpace(end[-1])) {
571cdf0e10cSrcweir             --end;
572cdf0e10cSrcweir         }
573cdf0e10cSrcweir         char const * p = begin;
574cdf0e10cSrcweir         enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
575cdf0e10cSrcweir             // a single true space character can go into the current span,
576cdf0e10cSrcweir             // everything else breaks the span
577cdf0e10cSrcweir         Space space = SPACE_NONE;
578cdf0e10cSrcweir         while (p != end) {
579cdf0e10cSrcweir             switch (*p) {
580cdf0e10cSrcweir             case '\x09':
581cdf0e10cSrcweir             case '\x0A':
582cdf0e10cSrcweir             case '\x0D':
583cdf0e10cSrcweir                 switch (space) {
584cdf0e10cSrcweir                 case SPACE_NONE:
585cdf0e10cSrcweir                     pad_.add(begin, p - begin);
586cdf0e10cSrcweir                     pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
587cdf0e10cSrcweir                     space = SPACE_BREAK;
588cdf0e10cSrcweir                     break;
589cdf0e10cSrcweir                 case SPACE_SPAN:
590cdf0e10cSrcweir                     pad_.add(begin, p - begin);
591cdf0e10cSrcweir                     space = SPACE_BREAK;
592cdf0e10cSrcweir                     break;
593cdf0e10cSrcweir                 case SPACE_BREAK:
594cdf0e10cSrcweir                     break;
595cdf0e10cSrcweir                 }
596cdf0e10cSrcweir                 begin = ++p;
597cdf0e10cSrcweir                 break;
598cdf0e10cSrcweir             case ' ':
599cdf0e10cSrcweir                 switch (space) {
600cdf0e10cSrcweir                 case SPACE_NONE:
601cdf0e10cSrcweir                     ++p;
602cdf0e10cSrcweir                     space = SPACE_SPAN;
603cdf0e10cSrcweir                     break;
604cdf0e10cSrcweir                 case SPACE_SPAN:
605cdf0e10cSrcweir                     pad_.add(begin, p - begin);
606cdf0e10cSrcweir                     begin = ++p;
607cdf0e10cSrcweir                     space = SPACE_BREAK;
608cdf0e10cSrcweir                     break;
609cdf0e10cSrcweir                 case SPACE_BREAK:
610cdf0e10cSrcweir                     begin = ++p;
611cdf0e10cSrcweir                     break;
612cdf0e10cSrcweir                 }
613cdf0e10cSrcweir                 break;
614cdf0e10cSrcweir             case '&':
615cdf0e10cSrcweir                 pad_.add(begin, p - begin);
616cdf0e10cSrcweir                 p = handleReference(p, end);
617cdf0e10cSrcweir                 begin = p;
618cdf0e10cSrcweir                 space = SPACE_NONE;
619cdf0e10cSrcweir                 break;
620cdf0e10cSrcweir             default:
621cdf0e10cSrcweir                 ++p;
622cdf0e10cSrcweir                 space = SPACE_NONE;
623cdf0e10cSrcweir                 break;
624cdf0e10cSrcweir             }
625cdf0e10cSrcweir         }
626cdf0e10cSrcweir         pad_.add(begin, p - begin);
627cdf0e10cSrcweir     } else {
628cdf0e10cSrcweir         char const * p = begin;
629cdf0e10cSrcweir         while (p != end) {
630cdf0e10cSrcweir             switch (*p) {
631cdf0e10cSrcweir             case '\x09':
632cdf0e10cSrcweir             case '\x0A':
633cdf0e10cSrcweir                 pad_.add(begin, p - begin);
634cdf0e10cSrcweir                 begin = ++p;
635cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
636cdf0e10cSrcweir                 break;
637cdf0e10cSrcweir             case '\x0D':
638cdf0e10cSrcweir                 pad_.add(begin, p - begin);
639cdf0e10cSrcweir                 ++p;
640cdf0e10cSrcweir                 if (peek() == '\x0A') {
641cdf0e10cSrcweir                     ++p;
642cdf0e10cSrcweir                 }
643cdf0e10cSrcweir                 begin = p;
644cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
645cdf0e10cSrcweir                 break;
646cdf0e10cSrcweir             case '&':
647cdf0e10cSrcweir                 pad_.add(begin, p - begin);
648cdf0e10cSrcweir                 p = handleReference(p, end);
649cdf0e10cSrcweir                 begin = p;
650cdf0e10cSrcweir                 break;
651cdf0e10cSrcweir             default:
652cdf0e10cSrcweir                 ++p;
653cdf0e10cSrcweir                 break;
654cdf0e10cSrcweir             }
655cdf0e10cSrcweir         }
656cdf0e10cSrcweir         pad_.add(begin, p - begin);
657cdf0e10cSrcweir     }
658cdf0e10cSrcweir     return pad_.get();
659cdf0e10cSrcweir }
660cdf0e10cSrcweir 
handleStartTag(int * nsId,Span * localName)661cdf0e10cSrcweir XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
662cdf0e10cSrcweir     OSL_ASSERT(nsId != 0 && localName);
663cdf0e10cSrcweir     char const * nameBegin = pos_;
664cdf0e10cSrcweir     char const * nameColon = 0;
665cdf0e10cSrcweir     if (!scanName(&nameColon)) {
666cdf0e10cSrcweir         throw css::uno::RuntimeException(
667cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
668cdf0e10cSrcweir              fileUrl_),
669cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
670cdf0e10cSrcweir     }
671cdf0e10cSrcweir     char const * nameEnd = pos_;
672cdf0e10cSrcweir     NamespaceList::size_type inheritedNamespaces = namespaces_.size();
673cdf0e10cSrcweir     bool hasDefaultNs = false;
674cdf0e10cSrcweir     int defaultNsId = NAMESPACE_NONE;
675cdf0e10cSrcweir     attributes_.clear();
676cdf0e10cSrcweir     for (;;) {
677cdf0e10cSrcweir         char const * p = pos_;
678cdf0e10cSrcweir         skipSpace();
679cdf0e10cSrcweir         if (peek() == '/' || peek() == '>') {
680cdf0e10cSrcweir             break;
681cdf0e10cSrcweir         }
682cdf0e10cSrcweir         if (pos_ == p) {
683cdf0e10cSrcweir             throw css::uno::RuntimeException(
684cdf0e10cSrcweir                 (rtl::OUString(
685cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
686cdf0e10cSrcweir                         "missing whitespace before attribute in ")) +
687cdf0e10cSrcweir                  fileUrl_),
688cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
689cdf0e10cSrcweir         }
690cdf0e10cSrcweir         char const * attrNameBegin = pos_;
691cdf0e10cSrcweir         char const * attrNameColon = 0;
692cdf0e10cSrcweir         if (!scanName(&attrNameColon)) {
693cdf0e10cSrcweir             throw css::uno::RuntimeException(
694cdf0e10cSrcweir                 (rtl::OUString(
695cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
696cdf0e10cSrcweir                  fileUrl_),
697cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
698cdf0e10cSrcweir         }
699cdf0e10cSrcweir         char const * attrNameEnd = pos_;
700cdf0e10cSrcweir         skipSpace();
701cdf0e10cSrcweir         if (read() != '=') {
702cdf0e10cSrcweir             throw css::uno::RuntimeException(
703cdf0e10cSrcweir                 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
704cdf0e10cSrcweir                  fileUrl_),
705cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
706cdf0e10cSrcweir         }
707cdf0e10cSrcweir         skipSpace();
708cdf0e10cSrcweir         char del = read();
709cdf0e10cSrcweir         if (del != '\'' && del != '"') {
710cdf0e10cSrcweir             throw css::uno::RuntimeException(
711cdf0e10cSrcweir                 (rtl::OUString(
712cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
713cdf0e10cSrcweir                  fileUrl_),
714cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
715cdf0e10cSrcweir         }
716cdf0e10cSrcweir         char const * valueBegin = pos_;
717cdf0e10cSrcweir         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
718cdf0e10cSrcweir         if (i < 0) {
719cdf0e10cSrcweir             throw css::uno::RuntimeException(
720cdf0e10cSrcweir                 (rtl::OUString(
721cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
722cdf0e10cSrcweir                         "unterminated attribute value in ")) +
723cdf0e10cSrcweir                  fileUrl_),
724cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
725cdf0e10cSrcweir         }
726cdf0e10cSrcweir         char const * valueEnd = pos_ + i;
727cdf0e10cSrcweir         pos_ += i + 1;
728cdf0e10cSrcweir         if (attrNameColon == 0 &&
729cdf0e10cSrcweir             Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
730cdf0e10cSrcweir                 RTL_CONSTASCII_STRINGPARAM("xmlns")))
731cdf0e10cSrcweir         {
732cdf0e10cSrcweir             hasDefaultNs = true;
733cdf0e10cSrcweir             defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
734cdf0e10cSrcweir         } else if (attrNameColon != 0 &&
735cdf0e10cSrcweir                    Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
736cdf0e10cSrcweir                        RTL_CONSTASCII_STRINGPARAM("xmlns")))
737cdf0e10cSrcweir         {
738cdf0e10cSrcweir             namespaces_.push_back(
739cdf0e10cSrcweir                 NamespaceData(
740cdf0e10cSrcweir                     Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
741cdf0e10cSrcweir                     scanNamespaceIri(valueBegin, valueEnd)));
742cdf0e10cSrcweir         } else {
743cdf0e10cSrcweir             attributes_.push_back(
744cdf0e10cSrcweir                 AttributeData(
745cdf0e10cSrcweir                     attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
746cdf0e10cSrcweir                     valueEnd));
747cdf0e10cSrcweir         }
748cdf0e10cSrcweir     }
749cdf0e10cSrcweir     if (!hasDefaultNs && !elements_.empty()) {
750cdf0e10cSrcweir         defaultNsId = elements_.top().defaultNamespaceId;
751cdf0e10cSrcweir     }
752cdf0e10cSrcweir     firstAttribute_ = true;
753cdf0e10cSrcweir     if (peek() == '/') {
754cdf0e10cSrcweir         state_ = STATE_EMPTY_ELEMENT_TAG;
755cdf0e10cSrcweir         ++pos_;
756cdf0e10cSrcweir     } else {
757cdf0e10cSrcweir         state_ = STATE_CONTENT;
758cdf0e10cSrcweir     }
759cdf0e10cSrcweir     if (peek() != '>') {
760cdf0e10cSrcweir         throw css::uno::RuntimeException(
761cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
762cdf0e10cSrcweir              fileUrl_),
763cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
764cdf0e10cSrcweir     }
765cdf0e10cSrcweir     ++pos_;
766cdf0e10cSrcweir     elements_.push(
767cdf0e10cSrcweir         ElementData(
768cdf0e10cSrcweir             Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
769cdf0e10cSrcweir             defaultNsId));
770cdf0e10cSrcweir     if (nameColon == 0) {
771cdf0e10cSrcweir         *nsId = defaultNsId;
772cdf0e10cSrcweir         *localName = Span(nameBegin, nameEnd - nameBegin);
773cdf0e10cSrcweir     } else {
774cdf0e10cSrcweir         *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
775cdf0e10cSrcweir         *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
776cdf0e10cSrcweir     }
777cdf0e10cSrcweir     return RESULT_BEGIN;
778cdf0e10cSrcweir }
779cdf0e10cSrcweir 
handleEndTag()780cdf0e10cSrcweir XmlReader::Result XmlReader::handleEndTag() {
781cdf0e10cSrcweir     if (elements_.empty()) {
782cdf0e10cSrcweir         throw css::uno::RuntimeException(
783cdf0e10cSrcweir             (rtl::OUString(
784cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
785cdf0e10cSrcweir              fileUrl_),
786cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
787cdf0e10cSrcweir     }
788cdf0e10cSrcweir     char const * nameBegin = pos_;
789cdf0e10cSrcweir     char const * nameColon = 0;
790cdf0e10cSrcweir     if (!scanName(&nameColon) ||
791cdf0e10cSrcweir         !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
792cdf0e10cSrcweir     {
793cdf0e10cSrcweir         throw css::uno::RuntimeException(
794cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
795cdf0e10cSrcweir              fileUrl_),
796cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
797cdf0e10cSrcweir     }
798cdf0e10cSrcweir     handleElementEnd();
799cdf0e10cSrcweir     skipSpace();
800cdf0e10cSrcweir     if (peek() != '>') {
801cdf0e10cSrcweir         throw css::uno::RuntimeException(
802cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
803cdf0e10cSrcweir              fileUrl_),
804cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
805cdf0e10cSrcweir     }
806cdf0e10cSrcweir     ++pos_;
807cdf0e10cSrcweir     return RESULT_END;
808cdf0e10cSrcweir }
809cdf0e10cSrcweir 
handleElementEnd()810cdf0e10cSrcweir void XmlReader::handleElementEnd() {
811cdf0e10cSrcweir     OSL_ASSERT(!elements_.empty());
812cdf0e10cSrcweir     namespaces_.resize(elements_.top().inheritedNamespaces);
813cdf0e10cSrcweir     elements_.pop();
814cdf0e10cSrcweir     state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
815cdf0e10cSrcweir }
816cdf0e10cSrcweir 
handleSkippedText(Span * data,int * nsId)817cdf0e10cSrcweir XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
818cdf0e10cSrcweir     for (;;) {
819cdf0e10cSrcweir         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
820cdf0e10cSrcweir         if (i < 0) {
821cdf0e10cSrcweir             throw css::uno::RuntimeException(
822cdf0e10cSrcweir                 (rtl::OUString(
823cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
824cdf0e10cSrcweir                  fileUrl_),
825cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
826cdf0e10cSrcweir         }
827cdf0e10cSrcweir         pos_ += i + 1;
828cdf0e10cSrcweir         switch (peek()) {
829cdf0e10cSrcweir         case '!':
830cdf0e10cSrcweir             ++pos_;
831cdf0e10cSrcweir             if (!skipComment() && !scanCdataSection().is()) {
832cdf0e10cSrcweir                 skipDocumentTypeDeclaration();
833cdf0e10cSrcweir             }
834cdf0e10cSrcweir             break;
835cdf0e10cSrcweir         case '/':
836cdf0e10cSrcweir             ++pos_;
837cdf0e10cSrcweir             return handleEndTag();
838cdf0e10cSrcweir         case '?':
839cdf0e10cSrcweir             ++pos_;
840cdf0e10cSrcweir             skipProcessingInstruction();
841cdf0e10cSrcweir             break;
842cdf0e10cSrcweir         default:
843cdf0e10cSrcweir             return handleStartTag(nsId, data);
844cdf0e10cSrcweir         }
845cdf0e10cSrcweir     }
846cdf0e10cSrcweir }
847cdf0e10cSrcweir 
handleRawText(Span * text)848cdf0e10cSrcweir XmlReader::Result XmlReader::handleRawText(Span * text) {
849cdf0e10cSrcweir     pad_.clear();
850cdf0e10cSrcweir     for (char const * begin = pos_;;) {
851cdf0e10cSrcweir         switch (peek()) {
852cdf0e10cSrcweir         case '\0': // i.e., EOF
853cdf0e10cSrcweir             throw css::uno::RuntimeException(
854cdf0e10cSrcweir                 (rtl::OUString(
855cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
856cdf0e10cSrcweir                  fileUrl_),
857cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
858cdf0e10cSrcweir         case '\x0D':
859cdf0e10cSrcweir             pad_.add(begin, pos_ - begin);
860cdf0e10cSrcweir             ++pos_;
861cdf0e10cSrcweir             if (peek() != '\x0A') {
862cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
863cdf0e10cSrcweir             }
864cdf0e10cSrcweir             begin = pos_;
865cdf0e10cSrcweir             break;
866cdf0e10cSrcweir         case '&':
867cdf0e10cSrcweir             pad_.add(begin, pos_ - begin);
868cdf0e10cSrcweir             pos_ = handleReference(pos_, end_);
869cdf0e10cSrcweir             begin = pos_;
870cdf0e10cSrcweir             break;
871cdf0e10cSrcweir         case '<':
872cdf0e10cSrcweir             pad_.add(begin, pos_ - begin);
873cdf0e10cSrcweir             ++pos_;
874cdf0e10cSrcweir             switch (peek()) {
875cdf0e10cSrcweir             case '!':
876cdf0e10cSrcweir                 ++pos_;
877cdf0e10cSrcweir                 if (!skipComment()) {
878cdf0e10cSrcweir                     Span cdata(scanCdataSection());
879cdf0e10cSrcweir                     if (cdata.is()) {
880cdf0e10cSrcweir                         normalizeLineEnds(cdata);
881cdf0e10cSrcweir                     } else {
882cdf0e10cSrcweir                         skipDocumentTypeDeclaration();
883cdf0e10cSrcweir                     }
884cdf0e10cSrcweir                 }
885cdf0e10cSrcweir                 begin = pos_;
886cdf0e10cSrcweir                 break;
887cdf0e10cSrcweir             case '/':
888cdf0e10cSrcweir                 *text = pad_.get();
889cdf0e10cSrcweir                 ++pos_;
890cdf0e10cSrcweir                 state_ = STATE_END_TAG;
891cdf0e10cSrcweir                 return RESULT_TEXT;
892cdf0e10cSrcweir             case '?':
893cdf0e10cSrcweir                 ++pos_;
894cdf0e10cSrcweir                 skipProcessingInstruction();
895cdf0e10cSrcweir                 begin = pos_;
896cdf0e10cSrcweir                 break;
897cdf0e10cSrcweir             default:
898cdf0e10cSrcweir                 *text = pad_.get();
899cdf0e10cSrcweir                 state_ = STATE_START_TAG;
900cdf0e10cSrcweir                 return RESULT_TEXT;
901cdf0e10cSrcweir             }
902cdf0e10cSrcweir             break;
903cdf0e10cSrcweir         default:
904cdf0e10cSrcweir             ++pos_;
905cdf0e10cSrcweir             break;
906cdf0e10cSrcweir         }
907cdf0e10cSrcweir     }
908cdf0e10cSrcweir }
909cdf0e10cSrcweir 
handleNormalizedText(Span * text)910cdf0e10cSrcweir XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
911cdf0e10cSrcweir     pad_.clear();
912cdf0e10cSrcweir     char const * flowBegin = pos_;
913cdf0e10cSrcweir     char const * flowEnd = pos_;
914cdf0e10cSrcweir     enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
915cdf0e10cSrcweir         // a single true space character can go into the current flow,
916cdf0e10cSrcweir         // everything else breaks the flow
917cdf0e10cSrcweir     Space space = SPACE_START;
918cdf0e10cSrcweir     for (;;) {
919cdf0e10cSrcweir         switch (peek()) {
920cdf0e10cSrcweir         case '\0': // i.e., EOF
921cdf0e10cSrcweir             throw css::uno::RuntimeException(
922cdf0e10cSrcweir                 (rtl::OUString(
923cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
924cdf0e10cSrcweir                  fileUrl_),
925cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
926cdf0e10cSrcweir         case '\x09':
927cdf0e10cSrcweir         case '\x0A':
928cdf0e10cSrcweir         case '\x0D':
929cdf0e10cSrcweir             switch (space) {
930cdf0e10cSrcweir             case SPACE_START:
931cdf0e10cSrcweir             case SPACE_BREAK:
932cdf0e10cSrcweir                 break;
933cdf0e10cSrcweir             case SPACE_NONE:
934cdf0e10cSrcweir             case SPACE_SPAN:
935cdf0e10cSrcweir                 space = SPACE_BREAK;
936cdf0e10cSrcweir                 break;
937cdf0e10cSrcweir             }
938cdf0e10cSrcweir             ++pos_;
939cdf0e10cSrcweir             break;
940cdf0e10cSrcweir         case ' ':
941cdf0e10cSrcweir             switch (space) {
942cdf0e10cSrcweir             case SPACE_START:
943cdf0e10cSrcweir             case SPACE_BREAK:
944cdf0e10cSrcweir                 break;
945cdf0e10cSrcweir             case SPACE_NONE:
946cdf0e10cSrcweir                 space = SPACE_SPAN;
947cdf0e10cSrcweir                 break;
948cdf0e10cSrcweir             case SPACE_SPAN:
949cdf0e10cSrcweir                 space = SPACE_BREAK;
950cdf0e10cSrcweir                 break;
951cdf0e10cSrcweir             }
952cdf0e10cSrcweir             ++pos_;
953cdf0e10cSrcweir             break;
954cdf0e10cSrcweir         case '&':
955cdf0e10cSrcweir             switch (space) {
956cdf0e10cSrcweir             case SPACE_START:
957cdf0e10cSrcweir                 break;
958cdf0e10cSrcweir             case SPACE_NONE:
959cdf0e10cSrcweir             case SPACE_SPAN:
960cdf0e10cSrcweir                 pad_.add(flowBegin, pos_ - flowBegin);
961cdf0e10cSrcweir                 break;
962cdf0e10cSrcweir             case SPACE_BREAK:
963cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
964cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
965cdf0e10cSrcweir                 break;
966cdf0e10cSrcweir             }
967cdf0e10cSrcweir             pos_ = handleReference(pos_, end_);
968cdf0e10cSrcweir             flowBegin = pos_;
969cdf0e10cSrcweir             flowEnd = pos_;
970cdf0e10cSrcweir             space = SPACE_NONE;
971cdf0e10cSrcweir             break;
972cdf0e10cSrcweir         case '<':
973cdf0e10cSrcweir             ++pos_;
974cdf0e10cSrcweir             switch (peek()) {
975cdf0e10cSrcweir             case '!':
976cdf0e10cSrcweir                 ++pos_;
977cdf0e10cSrcweir                 if (skipComment()) {
978cdf0e10cSrcweir                     space = SPACE_BREAK;
979cdf0e10cSrcweir                 } else {
980cdf0e10cSrcweir                     Span cdata(scanCdataSection());
981cdf0e10cSrcweir                     if (cdata.is()) {
982cdf0e10cSrcweir                         // CDATA is not normalized (similar to character
983cdf0e10cSrcweir                         // references; it keeps the code simple), but it might
984cdf0e10cSrcweir                         // arguably be better to normalize it:
985cdf0e10cSrcweir                         switch (space) {
986cdf0e10cSrcweir                         case SPACE_START:
987cdf0e10cSrcweir                             break;
988cdf0e10cSrcweir                         case SPACE_NONE:
989cdf0e10cSrcweir                         case SPACE_SPAN:
990cdf0e10cSrcweir                             pad_.add(flowBegin, pos_ - flowBegin);
991cdf0e10cSrcweir                             break;
992cdf0e10cSrcweir                         case SPACE_BREAK:
993cdf0e10cSrcweir                             pad_.add(flowBegin, flowEnd - flowBegin);
994cdf0e10cSrcweir                             pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
995cdf0e10cSrcweir                             break;
996cdf0e10cSrcweir                         }
997cdf0e10cSrcweir                         normalizeLineEnds(cdata);
998cdf0e10cSrcweir                         flowBegin = pos_;
999cdf0e10cSrcweir                         flowEnd = pos_;
1000cdf0e10cSrcweir                         space = SPACE_NONE;
1001cdf0e10cSrcweir                     } else {
1002cdf0e10cSrcweir                         skipDocumentTypeDeclaration();
1003cdf0e10cSrcweir                     }
1004cdf0e10cSrcweir                 }
1005cdf0e10cSrcweir                 break;
1006cdf0e10cSrcweir             case '/':
1007cdf0e10cSrcweir                 ++pos_;
1008cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
1009cdf0e10cSrcweir                 *text = pad_.get();
1010cdf0e10cSrcweir                 state_ = STATE_END_TAG;
1011cdf0e10cSrcweir                 return RESULT_TEXT;
1012cdf0e10cSrcweir             case '?':
1013cdf0e10cSrcweir                 ++pos_;
1014cdf0e10cSrcweir                 skipProcessingInstruction();
1015cdf0e10cSrcweir                 space = SPACE_BREAK;
1016cdf0e10cSrcweir                 break;
1017cdf0e10cSrcweir             default:
1018cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
1019cdf0e10cSrcweir                 *text = pad_.get();
1020cdf0e10cSrcweir                 state_ = STATE_START_TAG;
1021cdf0e10cSrcweir                 return RESULT_TEXT;
1022cdf0e10cSrcweir             }
1023cdf0e10cSrcweir             break;
1024cdf0e10cSrcweir         default:
1025cdf0e10cSrcweir             switch (space) {
1026cdf0e10cSrcweir             case SPACE_START:
1027cdf0e10cSrcweir                 flowBegin = pos_;
1028cdf0e10cSrcweir                 break;
1029cdf0e10cSrcweir             case SPACE_NONE:
1030cdf0e10cSrcweir             case SPACE_SPAN:
1031cdf0e10cSrcweir                 break;
1032cdf0e10cSrcweir             case SPACE_BREAK:
1033cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
1034cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1035cdf0e10cSrcweir                 flowBegin = pos_;
1036cdf0e10cSrcweir                 break;
1037cdf0e10cSrcweir             }
1038cdf0e10cSrcweir             flowEnd = ++pos_;
1039cdf0e10cSrcweir             space = SPACE_NONE;
1040cdf0e10cSrcweir             break;
1041cdf0e10cSrcweir         }
1042cdf0e10cSrcweir     }
1043cdf0e10cSrcweir }
1044cdf0e10cSrcweir 
toNamespaceId(NamespaceIris::size_type pos)1045cdf0e10cSrcweir int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1046cdf0e10cSrcweir     OSL_ASSERT(pos <= INT_MAX);
1047cdf0e10cSrcweir     return static_cast< int >(pos);
1048cdf0e10cSrcweir }
1049cdf0e10cSrcweir 
1050cdf0e10cSrcweir }
1051