xref: /aoo42x/main/xmlreader/source/xmlreader.cxx (revision b63233d8)
1*b725e8ebSAndrew Rist /**************************************************************
2*b725e8ebSAndrew Rist  *
3*b725e8ebSAndrew Rist  * Licensed to the Apache Software Foundation (ASF) under one
4*b725e8ebSAndrew Rist  * or more contributor license agreements.  See the NOTICE file
5*b725e8ebSAndrew Rist  * distributed with this work for additional information
6*b725e8ebSAndrew Rist  * regarding copyright ownership.  The ASF licenses this file
7*b725e8ebSAndrew Rist  * to you under the Apache License, Version 2.0 (the
8*b725e8ebSAndrew Rist  * "License"); you may not use this file except in compliance
9*b725e8ebSAndrew Rist  * with the License.  You may obtain a copy of the License at
10*b725e8ebSAndrew Rist  *
11*b725e8ebSAndrew Rist  *   http://www.apache.org/licenses/LICENSE-2.0
12*b725e8ebSAndrew Rist  *
13*b725e8ebSAndrew Rist  * Unless required by applicable law or agreed to in writing,
14*b725e8ebSAndrew Rist  * software distributed under the License is distributed on an
15*b725e8ebSAndrew Rist  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*b725e8ebSAndrew Rist  * KIND, either express or implied.  See the License for the
17*b725e8ebSAndrew Rist  * specific language governing permissions and limitations
18*b725e8ebSAndrew Rist  * under the License.
19*b725e8ebSAndrew Rist  *
20*b725e8ebSAndrew Rist  *************************************************************/
21*b725e8ebSAndrew Rist 
22*b725e8ebSAndrew Rist 
23cdf0e10cSrcweir 
24cdf0e10cSrcweir #include "sal/config.h"
25cdf0e10cSrcweir 
26cdf0e10cSrcweir #include <climits>
27cdf0e10cSrcweir #include <cstddef>
28cdf0e10cSrcweir 
29cdf0e10cSrcweir #include "com/sun/star/container/NoSuchElementException.hpp"
30cdf0e10cSrcweir #include "com/sun/star/uno/Reference.hxx"
31cdf0e10cSrcweir #include "com/sun/star/uno/RuntimeException.hpp"
32cdf0e10cSrcweir #include "com/sun/star/uno/XInterface.hpp"
33cdf0e10cSrcweir #include "osl/diagnose.h"
34cdf0e10cSrcweir #include "osl/file.h"
35cdf0e10cSrcweir #include "rtl/string.h"
36cdf0e10cSrcweir #include "rtl/ustring.h"
37cdf0e10cSrcweir #include "rtl/ustring.hxx"
38cdf0e10cSrcweir #include "sal/types.h"
39cdf0e10cSrcweir #include "xmlreader/pad.hxx"
40cdf0e10cSrcweir #include "xmlreader/span.hxx"
41cdf0e10cSrcweir #include "xmlreader/xmlreader.hxx"
42cdf0e10cSrcweir 
43cdf0e10cSrcweir namespace xmlreader {
44cdf0e10cSrcweir 
45cdf0e10cSrcweir namespace {
46cdf0e10cSrcweir 
47cdf0e10cSrcweir namespace css = com::sun::star;
48cdf0e10cSrcweir 
isSpace(char c)49cdf0e10cSrcweir bool isSpace(char c) {
50cdf0e10cSrcweir     switch (c) {
51cdf0e10cSrcweir     case '\x09':
52cdf0e10cSrcweir     case '\x0A':
53cdf0e10cSrcweir     case '\x0D':
54cdf0e10cSrcweir     case ' ':
55cdf0e10cSrcweir         return true;
56cdf0e10cSrcweir     default:
57cdf0e10cSrcweir         return false;
58cdf0e10cSrcweir     }
59cdf0e10cSrcweir }
60cdf0e10cSrcweir 
61cdf0e10cSrcweir }
62cdf0e10cSrcweir 
XmlReader(rtl::OUString const & fileUrl)63cdf0e10cSrcweir XmlReader::XmlReader(rtl::OUString const & fileUrl)
64cdf0e10cSrcweir     SAL_THROW((
65cdf0e10cSrcweir         css::container::NoSuchElementException, css::uno::RuntimeException)):
66cdf0e10cSrcweir     fileUrl_(fileUrl)
67cdf0e10cSrcweir {
68cdf0e10cSrcweir     switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
69cdf0e10cSrcweir     {
70cdf0e10cSrcweir     case osl_File_E_None:
71cdf0e10cSrcweir         break;
72cdf0e10cSrcweir     case osl_File_E_NOENT:
73cdf0e10cSrcweir         throw css::container::NoSuchElementException(
74cdf0e10cSrcweir             fileUrl_, css::uno::Reference< css::uno::XInterface >());
75cdf0e10cSrcweir     default:
76cdf0e10cSrcweir         throw css::uno::RuntimeException(
77cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
78cdf0e10cSrcweir              fileUrl_),
79cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
80cdf0e10cSrcweir     }
81cdf0e10cSrcweir     oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
82cdf0e10cSrcweir     if (e == osl_File_E_None) {
83cdf0e10cSrcweir         e = osl_mapFile(
84cdf0e10cSrcweir             fileHandle_, &fileAddress_, fileSize_, 0,
85cdf0e10cSrcweir             osl_File_MapFlag_WillNeed);
86cdf0e10cSrcweir     }
87cdf0e10cSrcweir     if (e != osl_File_E_None) {
88cdf0e10cSrcweir         e = osl_closeFile(fileHandle_);
89cdf0e10cSrcweir         if (e != osl_File_E_None) {
90cdf0e10cSrcweir             OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
91cdf0e10cSrcweir         }
92cdf0e10cSrcweir         throw css::uno::RuntimeException(
93cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
94cdf0e10cSrcweir              fileUrl_),
95cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
96cdf0e10cSrcweir     }
97cdf0e10cSrcweir     namespaceIris_.push_back(
98cdf0e10cSrcweir         Span(
99cdf0e10cSrcweir             RTL_CONSTASCII_STRINGPARAM(
100cdf0e10cSrcweir                 "http://www.w3.org/XML/1998/namespace")));
101cdf0e10cSrcweir     namespaces_.push_back(
102cdf0e10cSrcweir         NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
103cdf0e10cSrcweir     pos_ = static_cast< char * >(fileAddress_);
104cdf0e10cSrcweir     end_ = pos_ + fileSize_;
105cdf0e10cSrcweir     state_ = STATE_CONTENT;
106cdf0e10cSrcweir }
107cdf0e10cSrcweir 
~XmlReader()108cdf0e10cSrcweir XmlReader::~XmlReader() {
109cdf0e10cSrcweir     oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
110cdf0e10cSrcweir     if (e != osl_File_E_None) {
111cdf0e10cSrcweir         OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
112cdf0e10cSrcweir     }
113cdf0e10cSrcweir     e = osl_closeFile(fileHandle_);
114cdf0e10cSrcweir     if (e != osl_File_E_None) {
115cdf0e10cSrcweir         OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
116cdf0e10cSrcweir     }
117cdf0e10cSrcweir }
118cdf0e10cSrcweir 
registerNamespaceIri(Span const & iri)119cdf0e10cSrcweir int XmlReader::registerNamespaceIri(Span const & iri) {
120cdf0e10cSrcweir     int id = toNamespaceId(namespaceIris_.size());
121cdf0e10cSrcweir     namespaceIris_.push_back(iri);
122cdf0e10cSrcweir     if (iri.equals(
123cdf0e10cSrcweir             Span(
124cdf0e10cSrcweir                 RTL_CONSTASCII_STRINGPARAM(
125cdf0e10cSrcweir                     "http://www.w3.org/2001/XMLSchema-instance"))))
126cdf0e10cSrcweir     {
127cdf0e10cSrcweir         // Old user layer .xcu files used the xsi namespace prefix without
128cdf0e10cSrcweir         // declaring a corresponding namespace binding, see issue 77174; reading
129cdf0e10cSrcweir         // those files during migration would fail without this hack that can be
130cdf0e10cSrcweir         // removed once migration is no longer relevant (see
131cdf0e10cSrcweir         // configmgr::Components::parseModificationLayer):
132cdf0e10cSrcweir         namespaces_.push_back(
133cdf0e10cSrcweir             NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
134cdf0e10cSrcweir     }
135cdf0e10cSrcweir     return id;
136cdf0e10cSrcweir }
137cdf0e10cSrcweir 
nextItem(Text reportText,Span * data,int * nsId)138cdf0e10cSrcweir XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
139cdf0e10cSrcweir {
140cdf0e10cSrcweir     switch (state_) {
141cdf0e10cSrcweir     case STATE_CONTENT:
142cdf0e10cSrcweir         switch (reportText) {
143cdf0e10cSrcweir         case TEXT_NONE:
144cdf0e10cSrcweir             return handleSkippedText(data, nsId);
145cdf0e10cSrcweir         case TEXT_RAW:
146cdf0e10cSrcweir             return handleRawText(data);
147cdf0e10cSrcweir         case TEXT_NORMALIZED:
148cdf0e10cSrcweir             return handleNormalizedText(data);
149cdf0e10cSrcweir         }
150cdf0e10cSrcweir     case STATE_START_TAG:
151cdf0e10cSrcweir         return handleStartTag(nsId, data);
152cdf0e10cSrcweir     case STATE_END_TAG:
153cdf0e10cSrcweir         return handleEndTag();
154cdf0e10cSrcweir     case STATE_EMPTY_ELEMENT_TAG:
155cdf0e10cSrcweir         handleElementEnd();
156cdf0e10cSrcweir         return RESULT_END;
157cdf0e10cSrcweir     default: // STATE_DONE
158cdf0e10cSrcweir         return RESULT_DONE;
159cdf0e10cSrcweir     }
160cdf0e10cSrcweir }
161cdf0e10cSrcweir 
nextAttribute(int * nsId,Span * localName)162cdf0e10cSrcweir bool XmlReader::nextAttribute(int * nsId, Span * localName) {
163cdf0e10cSrcweir     OSL_ASSERT(nsId != 0 && localName != 0);
164cdf0e10cSrcweir     if (firstAttribute_) {
165cdf0e10cSrcweir         currentAttribute_ = attributes_.begin();
166cdf0e10cSrcweir         firstAttribute_ = false;
167cdf0e10cSrcweir     } else {
168cdf0e10cSrcweir         ++currentAttribute_;
169cdf0e10cSrcweir     }
170cdf0e10cSrcweir     if (currentAttribute_ == attributes_.end()) {
171cdf0e10cSrcweir         return false;
172cdf0e10cSrcweir     }
173cdf0e10cSrcweir     if (currentAttribute_->nameColon == 0) {
174cdf0e10cSrcweir         *nsId = NAMESPACE_NONE;
175cdf0e10cSrcweir         *localName = Span(
176cdf0e10cSrcweir             currentAttribute_->nameBegin,
177cdf0e10cSrcweir             currentAttribute_->nameEnd - currentAttribute_->nameBegin);
178cdf0e10cSrcweir     } else {
179cdf0e10cSrcweir         *nsId = getNamespaceId(
180cdf0e10cSrcweir             Span(
181cdf0e10cSrcweir                 currentAttribute_->nameBegin,
182cdf0e10cSrcweir                 currentAttribute_->nameColon - currentAttribute_->nameBegin));
183cdf0e10cSrcweir         *localName = Span(
184cdf0e10cSrcweir             currentAttribute_->nameColon + 1,
185cdf0e10cSrcweir             currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
186cdf0e10cSrcweir     }
187cdf0e10cSrcweir     return true;
188cdf0e10cSrcweir }
189cdf0e10cSrcweir 
getAttributeValue(bool fullyNormalize)190cdf0e10cSrcweir Span XmlReader::getAttributeValue(bool fullyNormalize) {
191cdf0e10cSrcweir     return handleAttributeValue(
192cdf0e10cSrcweir         currentAttribute_->valueBegin, currentAttribute_->valueEnd,
193cdf0e10cSrcweir         fullyNormalize);
194cdf0e10cSrcweir }
195cdf0e10cSrcweir 
getNamespaceId(Span const & prefix) const196cdf0e10cSrcweir int XmlReader::getNamespaceId(Span const & prefix) const {
197cdf0e10cSrcweir     for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
198cdf0e10cSrcweir          i != namespaces_.rend(); ++i)
199cdf0e10cSrcweir     {
200cdf0e10cSrcweir         if (prefix.equals(i->prefix)) {
201cdf0e10cSrcweir             return i->nsId;
202cdf0e10cSrcweir         }
203cdf0e10cSrcweir     }
204cdf0e10cSrcweir     return NAMESPACE_UNKNOWN;
205cdf0e10cSrcweir }
206cdf0e10cSrcweir 
getUrl() const207cdf0e10cSrcweir rtl::OUString XmlReader::getUrl() const {
208cdf0e10cSrcweir     return fileUrl_;
209cdf0e10cSrcweir }
210cdf0e10cSrcweir 
normalizeLineEnds(Span const & text)211cdf0e10cSrcweir void XmlReader::normalizeLineEnds(Span const & text) {
212cdf0e10cSrcweir     char const * p = text.begin;
213cdf0e10cSrcweir     sal_Int32 n = text.length;
214cdf0e10cSrcweir     for (;;) {
215cdf0e10cSrcweir         sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
216cdf0e10cSrcweir         if (i < 0) {
217cdf0e10cSrcweir             break;
218cdf0e10cSrcweir         }
219cdf0e10cSrcweir         pad_.add(p, i);
220cdf0e10cSrcweir         p += i + 1;
221cdf0e10cSrcweir         n -= i + 1;
222cdf0e10cSrcweir         if (n == 0 || *p != '\x0A') {
223cdf0e10cSrcweir             pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
224cdf0e10cSrcweir         }
225cdf0e10cSrcweir     }
226cdf0e10cSrcweir     pad_.add(p, n);
227cdf0e10cSrcweir }
228cdf0e10cSrcweir 
skipSpace()229cdf0e10cSrcweir void XmlReader::skipSpace() {
230cdf0e10cSrcweir     while (isSpace(peek())) {
231cdf0e10cSrcweir         ++pos_;
232cdf0e10cSrcweir     }
233cdf0e10cSrcweir }
234cdf0e10cSrcweir 
skipComment()235cdf0e10cSrcweir bool XmlReader::skipComment() {
236cdf0e10cSrcweir     if (rtl_str_shortenedCompare_WithLength(
237cdf0e10cSrcweir             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
238cdf0e10cSrcweir             RTL_CONSTASCII_LENGTH("--")) !=
239cdf0e10cSrcweir         0)
240cdf0e10cSrcweir     {
241cdf0e10cSrcweir         return false;
242cdf0e10cSrcweir     }
243cdf0e10cSrcweir     pos_ += RTL_CONSTASCII_LENGTH("--");
244cdf0e10cSrcweir     sal_Int32 i = rtl_str_indexOfStr_WithLength(
245cdf0e10cSrcweir         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
246cdf0e10cSrcweir     if (i < 0) {
247cdf0e10cSrcweir         throw css::uno::RuntimeException(
248cdf0e10cSrcweir             (rtl::OUString(
249cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM(
250cdf0e10cSrcweir                     "premature end (within comment) of ")) +
251cdf0e10cSrcweir              fileUrl_),
252cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
253cdf0e10cSrcweir     }
254cdf0e10cSrcweir     pos_ += i + RTL_CONSTASCII_LENGTH("--");
255cdf0e10cSrcweir     if (read() != '>') {
256cdf0e10cSrcweir         throw css::uno::RuntimeException(
257cdf0e10cSrcweir             (rtl::OUString(
258cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM(
259cdf0e10cSrcweir                     "illegal \"--\" within comment in ")) +
260cdf0e10cSrcweir              fileUrl_),
261cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
262cdf0e10cSrcweir     }
263cdf0e10cSrcweir     return true;
264cdf0e10cSrcweir }
265cdf0e10cSrcweir 
skipProcessingInstruction()266cdf0e10cSrcweir void XmlReader::skipProcessingInstruction() {
267cdf0e10cSrcweir     sal_Int32 i = rtl_str_indexOfStr_WithLength(
268cdf0e10cSrcweir         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
269cdf0e10cSrcweir     if (i < 0) {
270cdf0e10cSrcweir         throw css::uno::RuntimeException(
271cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
272cdf0e10cSrcweir              fileUrl_),
273cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
274cdf0e10cSrcweir     }
275cdf0e10cSrcweir     pos_ += i + RTL_CONSTASCII_LENGTH("?>");
276cdf0e10cSrcweir }
277cdf0e10cSrcweir 
skipDocumentTypeDeclaration()278cdf0e10cSrcweir void XmlReader::skipDocumentTypeDeclaration() {
279cdf0e10cSrcweir     // Neither is it checked that the doctypedecl is at the correct position in
280cdf0e10cSrcweir     // the document, nor that it is well-formed:
281cdf0e10cSrcweir     for (;;) {
282cdf0e10cSrcweir         char c = read();
283cdf0e10cSrcweir         switch (c) {
284cdf0e10cSrcweir         case '\0': // i.e., EOF
285cdf0e10cSrcweir             throw css::uno::RuntimeException(
286cdf0e10cSrcweir                 (rtl::OUString(
287cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
288cdf0e10cSrcweir                         "premature end (within DTD) of ")) +
289cdf0e10cSrcweir                  fileUrl_),
290cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
291cdf0e10cSrcweir         case '"':
292cdf0e10cSrcweir         case '\'':
293cdf0e10cSrcweir             {
294cdf0e10cSrcweir                 sal_Int32 i = rtl_str_indexOfChar_WithLength(
295cdf0e10cSrcweir                     pos_, end_ - pos_, c);
296cdf0e10cSrcweir                 if (i < 0) {
297cdf0e10cSrcweir                     throw css::uno::RuntimeException(
298cdf0e10cSrcweir                         (rtl::OUString(
299cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
300cdf0e10cSrcweir                                 "premature end (within DTD) of ")) +
301cdf0e10cSrcweir                          fileUrl_),
302cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
303cdf0e10cSrcweir                 }
304cdf0e10cSrcweir                 pos_ += i + 1;
305cdf0e10cSrcweir             }
306cdf0e10cSrcweir             break;
307cdf0e10cSrcweir         case '>':
308cdf0e10cSrcweir             return;
309cdf0e10cSrcweir         case '[':
310cdf0e10cSrcweir             for (;;) {
311cdf0e10cSrcweir                 c = read();
312cdf0e10cSrcweir                 switch (c) {
313cdf0e10cSrcweir                 case '\0': // i.e., EOF
314cdf0e10cSrcweir                     throw css::uno::RuntimeException(
315cdf0e10cSrcweir                         (rtl::OUString(
316cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
317cdf0e10cSrcweir                                 "premature end (within DTD) of ")) +
318cdf0e10cSrcweir                          fileUrl_),
319cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
320cdf0e10cSrcweir                 case '"':
321cdf0e10cSrcweir                 case '\'':
322cdf0e10cSrcweir                     {
323cdf0e10cSrcweir                         sal_Int32 i = rtl_str_indexOfChar_WithLength(
324cdf0e10cSrcweir                             pos_, end_ - pos_, c);
325cdf0e10cSrcweir                         if (i < 0) {
326cdf0e10cSrcweir                             throw css::uno::RuntimeException(
327cdf0e10cSrcweir                             (rtl::OUString(
328cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
329cdf0e10cSrcweir                                     "premature end (within DTD) of ")) +
330cdf0e10cSrcweir                              fileUrl_),
331cdf0e10cSrcweir                             css::uno::Reference< css::uno::XInterface >());
332cdf0e10cSrcweir                         }
333cdf0e10cSrcweir                         pos_ += i + 1;
334cdf0e10cSrcweir                     }
335cdf0e10cSrcweir                     break;
336cdf0e10cSrcweir                 case '<':
337cdf0e10cSrcweir                     switch (read()) {
338cdf0e10cSrcweir                     case '\0': // i.e., EOF
339cdf0e10cSrcweir                         throw css::uno::RuntimeException(
340cdf0e10cSrcweir                             (rtl::OUString(
341cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
342cdf0e10cSrcweir                                     "premature end (within DTD) of ")) +
343cdf0e10cSrcweir                              fileUrl_),
344cdf0e10cSrcweir                             css::uno::Reference< css::uno::XInterface >());
345cdf0e10cSrcweir                     case '!':
346cdf0e10cSrcweir                         skipComment();
347cdf0e10cSrcweir                         break;
348cdf0e10cSrcweir                     case '?':
349cdf0e10cSrcweir                         skipProcessingInstruction();
350cdf0e10cSrcweir                         break;
351cdf0e10cSrcweir                     default:
352cdf0e10cSrcweir                         break;
353cdf0e10cSrcweir                     }
354cdf0e10cSrcweir                     break;
355cdf0e10cSrcweir                 case ']':
356cdf0e10cSrcweir                     skipSpace();
357cdf0e10cSrcweir                     if (read() != '>') {
358cdf0e10cSrcweir                         throw css::uno::RuntimeException(
359cdf0e10cSrcweir                             (rtl::OUString(
360cdf0e10cSrcweir                                 RTL_CONSTASCII_USTRINGPARAM(
361cdf0e10cSrcweir                                     "missing \">\" of DTD in ")) +
362cdf0e10cSrcweir                              fileUrl_),
363cdf0e10cSrcweir                             css::uno::Reference< css::uno::XInterface >());
364cdf0e10cSrcweir                     }
365cdf0e10cSrcweir                     return;
366cdf0e10cSrcweir                 default:
367cdf0e10cSrcweir                     break;
368cdf0e10cSrcweir                 }
369cdf0e10cSrcweir             }
370cdf0e10cSrcweir         default:
371cdf0e10cSrcweir             break;
372cdf0e10cSrcweir         }
373cdf0e10cSrcweir     }
374cdf0e10cSrcweir }
375cdf0e10cSrcweir 
scanCdataSection()376cdf0e10cSrcweir Span XmlReader::scanCdataSection() {
377cdf0e10cSrcweir     if (rtl_str_shortenedCompare_WithLength(
378cdf0e10cSrcweir             pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
379cdf0e10cSrcweir             RTL_CONSTASCII_LENGTH("[CDATA[")) !=
380cdf0e10cSrcweir         0)
381cdf0e10cSrcweir     {
382cdf0e10cSrcweir         return Span();
383cdf0e10cSrcweir     }
384cdf0e10cSrcweir     pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
385cdf0e10cSrcweir     char const * begin = pos_;
386cdf0e10cSrcweir     sal_Int32 i = rtl_str_indexOfStr_WithLength(
387cdf0e10cSrcweir         pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
388cdf0e10cSrcweir     if (i < 0) {
389cdf0e10cSrcweir         throw css::uno::RuntimeException(
390cdf0e10cSrcweir             (rtl::OUString(
391cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM(
392cdf0e10cSrcweir                     "premature end (within CDATA section) of ")) +
393cdf0e10cSrcweir              fileUrl_),
394cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
395cdf0e10cSrcweir     }
396cdf0e10cSrcweir     pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
397cdf0e10cSrcweir     return Span(begin, i);
398cdf0e10cSrcweir }
399cdf0e10cSrcweir 
scanName(char const ** nameColon)400cdf0e10cSrcweir bool XmlReader::scanName(char const ** nameColon) {
401cdf0e10cSrcweir     OSL_ASSERT(nameColon != 0 && *nameColon == 0);
402cdf0e10cSrcweir     for (char const * begin = pos_;; ++pos_) {
403cdf0e10cSrcweir         switch (peek()) {
404cdf0e10cSrcweir         case '\0': // i.e., EOF
405cdf0e10cSrcweir         case '\x09':
406cdf0e10cSrcweir         case '\x0A':
407cdf0e10cSrcweir         case '\x0D':
408cdf0e10cSrcweir         case ' ':
409cdf0e10cSrcweir         case '/':
410cdf0e10cSrcweir         case '=':
411cdf0e10cSrcweir         case '>':
412cdf0e10cSrcweir             return pos_ != begin;
413cdf0e10cSrcweir         case ':':
414cdf0e10cSrcweir             *nameColon = pos_;
415cdf0e10cSrcweir             break;
416cdf0e10cSrcweir         default:
417cdf0e10cSrcweir             break;
418cdf0e10cSrcweir         }
419cdf0e10cSrcweir     }
420cdf0e10cSrcweir }
421cdf0e10cSrcweir 
scanNamespaceIri(char const * begin,char const * end)422cdf0e10cSrcweir int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
423cdf0e10cSrcweir     OSL_ASSERT(begin != 0 && begin <= end);
424cdf0e10cSrcweir     Span iri(handleAttributeValue(begin, end, false));
425cdf0e10cSrcweir     for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
426cdf0e10cSrcweir         if (namespaceIris_[i].equals(iri)) {
427cdf0e10cSrcweir             return toNamespaceId(i);
428cdf0e10cSrcweir         }
429cdf0e10cSrcweir     }
430cdf0e10cSrcweir     return XmlReader::NAMESPACE_UNKNOWN;
431cdf0e10cSrcweir }
432cdf0e10cSrcweir 
handleReference(char const * position,char const * end)433cdf0e10cSrcweir char const * XmlReader::handleReference(char const * position, char const * end)
434cdf0e10cSrcweir {
435cdf0e10cSrcweir     OSL_ASSERT(position != 0 && *position == '&' && position < end);
436cdf0e10cSrcweir     ++position;
437cdf0e10cSrcweir     if (*position == '#') {
438cdf0e10cSrcweir         ++position;
439cdf0e10cSrcweir         sal_Int32 val = 0;
440cdf0e10cSrcweir         char const * p;
441cdf0e10cSrcweir         if (*position == 'x') {
442cdf0e10cSrcweir             ++position;
443cdf0e10cSrcweir             p = position;
444cdf0e10cSrcweir             for (;; ++position) {
445cdf0e10cSrcweir                 char c = *position;
446cdf0e10cSrcweir                 if (c >= '0' && c <= '9') {
447cdf0e10cSrcweir                     val = 16 * val + (c - '0');
448cdf0e10cSrcweir                 } else if (c >= 'A' && c <= 'F') {
449cdf0e10cSrcweir                     val = 16 * val + (c - 'A') + 10;
450cdf0e10cSrcweir                 } else if (c >= 'a' && c <= 'f') {
451cdf0e10cSrcweir                     val = 16 * val + (c - 'a') + 10;
452cdf0e10cSrcweir                 } else {
453cdf0e10cSrcweir                     break;
454cdf0e10cSrcweir                 }
455cdf0e10cSrcweir                 if (val > 0x10FFFF) { // avoid overflow
456cdf0e10cSrcweir                     throw css::uno::RuntimeException(
457cdf0e10cSrcweir                         (rtl::OUString(
458cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
459cdf0e10cSrcweir                                 "'&#x...' too large in ")) +
460cdf0e10cSrcweir                          fileUrl_),
461cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
462cdf0e10cSrcweir                 }
463cdf0e10cSrcweir             }
464cdf0e10cSrcweir         } else {
465cdf0e10cSrcweir             p = position;
466cdf0e10cSrcweir             for (;; ++position) {
467cdf0e10cSrcweir                 char c = *position;
468cdf0e10cSrcweir                 if (c >= '0' && c <= '9') {
469cdf0e10cSrcweir                     val = 10 * val + (c - '0');
470cdf0e10cSrcweir                 } else {
471cdf0e10cSrcweir                     break;
472cdf0e10cSrcweir                 }
473cdf0e10cSrcweir                 if (val > 0x10FFFF) { // avoid overflow
474cdf0e10cSrcweir                     throw css::uno::RuntimeException(
475cdf0e10cSrcweir                         (rtl::OUString(
476cdf0e10cSrcweir                             RTL_CONSTASCII_USTRINGPARAM(
477cdf0e10cSrcweir                                 "'&#...' too large in ")) +
478cdf0e10cSrcweir                          fileUrl_),
479cdf0e10cSrcweir                         css::uno::Reference< css::uno::XInterface >());
480cdf0e10cSrcweir                 }
481cdf0e10cSrcweir             }
482cdf0e10cSrcweir         }
483cdf0e10cSrcweir         if (position == p || *position++ != ';') {
484cdf0e10cSrcweir             throw css::uno::RuntimeException(
485cdf0e10cSrcweir                 (rtl::OUString(
486cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
487cdf0e10cSrcweir                  fileUrl_),
488cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
489cdf0e10cSrcweir         }
490cdf0e10cSrcweir         OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
491cdf0e10cSrcweir         if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
492cdf0e10cSrcweir             (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
493cdf0e10cSrcweir         {
494cdf0e10cSrcweir             throw css::uno::RuntimeException(
495cdf0e10cSrcweir                 (rtl::OUString(
496cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
497cdf0e10cSrcweir                         "character reference denoting invalid character in ")) +
498cdf0e10cSrcweir                  fileUrl_),
499cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
500cdf0e10cSrcweir         }
501cdf0e10cSrcweir         char buf[4];
502cdf0e10cSrcweir         sal_Int32 len;
503cdf0e10cSrcweir         if (val < 0x80) {
504cdf0e10cSrcweir             buf[0] = static_cast< char >(val);
505cdf0e10cSrcweir             len = 1;
506cdf0e10cSrcweir         } else if (val < 0x800) {
507cdf0e10cSrcweir             buf[0] = static_cast< char >((val >> 6) | 0xC0);
508cdf0e10cSrcweir             buf[1] = static_cast< char >((val & 0x3F) | 0x80);
509cdf0e10cSrcweir             len = 2;
510cdf0e10cSrcweir         } else if (val < 0x10000) {
511cdf0e10cSrcweir             buf[0] = static_cast< char >((val >> 12) | 0xE0);
512cdf0e10cSrcweir             buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
513cdf0e10cSrcweir             buf[2] = static_cast< char >((val & 0x3F) | 0x80);
514cdf0e10cSrcweir             len = 3;
515cdf0e10cSrcweir         } else {
516cdf0e10cSrcweir             buf[0] = static_cast< char >((val >> 18) | 0xF0);
517cdf0e10cSrcweir             buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
518cdf0e10cSrcweir             buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
519cdf0e10cSrcweir             buf[3] = static_cast< char >((val & 0x3F) | 0x80);
520cdf0e10cSrcweir             len = 4;
521cdf0e10cSrcweir         }
522cdf0e10cSrcweir         pad_.addEphemeral(buf, len);
523cdf0e10cSrcweir         return position;
524cdf0e10cSrcweir     } else {
525cdf0e10cSrcweir         struct EntityRef {
526cdf0e10cSrcweir             char const * inBegin;
527cdf0e10cSrcweir             sal_Int32 inLength;
528cdf0e10cSrcweir             char const * outBegin;
529cdf0e10cSrcweir             sal_Int32 outLength;
530cdf0e10cSrcweir         };
531cdf0e10cSrcweir         static EntityRef const refs[] = {
532cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("amp;"),
533cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("&") },
534cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("lt;"),
535cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("<") },
536cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("gt;"),
537cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM(">") },
538cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("apos;"),
539cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("'") },
540cdf0e10cSrcweir             { RTL_CONSTASCII_STRINGPARAM("quot;"),
541cdf0e10cSrcweir               RTL_CONSTASCII_STRINGPARAM("\"") } };
542cdf0e10cSrcweir         for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
543cdf0e10cSrcweir             if (rtl_str_shortenedCompare_WithLength(
544cdf0e10cSrcweir                     position, end - position, refs[i].inBegin, refs[i].inLength,
545cdf0e10cSrcweir                     refs[i].inLength) ==
546cdf0e10cSrcweir                 0)
547cdf0e10cSrcweir             {
548cdf0e10cSrcweir                 position += refs[i].inLength;
549cdf0e10cSrcweir                 pad_.add(refs[i].outBegin, refs[i].outLength);
550cdf0e10cSrcweir                 return position;
551cdf0e10cSrcweir             }
552cdf0e10cSrcweir         }
553cdf0e10cSrcweir         throw css::uno::RuntimeException(
554cdf0e10cSrcweir             (rtl::OUString(
555cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
556cdf0e10cSrcweir              fileUrl_),
557cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
558cdf0e10cSrcweir     }
559cdf0e10cSrcweir }
560cdf0e10cSrcweir 
handleAttributeValue(char const * begin,char const * end,bool fullyNormalize)561cdf0e10cSrcweir Span XmlReader::handleAttributeValue(
562cdf0e10cSrcweir     char const * begin, char const * end, bool fullyNormalize)
563cdf0e10cSrcweir {
564cdf0e10cSrcweir     pad_.clear();
565cdf0e10cSrcweir     if (fullyNormalize) {
566cdf0e10cSrcweir         while (begin != end && isSpace(*begin)) {
567cdf0e10cSrcweir             ++begin;
568cdf0e10cSrcweir         }
569cdf0e10cSrcweir         while (end != begin && isSpace(end[-1])) {
570cdf0e10cSrcweir             --end;
571cdf0e10cSrcweir         }
572cdf0e10cSrcweir         char const * p = begin;
573cdf0e10cSrcweir         enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
574cdf0e10cSrcweir             // a single true space character can go into the current span,
575cdf0e10cSrcweir             // everything else breaks the span
576cdf0e10cSrcweir         Space space = SPACE_NONE;
577cdf0e10cSrcweir         while (p != end) {
578cdf0e10cSrcweir             switch (*p) {
579cdf0e10cSrcweir             case '\x09':
580cdf0e10cSrcweir             case '\x0A':
581cdf0e10cSrcweir             case '\x0D':
582cdf0e10cSrcweir                 switch (space) {
583cdf0e10cSrcweir                 case SPACE_NONE:
584cdf0e10cSrcweir                     pad_.add(begin, p - begin);
585cdf0e10cSrcweir                     pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
586cdf0e10cSrcweir                     space = SPACE_BREAK;
587cdf0e10cSrcweir                     break;
588cdf0e10cSrcweir                 case SPACE_SPAN:
589cdf0e10cSrcweir                     pad_.add(begin, p - begin);
590cdf0e10cSrcweir                     space = SPACE_BREAK;
591cdf0e10cSrcweir                     break;
592cdf0e10cSrcweir                 case SPACE_BREAK:
593cdf0e10cSrcweir                     break;
594cdf0e10cSrcweir                 }
595cdf0e10cSrcweir                 begin = ++p;
596cdf0e10cSrcweir                 break;
597cdf0e10cSrcweir             case ' ':
598cdf0e10cSrcweir                 switch (space) {
599cdf0e10cSrcweir                 case SPACE_NONE:
600cdf0e10cSrcweir                     ++p;
601cdf0e10cSrcweir                     space = SPACE_SPAN;
602cdf0e10cSrcweir                     break;
603cdf0e10cSrcweir                 case SPACE_SPAN:
604cdf0e10cSrcweir                     pad_.add(begin, p - begin);
605cdf0e10cSrcweir                     begin = ++p;
606cdf0e10cSrcweir                     space = SPACE_BREAK;
607cdf0e10cSrcweir                     break;
608cdf0e10cSrcweir                 case SPACE_BREAK:
609cdf0e10cSrcweir                     begin = ++p;
610cdf0e10cSrcweir                     break;
611cdf0e10cSrcweir                 }
612cdf0e10cSrcweir                 break;
613cdf0e10cSrcweir             case '&':
614cdf0e10cSrcweir                 pad_.add(begin, p - begin);
615cdf0e10cSrcweir                 p = handleReference(p, end);
616cdf0e10cSrcweir                 begin = p;
617cdf0e10cSrcweir                 space = SPACE_NONE;
618cdf0e10cSrcweir                 break;
619cdf0e10cSrcweir             default:
620cdf0e10cSrcweir                 ++p;
621cdf0e10cSrcweir                 space = SPACE_NONE;
622cdf0e10cSrcweir                 break;
623cdf0e10cSrcweir             }
624cdf0e10cSrcweir         }
625cdf0e10cSrcweir         pad_.add(begin, p - begin);
626cdf0e10cSrcweir     } else {
627cdf0e10cSrcweir         char const * p = begin;
628cdf0e10cSrcweir         while (p != end) {
629cdf0e10cSrcweir             switch (*p) {
630cdf0e10cSrcweir             case '\x09':
631cdf0e10cSrcweir             case '\x0A':
632cdf0e10cSrcweir                 pad_.add(begin, p - begin);
633cdf0e10cSrcweir                 begin = ++p;
634cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
635cdf0e10cSrcweir                 break;
636cdf0e10cSrcweir             case '\x0D':
637cdf0e10cSrcweir                 pad_.add(begin, p - begin);
638cdf0e10cSrcweir                 ++p;
639cdf0e10cSrcweir                 if (peek() == '\x0A') {
640cdf0e10cSrcweir                     ++p;
641cdf0e10cSrcweir                 }
642cdf0e10cSrcweir                 begin = p;
643cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
644cdf0e10cSrcweir                 break;
645cdf0e10cSrcweir             case '&':
646cdf0e10cSrcweir                 pad_.add(begin, p - begin);
647cdf0e10cSrcweir                 p = handleReference(p, end);
648cdf0e10cSrcweir                 begin = p;
649cdf0e10cSrcweir                 break;
650cdf0e10cSrcweir             default:
651cdf0e10cSrcweir                 ++p;
652cdf0e10cSrcweir                 break;
653cdf0e10cSrcweir             }
654cdf0e10cSrcweir         }
655cdf0e10cSrcweir         pad_.add(begin, p - begin);
656cdf0e10cSrcweir     }
657cdf0e10cSrcweir     return pad_.get();
658cdf0e10cSrcweir }
659cdf0e10cSrcweir 
handleStartTag(int * nsId,Span * localName)660cdf0e10cSrcweir XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
661cdf0e10cSrcweir     OSL_ASSERT(nsId != 0 && localName);
662cdf0e10cSrcweir     char const * nameBegin = pos_;
663cdf0e10cSrcweir     char const * nameColon = 0;
664cdf0e10cSrcweir     if (!scanName(&nameColon)) {
665cdf0e10cSrcweir         throw css::uno::RuntimeException(
666cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
667cdf0e10cSrcweir              fileUrl_),
668cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
669cdf0e10cSrcweir     }
670cdf0e10cSrcweir     char const * nameEnd = pos_;
671cdf0e10cSrcweir     NamespaceList::size_type inheritedNamespaces = namespaces_.size();
672cdf0e10cSrcweir     bool hasDefaultNs = false;
673cdf0e10cSrcweir     int defaultNsId = NAMESPACE_NONE;
674cdf0e10cSrcweir     attributes_.clear();
675cdf0e10cSrcweir     for (;;) {
676cdf0e10cSrcweir         char const * p = pos_;
677cdf0e10cSrcweir         skipSpace();
678cdf0e10cSrcweir         if (peek() == '/' || peek() == '>') {
679cdf0e10cSrcweir             break;
680cdf0e10cSrcweir         }
681cdf0e10cSrcweir         if (pos_ == p) {
682cdf0e10cSrcweir             throw css::uno::RuntimeException(
683cdf0e10cSrcweir                 (rtl::OUString(
684cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
685cdf0e10cSrcweir                         "missing whitespace before attribute in ")) +
686cdf0e10cSrcweir                  fileUrl_),
687cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
688cdf0e10cSrcweir         }
689cdf0e10cSrcweir         char const * attrNameBegin = pos_;
690cdf0e10cSrcweir         char const * attrNameColon = 0;
691cdf0e10cSrcweir         if (!scanName(&attrNameColon)) {
692cdf0e10cSrcweir             throw css::uno::RuntimeException(
693cdf0e10cSrcweir                 (rtl::OUString(
694cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
695cdf0e10cSrcweir                  fileUrl_),
696cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
697cdf0e10cSrcweir         }
698cdf0e10cSrcweir         char const * attrNameEnd = pos_;
699cdf0e10cSrcweir         skipSpace();
700cdf0e10cSrcweir         if (read() != '=') {
701cdf0e10cSrcweir             throw css::uno::RuntimeException(
702cdf0e10cSrcweir                 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
703cdf0e10cSrcweir                  fileUrl_),
704cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
705cdf0e10cSrcweir         }
706cdf0e10cSrcweir         skipSpace();
707cdf0e10cSrcweir         char del = read();
708cdf0e10cSrcweir         if (del != '\'' && del != '"') {
709cdf0e10cSrcweir             throw css::uno::RuntimeException(
710cdf0e10cSrcweir                 (rtl::OUString(
711cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
712cdf0e10cSrcweir                  fileUrl_),
713cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
714cdf0e10cSrcweir         }
715cdf0e10cSrcweir         char const * valueBegin = pos_;
716cdf0e10cSrcweir         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
717cdf0e10cSrcweir         if (i < 0) {
718cdf0e10cSrcweir             throw css::uno::RuntimeException(
719cdf0e10cSrcweir                 (rtl::OUString(
720cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM(
721cdf0e10cSrcweir                         "unterminated attribute value in ")) +
722cdf0e10cSrcweir                  fileUrl_),
723cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
724cdf0e10cSrcweir         }
725cdf0e10cSrcweir         char const * valueEnd = pos_ + i;
726cdf0e10cSrcweir         pos_ += i + 1;
727cdf0e10cSrcweir         if (attrNameColon == 0 &&
728cdf0e10cSrcweir             Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
729cdf0e10cSrcweir                 RTL_CONSTASCII_STRINGPARAM("xmlns")))
730cdf0e10cSrcweir         {
731cdf0e10cSrcweir             hasDefaultNs = true;
732cdf0e10cSrcweir             defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
733cdf0e10cSrcweir         } else if (attrNameColon != 0 &&
734cdf0e10cSrcweir                    Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
735cdf0e10cSrcweir                        RTL_CONSTASCII_STRINGPARAM("xmlns")))
736cdf0e10cSrcweir         {
737cdf0e10cSrcweir             namespaces_.push_back(
738cdf0e10cSrcweir                 NamespaceData(
739cdf0e10cSrcweir                     Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
740cdf0e10cSrcweir                     scanNamespaceIri(valueBegin, valueEnd)));
741cdf0e10cSrcweir         } else {
742cdf0e10cSrcweir             attributes_.push_back(
743cdf0e10cSrcweir                 AttributeData(
744cdf0e10cSrcweir                     attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
745cdf0e10cSrcweir                     valueEnd));
746cdf0e10cSrcweir         }
747cdf0e10cSrcweir     }
748cdf0e10cSrcweir     if (!hasDefaultNs && !elements_.empty()) {
749cdf0e10cSrcweir         defaultNsId = elements_.top().defaultNamespaceId;
750cdf0e10cSrcweir     }
751cdf0e10cSrcweir     firstAttribute_ = true;
752cdf0e10cSrcweir     if (peek() == '/') {
753cdf0e10cSrcweir         state_ = STATE_EMPTY_ELEMENT_TAG;
754cdf0e10cSrcweir         ++pos_;
755cdf0e10cSrcweir     } else {
756cdf0e10cSrcweir         state_ = STATE_CONTENT;
757cdf0e10cSrcweir     }
758cdf0e10cSrcweir     if (peek() != '>') {
759cdf0e10cSrcweir         throw css::uno::RuntimeException(
760cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
761cdf0e10cSrcweir              fileUrl_),
762cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
763cdf0e10cSrcweir     }
764cdf0e10cSrcweir     ++pos_;
765cdf0e10cSrcweir     elements_.push(
766cdf0e10cSrcweir         ElementData(
767cdf0e10cSrcweir             Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
768cdf0e10cSrcweir             defaultNsId));
769cdf0e10cSrcweir     if (nameColon == 0) {
770cdf0e10cSrcweir         *nsId = defaultNsId;
771cdf0e10cSrcweir         *localName = Span(nameBegin, nameEnd - nameBegin);
772cdf0e10cSrcweir     } else {
773cdf0e10cSrcweir         *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
774cdf0e10cSrcweir         *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
775cdf0e10cSrcweir     }
776cdf0e10cSrcweir     return RESULT_BEGIN;
777cdf0e10cSrcweir }
778cdf0e10cSrcweir 
handleEndTag()779cdf0e10cSrcweir XmlReader::Result XmlReader::handleEndTag() {
780cdf0e10cSrcweir     if (elements_.empty()) {
781cdf0e10cSrcweir         throw css::uno::RuntimeException(
782cdf0e10cSrcweir             (rtl::OUString(
783cdf0e10cSrcweir                 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
784cdf0e10cSrcweir              fileUrl_),
785cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
786cdf0e10cSrcweir     }
787cdf0e10cSrcweir     char const * nameBegin = pos_;
788cdf0e10cSrcweir     char const * nameColon = 0;
789cdf0e10cSrcweir     if (!scanName(&nameColon) ||
790cdf0e10cSrcweir         !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
791cdf0e10cSrcweir     {
792cdf0e10cSrcweir         throw css::uno::RuntimeException(
793cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
794cdf0e10cSrcweir              fileUrl_),
795cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
796cdf0e10cSrcweir     }
797cdf0e10cSrcweir     handleElementEnd();
798cdf0e10cSrcweir     skipSpace();
799cdf0e10cSrcweir     if (peek() != '>') {
800cdf0e10cSrcweir         throw css::uno::RuntimeException(
801cdf0e10cSrcweir             (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
802cdf0e10cSrcweir              fileUrl_),
803cdf0e10cSrcweir             css::uno::Reference< css::uno::XInterface >());
804cdf0e10cSrcweir     }
805cdf0e10cSrcweir     ++pos_;
806cdf0e10cSrcweir     return RESULT_END;
807cdf0e10cSrcweir }
808cdf0e10cSrcweir 
handleElementEnd()809cdf0e10cSrcweir void XmlReader::handleElementEnd() {
810cdf0e10cSrcweir     OSL_ASSERT(!elements_.empty());
811cdf0e10cSrcweir     namespaces_.resize(elements_.top().inheritedNamespaces);
812cdf0e10cSrcweir     elements_.pop();
813cdf0e10cSrcweir     state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
814cdf0e10cSrcweir }
815cdf0e10cSrcweir 
handleSkippedText(Span * data,int * nsId)816cdf0e10cSrcweir XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
817cdf0e10cSrcweir     for (;;) {
818cdf0e10cSrcweir         sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
819cdf0e10cSrcweir         if (i < 0) {
820cdf0e10cSrcweir             throw css::uno::RuntimeException(
821cdf0e10cSrcweir                 (rtl::OUString(
822cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
823cdf0e10cSrcweir                  fileUrl_),
824cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
825cdf0e10cSrcweir         }
826cdf0e10cSrcweir         pos_ += i + 1;
827cdf0e10cSrcweir         switch (peek()) {
828cdf0e10cSrcweir         case '!':
829cdf0e10cSrcweir             ++pos_;
830cdf0e10cSrcweir             if (!skipComment() && !scanCdataSection().is()) {
831cdf0e10cSrcweir                 skipDocumentTypeDeclaration();
832cdf0e10cSrcweir             }
833cdf0e10cSrcweir             break;
834cdf0e10cSrcweir         case '/':
835cdf0e10cSrcweir             ++pos_;
836cdf0e10cSrcweir             return handleEndTag();
837cdf0e10cSrcweir         case '?':
838cdf0e10cSrcweir             ++pos_;
839cdf0e10cSrcweir             skipProcessingInstruction();
840cdf0e10cSrcweir             break;
841cdf0e10cSrcweir         default:
842cdf0e10cSrcweir             return handleStartTag(nsId, data);
843cdf0e10cSrcweir         }
844cdf0e10cSrcweir     }
845cdf0e10cSrcweir }
846cdf0e10cSrcweir 
handleRawText(Span * text)847cdf0e10cSrcweir XmlReader::Result XmlReader::handleRawText(Span * text) {
848cdf0e10cSrcweir     pad_.clear();
849cdf0e10cSrcweir     for (char const * begin = pos_;;) {
850cdf0e10cSrcweir         switch (peek()) {
851cdf0e10cSrcweir         case '\0': // i.e., EOF
852cdf0e10cSrcweir             throw css::uno::RuntimeException(
853cdf0e10cSrcweir                 (rtl::OUString(
854cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
855cdf0e10cSrcweir                  fileUrl_),
856cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
857cdf0e10cSrcweir         case '\x0D':
858cdf0e10cSrcweir             pad_.add(begin, pos_ - begin);
859cdf0e10cSrcweir             ++pos_;
860cdf0e10cSrcweir             if (peek() != '\x0A') {
861cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
862cdf0e10cSrcweir             }
863cdf0e10cSrcweir             begin = pos_;
864cdf0e10cSrcweir             break;
865cdf0e10cSrcweir         case '&':
866cdf0e10cSrcweir             pad_.add(begin, pos_ - begin);
867cdf0e10cSrcweir             pos_ = handleReference(pos_, end_);
868cdf0e10cSrcweir             begin = pos_;
869cdf0e10cSrcweir             break;
870cdf0e10cSrcweir         case '<':
871cdf0e10cSrcweir             pad_.add(begin, pos_ - begin);
872cdf0e10cSrcweir             ++pos_;
873cdf0e10cSrcweir             switch (peek()) {
874cdf0e10cSrcweir             case '!':
875cdf0e10cSrcweir                 ++pos_;
876cdf0e10cSrcweir                 if (!skipComment()) {
877cdf0e10cSrcweir                     Span cdata(scanCdataSection());
878cdf0e10cSrcweir                     if (cdata.is()) {
879cdf0e10cSrcweir                         normalizeLineEnds(cdata);
880cdf0e10cSrcweir                     } else {
881cdf0e10cSrcweir                         skipDocumentTypeDeclaration();
882cdf0e10cSrcweir                     }
883cdf0e10cSrcweir                 }
884cdf0e10cSrcweir                 begin = pos_;
885cdf0e10cSrcweir                 break;
886cdf0e10cSrcweir             case '/':
887cdf0e10cSrcweir                 *text = pad_.get();
888cdf0e10cSrcweir                 ++pos_;
889cdf0e10cSrcweir                 state_ = STATE_END_TAG;
890cdf0e10cSrcweir                 return RESULT_TEXT;
891cdf0e10cSrcweir             case '?':
892cdf0e10cSrcweir                 ++pos_;
893cdf0e10cSrcweir                 skipProcessingInstruction();
894cdf0e10cSrcweir                 begin = pos_;
895cdf0e10cSrcweir                 break;
896cdf0e10cSrcweir             default:
897cdf0e10cSrcweir                 *text = pad_.get();
898cdf0e10cSrcweir                 state_ = STATE_START_TAG;
899cdf0e10cSrcweir                 return RESULT_TEXT;
900cdf0e10cSrcweir             }
901cdf0e10cSrcweir             break;
902cdf0e10cSrcweir         default:
903cdf0e10cSrcweir             ++pos_;
904cdf0e10cSrcweir             break;
905cdf0e10cSrcweir         }
906cdf0e10cSrcweir     }
907cdf0e10cSrcweir }
908cdf0e10cSrcweir 
handleNormalizedText(Span * text)909cdf0e10cSrcweir XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
910cdf0e10cSrcweir     pad_.clear();
911cdf0e10cSrcweir     char const * flowBegin = pos_;
912cdf0e10cSrcweir     char const * flowEnd = pos_;
913cdf0e10cSrcweir     enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
914cdf0e10cSrcweir         // a single true space character can go into the current flow,
915cdf0e10cSrcweir         // everything else breaks the flow
916cdf0e10cSrcweir     Space space = SPACE_START;
917cdf0e10cSrcweir     for (;;) {
918cdf0e10cSrcweir         switch (peek()) {
919cdf0e10cSrcweir         case '\0': // i.e., EOF
920cdf0e10cSrcweir             throw css::uno::RuntimeException(
921cdf0e10cSrcweir                 (rtl::OUString(
922cdf0e10cSrcweir                     RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
923cdf0e10cSrcweir                  fileUrl_),
924cdf0e10cSrcweir                 css::uno::Reference< css::uno::XInterface >());
925cdf0e10cSrcweir         case '\x09':
926cdf0e10cSrcweir         case '\x0A':
927cdf0e10cSrcweir         case '\x0D':
928cdf0e10cSrcweir             switch (space) {
929cdf0e10cSrcweir             case SPACE_START:
930cdf0e10cSrcweir             case SPACE_BREAK:
931cdf0e10cSrcweir                 break;
932cdf0e10cSrcweir             case SPACE_NONE:
933cdf0e10cSrcweir             case SPACE_SPAN:
934cdf0e10cSrcweir                 space = SPACE_BREAK;
935cdf0e10cSrcweir                 break;
936cdf0e10cSrcweir             }
937cdf0e10cSrcweir             ++pos_;
938cdf0e10cSrcweir             break;
939cdf0e10cSrcweir         case ' ':
940cdf0e10cSrcweir             switch (space) {
941cdf0e10cSrcweir             case SPACE_START:
942cdf0e10cSrcweir             case SPACE_BREAK:
943cdf0e10cSrcweir                 break;
944cdf0e10cSrcweir             case SPACE_NONE:
945cdf0e10cSrcweir                 space = SPACE_SPAN;
946cdf0e10cSrcweir                 break;
947cdf0e10cSrcweir             case SPACE_SPAN:
948cdf0e10cSrcweir                 space = SPACE_BREAK;
949cdf0e10cSrcweir                 break;
950cdf0e10cSrcweir             }
951cdf0e10cSrcweir             ++pos_;
952cdf0e10cSrcweir             break;
953cdf0e10cSrcweir         case '&':
954cdf0e10cSrcweir             switch (space) {
955cdf0e10cSrcweir             case SPACE_START:
956cdf0e10cSrcweir                 break;
957cdf0e10cSrcweir             case SPACE_NONE:
958cdf0e10cSrcweir             case SPACE_SPAN:
959cdf0e10cSrcweir                 pad_.add(flowBegin, pos_ - flowBegin);
960cdf0e10cSrcweir                 break;
961cdf0e10cSrcweir             case SPACE_BREAK:
962cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
963cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
964cdf0e10cSrcweir                 break;
965cdf0e10cSrcweir             }
966cdf0e10cSrcweir             pos_ = handleReference(pos_, end_);
967cdf0e10cSrcweir             flowBegin = pos_;
968cdf0e10cSrcweir             flowEnd = pos_;
969cdf0e10cSrcweir             space = SPACE_NONE;
970cdf0e10cSrcweir             break;
971cdf0e10cSrcweir         case '<':
972cdf0e10cSrcweir             ++pos_;
973cdf0e10cSrcweir             switch (peek()) {
974cdf0e10cSrcweir             case '!':
975cdf0e10cSrcweir                 ++pos_;
976cdf0e10cSrcweir                 if (skipComment()) {
977cdf0e10cSrcweir                     space = SPACE_BREAK;
978cdf0e10cSrcweir                 } else {
979cdf0e10cSrcweir                     Span cdata(scanCdataSection());
980cdf0e10cSrcweir                     if (cdata.is()) {
981cdf0e10cSrcweir                         // CDATA is not normalized (similar to character
982cdf0e10cSrcweir                         // references; it keeps the code simple), but it might
983cdf0e10cSrcweir                         // arguably be better to normalize it:
984cdf0e10cSrcweir                         switch (space) {
985cdf0e10cSrcweir                         case SPACE_START:
986cdf0e10cSrcweir                             break;
987cdf0e10cSrcweir                         case SPACE_NONE:
988cdf0e10cSrcweir                         case SPACE_SPAN:
989cdf0e10cSrcweir                             pad_.add(flowBegin, pos_ - flowBegin);
990cdf0e10cSrcweir                             break;
991cdf0e10cSrcweir                         case SPACE_BREAK:
992cdf0e10cSrcweir                             pad_.add(flowBegin, flowEnd - flowBegin);
993cdf0e10cSrcweir                             pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
994cdf0e10cSrcweir                             break;
995cdf0e10cSrcweir                         }
996cdf0e10cSrcweir                         normalizeLineEnds(cdata);
997cdf0e10cSrcweir                         flowBegin = pos_;
998cdf0e10cSrcweir                         flowEnd = pos_;
999cdf0e10cSrcweir                         space = SPACE_NONE;
1000cdf0e10cSrcweir                     } else {
1001cdf0e10cSrcweir                         skipDocumentTypeDeclaration();
1002cdf0e10cSrcweir                     }
1003cdf0e10cSrcweir                 }
1004cdf0e10cSrcweir                 break;
1005cdf0e10cSrcweir             case '/':
1006cdf0e10cSrcweir                 ++pos_;
1007cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
1008cdf0e10cSrcweir                 *text = pad_.get();
1009cdf0e10cSrcweir                 state_ = STATE_END_TAG;
1010cdf0e10cSrcweir                 return RESULT_TEXT;
1011cdf0e10cSrcweir             case '?':
1012cdf0e10cSrcweir                 ++pos_;
1013cdf0e10cSrcweir                 skipProcessingInstruction();
1014cdf0e10cSrcweir                 space = SPACE_BREAK;
1015cdf0e10cSrcweir                 break;
1016cdf0e10cSrcweir             default:
1017cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
1018cdf0e10cSrcweir                 *text = pad_.get();
1019cdf0e10cSrcweir                 state_ = STATE_START_TAG;
1020cdf0e10cSrcweir                 return RESULT_TEXT;
1021cdf0e10cSrcweir             }
1022cdf0e10cSrcweir             break;
1023cdf0e10cSrcweir         default:
1024cdf0e10cSrcweir             switch (space) {
1025cdf0e10cSrcweir             case SPACE_START:
1026cdf0e10cSrcweir                 flowBegin = pos_;
1027cdf0e10cSrcweir                 break;
1028cdf0e10cSrcweir             case SPACE_NONE:
1029cdf0e10cSrcweir             case SPACE_SPAN:
1030cdf0e10cSrcweir                 break;
1031cdf0e10cSrcweir             case SPACE_BREAK:
1032cdf0e10cSrcweir                 pad_.add(flowBegin, flowEnd - flowBegin);
1033cdf0e10cSrcweir                 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1034cdf0e10cSrcweir                 flowBegin = pos_;
1035cdf0e10cSrcweir                 break;
1036cdf0e10cSrcweir             }
1037cdf0e10cSrcweir             flowEnd = ++pos_;
1038cdf0e10cSrcweir             space = SPACE_NONE;
1039cdf0e10cSrcweir             break;
1040cdf0e10cSrcweir         }
1041cdf0e10cSrcweir     }
1042cdf0e10cSrcweir }
1043cdf0e10cSrcweir 
toNamespaceId(NamespaceIris::size_type pos)1044cdf0e10cSrcweir int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1045cdf0e10cSrcweir     OSL_ASSERT(pos <= INT_MAX);
1046cdf0e10cSrcweir     return static_cast< int >(pos);
1047cdf0e10cSrcweir }
1048cdf0e10cSrcweir 
1049cdf0e10cSrcweir }
1050