1*b725e8ebSAndrew Rist /**************************************************************
2*b725e8ebSAndrew Rist *
3*b725e8ebSAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one
4*b725e8ebSAndrew Rist * or more contributor license agreements. See the NOTICE file
5*b725e8ebSAndrew Rist * distributed with this work for additional information
6*b725e8ebSAndrew Rist * regarding copyright ownership. The ASF licenses this file
7*b725e8ebSAndrew Rist * to you under the Apache License, Version 2.0 (the
8*b725e8ebSAndrew Rist * "License"); you may not use this file except in compliance
9*b725e8ebSAndrew Rist * with the License. You may obtain a copy of the License at
10*b725e8ebSAndrew Rist *
11*b725e8ebSAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0
12*b725e8ebSAndrew Rist *
13*b725e8ebSAndrew Rist * Unless required by applicable law or agreed to in writing,
14*b725e8ebSAndrew Rist * software distributed under the License is distributed on an
15*b725e8ebSAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*b725e8ebSAndrew Rist * KIND, either express or implied. See the License for the
17*b725e8ebSAndrew Rist * specific language governing permissions and limitations
18*b725e8ebSAndrew Rist * under the License.
19*b725e8ebSAndrew Rist *
20*b725e8ebSAndrew Rist *************************************************************/
21*b725e8ebSAndrew Rist
22*b725e8ebSAndrew Rist
23cdf0e10cSrcweir
24cdf0e10cSrcweir #include "precompiled_xmlreader.hxx"
25cdf0e10cSrcweir #include "sal/config.h"
26cdf0e10cSrcweir
27cdf0e10cSrcweir #include <climits>
28cdf0e10cSrcweir #include <cstddef>
29cdf0e10cSrcweir
30cdf0e10cSrcweir #include "com/sun/star/container/NoSuchElementException.hpp"
31cdf0e10cSrcweir #include "com/sun/star/uno/Reference.hxx"
32cdf0e10cSrcweir #include "com/sun/star/uno/RuntimeException.hpp"
33cdf0e10cSrcweir #include "com/sun/star/uno/XInterface.hpp"
34cdf0e10cSrcweir #include "osl/diagnose.h"
35cdf0e10cSrcweir #include "osl/file.h"
36cdf0e10cSrcweir #include "rtl/string.h"
37cdf0e10cSrcweir #include "rtl/ustring.h"
38cdf0e10cSrcweir #include "rtl/ustring.hxx"
39cdf0e10cSrcweir #include "sal/types.h"
40cdf0e10cSrcweir #include "xmlreader/pad.hxx"
41cdf0e10cSrcweir #include "xmlreader/span.hxx"
42cdf0e10cSrcweir #include "xmlreader/xmlreader.hxx"
43cdf0e10cSrcweir
44cdf0e10cSrcweir namespace xmlreader {
45cdf0e10cSrcweir
46cdf0e10cSrcweir namespace {
47cdf0e10cSrcweir
48cdf0e10cSrcweir namespace css = com::sun::star;
49cdf0e10cSrcweir
isSpace(char c)50cdf0e10cSrcweir bool isSpace(char c) {
51cdf0e10cSrcweir switch (c) {
52cdf0e10cSrcweir case '\x09':
53cdf0e10cSrcweir case '\x0A':
54cdf0e10cSrcweir case '\x0D':
55cdf0e10cSrcweir case ' ':
56cdf0e10cSrcweir return true;
57cdf0e10cSrcweir default:
58cdf0e10cSrcweir return false;
59cdf0e10cSrcweir }
60cdf0e10cSrcweir }
61cdf0e10cSrcweir
62cdf0e10cSrcweir }
63cdf0e10cSrcweir
XmlReader(rtl::OUString const & fileUrl)64cdf0e10cSrcweir XmlReader::XmlReader(rtl::OUString const & fileUrl)
65cdf0e10cSrcweir SAL_THROW((
66cdf0e10cSrcweir css::container::NoSuchElementException, css::uno::RuntimeException)):
67cdf0e10cSrcweir fileUrl_(fileUrl)
68cdf0e10cSrcweir {
69cdf0e10cSrcweir switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
70cdf0e10cSrcweir {
71cdf0e10cSrcweir case osl_File_E_None:
72cdf0e10cSrcweir break;
73cdf0e10cSrcweir case osl_File_E_NOENT:
74cdf0e10cSrcweir throw css::container::NoSuchElementException(
75cdf0e10cSrcweir fileUrl_, css::uno::Reference< css::uno::XInterface >());
76cdf0e10cSrcweir default:
77cdf0e10cSrcweir throw css::uno::RuntimeException(
78cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
79cdf0e10cSrcweir fileUrl_),
80cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
81cdf0e10cSrcweir }
82cdf0e10cSrcweir oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
83cdf0e10cSrcweir if (e == osl_File_E_None) {
84cdf0e10cSrcweir e = osl_mapFile(
85cdf0e10cSrcweir fileHandle_, &fileAddress_, fileSize_, 0,
86cdf0e10cSrcweir osl_File_MapFlag_WillNeed);
87cdf0e10cSrcweir }
88cdf0e10cSrcweir if (e != osl_File_E_None) {
89cdf0e10cSrcweir e = osl_closeFile(fileHandle_);
90cdf0e10cSrcweir if (e != osl_File_E_None) {
91cdf0e10cSrcweir OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
92cdf0e10cSrcweir }
93cdf0e10cSrcweir throw css::uno::RuntimeException(
94cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
95cdf0e10cSrcweir fileUrl_),
96cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
97cdf0e10cSrcweir }
98cdf0e10cSrcweir namespaceIris_.push_back(
99cdf0e10cSrcweir Span(
100cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM(
101cdf0e10cSrcweir "http://www.w3.org/XML/1998/namespace")));
102cdf0e10cSrcweir namespaces_.push_back(
103cdf0e10cSrcweir NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
104cdf0e10cSrcweir pos_ = static_cast< char * >(fileAddress_);
105cdf0e10cSrcweir end_ = pos_ + fileSize_;
106cdf0e10cSrcweir state_ = STATE_CONTENT;
107cdf0e10cSrcweir }
108cdf0e10cSrcweir
~XmlReader()109cdf0e10cSrcweir XmlReader::~XmlReader() {
110cdf0e10cSrcweir oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
111cdf0e10cSrcweir if (e != osl_File_E_None) {
112cdf0e10cSrcweir OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
113cdf0e10cSrcweir }
114cdf0e10cSrcweir e = osl_closeFile(fileHandle_);
115cdf0e10cSrcweir if (e != osl_File_E_None) {
116cdf0e10cSrcweir OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
117cdf0e10cSrcweir }
118cdf0e10cSrcweir }
119cdf0e10cSrcweir
registerNamespaceIri(Span const & iri)120cdf0e10cSrcweir int XmlReader::registerNamespaceIri(Span const & iri) {
121cdf0e10cSrcweir int id = toNamespaceId(namespaceIris_.size());
122cdf0e10cSrcweir namespaceIris_.push_back(iri);
123cdf0e10cSrcweir if (iri.equals(
124cdf0e10cSrcweir Span(
125cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM(
126cdf0e10cSrcweir "http://www.w3.org/2001/XMLSchema-instance"))))
127cdf0e10cSrcweir {
128cdf0e10cSrcweir // Old user layer .xcu files used the xsi namespace prefix without
129cdf0e10cSrcweir // declaring a corresponding namespace binding, see issue 77174; reading
130cdf0e10cSrcweir // those files during migration would fail without this hack that can be
131cdf0e10cSrcweir // removed once migration is no longer relevant (see
132cdf0e10cSrcweir // configmgr::Components::parseModificationLayer):
133cdf0e10cSrcweir namespaces_.push_back(
134cdf0e10cSrcweir NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
135cdf0e10cSrcweir }
136cdf0e10cSrcweir return id;
137cdf0e10cSrcweir }
138cdf0e10cSrcweir
nextItem(Text reportText,Span * data,int * nsId)139cdf0e10cSrcweir XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
140cdf0e10cSrcweir {
141cdf0e10cSrcweir switch (state_) {
142cdf0e10cSrcweir case STATE_CONTENT:
143cdf0e10cSrcweir switch (reportText) {
144cdf0e10cSrcweir case TEXT_NONE:
145cdf0e10cSrcweir return handleSkippedText(data, nsId);
146cdf0e10cSrcweir case TEXT_RAW:
147cdf0e10cSrcweir return handleRawText(data);
148cdf0e10cSrcweir case TEXT_NORMALIZED:
149cdf0e10cSrcweir return handleNormalizedText(data);
150cdf0e10cSrcweir }
151cdf0e10cSrcweir case STATE_START_TAG:
152cdf0e10cSrcweir return handleStartTag(nsId, data);
153cdf0e10cSrcweir case STATE_END_TAG:
154cdf0e10cSrcweir return handleEndTag();
155cdf0e10cSrcweir case STATE_EMPTY_ELEMENT_TAG:
156cdf0e10cSrcweir handleElementEnd();
157cdf0e10cSrcweir return RESULT_END;
158cdf0e10cSrcweir default: // STATE_DONE
159cdf0e10cSrcweir return RESULT_DONE;
160cdf0e10cSrcweir }
161cdf0e10cSrcweir }
162cdf0e10cSrcweir
nextAttribute(int * nsId,Span * localName)163cdf0e10cSrcweir bool XmlReader::nextAttribute(int * nsId, Span * localName) {
164cdf0e10cSrcweir OSL_ASSERT(nsId != 0 && localName != 0);
165cdf0e10cSrcweir if (firstAttribute_) {
166cdf0e10cSrcweir currentAttribute_ = attributes_.begin();
167cdf0e10cSrcweir firstAttribute_ = false;
168cdf0e10cSrcweir } else {
169cdf0e10cSrcweir ++currentAttribute_;
170cdf0e10cSrcweir }
171cdf0e10cSrcweir if (currentAttribute_ == attributes_.end()) {
172cdf0e10cSrcweir return false;
173cdf0e10cSrcweir }
174cdf0e10cSrcweir if (currentAttribute_->nameColon == 0) {
175cdf0e10cSrcweir *nsId = NAMESPACE_NONE;
176cdf0e10cSrcweir *localName = Span(
177cdf0e10cSrcweir currentAttribute_->nameBegin,
178cdf0e10cSrcweir currentAttribute_->nameEnd - currentAttribute_->nameBegin);
179cdf0e10cSrcweir } else {
180cdf0e10cSrcweir *nsId = getNamespaceId(
181cdf0e10cSrcweir Span(
182cdf0e10cSrcweir currentAttribute_->nameBegin,
183cdf0e10cSrcweir currentAttribute_->nameColon - currentAttribute_->nameBegin));
184cdf0e10cSrcweir *localName = Span(
185cdf0e10cSrcweir currentAttribute_->nameColon + 1,
186cdf0e10cSrcweir currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
187cdf0e10cSrcweir }
188cdf0e10cSrcweir return true;
189cdf0e10cSrcweir }
190cdf0e10cSrcweir
getAttributeValue(bool fullyNormalize)191cdf0e10cSrcweir Span XmlReader::getAttributeValue(bool fullyNormalize) {
192cdf0e10cSrcweir return handleAttributeValue(
193cdf0e10cSrcweir currentAttribute_->valueBegin, currentAttribute_->valueEnd,
194cdf0e10cSrcweir fullyNormalize);
195cdf0e10cSrcweir }
196cdf0e10cSrcweir
getNamespaceId(Span const & prefix) const197cdf0e10cSrcweir int XmlReader::getNamespaceId(Span const & prefix) const {
198cdf0e10cSrcweir for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
199cdf0e10cSrcweir i != namespaces_.rend(); ++i)
200cdf0e10cSrcweir {
201cdf0e10cSrcweir if (prefix.equals(i->prefix)) {
202cdf0e10cSrcweir return i->nsId;
203cdf0e10cSrcweir }
204cdf0e10cSrcweir }
205cdf0e10cSrcweir return NAMESPACE_UNKNOWN;
206cdf0e10cSrcweir }
207cdf0e10cSrcweir
getUrl() const208cdf0e10cSrcweir rtl::OUString XmlReader::getUrl() const {
209cdf0e10cSrcweir return fileUrl_;
210cdf0e10cSrcweir }
211cdf0e10cSrcweir
normalizeLineEnds(Span const & text)212cdf0e10cSrcweir void XmlReader::normalizeLineEnds(Span const & text) {
213cdf0e10cSrcweir char const * p = text.begin;
214cdf0e10cSrcweir sal_Int32 n = text.length;
215cdf0e10cSrcweir for (;;) {
216cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
217cdf0e10cSrcweir if (i < 0) {
218cdf0e10cSrcweir break;
219cdf0e10cSrcweir }
220cdf0e10cSrcweir pad_.add(p, i);
221cdf0e10cSrcweir p += i + 1;
222cdf0e10cSrcweir n -= i + 1;
223cdf0e10cSrcweir if (n == 0 || *p != '\x0A') {
224cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
225cdf0e10cSrcweir }
226cdf0e10cSrcweir }
227cdf0e10cSrcweir pad_.add(p, n);
228cdf0e10cSrcweir }
229cdf0e10cSrcweir
skipSpace()230cdf0e10cSrcweir void XmlReader::skipSpace() {
231cdf0e10cSrcweir while (isSpace(peek())) {
232cdf0e10cSrcweir ++pos_;
233cdf0e10cSrcweir }
234cdf0e10cSrcweir }
235cdf0e10cSrcweir
skipComment()236cdf0e10cSrcweir bool XmlReader::skipComment() {
237cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength(
238cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
239cdf0e10cSrcweir RTL_CONSTASCII_LENGTH("--")) !=
240cdf0e10cSrcweir 0)
241cdf0e10cSrcweir {
242cdf0e10cSrcweir return false;
243cdf0e10cSrcweir }
244cdf0e10cSrcweir pos_ += RTL_CONSTASCII_LENGTH("--");
245cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength(
246cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
247cdf0e10cSrcweir if (i < 0) {
248cdf0e10cSrcweir throw css::uno::RuntimeException(
249cdf0e10cSrcweir (rtl::OUString(
250cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
251cdf0e10cSrcweir "premature end (within comment) of ")) +
252cdf0e10cSrcweir fileUrl_),
253cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
254cdf0e10cSrcweir }
255cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("--");
256cdf0e10cSrcweir if (read() != '>') {
257cdf0e10cSrcweir throw css::uno::RuntimeException(
258cdf0e10cSrcweir (rtl::OUString(
259cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
260cdf0e10cSrcweir "illegal \"--\" within comment in ")) +
261cdf0e10cSrcweir fileUrl_),
262cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
263cdf0e10cSrcweir }
264cdf0e10cSrcweir return true;
265cdf0e10cSrcweir }
266cdf0e10cSrcweir
skipProcessingInstruction()267cdf0e10cSrcweir void XmlReader::skipProcessingInstruction() {
268cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength(
269cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
270cdf0e10cSrcweir if (i < 0) {
271cdf0e10cSrcweir throw css::uno::RuntimeException(
272cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
273cdf0e10cSrcweir fileUrl_),
274cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
275cdf0e10cSrcweir }
276cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("?>");
277cdf0e10cSrcweir }
278cdf0e10cSrcweir
skipDocumentTypeDeclaration()279cdf0e10cSrcweir void XmlReader::skipDocumentTypeDeclaration() {
280cdf0e10cSrcweir // Neither is it checked that the doctypedecl is at the correct position in
281cdf0e10cSrcweir // the document, nor that it is well-formed:
282cdf0e10cSrcweir for (;;) {
283cdf0e10cSrcweir char c = read();
284cdf0e10cSrcweir switch (c) {
285cdf0e10cSrcweir case '\0': // i.e., EOF
286cdf0e10cSrcweir throw css::uno::RuntimeException(
287cdf0e10cSrcweir (rtl::OUString(
288cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
289cdf0e10cSrcweir "premature end (within DTD) of ")) +
290cdf0e10cSrcweir fileUrl_),
291cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
292cdf0e10cSrcweir case '"':
293cdf0e10cSrcweir case '\'':
294cdf0e10cSrcweir {
295cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(
296cdf0e10cSrcweir pos_, end_ - pos_, c);
297cdf0e10cSrcweir if (i < 0) {
298cdf0e10cSrcweir throw css::uno::RuntimeException(
299cdf0e10cSrcweir (rtl::OUString(
300cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
301cdf0e10cSrcweir "premature end (within DTD) of ")) +
302cdf0e10cSrcweir fileUrl_),
303cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
304cdf0e10cSrcweir }
305cdf0e10cSrcweir pos_ += i + 1;
306cdf0e10cSrcweir }
307cdf0e10cSrcweir break;
308cdf0e10cSrcweir case '>':
309cdf0e10cSrcweir return;
310cdf0e10cSrcweir case '[':
311cdf0e10cSrcweir for (;;) {
312cdf0e10cSrcweir c = read();
313cdf0e10cSrcweir switch (c) {
314cdf0e10cSrcweir case '\0': // i.e., EOF
315cdf0e10cSrcweir throw css::uno::RuntimeException(
316cdf0e10cSrcweir (rtl::OUString(
317cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
318cdf0e10cSrcweir "premature end (within DTD) of ")) +
319cdf0e10cSrcweir fileUrl_),
320cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
321cdf0e10cSrcweir case '"':
322cdf0e10cSrcweir case '\'':
323cdf0e10cSrcweir {
324cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(
325cdf0e10cSrcweir pos_, end_ - pos_, c);
326cdf0e10cSrcweir if (i < 0) {
327cdf0e10cSrcweir throw css::uno::RuntimeException(
328cdf0e10cSrcweir (rtl::OUString(
329cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
330cdf0e10cSrcweir "premature end (within DTD) of ")) +
331cdf0e10cSrcweir fileUrl_),
332cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
333cdf0e10cSrcweir }
334cdf0e10cSrcweir pos_ += i + 1;
335cdf0e10cSrcweir }
336cdf0e10cSrcweir break;
337cdf0e10cSrcweir case '<':
338cdf0e10cSrcweir switch (read()) {
339cdf0e10cSrcweir case '\0': // i.e., EOF
340cdf0e10cSrcweir throw css::uno::RuntimeException(
341cdf0e10cSrcweir (rtl::OUString(
342cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
343cdf0e10cSrcweir "premature end (within DTD) of ")) +
344cdf0e10cSrcweir fileUrl_),
345cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
346cdf0e10cSrcweir case '!':
347cdf0e10cSrcweir skipComment();
348cdf0e10cSrcweir break;
349cdf0e10cSrcweir case '?':
350cdf0e10cSrcweir skipProcessingInstruction();
351cdf0e10cSrcweir break;
352cdf0e10cSrcweir default:
353cdf0e10cSrcweir break;
354cdf0e10cSrcweir }
355cdf0e10cSrcweir break;
356cdf0e10cSrcweir case ']':
357cdf0e10cSrcweir skipSpace();
358cdf0e10cSrcweir if (read() != '>') {
359cdf0e10cSrcweir throw css::uno::RuntimeException(
360cdf0e10cSrcweir (rtl::OUString(
361cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
362cdf0e10cSrcweir "missing \">\" of DTD in ")) +
363cdf0e10cSrcweir fileUrl_),
364cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
365cdf0e10cSrcweir }
366cdf0e10cSrcweir return;
367cdf0e10cSrcweir default:
368cdf0e10cSrcweir break;
369cdf0e10cSrcweir }
370cdf0e10cSrcweir }
371cdf0e10cSrcweir default:
372cdf0e10cSrcweir break;
373cdf0e10cSrcweir }
374cdf0e10cSrcweir }
375cdf0e10cSrcweir }
376cdf0e10cSrcweir
scanCdataSection()377cdf0e10cSrcweir Span XmlReader::scanCdataSection() {
378cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength(
379cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
380cdf0e10cSrcweir RTL_CONSTASCII_LENGTH("[CDATA[")) !=
381cdf0e10cSrcweir 0)
382cdf0e10cSrcweir {
383cdf0e10cSrcweir return Span();
384cdf0e10cSrcweir }
385cdf0e10cSrcweir pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
386cdf0e10cSrcweir char const * begin = pos_;
387cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength(
388cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
389cdf0e10cSrcweir if (i < 0) {
390cdf0e10cSrcweir throw css::uno::RuntimeException(
391cdf0e10cSrcweir (rtl::OUString(
392cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
393cdf0e10cSrcweir "premature end (within CDATA section) of ")) +
394cdf0e10cSrcweir fileUrl_),
395cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
396cdf0e10cSrcweir }
397cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
398cdf0e10cSrcweir return Span(begin, i);
399cdf0e10cSrcweir }
400cdf0e10cSrcweir
scanName(char const ** nameColon)401cdf0e10cSrcweir bool XmlReader::scanName(char const ** nameColon) {
402cdf0e10cSrcweir OSL_ASSERT(nameColon != 0 && *nameColon == 0);
403cdf0e10cSrcweir for (char const * begin = pos_;; ++pos_) {
404cdf0e10cSrcweir switch (peek()) {
405cdf0e10cSrcweir case '\0': // i.e., EOF
406cdf0e10cSrcweir case '\x09':
407cdf0e10cSrcweir case '\x0A':
408cdf0e10cSrcweir case '\x0D':
409cdf0e10cSrcweir case ' ':
410cdf0e10cSrcweir case '/':
411cdf0e10cSrcweir case '=':
412cdf0e10cSrcweir case '>':
413cdf0e10cSrcweir return pos_ != begin;
414cdf0e10cSrcweir case ':':
415cdf0e10cSrcweir *nameColon = pos_;
416cdf0e10cSrcweir break;
417cdf0e10cSrcweir default:
418cdf0e10cSrcweir break;
419cdf0e10cSrcweir }
420cdf0e10cSrcweir }
421cdf0e10cSrcweir }
422cdf0e10cSrcweir
scanNamespaceIri(char const * begin,char const * end)423cdf0e10cSrcweir int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
424cdf0e10cSrcweir OSL_ASSERT(begin != 0 && begin <= end);
425cdf0e10cSrcweir Span iri(handleAttributeValue(begin, end, false));
426cdf0e10cSrcweir for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
427cdf0e10cSrcweir if (namespaceIris_[i].equals(iri)) {
428cdf0e10cSrcweir return toNamespaceId(i);
429cdf0e10cSrcweir }
430cdf0e10cSrcweir }
431cdf0e10cSrcweir return XmlReader::NAMESPACE_UNKNOWN;
432cdf0e10cSrcweir }
433cdf0e10cSrcweir
handleReference(char const * position,char const * end)434cdf0e10cSrcweir char const * XmlReader::handleReference(char const * position, char const * end)
435cdf0e10cSrcweir {
436cdf0e10cSrcweir OSL_ASSERT(position != 0 && *position == '&' && position < end);
437cdf0e10cSrcweir ++position;
438cdf0e10cSrcweir if (*position == '#') {
439cdf0e10cSrcweir ++position;
440cdf0e10cSrcweir sal_Int32 val = 0;
441cdf0e10cSrcweir char const * p;
442cdf0e10cSrcweir if (*position == 'x') {
443cdf0e10cSrcweir ++position;
444cdf0e10cSrcweir p = position;
445cdf0e10cSrcweir for (;; ++position) {
446cdf0e10cSrcweir char c = *position;
447cdf0e10cSrcweir if (c >= '0' && c <= '9') {
448cdf0e10cSrcweir val = 16 * val + (c - '0');
449cdf0e10cSrcweir } else if (c >= 'A' && c <= 'F') {
450cdf0e10cSrcweir val = 16 * val + (c - 'A') + 10;
451cdf0e10cSrcweir } else if (c >= 'a' && c <= 'f') {
452cdf0e10cSrcweir val = 16 * val + (c - 'a') + 10;
453cdf0e10cSrcweir } else {
454cdf0e10cSrcweir break;
455cdf0e10cSrcweir }
456cdf0e10cSrcweir if (val > 0x10FFFF) { // avoid overflow
457cdf0e10cSrcweir throw css::uno::RuntimeException(
458cdf0e10cSrcweir (rtl::OUString(
459cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
460cdf0e10cSrcweir "'&#x...' too large in ")) +
461cdf0e10cSrcweir fileUrl_),
462cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
463cdf0e10cSrcweir }
464cdf0e10cSrcweir }
465cdf0e10cSrcweir } else {
466cdf0e10cSrcweir p = position;
467cdf0e10cSrcweir for (;; ++position) {
468cdf0e10cSrcweir char c = *position;
469cdf0e10cSrcweir if (c >= '0' && c <= '9') {
470cdf0e10cSrcweir val = 10 * val + (c - '0');
471cdf0e10cSrcweir } else {
472cdf0e10cSrcweir break;
473cdf0e10cSrcweir }
474cdf0e10cSrcweir if (val > 0x10FFFF) { // avoid overflow
475cdf0e10cSrcweir throw css::uno::RuntimeException(
476cdf0e10cSrcweir (rtl::OUString(
477cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
478cdf0e10cSrcweir "'&#...' too large in ")) +
479cdf0e10cSrcweir fileUrl_),
480cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
481cdf0e10cSrcweir }
482cdf0e10cSrcweir }
483cdf0e10cSrcweir }
484cdf0e10cSrcweir if (position == p || *position++ != ';') {
485cdf0e10cSrcweir throw css::uno::RuntimeException(
486cdf0e10cSrcweir (rtl::OUString(
487cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
488cdf0e10cSrcweir fileUrl_),
489cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
490cdf0e10cSrcweir }
491cdf0e10cSrcweir OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
492cdf0e10cSrcweir if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
493cdf0e10cSrcweir (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
494cdf0e10cSrcweir {
495cdf0e10cSrcweir throw css::uno::RuntimeException(
496cdf0e10cSrcweir (rtl::OUString(
497cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
498cdf0e10cSrcweir "character reference denoting invalid character in ")) +
499cdf0e10cSrcweir fileUrl_),
500cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
501cdf0e10cSrcweir }
502cdf0e10cSrcweir char buf[4];
503cdf0e10cSrcweir sal_Int32 len;
504cdf0e10cSrcweir if (val < 0x80) {
505cdf0e10cSrcweir buf[0] = static_cast< char >(val);
506cdf0e10cSrcweir len = 1;
507cdf0e10cSrcweir } else if (val < 0x800) {
508cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 6) | 0xC0);
509cdf0e10cSrcweir buf[1] = static_cast< char >((val & 0x3F) | 0x80);
510cdf0e10cSrcweir len = 2;
511cdf0e10cSrcweir } else if (val < 0x10000) {
512cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 12) | 0xE0);
513cdf0e10cSrcweir buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
514cdf0e10cSrcweir buf[2] = static_cast< char >((val & 0x3F) | 0x80);
515cdf0e10cSrcweir len = 3;
516cdf0e10cSrcweir } else {
517cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 18) | 0xF0);
518cdf0e10cSrcweir buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
519cdf0e10cSrcweir buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
520cdf0e10cSrcweir buf[3] = static_cast< char >((val & 0x3F) | 0x80);
521cdf0e10cSrcweir len = 4;
522cdf0e10cSrcweir }
523cdf0e10cSrcweir pad_.addEphemeral(buf, len);
524cdf0e10cSrcweir return position;
525cdf0e10cSrcweir } else {
526cdf0e10cSrcweir struct EntityRef {
527cdf0e10cSrcweir char const * inBegin;
528cdf0e10cSrcweir sal_Int32 inLength;
529cdf0e10cSrcweir char const * outBegin;
530cdf0e10cSrcweir sal_Int32 outLength;
531cdf0e10cSrcweir };
532cdf0e10cSrcweir static EntityRef const refs[] = {
533cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("amp;"),
534cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("&") },
535cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("lt;"),
536cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("<") },
537cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("gt;"),
538cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM(">") },
539cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("apos;"),
540cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("'") },
541cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("quot;"),
542cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("\"") } };
543cdf0e10cSrcweir for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
544cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength(
545cdf0e10cSrcweir position, end - position, refs[i].inBegin, refs[i].inLength,
546cdf0e10cSrcweir refs[i].inLength) ==
547cdf0e10cSrcweir 0)
548cdf0e10cSrcweir {
549cdf0e10cSrcweir position += refs[i].inLength;
550cdf0e10cSrcweir pad_.add(refs[i].outBegin, refs[i].outLength);
551cdf0e10cSrcweir return position;
552cdf0e10cSrcweir }
553cdf0e10cSrcweir }
554cdf0e10cSrcweir throw css::uno::RuntimeException(
555cdf0e10cSrcweir (rtl::OUString(
556cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
557cdf0e10cSrcweir fileUrl_),
558cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
559cdf0e10cSrcweir }
560cdf0e10cSrcweir }
561cdf0e10cSrcweir
handleAttributeValue(char const * begin,char const * end,bool fullyNormalize)562cdf0e10cSrcweir Span XmlReader::handleAttributeValue(
563cdf0e10cSrcweir char const * begin, char const * end, bool fullyNormalize)
564cdf0e10cSrcweir {
565cdf0e10cSrcweir pad_.clear();
566cdf0e10cSrcweir if (fullyNormalize) {
567cdf0e10cSrcweir while (begin != end && isSpace(*begin)) {
568cdf0e10cSrcweir ++begin;
569cdf0e10cSrcweir }
570cdf0e10cSrcweir while (end != begin && isSpace(end[-1])) {
571cdf0e10cSrcweir --end;
572cdf0e10cSrcweir }
573cdf0e10cSrcweir char const * p = begin;
574cdf0e10cSrcweir enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
575cdf0e10cSrcweir // a single true space character can go into the current span,
576cdf0e10cSrcweir // everything else breaks the span
577cdf0e10cSrcweir Space space = SPACE_NONE;
578cdf0e10cSrcweir while (p != end) {
579cdf0e10cSrcweir switch (*p) {
580cdf0e10cSrcweir case '\x09':
581cdf0e10cSrcweir case '\x0A':
582cdf0e10cSrcweir case '\x0D':
583cdf0e10cSrcweir switch (space) {
584cdf0e10cSrcweir case SPACE_NONE:
585cdf0e10cSrcweir pad_.add(begin, p - begin);
586cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
587cdf0e10cSrcweir space = SPACE_BREAK;
588cdf0e10cSrcweir break;
589cdf0e10cSrcweir case SPACE_SPAN:
590cdf0e10cSrcweir pad_.add(begin, p - begin);
591cdf0e10cSrcweir space = SPACE_BREAK;
592cdf0e10cSrcweir break;
593cdf0e10cSrcweir case SPACE_BREAK:
594cdf0e10cSrcweir break;
595cdf0e10cSrcweir }
596cdf0e10cSrcweir begin = ++p;
597cdf0e10cSrcweir break;
598cdf0e10cSrcweir case ' ':
599cdf0e10cSrcweir switch (space) {
600cdf0e10cSrcweir case SPACE_NONE:
601cdf0e10cSrcweir ++p;
602cdf0e10cSrcweir space = SPACE_SPAN;
603cdf0e10cSrcweir break;
604cdf0e10cSrcweir case SPACE_SPAN:
605cdf0e10cSrcweir pad_.add(begin, p - begin);
606cdf0e10cSrcweir begin = ++p;
607cdf0e10cSrcweir space = SPACE_BREAK;
608cdf0e10cSrcweir break;
609cdf0e10cSrcweir case SPACE_BREAK:
610cdf0e10cSrcweir begin = ++p;
611cdf0e10cSrcweir break;
612cdf0e10cSrcweir }
613cdf0e10cSrcweir break;
614cdf0e10cSrcweir case '&':
615cdf0e10cSrcweir pad_.add(begin, p - begin);
616cdf0e10cSrcweir p = handleReference(p, end);
617cdf0e10cSrcweir begin = p;
618cdf0e10cSrcweir space = SPACE_NONE;
619cdf0e10cSrcweir break;
620cdf0e10cSrcweir default:
621cdf0e10cSrcweir ++p;
622cdf0e10cSrcweir space = SPACE_NONE;
623cdf0e10cSrcweir break;
624cdf0e10cSrcweir }
625cdf0e10cSrcweir }
626cdf0e10cSrcweir pad_.add(begin, p - begin);
627cdf0e10cSrcweir } else {
628cdf0e10cSrcweir char const * p = begin;
629cdf0e10cSrcweir while (p != end) {
630cdf0e10cSrcweir switch (*p) {
631cdf0e10cSrcweir case '\x09':
632cdf0e10cSrcweir case '\x0A':
633cdf0e10cSrcweir pad_.add(begin, p - begin);
634cdf0e10cSrcweir begin = ++p;
635cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
636cdf0e10cSrcweir break;
637cdf0e10cSrcweir case '\x0D':
638cdf0e10cSrcweir pad_.add(begin, p - begin);
639cdf0e10cSrcweir ++p;
640cdf0e10cSrcweir if (peek() == '\x0A') {
641cdf0e10cSrcweir ++p;
642cdf0e10cSrcweir }
643cdf0e10cSrcweir begin = p;
644cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
645cdf0e10cSrcweir break;
646cdf0e10cSrcweir case '&':
647cdf0e10cSrcweir pad_.add(begin, p - begin);
648cdf0e10cSrcweir p = handleReference(p, end);
649cdf0e10cSrcweir begin = p;
650cdf0e10cSrcweir break;
651cdf0e10cSrcweir default:
652cdf0e10cSrcweir ++p;
653cdf0e10cSrcweir break;
654cdf0e10cSrcweir }
655cdf0e10cSrcweir }
656cdf0e10cSrcweir pad_.add(begin, p - begin);
657cdf0e10cSrcweir }
658cdf0e10cSrcweir return pad_.get();
659cdf0e10cSrcweir }
660cdf0e10cSrcweir
handleStartTag(int * nsId,Span * localName)661cdf0e10cSrcweir XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
662cdf0e10cSrcweir OSL_ASSERT(nsId != 0 && localName);
663cdf0e10cSrcweir char const * nameBegin = pos_;
664cdf0e10cSrcweir char const * nameColon = 0;
665cdf0e10cSrcweir if (!scanName(&nameColon)) {
666cdf0e10cSrcweir throw css::uno::RuntimeException(
667cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
668cdf0e10cSrcweir fileUrl_),
669cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
670cdf0e10cSrcweir }
671cdf0e10cSrcweir char const * nameEnd = pos_;
672cdf0e10cSrcweir NamespaceList::size_type inheritedNamespaces = namespaces_.size();
673cdf0e10cSrcweir bool hasDefaultNs = false;
674cdf0e10cSrcweir int defaultNsId = NAMESPACE_NONE;
675cdf0e10cSrcweir attributes_.clear();
676cdf0e10cSrcweir for (;;) {
677cdf0e10cSrcweir char const * p = pos_;
678cdf0e10cSrcweir skipSpace();
679cdf0e10cSrcweir if (peek() == '/' || peek() == '>') {
680cdf0e10cSrcweir break;
681cdf0e10cSrcweir }
682cdf0e10cSrcweir if (pos_ == p) {
683cdf0e10cSrcweir throw css::uno::RuntimeException(
684cdf0e10cSrcweir (rtl::OUString(
685cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
686cdf0e10cSrcweir "missing whitespace before attribute in ")) +
687cdf0e10cSrcweir fileUrl_),
688cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
689cdf0e10cSrcweir }
690cdf0e10cSrcweir char const * attrNameBegin = pos_;
691cdf0e10cSrcweir char const * attrNameColon = 0;
692cdf0e10cSrcweir if (!scanName(&attrNameColon)) {
693cdf0e10cSrcweir throw css::uno::RuntimeException(
694cdf0e10cSrcweir (rtl::OUString(
695cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
696cdf0e10cSrcweir fileUrl_),
697cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
698cdf0e10cSrcweir }
699cdf0e10cSrcweir char const * attrNameEnd = pos_;
700cdf0e10cSrcweir skipSpace();
701cdf0e10cSrcweir if (read() != '=') {
702cdf0e10cSrcweir throw css::uno::RuntimeException(
703cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
704cdf0e10cSrcweir fileUrl_),
705cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
706cdf0e10cSrcweir }
707cdf0e10cSrcweir skipSpace();
708cdf0e10cSrcweir char del = read();
709cdf0e10cSrcweir if (del != '\'' && del != '"') {
710cdf0e10cSrcweir throw css::uno::RuntimeException(
711cdf0e10cSrcweir (rtl::OUString(
712cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
713cdf0e10cSrcweir fileUrl_),
714cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
715cdf0e10cSrcweir }
716cdf0e10cSrcweir char const * valueBegin = pos_;
717cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
718cdf0e10cSrcweir if (i < 0) {
719cdf0e10cSrcweir throw css::uno::RuntimeException(
720cdf0e10cSrcweir (rtl::OUString(
721cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
722cdf0e10cSrcweir "unterminated attribute value in ")) +
723cdf0e10cSrcweir fileUrl_),
724cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
725cdf0e10cSrcweir }
726cdf0e10cSrcweir char const * valueEnd = pos_ + i;
727cdf0e10cSrcweir pos_ += i + 1;
728cdf0e10cSrcweir if (attrNameColon == 0 &&
729cdf0e10cSrcweir Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
730cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("xmlns")))
731cdf0e10cSrcweir {
732cdf0e10cSrcweir hasDefaultNs = true;
733cdf0e10cSrcweir defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
734cdf0e10cSrcweir } else if (attrNameColon != 0 &&
735cdf0e10cSrcweir Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
736cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("xmlns")))
737cdf0e10cSrcweir {
738cdf0e10cSrcweir namespaces_.push_back(
739cdf0e10cSrcweir NamespaceData(
740cdf0e10cSrcweir Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
741cdf0e10cSrcweir scanNamespaceIri(valueBegin, valueEnd)));
742cdf0e10cSrcweir } else {
743cdf0e10cSrcweir attributes_.push_back(
744cdf0e10cSrcweir AttributeData(
745cdf0e10cSrcweir attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
746cdf0e10cSrcweir valueEnd));
747cdf0e10cSrcweir }
748cdf0e10cSrcweir }
749cdf0e10cSrcweir if (!hasDefaultNs && !elements_.empty()) {
750cdf0e10cSrcweir defaultNsId = elements_.top().defaultNamespaceId;
751cdf0e10cSrcweir }
752cdf0e10cSrcweir firstAttribute_ = true;
753cdf0e10cSrcweir if (peek() == '/') {
754cdf0e10cSrcweir state_ = STATE_EMPTY_ELEMENT_TAG;
755cdf0e10cSrcweir ++pos_;
756cdf0e10cSrcweir } else {
757cdf0e10cSrcweir state_ = STATE_CONTENT;
758cdf0e10cSrcweir }
759cdf0e10cSrcweir if (peek() != '>') {
760cdf0e10cSrcweir throw css::uno::RuntimeException(
761cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
762cdf0e10cSrcweir fileUrl_),
763cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
764cdf0e10cSrcweir }
765cdf0e10cSrcweir ++pos_;
766cdf0e10cSrcweir elements_.push(
767cdf0e10cSrcweir ElementData(
768cdf0e10cSrcweir Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
769cdf0e10cSrcweir defaultNsId));
770cdf0e10cSrcweir if (nameColon == 0) {
771cdf0e10cSrcweir *nsId = defaultNsId;
772cdf0e10cSrcweir *localName = Span(nameBegin, nameEnd - nameBegin);
773cdf0e10cSrcweir } else {
774cdf0e10cSrcweir *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
775cdf0e10cSrcweir *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
776cdf0e10cSrcweir }
777cdf0e10cSrcweir return RESULT_BEGIN;
778cdf0e10cSrcweir }
779cdf0e10cSrcweir
handleEndTag()780cdf0e10cSrcweir XmlReader::Result XmlReader::handleEndTag() {
781cdf0e10cSrcweir if (elements_.empty()) {
782cdf0e10cSrcweir throw css::uno::RuntimeException(
783cdf0e10cSrcweir (rtl::OUString(
784cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
785cdf0e10cSrcweir fileUrl_),
786cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
787cdf0e10cSrcweir }
788cdf0e10cSrcweir char const * nameBegin = pos_;
789cdf0e10cSrcweir char const * nameColon = 0;
790cdf0e10cSrcweir if (!scanName(&nameColon) ||
791cdf0e10cSrcweir !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
792cdf0e10cSrcweir {
793cdf0e10cSrcweir throw css::uno::RuntimeException(
794cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
795cdf0e10cSrcweir fileUrl_),
796cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
797cdf0e10cSrcweir }
798cdf0e10cSrcweir handleElementEnd();
799cdf0e10cSrcweir skipSpace();
800cdf0e10cSrcweir if (peek() != '>') {
801cdf0e10cSrcweir throw css::uno::RuntimeException(
802cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
803cdf0e10cSrcweir fileUrl_),
804cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
805cdf0e10cSrcweir }
806cdf0e10cSrcweir ++pos_;
807cdf0e10cSrcweir return RESULT_END;
808cdf0e10cSrcweir }
809cdf0e10cSrcweir
handleElementEnd()810cdf0e10cSrcweir void XmlReader::handleElementEnd() {
811cdf0e10cSrcweir OSL_ASSERT(!elements_.empty());
812cdf0e10cSrcweir namespaces_.resize(elements_.top().inheritedNamespaces);
813cdf0e10cSrcweir elements_.pop();
814cdf0e10cSrcweir state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
815cdf0e10cSrcweir }
816cdf0e10cSrcweir
handleSkippedText(Span * data,int * nsId)817cdf0e10cSrcweir XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
818cdf0e10cSrcweir for (;;) {
819cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
820cdf0e10cSrcweir if (i < 0) {
821cdf0e10cSrcweir throw css::uno::RuntimeException(
822cdf0e10cSrcweir (rtl::OUString(
823cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
824cdf0e10cSrcweir fileUrl_),
825cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
826cdf0e10cSrcweir }
827cdf0e10cSrcweir pos_ += i + 1;
828cdf0e10cSrcweir switch (peek()) {
829cdf0e10cSrcweir case '!':
830cdf0e10cSrcweir ++pos_;
831cdf0e10cSrcweir if (!skipComment() && !scanCdataSection().is()) {
832cdf0e10cSrcweir skipDocumentTypeDeclaration();
833cdf0e10cSrcweir }
834cdf0e10cSrcweir break;
835cdf0e10cSrcweir case '/':
836cdf0e10cSrcweir ++pos_;
837cdf0e10cSrcweir return handleEndTag();
838cdf0e10cSrcweir case '?':
839cdf0e10cSrcweir ++pos_;
840cdf0e10cSrcweir skipProcessingInstruction();
841cdf0e10cSrcweir break;
842cdf0e10cSrcweir default:
843cdf0e10cSrcweir return handleStartTag(nsId, data);
844cdf0e10cSrcweir }
845cdf0e10cSrcweir }
846cdf0e10cSrcweir }
847cdf0e10cSrcweir
handleRawText(Span * text)848cdf0e10cSrcweir XmlReader::Result XmlReader::handleRawText(Span * text) {
849cdf0e10cSrcweir pad_.clear();
850cdf0e10cSrcweir for (char const * begin = pos_;;) {
851cdf0e10cSrcweir switch (peek()) {
852cdf0e10cSrcweir case '\0': // i.e., EOF
853cdf0e10cSrcweir throw css::uno::RuntimeException(
854cdf0e10cSrcweir (rtl::OUString(
855cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
856cdf0e10cSrcweir fileUrl_),
857cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
858cdf0e10cSrcweir case '\x0D':
859cdf0e10cSrcweir pad_.add(begin, pos_ - begin);
860cdf0e10cSrcweir ++pos_;
861cdf0e10cSrcweir if (peek() != '\x0A') {
862cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
863cdf0e10cSrcweir }
864cdf0e10cSrcweir begin = pos_;
865cdf0e10cSrcweir break;
866cdf0e10cSrcweir case '&':
867cdf0e10cSrcweir pad_.add(begin, pos_ - begin);
868cdf0e10cSrcweir pos_ = handleReference(pos_, end_);
869cdf0e10cSrcweir begin = pos_;
870cdf0e10cSrcweir break;
871cdf0e10cSrcweir case '<':
872cdf0e10cSrcweir pad_.add(begin, pos_ - begin);
873cdf0e10cSrcweir ++pos_;
874cdf0e10cSrcweir switch (peek()) {
875cdf0e10cSrcweir case '!':
876cdf0e10cSrcweir ++pos_;
877cdf0e10cSrcweir if (!skipComment()) {
878cdf0e10cSrcweir Span cdata(scanCdataSection());
879cdf0e10cSrcweir if (cdata.is()) {
880cdf0e10cSrcweir normalizeLineEnds(cdata);
881cdf0e10cSrcweir } else {
882cdf0e10cSrcweir skipDocumentTypeDeclaration();
883cdf0e10cSrcweir }
884cdf0e10cSrcweir }
885cdf0e10cSrcweir begin = pos_;
886cdf0e10cSrcweir break;
887cdf0e10cSrcweir case '/':
888cdf0e10cSrcweir *text = pad_.get();
889cdf0e10cSrcweir ++pos_;
890cdf0e10cSrcweir state_ = STATE_END_TAG;
891cdf0e10cSrcweir return RESULT_TEXT;
892cdf0e10cSrcweir case '?':
893cdf0e10cSrcweir ++pos_;
894cdf0e10cSrcweir skipProcessingInstruction();
895cdf0e10cSrcweir begin = pos_;
896cdf0e10cSrcweir break;
897cdf0e10cSrcweir default:
898cdf0e10cSrcweir *text = pad_.get();
899cdf0e10cSrcweir state_ = STATE_START_TAG;
900cdf0e10cSrcweir return RESULT_TEXT;
901cdf0e10cSrcweir }
902cdf0e10cSrcweir break;
903cdf0e10cSrcweir default:
904cdf0e10cSrcweir ++pos_;
905cdf0e10cSrcweir break;
906cdf0e10cSrcweir }
907cdf0e10cSrcweir }
908cdf0e10cSrcweir }
909cdf0e10cSrcweir
handleNormalizedText(Span * text)910cdf0e10cSrcweir XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
911cdf0e10cSrcweir pad_.clear();
912cdf0e10cSrcweir char const * flowBegin = pos_;
913cdf0e10cSrcweir char const * flowEnd = pos_;
914cdf0e10cSrcweir enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
915cdf0e10cSrcweir // a single true space character can go into the current flow,
916cdf0e10cSrcweir // everything else breaks the flow
917cdf0e10cSrcweir Space space = SPACE_START;
918cdf0e10cSrcweir for (;;) {
919cdf0e10cSrcweir switch (peek()) {
920cdf0e10cSrcweir case '\0': // i.e., EOF
921cdf0e10cSrcweir throw css::uno::RuntimeException(
922cdf0e10cSrcweir (rtl::OUString(
923cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
924cdf0e10cSrcweir fileUrl_),
925cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
926cdf0e10cSrcweir case '\x09':
927cdf0e10cSrcweir case '\x0A':
928cdf0e10cSrcweir case '\x0D':
929cdf0e10cSrcweir switch (space) {
930cdf0e10cSrcweir case SPACE_START:
931cdf0e10cSrcweir case SPACE_BREAK:
932cdf0e10cSrcweir break;
933cdf0e10cSrcweir case SPACE_NONE:
934cdf0e10cSrcweir case SPACE_SPAN:
935cdf0e10cSrcweir space = SPACE_BREAK;
936cdf0e10cSrcweir break;
937cdf0e10cSrcweir }
938cdf0e10cSrcweir ++pos_;
939cdf0e10cSrcweir break;
940cdf0e10cSrcweir case ' ':
941cdf0e10cSrcweir switch (space) {
942cdf0e10cSrcweir case SPACE_START:
943cdf0e10cSrcweir case SPACE_BREAK:
944cdf0e10cSrcweir break;
945cdf0e10cSrcweir case SPACE_NONE:
946cdf0e10cSrcweir space = SPACE_SPAN;
947cdf0e10cSrcweir break;
948cdf0e10cSrcweir case SPACE_SPAN:
949cdf0e10cSrcweir space = SPACE_BREAK;
950cdf0e10cSrcweir break;
951cdf0e10cSrcweir }
952cdf0e10cSrcweir ++pos_;
953cdf0e10cSrcweir break;
954cdf0e10cSrcweir case '&':
955cdf0e10cSrcweir switch (space) {
956cdf0e10cSrcweir case SPACE_START:
957cdf0e10cSrcweir break;
958cdf0e10cSrcweir case SPACE_NONE:
959cdf0e10cSrcweir case SPACE_SPAN:
960cdf0e10cSrcweir pad_.add(flowBegin, pos_ - flowBegin);
961cdf0e10cSrcweir break;
962cdf0e10cSrcweir case SPACE_BREAK:
963cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
964cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
965cdf0e10cSrcweir break;
966cdf0e10cSrcweir }
967cdf0e10cSrcweir pos_ = handleReference(pos_, end_);
968cdf0e10cSrcweir flowBegin = pos_;
969cdf0e10cSrcweir flowEnd = pos_;
970cdf0e10cSrcweir space = SPACE_NONE;
971cdf0e10cSrcweir break;
972cdf0e10cSrcweir case '<':
973cdf0e10cSrcweir ++pos_;
974cdf0e10cSrcweir switch (peek()) {
975cdf0e10cSrcweir case '!':
976cdf0e10cSrcweir ++pos_;
977cdf0e10cSrcweir if (skipComment()) {
978cdf0e10cSrcweir space = SPACE_BREAK;
979cdf0e10cSrcweir } else {
980cdf0e10cSrcweir Span cdata(scanCdataSection());
981cdf0e10cSrcweir if (cdata.is()) {
982cdf0e10cSrcweir // CDATA is not normalized (similar to character
983cdf0e10cSrcweir // references; it keeps the code simple), but it might
984cdf0e10cSrcweir // arguably be better to normalize it:
985cdf0e10cSrcweir switch (space) {
986cdf0e10cSrcweir case SPACE_START:
987cdf0e10cSrcweir break;
988cdf0e10cSrcweir case SPACE_NONE:
989cdf0e10cSrcweir case SPACE_SPAN:
990cdf0e10cSrcweir pad_.add(flowBegin, pos_ - flowBegin);
991cdf0e10cSrcweir break;
992cdf0e10cSrcweir case SPACE_BREAK:
993cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
994cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
995cdf0e10cSrcweir break;
996cdf0e10cSrcweir }
997cdf0e10cSrcweir normalizeLineEnds(cdata);
998cdf0e10cSrcweir flowBegin = pos_;
999cdf0e10cSrcweir flowEnd = pos_;
1000cdf0e10cSrcweir space = SPACE_NONE;
1001cdf0e10cSrcweir } else {
1002cdf0e10cSrcweir skipDocumentTypeDeclaration();
1003cdf0e10cSrcweir }
1004cdf0e10cSrcweir }
1005cdf0e10cSrcweir break;
1006cdf0e10cSrcweir case '/':
1007cdf0e10cSrcweir ++pos_;
1008cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
1009cdf0e10cSrcweir *text = pad_.get();
1010cdf0e10cSrcweir state_ = STATE_END_TAG;
1011cdf0e10cSrcweir return RESULT_TEXT;
1012cdf0e10cSrcweir case '?':
1013cdf0e10cSrcweir ++pos_;
1014cdf0e10cSrcweir skipProcessingInstruction();
1015cdf0e10cSrcweir space = SPACE_BREAK;
1016cdf0e10cSrcweir break;
1017cdf0e10cSrcweir default:
1018cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
1019cdf0e10cSrcweir *text = pad_.get();
1020cdf0e10cSrcweir state_ = STATE_START_TAG;
1021cdf0e10cSrcweir return RESULT_TEXT;
1022cdf0e10cSrcweir }
1023cdf0e10cSrcweir break;
1024cdf0e10cSrcweir default:
1025cdf0e10cSrcweir switch (space) {
1026cdf0e10cSrcweir case SPACE_START:
1027cdf0e10cSrcweir flowBegin = pos_;
1028cdf0e10cSrcweir break;
1029cdf0e10cSrcweir case SPACE_NONE:
1030cdf0e10cSrcweir case SPACE_SPAN:
1031cdf0e10cSrcweir break;
1032cdf0e10cSrcweir case SPACE_BREAK:
1033cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
1034cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1035cdf0e10cSrcweir flowBegin = pos_;
1036cdf0e10cSrcweir break;
1037cdf0e10cSrcweir }
1038cdf0e10cSrcweir flowEnd = ++pos_;
1039cdf0e10cSrcweir space = SPACE_NONE;
1040cdf0e10cSrcweir break;
1041cdf0e10cSrcweir }
1042cdf0e10cSrcweir }
1043cdf0e10cSrcweir }
1044cdf0e10cSrcweir
toNamespaceId(NamespaceIris::size_type pos)1045cdf0e10cSrcweir int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1046cdf0e10cSrcweir OSL_ASSERT(pos <= INT_MAX);
1047cdf0e10cSrcweir return static_cast< int >(pos);
1048cdf0e10cSrcweir }
1049cdf0e10cSrcweir
1050cdf0e10cSrcweir }
1051