1*b725e8ebSAndrew Rist /**************************************************************
2*b725e8ebSAndrew Rist *
3*b725e8ebSAndrew Rist * Licensed to the Apache Software Foundation (ASF) under one
4*b725e8ebSAndrew Rist * or more contributor license agreements. See the NOTICE file
5*b725e8ebSAndrew Rist * distributed with this work for additional information
6*b725e8ebSAndrew Rist * regarding copyright ownership. The ASF licenses this file
7*b725e8ebSAndrew Rist * to you under the Apache License, Version 2.0 (the
8*b725e8ebSAndrew Rist * "License"); you may not use this file except in compliance
9*b725e8ebSAndrew Rist * with the License. You may obtain a copy of the License at
10*b725e8ebSAndrew Rist *
11*b725e8ebSAndrew Rist * http://www.apache.org/licenses/LICENSE-2.0
12*b725e8ebSAndrew Rist *
13*b725e8ebSAndrew Rist * Unless required by applicable law or agreed to in writing,
14*b725e8ebSAndrew Rist * software distributed under the License is distributed on an
15*b725e8ebSAndrew Rist * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16*b725e8ebSAndrew Rist * KIND, either express or implied. See the License for the
17*b725e8ebSAndrew Rist * specific language governing permissions and limitations
18*b725e8ebSAndrew Rist * under the License.
19*b725e8ebSAndrew Rist *
20*b725e8ebSAndrew Rist *************************************************************/
21*b725e8ebSAndrew Rist
22*b725e8ebSAndrew Rist
23cdf0e10cSrcweir
24cdf0e10cSrcweir #include "sal/config.h"
25cdf0e10cSrcweir
26cdf0e10cSrcweir #include <climits>
27cdf0e10cSrcweir #include <cstddef>
28cdf0e10cSrcweir
29cdf0e10cSrcweir #include "com/sun/star/container/NoSuchElementException.hpp"
30cdf0e10cSrcweir #include "com/sun/star/uno/Reference.hxx"
31cdf0e10cSrcweir #include "com/sun/star/uno/RuntimeException.hpp"
32cdf0e10cSrcweir #include "com/sun/star/uno/XInterface.hpp"
33cdf0e10cSrcweir #include "osl/diagnose.h"
34cdf0e10cSrcweir #include "osl/file.h"
35cdf0e10cSrcweir #include "rtl/string.h"
36cdf0e10cSrcweir #include "rtl/ustring.h"
37cdf0e10cSrcweir #include "rtl/ustring.hxx"
38cdf0e10cSrcweir #include "sal/types.h"
39cdf0e10cSrcweir #include "xmlreader/pad.hxx"
40cdf0e10cSrcweir #include "xmlreader/span.hxx"
41cdf0e10cSrcweir #include "xmlreader/xmlreader.hxx"
42cdf0e10cSrcweir
43cdf0e10cSrcweir namespace xmlreader {
44cdf0e10cSrcweir
45cdf0e10cSrcweir namespace {
46cdf0e10cSrcweir
47cdf0e10cSrcweir namespace css = com::sun::star;
48cdf0e10cSrcweir
isSpace(char c)49cdf0e10cSrcweir bool isSpace(char c) {
50cdf0e10cSrcweir switch (c) {
51cdf0e10cSrcweir case '\x09':
52cdf0e10cSrcweir case '\x0A':
53cdf0e10cSrcweir case '\x0D':
54cdf0e10cSrcweir case ' ':
55cdf0e10cSrcweir return true;
56cdf0e10cSrcweir default:
57cdf0e10cSrcweir return false;
58cdf0e10cSrcweir }
59cdf0e10cSrcweir }
60cdf0e10cSrcweir
61cdf0e10cSrcweir }
62cdf0e10cSrcweir
XmlReader(rtl::OUString const & fileUrl)63cdf0e10cSrcweir XmlReader::XmlReader(rtl::OUString const & fileUrl)
64cdf0e10cSrcweir SAL_THROW((
65cdf0e10cSrcweir css::container::NoSuchElementException, css::uno::RuntimeException)):
66cdf0e10cSrcweir fileUrl_(fileUrl)
67cdf0e10cSrcweir {
68cdf0e10cSrcweir switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
69cdf0e10cSrcweir {
70cdf0e10cSrcweir case osl_File_E_None:
71cdf0e10cSrcweir break;
72cdf0e10cSrcweir case osl_File_E_NOENT:
73cdf0e10cSrcweir throw css::container::NoSuchElementException(
74cdf0e10cSrcweir fileUrl_, css::uno::Reference< css::uno::XInterface >());
75cdf0e10cSrcweir default:
76cdf0e10cSrcweir throw css::uno::RuntimeException(
77cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
78cdf0e10cSrcweir fileUrl_),
79cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
80cdf0e10cSrcweir }
81cdf0e10cSrcweir oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
82cdf0e10cSrcweir if (e == osl_File_E_None) {
83cdf0e10cSrcweir e = osl_mapFile(
84cdf0e10cSrcweir fileHandle_, &fileAddress_, fileSize_, 0,
85cdf0e10cSrcweir osl_File_MapFlag_WillNeed);
86cdf0e10cSrcweir }
87cdf0e10cSrcweir if (e != osl_File_E_None) {
88cdf0e10cSrcweir e = osl_closeFile(fileHandle_);
89cdf0e10cSrcweir if (e != osl_File_E_None) {
90cdf0e10cSrcweir OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
91cdf0e10cSrcweir }
92cdf0e10cSrcweir throw css::uno::RuntimeException(
93cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
94cdf0e10cSrcweir fileUrl_),
95cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
96cdf0e10cSrcweir }
97cdf0e10cSrcweir namespaceIris_.push_back(
98cdf0e10cSrcweir Span(
99cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM(
100cdf0e10cSrcweir "http://www.w3.org/XML/1998/namespace")));
101cdf0e10cSrcweir namespaces_.push_back(
102cdf0e10cSrcweir NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
103cdf0e10cSrcweir pos_ = static_cast< char * >(fileAddress_);
104cdf0e10cSrcweir end_ = pos_ + fileSize_;
105cdf0e10cSrcweir state_ = STATE_CONTENT;
106cdf0e10cSrcweir }
107cdf0e10cSrcweir
~XmlReader()108cdf0e10cSrcweir XmlReader::~XmlReader() {
109cdf0e10cSrcweir oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
110cdf0e10cSrcweir if (e != osl_File_E_None) {
111cdf0e10cSrcweir OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
112cdf0e10cSrcweir }
113cdf0e10cSrcweir e = osl_closeFile(fileHandle_);
114cdf0e10cSrcweir if (e != osl_File_E_None) {
115cdf0e10cSrcweir OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
116cdf0e10cSrcweir }
117cdf0e10cSrcweir }
118cdf0e10cSrcweir
registerNamespaceIri(Span const & iri)119cdf0e10cSrcweir int XmlReader::registerNamespaceIri(Span const & iri) {
120cdf0e10cSrcweir int id = toNamespaceId(namespaceIris_.size());
121cdf0e10cSrcweir namespaceIris_.push_back(iri);
122cdf0e10cSrcweir if (iri.equals(
123cdf0e10cSrcweir Span(
124cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM(
125cdf0e10cSrcweir "http://www.w3.org/2001/XMLSchema-instance"))))
126cdf0e10cSrcweir {
127cdf0e10cSrcweir // Old user layer .xcu files used the xsi namespace prefix without
128cdf0e10cSrcweir // declaring a corresponding namespace binding, see issue 77174; reading
129cdf0e10cSrcweir // those files during migration would fail without this hack that can be
130cdf0e10cSrcweir // removed once migration is no longer relevant (see
131cdf0e10cSrcweir // configmgr::Components::parseModificationLayer):
132cdf0e10cSrcweir namespaces_.push_back(
133cdf0e10cSrcweir NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
134cdf0e10cSrcweir }
135cdf0e10cSrcweir return id;
136cdf0e10cSrcweir }
137cdf0e10cSrcweir
nextItem(Text reportText,Span * data,int * nsId)138cdf0e10cSrcweir XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
139cdf0e10cSrcweir {
140cdf0e10cSrcweir switch (state_) {
141cdf0e10cSrcweir case STATE_CONTENT:
142cdf0e10cSrcweir switch (reportText) {
143cdf0e10cSrcweir case TEXT_NONE:
144cdf0e10cSrcweir return handleSkippedText(data, nsId);
145cdf0e10cSrcweir case TEXT_RAW:
146cdf0e10cSrcweir return handleRawText(data);
147cdf0e10cSrcweir case TEXT_NORMALIZED:
148cdf0e10cSrcweir return handleNormalizedText(data);
149cdf0e10cSrcweir }
150cdf0e10cSrcweir case STATE_START_TAG:
151cdf0e10cSrcweir return handleStartTag(nsId, data);
152cdf0e10cSrcweir case STATE_END_TAG:
153cdf0e10cSrcweir return handleEndTag();
154cdf0e10cSrcweir case STATE_EMPTY_ELEMENT_TAG:
155cdf0e10cSrcweir handleElementEnd();
156cdf0e10cSrcweir return RESULT_END;
157cdf0e10cSrcweir default: // STATE_DONE
158cdf0e10cSrcweir return RESULT_DONE;
159cdf0e10cSrcweir }
160cdf0e10cSrcweir }
161cdf0e10cSrcweir
nextAttribute(int * nsId,Span * localName)162cdf0e10cSrcweir bool XmlReader::nextAttribute(int * nsId, Span * localName) {
163cdf0e10cSrcweir OSL_ASSERT(nsId != 0 && localName != 0);
164cdf0e10cSrcweir if (firstAttribute_) {
165cdf0e10cSrcweir currentAttribute_ = attributes_.begin();
166cdf0e10cSrcweir firstAttribute_ = false;
167cdf0e10cSrcweir } else {
168cdf0e10cSrcweir ++currentAttribute_;
169cdf0e10cSrcweir }
170cdf0e10cSrcweir if (currentAttribute_ == attributes_.end()) {
171cdf0e10cSrcweir return false;
172cdf0e10cSrcweir }
173cdf0e10cSrcweir if (currentAttribute_->nameColon == 0) {
174cdf0e10cSrcweir *nsId = NAMESPACE_NONE;
175cdf0e10cSrcweir *localName = Span(
176cdf0e10cSrcweir currentAttribute_->nameBegin,
177cdf0e10cSrcweir currentAttribute_->nameEnd - currentAttribute_->nameBegin);
178cdf0e10cSrcweir } else {
179cdf0e10cSrcweir *nsId = getNamespaceId(
180cdf0e10cSrcweir Span(
181cdf0e10cSrcweir currentAttribute_->nameBegin,
182cdf0e10cSrcweir currentAttribute_->nameColon - currentAttribute_->nameBegin));
183cdf0e10cSrcweir *localName = Span(
184cdf0e10cSrcweir currentAttribute_->nameColon + 1,
185cdf0e10cSrcweir currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
186cdf0e10cSrcweir }
187cdf0e10cSrcweir return true;
188cdf0e10cSrcweir }
189cdf0e10cSrcweir
getAttributeValue(bool fullyNormalize)190cdf0e10cSrcweir Span XmlReader::getAttributeValue(bool fullyNormalize) {
191cdf0e10cSrcweir return handleAttributeValue(
192cdf0e10cSrcweir currentAttribute_->valueBegin, currentAttribute_->valueEnd,
193cdf0e10cSrcweir fullyNormalize);
194cdf0e10cSrcweir }
195cdf0e10cSrcweir
getNamespaceId(Span const & prefix) const196cdf0e10cSrcweir int XmlReader::getNamespaceId(Span const & prefix) const {
197cdf0e10cSrcweir for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
198cdf0e10cSrcweir i != namespaces_.rend(); ++i)
199cdf0e10cSrcweir {
200cdf0e10cSrcweir if (prefix.equals(i->prefix)) {
201cdf0e10cSrcweir return i->nsId;
202cdf0e10cSrcweir }
203cdf0e10cSrcweir }
204cdf0e10cSrcweir return NAMESPACE_UNKNOWN;
205cdf0e10cSrcweir }
206cdf0e10cSrcweir
getUrl() const207cdf0e10cSrcweir rtl::OUString XmlReader::getUrl() const {
208cdf0e10cSrcweir return fileUrl_;
209cdf0e10cSrcweir }
210cdf0e10cSrcweir
normalizeLineEnds(Span const & text)211cdf0e10cSrcweir void XmlReader::normalizeLineEnds(Span const & text) {
212cdf0e10cSrcweir char const * p = text.begin;
213cdf0e10cSrcweir sal_Int32 n = text.length;
214cdf0e10cSrcweir for (;;) {
215cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
216cdf0e10cSrcweir if (i < 0) {
217cdf0e10cSrcweir break;
218cdf0e10cSrcweir }
219cdf0e10cSrcweir pad_.add(p, i);
220cdf0e10cSrcweir p += i + 1;
221cdf0e10cSrcweir n -= i + 1;
222cdf0e10cSrcweir if (n == 0 || *p != '\x0A') {
223cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
224cdf0e10cSrcweir }
225cdf0e10cSrcweir }
226cdf0e10cSrcweir pad_.add(p, n);
227cdf0e10cSrcweir }
228cdf0e10cSrcweir
skipSpace()229cdf0e10cSrcweir void XmlReader::skipSpace() {
230cdf0e10cSrcweir while (isSpace(peek())) {
231cdf0e10cSrcweir ++pos_;
232cdf0e10cSrcweir }
233cdf0e10cSrcweir }
234cdf0e10cSrcweir
skipComment()235cdf0e10cSrcweir bool XmlReader::skipComment() {
236cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength(
237cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
238cdf0e10cSrcweir RTL_CONSTASCII_LENGTH("--")) !=
239cdf0e10cSrcweir 0)
240cdf0e10cSrcweir {
241cdf0e10cSrcweir return false;
242cdf0e10cSrcweir }
243cdf0e10cSrcweir pos_ += RTL_CONSTASCII_LENGTH("--");
244cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength(
245cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
246cdf0e10cSrcweir if (i < 0) {
247cdf0e10cSrcweir throw css::uno::RuntimeException(
248cdf0e10cSrcweir (rtl::OUString(
249cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
250cdf0e10cSrcweir "premature end (within comment) of ")) +
251cdf0e10cSrcweir fileUrl_),
252cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
253cdf0e10cSrcweir }
254cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("--");
255cdf0e10cSrcweir if (read() != '>') {
256cdf0e10cSrcweir throw css::uno::RuntimeException(
257cdf0e10cSrcweir (rtl::OUString(
258cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
259cdf0e10cSrcweir "illegal \"--\" within comment in ")) +
260cdf0e10cSrcweir fileUrl_),
261cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
262cdf0e10cSrcweir }
263cdf0e10cSrcweir return true;
264cdf0e10cSrcweir }
265cdf0e10cSrcweir
skipProcessingInstruction()266cdf0e10cSrcweir void XmlReader::skipProcessingInstruction() {
267cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength(
268cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
269cdf0e10cSrcweir if (i < 0) {
270cdf0e10cSrcweir throw css::uno::RuntimeException(
271cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
272cdf0e10cSrcweir fileUrl_),
273cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
274cdf0e10cSrcweir }
275cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("?>");
276cdf0e10cSrcweir }
277cdf0e10cSrcweir
skipDocumentTypeDeclaration()278cdf0e10cSrcweir void XmlReader::skipDocumentTypeDeclaration() {
279cdf0e10cSrcweir // Neither is it checked that the doctypedecl is at the correct position in
280cdf0e10cSrcweir // the document, nor that it is well-formed:
281cdf0e10cSrcweir for (;;) {
282cdf0e10cSrcweir char c = read();
283cdf0e10cSrcweir switch (c) {
284cdf0e10cSrcweir case '\0': // i.e., EOF
285cdf0e10cSrcweir throw css::uno::RuntimeException(
286cdf0e10cSrcweir (rtl::OUString(
287cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
288cdf0e10cSrcweir "premature end (within DTD) of ")) +
289cdf0e10cSrcweir fileUrl_),
290cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
291cdf0e10cSrcweir case '"':
292cdf0e10cSrcweir case '\'':
293cdf0e10cSrcweir {
294cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(
295cdf0e10cSrcweir pos_, end_ - pos_, c);
296cdf0e10cSrcweir if (i < 0) {
297cdf0e10cSrcweir throw css::uno::RuntimeException(
298cdf0e10cSrcweir (rtl::OUString(
299cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
300cdf0e10cSrcweir "premature end (within DTD) of ")) +
301cdf0e10cSrcweir fileUrl_),
302cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
303cdf0e10cSrcweir }
304cdf0e10cSrcweir pos_ += i + 1;
305cdf0e10cSrcweir }
306cdf0e10cSrcweir break;
307cdf0e10cSrcweir case '>':
308cdf0e10cSrcweir return;
309cdf0e10cSrcweir case '[':
310cdf0e10cSrcweir for (;;) {
311cdf0e10cSrcweir c = read();
312cdf0e10cSrcweir switch (c) {
313cdf0e10cSrcweir case '\0': // i.e., EOF
314cdf0e10cSrcweir throw css::uno::RuntimeException(
315cdf0e10cSrcweir (rtl::OUString(
316cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
317cdf0e10cSrcweir "premature end (within DTD) of ")) +
318cdf0e10cSrcweir fileUrl_),
319cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
320cdf0e10cSrcweir case '"':
321cdf0e10cSrcweir case '\'':
322cdf0e10cSrcweir {
323cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(
324cdf0e10cSrcweir pos_, end_ - pos_, c);
325cdf0e10cSrcweir if (i < 0) {
326cdf0e10cSrcweir throw css::uno::RuntimeException(
327cdf0e10cSrcweir (rtl::OUString(
328cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
329cdf0e10cSrcweir "premature end (within DTD) of ")) +
330cdf0e10cSrcweir fileUrl_),
331cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
332cdf0e10cSrcweir }
333cdf0e10cSrcweir pos_ += i + 1;
334cdf0e10cSrcweir }
335cdf0e10cSrcweir break;
336cdf0e10cSrcweir case '<':
337cdf0e10cSrcweir switch (read()) {
338cdf0e10cSrcweir case '\0': // i.e., EOF
339cdf0e10cSrcweir throw css::uno::RuntimeException(
340cdf0e10cSrcweir (rtl::OUString(
341cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
342cdf0e10cSrcweir "premature end (within DTD) of ")) +
343cdf0e10cSrcweir fileUrl_),
344cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
345cdf0e10cSrcweir case '!':
346cdf0e10cSrcweir skipComment();
347cdf0e10cSrcweir break;
348cdf0e10cSrcweir case '?':
349cdf0e10cSrcweir skipProcessingInstruction();
350cdf0e10cSrcweir break;
351cdf0e10cSrcweir default:
352cdf0e10cSrcweir break;
353cdf0e10cSrcweir }
354cdf0e10cSrcweir break;
355cdf0e10cSrcweir case ']':
356cdf0e10cSrcweir skipSpace();
357cdf0e10cSrcweir if (read() != '>') {
358cdf0e10cSrcweir throw css::uno::RuntimeException(
359cdf0e10cSrcweir (rtl::OUString(
360cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
361cdf0e10cSrcweir "missing \">\" of DTD in ")) +
362cdf0e10cSrcweir fileUrl_),
363cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
364cdf0e10cSrcweir }
365cdf0e10cSrcweir return;
366cdf0e10cSrcweir default:
367cdf0e10cSrcweir break;
368cdf0e10cSrcweir }
369cdf0e10cSrcweir }
370cdf0e10cSrcweir default:
371cdf0e10cSrcweir break;
372cdf0e10cSrcweir }
373cdf0e10cSrcweir }
374cdf0e10cSrcweir }
375cdf0e10cSrcweir
scanCdataSection()376cdf0e10cSrcweir Span XmlReader::scanCdataSection() {
377cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength(
378cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
379cdf0e10cSrcweir RTL_CONSTASCII_LENGTH("[CDATA[")) !=
380cdf0e10cSrcweir 0)
381cdf0e10cSrcweir {
382cdf0e10cSrcweir return Span();
383cdf0e10cSrcweir }
384cdf0e10cSrcweir pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
385cdf0e10cSrcweir char const * begin = pos_;
386cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfStr_WithLength(
387cdf0e10cSrcweir pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
388cdf0e10cSrcweir if (i < 0) {
389cdf0e10cSrcweir throw css::uno::RuntimeException(
390cdf0e10cSrcweir (rtl::OUString(
391cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
392cdf0e10cSrcweir "premature end (within CDATA section) of ")) +
393cdf0e10cSrcweir fileUrl_),
394cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
395cdf0e10cSrcweir }
396cdf0e10cSrcweir pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
397cdf0e10cSrcweir return Span(begin, i);
398cdf0e10cSrcweir }
399cdf0e10cSrcweir
scanName(char const ** nameColon)400cdf0e10cSrcweir bool XmlReader::scanName(char const ** nameColon) {
401cdf0e10cSrcweir OSL_ASSERT(nameColon != 0 && *nameColon == 0);
402cdf0e10cSrcweir for (char const * begin = pos_;; ++pos_) {
403cdf0e10cSrcweir switch (peek()) {
404cdf0e10cSrcweir case '\0': // i.e., EOF
405cdf0e10cSrcweir case '\x09':
406cdf0e10cSrcweir case '\x0A':
407cdf0e10cSrcweir case '\x0D':
408cdf0e10cSrcweir case ' ':
409cdf0e10cSrcweir case '/':
410cdf0e10cSrcweir case '=':
411cdf0e10cSrcweir case '>':
412cdf0e10cSrcweir return pos_ != begin;
413cdf0e10cSrcweir case ':':
414cdf0e10cSrcweir *nameColon = pos_;
415cdf0e10cSrcweir break;
416cdf0e10cSrcweir default:
417cdf0e10cSrcweir break;
418cdf0e10cSrcweir }
419cdf0e10cSrcweir }
420cdf0e10cSrcweir }
421cdf0e10cSrcweir
scanNamespaceIri(char const * begin,char const * end)422cdf0e10cSrcweir int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
423cdf0e10cSrcweir OSL_ASSERT(begin != 0 && begin <= end);
424cdf0e10cSrcweir Span iri(handleAttributeValue(begin, end, false));
425cdf0e10cSrcweir for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
426cdf0e10cSrcweir if (namespaceIris_[i].equals(iri)) {
427cdf0e10cSrcweir return toNamespaceId(i);
428cdf0e10cSrcweir }
429cdf0e10cSrcweir }
430cdf0e10cSrcweir return XmlReader::NAMESPACE_UNKNOWN;
431cdf0e10cSrcweir }
432cdf0e10cSrcweir
handleReference(char const * position,char const * end)433cdf0e10cSrcweir char const * XmlReader::handleReference(char const * position, char const * end)
434cdf0e10cSrcweir {
435cdf0e10cSrcweir OSL_ASSERT(position != 0 && *position == '&' && position < end);
436cdf0e10cSrcweir ++position;
437cdf0e10cSrcweir if (*position == '#') {
438cdf0e10cSrcweir ++position;
439cdf0e10cSrcweir sal_Int32 val = 0;
440cdf0e10cSrcweir char const * p;
441cdf0e10cSrcweir if (*position == 'x') {
442cdf0e10cSrcweir ++position;
443cdf0e10cSrcweir p = position;
444cdf0e10cSrcweir for (;; ++position) {
445cdf0e10cSrcweir char c = *position;
446cdf0e10cSrcweir if (c >= '0' && c <= '9') {
447cdf0e10cSrcweir val = 16 * val + (c - '0');
448cdf0e10cSrcweir } else if (c >= 'A' && c <= 'F') {
449cdf0e10cSrcweir val = 16 * val + (c - 'A') + 10;
450cdf0e10cSrcweir } else if (c >= 'a' && c <= 'f') {
451cdf0e10cSrcweir val = 16 * val + (c - 'a') + 10;
452cdf0e10cSrcweir } else {
453cdf0e10cSrcweir break;
454cdf0e10cSrcweir }
455cdf0e10cSrcweir if (val > 0x10FFFF) { // avoid overflow
456cdf0e10cSrcweir throw css::uno::RuntimeException(
457cdf0e10cSrcweir (rtl::OUString(
458cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
459cdf0e10cSrcweir "'&#x...' too large in ")) +
460cdf0e10cSrcweir fileUrl_),
461cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
462cdf0e10cSrcweir }
463cdf0e10cSrcweir }
464cdf0e10cSrcweir } else {
465cdf0e10cSrcweir p = position;
466cdf0e10cSrcweir for (;; ++position) {
467cdf0e10cSrcweir char c = *position;
468cdf0e10cSrcweir if (c >= '0' && c <= '9') {
469cdf0e10cSrcweir val = 10 * val + (c - '0');
470cdf0e10cSrcweir } else {
471cdf0e10cSrcweir break;
472cdf0e10cSrcweir }
473cdf0e10cSrcweir if (val > 0x10FFFF) { // avoid overflow
474cdf0e10cSrcweir throw css::uno::RuntimeException(
475cdf0e10cSrcweir (rtl::OUString(
476cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
477cdf0e10cSrcweir "'&#...' too large in ")) +
478cdf0e10cSrcweir fileUrl_),
479cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
480cdf0e10cSrcweir }
481cdf0e10cSrcweir }
482cdf0e10cSrcweir }
483cdf0e10cSrcweir if (position == p || *position++ != ';') {
484cdf0e10cSrcweir throw css::uno::RuntimeException(
485cdf0e10cSrcweir (rtl::OUString(
486cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
487cdf0e10cSrcweir fileUrl_),
488cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
489cdf0e10cSrcweir }
490cdf0e10cSrcweir OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
491cdf0e10cSrcweir if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
492cdf0e10cSrcweir (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
493cdf0e10cSrcweir {
494cdf0e10cSrcweir throw css::uno::RuntimeException(
495cdf0e10cSrcweir (rtl::OUString(
496cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
497cdf0e10cSrcweir "character reference denoting invalid character in ")) +
498cdf0e10cSrcweir fileUrl_),
499cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
500cdf0e10cSrcweir }
501cdf0e10cSrcweir char buf[4];
502cdf0e10cSrcweir sal_Int32 len;
503cdf0e10cSrcweir if (val < 0x80) {
504cdf0e10cSrcweir buf[0] = static_cast< char >(val);
505cdf0e10cSrcweir len = 1;
506cdf0e10cSrcweir } else if (val < 0x800) {
507cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 6) | 0xC0);
508cdf0e10cSrcweir buf[1] = static_cast< char >((val & 0x3F) | 0x80);
509cdf0e10cSrcweir len = 2;
510cdf0e10cSrcweir } else if (val < 0x10000) {
511cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 12) | 0xE0);
512cdf0e10cSrcweir buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
513cdf0e10cSrcweir buf[2] = static_cast< char >((val & 0x3F) | 0x80);
514cdf0e10cSrcweir len = 3;
515cdf0e10cSrcweir } else {
516cdf0e10cSrcweir buf[0] = static_cast< char >((val >> 18) | 0xF0);
517cdf0e10cSrcweir buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
518cdf0e10cSrcweir buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
519cdf0e10cSrcweir buf[3] = static_cast< char >((val & 0x3F) | 0x80);
520cdf0e10cSrcweir len = 4;
521cdf0e10cSrcweir }
522cdf0e10cSrcweir pad_.addEphemeral(buf, len);
523cdf0e10cSrcweir return position;
524cdf0e10cSrcweir } else {
525cdf0e10cSrcweir struct EntityRef {
526cdf0e10cSrcweir char const * inBegin;
527cdf0e10cSrcweir sal_Int32 inLength;
528cdf0e10cSrcweir char const * outBegin;
529cdf0e10cSrcweir sal_Int32 outLength;
530cdf0e10cSrcweir };
531cdf0e10cSrcweir static EntityRef const refs[] = {
532cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("amp;"),
533cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("&") },
534cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("lt;"),
535cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("<") },
536cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("gt;"),
537cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM(">") },
538cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("apos;"),
539cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("'") },
540cdf0e10cSrcweir { RTL_CONSTASCII_STRINGPARAM("quot;"),
541cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("\"") } };
542cdf0e10cSrcweir for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
543cdf0e10cSrcweir if (rtl_str_shortenedCompare_WithLength(
544cdf0e10cSrcweir position, end - position, refs[i].inBegin, refs[i].inLength,
545cdf0e10cSrcweir refs[i].inLength) ==
546cdf0e10cSrcweir 0)
547cdf0e10cSrcweir {
548cdf0e10cSrcweir position += refs[i].inLength;
549cdf0e10cSrcweir pad_.add(refs[i].outBegin, refs[i].outLength);
550cdf0e10cSrcweir return position;
551cdf0e10cSrcweir }
552cdf0e10cSrcweir }
553cdf0e10cSrcweir throw css::uno::RuntimeException(
554cdf0e10cSrcweir (rtl::OUString(
555cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
556cdf0e10cSrcweir fileUrl_),
557cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
558cdf0e10cSrcweir }
559cdf0e10cSrcweir }
560cdf0e10cSrcweir
handleAttributeValue(char const * begin,char const * end,bool fullyNormalize)561cdf0e10cSrcweir Span XmlReader::handleAttributeValue(
562cdf0e10cSrcweir char const * begin, char const * end, bool fullyNormalize)
563cdf0e10cSrcweir {
564cdf0e10cSrcweir pad_.clear();
565cdf0e10cSrcweir if (fullyNormalize) {
566cdf0e10cSrcweir while (begin != end && isSpace(*begin)) {
567cdf0e10cSrcweir ++begin;
568cdf0e10cSrcweir }
569cdf0e10cSrcweir while (end != begin && isSpace(end[-1])) {
570cdf0e10cSrcweir --end;
571cdf0e10cSrcweir }
572cdf0e10cSrcweir char const * p = begin;
573cdf0e10cSrcweir enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
574cdf0e10cSrcweir // a single true space character can go into the current span,
575cdf0e10cSrcweir // everything else breaks the span
576cdf0e10cSrcweir Space space = SPACE_NONE;
577cdf0e10cSrcweir while (p != end) {
578cdf0e10cSrcweir switch (*p) {
579cdf0e10cSrcweir case '\x09':
580cdf0e10cSrcweir case '\x0A':
581cdf0e10cSrcweir case '\x0D':
582cdf0e10cSrcweir switch (space) {
583cdf0e10cSrcweir case SPACE_NONE:
584cdf0e10cSrcweir pad_.add(begin, p - begin);
585cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
586cdf0e10cSrcweir space = SPACE_BREAK;
587cdf0e10cSrcweir break;
588cdf0e10cSrcweir case SPACE_SPAN:
589cdf0e10cSrcweir pad_.add(begin, p - begin);
590cdf0e10cSrcweir space = SPACE_BREAK;
591cdf0e10cSrcweir break;
592cdf0e10cSrcweir case SPACE_BREAK:
593cdf0e10cSrcweir break;
594cdf0e10cSrcweir }
595cdf0e10cSrcweir begin = ++p;
596cdf0e10cSrcweir break;
597cdf0e10cSrcweir case ' ':
598cdf0e10cSrcweir switch (space) {
599cdf0e10cSrcweir case SPACE_NONE:
600cdf0e10cSrcweir ++p;
601cdf0e10cSrcweir space = SPACE_SPAN;
602cdf0e10cSrcweir break;
603cdf0e10cSrcweir case SPACE_SPAN:
604cdf0e10cSrcweir pad_.add(begin, p - begin);
605cdf0e10cSrcweir begin = ++p;
606cdf0e10cSrcweir space = SPACE_BREAK;
607cdf0e10cSrcweir break;
608cdf0e10cSrcweir case SPACE_BREAK:
609cdf0e10cSrcweir begin = ++p;
610cdf0e10cSrcweir break;
611cdf0e10cSrcweir }
612cdf0e10cSrcweir break;
613cdf0e10cSrcweir case '&':
614cdf0e10cSrcweir pad_.add(begin, p - begin);
615cdf0e10cSrcweir p = handleReference(p, end);
616cdf0e10cSrcweir begin = p;
617cdf0e10cSrcweir space = SPACE_NONE;
618cdf0e10cSrcweir break;
619cdf0e10cSrcweir default:
620cdf0e10cSrcweir ++p;
621cdf0e10cSrcweir space = SPACE_NONE;
622cdf0e10cSrcweir break;
623cdf0e10cSrcweir }
624cdf0e10cSrcweir }
625cdf0e10cSrcweir pad_.add(begin, p - begin);
626cdf0e10cSrcweir } else {
627cdf0e10cSrcweir char const * p = begin;
628cdf0e10cSrcweir while (p != end) {
629cdf0e10cSrcweir switch (*p) {
630cdf0e10cSrcweir case '\x09':
631cdf0e10cSrcweir case '\x0A':
632cdf0e10cSrcweir pad_.add(begin, p - begin);
633cdf0e10cSrcweir begin = ++p;
634cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
635cdf0e10cSrcweir break;
636cdf0e10cSrcweir case '\x0D':
637cdf0e10cSrcweir pad_.add(begin, p - begin);
638cdf0e10cSrcweir ++p;
639cdf0e10cSrcweir if (peek() == '\x0A') {
640cdf0e10cSrcweir ++p;
641cdf0e10cSrcweir }
642cdf0e10cSrcweir begin = p;
643cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
644cdf0e10cSrcweir break;
645cdf0e10cSrcweir case '&':
646cdf0e10cSrcweir pad_.add(begin, p - begin);
647cdf0e10cSrcweir p = handleReference(p, end);
648cdf0e10cSrcweir begin = p;
649cdf0e10cSrcweir break;
650cdf0e10cSrcweir default:
651cdf0e10cSrcweir ++p;
652cdf0e10cSrcweir break;
653cdf0e10cSrcweir }
654cdf0e10cSrcweir }
655cdf0e10cSrcweir pad_.add(begin, p - begin);
656cdf0e10cSrcweir }
657cdf0e10cSrcweir return pad_.get();
658cdf0e10cSrcweir }
659cdf0e10cSrcweir
handleStartTag(int * nsId,Span * localName)660cdf0e10cSrcweir XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
661cdf0e10cSrcweir OSL_ASSERT(nsId != 0 && localName);
662cdf0e10cSrcweir char const * nameBegin = pos_;
663cdf0e10cSrcweir char const * nameColon = 0;
664cdf0e10cSrcweir if (!scanName(&nameColon)) {
665cdf0e10cSrcweir throw css::uno::RuntimeException(
666cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
667cdf0e10cSrcweir fileUrl_),
668cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
669cdf0e10cSrcweir }
670cdf0e10cSrcweir char const * nameEnd = pos_;
671cdf0e10cSrcweir NamespaceList::size_type inheritedNamespaces = namespaces_.size();
672cdf0e10cSrcweir bool hasDefaultNs = false;
673cdf0e10cSrcweir int defaultNsId = NAMESPACE_NONE;
674cdf0e10cSrcweir attributes_.clear();
675cdf0e10cSrcweir for (;;) {
676cdf0e10cSrcweir char const * p = pos_;
677cdf0e10cSrcweir skipSpace();
678cdf0e10cSrcweir if (peek() == '/' || peek() == '>') {
679cdf0e10cSrcweir break;
680cdf0e10cSrcweir }
681cdf0e10cSrcweir if (pos_ == p) {
682cdf0e10cSrcweir throw css::uno::RuntimeException(
683cdf0e10cSrcweir (rtl::OUString(
684cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
685cdf0e10cSrcweir "missing whitespace before attribute in ")) +
686cdf0e10cSrcweir fileUrl_),
687cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
688cdf0e10cSrcweir }
689cdf0e10cSrcweir char const * attrNameBegin = pos_;
690cdf0e10cSrcweir char const * attrNameColon = 0;
691cdf0e10cSrcweir if (!scanName(&attrNameColon)) {
692cdf0e10cSrcweir throw css::uno::RuntimeException(
693cdf0e10cSrcweir (rtl::OUString(
694cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
695cdf0e10cSrcweir fileUrl_),
696cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
697cdf0e10cSrcweir }
698cdf0e10cSrcweir char const * attrNameEnd = pos_;
699cdf0e10cSrcweir skipSpace();
700cdf0e10cSrcweir if (read() != '=') {
701cdf0e10cSrcweir throw css::uno::RuntimeException(
702cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
703cdf0e10cSrcweir fileUrl_),
704cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
705cdf0e10cSrcweir }
706cdf0e10cSrcweir skipSpace();
707cdf0e10cSrcweir char del = read();
708cdf0e10cSrcweir if (del != '\'' && del != '"') {
709cdf0e10cSrcweir throw css::uno::RuntimeException(
710cdf0e10cSrcweir (rtl::OUString(
711cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
712cdf0e10cSrcweir fileUrl_),
713cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
714cdf0e10cSrcweir }
715cdf0e10cSrcweir char const * valueBegin = pos_;
716cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
717cdf0e10cSrcweir if (i < 0) {
718cdf0e10cSrcweir throw css::uno::RuntimeException(
719cdf0e10cSrcweir (rtl::OUString(
720cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM(
721cdf0e10cSrcweir "unterminated attribute value in ")) +
722cdf0e10cSrcweir fileUrl_),
723cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
724cdf0e10cSrcweir }
725cdf0e10cSrcweir char const * valueEnd = pos_ + i;
726cdf0e10cSrcweir pos_ += i + 1;
727cdf0e10cSrcweir if (attrNameColon == 0 &&
728cdf0e10cSrcweir Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
729cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("xmlns")))
730cdf0e10cSrcweir {
731cdf0e10cSrcweir hasDefaultNs = true;
732cdf0e10cSrcweir defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
733cdf0e10cSrcweir } else if (attrNameColon != 0 &&
734cdf0e10cSrcweir Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
735cdf0e10cSrcweir RTL_CONSTASCII_STRINGPARAM("xmlns")))
736cdf0e10cSrcweir {
737cdf0e10cSrcweir namespaces_.push_back(
738cdf0e10cSrcweir NamespaceData(
739cdf0e10cSrcweir Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
740cdf0e10cSrcweir scanNamespaceIri(valueBegin, valueEnd)));
741cdf0e10cSrcweir } else {
742cdf0e10cSrcweir attributes_.push_back(
743cdf0e10cSrcweir AttributeData(
744cdf0e10cSrcweir attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
745cdf0e10cSrcweir valueEnd));
746cdf0e10cSrcweir }
747cdf0e10cSrcweir }
748cdf0e10cSrcweir if (!hasDefaultNs && !elements_.empty()) {
749cdf0e10cSrcweir defaultNsId = elements_.top().defaultNamespaceId;
750cdf0e10cSrcweir }
751cdf0e10cSrcweir firstAttribute_ = true;
752cdf0e10cSrcweir if (peek() == '/') {
753cdf0e10cSrcweir state_ = STATE_EMPTY_ELEMENT_TAG;
754cdf0e10cSrcweir ++pos_;
755cdf0e10cSrcweir } else {
756cdf0e10cSrcweir state_ = STATE_CONTENT;
757cdf0e10cSrcweir }
758cdf0e10cSrcweir if (peek() != '>') {
759cdf0e10cSrcweir throw css::uno::RuntimeException(
760cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
761cdf0e10cSrcweir fileUrl_),
762cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
763cdf0e10cSrcweir }
764cdf0e10cSrcweir ++pos_;
765cdf0e10cSrcweir elements_.push(
766cdf0e10cSrcweir ElementData(
767cdf0e10cSrcweir Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
768cdf0e10cSrcweir defaultNsId));
769cdf0e10cSrcweir if (nameColon == 0) {
770cdf0e10cSrcweir *nsId = defaultNsId;
771cdf0e10cSrcweir *localName = Span(nameBegin, nameEnd - nameBegin);
772cdf0e10cSrcweir } else {
773cdf0e10cSrcweir *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
774cdf0e10cSrcweir *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
775cdf0e10cSrcweir }
776cdf0e10cSrcweir return RESULT_BEGIN;
777cdf0e10cSrcweir }
778cdf0e10cSrcweir
handleEndTag()779cdf0e10cSrcweir XmlReader::Result XmlReader::handleEndTag() {
780cdf0e10cSrcweir if (elements_.empty()) {
781cdf0e10cSrcweir throw css::uno::RuntimeException(
782cdf0e10cSrcweir (rtl::OUString(
783cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
784cdf0e10cSrcweir fileUrl_),
785cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
786cdf0e10cSrcweir }
787cdf0e10cSrcweir char const * nameBegin = pos_;
788cdf0e10cSrcweir char const * nameColon = 0;
789cdf0e10cSrcweir if (!scanName(&nameColon) ||
790cdf0e10cSrcweir !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
791cdf0e10cSrcweir {
792cdf0e10cSrcweir throw css::uno::RuntimeException(
793cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
794cdf0e10cSrcweir fileUrl_),
795cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
796cdf0e10cSrcweir }
797cdf0e10cSrcweir handleElementEnd();
798cdf0e10cSrcweir skipSpace();
799cdf0e10cSrcweir if (peek() != '>') {
800cdf0e10cSrcweir throw css::uno::RuntimeException(
801cdf0e10cSrcweir (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
802cdf0e10cSrcweir fileUrl_),
803cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
804cdf0e10cSrcweir }
805cdf0e10cSrcweir ++pos_;
806cdf0e10cSrcweir return RESULT_END;
807cdf0e10cSrcweir }
808cdf0e10cSrcweir
handleElementEnd()809cdf0e10cSrcweir void XmlReader::handleElementEnd() {
810cdf0e10cSrcweir OSL_ASSERT(!elements_.empty());
811cdf0e10cSrcweir namespaces_.resize(elements_.top().inheritedNamespaces);
812cdf0e10cSrcweir elements_.pop();
813cdf0e10cSrcweir state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
814cdf0e10cSrcweir }
815cdf0e10cSrcweir
handleSkippedText(Span * data,int * nsId)816cdf0e10cSrcweir XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
817cdf0e10cSrcweir for (;;) {
818cdf0e10cSrcweir sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
819cdf0e10cSrcweir if (i < 0) {
820cdf0e10cSrcweir throw css::uno::RuntimeException(
821cdf0e10cSrcweir (rtl::OUString(
822cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
823cdf0e10cSrcweir fileUrl_),
824cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
825cdf0e10cSrcweir }
826cdf0e10cSrcweir pos_ += i + 1;
827cdf0e10cSrcweir switch (peek()) {
828cdf0e10cSrcweir case '!':
829cdf0e10cSrcweir ++pos_;
830cdf0e10cSrcweir if (!skipComment() && !scanCdataSection().is()) {
831cdf0e10cSrcweir skipDocumentTypeDeclaration();
832cdf0e10cSrcweir }
833cdf0e10cSrcweir break;
834cdf0e10cSrcweir case '/':
835cdf0e10cSrcweir ++pos_;
836cdf0e10cSrcweir return handleEndTag();
837cdf0e10cSrcweir case '?':
838cdf0e10cSrcweir ++pos_;
839cdf0e10cSrcweir skipProcessingInstruction();
840cdf0e10cSrcweir break;
841cdf0e10cSrcweir default:
842cdf0e10cSrcweir return handleStartTag(nsId, data);
843cdf0e10cSrcweir }
844cdf0e10cSrcweir }
845cdf0e10cSrcweir }
846cdf0e10cSrcweir
handleRawText(Span * text)847cdf0e10cSrcweir XmlReader::Result XmlReader::handleRawText(Span * text) {
848cdf0e10cSrcweir pad_.clear();
849cdf0e10cSrcweir for (char const * begin = pos_;;) {
850cdf0e10cSrcweir switch (peek()) {
851cdf0e10cSrcweir case '\0': // i.e., EOF
852cdf0e10cSrcweir throw css::uno::RuntimeException(
853cdf0e10cSrcweir (rtl::OUString(
854cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
855cdf0e10cSrcweir fileUrl_),
856cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
857cdf0e10cSrcweir case '\x0D':
858cdf0e10cSrcweir pad_.add(begin, pos_ - begin);
859cdf0e10cSrcweir ++pos_;
860cdf0e10cSrcweir if (peek() != '\x0A') {
861cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
862cdf0e10cSrcweir }
863cdf0e10cSrcweir begin = pos_;
864cdf0e10cSrcweir break;
865cdf0e10cSrcweir case '&':
866cdf0e10cSrcweir pad_.add(begin, pos_ - begin);
867cdf0e10cSrcweir pos_ = handleReference(pos_, end_);
868cdf0e10cSrcweir begin = pos_;
869cdf0e10cSrcweir break;
870cdf0e10cSrcweir case '<':
871cdf0e10cSrcweir pad_.add(begin, pos_ - begin);
872cdf0e10cSrcweir ++pos_;
873cdf0e10cSrcweir switch (peek()) {
874cdf0e10cSrcweir case '!':
875cdf0e10cSrcweir ++pos_;
876cdf0e10cSrcweir if (!skipComment()) {
877cdf0e10cSrcweir Span cdata(scanCdataSection());
878cdf0e10cSrcweir if (cdata.is()) {
879cdf0e10cSrcweir normalizeLineEnds(cdata);
880cdf0e10cSrcweir } else {
881cdf0e10cSrcweir skipDocumentTypeDeclaration();
882cdf0e10cSrcweir }
883cdf0e10cSrcweir }
884cdf0e10cSrcweir begin = pos_;
885cdf0e10cSrcweir break;
886cdf0e10cSrcweir case '/':
887cdf0e10cSrcweir *text = pad_.get();
888cdf0e10cSrcweir ++pos_;
889cdf0e10cSrcweir state_ = STATE_END_TAG;
890cdf0e10cSrcweir return RESULT_TEXT;
891cdf0e10cSrcweir case '?':
892cdf0e10cSrcweir ++pos_;
893cdf0e10cSrcweir skipProcessingInstruction();
894cdf0e10cSrcweir begin = pos_;
895cdf0e10cSrcweir break;
896cdf0e10cSrcweir default:
897cdf0e10cSrcweir *text = pad_.get();
898cdf0e10cSrcweir state_ = STATE_START_TAG;
899cdf0e10cSrcweir return RESULT_TEXT;
900cdf0e10cSrcweir }
901cdf0e10cSrcweir break;
902cdf0e10cSrcweir default:
903cdf0e10cSrcweir ++pos_;
904cdf0e10cSrcweir break;
905cdf0e10cSrcweir }
906cdf0e10cSrcweir }
907cdf0e10cSrcweir }
908cdf0e10cSrcweir
handleNormalizedText(Span * text)909cdf0e10cSrcweir XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
910cdf0e10cSrcweir pad_.clear();
911cdf0e10cSrcweir char const * flowBegin = pos_;
912cdf0e10cSrcweir char const * flowEnd = pos_;
913cdf0e10cSrcweir enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
914cdf0e10cSrcweir // a single true space character can go into the current flow,
915cdf0e10cSrcweir // everything else breaks the flow
916cdf0e10cSrcweir Space space = SPACE_START;
917cdf0e10cSrcweir for (;;) {
918cdf0e10cSrcweir switch (peek()) {
919cdf0e10cSrcweir case '\0': // i.e., EOF
920cdf0e10cSrcweir throw css::uno::RuntimeException(
921cdf0e10cSrcweir (rtl::OUString(
922cdf0e10cSrcweir RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
923cdf0e10cSrcweir fileUrl_),
924cdf0e10cSrcweir css::uno::Reference< css::uno::XInterface >());
925cdf0e10cSrcweir case '\x09':
926cdf0e10cSrcweir case '\x0A':
927cdf0e10cSrcweir case '\x0D':
928cdf0e10cSrcweir switch (space) {
929cdf0e10cSrcweir case SPACE_START:
930cdf0e10cSrcweir case SPACE_BREAK:
931cdf0e10cSrcweir break;
932cdf0e10cSrcweir case SPACE_NONE:
933cdf0e10cSrcweir case SPACE_SPAN:
934cdf0e10cSrcweir space = SPACE_BREAK;
935cdf0e10cSrcweir break;
936cdf0e10cSrcweir }
937cdf0e10cSrcweir ++pos_;
938cdf0e10cSrcweir break;
939cdf0e10cSrcweir case ' ':
940cdf0e10cSrcweir switch (space) {
941cdf0e10cSrcweir case SPACE_START:
942cdf0e10cSrcweir case SPACE_BREAK:
943cdf0e10cSrcweir break;
944cdf0e10cSrcweir case SPACE_NONE:
945cdf0e10cSrcweir space = SPACE_SPAN;
946cdf0e10cSrcweir break;
947cdf0e10cSrcweir case SPACE_SPAN:
948cdf0e10cSrcweir space = SPACE_BREAK;
949cdf0e10cSrcweir break;
950cdf0e10cSrcweir }
951cdf0e10cSrcweir ++pos_;
952cdf0e10cSrcweir break;
953cdf0e10cSrcweir case '&':
954cdf0e10cSrcweir switch (space) {
955cdf0e10cSrcweir case SPACE_START:
956cdf0e10cSrcweir break;
957cdf0e10cSrcweir case SPACE_NONE:
958cdf0e10cSrcweir case SPACE_SPAN:
959cdf0e10cSrcweir pad_.add(flowBegin, pos_ - flowBegin);
960cdf0e10cSrcweir break;
961cdf0e10cSrcweir case SPACE_BREAK:
962cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
963cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
964cdf0e10cSrcweir break;
965cdf0e10cSrcweir }
966cdf0e10cSrcweir pos_ = handleReference(pos_, end_);
967cdf0e10cSrcweir flowBegin = pos_;
968cdf0e10cSrcweir flowEnd = pos_;
969cdf0e10cSrcweir space = SPACE_NONE;
970cdf0e10cSrcweir break;
971cdf0e10cSrcweir case '<':
972cdf0e10cSrcweir ++pos_;
973cdf0e10cSrcweir switch (peek()) {
974cdf0e10cSrcweir case '!':
975cdf0e10cSrcweir ++pos_;
976cdf0e10cSrcweir if (skipComment()) {
977cdf0e10cSrcweir space = SPACE_BREAK;
978cdf0e10cSrcweir } else {
979cdf0e10cSrcweir Span cdata(scanCdataSection());
980cdf0e10cSrcweir if (cdata.is()) {
981cdf0e10cSrcweir // CDATA is not normalized (similar to character
982cdf0e10cSrcweir // references; it keeps the code simple), but it might
983cdf0e10cSrcweir // arguably be better to normalize it:
984cdf0e10cSrcweir switch (space) {
985cdf0e10cSrcweir case SPACE_START:
986cdf0e10cSrcweir break;
987cdf0e10cSrcweir case SPACE_NONE:
988cdf0e10cSrcweir case SPACE_SPAN:
989cdf0e10cSrcweir pad_.add(flowBegin, pos_ - flowBegin);
990cdf0e10cSrcweir break;
991cdf0e10cSrcweir case SPACE_BREAK:
992cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
993cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
994cdf0e10cSrcweir break;
995cdf0e10cSrcweir }
996cdf0e10cSrcweir normalizeLineEnds(cdata);
997cdf0e10cSrcweir flowBegin = pos_;
998cdf0e10cSrcweir flowEnd = pos_;
999cdf0e10cSrcweir space = SPACE_NONE;
1000cdf0e10cSrcweir } else {
1001cdf0e10cSrcweir skipDocumentTypeDeclaration();
1002cdf0e10cSrcweir }
1003cdf0e10cSrcweir }
1004cdf0e10cSrcweir break;
1005cdf0e10cSrcweir case '/':
1006cdf0e10cSrcweir ++pos_;
1007cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
1008cdf0e10cSrcweir *text = pad_.get();
1009cdf0e10cSrcweir state_ = STATE_END_TAG;
1010cdf0e10cSrcweir return RESULT_TEXT;
1011cdf0e10cSrcweir case '?':
1012cdf0e10cSrcweir ++pos_;
1013cdf0e10cSrcweir skipProcessingInstruction();
1014cdf0e10cSrcweir space = SPACE_BREAK;
1015cdf0e10cSrcweir break;
1016cdf0e10cSrcweir default:
1017cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
1018cdf0e10cSrcweir *text = pad_.get();
1019cdf0e10cSrcweir state_ = STATE_START_TAG;
1020cdf0e10cSrcweir return RESULT_TEXT;
1021cdf0e10cSrcweir }
1022cdf0e10cSrcweir break;
1023cdf0e10cSrcweir default:
1024cdf0e10cSrcweir switch (space) {
1025cdf0e10cSrcweir case SPACE_START:
1026cdf0e10cSrcweir flowBegin = pos_;
1027cdf0e10cSrcweir break;
1028cdf0e10cSrcweir case SPACE_NONE:
1029cdf0e10cSrcweir case SPACE_SPAN:
1030cdf0e10cSrcweir break;
1031cdf0e10cSrcweir case SPACE_BREAK:
1032cdf0e10cSrcweir pad_.add(flowBegin, flowEnd - flowBegin);
1033cdf0e10cSrcweir pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1034cdf0e10cSrcweir flowBegin = pos_;
1035cdf0e10cSrcweir break;
1036cdf0e10cSrcweir }
1037cdf0e10cSrcweir flowEnd = ++pos_;
1038cdf0e10cSrcweir space = SPACE_NONE;
1039cdf0e10cSrcweir break;
1040cdf0e10cSrcweir }
1041cdf0e10cSrcweir }
1042cdf0e10cSrcweir }
1043cdf0e10cSrcweir
toNamespaceId(NamespaceIris::size_type pos)1044cdf0e10cSrcweir int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1045cdf0e10cSrcweir OSL_ASSERT(pos <= INT_MAX);
1046cdf0e10cSrcweir return static_cast< int >(pos);
1047cdf0e10cSrcweir }
1048cdf0e10cSrcweir
1049cdf0e10cSrcweir }
1050