1 /**************************************************************
2 *
3 * Licensed to the Apache Software Foundation (ASF) under one
4 * or more contributor license agreements. See the NOTICE file
5 * distributed with this work for additional information
6 * regarding copyright ownership. The ASF licenses this file
7 * to you under the Apache License, Version 2.0 (the
8 * "License"); you may not use this file except in compliance
9 * with the License. You may obtain a copy of the License at
10 *
11 * http://www.apache.org/licenses/LICENSE-2.0
12 *
13 * Unless required by applicable law or agreed to in writing,
14 * software distributed under the License is distributed on an
15 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16 * KIND, either express or implied. See the License for the
17 * specific language governing permissions and limitations
18 * under the License.
19 *
20 *************************************************************/
21
22
23
24 #include "sal/config.h"
25
26 #include <climits>
27 #include <cstddef>
28
29 #include "com/sun/star/container/NoSuchElementException.hpp"
30 #include "com/sun/star/uno/Reference.hxx"
31 #include "com/sun/star/uno/RuntimeException.hpp"
32 #include "com/sun/star/uno/XInterface.hpp"
33 #include "osl/diagnose.h"
34 #include "osl/file.h"
35 #include "rtl/string.h"
36 #include "rtl/ustring.h"
37 #include "rtl/ustring.hxx"
38 #include "sal/types.h"
39 #include "xmlreader/pad.hxx"
40 #include "xmlreader/span.hxx"
41 #include "xmlreader/xmlreader.hxx"
42
43 namespace xmlreader {
44
45 namespace {
46
47 namespace css = com::sun::star;
48
isSpace(char c)49 bool isSpace(char c) {
50 switch (c) {
51 case '\x09':
52 case '\x0A':
53 case '\x0D':
54 case ' ':
55 return true;
56 default:
57 return false;
58 }
59 }
60
61 }
62
XmlReader(rtl::OUString const & fileUrl)63 XmlReader::XmlReader(rtl::OUString const & fileUrl)
64 SAL_THROW((
65 css::container::NoSuchElementException, css::uno::RuntimeException)):
66 fileUrl_(fileUrl)
67 {
68 switch (osl_openFile(fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read))
69 {
70 case osl_File_E_None:
71 break;
72 case osl_File_E_NOENT:
73 throw css::container::NoSuchElementException(
74 fileUrl_, css::uno::Reference< css::uno::XInterface >());
75 default:
76 throw css::uno::RuntimeException(
77 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot open ")) +
78 fileUrl_),
79 css::uno::Reference< css::uno::XInterface >());
80 }
81 oslFileError e = osl_getFileSize(fileHandle_, &fileSize_);
82 if (e == osl_File_E_None) {
83 e = osl_mapFile(
84 fileHandle_, &fileAddress_, fileSize_, 0,
85 osl_File_MapFlag_WillNeed);
86 }
87 if (e != osl_File_E_None) {
88 e = osl_closeFile(fileHandle_);
89 if (e != osl_File_E_None) {
90 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
91 }
92 throw css::uno::RuntimeException(
93 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("cannot mmap ")) +
94 fileUrl_),
95 css::uno::Reference< css::uno::XInterface >());
96 }
97 namespaceIris_.push_back(
98 Span(
99 RTL_CONSTASCII_STRINGPARAM(
100 "http://www.w3.org/XML/1998/namespace")));
101 namespaces_.push_back(
102 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xml")), NAMESPACE_XML));
103 pos_ = static_cast< char * >(fileAddress_);
104 end_ = pos_ + fileSize_;
105 state_ = STATE_CONTENT;
106 }
107
~XmlReader()108 XmlReader::~XmlReader() {
109 oslFileError e = osl_unmapFile(fileAddress_, fileSize_);
110 if (e != osl_File_E_None) {
111 OSL_TRACE("osl_unmapFile failed with %ld", static_cast< long >(e));
112 }
113 e = osl_closeFile(fileHandle_);
114 if (e != osl_File_E_None) {
115 OSL_TRACE("osl_closeFile failed with %ld", static_cast< long >(e));
116 }
117 }
118
registerNamespaceIri(Span const & iri)119 int XmlReader::registerNamespaceIri(Span const & iri) {
120 int id = toNamespaceId(namespaceIris_.size());
121 namespaceIris_.push_back(iri);
122 if (iri.equals(
123 Span(
124 RTL_CONSTASCII_STRINGPARAM(
125 "http://www.w3.org/2001/XMLSchema-instance"))))
126 {
127 // Old user layer .xcu files used the xsi namespace prefix without
128 // declaring a corresponding namespace binding, see issue 77174; reading
129 // those files during migration would fail without this hack that can be
130 // removed once migration is no longer relevant (see
131 // configmgr::Components::parseModificationLayer):
132 namespaces_.push_back(
133 NamespaceData(Span(RTL_CONSTASCII_STRINGPARAM("xsi")), id));
134 }
135 return id;
136 }
137
nextItem(Text reportText,Span * data,int * nsId)138 XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
139 {
140 switch (state_) {
141 case STATE_CONTENT:
142 switch (reportText) {
143 case TEXT_NONE:
144 return handleSkippedText(data, nsId);
145 case TEXT_RAW:
146 return handleRawText(data);
147 case TEXT_NORMALIZED:
148 return handleNormalizedText(data);
149 }
150 case STATE_START_TAG:
151 return handleStartTag(nsId, data);
152 case STATE_END_TAG:
153 return handleEndTag();
154 case STATE_EMPTY_ELEMENT_TAG:
155 handleElementEnd();
156 return RESULT_END;
157 default: // STATE_DONE
158 return RESULT_DONE;
159 }
160 }
161
nextAttribute(int * nsId,Span * localName)162 bool XmlReader::nextAttribute(int * nsId, Span * localName) {
163 OSL_ASSERT(nsId != 0 && localName != 0);
164 if (firstAttribute_) {
165 currentAttribute_ = attributes_.begin();
166 firstAttribute_ = false;
167 } else {
168 ++currentAttribute_;
169 }
170 if (currentAttribute_ == attributes_.end()) {
171 return false;
172 }
173 if (currentAttribute_->nameColon == 0) {
174 *nsId = NAMESPACE_NONE;
175 *localName = Span(
176 currentAttribute_->nameBegin,
177 currentAttribute_->nameEnd - currentAttribute_->nameBegin);
178 } else {
179 *nsId = getNamespaceId(
180 Span(
181 currentAttribute_->nameBegin,
182 currentAttribute_->nameColon - currentAttribute_->nameBegin));
183 *localName = Span(
184 currentAttribute_->nameColon + 1,
185 currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
186 }
187 return true;
188 }
189
getAttributeValue(bool fullyNormalize)190 Span XmlReader::getAttributeValue(bool fullyNormalize) {
191 return handleAttributeValue(
192 currentAttribute_->valueBegin, currentAttribute_->valueEnd,
193 fullyNormalize);
194 }
195
getNamespaceId(Span const & prefix) const196 int XmlReader::getNamespaceId(Span const & prefix) const {
197 for (NamespaceList::const_reverse_iterator i(namespaces_.rbegin());
198 i != namespaces_.rend(); ++i)
199 {
200 if (prefix.equals(i->prefix)) {
201 return i->nsId;
202 }
203 }
204 return NAMESPACE_UNKNOWN;
205 }
206
getUrl() const207 rtl::OUString XmlReader::getUrl() const {
208 return fileUrl_;
209 }
210
normalizeLineEnds(Span const & text)211 void XmlReader::normalizeLineEnds(Span const & text) {
212 char const * p = text.begin;
213 sal_Int32 n = text.length;
214 for (;;) {
215 sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
216 if (i < 0) {
217 break;
218 }
219 pad_.add(p, i);
220 p += i + 1;
221 n -= i + 1;
222 if (n == 0 || *p != '\x0A') {
223 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
224 }
225 }
226 pad_.add(p, n);
227 }
228
skipSpace()229 void XmlReader::skipSpace() {
230 while (isSpace(peek())) {
231 ++pos_;
232 }
233 }
234
skipComment()235 bool XmlReader::skipComment() {
236 if (rtl_str_shortenedCompare_WithLength(
237 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
238 RTL_CONSTASCII_LENGTH("--")) !=
239 0)
240 {
241 return false;
242 }
243 pos_ += RTL_CONSTASCII_LENGTH("--");
244 sal_Int32 i = rtl_str_indexOfStr_WithLength(
245 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
246 if (i < 0) {
247 throw css::uno::RuntimeException(
248 (rtl::OUString(
249 RTL_CONSTASCII_USTRINGPARAM(
250 "premature end (within comment) of ")) +
251 fileUrl_),
252 css::uno::Reference< css::uno::XInterface >());
253 }
254 pos_ += i + RTL_CONSTASCII_LENGTH("--");
255 if (read() != '>') {
256 throw css::uno::RuntimeException(
257 (rtl::OUString(
258 RTL_CONSTASCII_USTRINGPARAM(
259 "illegal \"--\" within comment in ")) +
260 fileUrl_),
261 css::uno::Reference< css::uno::XInterface >());
262 }
263 return true;
264 }
265
skipProcessingInstruction()266 void XmlReader::skipProcessingInstruction() {
267 sal_Int32 i = rtl_str_indexOfStr_WithLength(
268 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
269 if (i < 0) {
270 throw css::uno::RuntimeException(
271 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad '<?' in ")) +
272 fileUrl_),
273 css::uno::Reference< css::uno::XInterface >());
274 }
275 pos_ += i + RTL_CONSTASCII_LENGTH("?>");
276 }
277
skipDocumentTypeDeclaration()278 void XmlReader::skipDocumentTypeDeclaration() {
279 // Neither is it checked that the doctypedecl is at the correct position in
280 // the document, nor that it is well-formed:
281 for (;;) {
282 char c = read();
283 switch (c) {
284 case '\0': // i.e., EOF
285 throw css::uno::RuntimeException(
286 (rtl::OUString(
287 RTL_CONSTASCII_USTRINGPARAM(
288 "premature end (within DTD) of ")) +
289 fileUrl_),
290 css::uno::Reference< css::uno::XInterface >());
291 case '"':
292 case '\'':
293 {
294 sal_Int32 i = rtl_str_indexOfChar_WithLength(
295 pos_, end_ - pos_, c);
296 if (i < 0) {
297 throw css::uno::RuntimeException(
298 (rtl::OUString(
299 RTL_CONSTASCII_USTRINGPARAM(
300 "premature end (within DTD) of ")) +
301 fileUrl_),
302 css::uno::Reference< css::uno::XInterface >());
303 }
304 pos_ += i + 1;
305 }
306 break;
307 case '>':
308 return;
309 case '[':
310 for (;;) {
311 c = read();
312 switch (c) {
313 case '\0': // i.e., EOF
314 throw css::uno::RuntimeException(
315 (rtl::OUString(
316 RTL_CONSTASCII_USTRINGPARAM(
317 "premature end (within DTD) of ")) +
318 fileUrl_),
319 css::uno::Reference< css::uno::XInterface >());
320 case '"':
321 case '\'':
322 {
323 sal_Int32 i = rtl_str_indexOfChar_WithLength(
324 pos_, end_ - pos_, c);
325 if (i < 0) {
326 throw css::uno::RuntimeException(
327 (rtl::OUString(
328 RTL_CONSTASCII_USTRINGPARAM(
329 "premature end (within DTD) of ")) +
330 fileUrl_),
331 css::uno::Reference< css::uno::XInterface >());
332 }
333 pos_ += i + 1;
334 }
335 break;
336 case '<':
337 switch (read()) {
338 case '\0': // i.e., EOF
339 throw css::uno::RuntimeException(
340 (rtl::OUString(
341 RTL_CONSTASCII_USTRINGPARAM(
342 "premature end (within DTD) of ")) +
343 fileUrl_),
344 css::uno::Reference< css::uno::XInterface >());
345 case '!':
346 skipComment();
347 break;
348 case '?':
349 skipProcessingInstruction();
350 break;
351 default:
352 break;
353 }
354 break;
355 case ']':
356 skipSpace();
357 if (read() != '>') {
358 throw css::uno::RuntimeException(
359 (rtl::OUString(
360 RTL_CONSTASCII_USTRINGPARAM(
361 "missing \">\" of DTD in ")) +
362 fileUrl_),
363 css::uno::Reference< css::uno::XInterface >());
364 }
365 return;
366 default:
367 break;
368 }
369 }
370 default:
371 break;
372 }
373 }
374 }
375
scanCdataSection()376 Span XmlReader::scanCdataSection() {
377 if (rtl_str_shortenedCompare_WithLength(
378 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
379 RTL_CONSTASCII_LENGTH("[CDATA[")) !=
380 0)
381 {
382 return Span();
383 }
384 pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
385 char const * begin = pos_;
386 sal_Int32 i = rtl_str_indexOfStr_WithLength(
387 pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
388 if (i < 0) {
389 throw css::uno::RuntimeException(
390 (rtl::OUString(
391 RTL_CONSTASCII_USTRINGPARAM(
392 "premature end (within CDATA section) of ")) +
393 fileUrl_),
394 css::uno::Reference< css::uno::XInterface >());
395 }
396 pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
397 return Span(begin, i);
398 }
399
scanName(char const ** nameColon)400 bool XmlReader::scanName(char const ** nameColon) {
401 OSL_ASSERT(nameColon != 0 && *nameColon == 0);
402 for (char const * begin = pos_;; ++pos_) {
403 switch (peek()) {
404 case '\0': // i.e., EOF
405 case '\x09':
406 case '\x0A':
407 case '\x0D':
408 case ' ':
409 case '/':
410 case '=':
411 case '>':
412 return pos_ != begin;
413 case ':':
414 *nameColon = pos_;
415 break;
416 default:
417 break;
418 }
419 }
420 }
421
scanNamespaceIri(char const * begin,char const * end)422 int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
423 OSL_ASSERT(begin != 0 && begin <= end);
424 Span iri(handleAttributeValue(begin, end, false));
425 for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
426 if (namespaceIris_[i].equals(iri)) {
427 return toNamespaceId(i);
428 }
429 }
430 return XmlReader::NAMESPACE_UNKNOWN;
431 }
432
handleReference(char const * position,char const * end)433 char const * XmlReader::handleReference(char const * position, char const * end)
434 {
435 OSL_ASSERT(position != 0 && *position == '&' && position < end);
436 ++position;
437 if (*position == '#') {
438 ++position;
439 sal_Int32 val = 0;
440 char const * p;
441 if (*position == 'x') {
442 ++position;
443 p = position;
444 for (;; ++position) {
445 char c = *position;
446 if (c >= '0' && c <= '9') {
447 val = 16 * val + (c - '0');
448 } else if (c >= 'A' && c <= 'F') {
449 val = 16 * val + (c - 'A') + 10;
450 } else if (c >= 'a' && c <= 'f') {
451 val = 16 * val + (c - 'a') + 10;
452 } else {
453 break;
454 }
455 if (val > 0x10FFFF) { // avoid overflow
456 throw css::uno::RuntimeException(
457 (rtl::OUString(
458 RTL_CONSTASCII_USTRINGPARAM(
459 "'&#x...' too large in ")) +
460 fileUrl_),
461 css::uno::Reference< css::uno::XInterface >());
462 }
463 }
464 } else {
465 p = position;
466 for (;; ++position) {
467 char c = *position;
468 if (c >= '0' && c <= '9') {
469 val = 10 * val + (c - '0');
470 } else {
471 break;
472 }
473 if (val > 0x10FFFF) { // avoid overflow
474 throw css::uno::RuntimeException(
475 (rtl::OUString(
476 RTL_CONSTASCII_USTRINGPARAM(
477 "'&#...' too large in ")) +
478 fileUrl_),
479 css::uno::Reference< css::uno::XInterface >());
480 }
481 }
482 }
483 if (position == p || *position++ != ';') {
484 throw css::uno::RuntimeException(
485 (rtl::OUString(
486 RTL_CONSTASCII_USTRINGPARAM("'&#...' missing ';' in ")) +
487 fileUrl_),
488 css::uno::Reference< css::uno::XInterface >());
489 }
490 OSL_ASSERT(val >= 0 && val <= 0x10FFFF);
491 if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
492 (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
493 {
494 throw css::uno::RuntimeException(
495 (rtl::OUString(
496 RTL_CONSTASCII_USTRINGPARAM(
497 "character reference denoting invalid character in ")) +
498 fileUrl_),
499 css::uno::Reference< css::uno::XInterface >());
500 }
501 char buf[4];
502 sal_Int32 len;
503 if (val < 0x80) {
504 buf[0] = static_cast< char >(val);
505 len = 1;
506 } else if (val < 0x800) {
507 buf[0] = static_cast< char >((val >> 6) | 0xC0);
508 buf[1] = static_cast< char >((val & 0x3F) | 0x80);
509 len = 2;
510 } else if (val < 0x10000) {
511 buf[0] = static_cast< char >((val >> 12) | 0xE0);
512 buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
513 buf[2] = static_cast< char >((val & 0x3F) | 0x80);
514 len = 3;
515 } else {
516 buf[0] = static_cast< char >((val >> 18) | 0xF0);
517 buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
518 buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
519 buf[3] = static_cast< char >((val & 0x3F) | 0x80);
520 len = 4;
521 }
522 pad_.addEphemeral(buf, len);
523 return position;
524 } else {
525 struct EntityRef {
526 char const * inBegin;
527 sal_Int32 inLength;
528 char const * outBegin;
529 sal_Int32 outLength;
530 };
531 static EntityRef const refs[] = {
532 { RTL_CONSTASCII_STRINGPARAM("amp;"),
533 RTL_CONSTASCII_STRINGPARAM("&") },
534 { RTL_CONSTASCII_STRINGPARAM("lt;"),
535 RTL_CONSTASCII_STRINGPARAM("<") },
536 { RTL_CONSTASCII_STRINGPARAM("gt;"),
537 RTL_CONSTASCII_STRINGPARAM(">") },
538 { RTL_CONSTASCII_STRINGPARAM("apos;"),
539 RTL_CONSTASCII_STRINGPARAM("'") },
540 { RTL_CONSTASCII_STRINGPARAM("quot;"),
541 RTL_CONSTASCII_STRINGPARAM("\"") } };
542 for (std::size_t i = 0; i < sizeof refs / sizeof refs[0]; ++i) {
543 if (rtl_str_shortenedCompare_WithLength(
544 position, end - position, refs[i].inBegin, refs[i].inLength,
545 refs[i].inLength) ==
546 0)
547 {
548 position += refs[i].inLength;
549 pad_.add(refs[i].outBegin, refs[i].outLength);
550 return position;
551 }
552 }
553 throw css::uno::RuntimeException(
554 (rtl::OUString(
555 RTL_CONSTASCII_USTRINGPARAM("unknown entity reference in ")) +
556 fileUrl_),
557 css::uno::Reference< css::uno::XInterface >());
558 }
559 }
560
handleAttributeValue(char const * begin,char const * end,bool fullyNormalize)561 Span XmlReader::handleAttributeValue(
562 char const * begin, char const * end, bool fullyNormalize)
563 {
564 pad_.clear();
565 if (fullyNormalize) {
566 while (begin != end && isSpace(*begin)) {
567 ++begin;
568 }
569 while (end != begin && isSpace(end[-1])) {
570 --end;
571 }
572 char const * p = begin;
573 enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
574 // a single true space character can go into the current span,
575 // everything else breaks the span
576 Space space = SPACE_NONE;
577 while (p != end) {
578 switch (*p) {
579 case '\x09':
580 case '\x0A':
581 case '\x0D':
582 switch (space) {
583 case SPACE_NONE:
584 pad_.add(begin, p - begin);
585 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
586 space = SPACE_BREAK;
587 break;
588 case SPACE_SPAN:
589 pad_.add(begin, p - begin);
590 space = SPACE_BREAK;
591 break;
592 case SPACE_BREAK:
593 break;
594 }
595 begin = ++p;
596 break;
597 case ' ':
598 switch (space) {
599 case SPACE_NONE:
600 ++p;
601 space = SPACE_SPAN;
602 break;
603 case SPACE_SPAN:
604 pad_.add(begin, p - begin);
605 begin = ++p;
606 space = SPACE_BREAK;
607 break;
608 case SPACE_BREAK:
609 begin = ++p;
610 break;
611 }
612 break;
613 case '&':
614 pad_.add(begin, p - begin);
615 p = handleReference(p, end);
616 begin = p;
617 space = SPACE_NONE;
618 break;
619 default:
620 ++p;
621 space = SPACE_NONE;
622 break;
623 }
624 }
625 pad_.add(begin, p - begin);
626 } else {
627 char const * p = begin;
628 while (p != end) {
629 switch (*p) {
630 case '\x09':
631 case '\x0A':
632 pad_.add(begin, p - begin);
633 begin = ++p;
634 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
635 break;
636 case '\x0D':
637 pad_.add(begin, p - begin);
638 ++p;
639 if (peek() == '\x0A') {
640 ++p;
641 }
642 begin = p;
643 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
644 break;
645 case '&':
646 pad_.add(begin, p - begin);
647 p = handleReference(p, end);
648 begin = p;
649 break;
650 default:
651 ++p;
652 break;
653 }
654 }
655 pad_.add(begin, p - begin);
656 }
657 return pad_.get();
658 }
659
handleStartTag(int * nsId,Span * localName)660 XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
661 OSL_ASSERT(nsId != 0 && localName);
662 char const * nameBegin = pos_;
663 char const * nameColon = 0;
664 if (!scanName(&nameColon)) {
665 throw css::uno::RuntimeException(
666 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("bad tag name in ")) +
667 fileUrl_),
668 css::uno::Reference< css::uno::XInterface >());
669 }
670 char const * nameEnd = pos_;
671 NamespaceList::size_type inheritedNamespaces = namespaces_.size();
672 bool hasDefaultNs = false;
673 int defaultNsId = NAMESPACE_NONE;
674 attributes_.clear();
675 for (;;) {
676 char const * p = pos_;
677 skipSpace();
678 if (peek() == '/' || peek() == '>') {
679 break;
680 }
681 if (pos_ == p) {
682 throw css::uno::RuntimeException(
683 (rtl::OUString(
684 RTL_CONSTASCII_USTRINGPARAM(
685 "missing whitespace before attribute in ")) +
686 fileUrl_),
687 css::uno::Reference< css::uno::XInterface >());
688 }
689 char const * attrNameBegin = pos_;
690 char const * attrNameColon = 0;
691 if (!scanName(&attrNameColon)) {
692 throw css::uno::RuntimeException(
693 (rtl::OUString(
694 RTL_CONSTASCII_USTRINGPARAM("bad attribute name in ")) +
695 fileUrl_),
696 css::uno::Reference< css::uno::XInterface >());
697 }
698 char const * attrNameEnd = pos_;
699 skipSpace();
700 if (read() != '=') {
701 throw css::uno::RuntimeException(
702 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '=' in ")) +
703 fileUrl_),
704 css::uno::Reference< css::uno::XInterface >());
705 }
706 skipSpace();
707 char del = read();
708 if (del != '\'' && del != '"') {
709 throw css::uno::RuntimeException(
710 (rtl::OUString(
711 RTL_CONSTASCII_USTRINGPARAM("bad attribute value in ")) +
712 fileUrl_),
713 css::uno::Reference< css::uno::XInterface >());
714 }
715 char const * valueBegin = pos_;
716 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
717 if (i < 0) {
718 throw css::uno::RuntimeException(
719 (rtl::OUString(
720 RTL_CONSTASCII_USTRINGPARAM(
721 "unterminated attribute value in ")) +
722 fileUrl_),
723 css::uno::Reference< css::uno::XInterface >());
724 }
725 char const * valueEnd = pos_ + i;
726 pos_ += i + 1;
727 if (attrNameColon == 0 &&
728 Span(attrNameBegin, attrNameEnd - attrNameBegin).equals(
729 RTL_CONSTASCII_STRINGPARAM("xmlns")))
730 {
731 hasDefaultNs = true;
732 defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
733 } else if (attrNameColon != 0 &&
734 Span(attrNameBegin, attrNameColon - attrNameBegin).equals(
735 RTL_CONSTASCII_STRINGPARAM("xmlns")))
736 {
737 namespaces_.push_back(
738 NamespaceData(
739 Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
740 scanNamespaceIri(valueBegin, valueEnd)));
741 } else {
742 attributes_.push_back(
743 AttributeData(
744 attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
745 valueEnd));
746 }
747 }
748 if (!hasDefaultNs && !elements_.empty()) {
749 defaultNsId = elements_.top().defaultNamespaceId;
750 }
751 firstAttribute_ = true;
752 if (peek() == '/') {
753 state_ = STATE_EMPTY_ELEMENT_TAG;
754 ++pos_;
755 } else {
756 state_ = STATE_CONTENT;
757 }
758 if (peek() != '>') {
759 throw css::uno::RuntimeException(
760 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
761 fileUrl_),
762 css::uno::Reference< css::uno::XInterface >());
763 }
764 ++pos_;
765 elements_.push(
766 ElementData(
767 Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
768 defaultNsId));
769 if (nameColon == 0) {
770 *nsId = defaultNsId;
771 *localName = Span(nameBegin, nameEnd - nameBegin);
772 } else {
773 *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
774 *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
775 }
776 return RESULT_BEGIN;
777 }
778
handleEndTag()779 XmlReader::Result XmlReader::handleEndTag() {
780 if (elements_.empty()) {
781 throw css::uno::RuntimeException(
782 (rtl::OUString(
783 RTL_CONSTASCII_USTRINGPARAM("spurious end tag in ")) +
784 fileUrl_),
785 css::uno::Reference< css::uno::XInterface >());
786 }
787 char const * nameBegin = pos_;
788 char const * nameColon = 0;
789 if (!scanName(&nameColon) ||
790 !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
791 {
792 throw css::uno::RuntimeException(
793 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("tag mismatch in ")) +
794 fileUrl_),
795 css::uno::Reference< css::uno::XInterface >());
796 }
797 handleElementEnd();
798 skipSpace();
799 if (peek() != '>') {
800 throw css::uno::RuntimeException(
801 (rtl::OUString(RTL_CONSTASCII_USTRINGPARAM("missing '>' in ")) +
802 fileUrl_),
803 css::uno::Reference< css::uno::XInterface >());
804 }
805 ++pos_;
806 return RESULT_END;
807 }
808
handleElementEnd()809 void XmlReader::handleElementEnd() {
810 OSL_ASSERT(!elements_.empty());
811 namespaces_.resize(elements_.top().inheritedNamespaces);
812 elements_.pop();
813 state_ = elements_.empty() ? STATE_DONE : STATE_CONTENT;
814 }
815
handleSkippedText(Span * data,int * nsId)816 XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
817 for (;;) {
818 sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, '<');
819 if (i < 0) {
820 throw css::uno::RuntimeException(
821 (rtl::OUString(
822 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
823 fileUrl_),
824 css::uno::Reference< css::uno::XInterface >());
825 }
826 pos_ += i + 1;
827 switch (peek()) {
828 case '!':
829 ++pos_;
830 if (!skipComment() && !scanCdataSection().is()) {
831 skipDocumentTypeDeclaration();
832 }
833 break;
834 case '/':
835 ++pos_;
836 return handleEndTag();
837 case '?':
838 ++pos_;
839 skipProcessingInstruction();
840 break;
841 default:
842 return handleStartTag(nsId, data);
843 }
844 }
845 }
846
handleRawText(Span * text)847 XmlReader::Result XmlReader::handleRawText(Span * text) {
848 pad_.clear();
849 for (char const * begin = pos_;;) {
850 switch (peek()) {
851 case '\0': // i.e., EOF
852 throw css::uno::RuntimeException(
853 (rtl::OUString(
854 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
855 fileUrl_),
856 css::uno::Reference< css::uno::XInterface >());
857 case '\x0D':
858 pad_.add(begin, pos_ - begin);
859 ++pos_;
860 if (peek() != '\x0A') {
861 pad_.add(RTL_CONSTASCII_STRINGPARAM("\x0A"));
862 }
863 begin = pos_;
864 break;
865 case '&':
866 pad_.add(begin, pos_ - begin);
867 pos_ = handleReference(pos_, end_);
868 begin = pos_;
869 break;
870 case '<':
871 pad_.add(begin, pos_ - begin);
872 ++pos_;
873 switch (peek()) {
874 case '!':
875 ++pos_;
876 if (!skipComment()) {
877 Span cdata(scanCdataSection());
878 if (cdata.is()) {
879 normalizeLineEnds(cdata);
880 } else {
881 skipDocumentTypeDeclaration();
882 }
883 }
884 begin = pos_;
885 break;
886 case '/':
887 *text = pad_.get();
888 ++pos_;
889 state_ = STATE_END_TAG;
890 return RESULT_TEXT;
891 case '?':
892 ++pos_;
893 skipProcessingInstruction();
894 begin = pos_;
895 break;
896 default:
897 *text = pad_.get();
898 state_ = STATE_START_TAG;
899 return RESULT_TEXT;
900 }
901 break;
902 default:
903 ++pos_;
904 break;
905 }
906 }
907 }
908
handleNormalizedText(Span * text)909 XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
910 pad_.clear();
911 char const * flowBegin = pos_;
912 char const * flowEnd = pos_;
913 enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
914 // a single true space character can go into the current flow,
915 // everything else breaks the flow
916 Space space = SPACE_START;
917 for (;;) {
918 switch (peek()) {
919 case '\0': // i.e., EOF
920 throw css::uno::RuntimeException(
921 (rtl::OUString(
922 RTL_CONSTASCII_USTRINGPARAM("premature end of ")) +
923 fileUrl_),
924 css::uno::Reference< css::uno::XInterface >());
925 case '\x09':
926 case '\x0A':
927 case '\x0D':
928 switch (space) {
929 case SPACE_START:
930 case SPACE_BREAK:
931 break;
932 case SPACE_NONE:
933 case SPACE_SPAN:
934 space = SPACE_BREAK;
935 break;
936 }
937 ++pos_;
938 break;
939 case ' ':
940 switch (space) {
941 case SPACE_START:
942 case SPACE_BREAK:
943 break;
944 case SPACE_NONE:
945 space = SPACE_SPAN;
946 break;
947 case SPACE_SPAN:
948 space = SPACE_BREAK;
949 break;
950 }
951 ++pos_;
952 break;
953 case '&':
954 switch (space) {
955 case SPACE_START:
956 break;
957 case SPACE_NONE:
958 case SPACE_SPAN:
959 pad_.add(flowBegin, pos_ - flowBegin);
960 break;
961 case SPACE_BREAK:
962 pad_.add(flowBegin, flowEnd - flowBegin);
963 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
964 break;
965 }
966 pos_ = handleReference(pos_, end_);
967 flowBegin = pos_;
968 flowEnd = pos_;
969 space = SPACE_NONE;
970 break;
971 case '<':
972 ++pos_;
973 switch (peek()) {
974 case '!':
975 ++pos_;
976 if (skipComment()) {
977 space = SPACE_BREAK;
978 } else {
979 Span cdata(scanCdataSection());
980 if (cdata.is()) {
981 // CDATA is not normalized (similar to character
982 // references; it keeps the code simple), but it might
983 // arguably be better to normalize it:
984 switch (space) {
985 case SPACE_START:
986 break;
987 case SPACE_NONE:
988 case SPACE_SPAN:
989 pad_.add(flowBegin, pos_ - flowBegin);
990 break;
991 case SPACE_BREAK:
992 pad_.add(flowBegin, flowEnd - flowBegin);
993 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
994 break;
995 }
996 normalizeLineEnds(cdata);
997 flowBegin = pos_;
998 flowEnd = pos_;
999 space = SPACE_NONE;
1000 } else {
1001 skipDocumentTypeDeclaration();
1002 }
1003 }
1004 break;
1005 case '/':
1006 ++pos_;
1007 pad_.add(flowBegin, flowEnd - flowBegin);
1008 *text = pad_.get();
1009 state_ = STATE_END_TAG;
1010 return RESULT_TEXT;
1011 case '?':
1012 ++pos_;
1013 skipProcessingInstruction();
1014 space = SPACE_BREAK;
1015 break;
1016 default:
1017 pad_.add(flowBegin, flowEnd - flowBegin);
1018 *text = pad_.get();
1019 state_ = STATE_START_TAG;
1020 return RESULT_TEXT;
1021 }
1022 break;
1023 default:
1024 switch (space) {
1025 case SPACE_START:
1026 flowBegin = pos_;
1027 break;
1028 case SPACE_NONE:
1029 case SPACE_SPAN:
1030 break;
1031 case SPACE_BREAK:
1032 pad_.add(flowBegin, flowEnd - flowBegin);
1033 pad_.add(RTL_CONSTASCII_STRINGPARAM(" "));
1034 flowBegin = pos_;
1035 break;
1036 }
1037 flowEnd = ++pos_;
1038 space = SPACE_NONE;
1039 break;
1040 }
1041 }
1042 }
1043
toNamespaceId(NamespaceIris::size_type pos)1044 int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
1045 OSL_ASSERT(pos <= INT_MAX);
1046 return static_cast< int >(pos);
1047 }
1048
1049 }
1050