/src/libreoffice/xmlreader/source/xmlreader.cxx
Line | Count | Source (jump to first uncovered line) |
1 | | /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
2 | | /* |
3 | | * This file is part of the LibreOffice project. |
4 | | * |
5 | | * This Source Code Form is subject to the terms of the Mozilla Public |
6 | | * License, v. 2.0. If a copy of the MPL was not distributed with this |
7 | | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8 | | * |
9 | | * This file incorporates work covered by the following license notice: |
10 | | * |
11 | | * Licensed to the Apache Software Foundation (ASF) under one or more |
12 | | * contributor license agreements. See the NOTICE file distributed |
13 | | * with this work for additional information regarding copyright |
14 | | * ownership. The ASF licenses this file to you under the Apache |
15 | | * License, Version 2.0 (the "License"); you may not use this file |
16 | | * except in compliance with the License. You may obtain a copy of |
17 | | * the License at http://www.apache.org/licenses/LICENSE-2.0 . |
18 | | */ |
19 | | |
20 | | #include <sal/config.h> |
21 | | |
22 | | #include <cassert> |
23 | | #include <climits> |
24 | | |
25 | | #include <com/sun/star/container/NoSuchElementException.hpp> |
26 | | #include <com/sun/star/uno/RuntimeException.hpp> |
27 | | #include <o3tl/numeric.hxx> |
28 | | #include <osl/file.h> |
29 | | #include <rtl/character.hxx> |
30 | | #include <rtl/string.h> |
31 | | #include <rtl/ustring.hxx> |
32 | | #include <sal/log.hxx> |
33 | | #include <sal/types.h> |
34 | | #include <utility> |
35 | | #include <xmlreader/pad.hxx> |
36 | | #include <xmlreader/span.hxx> |
37 | | #include <xmlreader/xmlreader.hxx> |
38 | | |
39 | | namespace xmlreader { |
40 | | |
41 | | namespace { |
42 | | |
43 | 1.78M | bool isSpace(char c) { |
44 | 1.78M | switch (c) { |
45 | 0 | case '\x09': |
46 | 0 | case '\x0A': |
47 | 0 | case '\x0D': |
48 | 360k | case ' ': |
49 | 360k | return true; |
50 | 1.42M | default: |
51 | 1.42M | return false; |
52 | 1.78M | } |
53 | 1.78M | } |
54 | | |
55 | | } |
56 | | |
57 | | XmlReader::XmlReader(OUString fileUrl) |
58 | 106 | : fileUrl_(std::move(fileUrl)) |
59 | 106 | , fileHandle_(nullptr) |
60 | 106 | { |
61 | 106 | oslFileError e = osl_openFile( |
62 | 106 | fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read); |
63 | 106 | switch (e) |
64 | 106 | { |
65 | 106 | case osl_File_E_None: |
66 | 106 | break; |
67 | 0 | case osl_File_E_NOENT: |
68 | 0 | throw css::container::NoSuchElementException( fileUrl_ ); |
69 | 0 | default: |
70 | 0 | throw css::uno::RuntimeException( |
71 | 0 | "cannot open " + fileUrl_ + ": " + OUString::number(e)); |
72 | 106 | } |
73 | 106 | e = osl_getFileSize(fileHandle_, &fileSize_); |
74 | 106 | if (e == osl_File_E_None) { |
75 | 106 | e = osl_mapFile( |
76 | 106 | fileHandle_, &fileAddress_, fileSize_, 0, |
77 | 106 | osl_File_MapFlag_WillNeed); |
78 | 106 | } |
79 | 106 | if (e != osl_File_E_None) { |
80 | 0 | oslFileError e2 = osl_closeFile(fileHandle_); |
81 | 0 | if (e2 != osl_File_E_None) { |
82 | 0 | SAL_WARN( |
83 | 0 | "xmlreader", |
84 | 0 | "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2); |
85 | 0 | } |
86 | 0 | throw css::uno::RuntimeException( |
87 | 0 | "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" ); |
88 | 0 | } |
89 | 106 | namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace"); |
90 | 106 | namespaces_.emplace_back(Span("xml"), NAMESPACE_XML); |
91 | 106 | pos_ = static_cast< char * >(fileAddress_); |
92 | 106 | end_ = pos_ + fileSize_; |
93 | 106 | state_ = State::Content; |
94 | 106 | firstAttribute_ = true; |
95 | 106 | } |
96 | | |
97 | 106 | XmlReader::~XmlReader() { |
98 | 106 | if (!fileHandle_) |
99 | 0 | return; |
100 | 106 | oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_); |
101 | 106 | if (e != osl_File_E_None) { |
102 | 0 | SAL_WARN( |
103 | 0 | "xmlreader", |
104 | 0 | "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e); |
105 | 0 | } |
106 | 106 | e = osl_closeFile(fileHandle_); |
107 | 106 | if (e != osl_File_E_None) { |
108 | 0 | SAL_WARN( |
109 | 0 | "xmlreader", |
110 | 0 | "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e); |
111 | 0 | } |
112 | 106 | } |
113 | | |
114 | 106 | int XmlReader::registerNamespaceIri(Span const & iri) { |
115 | 106 | int id = toNamespaceId(namespaceIris_.size()); |
116 | 106 | namespaceIris_.push_back(iri); |
117 | 106 | if (iri == "http://www.w3.org/2001/XMLSchema-instance") { |
118 | | // Old user layer .xcu files used the xsi namespace prefix without |
119 | | // declaring a corresponding namespace binding, see issue 77174; reading |
120 | | // those files during migration would fail without this hack that can be |
121 | | // removed once migration is no longer relevant (see |
122 | | // configmgr::Components::parseModificationLayer): |
123 | 0 | namespaces_.emplace_back(Span("xsi"), id); |
124 | 0 | } |
125 | 106 | return id; |
126 | 106 | } |
127 | | |
128 | | XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId) |
129 | 465k | { |
130 | 465k | switch (state_) { |
131 | 340k | case State::Content: |
132 | 340k | switch (reportText) { |
133 | 340k | case Text::NONE: |
134 | 340k | return handleSkippedText(data, nsId); |
135 | 0 | case Text::Raw: |
136 | 0 | return handleRawText(data); |
137 | 0 | default: // Text::Normalized |
138 | 0 | return handleNormalizedText(data); |
139 | 340k | } |
140 | 0 | case State::StartTag: |
141 | 0 | return handleStartTag(nsId, data); |
142 | 0 | case State::EndTag: |
143 | 0 | return handleEndTag(); |
144 | 125k | case State::EmptyElementTag: |
145 | 125k | handleElementEnd(); |
146 | 125k | return Result::End; |
147 | 106 | default: // State::Done |
148 | 106 | return Result::Done; |
149 | 465k | } |
150 | 465k | } |
151 | | |
152 | 592k | bool XmlReader::nextAttribute(int * nsId, Span * localName) { |
153 | 592k | assert(nsId != nullptr && localName != nullptr); |
154 | 592k | if (firstAttribute_) { |
155 | 232k | currentAttribute_ = attributes_.begin(); |
156 | 232k | firstAttribute_ = false; |
157 | 360k | } else { |
158 | 360k | ++currentAttribute_; |
159 | 360k | } |
160 | 592k | if (currentAttribute_ == attributes_.end()) { |
161 | 232k | return false; |
162 | 232k | } |
163 | 360k | if (currentAttribute_->nameColon == nullptr) { |
164 | 360k | *nsId = NAMESPACE_NONE; |
165 | 360k | *localName = Span( |
166 | 360k | currentAttribute_->nameBegin, |
167 | 360k | currentAttribute_->nameEnd - currentAttribute_->nameBegin); |
168 | 360k | } else { |
169 | 0 | *nsId = getNamespaceId( |
170 | 0 | Span( |
171 | 0 | currentAttribute_->nameBegin, |
172 | 0 | currentAttribute_->nameColon - currentAttribute_->nameBegin)); |
173 | 0 | *localName = Span( |
174 | 0 | currentAttribute_->nameColon + 1, |
175 | 0 | currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1)); |
176 | 0 | } |
177 | 360k | return true; |
178 | 592k | } |
179 | | |
180 | 360k | Span XmlReader::getAttributeValue(bool fullyNormalize) { |
181 | 360k | return handleAttributeValue( |
182 | 360k | currentAttribute_->valueBegin, currentAttribute_->valueEnd, |
183 | 360k | fullyNormalize); |
184 | 360k | } |
185 | | |
186 | 0 | int XmlReader::getNamespaceId(Span const & prefix) const { |
187 | 0 | auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(), |
188 | 0 | [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; }); |
189 | |
|
190 | 0 | if (i != namespaces_.rend()) |
191 | 0 | return i->nsId; |
192 | | |
193 | 0 | return NAMESPACE_UNKNOWN; |
194 | 0 | } |
195 | | |
196 | | |
197 | 0 | void XmlReader::normalizeLineEnds(Span const & text) { |
198 | 0 | char const * p = text.begin; |
199 | 0 | sal_Int32 n = text.length; |
200 | 0 | for (;;) { |
201 | 0 | sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D'); |
202 | 0 | if (i < 0) { |
203 | 0 | break; |
204 | 0 | } |
205 | 0 | pad_.add(p, i); |
206 | 0 | p += i + 1; |
207 | 0 | n -= i + 1; |
208 | 0 | if (n == 0 || *p != '\x0A') { |
209 | 0 | pad_.add("\x0A"); |
210 | 0 | } |
211 | 0 | } |
212 | 0 | pad_.add(p, n); |
213 | 0 | } |
214 | | |
215 | 1.42M | void XmlReader::skipSpace() { |
216 | 1.78M | while (isSpace(peek())) { |
217 | 360k | ++pos_; |
218 | 360k | } |
219 | 1.42M | } |
220 | | |
221 | 0 | bool XmlReader::skipComment() { |
222 | 0 | if (rtl_str_shortenedCompare_WithLength( |
223 | 0 | pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"), |
224 | 0 | RTL_CONSTASCII_LENGTH("--")) != |
225 | 0 | 0) |
226 | 0 | { |
227 | 0 | return false; |
228 | 0 | } |
229 | 0 | pos_ += RTL_CONSTASCII_LENGTH("--"); |
230 | 0 | sal_Int32 i = rtl_str_indexOfStr_WithLength( |
231 | 0 | pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--")); |
232 | 0 | if (i < 0) { |
233 | 0 | throw css::uno::RuntimeException( |
234 | 0 | "premature end (within comment) of " + fileUrl_ ); |
235 | 0 | } |
236 | 0 | pos_ += i + RTL_CONSTASCII_LENGTH("--"); |
237 | 0 | if (read() != '>') { |
238 | 0 | throw css::uno::RuntimeException( |
239 | 0 | "illegal \"--\" within comment in " + fileUrl_ ); |
240 | 0 | } |
241 | 0 | return true; |
242 | 0 | } |
243 | | |
244 | 106 | void XmlReader::skipProcessingInstruction() { |
245 | 106 | sal_Int32 i = rtl_str_indexOfStr_WithLength( |
246 | 106 | pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>")); |
247 | 106 | if (i < 0) { |
248 | 0 | throw css::uno::RuntimeException( |
249 | 0 | "bad '<?' in " + fileUrl_ ); |
250 | 0 | } |
251 | 106 | pos_ += i + RTL_CONSTASCII_LENGTH("?>"); |
252 | 106 | } |
253 | | |
254 | 0 | void XmlReader::skipDocumentTypeDeclaration() { |
255 | | // Neither is it checked that the doctypedecl is at the correct position in |
256 | | // the document, nor that it is well-formed: |
257 | 0 | for (;;) { |
258 | 0 | char c = read(); |
259 | 0 | switch (c) { |
260 | 0 | case '\0': // i.e., EOF |
261 | 0 | throw css::uno::RuntimeException( |
262 | 0 | "premature end (within DTD) of " + fileUrl_ ); |
263 | 0 | case '"': |
264 | 0 | case '\'': |
265 | 0 | { |
266 | 0 | sal_Int32 i = rtl_str_indexOfChar_WithLength( |
267 | 0 | pos_, end_ - pos_, c); |
268 | 0 | if (i < 0) { |
269 | 0 | throw css::uno::RuntimeException( |
270 | 0 | "premature end (within DTD) of " + fileUrl_ ); |
271 | 0 | } |
272 | 0 | pos_ += i + 1; |
273 | 0 | } |
274 | 0 | break; |
275 | 0 | case '>': |
276 | 0 | return; |
277 | 0 | case '[': |
278 | 0 | for (;;) { |
279 | 0 | c = read(); |
280 | 0 | switch (c) { |
281 | 0 | case '\0': // i.e., EOF |
282 | 0 | throw css::uno::RuntimeException( |
283 | 0 | "premature end (within DTD) of " + fileUrl_ ); |
284 | 0 | case '"': |
285 | 0 | case '\'': |
286 | 0 | { |
287 | 0 | sal_Int32 i = rtl_str_indexOfChar_WithLength( |
288 | 0 | pos_, end_ - pos_, c); |
289 | 0 | if (i < 0) { |
290 | 0 | throw css::uno::RuntimeException( |
291 | 0 | "premature end (within DTD) of " + fileUrl_ ); |
292 | 0 | } |
293 | 0 | pos_ += i + 1; |
294 | 0 | } |
295 | 0 | break; |
296 | 0 | case '<': |
297 | 0 | switch (read()) { |
298 | 0 | case '\0': // i.e., EOF |
299 | 0 | throw css::uno::RuntimeException( |
300 | 0 | "premature end (within DTD) of " + fileUrl_ ); |
301 | 0 | case '!': |
302 | 0 | skipComment(); |
303 | 0 | break; |
304 | 0 | case '?': |
305 | 0 | skipProcessingInstruction(); |
306 | 0 | break; |
307 | 0 | default: |
308 | 0 | break; |
309 | 0 | } |
310 | 0 | break; |
311 | 0 | case ']': |
312 | 0 | skipSpace(); |
313 | 0 | if (read() != '>') { |
314 | 0 | throw css::uno::RuntimeException( |
315 | 0 | "missing \">\" of DTD in " + fileUrl_ ); |
316 | 0 | } |
317 | 0 | return; |
318 | 0 | default: |
319 | 0 | break; |
320 | 0 | } |
321 | 0 | } |
322 | 0 | default: |
323 | 0 | break; |
324 | 0 | } |
325 | 0 | } |
326 | 0 | } |
327 | | |
328 | 0 | Span XmlReader::scanCdataSection() { |
329 | 0 | if (rtl_str_shortenedCompare_WithLength( |
330 | 0 | pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["), |
331 | 0 | RTL_CONSTASCII_LENGTH("[CDATA[")) != |
332 | 0 | 0) |
333 | 0 | { |
334 | 0 | return Span(); |
335 | 0 | } |
336 | 0 | pos_ += RTL_CONSTASCII_LENGTH("[CDATA["); |
337 | 0 | char const * begin = pos_; |
338 | 0 | sal_Int32 i = rtl_str_indexOfStr_WithLength( |
339 | 0 | pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>")); |
340 | 0 | if (i < 0) { |
341 | 0 | throw css::uno::RuntimeException( |
342 | 0 | "premature end (within CDATA section) of " + fileUrl_ ); |
343 | 0 | } |
344 | 0 | pos_ += i + RTL_CONSTASCII_LENGTH("]]>"); |
345 | 0 | return Span(begin, i); |
346 | 0 | } |
347 | | |
348 | 700k | bool XmlReader::scanName(char const ** nameColon) { |
349 | 700k | assert(nameColon != nullptr && *nameColon == nullptr); |
350 | 6.71M | for (char const * begin = pos_;; ++pos_) { |
351 | 6.71M | switch (peek()) { |
352 | 0 | case '\0': // i.e., EOF |
353 | 0 | case '\x09': |
354 | 0 | case '\x0A': |
355 | 0 | case '\x0D': |
356 | 232k | case ' ': |
357 | 232k | case '/': |
358 | 592k | case '=': |
359 | 700k | case '>': |
360 | 700k | return pos_ != begin; |
361 | 0 | case ':': |
362 | 0 | *nameColon = pos_; |
363 | 0 | break; |
364 | 6.01M | default: |
365 | 6.01M | break; |
366 | 6.71M | } |
367 | 6.71M | } |
368 | 700k | } |
369 | | |
370 | 106 | int XmlReader::scanNamespaceIri(char const * begin, char const * end) { |
371 | 106 | assert(begin != nullptr && begin <= end); |
372 | 106 | Span iri(handleAttributeValue(begin, end, false)); |
373 | 212 | for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) { |
374 | 212 | if (namespaceIris_[i] == iri) { |
375 | 106 | return toNamespaceId(i); |
376 | 106 | } |
377 | 212 | } |
378 | 0 | return XmlReader::NAMESPACE_UNKNOWN; |
379 | 106 | } |
380 | | |
381 | | char const * XmlReader::handleReference(char const * position, char const * end) |
382 | 0 | { |
383 | 0 | assert(position != nullptr && *position == '&' && position < end); |
384 | 0 | ++position; |
385 | 0 | if (*position == '#') { |
386 | 0 | ++position; |
387 | 0 | sal_uInt32 val = 0; |
388 | 0 | char const * p; |
389 | 0 | if (*position == 'x') { |
390 | 0 | ++position; |
391 | 0 | p = position; |
392 | 0 | for (;; ++position) |
393 | 0 | { |
394 | 0 | val = o3tl::convertToHex<sal_uInt32>(*position); |
395 | 0 | if (val >= 16) |
396 | 0 | break; |
397 | | |
398 | 0 | if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow |
399 | 0 | throw css::uno::RuntimeException( |
400 | 0 | "'&#x...' too large in " + fileUrl_ ); |
401 | 0 | } |
402 | 0 | } |
403 | 0 | } else { |
404 | 0 | p = position; |
405 | 0 | for (;; ++position) { |
406 | 0 | char c = *position; |
407 | 0 | if (c >= '0' && c <= '9') { |
408 | 0 | val = 10 * val + (c - '0'); |
409 | 0 | } else { |
410 | 0 | break; |
411 | 0 | } |
412 | 0 | if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow |
413 | 0 | throw css::uno::RuntimeException( |
414 | 0 | "'&#...' too large in " + fileUrl_ ); |
415 | 0 | } |
416 | 0 | } |
417 | 0 | } |
418 | 0 | if (position == p || *position++ != ';') { |
419 | 0 | throw css::uno::RuntimeException( |
420 | 0 | "'&#...' missing ';' in " + fileUrl_ ); |
421 | 0 | } |
422 | 0 | assert(rtl::isUnicodeCodePoint(val)); |
423 | 0 | if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) || |
424 | 0 | (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF) |
425 | 0 | { |
426 | 0 | throw css::uno::RuntimeException( |
427 | 0 | "character reference denoting invalid character in " + fileUrl_ ); |
428 | 0 | } |
429 | 0 | char buf[4]; |
430 | 0 | sal_Int32 len; |
431 | 0 | if (val < 0x80) { |
432 | 0 | buf[0] = static_cast< char >(val); |
433 | 0 | len = 1; |
434 | 0 | } else if (val < 0x800) { |
435 | 0 | buf[0] = static_cast< char >((val >> 6) | 0xC0); |
436 | 0 | buf[1] = static_cast< char >((val & 0x3F) | 0x80); |
437 | 0 | len = 2; |
438 | 0 | } else if (val < 0x10000) { |
439 | 0 | buf[0] = static_cast< char >((val >> 12) | 0xE0); |
440 | 0 | buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); |
441 | 0 | buf[2] = static_cast< char >((val & 0x3F) | 0x80); |
442 | 0 | len = 3; |
443 | 0 | } else { |
444 | 0 | buf[0] = static_cast< char >((val >> 18) | 0xF0); |
445 | 0 | buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80); |
446 | 0 | buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80); |
447 | 0 | buf[3] = static_cast< char >((val & 0x3F) | 0x80); |
448 | 0 | len = 4; |
449 | 0 | } |
450 | 0 | pad_.addEphemeral(buf, len); |
451 | 0 | return position; |
452 | 0 | } else { |
453 | 0 | struct EntityRef { |
454 | 0 | char const * inBegin; |
455 | 0 | sal_Int32 const inLength; |
456 | 0 | char const * outBegin; |
457 | 0 | sal_Int32 const outLength; |
458 | 0 | }; |
459 | 0 | static EntityRef const refs[] = { |
460 | 0 | { RTL_CONSTASCII_STRINGPARAM("amp;"), |
461 | 0 | RTL_CONSTASCII_STRINGPARAM("&") }, |
462 | 0 | { RTL_CONSTASCII_STRINGPARAM("lt;"), |
463 | 0 | RTL_CONSTASCII_STRINGPARAM("<") }, |
464 | 0 | { RTL_CONSTASCII_STRINGPARAM("gt;"), |
465 | 0 | RTL_CONSTASCII_STRINGPARAM(">") }, |
466 | 0 | { RTL_CONSTASCII_STRINGPARAM("apos;"), |
467 | 0 | RTL_CONSTASCII_STRINGPARAM("'") }, |
468 | 0 | { RTL_CONSTASCII_STRINGPARAM("quot;"), |
469 | 0 | RTL_CONSTASCII_STRINGPARAM("\"") } }; |
470 | 0 | for (const auto & ref : refs) { |
471 | 0 | if (rtl_str_shortenedCompare_WithLength( |
472 | 0 | position, end - position, ref.inBegin, ref.inLength, |
473 | 0 | ref.inLength) == |
474 | 0 | 0) |
475 | 0 | { |
476 | 0 | position += ref.inLength; |
477 | 0 | pad_.add(ref.outBegin, ref.outLength); |
478 | 0 | return position; |
479 | 0 | } |
480 | 0 | } |
481 | 0 | throw css::uno::RuntimeException( |
482 | 0 | "unknown entity reference in " + fileUrl_ ); |
483 | 0 | } |
484 | 0 | } |
485 | | |
486 | | Span XmlReader::handleAttributeValue( |
487 | | char const * begin, char const * end, bool fullyNormalize) |
488 | 360k | { |
489 | 360k | pad_.clear(); |
490 | 360k | if (fullyNormalize) { |
491 | 0 | while (begin != end && isSpace(*begin)) { |
492 | 0 | ++begin; |
493 | 0 | } |
494 | 0 | while (end != begin && isSpace(end[-1])) { |
495 | 0 | --end; |
496 | 0 | } |
497 | 0 | char const * p = begin; |
498 | 0 | enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; |
499 | | // a single true space character can go into the current span, |
500 | | // everything else breaks the span |
501 | 0 | Space space = SPACE_NONE; |
502 | 0 | while (p != end) { |
503 | 0 | switch (*p) { |
504 | 0 | case '\x09': |
505 | 0 | case '\x0A': |
506 | 0 | case '\x0D': |
507 | 0 | switch (space) { |
508 | 0 | case SPACE_NONE: |
509 | 0 | pad_.add(begin, p - begin); |
510 | 0 | pad_.add(" "); |
511 | 0 | space = SPACE_BREAK; |
512 | 0 | break; |
513 | 0 | case SPACE_SPAN: |
514 | 0 | pad_.add(begin, p - begin); |
515 | 0 | space = SPACE_BREAK; |
516 | 0 | break; |
517 | 0 | case SPACE_BREAK: |
518 | 0 | break; |
519 | 0 | } |
520 | 0 | begin = ++p; |
521 | 0 | break; |
522 | 0 | case ' ': |
523 | 0 | switch (space) { |
524 | 0 | case SPACE_NONE: |
525 | 0 | ++p; |
526 | 0 | space = SPACE_SPAN; |
527 | 0 | break; |
528 | 0 | case SPACE_SPAN: |
529 | 0 | pad_.add(begin, p - begin); |
530 | 0 | begin = ++p; |
531 | 0 | space = SPACE_BREAK; |
532 | 0 | break; |
533 | 0 | case SPACE_BREAK: |
534 | 0 | begin = ++p; |
535 | 0 | break; |
536 | 0 | } |
537 | 0 | break; |
538 | 0 | case '&': |
539 | 0 | pad_.add(begin, p - begin); |
540 | 0 | p = handleReference(p, end); |
541 | 0 | begin = p; |
542 | 0 | space = SPACE_NONE; |
543 | 0 | break; |
544 | 0 | default: |
545 | 0 | ++p; |
546 | 0 | space = SPACE_NONE; |
547 | 0 | break; |
548 | 0 | } |
549 | 0 | } |
550 | 0 | pad_.add(begin, p - begin); |
551 | 360k | } else { |
552 | 360k | char const * p = begin; |
553 | 14.5M | while (p != end) { |
554 | 14.1M | switch (*p) { |
555 | 0 | case '\x09': |
556 | 0 | case '\x0A': |
557 | 0 | pad_.add(begin, p - begin); |
558 | 0 | begin = ++p; |
559 | 0 | pad_.add(" "); |
560 | 0 | break; |
561 | 0 | case '\x0D': |
562 | 0 | pad_.add(begin, p - begin); |
563 | 0 | ++p; |
564 | 0 | if (peek() == '\x0A') { |
565 | 0 | ++p; |
566 | 0 | } |
567 | 0 | begin = p; |
568 | 0 | pad_.add(" "); |
569 | 0 | break; |
570 | 0 | case '&': |
571 | 0 | pad_.add(begin, p - begin); |
572 | 0 | p = handleReference(p, end); |
573 | 0 | begin = p; |
574 | 0 | break; |
575 | 14.1M | default: |
576 | 14.1M | ++p; |
577 | 14.1M | break; |
578 | 14.1M | } |
579 | 14.1M | } |
580 | 360k | pad_.add(begin, p - begin); |
581 | 360k | } |
582 | 360k | return pad_.get(); |
583 | 360k | } |
584 | | |
585 | 232k | XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) { |
586 | 232k | assert(nsId != nullptr && localName); |
587 | 232k | char const * nameBegin = pos_; |
588 | 232k | char const * nameColon = nullptr; |
589 | 232k | if (!scanName(&nameColon)) { |
590 | 0 | throw css::uno::RuntimeException( |
591 | 0 | "bad tag name in " + fileUrl_ ); |
592 | 0 | } |
593 | 232k | char const * nameEnd = pos_; |
594 | 232k | NamespaceList::size_type inheritedNamespaces = namespaces_.size(); |
595 | 232k | bool hasDefaultNs = false; |
596 | 232k | int defaultNsId = NAMESPACE_NONE; |
597 | 232k | attributes_.clear(); |
598 | 592k | for (;;) { |
599 | 592k | char const * p = pos_; |
600 | 592k | skipSpace(); |
601 | 592k | if (peek() == '/' || peek() == '>') { |
602 | 232k | break; |
603 | 232k | } |
604 | 360k | if (pos_ == p) { |
605 | 0 | throw css::uno::RuntimeException( |
606 | 0 | "missing whitespace before attribute in " + fileUrl_ ); |
607 | 0 | } |
608 | 360k | char const * attrNameBegin = pos_; |
609 | 360k | char const * attrNameColon = nullptr; |
610 | 360k | if (!scanName(&attrNameColon)) { |
611 | 0 | throw css::uno::RuntimeException( |
612 | 0 | "bad attribute name in " + fileUrl_ ); |
613 | 0 | } |
614 | 360k | char const * attrNameEnd = pos_; |
615 | 360k | skipSpace(); |
616 | 360k | if (read() != '=') { |
617 | 0 | throw css::uno::RuntimeException( |
618 | 0 | "missing '=' in " + fileUrl_ ); |
619 | 0 | } |
620 | 360k | skipSpace(); |
621 | 360k | char del = read(); |
622 | 360k | if (del != '\'' && del != '"') { |
623 | 0 | throw css::uno::RuntimeException( |
624 | 0 | "bad attribute value in " + fileUrl_ ); |
625 | 0 | } |
626 | 360k | char const * valueBegin = pos_; |
627 | 360k | sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del); |
628 | 360k | if (i < 0) { |
629 | 0 | throw css::uno::RuntimeException( |
630 | 0 | "unterminated attribute value in " + fileUrl_ ); |
631 | 0 | } |
632 | 360k | char const * valueEnd = pos_ + i; |
633 | 360k | pos_ += i + 1; |
634 | 360k | if (attrNameColon == nullptr && |
635 | 360k | Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns") |
636 | 106 | { |
637 | 106 | hasDefaultNs = true; |
638 | 106 | defaultNsId = scanNamespaceIri(valueBegin, valueEnd); |
639 | 360k | } else if (attrNameColon != nullptr && |
640 | 360k | Span(attrNameBegin, attrNameColon - attrNameBegin) == |
641 | 0 | "xmlns") |
642 | 0 | { |
643 | 0 | namespaces_.emplace_back( |
644 | 0 | Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)), |
645 | 0 | scanNamespaceIri(valueBegin, valueEnd)); |
646 | 360k | } else { |
647 | 360k | attributes_.emplace_back( |
648 | 360k | attrNameBegin, attrNameEnd, attrNameColon, valueBegin, |
649 | 360k | valueEnd); |
650 | 360k | } |
651 | 360k | } |
652 | 232k | if (!hasDefaultNs && !elements_.empty()) { |
653 | 232k | defaultNsId = elements_.top().defaultNamespaceId; |
654 | 232k | } |
655 | 232k | firstAttribute_ = true; |
656 | 232k | if (peek() == '/') { |
657 | 125k | state_ = State::EmptyElementTag; |
658 | 125k | ++pos_; |
659 | 125k | } else { |
660 | 107k | state_ = State::Content; |
661 | 107k | } |
662 | 232k | if (peek() != '>') { |
663 | 0 | throw css::uno::RuntimeException( |
664 | 0 | "missing '>' in " + fileUrl_ ); |
665 | 0 | } |
666 | 232k | ++pos_; |
667 | 232k | elements_.push( |
668 | 232k | ElementData( |
669 | 232k | Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces, |
670 | 232k | defaultNsId)); |
671 | 232k | if (nameColon == nullptr) { |
672 | 232k | *nsId = defaultNsId; |
673 | 232k | *localName = Span(nameBegin, nameEnd - nameBegin); |
674 | 232k | } else { |
675 | 0 | *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin)); |
676 | 0 | *localName = Span(nameColon + 1, nameEnd - (nameColon + 1)); |
677 | 0 | } |
678 | 232k | return Result::Begin; |
679 | 232k | } |
680 | | |
681 | 107k | XmlReader::Result XmlReader::handleEndTag() { |
682 | 107k | if (elements_.empty()) { |
683 | 0 | throw css::uno::RuntimeException( |
684 | 0 | "spurious end tag in " + fileUrl_ ); |
685 | 0 | } |
686 | 107k | char const * nameBegin = pos_; |
687 | 107k | char const * nameColon = nullptr; |
688 | 107k | if (!scanName(&nameColon) || |
689 | 107k | !elements_.top().name.equals(nameBegin, pos_ - nameBegin)) |
690 | 0 | { |
691 | 0 | throw css::uno::RuntimeException( |
692 | 0 | "tag mismatch in " + fileUrl_ ); |
693 | 0 | } |
694 | 107k | handleElementEnd(); |
695 | 107k | skipSpace(); |
696 | 107k | if (peek() != '>') { |
697 | 0 | throw css::uno::RuntimeException( |
698 | 0 | "missing '>' in " + fileUrl_ ); |
699 | 0 | } |
700 | 107k | ++pos_; |
701 | 107k | return Result::End; |
702 | 107k | } |
703 | | |
704 | 232k | void XmlReader::handleElementEnd() { |
705 | 232k | assert(!elements_.empty()); |
706 | 232k | auto end = elements_.top().inheritedNamespaces; |
707 | 232k | namespaces_.resize(end); |
708 | 232k | elements_.pop(); |
709 | 232k | state_ = elements_.empty() ? State::Done : State::Content; |
710 | 232k | } |
711 | | |
712 | 340k | XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) { |
713 | 340k | for (;;) { |
714 | 340k | auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_)); |
715 | 340k | if (!i) { |
716 | 0 | throw css::uno::RuntimeException( |
717 | 0 | "premature end of " + fileUrl_ ); |
718 | 0 | } |
719 | 340k | pos_ = i + 1; |
720 | 340k | switch (peek()) { |
721 | 0 | case '!': |
722 | 0 | ++pos_; |
723 | 0 | if (!skipComment() && !scanCdataSection().is()) { |
724 | 0 | skipDocumentTypeDeclaration(); |
725 | 0 | } |
726 | 0 | break; |
727 | 107k | case '/': |
728 | 107k | ++pos_; |
729 | 107k | return handleEndTag(); |
730 | 106 | case '?': |
731 | 106 | ++pos_; |
732 | 106 | skipProcessingInstruction(); |
733 | 106 | break; |
734 | 232k | default: |
735 | 232k | return handleStartTag(nsId, data); |
736 | 340k | } |
737 | 340k | } |
738 | 340k | } |
739 | | |
740 | 0 | XmlReader::Result XmlReader::handleRawText(Span * text) { |
741 | 0 | pad_.clear(); |
742 | 0 | for (char const * begin = pos_;;) { |
743 | 0 | switch (peek()) { |
744 | 0 | case '\0': // i.e., EOF |
745 | 0 | throw css::uno::RuntimeException( |
746 | 0 | "premature end of " + fileUrl_ ); |
747 | 0 | case '\x0D': |
748 | 0 | pad_.add(begin, pos_ - begin); |
749 | 0 | ++pos_; |
750 | 0 | if (peek() != '\x0A') { |
751 | 0 | pad_.add("\x0A"); |
752 | 0 | } |
753 | 0 | begin = pos_; |
754 | 0 | break; |
755 | 0 | case '&': |
756 | 0 | pad_.add(begin, pos_ - begin); |
757 | 0 | pos_ = handleReference(pos_, end_); |
758 | 0 | begin = pos_; |
759 | 0 | break; |
760 | 0 | case '<': |
761 | 0 | pad_.add(begin, pos_ - begin); |
762 | 0 | ++pos_; |
763 | 0 | switch (peek()) { |
764 | 0 | case '!': |
765 | 0 | ++pos_; |
766 | 0 | if (!skipComment()) { |
767 | 0 | Span cdata(scanCdataSection()); |
768 | 0 | if (cdata.is()) { |
769 | 0 | normalizeLineEnds(cdata); |
770 | 0 | } else { |
771 | 0 | skipDocumentTypeDeclaration(); |
772 | 0 | } |
773 | 0 | } |
774 | 0 | begin = pos_; |
775 | 0 | break; |
776 | 0 | case '/': |
777 | 0 | *text = pad_.get(); |
778 | 0 | ++pos_; |
779 | 0 | state_ = State::EndTag; |
780 | 0 | return Result::Text; |
781 | 0 | case '?': |
782 | 0 | ++pos_; |
783 | 0 | skipProcessingInstruction(); |
784 | 0 | begin = pos_; |
785 | 0 | break; |
786 | 0 | default: |
787 | 0 | *text = pad_.get(); |
788 | 0 | state_ = State::StartTag; |
789 | 0 | return Result::Text; |
790 | 0 | } |
791 | 0 | break; |
792 | 0 | default: |
793 | 0 | ++pos_; |
794 | 0 | break; |
795 | 0 | } |
796 | 0 | } |
797 | 0 | } |
798 | | |
799 | 0 | XmlReader::Result XmlReader::handleNormalizedText(Span * text) { |
800 | 0 | pad_.clear(); |
801 | 0 | char const * flowBegin = pos_; |
802 | 0 | char const * flowEnd = pos_; |
803 | 0 | enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK }; |
804 | | // a single true space character can go into the current flow, |
805 | | // everything else breaks the flow |
806 | 0 | Space space = SPACE_START; |
807 | 0 | for (;;) { |
808 | 0 | switch (peek()) { |
809 | 0 | case '\0': // i.e., EOF |
810 | 0 | throw css::uno::RuntimeException( |
811 | 0 | "premature end of " + fileUrl_ ); |
812 | 0 | case '\x09': |
813 | 0 | case '\x0A': |
814 | 0 | case '\x0D': |
815 | 0 | switch (space) { |
816 | 0 | case SPACE_START: |
817 | 0 | case SPACE_BREAK: |
818 | 0 | break; |
819 | 0 | case SPACE_NONE: |
820 | 0 | case SPACE_SPAN: |
821 | 0 | space = SPACE_BREAK; |
822 | 0 | break; |
823 | 0 | } |
824 | 0 | ++pos_; |
825 | 0 | break; |
826 | 0 | case ' ': |
827 | 0 | switch (space) { |
828 | 0 | case SPACE_START: |
829 | 0 | case SPACE_BREAK: |
830 | 0 | break; |
831 | 0 | case SPACE_NONE: |
832 | 0 | space = SPACE_SPAN; |
833 | 0 | break; |
834 | 0 | case SPACE_SPAN: |
835 | 0 | space = SPACE_BREAK; |
836 | 0 | break; |
837 | 0 | } |
838 | 0 | ++pos_; |
839 | 0 | break; |
840 | 0 | case '&': |
841 | 0 | switch (space) { |
842 | 0 | case SPACE_START: |
843 | 0 | break; |
844 | 0 | case SPACE_NONE: |
845 | 0 | case SPACE_SPAN: |
846 | 0 | pad_.add(flowBegin, pos_ - flowBegin); |
847 | 0 | break; |
848 | 0 | case SPACE_BREAK: |
849 | 0 | pad_.add(flowBegin, flowEnd - flowBegin); |
850 | 0 | pad_.add(" "); |
851 | 0 | break; |
852 | 0 | } |
853 | 0 | pos_ = handleReference(pos_, end_); |
854 | 0 | flowBegin = pos_; |
855 | 0 | flowEnd = pos_; |
856 | 0 | space = SPACE_NONE; |
857 | 0 | break; |
858 | 0 | case '<': |
859 | 0 | ++pos_; |
860 | 0 | switch (peek()) { |
861 | 0 | case '!': |
862 | 0 | ++pos_; |
863 | 0 | if (skipComment()) { |
864 | 0 | space = SPACE_BREAK; |
865 | 0 | } else { |
866 | 0 | Span cdata(scanCdataSection()); |
867 | 0 | if (cdata.is()) { |
868 | | // CDATA is not normalized (similar to character |
869 | | // references; it keeps the code simple), but it might |
870 | | // arguably be better to normalize it: |
871 | 0 | switch (space) { |
872 | 0 | case SPACE_START: |
873 | 0 | break; |
874 | 0 | case SPACE_NONE: |
875 | 0 | case SPACE_SPAN: |
876 | 0 | pad_.add(flowBegin, pos_ - flowBegin); |
877 | 0 | break; |
878 | 0 | case SPACE_BREAK: |
879 | 0 | pad_.add(flowBegin, flowEnd - flowBegin); |
880 | 0 | pad_.add(" "); |
881 | 0 | break; |
882 | 0 | } |
883 | 0 | normalizeLineEnds(cdata); |
884 | 0 | flowBegin = pos_; |
885 | 0 | flowEnd = pos_; |
886 | 0 | space = SPACE_NONE; |
887 | 0 | } else { |
888 | 0 | skipDocumentTypeDeclaration(); |
889 | 0 | } |
890 | 0 | } |
891 | 0 | break; |
892 | 0 | case '/': |
893 | 0 | ++pos_; |
894 | 0 | pad_.add(flowBegin, flowEnd - flowBegin); |
895 | 0 | *text = pad_.get(); |
896 | 0 | state_ = State::EndTag; |
897 | 0 | return Result::Text; |
898 | 0 | case '?': |
899 | 0 | ++pos_; |
900 | 0 | skipProcessingInstruction(); |
901 | 0 | space = SPACE_BREAK; |
902 | 0 | break; |
903 | 0 | default: |
904 | 0 | pad_.add(flowBegin, flowEnd - flowBegin); |
905 | 0 | *text = pad_.get(); |
906 | 0 | state_ = State::StartTag; |
907 | 0 | return Result::Text; |
908 | 0 | } |
909 | 0 | break; |
910 | 0 | default: |
911 | 0 | switch (space) { |
912 | 0 | case SPACE_START: |
913 | 0 | flowBegin = pos_; |
914 | 0 | break; |
915 | 0 | case SPACE_NONE: |
916 | 0 | case SPACE_SPAN: |
917 | 0 | break; |
918 | 0 | case SPACE_BREAK: |
919 | 0 | pad_.add(flowBegin, flowEnd - flowBegin); |
920 | 0 | pad_.add(" "); |
921 | 0 | flowBegin = pos_; |
922 | 0 | break; |
923 | 0 | } |
924 | 0 | flowEnd = ++pos_; |
925 | 0 | space = SPACE_NONE; |
926 | 0 | break; |
927 | 0 | } |
928 | 0 | } |
929 | 0 | } |
930 | | |
931 | 212 | int XmlReader::toNamespaceId(NamespaceIris::size_type pos) { |
932 | 212 | assert(pos <= INT_MAX); |
933 | 212 | return static_cast< int >(pos); |
934 | 212 | } |
935 | | |
936 | | } |
937 | | |
938 | | /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |