Coverage Report

Created: 2025-07-07 10:01

/src/libreoffice/xmlreader/source/xmlreader.cxx
Line
Count
Source (jump to first uncovered line)
1
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2
/*
3
 * This file is part of the LibreOffice project.
4
 *
5
 * This Source Code Form is subject to the terms of the Mozilla Public
6
 * License, v. 2.0. If a copy of the MPL was not distributed with this
7
 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
8
 *
9
 * This file incorporates work covered by the following license notice:
10
 *
11
 *   Licensed to the Apache Software Foundation (ASF) under one or more
12
 *   contributor license agreements. See the NOTICE file distributed
13
 *   with this work for additional information regarding copyright
14
 *   ownership. The ASF licenses this file to you under the Apache
15
 *   License, Version 2.0 (the "License"); you may not use this file
16
 *   except in compliance with the License. You may obtain a copy of
17
 *   the License at http://www.apache.org/licenses/LICENSE-2.0 .
18
 */
19
20
#include <sal/config.h>
21
22
#include <cassert>
23
#include <climits>
24
25
#include <com/sun/star/container/NoSuchElementException.hpp>
26
#include <com/sun/star/uno/RuntimeException.hpp>
27
#include <o3tl/numeric.hxx>
28
#include <osl/file.h>
29
#include <rtl/character.hxx>
30
#include <rtl/string.h>
31
#include <rtl/ustring.hxx>
32
#include <sal/log.hxx>
33
#include <sal/types.h>
34
#include <utility>
35
#include <xmlreader/pad.hxx>
36
#include <xmlreader/span.hxx>
37
#include <xmlreader/xmlreader.hxx>
38
39
namespace xmlreader {
40
41
namespace {
42
43
1.78M
bool isSpace(char c) {
44
1.78M
    switch (c) {
45
0
    case '\x09':
46
0
    case '\x0A':
47
0
    case '\x0D':
48
360k
    case ' ':
49
360k
        return true;
50
1.42M
    default:
51
1.42M
        return false;
52
1.78M
    }
53
1.78M
}
54
55
}
56
57
XmlReader::XmlReader(OUString fileUrl)
58
106
    : fileUrl_(std::move(fileUrl))
59
106
    , fileHandle_(nullptr)
60
106
{
61
106
    oslFileError e = osl_openFile(
62
106
        fileUrl_.pData, &fileHandle_, osl_File_OpenFlag_Read);
63
106
    switch (e)
64
106
    {
65
106
    case osl_File_E_None:
66
106
        break;
67
0
    case osl_File_E_NOENT:
68
0
        throw css::container::NoSuchElementException( fileUrl_ );
69
0
    default:
70
0
        throw css::uno::RuntimeException(
71
0
            "cannot open " + fileUrl_ + ": " + OUString::number(e));
72
106
    }
73
106
    e = osl_getFileSize(fileHandle_, &fileSize_);
74
106
    if (e == osl_File_E_None) {
75
106
        e = osl_mapFile(
76
106
            fileHandle_, &fileAddress_, fileSize_, 0,
77
106
            osl_File_MapFlag_WillNeed);
78
106
    }
79
106
    if (e != osl_File_E_None) {
80
0
        oslFileError e2 = osl_closeFile(fileHandle_);
81
0
        if (e2 != osl_File_E_None) {
82
0
            SAL_WARN(
83
0
                "xmlreader",
84
0
                "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e2);
85
0
        }
86
0
        throw css::uno::RuntimeException(
87
0
            "cannot mmap " + fileUrl_ + " (" + OUString::number(e) + ")" );
88
0
    }
89
106
    namespaceIris_.emplace_back("http://www.w3.org/XML/1998/namespace");
90
106
    namespaces_.emplace_back(Span("xml"), NAMESPACE_XML);
91
106
    pos_ = static_cast< char * >(fileAddress_);
92
106
    end_ = pos_ + fileSize_;
93
106
    state_ = State::Content;
94
106
    firstAttribute_ = true;
95
106
}
96
97
106
XmlReader::~XmlReader() {
98
106
    if (!fileHandle_)
99
0
        return;
100
106
    oslFileError e = osl_unmapMappedFile(fileHandle_, fileAddress_, fileSize_);
101
106
    if (e != osl_File_E_None) {
102
0
        SAL_WARN(
103
0
            "xmlreader",
104
0
            "osl_unmapMappedFile of \"" << fileUrl_ << "\" failed with " << +e);
105
0
    }
106
106
    e = osl_closeFile(fileHandle_);
107
106
    if (e != osl_File_E_None) {
108
0
        SAL_WARN(
109
0
            "xmlreader",
110
0
            "osl_closeFile of \"" << fileUrl_ << "\" failed with " << +e);
111
0
    }
112
106
}
113
114
106
int XmlReader::registerNamespaceIri(Span const & iri) {
115
106
    int id = toNamespaceId(namespaceIris_.size());
116
106
    namespaceIris_.push_back(iri);
117
106
    if (iri == "http://www.w3.org/2001/XMLSchema-instance") {
118
        // Old user layer .xcu files used the xsi namespace prefix without
119
        // declaring a corresponding namespace binding, see issue 77174; reading
120
        // those files during migration would fail without this hack that can be
121
        // removed once migration is no longer relevant (see
122
        // configmgr::Components::parseModificationLayer):
123
0
        namespaces_.emplace_back(Span("xsi"), id);
124
0
    }
125
106
    return id;
126
106
}
127
128
XmlReader::Result XmlReader::nextItem(Text reportText, Span * data, int * nsId)
129
465k
{
130
465k
    switch (state_) {
131
340k
    case State::Content:
132
340k
        switch (reportText) {
133
340k
        case Text::NONE:
134
340k
            return handleSkippedText(data, nsId);
135
0
        case Text::Raw:
136
0
            return handleRawText(data);
137
0
        default: // Text::Normalized
138
0
            return handleNormalizedText(data);
139
340k
        }
140
0
    case State::StartTag:
141
0
        return handleStartTag(nsId, data);
142
0
    case State::EndTag:
143
0
        return handleEndTag();
144
125k
    case State::EmptyElementTag:
145
125k
        handleElementEnd();
146
125k
        return Result::End;
147
106
    default: // State::Done
148
106
        return Result::Done;
149
465k
    }
150
465k
}
151
152
592k
bool XmlReader::nextAttribute(int * nsId, Span * localName) {
153
592k
    assert(nsId != nullptr && localName != nullptr);
154
592k
    if (firstAttribute_) {
155
232k
        currentAttribute_ = attributes_.begin();
156
232k
        firstAttribute_ = false;
157
360k
    } else {
158
360k
        ++currentAttribute_;
159
360k
    }
160
592k
    if (currentAttribute_ == attributes_.end()) {
161
232k
        return false;
162
232k
    }
163
360k
    if (currentAttribute_->nameColon == nullptr) {
164
360k
        *nsId = NAMESPACE_NONE;
165
360k
        *localName = Span(
166
360k
            currentAttribute_->nameBegin,
167
360k
            currentAttribute_->nameEnd - currentAttribute_->nameBegin);
168
360k
    } else {
169
0
        *nsId = getNamespaceId(
170
0
            Span(
171
0
                currentAttribute_->nameBegin,
172
0
                currentAttribute_->nameColon - currentAttribute_->nameBegin));
173
0
        *localName = Span(
174
0
            currentAttribute_->nameColon + 1,
175
0
            currentAttribute_->nameEnd - (currentAttribute_->nameColon + 1));
176
0
    }
177
360k
    return true;
178
592k
}
179
180
360k
Span XmlReader::getAttributeValue(bool fullyNormalize) {
181
360k
    return handleAttributeValue(
182
360k
        currentAttribute_->valueBegin, currentAttribute_->valueEnd,
183
360k
        fullyNormalize);
184
360k
}
185
186
0
int XmlReader::getNamespaceId(Span const & prefix) const {
187
0
    auto i = std::find_if(namespaces_.crbegin(), namespaces_.crend(),
188
0
        [&prefix](const NamespaceData& rNamespaceData) { return prefix == rNamespaceData.prefix; });
189
190
0
    if (i != namespaces_.rend())
191
0
        return i->nsId;
192
193
0
    return NAMESPACE_UNKNOWN;
194
0
}
195
196
197
0
void XmlReader::normalizeLineEnds(Span const & text) {
198
0
    char const * p = text.begin;
199
0
    sal_Int32 n = text.length;
200
0
    for (;;) {
201
0
        sal_Int32 i = rtl_str_indexOfChar_WithLength(p, n, '\x0D');
202
0
        if (i < 0) {
203
0
            break;
204
0
        }
205
0
        pad_.add(p, i);
206
0
        p += i + 1;
207
0
        n -= i + 1;
208
0
        if (n == 0 || *p != '\x0A') {
209
0
            pad_.add("\x0A");
210
0
        }
211
0
    }
212
0
    pad_.add(p, n);
213
0
}
214
215
1.42M
void XmlReader::skipSpace() {
216
1.78M
    while (isSpace(peek())) {
217
360k
        ++pos_;
218
360k
    }
219
1.42M
}
220
221
0
bool XmlReader::skipComment() {
222
0
    if (rtl_str_shortenedCompare_WithLength(
223
0
            pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"),
224
0
            RTL_CONSTASCII_LENGTH("--")) !=
225
0
        0)
226
0
    {
227
0
        return false;
228
0
    }
229
0
    pos_ += RTL_CONSTASCII_LENGTH("--");
230
0
    sal_Int32 i = rtl_str_indexOfStr_WithLength(
231
0
        pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("--"));
232
0
    if (i < 0) {
233
0
        throw css::uno::RuntimeException(
234
0
            "premature end (within comment) of " + fileUrl_ );
235
0
    }
236
0
    pos_ += i + RTL_CONSTASCII_LENGTH("--");
237
0
    if (read() != '>') {
238
0
        throw css::uno::RuntimeException(
239
0
            "illegal \"--\" within comment in " + fileUrl_ );
240
0
    }
241
0
    return true;
242
0
}
243
244
106
void XmlReader::skipProcessingInstruction() {
245
106
    sal_Int32 i = rtl_str_indexOfStr_WithLength(
246
106
        pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("?>"));
247
106
    if (i < 0) {
248
0
        throw css::uno::RuntimeException(
249
0
            "bad '<?' in " + fileUrl_ );
250
0
    }
251
106
    pos_ += i + RTL_CONSTASCII_LENGTH("?>");
252
106
}
253
254
0
void XmlReader::skipDocumentTypeDeclaration() {
255
    // Neither is it checked that the doctypedecl is at the correct position in
256
    // the document, nor that it is well-formed:
257
0
    for (;;) {
258
0
        char c = read();
259
0
        switch (c) {
260
0
        case '\0': // i.e., EOF
261
0
            throw css::uno::RuntimeException(
262
0
                "premature end (within DTD) of " + fileUrl_ );
263
0
        case '"':
264
0
        case '\'':
265
0
            {
266
0
                sal_Int32 i = rtl_str_indexOfChar_WithLength(
267
0
                    pos_, end_ - pos_, c);
268
0
                if (i < 0) {
269
0
                    throw css::uno::RuntimeException(
270
0
                        "premature end (within DTD) of " + fileUrl_ );
271
0
                }
272
0
                pos_ += i + 1;
273
0
            }
274
0
            break;
275
0
        case '>':
276
0
            return;
277
0
        case '[':
278
0
            for (;;) {
279
0
                c = read();
280
0
                switch (c) {
281
0
                case '\0': // i.e., EOF
282
0
                    throw css::uno::RuntimeException(
283
0
                        "premature end (within DTD) of " + fileUrl_ );
284
0
                case '"':
285
0
                case '\'':
286
0
                    {
287
0
                        sal_Int32 i = rtl_str_indexOfChar_WithLength(
288
0
                            pos_, end_ - pos_, c);
289
0
                        if (i < 0) {
290
0
                            throw css::uno::RuntimeException(
291
0
                                "premature end (within DTD) of " + fileUrl_ );
292
0
                        }
293
0
                        pos_ += i + 1;
294
0
                    }
295
0
                    break;
296
0
                case '<':
297
0
                    switch (read()) {
298
0
                    case '\0': // i.e., EOF
299
0
                        throw css::uno::RuntimeException(
300
0
                            "premature end (within DTD) of " + fileUrl_ );
301
0
                    case '!':
302
0
                        skipComment();
303
0
                        break;
304
0
                    case '?':
305
0
                        skipProcessingInstruction();
306
0
                        break;
307
0
                    default:
308
0
                        break;
309
0
                    }
310
0
                    break;
311
0
                case ']':
312
0
                    skipSpace();
313
0
                    if (read() != '>') {
314
0
                        throw css::uno::RuntimeException(
315
0
                            "missing \">\" of DTD in " + fileUrl_ );
316
0
                    }
317
0
                    return;
318
0
                default:
319
0
                    break;
320
0
                }
321
0
            }
322
0
        default:
323
0
            break;
324
0
        }
325
0
    }
326
0
}
327
328
0
Span XmlReader::scanCdataSection() {
329
0
    if (rtl_str_shortenedCompare_WithLength(
330
0
            pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("[CDATA["),
331
0
            RTL_CONSTASCII_LENGTH("[CDATA[")) !=
332
0
        0)
333
0
    {
334
0
        return Span();
335
0
    }
336
0
    pos_ += RTL_CONSTASCII_LENGTH("[CDATA[");
337
0
    char const * begin = pos_;
338
0
    sal_Int32 i = rtl_str_indexOfStr_WithLength(
339
0
        pos_, end_ - pos_, RTL_CONSTASCII_STRINGPARAM("]]>"));
340
0
    if (i < 0) {
341
0
        throw css::uno::RuntimeException(
342
0
            "premature end (within CDATA section) of " + fileUrl_ );
343
0
    }
344
0
    pos_ += i + RTL_CONSTASCII_LENGTH("]]>");
345
0
    return Span(begin, i);
346
0
}
347
348
700k
bool XmlReader::scanName(char const ** nameColon) {
349
700k
    assert(nameColon != nullptr && *nameColon == nullptr);
350
6.71M
    for (char const * begin = pos_;; ++pos_) {
351
6.71M
        switch (peek()) {
352
0
        case '\0': // i.e., EOF
353
0
        case '\x09':
354
0
        case '\x0A':
355
0
        case '\x0D':
356
232k
        case ' ':
357
232k
        case '/':
358
592k
        case '=':
359
700k
        case '>':
360
700k
            return pos_ != begin;
361
0
        case ':':
362
0
            *nameColon = pos_;
363
0
            break;
364
6.01M
        default:
365
6.01M
            break;
366
6.71M
        }
367
6.71M
    }
368
700k
}
369
370
106
int XmlReader::scanNamespaceIri(char const * begin, char const * end) {
371
106
    assert(begin != nullptr && begin <= end);
372
106
    Span iri(handleAttributeValue(begin, end, false));
373
212
    for (NamespaceIris::size_type i = 0; i < namespaceIris_.size(); ++i) {
374
212
        if (namespaceIris_[i] == iri) {
375
106
            return toNamespaceId(i);
376
106
        }
377
212
    }
378
0
    return XmlReader::NAMESPACE_UNKNOWN;
379
106
}
380
381
char const * XmlReader::handleReference(char const * position, char const * end)
382
0
{
383
0
    assert(position != nullptr && *position == '&' && position < end);
384
0
    ++position;
385
0
    if (*position == '#') {
386
0
        ++position;
387
0
        sal_uInt32 val = 0;
388
0
        char const * p;
389
0
        if (*position == 'x') {
390
0
            ++position;
391
0
            p = position;
392
0
            for (;; ++position)
393
0
            {
394
0
                val = o3tl::convertToHex<sal_uInt32>(*position);
395
0
                if (val >= 16)
396
0
                    break;
397
398
0
                if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
399
0
                    throw css::uno::RuntimeException(
400
0
                        "'&#x...' too large in " + fileUrl_ );
401
0
                }
402
0
            }
403
0
        } else {
404
0
            p = position;
405
0
            for (;; ++position) {
406
0
                char c = *position;
407
0
                if (c >= '0' && c <= '9') {
408
0
                    val = 10 * val + (c - '0');
409
0
                } else {
410
0
                    break;
411
0
                }
412
0
                if (!rtl::isUnicodeCodePoint(val)) { // avoid overflow
413
0
                    throw css::uno::RuntimeException(
414
0
                        "'&#...' too large in " + fileUrl_ );
415
0
                }
416
0
            }
417
0
        }
418
0
        if (position == p || *position++ != ';') {
419
0
            throw css::uno::RuntimeException(
420
0
                "'&#...' missing ';' in " + fileUrl_ );
421
0
        }
422
0
        assert(rtl::isUnicodeCodePoint(val));
423
0
        if ((val < 0x20 && val != 0x9 && val != 0xA && val != 0xD) ||
424
0
            (val >= 0xD800 && val <= 0xDFFF) || val == 0xFFFE || val == 0xFFFF)
425
0
        {
426
0
            throw css::uno::RuntimeException(
427
0
                "character reference denoting invalid character in " + fileUrl_ );
428
0
        }
429
0
        char buf[4];
430
0
        sal_Int32 len;
431
0
        if (val < 0x80) {
432
0
            buf[0] = static_cast< char >(val);
433
0
            len = 1;
434
0
        } else if (val < 0x800) {
435
0
            buf[0] = static_cast< char >((val >> 6) | 0xC0);
436
0
            buf[1] = static_cast< char >((val & 0x3F) | 0x80);
437
0
            len = 2;
438
0
        } else if (val < 0x10000) {
439
0
            buf[0] = static_cast< char >((val >> 12) | 0xE0);
440
0
            buf[1] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
441
0
            buf[2] = static_cast< char >((val & 0x3F) | 0x80);
442
0
            len = 3;
443
0
        } else {
444
0
            buf[0] = static_cast< char >((val >> 18) | 0xF0);
445
0
            buf[1] = static_cast< char >(((val >> 12) & 0x3F) | 0x80);
446
0
            buf[2] = static_cast< char >(((val >> 6) & 0x3F) | 0x80);
447
0
            buf[3] = static_cast< char >((val & 0x3F) | 0x80);
448
0
            len = 4;
449
0
        }
450
0
        pad_.addEphemeral(buf, len);
451
0
        return position;
452
0
    } else {
453
0
        struct EntityRef {
454
0
            char const * inBegin;
455
0
            sal_Int32 const inLength;
456
0
            char const * outBegin;
457
0
            sal_Int32 const outLength;
458
0
        };
459
0
        static EntityRef const refs[] = {
460
0
            { RTL_CONSTASCII_STRINGPARAM("amp;"),
461
0
              RTL_CONSTASCII_STRINGPARAM("&") },
462
0
            { RTL_CONSTASCII_STRINGPARAM("lt;"),
463
0
              RTL_CONSTASCII_STRINGPARAM("<") },
464
0
            { RTL_CONSTASCII_STRINGPARAM("gt;"),
465
0
              RTL_CONSTASCII_STRINGPARAM(">") },
466
0
            { RTL_CONSTASCII_STRINGPARAM("apos;"),
467
0
              RTL_CONSTASCII_STRINGPARAM("'") },
468
0
            { RTL_CONSTASCII_STRINGPARAM("quot;"),
469
0
              RTL_CONSTASCII_STRINGPARAM("\"") } };
470
0
        for (const auto & ref : refs) {
471
0
            if (rtl_str_shortenedCompare_WithLength(
472
0
                    position, end - position, ref.inBegin, ref.inLength,
473
0
                    ref.inLength) ==
474
0
                0)
475
0
            {
476
0
                position += ref.inLength;
477
0
                pad_.add(ref.outBegin, ref.outLength);
478
0
                return position;
479
0
            }
480
0
        }
481
0
        throw css::uno::RuntimeException(
482
0
            "unknown entity reference in " + fileUrl_ );
483
0
    }
484
0
}
485
486
Span XmlReader::handleAttributeValue(
487
    char const * begin, char const * end, bool fullyNormalize)
488
360k
{
489
360k
    pad_.clear();
490
360k
    if (fullyNormalize) {
491
0
        while (begin != end && isSpace(*begin)) {
492
0
            ++begin;
493
0
        }
494
0
        while (end != begin && isSpace(end[-1])) {
495
0
            --end;
496
0
        }
497
0
        char const * p = begin;
498
0
        enum Space { SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
499
            // a single true space character can go into the current span,
500
            // everything else breaks the span
501
0
        Space space = SPACE_NONE;
502
0
        while (p != end) {
503
0
            switch (*p) {
504
0
            case '\x09':
505
0
            case '\x0A':
506
0
            case '\x0D':
507
0
                switch (space) {
508
0
                case SPACE_NONE:
509
0
                    pad_.add(begin, p - begin);
510
0
                    pad_.add(" ");
511
0
                    space = SPACE_BREAK;
512
0
                    break;
513
0
                case SPACE_SPAN:
514
0
                    pad_.add(begin, p - begin);
515
0
                    space = SPACE_BREAK;
516
0
                    break;
517
0
                case SPACE_BREAK:
518
0
                    break;
519
0
                }
520
0
                begin = ++p;
521
0
                break;
522
0
            case ' ':
523
0
                switch (space) {
524
0
                case SPACE_NONE:
525
0
                    ++p;
526
0
                    space = SPACE_SPAN;
527
0
                    break;
528
0
                case SPACE_SPAN:
529
0
                    pad_.add(begin, p - begin);
530
0
                    begin = ++p;
531
0
                    space = SPACE_BREAK;
532
0
                    break;
533
0
                case SPACE_BREAK:
534
0
                    begin = ++p;
535
0
                    break;
536
0
                }
537
0
                break;
538
0
            case '&':
539
0
                pad_.add(begin, p - begin);
540
0
                p = handleReference(p, end);
541
0
                begin = p;
542
0
                space = SPACE_NONE;
543
0
                break;
544
0
            default:
545
0
                ++p;
546
0
                space = SPACE_NONE;
547
0
                break;
548
0
            }
549
0
        }
550
0
        pad_.add(begin, p - begin);
551
360k
    } else {
552
360k
        char const * p = begin;
553
14.5M
        while (p != end) {
554
14.1M
            switch (*p) {
555
0
            case '\x09':
556
0
            case '\x0A':
557
0
                pad_.add(begin, p - begin);
558
0
                begin = ++p;
559
0
                pad_.add(" ");
560
0
                break;
561
0
            case '\x0D':
562
0
                pad_.add(begin, p - begin);
563
0
                ++p;
564
0
                if (peek() == '\x0A') {
565
0
                    ++p;
566
0
                }
567
0
                begin = p;
568
0
                pad_.add(" ");
569
0
                break;
570
0
            case '&':
571
0
                pad_.add(begin, p - begin);
572
0
                p = handleReference(p, end);
573
0
                begin = p;
574
0
                break;
575
14.1M
            default:
576
14.1M
                ++p;
577
14.1M
                break;
578
14.1M
            }
579
14.1M
        }
580
360k
        pad_.add(begin, p - begin);
581
360k
    }
582
360k
    return pad_.get();
583
360k
}
584
585
232k
XmlReader::Result XmlReader::handleStartTag(int * nsId, Span * localName) {
586
232k
    assert(nsId != nullptr && localName);
587
232k
    char const * nameBegin = pos_;
588
232k
    char const * nameColon = nullptr;
589
232k
    if (!scanName(&nameColon)) {
590
0
        throw css::uno::RuntimeException(
591
0
            "bad tag name in " + fileUrl_ );
592
0
    }
593
232k
    char const * nameEnd = pos_;
594
232k
    NamespaceList::size_type inheritedNamespaces = namespaces_.size();
595
232k
    bool hasDefaultNs = false;
596
232k
    int defaultNsId = NAMESPACE_NONE;
597
232k
    attributes_.clear();
598
592k
    for (;;) {
599
592k
        char const * p = pos_;
600
592k
        skipSpace();
601
592k
        if (peek() == '/' || peek() == '>') {
602
232k
            break;
603
232k
        }
604
360k
        if (pos_ == p) {
605
0
            throw css::uno::RuntimeException(
606
0
                "missing whitespace before attribute in " + fileUrl_ );
607
0
        }
608
360k
        char const * attrNameBegin = pos_;
609
360k
        char const * attrNameColon = nullptr;
610
360k
        if (!scanName(&attrNameColon)) {
611
0
            throw css::uno::RuntimeException(
612
0
                "bad attribute name in " + fileUrl_ );
613
0
        }
614
360k
        char const * attrNameEnd = pos_;
615
360k
        skipSpace();
616
360k
        if (read() != '=') {
617
0
            throw css::uno::RuntimeException(
618
0
                "missing '=' in " + fileUrl_ );
619
0
        }
620
360k
        skipSpace();
621
360k
        char del = read();
622
360k
        if (del != '\'' && del != '"') {
623
0
            throw css::uno::RuntimeException(
624
0
                "bad attribute value in " + fileUrl_ );
625
0
        }
626
360k
        char const * valueBegin = pos_;
627
360k
        sal_Int32 i = rtl_str_indexOfChar_WithLength(pos_, end_ - pos_, del);
628
360k
        if (i < 0) {
629
0
            throw css::uno::RuntimeException(
630
0
                "unterminated attribute value in " + fileUrl_ );
631
0
        }
632
360k
        char const * valueEnd = pos_ + i;
633
360k
        pos_ += i + 1;
634
360k
        if (attrNameColon == nullptr &&
635
360k
            Span(attrNameBegin, attrNameEnd - attrNameBegin) == "xmlns")
636
106
        {
637
106
            hasDefaultNs = true;
638
106
            defaultNsId = scanNamespaceIri(valueBegin, valueEnd);
639
360k
        } else if (attrNameColon != nullptr &&
640
360k
                   Span(attrNameBegin, attrNameColon - attrNameBegin) ==
641
0
                       "xmlns")
642
0
        {
643
0
            namespaces_.emplace_back(
644
0
                    Span(attrNameColon + 1, attrNameEnd - (attrNameColon + 1)),
645
0
                    scanNamespaceIri(valueBegin, valueEnd));
646
360k
        } else {
647
360k
            attributes_.emplace_back(
648
360k
                    attrNameBegin, attrNameEnd, attrNameColon, valueBegin,
649
360k
                    valueEnd);
650
360k
        }
651
360k
    }
652
232k
    if (!hasDefaultNs && !elements_.empty()) {
653
232k
        defaultNsId = elements_.top().defaultNamespaceId;
654
232k
    }
655
232k
    firstAttribute_ = true;
656
232k
    if (peek() == '/') {
657
125k
        state_ = State::EmptyElementTag;
658
125k
        ++pos_;
659
125k
    } else {
660
107k
        state_ = State::Content;
661
107k
    }
662
232k
    if (peek() != '>') {
663
0
        throw css::uno::RuntimeException(
664
0
            "missing '>' in " + fileUrl_ );
665
0
    }
666
232k
    ++pos_;
667
232k
    elements_.push(
668
232k
        ElementData(
669
232k
            Span(nameBegin, nameEnd - nameBegin), inheritedNamespaces,
670
232k
            defaultNsId));
671
232k
    if (nameColon == nullptr) {
672
232k
        *nsId = defaultNsId;
673
232k
        *localName = Span(nameBegin, nameEnd - nameBegin);
674
232k
    } else {
675
0
        *nsId = getNamespaceId(Span(nameBegin, nameColon - nameBegin));
676
0
        *localName = Span(nameColon + 1, nameEnd - (nameColon + 1));
677
0
    }
678
232k
    return Result::Begin;
679
232k
}
680
681
107k
XmlReader::Result XmlReader::handleEndTag() {
682
107k
    if (elements_.empty()) {
683
0
        throw css::uno::RuntimeException(
684
0
            "spurious end tag in " + fileUrl_ );
685
0
    }
686
107k
    char const * nameBegin = pos_;
687
107k
    char const * nameColon = nullptr;
688
107k
    if (!scanName(&nameColon) ||
689
107k
        !elements_.top().name.equals(nameBegin, pos_ - nameBegin))
690
0
    {
691
0
        throw css::uno::RuntimeException(
692
0
            "tag mismatch in " + fileUrl_ );
693
0
    }
694
107k
    handleElementEnd();
695
107k
    skipSpace();
696
107k
    if (peek() != '>') {
697
0
        throw css::uno::RuntimeException(
698
0
            "missing '>' in " + fileUrl_ );
699
0
    }
700
107k
    ++pos_;
701
107k
    return Result::End;
702
107k
}
703
704
232k
void XmlReader::handleElementEnd() {
705
232k
    assert(!elements_.empty());
706
232k
    auto end = elements_.top().inheritedNamespaces;
707
232k
    namespaces_.resize(end);
708
232k
    elements_.pop();
709
232k
    state_ = elements_.empty() ? State::Done : State::Content;
710
232k
}
711
712
340k
XmlReader::Result XmlReader::handleSkippedText(Span * data, int * nsId) {
713
340k
    for (;;) {
714
340k
        auto i = static_cast<const char*>(std::memchr(pos_, '<', end_ - pos_));
715
340k
        if (!i) {
716
0
            throw css::uno::RuntimeException(
717
0
                "premature end of " + fileUrl_ );
718
0
        }
719
340k
        pos_ = i + 1;
720
340k
        switch (peek()) {
721
0
        case '!':
722
0
            ++pos_;
723
0
            if (!skipComment() && !scanCdataSection().is()) {
724
0
                skipDocumentTypeDeclaration();
725
0
            }
726
0
            break;
727
107k
        case '/':
728
107k
            ++pos_;
729
107k
            return handleEndTag();
730
106
        case '?':
731
106
            ++pos_;
732
106
            skipProcessingInstruction();
733
106
            break;
734
232k
        default:
735
232k
            return handleStartTag(nsId, data);
736
340k
        }
737
340k
    }
738
340k
}
739
740
0
XmlReader::Result XmlReader::handleRawText(Span * text) {
741
0
    pad_.clear();
742
0
    for (char const * begin = pos_;;) {
743
0
        switch (peek()) {
744
0
        case '\0': // i.e., EOF
745
0
            throw css::uno::RuntimeException(
746
0
                "premature end of " + fileUrl_ );
747
0
        case '\x0D':
748
0
            pad_.add(begin, pos_ - begin);
749
0
            ++pos_;
750
0
            if (peek() != '\x0A') {
751
0
                pad_.add("\x0A");
752
0
            }
753
0
            begin = pos_;
754
0
            break;
755
0
        case '&':
756
0
            pad_.add(begin, pos_ - begin);
757
0
            pos_ = handleReference(pos_, end_);
758
0
            begin = pos_;
759
0
            break;
760
0
        case '<':
761
0
            pad_.add(begin, pos_ - begin);
762
0
            ++pos_;
763
0
            switch (peek()) {
764
0
            case '!':
765
0
                ++pos_;
766
0
                if (!skipComment()) {
767
0
                    Span cdata(scanCdataSection());
768
0
                    if (cdata.is()) {
769
0
                        normalizeLineEnds(cdata);
770
0
                    } else {
771
0
                        skipDocumentTypeDeclaration();
772
0
                    }
773
0
                }
774
0
                begin = pos_;
775
0
                break;
776
0
            case '/':
777
0
                *text = pad_.get();
778
0
                ++pos_;
779
0
                state_ = State::EndTag;
780
0
                return Result::Text;
781
0
            case '?':
782
0
                ++pos_;
783
0
                skipProcessingInstruction();
784
0
                begin = pos_;
785
0
                break;
786
0
            default:
787
0
                *text = pad_.get();
788
0
                state_ = State::StartTag;
789
0
                return Result::Text;
790
0
            }
791
0
            break;
792
0
        default:
793
0
            ++pos_;
794
0
            break;
795
0
        }
796
0
    }
797
0
}
798
799
0
XmlReader::Result XmlReader::handleNormalizedText(Span * text) {
800
0
    pad_.clear();
801
0
    char const * flowBegin = pos_;
802
0
    char const * flowEnd = pos_;
803
0
    enum Space { SPACE_START, SPACE_NONE, SPACE_SPAN, SPACE_BREAK };
804
        // a single true space character can go into the current flow,
805
        // everything else breaks the flow
806
0
    Space space = SPACE_START;
807
0
    for (;;) {
808
0
        switch (peek()) {
809
0
        case '\0': // i.e., EOF
810
0
            throw css::uno::RuntimeException(
811
0
                "premature end of " + fileUrl_ );
812
0
        case '\x09':
813
0
        case '\x0A':
814
0
        case '\x0D':
815
0
            switch (space) {
816
0
            case SPACE_START:
817
0
            case SPACE_BREAK:
818
0
                break;
819
0
            case SPACE_NONE:
820
0
            case SPACE_SPAN:
821
0
                space = SPACE_BREAK;
822
0
                break;
823
0
            }
824
0
            ++pos_;
825
0
            break;
826
0
        case ' ':
827
0
            switch (space) {
828
0
            case SPACE_START:
829
0
            case SPACE_BREAK:
830
0
                break;
831
0
            case SPACE_NONE:
832
0
                space = SPACE_SPAN;
833
0
                break;
834
0
            case SPACE_SPAN:
835
0
                space = SPACE_BREAK;
836
0
                break;
837
0
            }
838
0
            ++pos_;
839
0
            break;
840
0
        case '&':
841
0
            switch (space) {
842
0
            case SPACE_START:
843
0
                break;
844
0
            case SPACE_NONE:
845
0
            case SPACE_SPAN:
846
0
                pad_.add(flowBegin, pos_ - flowBegin);
847
0
                break;
848
0
            case SPACE_BREAK:
849
0
                pad_.add(flowBegin, flowEnd - flowBegin);
850
0
                pad_.add(" ");
851
0
                break;
852
0
            }
853
0
            pos_ = handleReference(pos_, end_);
854
0
            flowBegin = pos_;
855
0
            flowEnd = pos_;
856
0
            space = SPACE_NONE;
857
0
            break;
858
0
        case '<':
859
0
            ++pos_;
860
0
            switch (peek()) {
861
0
            case '!':
862
0
                ++pos_;
863
0
                if (skipComment()) {
864
0
                    space = SPACE_BREAK;
865
0
                } else {
866
0
                    Span cdata(scanCdataSection());
867
0
                    if (cdata.is()) {
868
                        // CDATA is not normalized (similar to character
869
                        // references; it keeps the code simple), but it might
870
                        // arguably be better to normalize it:
871
0
                        switch (space) {
872
0
                        case SPACE_START:
873
0
                            break;
874
0
                        case SPACE_NONE:
875
0
                        case SPACE_SPAN:
876
0
                            pad_.add(flowBegin, pos_ - flowBegin);
877
0
                            break;
878
0
                        case SPACE_BREAK:
879
0
                            pad_.add(flowBegin, flowEnd - flowBegin);
880
0
                            pad_.add(" ");
881
0
                            break;
882
0
                        }
883
0
                        normalizeLineEnds(cdata);
884
0
                        flowBegin = pos_;
885
0
                        flowEnd = pos_;
886
0
                        space = SPACE_NONE;
887
0
                    } else {
888
0
                        skipDocumentTypeDeclaration();
889
0
                    }
890
0
                }
891
0
                break;
892
0
            case '/':
893
0
                ++pos_;
894
0
                pad_.add(flowBegin, flowEnd - flowBegin);
895
0
                *text = pad_.get();
896
0
                state_ = State::EndTag;
897
0
                return Result::Text;
898
0
            case '?':
899
0
                ++pos_;
900
0
                skipProcessingInstruction();
901
0
                space = SPACE_BREAK;
902
0
                break;
903
0
            default:
904
0
                pad_.add(flowBegin, flowEnd - flowBegin);
905
0
                *text = pad_.get();
906
0
                state_ = State::StartTag;
907
0
                return Result::Text;
908
0
            }
909
0
            break;
910
0
        default:
911
0
            switch (space) {
912
0
            case SPACE_START:
913
0
                flowBegin = pos_;
914
0
                break;
915
0
            case SPACE_NONE:
916
0
            case SPACE_SPAN:
917
0
                break;
918
0
            case SPACE_BREAK:
919
0
                pad_.add(flowBegin, flowEnd - flowBegin);
920
0
                pad_.add(" ");
921
0
                flowBegin = pos_;
922
0
                break;
923
0
            }
924
0
            flowEnd = ++pos_;
925
0
            space = SPACE_NONE;
926
0
            break;
927
0
        }
928
0
    }
929
0
}
930
931
212
int XmlReader::toNamespaceId(NamespaceIris::size_type pos) {
932
212
    assert(pos <= INT_MAX);
933
212
    return static_cast< int >(pos);
934
212
}
935
936
}
937
938
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */