Coverage Report

Created: 2022-05-20 06:19

/src/serenity/AK/GenericLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/Assertions.h>
8
#include <AK/CharacterTypes.h>
9
#include <AK/GenericLexer.h>
10
#include <AK/StringBuilder.h>
11
12
#ifndef KERNEL
13
#    include <AK/String.h>
14
#    include <AK/Utf16View.h>
15
#endif
16
17
namespace AK {
18
// Consume a number of characters
19
StringView GenericLexer::consume(size_t count)
20
288
{
21
288
    if (count == 0)
22
0
        return {};
23
24
288
    size_t start = m_index;
25
288
    size_t length = min(count, m_input.length() - m_index);
26
288
    m_index += length;
27
28
288
    return m_input.substring_view(start, length);
29
288
}
30
31
// Consume the rest of the input
32
StringView GenericLexer::consume_all()
33
0
{
34
0
    if (is_eof())
35
0
        return {};
36
37
0
    auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
38
0
    m_index = m_input.length();
39
0
    return rest;
40
0
}
41
42
// Consume until a new line is found
43
StringView GenericLexer::consume_line()
44
0
{
45
0
    size_t start = m_index;
46
0
    while (!is_eof() && peek() != '\r' && peek() != '\n')
47
0
        m_index++;
48
0
    size_t length = m_index - start;
49
50
0
    consume_specific('\r');
51
0
    consume_specific('\n');
52
53
0
    if (length == 0)
54
0
        return {};
55
0
    return m_input.substring_view(start, length);
56
0
}
57
58
// Consume and return characters until `stop` is peek'd
59
StringView GenericLexer::consume_until(char stop)
60
0
{
61
0
    size_t start = m_index;
62
0
    while (!is_eof() && peek() != stop)
63
0
        m_index++;
64
0
    size_t length = m_index - start;
65
66
0
    if (length == 0)
67
0
        return {};
68
0
    return m_input.substring_view(start, length);
69
0
}
70
71
// Consume and return characters until the string `stop` is found
72
StringView GenericLexer::consume_until(char const* stop)
73
0
{
74
0
    size_t start = m_index;
75
0
    while (!is_eof() && !next_is(stop))
76
0
        m_index++;
77
0
    size_t length = m_index - start;
78
79
0
    if (length == 0)
80
0
        return {};
81
0
    return m_input.substring_view(start, length);
82
0
}
83
84
// Consume and return characters until the string `stop` is found
85
StringView GenericLexer::consume_until(StringView stop)
86
0
{
87
0
    size_t start = m_index;
88
0
    while (!is_eof() && !next_is(stop))
89
0
        m_index++;
90
0
    size_t length = m_index - start;
91
92
0
    if (length == 0)
93
0
        return {};
94
0
    return m_input.substring_view(start, length);
95
0
}
96
97
/*
98
 * Consume a string surrounded by single or double quotes. The returned
99
 * StringView does not include the quotes. An escape character can be provided
100
 * to capture the enclosing quotes. Please note that the escape character will
101
 * still be in the resulting StringView
102
 */
103
StringView GenericLexer::consume_quoted_string(char escape_char)
104
0
{
105
0
    if (!next_is(is_quote))
106
0
        return {};
107
108
0
    char quote_char = consume();
109
0
    size_t start = m_index;
110
0
    while (!is_eof()) {
111
0
        if (next_is(escape_char))
112
0
            m_index++;
113
0
        else if (next_is(quote_char))
114
0
            break;
115
0
        m_index++;
116
0
    }
117
0
    size_t length = m_index - start;
118
119
0
    if (peek() != quote_char) {
120
        // Restore the index in case the string is unterminated
121
0
        m_index = start - 1;
122
0
        return {};
123
0
    }
124
125
    // Ignore closing quote
126
0
    ignore();
127
128
0
    return m_input.substring_view(start, length);
129
0
}
130
131
#ifndef KERNEL
132
String GenericLexer::consume_and_unescape_string(char escape_char)
133
0
{
134
0
    auto view = consume_quoted_string(escape_char);
135
0
    if (view.is_null())
136
0
        return {};
137
138
0
    StringBuilder builder;
139
0
    for (size_t i = 0; i < view.length(); ++i)
140
0
        builder.append(consume_escaped_character(escape_char));
141
0
    return builder.to_string();
142
0
}
143
144
auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
145
453
{
146
453
    if (!consume_specific("\\u"sv))
147
8
        return UnicodeEscapeError::MalformedUnicodeEscape;
148
149
445
    if (next_is('{'))
150
0
        return decode_code_point();
151
445
    return decode_single_or_paired_surrogate(combine_surrogate_pairs);
152
445
}
153
154
auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
155
0
{
156
0
    bool starts_with_open_bracket = consume_specific('{');
157
0
    VERIFY(starts_with_open_bracket);
158
159
0
    u32 code_point = 0;
160
161
0
    while (true) {
162
0
        if (!next_is(is_ascii_hex_digit))
163
0
            return UnicodeEscapeError::MalformedUnicodeEscape;
164
165
0
        auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
166
0
        if (new_code_point < code_point)
167
0
            return UnicodeEscapeError::UnicodeEscapeOverflow;
168
169
0
        code_point = new_code_point;
170
0
        if (consume_specific('}'))
171
0
            break;
172
0
    }
173
174
0
    if (is_unicode(code_point))
175
0
        return code_point;
176
0
    return UnicodeEscapeError::UnicodeEscapeOverflow;
177
0
}
178
179
auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
180
445
{
181
445
    constexpr size_t surrogate_length = 4;
182
183
445
    auto decode_one_surrogate = [&]() -> Optional<u16> {
184
445
        u16 surrogate = 0;
185
186
2.21k
        for (size_t i = 0; i < surrogate_length; ++i) {
187
1.77k
            if (!next_is(is_ascii_hex_digit))
188
4
                return {};
189
190
1.77k
            surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
191
1.77k
        }
192
193
441
        return surrogate;
194
445
    };
195
196
445
    auto high_surrogate = decode_one_surrogate();
197
445
    if (!high_surrogate.has_value())
198
4
        return UnicodeEscapeError::MalformedUnicodeEscape;
199
441
    if (!Utf16View::is_high_surrogate(*high_surrogate))
200
427
        return *high_surrogate;
201
14
    if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
202
14
        return *high_surrogate;
203
204
0
    auto low_surrogate = decode_one_surrogate();
205
0
    if (!low_surrogate.has_value())
206
0
        return UnicodeEscapeError::MalformedUnicodeEscape;
207
0
    if (Utf16View::is_low_surrogate(*low_surrogate))
208
0
        return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
209
210
0
    retreat(6);
211
0
    return *high_surrogate;
212
0
}
213
#endif
214
215
}