Coverage Report

Created: 2025-08-28 06:26

/src/serenity/AK/GenericLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/Assertions.h>
8
#include <AK/CharacterTypes.h>
9
#include <AK/GenericLexer.h>
10
#include <AK/ScopeGuard.h>
11
#include <AK/StringBuilder.h>
12
13
#ifndef KERNEL
14
#    include <AK/ByteString.h>
15
#    include <AK/Utf16View.h>
16
#endif
17
18
namespace AK {
19
// Consume a number of characters
20
StringView GenericLexer::consume(size_t count)
21
6.56M
{
22
6.56M
    size_t start = m_index;
23
6.56M
    size_t length = min(count, m_input.length() - m_index);
24
6.56M
    m_index += length;
25
26
6.56M
    return m_input.substring_view(start, length);
27
6.56M
}
28
29
// Consume the rest of the input
30
StringView GenericLexer::consume_all()
31
1.50k
{
32
1.50k
    auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
33
1.50k
    m_index = m_input.length();
34
1.50k
    return rest;
35
1.50k
}
36
37
// Consume until a new line is found
38
StringView GenericLexer::consume_line()
39
1.05M
{
40
1.05M
    size_t start = m_index;
41
66.8M
    while (!is_eof() && peek() != '\r' && peek() != '\n')
42
65.8M
        m_index++;
43
1.05M
    size_t length = m_index - start;
44
45
1.05M
    consume_specific('\r');
46
1.05M
    consume_specific('\n');
47
48
1.05M
    return m_input.substring_view(start, length);
49
1.05M
}
50
51
// Consume and return characters until `stop` is peek'd
52
StringView GenericLexer::consume_until(char stop)
53
0
{
54
0
    size_t start = m_index;
55
0
    while (!is_eof() && peek() != stop)
56
0
        m_index++;
57
0
    size_t length = m_index - start;
58
59
0
    return m_input.substring_view(start, length);
60
0
}
61
62
// Consume and return characters until the string `stop` is found
63
StringView GenericLexer::consume_until(char const* stop)
64
49.6k
{
65
49.6k
    size_t start = m_index;
66
8.15M
    while (!is_eof() && !next_is(stop))
67
8.11M
        m_index++;
68
49.6k
    size_t length = m_index - start;
69
70
49.6k
    return m_input.substring_view(start, length);
71
49.6k
}
72
73
// Consume and return characters until the string `stop` is found
74
StringView GenericLexer::consume_until(StringView stop)
75
0
{
76
0
    size_t start = m_index;
77
0
    while (!is_eof() && !next_is(stop))
78
0
        m_index++;
79
0
    size_t length = m_index - start;
80
81
0
    return m_input.substring_view(start, length);
82
0
}
83
84
/*
85
 * Consume a string surrounded by single or double quotes. The returned
86
 * StringView does not include the quotes. An escape character can be provided
87
 * to capture the enclosing quotes. Please note that the escape character will
88
 * still be in the resulting StringView
89
 */
90
StringView GenericLexer::consume_quoted_string(char escape_char)
91
725
{
92
725
    if (!next_is(is_quote))
93
0
        return {};
94
95
725
    char quote_char = consume();
96
725
    size_t start = m_index;
97
17.9M
    while (!is_eof()) {
98
17.9M
        if (next_is(escape_char))
99
101k
            m_index++;
100
17.8M
        else if (next_is(quote_char))
101
678
            break;
102
17.9M
        m_index++;
103
17.9M
    }
104
725
    size_t length = m_index - start;
105
106
725
    if (peek() != quote_char) {
107
        // Restore the index in case the string is unterminated
108
47
        m_index = start - 1;
109
47
        return {};
110
47
    }
111
112
    // Ignore closing quote
113
678
    ignore();
114
115
678
    return m_input.substring_view(start, length);
116
725
}
117
118
template<Integral T>
119
ErrorOr<T> GenericLexer::consume_decimal_integer()
120
0
{
121
0
    using UnsignedT = MakeUnsigned<T>;
122
123
0
    ArmedScopeGuard rollback { [&, rollback_position = m_index] {
124
0
        m_index = rollback_position;
125
0
    } };
Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEhEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv
Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEaEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv
Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEtEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv
Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEsEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv
Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEjEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv
Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEiEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv
Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEmEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv
Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralElEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv
126
127
0
    bool has_minus_sign = false;
128
129
0
    if (next_is('+') || next_is('-'))
130
0
        if (consume() == '-')
131
0
            has_minus_sign = true;
132
133
0
    StringView number_view = consume_while(is_ascii_digit);
134
0
    if (number_view.is_empty())
135
0
        return Error::from_errno(EINVAL);
136
137
0
    auto maybe_number = StringUtils::convert_to_uint<UnsignedT>(number_view, TrimWhitespace::No);
138
0
    if (!maybe_number.has_value())
139
0
        return Error::from_errno(ERANGE);
140
0
    auto number = maybe_number.value();
141
142
0
    if (!has_minus_sign) {
143
0
        if (NumericLimits<T>::max() < number) // This is only possible in a signed case.
144
0
            return Error::from_errno(ERANGE);
145
146
0
        rollback.disarm();
147
0
        return number;
148
0
    } else {
149
0
        if constexpr (IsUnsigned<T>) {
150
0
            if (number == 0) {
151
0
                rollback.disarm();
152
0
                return 0;
153
0
            }
154
0
            return Error::from_errno(ERANGE);
155
0
        } else {
156
0
            static constexpr UnsignedT max_value = static_cast<UnsignedT>(NumericLimits<T>::max()) + 1;
157
0
            if (number > max_value)
158
0
                return Error::from_errno(ERANGE);
159
0
            rollback.disarm();
160
0
            return -number;
161
0
        }
162
0
    }
163
0
}
Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEhEENS_7ErrorOrIT_NS_5ErrorEEEv
Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEaEENS_7ErrorOrIT_NS_5ErrorEEEv
Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEtEENS_7ErrorOrIT_NS_5ErrorEEEv
Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEsEENS_7ErrorOrIT_NS_5ErrorEEEv
Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEjEENS_7ErrorOrIT_NS_5ErrorEEEv
Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEiEENS_7ErrorOrIT_NS_5ErrorEEEv
Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEmEENS_7ErrorOrIT_NS_5ErrorEEEv
Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralElEENS_7ErrorOrIT_NS_5ErrorEEEv
164
165
#if !defined(KERNEL)
166
LineTrackingLexer::Position LineTrackingLexer::position_for(size_t index) const
167
60.9M
{
168
    // Sad case: we have no idea where the nearest newline is, so we have to
169
    //           scan ahead a bit.
170
66.8M
    while (index > m_largest_known_line_start_position) {
171
5.98M
        auto next_newline = m_input.find('\n', m_largest_known_line_start_position);
172
5.98M
        if (!next_newline.has_value()) {
173
            // No more newlines, add the end of the input as a line start to avoid searching again.
174
15.0k
            m_line_start_positions->insert(m_input.length(), m_line_start_positions->size());
175
15.0k
            m_largest_known_line_start_position = m_input.length();
176
15.0k
            break;
177
15.0k
        }
178
5.96M
        m_line_start_positions->insert(next_newline.value() + 1, m_line_start_positions->size());
179
5.96M
        m_largest_known_line_start_position = next_newline.value() + 1;
180
5.96M
    }
181
    // We should always have at least the first line start position.
182
60.9M
    auto previous_line_it = m_line_start_positions->find_largest_not_above_iterator(index);
183
60.9M
    auto previous_line_index = previous_line_it.key();
184
185
60.9M
    auto line = *previous_line_it;
186
60.9M
    auto column = index - previous_line_index;
187
60.9M
    if (line == 0) {
188
        // First line, take into account the start position.
189
44.0M
        column += m_first_line_start_position.column;
190
44.0M
    }
191
192
60.9M
    line += m_first_line_start_position.line;
193
60.9M
    return { index, line, column };
194
60.9M
}
195
#endif
196
197
template ErrorOr<u8> GenericLexer::consume_decimal_integer<u8>();
198
template ErrorOr<i8> GenericLexer::consume_decimal_integer<i8>();
199
template ErrorOr<u16> GenericLexer::consume_decimal_integer<u16>();
200
template ErrorOr<i16> GenericLexer::consume_decimal_integer<i16>();
201
template ErrorOr<u32> GenericLexer::consume_decimal_integer<u32>();
202
template ErrorOr<i32> GenericLexer::consume_decimal_integer<i32>();
203
template ErrorOr<u64> GenericLexer::consume_decimal_integer<u64>();
204
template ErrorOr<i64> GenericLexer::consume_decimal_integer<i64>();
205
#ifdef AK_OS_MACOS
206
template ErrorOr<size_t> GenericLexer::consume_decimal_integer<size_t>();
207
#endif
208
209
#ifndef KERNEL
210
Optional<ByteString> GenericLexer::consume_and_unescape_string(char escape_char)
211
0
{
212
0
    auto view = consume_quoted_string(escape_char);
213
0
    if (view.is_null())
214
0
        return {};
215
216
0
    StringBuilder builder;
217
0
    for (size_t i = 0; i < view.length(); ++i)
218
0
        builder.append(consume_escaped_character(escape_char));
219
0
    return builder.to_byte_string();
220
0
}
221
222
auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
223
3.25M
{
224
3.25M
    if (!consume_specific("\\u"sv))
225
365
        return UnicodeEscapeError::MalformedUnicodeEscape;
226
227
3.25M
    if (next_is('{'))
228
22
        return decode_code_point();
229
3.25M
    return decode_single_or_paired_surrogate(combine_surrogate_pairs);
230
3.25M
}
231
232
auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
233
22
{
234
22
    bool starts_with_open_bracket = consume_specific('{');
235
22
    VERIFY(starts_with_open_bracket);
236
237
22
    u32 code_point = 0;
238
239
108
    while (true) {
240
108
        if (!next_is(is_ascii_hex_digit))
241
0
            return UnicodeEscapeError::MalformedUnicodeEscape;
242
243
108
        auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
244
108
        if (new_code_point < code_point)
245
0
            return UnicodeEscapeError::UnicodeEscapeOverflow;
246
247
108
        code_point = new_code_point;
248
108
        if (consume_specific('}'))
249
22
            break;
250
108
    }
251
252
22
    if (is_unicode(code_point))
253
16
        return code_point;
254
6
    return UnicodeEscapeError::UnicodeEscapeOverflow;
255
22
}
256
257
auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
258
4.21M
{
259
4.21M
    constexpr size_t surrogate_length = 4;
260
261
4.37M
    auto decode_one_surrogate = [&]() -> Optional<u16> {
262
4.37M
        u16 surrogate = 0;
263
264
21.1M
        for (size_t i = 0; i < surrogate_length; ++i) {
265
17.1M
            if (!next_is(is_ascii_hex_digit))
266
376k
                return {};
267
268
16.7M
            surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
269
16.7M
        }
270
271
3.99M
        return surrogate;
272
4.37M
    };
273
274
4.21M
    auto high_surrogate = decode_one_surrogate();
275
4.21M
    if (!high_surrogate.has_value())
276
375k
        return UnicodeEscapeError::MalformedUnicodeEscape;
277
3.84M
    if (!Utf16View::is_high_surrogate(*high_surrogate))
278
2.37M
        return *high_surrogate;
279
1.46M
    if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
280
1.30M
        return *high_surrogate;
281
282
158k
    auto low_surrogate = decode_one_surrogate();
283
158k
    if (!low_surrogate.has_value())
284
70
        return UnicodeEscapeError::MalformedUnicodeEscape;
285
158k
    if (Utf16View::is_low_surrogate(*low_surrogate))
286
22.8k
        return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
287
288
135k
    retreat(6);
289
135k
    return *high_surrogate;
290
158k
}
291
#endif
292
293
}