Coverage Report

Created: 2025-08-28 06:26

/src/serenity/AK/GenericLexer.h
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#pragma once
8
9
#include <AK/NonnullOwnPtr.h>
10
#include <AK/RedBlackTree.h>
11
#include <AK/Result.h>
12
#include <AK/String.h>
13
#include <AK/StringView.h>
14
15
namespace AK {
16
17
class GenericLexer {
18
public:
19
    constexpr explicit GenericLexer(StringView input)
20
328M
        : m_input(input)
21
328M
    {
22
328M
    }
23
24
1.49G
    constexpr size_t tell() const { return m_index; }
25
817k
    constexpr size_t tell_remaining() const { return m_input.length() - m_index; }
26
27
193
    StringView remaining() const { return m_input.substring_view(m_index); }
28
44.7M
    StringView input() const { return m_input; }
29
30
6.31G
    constexpr bool is_eof() const { return m_index >= m_input.length(); }
31
32
    constexpr char peek(size_t offset = 0) const
33
13.2G
    {
34
13.2G
        return (m_index + offset < m_input.length()) ? m_input[m_index + offset] : '\0';
35
13.2G
    }
36
37
    Optional<StringView> peek_string(size_t length, size_t offset = 0) const
38
0
    {
39
0
        if (m_index + offset + length > m_input.length())
40
0
            return {};
41
0
        return m_input.substring_view(m_index + offset, length);
42
0
    }
43
44
    constexpr bool next_is(char expected) const
45
6.22G
    {
46
6.22G
        return peek() == expected;
47
6.22G
    }
48
49
    constexpr bool next_is(StringView expected) const
50
2.00G
    {
51
2.29G
        for (size_t i = 0; i < expected.length(); ++i)
52
2.22G
            if (peek(i) != expected[i])
53
1.94G
                return false;
54
61.2M
        return true;
55
2.00G
    }
56
57
    constexpr bool next_is(char const* expected) const
58
14.1M
    {
59
14.4M
        for (size_t i = 0; expected[i] != '\0'; ++i)
60
14.2M
            if (peek(i) != expected[i])
61
13.8M
                return false;
62
251k
        return true;
63
14.1M
    }
64
65
    constexpr void retreat()
66
1.18M
    {
67
1.18M
        VERIFY(m_index > 0);
68
1.18M
        --m_index;
69
1.18M
    }
70
71
    constexpr void retreat(size_t count)
72
91.9M
    {
73
91.9M
        VERIFY(m_index >= count);
74
91.9M
        m_index -= count;
75
91.9M
    }
76
77
    constexpr char consume()
78
1.97G
    {
79
1.97G
        VERIFY(!is_eof());
80
1.97G
        return m_input[m_index++];
81
1.97G
    }
82
83
    template<typename T>
84
    constexpr bool consume_specific(T const& next)
85
7.46G
    {
86
7.46G
        if (!next_is(next))
87
6.56G
            return false;
88
89
898M
        if constexpr (requires { next.length(); }) {
90
61.1M
            ignore(next.length());
91
837M
        } else {
92
837M
            ignore(sizeof(next));
93
837M
        }
94
898M
        return true;
95
7.46G
    }
bool AK::GenericLexer::consume_specific<AK::StringView>(AK::StringView const&)
Line
Count
Source
85
1.99G
    {
86
1.99G
        if (!next_is(next))
87
1.93G
            return false;
88
89
61.1M
        if constexpr (requires { next.length(); }) {
90
61.1M
            ignore(next.length());
91
        } else {
92
            ignore(sizeof(next));
93
        }
94
61.1M
        return true;
95
1.99G
    }
bool AK::GenericLexer::consume_specific<char>(char const&)
Line
Count
Source
85
5.47G
    {
86
5.47G
        if (!next_is(next))
87
4.63G
            return false;
88
89
        if constexpr (requires { next.length(); }) {
90
            ignore(next.length());
91
837M
        } else {
92
837M
            ignore(sizeof(next));
93
837M
        }
94
837M
        return true;
95
5.47G
    }
96
97
#ifndef KERNEL
98
    bool consume_specific(ByteString next) = delete;
99
100
    bool consume_specific(String const& next)
101
0
    {
102
0
        return consume_specific(next.bytes_as_string_view());
103
0
    }
104
#endif
105
106
    constexpr bool consume_specific(char const* next)
107
0
    {
108
0
        return consume_specific(StringView { next, __builtin_strlen(next) });
109
0
    }
110
111
    constexpr char consume_escaped_character(char escape_char = '\\', StringView escape_map = "n\nr\rt\tb\bf\f"sv)
112
477k
    {
113
477k
        if (!consume_specific(escape_char))
114
0
            return consume();
115
116
477k
        auto c = consume();
117
118
2.04M
        for (size_t i = 0; i < escape_map.length(); i += 2) {
119
2.00M
            if (c == escape_map[i])
120
431k
                return escape_map[i + 1];
121
2.00M
        }
122
123
45.6k
        return c;
124
477k
    }
125
126
    StringView consume(size_t count);
127
    StringView consume_all();
128
    StringView consume_line();
129
    StringView consume_until(char);
130
    StringView consume_until(char const*);
131
    StringView consume_until(StringView);
132
    StringView consume_quoted_string(char escape_char = 0);
133
#ifndef KERNEL
134
    Optional<ByteString> consume_and_unescape_string(char escape_char = '\\');
135
#endif
136
    template<Integral T>
137
    ErrorOr<T> consume_decimal_integer();
138
139
    enum class UnicodeEscapeError {
140
        MalformedUnicodeEscape,
141
        UnicodeEscapeOverflow,
142
    };
143
144
#ifndef KERNEL
145
    Result<u32, UnicodeEscapeError> consume_escaped_code_point(bool combine_surrogate_pairs = true);
146
#endif
147
148
    constexpr void ignore(size_t count = 1)
149
1.05G
    {
150
1.05G
        count = min(count, m_input.length() - m_index);
151
1.05G
        m_index += count;
152
1.05G
    }
153
154
    constexpr void ignore_until(char stop)
155
0
    {
156
0
        while (!is_eof() && peek() != stop) {
157
0
            ++m_index;
158
0
        }
159
0
    }
160
161
    constexpr void ignore_until(char const* stop)
162
0
    {
163
0
        while (!is_eof() && !next_is(stop)) {
164
0
            ++m_index;
165
0
        }
166
0
    }
167
168
    /*
169
     * Conditions are used to match arbitrary characters. You can use lambdas,
170
     * ctype functions, or is_any_of() and its derivatives (see below).
171
     * A few examples:
172
     *   - `if (lexer.next_is(isdigit))`
173
     *   - `auto name = lexer.consume_while([](char c) { return isalnum(c) || c == '_'; });`
174
     *   - `lexer.ignore_until(is_any_of("<^>"));`
175
     */
176
177
    // Test the next character against a Condition
178
    template<typename TPredicate>
179
    constexpr bool next_is(TPredicate pred) const
180
1.70G
    {
181
1.70G
        return pred(peek());
182
1.70G
    }
bool AK::GenericLexer::next_is<AK::is_any_of(AK::StringView)::{lambda(auto:1)#1}>(AK::is_any_of(AK::StringView)::{lambda(auto:1)#1}) const
Line
Count
Source
180
1.13G
    {
181
1.13G
        return pred(peek());
182
1.13G
    }
bool AK::GenericLexer::next_is<bool (*)(unsigned int)>(bool (*)(unsigned int)) const
Line
Count
Source
180
548M
    {
181
548M
        return pred(peek());
182
548M
    }
bool AK::GenericLexer::next_is<ranges_for_search<Range{48u, 57u}, Range{97u, 102u}, Range{65u, 70u}> >(ranges_for_search<Range{48u, 57u}, Range{97u, 102u}, Range{65u, 70u}>) const
Line
Count
Source
180
2.09M
    {
181
2.09M
        return pred(peek());
182
2.09M
    }
bool AK::GenericLexer::next_is<ranges_for_search<Range{48u, 57u}> >(ranges_for_search<Range{48u, 57u}>) const
Line
Count
Source
180
1.61M
    {
181
1.61M
        return pred(peek());
182
1.61M
    }
bool AK::GenericLexer::next_is<ranges_for_search<Range{58u, 58u}, Range{65u, 90u}, Range{95u, 95u}, Range{97u, 122u}, Range{192u, 214u}, Range{216u, 246u}, Range{248u, 767u}, Range{880u, 893u}, Range{895u, 8191u}, Range{8204u, 8205u}, Range{8304u, 8591u}, Range{11264u, 12271u}, Range{12289u, 55295u}, Range{63744u, 64975u}, Range{65008u, 65533u}, Range{65536u, 983039u}, Range{45u, 45u}, Range{46u, 46u}, Range{48u, 57u}, Range{183u, 183u}, Range{768u, 879u}, Range{8255u, 8256u}> >(ranges_for_search<Range{58u, 58u}, Range{65u, 90u}, Range{95u, 95u}, Range{97u, 122u}, Range{192u, 214u}, Range{216u, 246u}, Range{248u, 767u}, Range{880u, 893u}, Range{895u, 8191u}, Range{8204u, 8205u}, Range{8304u, 8591u}, Range{11264u, 12271u}, Range{12289u, 55295u}, Range{63744u, 64975u}, Range{65008u, 65533u}, Range{65536u, 983039u}, Range{45u, 45u}, Range{46u, 46u}, Range{48u, 57u}, Range{183u, 183u}, Range{768u, 879u}, Range{8255u, 8256u}>) const
Line
Count
Source
180
4.15M
    {
181
4.15M
        return pred(peek());
182
4.15M
    }
Parser.cpp:bool AK::GenericLexer::next_is<XML::Parser::parse_public_id_literal()::$_0>(XML::Parser::parse_public_id_literal()::$_0) const
Line
Count
Source
180
3.52M
    {
181
3.52M
        return pred(peek());
182
3.52M
    }
bool AK::GenericLexer::next_is<AK::is_not_any_of(AK::StringView)::{lambda(auto:1)#1}>(AK::is_not_any_of(AK::StringView)::{lambda(auto:1)#1}) const
Line
Count
Source
180
18.0M
    {
181
18.0M
        return pred(peek());
182
18.0M
    }
183
184
    // Consume and return characters while `pred` returns true
185
    template<typename TPredicate>
186
    StringView consume_while(TPredicate pred)
187
96.2M
    {
188
96.2M
        size_t start = m_index;
189
1.58G
        while (!is_eof() && pred(peek()))
190
1.49G
            ++m_index;
191
96.2M
        size_t length = m_index - start;
192
193
96.2M
        return m_input.substring_view(start, length);
194
96.2M
    }
AK::StringView AK::GenericLexer::consume_while<bool (*)(unsigned int)>(bool (*)(unsigned int))
Line
Count
Source
187
193
    {
188
193
        size_t start = m_index;
189
208
        while (!is_eof() && pred(peek()))
190
15
            ++m_index;
191
193
        size_t length = m_index - start;
192
193
193
        return m_input.substring_view(start, length);
194
193
    }
AK::StringView AK::GenericLexer::consume_while<AK::is_any_of(AK::StringView)::{lambda(auto:1)#1}>(AK::is_any_of(AK::StringView)::{lambda(auto:1)#1})
Line
Count
Source
187
88.3M
    {
188
88.3M
        size_t start = m_index;
189
123M
        while (!is_eof() && pred(peek()))
190
35.5M
            ++m_index;
191
88.3M
        size_t length = m_index - start;
192
193
88.3M
        return m_input.substring_view(start, length);
194
88.3M
    }
Parser.cpp:AK::StringView AK::GenericLexer::consume_while<XML::Parser::parse_comment()::$_0>(XML::Parser::parse_comment()::$_0)
Line
Count
Source
187
8.70k
    {
188
8.70k
        size_t start = m_index;
189
22.1M
        while (!is_eof() && pred(peek()))
190
22.1M
            ++m_index;
191
8.70k
        size_t length = m_index - start;
192
193
8.70k
        return m_input.substring_view(start, length);
194
8.70k
    }
Parser.cpp:AK::StringView AK::GenericLexer::consume_while<XML::Parser::parse_char_data()::$_0>(XML::Parser::parse_char_data()::$_0)
Line
Count
Source
187
7.66M
    {
188
7.66M
        size_t start = m_index;
189
1.44G
        while (!is_eof() && pred(peek()))
190
1.43G
            ++m_index;
191
7.66M
        size_t length = m_index - start;
192
193
7.66M
        return m_input.substring_view(start, length);
194
7.66M
    }
Unexecuted instantiation: AK::StringView AK::GenericLexer::consume_while<int (*)(int) noexcept>(int (*)(int) noexcept)
Unexecuted instantiation: AK::StringView AK::GenericLexer::consume_while<regex::Regex<regex::PosixBasicParser>::replace(regex::RegexStringView, AK::StringView, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1)#1}>(regex::Regex<regex::PosixBasicParser>::replace(regex::RegexStringView, AK::StringView, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1)#1})
Unexecuted instantiation: AK::StringView AK::GenericLexer::consume_while<regex::Regex<regex::PosixExtendedParser>::replace(regex::RegexStringView, AK::StringView, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1)#1}>(regex::Regex<regex::PosixExtendedParser>::replace(regex::RegexStringView, AK::StringView, AK::Optional<regex::RegexOptions<regex::PosixFlags> >) const::{lambda(auto:1)#1})
Unexecuted instantiation: AK::StringView AK::GenericLexer::consume_while<regex::Regex<regex::ECMA262Parser>::replace(regex::RegexStringView, AK::StringView, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const::{lambda(auto:1)#1}>(regex::Regex<regex::ECMA262Parser>::replace(regex::RegexStringView, AK::StringView, AK::Optional<regex::RegexOptions<regex::ECMAScriptFlags> >) const::{lambda(auto:1)#1})
PosixParser.cpp:AK::StringView AK::GenericLexer::consume_while<Shell::Posix::Parser::perform_expansions(AK::Vector<Shell::Posix::Token, 0ul>)::$_1::operator()(Shell::Posix::ParameterExpansion const&) const::{lambda(char)#1}>(Shell::Posix::Parser::perform_expansions(AK::Vector<Shell::Posix::Token, 0ul>)::$_1::operator()(Shell::Posix::ParameterExpansion const&) const::{lambda(char)#1})
Line
Count
Source
187
207k
    {
188
207k
        size_t start = m_index;
189
1.93M
        while (!is_eof() && pred(peek()))
190
1.73M
            ++m_index;
191
207k
        size_t length = m_index - start;
192
193
207k
        return m_input.substring_view(start, length);
194
207k
    }
Unexecuted instantiation: Parser.cpp:AK::StringView AK::GenericLexer::consume_while<Web::CSS::Parser::Parser::parse_unicode_range(AK::StringView)::$_1>(Web::CSS::Parser::Parser::parse_unicode_range(AK::StringView)::$_1)
Unexecuted instantiation: Document.cpp:AK::StringView AK::GenericLexer::consume_while<Web::DOM::Document::shared_declarative_refresh_steps(AK::StringView, JS::GCPtr<Web::HTML::HTMLMetaElement const>)::$_2>(Web::DOM::Document::shared_declarative_refresh_steps(AK::StringView, JS::GCPtr<Web::HTML::HTMLMetaElement const>)::$_2)
Unexecuted instantiation: ViewBox.cpp:AK::StringView AK::GenericLexer::consume_while<Web::SVG::try_parse_view_box(AK::StringView)::$_0>(Web::SVG::try_parse_view_box(AK::StringView)::$_0)
195
196
    // Consume and return characters until `pred` return true
197
    template<typename TPredicate>
198
    StringView consume_until(TPredicate pred)
199
0
    {
200
0
        size_t start = m_index;
201
0
        while (!is_eof() && !pred(peek()))
202
0
            ++m_index;
203
0
        size_t length = m_index - start;
204
205
0
        return m_input.substring_view(start, length);
206
0
    }
Unexecuted instantiation: DateTime.cpp:AK::StringView AK::GenericLexer::consume_until<Core::DateTime::parse(AK::StringView, AK::StringView)::$_3>(Core::DateTime::parse(AK::StringView, AK::StringView)::$_3)
Unexecuted instantiation: AK::StringView AK::GenericLexer::consume_until<AK::is_any_of(AK::StringView)::{lambda(auto:1)#1}>(AK::is_any_of(AK::StringView)::{lambda(auto:1)#1})
Unexecuted instantiation: HTTP.cpp:AK::StringView AK::GenericLexer::consume_until<Web::Fetch::Infrastructure::collect_an_http_quoted_string(AK::GenericLexer&, Web::Fetch::Infrastructure::HttpQuotedStringExtractValue)::$_0>(Web::Fetch::Infrastructure::collect_an_http_quoted_string(AK::GenericLexer&, Web::Fetch::Infrastructure::HttpQuotedStringExtractValue)::$_0)
Unexecuted instantiation: HTMLEncodingDetection.cpp:AK::StringView AK::GenericLexer::consume_until<Web::HTML::extract_character_encoding_from_meta_element(AK::ByteString const&)::$_2>(Web::HTML::extract_character_encoding_from_meta_element(AK::ByteString const&)::$_2)
Unexecuted instantiation: TokenizedFeatures.cpp:AK::StringView AK::GenericLexer::consume_until<Web::HTML::tokenize_open_features(AK::StringView)::$_0>(Web::HTML::tokenize_open_features(AK::StringView)::$_0)
Unexecuted instantiation: MimeType.cpp:AK::StringView AK::GenericLexer::consume_until<Web::MimeSniff::MimeType::parse(AK::StringView)::$_0>(Web::MimeSniff::MimeType::parse(AK::StringView)::$_0)
Unexecuted instantiation: AK::StringView AK::GenericLexer::consume_until<bool (*)(char)>(bool (*)(char))
Unexecuted instantiation: ViewBox.cpp:AK::StringView AK::GenericLexer::consume_until<Web::SVG::try_parse_view_box(AK::StringView)::$_1>(Web::SVG::try_parse_view_box(AK::StringView)::$_1)
207
208
    // Ignore characters while `pred` returns true
209
    template<typename TPredicate>
210
    constexpr void ignore_while(TPredicate pred)
211
44.9M
    {
212
45.1M
        while (!is_eof() && pred(peek()))
213
177k
            ++m_index;
214
44.9M
    }
void AK::GenericLexer::ignore_while<bool (*)(int)>(bool (*)(int))
Line
Count
Source
211
44.8M
    {
212
45.0M
        while (!is_eof() && pred(peek()))
213
176k
            ++m_index;
214
44.8M
    }
void AK::GenericLexer::ignore_while<AK::is_any_of(AK::StringView)::{lambda(auto:1)#1}>(AK::is_any_of(AK::StringView)::{lambda(auto:1)#1})
Line
Count
Source
211
133k
    {
212
134k
        while (!is_eof() && pred(peek()))
213
1.45k
            ++m_index;
214
133k
    }
Unexecuted instantiation: void AK::GenericLexer::ignore_while<bool (*)(unsigned int)>(bool (*)(unsigned int))
Unexecuted instantiation: Document.cpp:void AK::GenericLexer::ignore_while<Web::DOM::Document::shared_declarative_refresh_steps(AK::StringView, JS::GCPtr<Web::HTML::HTMLMetaElement const>)::$_0>(Web::DOM::Document::shared_declarative_refresh_steps(AK::StringView, JS::GCPtr<Web::HTML::HTMLMetaElement const>)::$_0)
Unexecuted instantiation: HTMLEncodingDetection.cpp:void AK::GenericLexer::ignore_while<Web::HTML::extract_character_encoding_from_meta_element(AK::ByteString const&)::$_0>(Web::HTML::extract_character_encoding_from_meta_element(AK::ByteString const&)::$_0)
Unexecuted instantiation: HTMLEncodingDetection.cpp:void AK::GenericLexer::ignore_while<Web::HTML::extract_character_encoding_from_meta_element(AK::ByteString const&)::$_1>(Web::HTML::extract_character_encoding_from_meta_element(AK::ByteString const&)::$_1)
Unexecuted instantiation: TokenizedFeatures.cpp:void AK::GenericLexer::ignore_while<Web::HTML::tokenize_open_features(AK::StringView)::$_0>(Web::HTML::tokenize_open_features(AK::StringView)::$_0)
Unexecuted instantiation: TokenizedFeatures.cpp:void AK::GenericLexer::ignore_while<Web::HTML::tokenize_open_features(AK::StringView)::$_1>(Web::HTML::tokenize_open_features(AK::StringView)::$_1)
Unexecuted instantiation: void AK::GenericLexer::ignore_while<bool (*)(char)>(bool (*)(char))
215
216
    // Ignore characters until `pred` returns true
217
    template<typename TPredicate>
218
    constexpr void ignore_until(TPredicate pred)
219
0
    {
220
0
        while (!is_eof() && !pred(peek()))
221
0
            ++m_index;
222
0
    }
223
224
protected:
225
#ifndef KERNEL
226
    Result<u32, UnicodeEscapeError> decode_code_point();
227
    Result<u32, UnicodeEscapeError> decode_single_or_paired_surrogate(bool combine_surrogate_pairs = true);
228
#endif
229
230
    StringView m_input;
231
    size_t m_index { 0 };
232
};
233
234
#if !defined(KERNEL)
235
class LineTrackingLexer : public GenericLexer {
236
public:
237
    struct Position {
238
        size_t offset { 0 };
239
        size_t line { 0 };
240
        size_t column { 0 };
241
    };
242
243
    LineTrackingLexer(StringView input, Position start_position)
244
257k
        : GenericLexer(input)
245
257k
        , m_first_line_start_position(start_position)
246
257k
        , m_line_start_positions(make<RedBlackTree<size_t, size_t>>())
247
257k
    {
248
257k
        m_line_start_positions->insert(0, 0);
249
257k
        auto first_newline = input.find('\n').map([](auto x) { return x + 1; }).value_or(input.length());
250
257k
        m_line_start_positions->insert(first_newline, 1);
251
257k
        m_largest_known_line_start_position = first_newline;
252
257k
    }
253
254
    LineTrackingLexer(StringView input)
255
257k
        : LineTrackingLexer(input, { 0, 1, 1 })
256
257k
    {
257
257k
    }
258
259
    Position position_for(size_t) const;
260
45.9M
    Position current_position() const { return position_for(m_index); }
261
262
protected:
263
    Position m_first_line_start_position;
264
    mutable NonnullOwnPtr<RedBlackTree<size_t, size_t>> m_line_start_positions; // offset -> line index
265
    mutable size_t m_largest_known_line_start_position { 0 };
266
};
267
#endif
268
269
constexpr auto is_any_of(StringView values)
270
1.22G
{
271
1.26G
    return [values](auto c) { return values.contains(c); };
272
1.22G
}
273
274
constexpr auto is_not_any_of(StringView values)
275
25.2k
{
276
18.0M
    return [values](auto c) { return !values.contains(c); };
277
25.2k
}
278
279
constexpr auto is_path_separator = is_any_of("/\\"sv);
280
constexpr auto is_quote = is_any_of("'\""sv);
281
282
}
283
284
#if USING_AK_GLOBALLY
285
using AK::GenericLexer;
286
using AK::is_any_of;
287
using AK::is_path_separator;
288
using AK::is_quote;
289
#    if !defined(KERNEL)
290
using AK::LineTrackingLexer;
291
#    endif
292
#endif