/src/serenity/AK/GenericLexer.cpp

Source (jump to first uncovered line)
/*
 * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/Assertions.h>
#include <AK/CharacterTypes.h>
#include <AK/GenericLexer.h>
#include <AK/ScopeGuard.h>
#include <AK/StringBuilder.h>

#ifndef KERNEL
#    include <AK/ByteString.h>
#    include <AK/Utf16View.h>
#endif

namespace AK {
// Consume a number of characters
StringView GenericLexer::consume(size_t count)
{
    size_t start = m_index;
    size_t length = min(count, m_input.length() - m_index);
    m_index += length;

    return m_input.substring_view(start, length);
}

// Consume the rest of the input
StringView GenericLexer::consume_all()
{
    auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
    m_index = m_input.length();
    return rest;
}

// Consume until a new line is found
StringView GenericLexer::consume_line()
{
    size_t start = m_index;
    while (!is_eof() && peek() != '\r' && peek() != '\n')
        m_index++;
    size_t length = m_index - start;

    consume_specific('\r');
    consume_specific('\n');

    return m_input.substring_view(start, length);
}

// Consume and return characters until `stop` is peek'd
StringView GenericLexer::consume_until(char stop)
{
    size_t start = m_index;
    while (!is_eof() && peek() != stop)
        m_index++;
    size_t length = m_index - start;

    return m_input.substring_view(start, length);
}

// Consume and return characters until the string `stop` is found
StringView GenericLexer::consume_until(char const* stop)
{
    size_t start = m_index;
    while (!is_eof() && !next_is(stop))
        m_index++;
    size_t length = m_index - start;

    return m_input.substring_view(start, length);
}

// Consume and return characters until the string `stop` is found
StringView GenericLexer::consume_until(StringView stop)
{
    size_t start = m_index;
    while (!is_eof() && !next_is(stop))
        m_index++;
    size_t length = m_index - start;

    return m_input.substring_view(start, length);
}

/*
 * Consume a string surrounded by single or double quotes. The returned
 * StringView does not include the quotes. An escape character can be provided
 * to capture the enclosing quotes. Please note that the escape character will
 * still be in the resulting StringView
 */
StringView GenericLexer::consume_quoted_string(char escape_char)
{
    if (!next_is(is_quote))
        return {};

    char quote_char = consume();
    size_t start = m_index;
    while (!is_eof()) {
        if (next_is(escape_char))
            m_index++;
        else if (next_is(quote_char))
            break;
        m_index++;
    }
    size_t length = m_index - start;

    if (peek() != quote_char) {
        // Restore the index in case the string is unterminated
        m_index = start - 1;
        return {};
    }

    // Ignore closing quote
    ignore();

    return m_input.substring_view(start, length);
}

template<Integral T>
ErrorOr<T> GenericLexer::consume_decimal_integer()
{
    using UnsignedT = MakeUnsigned<T>;

    ArmedScopeGuard rollback { [&, rollback_position = m_index] {
        m_index = rollback_position;
    } };

    bool has_minus_sign = false;

    if (next_is('+') || next_is('-'))
        if (consume() == '-')
            has_minus_sign = true;

    StringView number_view = consume_while(is_ascii_digit);
    if (number_view.is_empty())
        return Error::from_errno(EINVAL);

    auto maybe_number = StringUtils::convert_to_uint<UnsignedT>(number_view, TrimWhitespace::No);
    if (!maybe_number.has_value())
        return Error::from_errno(ERANGE);
    auto number = maybe_number.value();

    if (!has_minus_sign) {
        if (NumericLimits<T>::max() < number) // This is only possible in a signed case.
            return Error::from_errno(ERANGE);

        rollback.disarm();
        return number;
    } else {
        if constexpr (IsUnsigned<T>) {
            if (number == 0) {
                rollback.disarm();
                return 0;
            }
            return Error::from_errno(ERANGE);
        } else {
            static constexpr UnsignedT max_value = static_cast<UnsignedT>(NumericLimits<T>::max()) + 1;
            if (number > max_value)
                return Error::from_errno(ERANGE);
            rollback.disarm();
            return -number;
        }
    }
}

#if !defined(KERNEL)
LineTrackingLexer::Position LineTrackingLexer::position_for(size_t index) const
{
    // Sad case: we have no idea where the nearest newline is, so we have to
    //           scan ahead a bit.
    while (index > m_largest_known_line_start_position) {
        auto next_newline = m_input.find('\n', m_largest_known_line_start_position);
        if (!next_newline.has_value()) {
            // No more newlines, add the end of the input as a line start to avoid searching again.
            m_line_start_positions->insert(m_input.length(), m_line_start_positions->size());
            m_largest_known_line_start_position = m_input.length();
            break;
        }
        m_line_start_positions->insert(next_newline.value() + 1, m_line_start_positions->size());
        m_largest_known_line_start_position = next_newline.value() + 1;
    }
    // We should always have at least the first line start position.
    auto previous_line_it = m_line_start_positions->find_largest_not_above_iterator(index);
    auto previous_line_index = previous_line_it.key();

    auto line = *previous_line_it;
    auto column = index - previous_line_index;
    if (line == 0) {
        // First line, take into account the start position.
        column += m_first_line_start_position.column;
    }

    line += m_first_line_start_position.line;
    return { index, line, column };
}
#endif

template ErrorOr<u8> GenericLexer::consume_decimal_integer<u8>();
template ErrorOr<i8> GenericLexer::consume_decimal_integer<i8>();
template ErrorOr<u16> GenericLexer::consume_decimal_integer<u16>();
template ErrorOr<i16> GenericLexer::consume_decimal_integer<i16>();
template ErrorOr<u32> GenericLexer::consume_decimal_integer<u32>();
template ErrorOr<i32> GenericLexer::consume_decimal_integer<i32>();
template ErrorOr<u64> GenericLexer::consume_decimal_integer<u64>();
template ErrorOr<i64> GenericLexer::consume_decimal_integer<i64>();
#ifdef AK_OS_MACOS
template ErrorOr<size_t> GenericLexer::consume_decimal_integer<size_t>();
#endif

#ifndef KERNEL
Optional<ByteString> GenericLexer::consume_and_unescape_string(char escape_char)
{
    auto view = consume_quoted_string(escape_char);
    if (view.is_null())
        return {};

    StringBuilder builder;
    for (size_t i = 0; i < view.length(); ++i)
        builder.append(consume_escaped_character(escape_char));
    return builder.to_byte_string();
}

auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
{
    if (!consume_specific("\\u"sv))
        return UnicodeEscapeError::MalformedUnicodeEscape;

    if (next_is('{'))
        return decode_code_point();
    return decode_single_or_paired_surrogate(combine_surrogate_pairs);
}

auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
{
    bool starts_with_open_bracket = consume_specific('{');
    VERIFY(starts_with_open_bracket);

    u32 code_point = 0;

    while (true) {
        if (!next_is(is_ascii_hex_digit))
            return UnicodeEscapeError::MalformedUnicodeEscape;

        auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
        if (new_code_point < code_point)
            return UnicodeEscapeError::UnicodeEscapeOverflow;

        code_point = new_code_point;
        if (consume_specific('}'))
            break;
    }

    if (is_unicode(code_point))
        return code_point;
    return UnicodeEscapeError::UnicodeEscapeOverflow;
}

auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
{
    constexpr size_t surrogate_length = 4;

    auto decode_one_surrogate = [&]() -> Optional<u16> {
        u16 surrogate = 0;

        for (size_t i = 0; i < surrogate_length; ++i) {
            if (!next_is(is_ascii_hex_digit))
                return {};

            surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
        }

        return surrogate;
    };

    auto high_surrogate = decode_one_surrogate();
    if (!high_surrogate.has_value())
        return UnicodeEscapeError::MalformedUnicodeEscape;
    if (!Utf16View::is_high_surrogate(*high_surrogate))
        return *high_surrogate;
    if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
        return *high_surrogate;

    auto low_surrogate = decode_one_surrogate();
    if (!low_surrogate.has_value())
        return UnicodeEscapeError::MalformedUnicodeEscape;
    if (Utf16View::is_low_surrogate(*low_surrogate))
        return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);

    retreat(6);
    return *high_surrogate;
}
#endif

}

Coverage Report

Created: 2025-08-28 06:26

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
3		*
4		* SPDX-License-Identifier: BSD-2-Clause
5		*/
6
7		#include <AK/Assertions.h>
8		#include <AK/CharacterTypes.h>
9		#include <AK/GenericLexer.h>
10		#include <AK/ScopeGuard.h>
11		#include <AK/StringBuilder.h>
12
13		#ifndef KERNEL
14		# include <AK/ByteString.h>
15		# include <AK/Utf16View.h>
16		#endif
17
18		namespace AK {
19		// Consume a number of characters
20		StringView GenericLexer::consume(size_t count)
21	6.56M	{
22	6.56M	size_t start = m_index;
23	6.56M	size_t length = min(count, m_input.length() - m_index);
24	6.56M	m_index += length;
25
26	6.56M	return m_input.substring_view(start, length);
27	6.56M	}
28
29		// Consume the rest of the input
30		StringView GenericLexer::consume_all()
31	1.50k	{
32	1.50k	auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
33	1.50k	m_index = m_input.length();
34	1.50k	return rest;
35	1.50k	}
36
37		// Consume until a new line is found
38		StringView GenericLexer::consume_line()
39	1.05M	{
40	1.05M	size_t start = m_index;
41	66.8M	while (!is_eof() && peek() != '\r' && peek() != '\n')
42	65.8M	m_index++;
43	1.05M	size_t length = m_index - start;
44
45	1.05M	consume_specific('\r');
46	1.05M	consume_specific('\n');
47
48	1.05M	return m_input.substring_view(start, length);
49	1.05M	}
50
51		// Consume and return characters until `stop` is peek'd
52		StringView GenericLexer::consume_until(char stop)
53	0	{
54	0	size_t start = m_index;
55	0	while (!is_eof() && peek() != stop)
56	0	m_index++;
57	0	size_t length = m_index - start;
58
59	0	return m_input.substring_view(start, length);
60	0	}
61
62		// Consume and return characters until the string `stop` is found
63		StringView GenericLexer::consume_until(char const* stop)
64	49.6k	{
65	49.6k	size_t start = m_index;
66	8.15M	while (!is_eof() && !next_is(stop))
67	8.11M	m_index++;
68	49.6k	size_t length = m_index - start;
69
70	49.6k	return m_input.substring_view(start, length);
71	49.6k	}
72
73		// Consume and return characters until the string `stop` is found
74		StringView GenericLexer::consume_until(StringView stop)
75	0	{
76	0	size_t start = m_index;
77	0	while (!is_eof() && !next_is(stop))
78	0	m_index++;
79	0	size_t length = m_index - start;
80
81	0	return m_input.substring_view(start, length);
82	0	}
83
84		/*
85		* Consume a string surrounded by single or double quotes. The returned
86		* StringView does not include the quotes. An escape character can be provided
87		* to capture the enclosing quotes. Please note that the escape character will
88		* still be in the resulting StringView
89		*/
90		StringView GenericLexer::consume_quoted_string(char escape_char)
91	725	{
92	725	if (!next_is(is_quote))
93	0	return {};
94
95	725	char quote_char = consume();
96	725	size_t start = m_index;
97	17.9M	while (!is_eof()) {
98	17.9M	if (next_is(escape_char))
99	101k	m_index++;
100	17.8M	else if (next_is(quote_char))
101	678	break;
102	17.9M	m_index++;
103	17.9M	}
104	725	size_t length = m_index - start;
105
106	725	if (peek() != quote_char) {
107		// Restore the index in case the string is unterminated
108	47	m_index = start - 1;
109	47	return {};
110	47	}
111
112		// Ignore closing quote
113	678	ignore();
114
115	678	return m_input.substring_view(start, length);
116	725	}
117
118		template<Integral T>
119		ErrorOr<T> GenericLexer::consume_decimal_integer()
120	0	{
121	0	using UnsignedT = MakeUnsigned<T>;
122
123	0	ArmedScopeGuard rollback { [&, rollback_position = m_index] {
124	0	m_index = rollback_position;
125	0	} }; Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEhEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEaEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEtEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEsEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEjEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEiEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEmEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv Unexecuted instantiation: _ZZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralElEENS_7ErrorOrIT_NS_5ErrorEEEvENKUlvE_clEv
126
127	0	bool has_minus_sign = false;
128
129	0	if (next_is('+') \|\| next_is('-'))
130	0	if (consume() == '-')
131	0	has_minus_sign = true;
132
133	0	StringView number_view = consume_while(is_ascii_digit);
134	0	if (number_view.is_empty())
135	0	return Error::from_errno(EINVAL);
136
137	0	auto maybe_number = StringUtils::convert_to_uint<UnsignedT>(number_view, TrimWhitespace::No);
138	0	if (!maybe_number.has_value())
139	0	return Error::from_errno(ERANGE);
140	0	auto number = maybe_number.value();
141
142	0	if (!has_minus_sign) {
143	0	if (NumericLimits<T>::max() < number) // This is only possible in a signed case.
144	0	return Error::from_errno(ERANGE);
145
146	0	rollback.disarm();
147	0	return number;
148	0	} else {
149	0	if constexpr (IsUnsigned<T>) {
150	0	if (number == 0) {
151	0	rollback.disarm();
152	0	return 0;
153	0	}
154	0	return Error::from_errno(ERANGE);
155	0	} else {
156	0	static constexpr UnsignedT max_value = static_cast<UnsignedT>(NumericLimits<T>::max()) + 1;
157	0	if (number > max_value)
158	0	return Error::from_errno(ERANGE);
159	0	rollback.disarm();
160	0	return -number;
161	0	}
162	0	}
163	0	} Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEhEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEaEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEtEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEsEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEjEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEiEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralEmEENS_7ErrorOrIT_NS_5ErrorEEEv Unexecuted instantiation: _ZN2AK12GenericLexer23consume_decimal_integerITkNS_8Concepts8IntegralElEENS_7ErrorOrIT_NS_5ErrorEEEv
164
165		#if !defined(KERNEL)
166		LineTrackingLexer::Position LineTrackingLexer::position_for(size_t index) const
167	60.9M	{
168		// Sad case: we have no idea where the nearest newline is, so we have to
169		// scan ahead a bit.
170	66.8M	while (index > m_largest_known_line_start_position) {
171	5.98M	auto next_newline = m_input.find('\n', m_largest_known_line_start_position);
172	5.98M	if (!next_newline.has_value()) {
173		// No more newlines, add the end of the input as a line start to avoid searching again.
174	15.0k	m_line_start_positions->insert(m_input.length(), m_line_start_positions->size());
175	15.0k	m_largest_known_line_start_position = m_input.length();
176	15.0k	break;
177	15.0k	}
178	5.96M	m_line_start_positions->insert(next_newline.value() + 1, m_line_start_positions->size());
179	5.96M	m_largest_known_line_start_position = next_newline.value() + 1;
180	5.96M	}
181		// We should always have at least the first line start position.
182	60.9M	auto previous_line_it = m_line_start_positions->find_largest_not_above_iterator(index);
183	60.9M	auto previous_line_index = previous_line_it.key();
184
185	60.9M	auto line = *previous_line_it;
186	60.9M	auto column = index - previous_line_index;
187	60.9M	if (line == 0) {
188		// First line, take into account the start position.
189	44.0M	column += m_first_line_start_position.column;
190	44.0M	}
191
192	60.9M	line += m_first_line_start_position.line;
193	60.9M	return { index, line, column };
194	60.9M	}
195		#endif
196
197		template ErrorOr<u8> GenericLexer::consume_decimal_integer<u8>();
198		template ErrorOr<i8> GenericLexer::consume_decimal_integer<i8>();
199		template ErrorOr<u16> GenericLexer::consume_decimal_integer<u16>();
200		template ErrorOr<i16> GenericLexer::consume_decimal_integer<i16>();
201		template ErrorOr<u32> GenericLexer::consume_decimal_integer<u32>();
202		template ErrorOr<i32> GenericLexer::consume_decimal_integer<i32>();
203		template ErrorOr<u64> GenericLexer::consume_decimal_integer<u64>();
204		template ErrorOr<i64> GenericLexer::consume_decimal_integer<i64>();
205		#ifdef AK_OS_MACOS
206		template ErrorOr<size_t> GenericLexer::consume_decimal_integer<size_t>();
207		#endif
208
209		#ifndef KERNEL
210		Optional<ByteString> GenericLexer::consume_and_unescape_string(char escape_char)
211	0	{
212	0	auto view = consume_quoted_string(escape_char);
213	0	if (view.is_null())
214	0	return {};
215
216	0	StringBuilder builder;
217	0	for (size_t i = 0; i < view.length(); ++i)
218	0	builder.append(consume_escaped_character(escape_char));
219	0	return builder.to_byte_string();
220	0	}
221
222		auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
223	3.25M	{
224	3.25M	if (!consume_specific("\\u"sv))
225	365	return UnicodeEscapeError::MalformedUnicodeEscape;
226
227	3.25M	if (next_is('{'))
228	22	return decode_code_point();
229	3.25M	return decode_single_or_paired_surrogate(combine_surrogate_pairs);
230	3.25M	}
231
232		auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
233	22	{
234	22	bool starts_with_open_bracket = consume_specific('{');
235	22	VERIFY(starts_with_open_bracket);
236
237	22	u32 code_point = 0;
238
239	108	while (true) {
240	108	if (!next_is(is_ascii_hex_digit))
241	0	return UnicodeEscapeError::MalformedUnicodeEscape;
242
243	108	auto new_code_point = (code_point << 4u) \| parse_ascii_hex_digit(consume());
244	108	if (new_code_point < code_point)
245	0	return UnicodeEscapeError::UnicodeEscapeOverflow;
246
247	108	code_point = new_code_point;
248	108	if (consume_specific('}'))
249	22	break;
250	108	}
251
252	22	if (is_unicode(code_point))
253	16	return code_point;
254	6	return UnicodeEscapeError::UnicodeEscapeOverflow;
255	22	}
256
257		auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
258	4.21M	{
259	4.21M	constexpr size_t surrogate_length = 4;
260
261	4.37M	auto decode_one_surrogate = [&]() -> Optional<u16> {
262	4.37M	u16 surrogate = 0;
263
264	21.1M	for (size_t i = 0; i < surrogate_length; ++i) {
265	17.1M	if (!next_is(is_ascii_hex_digit))
266	376k	return {};
267
268	16.7M	surrogate = (surrogate << 4u) \| parse_ascii_hex_digit(consume());
269	16.7M	}
270
271	3.99M	return surrogate;
272	4.37M	};
273
274	4.21M	auto high_surrogate = decode_one_surrogate();
275	4.21M	if (!high_surrogate.has_value())
276	375k	return UnicodeEscapeError::MalformedUnicodeEscape;
277	3.84M	if (!Utf16View::is_high_surrogate(*high_surrogate))
278	2.37M	return *high_surrogate;
279	1.46M	if (!combine_surrogate_pairs \|\| !consume_specific("\\u"sv))
280	1.30M	return *high_surrogate;
281
282	158k	auto low_surrogate = decode_one_surrogate();
283	158k	if (!low_surrogate.has_value())
284	70	return UnicodeEscapeError::MalformedUnicodeEscape;
285	158k	if (Utf16View::is_low_surrogate(*low_surrogate))
286	22.8k	return Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate);
287
288	135k	retreat(6);
289	135k	return *high_surrogate;
290	158k	}
291		#endif
292
293		}