/src/serenity/AK/GenericLexer.cpp

Source (jump to first uncovered line)
/*
 * Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/Assertions.h>
#include <AK/CharacterTypes.h>
#include <AK/GenericLexer.h>
#include <AK/StringBuilder.h>

#ifndef KERNEL
#    include <AK/String.h>
#    include <AK/Utf16View.h>
#endif

namespace AK {
// Consume a number of characters
StringView GenericLexer::consume(size_t count)
{
    if (count == 0)
        return {};

    size_t start = m_index;
    size_t length = min(count, m_input.length() - m_index);
    m_index += length;

    return m_input.substring_view(start, length);
}

// Consume the rest of the input
StringView GenericLexer::consume_all()
{
    if (is_eof())
        return {};

    auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
    m_index = m_input.length();
    return rest;
}

// Consume until a new line is found
StringView GenericLexer::consume_line()
{
    size_t start = m_index;
    while (!is_eof() && peek() != '\r' && peek() != '\n')
        m_index++;
    size_t length = m_index - start;

    consume_specific('\r');
    consume_specific('\n');

    if (length == 0)
        return {};
    return m_input.substring_view(start, length);
}

// Consume and return characters until `stop` is peek'd
StringView GenericLexer::consume_until(char stop)
{
    size_t start = m_index;
    while (!is_eof() && peek() != stop)
        m_index++;
    size_t length = m_index - start;

    if (length == 0)
        return {};
    return m_input.substring_view(start, length);
}

// Consume and return characters until the string `stop` is found
StringView GenericLexer::consume_until(char const* stop)
{
    size_t start = m_index;
    while (!is_eof() && !next_is(stop))
        m_index++;
    size_t length = m_index - start;

    if (length == 0)
        return {};
    return m_input.substring_view(start, length);
}

// Consume and return characters until the string `stop` is found
StringView GenericLexer::consume_until(StringView stop)
{
    size_t start = m_index;
    while (!is_eof() && !next_is(stop))
        m_index++;
    size_t length = m_index - start;

    if (length == 0)
        return {};
    return m_input.substring_view(start, length);
}

/*
 * Consume a string surrounded by single or double quotes. The returned
 * StringView does not include the quotes. An escape character can be provided
 * to capture the enclosing quotes. Please note that the escape character will
 * still be in the resulting StringView
 */
StringView GenericLexer::consume_quoted_string(char escape_char)
{
    if (!next_is(is_quote))
        return {};

    char quote_char = consume();
    size_t start = m_index;
    while (!is_eof()) {
        if (next_is(escape_char))
            m_index++;
        else if (next_is(quote_char))
            break;
        m_index++;
    }
    size_t length = m_index - start;

    if (peek() != quote_char) {
        // Restore the index in case the string is unterminated
        m_index = start - 1;
        return {};
    }

    // Ignore closing quote
    ignore();

    return m_input.substring_view(start, length);
}

#ifndef KERNEL
String GenericLexer::consume_and_unescape_string(char escape_char)
{
    auto view = consume_quoted_string(escape_char);
    if (view.is_null())
        return {};

    StringBuilder builder;
    for (size_t i = 0; i < view.length(); ++i)
        builder.append(consume_escaped_character(escape_char));
    return builder.to_string();
}

auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
{
    if (!consume_specific("\\u"sv))
        return UnicodeEscapeError::MalformedUnicodeEscape;

    if (next_is('{'))
        return decode_code_point();
    return decode_single_or_paired_surrogate(combine_surrogate_pairs);
}

auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
{
    bool starts_with_open_bracket = consume_specific('{');
    VERIFY(starts_with_open_bracket);

    u32 code_point = 0;

    while (true) {
        if (!next_is(is_ascii_hex_digit))
            return UnicodeEscapeError::MalformedUnicodeEscape;

        auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
        if (new_code_point < code_point)
            return UnicodeEscapeError::UnicodeEscapeOverflow;

        code_point = new_code_point;
        if (consume_specific('}'))
            break;
    }

    if (is_unicode(code_point))
        return code_point;
    return UnicodeEscapeError::UnicodeEscapeOverflow;
}

auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
{
    constexpr size_t surrogate_length = 4;

    auto decode_one_surrogate = [&]() -> Optional<u16> {
        u16 surrogate = 0;

        for (size_t i = 0; i < surrogate_length; ++i) {
            if (!next_is(is_ascii_hex_digit))
                return {};

            surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
        }

        return surrogate;
    };

    auto high_surrogate = decode_one_surrogate();
    if (!high_surrogate.has_value())
        return UnicodeEscapeError::MalformedUnicodeEscape;
    if (!Utf16View::is_high_surrogate(*high_surrogate))
        return *high_surrogate;
    if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
        return *high_surrogate;

    auto low_surrogate = decode_one_surrogate();
    if (!low_surrogate.has_value())
        return UnicodeEscapeError::MalformedUnicodeEscape;
    if (Utf16View::is_low_surrogate(*low_surrogate))
        return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);

    retreat(6);
    return *high_surrogate;
}
#endif

}

Coverage Report

Created: 2022-05-20 06:19

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2020, Benoit Lormeau <blormeau@outlook.com>
3		*
4		* SPDX-License-Identifier: BSD-2-Clause
5		*/
6
7		#include <AK/Assertions.h>
8		#include <AK/CharacterTypes.h>
9		#include <AK/GenericLexer.h>
10		#include <AK/StringBuilder.h>
11
12		#ifndef KERNEL
13		# include <AK/String.h>
14		# include <AK/Utf16View.h>
15		#endif
16
17		namespace AK {
18		// Consume a number of characters
19		StringView GenericLexer::consume(size_t count)
20	288	{
21	288	if (count == 0)
22	0	return {};
23
24	288	size_t start = m_index;
25	288	size_t length = min(count, m_input.length() - m_index);
26	288	m_index += length;
27
28	288	return m_input.substring_view(start, length);
29	288	}
30
31		// Consume the rest of the input
32		StringView GenericLexer::consume_all()
33	0	{
34	0	if (is_eof())
35	0	return {};
36
37	0	auto rest = m_input.substring_view(m_index, m_input.length() - m_index);
38	0	m_index = m_input.length();
39	0	return rest;
40	0	}
41
42		// Consume until a new line is found
43		StringView GenericLexer::consume_line()
44	0	{
45	0	size_t start = m_index;
46	0	while (!is_eof() && peek() != '\r' && peek() != '\n')
47	0	m_index++;
48	0	size_t length = m_index - start;
49
50	0	consume_specific('\r');
51	0	consume_specific('\n');
52
53	0	if (length == 0)
54	0	return {};
55	0	return m_input.substring_view(start, length);
56	0	}
57
58		// Consume and return characters until `stop` is peek'd
59		StringView GenericLexer::consume_until(char stop)
60	0	{
61	0	size_t start = m_index;
62	0	while (!is_eof() && peek() != stop)
63	0	m_index++;
64	0	size_t length = m_index - start;
65
66	0	if (length == 0)
67	0	return {};
68	0	return m_input.substring_view(start, length);
69	0	}
70
71		// Consume and return characters until the string `stop` is found
72		StringView GenericLexer::consume_until(char const* stop)
73	0	{
74	0	size_t start = m_index;
75	0	while (!is_eof() && !next_is(stop))
76	0	m_index++;
77	0	size_t length = m_index - start;
78
79	0	if (length == 0)
80	0	return {};
81	0	return m_input.substring_view(start, length);
82	0	}
83
84		// Consume and return characters until the string `stop` is found
85		StringView GenericLexer::consume_until(StringView stop)
86	0	{
87	0	size_t start = m_index;
88	0	while (!is_eof() && !next_is(stop))
89	0	m_index++;
90	0	size_t length = m_index - start;
91
92	0	if (length == 0)
93	0	return {};
94	0	return m_input.substring_view(start, length);
95	0	}
96
97		/*
98		* Consume a string surrounded by single or double quotes. The returned
99		* StringView does not include the quotes. An escape character can be provided
100		* to capture the enclosing quotes. Please note that the escape character will
101		* still be in the resulting StringView
102		*/
103		StringView GenericLexer::consume_quoted_string(char escape_char)
104	0	{
105	0	if (!next_is(is_quote))
106	0	return {};
107
108	0	char quote_char = consume();
109	0	size_t start = m_index;
110	0	while (!is_eof()) {
111	0	if (next_is(escape_char))
112	0	m_index++;
113	0	else if (next_is(quote_char))
114	0	break;
115	0	m_index++;
116	0	}
117	0	size_t length = m_index - start;
118
119	0	if (peek() != quote_char) {
120		// Restore the index in case the string is unterminated
121	0	m_index = start - 1;
122	0	return {};
123	0	}
124
125		// Ignore closing quote
126	0	ignore();
127
128	0	return m_input.substring_view(start, length);
129	0	}
130
131		#ifndef KERNEL
132		String GenericLexer::consume_and_unescape_string(char escape_char)
133	0	{
134	0	auto view = consume_quoted_string(escape_char);
135	0	if (view.is_null())
136	0	return {};
137
138	0	StringBuilder builder;
139	0	for (size_t i = 0; i < view.length(); ++i)
140	0	builder.append(consume_escaped_character(escape_char));
141	0	return builder.to_string();
142	0	}
143
144		auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
145	453	{
146	453	if (!consume_specific("\\u"sv))
147	8	return UnicodeEscapeError::MalformedUnicodeEscape;
148
149	445	if (next_is('{'))
150	0	return decode_code_point();
151	445	return decode_single_or_paired_surrogate(combine_surrogate_pairs);
152	445	}
153
154		auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
155	0	{
156	0	bool starts_with_open_bracket = consume_specific('{');
157	0	VERIFY(starts_with_open_bracket);
158
159	0	u32 code_point = 0;
160
161	0	while (true) {
162	0	if (!next_is(is_ascii_hex_digit))
163	0	return UnicodeEscapeError::MalformedUnicodeEscape;
164
165	0	auto new_code_point = (code_point << 4u) \| parse_ascii_hex_digit(consume());
166	0	if (new_code_point < code_point)
167	0	return UnicodeEscapeError::UnicodeEscapeOverflow;
168
169	0	code_point = new_code_point;
170	0	if (consume_specific('}'))
171	0	break;
172	0	}
173
174	0	if (is_unicode(code_point))
175	0	return code_point;
176	0	return UnicodeEscapeError::UnicodeEscapeOverflow;
177	0	}
178
179		auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
180	445	{
181	445	constexpr size_t surrogate_length = 4;
182
183	445	auto decode_one_surrogate = [&]() -> Optional<u16> {
184	445	u16 surrogate = 0;
185
186	2.21k	for (size_t i = 0; i < surrogate_length; ++i) {
187	1.77k	if (!next_is(is_ascii_hex_digit))
188	4	return {};
189
190	1.77k	surrogate = (surrogate << 4u) \| parse_ascii_hex_digit(consume());
191	1.77k	}
192
193	441	return surrogate;
194	445	};
195
196	445	auto high_surrogate = decode_one_surrogate();
197	445	if (!high_surrogate.has_value())
198	4	return UnicodeEscapeError::MalformedUnicodeEscape;
199	441	if (!Utf16View::is_high_surrogate(*high_surrogate))
200	427	return *high_surrogate;
201	14	if (!combine_surrogate_pairs \|\| !consume_specific("\\u"sv))
202	14	return *high_surrogate;
203
204	0	auto low_surrogate = decode_one_surrogate();
205	0	if (!low_surrogate.has_value())
206	0	return UnicodeEscapeError::MalformedUnicodeEscape;
207	0	if (Utf16View::is_low_surrogate(*low_surrogate))
208	0	return Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate);
209
210	0	retreat(6);
211	0	return *high_surrogate;
212	0	}
213		#endif
214
215		}