/src/serenity/AK/Utf8View.cpp

Source
/*
 * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/Assertions.h>
#include <AK/Debug.h>
#include <AK/Format.h>
#include <AK/Utf8View.h>

namespace AK {

Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) const
{
    size_t current_offset = 0;
    for (auto iterator = begin(); !iterator.done(); ++iterator) {
        if (current_offset >= byte_offset)
            return iterator;
        current_offset += iterator.underlying_code_point_length_in_bytes();
    }
    return end();
}

Utf8CodePointIterator Utf8View::iterator_at_byte_offset_without_validation(size_t byte_offset) const
{
    return Utf8CodePointIterator { reinterpret_cast<u8 const*>(m_string.characters_without_null_termination()) + byte_offset, m_string.length() - byte_offset };
}

size_t Utf8View::byte_offset_of(Utf8CodePointIterator const& it) const
{
    VERIFY(it.m_ptr >= begin_ptr());
    VERIFY(it.m_ptr <= end_ptr());

    return it.m_ptr - begin_ptr();
}

size_t Utf8View::byte_offset_of(size_t code_point_offset) const
{
    size_t byte_offset = 0;

    for (auto it = begin(); !it.done(); ++it) {
        if (code_point_offset == 0)
            return byte_offset;

        byte_offset += it.underlying_code_point_length_in_bytes();
        --code_point_offset;
    }

    return byte_offset;
}

Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
{
    if (code_point_length == 0)
        return {};

    size_t code_point_index = 0, offset_in_bytes = 0;
    for (auto iterator = begin(); !iterator.done(); ++iterator) {
        if (code_point_index == code_point_offset)
            offset_in_bytes = byte_offset_of(iterator);
        if (code_point_index == code_point_offset + code_point_length - 1) {
            size_t length_in_bytes = byte_offset_of(++iterator) - offset_in_bytes;
            return substring_view(offset_in_bytes, length_in_bytes);
        }
        ++code_point_index;
    }

    VERIFY_NOT_REACHED();
}

size_t Utf8View::calculate_length() const
{
    size_t length = 0;

    for (size_t i = 0; i < m_string.length(); ++length) {
        auto [byte_length, code_point, is_valid] = decode_leading_byte(static_cast<u8>(m_string[i]));

        // Similar to Utf8CodePointIterator::operator++, if the byte is invalid, try the next byte.
        i += is_valid ? byte_length : 1;
    }

    return length;
}

bool Utf8View::starts_with(Utf8View const& start) const
{
    if (start.is_empty())
        return true;
    if (is_empty())
        return false;
    if (start.length() > length())
        return false;
    if (begin_ptr() == start.begin_ptr())
        return true;

    for (auto k = begin(), l = start.begin(); l != start.end(); ++k, ++l) {
        if (*k != *l)
            return false;
    }
    return true;
}

bool Utf8View::contains(u32 needle) const
{
    if (needle <= 0x7f) {
        // OPTIMIZATION: Fast path for ASCII
        for (u8 code_point : as_string()) {
            if (code_point == needle)
                return true;
        }
    } else {
        for (u32 code_point : *this) {
            if (code_point == needle)
                return true;
        }
    }

    return false;
}

Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const
{
    size_t substring_start = 0;
    size_t substring_length = byte_length();

    if (mode == TrimMode::Left || mode == TrimMode::Both) {
        for (auto code_point = begin(); code_point != end(); ++code_point) {
            if (substring_length == 0)
                return {};
            if (!characters.contains(*code_point))
                break;
            substring_start += code_point.underlying_code_point_length_in_bytes();
            substring_length -= code_point.underlying_code_point_length_in_bytes();
        }
    }

    if (mode == TrimMode::Right || mode == TrimMode::Both) {
        size_t seen_whitespace_length = 0;
        for (auto code_point = begin(); code_point != end(); ++code_point) {
            if (characters.contains(*code_point))
                seen_whitespace_length += code_point.underlying_code_point_length_in_bytes();
            else
                seen_whitespace_length = 0;
        }
        if (seen_whitespace_length >= substring_length)
            return {};
        substring_length -= seen_whitespace_length;
    }

    return substring_view(substring_start, substring_length);
}

Utf8CodePointIterator& Utf8CodePointIterator::operator++()
{
    VERIFY(m_length > 0);

    // OPTIMIZATION: Fast path for ASCII characters.
    if (*m_ptr <= 0x7F) {
        m_ptr += 1;
        m_length -= 1;
        return *this;
    }

    size_t code_point_length_in_bytes = underlying_code_point_length_in_bytes();
    if (code_point_length_in_bytes > m_length) {
        // We don't have enough data for the next code point. Skip one character and try again.
        // The rest of the code will output replacement characters as needed for any eventual extension bytes we might encounter afterwards.
        dbgln_if(UTF8_DEBUG, "Expected code point size {} is too big for the remaining length {}. Moving forward one byte.", code_point_length_in_bytes, m_length);
        m_ptr += 1;
        m_length -= 1;
        return *this;
    }

    m_ptr += code_point_length_in_bytes;
    m_length -= code_point_length_in_bytes;
    return *this;
}

size_t Utf8CodePointIterator::underlying_code_point_length_in_bytes() const
{
    VERIFY(m_length > 0);
    auto [code_point_length_in_bytes, value, first_byte_makes_sense] = Utf8View::decode_leading_byte(*m_ptr);

    // If any of these tests fail, we will output a replacement character for this byte and treat it as a code point of size 1.
    if (!first_byte_makes_sense)
        return 1;

    if (code_point_length_in_bytes > m_length)
        return 1;

    for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) {
        if (m_ptr[offset] >> 6 != 2)
            return 1;
    }

    return code_point_length_in_bytes;
}

ReadonlyBytes Utf8CodePointIterator::underlying_code_point_bytes() const
{
    return { m_ptr, underlying_code_point_length_in_bytes() };
}

u32 Utf8CodePointIterator::operator*() const
{
    VERIFY(m_length > 0);

    // OPTIMIZATION: Fast path for ASCII characters.
    if (*m_ptr <= 0x7F)
        return *m_ptr;

    auto [code_point_length_in_bytes, code_point_value_so_far, first_byte_makes_sense] = Utf8View::decode_leading_byte(*m_ptr);

    if (!first_byte_makes_sense) {
        // The first byte of the code point doesn't make sense: output a replacement character
        dbgln_if(UTF8_DEBUG, "First byte doesn't make sense: {:#02x}.", m_ptr[0]);
        return 0xFFFD;
    }

    if (code_point_length_in_bytes > m_length) {
        // There is not enough data left for the full code point: output a replacement character
        dbgln_if(UTF8_DEBUG, "Not enough bytes (need {}, have {}), first byte is: {:#02x}.", code_point_length_in_bytes, m_length, m_ptr[0]);
        return 0xFFFD;
    }

    for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) {
        if (m_ptr[offset] >> 6 != 2) {
            // One of the extension bytes of the code point doesn't make sense: output a replacement character
            dbgln_if(UTF8_DEBUG, "Extension byte {:#02x} in {} position after first byte {:#02x} doesn't make sense.", m_ptr[offset], offset, m_ptr[0]);
            return 0xFFFD;
        }

        code_point_value_so_far <<= 6;
        code_point_value_so_far |= m_ptr[offset] & 63;
    }

    if (code_point_value_so_far > 0x10FFFF) {
        dbgln_if(UTF8_DEBUG, "Multi-byte sequence is otherwise valid, but code point {:#x} is not permissible.", code_point_value_so_far);
        return 0xFFFD;
    }
    return code_point_value_so_far;
}

Optional<u32> Utf8CodePointIterator::peek(size_t offset) const
{
    if (offset == 0) {
        if (this->done())
            return {};
        return this->operator*();
    }

    auto new_iterator = *this;
    for (size_t index = 0; index < offset; ++index) {
        ++new_iterator;
        if (new_iterator.done())
            return {};
    }
    return *new_iterator;
}

ErrorOr<void> Formatter<Utf8View>::format(FormatBuilder& builder, Utf8View const& string)
{
    return Formatter<StringView>::format(builder, string.as_string());
}

}

Coverage Report

Created: 2026-02-14 08:01

Line	Count	Source
1		/*
2		* Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
3		* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
4		*
5		* SPDX-License-Identifier: BSD-2-Clause
6		*/
7
8		#include <AK/Assertions.h>
9		#include <AK/Debug.h>
10		#include <AK/Format.h>
11		#include <AK/Utf8View.h>
12
13		namespace AK {
14
15		Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) const
16	380k	{
17	380k	size_t current_offset = 0;
18	29.2M	for (auto iterator = begin(); !iterator.done(); ++iterator) {
19	29.2M	if (current_offset >= byte_offset)
20	380k	return iterator;
21	28.8M	current_offset += iterator.underlying_code_point_length_in_bytes();
22	28.8M	}
23	179	return end();
24	380k	}
25
26		Utf8CodePointIterator Utf8View::iterator_at_byte_offset_without_validation(size_t byte_offset) const
27	0	{
28	0	return Utf8CodePointIterator { reinterpret_cast<u8 const*>(m_string.characters_without_null_termination()) + byte_offset, m_string.length() - byte_offset };
29	0	}
30
31		size_t Utf8View::byte_offset_of(Utf8CodePointIterator const& it) const
32	375k	{
33	375k	VERIFY(it.m_ptr >= begin_ptr());
34	375k	VERIFY(it.m_ptr <= end_ptr());
35
36	375k	return it.m_ptr - begin_ptr();
37	375k	}
38
39		size_t Utf8View::byte_offset_of(size_t code_point_offset) const
40	1.71M	{
41	1.71M	size_t byte_offset = 0;
42
43	3.43M	for (auto it = begin(); !it.done(); ++it) {
44	3.43M	if (code_point_offset == 0)
45	1.71M	return byte_offset;
46
47	1.71M	byte_offset += it.underlying_code_point_length_in_bytes();
48	1.71M	--code_point_offset;
49	1.71M	}
50
51	6	return byte_offset;
52	1.71M	}
53
54		Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
55	0	{
56	0	if (code_point_length == 0)
57	0	return {};
58
59	0	size_t code_point_index = 0, offset_in_bytes = 0;
60	0	for (auto iterator = begin(); !iterator.done(); ++iterator) {
61	0	if (code_point_index == code_point_offset)
62	0	offset_in_bytes = byte_offset_of(iterator);
63	0	if (code_point_index == code_point_offset + code_point_length - 1) {
64	0	size_t length_in_bytes = byte_offset_of(++iterator) - offset_in_bytes;
65	0	return substring_view(offset_in_bytes, length_in_bytes);
66	0	}
67	0	++code_point_index;
68	0	}
69
70	0	VERIFY_NOT_REACHED();
71	0	}
72
73		size_t Utf8View::calculate_length() const
74	0	{
75	0	size_t length = 0;
76
77	0	for (size_t i = 0; i < m_string.length(); ++length) {
78	0	auto [byte_length, code_point, is_valid] = decode_leading_byte(static_cast<u8>(m_string[i]));
79
80		// Similar to Utf8CodePointIterator::operator++, if the byte is invalid, try the next byte.
81	0	i += is_valid ? byte_length : 1;
82	0	}
83
84	0	return length;
85	0	}
86
87		bool Utf8View::starts_with(Utf8View const& start) const
88	0	{
89	0	if (start.is_empty())
90	0	return true;
91	0	if (is_empty())
92	0	return false;
93	0	if (start.length() > length())
94	0	return false;
95	0	if (begin_ptr() == start.begin_ptr())
96	0	return true;
97
98	0	for (auto k = begin(), l = start.begin(); l != start.end(); ++k, ++l) {
99	0	if (k != l)
100	0	return false;
101	0	}
102	0	return true;
103	0	}
104
105		bool Utf8View::contains(u32 needle) const
106	0	{
107	0	if (needle <= 0x7f) {
108		// OPTIMIZATION: Fast path for ASCII
109	0	for (u8 code_point : as_string()) {
110	0	if (code_point == needle)
111	0	return true;
112	0	}
113	0	} else {
114	0	for (u32 code_point : *this) {
115	0	if (code_point == needle)
116	0	return true;
117	0	}
118	0	}
119
120	0	return false;
121	0	}
122
123		Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const
124	0	{
125	0	size_t substring_start = 0;
126	0	size_t substring_length = byte_length();
127
128	0	if (mode == TrimMode::Left \|\| mode == TrimMode::Both) {
129	0	for (auto code_point = begin(); code_point != end(); ++code_point) {
130	0	if (substring_length == 0)
131	0	return {};
132	0	if (!characters.contains(*code_point))
133	0	break;
134	0	substring_start += code_point.underlying_code_point_length_in_bytes();
135	0	substring_length -= code_point.underlying_code_point_length_in_bytes();
136	0	}
137	0	}
138
139	0	if (mode == TrimMode::Right \|\| mode == TrimMode::Both) {
140	0	size_t seen_whitespace_length = 0;
141	0	for (auto code_point = begin(); code_point != end(); ++code_point) {
142	0	if (characters.contains(*code_point))
143	0	seen_whitespace_length += code_point.underlying_code_point_length_in_bytes();
144	0	else
145	0	seen_whitespace_length = 0;
146	0	}
147	0	if (seen_whitespace_length >= substring_length)
148	0	return {};
149	0	substring_length -= seen_whitespace_length;
150	0	}
151
152	0	return substring_view(substring_start, substring_length);
153	0	}
154
155		Utf8CodePointIterator& Utf8CodePointIterator::operator++()
156	325M	{
157	325M	VERIFY(m_length > 0);
158
159		// OPTIMIZATION: Fast path for ASCII characters.
160	325M	if (*m_ptr <= 0x7F) {
161	213M	m_ptr += 1;
162	213M	m_length -= 1;
163	213M	return *this;
164	213M	}
165
166	112M	size_t code_point_length_in_bytes = underlying_code_point_length_in_bytes();
167	112M	if (code_point_length_in_bytes > m_length) {
168		// We don't have enough data for the next code point. Skip one character and try again.
169		// The rest of the code will output replacement characters as needed for any eventual extension bytes we might encounter afterwards.
170	0	dbgln_if(UTF8_DEBUG, "Expected code point size {} is too big for the remaining length {}. Moving forward one byte.", code_point_length_in_bytes, m_length);
171	0	m_ptr += 1;
172	0	m_length -= 1;
173	0	return *this;
174	0	}
175
176	112M	m_ptr += code_point_length_in_bytes;
177	112M	m_length -= code_point_length_in_bytes;
178	112M	return *this;
179	112M	}
180
181		size_t Utf8CodePointIterator::underlying_code_point_length_in_bytes() const
182	206M	{
183	206M	VERIFY(m_length > 0);
184	206M	auto [code_point_length_in_bytes, value, first_byte_makes_sense] = Utf8View::decode_leading_byte(*m_ptr);
185
186		// If any of these tests fail, we will output a replacement character for this byte and treat it as a code point of size 1.
187	206M	if (!first_byte_makes_sense)
188	77.2M	return 1;
189
190	129M	if (code_point_length_in_bytes > m_length)
191	3.54k	return 1;
192
193	170M	for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) {
194	68.0M	if (m_ptr[offset] >> 6 != 2)
195	27.0M	return 1;
196	68.0M	}
197
198	102M	return code_point_length_in_bytes;
199	129M	}
200
201		ReadonlyBytes Utf8CodePointIterator::underlying_code_point_bytes() const
202	0	{
203	0	return { m_ptr, underlying_code_point_length_in_bytes() };
204	0	}
205
206		u32 Utf8CodePointIterator::operator*() const
207	328M	{
208	328M	VERIFY(m_length > 0);
209
210		// OPTIMIZATION: Fast path for ASCII characters.
211	328M	if (*m_ptr <= 0x7F)
212	226M	return *m_ptr;
213
214	102M	auto [code_point_length_in_bytes, code_point_value_so_far, first_byte_makes_sense] = Utf8View::decode_leading_byte(*m_ptr);
215
216	102M	if (!first_byte_makes_sense) {
217		// The first byte of the code point doesn't make sense: output a replacement character
218	53.8M	dbgln_if(UTF8_DEBUG, "First byte doesn't make sense: {:#02x}.", m_ptr[0]);
219	53.8M	return 0xFFFD;
220	53.8M	}
221
222	48.2M	if (code_point_length_in_bytes > m_length) {
223		// There is not enough data left for the full code point: output a replacement character
224	2.32k	dbgln_if(UTF8_DEBUG, "Not enough bytes (need {}, have {}), first byte is: {:#02x}.", code_point_length_in_bytes, m_length, m_ptr[0]);
225	2.32k	return 0xFFFD;
226	2.32k	}
227
228	92.2M	for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) {
229	70.4M	if (m_ptr[offset] >> 6 != 2) {
230		// One of the extension bytes of the code point doesn't make sense: output a replacement character
231	26.4M	dbgln_if(UTF8_DEBUG, "Extension byte {:#02x} in {} position after first byte {:#02x} doesn't make sense.", m_ptr[offset], offset, m_ptr[0]);
232	26.4M	return 0xFFFD;
233	26.4M	}
234
235	43.9M	code_point_value_so_far <<= 6;
236	43.9M	code_point_value_so_far \|= m_ptr[offset] & 63;
237	43.9M	}
238
239	21.8M	if (code_point_value_so_far > 0x10FFFF) {
240	3.64k	dbgln_if(UTF8_DEBUG, "Multi-byte sequence is otherwise valid, but code point {:#x} is not permissible.", code_point_value_so_far);
241	3.64k	return 0xFFFD;
242	3.64k	}
243	21.8M	return code_point_value_so_far;
244	21.8M	}
245
246		Optional<u32> Utf8CodePointIterator::peek(size_t offset) const
247	379k	{
248	379k	if (offset == 0) {
249	0	if (this->done())
250	0	return {};
251	0	return this->operator*();
252	0	}
253
254	379k	auto new_iterator = *this;
255	918k	for (size_t index = 0; index < offset; ++index) {
256	539k	++new_iterator;
257	539k	if (new_iterator.done())
258	351	return {};
259	539k	}
260	378k	return *new_iterator;
261	379k	}
262
263		ErrorOr<void> Formatter<Utf8View>::format(FormatBuilder& builder, Utf8View const& string)
264	0	{
265	0	return Formatter<StringView>::format(builder, string.as_string());
266	0	}
267
268		}