/src/serenity/AK/Utf8View.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org> |
3 | | * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch> |
4 | | * |
5 | | * SPDX-License-Identifier: BSD-2-Clause |
6 | | */ |
7 | | |
8 | | #include <AK/Assertions.h> |
9 | | #include <AK/Debug.h> |
10 | | #include <AK/Format.h> |
11 | | #include <AK/Utf8View.h> |
12 | | |
13 | | namespace AK { |
14 | | |
15 | | Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) const |
16 | 380k | { |
17 | 380k | size_t current_offset = 0; |
18 | 29.2M | for (auto iterator = begin(); !iterator.done(); ++iterator) { |
19 | 29.2M | if (current_offset >= byte_offset) |
20 | 380k | return iterator; |
21 | 28.8M | current_offset += iterator.underlying_code_point_length_in_bytes(); |
22 | 28.8M | } |
23 | 179 | return end(); |
24 | 380k | } |
25 | | |
26 | | Utf8CodePointIterator Utf8View::iterator_at_byte_offset_without_validation(size_t byte_offset) const |
27 | 0 | { |
28 | 0 | return Utf8CodePointIterator { reinterpret_cast<u8 const*>(m_string.characters_without_null_termination()) + byte_offset, m_string.length() - byte_offset }; |
29 | 0 | } |
30 | | |
31 | | size_t Utf8View::byte_offset_of(Utf8CodePointIterator const& it) const |
32 | 375k | { |
33 | 375k | VERIFY(it.m_ptr >= begin_ptr()); |
34 | 375k | VERIFY(it.m_ptr <= end_ptr()); |
35 | | |
36 | 375k | return it.m_ptr - begin_ptr(); |
37 | 375k | } |
38 | | |
39 | | size_t Utf8View::byte_offset_of(size_t code_point_offset) const |
40 | 1.71M | { |
41 | 1.71M | size_t byte_offset = 0; |
42 | | |
43 | 3.43M | for (auto it = begin(); !it.done(); ++it) { |
44 | 3.43M | if (code_point_offset == 0) |
45 | 1.71M | return byte_offset; |
46 | | |
47 | 1.71M | byte_offset += it.underlying_code_point_length_in_bytes(); |
48 | 1.71M | --code_point_offset; |
49 | 1.71M | } |
50 | | |
51 | 6 | return byte_offset; |
52 | 1.71M | } |
53 | | |
54 | | Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const |
55 | 0 | { |
56 | 0 | if (code_point_length == 0) |
57 | 0 | return {}; |
58 | | |
59 | 0 | size_t code_point_index = 0, offset_in_bytes = 0; |
60 | 0 | for (auto iterator = begin(); !iterator.done(); ++iterator) { |
61 | 0 | if (code_point_index == code_point_offset) |
62 | 0 | offset_in_bytes = byte_offset_of(iterator); |
63 | 0 | if (code_point_index == code_point_offset + code_point_length - 1) { |
64 | 0 | size_t length_in_bytes = byte_offset_of(++iterator) - offset_in_bytes; |
65 | 0 | return substring_view(offset_in_bytes, length_in_bytes); |
66 | 0 | } |
67 | 0 | ++code_point_index; |
68 | 0 | } |
69 | | |
70 | 0 | VERIFY_NOT_REACHED(); |
71 | 0 | } |
72 | | |
73 | | size_t Utf8View::calculate_length() const |
74 | 0 | { |
75 | 0 | size_t length = 0; |
76 | |
|
77 | 0 | for (size_t i = 0; i < m_string.length(); ++length) { |
78 | 0 | auto [byte_length, code_point, is_valid] = decode_leading_byte(static_cast<u8>(m_string[i])); |
79 | | |
80 | | // Similar to Utf8CodePointIterator::operator++, if the byte is invalid, try the next byte. |
81 | 0 | i += is_valid ? byte_length : 1; |
82 | 0 | } |
83 | |
|
84 | 0 | return length; |
85 | 0 | } |
86 | | |
87 | | bool Utf8View::starts_with(Utf8View const& start) const |
88 | 0 | { |
89 | 0 | if (start.is_empty()) |
90 | 0 | return true; |
91 | 0 | if (is_empty()) |
92 | 0 | return false; |
93 | 0 | if (start.length() > length()) |
94 | 0 | return false; |
95 | 0 | if (begin_ptr() == start.begin_ptr()) |
96 | 0 | return true; |
97 | | |
98 | 0 | for (auto k = begin(), l = start.begin(); l != start.end(); ++k, ++l) { |
99 | 0 | if (*k != *l) |
100 | 0 | return false; |
101 | 0 | } |
102 | 0 | return true; |
103 | 0 | } |
104 | | |
105 | | bool Utf8View::contains(u32 needle) const |
106 | 0 | { |
107 | 0 | if (needle <= 0x7f) { |
108 | | // OPTIMIZATION: Fast path for ASCII |
109 | 0 | for (u8 code_point : as_string()) { |
110 | 0 | if (code_point == needle) |
111 | 0 | return true; |
112 | 0 | } |
113 | 0 | } else { |
114 | 0 | for (u32 code_point : *this) { |
115 | 0 | if (code_point == needle) |
116 | 0 | return true; |
117 | 0 | } |
118 | 0 | } |
119 | | |
120 | 0 | return false; |
121 | 0 | } |
122 | | |
123 | | Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const |
124 | 0 | { |
125 | 0 | size_t substring_start = 0; |
126 | 0 | size_t substring_length = byte_length(); |
127 | |
|
128 | 0 | if (mode == TrimMode::Left || mode == TrimMode::Both) { |
129 | 0 | for (auto code_point = begin(); code_point != end(); ++code_point) { |
130 | 0 | if (substring_length == 0) |
131 | 0 | return {}; |
132 | 0 | if (!characters.contains(*code_point)) |
133 | 0 | break; |
134 | 0 | substring_start += code_point.underlying_code_point_length_in_bytes(); |
135 | 0 | substring_length -= code_point.underlying_code_point_length_in_bytes(); |
136 | 0 | } |
137 | 0 | } |
138 | | |
139 | 0 | if (mode == TrimMode::Right || mode == TrimMode::Both) { |
140 | 0 | size_t seen_whitespace_length = 0; |
141 | 0 | for (auto code_point = begin(); code_point != end(); ++code_point) { |
142 | 0 | if (characters.contains(*code_point)) |
143 | 0 | seen_whitespace_length += code_point.underlying_code_point_length_in_bytes(); |
144 | 0 | else |
145 | 0 | seen_whitespace_length = 0; |
146 | 0 | } |
147 | 0 | if (seen_whitespace_length >= substring_length) |
148 | 0 | return {}; |
149 | 0 | substring_length -= seen_whitespace_length; |
150 | 0 | } |
151 | | |
152 | 0 | return substring_view(substring_start, substring_length); |
153 | 0 | } |
154 | | |
155 | | Utf8CodePointIterator& Utf8CodePointIterator::operator++() |
156 | 325M | { |
157 | 325M | VERIFY(m_length > 0); |
158 | | |
159 | | // OPTIMIZATION: Fast path for ASCII characters. |
160 | 325M | if (*m_ptr <= 0x7F) { |
161 | 213M | m_ptr += 1; |
162 | 213M | m_length -= 1; |
163 | 213M | return *this; |
164 | 213M | } |
165 | | |
166 | 112M | size_t code_point_length_in_bytes = underlying_code_point_length_in_bytes(); |
167 | 112M | if (code_point_length_in_bytes > m_length) { |
168 | | // We don't have enough data for the next code point. Skip one character and try again. |
169 | | // The rest of the code will output replacement characters as needed for any eventual extension bytes we might encounter afterwards. |
170 | 0 | dbgln_if(UTF8_DEBUG, "Expected code point size {} is too big for the remaining length {}. Moving forward one byte.", code_point_length_in_bytes, m_length); |
171 | 0 | m_ptr += 1; |
172 | 0 | m_length -= 1; |
173 | 0 | return *this; |
174 | 0 | } |
175 | | |
176 | 112M | m_ptr += code_point_length_in_bytes; |
177 | 112M | m_length -= code_point_length_in_bytes; |
178 | 112M | return *this; |
179 | 112M | } |
180 | | |
181 | | size_t Utf8CodePointIterator::underlying_code_point_length_in_bytes() const |
182 | 206M | { |
183 | 206M | VERIFY(m_length > 0); |
184 | 206M | auto [code_point_length_in_bytes, value, first_byte_makes_sense] = Utf8View::decode_leading_byte(*m_ptr); |
185 | | |
186 | | // If any of these tests fail, we will output a replacement character for this byte and treat it as a code point of size 1. |
187 | 206M | if (!first_byte_makes_sense) |
188 | 77.2M | return 1; |
189 | | |
190 | 129M | if (code_point_length_in_bytes > m_length) |
191 | 3.54k | return 1; |
192 | | |
193 | 170M | for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) { |
194 | 68.0M | if (m_ptr[offset] >> 6 != 2) |
195 | 27.0M | return 1; |
196 | 68.0M | } |
197 | | |
198 | 102M | return code_point_length_in_bytes; |
199 | 129M | } |
200 | | |
201 | | ReadonlyBytes Utf8CodePointIterator::underlying_code_point_bytes() const |
202 | 0 | { |
203 | 0 | return { m_ptr, underlying_code_point_length_in_bytes() }; |
204 | 0 | } |
205 | | |
206 | | u32 Utf8CodePointIterator::operator*() const |
207 | 328M | { |
208 | 328M | VERIFY(m_length > 0); |
209 | | |
210 | | // OPTIMIZATION: Fast path for ASCII characters. |
211 | 328M | if (*m_ptr <= 0x7F) |
212 | 226M | return *m_ptr; |
213 | | |
214 | 102M | auto [code_point_length_in_bytes, code_point_value_so_far, first_byte_makes_sense] = Utf8View::decode_leading_byte(*m_ptr); |
215 | | |
216 | 102M | if (!first_byte_makes_sense) { |
217 | | // The first byte of the code point doesn't make sense: output a replacement character |
218 | 53.8M | dbgln_if(UTF8_DEBUG, "First byte doesn't make sense: {:#02x}.", m_ptr[0]); |
219 | 53.8M | return 0xFFFD; |
220 | 53.8M | } |
221 | | |
222 | 48.2M | if (code_point_length_in_bytes > m_length) { |
223 | | // There is not enough data left for the full code point: output a replacement character |
224 | 2.32k | dbgln_if(UTF8_DEBUG, "Not enough bytes (need {}, have {}), first byte is: {:#02x}.", code_point_length_in_bytes, m_length, m_ptr[0]); |
225 | 2.32k | return 0xFFFD; |
226 | 2.32k | } |
227 | | |
228 | 92.2M | for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) { |
229 | 70.4M | if (m_ptr[offset] >> 6 != 2) { |
230 | | // One of the extension bytes of the code point doesn't make sense: output a replacement character |
231 | 26.4M | dbgln_if(UTF8_DEBUG, "Extension byte {:#02x} in {} position after first byte {:#02x} doesn't make sense.", m_ptr[offset], offset, m_ptr[0]); |
232 | 26.4M | return 0xFFFD; |
233 | 26.4M | } |
234 | | |
235 | 43.9M | code_point_value_so_far <<= 6; |
236 | 43.9M | code_point_value_so_far |= m_ptr[offset] & 63; |
237 | 43.9M | } |
238 | | |
239 | 21.8M | if (code_point_value_so_far > 0x10FFFF) { |
240 | 3.64k | dbgln_if(UTF8_DEBUG, "Multi-byte sequence is otherwise valid, but code point {:#x} is not permissible.", code_point_value_so_far); |
241 | 3.64k | return 0xFFFD; |
242 | 3.64k | } |
243 | 21.8M | return code_point_value_so_far; |
244 | 21.8M | } |
245 | | |
246 | | Optional<u32> Utf8CodePointIterator::peek(size_t offset) const |
247 | 379k | { |
248 | 379k | if (offset == 0) { |
249 | 0 | if (this->done()) |
250 | 0 | return {}; |
251 | 0 | return this->operator*(); |
252 | 0 | } |
253 | | |
254 | 379k | auto new_iterator = *this; |
255 | 918k | for (size_t index = 0; index < offset; ++index) { |
256 | 539k | ++new_iterator; |
257 | 539k | if (new_iterator.done()) |
258 | 351 | return {}; |
259 | 539k | } |
260 | 378k | return *new_iterator; |
261 | 379k | } |
262 | | |
263 | | ErrorOr<void> Formatter<Utf8View>::format(FormatBuilder& builder, Utf8View const& string) |
264 | 0 | { |
265 | 0 | return Formatter<StringView>::format(builder, string.as_string()); |
266 | 0 | } |
267 | | |
268 | | } |