/src/serenity/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org> |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <AK/Platform.h> |
8 | | #include <AK/String.h> |
9 | | #include <AK/StringBuilder.h> |
10 | | #include <AK/Types.h> |
11 | | #include <LibUnicode/CharacterTypes.h> |
12 | | #include <LibUnicode/Segmentation.h> |
13 | | #include <LibUnicode/UnicodeUtils.h> |
14 | | |
15 | | #if ENABLE_UNICODE_DATA |
16 | | # include <LibUnicode/UnicodeData.h> |
17 | | #endif |
18 | | |
19 | | // For details on the algorithms used here, see Section 3.13 Default Case Algorithms |
20 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf |
21 | | |
22 | | namespace Unicode::Detail { |
23 | | |
24 | | #if ENABLE_UNICODE_DATA |
25 | | |
26 | | static bool is_after_uppercase_i(Utf8View const& string, size_t index) |
27 | 0 | { |
28 | | // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0. |
29 | 0 | auto preceding_view = string.substring_view(0, index); |
30 | 0 | bool found_uppercase_i = false; |
31 | | |
32 | | // FIXME: Would be better if Utf8View supported reverse iteration. |
33 | 0 | for (auto code_point : preceding_view) { |
34 | 0 | if (code_point == 'I') { |
35 | 0 | found_uppercase_i = true; |
36 | 0 | continue; |
37 | 0 | } |
38 | | |
39 | 0 | auto combining_class = canonical_combining_class(code_point); |
40 | 0 | if (combining_class == 0 || combining_class == 230) |
41 | 0 | found_uppercase_i = false; |
42 | 0 | } |
43 | |
|
44 | 0 | return found_uppercase_i; |
45 | 0 | } |
46 | | |
47 | | static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index) |
48 | 0 | { |
49 | | // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above). |
50 | 0 | auto preceding_view = string.substring_view(0, index); |
51 | 0 | bool found_soft_dotted_code_point = false; |
52 | | |
53 | | // FIXME: Would be better if Utf8View supported reverse iteration. |
54 | 0 | for (auto code_point : preceding_view) { |
55 | 0 | if (code_point_has_property(code_point, Property::Soft_Dotted)) { |
56 | 0 | found_soft_dotted_code_point = true; |
57 | 0 | continue; |
58 | 0 | } |
59 | | |
60 | 0 | auto combining_class = canonical_combining_class(code_point); |
61 | 0 | if (combining_class == 0 || combining_class == 230) |
62 | 0 | found_soft_dotted_code_point = false; |
63 | 0 | } |
64 | |
|
65 | 0 | return found_soft_dotted_code_point; |
66 | 0 | } |
67 | | |
68 | | static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) |
69 | 84 | { |
70 | | // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable |
71 | | // characters, and C is not followed by a sequence consisting of zero or more case-ignorable |
72 | | // characters and then a cased letter. |
73 | 84 | auto preceding_view = string.substring_view(0, index); |
74 | 84 | auto following_view = ((index + byte_length) < string.byte_length()) |
75 | 84 | ? string.substring_view(index + byte_length) |
76 | 84 | : Utf8View {}; |
77 | | |
78 | 84 | size_t cased_letter_count = 0; |
79 | | |
80 | 84 | for (auto code_point : preceding_view) { |
81 | 84 | bool is_cased = code_point_has_property(code_point, Property::Cased); |
82 | 84 | bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); |
83 | | |
84 | 84 | if (is_cased && !is_case_ignorable) |
85 | 42 | ++cased_letter_count; |
86 | 42 | else if (!is_case_ignorable) |
87 | 42 | cased_letter_count = 0; |
88 | 84 | } |
89 | | |
90 | 84 | if (cased_letter_count == 0) |
91 | 84 | return false; |
92 | | |
93 | 0 | for (auto code_point : following_view) { |
94 | 0 | bool is_cased = code_point_has_property(code_point, Property::Cased); |
95 | 0 | bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); |
96 | |
|
97 | 0 | if (is_case_ignorable) |
98 | 0 | continue; |
99 | 0 | if (is_cased) |
100 | 0 | return false; |
101 | | |
102 | 0 | break; |
103 | 0 | } |
104 | | |
105 | 0 | return true; |
106 | 0 | } |
107 | | |
108 | | static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length) |
109 | 0 | { |
110 | | // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above). |
111 | 0 | auto following_view = ((index + byte_length) < string.byte_length()) |
112 | 0 | ? string.substring_view(index + byte_length) |
113 | 0 | : Utf8View {}; |
114 | |
|
115 | 0 | for (auto code_point : following_view) { |
116 | 0 | u32 combining_class = canonical_combining_class(code_point); |
117 | |
|
118 | 0 | if (combining_class == 0) |
119 | 0 | return false; |
120 | 0 | if (combining_class == 230) |
121 | 0 | return true; |
122 | 0 | } |
123 | | |
124 | 0 | return false; |
125 | 0 | } |
126 | | |
127 | | static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length) |
128 | 0 | { |
129 | | // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may |
130 | | // intervene between the current character and the combining dot above. |
131 | 0 | auto following_view = ((index + byte_length) < string.byte_length()) |
132 | 0 | ? string.substring_view(index + byte_length) |
133 | 0 | : Utf8View {}; |
134 | |
|
135 | 0 | for (auto code_point : following_view) { |
136 | 0 | if (code_point == 0x307) |
137 | 0 | return true; |
138 | | |
139 | 0 | u32 combining_class = canonical_combining_class(code_point); |
140 | |
|
141 | 0 | if (combining_class == 0) |
142 | 0 | return false; |
143 | 0 | if (combining_class == 230) |
144 | 0 | return false; |
145 | 0 | } |
146 | | |
147 | 0 | return false; |
148 | 0 | } |
149 | | |
150 | | static Optional<SpecialCasing const&> find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length) |
151 | 23.3k | { |
152 | 23.3k | auto requested_locale = Locale::None; |
153 | | |
154 | 23.3k | if (locale.has_value()) { |
155 | 0 | if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value()) |
156 | 0 | requested_locale = *maybe_locale; |
157 | 0 | } |
158 | | |
159 | 23.3k | auto special_casings = special_case_mapping(code_point); |
160 | | |
161 | 23.3k | for (auto const& special_casing : special_casings) { |
162 | 19.6k | if (special_casing.locale != Locale::None && special_casing.locale != requested_locale) |
163 | 19.0k | continue; |
164 | | |
165 | 536 | switch (special_casing.condition) { |
166 | 452 | case Condition::None: |
167 | 452 | return special_casing; |
168 | | |
169 | 0 | case Condition::AfterI: |
170 | 0 | if (is_after_uppercase_i(string, index)) |
171 | 0 | return special_casing; |
172 | 0 | break; |
173 | | |
174 | 0 | case Condition::AfterSoftDotted: |
175 | 0 | if (is_after_soft_dotted_code_point(string, index)) |
176 | 0 | return special_casing; |
177 | 0 | break; |
178 | | |
179 | 84 | case Condition::FinalSigma: |
180 | 84 | if (is_final_code_point(string, index, byte_length)) |
181 | 0 | return special_casing; |
182 | 84 | break; |
183 | | |
184 | 84 | case Condition::MoreAbove: |
185 | 0 | if (is_followed_by_combining_class_above(string, index, byte_length)) |
186 | 0 | return special_casing; |
187 | 0 | break; |
188 | | |
189 | 0 | case Condition::NotBeforeDot: |
190 | 0 | if (!is_followed_by_combining_dot_above(string, index, byte_length)) |
191 | 0 | return special_casing; |
192 | 0 | break; |
193 | 536 | } |
194 | 536 | } |
195 | | |
196 | 22.8k | return {}; |
197 | 23.3k | } |
198 | | |
199 | | template<CaseFoldingStatus... StatusFilter> |
200 | | static Optional<CaseFolding const&> find_matching_case_folding(u32 code_point) |
201 | 0 | { |
202 | 0 | auto case_foldings = case_folding_mapping(code_point); |
203 | |
|
204 | 0 | for (auto const& case_folding : case_foldings) { |
205 | 0 | if (((case_folding.status == StatusFilter) || ...)) |
206 | 0 | return case_folding; |
207 | 0 | } |
208 | | |
209 | 0 | return {}; |
210 | 0 | } |
211 | | |
212 | | #endif |
213 | | |
214 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 |
215 | | ErrorOr<void> build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale) |
216 | 0 | { |
217 | 0 | #if ENABLE_UNICODE_DATA |
218 | 0 | size_t index = 0; |
219 | 0 | size_t byte_length = 0; |
220 | |
|
221 | 0 | for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) { |
222 | 0 | u32 code_point = *it; |
223 | 0 | byte_length = it.underlying_code_point_length_in_bytes(); |
224 | |
|
225 | 0 | auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); |
226 | 0 | if (!special_casing.has_value()) { |
227 | 0 | TRY(builder.try_append_code_point(to_unicode_lowercase(code_point))); |
228 | 0 | continue; |
229 | 0 | } |
230 | | |
231 | 0 | for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i) |
232 | 0 | TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i])); |
233 | 0 | } |
234 | | |
235 | 0 | return {}; |
236 | | #else |
237 | | return Error::from_string_literal("Unicode data has been disabled"); |
238 | | #endif |
239 | 0 | } |
240 | | |
241 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 |
242 | | ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale) |
243 | 2.60k | { |
244 | 2.60k | #if ENABLE_UNICODE_DATA |
245 | 2.60k | size_t index = 0; |
246 | 2.60k | size_t byte_length = 0; |
247 | | |
248 | 25.9k | for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) { |
249 | 23.3k | u32 code_point = *it; |
250 | 23.3k | byte_length = it.underlying_code_point_length_in_bytes(); |
251 | | |
252 | 23.3k | auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); |
253 | 23.3k | if (!special_casing.has_value()) { |
254 | 22.8k | TRY(builder.try_append_code_point(to_unicode_uppercase(code_point))); |
255 | 0 | continue; |
256 | 22.8k | } |
257 | | |
258 | 1.35k | for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i) |
259 | 904 | TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i])); |
260 | 452 | } |
261 | | |
262 | 2.60k | return {}; |
263 | | #else |
264 | | return Error::from_string_literal("Unicode data has been disabled"); |
265 | | #endif |
266 | 2.60k | } |
267 | | |
268 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078 |
269 | | ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale, [[maybe_unused]] TrailingCodePointTransformation trailing_code_point_transformation) |
270 | 0 | { |
271 | 0 | #if ENABLE_UNICODE_DATA |
272 | | // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29, |
273 | | // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following |
274 | | // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between |
275 | | // F and the following word boundary to Lowercase_Mapping(C). |
276 | |
|
277 | 0 | auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> { |
278 | 0 | auto it = code_points.iterator_at_byte_offset_without_validation(boundary); |
279 | 0 | auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary); |
280 | |
|
281 | 0 | for (; it != end; ++it) { |
282 | 0 | if (code_point_has_property(*it, Property::Cased)) |
283 | 0 | return it; |
284 | 0 | } |
285 | | |
286 | 0 | return {}; |
287 | 0 | }; |
288 | |
|
289 | 0 | auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> { |
290 | 0 | auto special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length); |
291 | 0 | if (!special_casing.has_value()) { |
292 | 0 | TRY(builder.try_append_code_point(to_unicode_titlecase(code_point))); |
293 | 0 | return {}; |
294 | 0 | } |
295 | | |
296 | 0 | for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i) |
297 | 0 | TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i])); |
298 | 0 | return {}; |
299 | 0 | }; |
300 | |
|
301 | 0 | size_t boundary = 0; |
302 | |
|
303 | 0 | while (true) { |
304 | 0 | auto next_boundary = next_word_segmentation_boundary(code_points, boundary); |
305 | 0 | if (!next_boundary.has_value()) |
306 | 0 | break; |
307 | | |
308 | 0 | if (auto it = first_cased_code_point_after_boundary(boundary, *next_boundary); it.has_value()) { |
309 | 0 | auto code_point = *it.value(); |
310 | 0 | auto code_point_offset = code_points.byte_offset_of(*it); |
311 | 0 | auto code_point_length = it->underlying_code_point_length_in_bytes(); |
312 | |
|
313 | 0 | auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary); |
314 | 0 | TRY(builder.try_append(caseless_code_points.as_string())); |
315 | | |
316 | 0 | TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length)); |
317 | 0 | boundary = code_point_offset + code_point_length; |
318 | 0 | } |
319 | | |
320 | 0 | auto remaining_code_points = code_points.substring_view(boundary, *next_boundary - boundary); |
321 | 0 | switch (trailing_code_point_transformation) { |
322 | 0 | case TrailingCodePointTransformation::Lowercase: |
323 | 0 | TRY(build_lowercase_string(remaining_code_points, builder, locale)); |
324 | 0 | break; |
325 | 0 | case TrailingCodePointTransformation::PreserveExisting: |
326 | 0 | TRY(builder.try_append(remaining_code_points.as_string())); |
327 | 0 | break; |
328 | 0 | } |
329 | | |
330 | 0 | boundary = *next_boundary; |
331 | 0 | } |
332 | | |
333 | 0 | return {}; |
334 | | #else |
335 | | return Error::from_string_literal("Unicode data has been disabled"); |
336 | | #endif |
337 | 0 | } |
338 | | |
339 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253 |
340 | | ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder) |
341 | 0 | { |
342 | | // toCasefold(X): Map each character C in X to Case_Folding(C). |
343 | 0 | for (auto code_point : code_points) { |
344 | 0 | auto case_folding = casefold_code_point(code_point); |
345 | 0 | TRY(builder.try_append(case_folding)); |
346 | 0 | } |
347 | | |
348 | 0 | return {}; |
349 | 0 | } |
350 | | |
351 | | // https://www.unicode.org/reports/tr44/#CaseFolding.txt |
352 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253 |
353 | | Utf32View casefold_code_point(u32 const& code_point) |
354 | 0 | { |
355 | 0 | #if ENABLE_UNICODE_DATA |
356 | | // Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file |
357 | | // CaseFolding.txt in the Unicode Character Database. |
358 | 0 | using enum CaseFoldingStatus; |
359 | |
|
360 | 0 | if (auto case_folding = find_matching_case_folding<Common, Full>(code_point); case_folding.has_value()) |
361 | 0 | return Utf32View { case_folding->mapping, case_folding->mapping_size }; |
362 | 0 | #endif |
363 | | |
364 | | // The case foldings are omitted in the data file if they are the same as the code point itself. |
365 | 0 | return Utf32View { &code_point, 1 }; |
366 | 0 | } |
367 | | |
368 | | } |