/src/serenity/Userland/Libraries/LibUnicode/Normalize.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2022, mat |
3 | | * |
4 | | * SPDX-License-Identifier: BSD-2-Clause |
5 | | */ |
6 | | |
7 | | #include <AK/Find.h> |
8 | | #include <AK/QuickSort.h> |
9 | | #include <AK/Utf8View.h> |
10 | | #include <AK/Vector.h> |
11 | | #include <LibUnicode/CharacterTypes.h> |
12 | | #include <LibUnicode/Normalize.h> |
13 | | |
14 | | #if ENABLE_UNICODE_DATA |
15 | | # include <LibUnicode/UnicodeData.h> |
16 | | #else |
17 | | struct Unicode::CodePointDecomposition { }; |
18 | | #endif |
19 | | |
20 | | namespace Unicode { |
21 | | |
22 | 0 | Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; } |
23 | 0 | Optional<u32> __attribute__((weak)) code_point_composition(u32, u32) { return {}; } |
24 | | |
25 | | NormalizationForm normalization_form_from_string(StringView form) |
26 | 0 | { |
27 | 0 | if (form == "NFD"sv) |
28 | 0 | return NormalizationForm::NFD; |
29 | 0 | if (form == "NFC"sv) |
30 | 0 | return NormalizationForm::NFC; |
31 | 0 | if (form == "NFKD"sv) |
32 | 0 | return NormalizationForm::NFKD; |
33 | 0 | if (form == "NFKC"sv) |
34 | 0 | return NormalizationForm::NFKC; |
35 | 0 | VERIFY_NOT_REACHED(); |
36 | 0 | } |
37 | | |
38 | | StringView normalization_form_to_string(NormalizationForm form) |
39 | 0 | { |
40 | 0 | switch (form) { |
41 | 0 | case NormalizationForm::NFD: |
42 | 0 | return "NFD"sv; |
43 | 0 | case NormalizationForm::NFC: |
44 | 0 | return "NFC"sv; |
45 | 0 | case NormalizationForm::NFKD: |
46 | 0 | return "NFKD"sv; |
47 | 0 | case NormalizationForm::NFKC: |
48 | 0 | return "NFKC"sv; |
49 | 0 | } |
50 | 0 | VERIFY_NOT_REACHED(); |
51 | 0 | } |
52 | | |
53 | | ALWAYS_INLINE static bool is_starter(u32 code_point) |
54 | 0 | { |
55 | 0 | return Unicode::canonical_combining_class(code_point) == 0; |
56 | 0 | } |
57 | | |
58 | | // From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669 |
59 | | static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00; |
60 | | static constexpr u32 HANGUL_LEADING_BASE = 0x1100; |
61 | | static constexpr u32 HANGUL_VOWEL_BASE = 0x1161; |
62 | | static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7; |
63 | | static constexpr u32 HANGUL_LEADING_COUNT = 19; |
64 | | static constexpr u32 HANGUL_VOWEL_COUNT = 21; |
65 | | static constexpr u32 HANGUL_TRAILING_COUNT = 28; |
66 | | // NCount in the standard. |
67 | | static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT; |
68 | | static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT; |
69 | | |
70 | | ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point) |
71 | 0 | { |
72 | 0 | return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT; |
73 | 0 | } |
74 | | |
75 | | ALWAYS_INLINE static bool is_hangul_leading(u32 code_point) |
76 | 0 | { |
77 | 0 | return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT; |
78 | 0 | } |
79 | | |
80 | | ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point) |
81 | 0 | { |
82 | 0 | return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT; |
83 | 0 | } |
84 | | |
85 | | ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point) |
86 | 0 | { |
87 | 0 | return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT; |
88 | 0 | } |
89 | | |
90 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669 |
91 | | static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output) |
92 | 0 | { |
93 | 0 | auto const index = code_point - HANGUL_SYLLABLE_BASE; |
94 | |
|
95 | 0 | auto const leading_index = index / HANGUL_BLOCK_COUNT; |
96 | 0 | auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT; |
97 | 0 | auto const trailing_index = index % HANGUL_TRAILING_COUNT; |
98 | |
|
99 | 0 | auto const leading_part = HANGUL_LEADING_BASE + leading_index; |
100 | 0 | auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index; |
101 | 0 | auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index; |
102 | |
|
103 | 0 | code_points_output.append(leading_part); |
104 | 0 | code_points_output.append(vowel_part); |
105 | 0 | if (trailing_index != 0) |
106 | 0 | code_points_output.append(trailing_part); |
107 | 0 | } |
108 | | |
109 | | // L, V and LV, T Hangul Syllable Composition |
110 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688 |
111 | | static u32 combine_hangul_code_points(u32 a, u32 b) |
112 | 0 | { |
113 | 0 | if (is_hangul_leading(a) && is_hangul_vowel(b)) { |
114 | 0 | auto const leading_index = a - HANGUL_LEADING_BASE; |
115 | 0 | auto const vowel_index = b - HANGUL_VOWEL_BASE; |
116 | 0 | auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT; |
117 | 0 | return HANGUL_SYLLABLE_BASE + leading_vowel_index; |
118 | 0 | } |
119 | | // LV characters are the first in each "T block", so use this check to avoid combining LVT with T. |
120 | 0 | if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) { |
121 | 0 | return a + b - HANGUL_TRAILING_BASE; |
122 | 0 | } |
123 | 0 | return 0; |
124 | 0 | } |
125 | | |
126 | | static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b) |
127 | 0 | { |
128 | 0 | #if ENABLE_UNICODE_DATA |
129 | 0 | auto composition = code_point_composition(a, b); |
130 | 0 | if (composition.has_value()) |
131 | 0 | return composition.value(); |
132 | 0 | #endif |
133 | | |
134 | 0 | return 0; |
135 | 0 | } |
136 | | |
137 | | enum class UseCompatibility { |
138 | | Yes, |
139 | | No |
140 | | }; |
141 | | |
142 | | static void decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility) |
143 | 0 | { |
144 | 0 | if (is_hangul_code_point(code_point)) |
145 | 0 | return decompose_hangul_code_point(code_point, code_points_output); |
146 | | |
147 | 0 | #if ENABLE_UNICODE_DATA |
148 | 0 | auto const mapping = Unicode::code_point_decomposition(code_point); |
149 | 0 | if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) { |
150 | 0 | for (auto code_point : mapping->decomposition) { |
151 | 0 | decompose_code_point(code_point, code_points_output, use_compatibility); |
152 | 0 | } |
153 | 0 | } else { |
154 | 0 | code_points_output.append(code_point); |
155 | 0 | } |
156 | 0 | #endif |
157 | 0 | } |
158 | | |
159 | | // This can be any sorting algorithm that maintains order (like std::stable_sort), |
160 | | // however bubble sort is easier to implement, so go with it (for now). |
161 | | template<typename T, typename LessThan> |
162 | | void bubble_sort(Span<T> span, LessThan less_than) |
163 | 0 | { |
164 | 0 | for (size_t i = 0; i < span.size() - 1; ++i) { |
165 | 0 | for (size_t j = 0; j < span.size() - 1 - i; ++j) { |
166 | 0 | if (!less_than(span[j], span[j + 1])) |
167 | 0 | swap(span[j], span[j + 1]); |
168 | 0 | } |
169 | 0 | } |
170 | 0 | } |
171 | | |
172 | | // The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard. |
173 | | // See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15 |
174 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591 |
175 | | static void canonical_ordering_algorithm(Span<u32> code_points) |
176 | 367 | { |
177 | 367 | for (size_t i = 0; i < code_points.size(); ++i) { |
178 | 0 | if (!is_starter(code_points[i])) { |
179 | 0 | auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter); |
180 | 0 | auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i)); |
181 | | // Nothing to reorder, so continue. |
182 | 0 | if (span_size <= 1) |
183 | 0 | continue; |
184 | 0 | Span<u32> const span { code_points.data() + i, span_size }; |
185 | |
|
186 | 0 | bubble_sort(span, [](u32 a, u32 b) { |
187 | | // Use <= to keep ordering. |
188 | 0 | return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b); |
189 | 0 | }); |
190 | | |
191 | | // Skip over span we just sorted. |
192 | 0 | i += span_size - 1; |
193 | 0 | } |
194 | 0 | } |
195 | 367 | } |
196 | | |
197 | | // See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard. |
198 | | static bool is_blocked(Span<u32> code_points, size_t a, size_t c) |
199 | 0 | { |
200 | 0 | if (a == c - 1) |
201 | 0 | return false; |
202 | 0 | auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]); |
203 | 0 | auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]); |
204 | 0 | return b_combining_class >= c_combining_class; |
205 | 0 | } |
206 | | |
207 | | // The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard. |
208 | | // See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15 |
209 | | // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628 |
210 | | static void canonical_composition_algorithm(Vector<u32>& code_points) |
211 | 367 | { |
212 | 367 | if (code_points.size() <= 1) |
213 | 367 | return; |
214 | 0 | ssize_t last_starter = is_starter(code_points[0]) ? 0 : -1; |
215 | 0 | for (size_t i = 1; i < code_points.size(); ++i) { |
216 | 0 | auto const current_character = code_points[i]; |
217 | | // R1. Seek back (left) to find the last Starter L preceding C in the character sequence |
218 | 0 | if (last_starter == -1) { |
219 | 0 | if (is_starter(current_character)) |
220 | 0 | last_starter = i; |
221 | 0 | continue; |
222 | 0 | } |
223 | | // R2. If there is such an L, and C is not blocked from L, |
224 | | // and there exists a Primary Composite P which is canonically equivalent to <L, C>, |
225 | | // then replace L by P in the sequence and delete C from the sequence. |
226 | 0 | if (is_blocked(code_points.span(), last_starter, i)) { |
227 | 0 | if (is_starter(current_character)) |
228 | 0 | last_starter = i; |
229 | 0 | continue; |
230 | 0 | } |
231 | | |
232 | 0 | auto composite = combine_hangul_code_points(code_points[last_starter], current_character); |
233 | |
|
234 | 0 | if (composite == 0) |
235 | 0 | composite = combine_code_points(code_points[last_starter], current_character); |
236 | |
|
237 | 0 | if (composite == 0) { |
238 | 0 | if (is_starter(current_character)) |
239 | 0 | last_starter = i; |
240 | 0 | continue; |
241 | 0 | } |
242 | | |
243 | 0 | code_points[last_starter] = composite; |
244 | 0 | code_points.remove(i); |
245 | 0 | --i; |
246 | 0 | } |
247 | 0 | } |
248 | | |
249 | | static Vector<u32> normalize_nfd(Utf8View string) |
250 | 367 | { |
251 | 367 | Vector<u32> result; |
252 | 367 | for (auto const code_point : string) |
253 | 0 | decompose_code_point(code_point, result, UseCompatibility::No); |
254 | | |
255 | 367 | canonical_ordering_algorithm(result); |
256 | 367 | return result; |
257 | 367 | } |
258 | | |
259 | | static Vector<u32> normalize_nfc(Utf8View string) |
260 | 367 | { |
261 | 367 | auto result = normalize_nfd(string); |
262 | 367 | canonical_composition_algorithm(result); |
263 | | |
264 | 367 | return result; |
265 | 367 | } |
266 | | |
267 | | static Vector<u32> normalize_nfkd(Utf8View string) |
268 | 0 | { |
269 | 0 | Vector<u32> result; |
270 | 0 | for (auto const code_point : string) |
271 | 0 | decompose_code_point(code_point, result, UseCompatibility::Yes); |
272 | |
|
273 | 0 | canonical_ordering_algorithm(result); |
274 | 0 | return result; |
275 | 0 | } |
276 | | |
277 | | static Vector<u32> normalize_nfkc(Utf8View string) |
278 | 0 | { |
279 | 0 | auto result = normalize_nfkd(string); |
280 | 0 | canonical_composition_algorithm(result); |
281 | |
|
282 | 0 | return result; |
283 | 0 | } |
284 | | |
285 | | static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm form) |
286 | 367 | { |
287 | 367 | switch (form) { |
288 | 0 | case NormalizationForm::NFD: |
289 | 0 | return normalize_nfd(string); |
290 | 367 | case NormalizationForm::NFC: |
291 | 367 | return normalize_nfc(string); |
292 | 0 | case NormalizationForm::NFKD: |
293 | 0 | return normalize_nfkd(string); |
294 | 0 | case NormalizationForm::NFKC: |
295 | 0 | return normalize_nfkc(string); |
296 | 367 | } |
297 | 0 | VERIFY_NOT_REACHED(); |
298 | 0 | } |
299 | | |
300 | | String normalize(StringView string, NormalizationForm form) |
301 | 367 | { |
302 | 367 | auto const code_points = normalize_implementation(Utf8View { string }, form); |
303 | | |
304 | 367 | StringBuilder builder; |
305 | 367 | for (auto code_point : code_points) |
306 | 0 | builder.append_code_point(code_point); |
307 | | |
308 | 367 | return MUST(builder.to_string()); |
309 | 367 | } |
310 | | |
311 | | } |