Coverage Report

Created: 2025-09-05 06:52

/src/serenity/Userland/Libraries/LibUnicode/Normalize.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2022, mat
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/Find.h>
8
#include <AK/QuickSort.h>
9
#include <AK/Utf8View.h>
10
#include <AK/Vector.h>
11
#include <LibUnicode/CharacterTypes.h>
12
#include <LibUnicode/Normalize.h>
13
14
#if ENABLE_UNICODE_DATA
15
#    include <LibUnicode/UnicodeData.h>
16
#else
17
struct Unicode::CodePointDecomposition { };
18
#endif
19
20
namespace Unicode {
21
22
0
Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
23
0
Optional<u32> __attribute__((weak)) code_point_composition(u32, u32) { return {}; }
24
25
NormalizationForm normalization_form_from_string(StringView form)
26
0
{
27
0
    if (form == "NFD"sv)
28
0
        return NormalizationForm::NFD;
29
0
    if (form == "NFC"sv)
30
0
        return NormalizationForm::NFC;
31
0
    if (form == "NFKD"sv)
32
0
        return NormalizationForm::NFKD;
33
0
    if (form == "NFKC"sv)
34
0
        return NormalizationForm::NFKC;
35
0
    VERIFY_NOT_REACHED();
36
0
}
37
38
StringView normalization_form_to_string(NormalizationForm form)
39
0
{
40
0
    switch (form) {
41
0
    case NormalizationForm::NFD:
42
0
        return "NFD"sv;
43
0
    case NormalizationForm::NFC:
44
0
        return "NFC"sv;
45
0
    case NormalizationForm::NFKD:
46
0
        return "NFKD"sv;
47
0
    case NormalizationForm::NFKC:
48
0
        return "NFKC"sv;
49
0
    }
50
0
    VERIFY_NOT_REACHED();
51
0
}
52
53
ALWAYS_INLINE static bool is_starter(u32 code_point)
54
0
{
55
0
    return Unicode::canonical_combining_class(code_point) == 0;
56
0
}
57
58
// From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
59
static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00;
60
static constexpr u32 HANGUL_LEADING_BASE = 0x1100;
61
static constexpr u32 HANGUL_VOWEL_BASE = 0x1161;
62
static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7;
63
static constexpr u32 HANGUL_LEADING_COUNT = 19;
64
static constexpr u32 HANGUL_VOWEL_COUNT = 21;
65
static constexpr u32 HANGUL_TRAILING_COUNT = 28;
66
// NCount in the standard.
67
static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT;
68
static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT;
69
70
ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point)
71
0
{
72
0
    return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT;
73
0
}
74
75
ALWAYS_INLINE static bool is_hangul_leading(u32 code_point)
76
0
{
77
0
    return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT;
78
0
}
79
80
ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point)
81
0
{
82
0
    return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT;
83
0
}
84
85
ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point)
86
0
{
87
0
    return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT;
88
0
}
89
90
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
91
static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
92
0
{
93
0
    auto const index = code_point - HANGUL_SYLLABLE_BASE;
94
95
0
    auto const leading_index = index / HANGUL_BLOCK_COUNT;
96
0
    auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT;
97
0
    auto const trailing_index = index % HANGUL_TRAILING_COUNT;
98
99
0
    auto const leading_part = HANGUL_LEADING_BASE + leading_index;
100
0
    auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
101
0
    auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;
102
103
0
    code_points_output.append(leading_part);
104
0
    code_points_output.append(vowel_part);
105
0
    if (trailing_index != 0)
106
0
        code_points_output.append(trailing_part);
107
0
}
108
109
// L, V and LV, T Hangul Syllable Composition
110
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688
111
static u32 combine_hangul_code_points(u32 a, u32 b)
112
0
{
113
0
    if (is_hangul_leading(a) && is_hangul_vowel(b)) {
114
0
        auto const leading_index = a - HANGUL_LEADING_BASE;
115
0
        auto const vowel_index = b - HANGUL_VOWEL_BASE;
116
0
        auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT;
117
0
        return HANGUL_SYLLABLE_BASE + leading_vowel_index;
118
0
    }
119
    // LV characters are the first in each "T block", so use this check to avoid combining LVT with T.
120
0
    if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) {
121
0
        return a + b - HANGUL_TRAILING_BASE;
122
0
    }
123
0
    return 0;
124
0
}
125
126
static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b)
127
0
{
128
0
#if ENABLE_UNICODE_DATA
129
0
    auto composition = code_point_composition(a, b);
130
0
    if (composition.has_value())
131
0
        return composition.value();
132
0
#endif
133
134
0
    return 0;
135
0
}
136
137
enum class UseCompatibility {
138
    Yes,
139
    No
140
};
141
142
static void decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility)
143
0
{
144
0
    if (is_hangul_code_point(code_point))
145
0
        return decompose_hangul_code_point(code_point, code_points_output);
146
147
0
#if ENABLE_UNICODE_DATA
148
0
    auto const mapping = Unicode::code_point_decomposition(code_point);
149
0
    if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) {
150
0
        for (auto code_point : mapping->decomposition) {
151
0
            decompose_code_point(code_point, code_points_output, use_compatibility);
152
0
        }
153
0
    } else {
154
0
        code_points_output.append(code_point);
155
0
    }
156
0
#endif
157
0
}
158
159
// This can be any sorting algorithm that maintains order (like std::stable_sort),
160
// however bubble sort is easier to implement, so go with it (for now).
161
template<typename T, typename LessThan>
162
void bubble_sort(Span<T> span, LessThan less_than)
163
0
{
164
0
    for (size_t i = 0; i < span.size() - 1; ++i) {
165
0
        for (size_t j = 0; j < span.size() - 1 - i; ++j) {
166
0
            if (!less_than(span[j], span[j + 1]))
167
0
                swap(span[j], span[j + 1]);
168
0
        }
169
0
    }
170
0
}
171
172
// The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
173
// See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15
174
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591
175
static void canonical_ordering_algorithm(Span<u32> code_points)
176
367
{
177
367
    for (size_t i = 0; i < code_points.size(); ++i) {
178
0
        if (!is_starter(code_points[i])) {
179
0
            auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter);
180
0
            auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i));
181
            // Nothing to reorder, so continue.
182
0
            if (span_size <= 1)
183
0
                continue;
184
0
            Span<u32> const span { code_points.data() + i, span_size };
185
186
0
            bubble_sort(span, [](u32 a, u32 b) {
187
                // Use <= to keep ordering.
188
0
                return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b);
189
0
            });
190
191
            // Skip over span we just sorted.
192
0
            i += span_size - 1;
193
0
        }
194
0
    }
195
367
}
196
197
// See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard.
198
static bool is_blocked(Span<u32> code_points, size_t a, size_t c)
199
0
{
200
0
    if (a == c - 1)
201
0
        return false;
202
0
    auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]);
203
0
    auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]);
204
0
    return b_combining_class >= c_combining_class;
205
0
}
206
207
// The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
208
// See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15
209
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628
210
static void canonical_composition_algorithm(Vector<u32>& code_points)
211
367
{
212
367
    if (code_points.size() <= 1)
213
367
        return;
214
0
    ssize_t last_starter = is_starter(code_points[0]) ? 0 : -1;
215
0
    for (size_t i = 1; i < code_points.size(); ++i) {
216
0
        auto const current_character = code_points[i];
217
        // R1. Seek back (left) to find the last Starter L preceding C in the character sequence
218
0
        if (last_starter == -1) {
219
0
            if (is_starter(current_character))
220
0
                last_starter = i;
221
0
            continue;
222
0
        }
223
        // R2. If there is such an L, and C is not blocked from L,
224
        //     and there exists a Primary Composite P which is canonically equivalent to <L, C>,
225
        //     then replace L by P in the sequence and delete C from the sequence.
226
0
        if (is_blocked(code_points.span(), last_starter, i)) {
227
0
            if (is_starter(current_character))
228
0
                last_starter = i;
229
0
            continue;
230
0
        }
231
232
0
        auto composite = combine_hangul_code_points(code_points[last_starter], current_character);
233
234
0
        if (composite == 0)
235
0
            composite = combine_code_points(code_points[last_starter], current_character);
236
237
0
        if (composite == 0) {
238
0
            if (is_starter(current_character))
239
0
                last_starter = i;
240
0
            continue;
241
0
        }
242
243
0
        code_points[last_starter] = composite;
244
0
        code_points.remove(i);
245
0
        --i;
246
0
    }
247
0
}
248
249
static Vector<u32> normalize_nfd(Utf8View string)
250
367
{
251
367
    Vector<u32> result;
252
367
    for (auto const code_point : string)
253
0
        decompose_code_point(code_point, result, UseCompatibility::No);
254
255
367
    canonical_ordering_algorithm(result);
256
367
    return result;
257
367
}
258
259
static Vector<u32> normalize_nfc(Utf8View string)
260
367
{
261
367
    auto result = normalize_nfd(string);
262
367
    canonical_composition_algorithm(result);
263
264
367
    return result;
265
367
}
266
267
static Vector<u32> normalize_nfkd(Utf8View string)
268
0
{
269
0
    Vector<u32> result;
270
0
    for (auto const code_point : string)
271
0
        decompose_code_point(code_point, result, UseCompatibility::Yes);
272
273
0
    canonical_ordering_algorithm(result);
274
0
    return result;
275
0
}
276
277
static Vector<u32> normalize_nfkc(Utf8View string)
278
0
{
279
0
    auto result = normalize_nfkd(string);
280
0
    canonical_composition_algorithm(result);
281
282
0
    return result;
283
0
}
284
285
static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm form)
286
367
{
287
367
    switch (form) {
288
0
    case NormalizationForm::NFD:
289
0
        return normalize_nfd(string);
290
367
    case NormalizationForm::NFC:
291
367
        return normalize_nfc(string);
292
0
    case NormalizationForm::NFKD:
293
0
        return normalize_nfkd(string);
294
0
    case NormalizationForm::NFKC:
295
0
        return normalize_nfkc(string);
296
367
    }
297
0
    VERIFY_NOT_REACHED();
298
0
}
299
300
String normalize(StringView string, NormalizationForm form)
301
367
{
302
367
    auto const code_points = normalize_implementation(Utf8View { string }, form);
303
304
367
    StringBuilder builder;
305
367
    for (auto code_point : code_points)
306
0
        builder.append_code_point(code_point);
307
308
367
    return MUST(builder.to_string());
309
367
}
310
311
}