/src/serenity/Userland/Libraries/LibUnicode/Normalize.cpp

Source (jump to first uncovered line)
/*
 * Copyright (c) 2022, mat
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/Find.h>
#include <AK/QuickSort.h>
#include <AK/Utf8View.h>
#include <AK/Vector.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/Normalize.h>

#if ENABLE_UNICODE_DATA
#    include <LibUnicode/UnicodeData.h>
#else
struct Unicode::CodePointDecomposition { };
#endif

namespace Unicode {

Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
Optional<u32> __attribute__((weak)) code_point_composition(u32, u32) { return {}; }

NormalizationForm normalization_form_from_string(StringView form)
{
    if (form == "NFD"sv)
        return NormalizationForm::NFD;
    if (form == "NFC"sv)
        return NormalizationForm::NFC;
    if (form == "NFKD"sv)
        return NormalizationForm::NFKD;
    if (form == "NFKC"sv)
        return NormalizationForm::NFKC;
    VERIFY_NOT_REACHED();
}

StringView normalization_form_to_string(NormalizationForm form)
{
    switch (form) {
    case NormalizationForm::NFD:
        return "NFD"sv;
    case NormalizationForm::NFC:
        return "NFC"sv;
    case NormalizationForm::NFKD:
        return "NFKD"sv;
    case NormalizationForm::NFKC:
        return "NFKC"sv;
    }
    VERIFY_NOT_REACHED();
}

ALWAYS_INLINE static bool is_starter(u32 code_point)
{
    return Unicode::canonical_combining_class(code_point) == 0;
}

// From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00;
static constexpr u32 HANGUL_LEADING_BASE = 0x1100;
static constexpr u32 HANGUL_VOWEL_BASE = 0x1161;
static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7;
static constexpr u32 HANGUL_LEADING_COUNT = 19;
static constexpr u32 HANGUL_VOWEL_COUNT = 21;
static constexpr u32 HANGUL_TRAILING_COUNT = 28;
// NCount in the standard.
static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT;
static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT;

ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point)
{
    return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT;
}

ALWAYS_INLINE static bool is_hangul_leading(u32 code_point)
{
    return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT;
}

ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point)
{
    return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT;
}

ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point)
{
    return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT;
}

// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
{
    auto const index = code_point - HANGUL_SYLLABLE_BASE;

    auto const leading_index = index / HANGUL_BLOCK_COUNT;
    auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT;
    auto const trailing_index = index % HANGUL_TRAILING_COUNT;

    auto const leading_part = HANGUL_LEADING_BASE + leading_index;
    auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
    auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;

    code_points_output.append(leading_part);
    code_points_output.append(vowel_part);
    if (trailing_index != 0)
        code_points_output.append(trailing_part);
}

// L, V and LV, T Hangul Syllable Composition
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688
static u32 combine_hangul_code_points(u32 a, u32 b)
{
    if (is_hangul_leading(a) && is_hangul_vowel(b)) {
        auto const leading_index = a - HANGUL_LEADING_BASE;
        auto const vowel_index = b - HANGUL_VOWEL_BASE;
        auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT;
        return HANGUL_SYLLABLE_BASE + leading_vowel_index;
    }
    // LV characters are the first in each "T block", so use this check to avoid combining LVT with T.
    if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) {
        return a + b - HANGUL_TRAILING_BASE;
    }
    return 0;
}

static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b)
{
#if ENABLE_UNICODE_DATA
    auto composition = code_point_composition(a, b);
    if (composition.has_value())
        return composition.value();
#endif

    return 0;
}

enum class UseCompatibility {
    Yes,
    No
};

static void decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility)
{
    if (is_hangul_code_point(code_point))
        return decompose_hangul_code_point(code_point, code_points_output);

#if ENABLE_UNICODE_DATA
    auto const mapping = Unicode::code_point_decomposition(code_point);
    if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) {
        for (auto code_point : mapping->decomposition) {
            decompose_code_point(code_point, code_points_output, use_compatibility);
        }
    } else {
        code_points_output.append(code_point);
    }
#endif
}

// This can be any sorting algorithm that maintains order (like std::stable_sort),
// however bubble sort is easier to implement, so go with it (for now).
template<typename T, typename LessThan>
void bubble_sort(Span<T> span, LessThan less_than)
{
    for (size_t i = 0; i < span.size() - 1; ++i) {
        for (size_t j = 0; j < span.size() - 1 - i; ++j) {
            if (!less_than(span[j], span[j + 1]))
                swap(span[j], span[j + 1]);
        }
    }
}

// The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
// See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591
static void canonical_ordering_algorithm(Span<u32> code_points)
{
    for (size_t i = 0; i < code_points.size(); ++i) {
        if (!is_starter(code_points[i])) {
            auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter);
            auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i));
            // Nothing to reorder, so continue.
            if (span_size <= 1)
                continue;
            Span<u32> const span { code_points.data() + i, span_size };

            bubble_sort(span, [](u32 a, u32 b) {
                // Use <= to keep ordering.
                return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b);
            });

            // Skip over span we just sorted.
            i += span_size - 1;
        }
    }
}

// See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard.
static bool is_blocked(Span<u32> code_points, size_t a, size_t c)
{
    if (a == c - 1)
        return false;
    auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]);
    auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]);
    return b_combining_class >= c_combining_class;
}

// The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
// See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628
static void canonical_composition_algorithm(Vector<u32>& code_points)
{
    if (code_points.size() <= 1)
        return;
    ssize_t last_starter = is_starter(code_points[0]) ? 0 : -1;
    for (size_t i = 1; i < code_points.size(); ++i) {
        auto const current_character = code_points[i];
        // R1. Seek back (left) to find the last Starter L preceding C in the character sequence
        if (last_starter == -1) {
            if (is_starter(current_character))
                last_starter = i;
            continue;
        }
        // R2. If there is such an L, and C is not blocked from L,
        //     and there exists a Primary Composite P which is canonically equivalent to <L, C>,
        //     then replace L by P in the sequence and delete C from the sequence.
        if (is_blocked(code_points.span(), last_starter, i)) {
            if (is_starter(current_character))
                last_starter = i;
            continue;
        }

        auto composite = combine_hangul_code_points(code_points[last_starter], current_character);

        if (composite == 0)
            composite = combine_code_points(code_points[last_starter], current_character);

        if (composite == 0) {
            if (is_starter(current_character))
                last_starter = i;
            continue;
        }

        code_points[last_starter] = composite;
        code_points.remove(i);
        --i;
    }
}

static Vector<u32> normalize_nfd(Utf8View string)
{
    Vector<u32> result;
    for (auto const code_point : string)
        decompose_code_point(code_point, result, UseCompatibility::No);

    canonical_ordering_algorithm(result);
    return result;
}

static Vector<u32> normalize_nfc(Utf8View string)
{
    auto result = normalize_nfd(string);
    canonical_composition_algorithm(result);

    return result;
}

static Vector<u32> normalize_nfkd(Utf8View string)
{
    Vector<u32> result;
    for (auto const code_point : string)
        decompose_code_point(code_point, result, UseCompatibility::Yes);

    canonical_ordering_algorithm(result);
    return result;
}

static Vector<u32> normalize_nfkc(Utf8View string)
{
    auto result = normalize_nfkd(string);
    canonical_composition_algorithm(result);

    return result;
}

static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm form)
{
    switch (form) {
    case NormalizationForm::NFD:
        return normalize_nfd(string);
    case NormalizationForm::NFC:
        return normalize_nfc(string);
    case NormalizationForm::NFKD:
        return normalize_nfkd(string);
    case NormalizationForm::NFKC:
        return normalize_nfkc(string);
    }
    VERIFY_NOT_REACHED();
}

String normalize(StringView string, NormalizationForm form)
{
    auto const code_points = normalize_implementation(Utf8View { string }, form);

    StringBuilder builder;
    for (auto code_point : code_points)
        builder.append_code_point(code_point);

    return MUST(builder.to_string());
}

}

Coverage Report

Created: 2025-09-05 06:52

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2022, mat
3		*
4		* SPDX-License-Identifier: BSD-2-Clause
5		*/
6
7		#include <AK/Find.h>
8		#include <AK/QuickSort.h>
9		#include <AK/Utf8View.h>
10		#include <AK/Vector.h>
11		#include <LibUnicode/CharacterTypes.h>
12		#include <LibUnicode/Normalize.h>
13
14		#if ENABLE_UNICODE_DATA
15		# include <LibUnicode/UnicodeData.h>
16		#else
17		struct Unicode::CodePointDecomposition { };
18		#endif
19
20		namespace Unicode {
21
22	0	Optional<CodePointDecomposition const> __attribute__((weak)) code_point_decomposition(u32) { return {}; }
23	0	Optional<u32> __attribute__((weak)) code_point_composition(u32, u32) { return {}; }
24
25		NormalizationForm normalization_form_from_string(StringView form)
26	0	{
27	0	if (form == "NFD"sv)
28	0	return NormalizationForm::NFD;
29	0	if (form == "NFC"sv)
30	0	return NormalizationForm::NFC;
31	0	if (form == "NFKD"sv)
32	0	return NormalizationForm::NFKD;
33	0	if (form == "NFKC"sv)
34	0	return NormalizationForm::NFKC;
35	0	VERIFY_NOT_REACHED();
36	0	}
37
38		StringView normalization_form_to_string(NormalizationForm form)
39	0	{
40	0	switch (form) {
41	0	case NormalizationForm::NFD:
42	0	return "NFD"sv;
43	0	case NormalizationForm::NFC:
44	0	return "NFC"sv;
45	0	case NormalizationForm::NFKD:
46	0	return "NFKD"sv;
47	0	case NormalizationForm::NFKC:
48	0	return "NFKC"sv;
49	0	}
50	0	VERIFY_NOT_REACHED();
51	0	}
52
53		ALWAYS_INLINE static bool is_starter(u32 code_point)
54	0	{
55	0	return Unicode::canonical_combining_class(code_point) == 0;
56	0	}
57
58		// From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
59		static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00;
60		static constexpr u32 HANGUL_LEADING_BASE = 0x1100;
61		static constexpr u32 HANGUL_VOWEL_BASE = 0x1161;
62		static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7;
63		static constexpr u32 HANGUL_LEADING_COUNT = 19;
64		static constexpr u32 HANGUL_VOWEL_COUNT = 21;
65		static constexpr u32 HANGUL_TRAILING_COUNT = 28;
66		// NCount in the standard.
67		static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT;
68		static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT;
69
70		ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point)
71	0	{
72	0	return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT;
73	0	}
74
75		ALWAYS_INLINE static bool is_hangul_leading(u32 code_point)
76	0	{
77	0	return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT;
78	0	}
79
80		ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point)
81	0	{
82	0	return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT;
83	0	}
84
85		ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point)
86	0	{
87	0	return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT;
88	0	}
89
90		// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669
91		static void decompose_hangul_code_point(u32 code_point, Vector<u32>& code_points_output)
92	0	{
93	0	auto const index = code_point - HANGUL_SYLLABLE_BASE;
94
95	0	auto const leading_index = index / HANGUL_BLOCK_COUNT;
96	0	auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT;
97	0	auto const trailing_index = index % HANGUL_TRAILING_COUNT;
98
99	0	auto const leading_part = HANGUL_LEADING_BASE + leading_index;
100	0	auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index;
101	0	auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index;
102
103	0	code_points_output.append(leading_part);
104	0	code_points_output.append(vowel_part);
105	0	if (trailing_index != 0)
106	0	code_points_output.append(trailing_part);
107	0	}
108
109		// L, V and LV, T Hangul Syllable Composition
110		// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688
111		static u32 combine_hangul_code_points(u32 a, u32 b)
112	0	{
113	0	if (is_hangul_leading(a) && is_hangul_vowel(b)) {
114	0	auto const leading_index = a - HANGUL_LEADING_BASE;
115	0	auto const vowel_index = b - HANGUL_VOWEL_BASE;
116	0	auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT;
117	0	return HANGUL_SYLLABLE_BASE + leading_vowel_index;
118	0	}
119		// LV characters are the first in each "T block", so use this check to avoid combining LVT with T.
120	0	if (is_hangul_code_point(a) && (a - HANGUL_SYLLABLE_BASE) % HANGUL_TRAILING_COUNT == 0 && is_hangul_trailing(b)) {
121	0	return a + b - HANGUL_TRAILING_BASE;
122	0	}
123	0	return 0;
124	0	}
125
126		static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b)
127	0	{
128	0	#if ENABLE_UNICODE_DATA
129	0	auto composition = code_point_composition(a, b);
130	0	if (composition.has_value())
131	0	return composition.value();
132	0	#endif
133
134	0	return 0;
135	0	}
136
137		enum class UseCompatibility {
138		Yes,
139		No
140		};
141
142		static void decompose_code_point(u32 code_point, Vector<u32>& code_points_output, [[maybe_unused]] UseCompatibility use_compatibility)
143	0	{
144	0	if (is_hangul_code_point(code_point))
145	0	return decompose_hangul_code_point(code_point, code_points_output);
146
147	0	#if ENABLE_UNICODE_DATA
148	0	auto const mapping = Unicode::code_point_decomposition(code_point);
149	0	if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical \|\| use_compatibility == UseCompatibility::Yes)) {
150	0	for (auto code_point : mapping->decomposition) {
151	0	decompose_code_point(code_point, code_points_output, use_compatibility);
152	0	}
153	0	} else {
154	0	code_points_output.append(code_point);
155	0	}
156	0	#endif
157	0	}
158
159		// This can be any sorting algorithm that maintains order (like std::stable_sort),
160		// however bubble sort is easier to implement, so go with it (for now).
161		template<typename T, typename LessThan>
162		void bubble_sort(Span<T> span, LessThan less_than)
163	0	{
164	0	for (size_t i = 0; i < span.size() - 1; ++i) {
165	0	for (size_t j = 0; j < span.size() - 1 - i; ++j) {
166	0	if (!less_than(span[j], span[j + 1]))
167	0	swap(span[j], span[j + 1]);
168	0	}
169	0	}
170	0	}
171
172		// The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
173		// See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15
174		// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591
175		static void canonical_ordering_algorithm(Span<u32> code_points)
176	367	{
177	367	for (size_t i = 0; i < code_points.size(); ++i) {
178	0	if (!is_starter(code_points[i])) {
179	0	auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter);
180	0	auto const span_size = static_cast<size_t>(starter - (code_points.begin() + i));
181		// Nothing to reorder, so continue.
182	0	if (span_size <= 1)
183	0	continue;
184	0	Span<u32> const span { code_points.data() + i, span_size };
185
186	0	bubble_sort(span, [](u32 a, u32 b) {
187		// Use <= to keep ordering.
188	0	return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b);
189	0	});
190
191		// Skip over span we just sorted.
192	0	i += span_size - 1;
193	0	}
194	0	}
195	367	}
196
197		// See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard.
198		static bool is_blocked(Span<u32> code_points, size_t a, size_t c)
199	0	{
200	0	if (a == c - 1)
201	0	return false;
202	0	auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]);
203	0	auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]);
204	0	return b_combining_class >= c_combining_class;
205	0	}
206
207		// The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard.
208		// See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15
209		// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628
210		static void canonical_composition_algorithm(Vector<u32>& code_points)
211	367	{
212	367	if (code_points.size() <= 1)
213	367	return;
214	0	ssize_t last_starter = is_starter(code_points[0]) ? 0 : -1;
215	0	for (size_t i = 1; i < code_points.size(); ++i) {
216	0	auto const current_character = code_points[i];
217		// R1. Seek back (left) to find the last Starter L preceding C in the character sequence
218	0	if (last_starter == -1) {
219	0	if (is_starter(current_character))
220	0	last_starter = i;
221	0	continue;
222	0	}
223		// R2. If there is such an L, and C is not blocked from L,
224		// and there exists a Primary Composite P which is canonically equivalent to <L, C>,
225		// then replace L by P in the sequence and delete C from the sequence.
226	0	if (is_blocked(code_points.span(), last_starter, i)) {
227	0	if (is_starter(current_character))
228	0	last_starter = i;
229	0	continue;
230	0	}
231
232	0	auto composite = combine_hangul_code_points(code_points[last_starter], current_character);
233
234	0	if (composite == 0)
235	0	composite = combine_code_points(code_points[last_starter], current_character);
236
237	0	if (composite == 0) {
238	0	if (is_starter(current_character))
239	0	last_starter = i;
240	0	continue;
241	0	}
242
243	0	code_points[last_starter] = composite;
244	0	code_points.remove(i);
245	0	--i;
246	0	}
247	0	}
248
249		static Vector<u32> normalize_nfd(Utf8View string)
250	367	{
251	367	Vector<u32> result;
252	367	for (auto const code_point : string)
253	0	decompose_code_point(code_point, result, UseCompatibility::No);
254
255	367	canonical_ordering_algorithm(result);
256	367	return result;
257	367	}
258
259		static Vector<u32> normalize_nfc(Utf8View string)
260	367	{
261	367	auto result = normalize_nfd(string);
262	367	canonical_composition_algorithm(result);
263
264	367	return result;
265	367	}
266
267		static Vector<u32> normalize_nfkd(Utf8View string)
268	0	{
269	0	Vector<u32> result;
270	0	for (auto const code_point : string)
271	0	decompose_code_point(code_point, result, UseCompatibility::Yes);
272
273	0	canonical_ordering_algorithm(result);
274	0	return result;
275	0	}
276
277		static Vector<u32> normalize_nfkc(Utf8View string)
278	0	{
279	0	auto result = normalize_nfkd(string);
280	0	canonical_composition_algorithm(result);
281
282	0	return result;
283	0	}
284
285		static Vector<u32> normalize_implementation(Utf8View string, NormalizationForm form)
286	367	{
287	367	switch (form) {
288	0	case NormalizationForm::NFD:
289	0	return normalize_nfd(string);
290	367	case NormalizationForm::NFC:
291	367	return normalize_nfc(string);
292	0	case NormalizationForm::NFKD:
293	0	return normalize_nfkd(string);
294	0	case NormalizationForm::NFKC:
295	0	return normalize_nfkc(string);
296	367	}
297	0	VERIFY_NOT_REACHED();
298	0	}
299
300		String normalize(StringView string, NormalizationForm form)
301	367	{
302	367	auto const code_points = normalize_implementation(Utf8View { string }, form);
303
304	367	StringBuilder builder;
305	367	for (auto code_point : code_points)
306	0	builder.append_code_point(code_point);
307
308	367	return MUST(builder.to_string());
309	367	}
310
311		}