Coverage Report

Created: 2025-03-04 07:22

/src/serenity/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
3
 *
4
 * SPDX-License-Identifier: BSD-2-Clause
5
 */
6
7
#include <AK/Platform.h>
8
#include <AK/String.h>
9
#include <AK/StringBuilder.h>
10
#include <AK/Types.h>
11
#include <LibUnicode/CharacterTypes.h>
12
#include <LibUnicode/Segmentation.h>
13
#include <LibUnicode/UnicodeUtils.h>
14
15
#if ENABLE_UNICODE_DATA
16
#    include <LibUnicode/UnicodeData.h>
17
#endif
18
19
// For details on the algorithms used here, see Section 3.13 Default Case Algorithms
20
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
21
22
namespace Unicode::Detail {
23
24
#if ENABLE_UNICODE_DATA
25
26
static bool is_after_uppercase_i(Utf8View const& string, size_t index)
27
0
{
28
    // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0.
29
0
    auto preceding_view = string.substring_view(0, index);
30
0
    bool found_uppercase_i = false;
31
32
    // FIXME: Would be better if Utf8View supported reverse iteration.
33
0
    for (auto code_point : preceding_view) {
34
0
        if (code_point == 'I') {
35
0
            found_uppercase_i = true;
36
0
            continue;
37
0
        }
38
39
0
        auto combining_class = canonical_combining_class(code_point);
40
0
        if (combining_class == 0 || combining_class == 230)
41
0
            found_uppercase_i = false;
42
0
    }
43
44
0
    return found_uppercase_i;
45
0
}
46
47
static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index)
48
0
{
49
    // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above).
50
0
    auto preceding_view = string.substring_view(0, index);
51
0
    bool found_soft_dotted_code_point = false;
52
53
    // FIXME: Would be better if Utf8View supported reverse iteration.
54
0
    for (auto code_point : preceding_view) {
55
0
        if (code_point_has_property(code_point, Property::Soft_Dotted)) {
56
0
            found_soft_dotted_code_point = true;
57
0
            continue;
58
0
        }
59
60
0
        auto combining_class = canonical_combining_class(code_point);
61
0
        if (combining_class == 0 || combining_class == 230)
62
0
            found_soft_dotted_code_point = false;
63
0
    }
64
65
0
    return found_soft_dotted_code_point;
66
0
}
67
68
static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length)
69
84
{
70
    // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable
71
    // characters, and C is not followed by a sequence consisting of zero or more case-ignorable
72
    // characters and then a cased letter.
73
84
    auto preceding_view = string.substring_view(0, index);
74
84
    auto following_view = ((index + byte_length) < string.byte_length())
75
84
        ? string.substring_view(index + byte_length)
76
84
        : Utf8View {};
77
78
84
    size_t cased_letter_count = 0;
79
80
84
    for (auto code_point : preceding_view) {
81
84
        bool is_cased = code_point_has_property(code_point, Property::Cased);
82
84
        bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
83
84
84
        if (is_cased && !is_case_ignorable)
85
42
            ++cased_letter_count;
86
42
        else if (!is_case_ignorable)
87
42
            cased_letter_count = 0;
88
84
    }
89
90
84
    if (cased_letter_count == 0)
91
84
        return false;
92
93
0
    for (auto code_point : following_view) {
94
0
        bool is_cased = code_point_has_property(code_point, Property::Cased);
95
0
        bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable);
96
97
0
        if (is_case_ignorable)
98
0
            continue;
99
0
        if (is_cased)
100
0
            return false;
101
102
0
        break;
103
0
    }
104
105
0
    return true;
106
0
}
107
108
static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length)
109
0
{
110
    // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above).
111
0
    auto following_view = ((index + byte_length) < string.byte_length())
112
0
        ? string.substring_view(index + byte_length)
113
0
        : Utf8View {};
114
115
0
    for (auto code_point : following_view) {
116
0
        u32 combining_class = canonical_combining_class(code_point);
117
118
0
        if (combining_class == 0)
119
0
            return false;
120
0
        if (combining_class == 230)
121
0
            return true;
122
0
    }
123
124
0
    return false;
125
0
}
126
127
static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length)
128
0
{
129
    // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may
130
    // intervene between the current character and the combining dot above.
131
0
    auto following_view = ((index + byte_length) < string.byte_length())
132
0
        ? string.substring_view(index + byte_length)
133
0
        : Utf8View {};
134
135
0
    for (auto code_point : following_view) {
136
0
        if (code_point == 0x307)
137
0
            return true;
138
139
0
        u32 combining_class = canonical_combining_class(code_point);
140
141
0
        if (combining_class == 0)
142
0
            return false;
143
0
        if (combining_class == 230)
144
0
            return false;
145
0
    }
146
147
0
    return false;
148
0
}
149
150
static Optional<SpecialCasing const&> find_matching_special_case(u32 code_point, Utf8View const& string, Optional<StringView> locale, size_t index, size_t byte_length)
151
23.3k
{
152
23.3k
    auto requested_locale = Locale::None;
153
154
23.3k
    if (locale.has_value()) {
155
0
        if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value())
156
0
            requested_locale = *maybe_locale;
157
0
    }
158
159
23.3k
    auto special_casings = special_case_mapping(code_point);
160
161
23.3k
    for (auto const& special_casing : special_casings) {
162
19.6k
        if (special_casing.locale != Locale::None && special_casing.locale != requested_locale)
163
19.0k
            continue;
164
165
536
        switch (special_casing.condition) {
166
452
        case Condition::None:
167
452
            return special_casing;
168
169
0
        case Condition::AfterI:
170
0
            if (is_after_uppercase_i(string, index))
171
0
                return special_casing;
172
0
            break;
173
174
0
        case Condition::AfterSoftDotted:
175
0
            if (is_after_soft_dotted_code_point(string, index))
176
0
                return special_casing;
177
0
            break;
178
179
84
        case Condition::FinalSigma:
180
84
            if (is_final_code_point(string, index, byte_length))
181
0
                return special_casing;
182
84
            break;
183
184
84
        case Condition::MoreAbove:
185
0
            if (is_followed_by_combining_class_above(string, index, byte_length))
186
0
                return special_casing;
187
0
            break;
188
189
0
        case Condition::NotBeforeDot:
190
0
            if (!is_followed_by_combining_dot_above(string, index, byte_length))
191
0
                return special_casing;
192
0
            break;
193
536
        }
194
536
    }
195
196
22.8k
    return {};
197
23.3k
}
198
199
template<CaseFoldingStatus... StatusFilter>
200
static Optional<CaseFolding const&> find_matching_case_folding(u32 code_point)
201
0
{
202
0
    auto case_foldings = case_folding_mapping(code_point);
203
204
0
    for (auto const& case_folding : case_foldings) {
205
0
        if (((case_folding.status == StatusFilter) || ...))
206
0
            return case_folding;
207
0
    }
208
209
0
    return {};
210
0
}
211
212
#endif
213
214
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
215
ErrorOr<void> build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
216
0
{
217
0
#if ENABLE_UNICODE_DATA
218
0
    size_t index = 0;
219
0
    size_t byte_length = 0;
220
221
0
    for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
222
0
        u32 code_point = *it;
223
0
        byte_length = it.underlying_code_point_length_in_bytes();
224
225
0
        auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
226
0
        if (!special_casing.has_value()) {
227
0
            TRY(builder.try_append_code_point(to_unicode_lowercase(code_point)));
228
0
            continue;
229
0
        }
230
231
0
        for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i)
232
0
            TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i]));
233
0
    }
234
235
0
    return {};
236
#else
237
    return Error::from_string_literal("Unicode data has been disabled");
238
#endif
239
0
}
240
241
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
242
ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
243
2.60k
{
244
2.60k
#if ENABLE_UNICODE_DATA
245
2.60k
    size_t index = 0;
246
2.60k
    size_t byte_length = 0;
247
248
25.9k
    for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) {
249
23.3k
        u32 code_point = *it;
250
23.3k
        byte_length = it.underlying_code_point_length_in_bytes();
251
252
23.3k
        auto special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length);
253
23.3k
        if (!special_casing.has_value()) {
254
22.8k
            TRY(builder.try_append_code_point(to_unicode_uppercase(code_point)));
255
0
            continue;
256
22.8k
        }
257
258
1.35k
        for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i)
259
904
            TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i]));
260
452
    }
261
262
2.60k
    return {};
263
#else
264
    return Error::from_string_literal("Unicode data has been disabled");
265
#endif
266
2.60k
}
267
268
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34078
269
ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale, [[maybe_unused]] TrailingCodePointTransformation trailing_code_point_transformation)
270
0
{
271
0
#if ENABLE_UNICODE_DATA
272
    // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29,
273
    // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following
274
    // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between
275
    // F and the following word boundary to Lowercase_Mapping(C).
276
277
0
    auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> {
278
0
        auto it = code_points.iterator_at_byte_offset_without_validation(boundary);
279
0
        auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary);
280
281
0
        for (; it != end; ++it) {
282
0
            if (code_point_has_property(*it, Property::Cased))
283
0
                return it;
284
0
        }
285
286
0
        return {};
287
0
    };
288
289
0
    auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> {
290
0
        auto special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length);
291
0
        if (!special_casing.has_value()) {
292
0
            TRY(builder.try_append_code_point(to_unicode_titlecase(code_point)));
293
0
            return {};
294
0
        }
295
296
0
        for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i)
297
0
            TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i]));
298
0
        return {};
299
0
    };
300
301
0
    size_t boundary = 0;
302
303
0
    while (true) {
304
0
        auto next_boundary = next_word_segmentation_boundary(code_points, boundary);
305
0
        if (!next_boundary.has_value())
306
0
            break;
307
308
0
        if (auto it = first_cased_code_point_after_boundary(boundary, *next_boundary); it.has_value()) {
309
0
            auto code_point = *it.value();
310
0
            auto code_point_offset = code_points.byte_offset_of(*it);
311
0
            auto code_point_length = it->underlying_code_point_length_in_bytes();
312
313
0
            auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary);
314
0
            TRY(builder.try_append(caseless_code_points.as_string()));
315
316
0
            TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length));
317
0
            boundary = code_point_offset + code_point_length;
318
0
        }
319
320
0
        auto remaining_code_points = code_points.substring_view(boundary, *next_boundary - boundary);
321
0
        switch (trailing_code_point_transformation) {
322
0
        case TrailingCodePointTransformation::Lowercase:
323
0
            TRY(build_lowercase_string(remaining_code_points, builder, locale));
324
0
            break;
325
0
        case TrailingCodePointTransformation::PreserveExisting:
326
0
            TRY(builder.try_append(remaining_code_points.as_string()));
327
0
            break;
328
0
        }
329
330
0
        boundary = *next_boundary;
331
0
    }
332
333
0
    return {};
334
#else
335
    return Error::from_string_literal("Unicode data has been disabled");
336
#endif
337
0
}
338
339
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
340
ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder)
341
0
{
342
    // toCasefold(X): Map each character C in X to Case_Folding(C).
343
0
    for (auto code_point : code_points) {
344
0
        auto case_folding = casefold_code_point(code_point);
345
0
        TRY(builder.try_append(case_folding));
346
0
    }
347
348
0
    return {};
349
0
}
350
351
// https://www.unicode.org/reports/tr44/#CaseFolding.txt
352
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
353
Utf32View casefold_code_point(u32 const& code_point)
354
0
{
355
0
#if ENABLE_UNICODE_DATA
356
    // Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file
357
    // CaseFolding.txt in the Unicode Character Database.
358
0
    using enum CaseFoldingStatus;
359
360
0
    if (auto case_folding = find_matching_case_folding<Common, Full>(code_point); case_folding.has_value())
361
0
        return Utf32View { case_folding->mapping, case_folding->mapping_size };
362
0
#endif
363
364
    // The case foldings are omitted in the data file if they are the same as the code point itself.
365
0
    return Utf32View { &code_point, 1 };
366
0
}
367
368
}