Coverage Report

Created: 2025-11-16 07:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/serenity/Userland/Libraries/LibUnicode/Segmentation.cpp
Line
Count
Source
1
/*
2
 * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
3
 * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
4
 *
5
 * SPDX-License-Identifier: BSD-2-Clause
6
 */
7
8
#include <AK/Utf16View.h>
9
#include <AK/Utf32View.h>
10
#include <AK/Utf8View.h>
11
#include <LibUnicode/CharacterTypes.h>
12
#include <LibUnicode/Segmentation.h>
13
14
#if ENABLE_UNICODE_DATA
15
#    include <LibUnicode/UnicodeData.h>
16
#endif
17
18
namespace Unicode {
19
20
template<typename ViewType>
21
static size_t code_unit_length(ViewType const& view)
22
0
{
23
    if constexpr (IsSame<ViewType, Utf8View>)
24
0
        return view.byte_length();
25
    else if constexpr (IsSame<ViewType, Utf16View>)
26
0
        return view.length_in_code_units();
27
    else if constexpr (IsSame<ViewType, Utf32View>)
28
0
        return view.length();
29
    else
30
        static_assert(DependentFalse<ViewType>);
31
0
}
Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_length<AK::Utf8View>(AK::Utf8View const&)
Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_length<AK::Utf16View>(AK::Utf16View const&)
Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_length<AK::Utf32View>(AK::Utf32View const&)
32
33
template<typename ViewType, typename CodeUnitIterator>
34
static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it)
35
0
{
36
    if constexpr (IsSame<ViewType, Utf8View>)
37
0
        return view.byte_offset_of(it);
38
    else if constexpr (IsSame<ViewType, Utf16View>)
39
0
        return view.code_unit_offset_of(it);
40
    else if constexpr (IsSame<ViewType, Utf32View>)
41
0
        return view.iterator_offset(it);
42
    else
43
        static_assert(DependentFalse<ViewType>);
44
0
}
Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_offset_of<AK::Utf8View, AK::Utf8CodePointIterator>(AK::Utf8View const&, AK::Utf8CodePointIterator const&)
Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_offset_of<AK::Utf16View, AK::Utf16CodePointIterator>(AK::Utf16View const&, AK::Utf16CodePointIterator const&)
Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_offset_of<AK::Utf32View, AK::Utf32CodePointIterator>(AK::Utf32View const&, AK::Utf32CodePointIterator const&)
45
46
template<typename ViewType>
47
static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
48
0
{
49
0
#if ENABLE_UNICODE_DATA
50
0
    using GBP = GraphemeBreakProperty;
51
52
    // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
53
0
    if (view.is_empty())
54
0
        return;
55
56
0
    auto has_any_gbp = [](u32 code_point, auto&&... properties) {
57
0
        return (code_point_has_grapheme_break_property(code_point, properties) || ...);
58
0
    };
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyEEEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_SF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyEEEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_SF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyEEEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_SF_EEEDajSC_
59
60
0
    auto skip_incb_extend_linker_sequence = [&](auto& it) {
61
0
        while (true) {
62
0
            if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend))
63
0
                return;
64
65
0
            auto next_it = it;
66
0
            ++next_it;
67
68
0
            if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker))
69
0
                return;
70
71
0
            it = next_it;
72
0
            ++it;
73
0
        }
74
0
    };
Unexecuted instantiation: Segmentation.cpp:auto Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf8View>(AK::Utf8View const&, AK::Function<AK::IterationDecision (unsigned long)>)::{lambda(auto:1&)#1}::operator()<AK::Utf8CodePointIterator>(AK::Utf8CodePointIterator&) const
Unexecuted instantiation: Segmentation.cpp:auto Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf16View>(AK::Utf16View const&, AK::Function<AK::IterationDecision (unsigned long)>)::{lambda(auto:1&)#1}::operator()<AK::Utf16CodePointIterator>(AK::Utf16CodePointIterator&) const
Unexecuted instantiation: Segmentation.cpp:auto Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf32View>(AK::Utf32View const&, AK::Function<AK::IterationDecision (unsigned long)>)::{lambda(auto:1&)#1}::operator()<AK::Utf32CodePointIterator>(AK::Utf32CodePointIterator&) const
75
76
    // GB1
77
0
    if (callback(0) == IterationDecision::Break)
78
0
        return;
79
80
0
    if (code_unit_length(view) > 1) {
81
0
        auto it = view.begin();
82
0
        auto code_point = *it;
83
0
        u32 next_code_point = 0;
84
0
        auto current_ri_chain = 0;
85
86
0
        for (++it; it != view.end(); ++it, code_point = next_code_point) {
87
0
            next_code_point = *it;
88
89
            // GB9c
90
0
            if (code_point_has_property(code_point, Property::InCB_Consonant)) {
91
0
                auto it_copy = it;
92
0
                skip_incb_extend_linker_sequence(it_copy);
93
94
0
                if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) {
95
0
                    ++it_copy;
96
0
                    skip_incb_extend_linker_sequence(it_copy);
97
98
0
                    if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) {
99
0
                        next_code_point = *it_copy;
100
0
                        it = it_copy;
101
0
                        continue;
102
0
                    }
103
0
                }
104
0
            }
105
106
            // GB11
107
0
            if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) {
108
0
                auto it_copy = it;
109
110
0
                while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend))
111
0
                    ++it_copy;
112
113
0
                if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) {
114
0
                    ++it_copy;
115
116
0
                    if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) {
117
0
                        next_code_point = *it_copy;
118
0
                        it = it_copy;
119
0
                        continue;
120
0
                    }
121
0
                }
122
0
            }
123
124
0
            auto code_point_is_cr = has_any_gbp(code_point, GBP::CR);
125
0
            auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF);
126
127
            // GB3
128
0
            if (code_point_is_cr && next_code_point_is_lf)
129
0
                continue;
130
            // GB4, GB5
131
0
            if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) {
132
0
                if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
133
0
                    return;
134
0
                continue;
135
0
            }
136
137
0
            auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V);
138
0
            auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T);
139
140
            // GB6
141
0
            if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT)))
142
0
                continue;
143
            // GB7
144
0
            if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V))
145
0
                continue;
146
            // GB8
147
0
            if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T))
148
0
                continue;
149
150
            // GB9
151
0
            if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ))
152
0
                continue;
153
            // GB9a
154
0
            if (has_any_gbp(next_code_point, GBP::SpacingMark))
155
0
                continue;
156
            // GB9b
157
0
            if (has_any_gbp(code_point, GBP::Prepend))
158
0
                continue;
159
160
0
            auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator);
161
0
            current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
162
163
            // GB12, GB13
164
0
            if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1)
165
0
                continue;
166
167
            // GB999
168
0
            if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
169
0
                return;
170
0
        }
171
0
    }
172
173
    // GB2
174
0
    callback(code_unit_length(view));
175
0
#endif
176
0
}
Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf8View>(AK::Utf8View const&, AK::Function<AK::IterationDecision (unsigned long)>)
Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf16View>(AK::Utf16View const&, AK::Function<AK::IterationDecision (unsigned long)>)
Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf32View>(AK::Utf32View const&, AK::Function<AK::IterationDecision (unsigned long)>)
177
178
void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
179
0
{
180
0
    for_each_grapheme_segmentation_boundary_impl(view, move(callback));
181
0
}
182
183
void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
184
0
{
185
0
    for_each_grapheme_segmentation_boundary_impl(view, move(callback));
186
0
}
187
188
void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
189
0
{
190
0
    for_each_grapheme_segmentation_boundary_impl(view, move(callback));
191
0
}
192
193
template<typename ViewType>
194
static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
195
0
{
196
0
#if ENABLE_UNICODE_DATA
197
0
    using WBP = WordBreakProperty;
198
199
    // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules
200
0
    if (view.is_empty())
201
0
        return;
202
203
0
    auto has_any_wbp = [](u32 code_point, auto&&... properties) {
204
0
        return (code_point_has_word_break_property(code_point, properties) || ...);
205
0
    };
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyEEEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_SF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyEEEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_SF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyEEEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_SF_EEEDajSC_
206
207
    // WB1
208
0
    if (callback(0) == IterationDecision::Break)
209
0
        return;
210
211
0
    if (code_unit_length(view) > 1) {
212
0
        auto it = view.begin();
213
0
        auto code_point = *it;
214
0
        u32 next_code_point;
215
0
        Optional<u32> previous_code_point;
216
0
        auto current_ri_chain = 0;
217
218
0
        for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
219
0
            next_code_point = *it;
220
221
0
            auto code_point_is_cr = has_any_wbp(code_point, WBP::CR);
222
0
            auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF);
223
224
            // WB3
225
0
            if (code_point_is_cr && next_code_point_is_lf)
226
0
                continue;
227
            // WB3a, WB3b
228
0
            if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) {
229
0
                if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
230
0
                    return;
231
0
                continue;
232
0
            }
233
            // WB3c
234
0
            if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic))
235
0
                continue;
236
            // WB3d
237
0
            if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace))
238
0
                continue;
239
240
            // WB4
241
0
            if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ))
242
0
                continue;
243
244
0
            auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter);
245
0
            auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter);
246
0
            auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter);
247
0
            auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter);
248
249
            // WB5
250
0
            if (code_point_is_ah_letter && next_code_point_is_ah_letter)
251
0
                continue;
252
253
0
            Optional<u32> next_next_code_point;
254
0
            if (it != view.end()) {
255
0
                auto it_copy = it;
256
0
                ++it_copy;
257
0
                if (it_copy != view.end())
258
0
                    next_next_code_point = *it_copy;
259
0
            }
260
0
            bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter);
261
0
            bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter));
262
263
0
            auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote);
264
265
            // WB6
266
0
            if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter)))
267
0
                continue;
268
269
0
            auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote);
270
0
            auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter);
271
0
            auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter));
272
273
            // WB7
274
0
            if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter)))
275
0
                continue;
276
            // WB7a
277
0
            if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote))
278
0
                continue;
279
            // WB7b
280
0
            if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote))
281
0
                continue;
282
            // WB7c
283
0
            if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote))
284
0
                continue;
285
286
0
            auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric);
287
0
            auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric);
288
289
            // WB8
290
0
            if (code_point_is_numeric && next_code_point_is_numeric)
291
0
                continue;
292
            // WB9
293
0
            if (code_point_is_ah_letter && next_code_point_is_numeric)
294
0
                continue;
295
            // WB10
296
0
            if (code_point_is_numeric && next_code_point_is_ah_letter)
297
0
                continue;
298
299
0
            auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric);
300
301
            // WB11
302
0
            if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum)))
303
0
                continue;
304
305
0
            bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric);
306
307
            // WB12
308
0
            if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum)))
309
0
                continue;
310
311
0
            auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana);
312
0
            auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana);
313
314
            // WB13
315
0
            if (code_point_is_katakana && next_code_point_is_katakana)
316
0
                continue;
317
318
0
            auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet);
319
320
            // WB13a
321
0
            if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet))
322
0
                continue;
323
            // WB13b
324
0
            if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana))
325
0
                continue;
326
327
0
            auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator);
328
0
            current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0;
329
330
            // WB15, WB16
331
0
            if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1)
332
0
                continue;
333
334
            // WB999
335
0
            if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
336
0
                return;
337
0
        }
338
0
    }
339
340
    // WB2
341
0
    callback(code_unit_length(view));
342
0
#endif
343
0
}
Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_word_segmentation_boundary_impl<AK::Utf8View>(AK::Utf8View const&, AK::Function<AK::IterationDecision (unsigned long)>)
Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_word_segmentation_boundary_impl<AK::Utf16View>(AK::Utf16View const&, AK::Function<AK::IterationDecision (unsigned long)>)
Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_word_segmentation_boundary_impl<AK::Utf32View>(AK::Utf32View const&, AK::Function<AK::IterationDecision (unsigned long)>)
344
345
void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
346
0
{
347
0
    for_each_word_segmentation_boundary_impl(view, move(callback));
348
0
}
349
350
void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
351
0
{
352
0
    for_each_word_segmentation_boundary_impl(view, move(callback));
353
0
}
354
355
void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
356
0
{
357
0
    for_each_word_segmentation_boundary_impl(view, move(callback));
358
0
}
359
360
template<typename ViewType>
361
static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback)
362
0
{
363
0
#if ENABLE_UNICODE_DATA
364
0
    using SBP = SentenceBreakProperty;
365
366
    // https://www.unicode.org/reports/tr29/#Sentence_Boundary_Rules
367
0
    if (view.is_empty())
368
0
        return;
369
370
0
    auto has_any_sbp = [](u32 code_point, auto&&... properties) {
371
0
        return (code_point_has_sentence_break_property(code_point, properties) || ...);
372
0
    };
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyEEEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_SF_SF_SF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyEEEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_SF_SF_SF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyEEEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_SF_SF_SF_EEEDajSC_
Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_EEEDajSC_
373
374
    // SB1
375
0
    if (callback(0) == IterationDecision::Break)
376
0
        return;
377
378
0
    if (code_unit_length(view) > 1) {
379
0
        auto it = view.begin();
380
0
        auto code_point = *it;
381
0
        u32 next_code_point;
382
0
        Optional<u32> previous_code_point;
383
0
        enum class TerminatorSequenceState {
384
0
            None,
385
0
            Term,
386
0
            Close,
387
0
            Sp
388
0
        } terminator_sequence_state { TerminatorSequenceState::None };
389
0
        auto term_was_a_term = false;
390
391
0
        for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) {
392
0
            next_code_point = *it;
393
394
0
            auto code_point_is_cr = has_any_sbp(code_point, SBP::CR);
395
0
            auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF);
396
397
            // SB3
398
0
            if (code_point_is_cr && next_code_point_is_lf)
399
0
                continue;
400
401
0
            auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep);
402
403
            // SB4
404
0
            if (code_point_is_para_sep) {
405
0
                if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
406
0
                    return;
407
0
                continue;
408
0
            }
409
410
            // SB5
411
0
            if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend))
412
0
                continue;
413
414
0
            auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm);
415
416
            // SB6
417
0
            if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric))
418
0
                continue;
419
            // SB7
420
0
            if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper))
421
0
                continue;
422
423
0
            if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) {
424
0
                terminator_sequence_state = TerminatorSequenceState::Term;
425
0
                term_was_a_term = code_point_is_a_term;
426
0
            } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) {
427
0
                terminator_sequence_state = TerminatorSequenceState::Close;
428
0
            } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) {
429
0
                terminator_sequence_state = TerminatorSequenceState::Sp;
430
0
            } else {
431
0
                terminator_sequence_state = TerminatorSequenceState::None;
432
0
            }
433
434
            // SB8
435
0
            if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) {
436
0
                auto it_copy = it;
437
0
                bool illegal_sequence = false;
438
0
                for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) {
439
0
                    if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend))
440
0
                        continue;
441
0
                    illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower);
442
0
                }
443
0
                if (illegal_sequence)
444
0
                    continue;
445
0
            }
446
447
            // SB8a
448
0
            if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm)))
449
0
                continue;
450
451
0
            auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp);
452
0
            auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF);
453
454
            // SB9
455
0
            if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close)))
456
0
                continue;
457
458
            // SB10
459
0
            if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep))
460
0
                continue;
461
462
            // SB11
463
0
            if (terminator_sequence_state >= TerminatorSequenceState::Term)
464
0
                if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break)
465
0
                    return;
466
467
            // SB998
468
0
        }
469
0
    }
470
471
    // SB2
472
0
    callback(code_unit_length(view));
473
0
#endif
474
0
}
Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_sentence_segmentation_boundary_impl<AK::Utf8View>(AK::Utf8View const&, AK::Function<AK::IterationDecision (unsigned long)>)
Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_sentence_segmentation_boundary_impl<AK::Utf16View>(AK::Utf16View const&, AK::Function<AK::IterationDecision (unsigned long)>)
Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_sentence_segmentation_boundary_impl<AK::Utf32View>(AK::Utf32View const&, AK::Function<AK::IterationDecision (unsigned long)>)
475
476
void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback)
477
0
{
478
0
    for_each_sentence_segmentation_boundary_impl(view, move(callback));
479
0
}
480
481
void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback)
482
0
{
483
0
    for_each_sentence_segmentation_boundary_impl(view, move(callback));
484
0
}
485
486
void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback)
487
0
{
488
0
    for_each_sentence_segmentation_boundary_impl(view, move(callback));
489
0
}
490
491
}