/src/serenity/Userland/Libraries/LibUnicode/Segmentation.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org> |
3 | | * Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org> |
4 | | * |
5 | | * SPDX-License-Identifier: BSD-2-Clause |
6 | | */ |
7 | | |
8 | | #include <AK/Utf16View.h> |
9 | | #include <AK/Utf32View.h> |
10 | | #include <AK/Utf8View.h> |
11 | | #include <LibUnicode/CharacterTypes.h> |
12 | | #include <LibUnicode/Segmentation.h> |
13 | | |
14 | | #if ENABLE_UNICODE_DATA |
15 | | # include <LibUnicode/UnicodeData.h> |
16 | | #endif |
17 | | |
18 | | namespace Unicode { |
19 | | |
20 | | template<typename ViewType> |
21 | | static size_t code_unit_length(ViewType const& view) |
22 | 0 | { |
23 | | if constexpr (IsSame<ViewType, Utf8View>) |
24 | 0 | return view.byte_length(); |
25 | | else if constexpr (IsSame<ViewType, Utf16View>) |
26 | 0 | return view.length_in_code_units(); |
27 | | else if constexpr (IsSame<ViewType, Utf32View>) |
28 | 0 | return view.length(); |
29 | | else |
30 | | static_assert(DependentFalse<ViewType>); |
31 | 0 | } Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_length<AK::Utf8View>(AK::Utf8View const&) Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_length<AK::Utf16View>(AK::Utf16View const&) Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_length<AK::Utf32View>(AK::Utf32View const&) |
32 | | |
33 | | template<typename ViewType, typename CodeUnitIterator> |
34 | | static size_t code_unit_offset_of(ViewType const& view, CodeUnitIterator const& it) |
35 | 0 | { |
36 | | if constexpr (IsSame<ViewType, Utf8View>) |
37 | 0 | return view.byte_offset_of(it); |
38 | | else if constexpr (IsSame<ViewType, Utf16View>) |
39 | 0 | return view.code_unit_offset_of(it); |
40 | | else if constexpr (IsSame<ViewType, Utf32View>) |
41 | 0 | return view.iterator_offset(it); |
42 | | else |
43 | | static_assert(DependentFalse<ViewType>); |
44 | 0 | } Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_offset_of<AK::Utf8View, AK::Utf8CodePointIterator>(AK::Utf8View const&, AK::Utf8CodePointIterator const&) Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_offset_of<AK::Utf16View, AK::Utf16CodePointIterator>(AK::Utf16View const&, AK::Utf16CodePointIterator const&) Unexecuted instantiation: Segmentation.cpp:unsigned long Unicode::code_unit_offset_of<AK::Utf32View, AK::Utf32CodePointIterator>(AK::Utf32View const&, AK::Utf32CodePointIterator const&) |
45 | | |
46 | | template<typename ViewType> |
47 | | static void for_each_grapheme_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback) |
48 | 0 | { |
49 | 0 | #if ENABLE_UNICODE_DATA |
50 | 0 | using GBP = GraphemeBreakProperty; |
51 | | |
52 | | // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules |
53 | 0 | if (view.is_empty()) |
54 | 0 | return; |
55 | | |
56 | 0 | auto has_any_gbp = [](u32 code_point, auto&&... properties) { |
57 | 0 | return (code_point_has_grapheme_break_property(code_point, properties) || ...); |
58 | 0 | }; Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyEEEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_SF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyEEEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_SF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyEEEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_grapheme_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21GraphemeBreakPropertyESF_SF_EEEDajSC_ |
59 | |
|
60 | 0 | auto skip_incb_extend_linker_sequence = [&](auto& it) { |
61 | 0 | while (true) { |
62 | 0 | if (it == view.end() || !code_point_has_property(*it, Property::InCB_Extend)) |
63 | 0 | return; |
64 | | |
65 | 0 | auto next_it = it; |
66 | 0 | ++next_it; |
67 | |
|
68 | 0 | if (next_it == view.end() || !code_point_has_property(*next_it, Property::InCB_Linker)) |
69 | 0 | return; |
70 | | |
71 | 0 | it = next_it; |
72 | 0 | ++it; |
73 | 0 | } |
74 | 0 | }; Unexecuted instantiation: Segmentation.cpp:auto Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf8View>(AK::Utf8View const&, AK::Function<AK::IterationDecision (unsigned long)>)::{lambda(auto:1&)#1}::operator()<AK::Utf8CodePointIterator>(AK::Utf8CodePointIterator&) constUnexecuted instantiation: Segmentation.cpp:auto Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf16View>(AK::Utf16View const&, AK::Function<AK::IterationDecision (unsigned long)>)::{lambda(auto:1&)#1}::operator()<AK::Utf16CodePointIterator>(AK::Utf16CodePointIterator&) constUnexecuted instantiation: Segmentation.cpp:auto Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf32View>(AK::Utf32View const&, AK::Function<AK::IterationDecision (unsigned long)>)::{lambda(auto:1&)#1}::operator()<AK::Utf32CodePointIterator>(AK::Utf32CodePointIterator&) const |
75 | | |
76 | | // GB1 |
77 | 0 | if (callback(0) == IterationDecision::Break) |
78 | 0 | return; |
79 | | |
80 | 0 | if (code_unit_length(view) > 1) { |
81 | 0 | auto it = view.begin(); |
82 | 0 | auto code_point = *it; |
83 | 0 | u32 next_code_point = 0; |
84 | 0 | auto current_ri_chain = 0; |
85 | |
|
86 | 0 | for (++it; it != view.end(); ++it, code_point = next_code_point) { |
87 | 0 | next_code_point = *it; |
88 | | |
89 | | // GB9c |
90 | 0 | if (code_point_has_property(code_point, Property::InCB_Consonant)) { |
91 | 0 | auto it_copy = it; |
92 | 0 | skip_incb_extend_linker_sequence(it_copy); |
93 | |
|
94 | 0 | if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Linker)) { |
95 | 0 | ++it_copy; |
96 | 0 | skip_incb_extend_linker_sequence(it_copy); |
97 | |
|
98 | 0 | if (it_copy != view.end() && code_point_has_property(*it_copy, Property::InCB_Consonant)) { |
99 | 0 | next_code_point = *it_copy; |
100 | 0 | it = it_copy; |
101 | 0 | continue; |
102 | 0 | } |
103 | 0 | } |
104 | 0 | } |
105 | | |
106 | | // GB11 |
107 | 0 | if (code_point_has_property(code_point, Property::Extended_Pictographic) && has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) { |
108 | 0 | auto it_copy = it; |
109 | |
|
110 | 0 | while (it_copy != view.end() && has_any_gbp(*it_copy, GBP::Extend)) |
111 | 0 | ++it_copy; |
112 | |
|
113 | 0 | if (it_copy != view.end() && has_any_gbp(*it_copy, GBP::ZWJ)) { |
114 | 0 | ++it_copy; |
115 | |
|
116 | 0 | if (it_copy != view.end() && code_point_has_property(*it_copy, Property::Extended_Pictographic)) { |
117 | 0 | next_code_point = *it_copy; |
118 | 0 | it = it_copy; |
119 | 0 | continue; |
120 | 0 | } |
121 | 0 | } |
122 | 0 | } |
123 | | |
124 | 0 | auto code_point_is_cr = has_any_gbp(code_point, GBP::CR); |
125 | 0 | auto next_code_point_is_lf = has_any_gbp(next_code_point, GBP::LF); |
126 | | |
127 | | // GB3 |
128 | 0 | if (code_point_is_cr && next_code_point_is_lf) |
129 | 0 | continue; |
130 | | // GB4, GB5 |
131 | 0 | if (code_point_is_cr || next_code_point_is_lf || has_any_gbp(next_code_point, GBP::CR, GBP::Control) || has_any_gbp(code_point, GBP::LF, GBP::Control)) { |
132 | 0 | if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) |
133 | 0 | return; |
134 | 0 | continue; |
135 | 0 | } |
136 | | |
137 | 0 | auto next_code_point_is_v = has_any_gbp(next_code_point, GBP::V); |
138 | 0 | auto next_code_point_is_t = has_any_gbp(next_code_point, GBP::T); |
139 | | |
140 | | // GB6 |
141 | 0 | if (has_any_gbp(code_point, GBP::L) && (next_code_point_is_v || has_any_gbp(next_code_point, GBP::L, GBP::LV, GBP::LVT))) |
142 | 0 | continue; |
143 | | // GB7 |
144 | 0 | if ((next_code_point_is_v || next_code_point_is_t) && has_any_gbp(code_point, GBP::LV, GBP::V)) |
145 | 0 | continue; |
146 | | // GB8 |
147 | 0 | if (next_code_point_is_t && has_any_gbp(code_point, GBP::LVT, GBP::T)) |
148 | 0 | continue; |
149 | | |
150 | | // GB9 |
151 | 0 | if (has_any_gbp(next_code_point, GBP::Extend, GBP::ZWJ)) |
152 | 0 | continue; |
153 | | // GB9a |
154 | 0 | if (has_any_gbp(next_code_point, GBP::SpacingMark)) |
155 | 0 | continue; |
156 | | // GB9b |
157 | 0 | if (has_any_gbp(code_point, GBP::Prepend)) |
158 | 0 | continue; |
159 | | |
160 | 0 | auto code_point_is_ri = has_any_gbp(code_point, GBP::Regional_Indicator); |
161 | 0 | current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; |
162 | | |
163 | | // GB12, GB13 |
164 | 0 | if (code_point_is_ri && has_any_gbp(next_code_point, GBP::Regional_Indicator) && current_ri_chain % 2 == 1) |
165 | 0 | continue; |
166 | | |
167 | | // GB999 |
168 | 0 | if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) |
169 | 0 | return; |
170 | 0 | } |
171 | 0 | } |
172 | | |
173 | | // GB2 |
174 | 0 | callback(code_unit_length(view)); |
175 | 0 | #endif |
176 | 0 | } Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf8View>(AK::Utf8View const&, AK::Function<AK::IterationDecision (unsigned long)>) Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf16View>(AK::Utf16View const&, AK::Function<AK::IterationDecision (unsigned long)>) Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_grapheme_segmentation_boundary_impl<AK::Utf32View>(AK::Utf32View const&, AK::Function<AK::IterationDecision (unsigned long)>) |
177 | | |
178 | | void for_each_grapheme_segmentation_boundary(Utf8View const& view, SegmentationCallback callback) |
179 | 0 | { |
180 | 0 | for_each_grapheme_segmentation_boundary_impl(view, move(callback)); |
181 | 0 | } |
182 | | |
183 | | void for_each_grapheme_segmentation_boundary(Utf16View const& view, SegmentationCallback callback) |
184 | 0 | { |
185 | 0 | for_each_grapheme_segmentation_boundary_impl(view, move(callback)); |
186 | 0 | } |
187 | | |
188 | | void for_each_grapheme_segmentation_boundary(Utf32View const& view, SegmentationCallback callback) |
189 | 0 | { |
190 | 0 | for_each_grapheme_segmentation_boundary_impl(view, move(callback)); |
191 | 0 | } |
192 | | |
193 | | template<typename ViewType> |
194 | | static void for_each_word_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback) |
195 | 0 | { |
196 | 0 | #if ENABLE_UNICODE_DATA |
197 | 0 | using WBP = WordBreakProperty; |
198 | | |
199 | | // https://www.unicode.org/reports/tr29/#Word_Boundary_Rules |
200 | 0 | if (view.is_empty()) |
201 | 0 | return; |
202 | | |
203 | 0 | auto has_any_wbp = [](u32 code_point, auto&&... properties) { |
204 | 0 | return (code_point_has_word_break_property(code_point, properties) || ...); |
205 | 0 | }; Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyEEEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_SF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyEEEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_SF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyEEEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL40for_each_word_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_17WordBreakPropertyESF_SF_EEEDajSC_ |
206 | | |
207 | | // WB1 |
208 | 0 | if (callback(0) == IterationDecision::Break) |
209 | 0 | return; |
210 | | |
211 | 0 | if (code_unit_length(view) > 1) { |
212 | 0 | auto it = view.begin(); |
213 | 0 | auto code_point = *it; |
214 | 0 | u32 next_code_point; |
215 | 0 | Optional<u32> previous_code_point; |
216 | 0 | auto current_ri_chain = 0; |
217 | |
|
218 | 0 | for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) { |
219 | 0 | next_code_point = *it; |
220 | |
|
221 | 0 | auto code_point_is_cr = has_any_wbp(code_point, WBP::CR); |
222 | 0 | auto next_code_point_is_lf = has_any_wbp(next_code_point, WBP::LF); |
223 | | |
224 | | // WB3 |
225 | 0 | if (code_point_is_cr && next_code_point_is_lf) |
226 | 0 | continue; |
227 | | // WB3a, WB3b |
228 | 0 | if (code_point_is_cr || next_code_point_is_lf || has_any_wbp(next_code_point, WBP::CR, WBP::Newline) || has_any_wbp(code_point, WBP::LF, WBP::Newline)) { |
229 | 0 | if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) |
230 | 0 | return; |
231 | 0 | continue; |
232 | 0 | } |
233 | | // WB3c |
234 | 0 | if (has_any_wbp(code_point, WBP::ZWJ) && code_point_has_property(next_code_point, Property::Extended_Pictographic)) |
235 | 0 | continue; |
236 | | // WB3d |
237 | 0 | if (has_any_wbp(code_point, WBP::WSegSpace) && has_any_wbp(next_code_point, WBP::WSegSpace)) |
238 | 0 | continue; |
239 | | |
240 | | // WB4 |
241 | 0 | if (has_any_wbp(next_code_point, WBP::Format, WBP::Extend, WBP::ZWJ)) |
242 | 0 | continue; |
243 | | |
244 | 0 | auto code_point_is_hebrew_letter = has_any_wbp(code_point, WBP::Hebrew_Letter); |
245 | 0 | auto code_point_is_ah_letter = code_point_is_hebrew_letter || has_any_wbp(code_point, WBP::ALetter); |
246 | 0 | auto next_code_point_is_hebrew_letter = has_any_wbp(next_code_point, WBP::Hebrew_Letter); |
247 | 0 | auto next_code_point_is_ah_letter = next_code_point_is_hebrew_letter || has_any_wbp(next_code_point, WBP::ALetter); |
248 | | |
249 | | // WB5 |
250 | 0 | if (code_point_is_ah_letter && next_code_point_is_ah_letter) |
251 | 0 | continue; |
252 | | |
253 | 0 | Optional<u32> next_next_code_point; |
254 | 0 | if (it != view.end()) { |
255 | 0 | auto it_copy = it; |
256 | 0 | ++it_copy; |
257 | 0 | if (it_copy != view.end()) |
258 | 0 | next_next_code_point = *it_copy; |
259 | 0 | } |
260 | 0 | bool next_next_code_point_is_hebrew_letter = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Hebrew_Letter); |
261 | 0 | bool next_next_code_point_is_ah_letter = next_next_code_point_is_hebrew_letter || (next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::ALetter)); |
262 | |
|
263 | 0 | auto next_code_point_is_mid_num_let_q = has_any_wbp(next_code_point, WBP::MidNumLet, WBP::Single_Quote); |
264 | | |
265 | | // WB6 |
266 | 0 | if (code_point_is_ah_letter && next_next_code_point_is_ah_letter && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidLetter))) |
267 | 0 | continue; |
268 | | |
269 | 0 | auto code_point_is_mid_num_let_q = has_any_wbp(code_point, WBP::MidNumLet, WBP::Single_Quote); |
270 | 0 | auto previous_code_point_is_hebrew_letter = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Hebrew_Letter); |
271 | 0 | auto previous_code_point_is_ah_letter = previous_code_point_is_hebrew_letter || (previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::ALetter)); |
272 | | |
273 | | // WB7 |
274 | 0 | if (previous_code_point_is_ah_letter && next_code_point_is_ah_letter && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidLetter))) |
275 | 0 | continue; |
276 | | // WB7a |
277 | 0 | if (code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Single_Quote)) |
278 | 0 | continue; |
279 | | // WB7b |
280 | 0 | if (code_point_is_hebrew_letter && next_next_code_point_is_hebrew_letter && has_any_wbp(next_code_point, WBP::Double_Quote)) |
281 | 0 | continue; |
282 | | // WB7c |
283 | 0 | if (previous_code_point_is_hebrew_letter && next_code_point_is_hebrew_letter && has_any_wbp(code_point, WBP::Double_Quote)) |
284 | 0 | continue; |
285 | | |
286 | 0 | auto code_point_is_numeric = has_any_wbp(code_point, WBP::Numeric); |
287 | 0 | auto next_code_point_is_numeric = has_any_wbp(next_code_point, WBP::Numeric); |
288 | | |
289 | | // WB8 |
290 | 0 | if (code_point_is_numeric && next_code_point_is_numeric) |
291 | 0 | continue; |
292 | | // WB9 |
293 | 0 | if (code_point_is_ah_letter && next_code_point_is_numeric) |
294 | 0 | continue; |
295 | | // WB10 |
296 | 0 | if (code_point_is_numeric && next_code_point_is_ah_letter) |
297 | 0 | continue; |
298 | | |
299 | 0 | auto previous_code_point_is_numeric = previous_code_point.has_value() && has_any_wbp(*previous_code_point, WBP::Numeric); |
300 | | |
301 | | // WB11 |
302 | 0 | if (previous_code_point_is_numeric && next_code_point_is_numeric && (code_point_is_mid_num_let_q || has_any_wbp(code_point, WBP::MidNum))) |
303 | 0 | continue; |
304 | | |
305 | 0 | bool next_next_code_point_is_numeric = next_next_code_point.has_value() && has_any_wbp(*next_next_code_point, WBP::Numeric); |
306 | | |
307 | | // WB12 |
308 | 0 | if (code_point_is_numeric && next_next_code_point_is_numeric && (next_code_point_is_mid_num_let_q || has_any_wbp(next_code_point, WBP::MidNum))) |
309 | 0 | continue; |
310 | | |
311 | 0 | auto code_point_is_katakana = has_any_wbp(code_point, WBP::Katakana); |
312 | 0 | auto next_code_point_is_katakana = has_any_wbp(next_code_point, WBP::Katakana); |
313 | | |
314 | | // WB13 |
315 | 0 | if (code_point_is_katakana && next_code_point_is_katakana) |
316 | 0 | continue; |
317 | | |
318 | 0 | auto code_point_is_extend_num_let = has_any_wbp(code_point, WBP::ExtendNumLet); |
319 | | |
320 | | // WB13a |
321 | 0 | if ((code_point_is_ah_letter || code_point_is_numeric || code_point_is_katakana || code_point_is_extend_num_let) && has_any_wbp(next_code_point, WBP::ExtendNumLet)) |
322 | 0 | continue; |
323 | | // WB13b |
324 | 0 | if (code_point_is_extend_num_let && (next_code_point_is_ah_letter || next_code_point_is_numeric || next_code_point_is_katakana)) |
325 | 0 | continue; |
326 | | |
327 | 0 | auto code_point_is_ri = has_any_wbp(code_point, WBP::Regional_Indicator); |
328 | 0 | current_ri_chain = code_point_is_ri ? current_ri_chain + 1 : 0; |
329 | | |
330 | | // WB15, WB16 |
331 | 0 | if (code_point_is_ri && has_any_wbp(next_code_point, WBP::Regional_Indicator) && current_ri_chain % 2 == 1) |
332 | 0 | continue; |
333 | | |
334 | | // WB999 |
335 | 0 | if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) |
336 | 0 | return; |
337 | 0 | } |
338 | 0 | } |
339 | | |
340 | | // WB2 |
341 | 0 | callback(code_unit_length(view)); |
342 | 0 | #endif |
343 | 0 | } Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_word_segmentation_boundary_impl<AK::Utf8View>(AK::Utf8View const&, AK::Function<AK::IterationDecision (unsigned long)>) Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_word_segmentation_boundary_impl<AK::Utf16View>(AK::Utf16View const&, AK::Function<AK::IterationDecision (unsigned long)>) Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_word_segmentation_boundary_impl<AK::Utf32View>(AK::Utf32View const&, AK::Function<AK::IterationDecision (unsigned long)>) |
344 | | |
345 | | void for_each_word_segmentation_boundary(Utf8View const& view, SegmentationCallback callback) |
346 | 0 | { |
347 | 0 | for_each_word_segmentation_boundary_impl(view, move(callback)); |
348 | 0 | } |
349 | | |
350 | | void for_each_word_segmentation_boundary(Utf16View const& view, SegmentationCallback callback) |
351 | 0 | { |
352 | 0 | for_each_word_segmentation_boundary_impl(view, move(callback)); |
353 | 0 | } |
354 | | |
355 | | void for_each_word_segmentation_boundary(Utf32View const& view, SegmentationCallback callback) |
356 | 0 | { |
357 | 0 | for_each_word_segmentation_boundary_impl(view, move(callback)); |
358 | 0 | } |
359 | | |
360 | | template<typename ViewType> |
361 | | static void for_each_sentence_segmentation_boundary_impl([[maybe_unused]] ViewType const& view, [[maybe_unused]] SegmentationCallback callback) |
362 | 0 | { |
363 | 0 | #if ENABLE_UNICODE_DATA |
364 | 0 | using SBP = SentenceBreakProperty; |
365 | | |
366 | | // https://www.unicode.org/reports/tr29/#Sentence_Boundary_Rules |
367 | 0 | if (view.is_empty()) |
368 | 0 | return; |
369 | | |
370 | 0 | auto has_any_sbp = [](u32 code_point, auto&&... properties) { |
371 | 0 | return (code_point_has_sentence_break_property(code_point, properties) || ...); |
372 | 0 | }; Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyEEEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_SF_SF_SF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK8Utf8ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyEEEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_SF_SF_SF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf16ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyEEEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_SF_SF_SF_EEEDajSC_ Unexecuted instantiation: Segmentation.cpp:_ZZN7UnicodeL44for_each_sentence_segmentation_boundary_implIN2AK9Utf32ViewEEEvRKT_NS1_8FunctionIFNS1_17IterationDecisionEmEEEENKUljDpOT_E_clIJNS_21SentenceBreakPropertyESF_SF_EEEDajSC_ |
373 | | |
374 | | // SB1 |
375 | 0 | if (callback(0) == IterationDecision::Break) |
376 | 0 | return; |
377 | | |
378 | 0 | if (code_unit_length(view) > 1) { |
379 | 0 | auto it = view.begin(); |
380 | 0 | auto code_point = *it; |
381 | 0 | u32 next_code_point; |
382 | 0 | Optional<u32> previous_code_point; |
383 | 0 | enum class TerminatorSequenceState { |
384 | 0 | None, |
385 | 0 | Term, |
386 | 0 | Close, |
387 | 0 | Sp |
388 | 0 | } terminator_sequence_state { TerminatorSequenceState::None }; |
389 | 0 | auto term_was_a_term = false; |
390 | |
|
391 | 0 | for (++it; it != view.end(); ++it, previous_code_point = code_point, code_point = next_code_point) { |
392 | 0 | next_code_point = *it; |
393 | |
|
394 | 0 | auto code_point_is_cr = has_any_sbp(code_point, SBP::CR); |
395 | 0 | auto next_code_point_is_lf = has_any_sbp(next_code_point, SBP::LF); |
396 | | |
397 | | // SB3 |
398 | 0 | if (code_point_is_cr && next_code_point_is_lf) |
399 | 0 | continue; |
400 | | |
401 | 0 | auto code_point_is_para_sep = code_point_is_cr || has_any_sbp(code_point, SBP::LF, SBP::Sep); |
402 | | |
403 | | // SB4 |
404 | 0 | if (code_point_is_para_sep) { |
405 | 0 | if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) |
406 | 0 | return; |
407 | 0 | continue; |
408 | 0 | } |
409 | | |
410 | | // SB5 |
411 | 0 | if (has_any_sbp(next_code_point, SBP::Format, SBP::Extend)) |
412 | 0 | continue; |
413 | | |
414 | 0 | auto code_point_is_a_term = has_any_sbp(code_point, SBP::ATerm); |
415 | | |
416 | | // SB6 |
417 | 0 | if (code_point_is_a_term && has_any_sbp(next_code_point, SBP::Numeric)) |
418 | 0 | continue; |
419 | | // SB7 |
420 | 0 | if (code_point_is_a_term && previous_code_point.has_value() && has_any_sbp(*previous_code_point, SBP::Upper, SBP::Lower) && has_any_sbp(next_code_point, SBP::Upper)) |
421 | 0 | continue; |
422 | | |
423 | 0 | if (code_point_is_a_term || has_any_sbp(code_point, SBP::STerm)) { |
424 | 0 | terminator_sequence_state = TerminatorSequenceState::Term; |
425 | 0 | term_was_a_term = code_point_is_a_term; |
426 | 0 | } else if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && has_any_sbp(code_point, SBP::Close)) { |
427 | 0 | terminator_sequence_state = TerminatorSequenceState::Close; |
428 | 0 | } else if (terminator_sequence_state >= TerminatorSequenceState::Term && has_any_sbp(code_point, SBP::Sp)) { |
429 | 0 | terminator_sequence_state = TerminatorSequenceState::Sp; |
430 | 0 | } else { |
431 | 0 | terminator_sequence_state = TerminatorSequenceState::None; |
432 | 0 | } |
433 | | |
434 | | // SB8 |
435 | 0 | if (terminator_sequence_state >= TerminatorSequenceState::Term && term_was_a_term) { |
436 | 0 | auto it_copy = it; |
437 | 0 | bool illegal_sequence = false; |
438 | 0 | for (auto sequence_code_point = *it_copy; it_copy != view.end(); ++it_copy) { |
439 | 0 | if (has_any_sbp(sequence_code_point, SBP::Close, SBP::SContinue, SBP::Numeric, SBP::Sp, SBP::Format, SBP::Extend)) |
440 | 0 | continue; |
441 | 0 | illegal_sequence = has_any_sbp(sequence_code_point, SBP::Lower); |
442 | 0 | } |
443 | 0 | if (illegal_sequence) |
444 | 0 | continue; |
445 | 0 | } |
446 | | |
447 | | // SB8a |
448 | 0 | if (terminator_sequence_state >= TerminatorSequenceState::Term && (has_any_sbp(next_code_point, SBP::SContinue, SBP::STerm, SBP::ATerm))) |
449 | 0 | continue; |
450 | | |
451 | 0 | auto next_code_point_is_sp = has_any_sbp(next_code_point, SBP::Sp); |
452 | 0 | auto next_code_point_is_para_sep = has_any_sbp(next_code_point, SBP::Sep, SBP::CR, SBP::LF); |
453 | | |
454 | | // SB9 |
455 | 0 | if (terminator_sequence_state >= TerminatorSequenceState::Term && terminator_sequence_state <= TerminatorSequenceState::Close && (next_code_point_is_sp || next_code_point_is_para_sep || has_any_sbp(next_code_point, SBP::Close))) |
456 | 0 | continue; |
457 | | |
458 | | // SB10 |
459 | 0 | if (terminator_sequence_state >= TerminatorSequenceState::Term && (next_code_point_is_sp || next_code_point_is_para_sep)) |
460 | 0 | continue; |
461 | | |
462 | | // SB11 |
463 | 0 | if (terminator_sequence_state >= TerminatorSequenceState::Term) |
464 | 0 | if (callback(code_unit_offset_of(view, it)) == IterationDecision::Break) |
465 | 0 | return; |
466 | | |
467 | | // SB998 |
468 | 0 | } |
469 | 0 | } |
470 | | |
471 | | // SB2 |
472 | 0 | callback(code_unit_length(view)); |
473 | 0 | #endif |
474 | 0 | } Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_sentence_segmentation_boundary_impl<AK::Utf8View>(AK::Utf8View const&, AK::Function<AK::IterationDecision (unsigned long)>) Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_sentence_segmentation_boundary_impl<AK::Utf16View>(AK::Utf16View const&, AK::Function<AK::IterationDecision (unsigned long)>) Unexecuted instantiation: Segmentation.cpp:void Unicode::for_each_sentence_segmentation_boundary_impl<AK::Utf32View>(AK::Utf32View const&, AK::Function<AK::IterationDecision (unsigned long)>) |
475 | | |
476 | | void for_each_sentence_segmentation_boundary(Utf8View const& view, SegmentationCallback callback) |
477 | 0 | { |
478 | 0 | for_each_sentence_segmentation_boundary_impl(view, move(callback)); |
479 | 0 | } |
480 | | |
481 | | void for_each_sentence_segmentation_boundary(Utf16View const& view, SegmentationCallback callback) |
482 | 0 | { |
483 | 0 | for_each_sentence_segmentation_boundary_impl(view, move(callback)); |
484 | 0 | } |
485 | | |
486 | | void for_each_sentence_segmentation_boundary(Utf32View const& view, SegmentationCallback callback) |
487 | 0 | { |
488 | 0 | for_each_sentence_segmentation_boundary_impl(view, move(callback)); |
489 | 0 | } |
490 | | |
491 | | } |