/rust/registry/src/index.crates.io-6f17d22bba15001f/icu_segmenter-1.5.0/src/line.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | use crate::complex::*; |
6 | | use crate::indices::*; |
7 | | use crate::provider::*; |
8 | | use crate::SegmenterError; |
9 | | use alloc::string::String; |
10 | | use alloc::vec; |
11 | | use alloc::vec::Vec; |
12 | | use core::char; |
13 | | use core::str::CharIndices; |
14 | | use icu_provider::prelude::*; |
15 | | use utf8_iter::Utf8CharIndices; |
16 | | |
17 | | // TODO(#1637): These constants should be data driven. |
18 | | #[allow(dead_code)] |
19 | | const UNKNOWN: u8 = 0; |
20 | | #[allow(dead_code)] |
21 | | const AI: u8 = 1; |
22 | | #[allow(dead_code)] |
23 | | const AL: u8 = 2; |
24 | | #[allow(dead_code)] |
25 | | const B2: u8 = 3; |
26 | | #[allow(dead_code)] |
27 | | const BA: u8 = 4; |
28 | | #[allow(dead_code)] |
29 | | const BB: u8 = 5; |
30 | | #[allow(dead_code)] |
31 | | const BK: u8 = 6; |
32 | | #[allow(dead_code)] |
33 | | const CB: u8 = 7; |
34 | | #[allow(dead_code)] |
35 | | const CJ: u8 = 8; |
36 | | #[allow(dead_code)] |
37 | | const CL: u8 = 9; |
38 | | #[allow(dead_code)] |
39 | | const CM: u8 = 10; |
40 | | #[allow(dead_code)] |
41 | | const CP: u8 = 11; |
42 | | #[allow(dead_code)] |
43 | | const CR: u8 = 12; |
44 | | #[allow(dead_code)] |
45 | | const EB: u8 = 13; |
46 | | #[allow(dead_code)] |
47 | | const EM: u8 = 14; |
48 | | #[allow(dead_code)] |
49 | | const EX: u8 = 15; |
50 | | #[allow(dead_code)] |
51 | | const GL: u8 = 16; |
52 | | #[allow(dead_code)] |
53 | | const H2: u8 = 17; |
54 | | #[allow(dead_code)] |
55 | | const H3: u8 = 18; |
56 | | #[allow(dead_code)] |
57 | | const HL: u8 = 19; |
58 | | #[allow(dead_code)] |
59 | | const HY: u8 = 20; |
60 | | #[allow(dead_code)] |
61 | | const ID: u8 = 21; |
62 | | #[allow(dead_code)] |
63 | | const ID_CN: u8 = 22; |
64 | | #[allow(dead_code)] |
65 | | const IN: u8 = 23; |
66 | | #[allow(dead_code)] |
67 | | const IS: u8 = 24; |
68 | | #[allow(dead_code)] |
69 | | const JL: u8 = 25; |
70 | | #[allow(dead_code)] |
71 | | const JT: u8 = 26; |
72 | | #[allow(dead_code)] |
73 | | const JV: u8 = 27; |
74 | | #[allow(dead_code)] |
75 | | const LF: u8 = 28; |
76 | | #[allow(dead_code)] |
77 | | const NL: u8 = 29; |
78 | | #[allow(dead_code)] |
79 | | const NS: u8 = 30; |
80 | | #[allow(dead_code)] |
81 | | const NU: u8 = 31; |
82 | | #[allow(dead_code)] |
83 | | const OP_EA: u8 = 32; |
84 | | #[allow(dead_code)] |
85 | | const OP_OP30: u8 = 33; |
86 | | #[allow(dead_code)] |
87 | | const PO: u8 = 34; |
88 | | #[allow(dead_code)] |
89 | | const PO_EAW: u8 = 35; |
90 | | #[allow(dead_code)] |
91 | | const PR: u8 = 36; |
92 | | #[allow(dead_code)] |
93 | | const PR_EAW: u8 = 37; |
94 | | #[allow(dead_code)] |
95 | | const QU: u8 = 38; |
96 | | #[allow(dead_code)] |
97 | | const RI: u8 = 39; |
98 | | #[allow(dead_code)] |
99 | | const SA: u8 = 40; |
100 | | #[allow(dead_code)] |
101 | | const SG: u8 = 41; |
102 | | #[allow(dead_code)] |
103 | | const SP: u8 = 42; |
104 | | #[allow(dead_code)] |
105 | | const SY: u8 = 43; |
106 | | #[allow(dead_code)] |
107 | | const WJ: u8 = 44; |
108 | | #[allow(dead_code)] |
109 | | const XX: u8 = 45; |
110 | | #[allow(dead_code)] |
111 | | const ZW: u8 = 46; |
112 | | #[allow(dead_code)] |
113 | | const ZWJ: u8 = 47; |
114 | | |
115 | | /// An enum specifies the strictness of line-breaking rules. It can be passed as |
116 | | /// an argument when creating a line segmenter. |
117 | | /// |
118 | | /// Each enum value has the same meaning with respect to the `line-break` |
119 | | /// property values in the CSS Text spec. See the details in |
120 | | /// <https://drafts.csswg.org/css-text-3/#line-break-property>. |
121 | | #[non_exhaustive] |
122 | | #[derive(Copy, Clone, PartialEq, Eq, Debug)] |
123 | | pub enum LineBreakStrictness { |
124 | | /// Breaks text using the least restrictive set of line-breaking rules. |
125 | | /// Typically used for short lines, such as in newspapers. |
126 | | /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-loose> |
127 | | Loose, |
128 | | |
129 | | /// Breaks text using the most common set of line-breaking rules. |
130 | | /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-normal> |
131 | | Normal, |
132 | | |
133 | | /// Breaks text using the most stringent set of line-breaking rules. |
134 | | /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-strict> |
135 | | /// |
136 | | /// This is the default behaviour of the Unicode Line Breaking Algorithm, |
137 | | /// resolving class [CJ](https://www.unicode.org/reports/tr14/#CJ) to |
138 | | /// [NS](https://www.unicode.org/reports/tr14/#NS); |
139 | | /// see rule [LB1](https://www.unicode.org/reports/tr14/#LB1). |
140 | | Strict, |
141 | | |
142 | | /// Breaks text assuming there is a soft wrap opportunity around every |
143 | | /// typographic character unit, disregarding any prohibition against line |
144 | | /// breaks. See more details in |
145 | | /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-anywhere>. |
146 | | Anywhere, |
147 | | } |
148 | | |
149 | | /// An enum specifies the line break opportunities between letters. It can be |
150 | | /// passed as an argument when creating a line segmenter. |
151 | | /// |
152 | | /// Each enum value has the same meaning with respect to the `word-break` |
153 | | /// property values in the CSS Text spec. See the details in |
154 | | /// <https://drafts.csswg.org/css-text-3/#word-break-property> |
155 | | #[non_exhaustive] |
156 | | #[derive(Copy, Clone, PartialEq, Eq, Debug)] |
157 | | pub enum LineBreakWordOption { |
158 | | /// Words break according to their customary rules. See the details in |
159 | | /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-normal>. |
160 | | Normal, |
161 | | |
162 | | /// Breaking is allowed within "words". |
163 | | /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-break-all> |
164 | | BreakAll, |
165 | | |
166 | | /// Breaking is forbidden within "word". |
167 | | /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all> |
168 | | KeepAll, |
169 | | } |
170 | | |
171 | | /// Options to tailor line-breaking behavior. |
172 | | #[non_exhaustive] |
173 | | #[derive(Copy, Clone, PartialEq, Eq, Debug)] |
174 | | pub struct LineBreakOptions { |
175 | | /// Strictness of line-breaking rules. See [`LineBreakStrictness`]. |
176 | | pub strictness: LineBreakStrictness, |
177 | | |
178 | | /// Line break opportunities between letters. See [`LineBreakWordOption`]. |
179 | | pub word_option: LineBreakWordOption, |
180 | | |
181 | | /// Use `true` as a hint to the line segmenter that the writing |
182 | | /// system is Chinese or Japanese. This allows more break opportunities when |
183 | | /// `LineBreakStrictness` is `Normal` or `Loose`. See |
184 | | /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details. |
185 | | /// |
186 | | /// This option has no effect in Latin-1 mode. |
187 | | pub ja_zh: bool, |
188 | | } |
189 | | |
190 | | impl Default for LineBreakOptions { |
191 | 0 | fn default() -> Self { |
192 | 0 | Self { |
193 | 0 | strictness: LineBreakStrictness::Strict, |
194 | 0 | word_option: LineBreakWordOption::Normal, |
195 | 0 | ja_zh: false, |
196 | 0 | } |
197 | 0 | } |
198 | | } |
199 | | |
200 | | /// Line break iterator for an `str` (a UTF-8 string). |
201 | | /// |
202 | | /// For examples of use, see [`LineSegmenter`]. |
203 | | pub type LineBreakIteratorUtf8<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf8>; |
204 | | |
205 | | /// Line break iterator for a potentially invalid UTF-8 string. |
206 | | /// |
207 | | /// For examples of use, see [`LineSegmenter`]. |
208 | | pub type LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> = |
209 | | LineBreakIterator<'l, 's, LineBreakTypePotentiallyIllFormedUtf8>; |
210 | | |
211 | | /// Line break iterator for a Latin-1 (8-bit) string. |
212 | | /// |
213 | | /// For examples of use, see [`LineSegmenter`]. |
214 | | pub type LineBreakIteratorLatin1<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeLatin1>; |
215 | | |
216 | | /// Line break iterator for a UTF-16 string. |
217 | | /// |
218 | | /// For examples of use, see [`LineSegmenter`]. |
219 | | pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf16>; |
220 | | |
221 | | /// Supports loading line break data, and creating line break iterators for different string |
222 | | /// encodings. |
223 | | /// |
224 | | /// The segmenter returns mandatory breaks (as defined by [definition LD7][LD7] of |
225 | | /// Unicode Standard Annex #14, _Unicode Line Breaking Algorithm_) as well as |
226 | | /// line break opportunities ([definition LD3][LD3]). |
227 | | /// It does not distinguish them. Callers requiring that distinction can check |
228 | | /// the Line_Break property of the code point preceding the break against those |
229 | | /// listed in rules [LB4][LB4] and [LB5][LB5], special-casing the end of text |
230 | | /// according to [LB3][LB3]. |
231 | | /// |
232 | | /// For consistency with the grapheme, word, and sentence segmenters, there is |
233 | | /// always a breakpoint returned at index 0, but this breakpoint is not a |
234 | | /// meaningful line break opportunity. |
235 | | /// |
236 | | /// [LD3]: https://www.unicode.org/reports/tr14/#LD3 |
237 | | /// [LD7]: https://www.unicode.org/reports/tr14/#LD7 |
238 | | /// [LB3]: https://www.unicode.org/reports/tr14/#LB3 |
239 | | /// [LB4]: https://www.unicode.org/reports/tr14/#LB4 |
240 | | /// [LB5]: https://www.unicode.org/reports/tr14/#LB5 |
241 | | /// |
242 | | /// ```rust |
243 | | /// # use icu::segmenter::LineSegmenter; |
244 | | /// # |
245 | | /// # let segmenter = LineSegmenter::new_auto(); |
246 | | /// # |
247 | | /// let text = "Summary\r\nThis annex…"; |
248 | | /// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect(); |
249 | | /// // 9 and 22 are mandatory breaks, 14 is a line break opportunity. |
250 | | /// assert_eq!(&breakpoints, &[0, 9, 14, 22]); |
251 | | /// |
252 | | /// // There is a break opportunity between emoji, but not within the ZWJ sequence 🏳️🌈. |
253 | | /// let flag_equation = "🏳️➕🌈🟰🏳️\u{200D}🌈"; |
254 | | /// let possible_first_lines: Vec<&str> = |
255 | | /// segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect(); |
256 | | /// assert_eq!( |
257 | | /// &possible_first_lines, |
258 | | /// &[ |
259 | | /// "🏳️", |
260 | | /// "🏳️➕", |
261 | | /// "🏳️➕🌈", |
262 | | /// "🏳️➕🌈🟰", |
263 | | /// "🏳️➕🌈🟰🏳️🌈" |
264 | | /// ] |
265 | | /// ); |
266 | | /// ``` |
267 | | /// |
268 | | /// # Examples |
269 | | /// |
270 | | /// Segment a string with default options: |
271 | | /// |
272 | | /// ```rust |
273 | | /// use icu::segmenter::LineSegmenter; |
274 | | /// |
275 | | /// let segmenter = LineSegmenter::new_auto(); |
276 | | /// |
277 | | /// let breakpoints: Vec<usize> = |
278 | | /// segmenter.segment_str("Hello World").collect(); |
279 | | /// assert_eq!(&breakpoints, &[0, 6, 11]); |
280 | | /// ``` |
281 | | /// |
282 | | /// Segment a string with CSS option overrides: |
283 | | /// |
284 | | /// ```rust |
285 | | /// use icu::segmenter::{ |
286 | | /// LineBreakOptions, LineBreakStrictness, LineBreakWordOption, |
287 | | /// LineSegmenter, |
288 | | /// }; |
289 | | /// |
290 | | /// let mut options = LineBreakOptions::default(); |
291 | | /// options.strictness = LineBreakStrictness::Strict; |
292 | | /// options.word_option = LineBreakWordOption::BreakAll; |
293 | | /// options.ja_zh = false; |
294 | | /// let segmenter = LineSegmenter::new_auto_with_options(options); |
295 | | /// |
296 | | /// let breakpoints: Vec<usize> = |
297 | | /// segmenter.segment_str("Hello World").collect(); |
298 | | /// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]); |
299 | | /// ``` |
300 | | /// |
301 | | /// Segment a Latin1 byte string: |
302 | | /// |
303 | | /// ```rust |
304 | | /// use icu::segmenter::LineSegmenter; |
305 | | /// |
306 | | /// let segmenter = LineSegmenter::new_auto(); |
307 | | /// |
308 | | /// let breakpoints: Vec<usize> = |
309 | | /// segmenter.segment_latin1(b"Hello World").collect(); |
310 | | /// assert_eq!(&breakpoints, &[0, 6, 11]); |
311 | | /// ``` |
312 | | /// |
313 | | /// Separate mandatory breaks from the break opportunities: |
314 | | /// |
315 | | /// ```rust |
316 | | /// use icu::properties::{maps, LineBreak}; |
317 | | /// use icu::segmenter::LineSegmenter; |
318 | | /// |
319 | | /// # let segmenter = LineSegmenter::new_auto(); |
320 | | /// # |
321 | | /// let text = "Summary\r\nThis annex…"; |
322 | | /// |
323 | | /// let mandatory_breaks: Vec<usize> = segmenter |
324 | | /// .segment_str(text) |
325 | | /// .into_iter() |
326 | | /// .filter(|&i| { |
327 | | /// text[..i].chars().next_back().map_or(false, |c| { |
328 | | /// matches!( |
329 | | /// maps::line_break().get(c), |
330 | | /// LineBreak::MandatoryBreak |
331 | | /// | LineBreak::CarriageReturn |
332 | | /// | LineBreak::LineFeed |
333 | | /// | LineBreak::NextLine |
334 | | /// ) || i == text.len() |
335 | | /// }) |
336 | | /// }) |
337 | | /// .collect(); |
338 | | /// assert_eq!(&mandatory_breaks, &[9, 22]); |
339 | | /// ``` |
340 | | #[derive(Debug)] |
341 | | pub struct LineSegmenter { |
342 | | options: LineBreakOptions, |
343 | | payload: DataPayload<LineBreakDataV1Marker>, |
344 | | complex: ComplexPayloads, |
345 | | } |
346 | | |
347 | | impl LineSegmenter { |
348 | | /// Constructs a [`LineSegmenter`] with an invariant locale and the best available compiled data for |
349 | | /// complex scripts (Khmer, Lao, Myanmar, and Thai). |
350 | | /// |
351 | | /// The current behavior, which is subject to change, is to use the LSTM model when available. |
352 | | /// |
353 | | /// See also [`Self::new_auto_with_options`]. |
354 | | /// |
355 | | /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.* |
356 | | /// |
357 | | /// [📚 Help choosing a constructor](icu_provider::constructors) |
358 | | #[cfg(feature = "compiled_data")] |
359 | | #[cfg(feature = "auto")] |
360 | 0 | pub fn new_auto() -> Self { |
361 | 0 | Self::new_auto_with_options(Default::default()) |
362 | 0 | } |
363 | | |
364 | | #[cfg(feature = "auto")] |
365 | | icu_provider::gen_any_buffer_data_constructors!( |
366 | | locale: skip, |
367 | | options: skip, |
368 | | error: SegmenterError, |
369 | | #[cfg(skip)] |
370 | | functions: [ |
371 | | new_auto, |
372 | | try_new_auto_with_any_provider, |
373 | | try_new_auto_with_buffer_provider, |
374 | | try_new_auto_unstable, |
375 | | Self, |
376 | | ] |
377 | | ); |
378 | | |
379 | | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto)] |
380 | | #[cfg(feature = "auto")] |
381 | 0 | pub fn try_new_auto_unstable<D>(provider: &D) -> Result<Self, SegmenterError> |
382 | 0 | where |
383 | 0 | D: DataProvider<LineBreakDataV1Marker> |
384 | 0 | + DataProvider<LstmForWordLineAutoV1Marker> |
385 | 0 | + DataProvider<GraphemeClusterBreakDataV1Marker> |
386 | 0 | + ?Sized, |
387 | 0 | { |
388 | 0 | Self::try_new_auto_with_options_unstable(provider, Default::default()) |
389 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_auto_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_auto_unstable::<_> |
390 | | |
391 | | /// Constructs a [`LineSegmenter`] with an invariant locale and compiled LSTM data for |
392 | | /// complex scripts (Khmer, Lao, Myanmar, and Thai). |
393 | | /// |
394 | | /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than |
395 | | /// the full dictionary but more expensive during segmentation (inference). |
396 | | /// |
397 | | /// See also [`Self::new_lstm_with_options`]. |
398 | | /// |
399 | | /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.* |
400 | | /// |
401 | | /// [📚 Help choosing a constructor](icu_provider::constructors) |
402 | | #[cfg(feature = "compiled_data")] |
403 | | #[cfg(feature = "lstm")] |
404 | 0 | pub fn new_lstm() -> Self { |
405 | 0 | Self::new_lstm_with_options(Default::default()) |
406 | 0 | } |
407 | | |
408 | | #[cfg(feature = "lstm")] |
409 | | icu_provider::gen_any_buffer_data_constructors!( |
410 | | locale: skip, |
411 | | options: skip, |
412 | | error: SegmenterError, |
413 | | #[cfg(skip)] |
414 | | functions: [ |
415 | | new_lstm, |
416 | | try_new_lstm_with_any_provider, |
417 | | try_new_lstm_with_buffer_provider, |
418 | | try_new_lstm_unstable, |
419 | | Self, |
420 | | ] |
421 | | ); |
422 | | |
423 | | #[cfg(feature = "lstm")] |
424 | | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)] |
425 | 0 | pub fn try_new_lstm_unstable<D>(provider: &D) -> Result<Self, SegmenterError> |
426 | 0 | where |
427 | 0 | D: DataProvider<LineBreakDataV1Marker> |
428 | 0 | + DataProvider<LstmForWordLineAutoV1Marker> |
429 | 0 | + DataProvider<GraphemeClusterBreakDataV1Marker> |
430 | 0 | + ?Sized, |
431 | 0 | { |
432 | 0 | Self::try_new_lstm_with_options_unstable(provider, Default::default()) |
433 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_lstm_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_lstm_unstable::<_> |
434 | | |
435 | | /// Constructs a [`LineSegmenter`] with an invariant locale and compiled dictionary data for |
436 | | /// complex scripts (Khmer, Lao, Myanmar, and Thai). |
437 | | /// |
438 | | /// The dictionary model uses a list of words to determine appropriate breakpoints. It is |
439 | | /// faster than the LSTM model but requires more data. |
440 | | /// |
441 | | /// See also [`Self::new_dictionary_with_options`]. |
442 | | /// |
443 | | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
444 | | /// |
445 | | /// [📚 Help choosing a constructor](icu_provider::constructors) |
446 | | #[cfg(feature = "compiled_data")] |
447 | 0 | pub fn new_dictionary() -> Self { |
448 | 0 | Self::new_dictionary_with_options(Default::default()) |
449 | 0 | } |
450 | | |
451 | | icu_provider::gen_any_buffer_data_constructors!( |
452 | | locale: skip, |
453 | | options: skip, |
454 | | error: SegmenterError, |
455 | | #[cfg(skip)] |
456 | | functions: [ |
457 | | new_dictionary, |
458 | | try_new_dictionary_with_any_provider, |
459 | | try_new_dictionary_with_buffer_provider, |
460 | | try_new_dictionary_unstable, |
461 | | Self, |
462 | | ] |
463 | | ); |
464 | | |
465 | | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)] |
466 | 0 | pub fn try_new_dictionary_unstable<D>(provider: &D) -> Result<Self, SegmenterError> |
467 | 0 | where |
468 | 0 | D: DataProvider<LineBreakDataV1Marker> |
469 | 0 | + DataProvider<DictionaryForWordLineExtendedV1Marker> |
470 | 0 | + DataProvider<GraphemeClusterBreakDataV1Marker> |
471 | 0 | + ?Sized, |
472 | 0 | { |
473 | 0 | Self::try_new_dictionary_with_options_unstable(provider, Default::default()) |
474 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_dictionary_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_dictionary_unstable::<_> |
475 | | |
476 | | /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and |
477 | | /// the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai). |
478 | | /// |
479 | | /// The current behavior, which is subject to change, is to use the LSTM model when available. |
480 | | /// |
481 | | /// See also [`Self::new_auto`]. |
482 | | /// |
483 | | /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.* |
484 | | /// |
485 | | /// [📚 Help choosing a constructor](icu_provider::constructors) |
486 | | #[cfg(feature = "auto")] |
487 | | #[cfg(feature = "compiled_data")] |
488 | 0 | pub fn new_auto_with_options(options: LineBreakOptions) -> Self { |
489 | 0 | Self::new_lstm_with_options(options) |
490 | 0 | } |
491 | | |
492 | | #[cfg(feature = "auto")] |
493 | | icu_provider::gen_any_buffer_data_constructors!( |
494 | | locale: skip, |
495 | | options: LineBreakOptions, |
496 | | error: SegmenterError, |
497 | | #[cfg(skip)] |
498 | | functions: [ |
499 | | new_auto_with_options, |
500 | | try_new_auto_with_options_with_any_provider, |
501 | | try_new_auto_with_options_with_buffer_provider, |
502 | | try_new_auto_with_options_unstable, |
503 | | Self, |
504 | | ] |
505 | | ); |
506 | | |
507 | | #[cfg(feature = "auto")] |
508 | | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto_with_options)] |
509 | 0 | pub fn try_new_auto_with_options_unstable<D>( |
510 | 0 | provider: &D, |
511 | 0 | options: LineBreakOptions, |
512 | 0 | ) -> Result<Self, SegmenterError> |
513 | 0 | where |
514 | 0 | D: DataProvider<LineBreakDataV1Marker> |
515 | 0 | + DataProvider<LstmForWordLineAutoV1Marker> |
516 | 0 | + DataProvider<GraphemeClusterBreakDataV1Marker> |
517 | 0 | + ?Sized, |
518 | 0 | { |
519 | 0 | Self::try_new_lstm_with_options_unstable(provider, options) |
520 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_auto_with_options_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_auto_with_options_unstable::<_> |
521 | | |
522 | | /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and |
523 | | /// compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai). |
524 | | /// |
525 | | /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than |
526 | | /// the full dictionary but more expensive during segmentation (inference). |
527 | | /// |
528 | | /// See also [`Self::new_dictionary`]. |
529 | | /// |
530 | | /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.* |
531 | | /// |
532 | | /// [📚 Help choosing a constructor](icu_provider::constructors) |
533 | | #[cfg(feature = "lstm")] |
534 | | #[cfg(feature = "compiled_data")] |
535 | 0 | pub fn new_lstm_with_options(options: LineBreakOptions) -> Self { |
536 | 0 | Self { |
537 | 0 | options, |
538 | 0 | payload: DataPayload::from_static_ref( |
539 | 0 | crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1, |
540 | 0 | ), |
541 | 0 | complex: ComplexPayloads::new_lstm(), |
542 | 0 | } |
543 | 0 | } |
544 | | |
545 | | #[cfg(feature = "lstm")] |
546 | | icu_provider::gen_any_buffer_data_constructors!( |
547 | | locale: skip, |
548 | | options: LineBreakOptions, |
549 | | error: SegmenterError, |
550 | | #[cfg(skip)] |
551 | | functions: [ |
552 | | try_new_lstm_with_options, |
553 | | try_new_lstm_with_options_with_any_provider, |
554 | | try_new_lstm_with_options_with_buffer_provider, |
555 | | try_new_lstm_with_options_unstable, |
556 | | Self, |
557 | | ] |
558 | | ); |
559 | | |
560 | | #[cfg(feature = "lstm")] |
561 | | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm_with_options)] |
562 | 0 | pub fn try_new_lstm_with_options_unstable<D>( |
563 | 0 | provider: &D, |
564 | 0 | options: LineBreakOptions, |
565 | 0 | ) -> Result<Self, SegmenterError> |
566 | 0 | where |
567 | 0 | D: DataProvider<LineBreakDataV1Marker> |
568 | 0 | + DataProvider<LstmForWordLineAutoV1Marker> |
569 | 0 | + DataProvider<GraphemeClusterBreakDataV1Marker> |
570 | 0 | + ?Sized, |
571 | 0 | { |
572 | 0 | Ok(Self { |
573 | 0 | options, |
574 | 0 | payload: provider.load(Default::default())?.take_payload()?, |
575 | 0 | complex: ComplexPayloads::try_new_lstm(provider)?, |
576 | | }) |
577 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_lstm_with_options_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_lstm_with_options_unstable::<_> |
578 | | |
579 | | /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and |
580 | | /// compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai). |
581 | | /// |
582 | | /// The dictionary model uses a list of words to determine appropriate breakpoints. It is |
583 | | /// faster than the LSTM model but requires more data. |
584 | | /// |
585 | | /// See also [`Self::new_dictionary`]. |
586 | | /// |
587 | | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
588 | | /// |
589 | | /// [📚 Help choosing a constructor](icu_provider::constructors) |
590 | | #[cfg(feature = "compiled_data")] |
591 | 0 | pub fn new_dictionary_with_options(options: LineBreakOptions) -> Self { |
592 | 0 | Self { |
593 | 0 | options, |
594 | 0 | payload: DataPayload::from_static_ref( |
595 | 0 | crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1, |
596 | 0 | ), |
597 | 0 | // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK |
598 | 0 | // characters [1]. Southeast Asian languages however require complex context analysis |
599 | 0 | // [2]. |
600 | 0 | // |
601 | 0 | // [1]: https://www.unicode.org/reports/tr14/#ID |
602 | 0 | // [2]: https://www.unicode.org/reports/tr14/#SA |
603 | 0 | complex: ComplexPayloads::new_southeast_asian(), |
604 | 0 | } |
605 | 0 | } |
606 | | |
607 | | icu_provider::gen_any_buffer_data_constructors!( |
608 | | locale: skip, |
609 | | options: LineBreakOptions, |
610 | | error: SegmenterError, |
611 | | #[cfg(skip)] |
612 | | functions: [ |
613 | | new_dictionary_with_options, |
614 | | try_new_dictionary_with_options_with_any_provider, |
615 | | try_new_dictionary_with_options_with_buffer_provider, |
616 | | try_new_dictionary_with_options_unstable, |
617 | | Self, |
618 | | ] |
619 | | ); |
620 | | |
621 | | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary_with_options)] |
622 | 0 | pub fn try_new_dictionary_with_options_unstable<D>( |
623 | 0 | provider: &D, |
624 | 0 | options: LineBreakOptions, |
625 | 0 | ) -> Result<Self, SegmenterError> |
626 | 0 | where |
627 | 0 | D: DataProvider<LineBreakDataV1Marker> |
628 | 0 | + DataProvider<DictionaryForWordLineExtendedV1Marker> |
629 | 0 | + DataProvider<GraphemeClusterBreakDataV1Marker> |
630 | 0 | + ?Sized, |
631 | 0 | { |
632 | 0 | Ok(Self { |
633 | 0 | options, |
634 | 0 | payload: provider.load(Default::default())?.take_payload()?, |
635 | | // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK |
636 | | // characters [1]. Southeast Asian languages however require complex context analysis |
637 | | // [2]. |
638 | | // |
639 | | // [1]: https://www.unicode.org/reports/tr14/#ID |
640 | | // [2]: https://www.unicode.org/reports/tr14/#SA |
641 | 0 | complex: ComplexPayloads::try_new_southeast_asian(provider)?, |
642 | | }) |
643 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_dictionary_with_options_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_dictionary_with_options_unstable::<_> |
644 | | |
645 | | /// Creates a line break iterator for an `str` (a UTF-8 string). |
646 | | /// |
647 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
648 | 0 | pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> { |
649 | 0 | LineBreakIterator { |
650 | 0 | iter: input.char_indices(), |
651 | 0 | len: input.len(), |
652 | 0 | current_pos_data: None, |
653 | 0 | result_cache: Vec::new(), |
654 | 0 | data: self.payload.get(), |
655 | 0 | options: &self.options, |
656 | 0 | complex: &self.complex, |
657 | 0 | } |
658 | 0 | } |
659 | | /// Creates a line break iterator for a potentially ill-formed UTF8 string |
660 | | /// |
661 | | /// Invalid characters are treated as REPLACEMENT CHARACTER |
662 | | /// |
663 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
664 | 0 | pub fn segment_utf8<'l, 's>( |
665 | 0 | &'l self, |
666 | 0 | input: &'s [u8], |
667 | 0 | ) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> { |
668 | 0 | LineBreakIterator { |
669 | 0 | iter: Utf8CharIndices::new(input), |
670 | 0 | len: input.len(), |
671 | 0 | current_pos_data: None, |
672 | 0 | result_cache: Vec::new(), |
673 | 0 | data: self.payload.get(), |
674 | 0 | options: &self.options, |
675 | 0 | complex: &self.complex, |
676 | 0 | } |
677 | 0 | } |
678 | | /// Creates a line break iterator for a Latin-1 (8-bit) string. |
679 | | /// |
680 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
681 | 0 | pub fn segment_latin1<'l, 's>(&'l self, input: &'s [u8]) -> LineBreakIteratorLatin1<'l, 's> { |
682 | 0 | LineBreakIterator { |
683 | 0 | iter: Latin1Indices::new(input), |
684 | 0 | len: input.len(), |
685 | 0 | current_pos_data: None, |
686 | 0 | result_cache: Vec::new(), |
687 | 0 | data: self.payload.get(), |
688 | 0 | options: &self.options, |
689 | 0 | complex: &self.complex, |
690 | 0 | } |
691 | 0 | } |
692 | | |
693 | | /// Creates a line break iterator for a UTF-16 string. |
694 | | /// |
695 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
696 | 0 | pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> { |
697 | 0 | LineBreakIterator { |
698 | 0 | iter: Utf16Indices::new(input), |
699 | 0 | len: input.len(), |
700 | 0 | current_pos_data: None, |
701 | 0 | result_cache: Vec::new(), |
702 | 0 | data: self.payload.get(), |
703 | 0 | options: &self.options, |
704 | 0 | complex: &self.complex, |
705 | 0 | } |
706 | 0 | } |
707 | | } |
708 | | |
709 | | impl RuleBreakDataV1<'_> { |
710 | 0 | fn get_linebreak_property_utf32_with_rule( |
711 | 0 | &self, |
712 | 0 | codepoint: u32, |
713 | 0 | strictness: LineBreakStrictness, |
714 | 0 | word_option: LineBreakWordOption, |
715 | 0 | ) -> u8 { |
716 | 0 | // Note: Default value is 0 == UNKNOWN |
717 | 0 | let prop = self.property_table.get32(codepoint); |
718 | 0 |
|
719 | 0 | if word_option == LineBreakWordOption::BreakAll |
720 | 0 | || strictness == LineBreakStrictness::Loose |
721 | 0 | || strictness == LineBreakStrictness::Normal |
722 | | { |
723 | 0 | return match prop { |
724 | 0 | CJ => ID, // All CJ's General_Category is Other_Letter (Lo). |
725 | 0 | _ => prop, |
726 | | }; |
727 | 0 | } |
728 | 0 |
|
729 | 0 | // CJ is treated as NS by default, yielding strict line breaking. |
730 | 0 | // https://www.unicode.org/reports/tr14/#CJ |
731 | 0 | prop |
732 | 0 | } |
733 | | |
734 | | #[inline] |
735 | 0 | fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState { |
736 | 0 | let idx = (left as usize) * (self.property_count as usize) + (right as usize); |
737 | 0 | // We use unwrap_or to fall back to the base case and prevent panics on bad data. |
738 | 0 | self.break_state_table.get(idx).unwrap_or(BreakState::Keep) |
739 | 0 | } Unexecuted instantiation: <icu_segmenter::provider::RuleBreakDataV1>::get_break_state_from_table Unexecuted instantiation: <icu_segmenter::provider::RuleBreakDataV1>::get_break_state_from_table |
740 | | |
741 | | #[inline] |
742 | 0 | fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool { |
743 | 0 | let line_break_property = self.get_linebreak_property_utf32_with_rule( |
744 | 0 | codepoint, |
745 | 0 | LineBreakStrictness::Strict, |
746 | 0 | LineBreakWordOption::Normal, |
747 | 0 | ); |
748 | 0 |
|
749 | 0 | line_break_property == SA |
750 | 0 | } Unexecuted instantiation: <icu_segmenter::provider::RuleBreakDataV1>::use_complex_breaking_utf32 Unexecuted instantiation: <icu_segmenter::provider::RuleBreakDataV1>::use_complex_breaking_utf32 |
751 | | } |
752 | | |
753 | | #[inline] |
754 | 0 | fn is_break_utf32_by_loose( |
755 | 0 | right_codepoint: u32, |
756 | 0 | left_prop: u8, |
757 | 0 | right_prop: u8, |
758 | 0 | ja_zh: bool, |
759 | 0 | ) -> Option<bool> { |
760 | 0 | // breaks before hyphens |
761 | 0 | if right_prop == BA { |
762 | 0 | if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) { |
763 | 0 | return Some(true); |
764 | 0 | } |
765 | 0 | } else if right_prop == NS { |
766 | | // breaks before certain CJK hyphen-like characters |
767 | 0 | if right_codepoint == 0x301C || right_codepoint == 0x30A0 { |
768 | 0 | return Some(ja_zh); |
769 | 0 | } |
770 | 0 |
|
771 | 0 | // breaks before iteration marks |
772 | 0 | if right_codepoint == 0x3005 |
773 | 0 | || right_codepoint == 0x303B |
774 | 0 | || right_codepoint == 0x309D |
775 | 0 | || right_codepoint == 0x309E |
776 | 0 | || right_codepoint == 0x30FD |
777 | 0 | || right_codepoint == 0x30FE |
778 | | { |
779 | 0 | return Some(true); |
780 | 0 | } |
781 | 0 |
|
782 | 0 | // breaks before certain centered punctuation marks: |
783 | 0 | if right_codepoint == 0x30FB |
784 | 0 | || right_codepoint == 0xFF1A |
785 | 0 | || right_codepoint == 0xFF1B |
786 | 0 | || right_codepoint == 0xFF65 |
787 | 0 | || right_codepoint == 0x203C |
788 | 0 | || (0x2047..=0x2049).contains(&right_codepoint) |
789 | | { |
790 | 0 | return Some(ja_zh); |
791 | 0 | } |
792 | 0 | } else if right_prop == IN { |
793 | | // breaks between inseparable characters such as U+2025, U+2026 i.e. characters with the Unicode Line Break property IN |
794 | 0 | return Some(true); |
795 | 0 | } else if right_prop == EX { |
796 | | // breaks before certain centered punctuation marks: |
797 | 0 | if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F { |
798 | 0 | return Some(ja_zh); |
799 | 0 | } |
800 | 0 | } |
801 | | |
802 | | // breaks before suffixes: |
803 | | // Characters with the Unicode Line Break property PO and the East Asian Width property |
804 | 0 | if right_prop == PO_EAW { |
805 | 0 | return Some(ja_zh); |
806 | 0 | } |
807 | 0 | // breaks after prefixes: |
808 | 0 | // Characters with the Unicode Line Break property PR and the East Asian Width property |
809 | 0 | if left_prop == PR_EAW { |
810 | 0 | return Some(ja_zh); |
811 | 0 | } |
812 | 0 | None |
813 | 0 | } Unexecuted instantiation: icu_segmenter::line::is_break_utf32_by_loose Unexecuted instantiation: icu_segmenter::line::is_break_utf32_by_loose |
814 | | |
815 | | /// A trait allowing for LineBreakIterator to be generalized to multiple string iteration methods. |
816 | | /// |
817 | | /// This is implemented by ICU4X for several common string types. |
818 | | pub trait LineBreakType<'l, 's> { |
819 | | /// The iterator over characters. |
820 | | type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone; |
821 | | |
822 | | /// The character type. |
823 | | type CharType: Copy + Into<u32>; |
824 | | |
825 | | fn use_complex_breaking(iterator: &LineBreakIterator<'l, 's, Self>, c: Self::CharType) -> bool; |
826 | | |
827 | | fn get_linebreak_property_with_rule( |
828 | | iterator: &LineBreakIterator<'l, 's, Self>, |
829 | | c: Self::CharType, |
830 | | ) -> u8; |
831 | | |
832 | | fn get_current_position_character_len(iterator: &LineBreakIterator<'l, 's, Self>) -> usize; |
833 | | |
834 | | fn handle_complex_language( |
835 | | iterator: &mut LineBreakIterator<'l, 's, Self>, |
836 | | left_codepoint: Self::CharType, |
837 | | ) -> Option<usize>; |
838 | | } |
839 | | |
840 | | /// Implements the [`Iterator`] trait over the line break opportunities of the given string. |
841 | | /// |
842 | | /// Lifetimes: |
843 | | /// |
844 | | /// - `'l` = lifetime of the [`LineSegmenter`] object from which this iterator was created |
845 | | /// - `'s` = lifetime of the string being segmented |
846 | | /// |
847 | | /// The [`Iterator::Item`] is an [`usize`] representing index of a code unit |
848 | | /// _after_ the break (for a break at the end of text, this index is the length |
849 | | /// of the [`str`] or array of code units). |
850 | | /// |
851 | | /// For examples of use, see [`LineSegmenter`]. |
852 | | #[derive(Debug)] |
853 | | pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> { |
854 | | iter: Y::IterAttr, |
855 | | len: usize, |
856 | | current_pos_data: Option<(usize, Y::CharType)>, |
857 | | result_cache: Vec<usize>, |
858 | | data: &'l RuleBreakDataV1<'l>, |
859 | | options: &'l LineBreakOptions, |
860 | | complex: &'l ComplexPayloads, |
861 | | } |
862 | | |
863 | | impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> { |
864 | | type Item = usize; |
865 | | |
866 | 0 | fn next(&mut self) -> Option<Self::Item> { |
867 | 0 | match self.check_eof() { |
868 | 0 | StringBoundaryPosType::Start => return Some(0), |
869 | 0 | StringBoundaryPosType::End => return None, |
870 | 0 | _ => (), |
871 | | } |
872 | | |
873 | | // If we have break point cache by previous run, return this result |
874 | 0 | if let Some(&first_pos) = self.result_cache.first() { |
875 | 0 | let mut i = 0; |
876 | | loop { |
877 | 0 | if i == first_pos { |
878 | 0 | self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect(); Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16> as core::iter::traits::iterator::Iterator>::next::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1> as core::iter::traits::iterator::Iterator>::next::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8> as core::iter::traits::iterator::Iterator>::next::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_> as core::iter::traits::iterator::Iterator>::next::{closure#0} |
879 | 0 | return self.get_current_position(); |
880 | 0 | } |
881 | 0 | i += Y::get_current_position_character_len(self); |
882 | 0 | self.advance_iter(); |
883 | 0 | if self.is_eof() { |
884 | 0 | self.result_cache.clear(); |
885 | 0 | return Some(self.len); |
886 | 0 | } |
887 | | } |
888 | 0 | } |
889 | | |
890 | | 'a: loop { |
891 | 0 | debug_assert!(!self.is_eof()); |
892 | 0 | let left_codepoint = self.get_current_codepoint()?; |
893 | 0 | let mut left_prop = self.get_linebreak_property(left_codepoint); |
894 | 0 | self.advance_iter(); |
895 | | |
896 | 0 | let Some(right_codepoint) = self.get_current_codepoint() else { |
897 | 0 | return Some(self.len); |
898 | | }; |
899 | 0 | let right_prop = self.get_linebreak_property(right_codepoint); |
900 | 0 |
|
901 | 0 | // CSS word-break property handling |
902 | 0 | match (self.options.word_option, left_prop, right_prop) { |
903 | 0 | (LineBreakWordOption::BreakAll, AL | NU | SA, _) => { |
904 | 0 | left_prop = ID; |
905 | 0 | } |
906 | | // typographic letter units shouldn't be break |
907 | | ( |
908 | | LineBreakWordOption::KeepAll, |
909 | | AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ, |
910 | | AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ, |
911 | | ) => { |
912 | 0 | continue; |
913 | | } |
914 | 0 | _ => (), |
915 | | } |
916 | | |
917 | | // CSS line-break property handling |
918 | 0 | match self.options.strictness { |
919 | | LineBreakStrictness::Normal => { |
920 | 0 | if self.is_break_by_normal(right_codepoint) { |
921 | 0 | return self.get_current_position(); |
922 | 0 | } |
923 | | } |
924 | | LineBreakStrictness::Loose => { |
925 | 0 | if let Some(breakable) = is_break_utf32_by_loose( |
926 | 0 | right_codepoint.into(), |
927 | 0 | left_prop, |
928 | 0 | right_prop, |
929 | 0 | self.options.ja_zh, |
930 | 0 | ) { |
931 | 0 | if breakable { |
932 | 0 | return self.get_current_position(); |
933 | 0 | } |
934 | 0 | continue; |
935 | 0 | } |
936 | | } |
937 | | LineBreakStrictness::Anywhere => { |
938 | 0 | return self.get_current_position(); |
939 | | } |
940 | 0 | _ => (), |
941 | | }; |
942 | | |
943 | | // UAX14 doesn't have Thai etc, so use another way. |
944 | 0 | if self.options.word_option != LineBreakWordOption::BreakAll |
945 | 0 | && Y::use_complex_breaking(self, left_codepoint) |
946 | 0 | && Y::use_complex_breaking(self, right_codepoint) |
947 | | { |
948 | 0 | let result = Y::handle_complex_language(self, left_codepoint); |
949 | 0 | if result.is_some() { |
950 | 0 | return result; |
951 | 0 | } |
952 | | // I may have to fetch text until non-SA character?. |
953 | 0 | } |
954 | | |
955 | | // If break_state is equals or grater than 0, it is alias of property. |
956 | 0 | let mut index = match self.data.get_break_state_from_table(left_prop, right_prop) { |
957 | 0 | BreakState::Index(index) => index, |
958 | | // Line break uses more that 64 states, so they spill over into the intermediate range, |
959 | | // and we cannot change that at the moment |
960 | 0 | BreakState::Intermediate(index) => index + 64, |
961 | 0 | BreakState::Break | BreakState::NoMatch => return self.get_current_position(), |
962 | 0 | BreakState::Keep => continue, |
963 | | }; |
964 | | |
965 | 0 | let mut previous_iter = self.iter.clone(); |
966 | 0 | let mut previous_pos_data = self.current_pos_data; |
967 | | |
968 | | loop { |
969 | 0 | self.advance_iter(); |
970 | | |
971 | 0 | let Some(prop) = self.get_current_linebreak_property() else { |
972 | | // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point. |
973 | 0 | let break_state = self |
974 | 0 | .data |
975 | 0 | .get_break_state_from_table(index, self.data.eot_property); |
976 | 0 | if break_state == BreakState::NoMatch { |
977 | 0 | self.iter = previous_iter; |
978 | 0 | self.current_pos_data = previous_pos_data; |
979 | 0 | return self.get_current_position(); |
980 | 0 | } |
981 | 0 | // EOF |
982 | 0 | return Some(self.len); |
983 | | }; |
984 | | |
985 | 0 | match self.data.get_break_state_from_table(index, prop) { |
986 | 0 | BreakState::Keep => continue 'a, |
987 | | BreakState::NoMatch => { |
988 | 0 | self.iter = previous_iter; |
989 | 0 | self.current_pos_data = previous_pos_data; |
990 | 0 | return self.get_current_position(); |
991 | | } |
992 | 0 | BreakState::Break => return self.get_current_position(), |
993 | 0 | BreakState::Index(i) => { |
994 | 0 | index = i; |
995 | 0 | previous_iter = self.iter.clone(); |
996 | 0 | previous_pos_data = self.current_pos_data; |
997 | 0 | } |
998 | 0 | BreakState::Intermediate(i) => { |
999 | 0 | index = i + 64; |
1000 | 0 | previous_iter = self.iter.clone(); |
1001 | 0 | previous_pos_data = self.current_pos_data; |
1002 | 0 | } |
1003 | | } |
1004 | | } |
1005 | | } |
1006 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16> as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1> as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8> as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_> as core::iter::traits::iterator::Iterator>::next |
1007 | | } |
1008 | | |
1009 | | enum StringBoundaryPosType { |
1010 | | Start, |
1011 | | Middle, |
1012 | | End, |
1013 | | } |
1014 | | |
1015 | | impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> { |
1016 | 0 | fn advance_iter(&mut self) { |
1017 | 0 | self.current_pos_data = self.iter.next(); |
1018 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::advance_iter Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::advance_iter Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::advance_iter Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::advance_iter |
1019 | | |
1020 | 0 | fn is_eof(&self) -> bool { |
1021 | 0 | self.current_pos_data.is_none() |
1022 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::is_eof Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::is_eof Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::is_eof Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::is_eof |
1023 | | |
1024 | | #[inline] |
1025 | 0 | fn check_eof(&mut self) -> StringBoundaryPosType { |
1026 | 0 | if self.is_eof() { |
1027 | 0 | self.advance_iter(); |
1028 | 0 | if self.is_eof() { |
1029 | 0 | if self.len == 0 { |
1030 | | // Empty string. Since `self.current_pos_data` is always going to be empty, |
1031 | | // we never read `self.len` except for here, so we can use it to mark that |
1032 | | // we have already returned the single empty-string breakpoint. |
1033 | 0 | self.len = 1; |
1034 | 0 | StringBoundaryPosType::Start |
1035 | | } else { |
1036 | 0 | StringBoundaryPosType::End |
1037 | | } |
1038 | | } else { |
1039 | 0 | StringBoundaryPosType::Start |
1040 | | } |
1041 | | } else { |
1042 | 0 | StringBoundaryPosType::Middle |
1043 | | } |
1044 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::check_eof Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::check_eof Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::check_eof Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::check_eof |
1045 | | |
1046 | 0 | fn get_current_position(&self) -> Option<usize> { |
1047 | 0 | self.current_pos_data.map(|(pos, _)| pos) Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_position::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::get_current_position::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_position::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_position::{closure#0} |
1048 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_position Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::get_current_position Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_position Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_position |
1049 | | |
1050 | 0 | fn get_current_codepoint(&self) -> Option<Y::CharType> { |
1051 | 0 | self.current_pos_data.map(|(_, codepoint)| codepoint) Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_codepoint::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::get_current_codepoint::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_codepoint::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_codepoint::{closure#0} |
1052 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_codepoint Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::get_current_codepoint Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_codepoint Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_codepoint |
1053 | | |
1054 | 0 | fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 { |
1055 | 0 | Y::get_linebreak_property_with_rule(self, codepoint) |
1056 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_linebreak_property Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_linebreak_property Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_linebreak_property Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::get_linebreak_property |
1057 | | |
1058 | 0 | fn get_current_linebreak_property(&self) -> Option<u8> { |
1059 | 0 | self.get_current_codepoint() |
1060 | 0 | .map(|c| self.get_linebreak_property(c)) Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_linebreak_property::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_linebreak_property::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_linebreak_property::{closure#0} Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::get_current_linebreak_property::{closure#0} |
1061 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_linebreak_property Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_linebreak_property Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_linebreak_property Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::get_current_linebreak_property |
1062 | | |
1063 | 0 | fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool { |
1064 | 0 | match codepoint.into() { |
1065 | 0 | 0x301C | 0x30A0 => self.options.ja_zh, |
1066 | 0 | _ => false, |
1067 | | } |
1068 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::is_break_by_normal Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::is_break_by_normal Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::is_break_by_normal Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::is_break_by_normal |
1069 | | } |
1070 | | |
1071 | | #[derive(Debug)] |
1072 | | pub struct LineBreakTypeUtf8; |
1073 | | |
1074 | | impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf8 { |
1075 | | type IterAttr = CharIndices<'s>; |
1076 | | type CharType = char; |
1077 | | |
1078 | 0 | fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 { |
1079 | 0 | iterator.data.get_linebreak_property_utf32_with_rule( |
1080 | 0 | c as u32, |
1081 | 0 | iterator.options.strictness, |
1082 | 0 | iterator.options.word_option, |
1083 | 0 | ) |
1084 | 0 | } |
1085 | | |
1086 | | #[inline] |
1087 | 0 | fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool { |
1088 | 0 | iterator.data.use_complex_breaking_utf32(c as u32) |
1089 | 0 | } |
1090 | | |
1091 | 0 | fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize { |
1092 | 0 | iterator.get_current_codepoint().map_or(0, |c| c.len_utf8()) |
1093 | 0 | } |
1094 | | |
1095 | 0 | fn handle_complex_language( |
1096 | 0 | iter: &mut LineBreakIterator<'l, 's, Self>, |
1097 | 0 | left_codepoint: char, |
1098 | 0 | ) -> Option<usize> { |
1099 | 0 | handle_complex_language_utf8(iter, left_codepoint) |
1100 | 0 | } |
1101 | | } |
1102 | | |
1103 | | #[derive(Debug)] |
1104 | | pub struct LineBreakTypePotentiallyIllFormedUtf8; |
1105 | | |
1106 | | impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypePotentiallyIllFormedUtf8 { |
1107 | | type IterAttr = Utf8CharIndices<'s>; |
1108 | | type CharType = char; |
1109 | | |
1110 | 0 | fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 { |
1111 | 0 | iterator.data.get_linebreak_property_utf32_with_rule( |
1112 | 0 | c as u32, |
1113 | 0 | iterator.options.strictness, |
1114 | 0 | iterator.options.word_option, |
1115 | 0 | ) |
1116 | 0 | } |
1117 | | |
1118 | | #[inline] |
1119 | 0 | fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool { |
1120 | 0 | iterator.data.use_complex_breaking_utf32(c as u32) |
1121 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8 as icu_segmenter::line::LineBreakType>::use_complex_breaking Unexecuted instantiation: <icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8 as icu_segmenter::line::LineBreakType>::use_complex_breaking |
1122 | | |
1123 | 0 | fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize { |
1124 | 0 | iterator.get_current_codepoint().map_or(0, |c| c.len_utf8()) |
1125 | 0 | } |
1126 | | |
1127 | 0 | fn handle_complex_language( |
1128 | 0 | iter: &mut LineBreakIterator<'l, 's, Self>, |
1129 | 0 | left_codepoint: char, |
1130 | 0 | ) -> Option<usize> { |
1131 | 0 | handle_complex_language_utf8(iter, left_codepoint) |
1132 | 0 | } |
1133 | | } |
1134 | | /// handle_complex_language impl for UTF8 iterators |
1135 | 0 | fn handle_complex_language_utf8<'l, 's, T>( |
1136 | 0 | iter: &mut LineBreakIterator<'l, 's, T>, |
1137 | 0 | left_codepoint: char, |
1138 | 0 | ) -> Option<usize> |
1139 | 0 | where |
1140 | 0 | T: LineBreakType<'l, 's, CharType = char>, |
1141 | 0 | { |
1142 | 0 | // word segmenter doesn't define break rules for some languages such as Thai. |
1143 | 0 | let start_iter = iter.iter.clone(); |
1144 | 0 | let start_point = iter.current_pos_data; |
1145 | 0 | let mut s = String::new(); |
1146 | 0 | s.push(left_codepoint); |
1147 | | loop { |
1148 | 0 | debug_assert!(!iter.is_eof()); |
1149 | 0 | s.push(iter.get_current_codepoint()?); |
1150 | 0 | iter.advance_iter(); |
1151 | 0 | if let Some(current_codepoint) = iter.get_current_codepoint() { |
1152 | 0 | if !T::use_complex_breaking(iter, current_codepoint) { |
1153 | 0 | break; |
1154 | 0 | } |
1155 | | } else { |
1156 | | // EOF |
1157 | 0 | break; |
1158 | | } |
1159 | | } |
1160 | | |
1161 | | // Restore iterator to move to head of complex string |
1162 | 0 | iter.iter = start_iter; |
1163 | 0 | iter.current_pos_data = start_point; |
1164 | 0 | let breaks = complex_language_segment_str(iter.complex, &s); |
1165 | 0 | iter.result_cache = breaks; |
1166 | 0 | let first_pos = *iter.result_cache.first()?; |
1167 | 0 | let mut i = left_codepoint.len_utf8(); |
1168 | | loop { |
1169 | 0 | if i == first_pos { |
1170 | | // Re-calculate breaking offset |
1171 | 0 | iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect(); Unexecuted instantiation: icu_segmenter::line::handle_complex_language_utf8::<icu_segmenter::line::LineBreakTypeUtf8>::{closure#0} Unexecuted instantiation: icu_segmenter::line::handle_complex_language_utf8::<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>::{closure#0} |
1172 | 0 | return iter.get_current_position(); |
1173 | 0 | } |
1174 | 0 | debug_assert!( |
1175 | 0 | i < first_pos, |
1176 | 0 | "we should always arrive at first_pos: near index {:?}", |
1177 | 0 | iter.get_current_position() |
1178 | | ); |
1179 | 0 | i += T::get_current_position_character_len(iter); |
1180 | 0 | iter.advance_iter(); |
1181 | 0 | if iter.is_eof() { |
1182 | 0 | iter.result_cache.clear(); |
1183 | 0 | return Some(iter.len); |
1184 | 0 | } |
1185 | | } |
1186 | 0 | } Unexecuted instantiation: icu_segmenter::line::handle_complex_language_utf8::<icu_segmenter::line::LineBreakTypeUtf8> Unexecuted instantiation: icu_segmenter::line::handle_complex_language_utf8::<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8> |
1187 | | |
1188 | | #[derive(Debug)] |
1189 | | pub struct LineBreakTypeLatin1; |
1190 | | |
1191 | | impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeLatin1 { |
1192 | | type IterAttr = Latin1Indices<'s>; |
1193 | | type CharType = u8; |
1194 | | |
1195 | 0 | fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 { |
1196 | 0 | // No CJ on Latin1 |
1197 | 0 | // Note: Default value is 0 == UNKNOWN |
1198 | 0 | iterator.data.property_table.get32(c as u32) |
1199 | 0 | } |
1200 | | |
1201 | | #[inline] |
1202 | 0 | fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool { |
1203 | 0 | false |
1204 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakTypeLatin1 as icu_segmenter::line::LineBreakType>::use_complex_breaking Unexecuted instantiation: <icu_segmenter::line::LineBreakTypeLatin1 as icu_segmenter::line::LineBreakType>::use_complex_breaking |
1205 | | |
1206 | 0 | fn get_current_position_character_len(_: &LineBreakIterator<Self>) -> usize { |
1207 | 0 | unreachable!() |
1208 | | } |
1209 | | |
1210 | 0 | fn handle_complex_language( |
1211 | 0 | _: &mut LineBreakIterator<Self>, |
1212 | 0 | _: Self::CharType, |
1213 | 0 | ) -> Option<usize> { |
1214 | 0 | unreachable!() |
1215 | | } |
1216 | | } |
1217 | | |
1218 | | #[derive(Debug)] |
1219 | | pub struct LineBreakTypeUtf16; |
1220 | | |
1221 | | impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 { |
1222 | | type IterAttr = Utf16Indices<'s>; |
1223 | | type CharType = u32; |
1224 | | |
1225 | 0 | fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 { |
1226 | 0 | iterator.data.get_linebreak_property_utf32_with_rule( |
1227 | 0 | c, |
1228 | 0 | iterator.options.strictness, |
1229 | 0 | iterator.options.word_option, |
1230 | 0 | ) |
1231 | 0 | } |
1232 | | |
1233 | | #[inline] |
1234 | 0 | fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool { |
1235 | 0 | iterator.data.use_complex_breaking_utf32(c) |
1236 | 0 | } Unexecuted instantiation: <icu_segmenter::line::LineBreakTypeUtf16 as icu_segmenter::line::LineBreakType>::use_complex_breaking Unexecuted instantiation: <icu_segmenter::line::LineBreakTypeUtf16 as icu_segmenter::line::LineBreakType>::use_complex_breaking |
1237 | | |
1238 | 0 | fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize { |
1239 | 0 | match iterator.get_current_codepoint() { |
1240 | 0 | None => 0, |
1241 | 0 | Some(ch) if ch >= 0x10000 => 2, |
1242 | 0 | _ => 1, |
1243 | | } |
1244 | 0 | } |
1245 | | |
1246 | 0 | fn handle_complex_language( |
1247 | 0 | iterator: &mut LineBreakIterator<Self>, |
1248 | 0 | left_codepoint: Self::CharType, |
1249 | 0 | ) -> Option<usize> { |
1250 | 0 | // word segmenter doesn't define break rules for some languages such as Thai. |
1251 | 0 | let start_iter = iterator.iter.clone(); |
1252 | 0 | let start_point = iterator.current_pos_data; |
1253 | 0 | let mut s = vec![left_codepoint as u16]; |
1254 | | loop { |
1255 | 0 | debug_assert!(!iterator.is_eof()); |
1256 | 0 | s.push(iterator.get_current_codepoint()? as u16); |
1257 | 0 | iterator.advance_iter(); |
1258 | 0 | if let Some(current_codepoint) = iterator.get_current_codepoint() { |
1259 | 0 | if !Self::use_complex_breaking(iterator, current_codepoint) { |
1260 | 0 | break; |
1261 | 0 | } |
1262 | | } else { |
1263 | | // EOF |
1264 | 0 | break; |
1265 | | } |
1266 | | } |
1267 | | |
1268 | | // Restore iterator to move to head of complex string |
1269 | 0 | iterator.iter = start_iter; |
1270 | 0 | iterator.current_pos_data = start_point; |
1271 | 0 | let breaks = complex_language_segment_utf16(iterator.complex, &s); |
1272 | 0 | iterator.result_cache = breaks; |
1273 | | // result_cache vector is utf-16 index that is in BMP. |
1274 | 0 | let first_pos = *iterator.result_cache.first()?; |
1275 | 0 | let mut i = 1; |
1276 | | loop { |
1277 | 0 | if i == first_pos { |
1278 | | // Re-calculate breaking offset |
1279 | 0 | iterator.result_cache = iterator |
1280 | 0 | .result_cache |
1281 | 0 | .iter() |
1282 | 0 | .skip(1) |
1283 | 0 | .map(|r| r - i) |
1284 | 0 | .collect(); |
1285 | 0 | return iterator.get_current_position(); |
1286 | 0 | } |
1287 | 0 | debug_assert!( |
1288 | 0 | i < first_pos, |
1289 | 0 | "we should always arrive at first_pos: near index {:?}", |
1290 | 0 | iterator.get_current_position() |
1291 | | ); |
1292 | 0 | i += 1; |
1293 | 0 | iterator.advance_iter(); |
1294 | 0 | if iterator.is_eof() { |
1295 | 0 | iterator.result_cache.clear(); |
1296 | 0 | return Some(iterator.len); |
1297 | 0 | } |
1298 | | } |
1299 | 0 | } |
1300 | | } |
1301 | | |
1302 | | #[cfg(test)] |
1303 | | #[cfg(feature = "serde")] |
1304 | | mod tests { |
1305 | | use super::*; |
1306 | | use crate::LineSegmenter; |
1307 | | |
1308 | | #[test] |
1309 | | fn linebreak_property() { |
1310 | | let payload = DataProvider::<LineBreakDataV1Marker>::load( |
1311 | | &crate::provider::Baked, |
1312 | | Default::default(), |
1313 | | ) |
1314 | | .expect("Loading should succeed!") |
1315 | | .take_payload() |
1316 | | .expect("Data should be present!"); |
1317 | | |
1318 | | let get_linebreak_property = |codepoint| { |
1319 | | payload.get().get_linebreak_property_utf32_with_rule( |
1320 | | codepoint as u32, |
1321 | | LineBreakStrictness::Strict, |
1322 | | LineBreakWordOption::Normal, |
1323 | | ) |
1324 | | }; |
1325 | | |
1326 | | assert_eq!(get_linebreak_property('\u{0020}'), SP); |
1327 | | assert_eq!(get_linebreak_property('\u{0022}'), QU); |
1328 | | assert_eq!(get_linebreak_property('('), OP_OP30); |
1329 | | assert_eq!(get_linebreak_property('\u{0030}'), NU); |
1330 | | assert_eq!(get_linebreak_property('['), OP_OP30); |
1331 | | assert_eq!(get_linebreak_property('\u{1f3fb}'), EM); |
1332 | | assert_eq!(get_linebreak_property('\u{20000}'), ID); |
1333 | | assert_eq!(get_linebreak_property('\u{e0020}'), CM); |
1334 | | assert_eq!(get_linebreak_property('\u{3041}'), CJ); |
1335 | | assert_eq!(get_linebreak_property('\u{0025}'), PO); |
1336 | | assert_eq!(get_linebreak_property('\u{00A7}'), AI); |
1337 | | assert_eq!(get_linebreak_property('\u{50005}'), XX); |
1338 | | assert_eq!(get_linebreak_property('\u{17D6}'), NS); |
1339 | | assert_eq!(get_linebreak_property('\u{2014}'), B2); |
1340 | | } |
1341 | | |
1342 | | #[test] |
1343 | | #[allow(clippy::bool_assert_comparison)] // clearer when we're testing bools directly |
1344 | | fn break_rule() { |
1345 | | let payload = DataProvider::<LineBreakDataV1Marker>::load( |
1346 | | &crate::provider::Baked, |
1347 | | Default::default(), |
1348 | | ) |
1349 | | .expect("Loading should succeed!") |
1350 | | .take_payload() |
1351 | | .expect("Data should be present!"); |
1352 | | let lb_data: &RuleBreakDataV1 = payload.get(); |
1353 | | |
1354 | | let is_break = |left, right| { |
1355 | | matches!( |
1356 | | lb_data.get_break_state_from_table(left, right), |
1357 | | BreakState::Break | BreakState::NoMatch |
1358 | | ) |
1359 | | }; |
1360 | | |
1361 | | // LB4 |
1362 | | assert_eq!(is_break(BK, AL), true); |
1363 | | // LB5 |
1364 | | assert_eq!(is_break(CR, LF), false); |
1365 | | assert_eq!(is_break(CR, AL), true); |
1366 | | assert_eq!(is_break(LF, AL), true); |
1367 | | assert_eq!(is_break(NL, AL), true); |
1368 | | // LB6 |
1369 | | assert_eq!(is_break(AL, BK), false); |
1370 | | assert_eq!(is_break(AL, CR), false); |
1371 | | assert_eq!(is_break(AL, LF), false); |
1372 | | assert_eq!(is_break(AL, NL), false); |
1373 | | // LB7 |
1374 | | assert_eq!(is_break(AL, SP), false); |
1375 | | assert_eq!(is_break(AL, ZW), false); |
1376 | | // LB8 |
1377 | | // LB8a |
1378 | | assert_eq!(is_break(ZWJ, AL), false); |
1379 | | // LB9 |
1380 | | assert_eq!(is_break(AL, ZWJ), false); |
1381 | | assert_eq!(is_break(AL, CM), false); |
1382 | | assert_eq!(is_break(ID, ZWJ), false); |
1383 | | // LB10 |
1384 | | assert_eq!(is_break(ZWJ, SP), false); |
1385 | | assert_eq!(is_break(SP, CM), true); |
1386 | | // LB11 |
1387 | | assert_eq!(is_break(AL, WJ), false); |
1388 | | assert_eq!(is_break(WJ, AL), false); |
1389 | | // LB12 |
1390 | | assert_eq!(is_break(GL, AL), false); |
1391 | | // LB12a |
1392 | | assert_eq!(is_break(AL, GL), false); |
1393 | | assert_eq!(is_break(SP, GL), true); |
1394 | | // LB13 |
1395 | | assert_eq!(is_break(AL, CL), false); |
1396 | | assert_eq!(is_break(AL, CP), false); |
1397 | | assert_eq!(is_break(AL, EX), false); |
1398 | | assert_eq!(is_break(AL, IS), false); |
1399 | | assert_eq!(is_break(AL, SY), false); |
1400 | | // LB18 |
1401 | | assert_eq!(is_break(SP, AL), true); |
1402 | | // LB19 |
1403 | | assert_eq!(is_break(AL, QU), false); |
1404 | | assert_eq!(is_break(QU, AL), false); |
1405 | | // LB20 |
1406 | | assert_eq!(is_break(AL, CB), true); |
1407 | | assert_eq!(is_break(CB, AL), true); |
1408 | | // LB20 |
1409 | | assert_eq!(is_break(AL, BA), false); |
1410 | | assert_eq!(is_break(AL, HY), false); |
1411 | | assert_eq!(is_break(AL, NS), false); |
1412 | | // LB21 |
1413 | | assert_eq!(is_break(AL, BA), false); |
1414 | | assert_eq!(is_break(BB, AL), false); |
1415 | | assert_eq!(is_break(ID, BA), false); |
1416 | | assert_eq!(is_break(ID, NS), false); |
1417 | | // LB21a |
1418 | | // LB21b |
1419 | | assert_eq!(is_break(SY, HL), false); |
1420 | | // LB22 |
1421 | | assert_eq!(is_break(AL, IN), false); |
1422 | | // LB 23 |
1423 | | assert_eq!(is_break(AL, NU), false); |
1424 | | assert_eq!(is_break(HL, NU), false); |
1425 | | // LB 23a |
1426 | | assert_eq!(is_break(PR, ID), false); |
1427 | | assert_eq!(is_break(PR, EB), false); |
1428 | | assert_eq!(is_break(PR, EM), false); |
1429 | | assert_eq!(is_break(ID, PO), false); |
1430 | | assert_eq!(is_break(EB, PO), false); |
1431 | | assert_eq!(is_break(EM, PO), false); |
1432 | | // LB26 |
1433 | | assert_eq!(is_break(JL, JL), false); |
1434 | | assert_eq!(is_break(JL, JV), false); |
1435 | | assert_eq!(is_break(JL, H2), false); |
1436 | | // LB27 |
1437 | | assert_eq!(is_break(JL, IN), false); |
1438 | | assert_eq!(is_break(JL, PO), false); |
1439 | | assert_eq!(is_break(PR, JL), false); |
1440 | | // LB28 |
1441 | | assert_eq!(is_break(AL, AL), false); |
1442 | | assert_eq!(is_break(HL, AL), false); |
1443 | | // LB29 |
1444 | | assert_eq!(is_break(IS, AL), false); |
1445 | | assert_eq!(is_break(IS, HL), false); |
1446 | | // LB30b |
1447 | | assert_eq!(is_break(EB, EM), false); |
1448 | | // LB31 |
1449 | | assert_eq!(is_break(ID, ID), true); |
1450 | | } |
1451 | | |
1452 | | #[test] |
1453 | | fn linebreak() { |
1454 | | let segmenter = LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked) |
1455 | | .expect("Data exists"); |
1456 | | |
1457 | | let mut iter = segmenter.segment_str("hello world"); |
1458 | | assert_eq!(Some(0), iter.next()); |
1459 | | assert_eq!(Some(6), iter.next()); |
1460 | | assert_eq!(Some(11), iter.next()); |
1461 | | assert_eq!(None, iter.next()); |
1462 | | |
1463 | | iter = segmenter.segment_str("$10 $10"); |
1464 | | assert_eq!(Some(0), iter.next()); |
1465 | | assert_eq!(Some(4), iter.next()); |
1466 | | assert_eq!(Some(7), iter.next()); |
1467 | | assert_eq!(None, iter.next()); |
1468 | | |
1469 | | // LB10 |
1470 | | |
1471 | | // LB14 |
1472 | | iter = segmenter.segment_str("[ abc def"); |
1473 | | assert_eq!(Some(0), iter.next()); |
1474 | | assert_eq!(Some(7), iter.next()); |
1475 | | assert_eq!(Some(10), iter.next()); |
1476 | | assert_eq!(None, iter.next()); |
1477 | | |
1478 | | let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66]; |
1479 | | let mut iter_u8 = segmenter.segment_latin1(&input); |
1480 | | assert_eq!(Some(0), iter_u8.next()); |
1481 | | assert_eq!(Some(7), iter_u8.next()); |
1482 | | assert_eq!(Some(10), iter_u8.next()); |
1483 | | assert_eq!(None, iter_u8.next()); |
1484 | | |
1485 | | let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66]; |
1486 | | let mut iter_u16 = segmenter.segment_utf16(&input); |
1487 | | assert_eq!(Some(0), iter_u16.next()); |
1488 | | assert_eq!(Some(7), iter_u16.next()); |
1489 | | assert_eq!(Some(10), iter_u16.next()); |
1490 | | assert_eq!(None, iter_u16.next()); |
1491 | | |
1492 | | // LB15 |
1493 | | iter = segmenter.segment_str("abc\u{0022} (def"); |
1494 | | assert_eq!(Some(0), iter.next()); |
1495 | | assert_eq!(Some(10), iter.next()); |
1496 | | assert_eq!(None, iter.next()); |
1497 | | |
1498 | | let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66]; |
1499 | | let mut iter_u8 = segmenter.segment_latin1(&input); |
1500 | | assert_eq!(Some(0), iter_u8.next()); |
1501 | | assert_eq!(Some(10), iter_u8.next()); |
1502 | | assert_eq!(None, iter_u8.next()); |
1503 | | |
1504 | | let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66]; |
1505 | | let mut iter_u16 = segmenter.segment_utf16(&input); |
1506 | | assert_eq!(Some(0), iter_u16.next()); |
1507 | | assert_eq!(Some(10), iter_u16.next()); |
1508 | | assert_eq!(None, iter_u16.next()); |
1509 | | |
1510 | | // LB16 |
1511 | | iter = segmenter.segment_str("\u{0029}\u{203C}"); |
1512 | | assert_eq!(Some(0), iter.next()); |
1513 | | assert_eq!(Some(4), iter.next()); |
1514 | | assert_eq!(None, iter.next()); |
1515 | | iter = segmenter.segment_str("\u{0029} \u{203C}"); |
1516 | | assert_eq!(Some(0), iter.next()); |
1517 | | assert_eq!(Some(6), iter.next()); |
1518 | | assert_eq!(None, iter.next()); |
1519 | | |
1520 | | let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c]; |
1521 | | let mut iter_u16 = segmenter.segment_utf16(&input); |
1522 | | assert_eq!(Some(0), iter_u16.next()); |
1523 | | assert_eq!(Some(4), iter_u16.next()); |
1524 | | assert_eq!(None, iter_u16.next()); |
1525 | | |
1526 | | // LB17 |
1527 | | iter = segmenter.segment_str("\u{2014}\u{2014}aa"); |
1528 | | assert_eq!(Some(0), iter.next()); |
1529 | | assert_eq!(Some(6), iter.next()); |
1530 | | assert_eq!(Some(8), iter.next()); |
1531 | | assert_eq!(None, iter.next()); |
1532 | | iter = segmenter.segment_str("\u{2014} \u{2014}aa"); |
1533 | | assert_eq!(Some(0), iter.next()); |
1534 | | assert_eq!(Some(8), iter.next()); |
1535 | | assert_eq!(Some(10), iter.next()); |
1536 | | assert_eq!(None, iter.next()); |
1537 | | |
1538 | | iter = segmenter.segment_str("\u{2014}\u{2014} \u{2014}\u{2014}123 abc"); |
1539 | | assert_eq!(Some(0), iter.next()); |
1540 | | assert_eq!(Some(14), iter.next()); |
1541 | | assert_eq!(Some(18), iter.next()); |
1542 | | assert_eq!(Some(21), iter.next()); |
1543 | | assert_eq!(None, iter.next()); |
1544 | | |
1545 | | // LB25 |
1546 | | let mut iter = segmenter.segment_str("(0,1)+(2,3)"); |
1547 | | assert_eq!(Some(0), iter.next()); |
1548 | | assert_eq!(Some(11), iter.next()); |
1549 | | assert_eq!(None, iter.next()); |
1550 | | let input: [u16; 11] = [ |
1551 | | 0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29, |
1552 | | ]; |
1553 | | let mut iter_u16 = segmenter.segment_utf16(&input); |
1554 | | assert_eq!(Some(0), iter_u16.next()); |
1555 | | assert_eq!(Some(11), iter_u16.next()); |
1556 | | assert_eq!(None, iter_u16.next()); |
1557 | | |
1558 | | let input: [u16; 13] = [ |
1559 | | 0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63, |
1560 | | ]; |
1561 | | let mut iter_u16 = segmenter.segment_utf16(&input); |
1562 | | assert_eq!(Some(0), iter_u16.next()); |
1563 | | assert_eq!(Some(6), iter_u16.next()); |
1564 | | assert_eq!(Some(10), iter_u16.next()); |
1565 | | assert_eq!(Some(13), iter_u16.next()); |
1566 | | assert_eq!(None, iter_u16.next()); |
1567 | | |
1568 | | iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}"); |
1569 | | assert_eq!(Some(0), iter.next()); |
1570 | | assert_eq!(Some(5), iter.next()); |
1571 | | assert_eq!(Some(9), iter.next()); |
1572 | | assert_eq!(None, iter.next()); |
1573 | | } |
1574 | | |
1575 | | #[test] |
1576 | | #[cfg(feature = "lstm")] |
1577 | | fn thai_line_break() { |
1578 | | const TEST_STR: &str = "ภาษาไทยภาษาไทย"; |
1579 | | |
1580 | | let segmenter = LineSegmenter::new_lstm(); |
1581 | | let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect(); |
1582 | | assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test"); |
1583 | | |
1584 | | let utf16: Vec<u16> = TEST_STR.encode_utf16().collect(); |
1585 | | let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect(); |
1586 | | assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test"); |
1587 | | |
1588 | | let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32]; |
1589 | | let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect(); |
1590 | | assert_eq!(breaks, [0, 4], "Thai test"); |
1591 | | } |
1592 | | |
1593 | | #[test] |
1594 | | #[cfg(feature = "lstm")] |
1595 | | fn burmese_line_break() { |
1596 | | // "Burmese Language" in Burmese |
1597 | | const TEST_STR: &str = "မြန်မာဘာသာစကား"; |
1598 | | |
1599 | | let segmenter = LineSegmenter::new_lstm(); |
1600 | | let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect(); |
1601 | | // LSTM model breaks more characters, but it is better to return [30]. |
1602 | | assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test"); |
1603 | | |
1604 | | let utf16: Vec<u16> = TEST_STR.encode_utf16().collect(); |
1605 | | let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect(); |
1606 | | // LSTM model breaks more characters, but it is better to return [10]. |
1607 | | assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test"); |
1608 | | } |
1609 | | |
1610 | | #[test] |
1611 | | #[cfg(feature = "lstm")] |
1612 | | fn khmer_line_break() { |
1613 | | const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស"; |
1614 | | |
1615 | | let segmenter = LineSegmenter::new_lstm(); |
1616 | | let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect(); |
1617 | | // Note: This small sample matches the ICU dictionary segmenter |
1618 | | assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test"); |
1619 | | |
1620 | | let utf16: Vec<u16> = TEST_STR.encode_utf16().collect(); |
1621 | | let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect(); |
1622 | | assert_eq!( |
1623 | | breaks, |
1624 | | [0, 13, 16, 18, 24, utf16.len()], |
1625 | | "Khmer utf-16 test" |
1626 | | ); |
1627 | | } |
1628 | | |
1629 | | #[test] |
1630 | | #[cfg(feature = "lstm")] |
1631 | | fn lao_line_break() { |
1632 | | const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ"; |
1633 | | |
1634 | | let segmenter = LineSegmenter::new_lstm(); |
1635 | | let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect(); |
1636 | | // Note: LSTM finds a break at '12' that the dictionary does not find |
1637 | | assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test"); |
1638 | | |
1639 | | let utf16: Vec<u16> = TEST_STR.encode_utf16().collect(); |
1640 | | let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect(); |
1641 | | assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test"); |
1642 | | } |
1643 | | |
1644 | | #[test] |
1645 | | fn empty_string() { |
1646 | | let segmenter = LineSegmenter::new_auto(); |
1647 | | let breaks: Vec<usize> = segmenter.segment_str("").collect(); |
1648 | | assert_eq!(breaks, [0]); |
1649 | | } |
1650 | | } |