Coverage Report

Created: 2025-08-26 06:41

/rust/registry/src/index.crates.io-6f17d22bba15001f/icu_segmenter-1.5.0/src/line.rs
Line
Count
Source (jump to first uncovered line)
1
// This file is part of ICU4X. For terms of use, please see the file
2
// called LICENSE at the top level of the ICU4X source tree
3
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5
use crate::complex::*;
6
use crate::indices::*;
7
use crate::provider::*;
8
use crate::SegmenterError;
9
use alloc::string::String;
10
use alloc::vec;
11
use alloc::vec::Vec;
12
use core::char;
13
use core::str::CharIndices;
14
use icu_provider::prelude::*;
15
use utf8_iter::Utf8CharIndices;
16
17
// TODO(#1637): These constants should be data driven.
18
#[allow(dead_code)]
19
const UNKNOWN: u8 = 0;
20
#[allow(dead_code)]
21
const AI: u8 = 1;
22
#[allow(dead_code)]
23
const AL: u8 = 2;
24
#[allow(dead_code)]
25
const B2: u8 = 3;
26
#[allow(dead_code)]
27
const BA: u8 = 4;
28
#[allow(dead_code)]
29
const BB: u8 = 5;
30
#[allow(dead_code)]
31
const BK: u8 = 6;
32
#[allow(dead_code)]
33
const CB: u8 = 7;
34
#[allow(dead_code)]
35
const CJ: u8 = 8;
36
#[allow(dead_code)]
37
const CL: u8 = 9;
38
#[allow(dead_code)]
39
const CM: u8 = 10;
40
#[allow(dead_code)]
41
const CP: u8 = 11;
42
#[allow(dead_code)]
43
const CR: u8 = 12;
44
#[allow(dead_code)]
45
const EB: u8 = 13;
46
#[allow(dead_code)]
47
const EM: u8 = 14;
48
#[allow(dead_code)]
49
const EX: u8 = 15;
50
#[allow(dead_code)]
51
const GL: u8 = 16;
52
#[allow(dead_code)]
53
const H2: u8 = 17;
54
#[allow(dead_code)]
55
const H3: u8 = 18;
56
#[allow(dead_code)]
57
const HL: u8 = 19;
58
#[allow(dead_code)]
59
const HY: u8 = 20;
60
#[allow(dead_code)]
61
const ID: u8 = 21;
62
#[allow(dead_code)]
63
const ID_CN: u8 = 22;
64
#[allow(dead_code)]
65
const IN: u8 = 23;
66
#[allow(dead_code)]
67
const IS: u8 = 24;
68
#[allow(dead_code)]
69
const JL: u8 = 25;
70
#[allow(dead_code)]
71
const JT: u8 = 26;
72
#[allow(dead_code)]
73
const JV: u8 = 27;
74
#[allow(dead_code)]
75
const LF: u8 = 28;
76
#[allow(dead_code)]
77
const NL: u8 = 29;
78
#[allow(dead_code)]
79
const NS: u8 = 30;
80
#[allow(dead_code)]
81
const NU: u8 = 31;
82
#[allow(dead_code)]
83
const OP_EA: u8 = 32;
84
#[allow(dead_code)]
85
const OP_OP30: u8 = 33;
86
#[allow(dead_code)]
87
const PO: u8 = 34;
88
#[allow(dead_code)]
89
const PO_EAW: u8 = 35;
90
#[allow(dead_code)]
91
const PR: u8 = 36;
92
#[allow(dead_code)]
93
const PR_EAW: u8 = 37;
94
#[allow(dead_code)]
95
const QU: u8 = 38;
96
#[allow(dead_code)]
97
const RI: u8 = 39;
98
#[allow(dead_code)]
99
const SA: u8 = 40;
100
#[allow(dead_code)]
101
const SG: u8 = 41;
102
#[allow(dead_code)]
103
const SP: u8 = 42;
104
#[allow(dead_code)]
105
const SY: u8 = 43;
106
#[allow(dead_code)]
107
const WJ: u8 = 44;
108
#[allow(dead_code)]
109
const XX: u8 = 45;
110
#[allow(dead_code)]
111
const ZW: u8 = 46;
112
#[allow(dead_code)]
113
const ZWJ: u8 = 47;
114
115
/// An enum specifies the strictness of line-breaking rules. It can be passed as
116
/// an argument when creating a line segmenter.
117
///
118
/// Each enum value has the same meaning with respect to the `line-break`
119
/// property values in the CSS Text spec. See the details in
120
/// <https://drafts.csswg.org/css-text-3/#line-break-property>.
121
#[non_exhaustive]
122
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
123
pub enum LineBreakStrictness {
124
    /// Breaks text using the least restrictive set of line-breaking rules.
125
    /// Typically used for short lines, such as in newspapers.
126
    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-loose>
127
    Loose,
128
129
    /// Breaks text using the most common set of line-breaking rules.
130
    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-normal>
131
    Normal,
132
133
    /// Breaks text using the most stringent set of line-breaking rules.
134
    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-strict>
135
    ///
136
    /// This is the default behaviour of the Unicode Line Breaking Algorithm,
137
    /// resolving class [CJ](https://www.unicode.org/reports/tr14/#CJ) to
138
    /// [NS](https://www.unicode.org/reports/tr14/#NS);
139
    /// see rule [LB1](https://www.unicode.org/reports/tr14/#LB1).
140
    Strict,
141
142
    /// Breaks text assuming there is a soft wrap opportunity around every
143
    /// typographic character unit, disregarding any prohibition against line
144
    /// breaks. See more details in
145
    /// <https://drafts.csswg.org/css-text-3/#valdef-line-break-anywhere>.
146
    Anywhere,
147
}
148
149
/// An enum specifies the line break opportunities between letters. It can be
150
/// passed as an argument when creating a line segmenter.
151
///
152
/// Each enum value has the same meaning with respect to the `word-break`
153
/// property values in the CSS Text spec. See the details in
154
/// <https://drafts.csswg.org/css-text-3/#word-break-property>
155
#[non_exhaustive]
156
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
157
pub enum LineBreakWordOption {
158
    /// Words break according to their customary rules. See the details in
159
    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-normal>.
160
    Normal,
161
162
    /// Breaking is allowed within "words".
163
    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-break-all>
164
    BreakAll,
165
166
    /// Breaking is forbidden within "word".
167
    /// <https://drafts.csswg.org/css-text-3/#valdef-word-break-keep-all>
168
    KeepAll,
169
}
170
171
/// Options to tailor line-breaking behavior.
172
#[non_exhaustive]
173
#[derive(Copy, Clone, PartialEq, Eq, Debug)]
174
pub struct LineBreakOptions {
175
    /// Strictness of line-breaking rules. See [`LineBreakStrictness`].
176
    pub strictness: LineBreakStrictness,
177
178
    /// Line break opportunities between letters. See [`LineBreakWordOption`].
179
    pub word_option: LineBreakWordOption,
180
181
    /// Use `true` as a hint to the line segmenter that the writing
182
    /// system is Chinese or Japanese. This allows more break opportunities when
183
    /// `LineBreakStrictness` is `Normal` or `Loose`. See
184
    /// <https://drafts.csswg.org/css-text-3/#line-break-property> for details.
185
    ///
186
    /// This option has no effect in Latin-1 mode.
187
    pub ja_zh: bool,
188
}
189
190
impl Default for LineBreakOptions {
191
0
    fn default() -> Self {
192
0
        Self {
193
0
            strictness: LineBreakStrictness::Strict,
194
0
            word_option: LineBreakWordOption::Normal,
195
0
            ja_zh: false,
196
0
        }
197
0
    }
198
}
199
200
/// Line break iterator for an `str` (a UTF-8 string).
201
///
202
/// For examples of use, see [`LineSegmenter`].
203
pub type LineBreakIteratorUtf8<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf8>;
204
205
/// Line break iterator for a potentially invalid UTF-8 string.
206
///
207
/// For examples of use, see [`LineSegmenter`].
208
pub type LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> =
209
    LineBreakIterator<'l, 's, LineBreakTypePotentiallyIllFormedUtf8>;
210
211
/// Line break iterator for a Latin-1 (8-bit) string.
212
///
213
/// For examples of use, see [`LineSegmenter`].
214
pub type LineBreakIteratorLatin1<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeLatin1>;
215
216
/// Line break iterator for a UTF-16 string.
217
///
218
/// For examples of use, see [`LineSegmenter`].
219
pub type LineBreakIteratorUtf16<'l, 's> = LineBreakIterator<'l, 's, LineBreakTypeUtf16>;
220
221
/// Supports loading line break data, and creating line break iterators for different string
222
/// encodings.
223
///
224
/// The segmenter returns mandatory breaks (as defined by [definition LD7][LD7] of
225
/// Unicode Standard Annex #14, _Unicode Line Breaking Algorithm_) as well as
226
/// line break opportunities ([definition LD3][LD3]).
227
/// It does not distinguish them.  Callers requiring that distinction can check
228
/// the Line_Break property of the code point preceding the break against those
229
/// listed in rules [LB4][LB4] and [LB5][LB5], special-casing the end of text
230
/// according to [LB3][LB3].
231
///
232
/// For consistency with the grapheme, word, and sentence segmenters, there is
233
/// always a breakpoint returned at index 0, but this breakpoint is not a
234
/// meaningful line break opportunity.
235
///
236
/// [LD3]: https://www.unicode.org/reports/tr14/#LD3
237
/// [LD7]: https://www.unicode.org/reports/tr14/#LD7
238
/// [LB3]: https://www.unicode.org/reports/tr14/#LB3
239
/// [LB4]: https://www.unicode.org/reports/tr14/#LB4
240
/// [LB5]: https://www.unicode.org/reports/tr14/#LB5
241
///
242
/// ```rust
243
/// # use icu::segmenter::LineSegmenter;
244
/// #
245
/// # let segmenter = LineSegmenter::new_auto();
246
/// #
247
/// let text = "Summary\r\nThis annex…";
248
/// let breakpoints: Vec<usize> = segmenter.segment_str(text).collect();
249
/// // 9 and 22 are mandatory breaks, 14 is a line break opportunity.
250
/// assert_eq!(&breakpoints, &[0, 9, 14, 22]);
251
///
252
/// // There is a break opportunity between emoji, but not within the ZWJ sequence 🏳️‍🌈.
253
/// let flag_equation = "🏳️➕🌈🟰🏳️\u{200D}🌈";
254
/// let possible_first_lines: Vec<&str> =
255
///     segmenter.segment_str(flag_equation).skip(1).map(|i| &flag_equation[..i]).collect();
256
/// assert_eq!(
257
///     &possible_first_lines,
258
///     &[
259
///         "🏳️",
260
///         "🏳️➕",
261
///         "🏳️➕🌈",
262
///         "🏳️➕🌈🟰",
263
///         "🏳️➕🌈🟰🏳️‍🌈"
264
///     ]
265
/// );
266
/// ```
267
///
268
/// # Examples
269
///
270
/// Segment a string with default options:
271
///
272
/// ```rust
273
/// use icu::segmenter::LineSegmenter;
274
///
275
/// let segmenter = LineSegmenter::new_auto();
276
///
277
/// let breakpoints: Vec<usize> =
278
///     segmenter.segment_str("Hello World").collect();
279
/// assert_eq!(&breakpoints, &[0, 6, 11]);
280
/// ```
281
///
282
/// Segment a string with CSS option overrides:
283
///
284
/// ```rust
285
/// use icu::segmenter::{
286
///     LineBreakOptions, LineBreakStrictness, LineBreakWordOption,
287
///     LineSegmenter,
288
/// };
289
///
290
/// let mut options = LineBreakOptions::default();
291
/// options.strictness = LineBreakStrictness::Strict;
292
/// options.word_option = LineBreakWordOption::BreakAll;
293
/// options.ja_zh = false;
294
/// let segmenter = LineSegmenter::new_auto_with_options(options);
295
///
296
/// let breakpoints: Vec<usize> =
297
///     segmenter.segment_str("Hello World").collect();
298
/// assert_eq!(&breakpoints, &[0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11]);
299
/// ```
300
///
301
/// Segment a Latin1 byte string:
302
///
303
/// ```rust
304
/// use icu::segmenter::LineSegmenter;
305
///
306
/// let segmenter = LineSegmenter::new_auto();
307
///
308
/// let breakpoints: Vec<usize> =
309
///     segmenter.segment_latin1(b"Hello World").collect();
310
/// assert_eq!(&breakpoints, &[0, 6, 11]);
311
/// ```
312
///
313
/// Separate mandatory breaks from the break opportunities:
314
///
315
/// ```rust
316
/// use icu::properties::{maps, LineBreak};
317
/// use icu::segmenter::LineSegmenter;
318
///
319
/// # let segmenter = LineSegmenter::new_auto();
320
/// #
321
/// let text = "Summary\r\nThis annex…";
322
///
323
/// let mandatory_breaks: Vec<usize> = segmenter
324
///     .segment_str(text)
325
///     .into_iter()
326
///     .filter(|&i| {
327
///         text[..i].chars().next_back().map_or(false, |c| {
328
///             matches!(
329
///                 maps::line_break().get(c),
330
///                 LineBreak::MandatoryBreak
331
///                     | LineBreak::CarriageReturn
332
///                     | LineBreak::LineFeed
333
///                     | LineBreak::NextLine
334
///             ) || i == text.len()
335
///         })
336
///     })
337
///     .collect();
338
/// assert_eq!(&mandatory_breaks, &[9, 22]);
339
/// ```
340
#[derive(Debug)]
341
pub struct LineSegmenter {
342
    options: LineBreakOptions,
343
    payload: DataPayload<LineBreakDataV1Marker>,
344
    complex: ComplexPayloads,
345
}
346
347
impl LineSegmenter {
348
    /// Constructs a [`LineSegmenter`] with an invariant locale and the best available compiled data for
349
    /// complex scripts (Khmer, Lao, Myanmar, and Thai).
350
    ///
351
    /// The current behavior, which is subject to change, is to use the LSTM model when available.
352
    ///
353
    /// See also [`Self::new_auto_with_options`].
354
    ///
355
    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
356
    ///
357
    /// [📚 Help choosing a constructor](icu_provider::constructors)
358
    #[cfg(feature = "compiled_data")]
359
    #[cfg(feature = "auto")]
360
0
    pub fn new_auto() -> Self {
361
0
        Self::new_auto_with_options(Default::default())
362
0
    }
363
364
    #[cfg(feature = "auto")]
365
    icu_provider::gen_any_buffer_data_constructors!(
366
        locale: skip,
367
        options: skip,
368
        error: SegmenterError,
369
        #[cfg(skip)]
370
        functions: [
371
            new_auto,
372
            try_new_auto_with_any_provider,
373
            try_new_auto_with_buffer_provider,
374
            try_new_auto_unstable,
375
            Self,
376
        ]
377
    );
378
379
    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto)]
380
    #[cfg(feature = "auto")]
381
0
    pub fn try_new_auto_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
382
0
    where
383
0
        D: DataProvider<LineBreakDataV1Marker>
384
0
            + DataProvider<LstmForWordLineAutoV1Marker>
385
0
            + DataProvider<GraphemeClusterBreakDataV1Marker>
386
0
            + ?Sized,
387
0
    {
388
0
        Self::try_new_auto_with_options_unstable(provider, Default::default())
389
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_auto_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>>
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_auto_unstable::<_>
390
391
    /// Constructs a [`LineSegmenter`] with an invariant locale and compiled LSTM data for
392
    /// complex scripts (Khmer, Lao, Myanmar, and Thai).
393
    ///
394
    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
395
    /// the full dictionary but more expensive during segmentation (inference).
396
    ///
397
    /// See also [`Self::new_lstm_with_options`].
398
    ///
399
    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
400
    ///
401
    /// [📚 Help choosing a constructor](icu_provider::constructors)
402
    #[cfg(feature = "compiled_data")]
403
    #[cfg(feature = "lstm")]
404
0
    pub fn new_lstm() -> Self {
405
0
        Self::new_lstm_with_options(Default::default())
406
0
    }
407
408
    #[cfg(feature = "lstm")]
409
    icu_provider::gen_any_buffer_data_constructors!(
410
        locale: skip,
411
        options: skip,
412
        error: SegmenterError,
413
        #[cfg(skip)]
414
        functions: [
415
            new_lstm,
416
            try_new_lstm_with_any_provider,
417
            try_new_lstm_with_buffer_provider,
418
            try_new_lstm_unstable,
419
            Self,
420
        ]
421
    );
422
423
    #[cfg(feature = "lstm")]
424
    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm)]
425
0
    pub fn try_new_lstm_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
426
0
    where
427
0
        D: DataProvider<LineBreakDataV1Marker>
428
0
            + DataProvider<LstmForWordLineAutoV1Marker>
429
0
            + DataProvider<GraphemeClusterBreakDataV1Marker>
430
0
            + ?Sized,
431
0
    {
432
0
        Self::try_new_lstm_with_options_unstable(provider, Default::default())
433
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_lstm_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>>
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_lstm_unstable::<_>
434
435
    /// Constructs a [`LineSegmenter`] with an invariant locale and compiled dictionary data for
436
    /// complex scripts (Khmer, Lao, Myanmar, and Thai).
437
    ///
438
    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
439
    /// faster than the LSTM model but requires more data.
440
    ///
441
    /// See also [`Self::new_dictionary_with_options`].
442
    ///
443
    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
444
    ///
445
    /// [📚 Help choosing a constructor](icu_provider::constructors)
446
    #[cfg(feature = "compiled_data")]
447
0
    pub fn new_dictionary() -> Self {
448
0
        Self::new_dictionary_with_options(Default::default())
449
0
    }
450
451
    icu_provider::gen_any_buffer_data_constructors!(
452
        locale: skip,
453
        options: skip,
454
        error: SegmenterError,
455
        #[cfg(skip)]
456
        functions: [
457
            new_dictionary,
458
            try_new_dictionary_with_any_provider,
459
            try_new_dictionary_with_buffer_provider,
460
            try_new_dictionary_unstable,
461
            Self,
462
        ]
463
    );
464
465
    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary)]
466
0
    pub fn try_new_dictionary_unstable<D>(provider: &D) -> Result<Self, SegmenterError>
467
0
    where
468
0
        D: DataProvider<LineBreakDataV1Marker>
469
0
            + DataProvider<DictionaryForWordLineExtendedV1Marker>
470
0
            + DataProvider<GraphemeClusterBreakDataV1Marker>
471
0
            + ?Sized,
472
0
    {
473
0
        Self::try_new_dictionary_with_options_unstable(provider, Default::default())
474
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_dictionary_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>>
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_dictionary_unstable::<_>
475
476
    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
477
    /// the best available compiled data for complex scripts (Khmer, Lao, Myanmar, and Thai).
478
    ///
479
    /// The current behavior, which is subject to change, is to use the LSTM model when available.
480
    ///
481
    /// See also [`Self::new_auto`].
482
    ///
483
    /// ✨ *Enabled with the `compiled_data` and `auto` Cargo features.*
484
    ///
485
    /// [📚 Help choosing a constructor](icu_provider::constructors)
486
    #[cfg(feature = "auto")]
487
    #[cfg(feature = "compiled_data")]
488
0
    pub fn new_auto_with_options(options: LineBreakOptions) -> Self {
489
0
        Self::new_lstm_with_options(options)
490
0
    }
491
492
    #[cfg(feature = "auto")]
493
    icu_provider::gen_any_buffer_data_constructors!(
494
        locale: skip,
495
        options: LineBreakOptions,
496
        error: SegmenterError,
497
        #[cfg(skip)]
498
        functions: [
499
            new_auto_with_options,
500
            try_new_auto_with_options_with_any_provider,
501
            try_new_auto_with_options_with_buffer_provider,
502
            try_new_auto_with_options_unstable,
503
            Self,
504
        ]
505
    );
506
507
    #[cfg(feature = "auto")]
508
    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_auto_with_options)]
509
0
    pub fn try_new_auto_with_options_unstable<D>(
510
0
        provider: &D,
511
0
        options: LineBreakOptions,
512
0
    ) -> Result<Self, SegmenterError>
513
0
    where
514
0
        D: DataProvider<LineBreakDataV1Marker>
515
0
            + DataProvider<LstmForWordLineAutoV1Marker>
516
0
            + DataProvider<GraphemeClusterBreakDataV1Marker>
517
0
            + ?Sized,
518
0
    {
519
0
        Self::try_new_lstm_with_options_unstable(provider, options)
520
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_auto_with_options_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>>
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_auto_with_options_unstable::<_>
521
522
    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
523
    /// compiled LSTM data for complex scripts (Khmer, Lao, Myanmar, and Thai).
524
    ///
525
    /// The LSTM, or Long Term Short Memory, is a machine learning model. It is smaller than
526
    /// the full dictionary but more expensive during segmentation (inference).
527
    ///
528
    /// See also [`Self::new_dictionary`].
529
    ///
530
    /// ✨ *Enabled with the `compiled_data` and `lstm` Cargo features.*
531
    ///
532
    /// [📚 Help choosing a constructor](icu_provider::constructors)
533
    #[cfg(feature = "lstm")]
534
    #[cfg(feature = "compiled_data")]
535
0
    pub fn new_lstm_with_options(options: LineBreakOptions) -> Self {
536
0
        Self {
537
0
            options,
538
0
            payload: DataPayload::from_static_ref(
539
0
                crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
540
0
            ),
541
0
            complex: ComplexPayloads::new_lstm(),
542
0
        }
543
0
    }
544
545
    #[cfg(feature = "lstm")]
546
    icu_provider::gen_any_buffer_data_constructors!(
547
        locale: skip,
548
        options: LineBreakOptions,
549
        error: SegmenterError,
550
        #[cfg(skip)]
551
        functions: [
552
            try_new_lstm_with_options,
553
            try_new_lstm_with_options_with_any_provider,
554
            try_new_lstm_with_options_with_buffer_provider,
555
            try_new_lstm_with_options_unstable,
556
            Self,
557
        ]
558
    );
559
560
    #[cfg(feature = "lstm")]
561
    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_lstm_with_options)]
562
0
    pub fn try_new_lstm_with_options_unstable<D>(
563
0
        provider: &D,
564
0
        options: LineBreakOptions,
565
0
    ) -> Result<Self, SegmenterError>
566
0
    where
567
0
        D: DataProvider<LineBreakDataV1Marker>
568
0
            + DataProvider<LstmForWordLineAutoV1Marker>
569
0
            + DataProvider<GraphemeClusterBreakDataV1Marker>
570
0
            + ?Sized,
571
0
    {
572
0
        Ok(Self {
573
0
            options,
574
0
            payload: provider.load(Default::default())?.take_payload()?,
575
0
            complex: ComplexPayloads::try_new_lstm(provider)?,
576
        })
577
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_lstm_with_options_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>>
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_lstm_with_options_unstable::<_>
578
579
    /// Constructs a [`LineSegmenter`] with an invariant locale, custom [`LineBreakOptions`], and
580
    /// compiled dictionary data for complex scripts (Khmer, Lao, Myanmar, and Thai).
581
    ///
582
    /// The dictionary model uses a list of words to determine appropriate breakpoints. It is
583
    /// faster than the LSTM model but requires more data.
584
    ///
585
    /// See also [`Self::new_dictionary`].
586
    ///
587
    /// ✨ *Enabled with the `compiled_data` Cargo feature.*
588
    ///
589
    /// [📚 Help choosing a constructor](icu_provider::constructors)
590
    #[cfg(feature = "compiled_data")]
591
0
    pub fn new_dictionary_with_options(options: LineBreakOptions) -> Self {
592
0
        Self {
593
0
            options,
594
0
            payload: DataPayload::from_static_ref(
595
0
                crate::provider::Baked::SINGLETON_SEGMENTER_LINE_V1,
596
0
            ),
597
0
            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
598
0
            // characters [1]. Southeast Asian languages however require complex context analysis
599
0
            // [2].
600
0
            //
601
0
            // [1]: https://www.unicode.org/reports/tr14/#ID
602
0
            // [2]: https://www.unicode.org/reports/tr14/#SA
603
0
            complex: ComplexPayloads::new_southeast_asian(),
604
0
        }
605
0
    }
606
607
    icu_provider::gen_any_buffer_data_constructors!(
608
        locale: skip,
609
        options: LineBreakOptions,
610
        error: SegmenterError,
611
        #[cfg(skip)]
612
        functions: [
613
            new_dictionary_with_options,
614
            try_new_dictionary_with_options_with_any_provider,
615
            try_new_dictionary_with_options_with_buffer_provider,
616
            try_new_dictionary_with_options_unstable,
617
            Self,
618
        ]
619
    );
620
621
    #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new_dictionary_with_options)]
622
0
    pub fn try_new_dictionary_with_options_unstable<D>(
623
0
        provider: &D,
624
0
        options: LineBreakOptions,
625
0
    ) -> Result<Self, SegmenterError>
626
0
    where
627
0
        D: DataProvider<LineBreakDataV1Marker>
628
0
            + DataProvider<DictionaryForWordLineExtendedV1Marker>
629
0
            + DataProvider<GraphemeClusterBreakDataV1Marker>
630
0
            + ?Sized,
631
0
    {
632
0
        Ok(Self {
633
0
            options,
634
0
            payload: provider.load(Default::default())?.take_payload()?,
635
            // Line segmenter doesn't need to load CJ dictionary because UAX 14 rules handles CJK
636
            // characters [1]. Southeast Asian languages however require complex context analysis
637
            // [2].
638
            //
639
            // [1]: https://www.unicode.org/reports/tr14/#ID
640
            // [2]: https://www.unicode.org/reports/tr14/#SA
641
0
            complex: ComplexPayloads::try_new_southeast_asian(provider)?,
642
        })
643
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_dictionary_with_options_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>>
Unexecuted instantiation: <icu_segmenter::line::LineSegmenter>::try_new_dictionary_with_options_unstable::<_>
644
645
    /// Creates a line break iterator for an `str` (a UTF-8 string).
646
    ///
647
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
648
0
    pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> LineBreakIteratorUtf8<'l, 's> {
649
0
        LineBreakIterator {
650
0
            iter: input.char_indices(),
651
0
            len: input.len(),
652
0
            current_pos_data: None,
653
0
            result_cache: Vec::new(),
654
0
            data: self.payload.get(),
655
0
            options: &self.options,
656
0
            complex: &self.complex,
657
0
        }
658
0
    }
659
    /// Creates a line break iterator for a potentially ill-formed UTF8 string
660
    ///
661
    /// Invalid characters are treated as REPLACEMENT CHARACTER
662
    ///
663
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
664
0
    pub fn segment_utf8<'l, 's>(
665
0
        &'l self,
666
0
        input: &'s [u8],
667
0
    ) -> LineBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
668
0
        LineBreakIterator {
669
0
            iter: Utf8CharIndices::new(input),
670
0
            len: input.len(),
671
0
            current_pos_data: None,
672
0
            result_cache: Vec::new(),
673
0
            data: self.payload.get(),
674
0
            options: &self.options,
675
0
            complex: &self.complex,
676
0
        }
677
0
    }
678
    /// Creates a line break iterator for a Latin-1 (8-bit) string.
679
    ///
680
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
681
0
    pub fn segment_latin1<'l, 's>(&'l self, input: &'s [u8]) -> LineBreakIteratorLatin1<'l, 's> {
682
0
        LineBreakIterator {
683
0
            iter: Latin1Indices::new(input),
684
0
            len: input.len(),
685
0
            current_pos_data: None,
686
0
            result_cache: Vec::new(),
687
0
            data: self.payload.get(),
688
0
            options: &self.options,
689
0
            complex: &self.complex,
690
0
        }
691
0
    }
692
693
    /// Creates a line break iterator for a UTF-16 string.
694
    ///
695
    /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
696
0
    pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> LineBreakIteratorUtf16<'l, 's> {
697
0
        LineBreakIterator {
698
0
            iter: Utf16Indices::new(input),
699
0
            len: input.len(),
700
0
            current_pos_data: None,
701
0
            result_cache: Vec::new(),
702
0
            data: self.payload.get(),
703
0
            options: &self.options,
704
0
            complex: &self.complex,
705
0
        }
706
0
    }
707
}
708
709
impl RuleBreakDataV1<'_> {
710
0
    fn get_linebreak_property_utf32_with_rule(
711
0
        &self,
712
0
        codepoint: u32,
713
0
        strictness: LineBreakStrictness,
714
0
        word_option: LineBreakWordOption,
715
0
    ) -> u8 {
716
0
        // Note: Default value is 0 == UNKNOWN
717
0
        let prop = self.property_table.get32(codepoint);
718
0
719
0
        if word_option == LineBreakWordOption::BreakAll
720
0
            || strictness == LineBreakStrictness::Loose
721
0
            || strictness == LineBreakStrictness::Normal
722
        {
723
0
            return match prop {
724
0
                CJ => ID, // All CJ's General_Category is Other_Letter (Lo).
725
0
                _ => prop,
726
            };
727
0
        }
728
0
729
0
        // CJ is treated as NS by default, yielding strict line breaking.
730
0
        // https://www.unicode.org/reports/tr14/#CJ
731
0
        prop
732
0
    }
733
734
    #[inline]
735
0
    fn get_break_state_from_table(&self, left: u8, right: u8) -> BreakState {
736
0
        let idx = (left as usize) * (self.property_count as usize) + (right as usize);
737
0
        // We use unwrap_or to fall back to the base case and prevent panics on bad data.
738
0
        self.break_state_table.get(idx).unwrap_or(BreakState::Keep)
739
0
    }
Unexecuted instantiation: <icu_segmenter::provider::RuleBreakDataV1>::get_break_state_from_table
Unexecuted instantiation: <icu_segmenter::provider::RuleBreakDataV1>::get_break_state_from_table
740
741
    #[inline]
742
0
    fn use_complex_breaking_utf32(&self, codepoint: u32) -> bool {
743
0
        let line_break_property = self.get_linebreak_property_utf32_with_rule(
744
0
            codepoint,
745
0
            LineBreakStrictness::Strict,
746
0
            LineBreakWordOption::Normal,
747
0
        );
748
0
749
0
        line_break_property == SA
750
0
    }
Unexecuted instantiation: <icu_segmenter::provider::RuleBreakDataV1>::use_complex_breaking_utf32
Unexecuted instantiation: <icu_segmenter::provider::RuleBreakDataV1>::use_complex_breaking_utf32
751
}
752
753
#[inline]
754
0
fn is_break_utf32_by_loose(
755
0
    right_codepoint: u32,
756
0
    left_prop: u8,
757
0
    right_prop: u8,
758
0
    ja_zh: bool,
759
0
) -> Option<bool> {
760
0
    // breaks before hyphens
761
0
    if right_prop == BA {
762
0
        if left_prop == ID && (right_codepoint == 0x2010 || right_codepoint == 0x2013) {
763
0
            return Some(true);
764
0
        }
765
0
    } else if right_prop == NS {
766
        // breaks before certain CJK hyphen-like characters
767
0
        if right_codepoint == 0x301C || right_codepoint == 0x30A0 {
768
0
            return Some(ja_zh);
769
0
        }
770
0
771
0
        // breaks before iteration marks
772
0
        if right_codepoint == 0x3005
773
0
            || right_codepoint == 0x303B
774
0
            || right_codepoint == 0x309D
775
0
            || right_codepoint == 0x309E
776
0
            || right_codepoint == 0x30FD
777
0
            || right_codepoint == 0x30FE
778
        {
779
0
            return Some(true);
780
0
        }
781
0
782
0
        // breaks before certain centered punctuation marks:
783
0
        if right_codepoint == 0x30FB
784
0
            || right_codepoint == 0xFF1A
785
0
            || right_codepoint == 0xFF1B
786
0
            || right_codepoint == 0xFF65
787
0
            || right_codepoint == 0x203C
788
0
            || (0x2047..=0x2049).contains(&right_codepoint)
789
        {
790
0
            return Some(ja_zh);
791
0
        }
792
0
    } else if right_prop == IN {
793
        // breaks between inseparable characters such as U+2025, U+2026 i.e. characters with the Unicode Line Break property IN
794
0
        return Some(true);
795
0
    } else if right_prop == EX {
796
        // breaks before certain centered punctuation marks:
797
0
        if right_codepoint == 0xFF01 || right_codepoint == 0xFF1F {
798
0
            return Some(ja_zh);
799
0
        }
800
0
    }
801
802
    // breaks before suffixes:
803
    // Characters with the Unicode Line Break property PO and the East Asian Width property
804
0
    if right_prop == PO_EAW {
805
0
        return Some(ja_zh);
806
0
    }
807
0
    // breaks after prefixes:
808
0
    // Characters with the Unicode Line Break property PR and the East Asian Width property
809
0
    if left_prop == PR_EAW {
810
0
        return Some(ja_zh);
811
0
    }
812
0
    None
813
0
}
Unexecuted instantiation: icu_segmenter::line::is_break_utf32_by_loose
Unexecuted instantiation: icu_segmenter::line::is_break_utf32_by_loose
814
815
/// A trait allowing for LineBreakIterator to be generalized to multiple string iteration methods.
816
///
817
/// This is implemented by ICU4X for several common string types.
818
pub trait LineBreakType<'l, 's> {
819
    /// The iterator over characters.
820
    type IterAttr: Iterator<Item = (usize, Self::CharType)> + Clone;
821
822
    /// The character type.
823
    type CharType: Copy + Into<u32>;
824
825
    fn use_complex_breaking(iterator: &LineBreakIterator<'l, 's, Self>, c: Self::CharType) -> bool;
826
827
    fn get_linebreak_property_with_rule(
828
        iterator: &LineBreakIterator<'l, 's, Self>,
829
        c: Self::CharType,
830
    ) -> u8;
831
832
    fn get_current_position_character_len(iterator: &LineBreakIterator<'l, 's, Self>) -> usize;
833
834
    fn handle_complex_language(
835
        iterator: &mut LineBreakIterator<'l, 's, Self>,
836
        left_codepoint: Self::CharType,
837
    ) -> Option<usize>;
838
}
839
840
/// Implements the [`Iterator`] trait over the line break opportunities of the given string.
841
///
842
/// Lifetimes:
843
///
844
/// - `'l` = lifetime of the [`LineSegmenter`] object from which this iterator was created
845
/// - `'s` = lifetime of the string being segmented
846
///
847
/// The [`Iterator::Item`] is an [`usize`] representing index of a code unit
848
/// _after_ the break (for a break at the end of text, this index is the length
849
/// of the [`str`] or array of code units).
850
///
851
/// For examples of use, see [`LineSegmenter`].
852
#[derive(Debug)]
853
pub struct LineBreakIterator<'l, 's, Y: LineBreakType<'l, 's> + ?Sized> {
854
    iter: Y::IterAttr,
855
    len: usize,
856
    current_pos_data: Option<(usize, Y::CharType)>,
857
    result_cache: Vec<usize>,
858
    data: &'l RuleBreakDataV1<'l>,
859
    options: &'l LineBreakOptions,
860
    complex: &'l ComplexPayloads,
861
}
862
863
impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y> {
864
    type Item = usize;
865
866
0
    fn next(&mut self) -> Option<Self::Item> {
867
0
        match self.check_eof() {
868
0
            StringBoundaryPosType::Start => return Some(0),
869
0
            StringBoundaryPosType::End => return None,
870
0
            _ => (),
871
        }
872
873
        // If we have break point cache by previous run, return this result
874
0
        if let Some(&first_pos) = self.result_cache.first() {
875
0
            let mut i = 0;
876
            loop {
877
0
                if i == first_pos {
878
0
                    self.result_cache = self.result_cache.iter().skip(1).map(|r| r - i).collect();
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16> as core::iter::traits::iterator::Iterator>::next::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1> as core::iter::traits::iterator::Iterator>::next::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8> as core::iter::traits::iterator::Iterator>::next::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_> as core::iter::traits::iterator::Iterator>::next::{closure#0}
879
0
                    return self.get_current_position();
880
0
                }
881
0
                i += Y::get_current_position_character_len(self);
882
0
                self.advance_iter();
883
0
                if self.is_eof() {
884
0
                    self.result_cache.clear();
885
0
                    return Some(self.len);
886
0
                }
887
            }
888
0
        }
889
890
        'a: loop {
891
0
            debug_assert!(!self.is_eof());
892
0
            let left_codepoint = self.get_current_codepoint()?;
893
0
            let mut left_prop = self.get_linebreak_property(left_codepoint);
894
0
            self.advance_iter();
895
896
0
            let Some(right_codepoint) = self.get_current_codepoint() else {
897
0
                return Some(self.len);
898
            };
899
0
            let right_prop = self.get_linebreak_property(right_codepoint);
900
0
901
0
            // CSS word-break property handling
902
0
            match (self.options.word_option, left_prop, right_prop) {
903
0
                (LineBreakWordOption::BreakAll, AL | NU | SA, _) => {
904
0
                    left_prop = ID;
905
0
                }
906
                //  typographic letter units shouldn't be break
907
                (
908
                    LineBreakWordOption::KeepAll,
909
                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
910
                    AI | AL | ID | NU | HY | H2 | H3 | JL | JV | JT | CJ,
911
                ) => {
912
0
                    continue;
913
                }
914
0
                _ => (),
915
            }
916
917
            // CSS line-break property handling
918
0
            match self.options.strictness {
919
                LineBreakStrictness::Normal => {
920
0
                    if self.is_break_by_normal(right_codepoint) {
921
0
                        return self.get_current_position();
922
0
                    }
923
                }
924
                LineBreakStrictness::Loose => {
925
0
                    if let Some(breakable) = is_break_utf32_by_loose(
926
0
                        right_codepoint.into(),
927
0
                        left_prop,
928
0
                        right_prop,
929
0
                        self.options.ja_zh,
930
0
                    ) {
931
0
                        if breakable {
932
0
                            return self.get_current_position();
933
0
                        }
934
0
                        continue;
935
0
                    }
936
                }
937
                LineBreakStrictness::Anywhere => {
938
0
                    return self.get_current_position();
939
                }
940
0
                _ => (),
941
            };
942
943
            // UAX14 doesn't have Thai etc, so use another way.
944
0
            if self.options.word_option != LineBreakWordOption::BreakAll
945
0
                && Y::use_complex_breaking(self, left_codepoint)
946
0
                && Y::use_complex_breaking(self, right_codepoint)
947
            {
948
0
                let result = Y::handle_complex_language(self, left_codepoint);
949
0
                if result.is_some() {
950
0
                    return result;
951
0
                }
952
                // I may have to fetch text until non-SA character?.
953
0
            }
954
955
            // If break_state is equals or grater than 0, it is alias of property.
956
0
            let mut index = match self.data.get_break_state_from_table(left_prop, right_prop) {
957
0
                BreakState::Index(index) => index,
958
                // Line break uses more that 64 states, so they spill over into the intermediate range,
959
                // and we cannot change that at the moment
960
0
                BreakState::Intermediate(index) => index + 64,
961
0
                BreakState::Break | BreakState::NoMatch => return self.get_current_position(),
962
0
                BreakState::Keep => continue,
963
            };
964
965
0
            let mut previous_iter = self.iter.clone();
966
0
            let mut previous_pos_data = self.current_pos_data;
967
968
            loop {
969
0
                self.advance_iter();
970
971
0
                let Some(prop) = self.get_current_linebreak_property() else {
972
                    // Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
973
0
                    let break_state = self
974
0
                        .data
975
0
                        .get_break_state_from_table(index, self.data.eot_property);
976
0
                    if break_state == BreakState::NoMatch {
977
0
                        self.iter = previous_iter;
978
0
                        self.current_pos_data = previous_pos_data;
979
0
                        return self.get_current_position();
980
0
                    }
981
0
                    // EOF
982
0
                    return Some(self.len);
983
                };
984
985
0
                match self.data.get_break_state_from_table(index, prop) {
986
0
                    BreakState::Keep => continue 'a,
987
                    BreakState::NoMatch => {
988
0
                        self.iter = previous_iter;
989
0
                        self.current_pos_data = previous_pos_data;
990
0
                        return self.get_current_position();
991
                    }
992
0
                    BreakState::Break => return self.get_current_position(),
993
0
                    BreakState::Index(i) => {
994
0
                        index = i;
995
0
                        previous_iter = self.iter.clone();
996
0
                        previous_pos_data = self.current_pos_data;
997
0
                    }
998
0
                    BreakState::Intermediate(i) => {
999
0
                        index = i + 64;
1000
0
                        previous_iter = self.iter.clone();
1001
0
                        previous_pos_data = self.current_pos_data;
1002
0
                    }
1003
                }
1004
            }
1005
        }
1006
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16> as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1> as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8> as core::iter::traits::iterator::Iterator>::next
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_> as core::iter::traits::iterator::Iterator>::next
1007
}
1008
1009
enum StringBoundaryPosType {
1010
    Start,
1011
    Middle,
1012
    End,
1013
}
1014
1015
impl<'l, 's, Y: LineBreakType<'l, 's>> LineBreakIterator<'l, 's, Y> {
1016
0
    fn advance_iter(&mut self) {
1017
0
        self.current_pos_data = self.iter.next();
1018
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::advance_iter
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::advance_iter
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::advance_iter
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::advance_iter
1019
1020
0
    fn is_eof(&self) -> bool {
1021
0
        self.current_pos_data.is_none()
1022
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::is_eof
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::is_eof
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::is_eof
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::is_eof
1023
1024
    #[inline]
1025
0
    fn check_eof(&mut self) -> StringBoundaryPosType {
1026
0
        if self.is_eof() {
1027
0
            self.advance_iter();
1028
0
            if self.is_eof() {
1029
0
                if self.len == 0 {
1030
                    // Empty string. Since `self.current_pos_data` is always going to be empty,
1031
                    // we never read `self.len` except for here, so we can use it to mark that
1032
                    // we have already returned the single empty-string breakpoint.
1033
0
                    self.len = 1;
1034
0
                    StringBoundaryPosType::Start
1035
                } else {
1036
0
                    StringBoundaryPosType::End
1037
                }
1038
            } else {
1039
0
                StringBoundaryPosType::Start
1040
            }
1041
        } else {
1042
0
            StringBoundaryPosType::Middle
1043
        }
1044
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::check_eof
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::check_eof
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::check_eof
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::check_eof
1045
1046
0
    fn get_current_position(&self) -> Option<usize> {
1047
0
        self.current_pos_data.map(|(pos, _)| pos)
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_position::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::get_current_position::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_position::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_position::{closure#0}
1048
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_position
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::get_current_position
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_position
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_position
1049
1050
0
    fn get_current_codepoint(&self) -> Option<Y::CharType> {
1051
0
        self.current_pos_data.map(|(_, codepoint)| codepoint)
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_codepoint::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::get_current_codepoint::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_codepoint::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_codepoint::{closure#0}
1052
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_codepoint
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf8>>::get_current_codepoint
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_codepoint
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_codepoint
1053
1054
0
    fn get_linebreak_property(&self, codepoint: Y::CharType) -> u8 {
1055
0
        Y::get_linebreak_property_with_rule(self, codepoint)
1056
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_linebreak_property
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_linebreak_property
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_linebreak_property
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::get_linebreak_property
1057
1058
0
    fn get_current_linebreak_property(&self) -> Option<u8> {
1059
0
        self.get_current_codepoint()
1060
0
            .map(|c| self.get_linebreak_property(c))
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_linebreak_property::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_linebreak_property::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_linebreak_property::{closure#0}
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::get_current_linebreak_property::{closure#0}
1061
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::get_current_linebreak_property
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::get_current_linebreak_property
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::get_current_linebreak_property
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::get_current_linebreak_property
1062
1063
0
    fn is_break_by_normal(&self, codepoint: Y::CharType) -> bool {
1064
0
        match codepoint.into() {
1065
0
            0x301C | 0x30A0 => self.options.ja_zh,
1066
0
            _ => false,
1067
        }
1068
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeUtf16>>::is_break_by_normal
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypeLatin1>>::is_break_by_normal
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>>::is_break_by_normal
Unexecuted instantiation: <icu_segmenter::line::LineBreakIterator<_>>::is_break_by_normal
1069
}
1070
1071
#[derive(Debug)]
1072
pub struct LineBreakTypeUtf8;
1073
1074
impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf8 {
1075
    type IterAttr = CharIndices<'s>;
1076
    type CharType = char;
1077
1078
0
    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1079
0
        iterator.data.get_linebreak_property_utf32_with_rule(
1080
0
            c as u32,
1081
0
            iterator.options.strictness,
1082
0
            iterator.options.word_option,
1083
0
        )
1084
0
    }
1085
1086
    #[inline]
1087
0
    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1088
0
        iterator.data.use_complex_breaking_utf32(c as u32)
1089
0
    }
1090
1091
0
    fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
1092
0
        iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
1093
0
    }
1094
1095
0
    fn handle_complex_language(
1096
0
        iter: &mut LineBreakIterator<'l, 's, Self>,
1097
0
        left_codepoint: char,
1098
0
    ) -> Option<usize> {
1099
0
        handle_complex_language_utf8(iter, left_codepoint)
1100
0
    }
1101
}
1102
1103
#[derive(Debug)]
1104
pub struct LineBreakTypePotentiallyIllFormedUtf8;
1105
1106
impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypePotentiallyIllFormedUtf8 {
1107
    type IterAttr = Utf8CharIndices<'s>;
1108
    type CharType = char;
1109
1110
0
    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: char) -> u8 {
1111
0
        iterator.data.get_linebreak_property_utf32_with_rule(
1112
0
            c as u32,
1113
0
            iterator.options.strictness,
1114
0
            iterator.options.word_option,
1115
0
        )
1116
0
    }
1117
1118
    #[inline]
1119
0
    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: char) -> bool {
1120
0
        iterator.data.use_complex_breaking_utf32(c as u32)
1121
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8 as icu_segmenter::line::LineBreakType>::use_complex_breaking
Unexecuted instantiation: <icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8 as icu_segmenter::line::LineBreakType>::use_complex_breaking
1122
1123
0
    fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
1124
0
        iterator.get_current_codepoint().map_or(0, |c| c.len_utf8())
1125
0
    }
1126
1127
0
    fn handle_complex_language(
1128
0
        iter: &mut LineBreakIterator<'l, 's, Self>,
1129
0
        left_codepoint: char,
1130
0
    ) -> Option<usize> {
1131
0
        handle_complex_language_utf8(iter, left_codepoint)
1132
0
    }
1133
}
1134
/// handle_complex_language impl for UTF8 iterators
1135
0
fn handle_complex_language_utf8<'l, 's, T>(
1136
0
    iter: &mut LineBreakIterator<'l, 's, T>,
1137
0
    left_codepoint: char,
1138
0
) -> Option<usize>
1139
0
where
1140
0
    T: LineBreakType<'l, 's, CharType = char>,
1141
0
{
1142
0
    // word segmenter doesn't define break rules for some languages such as Thai.
1143
0
    let start_iter = iter.iter.clone();
1144
0
    let start_point = iter.current_pos_data;
1145
0
    let mut s = String::new();
1146
0
    s.push(left_codepoint);
1147
    loop {
1148
0
        debug_assert!(!iter.is_eof());
1149
0
        s.push(iter.get_current_codepoint()?);
1150
0
        iter.advance_iter();
1151
0
        if let Some(current_codepoint) = iter.get_current_codepoint() {
1152
0
            if !T::use_complex_breaking(iter, current_codepoint) {
1153
0
                break;
1154
0
            }
1155
        } else {
1156
            // EOF
1157
0
            break;
1158
        }
1159
    }
1160
1161
    // Restore iterator to move to head of complex string
1162
0
    iter.iter = start_iter;
1163
0
    iter.current_pos_data = start_point;
1164
0
    let breaks = complex_language_segment_str(iter.complex, &s);
1165
0
    iter.result_cache = breaks;
1166
0
    let first_pos = *iter.result_cache.first()?;
1167
0
    let mut i = left_codepoint.len_utf8();
1168
    loop {
1169
0
        if i == first_pos {
1170
            // Re-calculate breaking offset
1171
0
            iter.result_cache = iter.result_cache.iter().skip(1).map(|r| r - i).collect();
Unexecuted instantiation: icu_segmenter::line::handle_complex_language_utf8::<icu_segmenter::line::LineBreakTypeUtf8>::{closure#0}
Unexecuted instantiation: icu_segmenter::line::handle_complex_language_utf8::<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>::{closure#0}
1172
0
            return iter.get_current_position();
1173
0
        }
1174
0
        debug_assert!(
1175
0
            i < first_pos,
1176
0
            "we should always arrive at first_pos: near index {:?}",
1177
0
            iter.get_current_position()
1178
        );
1179
0
        i += T::get_current_position_character_len(iter);
1180
0
        iter.advance_iter();
1181
0
        if iter.is_eof() {
1182
0
            iter.result_cache.clear();
1183
0
            return Some(iter.len);
1184
0
        }
1185
    }
1186
0
}
Unexecuted instantiation: icu_segmenter::line::handle_complex_language_utf8::<icu_segmenter::line::LineBreakTypeUtf8>
Unexecuted instantiation: icu_segmenter::line::handle_complex_language_utf8::<icu_segmenter::line::LineBreakTypePotentiallyIllFormedUtf8>
1187
1188
#[derive(Debug)]
1189
pub struct LineBreakTypeLatin1;
1190
1191
impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeLatin1 {
1192
    type IterAttr = Latin1Indices<'s>;
1193
    type CharType = u8;
1194
1195
0
    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u8) -> u8 {
1196
0
        // No CJ on Latin1
1197
0
        // Note: Default value is 0 == UNKNOWN
1198
0
        iterator.data.property_table.get32(c as u32)
1199
0
    }
1200
1201
    #[inline]
1202
0
    fn use_complex_breaking(_iterator: &LineBreakIterator<Self>, _c: u8) -> bool {
1203
0
        false
1204
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakTypeLatin1 as icu_segmenter::line::LineBreakType>::use_complex_breaking
Unexecuted instantiation: <icu_segmenter::line::LineBreakTypeLatin1 as icu_segmenter::line::LineBreakType>::use_complex_breaking
1205
1206
0
    fn get_current_position_character_len(_: &LineBreakIterator<Self>) -> usize {
1207
0
        unreachable!()
1208
    }
1209
1210
0
    fn handle_complex_language(
1211
0
        _: &mut LineBreakIterator<Self>,
1212
0
        _: Self::CharType,
1213
0
    ) -> Option<usize> {
1214
0
        unreachable!()
1215
    }
1216
}
1217
1218
#[derive(Debug)]
1219
pub struct LineBreakTypeUtf16;
1220
1221
impl<'l, 's> LineBreakType<'l, 's> for LineBreakTypeUtf16 {
1222
    type IterAttr = Utf16Indices<'s>;
1223
    type CharType = u32;
1224
1225
0
    fn get_linebreak_property_with_rule(iterator: &LineBreakIterator<Self>, c: u32) -> u8 {
1226
0
        iterator.data.get_linebreak_property_utf32_with_rule(
1227
0
            c,
1228
0
            iterator.options.strictness,
1229
0
            iterator.options.word_option,
1230
0
        )
1231
0
    }
1232
1233
    #[inline]
1234
0
    fn use_complex_breaking(iterator: &LineBreakIterator<Self>, c: u32) -> bool {
1235
0
        iterator.data.use_complex_breaking_utf32(c)
1236
0
    }
Unexecuted instantiation: <icu_segmenter::line::LineBreakTypeUtf16 as icu_segmenter::line::LineBreakType>::use_complex_breaking
Unexecuted instantiation: <icu_segmenter::line::LineBreakTypeUtf16 as icu_segmenter::line::LineBreakType>::use_complex_breaking
1237
1238
0
    fn get_current_position_character_len(iterator: &LineBreakIterator<Self>) -> usize {
1239
0
        match iterator.get_current_codepoint() {
1240
0
            None => 0,
1241
0
            Some(ch) if ch >= 0x10000 => 2,
1242
0
            _ => 1,
1243
        }
1244
0
    }
1245
1246
0
    fn handle_complex_language(
1247
0
        iterator: &mut LineBreakIterator<Self>,
1248
0
        left_codepoint: Self::CharType,
1249
0
    ) -> Option<usize> {
1250
0
        // word segmenter doesn't define break rules for some languages such as Thai.
1251
0
        let start_iter = iterator.iter.clone();
1252
0
        let start_point = iterator.current_pos_data;
1253
0
        let mut s = vec![left_codepoint as u16];
1254
        loop {
1255
0
            debug_assert!(!iterator.is_eof());
1256
0
            s.push(iterator.get_current_codepoint()? as u16);
1257
0
            iterator.advance_iter();
1258
0
            if let Some(current_codepoint) = iterator.get_current_codepoint() {
1259
0
                if !Self::use_complex_breaking(iterator, current_codepoint) {
1260
0
                    break;
1261
0
                }
1262
            } else {
1263
                // EOF
1264
0
                break;
1265
            }
1266
        }
1267
1268
        // Restore iterator to move to head of complex string
1269
0
        iterator.iter = start_iter;
1270
0
        iterator.current_pos_data = start_point;
1271
0
        let breaks = complex_language_segment_utf16(iterator.complex, &s);
1272
0
        iterator.result_cache = breaks;
1273
        // result_cache vector is utf-16 index that is in BMP.
1274
0
        let first_pos = *iterator.result_cache.first()?;
1275
0
        let mut i = 1;
1276
        loop {
1277
0
            if i == first_pos {
1278
                // Re-calculate breaking offset
1279
0
                iterator.result_cache = iterator
1280
0
                    .result_cache
1281
0
                    .iter()
1282
0
                    .skip(1)
1283
0
                    .map(|r| r - i)
1284
0
                    .collect();
1285
0
                return iterator.get_current_position();
1286
0
            }
1287
0
            debug_assert!(
1288
0
                i < first_pos,
1289
0
                "we should always arrive at first_pos: near index {:?}",
1290
0
                iterator.get_current_position()
1291
            );
1292
0
            i += 1;
1293
0
            iterator.advance_iter();
1294
0
            if iterator.is_eof() {
1295
0
                iterator.result_cache.clear();
1296
0
                return Some(iterator.len);
1297
0
            }
1298
        }
1299
0
    }
1300
}
1301
1302
#[cfg(test)]
1303
#[cfg(feature = "serde")]
1304
mod tests {
1305
    use super::*;
1306
    use crate::LineSegmenter;
1307
1308
    #[test]
1309
    fn linebreak_property() {
1310
        let payload = DataProvider::<LineBreakDataV1Marker>::load(
1311
            &crate::provider::Baked,
1312
            Default::default(),
1313
        )
1314
        .expect("Loading should succeed!")
1315
        .take_payload()
1316
        .expect("Data should be present!");
1317
1318
        let get_linebreak_property = |codepoint| {
1319
            payload.get().get_linebreak_property_utf32_with_rule(
1320
                codepoint as u32,
1321
                LineBreakStrictness::Strict,
1322
                LineBreakWordOption::Normal,
1323
            )
1324
        };
1325
1326
        assert_eq!(get_linebreak_property('\u{0020}'), SP);
1327
        assert_eq!(get_linebreak_property('\u{0022}'), QU);
1328
        assert_eq!(get_linebreak_property('('), OP_OP30);
1329
        assert_eq!(get_linebreak_property('\u{0030}'), NU);
1330
        assert_eq!(get_linebreak_property('['), OP_OP30);
1331
        assert_eq!(get_linebreak_property('\u{1f3fb}'), EM);
1332
        assert_eq!(get_linebreak_property('\u{20000}'), ID);
1333
        assert_eq!(get_linebreak_property('\u{e0020}'), CM);
1334
        assert_eq!(get_linebreak_property('\u{3041}'), CJ);
1335
        assert_eq!(get_linebreak_property('\u{0025}'), PO);
1336
        assert_eq!(get_linebreak_property('\u{00A7}'), AI);
1337
        assert_eq!(get_linebreak_property('\u{50005}'), XX);
1338
        assert_eq!(get_linebreak_property('\u{17D6}'), NS);
1339
        assert_eq!(get_linebreak_property('\u{2014}'), B2);
1340
    }
1341
1342
    #[test]
1343
    #[allow(clippy::bool_assert_comparison)] // clearer when we're testing bools directly
1344
    fn break_rule() {
1345
        let payload = DataProvider::<LineBreakDataV1Marker>::load(
1346
            &crate::provider::Baked,
1347
            Default::default(),
1348
        )
1349
        .expect("Loading should succeed!")
1350
        .take_payload()
1351
        .expect("Data should be present!");
1352
        let lb_data: &RuleBreakDataV1 = payload.get();
1353
1354
        let is_break = |left, right| {
1355
            matches!(
1356
                lb_data.get_break_state_from_table(left, right),
1357
                BreakState::Break | BreakState::NoMatch
1358
            )
1359
        };
1360
1361
        // LB4
1362
        assert_eq!(is_break(BK, AL), true);
1363
        // LB5
1364
        assert_eq!(is_break(CR, LF), false);
1365
        assert_eq!(is_break(CR, AL), true);
1366
        assert_eq!(is_break(LF, AL), true);
1367
        assert_eq!(is_break(NL, AL), true);
1368
        // LB6
1369
        assert_eq!(is_break(AL, BK), false);
1370
        assert_eq!(is_break(AL, CR), false);
1371
        assert_eq!(is_break(AL, LF), false);
1372
        assert_eq!(is_break(AL, NL), false);
1373
        // LB7
1374
        assert_eq!(is_break(AL, SP), false);
1375
        assert_eq!(is_break(AL, ZW), false);
1376
        // LB8
1377
        // LB8a
1378
        assert_eq!(is_break(ZWJ, AL), false);
1379
        // LB9
1380
        assert_eq!(is_break(AL, ZWJ), false);
1381
        assert_eq!(is_break(AL, CM), false);
1382
        assert_eq!(is_break(ID, ZWJ), false);
1383
        // LB10
1384
        assert_eq!(is_break(ZWJ, SP), false);
1385
        assert_eq!(is_break(SP, CM), true);
1386
        // LB11
1387
        assert_eq!(is_break(AL, WJ), false);
1388
        assert_eq!(is_break(WJ, AL), false);
1389
        // LB12
1390
        assert_eq!(is_break(GL, AL), false);
1391
        // LB12a
1392
        assert_eq!(is_break(AL, GL), false);
1393
        assert_eq!(is_break(SP, GL), true);
1394
        // LB13
1395
        assert_eq!(is_break(AL, CL), false);
1396
        assert_eq!(is_break(AL, CP), false);
1397
        assert_eq!(is_break(AL, EX), false);
1398
        assert_eq!(is_break(AL, IS), false);
1399
        assert_eq!(is_break(AL, SY), false);
1400
        // LB18
1401
        assert_eq!(is_break(SP, AL), true);
1402
        // LB19
1403
        assert_eq!(is_break(AL, QU), false);
1404
        assert_eq!(is_break(QU, AL), false);
1405
        // LB20
1406
        assert_eq!(is_break(AL, CB), true);
1407
        assert_eq!(is_break(CB, AL), true);
1408
        // LB20
1409
        assert_eq!(is_break(AL, BA), false);
1410
        assert_eq!(is_break(AL, HY), false);
1411
        assert_eq!(is_break(AL, NS), false);
1412
        // LB21
1413
        assert_eq!(is_break(AL, BA), false);
1414
        assert_eq!(is_break(BB, AL), false);
1415
        assert_eq!(is_break(ID, BA), false);
1416
        assert_eq!(is_break(ID, NS), false);
1417
        // LB21a
1418
        // LB21b
1419
        assert_eq!(is_break(SY, HL), false);
1420
        // LB22
1421
        assert_eq!(is_break(AL, IN), false);
1422
        // LB 23
1423
        assert_eq!(is_break(AL, NU), false);
1424
        assert_eq!(is_break(HL, NU), false);
1425
        // LB 23a
1426
        assert_eq!(is_break(PR, ID), false);
1427
        assert_eq!(is_break(PR, EB), false);
1428
        assert_eq!(is_break(PR, EM), false);
1429
        assert_eq!(is_break(ID, PO), false);
1430
        assert_eq!(is_break(EB, PO), false);
1431
        assert_eq!(is_break(EM, PO), false);
1432
        // LB26
1433
        assert_eq!(is_break(JL, JL), false);
1434
        assert_eq!(is_break(JL, JV), false);
1435
        assert_eq!(is_break(JL, H2), false);
1436
        // LB27
1437
        assert_eq!(is_break(JL, IN), false);
1438
        assert_eq!(is_break(JL, PO), false);
1439
        assert_eq!(is_break(PR, JL), false);
1440
        // LB28
1441
        assert_eq!(is_break(AL, AL), false);
1442
        assert_eq!(is_break(HL, AL), false);
1443
        // LB29
1444
        assert_eq!(is_break(IS, AL), false);
1445
        assert_eq!(is_break(IS, HL), false);
1446
        // LB30b
1447
        assert_eq!(is_break(EB, EM), false);
1448
        // LB31
1449
        assert_eq!(is_break(ID, ID), true);
1450
    }
1451
1452
    #[test]
1453
    fn linebreak() {
1454
        let segmenter = LineSegmenter::try_new_dictionary_unstable(&crate::provider::Baked)
1455
            .expect("Data exists");
1456
1457
        let mut iter = segmenter.segment_str("hello world");
1458
        assert_eq!(Some(0), iter.next());
1459
        assert_eq!(Some(6), iter.next());
1460
        assert_eq!(Some(11), iter.next());
1461
        assert_eq!(None, iter.next());
1462
1463
        iter = segmenter.segment_str("$10 $10");
1464
        assert_eq!(Some(0), iter.next());
1465
        assert_eq!(Some(4), iter.next());
1466
        assert_eq!(Some(7), iter.next());
1467
        assert_eq!(None, iter.next());
1468
1469
        // LB10
1470
1471
        // LB14
1472
        iter = segmenter.segment_str("[  abc def");
1473
        assert_eq!(Some(0), iter.next());
1474
        assert_eq!(Some(7), iter.next());
1475
        assert_eq!(Some(10), iter.next());
1476
        assert_eq!(None, iter.next());
1477
1478
        let input: [u8; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1479
        let mut iter_u8 = segmenter.segment_latin1(&input);
1480
        assert_eq!(Some(0), iter_u8.next());
1481
        assert_eq!(Some(7), iter_u8.next());
1482
        assert_eq!(Some(10), iter_u8.next());
1483
        assert_eq!(None, iter_u8.next());
1484
1485
        let input: [u16; 10] = [0x5B, 0x20, 0x20, 0x61, 0x62, 0x63, 0x20, 0x64, 0x65, 0x66];
1486
        let mut iter_u16 = segmenter.segment_utf16(&input);
1487
        assert_eq!(Some(0), iter_u16.next());
1488
        assert_eq!(Some(7), iter_u16.next());
1489
        assert_eq!(Some(10), iter_u16.next());
1490
        assert_eq!(None, iter_u16.next());
1491
1492
        // LB15
1493
        iter = segmenter.segment_str("abc\u{0022}  (def");
1494
        assert_eq!(Some(0), iter.next());
1495
        assert_eq!(Some(10), iter.next());
1496
        assert_eq!(None, iter.next());
1497
1498
        let input: [u8; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1499
        let mut iter_u8 = segmenter.segment_latin1(&input);
1500
        assert_eq!(Some(0), iter_u8.next());
1501
        assert_eq!(Some(10), iter_u8.next());
1502
        assert_eq!(None, iter_u8.next());
1503
1504
        let input: [u16; 10] = [0x61, 0x62, 0x63, 0x22, 0x20, 0x20, 0x28, 0x64, 0x65, 0x66];
1505
        let mut iter_u16 = segmenter.segment_utf16(&input);
1506
        assert_eq!(Some(0), iter_u16.next());
1507
        assert_eq!(Some(10), iter_u16.next());
1508
        assert_eq!(None, iter_u16.next());
1509
1510
        // LB16
1511
        iter = segmenter.segment_str("\u{0029}\u{203C}");
1512
        assert_eq!(Some(0), iter.next());
1513
        assert_eq!(Some(4), iter.next());
1514
        assert_eq!(None, iter.next());
1515
        iter = segmenter.segment_str("\u{0029}  \u{203C}");
1516
        assert_eq!(Some(0), iter.next());
1517
        assert_eq!(Some(6), iter.next());
1518
        assert_eq!(None, iter.next());
1519
1520
        let input: [u16; 4] = [0x29, 0x20, 0x20, 0x203c];
1521
        let mut iter_u16 = segmenter.segment_utf16(&input);
1522
        assert_eq!(Some(0), iter_u16.next());
1523
        assert_eq!(Some(4), iter_u16.next());
1524
        assert_eq!(None, iter_u16.next());
1525
1526
        // LB17
1527
        iter = segmenter.segment_str("\u{2014}\u{2014}aa");
1528
        assert_eq!(Some(0), iter.next());
1529
        assert_eq!(Some(6), iter.next());
1530
        assert_eq!(Some(8), iter.next());
1531
        assert_eq!(None, iter.next());
1532
        iter = segmenter.segment_str("\u{2014}  \u{2014}aa");
1533
        assert_eq!(Some(0), iter.next());
1534
        assert_eq!(Some(8), iter.next());
1535
        assert_eq!(Some(10), iter.next());
1536
        assert_eq!(None, iter.next());
1537
1538
        iter = segmenter.segment_str("\u{2014}\u{2014}  \u{2014}\u{2014}123 abc");
1539
        assert_eq!(Some(0), iter.next());
1540
        assert_eq!(Some(14), iter.next());
1541
        assert_eq!(Some(18), iter.next());
1542
        assert_eq!(Some(21), iter.next());
1543
        assert_eq!(None, iter.next());
1544
1545
        // LB25
1546
        let mut iter = segmenter.segment_str("(0,1)+(2,3)");
1547
        assert_eq!(Some(0), iter.next());
1548
        assert_eq!(Some(11), iter.next());
1549
        assert_eq!(None, iter.next());
1550
        let input: [u16; 11] = [
1551
            0x28, 0x30, 0x2C, 0x31, 0x29, 0x2B, 0x28, 0x32, 0x2C, 0x33, 0x29,
1552
        ];
1553
        let mut iter_u16 = segmenter.segment_utf16(&input);
1554
        assert_eq!(Some(0), iter_u16.next());
1555
        assert_eq!(Some(11), iter_u16.next());
1556
        assert_eq!(None, iter_u16.next());
1557
1558
        let input: [u16; 13] = [
1559
            0x2014, 0x2014, 0x20, 0x20, 0x2014, 0x2014, 0x31, 0x32, 0x33, 0x20, 0x61, 0x62, 0x63,
1560
        ];
1561
        let mut iter_u16 = segmenter.segment_utf16(&input);
1562
        assert_eq!(Some(0), iter_u16.next());
1563
        assert_eq!(Some(6), iter_u16.next());
1564
        assert_eq!(Some(10), iter_u16.next());
1565
        assert_eq!(Some(13), iter_u16.next());
1566
        assert_eq!(None, iter_u16.next());
1567
1568
        iter = segmenter.segment_str("\u{1F3FB} \u{1F3FB}");
1569
        assert_eq!(Some(0), iter.next());
1570
        assert_eq!(Some(5), iter.next());
1571
        assert_eq!(Some(9), iter.next());
1572
        assert_eq!(None, iter.next());
1573
    }
1574
1575
    #[test]
1576
    #[cfg(feature = "lstm")]
1577
    fn thai_line_break() {
1578
        const TEST_STR: &str = "ภาษาไทยภาษาไทย";
1579
1580
        let segmenter = LineSegmenter::new_lstm();
1581
        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1582
        assert_eq!(breaks, [0, 12, 21, 33, TEST_STR.len()], "Thai test");
1583
1584
        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1585
        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1586
        assert_eq!(breaks, [0, 4, 7, 11, utf16.len()], "Thai test");
1587
1588
        let utf16: [u16; 4] = [0x0e20, 0x0e32, 0x0e29, 0x0e32];
1589
        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1590
        assert_eq!(breaks, [0, 4], "Thai test");
1591
    }
1592
1593
    #[test]
1594
    #[cfg(feature = "lstm")]
1595
    fn burmese_line_break() {
1596
        // "Burmese Language" in Burmese
1597
        const TEST_STR: &str = "မြန်မာဘာသာစကား";
1598
1599
        let segmenter = LineSegmenter::new_lstm();
1600
        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1601
        // LSTM model breaks more characters, but it is better to return [30].
1602
        assert_eq!(breaks, [0, 12, 18, 30, TEST_STR.len()], "Burmese test");
1603
1604
        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1605
        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1606
        // LSTM model breaks more characters, but it is better to return [10].
1607
        assert_eq!(breaks, [0, 4, 6, 10, utf16.len()], "Burmese utf-16 test");
1608
    }
1609
1610
    #[test]
1611
    #[cfg(feature = "lstm")]
1612
    fn khmer_line_break() {
1613
        const TEST_STR: &str = "សេចក្ដីប្រកាសជាសកលស្ដីពីសិទ្ធិមនុស្ស";
1614
1615
        let segmenter = LineSegmenter::new_lstm();
1616
        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1617
        // Note: This small sample matches the ICU dictionary segmenter
1618
        assert_eq!(breaks, [0, 39, 48, 54, 72, TEST_STR.len()], "Khmer test");
1619
1620
        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1621
        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1622
        assert_eq!(
1623
            breaks,
1624
            [0, 13, 16, 18, 24, utf16.len()],
1625
            "Khmer utf-16 test"
1626
        );
1627
    }
1628
1629
    #[test]
1630
    #[cfg(feature = "lstm")]
1631
    fn lao_line_break() {
1632
        const TEST_STR: &str = "ກ່ຽວກັບສິດຂອງມະນຸດ";
1633
1634
        let segmenter = LineSegmenter::new_lstm();
1635
        let breaks: Vec<usize> = segmenter.segment_str(TEST_STR).collect();
1636
        // Note: LSTM finds a break at '12' that the dictionary does not find
1637
        assert_eq!(breaks, [0, 12, 21, 30, 39, TEST_STR.len()], "Lao test");
1638
1639
        let utf16: Vec<u16> = TEST_STR.encode_utf16().collect();
1640
        let breaks: Vec<usize> = segmenter.segment_utf16(&utf16).collect();
1641
        assert_eq!(breaks, [0, 4, 7, 10, 13, utf16.len()], "Lao utf-16 test");
1642
    }
1643
1644
    #[test]
1645
    fn empty_string() {
1646
        let segmenter = LineSegmenter::new_auto();
1647
        let breaks: Vec<usize> = segmenter.segment_str("").collect();
1648
        assert_eq!(breaks, [0]);
1649
    }
1650
}