/rust/registry/src/index.crates.io-1949cf8c6b5b557f/icu_segmenter-1.5.0/src/sentence.rs
Line | Count | Source |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | use alloc::vec::Vec; |
6 | | use icu_provider::prelude::*; |
7 | | |
8 | | use crate::indices::{Latin1Indices, Utf16Indices}; |
9 | | use crate::iterator_helpers::derive_usize_iterator_with_type; |
10 | | use crate::rule_segmenter::*; |
11 | | use crate::{provider::*, SegmenterError}; |
12 | | use utf8_iter::Utf8CharIndices; |
13 | | |
14 | | /// Implements the [`Iterator`] trait over the sentence boundaries of the given string. |
15 | | /// |
16 | | /// Lifetimes: |
17 | | /// |
18 | | /// - `'l` = lifetime of the segmenter object from which this iterator was created |
19 | | /// - `'s` = lifetime of the string being segmented |
20 | | /// |
21 | | /// The [`Iterator::Item`] is an [`usize`] representing index of a code unit |
22 | | /// _after_ the boundary (for a boundary at the end of text, this index is the length |
23 | | /// of the [`str`] or array of code units). |
24 | | /// |
25 | | /// For examples of use, see [`SentenceSegmenter`]. |
26 | | #[derive(Debug)] |
27 | | pub struct SentenceBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized>( |
28 | | RuleBreakIterator<'l, 's, Y>, |
29 | | ); |
30 | | |
31 | | derive_usize_iterator_with_type!(SentenceBreakIterator); |
32 | | |
33 | | /// Sentence break iterator for an `str` (a UTF-8 string). |
34 | | /// |
35 | | /// For examples of use, see [`SentenceSegmenter`]. |
36 | | pub type SentenceBreakIteratorUtf8<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf8>; |
37 | | |
38 | | /// Sentence break iterator for a potentially invalid UTF-8 string. |
39 | | /// |
40 | | /// For examples of use, see [`SentenceSegmenter`]. |
41 | | pub type SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> = |
42 | | SentenceBreakIterator<'l, 's, RuleBreakTypePotentiallyIllFormedUtf8>; |
43 | | |
44 | | /// Sentence break iterator for a Latin-1 (8-bit) string. |
45 | | /// |
46 | | /// For examples of use, see [`SentenceSegmenter`]. |
47 | | pub type SentenceBreakIteratorLatin1<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeLatin1>; |
48 | | |
49 | | /// Sentence break iterator for a UTF-16 string. |
50 | | /// |
51 | | /// For examples of use, see [`SentenceSegmenter`]. |
52 | | pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, RuleBreakTypeUtf16>; |
53 | | |
54 | | /// Supports loading sentence break data, and creating sentence break iterators for different string |
55 | | /// encodings. |
56 | | /// |
57 | | /// # Examples |
58 | | /// |
59 | | /// Segment a string: |
60 | | /// |
61 | | /// ```rust |
62 | | /// use icu::segmenter::SentenceSegmenter; |
63 | | /// let segmenter = SentenceSegmenter::new(); |
64 | | /// |
65 | | /// let breakpoints: Vec<usize> = |
66 | | /// segmenter.segment_str("Hello World").collect(); |
67 | | /// assert_eq!(&breakpoints, &[0, 11]); |
68 | | /// ``` |
69 | | /// |
70 | | /// Segment a Latin1 byte string: |
71 | | /// |
72 | | /// ```rust |
73 | | /// use icu::segmenter::SentenceSegmenter; |
74 | | /// let segmenter = SentenceSegmenter::new(); |
75 | | /// |
76 | | /// let breakpoints: Vec<usize> = |
77 | | /// segmenter.segment_latin1(b"Hello World").collect(); |
78 | | /// assert_eq!(&breakpoints, &[0, 11]); |
79 | | /// ``` |
80 | | /// |
81 | | /// Successive boundaries can be used to retrieve the sentences. |
82 | | /// In particular, the first boundary is always 0, and the last one is the |
83 | | /// length of the segmented text in code units. |
84 | | /// |
85 | | /// ```rust |
86 | | /// # use icu::segmenter::SentenceSegmenter; |
87 | | /// # let segmenter = SentenceSegmenter::new(); |
88 | | /// use itertools::Itertools; |
89 | | /// let text = "Ceci tuera cela. Le livre tuera l’édifice."; |
90 | | /// let sentences: Vec<&str> = segmenter |
91 | | /// .segment_str(text) |
92 | | /// .tuple_windows() |
93 | | /// .map(|(i, j)| &text[i..j]) |
94 | | /// .collect(); |
95 | | /// assert_eq!( |
96 | | /// &sentences, |
97 | | /// &["Ceci tuera cela. ", "Le livre tuera l’édifice."] |
98 | | /// ); |
99 | | /// ``` |
100 | | #[derive(Debug)] |
101 | | pub struct SentenceSegmenter { |
102 | | payload: DataPayload<SentenceBreakDataV1Marker>, |
103 | | } |
104 | | |
105 | | #[cfg(feature = "compiled_data")] |
106 | | impl Default for SentenceSegmenter { |
107 | 0 | fn default() -> Self { |
108 | 0 | Self::new() |
109 | 0 | } |
110 | | } |
111 | | |
112 | | impl SentenceSegmenter { |
113 | | /// Constructs a [`SentenceSegmenter`] with an invariant locale and compiled data. |
114 | | /// |
115 | | /// ✨ *Enabled with the `compiled_data` Cargo feature.* |
116 | | /// |
117 | | /// [📚 Help choosing a constructor](icu_provider::constructors) |
118 | | #[cfg(feature = "compiled_data")] |
119 | 0 | pub fn new() -> Self { |
120 | 0 | Self { |
121 | 0 | payload: DataPayload::from_static_ref( |
122 | 0 | crate::provider::Baked::SINGLETON_SEGMENTER_SENTENCE_V1, |
123 | 0 | ), |
124 | 0 | } |
125 | 0 | } |
126 | | |
127 | | icu_provider::gen_any_buffer_data_constructors!(locale: skip, options: skip, error: SegmenterError, |
128 | | #[cfg(skip)] |
129 | | functions: [ |
130 | | new, |
131 | | try_new_with_any_provider, |
132 | | try_new_with_buffer_provider, |
133 | | try_new_unstable, |
134 | | Self, |
135 | | ] |
136 | | ); |
137 | | |
138 | | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)] |
139 | 0 | pub fn try_new_unstable<D>(provider: &D) -> Result<Self, SegmenterError> |
140 | 0 | where |
141 | 0 | D: DataProvider<SentenceBreakDataV1Marker> + ?Sized, |
142 | | { |
143 | 0 | let payload = provider.load(Default::default())?.take_payload()?; |
144 | 0 | Ok(Self { payload }) |
145 | 0 | } Unexecuted instantiation: <icu_segmenter::sentence::SentenceSegmenter>::try_new_unstable::<icu_provider::any::DowncastingAnyProvider<icu_provider_adapters::empty::EmptyDataProvider>> Unexecuted instantiation: <icu_segmenter::sentence::SentenceSegmenter>::try_new_unstable::<_> |
146 | | |
147 | | /// Creates a sentence break iterator for an `str` (a UTF-8 string). |
148 | | /// |
149 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
150 | 0 | pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> SentenceBreakIteratorUtf8<'l, 's> { |
151 | 0 | SentenceBreakIterator(RuleBreakIterator { |
152 | 0 | iter: input.char_indices(), |
153 | 0 | len: input.len(), |
154 | 0 | current_pos_data: None, |
155 | 0 | result_cache: Vec::new(), |
156 | 0 | data: self.payload.get(), |
157 | 0 | complex: None, |
158 | 0 | boundary_property: 0, |
159 | 0 | }) |
160 | 0 | } |
161 | | /// Creates a sentence break iterator for a potentially ill-formed UTF8 string |
162 | | /// |
163 | | /// Invalid characters are treated as REPLACEMENT CHARACTER |
164 | | /// |
165 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
166 | 0 | pub fn segment_utf8<'l, 's>( |
167 | 0 | &'l self, |
168 | 0 | input: &'s [u8], |
169 | 0 | ) -> SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> { |
170 | 0 | SentenceBreakIterator(RuleBreakIterator { |
171 | 0 | iter: Utf8CharIndices::new(input), |
172 | 0 | len: input.len(), |
173 | 0 | current_pos_data: None, |
174 | 0 | result_cache: Vec::new(), |
175 | 0 | data: self.payload.get(), |
176 | 0 | complex: None, |
177 | 0 | boundary_property: 0, |
178 | 0 | }) |
179 | 0 | } |
180 | | /// Creates a sentence break iterator for a Latin-1 (8-bit) string. |
181 | | /// |
182 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
183 | 0 | pub fn segment_latin1<'l, 's>( |
184 | 0 | &'l self, |
185 | 0 | input: &'s [u8], |
186 | 0 | ) -> SentenceBreakIteratorLatin1<'l, 's> { |
187 | 0 | SentenceBreakIterator(RuleBreakIterator { |
188 | 0 | iter: Latin1Indices::new(input), |
189 | 0 | len: input.len(), |
190 | 0 | current_pos_data: None, |
191 | 0 | result_cache: Vec::new(), |
192 | 0 | data: self.payload.get(), |
193 | 0 | complex: None, |
194 | 0 | boundary_property: 0, |
195 | 0 | }) |
196 | 0 | } |
197 | | |
198 | | /// Creates a sentence break iterator for a UTF-16 string. |
199 | | /// |
200 | | /// There are always breakpoints at 0 and the string length, or only at 0 for the empty string. |
201 | 0 | pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> SentenceBreakIteratorUtf16<'l, 's> { |
202 | 0 | SentenceBreakIterator(RuleBreakIterator { |
203 | 0 | iter: Utf16Indices::new(input), |
204 | 0 | len: input.len(), |
205 | 0 | current_pos_data: None, |
206 | 0 | result_cache: Vec::new(), |
207 | 0 | data: self.payload.get(), |
208 | 0 | complex: None, |
209 | 0 | boundary_property: 0, |
210 | 0 | }) |
211 | 0 | } |
212 | | } |
213 | | |
214 | | #[cfg(all(test, feature = "serde"))] |
215 | | #[test] |
216 | | fn empty_string() { |
217 | | let segmenter = SentenceSegmenter::new(); |
218 | | let breaks: Vec<usize> = segmenter.segment_str("").collect(); |
219 | | assert_eq!(breaks, [0]); |
220 | | } |