/rust/registry/src/index.crates.io-1949cf8c6b5b557f/bstr-1.12.0/src/unicode/sentence.rs
Line | Count | Source |
1 | | use regex_automata::{dfa::Automaton, Anchored, Input}; |
2 | | |
3 | | use crate::{ |
4 | | ext_slice::ByteSlice, |
5 | | unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8, |
6 | | }; |
7 | | |
8 | | /// An iterator over sentences in a byte string. |
9 | | /// |
10 | | /// This iterator is typically constructed by |
11 | | /// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences). |
12 | | /// |
13 | | /// Sentences typically include their trailing punctuation and whitespace. |
14 | | /// |
15 | | /// Since sentences are made up of one or more codepoints, this iterator yields |
16 | | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints |
17 | | /// are [substituted](index.html#handling-of-invalid-utf-8). |
18 | | /// |
19 | | /// This iterator yields words in accordance with the default sentence boundary |
20 | | /// rules specified in |
21 | | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries). |
22 | | #[derive(Clone, Debug)] |
23 | | pub struct Sentences<'a> { |
24 | | bs: &'a [u8], |
25 | | } |
26 | | |
27 | | impl<'a> Sentences<'a> { |
28 | 0 | pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> { |
29 | 0 | Sentences { bs } |
30 | 0 | } |
31 | | |
32 | | /// View the underlying data as a subslice of the original data. |
33 | | /// |
34 | | /// The slice returned has the same lifetime as the original slice, and so |
35 | | /// the iterator can continue to be used while this exists. |
36 | | /// |
37 | | /// # Examples |
38 | | /// |
39 | | /// ``` |
40 | | /// use bstr::ByteSlice; |
41 | | /// |
42 | | /// let mut it = b"I want this. Not that. Right now.".sentences(); |
43 | | /// |
44 | | /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes()); |
45 | | /// it.next(); |
46 | | /// assert_eq!(b"Not that. Right now.", it.as_bytes()); |
47 | | /// it.next(); |
48 | | /// it.next(); |
49 | | /// assert_eq!(b"", it.as_bytes()); |
50 | | /// ``` |
51 | | #[inline] |
52 | 0 | pub fn as_bytes(&self) -> &'a [u8] { |
53 | 0 | self.bs |
54 | 0 | } |
55 | | } |
56 | | |
57 | | impl<'a> Iterator for Sentences<'a> { |
58 | | type Item = &'a str; |
59 | | |
60 | | #[inline] |
61 | 0 | fn next(&mut self) -> Option<&'a str> { |
62 | 0 | let (sentence, size) = decode_sentence(self.bs); |
63 | 0 | if size == 0 { |
64 | 0 | return None; |
65 | 0 | } |
66 | 0 | self.bs = &self.bs[size..]; |
67 | 0 | Some(sentence) |
68 | 0 | } |
69 | | } |
70 | | |
71 | | /// An iterator over sentences in a byte string, along with their byte offsets. |
72 | | /// |
73 | | /// This iterator is typically constructed by |
74 | | /// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices). |
75 | | /// |
76 | | /// Sentences typically include their trailing punctuation and whitespace. |
77 | | /// |
78 | | /// Since sentences are made up of one or more codepoints, this iterator |
79 | | /// yields `&str` elements (along with their start and end byte offsets). |
80 | | /// When invalid UTF-8 is encountered, replacement codepoints are |
81 | | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the |
82 | | /// indices yielded by this iterator may not correspond to the length of the |
83 | | /// sentence yielded with those indices. For example, when this iterator |
84 | | /// encounters `\xFF` in the byte string, then it will yield a pair of indices |
85 | | /// ranging over a single byte, but will provide an `&str` equivalent to |
86 | | /// `"\u{FFFD}"`, which is three bytes in length. However, when given only |
87 | | /// valid UTF-8, then all indices are in exact correspondence with their paired |
88 | | /// word. |
89 | | /// |
90 | | /// This iterator yields words in accordance with the default sentence boundary |
91 | | /// rules specified in |
92 | | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries). |
93 | | #[derive(Clone, Debug)] |
94 | | pub struct SentenceIndices<'a> { |
95 | | bs: &'a [u8], |
96 | | forward_index: usize, |
97 | | } |
98 | | |
99 | | impl<'a> SentenceIndices<'a> { |
100 | 0 | pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> { |
101 | 0 | SentenceIndices { bs, forward_index: 0 } |
102 | 0 | } |
103 | | |
104 | | /// View the underlying data as a subslice of the original data. |
105 | | /// |
106 | | /// The slice returned has the same lifetime as the original slice, and so |
107 | | /// the iterator can continue to be used while this exists. |
108 | | /// |
109 | | /// # Examples |
110 | | /// |
111 | | /// ``` |
112 | | /// use bstr::ByteSlice; |
113 | | /// |
114 | | /// let mut it = b"I want this. Not that. Right now.".sentence_indices(); |
115 | | /// |
116 | | /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes()); |
117 | | /// it.next(); |
118 | | /// assert_eq!(b"Not that. Right now.", it.as_bytes()); |
119 | | /// it.next(); |
120 | | /// it.next(); |
121 | | /// assert_eq!(b"", it.as_bytes()); |
122 | | /// ``` |
123 | | #[inline] |
124 | 0 | pub fn as_bytes(&self) -> &'a [u8] { |
125 | 0 | self.bs |
126 | 0 | } |
127 | | } |
128 | | |
129 | | impl<'a> Iterator for SentenceIndices<'a> { |
130 | | type Item = (usize, usize, &'a str); |
131 | | |
132 | | #[inline] |
133 | 0 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { |
134 | 0 | let index = self.forward_index; |
135 | 0 | let (word, size) = decode_sentence(self.bs); |
136 | 0 | if size == 0 { |
137 | 0 | return None; |
138 | 0 | } |
139 | 0 | self.bs = &self.bs[size..]; |
140 | 0 | self.forward_index += size; |
141 | 0 | Some((index, index + size, word)) |
142 | 0 | } |
143 | | } |
144 | | |
145 | 0 | fn decode_sentence(bs: &[u8]) -> (&str, usize) { |
146 | 0 | if bs.is_empty() { |
147 | 0 | ("", 0) |
148 | 0 | } else if let Some(hm) = { |
149 | 0 | let input = Input::new(bs).anchored(Anchored::Yes); |
150 | 0 | SENTENCE_BREAK_FWD.try_search_fwd(&input).unwrap() |
151 | 0 | } { |
152 | | // Safe because a match can only occur for valid UTF-8. |
153 | 0 | let sentence = unsafe { bs[..hm.offset()].to_str_unchecked() }; |
154 | 0 | (sentence, sentence.len()) |
155 | | } else { |
156 | | const INVALID: &str = "\u{FFFD}"; |
157 | | // No match on non-empty bytes implies we found invalid UTF-8. |
158 | 0 | let (_, size) = utf8::decode_lossy(bs); |
159 | 0 | (INVALID, size) |
160 | | } |
161 | 0 | } |
162 | | |
163 | | #[cfg(all(test, feature = "std"))] |
164 | | mod tests { |
165 | | use alloc::{vec, vec::Vec}; |
166 | | |
167 | | #[cfg(not(miri))] |
168 | | use ucd_parse::SentenceBreakTest; |
169 | | |
170 | | use crate::ext_slice::ByteSlice; |
171 | | |
172 | | #[test] |
173 | | #[cfg(not(miri))] |
174 | | fn forward_ucd() { |
175 | | for (i, test) in ucdtests().into_iter().enumerate() { |
176 | | let given = test.sentences.concat(); |
177 | | let got = sentences(given.as_bytes()); |
178 | | assert_eq!( |
179 | | test.sentences, |
180 | | got, |
181 | | "\n\nsentence forward break test {} failed:\n\ |
182 | | given: {:?}\n\ |
183 | | expected: {:?}\n\ |
184 | | got: {:?}\n", |
185 | | i, |
186 | | given, |
187 | | strs_to_bstrs(&test.sentences), |
188 | | strs_to_bstrs(&got), |
189 | | ); |
190 | | } |
191 | | } |
192 | | |
193 | | // Some additional tests that don't seem to be covered by the UCD tests. |
194 | | #[test] |
195 | | fn forward_additional() { |
196 | | assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A")); |
197 | | assert_eq!(vec!["a.. a"], sentences(b"a.. a")); |
198 | | |
199 | | assert_eq!(vec!["a... ", "A"], sentences(b"a... A")); |
200 | | assert_eq!(vec!["a... a"], sentences(b"a... a")); |
201 | | |
202 | | assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a")); |
203 | | } |
204 | | |
205 | | fn sentences(bytes: &[u8]) -> Vec<&str> { |
206 | | bytes.sentences().collect() |
207 | | } |
208 | | |
209 | | #[cfg(not(miri))] |
210 | | fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> { |
211 | | strs.iter().map(|s| s.as_ref().as_bytes()).collect() |
212 | | } |
213 | | |
214 | | /// Return all of the UCD for sentence breaks. |
215 | | #[cfg(not(miri))] |
216 | | fn ucdtests() -> Vec<SentenceBreakTest> { |
217 | | const TESTDATA: &str = include_str!("data/SentenceBreakTest.txt"); |
218 | | |
219 | | let mut tests = vec![]; |
220 | | for mut line in TESTDATA.lines() { |
221 | | line = line.trim(); |
222 | | if line.starts_with("#") || line.contains("surrogate") { |
223 | | continue; |
224 | | } |
225 | | tests.push(line.parse().unwrap()); |
226 | | } |
227 | | tests |
228 | | } |
229 | | } |