/src/unicode-segmentation/src/sentence.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution and at |
3 | | // http://rust-lang.org/COPYRIGHT. |
4 | | // |
5 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | | // option. This file may not be copied, modified, or distributed |
9 | | // except according to those terms. |
10 | | |
11 | | use core::cmp; |
12 | | use core::iter::Filter; |
13 | | |
14 | | // All of the logic for forward iteration over sentences |
15 | | mod fwd { |
16 | | use crate::tables::sentence::SentenceCat; |
17 | | use core::cmp; |
18 | | |
19 | | // Describe a parsed part of source string as described in this table: |
20 | | // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries |
21 | | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
22 | | enum StatePart { |
23 | | Sot, |
24 | | Eot, |
25 | | Other, |
26 | | CR, |
27 | | LF, |
28 | | Sep, |
29 | | ATerm, |
30 | | UpperLower, |
31 | | ClosePlus, |
32 | | SpPlus, |
33 | | STerm, |
34 | | } |
35 | | |
36 | | #[derive(Debug, Clone, PartialEq, Eq)] |
37 | | struct SentenceBreaksState(pub [StatePart; 4]); |
38 | | |
39 | | const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([ |
40 | | StatePart::Sot, |
41 | | StatePart::Sot, |
42 | | StatePart::Sot, |
43 | | StatePart::Sot, |
44 | | ]); |
45 | | |
46 | | #[derive(Debug, Clone)] |
47 | | pub struct SentenceBreaks<'a> { |
48 | | pub string: &'a str, |
49 | | pos: usize, |
50 | | state: SentenceBreaksState, |
51 | | } |
52 | | |
53 | | impl SentenceBreaksState { |
54 | | // Attempt to advance the internal state by one part |
55 | | // Whitespace and some punctutation will be collapsed |
56 | | fn next(&self, cat: SentenceCat) -> SentenceBreaksState { |
57 | | let &SentenceBreaksState(parts) = self; |
58 | | let parts = match (parts[3], cat) { |
59 | | (StatePart::ClosePlus, SentenceCat::SC_Close) => parts, |
60 | | (StatePart::SpPlus, SentenceCat::SC_Sp) => parts, |
61 | | _ => [ |
62 | | parts[1], |
63 | | parts[2], |
64 | | parts[3], |
65 | | match cat { |
66 | | SentenceCat::SC_CR => StatePart::CR, |
67 | | SentenceCat::SC_LF => StatePart::LF, |
68 | | SentenceCat::SC_Sep => StatePart::Sep, |
69 | | SentenceCat::SC_ATerm => StatePart::ATerm, |
70 | | SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower, |
71 | | SentenceCat::SC_Close => StatePart::ClosePlus, |
72 | | SentenceCat::SC_Sp => StatePart::SpPlus, |
73 | | SentenceCat::SC_STerm => StatePart::STerm, |
74 | | _ => StatePart::Other, |
75 | | }, |
76 | | ], |
77 | | }; |
78 | | SentenceBreaksState(parts) |
79 | | } |
80 | | |
81 | | fn end(&self) -> SentenceBreaksState { |
82 | | let &SentenceBreaksState(parts) = self; |
83 | | SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot]) |
84 | | } |
85 | | |
86 | | // Helper function to check if state head matches a single `StatePart` |
87 | | fn match1(&self, part: StatePart) -> bool { |
88 | | let &SentenceBreaksState(parts) = self; |
89 | | part == parts[3] |
90 | | } |
91 | | |
92 | | // Helper function to check if first two `StateParts` in state match |
93 | | // the given two |
94 | | fn match2(&self, part1: StatePart, part2: StatePart) -> bool { |
95 | | let &SentenceBreaksState(parts) = self; |
96 | | part1 == parts[2] && part2 == parts[3] |
97 | | } |
98 | | } |
99 | | |
100 | | // https://unicode.org/reports/tr29/#SB8 |
101 | | // TODO cache this, it is currently quadratic |
102 | | fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool { |
103 | | let &SentenceBreaksState(parts) = state; |
104 | | let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; |
105 | | if parts[idx] == StatePart::ClosePlus { |
106 | | idx -= 1 |
107 | | } |
108 | | |
109 | | if parts[idx] == StatePart::ATerm { |
110 | | use crate::tables::sentence as se; |
111 | | |
112 | | for next_char in ahead.chars() { |
113 | | //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower |
114 | | match se::sentence_category(next_char).2 { |
115 | | se::SC_Lower => return true, |
116 | | se::SC_OLetter |
117 | | | se::SC_Upper |
118 | | | se::SC_Sep |
119 | | | se::SC_CR |
120 | | | se::SC_LF |
121 | | | se::SC_STerm |
122 | | | se::SC_ATerm => return false, |
123 | | _ => continue, |
124 | | } |
125 | | } |
126 | | } |
127 | | |
128 | | false |
129 | | } |
130 | | |
131 | | // https://unicode.org/reports/tr29/#SB8a |
132 | | fn match_sb8a(state: &SentenceBreaksState) -> bool { |
133 | | // SATerm Close* Sp* |
134 | | let &SentenceBreaksState(parts) = state; |
135 | | let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; |
136 | | if parts[idx] == StatePart::ClosePlus { |
137 | | idx -= 1 |
138 | | } |
139 | | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
140 | | } |
141 | | |
142 | | // https://unicode.org/reports/tr29/#SB9 |
143 | | fn match_sb9(state: &SentenceBreaksState) -> bool { |
144 | | // SATerm Close* |
145 | | let &SentenceBreaksState(parts) = state; |
146 | | let idx = if parts[3] == StatePart::ClosePlus { |
147 | | 2 |
148 | | } else { |
149 | | 3 |
150 | | }; |
151 | | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
152 | | } |
153 | | |
154 | | // https://unicode.org/reports/tr29/#SB11 |
155 | | fn match_sb11(state: &SentenceBreaksState) -> bool { |
156 | | // SATerm Close* Sp* ParaSep? |
157 | | let &SentenceBreaksState(parts) = state; |
158 | | let mut idx = match parts[3] { |
159 | | StatePart::Sep | StatePart::CR | StatePart::LF => 2, |
160 | | _ => 3, |
161 | | }; |
162 | | |
163 | | if parts[idx] == StatePart::SpPlus { |
164 | | idx -= 1 |
165 | | } |
166 | | if parts[idx] == StatePart::ClosePlus { |
167 | | idx -= 1 |
168 | | } |
169 | | |
170 | | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
171 | | } |
172 | | |
173 | | impl<'a> Iterator for SentenceBreaks<'a> { |
174 | | // Returns the index of the character which follows a break |
175 | | type Item = usize; |
176 | | |
177 | | #[inline] |
178 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
179 | 0 | let slen = self.string.len(); |
180 | 0 | // A sentence could be one character |
181 | 0 | (cmp::min(slen, 2), Some(slen + 1)) |
182 | 0 | } |
183 | | |
184 | | #[inline] |
185 | 0 | fn next(&mut self) -> Option<usize> { |
186 | | use crate::tables::sentence as se; |
187 | | |
188 | 0 | for next_char in self.string[self.pos..].chars() { |
189 | 0 | let position_before = self.pos; |
190 | 0 | let state_before = self.state.clone(); |
191 | 0 |
|
192 | 0 | let next_cat = se::sentence_category(next_char).2; |
193 | 0 |
|
194 | 0 | self.pos += next_char.len_utf8(); |
195 | 0 | self.state = self.state.next(next_cat); |
196 | | |
197 | 0 | match next_cat { |
198 | | // SB1 https://unicode.org/reports/tr29/#SB1 |
199 | 0 | _ if state_before.match1(StatePart::Sot) => return Some(position_before), |
200 | | |
201 | | // SB2 is handled when inner iterator (chars) is finished |
202 | | |
203 | | // SB3 https://unicode.org/reports/tr29/#SB3 |
204 | 0 | SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue, |
205 | | |
206 | | // SB4 https://unicode.org/reports/tr29/#SB4 |
207 | 0 | _ if state_before.match1(StatePart::Sep) |
208 | 0 | || state_before.match1(StatePart::CR) |
209 | 0 | || state_before.match1(StatePart::LF) => |
210 | 0 | { |
211 | 0 | return Some(position_before) |
212 | | } |
213 | | |
214 | | // SB5 https://unicode.org/reports/tr29/#SB5 |
215 | 0 | SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before, |
216 | | |
217 | | // SB6 https://unicode.org/reports/tr29/#SB6 |
218 | 0 | SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue, |
219 | | |
220 | | // SB7 https://unicode.org/reports/tr29/#SB7 |
221 | | SentenceCat::SC_Upper |
222 | 0 | if state_before.match2(StatePart::UpperLower, StatePart::ATerm) => |
223 | 0 | { |
224 | 0 | continue |
225 | | } |
226 | | |
227 | | // SB8 https://unicode.org/reports/tr29/#SB8 |
228 | 0 | _ if match_sb8(&state_before, &self.string[position_before..]) => continue, |
229 | | |
230 | | // SB8a https://unicode.org/reports/tr29/#SB8a |
231 | | SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm |
232 | 0 | if match_sb8a(&state_before) => |
233 | | { |
234 | 0 | continue |
235 | | } |
236 | | |
237 | | // SB9 https://unicode.org/reports/tr29/#SB9 |
238 | | SentenceCat::SC_Close |
239 | | | SentenceCat::SC_Sp |
240 | | | SentenceCat::SC_Sep |
241 | | | SentenceCat::SC_CR |
242 | | | SentenceCat::SC_LF |
243 | 0 | if match_sb9(&state_before) => |
244 | | { |
245 | 0 | continue |
246 | | } |
247 | | |
248 | | // SB10 https://unicode.org/reports/tr29/#SB10 |
249 | | SentenceCat::SC_Sp |
250 | | | SentenceCat::SC_Sep |
251 | | | SentenceCat::SC_CR |
252 | | | SentenceCat::SC_LF |
253 | 0 | if match_sb8a(&state_before) => |
254 | | { |
255 | 0 | continue |
256 | | } |
257 | | |
258 | | // SB11 https://unicode.org/reports/tr29/#SB11 |
259 | 0 | _ if match_sb11(&state_before) => return Some(position_before), |
260 | | |
261 | | // SB998 https://unicode.org/reports/tr29/#SB998 |
262 | 0 | _ => continue, |
263 | | } |
264 | | } |
265 | | |
266 | | // SB2 https://unicode.org/reports/tr29/#SB2 |
267 | 0 | if self.state.match1(StatePart::Sot) || self.state.match1(StatePart::Eot) { |
268 | 0 | None |
269 | | } else { |
270 | 0 | self.state = self.state.end(); |
271 | 0 | Some(self.pos) |
272 | | } |
273 | 0 | } |
274 | | } |
275 | | |
276 | | pub fn new_sentence_breaks(source: &str) -> SentenceBreaks<'_> { |
277 | | SentenceBreaks { |
278 | | string: source, |
279 | | pos: 0, |
280 | | state: INITIAL_STATE, |
281 | | } |
282 | | } |
283 | | } |
284 | | |
285 | | /// An iterator over the substrings of a string which, after splitting the string on |
286 | | /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries), |
287 | | /// contain any characters with the |
288 | | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
289 | | /// property, or with |
290 | | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
291 | | /// |
292 | | /// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`] |
293 | | /// trait. See its documentation for more. |
294 | | /// |
295 | | /// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences |
296 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
297 | | #[derive(Debug, Clone)] |
298 | | pub struct UnicodeSentences<'a> { |
299 | | inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>, |
300 | | } |
301 | | |
302 | | /// External iterator for a string's |
303 | | /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). |
304 | | /// |
305 | | /// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`] |
306 | | /// trait. See its documentation for more. |
307 | | /// |
308 | | /// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds |
309 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
310 | | #[derive(Debug, Clone)] |
311 | | pub struct USentenceBounds<'a> { |
312 | | iter: fwd::SentenceBreaks<'a>, |
313 | | sentence_start: Option<usize>, |
314 | | } |
315 | | |
316 | | /// External iterator for sentence boundaries and byte offsets. |
317 | | /// |
318 | | /// This struct is created by the [`split_sentence_bound_indices`] method on the |
319 | | /// [`UnicodeSegmentation`] trait. See its documentation for more. |
320 | | /// |
321 | | /// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices |
322 | | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
323 | | #[derive(Debug, Clone)] |
324 | | pub struct USentenceBoundIndices<'a> { |
325 | | start_offset: usize, |
326 | | iter: USentenceBounds<'a>, |
327 | | } |
328 | | |
329 | | #[inline] |
330 | 0 | pub fn new_sentence_bounds(source: &str) -> USentenceBounds<'_> { |
331 | 0 | USentenceBounds { |
332 | 0 | iter: fwd::new_sentence_breaks(source), |
333 | 0 | sentence_start: None, |
334 | 0 | } |
335 | 0 | } |
336 | | |
337 | | #[inline] |
338 | 0 | pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> { |
339 | 0 | USentenceBoundIndices { |
340 | 0 | start_offset: source.as_ptr() as usize, |
341 | 0 | iter: new_sentence_bounds(source), |
342 | 0 | } |
343 | 0 | } |
344 | | |
345 | | #[inline] |
346 | 0 | pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> { |
347 | | use super::UnicodeSegmentation; |
348 | | use crate::tables::util::is_alphanumeric; |
349 | | |
350 | | fn has_alphanumeric(s: &&str) -> bool { |
351 | | s.chars().any(is_alphanumeric) |
352 | | } |
353 | 0 | let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer |
354 | 0 |
|
355 | 0 | UnicodeSentences { |
356 | 0 | inner: s.split_sentence_bounds().filter(has_alphanumeric), |
357 | 0 | } |
358 | 0 | } |
359 | | |
360 | | impl<'a> Iterator for UnicodeSentences<'a> { |
361 | | type Item = &'a str; |
362 | | |
363 | | #[inline] |
364 | 0 | fn next(&mut self) -> Option<&'a str> { |
365 | 0 | self.inner.next() |
366 | 0 | } |
367 | | |
368 | | #[inline] |
369 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
370 | 0 | self.inner.size_hint() |
371 | 0 | } |
372 | | } |
373 | | |
374 | | impl<'a> Iterator for USentenceBounds<'a> { |
375 | | type Item = &'a str; |
376 | | |
377 | | #[inline] |
378 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
379 | 0 | let (lower, upper) = self.iter.size_hint(); |
380 | 0 | (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1))) |
381 | 0 | } |
382 | | |
383 | | #[inline] |
384 | 0 | fn next(&mut self) -> Option<&'a str> { |
385 | 0 | if self.sentence_start.is_none() { |
386 | 0 | if let Some(start_pos) = self.iter.next() { |
387 | 0 | self.sentence_start = Some(start_pos) |
388 | | } else { |
389 | 0 | return None; |
390 | | } |
391 | 0 | } |
392 | | |
393 | 0 | if let Some(break_pos) = self.iter.next() { |
394 | 0 | let start_pos = self.sentence_start.unwrap(); |
395 | 0 | let sentence = &self.iter.string[start_pos..break_pos]; |
396 | 0 | self.sentence_start = Some(break_pos); |
397 | 0 | Some(sentence) |
398 | | } else { |
399 | 0 | None |
400 | | } |
401 | 0 | } |
402 | | } |
403 | | |
404 | | impl<'a> Iterator for USentenceBoundIndices<'a> { |
405 | | type Item = (usize, &'a str); |
406 | | |
407 | | #[inline] |
408 | 0 | fn next(&mut self) -> Option<(usize, &'a str)> { |
409 | 0 | self.iter |
410 | 0 | .next() |
411 | 0 | .map(|s| (s.as_ptr() as usize - self.start_offset, s)) |
412 | 0 | } |
413 | | |
414 | | #[inline] |
415 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
416 | 0 | self.iter.size_hint() |
417 | 0 | } |
418 | | } |