/rust/registry/src/index.crates.io-6f17d22bba15001f/icu_segmenter-1.5.0/src/indices.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // This file is part of ICU4X. For terms of use, please see the file |
2 | | // called LICENSE at the top level of the ICU4X source tree |
3 | | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | | |
5 | | /// Similar to [`core::str::CharIndices`] for Latin-1 strings, represented as `[u8]`. |
6 | | /// |
7 | | /// Contrary to [`core::str::CharIndices`], the second element of the |
8 | | /// [`Iterator::Item`] is a [`u8`], representing a Unicode scalar value in the |
9 | | /// range U+0000–U+00FF. |
10 | | #[derive(Clone, Debug)] |
11 | | pub struct Latin1Indices<'a> { |
12 | | front_offset: usize, |
13 | | iter: &'a [u8], |
14 | | } |
15 | | |
16 | | impl<'a> Latin1Indices<'a> { |
17 | 0 | pub fn new(input: &'a [u8]) -> Self { |
18 | 0 | Self { |
19 | 0 | front_offset: 0, |
20 | 0 | iter: input, |
21 | 0 | } |
22 | 0 | } |
23 | | } |
24 | | |
25 | | impl<'a> Iterator for Latin1Indices<'a> { |
26 | | type Item = (usize, u8); |
27 | | |
28 | | #[inline] |
29 | 0 | fn next(&mut self) -> Option<(usize, u8)> { |
30 | 0 | self.iter.get(self.front_offset).map(|ch| { |
31 | 0 | self.front_offset += 1; |
32 | 0 | (self.front_offset - 1, *ch) |
33 | 0 | }) Unexecuted instantiation: <icu_segmenter::indices::Latin1Indices as core::iter::traits::iterator::Iterator>::next::{closure#0} Unexecuted instantiation: <icu_segmenter::indices::Latin1Indices as core::iter::traits::iterator::Iterator>::next::{closure#0} |
34 | 0 | } Unexecuted instantiation: <icu_segmenter::indices::Latin1Indices as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <icu_segmenter::indices::Latin1Indices as core::iter::traits::iterator::Iterator>::next |
35 | | } |
36 | | |
37 | | /// Similar to [`core::str::CharIndices`] for UTF-16 strings, represented as `[u16]`. |
38 | | /// |
39 | | /// Contrary to [`core::str::CharIndices`], the second element of the |
40 | | /// [`Iterator::Item`] is a Unicode code point represented by a [`u32`], |
41 | | /// rather than a Unicode scalar value represented by a [`char`], because this |
42 | | /// iterator preserves unpaired surrogates. |
43 | | #[derive(Clone, Debug)] |
44 | | pub struct Utf16Indices<'a> { |
45 | | front_offset: usize, |
46 | | iter: &'a [u16], |
47 | | } |
48 | | |
49 | | impl<'a> Utf16Indices<'a> { |
50 | 0 | pub fn new(input: &'a [u16]) -> Self { |
51 | 0 | Self { |
52 | 0 | front_offset: 0, |
53 | 0 | iter: input, |
54 | 0 | } |
55 | 0 | } |
56 | | } |
57 | | |
58 | | impl<'a> Iterator for Utf16Indices<'a> { |
59 | | type Item = (usize, u32); |
60 | | |
61 | | #[inline] |
62 | 0 | fn next(&mut self) -> Option<(usize, u32)> { |
63 | 0 | let (index, ch) = self.iter.get(self.front_offset).map(|ch| { |
64 | 0 | self.front_offset += 1; |
65 | 0 | (self.front_offset - 1, *ch) |
66 | 0 | })?; |
67 | | |
68 | 0 | let mut ch = ch as u32; |
69 | 0 | if (ch & 0xfc00) != 0xd800 { |
70 | 0 | return Some((index, ch)); |
71 | 0 | } |
72 | | |
73 | 0 | if let Some(next) = self.iter.get(self.front_offset) { |
74 | 0 | let next = *next as u32; |
75 | 0 | if (next & 0xfc00) == 0xdc00 { |
76 | 0 | // Combine low and high surrogates to UTF-32 code point. |
77 | 0 | ch = ((ch & 0x3ff) << 10) + (next & 0x3ff) + 0x10000; |
78 | 0 | self.front_offset += 1; |
79 | 0 | } |
80 | 0 | } |
81 | 0 | Some((index, ch)) |
82 | 0 | } |
83 | | } |
84 | | |
85 | | #[cfg(test)] |
86 | | mod tests { |
87 | | use crate::indices::*; |
88 | | |
89 | | #[test] |
90 | | fn latin1_indices() { |
91 | | let latin1 = [0x30, 0x31, 0x32]; |
92 | | let mut indices = Latin1Indices::new(&latin1); |
93 | | let n = indices.next().unwrap(); |
94 | | assert_eq!(n.0, 0); |
95 | | assert_eq!(n.1, 0x30); |
96 | | let n = indices.next().unwrap(); |
97 | | assert_eq!(n.0, 1); |
98 | | assert_eq!(n.1, 0x31); |
99 | | let n = indices.next().unwrap(); |
100 | | assert_eq!(n.0, 2); |
101 | | assert_eq!(n.1, 0x32); |
102 | | let n = indices.next(); |
103 | | assert_eq!(n, None); |
104 | | } |
105 | | |
106 | | #[test] |
107 | | fn utf16_indices() { |
108 | | let utf16 = [0xd83d, 0xde03, 0x0020, 0xd83c, 0xdf00, 0xd800, 0x0020]; |
109 | | let mut indices = Utf16Indices::new(&utf16); |
110 | | let n = indices.next().unwrap(); |
111 | | assert_eq!(n.0, 0); |
112 | | assert_eq!(n.1, 0x1f603); |
113 | | let n = indices.next().unwrap(); |
114 | | assert_eq!(n.0, 2); |
115 | | assert_eq!(n.1, 0x20); |
116 | | let n = indices.next().unwrap(); |
117 | | assert_eq!(n.0, 3); |
118 | | assert_eq!(n.1, 0x1f300); |
119 | | // This is invalid surrogate pair. |
120 | | let n = indices.next().unwrap(); |
121 | | assert_eq!(n.0, 5); |
122 | | assert_eq!(n.1, 0xd800); |
123 | | let n = indices.next().unwrap(); |
124 | | assert_eq!(n.0, 6); |
125 | | assert_eq!(n.1, 0x0020); |
126 | | let n = indices.next(); |
127 | | assert_eq!(n, None); |
128 | | } |
129 | | } |