/rust/registry/src/index.crates.io-6f17d22bba15001f/lexicmp-0.1.0/src/iter.rs
Line | Count | Source (jump to first uncovered line) |
1 | | //! Iterators to transliterate Unicode to ASCII. Note that only alphanumeric |
2 | | //! characters are transliterated, and not all of them are supported. |
3 | | //! |
4 | | //! Characters can be transliterated to multiple ASCII characters. For example, |
5 | | //! `æ` is converted to `ae`, and `½` is converted to `1/2`. |
6 | | //! |
7 | | //! The iterators don't allocate memory on the heap. I haven't benchmarked it, |
8 | | //! but I believe that it's quite efficient. |
9 | | |
10 | | use any_ascii::any_ascii_char; |
11 | | use core::iter::FusedIterator; |
12 | | |
13 | | /// An iterator over one `char`, converted to lowercase |
14 | | /// and transliterated to ASCII, if it is an alphanumeric character |
15 | | /// |
16 | | /// This iterator can be created by calling `iterate_lexical_char()` or |
17 | | /// `iterate_lexical_char_only_alnum()` |
18 | | pub struct LexicalChar(CharOrSlice); |
19 | | |
20 | | impl LexicalChar { |
21 | | #[inline] |
22 | 0 | fn from_char(c: char) -> Self { |
23 | 0 | LexicalChar(CharOrSlice::Char(c)) |
24 | 0 | } |
25 | | |
26 | | #[inline] |
27 | 0 | fn from_slice(s: &'static [u8]) -> Self { |
28 | 0 | LexicalChar(CharOrSlice::Slice(s)) |
29 | 0 | } |
30 | | |
31 | | #[inline] |
32 | 0 | fn empty() -> Self { |
33 | 0 | LexicalChar(CharOrSlice::Slice(&[])) |
34 | 0 | } |
35 | | |
36 | | #[inline] |
37 | 0 | fn inner(&self) -> &CharOrSlice { |
38 | 0 | &self.0 |
39 | 0 | } |
40 | | |
41 | | #[inline] |
42 | 0 | fn inner_mut(&mut self) -> &mut CharOrSlice { |
43 | 0 | &mut self.0 |
44 | 0 | } |
45 | | } |
46 | | |
47 | | enum CharOrSlice { |
48 | | Char(char), |
49 | | Slice(&'static [u8]), |
50 | | } |
51 | | |
52 | | impl Iterator for LexicalChar { |
53 | | type Item = char; |
54 | | |
55 | | #[inline] |
56 | 0 | fn next(&mut self) -> Option<Self::Item> { |
57 | 0 | match self.inner_mut() { |
58 | 0 | &mut CharOrSlice::Char(c) => { |
59 | 0 | *self = LexicalChar::empty(); |
60 | 0 | Some(c) |
61 | | } |
62 | 0 | CharOrSlice::Slice(slice) => match slice.get(0_usize) { |
63 | 0 | Some(&next) => { |
64 | 0 | *slice = &slice[1..]; |
65 | 0 | Some((next as char).to_ascii_lowercase()) |
66 | | } |
67 | 0 | None => None, |
68 | | }, |
69 | | } |
70 | 0 | } |
71 | | |
72 | | #[inline] |
73 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
74 | 0 | match self.inner() { |
75 | 0 | CharOrSlice::Char(_) => (1, Some(1)), |
76 | 0 | CharOrSlice::Slice(s) => (s.len(), Some(s.len())), |
77 | | } |
78 | 0 | } |
79 | | |
80 | | #[inline] |
81 | 0 | fn nth(&mut self, n: usize) -> Option<Self::Item> { |
82 | 0 | if n == 0 { |
83 | 0 | self.next() |
84 | 0 | } else if let CharOrSlice::Slice(slice) = self.inner_mut() { |
85 | 0 | match slice.get(n) { |
86 | 0 | Some(&next) => { |
87 | 0 | *slice = &slice[1..]; |
88 | 0 | Some((next as char).to_ascii_lowercase()) |
89 | | } |
90 | 0 | None => None, |
91 | | } |
92 | | } else { |
93 | 0 | None |
94 | | } |
95 | 0 | } |
96 | | } |
97 | | |
98 | | impl FusedIterator for LexicalChar {} |
99 | | |
100 | | impl ExactSizeIterator for LexicalChar {} |
101 | | |
102 | | impl DoubleEndedIterator for LexicalChar { |
103 | | #[inline] |
104 | 0 | fn next_back(&mut self) -> Option<Self::Item> { |
105 | 0 | match self.inner_mut() { |
106 | 0 | &mut CharOrSlice::Char(c) => { |
107 | 0 | *self = LexicalChar::empty(); |
108 | 0 | Some(c) |
109 | | } |
110 | 0 | CharOrSlice::Slice(slice) => { |
111 | 0 | if slice.len() > 0 { |
112 | 0 | let ix = slice.len() - 1; |
113 | 0 | *slice = &slice[..ix]; |
114 | 0 | Some((slice[ix] as char).to_ascii_lowercase()) |
115 | | } else { |
116 | 0 | None |
117 | | } |
118 | | } |
119 | | } |
120 | 0 | } |
121 | | } |
122 | | |
123 | | /// Returns an iterator over one `char`, converted to lowercase |
124 | | /// and transliterated to ASCII, if it is alphanumeric |
125 | | #[inline] |
126 | 0 | pub fn iterate_lexical_char(c: char) -> LexicalChar { |
127 | 0 | if c.is_ascii() { |
128 | 0 | LexicalChar::from_char(c.to_ascii_lowercase()) |
129 | 0 | } else if c.is_alphanumeric() { |
130 | 0 | match any_ascii_char(c) { |
131 | 0 | s if s.is_empty() => LexicalChar::from_char(c), |
132 | 0 | s => LexicalChar::from_slice(s.as_bytes()), |
133 | | } |
134 | 0 | } else if combining_diacritical(&c) { |
135 | 0 | LexicalChar::empty() |
136 | | } else { |
137 | 0 | LexicalChar::from_char(c) |
138 | | } |
139 | 0 | } |
140 | | |
141 | | /// Returns an iterator over one `char`, converted to lowercase |
142 | | /// and transliterated to ASCII, if it is alphanumeric |
143 | | #[inline] |
144 | 0 | pub fn iterate_lexical_char_only_alnum(c: char) -> LexicalChar { |
145 | 0 | if c.is_ascii() { |
146 | 0 | if c.is_ascii_alphanumeric() { |
147 | 0 | LexicalChar::from_char(c.to_ascii_lowercase()) |
148 | | } else { |
149 | 0 | LexicalChar::empty() |
150 | | } |
151 | 0 | } else if c.is_alphanumeric() { |
152 | 0 | match any_ascii_char(c) { |
153 | 0 | s if s.is_empty() => LexicalChar::from_char(c), |
154 | 0 | s => LexicalChar::from_slice(s.as_bytes()), |
155 | | } |
156 | | } else { |
157 | 0 | LexicalChar::empty() |
158 | | } |
159 | 0 | } |
160 | | |
161 | | /// returns `true` for combining diacritical marks |
162 | | #[inline] |
163 | 0 | fn combining_diacritical(&c: &char) -> bool { |
164 | 0 | c >= '\u{300}' && c <= '\u{36F}' |
165 | 0 | } |
166 | | |
167 | | /// Returns an iterator over the characters of a string, converted to lowercase |
168 | | /// and transliterated to ASCII, if they're alphanumeric |
169 | 0 | pub fn iterate_lexical(s: &'_ str) -> impl Iterator<Item = char> + '_ { |
170 | 0 | s.chars().flat_map(iterate_lexical_char) |
171 | 0 | } |
172 | | |
173 | | /// Returns an iterator over the characters of a string, converted to lowercase |
174 | | /// and transliterated to ASCII. Non-alphanumeric characters are skipped |
175 | 0 | pub fn iterate_lexical_only_alnum(s: &'_ str) -> impl Iterator<Item = char> + '_ { |
176 | 0 | s.chars().flat_map(iterate_lexical_char_only_alnum) |
177 | 0 | } |
178 | | |
179 | | #[test] |
180 | | fn test_iteration() { |
181 | | fn it(s: &'static str) -> String { |
182 | | iterate_lexical(s).collect() |
183 | | } |
184 | | |
185 | | assert_eq!(&it("Hello, world!"), "hello, world!"); |
186 | | assert_eq!(&it("Ω A æ b ö ß é"), "o a ae b o ss e"); |
187 | | assert_eq!(&it("3½/⅝ £ → € ®™"), "31/2/5/8 £ → € ®™"); |
188 | | assert_eq!(&it("»@« 15% ¡¹!"), "»@« 15% ¡1!"); |
189 | | assert_eq!(&it("🎉🦄☣"), "🎉🦄☣"); |
190 | | assert_eq!(&it("北亰"), "beijing"); |
191 | | assert_eq!(&it("ΣΣΣ"), "sss"); |
192 | | assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}' |
193 | | } |
194 | | |
195 | | #[test] |
196 | | fn test_iteration_only_alnum() { |
197 | | fn it(s: &'static str) -> String { |
198 | | iterate_lexical_only_alnum(s).collect() |
199 | | } |
200 | | |
201 | | assert_eq!(&it("Hello, world!"), "helloworld"); |
202 | | assert_eq!(&it("Ω A æ b ö ß é"), "oaaebosse"); |
203 | | assert_eq!(&it("3½/⅝ £ → € ®™"), "31/25/8"); |
204 | | assert_eq!(&it("»@« 15% ¡¹!"), "151"); |
205 | | assert_eq!(&it("🎉🦄☣"), ""); |
206 | | assert_eq!(&it("北亰"), "beijing"); |
207 | | assert_eq!(&it("ΣΣΣ"), "sss"); |
208 | | assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}' |
209 | | } |