Coverage Report

Created: 2025-02-21 07:11

/rust/registry/src/index.crates.io-6f17d22bba15001f/lexicmp-0.1.0/src/iter.rs
Line
Count
Source (jump to first uncovered line)
1
//! Iterators to transliterate Unicode to ASCII. Note that only alphanumeric
2
//! characters are transliterated, and not all of them are supported.
3
//!
4
//! Characters can be transliterated to multiple ASCII characters. For example,
5
//! `æ` is converted to `ae`, and `½` is converted to `1/2`.
6
//!
7
//! The iterators don't allocate memory on the heap. I haven't benchmarked it,
8
//! but I believe that it's quite efficient.
9
10
use any_ascii::any_ascii_char;
11
use core::iter::FusedIterator;
12
13
/// An iterator over one `char`, converted to lowercase
14
/// and transliterated to ASCII, if it is an alphanumeric character
15
///
16
/// This iterator can be created by calling `iterate_lexical_char()` or
17
/// `iterate_lexical_char_only_alnum()`
18
pub struct LexicalChar(CharOrSlice);
19
20
impl LexicalChar {
21
    #[inline]
22
0
    fn from_char(c: char) -> Self {
23
0
        LexicalChar(CharOrSlice::Char(c))
24
0
    }
25
26
    #[inline]
27
0
    fn from_slice(s: &'static [u8]) -> Self {
28
0
        LexicalChar(CharOrSlice::Slice(s))
29
0
    }
30
31
    #[inline]
32
0
    fn empty() -> Self {
33
0
        LexicalChar(CharOrSlice::Slice(&[]))
34
0
    }
35
36
    #[inline]
37
0
    fn inner(&self) -> &CharOrSlice {
38
0
        &self.0
39
0
    }
40
41
    #[inline]
42
0
    fn inner_mut(&mut self) -> &mut CharOrSlice {
43
0
        &mut self.0
44
0
    }
45
}
46
47
enum CharOrSlice {
48
    Char(char),
49
    Slice(&'static [u8]),
50
}
51
52
impl Iterator for LexicalChar {
53
    type Item = char;
54
55
    #[inline]
56
0
    fn next(&mut self) -> Option<Self::Item> {
57
0
        match self.inner_mut() {
58
0
            &mut CharOrSlice::Char(c) => {
59
0
                *self = LexicalChar::empty();
60
0
                Some(c)
61
            }
62
0
            CharOrSlice::Slice(slice) => match slice.get(0_usize) {
63
0
                Some(&next) => {
64
0
                    *slice = &slice[1..];
65
0
                    Some((next as char).to_ascii_lowercase())
66
                }
67
0
                None => None,
68
            },
69
        }
70
0
    }
71
72
    #[inline]
73
0
    fn size_hint(&self) -> (usize, Option<usize>) {
74
0
        match self.inner() {
75
0
            CharOrSlice::Char(_) => (1, Some(1)),
76
0
            CharOrSlice::Slice(s) => (s.len(), Some(s.len())),
77
        }
78
0
    }
79
80
    #[inline]
81
0
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
82
0
        if n == 0 {
83
0
            self.next()
84
0
        } else if let CharOrSlice::Slice(slice) = self.inner_mut() {
85
0
            match slice.get(n) {
86
0
                Some(&next) => {
87
0
                    *slice = &slice[1..];
88
0
                    Some((next as char).to_ascii_lowercase())
89
                }
90
0
                None => None,
91
            }
92
        } else {
93
0
            None
94
        }
95
0
    }
96
}
97
98
impl FusedIterator for LexicalChar {}
99
100
impl ExactSizeIterator for LexicalChar {}
101
102
impl DoubleEndedIterator for LexicalChar {
103
    #[inline]
104
0
    fn next_back(&mut self) -> Option<Self::Item> {
105
0
        match self.inner_mut() {
106
0
            &mut CharOrSlice::Char(c) => {
107
0
                *self = LexicalChar::empty();
108
0
                Some(c)
109
            }
110
0
            CharOrSlice::Slice(slice) => {
111
0
                if slice.len() > 0 {
112
0
                    let ix = slice.len() - 1;
113
0
                    *slice = &slice[..ix];
114
0
                    Some((slice[ix] as char).to_ascii_lowercase())
115
                } else {
116
0
                    None
117
                }
118
            }
119
        }
120
0
    }
121
}
122
123
/// Returns an iterator over one `char`, converted to lowercase
124
/// and transliterated to ASCII, if it is alphanumeric
125
#[inline]
126
0
pub fn iterate_lexical_char(c: char) -> LexicalChar {
127
0
    if c.is_ascii() {
128
0
        LexicalChar::from_char(c.to_ascii_lowercase())
129
0
    } else if c.is_alphanumeric() {
130
0
        match any_ascii_char(c) {
131
0
            s if s.is_empty() => LexicalChar::from_char(c),
132
0
            s => LexicalChar::from_slice(s.as_bytes()),
133
        }
134
0
    } else if combining_diacritical(&c) {
135
0
        LexicalChar::empty()
136
    } else {
137
0
        LexicalChar::from_char(c)
138
    }
139
0
}
140
141
/// Returns an iterator over one `char`, converted to lowercase
142
/// and transliterated to ASCII, if it is alphanumeric
143
#[inline]
144
0
pub fn iterate_lexical_char_only_alnum(c: char) -> LexicalChar {
145
0
    if c.is_ascii() {
146
0
        if c.is_ascii_alphanumeric() {
147
0
            LexicalChar::from_char(c.to_ascii_lowercase())
148
        } else {
149
0
            LexicalChar::empty()
150
        }
151
0
    } else if c.is_alphanumeric() {
152
0
        match any_ascii_char(c) {
153
0
            s if s.is_empty() => LexicalChar::from_char(c),
154
0
            s => LexicalChar::from_slice(s.as_bytes()),
155
        }
156
    } else {
157
0
        LexicalChar::empty()
158
    }
159
0
}
160
161
/// returns `true` for combining diacritical marks
162
#[inline]
163
0
fn combining_diacritical(&c: &char) -> bool {
164
0
    c >= '\u{300}' && c <= '\u{36F}'
165
0
}
166
167
/// Returns an iterator over the characters of a string, converted to lowercase
168
/// and transliterated to ASCII, if they're alphanumeric
169
0
pub fn iterate_lexical(s: &'_ str) -> impl Iterator<Item = char> + '_ {
170
0
    s.chars().flat_map(iterate_lexical_char)
171
0
}
172
173
/// Returns an iterator over the characters of a string, converted to lowercase
174
/// and transliterated to ASCII. Non-alphanumeric characters are skipped
175
0
pub fn iterate_lexical_only_alnum(s: &'_ str) -> impl Iterator<Item = char> + '_ {
176
0
    s.chars().flat_map(iterate_lexical_char_only_alnum)
177
0
}
178
179
#[test]
180
fn test_iteration() {
181
    fn it(s: &'static str) -> String {
182
        iterate_lexical(s).collect()
183
    }
184
185
    assert_eq!(&it("Hello, world!"), "hello, world!");
186
    assert_eq!(&it("Ω A æ b ö ß é"), "o a ae b o ss e");
187
    assert_eq!(&it("3½/⅝ £ → € ®™"), "31/2/5/8 £ → € ®™");
188
    assert_eq!(&it("»@« 15% ¡¹!"), "»@« 15% ¡1!");
189
    assert_eq!(&it("🎉🦄☣"), "🎉🦄☣");
190
    assert_eq!(&it("北亰"), "beijing");
191
    assert_eq!(&it("ΣΣΣ"), "sss");
192
    assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}'
193
}
194
195
#[test]
196
fn test_iteration_only_alnum() {
197
    fn it(s: &'static str) -> String {
198
        iterate_lexical_only_alnum(s).collect()
199
    }
200
201
    assert_eq!(&it("Hello, world!"), "helloworld");
202
    assert_eq!(&it("Ω A æ b ö ß é"), "oaaebosse");
203
    assert_eq!(&it("3½/⅝ £ → € ®™"), "31/25/8");
204
    assert_eq!(&it("»@« 15% ¡¹!"), "151");
205
    assert_eq!(&it("🎉🦄☣"), "");
206
    assert_eq!(&it("北亰"), "beijing");
207
    assert_eq!(&it("ΣΣΣ"), "sss");
208
    assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}'
209
}