/rust/registry/src/index.crates.io-6f17d22bba15001f/lexicmp-0.1.0/src/iter.rs

Source (jump to first uncovered line)
//! Iterators to transliterate Unicode to ASCII. Note that only alphanumeric
//! characters are transliterated, and not all of them are supported.
//!
//! Characters can be transliterated to multiple ASCII characters. For example,
//! `æ` is converted to `ae`, and `½` is converted to `1/2`.
//!
//! The iterators don't allocate memory on the heap. I haven't benchmarked it,
//! but I believe that it's quite efficient.

use any_ascii::any_ascii_char;
use core::iter::FusedIterator;

/// An iterator over one `char`, converted to lowercase
/// and transliterated to ASCII, if it is an alphanumeric character
///
/// This iterator can be created by calling `iterate_lexical_char()` or
/// `iterate_lexical_char_only_alnum()`
pub struct LexicalChar(CharOrSlice);

impl LexicalChar {
    #[inline]
    fn from_char(c: char) -> Self {
        LexicalChar(CharOrSlice::Char(c))
    }

    #[inline]
    fn from_slice(s: &'static [u8]) -> Self {
        LexicalChar(CharOrSlice::Slice(s))
    }

    #[inline]
    fn empty() -> Self {
        LexicalChar(CharOrSlice::Slice(&[]))
    }

    #[inline]
    fn inner(&self) -> &CharOrSlice {
        &self.0
    }

    #[inline]
    fn inner_mut(&mut self) -> &mut CharOrSlice {
        &mut self.0
    }
}

enum CharOrSlice {
    Char(char),
    Slice(&'static [u8]),
}

impl Iterator for LexicalChar {
    type Item = char;

    #[inline]
    fn next(&mut self) -> Option<Self::Item> {
        match self.inner_mut() {
            &mut CharOrSlice::Char(c) => {
                *self = LexicalChar::empty();
                Some(c)
            }
            CharOrSlice::Slice(slice) => match slice.get(0_usize) {
                Some(&next) => {
                    *slice = &slice[1..];
                    Some((next as char).to_ascii_lowercase())
                }
                None => None,
            },
        }
    }

    #[inline]
    fn size_hint(&self) -> (usize, Option<usize>) {
        match self.inner() {
            CharOrSlice::Char(_) => (1, Some(1)),
            CharOrSlice::Slice(s) => (s.len(), Some(s.len())),
        }
    }

    #[inline]
    fn nth(&mut self, n: usize) -> Option<Self::Item> {
        if n == 0 {
            self.next()
        } else if let CharOrSlice::Slice(slice) = self.inner_mut() {
            match slice.get(n) {
                Some(&next) => {
                    *slice = &slice[1..];
                    Some((next as char).to_ascii_lowercase())
                }
                None => None,
            }
        } else {
            None
        }
    }
}

impl FusedIterator for LexicalChar {}

impl ExactSizeIterator for LexicalChar {}

impl DoubleEndedIterator for LexicalChar {
    #[inline]
    fn next_back(&mut self) -> Option<Self::Item> {
        match self.inner_mut() {
            &mut CharOrSlice::Char(c) => {
                *self = LexicalChar::empty();
                Some(c)
            }
            CharOrSlice::Slice(slice) => {
                if slice.len() > 0 {
                    let ix = slice.len() - 1;
                    *slice = &slice[..ix];
                    Some((slice[ix] as char).to_ascii_lowercase())
                } else {
                    None
                }
            }
        }
    }
}

/// Returns an iterator over one `char`, converted to lowercase
/// and transliterated to ASCII, if it is alphanumeric
#[inline]
pub fn iterate_lexical_char(c: char) -> LexicalChar {
    if c.is_ascii() {
        LexicalChar::from_char(c.to_ascii_lowercase())
    } else if c.is_alphanumeric() {
        match any_ascii_char(c) {
            s if s.is_empty() => LexicalChar::from_char(c),
            s => LexicalChar::from_slice(s.as_bytes()),
        }
    } else if combining_diacritical(&c) {
        LexicalChar::empty()
    } else {
        LexicalChar::from_char(c)
    }
}

/// Returns an iterator over one `char`, converted to lowercase
/// and transliterated to ASCII, if it is alphanumeric
#[inline]
pub fn iterate_lexical_char_only_alnum(c: char) -> LexicalChar {
    if c.is_ascii() {
        if c.is_ascii_alphanumeric() {
            LexicalChar::from_char(c.to_ascii_lowercase())
        } else {
            LexicalChar::empty()
        }
    } else if c.is_alphanumeric() {
        match any_ascii_char(c) {
            s if s.is_empty() => LexicalChar::from_char(c),
            s => LexicalChar::from_slice(s.as_bytes()),
        }
    } else {
        LexicalChar::empty()
    }
}

/// returns `true` for combining diacritical marks
#[inline]
fn combining_diacritical(&c: &char) -> bool {
    c >= '\u{300}' && c <= '\u{36F}'
}

/// Returns an iterator over the characters of a string, converted to lowercase
/// and transliterated to ASCII, if they're alphanumeric
pub fn iterate_lexical(s: &'_ str) -> impl Iterator<Item = char> + '_ {
    s.chars().flat_map(iterate_lexical_char)
}

/// Returns an iterator over the characters of a string, converted to lowercase
/// and transliterated to ASCII. Non-alphanumeric characters are skipped
pub fn iterate_lexical_only_alnum(s: &'_ str) -> impl Iterator<Item = char> + '_ {
    s.chars().flat_map(iterate_lexical_char_only_alnum)
}

#[test]
fn test_iteration() {
    fn it(s: &'static str) -> String {
        iterate_lexical(s).collect()
    }

    assert_eq!(&it("Hello, world!"), "hello, world!");
    assert_eq!(&it("Ω A æ b ö ß é"), "o a ae b o ss e");
    assert_eq!(&it("3½/⅝ £ → € ®™"), "31/2/5/8 £ → € ®™");
    assert_eq!(&it("»@« 15% ¡¹!"), "»@« 15% ¡1!");
    assert_eq!(&it("🎉🦄☣"), "🎉🦄☣");
    assert_eq!(&it("北亰"), "beijing");
    assert_eq!(&it("ΣΣΣ"), "sss");
    assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}'
}

#[test]
fn test_iteration_only_alnum() {
    fn it(s: &'static str) -> String {
        iterate_lexical_only_alnum(s).collect()
    }

    assert_eq!(&it("Hello, world!"), "helloworld");
    assert_eq!(&it("Ω A æ b ö ß é"), "oaaebosse");
    assert_eq!(&it("3½/⅝ £ → € ®™"), "31/25/8");
    assert_eq!(&it("»@« 15% ¡¹!"), "151");
    assert_eq!(&it("🎉🦄☣"), "");
    assert_eq!(&it("北亰"), "beijing");
    assert_eq!(&it("ΣΣΣ"), "sss");
    assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}'
}

Coverage Report

Created: 2025-02-21 07:11

Line	Count	Source (jump to first uncovered line)
1		//! Iterators to transliterate Unicode to ASCII. Note that only alphanumeric
2		//! characters are transliterated, and not all of them are supported.
3		//!
4		//! Characters can be transliterated to multiple ASCII characters. For example,
5		//! `æ` is converted to `ae`, and `½` is converted to `1/2`.
6		//!
7		//! The iterators don't allocate memory on the heap. I haven't benchmarked it,
8		//! but I believe that it's quite efficient.
9
10		use any_ascii::any_ascii_char;
11		use core::iter::FusedIterator;
12
13		/// An iterator over one `char`, converted to lowercase
14		/// and transliterated to ASCII, if it is an alphanumeric character
15		///
16		/// This iterator can be created by calling `iterate_lexical_char()` or
17		/// `iterate_lexical_char_only_alnum()`
18		pub struct LexicalChar(CharOrSlice);
19
20		impl LexicalChar {
21		#[inline]
22	0	fn from_char(c: char) -> Self {
23	0	LexicalChar(CharOrSlice::Char(c))
24	0	}
25
26		#[inline]
27	0	fn from_slice(s: &'static [u8]) -> Self {
28	0	LexicalChar(CharOrSlice::Slice(s))
29	0	}
30
31		#[inline]
32	0	fn empty() -> Self {
33	0	LexicalChar(CharOrSlice::Slice(&[]))
34	0	}
35
36		#[inline]
37	0	fn inner(&self) -> &CharOrSlice {
38	0	&self.0
39	0	}
40
41		#[inline]
42	0	fn inner_mut(&mut self) -> &mut CharOrSlice {
43	0	&mut self.0
44	0	}
45		}
46
47		enum CharOrSlice {
48		Char(char),
49		Slice(&'static [u8]),
50		}
51
52		impl Iterator for LexicalChar {
53		type Item = char;
54
55		#[inline]
56	0	fn next(&mut self) -> Option<Self::Item> {
57	0	match self.inner_mut() {
58	0	&mut CharOrSlice::Char(c) => {
59	0	*self = LexicalChar::empty();
60	0	Some(c)
61		}
62	0	CharOrSlice::Slice(slice) => match slice.get(0_usize) {
63	0	Some(&next) => {
64	0	*slice = &slice[1..];
65	0	Some((next as char).to_ascii_lowercase())
66		}
67	0	None => None,
68		},
69		}
70	0	}
71
72		#[inline]
73	0	fn size_hint(&self) -> (usize, Option<usize>) {
74	0	match self.inner() {
75	0	CharOrSlice::Char(_) => (1, Some(1)),
76	0	CharOrSlice::Slice(s) => (s.len(), Some(s.len())),
77		}
78	0	}
79
80		#[inline]
81	0	fn nth(&mut self, n: usize) -> Option<Self::Item> {
82	0	if n == 0 {
83	0	self.next()
84	0	} else if let CharOrSlice::Slice(slice) = self.inner_mut() {
85	0	match slice.get(n) {
86	0	Some(&next) => {
87	0	*slice = &slice[1..];
88	0	Some((next as char).to_ascii_lowercase())
89		}
90	0	None => None,
91		}
92		} else {
93	0	None
94		}
95	0	}
96		}
97
98		impl FusedIterator for LexicalChar {}
99
100		impl ExactSizeIterator for LexicalChar {}
101
102		impl DoubleEndedIterator for LexicalChar {
103		#[inline]
104	0	fn next_back(&mut self) -> Option<Self::Item> {
105	0	match self.inner_mut() {
106	0	&mut CharOrSlice::Char(c) => {
107	0	*self = LexicalChar::empty();
108	0	Some(c)
109		}
110	0	CharOrSlice::Slice(slice) => {
111	0	if slice.len() > 0 {
112	0	let ix = slice.len() - 1;
113	0	*slice = &slice[..ix];
114	0	Some((slice[ix] as char).to_ascii_lowercase())
115		} else {
116	0	None
117		}
118		}
119		}
120	0	}
121		}
122
123		/// Returns an iterator over one `char`, converted to lowercase
124		/// and transliterated to ASCII, if it is alphanumeric
125		#[inline]
126	0	pub fn iterate_lexical_char(c: char) -> LexicalChar {
127	0	if c.is_ascii() {
128	0	LexicalChar::from_char(c.to_ascii_lowercase())
129	0	} else if c.is_alphanumeric() {
130	0	match any_ascii_char(c) {
131	0	s if s.is_empty() => LexicalChar::from_char(c),
132	0	s => LexicalChar::from_slice(s.as_bytes()),
133		}
134	0	} else if combining_diacritical(&c) {
135	0	LexicalChar::empty()
136		} else {
137	0	LexicalChar::from_char(c)
138		}
139	0	}
140
141		/// Returns an iterator over one `char`, converted to lowercase
142		/// and transliterated to ASCII, if it is alphanumeric
143		#[inline]
144	0	pub fn iterate_lexical_char_only_alnum(c: char) -> LexicalChar {
145	0	if c.is_ascii() {
146	0	if c.is_ascii_alphanumeric() {
147	0	LexicalChar::from_char(c.to_ascii_lowercase())
148		} else {
149	0	LexicalChar::empty()
150		}
151	0	} else if c.is_alphanumeric() {
152	0	match any_ascii_char(c) {
153	0	s if s.is_empty() => LexicalChar::from_char(c),
154	0	s => LexicalChar::from_slice(s.as_bytes()),
155		}
156		} else {
157	0	LexicalChar::empty()
158		}
159	0	}
160
161		/// returns `true` for combining diacritical marks
162		#[inline]
163	0	fn combining_diacritical(&c: &char) -> bool {
164	0	c >= '\u{300}' && c <= '\u{36F}'
165	0	}
166
167		/// Returns an iterator over the characters of a string, converted to lowercase
168		/// and transliterated to ASCII, if they're alphanumeric
169	0	pub fn iterate_lexical(s: &'_ str) -> impl Iterator<Item = char> + '_ {
170	0	s.chars().flat_map(iterate_lexical_char)
171	0	}
172
173		/// Returns an iterator over the characters of a string, converted to lowercase
174		/// and transliterated to ASCII. Non-alphanumeric characters are skipped
175	0	pub fn iterate_lexical_only_alnum(s: &'_ str) -> impl Iterator<Item = char> + '_ {
176	0	s.chars().flat_map(iterate_lexical_char_only_alnum)
177	0	}
178
179		#[test]
180		fn test_iteration() {
181		fn it(s: &'static str) -> String {
182		iterate_lexical(s).collect()
183		}
184
185		assert_eq!(&it("Hello, world!"), "hello, world!");
186		assert_eq!(&it("Ω A æ b ö ß é"), "o a ae b o ss e");
187		assert_eq!(&it("3½/⅝ £ → € ®™"), "31/2/5/8 £ → € ®™");
188		assert_eq!(&it("»@« 15% ¡¹!"), "»@« 15% ¡1!");
189		assert_eq!(&it("🎉🦄☣"), "🎉🦄☣");
190		assert_eq!(&it("北亰"), "beijing");
191		assert_eq!(&it("ΣΣΣ"), "sss");
192		assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}'
193		}
194
195		#[test]
196		fn test_iteration_only_alnum() {
197		fn it(s: &'static str) -> String {
198		iterate_lexical_only_alnum(s).collect()
199		}
200
201		assert_eq!(&it("Hello, world!"), "helloworld");
202		assert_eq!(&it("Ω A æ b ö ß é"), "oaaebosse");
203		assert_eq!(&it("3½/⅝ £ → € ®™"), "31/25/8");
204		assert_eq!(&it("»@« 15% ¡¹!"), "151");
205		assert_eq!(&it("🎉🦄☣"), "");
206		assert_eq!(&it("北亰"), "beijing");
207		assert_eq!(&it("ΣΣΣ"), "sss");
208		assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}'
209		}