/rust/registry/src/index.crates.io-6f17d22bba15001f/icu_segmenter-1.5.0/src/indices.rs

Source (jump to first uncovered line)
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

/// Similar to [`core::str::CharIndices`] for Latin-1 strings, represented as `[u8]`.
///
/// Contrary to [`core::str::CharIndices`], the second element of the
/// [`Iterator::Item`] is a [`u8`], representing a Unicode scalar value in the
/// range U+0000–U+00FF.
#[derive(Clone, Debug)]
pub struct Latin1Indices<'a> {
    front_offset: usize,
    iter: &'a [u8],
}

impl<'a> Latin1Indices<'a> {
    pub fn new(input: &'a [u8]) -> Self {
        Self {
            front_offset: 0,
            iter: input,
        }
    }
}

impl<'a> Iterator for Latin1Indices<'a> {
    type Item = (usize, u8);

    #[inline]
    fn next(&mut self) -> Option<(usize, u8)> {
        self.iter.get(self.front_offset).map(|ch| {
            self.front_offset += 1;
            (self.front_offset - 1, *ch)
        })
    }
}

/// Similar to [`core::str::CharIndices`] for UTF-16 strings, represented as `[u16]`.
///
/// Contrary to [`core::str::CharIndices`], the second element of the
/// [`Iterator::Item`] is a Unicode code point represented by a [`u32`],
/// rather than a Unicode scalar value represented by a [`char`], because this
/// iterator preserves unpaired surrogates.
#[derive(Clone, Debug)]
pub struct Utf16Indices<'a> {
    front_offset: usize,
    iter: &'a [u16],
}

impl<'a> Utf16Indices<'a> {
    pub fn new(input: &'a [u16]) -> Self {
        Self {
            front_offset: 0,
            iter: input,
        }
    }
}

impl<'a> Iterator for Utf16Indices<'a> {
    type Item = (usize, u32);

    #[inline]
    fn next(&mut self) -> Option<(usize, u32)> {
        let (index, ch) = self.iter.get(self.front_offset).map(|ch| {
            self.front_offset += 1;
            (self.front_offset - 1, *ch)
        })?;

        let mut ch = ch as u32;
        if (ch & 0xfc00) != 0xd800 {
            return Some((index, ch));
        }

        if let Some(next) = self.iter.get(self.front_offset) {
            let next = *next as u32;
            if (next & 0xfc00) == 0xdc00 {
                // Combine low and high surrogates to UTF-32 code point.
                ch = ((ch & 0x3ff) << 10) + (next & 0x3ff) + 0x10000;
                self.front_offset += 1;
            }
        }
        Some((index, ch))
    }
}

#[cfg(test)]
mod tests {
    use crate::indices::*;

    #[test]
    fn latin1_indices() {
        let latin1 = [0x30, 0x31, 0x32];
        let mut indices = Latin1Indices::new(&latin1);
        let n = indices.next().unwrap();
        assert_eq!(n.0, 0);
        assert_eq!(n.1, 0x30);
        let n = indices.next().unwrap();
        assert_eq!(n.0, 1);
        assert_eq!(n.1, 0x31);
        let n = indices.next().unwrap();
        assert_eq!(n.0, 2);
        assert_eq!(n.1, 0x32);
        let n = indices.next();
        assert_eq!(n, None);
    }

    #[test]
    fn utf16_indices() {
        let utf16 = [0xd83d, 0xde03, 0x0020, 0xd83c, 0xdf00, 0xd800, 0x0020];
        let mut indices = Utf16Indices::new(&utf16);
        let n = indices.next().unwrap();
        assert_eq!(n.0, 0);
        assert_eq!(n.1, 0x1f603);
        let n = indices.next().unwrap();
        assert_eq!(n.0, 2);
        assert_eq!(n.1, 0x20);
        let n = indices.next().unwrap();
        assert_eq!(n.0, 3);
        assert_eq!(n.1, 0x1f300);
        // This is invalid surrogate pair.
        let n = indices.next().unwrap();
        assert_eq!(n.0, 5);
        assert_eq!(n.1, 0xd800);
        let n = indices.next().unwrap();
        assert_eq!(n.0, 6);
        assert_eq!(n.1, 0x0020);
        let n = indices.next();
        assert_eq!(n, None);
    }
}

Coverage Report

Created: 2025-08-12 06:35

Line	Count	Source (jump to first uncovered line)
1		// This file is part of ICU4X. For terms of use, please see the file
2		// called LICENSE at the top level of the ICU4X source tree
3		// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5		/// Similar to [`core::str::CharIndices`] for Latin-1 strings, represented as `[u8]`.
6		///
7		/// Contrary to [`core::str::CharIndices`], the second element of the
8		/// [`Iterator::Item`] is a [`u8`], representing a Unicode scalar value in the
9		/// range U+0000–U+00FF.
10		#[derive(Clone, Debug)]
11		pub struct Latin1Indices<'a> {
12		front_offset: usize,
13		iter: &'a [u8],
14		}
15
16		impl<'a> Latin1Indices<'a> {
17	0	pub fn new(input: &'a [u8]) -> Self {
18	0	Self {
19	0	front_offset: 0,
20	0	iter: input,
21	0	}
22	0	}
23		}
24
25		impl<'a> Iterator for Latin1Indices<'a> {
26		type Item = (usize, u8);
27
28		#[inline]
29	0	fn next(&mut self) -> Option<(usize, u8)> {
30	0	self.iter.get(self.front_offset).map(\|ch\| {
31	0	self.front_offset += 1;
32	0	(self.front_offset - 1, *ch)
33	0	}) Unexecuted instantiation: <icu_segmenter::indices::Latin1Indices as core::iter::traits::iterator::Iterator>::next::{closure#0} Unexecuted instantiation: <icu_segmenter::indices::Latin1Indices as core::iter::traits::iterator::Iterator>::next::{closure#0}
34	0	} Unexecuted instantiation: <icu_segmenter::indices::Latin1Indices as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <icu_segmenter::indices::Latin1Indices as core::iter::traits::iterator::Iterator>::next
35		}
36
37		/// Similar to [`core::str::CharIndices`] for UTF-16 strings, represented as `[u16]`.
38		///
39		/// Contrary to [`core::str::CharIndices`], the second element of the
40		/// [`Iterator::Item`] is a Unicode code point represented by a [`u32`],
41		/// rather than a Unicode scalar value represented by a [`char`], because this
42		/// iterator preserves unpaired surrogates.
43		#[derive(Clone, Debug)]
44		pub struct Utf16Indices<'a> {
45		front_offset: usize,
46		iter: &'a [u16],
47		}
48
49		impl<'a> Utf16Indices<'a> {
50	0	pub fn new(input: &'a [u16]) -> Self {
51	0	Self {
52	0	front_offset: 0,
53	0	iter: input,
54	0	}
55	0	}
56		}
57
58		impl<'a> Iterator for Utf16Indices<'a> {
59		type Item = (usize, u32);
60
61		#[inline]
62	0	fn next(&mut self) -> Option<(usize, u32)> {
63	0	let (index, ch) = self.iter.get(self.front_offset).map(\|ch\| {
64	0	self.front_offset += 1;
65	0	(self.front_offset - 1, *ch)
66	0	})?;
67
68	0	let mut ch = ch as u32;
69	0	if (ch & 0xfc00) != 0xd800 {
70	0	return Some((index, ch));
71	0	}
72
73	0	if let Some(next) = self.iter.get(self.front_offset) {
74	0	let next = *next as u32;
75	0	if (next & 0xfc00) == 0xdc00 {
76	0	// Combine low and high surrogates to UTF-32 code point.
77	0	ch = ((ch & 0x3ff) << 10) + (next & 0x3ff) + 0x10000;
78	0	self.front_offset += 1;
79	0	}
80	0	}
81	0	Some((index, ch))
82	0	}
83		}
84
85		#[cfg(test)]
86		mod tests {
87		use crate::indices::*;
88
89		#[test]
90		fn latin1_indices() {
91		let latin1 = [0x30, 0x31, 0x32];
92		let mut indices = Latin1Indices::new(&latin1);
93		let n = indices.next().unwrap();
94		assert_eq!(n.0, 0);
95		assert_eq!(n.1, 0x30);
96		let n = indices.next().unwrap();
97		assert_eq!(n.0, 1);
98		assert_eq!(n.1, 0x31);
99		let n = indices.next().unwrap();
100		assert_eq!(n.0, 2);
101		assert_eq!(n.1, 0x32);
102		let n = indices.next();
103		assert_eq!(n, None);
104		}
105
106		#[test]
107		fn utf16_indices() {
108		let utf16 = [0xd83d, 0xde03, 0x0020, 0xd83c, 0xdf00, 0xd800, 0x0020];
109		let mut indices = Utf16Indices::new(&utf16);
110		let n = indices.next().unwrap();
111		assert_eq!(n.0, 0);
112		assert_eq!(n.1, 0x1f603);
113		let n = indices.next().unwrap();
114		assert_eq!(n.0, 2);
115		assert_eq!(n.1, 0x20);
116		let n = indices.next().unwrap();
117		assert_eq!(n.0, 3);
118		assert_eq!(n.1, 0x1f300);
119		// This is invalid surrogate pair.
120		let n = indices.next().unwrap();
121		assert_eq!(n.0, 5);
122		assert_eq!(n.1, 0xd800);
123		let n = indices.next().unwrap();
124		assert_eq!(n.0, 6);
125		assert_eq!(n.1, 0x0020);
126		let n = indices.next();
127		assert_eq!(n, None);
128		}
129		}