/rust/registry/src/index.crates.io-6f17d22bba15001f/ucd-trie-0.1.6/src/owned.rs

Source (jump to first uncovered line)
use std::borrow::Borrow;
use std::collections::HashMap;
use std::error;
use std::fmt;
use std::io;
use std::result;

use super::{TrieSetSlice, CHUNK_SIZE};

// This implementation was pretty much cribbed from raphlinus' contribution
// to the standard library: https://github.com/rust-lang/rust/pull/33098/files
//
// The fundamental principle guiding this implementation is to take advantage
// of the fact that similar Unicode codepoints are often grouped together, and
// that most boolean Unicode properties are quite sparse over the entire space
// of Unicode codepoints.
//
// To do this, we represent sets using something like a trie (which gives us
// prefix compression). The "final" states of the trie are embedded in leaves
// or "chunks," where each chunk is a 64 bit integer. Each bit position of the
// integer corresponds to whether a particular codepoint is in the set or not.
// These chunks are not just a compact representation of the final states of
// the trie, but are also a form of suffix compression. In particular, if
// multiple ranges of 64 contiguous codepoints map have the same set membership
// ordering, then they all map to the exact same chunk in the trie.
//
// We organize this structure by partitioning the space of Unicode codepoints
// into three disjoint sets. The first set corresponds to codepoints
// [0, 0x800), the second [0x800, 0x1000) and the third [0x10000, 0x110000).
// These partitions conveniently correspond to the space of 1 or 2 byte UTF-8
// encoded codepoints, 3 byte UTF-8 encoded codepoints and 4 byte UTF-8 encoded
// codepoints, respectively.
//
// Each partition has its own tree with its own root. The first partition is
// the simplest, since the tree is completely flat. In particular, to determine
// the set membership of a Unicode codepoint (that is less than `0x800`), we
// do the following (where `cp` is the codepoint we're testing):
//
//     let chunk_address = cp >> 6;
//     let chunk_bit = cp & 0b111111;
//     let chunk = tree1[cp >> 6];
//     let is_member = 1 == ((chunk >> chunk_bit) & 1);
//
// We do something similar for the second partition:
//
//     // we subtract 0x20 since (0x800 >> 6) == 0x20.
//     let child_address = (cp >> 6) - 0x20;
//     let chunk_address = tree2_level1[child_address];
//     let chunk_bit = cp & 0b111111;
//     let chunk = tree2_level2[chunk_address];
//     let is_member = 1 == ((chunk >> chunk_bit) & 1);
//
// And so on for the third partition.
//
// Note that as a special case, if the second or third partitions are empty,
// then the trie will store empty slices for those levels. The `contains`
// check knows to return `false` in those cases.

const CHUNKS: usize = 0x110000 / CHUNK_SIZE;

/// A type alias that maps to `std::result::Result<T, ucd_trie::Error>`.
pub type Result<T> = result::Result<T, Error>;

/// An error that can occur during construction of a trie.
#[derive(Clone, Debug)]
pub enum Error {
    /// This error is returned when an invalid codepoint is given to
    /// `TrieSetOwned::from_codepoints`. An invalid codepoint is a `u32` that
    /// is greater than `0x10FFFF`.
    InvalidCodepoint(u32),
    /// This error is returned when a set of Unicode codepoints could not be
    /// sufficiently compressed into the trie provided by this crate. There is
    /// no work-around for this error at this time.
    GaveUp,
}

impl error::Error for Error {}

impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match *self {
            Error::InvalidCodepoint(cp) => write!(
                f,
                "could not construct trie set containing an \
                 invalid Unicode codepoint: 0x{:X}",
                cp
            ),
            Error::GaveUp => {
                write!(f, "could not compress codepoint set into a trie")
            }
        }
    }
}

impl From<Error> for io::Error {
    fn from(err: Error) -> io::Error {
        io::Error::new(io::ErrorKind::Other, err)
    }
}

/// An owned trie set.
#[derive(Clone)]
pub struct TrieSetOwned {
    tree1_level1: Vec<u64>,
    tree2_level1: Vec<u8>,
    tree2_level2: Vec<u64>,
    tree3_level1: Vec<u8>,
    tree3_level2: Vec<u8>,
    tree3_level3: Vec<u64>,
}

impl fmt::Debug for TrieSetOwned {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "TrieSetOwned(...)")
    }
}

impl TrieSetOwned {
    fn new(all: &[bool]) -> Result<TrieSetOwned> {
        let mut bitvectors = Vec::with_capacity(CHUNKS);
        for i in 0..CHUNKS {
            let mut bitvector = 0u64;
            for j in 0..CHUNK_SIZE {
                if all[i * CHUNK_SIZE + j] {
                    bitvector |= 1 << j;
                }
            }
            bitvectors.push(bitvector);
        }

        let tree1_level1 =
            bitvectors.iter().cloned().take(0x800 / CHUNK_SIZE).collect();

        let (mut tree2_level1, mut tree2_level2) = compress_postfix_leaves(
            &bitvectors[0x800 / CHUNK_SIZE..0x10000 / CHUNK_SIZE],
        )?;
        if tree2_level2.len() == 1 && tree2_level2[0] == 0 {
            tree2_level1.clear();
            tree2_level2.clear();
        }

        let (mid, mut tree3_level3) = compress_postfix_leaves(
            &bitvectors[0x10000 / CHUNK_SIZE..0x110000 / CHUNK_SIZE],
        )?;
        let (mut tree3_level1, mut tree3_level2) =
            compress_postfix_mid(&mid, 64)?;
        if tree3_level3.len() == 1 && tree3_level3[0] == 0 {
            tree3_level1.clear();
            tree3_level2.clear();
            tree3_level3.clear();
        }

        Ok(TrieSetOwned {
            tree1_level1,
            tree2_level1,
            tree2_level2,
            tree3_level1,
            tree3_level2,
            tree3_level3,
        })
    }

    /// Create a new trie set from a set of Unicode scalar values.
    ///
    /// This returns an error if a set could not be sufficiently compressed to
    /// fit into a trie.
    pub fn from_scalars<I, C>(scalars: I) -> Result<TrieSetOwned>
    where
        I: IntoIterator<Item = C>,
        C: Borrow<char>,
    {
        let mut all = vec![false; 0x110000];
        for s in scalars {
            all[*s.borrow() as usize] = true;
        }
        TrieSetOwned::new(&all)
    }

    /// Create a new trie set from a set of Unicode scalar values.
    ///
    /// This returns an error if a set could not be sufficiently compressed to
    /// fit into a trie. This also returns an error if any of the given
    /// codepoints are greater than `0x10FFFF`.
    pub fn from_codepoints<I, C>(codepoints: I) -> Result<TrieSetOwned>
    where
        I: IntoIterator<Item = C>,
        C: Borrow<u32>,
    {
        let mut all = vec![false; 0x110000];
        for cp in codepoints {
            let cp = *cp.borrow();
            if cp > 0x10FFFF {
                return Err(Error::InvalidCodepoint(cp));
            }
            all[cp as usize] = true;
        }
        TrieSetOwned::new(&all)
    }

    /// Return this set as a slice.
    #[inline(always)]
    pub fn as_slice(&self) -> TrieSetSlice<'_> {
        TrieSetSlice {
            tree1_level1: &self.tree1_level1,
            tree2_level1: &self.tree2_level1,
            tree2_level2: &self.tree2_level2,
            tree3_level1: &self.tree3_level1,
            tree3_level2: &self.tree3_level2,
            tree3_level3: &self.tree3_level3,
        }
    }

    /// Returns true if and only if the given Unicode scalar value is in this
    /// set.
    pub fn contains_char(&self, c: char) -> bool {
        self.as_slice().contains_char(c)
    }

    /// Returns true if and only if the given codepoint is in this set.
    ///
    /// If the given value exceeds the codepoint range (i.e., it's greater
    /// than `0x10FFFF`), then this returns false.
    pub fn contains_u32(&self, cp: u32) -> bool {
        self.as_slice().contains_u32(cp)
    }
}

fn compress_postfix_leaves(chunks: &[u64]) -> Result<(Vec<u8>, Vec<u64>)> {
    let mut root = vec![];
    let mut children = vec![];
    let mut bychild = HashMap::new();
    for &chunk in chunks {
        if !bychild.contains_key(&chunk) {
            let start = bychild.len();
            if start > ::std::u8::MAX as usize {
                return Err(Error::GaveUp);
            }
            bychild.insert(chunk, start as u8);
            children.push(chunk);
        }
        root.push(bychild[&chunk]);
    }
    Ok((root, children))
}

fn compress_postfix_mid(
    chunks: &[u8],
    chunk_size: usize,
) -> Result<(Vec<u8>, Vec<u8>)> {
    let mut root = vec![];
    let mut children = vec![];
    let mut bychild = HashMap::new();
    for i in 0..(chunks.len() / chunk_size) {
        let chunk = &chunks[i * chunk_size..(i + 1) * chunk_size];
        if !bychild.contains_key(chunk) {
            let start = bychild.len();
            if start > ::std::u8::MAX as usize {
                return Err(Error::GaveUp);
            }
            bychild.insert(chunk, start as u8);
            children.extend(chunk);
        }
        root.push(bychild[chunk]);
    }
    Ok((root, children))
}

#[cfg(test)]
mod tests {
    use super::TrieSetOwned;
    use crate::general_category;
    use std::collections::HashSet;

    fn mk(scalars: &[char]) -> TrieSetOwned {
        TrieSetOwned::from_scalars(scalars).unwrap()
    }

    fn ranges_to_set(ranges: &[(u32, u32)]) -> Vec<u32> {
        let mut set = vec![];
        for &(start, end) in ranges {
            for cp in start..end + 1 {
                set.push(cp);
            }
        }
        set
    }

    #[test]
    fn set1() {
        let set = mk(&['a']);
        assert!(set.contains_char('a'));
        assert!(!set.contains_char('b'));
        assert!(!set.contains_char('β'));
        assert!(!set.contains_char('☃'));
        assert!(!set.contains_char('😼'));
    }

    #[test]
    fn set_combined() {
        let set = mk(&['a', 'b', 'β', '☃', '😼']);
        assert!(set.contains_char('a'));
        assert!(set.contains_char('b'));
        assert!(set.contains_char('β'));
        assert!(set.contains_char('☃'));
        assert!(set.contains_char('😼'));

        assert!(!set.contains_char('c'));
        assert!(!set.contains_char('θ'));
        assert!(!set.contains_char('⛇'));
        assert!(!set.contains_char('🐲'));
    }

    // Basic tests on all of the general category sets. We check that
    // membership is correct on every Unicode codepoint... because we can.

    macro_rules! category_test {
        ($name:ident, $ranges:ident) => {
            #[test]
            fn $name() {
                let set = ranges_to_set(general_category::$ranges);
                let hashset: HashSet<u32> = set.iter().cloned().collect();
                let trie = TrieSetOwned::from_codepoints(&set).unwrap();
                for cp in 0..0x110000 {
                    assert!(trie.contains_u32(cp) == hashset.contains(&cp));
                }
                // Test that an invalid codepoint is treated correctly.
                assert!(!trie.contains_u32(0x110000));
                assert!(!hashset.contains(&0x110000));
            }
        };
    }

    category_test!(gencat_cased_letter, CASED_LETTER);
    category_test!(gencat_close_punctuation, CLOSE_PUNCTUATION);
    category_test!(gencat_connector_punctuation, CONNECTOR_PUNCTUATION);
    category_test!(gencat_control, CONTROL);
    category_test!(gencat_currency_symbol, CURRENCY_SYMBOL);
    category_test!(gencat_dash_punctuation, DASH_PUNCTUATION);
    category_test!(gencat_decimal_number, DECIMAL_NUMBER);
    category_test!(gencat_enclosing_mark, ENCLOSING_MARK);
    category_test!(gencat_final_punctuation, FINAL_PUNCTUATION);
    category_test!(gencat_format, FORMAT);
    category_test!(gencat_initial_punctuation, INITIAL_PUNCTUATION);
    category_test!(gencat_letter, LETTER);
    category_test!(gencat_letter_number, LETTER_NUMBER);
    category_test!(gencat_line_separator, LINE_SEPARATOR);
    category_test!(gencat_lowercase_letter, LOWERCASE_LETTER);
    category_test!(gencat_math_symbol, MATH_SYMBOL);
    category_test!(gencat_mark, MARK);
    category_test!(gencat_modifier_letter, MODIFIER_LETTER);
    category_test!(gencat_modifier_symbol, MODIFIER_SYMBOL);
    category_test!(gencat_nonspacing_mark, NONSPACING_MARK);
    category_test!(gencat_number, NUMBER);
    category_test!(gencat_open_punctuation, OPEN_PUNCTUATION);
    category_test!(gencat_other, OTHER);
    category_test!(gencat_other_letter, OTHER_LETTER);
    category_test!(gencat_other_number, OTHER_NUMBER);
    category_test!(gencat_other_punctuation, OTHER_PUNCTUATION);
    category_test!(gencat_other_symbol, OTHER_SYMBOL);
    category_test!(gencat_paragraph_separator, PARAGRAPH_SEPARATOR);
    category_test!(gencat_private_use, PRIVATE_USE);
    category_test!(gencat_punctuation, PUNCTUATION);
    category_test!(gencat_separator, SEPARATOR);
    category_test!(gencat_space_separator, SPACE_SEPARATOR);
    category_test!(gencat_spacing_mark, SPACING_MARK);
    category_test!(gencat_surrogate, SURROGATE);
    category_test!(gencat_symbol, SYMBOL);
    category_test!(gencat_titlecase_letter, TITLECASE_LETTER);
    category_test!(gencat_unassigned, UNASSIGNED);
    category_test!(gencat_uppercase_letter, UPPERCASE_LETTER);
}

Coverage Report

Created: 2025-06-02 07:01

Line	Count	Source (jump to first uncovered line)
1		use std::borrow::Borrow;
2		use std::collections::HashMap;
3		use std::error;
4		use std::fmt;
5		use std::io;
6		use std::result;
7
8		use super::{TrieSetSlice, CHUNK_SIZE};
9
10		// This implementation was pretty much cribbed from raphlinus' contribution
11		// to the standard library: https://github.com/rust-lang/rust/pull/33098/files
12		//
13		// The fundamental principle guiding this implementation is to take advantage
14		// of the fact that similar Unicode codepoints are often grouped together, and
15		// that most boolean Unicode properties are quite sparse over the entire space
16		// of Unicode codepoints.
17		//
18		// To do this, we represent sets using something like a trie (which gives us
19		// prefix compression). The "final" states of the trie are embedded in leaves
20		// or "chunks," where each chunk is a 64 bit integer. Each bit position of the
21		// integer corresponds to whether a particular codepoint is in the set or not.
22		// These chunks are not just a compact representation of the final states of
23		// the trie, but are also a form of suffix compression. In particular, if
24		// multiple ranges of 64 contiguous codepoints map have the same set membership
25		// ordering, then they all map to the exact same chunk in the trie.
26		//
27		// We organize this structure by partitioning the space of Unicode codepoints
28		// into three disjoint sets. The first set corresponds to codepoints
29		// [0, 0x800), the second [0x800, 0x1000) and the third [0x10000, 0x110000).
30		// These partitions conveniently correspond to the space of 1 or 2 byte UTF-8
31		// encoded codepoints, 3 byte UTF-8 encoded codepoints and 4 byte UTF-8 encoded
32		// codepoints, respectively.
33		//
34		// Each partition has its own tree with its own root. The first partition is
35		// the simplest, since the tree is completely flat. In particular, to determine
36		// the set membership of a Unicode codepoint (that is less than `0x800`), we
37		// do the following (where `cp` is the codepoint we're testing):
38		//
39		// let chunk_address = cp >> 6;
40		// let chunk_bit = cp & 0b111111;
41		// let chunk = tree1[cp >> 6];
42		// let is_member = 1 == ((chunk >> chunk_bit) & 1);
43		//
44		// We do something similar for the second partition:
45		//
46		// // we subtract 0x20 since (0x800 >> 6) == 0x20.
47		// let child_address = (cp >> 6) - 0x20;
48		// let chunk_address = tree2_level1[child_address];
49		// let chunk_bit = cp & 0b111111;
50		// let chunk = tree2_level2[chunk_address];
51		// let is_member = 1 == ((chunk >> chunk_bit) & 1);
52		//
53		// And so on for the third partition.
54		//
55		// Note that as a special case, if the second or third partitions are empty,
56		// then the trie will store empty slices for those levels. The `contains`
57		// check knows to return `false` in those cases.
58
59		const CHUNKS: usize = 0x110000 / CHUNK_SIZE;
60
61		/// A type alias that maps to `std::result::Result<T, ucd_trie::Error>`.
62		pub type Result<T> = result::Result<T, Error>;
63
64		/// An error that can occur during construction of a trie.
65		#[derive(Clone, Debug)]
66		pub enum Error {
67		/// This error is returned when an invalid codepoint is given to
68		/// `TrieSetOwned::from_codepoints`. An invalid codepoint is a `u32` that
69		/// is greater than `0x10FFFF`.
70		InvalidCodepoint(u32),
71		/// This error is returned when a set of Unicode codepoints could not be
72		/// sufficiently compressed into the trie provided by this crate. There is
73		/// no work-around for this error at this time.
74		GaveUp,
75		}
76
77		impl error::Error for Error {}
78
79		impl fmt::Display for Error {
80	0	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
81	0	match *self {
82	0	Error::InvalidCodepoint(cp) => write!(
83	0	f,
84	0	"could not construct trie set containing an \
85	0	invalid Unicode codepoint: 0x{:X}",
86	0	cp
87	0	),
88		Error::GaveUp => {
89	0	write!(f, "could not compress codepoint set into a trie")
90		}
91		}
92	0	}
93		}
94
95		impl From<Error> for io::Error {
96	0	fn from(err: Error) -> io::Error {
97	0	io::Error::new(io::ErrorKind::Other, err)
98	0	}
99		}
100
101		/// An owned trie set.
102		#[derive(Clone)]
103		pub struct TrieSetOwned {
104		tree1_level1: Vec<u64>,
105		tree2_level1: Vec<u8>,
106		tree2_level2: Vec<u64>,
107		tree3_level1: Vec<u8>,
108		tree3_level2: Vec<u8>,
109		tree3_level3: Vec<u64>,
110		}
111
112		impl fmt::Debug for TrieSetOwned {
113	0	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
114	0	write!(f, "TrieSetOwned(...)")
115	0	}
116		}
117
118		impl TrieSetOwned {
119	0	fn new(all: &[bool]) -> Result<TrieSetOwned> {
120	0	let mut bitvectors = Vec::with_capacity(CHUNKS);
121	0	for i in 0..CHUNKS {
122	0	let mut bitvector = 0u64;
123	0	for j in 0..CHUNK_SIZE {
124	0	if all[i * CHUNK_SIZE + j] {
125	0	bitvector \|= 1 << j;
126	0	}
127		}
128	0	bitvectors.push(bitvector);
129		}
130
131	0	let tree1_level1 =
132	0	bitvectors.iter().cloned().take(0x800 / CHUNK_SIZE).collect();
133
134	0	let (mut tree2_level1, mut tree2_level2) = compress_postfix_leaves(
135	0	&bitvectors[0x800 / CHUNK_SIZE..0x10000 / CHUNK_SIZE],
136	0	)?;
137	0	if tree2_level2.len() == 1 && tree2_level2[0] == 0 {
138	0	tree2_level1.clear();
139	0	tree2_level2.clear();
140	0	}
141
142	0	let (mid, mut tree3_level3) = compress_postfix_leaves(
143	0	&bitvectors[0x10000 / CHUNK_SIZE..0x110000 / CHUNK_SIZE],
144	0	)?;
145	0	let (mut tree3_level1, mut tree3_level2) =
146	0	compress_postfix_mid(&mid, 64)?;
147	0	if tree3_level3.len() == 1 && tree3_level3[0] == 0 {
148	0	tree3_level1.clear();
149	0	tree3_level2.clear();
150	0	tree3_level3.clear();
151	0	}
152
153	0	Ok(TrieSetOwned {
154	0	tree1_level1,
155	0	tree2_level1,
156	0	tree2_level2,
157	0	tree3_level1,
158	0	tree3_level2,
159	0	tree3_level3,
160	0	})
161	0	}
162
163		/// Create a new trie set from a set of Unicode scalar values.
164		///
165		/// This returns an error if a set could not be sufficiently compressed to
166		/// fit into a trie.
167	0	pub fn from_scalars<I, C>(scalars: I) -> Result<TrieSetOwned>
168	0	where
169	0	I: IntoIterator<Item = C>,
170	0	C: Borrow<char>,
171	0	{
172	0	let mut all = vec![false; 0x110000];
173	0	for s in scalars {
174	0	all[*s.borrow() as usize] = true;
175	0	}
176	0	TrieSetOwned::new(&all)
177	0	}
178
179		/// Create a new trie set from a set of Unicode scalar values.
180		///
181		/// This returns an error if a set could not be sufficiently compressed to
182		/// fit into a trie. This also returns an error if any of the given
183		/// codepoints are greater than `0x10FFFF`.
184	0	pub fn from_codepoints<I, C>(codepoints: I) -> Result<TrieSetOwned>
185	0	where
186	0	I: IntoIterator<Item = C>,
187	0	C: Borrow<u32>,
188	0	{
189	0	let mut all = vec![false; 0x110000];
190	0	for cp in codepoints {
191	0	let cp = *cp.borrow();
192	0	if cp > 0x10FFFF {
193	0	return Err(Error::InvalidCodepoint(cp));
194	0	}
195	0	all[cp as usize] = true;
196		}
197	0	TrieSetOwned::new(&all)
198	0	}
199
200		/// Return this set as a slice.
201		#[inline(always)]
202	0	pub fn as_slice(&self) -> TrieSetSlice<'_> {
203	0	TrieSetSlice {
204	0	tree1_level1: &self.tree1_level1,
205	0	tree2_level1: &self.tree2_level1,
206	0	tree2_level2: &self.tree2_level2,
207	0	tree3_level1: &self.tree3_level1,
208	0	tree3_level2: &self.tree3_level2,
209	0	tree3_level3: &self.tree3_level3,
210	0	}
211	0	}
212
213		/// Returns true if and only if the given Unicode scalar value is in this
214		/// set.
215	0	pub fn contains_char(&self, c: char) -> bool {
216	0	self.as_slice().contains_char(c)
217	0	}
218
219		/// Returns true if and only if the given codepoint is in this set.
220		///
221		/// If the given value exceeds the codepoint range (i.e., it's greater
222		/// than `0x10FFFF`), then this returns false.
223	0	pub fn contains_u32(&self, cp: u32) -> bool {
224	0	self.as_slice().contains_u32(cp)
225	0	}
226		}
227
228	0	fn compress_postfix_leaves(chunks: &[u64]) -> Result<(Vec<u8>, Vec<u64>)> {
229	0	let mut root = vec![];
230	0	let mut children = vec![];
231	0	let mut bychild = HashMap::new();
232	0	for &chunk in chunks {
233	0	if !bychild.contains_key(&chunk) {
234	0	let start = bychild.len();
235	0	if start > ::std::u8::MAX as usize {
236	0	return Err(Error::GaveUp);
237	0	}
238	0	bychild.insert(chunk, start as u8);
239	0	children.push(chunk);
240	0	}
241	0	root.push(bychild[&chunk]);
242		}
243	0	Ok((root, children))
244	0	}
245
246	0	fn compress_postfix_mid(
247	0	chunks: &[u8],
248	0	chunk_size: usize,
249	0	) -> Result<(Vec<u8>, Vec<u8>)> {
250	0	let mut root = vec![];
251	0	let mut children = vec![];
252	0	let mut bychild = HashMap::new();
253	0	for i in 0..(chunks.len() / chunk_size) {
254	0	let chunk = &chunks[i * chunk_size..(i + 1) * chunk_size];
255	0	if !bychild.contains_key(chunk) {
256	0	let start = bychild.len();
257	0	if start > ::std::u8::MAX as usize {
258	0	return Err(Error::GaveUp);
259	0	}
260	0	bychild.insert(chunk, start as u8);
261	0	children.extend(chunk);
262	0	}
263	0	root.push(bychild[chunk]);
264		}
265	0	Ok((root, children))
266	0	}
267
268		#[cfg(test)]
269		mod tests {
270		use super::TrieSetOwned;
271		use crate::general_category;
272		use std::collections::HashSet;
273
274		fn mk(scalars: &[char]) -> TrieSetOwned {
275		TrieSetOwned::from_scalars(scalars).unwrap()
276		}
277
278		fn ranges_to_set(ranges: &[(u32, u32)]) -> Vec<u32> {
279		let mut set = vec![];
280		for &(start, end) in ranges {
281		for cp in start..end + 1 {
282		set.push(cp);
283		}
284		}
285		set
286		}
287
288		#[test]
289		fn set1() {
290		let set = mk(&['a']);
291		assert!(set.contains_char('a'));
292		assert!(!set.contains_char('b'));
293		assert!(!set.contains_char('β'));
294		assert!(!set.contains_char('☃'));
295		assert!(!set.contains_char('😼'));
296		}
297
298		#[test]
299		fn set_combined() {
300		let set = mk(&['a', 'b', 'β', '☃', '😼']);
301		assert!(set.contains_char('a'));
302		assert!(set.contains_char('b'));
303		assert!(set.contains_char('β'));
304		assert!(set.contains_char('☃'));
305		assert!(set.contains_char('😼'));
306
307		assert!(!set.contains_char('c'));
308		assert!(!set.contains_char('θ'));
309		assert!(!set.contains_char('⛇'));
310		assert!(!set.contains_char('🐲'));
311		}
312
313		// Basic tests on all of the general category sets. We check that
314		// membership is correct on every Unicode codepoint... because we can.
315
316		macro_rules! category_test {
317		($name:ident, $ranges:ident) => {
318		#[test]
319		fn $name() {
320		let set = ranges_to_set(general_category::$ranges);
321		let hashset: HashSet<u32> = set.iter().cloned().collect();
322		let trie = TrieSetOwned::from_codepoints(&set).unwrap();
323		for cp in 0..0x110000 {
324		assert!(trie.contains_u32(cp) == hashset.contains(&cp));
325		}
326		// Test that an invalid codepoint is treated correctly.
327		assert!(!trie.contains_u32(0x110000));
328		assert!(!hashset.contains(&0x110000));
329		}
330		};
331		}
332
333		category_test!(gencat_cased_letter, CASED_LETTER);
334		category_test!(gencat_close_punctuation, CLOSE_PUNCTUATION);
335		category_test!(gencat_connector_punctuation, CONNECTOR_PUNCTUATION);
336		category_test!(gencat_control, CONTROL);
337		category_test!(gencat_currency_symbol, CURRENCY_SYMBOL);
338		category_test!(gencat_dash_punctuation, DASH_PUNCTUATION);
339		category_test!(gencat_decimal_number, DECIMAL_NUMBER);
340		category_test!(gencat_enclosing_mark, ENCLOSING_MARK);
341		category_test!(gencat_final_punctuation, FINAL_PUNCTUATION);
342		category_test!(gencat_format, FORMAT);
343		category_test!(gencat_initial_punctuation, INITIAL_PUNCTUATION);
344		category_test!(gencat_letter, LETTER);
345		category_test!(gencat_letter_number, LETTER_NUMBER);
346		category_test!(gencat_line_separator, LINE_SEPARATOR);
347		category_test!(gencat_lowercase_letter, LOWERCASE_LETTER);
348		category_test!(gencat_math_symbol, MATH_SYMBOL);
349		category_test!(gencat_mark, MARK);
350		category_test!(gencat_modifier_letter, MODIFIER_LETTER);
351		category_test!(gencat_modifier_symbol, MODIFIER_SYMBOL);
352		category_test!(gencat_nonspacing_mark, NONSPACING_MARK);
353		category_test!(gencat_number, NUMBER);
354		category_test!(gencat_open_punctuation, OPEN_PUNCTUATION);
355		category_test!(gencat_other, OTHER);
356		category_test!(gencat_other_letter, OTHER_LETTER);
357		category_test!(gencat_other_number, OTHER_NUMBER);
358		category_test!(gencat_other_punctuation, OTHER_PUNCTUATION);
359		category_test!(gencat_other_symbol, OTHER_SYMBOL);
360		category_test!(gencat_paragraph_separator, PARAGRAPH_SEPARATOR);
361		category_test!(gencat_private_use, PRIVATE_USE);
362		category_test!(gencat_punctuation, PUNCTUATION);
363		category_test!(gencat_separator, SEPARATOR);
364		category_test!(gencat_space_separator, SPACE_SEPARATOR);
365		category_test!(gencat_spacing_mark, SPACING_MARK);
366		category_test!(gencat_surrogate, SURROGATE);
367		category_test!(gencat_symbol, SYMBOL);
368		category_test!(gencat_titlecase_letter, TITLECASE_LETTER);
369		category_test!(gencat_unassigned, UNASSIGNED);
370		category_test!(gencat_uppercase_letter, UPPERCASE_LETTER);
371		}