/src/rust-url/percent_encoding/src/ascii_set.rs
Line | Count | Source |
1 | | // Copyright 2013-2016 The rust-url developers. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
4 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
5 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
6 | | // option. This file may not be copied, modified, or distributed |
7 | | // except according to those terms. |
8 | | |
9 | | use core::{mem, ops}; |
10 | | |
11 | | /// Represents a set of characters or bytes in the ASCII range. |
12 | | /// |
13 | | /// This is used in [`percent_encode`] and [`utf8_percent_encode`]. |
14 | | /// This is similar to [percent-encode sets](https://url.spec.whatwg.org/#percent-encoded-bytes). |
15 | | /// |
16 | | /// Use the `add` method of an existing set to define a new set. For example: |
17 | | /// |
18 | | /// [`percent_encode`]: crate::percent_encode |
19 | | /// [`utf8_percent_encode`]: crate::utf8_percent_encode |
20 | | /// |
21 | | /// ``` |
22 | | /// use percent_encoding::{AsciiSet, CONTROLS}; |
23 | | /// |
24 | | /// /// https://url.spec.whatwg.org/#fragment-percent-encode-set |
25 | | /// const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`'); |
26 | | /// ``` |
27 | | #[derive(Debug, PartialEq, Eq)] |
28 | | pub struct AsciiSet { |
29 | | mask: [Chunk; ASCII_RANGE_LEN / BITS_PER_CHUNK], |
30 | | } |
31 | | |
32 | | type Chunk = u32; |
33 | | |
34 | | const ASCII_RANGE_LEN: usize = 0x80; |
35 | | |
36 | | const BITS_PER_CHUNK: usize = 8 * mem::size_of::<Chunk>(); |
37 | | |
38 | | impl AsciiSet { |
39 | | /// An empty set. |
40 | | pub const EMPTY: Self = Self { |
41 | | mask: [0; ASCII_RANGE_LEN / BITS_PER_CHUNK], |
42 | | }; |
43 | | |
44 | | /// Called with UTF-8 bytes rather than code points. |
45 | | /// Not used for non-ASCII bytes. |
46 | 812M | pub(crate) const fn contains(&self, byte: u8) -> bool { |
47 | 812M | let chunk = self.mask[byte as usize / BITS_PER_CHUNK]; |
48 | 812M | let mask = 1 << (byte as usize % BITS_PER_CHUNK); |
49 | 812M | (chunk & mask) != 0 |
50 | 812M | } |
51 | | |
52 | 843M | pub(crate) fn should_percent_encode(&self, byte: u8) -> bool { |
53 | 843M | !byte.is_ascii() || self.contains(byte) |
54 | 843M | } |
55 | | |
56 | 0 | pub const fn add(&self, byte: u8) -> Self { |
57 | 0 | let mut mask = self.mask; |
58 | 0 | mask[byte as usize / BITS_PER_CHUNK] |= 1 << (byte as usize % BITS_PER_CHUNK); |
59 | 0 | Self { mask } |
60 | 0 | } |
61 | | |
62 | 0 | pub const fn remove(&self, byte: u8) -> Self { |
63 | 0 | let mut mask = self.mask; |
64 | 0 | mask[byte as usize / BITS_PER_CHUNK] &= !(1 << (byte as usize % BITS_PER_CHUNK)); |
65 | 0 | Self { mask } |
66 | 0 | } |
67 | | |
68 | | /// Return the union of two sets. |
69 | 0 | pub const fn union(&self, other: Self) -> Self { |
70 | 0 | let mask = [ |
71 | 0 | self.mask[0] | other.mask[0], |
72 | 0 | self.mask[1] | other.mask[1], |
73 | 0 | self.mask[2] | other.mask[2], |
74 | 0 | self.mask[3] | other.mask[3], |
75 | 0 | ]; |
76 | 0 | Self { mask } |
77 | 0 | } |
78 | | |
79 | | /// Return the negation of the set. |
80 | 0 | pub const fn complement(&self) -> Self { |
81 | 0 | let mask = [!self.mask[0], !self.mask[1], !self.mask[2], !self.mask[3]]; |
82 | 0 | Self { mask } |
83 | 0 | } |
84 | | } |
85 | | |
86 | | impl ops::Add for AsciiSet { |
87 | | type Output = Self; |
88 | | |
89 | 0 | fn add(self, other: Self) -> Self { |
90 | 0 | self.union(other) |
91 | 0 | } |
92 | | } |
93 | | |
94 | | impl ops::Not for AsciiSet { |
95 | | type Output = Self; |
96 | | |
97 | 0 | fn not(self) -> Self { |
98 | 0 | self.complement() |
99 | 0 | } |
100 | | } |
101 | | |
102 | | /// The set of 0x00 to 0x1F (C0 controls), and 0x7F (DEL). |
103 | | /// |
104 | | /// Note that this includes the newline and tab characters, but not the space 0x20. |
105 | | /// |
106 | | /// <https://url.spec.whatwg.org/#c0-control-percent-encode-set> |
107 | | pub const CONTROLS: &AsciiSet = &AsciiSet { |
108 | | mask: [ |
109 | | !0_u32, // C0: 0x00 to 0x1F (32 bits set) |
110 | | 0, |
111 | | 0, |
112 | | 1 << (0x7F_u32 % 32), // DEL: 0x7F (one bit set) |
113 | | ], |
114 | | }; |
115 | | |
116 | | macro_rules! static_assert { |
117 | | ($( $bool: expr, )+) => { |
118 | 0 | fn _static_assert() { |
119 | | $( |
120 | 0 | let _ = mem::transmute::<[u8; $bool as usize], u8>; |
121 | | )+ |
122 | 0 | } |
123 | | } |
124 | | } |
125 | | |
126 | | static_assert! { |
127 | | CONTROLS.contains(0x00), |
128 | | CONTROLS.contains(0x1F), |
129 | | !CONTROLS.contains(0x20), |
130 | | !CONTROLS.contains(0x7E), |
131 | | CONTROLS.contains(0x7F), |
132 | | } |
133 | | |
134 | | /// Everything that is not an ASCII letter or digit. |
135 | | /// |
136 | | /// This is probably more eager than necessary in any context. |
137 | | pub const NON_ALPHANUMERIC: &AsciiSet = &CONTROLS |
138 | | .add(b' ') |
139 | | .add(b'!') |
140 | | .add(b'"') |
141 | | .add(b'#') |
142 | | .add(b'$') |
143 | | .add(b'%') |
144 | | .add(b'&') |
145 | | .add(b'\'') |
146 | | .add(b'(') |
147 | | .add(b')') |
148 | | .add(b'*') |
149 | | .add(b'+') |
150 | | .add(b',') |
151 | | .add(b'-') |
152 | | .add(b'.') |
153 | | .add(b'/') |
154 | | .add(b':') |
155 | | .add(b';') |
156 | | .add(b'<') |
157 | | .add(b'=') |
158 | | .add(b'>') |
159 | | .add(b'?') |
160 | | .add(b'@') |
161 | | .add(b'[') |
162 | | .add(b'\\') |
163 | | .add(b']') |
164 | | .add(b'^') |
165 | | .add(b'_') |
166 | | .add(b'`') |
167 | | .add(b'{') |
168 | | .add(b'|') |
169 | | .add(b'}') |
170 | | .add(b'~'); |
171 | | |
172 | | #[cfg(test)] |
173 | | mod tests { |
174 | | use super::*; |
175 | | |
176 | | #[test] |
177 | | fn add_op() { |
178 | | let left = AsciiSet::EMPTY.add(b'A'); |
179 | | let right = AsciiSet::EMPTY.add(b'B'); |
180 | | let expected = AsciiSet::EMPTY.add(b'A').add(b'B'); |
181 | | assert_eq!(left + right, expected); |
182 | | } |
183 | | |
184 | | #[test] |
185 | | fn not_op() { |
186 | | let set = AsciiSet::EMPTY.add(b'A').add(b'B'); |
187 | | let not_set = !set; |
188 | | assert!(!not_set.contains(b'A')); |
189 | | assert!(not_set.contains(b'C')); |
190 | | } |
191 | | |
192 | | /// This test ensures that we can get the union of two sets as a constant value, which is |
193 | | /// useful for defining sets in a modular way. |
194 | | #[test] |
195 | | fn union() { |
196 | | const A: AsciiSet = AsciiSet::EMPTY.add(b'A'); |
197 | | const B: AsciiSet = AsciiSet::EMPTY.add(b'B'); |
198 | | const UNION: AsciiSet = A.union(B); |
199 | | const EXPECTED: AsciiSet = AsciiSet::EMPTY.add(b'A').add(b'B'); |
200 | | assert_eq!(UNION, EXPECTED); |
201 | | } |
202 | | |
203 | | /// This test ensures that we can get the complement of a set as a constant value, which is |
204 | | /// useful for defining sets in a modular way. |
205 | | #[test] |
206 | | fn complement() { |
207 | | const BOTH: AsciiSet = AsciiSet::EMPTY.add(b'A').add(b'B'); |
208 | | const COMPLEMENT: AsciiSet = BOTH.complement(); |
209 | | assert!(!COMPLEMENT.contains(b'A')); |
210 | | assert!(!COMPLEMENT.contains(b'B')); |
211 | | assert!(COMPLEMENT.contains(b'C')); |
212 | | } |
213 | | } |