/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-syntax-0.8.6/src/utf8.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes. |
3 | | |
4 | | This is sub-module is useful for constructing byte based automatons that need |
5 | | to embed UTF-8 decoding. The most common use of this module is in conjunction |
6 | | with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type. |
7 | | |
8 | | See the documentation on the `Utf8Sequences` iterator for more details and |
9 | | an example. |
10 | | |
11 | | # Wait, what is this? |
12 | | |
13 | | This is simplest to explain with an example. Let's say you wanted to test |
14 | | whether a particular byte sequence was a Cyrillic character. One possible |
15 | | scalar value range is `[0400-04FF]`. The set of allowed bytes for this |
16 | | range can be expressed as a sequence of byte ranges: |
17 | | |
18 | | ```text |
19 | | [D0-D3][80-BF] |
20 | | ``` |
21 | | |
22 | | This is simple enough: simply encode the boundaries, `0400` encodes to |
23 | | `D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each |
24 | | corresponding pair of bytes: `D0` to `D3` and `80` to `BF`. |
25 | | |
26 | | However, what if you wanted to add the Cyrillic Supplementary characters to |
27 | | your range? Your range might then become `[0400-052F]`. The same procedure |
28 | | as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges |
29 | | you'd get from the previous transformation would be `[D0-D4][80-AF]`. However, |
30 | | this isn't quite correct because this range doesn't capture many characters, |
31 | | for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`). |
32 | | |
33 | | Instead, you need multiple sequences of byte ranges: |
34 | | |
35 | | ```text |
36 | | [D0-D3][80-BF] # matches codepoints 0400-04FF |
37 | | [D4][80-AF] # matches codepoints 0500-052F |
38 | | ``` |
39 | | |
40 | | This gets even more complicated if you want bigger ranges, particularly if |
41 | | they naively contain surrogate codepoints. For example, the sequence of byte |
42 | | ranges for the basic multilingual plane (`[0000-FFFF]`) look like this: |
43 | | |
44 | | ```text |
45 | | [0-7F] |
46 | | [C2-DF][80-BF] |
47 | | [E0][A0-BF][80-BF] |
48 | | [E1-EC][80-BF][80-BF] |
49 | | [ED][80-9F][80-BF] |
50 | | [EE-EF][80-BF][80-BF] |
51 | | ``` |
52 | | |
53 | | Note that the byte ranges above will *not* match any erroneous encoding of |
54 | | UTF-8, including encodings of surrogate codepoints. |
55 | | |
56 | | And, of course, for all of Unicode (`[000000-10FFFF]`): |
57 | | |
58 | | ```text |
59 | | [0-7F] |
60 | | [C2-DF][80-BF] |
61 | | [E0][A0-BF][80-BF] |
62 | | [E1-EC][80-BF][80-BF] |
63 | | [ED][80-9F][80-BF] |
64 | | [EE-EF][80-BF][80-BF] |
65 | | [F0][90-BF][80-BF][80-BF] |
66 | | [F1-F3][80-BF][80-BF][80-BF] |
67 | | [F4][80-8F][80-BF][80-BF] |
68 | | ``` |
69 | | |
70 | | This module automates the process of creating these byte ranges from ranges of |
71 | | Unicode scalar values. |
72 | | |
73 | | # Lineage |
74 | | |
75 | | I got the idea and general implementation strategy from Russ Cox in his |
76 | | [article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2. |
77 | | Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?). |
78 | | I also got the idea from |
79 | | [Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java), |
80 | | which uses it for executing automata on their term index. |
81 | | */ |
82 | | |
83 | | use core::{char, fmt, iter::FusedIterator, slice}; |
84 | | |
85 | | use alloc::{vec, vec::Vec}; |
86 | | |
87 | | const MAX_UTF8_BYTES: usize = 4; |
88 | | |
89 | | /// Utf8Sequence represents a sequence of byte ranges. |
90 | | /// |
91 | | /// To match a Utf8Sequence, a candidate byte sequence must match each |
92 | | /// successive range. |
93 | | /// |
94 | | /// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte |
95 | | /// sequence `\xDD\x61` would not match because `0x61 < 0x80`. |
96 | | #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)] |
97 | | pub enum Utf8Sequence { |
98 | | /// One byte range. |
99 | | One(Utf8Range), |
100 | | /// Two successive byte ranges. |
101 | | Two([Utf8Range; 2]), |
102 | | /// Three successive byte ranges. |
103 | | Three([Utf8Range; 3]), |
104 | | /// Four successive byte ranges. |
105 | | Four([Utf8Range; 4]), |
106 | | } |
107 | | |
108 | | impl Utf8Sequence { |
109 | | /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value |
110 | | /// range. |
111 | | /// |
112 | | /// This assumes that `start` and `end` have the same length. |
113 | 7.88k | fn from_encoded_range(start: &[u8], end: &[u8]) -> Self { |
114 | 7.88k | assert_eq!(start.len(), end.len()); |
115 | 7.88k | match start.len() { |
116 | 488 | 2 => Utf8Sequence::Two([ |
117 | 488 | Utf8Range::new(start[0], end[0]), |
118 | 488 | Utf8Range::new(start[1], end[1]), |
119 | 488 | ]), |
120 | 3.71k | 3 => Utf8Sequence::Three([ |
121 | 3.71k | Utf8Range::new(start[0], end[0]), |
122 | 3.71k | Utf8Range::new(start[1], end[1]), |
123 | 3.71k | Utf8Range::new(start[2], end[2]), |
124 | 3.71k | ]), |
125 | 3.68k | 4 => Utf8Sequence::Four([ |
126 | 3.68k | Utf8Range::new(start[0], end[0]), |
127 | 3.68k | Utf8Range::new(start[1], end[1]), |
128 | 3.68k | Utf8Range::new(start[2], end[2]), |
129 | 3.68k | Utf8Range::new(start[3], end[3]), |
130 | 3.68k | ]), |
131 | 0 | n => unreachable!("invalid encoded length: {}", n), |
132 | | } |
133 | 7.88k | } |
134 | | |
135 | | /// Returns the underlying sequence of byte ranges as a slice. |
136 | 7.95k | pub fn as_slice(&self) -> &[Utf8Range] { |
137 | | use self::Utf8Sequence::*; |
138 | 7.95k | match *self { |
139 | 74 | One(ref r) => slice::from_ref(r), |
140 | 488 | Two(ref r) => &r[..], |
141 | 3.71k | Three(ref r) => &r[..], |
142 | 3.68k | Four(ref r) => &r[..], |
143 | | } |
144 | 7.95k | } |
145 | | |
146 | | /// Returns the number of byte ranges in this sequence. |
147 | | /// |
148 | | /// The length is guaranteed to be in the closed interval `[1, 4]`. |
149 | 0 | pub fn len(&self) -> usize { |
150 | 0 | self.as_slice().len() |
151 | 0 | } |
152 | | |
153 | | /// Reverses the ranges in this sequence. |
154 | | /// |
155 | | /// For example, if this corresponds to the following sequence: |
156 | | /// |
157 | | /// ```text |
158 | | /// [D0-D3][80-BF] |
159 | | /// ``` |
160 | | /// |
161 | | /// Then after reversal, it will be |
162 | | /// |
163 | | /// ```text |
164 | | /// [80-BF][D0-D3] |
165 | | /// ``` |
166 | | /// |
167 | | /// This is useful when one is constructing a UTF-8 automaton to match |
168 | | /// character classes in reverse. |
169 | 0 | pub fn reverse(&mut self) { |
170 | 0 | match *self { |
171 | 0 | Utf8Sequence::One(_) => {} |
172 | 0 | Utf8Sequence::Two(ref mut x) => x.reverse(), |
173 | 0 | Utf8Sequence::Three(ref mut x) => x.reverse(), |
174 | 0 | Utf8Sequence::Four(ref mut x) => x.reverse(), |
175 | | } |
176 | 0 | } |
177 | | |
178 | | /// Returns true if and only if a prefix of `bytes` matches this sequence |
179 | | /// of byte ranges. |
180 | 0 | pub fn matches(&self, bytes: &[u8]) -> bool { |
181 | 0 | if bytes.len() < self.len() { |
182 | 0 | return false; |
183 | 0 | } |
184 | 0 | for (&b, r) in bytes.iter().zip(self) { |
185 | 0 | if !r.matches(b) { |
186 | 0 | return false; |
187 | 0 | } |
188 | | } |
189 | 0 | true |
190 | 0 | } |
191 | | } |
192 | | |
193 | | impl<'a> IntoIterator for &'a Utf8Sequence { |
194 | | type IntoIter = slice::Iter<'a, Utf8Range>; |
195 | | type Item = &'a Utf8Range; |
196 | | |
197 | 0 | fn into_iter(self) -> Self::IntoIter { |
198 | 0 | self.as_slice().iter() |
199 | 0 | } |
200 | | } |
201 | | |
202 | | impl fmt::Debug for Utf8Sequence { |
203 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
204 | | use self::Utf8Sequence::*; |
205 | 0 | match *self { |
206 | 0 | One(ref r) => write!(f, "{:?}", r), |
207 | 0 | Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]), |
208 | 0 | Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]), |
209 | 0 | Four(ref r) => { |
210 | 0 | write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3]) |
211 | | } |
212 | | } |
213 | 0 | } |
214 | | } |
215 | | |
216 | | /// A single inclusive range of UTF-8 bytes. |
217 | | #[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] |
218 | | pub struct Utf8Range { |
219 | | /// Start of byte range (inclusive). |
220 | | pub start: u8, |
221 | | /// End of byte range (inclusive). |
222 | | pub end: u8, |
223 | | } |
224 | | |
225 | | impl Utf8Range { |
226 | 26.9k | fn new(start: u8, end: u8) -> Self { |
227 | 26.9k | Utf8Range { start, end } |
228 | 26.9k | } |
229 | | |
230 | | /// Returns true if and only if the given byte is in this range. |
231 | 0 | pub fn matches(&self, b: u8) -> bool { |
232 | 0 | self.start <= b && b <= self.end |
233 | 0 | } |
234 | | } |
235 | | |
236 | | impl fmt::Debug for Utf8Range { |
237 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
238 | 0 | if self.start == self.end { |
239 | 0 | write!(f, "[{:X}]", self.start) |
240 | | } else { |
241 | 0 | write!(f, "[{:X}-{:X}]", self.start, self.end) |
242 | | } |
243 | 0 | } |
244 | | } |
245 | | |
246 | | /// An iterator over ranges of matching UTF-8 byte sequences. |
247 | | /// |
248 | | /// The iteration represents an alternation of comprehensive byte sequences |
249 | | /// that match precisely the set of UTF-8 encoded scalar values. |
250 | | /// |
251 | | /// A byte sequence corresponds to one of the scalar values in the range given |
252 | | /// if and only if it completely matches exactly one of the sequences of byte |
253 | | /// ranges produced by this iterator. |
254 | | /// |
255 | | /// Each sequence of byte ranges matches a unique set of bytes. That is, no two |
256 | | /// sequences will match the same bytes. |
257 | | /// |
258 | | /// # Example |
259 | | /// |
260 | | /// This shows how to match an arbitrary byte sequence against a range of |
261 | | /// scalar values. |
262 | | /// |
263 | | /// ```rust |
264 | | /// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence}; |
265 | | /// |
266 | | /// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool { |
267 | | /// for range in seqs { |
268 | | /// if range.matches(bytes) { |
269 | | /// return true; |
270 | | /// } |
271 | | /// } |
272 | | /// false |
273 | | /// } |
274 | | /// |
275 | | /// // Test the basic multilingual plane. |
276 | | /// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect(); |
277 | | /// |
278 | | /// // UTF-8 encoding of 'a'. |
279 | | /// assert!(matches(&seqs, &[0x61])); |
280 | | /// // UTF-8 encoding of '☃' (`\u{2603}`). |
281 | | /// assert!(matches(&seqs, &[0xE2, 0x98, 0x83])); |
282 | | /// // UTF-8 encoding of `\u{10348}` (outside the BMP). |
283 | | /// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88])); |
284 | | /// // Tries to match against a UTF-8 encoding of a surrogate codepoint, |
285 | | /// // which is invalid UTF-8, and therefore fails, despite the fact that |
286 | | /// // the corresponding codepoint (0xD800) falls in the range given. |
287 | | /// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80])); |
288 | | /// // And fails against plain old invalid UTF-8. |
289 | | /// assert!(!matches(&seqs, &[0xFF, 0xFF])); |
290 | | /// ``` |
291 | | /// |
292 | | /// If this example seems circuitous, that's because it is! It's meant to be |
293 | | /// illustrative. In practice, you could just try to decode your byte sequence |
294 | | /// and compare it with the scalar value range directly. However, this is not |
295 | | /// always possible (for example, in a byte based automaton). |
296 | | #[derive(Debug)] |
297 | | pub struct Utf8Sequences { |
298 | | range_stack: Vec<ScalarRange>, |
299 | | } |
300 | | |
301 | | impl Utf8Sequences { |
302 | | /// Create a new iterator over UTF-8 byte ranges for the scalar value range |
303 | | /// given. |
304 | 6.31k | pub fn new(start: char, end: char) -> Self { |
305 | 6.31k | let range = |
306 | 6.31k | ScalarRange { start: u32::from(start), end: u32::from(end) }; |
307 | 6.31k | Utf8Sequences { range_stack: vec![range] } |
308 | 6.31k | } |
309 | | |
310 | | /// reset resets the scalar value range. |
311 | | /// Any existing state is cleared, but resources may be reused. |
312 | | /// |
313 | | /// N.B. Benchmarks say that this method is dubious. |
314 | | #[doc(hidden)] |
315 | 0 | pub fn reset(&mut self, start: char, end: char) { |
316 | 0 | self.range_stack.clear(); |
317 | 0 | self.push(u32::from(start), u32::from(end)); |
318 | 0 | } |
319 | | |
320 | 1.64k | fn push(&mut self, start: u32, end: u32) { |
321 | 1.64k | self.range_stack.push(ScalarRange { start, end }); |
322 | 1.64k | } |
323 | | } |
324 | | |
325 | | struct ScalarRange { |
326 | | start: u32, |
327 | | end: u32, |
328 | | } |
329 | | |
330 | | impl fmt::Debug for ScalarRange { |
331 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
332 | 0 | write!(f, "ScalarRange({:X}, {:X})", self.start, self.end) |
333 | 0 | } |
334 | | } |
335 | | |
336 | | impl Iterator for Utf8Sequences { |
337 | | type Item = Utf8Sequence; |
338 | | |
339 | 14.2k | fn next(&mut self) -> Option<Self::Item> { |
340 | 14.2k | 'TOP: while let Some(mut r) = self.range_stack.pop() { |
341 | | 'INNER: loop { |
342 | 9.59k | if let Some((r1, r2)) = r.split() { |
343 | 8 | self.push(r2.start, r2.end); |
344 | 8 | r.start = r1.start; |
345 | 8 | r.end = r1.end; |
346 | 8 | continue 'INNER; |
347 | 9.59k | } |
348 | 9.59k | if !r.is_valid() { |
349 | 0 | continue 'TOP; |
350 | 9.59k | } |
351 | 38.3k | for i in 1..MAX_UTF8_BYTES { |
352 | 28.7k | let max = max_scalar_value(i); |
353 | 28.7k | if r.start <= max && max < r.end { |
354 | 24 | self.push(max + 1, r.end); |
355 | 24 | r.end = max; |
356 | 24 | continue 'INNER; |
357 | 28.7k | } |
358 | | } |
359 | 9.56k | if let Some(ascii_range) = r.as_ascii() { |
360 | 74 | return Some(Utf8Sequence::One(ascii_range)); |
361 | 9.49k | } |
362 | 33.2k | for i in 1..MAX_UTF8_BYTES { |
363 | 25.4k | let m = (1 << (6 * i)) - 1; |
364 | 25.4k | if (r.start & !m) != (r.end & !m) { |
365 | 2.30k | if (r.start & m) != 0 { |
366 | 1.04k | self.push((r.start | m) + 1, r.end); |
367 | 1.04k | r.end = r.start | m; |
368 | 1.04k | continue 'INNER; |
369 | 1.26k | } |
370 | 1.26k | if (r.end & m) != m { |
371 | 568 | self.push(r.end & !m, r.end); |
372 | 568 | r.end = (r.end & !m) - 1; |
373 | 568 | continue 'INNER; |
374 | 696 | } |
375 | 23.1k | } |
376 | | } |
377 | 7.88k | let mut start = [0; MAX_UTF8_BYTES]; |
378 | 7.88k | let mut end = [0; MAX_UTF8_BYTES]; |
379 | 7.88k | let n = r.encode(&mut start, &mut end); |
380 | 7.88k | return Some(Utf8Sequence::from_encoded_range( |
381 | 7.88k | &start[0..n], |
382 | 7.88k | &end[0..n], |
383 | 7.88k | )); |
384 | | } |
385 | | } |
386 | 6.31k | None |
387 | 14.2k | } |
388 | | } |
389 | | |
390 | | impl FusedIterator for Utf8Sequences {} |
391 | | |
392 | | impl ScalarRange { |
393 | | /// split splits this range if it overlaps with a surrogate codepoint. |
394 | | /// |
395 | | /// Either or both ranges may be invalid. |
396 | 9.59k | fn split(&self) -> Option<(ScalarRange, ScalarRange)> { |
397 | 9.59k | if self.start < 0xE000 && self.end > 0xD7FF { |
398 | 8 | Some(( |
399 | 8 | ScalarRange { start: self.start, end: 0xD7FF }, |
400 | 8 | ScalarRange { start: 0xE000, end: self.end }, |
401 | 8 | )) |
402 | | } else { |
403 | 9.59k | None |
404 | | } |
405 | 9.59k | } |
406 | | |
407 | | /// is_valid returns true if and only if start <= end. |
408 | 19.1k | fn is_valid(&self) -> bool { |
409 | 19.1k | self.start <= self.end |
410 | 19.1k | } |
411 | | |
412 | | /// as_ascii returns this range as a Utf8Range if and only if all scalar |
413 | | /// values in this range can be encoded as a single byte. |
414 | 9.56k | fn as_ascii(&self) -> Option<Utf8Range> { |
415 | 9.56k | if self.is_ascii() { |
416 | 74 | let start = u8::try_from(self.start).unwrap(); |
417 | 74 | let end = u8::try_from(self.end).unwrap(); |
418 | 74 | Some(Utf8Range::new(start, end)) |
419 | | } else { |
420 | 9.49k | None |
421 | | } |
422 | 9.56k | } |
423 | | |
424 | | /// is_ascii returns true if the range is ASCII only (i.e., takes a single |
425 | | /// byte to encode any scalar value). |
426 | 9.56k | fn is_ascii(&self) -> bool { |
427 | 9.56k | self.is_valid() && self.end <= 0x7f |
428 | 9.56k | } |
429 | | |
430 | | /// encode writes the UTF-8 encoding of the start and end of this range |
431 | | /// to the corresponding destination slices, and returns the number of |
432 | | /// bytes written. |
433 | | /// |
434 | | /// The slices should have room for at least `MAX_UTF8_BYTES`. |
435 | 7.88k | fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize { |
436 | 7.88k | let cs = char::from_u32(self.start).unwrap(); |
437 | 7.88k | let ce = char::from_u32(self.end).unwrap(); |
438 | 7.88k | let ss = cs.encode_utf8(start); |
439 | 7.88k | let se = ce.encode_utf8(end); |
440 | 7.88k | assert_eq!(ss.len(), se.len()); |
441 | 7.88k | ss.len() |
442 | 7.88k | } |
443 | | } |
444 | | |
445 | 28.7k | fn max_scalar_value(nbytes: usize) -> u32 { |
446 | 28.7k | match nbytes { |
447 | 9.59k | 1 => 0x007F, |
448 | 9.58k | 2 => 0x07FF, |
449 | 9.57k | 3 => 0xFFFF, |
450 | 0 | 4 => 0x0010_FFFF, |
451 | 0 | _ => unreachable!("invalid UTF-8 byte sequence size"), |
452 | | } |
453 | 28.7k | } |
454 | | |
455 | | #[cfg(test)] |
456 | | mod tests { |
457 | | use core::char; |
458 | | |
459 | | use alloc::{vec, vec::Vec}; |
460 | | |
461 | | use crate::utf8::{Utf8Range, Utf8Sequences}; |
462 | | |
463 | | fn rutf8(s: u8, e: u8) -> Utf8Range { |
464 | | Utf8Range::new(s, e) |
465 | | } |
466 | | |
467 | | fn never_accepts_surrogate_codepoints(start: char, end: char) { |
468 | | for cp in 0xD800..0xE000 { |
469 | | let buf = encode_surrogate(cp); |
470 | | for r in Utf8Sequences::new(start, end) { |
471 | | if r.matches(&buf) { |
472 | | panic!( |
473 | | "Sequence ({:X}, {:X}) contains range {:?}, \ |
474 | | which matches surrogate code point {:X} \ |
475 | | with encoded bytes {:?}", |
476 | | u32::from(start), |
477 | | u32::from(end), |
478 | | r, |
479 | | cp, |
480 | | buf, |
481 | | ); |
482 | | } |
483 | | } |
484 | | } |
485 | | } |
486 | | |
487 | | #[test] |
488 | | fn codepoints_no_surrogates() { |
489 | | never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}'); |
490 | | never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}'); |
491 | | never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}'); |
492 | | never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}'); |
493 | | never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}'); |
494 | | } |
495 | | |
496 | | #[test] |
497 | | fn single_codepoint_one_sequence() { |
498 | | // Tests that every range of scalar values that contains a single |
499 | | // scalar value is recognized by one sequence of byte ranges. |
500 | | for i in 0x0..=0x0010_FFFF { |
501 | | let c = match char::from_u32(i) { |
502 | | None => continue, |
503 | | Some(c) => c, |
504 | | }; |
505 | | let seqs: Vec<_> = Utf8Sequences::new(c, c).collect(); |
506 | | assert_eq!(seqs.len(), 1); |
507 | | } |
508 | | } |
509 | | |
510 | | #[test] |
511 | | fn bmp() { |
512 | | use crate::utf8::Utf8Sequence::*; |
513 | | |
514 | | let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>(); |
515 | | assert_eq!( |
516 | | seqs, |
517 | | vec![ |
518 | | One(rutf8(0x0, 0x7F)), |
519 | | Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]), |
520 | | Three([ |
521 | | rutf8(0xE0, 0xE0), |
522 | | rutf8(0xA0, 0xBF), |
523 | | rutf8(0x80, 0xBF) |
524 | | ]), |
525 | | Three([ |
526 | | rutf8(0xE1, 0xEC), |
527 | | rutf8(0x80, 0xBF), |
528 | | rutf8(0x80, 0xBF) |
529 | | ]), |
530 | | Three([ |
531 | | rutf8(0xED, 0xED), |
532 | | rutf8(0x80, 0x9F), |
533 | | rutf8(0x80, 0xBF) |
534 | | ]), |
535 | | Three([ |
536 | | rutf8(0xEE, 0xEF), |
537 | | rutf8(0x80, 0xBF), |
538 | | rutf8(0x80, 0xBF) |
539 | | ]), |
540 | | ] |
541 | | ); |
542 | | } |
543 | | |
544 | | #[test] |
545 | | fn reverse() { |
546 | | use crate::utf8::Utf8Sequence::*; |
547 | | |
548 | | let mut s = One(rutf8(0xA, 0xB)); |
549 | | s.reverse(); |
550 | | assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]); |
551 | | |
552 | | let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]); |
553 | | s.reverse(); |
554 | | assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]); |
555 | | |
556 | | let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]); |
557 | | s.reverse(); |
558 | | assert_eq!( |
559 | | s.as_slice(), |
560 | | &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)] |
561 | | ); |
562 | | |
563 | | let mut s = Four([ |
564 | | rutf8(0xA, 0xB), |
565 | | rutf8(0xB, 0xC), |
566 | | rutf8(0xC, 0xD), |
567 | | rutf8(0xD, 0xE), |
568 | | ]); |
569 | | s.reverse(); |
570 | | assert_eq!( |
571 | | s.as_slice(), |
572 | | &[ |
573 | | rutf8(0xD, 0xE), |
574 | | rutf8(0xC, 0xD), |
575 | | rutf8(0xB, 0xC), |
576 | | rutf8(0xA, 0xB) |
577 | | ] |
578 | | ); |
579 | | } |
580 | | |
581 | | fn encode_surrogate(cp: u32) -> [u8; 3] { |
582 | | const TAG_CONT: u8 = 0b1000_0000; |
583 | | const TAG_THREE_B: u8 = 0b1110_0000; |
584 | | |
585 | | assert!(0xD800 <= cp && cp < 0xE000); |
586 | | let mut dst = [0; 3]; |
587 | | dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B; |
588 | | dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT; |
589 | | dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT; |
590 | | dst |
591 | | } |
592 | | } |