/rust/registry/src/index.crates.io-6f17d22bba15001f/jiff-0.1.8/src/util/escape.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Provides convenience routines for escaping raw bytes. |
3 | | |
4 | | This was copied from `regex-automata` with a few light edits. |
5 | | */ |
6 | | |
7 | | /// Provides a convenient `Debug` implementation for a `u8`. |
8 | | /// |
9 | | /// The `Debug` impl treats the byte as an ASCII, and emits a human readable |
10 | | /// representation of it. If the byte isn't ASCII, then it's emitted as a hex |
11 | | /// escape sequence. |
12 | | #[derive(Clone, Copy)] |
13 | | pub struct Byte(pub u8); |
14 | | |
15 | | impl core::fmt::Display for Byte { |
16 | 39.6k | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
17 | 39.6k | if self.0 == b' ' { |
18 | 5.85k | return write!(f, " "); |
19 | 33.8k | } |
20 | 33.8k | // 10 bytes is enough to cover any output from ascii::escape_default. |
21 | 33.8k | let mut bytes = [0u8; 10]; |
22 | 33.8k | let mut len = 0; |
23 | 44.9k | for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { |
24 | | // capitalize \xab to \xAB |
25 | 44.9k | if i >= 2 && b'a' <= b && b <= b'f' { |
26 | 2.53k | b -= 32; |
27 | 42.3k | } |
28 | 44.9k | bytes[len] = b; |
29 | 44.9k | len += 1; |
30 | | } |
31 | 33.8k | write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) |
32 | 39.6k | } |
33 | | } |
34 | | |
35 | | impl core::fmt::Debug for Byte { |
36 | 38.5k | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
37 | 38.5k | write!(f, "\"")?; |
38 | 38.5k | core::fmt::Display::fmt(self, f)?; |
39 | 38.5k | write!(f, "\"")?; |
40 | 38.5k | Ok(()) |
41 | 38.5k | } |
42 | | } |
43 | | |
44 | | /// Provides a convenient `Debug` implementation for `&[u8]`. |
45 | | /// |
46 | | /// This generally works best when the bytes are presumed to be mostly UTF-8, |
47 | | /// but will work for anything. For any bytes that aren't UTF-8, they are |
48 | | /// emitted as hex escape sequences. |
49 | | pub struct Bytes<'a>(pub &'a [u8]); |
50 | | |
51 | | impl<'a> core::fmt::Display for Bytes<'a> { |
52 | 23.0k | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
53 | 23.0k | // This is a sad re-implementation of a similar impl found in bstr. |
54 | 23.0k | let mut bytes = self.0; |
55 | 17.8M | while let Some(result) = utf8_decode(bytes) { |
56 | 17.8M | let ch = match result { |
57 | 17.8M | Ok(ch) => ch, |
58 | 1.27k | Err(byte) => { |
59 | 1.27k | write!(f, r"\x{:02x}", byte)?; |
60 | 1.27k | bytes = &bytes[1..]; |
61 | 1.27k | continue; |
62 | | } |
63 | | }; |
64 | 17.8M | bytes = &bytes[ch.len_utf8()..]; |
65 | 17.8M | match ch { |
66 | 5.21M | '\0' => write!(f, "\\0")?, |
67 | | // ASCII control characters except \0, \n, \r, \t |
68 | 9.11M | '\x01'..='\x08' |
69 | | | '\x0b' |
70 | | | '\x0c' |
71 | 9.00M | | '\x0e'..='\x19' |
72 | | | '\x7f' => { |
73 | 1.07M | write!(f, "\\x{:02x}", u32::from(ch))?; |
74 | | } |
75 | | '\n' | '\r' | '\t' | _ => { |
76 | 11.5M | write!(f, "{}", ch.escape_debug())?; |
77 | | } |
78 | | } |
79 | | } |
80 | 23.0k | Ok(()) |
81 | 23.0k | } |
82 | | } |
83 | | |
84 | | impl<'a> core::fmt::Debug for Bytes<'a> { |
85 | 22.8k | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
86 | 22.8k | write!(f, "\"")?; |
87 | 22.8k | core::fmt::Display::fmt(self, f)?; |
88 | 22.8k | write!(f, "\"")?; |
89 | 22.8k | Ok(()) |
90 | 22.8k | } |
91 | | } |
92 | | |
93 | | /// Decodes the next UTF-8 encoded codepoint from the given byte slice. |
94 | | /// |
95 | | /// If no valid encoding of a codepoint exists at the beginning of the given |
96 | | /// byte slice, then the first byte is returned instead. |
97 | | /// |
98 | | /// This returns `None` if and only if `bytes` is empty. |
99 | | /// |
100 | | /// This never panics. |
101 | | /// |
102 | | /// *WARNING*: This is not designed for performance. If you're looking for a |
103 | | /// fast UTF-8 decoder, this is not it. If you feel like you need one in this |
104 | | /// crate, then please file an issue and discuss your use case. |
105 | 17.8M | fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> { |
106 | 17.8M | if bytes.is_empty() { |
107 | 23.0k | return None; |
108 | 17.8M | } |
109 | 17.8M | let len = match utf8_len(bytes[0]) { |
110 | 560 | None => return Some(Err(bytes[0])), |
111 | 17.8M | Some(len) if len > bytes.len() => return Some(Err(bytes[0])), |
112 | 17.8M | Some(1) => return Some(Ok(char::from(bytes[0]))), |
113 | 15.0k | Some(len) => len, |
114 | 15.0k | }; |
115 | 15.0k | match core::str::from_utf8(&bytes[..len]) { |
116 | 15.0k | Ok(s) => Some(Ok(s.chars().next().unwrap())), |
117 | 0 | Err(_) => Some(Err(bytes[0])), |
118 | | } |
119 | 17.8M | } |
120 | | |
121 | | /// Given a UTF-8 leading byte, this returns the total number of code units |
122 | | /// in the following encoded codepoint. |
123 | | /// |
124 | | /// If the given byte is not a valid UTF-8 leading byte, then this returns |
125 | | /// `None`. |
126 | 17.8M | fn utf8_len(byte: u8) -> Option<usize> { |
127 | 17.8M | if byte <= 0x7F { |
128 | 17.8M | return Some(1); |
129 | 16.3k | } else if byte & 0b1100_0000 == 0b1000_0000 { |
130 | 560 | return None; |
131 | 15.7k | } else if byte <= 0b1101_1111 { |
132 | 6.45k | Some(2) |
133 | 9.30k | } else if byte <= 0b1110_1111 { |
134 | 1.90k | Some(3) |
135 | 7.39k | } else if byte <= 0b1111_0111 { |
136 | 7.39k | Some(4) |
137 | | } else { |
138 | 0 | None |
139 | | } |
140 | 17.8M | } |