/rust/registry/src/index.crates.io-1949cf8c6b5b557f/futf-0.1.5/src/lib.rs
Line | Count | Source |
1 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
2 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
3 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
4 | | // option. This file may not be copied, modified, or distributed |
5 | | // except according to those terms. |
6 | | |
7 | | #![cfg_attr(test, feature(test))] |
8 | | |
9 | | #[macro_use] |
10 | | extern crate debug_unreachable; |
11 | | |
12 | | #[macro_use] |
13 | | extern crate mac; |
14 | | |
15 | | #[cfg(test)] |
16 | | extern crate test as std_test; |
17 | | |
18 | | use std::{slice, char}; |
19 | | |
20 | | /// Meaning of a complete or partial UTF-8 codepoint. |
21 | | /// |
22 | | /// Not all checking is performed eagerly. That is, a codepoint `Prefix` or |
23 | | /// `Suffix` may in reality have no valid completion. |
24 | | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] |
25 | | pub enum Meaning { |
26 | | /// We found a whole codepoint. |
27 | | Whole(char), |
28 | | |
29 | | /// We found something that isn't a valid Unicode codepoint, but |
30 | | /// it *would* correspond to a UTF-16 leading surrogate code unit, |
31 | | /// i.e. a value in the range `U+D800` - `U+DBFF`. |
32 | | /// |
33 | | /// The argument is the code unit's 10-bit index within that range. |
34 | | /// |
35 | | /// These are found in UTF-8 variants such as CESU-8 and WTF-8. |
36 | | LeadSurrogate(u16), |
37 | | |
38 | | /// We found something that isn't a valid Unicode codepoint, but |
39 | | /// it *would* correspond to a UTF-16 trailing surrogate code unit, |
40 | | /// i.e. a value in the range `U+DC00` - `U+DFFF`. |
41 | | /// |
42 | | /// The argument is the code unit's 10-bit index within that range. |
43 | | /// |
44 | | /// These are found in UTF-8 variants such as CESU-8 and WTF-8. |
45 | | TrailSurrogate(u16), |
46 | | |
47 | | /// We found only a prefix of a codepoint before the buffer ended. |
48 | | /// |
49 | | /// Includes the number of additional bytes needed. |
50 | | Prefix(usize), |
51 | | |
52 | | /// We found only a suffix of a codepoint before running off the |
53 | | /// start of the buffer. |
54 | | /// |
55 | | /// Up to 3 more bytes may be needed. |
56 | | Suffix, |
57 | | } |
58 | | |
59 | | /// Represents a complete or partial UTF-8 codepoint. |
60 | | #[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)] |
61 | | pub struct Codepoint<'a> { |
62 | | /// The bytes that make up the partial or full codepoint. |
63 | | /// |
64 | | /// For a `Suffix` this depends on `idx`. We don't scan forward |
65 | | /// for additional continuation bytes after the reverse scan |
66 | | /// failed to locate a multibyte sequence start. |
67 | | pub bytes: &'a [u8], |
68 | | |
69 | | /// Start of the codepoint in the buffer, expressed as an offset |
70 | | /// back from `idx`. |
71 | | pub rewind: usize, |
72 | | |
73 | | /// Meaning of the partial or full codepoint. |
74 | | pub meaning: Meaning, |
75 | | } |
76 | | |
77 | | #[derive(Debug, PartialEq, Eq)] |
78 | | enum Byte { |
79 | | Ascii, |
80 | | Start(usize), |
81 | | Cont, |
82 | | } |
83 | | |
84 | | impl Byte { |
85 | | #[inline(always)] |
86 | 0 | fn classify(x: u8) -> Option<Byte> { |
87 | 0 | match x & 0xC0 { |
88 | 0 | 0xC0 => match x { |
89 | 0 | x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)), |
90 | 0 | x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)), |
91 | 0 | x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)), |
92 | 0 | _ => None, |
93 | | }, |
94 | 0 | 0x80 => Some(Byte::Cont), |
95 | 0 | _ => Some(Byte::Ascii), |
96 | | } |
97 | 0 | } |
98 | | } |
99 | | |
100 | | #[inline(always)] |
101 | 0 | fn all_cont(buf: &[u8]) -> bool { |
102 | 0 | buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont))) |
103 | 0 | } |
104 | | |
105 | | // NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence: |
106 | | // a starting byte followed by the correct number of continuation bytes. |
107 | | #[inline(always)] |
108 | 0 | unsafe fn decode(buf: &[u8]) -> Option<Meaning> { |
109 | 0 | debug_assert!(buf.len() >= 2); |
110 | 0 | debug_assert!(buf.len() <= 4); |
111 | | let n; |
112 | 0 | match buf.len() { |
113 | | 2 => { |
114 | 0 | n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6 |
115 | 0 | | ((*buf.get_unchecked(1) & 0x3F) as u32); |
116 | 0 | if n < 0x80 { return None } // Overlong |
117 | | } |
118 | | 3 => { |
119 | 0 | n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12 |
120 | 0 | | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6 |
121 | 0 | | ((*buf.get_unchecked(2) & 0x3F) as u32); |
122 | 0 | match n { |
123 | 0 | 0x0000 ... 0x07FF => return None, // Overlong |
124 | 0 | 0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)), |
125 | 0 | 0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)), |
126 | 0 | _ => {} |
127 | | } |
128 | | } |
129 | | 4 => { |
130 | 0 | n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18 |
131 | 0 | | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12 |
132 | 0 | | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6 |
133 | 0 | | ((*buf.get_unchecked(3) & 0x3F) as u32); |
134 | 0 | if n < 0x1_0000 { return None } // Overlong |
135 | | } |
136 | 0 | _ => debug_unreachable!(), |
137 | | } |
138 | | |
139 | 0 | char::from_u32(n).map(Meaning::Whole) |
140 | 0 | } |
141 | | |
142 | | #[inline(always)] |
143 | 0 | unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] { |
144 | 0 | debug_assert!(start <= buf.len()); |
145 | 0 | debug_assert!(new_len <= (buf.len() - start)); |
146 | 0 | slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len) |
147 | 0 | } |
148 | | |
149 | | macro_rules! otry { |
150 | | ($x:expr) => { unwrap_or_return!($x, None) } |
151 | | } |
152 | | |
153 | | /// Describes the UTF-8 codepoint containing the byte at index `idx` within |
154 | | /// `buf`. |
155 | | /// |
156 | | /// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8 |
157 | | /// in the vicinity of `idx`. |
158 | | #[inline] |
159 | 0 | pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> { |
160 | 0 | if idx >= buf.len() { |
161 | 0 | return None; |
162 | 0 | } |
163 | | |
164 | | unsafe { |
165 | 0 | let x = *buf.get_unchecked(idx); |
166 | 0 | match otry!(Byte::classify(x)) { |
167 | 0 | Byte::Ascii => Some(Codepoint { |
168 | 0 | bytes: unsafe_slice(buf, idx, 1), |
169 | 0 | rewind: 0, |
170 | 0 | meaning: Meaning::Whole(x as char), |
171 | 0 | }), |
172 | 0 | Byte::Start(n) => { |
173 | 0 | let avail = buf.len() - idx; |
174 | 0 | if avail >= n { |
175 | 0 | let bytes = unsafe_slice(buf, idx, n); |
176 | 0 | if !all_cont(unsafe_slice(bytes, 1, n-1)) { |
177 | 0 | return None; |
178 | 0 | } |
179 | 0 | let meaning = otry!(decode(bytes)); |
180 | 0 | Some(Codepoint { |
181 | 0 | bytes: bytes, |
182 | 0 | rewind: 0, |
183 | 0 | meaning: meaning, |
184 | 0 | }) |
185 | | } else { |
186 | 0 | Some(Codepoint { |
187 | 0 | bytes: unsafe_slice(buf, idx, avail), |
188 | 0 | rewind: 0, |
189 | 0 | meaning: Meaning::Prefix(n - avail), |
190 | 0 | }) |
191 | | } |
192 | | }, |
193 | | Byte::Cont => { |
194 | 0 | let mut start = idx; |
195 | 0 | let mut checked = 0; |
196 | | loop { |
197 | 0 | if start == 0 { |
198 | | // Whoops, fell off the beginning. |
199 | 0 | return Some(Codepoint { |
200 | 0 | bytes: unsafe_slice(buf, 0, idx + 1), |
201 | 0 | rewind: idx, |
202 | 0 | meaning: Meaning::Suffix, |
203 | 0 | }); |
204 | 0 | } |
205 | | |
206 | 0 | start -= 1; |
207 | 0 | checked += 1; |
208 | 0 | match otry!(Byte::classify(*buf.get_unchecked(start))) { |
209 | 0 | Byte::Cont => (), |
210 | 0 | Byte::Start(n) => { |
211 | 0 | let avail = buf.len() - start; |
212 | 0 | if avail >= n { |
213 | 0 | let bytes = unsafe_slice(buf, start, n); |
214 | 0 | if checked < n { |
215 | 0 | if !all_cont(unsafe_slice(bytes, checked, n-checked)) { |
216 | 0 | return None; |
217 | 0 | } |
218 | 0 | } |
219 | 0 | let meaning = otry!(decode(bytes)); |
220 | 0 | return Some(Codepoint { |
221 | 0 | bytes: bytes, |
222 | 0 | rewind: idx - start, |
223 | 0 | meaning: meaning, |
224 | 0 | }); |
225 | | } else { |
226 | 0 | return Some(Codepoint { |
227 | 0 | bytes: unsafe_slice(buf, start, avail), |
228 | 0 | rewind: idx - start, |
229 | 0 | meaning: Meaning::Prefix(n - avail), |
230 | 0 | }); |
231 | | } |
232 | | } |
233 | 0 | _ => return None, |
234 | | } |
235 | | |
236 | 0 | if idx - start >= 3 { |
237 | | // We looked at 3 bytes before a continuation byte |
238 | | // and didn't find a start byte. |
239 | 0 | return None; |
240 | 0 | } |
241 | | } |
242 | | } |
243 | | } |
244 | | } |
245 | 0 | } |
246 | | |
247 | | #[cfg(test)] |
248 | | mod test; |