Coverage Report

Created: 2026-01-13 06:57

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/html5ever/tendril/src/futf.rs
Line
Count
Source
1
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4
// option. This file may not be copied, modified, or distributed
5
// except according to those terms.
6
7
use debug_unreachable::debug_unreachable;
8
use std::{char, slice};
9
10
/// Meaning of a complete or partial UTF-8 codepoint.
11
///
12
/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
13
/// `Suffix` may in reality have no valid completion.
14
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
15
pub enum Meaning {
16
    /// We found a whole codepoint.
17
    Whole(char),
18
19
    /// We found something that isn't a valid Unicode codepoint, but
20
    /// it *would* correspond to a UTF-16 leading surrogate code unit,
21
    /// i.e. a value in the range `U+D800` - `U+DBFF`.
22
    ///
23
    /// The argument is the code unit's 10-bit index within that range.
24
    ///
25
    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
26
    LeadSurrogate(u16),
27
28
    /// We found something that isn't a valid Unicode codepoint, but
29
    /// it *would* correspond to a UTF-16 trailing surrogate code unit,
30
    /// i.e. a value in the range `U+DC00` - `U+DFFF`.
31
    ///
32
    /// The argument is the code unit's 10-bit index within that range.
33
    ///
34
    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
35
    TrailSurrogate(u16),
36
37
    /// We found only a prefix of a codepoint before the buffer ended.
38
    ///
39
    /// Includes the number of additional bytes needed.
40
    Prefix(usize),
41
42
    /// We found only a suffix of a codepoint before running off the
43
    /// start of the buffer.
44
    ///
45
    /// Up to 3 more bytes may be needed.
46
    Suffix,
47
}
48
49
/// Represents a complete or partial UTF-8 codepoint.
50
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
51
pub struct Codepoint<'a> {
52
    /// The bytes that make up the partial or full codepoint.
53
    ///
54
    /// For a `Suffix` this depends on `idx`. We don't scan forward
55
    /// for additional continuation bytes after the reverse scan
56
    /// failed to locate a multibyte sequence start.
57
    pub bytes: &'a [u8],
58
59
    /// Start of the codepoint in the buffer, expressed as an offset
60
    /// back from `idx`.
61
    pub rewind: usize,
62
63
    /// Meaning of the partial or full codepoint.
64
    pub meaning: Meaning,
65
}
66
67
#[derive(Debug, PartialEq, Eq)]
68
enum Byte {
69
    Ascii,
70
    Start(usize),
71
    Cont,
72
}
73
74
impl Byte {
75
    #[inline(always)]
76
12.3M
    fn classify(x: u8) -> Option<Byte> {
77
12.3M
        match x & 0xC0 {
78
2.12M
            0xC0 => match x {
79
2.12M
                x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
80
2.11M
                x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
81
23.5k
                x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
82
0
                _ => None,
83
            },
84
5.29M
            0x80 => Some(Byte::Cont),
85
4.97M
            _ => Some(Byte::Ascii),
86
        }
87
12.3M
    }
88
}
89
90
#[inline(always)]
91
2.12M
fn all_cont(buf: &[u8]) -> bool {
92
2.12M
    buf.iter()
93
3.15M
        .all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
tendril::futf::all_cont::{closure#0}
Line
Count
Source
93
3.15M
        .all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
Unexecuted instantiation: tendril::futf::all_cont::{closure#0}
94
2.12M
}
95
96
// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
97
// a starting byte followed by the correct number of continuation bytes.
98
#[inline(always)]
99
2.12M
unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
100
2.12M
    debug_assert!(buf.len() >= 2);
101
2.12M
    debug_assert!(buf.len() <= 4);
102
    let n;
103
2.12M
    match buf.len() {
104
        2 => {
105
23.5k
            n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
106
23.5k
                | ((*buf.get_unchecked(1) & 0x3F) as u32);
107
23.5k
            if n < 0x80 {
108
0
                return None;
109
23.5k
            } // Overlong
110
        },
111
        3 => {
112
2.09M
            n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
113
2.09M
                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
114
2.09M
                | ((*buf.get_unchecked(2) & 0x3F) as u32);
115
2.09M
            match n {
116
2.09M
                0x0000..=0x07FF => return None, // Overlong
117
2.09M
                0xD800..=0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
118
2.09M
                0xDC00..=0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
119
2.09M
                _ => {},
120
            }
121
        },
122
        4 => {
123
4.43k
            n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
124
4.43k
                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
125
4.43k
                | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
126
4.43k
                | ((*buf.get_unchecked(3) & 0x3F) as u32);
127
4.43k
            if n < 0x1_0000 {
128
0
                return None;
129
4.43k
            } // Overlong
130
        },
131
0
        _ => debug_unreachable!(),
132
    }
133
134
2.12M
    char::from_u32(n).map(Meaning::Whole)
135
2.12M
}
136
137
#[inline(always)]
138
9.21M
unsafe fn unsafe_slice(buf: &[u8], start: usize, new_len: usize) -> &[u8] {
139
9.21M
    debug_assert!(start <= buf.len());
140
9.21M
    debug_assert!(new_len <= (buf.len() - start));
141
9.21M
    slice::from_raw_parts(buf.as_ptr().add(start), new_len)
142
9.21M
}
143
144
/// Describes the UTF-8 codepoint containing the byte at index `idx` within
145
/// `buf`.
146
///
147
/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
148
/// in the vicinity of `idx`.
149
#[inline]
150
7.09M
pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
151
7.09M
    if idx >= buf.len() {
152
0
        return None;
153
7.09M
    }
154
155
    unsafe {
156
7.09M
        let x = *buf.get_unchecked(idx);
157
7.09M
        match Byte::classify(x)? {
158
4.97M
            Byte::Ascii => Some(Codepoint {
159
4.97M
                bytes: unsafe_slice(buf, idx, 1),
160
4.97M
                rewind: 0,
161
4.97M
                meaning: Meaning::Whole(x as char),
162
4.97M
            }),
163
1.04M
            Byte::Start(n) => {
164
1.04M
                let avail = buf.len() - idx;
165
1.04M
                if avail >= n {
166
1.04M
                    let bytes = unsafe_slice(buf, idx, n);
167
1.04M
                    if !all_cont(unsafe_slice(bytes, 1, n - 1)) {
168
0
                        return None;
169
1.04M
                    }
170
1.04M
                    let meaning = decode(bytes)?;
171
1.04M
                    Some(Codepoint {
172
1.04M
                        bytes,
173
1.04M
                        rewind: 0,
174
1.04M
                        meaning,
175
1.04M
                    })
176
                } else {
177
0
                    Some(Codepoint {
178
0
                        bytes: unsafe_slice(buf, idx, avail),
179
0
                        rewind: 0,
180
0
                        meaning: Meaning::Prefix(n - avail),
181
0
                    })
182
                }
183
            },
184
            Byte::Cont => {
185
1.07M
                let mut start = idx;
186
1.07M
                let mut checked = 0;
187
                loop {
188
2.14M
                    if start == 0 {
189
                        // Whoops, fell off the beginning.
190
0
                        return Some(Codepoint {
191
0
                            bytes: unsafe_slice(buf, 0, idx + 1),
192
0
                            rewind: idx,
193
0
                            meaning: Meaning::Suffix,
194
0
                        });
195
2.14M
                    }
196
197
2.14M
                    start -= 1;
198
2.14M
                    checked += 1;
199
2.14M
                    match Byte::classify(*buf.get_unchecked(start))? {
200
1.06M
                        Byte::Cont => (),
201
1.07M
                        Byte::Start(n) => {
202
1.07M
                            let avail = buf.len() - start;
203
1.07M
                            if avail >= n {
204
1.07M
                                let bytes = unsafe_slice(buf, start, n);
205
1.07M
                                if checked < n {
206
1.07M
                                    if !all_cont(unsafe_slice(bytes, checked, n - checked)) {
207
0
                                        return None;
208
1.07M
                                    }
209
0
                                }
210
1.07M
                                let meaning = decode(bytes)?;
211
1.07M
                                return Some(Codepoint {
212
1.07M
                                    bytes,
213
1.07M
                                    rewind: idx - start,
214
1.07M
                                    meaning,
215
1.07M
                                });
216
                            } else {
217
0
                                return Some(Codepoint {
218
0
                                    bytes: unsafe_slice(buf, start, avail),
219
0
                                    rewind: idx - start,
220
0
                                    meaning: Meaning::Prefix(n - avail),
221
0
                                });
222
                            }
223
                        },
224
0
                        _ => return None,
225
                    }
226
227
1.06M
                    if idx - start >= 3 {
228
                        // We looked at 3 bytes before a continuation byte
229
                        // and didn't find a start byte.
230
0
                        return None;
231
1.06M
                    }
232
                }
233
            },
234
        }
235
    }
236
7.09M
}
tendril::futf::classify
Line
Count
Source
150
314k
pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
151
314k
    if idx >= buf.len() {
152
0
        return None;
153
314k
    }
154
155
    unsafe {
156
314k
        let x = *buf.get_unchecked(idx);
157
314k
        match Byte::classify(x)? {
158
310k
            Byte::Ascii => Some(Codepoint {
159
310k
                bytes: unsafe_slice(buf, idx, 1),
160
310k
                rewind: 0,
161
310k
                meaning: Meaning::Whole(x as char),
162
310k
            }),
163
3.61k
            Byte::Start(n) => {
164
3.61k
                let avail = buf.len() - idx;
165
3.61k
                if avail >= n {
166
3.61k
                    let bytes = unsafe_slice(buf, idx, n);
167
3.61k
                    if !all_cont(unsafe_slice(bytes, 1, n - 1)) {
168
0
                        return None;
169
3.61k
                    }
170
3.61k
                    let meaning = decode(bytes)?;
171
3.61k
                    Some(Codepoint {
172
3.61k
                        bytes,
173
3.61k
                        rewind: 0,
174
3.61k
                        meaning,
175
3.61k
                    })
176
                } else {
177
0
                    Some(Codepoint {
178
0
                        bytes: unsafe_slice(buf, idx, avail),
179
0
                        rewind: 0,
180
0
                        meaning: Meaning::Prefix(n - avail),
181
0
                    })
182
                }
183
            },
184
            Byte::Cont => {
185
0
                let mut start = idx;
186
0
                let mut checked = 0;
187
                loop {
188
0
                    if start == 0 {
189
                        // Whoops, fell off the beginning.
190
0
                        return Some(Codepoint {
191
0
                            bytes: unsafe_slice(buf, 0, idx + 1),
192
0
                            rewind: idx,
193
0
                            meaning: Meaning::Suffix,
194
0
                        });
195
0
                    }
196
197
0
                    start -= 1;
198
0
                    checked += 1;
199
0
                    match Byte::classify(*buf.get_unchecked(start))? {
200
0
                        Byte::Cont => (),
201
0
                        Byte::Start(n) => {
202
0
                            let avail = buf.len() - start;
203
0
                            if avail >= n {
204
0
                                let bytes = unsafe_slice(buf, start, n);
205
0
                                if checked < n {
206
0
                                    if !all_cont(unsafe_slice(bytes, checked, n - checked)) {
207
0
                                        return None;
208
0
                                    }
209
0
                                }
210
0
                                let meaning = decode(bytes)?;
211
0
                                return Some(Codepoint {
212
0
                                    bytes,
213
0
                                    rewind: idx - start,
214
0
                                    meaning,
215
0
                                });
216
                            } else {
217
0
                                return Some(Codepoint {
218
0
                                    bytes: unsafe_slice(buf, start, avail),
219
0
                                    rewind: idx - start,
220
0
                                    meaning: Meaning::Prefix(n - avail),
221
0
                                });
222
                            }
223
                        },
224
0
                        _ => return None,
225
                    }
226
227
0
                    if idx - start >= 3 {
228
                        // We looked at 3 bytes before a continuation byte
229
                        // and didn't find a start byte.
230
0
                        return None;
231
0
                    }
232
                }
233
            },
234
        }
235
    }
236
314k
}
tendril::futf::classify
Line
Count
Source
150
6.78M
pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
151
6.78M
    if idx >= buf.len() {
152
0
        return None;
153
6.78M
    }
154
155
    unsafe {
156
6.78M
        let x = *buf.get_unchecked(idx);
157
6.78M
        match Byte::classify(x)? {
158
4.66M
            Byte::Ascii => Some(Codepoint {
159
4.66M
                bytes: unsafe_slice(buf, idx, 1),
160
4.66M
                rewind: 0,
161
4.66M
                meaning: Meaning::Whole(x as char),
162
4.66M
            }),
163
1.03M
            Byte::Start(n) => {
164
1.03M
                let avail = buf.len() - idx;
165
1.03M
                if avail >= n {
166
1.03M
                    let bytes = unsafe_slice(buf, idx, n);
167
1.03M
                    if !all_cont(unsafe_slice(bytes, 1, n - 1)) {
168
0
                        return None;
169
1.03M
                    }
170
1.03M
                    let meaning = decode(bytes)?;
171
1.03M
                    Some(Codepoint {
172
1.03M
                        bytes,
173
1.03M
                        rewind: 0,
174
1.03M
                        meaning,
175
1.03M
                    })
176
                } else {
177
0
                    Some(Codepoint {
178
0
                        bytes: unsafe_slice(buf, idx, avail),
179
0
                        rewind: 0,
180
0
                        meaning: Meaning::Prefix(n - avail),
181
0
                    })
182
                }
183
            },
184
            Byte::Cont => {
185
1.07M
                let mut start = idx;
186
1.07M
                let mut checked = 0;
187
                loop {
188
2.14M
                    if start == 0 {
189
                        // Whoops, fell off the beginning.
190
0
                        return Some(Codepoint {
191
0
                            bytes: unsafe_slice(buf, 0, idx + 1),
192
0
                            rewind: idx,
193
0
                            meaning: Meaning::Suffix,
194
0
                        });
195
2.14M
                    }
196
197
2.14M
                    start -= 1;
198
2.14M
                    checked += 1;
199
2.14M
                    match Byte::classify(*buf.get_unchecked(start))? {
200
1.06M
                        Byte::Cont => (),
201
1.07M
                        Byte::Start(n) => {
202
1.07M
                            let avail = buf.len() - start;
203
1.07M
                            if avail >= n {
204
1.07M
                                let bytes = unsafe_slice(buf, start, n);
205
1.07M
                                if checked < n {
206
1.07M
                                    if !all_cont(unsafe_slice(bytes, checked, n - checked)) {
207
0
                                        return None;
208
1.07M
                                    }
209
0
                                }
210
1.07M
                                let meaning = decode(bytes)?;
211
1.07M
                                return Some(Codepoint {
212
1.07M
                                    bytes,
213
1.07M
                                    rewind: idx - start,
214
1.07M
                                    meaning,
215
1.07M
                                });
216
                            } else {
217
0
                                return Some(Codepoint {
218
0
                                    bytes: unsafe_slice(buf, start, avail),
219
0
                                    rewind: idx - start,
220
0
                                    meaning: Meaning::Prefix(n - avail),
221
0
                                });
222
                            }
223
                        },
224
0
                        _ => return None,
225
                    }
226
227
1.06M
                    if idx - start >= 3 {
228
                        // We looked at 3 bytes before a continuation byte
229
                        // and didn't find a start byte.
230
0
                        return None;
231
1.06M
                    }
232
                }
233
            },
234
        }
235
    }
236
6.78M
}
Unexecuted instantiation: tendril::futf::classify
237
238
#[cfg(test)]
239
mod tests {
240
    use super::{all_cont, classify, decode, Byte, Meaning};
241
    use std::borrow::ToOwned;
242
    use std::io::Write;
243
244
    #[test]
245
    fn classify_all_bytes() {
246
        for n in 0x00..0x80 {
247
            assert_eq!(Byte::classify(n), Some(Byte::Ascii));
248
        }
249
        for n in 0x80..0xC0 {
250
            assert_eq!(Byte::classify(n), Some(Byte::Cont));
251
        }
252
        for n in 0xC0..0xE0 {
253
            assert_eq!(Byte::classify(n), Some(Byte::Start(2)));
254
        }
255
        for n in 0xE0..0xF0 {
256
            assert_eq!(Byte::classify(n), Some(Byte::Start(3)));
257
        }
258
        for n in 0xF0..0xF8 {
259
            assert_eq!(Byte::classify(n), Some(Byte::Start(4)));
260
        }
261
        for n in 0xF8..0xFF {
262
            assert_eq!(Byte::classify(n), None);
263
        }
264
        assert_eq!(Byte::classify(0xFF), None);
265
    }
266
267
    #[test]
268
    fn test_all_cont() {
269
        assert!(all_cont(b""));
270
        assert!(all_cont(b"\x80"));
271
        assert!(all_cont(b"\xBF"));
272
        assert!(all_cont(b"\x80\xBF\x80\xBF"));
273
274
        assert!(!all_cont(b"z"));
275
        assert!(!all_cont(b"\xC0\xBF"));
276
        assert!(!all_cont(b"\xFF"));
277
        assert!(!all_cont(b"\x80\xBFz\x80\xBF"));
278
        assert!(!all_cont(b"\x80\xBF\xC0\x80\xBF"));
279
        assert!(!all_cont(b"\x80\xBF\xFF\x80\xBF"));
280
        assert!(!all_cont(b"\x80\xBF\x80\xBFz"));
281
        assert!(!all_cont(b"\x80\xBF\x80\xBF\xC0"));
282
        assert!(!all_cont(b"z\x80\xBF\x80\xBF"));
283
        assert!(!all_cont(b"\xC0\x80\xBF\x80\xBF"));
284
    }
285
286
    #[test]
287
    fn test_decode() {
288
        unsafe {
289
            assert_eq!(Some(Meaning::Whole('ő')), decode(b"\xC5\x91"));
290
            assert_eq!(Some(Meaning::Whole('\u{a66e}')), decode(b"\xEA\x99\xAE"));
291
            assert_eq!(
292
                Some(Meaning::Whole('\u{1f4a9}')),
293
                decode(b"\xF0\x9F\x92\xA9")
294
            );
295
            assert_eq!(
296
                Some(Meaning::Whole('\u{10ffff}')),
297
                decode(b"\xF4\x8F\xBF\xBF")
298
            );
299
300
            assert_eq!(
301
                Some(Meaning::LeadSurrogate(0x0000)),
302
                decode(b"\xED\xA0\x80")
303
            );
304
            assert_eq!(
305
                Some(Meaning::LeadSurrogate(0x0001)),
306
                decode(b"\xED\xA0\x81")
307
            );
308
            assert_eq!(
309
                Some(Meaning::LeadSurrogate(0x03FE)),
310
                decode(b"\xED\xAF\xBE")
311
            );
312
            assert_eq!(
313
                Some(Meaning::LeadSurrogate(0x03FF)),
314
                decode(b"\xED\xAF\xBF")
315
            );
316
317
            assert_eq!(
318
                Some(Meaning::TrailSurrogate(0x0000)),
319
                decode(b"\xED\xB0\x80")
320
            );
321
            assert_eq!(
322
                Some(Meaning::TrailSurrogate(0x0001)),
323
                decode(b"\xED\xB0\x81")
324
            );
325
            assert_eq!(
326
                Some(Meaning::TrailSurrogate(0x03FE)),
327
                decode(b"\xED\xBF\xBE")
328
            );
329
            assert_eq!(
330
                Some(Meaning::TrailSurrogate(0x03FF)),
331
                decode(b"\xED\xBF\xBF")
332
            );
333
334
            // The last 4-byte UTF-8 sequence. This would be U+1FFFFF, which is out of
335
            // range.
336
            assert_eq!(None, decode(b"\xF7\xBF\xBF\xBF"));
337
338
            // First otherwise-valid sequence (would be U+110000) that is out of range
339
            assert_eq!(None, decode(b"\xF4\x90\x80\x80"));
340
341
            // Overlong sequences
342
            assert_eq!(None, decode(b"\xC0\x80"));
343
            assert_eq!(None, decode(b"\xC1\xBF"));
344
            assert_eq!(None, decode(b"\xE0\x80\x80"));
345
            assert_eq!(None, decode(b"\xE0\x9F\xBF"));
346
            assert_eq!(None, decode(b"\xF0\x80\x80\x80"));
347
            assert_eq!(None, decode(b"\xF0\x8F\xBF\xBF"));
348
349
            // For not-overlong sequence for each sequence length
350
            assert_eq!(Some(Meaning::Whole('\u{80}')), decode(b"\xC2\x80"));
351
            assert_eq!(Some(Meaning::Whole('\u{800}')), decode(b"\xE0\xA0\x80"));
352
            assert_eq!(
353
                Some(Meaning::Whole('\u{10000}')),
354
                decode(b"\xF0\x90\x80\x80")
355
            );
356
        }
357
    }
358
359
    static JUNK: &[u8] = b"\
360
        \xf8\x0d\x07\x25\xa6\x7b\x95\xeb\x47\x01\x7f\xee\
361
        \x3b\x00\x60\x57\x1d\x9e\x5d\x0a\x0b\x0a\x7c\x75\
362
        \x13\xa1\x82\x46\x27\x34\xe9\x52\x61\x0d\xec\x10\
363
        \x54\x49\x6e\x54\xdf\x7b\xe1\x31\x8c\x06\x21\x83\
364
        \x0f\xb5\x1f\x4c\x6a\x71\x52\x42\x74\xe7\x7b\x50\
365
        \x59\x1f\x6a\xd4\xff\x06\x92\x33\xc4\x34\x97\xff\
366
        \xcc\xb5\xc4\x00\x7b\xc3\x4a\x7f\x7e\x63\x96\x58\
367
        \x51\x63\x21\x54\x53\x2f\x03\x8a\x7d\x41\x79\x98\
368
        \x5b\xcb\xb8\x94\x6b\x73\xf3\x0c\x5a\xd7\xc4\x12\
369
        \x7a\x2b\x9a\x2e\x67\x62\x2a\x00\x45\x2c\xfe\x7d\
370
        \x8d\xd6\x51\x4e\x59\x36\x72\x1b\xae\xaa\x06\xe8\
371
        \x71\x1b\x85\xd3\x35\xb5\xbe\x9e\x16\x96\x72\xd8\
372
        \x1a\x48\xba\x4d\x55\x4f\x1b\xa2\x77\xfa\x8f\x71\
373
        \x58\x7d\x03\x93\xa2\x3a\x76\x51\xda\x48\xe2\x3f\
374
        \xeb\x8d\xda\x89\xae\xf7\xbd\x3d\xb6\x37\x97\xca\
375
        \x99\xcc\x4a\x8d\x62\x89\x97\xe3\xc0\xd1\x8d\xc1\
376
        \x26\x11\xbb\x8d\x53\x61\x4f\x76\x03\x00\x30\xd3\
377
        \x5f\x86\x19\x52\x9c\x3e\x99\x8c\xb7\x21\x48\x1c\
378
        \x85\xae\xad\xd5\x74\x00\x6c\x3e\xd0\x17\xff\x76\
379
        \x5c\x32\xc3\xfb\x24\x99\xd4\x4c\xa4\x1f\x66\x46\
380
        \xe7\x2d\x44\x56\x7d\x14\xd9\x76\x91\x37\x2f\xb7\
381
        \xcc\x1b\xd3\xc2";
382
383
    #[test]
384
    fn classify_whole() {
385
        assert_eq!(JUNK.len(), 256);
386
387
        for &c in &[
388
            '\0',
389
            '\x01',
390
            'o',
391
            'z',
392
            'ő',
393
            '\u{2764}',
394
            '\u{a66e}',
395
            '\u{1f4a9}',
396
            '\u{1f685}',
397
        ] {
398
            for idx in 0..JUNK.len() - 3 {
399
                let mut buf = JUNK.to_owned();
400
                let ch = format!("{}", c).into_bytes();
401
                (&mut buf[idx..]).write_all(&ch).unwrap();
402
403
                for j in 0..ch.len() {
404
                    let class = classify(&buf, idx + j).unwrap();
405
                    assert_eq!(class.bytes, &*ch);
406
                    assert_eq!(class.rewind, j);
407
                    assert_eq!(class.meaning, Meaning::Whole(c));
408
                }
409
            }
410
        }
411
    }
412
413
    #[test]
414
    fn classify_surrogates() {
415
        for &(s, b) in &[
416
            (Meaning::LeadSurrogate(0x0000), b"\xED\xA0\x80"),
417
            (Meaning::LeadSurrogate(0x0001), b"\xED\xA0\x81"),
418
            (Meaning::LeadSurrogate(0x03FE), b"\xED\xAF\xBE"),
419
            (Meaning::LeadSurrogate(0x03FF), b"\xED\xAF\xBF"),
420
            (Meaning::TrailSurrogate(0x0000), b"\xED\xB0\x80"),
421
            (Meaning::TrailSurrogate(0x0001), b"\xED\xB0\x81"),
422
            (Meaning::TrailSurrogate(0x03FE), b"\xED\xBF\xBE"),
423
            (Meaning::TrailSurrogate(0x03FF), b"\xED\xBF\xBF"),
424
        ] {
425
            for idx in 0..JUNK.len() - 2 {
426
                let mut buf = JUNK.to_owned();
427
                (&mut buf[idx..]).write_all(b).unwrap();
428
429
                let class = classify(&buf, idx).unwrap();
430
                assert_eq!(class.bytes, b);
431
                assert_eq!(class.rewind, 0);
432
                assert_eq!(class.meaning, s);
433
            }
434
        }
435
    }
436
437
    #[test]
438
    fn classify_prefix_suffix() {
439
        for &c in &['ő', '\u{a66e}', '\u{1f4a9}'] {
440
            let ch = format!("{}", c).into_bytes();
441
            for pfx in 1..ch.len() - 1 {
442
                let mut buf = JUNK.to_owned();
443
                let buflen = buf.len();
444
                (&mut buf[buflen - pfx..buflen])
445
                    .write_all(&ch[..pfx])
446
                    .unwrap();
447
                for j in 0..pfx {
448
                    let idx = buflen - 1 - j;
449
                    let class = classify(&buf, idx).unwrap();
450
                    assert_eq!(class.bytes, &ch[..pfx]);
451
                    assert_eq!(class.rewind, pfx - 1 - j);
452
                    assert_eq!(class.meaning, Meaning::Prefix(ch.len() - pfx));
453
                }
454
            }
455
            for sfx in 1..ch.len() - 1 {
456
                let ch_bytes = &ch[ch.len() - sfx..];
457
                let mut buf = JUNK.to_owned();
458
                (&mut *buf).write_all(ch_bytes).unwrap();
459
                for j in 0..sfx {
460
                    let class = classify(&buf, j).unwrap();
461
                    assert!(ch_bytes.starts_with(class.bytes));
462
                    assert_eq!(class.rewind, j);
463
                    assert_eq!(class.meaning, Meaning::Suffix);
464
                }
465
            }
466
        }
467
    }
468
469
    #[test]
470
    fn out_of_bounds() {
471
        assert!(classify(b"", 0).is_none());
472
        assert!(classify(b"", 7).is_none());
473
        assert!(classify(b"aaaaaaa", 7).is_none());
474
    }
475
476
    #[test]
477
    fn malformed() {
478
        assert_eq!(None, classify(b"\xFF", 0));
479
        assert_eq!(None, classify(b"\xC5\xC5", 0));
480
        assert_eq!(None, classify(b"x\x91", 1));
481
        assert_eq!(None, classify(b"\x91\x91\x91\x91", 3));
482
        assert_eq!(None, classify(b"\x91\x91\x91\x91\x91", 4));
483
        assert_eq!(None, classify(b"\xEA\x91\xFF", 1));
484
        assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 0));
485
        assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 1));
486
        assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 2));
487
488
        for i in 0..4 {
489
            // out of range: U+110000
490
            assert_eq!(None, classify(b"\xF4\x90\x80\x80", i));
491
492
            // out of range: U+1FFFFF
493
            assert_eq!(None, classify(b"\xF7\xBF\xBF\xBF", i));
494
495
            // Overlong sequences
496
            assert_eq!(None, classify(b"\xC0\x80", i));
497
            assert_eq!(None, classify(b"\xC1\xBF", i));
498
            assert_eq!(None, classify(b"\xE0\x80\x80", i));
499
            assert_eq!(None, classify(b"\xE0\x9F\xBF", i));
500
            assert_eq!(None, classify(b"\xF0\x80\x80\x80", i));
501
            assert_eq!(None, classify(b"\xF0\x8F\xBF\xBF", i));
502
        }
503
    }
504
}