Coverage Report

Created: 2025-10-10 07:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/futf-0.1.5/src/lib.rs
Line
Count
Source
1
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
4
// option. This file may not be copied, modified, or distributed
5
// except according to those terms.
6
7
#![cfg_attr(test, feature(test))]
8
9
#[macro_use]
10
extern crate debug_unreachable;
11
12
#[macro_use]
13
extern crate mac;
14
15
#[cfg(test)]
16
extern crate test as std_test;
17
18
use std::{slice, char};
19
20
/// Meaning of a complete or partial UTF-8 codepoint.
21
///
22
/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
23
/// `Suffix` may in reality have no valid completion.
24
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
25
pub enum Meaning {
26
    /// We found a whole codepoint.
27
    Whole(char),
28
29
    /// We found something that isn't a valid Unicode codepoint, but
30
    /// it *would* correspond to a UTF-16 leading surrogate code unit,
31
    /// i.e. a value in the range `U+D800` - `U+DBFF`.
32
    ///
33
    /// The argument is the code unit's 10-bit index within that range.
34
    ///
35
    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
36
    LeadSurrogate(u16),
37
38
    /// We found something that isn't a valid Unicode codepoint, but
39
    /// it *would* correspond to a UTF-16 trailing surrogate code unit,
40
    /// i.e. a value in the range `U+DC00` - `U+DFFF`.
41
    ///
42
    /// The argument is the code unit's 10-bit index within that range.
43
    ///
44
    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
45
    TrailSurrogate(u16),
46
47
    /// We found only a prefix of a codepoint before the buffer ended.
48
    ///
49
    /// Includes the number of additional bytes needed.
50
    Prefix(usize),
51
52
    /// We found only a suffix of a codepoint before running off the
53
    /// start of the buffer.
54
    ///
55
    /// Up to 3 more bytes may be needed.
56
    Suffix,
57
}
58
59
/// Represents a complete or partial UTF-8 codepoint.
60
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
61
pub struct Codepoint<'a> {
62
    /// The bytes that make up the partial or full codepoint.
63
    ///
64
    /// For a `Suffix` this depends on `idx`. We don't scan forward
65
    /// for additional continuation bytes after the reverse scan
66
    /// failed to locate a multibyte sequence start.
67
    pub bytes: &'a [u8],
68
69
    /// Start of the codepoint in the buffer, expressed as an offset
70
    /// back from `idx`.
71
    pub rewind: usize,
72
73
    /// Meaning of the partial or full codepoint.
74
    pub meaning: Meaning,
75
}
76
77
#[derive(Debug, PartialEq, Eq)]
78
enum Byte {
79
    Ascii,
80
    Start(usize),
81
    Cont,
82
}
83
84
impl Byte {
85
    #[inline(always)]
86
0
    fn classify(x: u8) -> Option<Byte> {
87
0
        match x & 0xC0 {
88
0
            0xC0 => match x {
89
0
                x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
90
0
                x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
91
0
                x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
92
0
                _ => None,
93
            },
94
0
            0x80 => Some(Byte::Cont),
95
0
            _ => Some(Byte::Ascii),
96
        }
97
0
    }
98
}
99
100
#[inline(always)]
101
0
fn all_cont(buf: &[u8]) -> bool {
102
0
    buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
103
0
}
104
105
// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
106
// a starting byte followed by the correct number of continuation bytes.
107
#[inline(always)]
108
0
unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
109
0
    debug_assert!(buf.len() >= 2);
110
0
    debug_assert!(buf.len() <= 4);
111
    let n;
112
0
    match buf.len() {
113
        2 => {
114
0
            n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
115
0
                | ((*buf.get_unchecked(1) & 0x3F) as u32);
116
0
            if n < 0x80 { return None }  // Overlong
117
        }
118
        3 => {
119
0
            n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
120
0
                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
121
0
                | ((*buf.get_unchecked(2) & 0x3F) as u32);
122
0
            match n {
123
0
                0x0000 ... 0x07FF => return None,  // Overlong
124
0
                0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
125
0
                0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
126
0
                _ => {}
127
            }
128
        }
129
        4 => {
130
0
            n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
131
0
                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
132
0
                | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
133
0
                | ((*buf.get_unchecked(3) & 0x3F) as u32);
134
0
            if n < 0x1_0000 { return None }  // Overlong
135
        }
136
0
        _ => debug_unreachable!(),
137
    }
138
139
0
    char::from_u32(n).map(Meaning::Whole)
140
0
}
141
142
#[inline(always)]
143
0
unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
144
0
    debug_assert!(start <= buf.len());
145
0
    debug_assert!(new_len <= (buf.len() - start));
146
0
    slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len)
147
0
}
148
149
macro_rules! otry {
150
    ($x:expr) => { unwrap_or_return!($x, None) }
151
}
152
153
/// Describes the UTF-8 codepoint containing the byte at index `idx` within
154
/// `buf`.
155
///
156
/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
157
/// in the vicinity of `idx`.
158
#[inline]
159
0
pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
160
0
    if idx >= buf.len() {
161
0
        return None;
162
0
    }
163
164
    unsafe {
165
0
        let x = *buf.get_unchecked(idx);
166
0
        match otry!(Byte::classify(x)) {
167
0
            Byte::Ascii => Some(Codepoint {
168
0
                bytes: unsafe_slice(buf, idx, 1),
169
0
                rewind: 0,
170
0
                meaning: Meaning::Whole(x as char),
171
0
            }),
172
0
            Byte::Start(n) => {
173
0
                let avail = buf.len() - idx;
174
0
                if avail >= n {
175
0
                    let bytes = unsafe_slice(buf, idx, n);
176
0
                    if !all_cont(unsafe_slice(bytes, 1, n-1)) {
177
0
                        return None;
178
0
                    }
179
0
                    let meaning = otry!(decode(bytes));
180
0
                    Some(Codepoint {
181
0
                        bytes: bytes,
182
0
                        rewind: 0,
183
0
                        meaning: meaning,
184
0
                    })
185
                } else {
186
0
                    Some(Codepoint {
187
0
                        bytes: unsafe_slice(buf, idx, avail),
188
0
                        rewind: 0,
189
0
                        meaning: Meaning::Prefix(n - avail),
190
0
                    })
191
                }
192
            },
193
            Byte::Cont => {
194
0
                let mut start = idx;
195
0
                let mut checked = 0;
196
                loop {
197
0
                    if start == 0 {
198
                        // Whoops, fell off the beginning.
199
0
                        return Some(Codepoint {
200
0
                            bytes: unsafe_slice(buf, 0, idx + 1),
201
0
                            rewind: idx,
202
0
                            meaning: Meaning::Suffix,
203
0
                        });
204
0
                    }
205
206
0
                    start -= 1;
207
0
                    checked += 1;
208
0
                    match otry!(Byte::classify(*buf.get_unchecked(start))) {
209
0
                        Byte::Cont => (),
210
0
                        Byte::Start(n) => {
211
0
                            let avail = buf.len() - start;
212
0
                            if avail >= n {
213
0
                                let bytes = unsafe_slice(buf, start, n);
214
0
                                if checked < n {
215
0
                                    if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
216
0
                                        return None;
217
0
                                    }
218
0
                                }
219
0
                                let meaning = otry!(decode(bytes));
220
0
                                return Some(Codepoint {
221
0
                                    bytes: bytes,
222
0
                                    rewind: idx - start,
223
0
                                    meaning: meaning,
224
0
                                });
225
                            } else {
226
0
                                return Some(Codepoint {
227
0
                                    bytes: unsafe_slice(buf, start, avail),
228
0
                                    rewind: idx - start,
229
0
                                    meaning: Meaning::Prefix(n - avail),
230
0
                                });
231
                            }
232
                        }
233
0
                        _ => return None,
234
                    }
235
236
0
                    if idx - start >= 3 {
237
                        // We looked at 3 bytes before a continuation byte
238
                        // and didn't find a start byte.
239
0
                        return None;
240
0
                    }
241
                }
242
            }
243
        }
244
    }
245
0
}
246
247
#[cfg(test)]
248
mod test;