/rust/registry/src/index.crates.io-1949cf8c6b5b557f/futf-0.1.5/src/lib.rs

Source
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

#![cfg_attr(test, feature(test))]

#[macro_use]
extern crate debug_unreachable;

#[macro_use]
extern crate mac;

#[cfg(test)]
extern crate test as std_test;

use std::{slice, char};

/// Meaning of a complete or partial UTF-8 codepoint.
///
/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
/// `Suffix` may in reality have no valid completion.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
pub enum Meaning {
    /// We found a whole codepoint.
    Whole(char),

    /// We found something that isn't a valid Unicode codepoint, but
    /// it *would* correspond to a UTF-16 leading surrogate code unit,
    /// i.e. a value in the range `U+D800` - `U+DBFF`.
    ///
    /// The argument is the code unit's 10-bit index within that range.
    ///
    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
    LeadSurrogate(u16),

    /// We found something that isn't a valid Unicode codepoint, but
    /// it *would* correspond to a UTF-16 trailing surrogate code unit,
    /// i.e. a value in the range `U+DC00` - `U+DFFF`.
    ///
    /// The argument is the code unit's 10-bit index within that range.
    ///
    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
    TrailSurrogate(u16),

    /// We found only a prefix of a codepoint before the buffer ended.
    ///
    /// Includes the number of additional bytes needed.
    Prefix(usize),

    /// We found only a suffix of a codepoint before running off the
    /// start of the buffer.
    ///
    /// Up to 3 more bytes may be needed.
    Suffix,
}

/// Represents a complete or partial UTF-8 codepoint.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
pub struct Codepoint<'a> {
    /// The bytes that make up the partial or full codepoint.
    ///
    /// For a `Suffix` this depends on `idx`. We don't scan forward
    /// for additional continuation bytes after the reverse scan
    /// failed to locate a multibyte sequence start.
    pub bytes: &'a [u8],

    /// Start of the codepoint in the buffer, expressed as an offset
    /// back from `idx`.
    pub rewind: usize,

    /// Meaning of the partial or full codepoint.
    pub meaning: Meaning,
}

#[derive(Debug, PartialEq, Eq)]
enum Byte {
    Ascii,
    Start(usize),
    Cont,
}

impl Byte {
    #[inline(always)]
    fn classify(x: u8) -> Option<Byte> {
        match x & 0xC0 {
            0xC0 => match x {
                x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
                x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
                x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
                _ => None,
            },
            0x80 => Some(Byte::Cont),
            _ => Some(Byte::Ascii),
        }
    }
}

#[inline(always)]
fn all_cont(buf: &[u8]) -> bool {
    buf.iter().all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
}

// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
// a starting byte followed by the correct number of continuation bytes.
#[inline(always)]
unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
    debug_assert!(buf.len() >= 2);
    debug_assert!(buf.len() <= 4);
    let n;
    match buf.len() {
        2 => {
            n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
                | ((*buf.get_unchecked(1) & 0x3F) as u32);
            if n < 0x80 { return None }  // Overlong
        }
        3 => {
            n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
                | ((*buf.get_unchecked(2) & 0x3F) as u32);
            match n {
                0x0000 ... 0x07FF => return None,  // Overlong
                0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
                0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
                _ => {}
            }
        }
        4 => {
            n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
                | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
                | ((*buf.get_unchecked(3) & 0x3F) as u32);
            if n < 0x1_0000 { return None }  // Overlong
        }
        _ => debug_unreachable!(),
    }

    char::from_u32(n).map(Meaning::Whole)
}

#[inline(always)]
unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
    debug_assert!(start <= buf.len());
    debug_assert!(new_len <= (buf.len() - start));
    slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len)
}

macro_rules! otry {
    ($x:expr) => { unwrap_or_return!($x, None) }
}

/// Describes the UTF-8 codepoint containing the byte at index `idx` within
/// `buf`.
///
/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
/// in the vicinity of `idx`.
#[inline]
pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
    if idx >= buf.len() {
        return None;
    }

    unsafe {
        let x = *buf.get_unchecked(idx);
        match otry!(Byte::classify(x)) {
            Byte::Ascii => Some(Codepoint {
                bytes: unsafe_slice(buf, idx, 1),
                rewind: 0,
                meaning: Meaning::Whole(x as char),
            }),
            Byte::Start(n) => {
                let avail = buf.len() - idx;
                if avail >= n {
                    let bytes = unsafe_slice(buf, idx, n);
                    if !all_cont(unsafe_slice(bytes, 1, n-1)) {
                        return None;
                    }
                    let meaning = otry!(decode(bytes));
                    Some(Codepoint {
                        bytes: bytes,
                        rewind: 0,
                        meaning: meaning,
                    })
                } else {
                    Some(Codepoint {
                        bytes: unsafe_slice(buf, idx, avail),
                        rewind: 0,
                        meaning: Meaning::Prefix(n - avail),
                    })
                }
            },
            Byte::Cont => {
                let mut start = idx;
                let mut checked = 0;
                loop {
                    if start == 0 {
                        // Whoops, fell off the beginning.
                        return Some(Codepoint {
                            bytes: unsafe_slice(buf, 0, idx + 1),
                            rewind: idx,
                            meaning: Meaning::Suffix,
                        });
                    }

                    start -= 1;
                    checked += 1;
                    match otry!(Byte::classify(*buf.get_unchecked(start))) {
                        Byte::Cont => (),
                        Byte::Start(n) => {
                            let avail = buf.len() - start;
                            if avail >= n {
                                let bytes = unsafe_slice(buf, start, n);
                                if checked < n {
                                    if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
                                        return None;
                                    }
                                }
                                let meaning = otry!(decode(bytes));
                                return Some(Codepoint {
                                    bytes: bytes,
                                    rewind: idx - start,
                                    meaning: meaning,
                                });
                            } else {
                                return Some(Codepoint {
                                    bytes: unsafe_slice(buf, start, avail),
                                    rewind: idx - start,
                                    meaning: Meaning::Prefix(n - avail),
                                });
                            }
                        }
                        _ => return None,
                    }

                    if idx - start >= 3 {
                        // We looked at 3 bytes before a continuation byte
                        // and didn't find a start byte.
                        return None;
                    }
                }
            }
        }
    }
}

#[cfg(test)]
mod test;

Coverage Report

Created: 2025-10-10 07:11

Line	Count	Source
1		// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2		// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3		// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
4		// option. This file may not be copied, modified, or distributed
5		// except according to those terms.
6
7		#![cfg_attr(test, feature(test))]
8
9		#[macro_use]
10		extern crate debug_unreachable;
11
12		#[macro_use]
13		extern crate mac;
14
15		#[cfg(test)]
16		extern crate test as std_test;
17
18		use std::{slice, char};
19
20		/// Meaning of a complete or partial UTF-8 codepoint.
21		///
22		/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
23		/// `Suffix` may in reality have no valid completion.
24		#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
25		pub enum Meaning {
26		/// We found a whole codepoint.
27		Whole(char),
28
29		/// We found something that isn't a valid Unicode codepoint, but
30		/// it would correspond to a UTF-16 leading surrogate code unit,
31		/// i.e. a value in the range `U+D800` - `U+DBFF`.
32		///
33		/// The argument is the code unit's 10-bit index within that range.
34		///
35		/// These are found in UTF-8 variants such as CESU-8 and WTF-8.
36		LeadSurrogate(u16),
37
38		/// We found something that isn't a valid Unicode codepoint, but
39		/// it would correspond to a UTF-16 trailing surrogate code unit,
40		/// i.e. a value in the range `U+DC00` - `U+DFFF`.
41		///
42		/// The argument is the code unit's 10-bit index within that range.
43		///
44		/// These are found in UTF-8 variants such as CESU-8 and WTF-8.
45		TrailSurrogate(u16),
46
47		/// We found only a prefix of a codepoint before the buffer ended.
48		///
49		/// Includes the number of additional bytes needed.
50		Prefix(usize),
51
52		/// We found only a suffix of a codepoint before running off the
53		/// start of the buffer.
54		///
55		/// Up to 3 more bytes may be needed.
56		Suffix,
57		}
58
59		/// Represents a complete or partial UTF-8 codepoint.
60		#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
61		pub struct Codepoint<'a> {
62		/// The bytes that make up the partial or full codepoint.
63		///
64		/// For a `Suffix` this depends on `idx`. We don't scan forward
65		/// for additional continuation bytes after the reverse scan
66		/// failed to locate a multibyte sequence start.
67		pub bytes: &'a [u8],
68
69		/// Start of the codepoint in the buffer, expressed as an offset
70		/// back from `idx`.
71		pub rewind: usize,
72
73		/// Meaning of the partial or full codepoint.
74		pub meaning: Meaning,
75		}
76
77		#[derive(Debug, PartialEq, Eq)]
78		enum Byte {
79		Ascii,
80		Start(usize),
81		Cont,
82		}
83
84		impl Byte {
85		#[inline(always)]
86	0	fn classify(x: u8) -> Option<Byte> {
87	0	match x & 0xC0 {
88	0	0xC0 => match x {
89	0	x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
90	0	x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
91	0	x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
92	0	_ => None,
93		},
94	0	0x80 => Some(Byte::Cont),
95	0	_ => Some(Byte::Ascii),
96		}
97	0	}
98		}
99
100		#[inline(always)]
101	0	fn all_cont(buf: &[u8]) -> bool {
102	0	buf.iter().all(\|&b\| matches!(Byte::classify(b), Some(Byte::Cont)))
103	0	}
104
105		// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
106		// a starting byte followed by the correct number of continuation bytes.
107		#[inline(always)]
108	0	unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
109	0	debug_assert!(buf.len() >= 2);
110	0	debug_assert!(buf.len() <= 4);
111		let n;
112	0	match buf.len() {
113		2 => {
114	0	n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
115	0	\| ((*buf.get_unchecked(1) & 0x3F) as u32);
116	0	if n < 0x80 { return None } // Overlong
117		}
118		3 => {
119	0	n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
120	0	\| ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
121	0	\| ((*buf.get_unchecked(2) & 0x3F) as u32);
122	0	match n {
123	0	0x0000 ... 0x07FF => return None, // Overlong
124	0	0xD800 ... 0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
125	0	0xDC00 ... 0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
126	0	_ => {}
127		}
128		}
129		4 => {
130	0	n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
131	0	\| ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
132	0	\| ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
133	0	\| ((*buf.get_unchecked(3) & 0x3F) as u32);
134	0	if n < 0x1_0000 { return None } // Overlong
135		}
136	0	_ => debug_unreachable!(),
137		}
138
139	0	char::from_u32(n).map(Meaning::Whole)
140	0	}
141
142		#[inline(always)]
143	0	unsafe fn unsafe_slice<'a>(buf: &'a [u8], start: usize, new_len: usize) -> &'a [u8] {
144	0	debug_assert!(start <= buf.len());
145	0	debug_assert!(new_len <= (buf.len() - start));
146	0	slice::from_raw_parts(buf.as_ptr().offset(start as isize), new_len)
147	0	}
148
149		macro_rules! otry {
150		($x:expr) => { unwrap_or_return!($x, None) }
151		}
152
153		/// Describes the UTF-8 codepoint containing the byte at index `idx` within
154		/// `buf`.
155		///
156		/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
157		/// in the vicinity of `idx`.
158		#[inline]
159	0	pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
160	0	if idx >= buf.len() {
161	0	return None;
162	0	}
163
164		unsafe {
165	0	let x = *buf.get_unchecked(idx);
166	0	match otry!(Byte::classify(x)) {
167	0	Byte::Ascii => Some(Codepoint {
168	0	bytes: unsafe_slice(buf, idx, 1),
169	0	rewind: 0,
170	0	meaning: Meaning::Whole(x as char),
171	0	}),
172	0	Byte::Start(n) => {
173	0	let avail = buf.len() - idx;
174	0	if avail >= n {
175	0	let bytes = unsafe_slice(buf, idx, n);
176	0	if !all_cont(unsafe_slice(bytes, 1, n-1)) {
177	0	return None;
178	0	}
179	0	let meaning = otry!(decode(bytes));
180	0	Some(Codepoint {
181	0	bytes: bytes,
182	0	rewind: 0,
183	0	meaning: meaning,
184	0	})
185		} else {
186	0	Some(Codepoint {
187	0	bytes: unsafe_slice(buf, idx, avail),
188	0	rewind: 0,
189	0	meaning: Meaning::Prefix(n - avail),
190	0	})
191		}
192		},
193		Byte::Cont => {
194	0	let mut start = idx;
195	0	let mut checked = 0;
196		loop {
197	0	if start == 0 {
198		// Whoops, fell off the beginning.
199	0	return Some(Codepoint {
200	0	bytes: unsafe_slice(buf, 0, idx + 1),
201	0	rewind: idx,
202	0	meaning: Meaning::Suffix,
203	0	});
204	0	}
205
206	0	start -= 1;
207	0	checked += 1;
208	0	match otry!(Byte::classify(*buf.get_unchecked(start))) {
209	0	Byte::Cont => (),
210	0	Byte::Start(n) => {
211	0	let avail = buf.len() - start;
212	0	if avail >= n {
213	0	let bytes = unsafe_slice(buf, start, n);
214	0	if checked < n {
215	0	if !all_cont(unsafe_slice(bytes, checked, n-checked)) {
216	0	return None;
217	0	}
218	0	}
219	0	let meaning = otry!(decode(bytes));
220	0	return Some(Codepoint {
221	0	bytes: bytes,
222	0	rewind: idx - start,
223	0	meaning: meaning,
224	0	});
225		} else {
226	0	return Some(Codepoint {
227	0	bytes: unsafe_slice(buf, start, avail),
228	0	rewind: idx - start,
229	0	meaning: Meaning::Prefix(n - avail),
230	0	});
231		}
232		}
233	0	_ => return None,
234		}
235
236	0	if idx - start >= 3 {
237		// We looked at 3 bytes before a continuation byte
238		// and didn't find a start byte.
239	0	return None;
240	0	}
241		}
242		}
243		}
244		}
245	0	}
246
247		#[cfg(test)]
248		mod test;