/src/html5ever/tendril/src/futf.rs

Source
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

use debug_unreachable::debug_unreachable;
use std::{char, slice};

/// Meaning of a complete or partial UTF-8 codepoint.
///
/// Not all checking is performed eagerly. That is, a codepoint `Prefix` or
/// `Suffix` may in reality have no valid completion.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
pub enum Meaning {
    /// We found a whole codepoint.
    Whole(char),

    /// We found something that isn't a valid Unicode codepoint, but
    /// it *would* correspond to a UTF-16 leading surrogate code unit,
    /// i.e. a value in the range `U+D800` - `U+DBFF`.
    ///
    /// The argument is the code unit's 10-bit index within that range.
    ///
    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
    LeadSurrogate(u16),

    /// We found something that isn't a valid Unicode codepoint, but
    /// it *would* correspond to a UTF-16 trailing surrogate code unit,
    /// i.e. a value in the range `U+DC00` - `U+DFFF`.
    ///
    /// The argument is the code unit's 10-bit index within that range.
    ///
    /// These are found in UTF-8 variants such as CESU-8 and WTF-8.
    TrailSurrogate(u16),

    /// We found only a prefix of a codepoint before the buffer ended.
    ///
    /// Includes the number of additional bytes needed.
    Prefix(usize),

    /// We found only a suffix of a codepoint before running off the
    /// start of the buffer.
    ///
    /// Up to 3 more bytes may be needed.
    Suffix,
}

/// Represents a complete or partial UTF-8 codepoint.
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug, Hash)]
pub struct Codepoint<'a> {
    /// The bytes that make up the partial or full codepoint.
    ///
    /// For a `Suffix` this depends on `idx`. We don't scan forward
    /// for additional continuation bytes after the reverse scan
    /// failed to locate a multibyte sequence start.
    pub bytes: &'a [u8],

    /// Start of the codepoint in the buffer, expressed as an offset
    /// back from `idx`.
    pub rewind: usize,

    /// Meaning of the partial or full codepoint.
    pub meaning: Meaning,
}

#[derive(Debug, PartialEq, Eq)]
enum Byte {
    Ascii,
    Start(usize),
    Cont,
}

impl Byte {
    #[inline(always)]
    fn classify(x: u8) -> Option<Byte> {
        match x & 0xC0 {
            0xC0 => match x {
                x if x & 0b11111_000 == 0b11110_000 => Some(Byte::Start(4)),
                x if x & 0b1111_0000 == 0b1110_0000 => Some(Byte::Start(3)),
                x if x & 0b111_00000 == 0b110_00000 => Some(Byte::Start(2)),
                _ => None,
            },
            0x80 => Some(Byte::Cont),
            _ => Some(Byte::Ascii),
        }
    }
}

#[inline(always)]
fn all_cont(buf: &[u8]) -> bool {
    buf.iter()
        .all(|&b| matches!(Byte::classify(b), Some(Byte::Cont)))
}

// NOTE: Assumes the buffer is a syntactically valid multi-byte UTF-8 sequence:
// a starting byte followed by the correct number of continuation bytes.
#[inline(always)]
unsafe fn decode(buf: &[u8]) -> Option<Meaning> {
    debug_assert!(buf.len() >= 2);
    debug_assert!(buf.len() <= 4);
    let n;
    match buf.len() {
        2 => {
            n = ((*buf.get_unchecked(0) & 0b11111) as u32) << 6
                | ((*buf.get_unchecked(1) & 0x3F) as u32);
            if n < 0x80 {
                return None;
            } // Overlong
        },
        3 => {
            n = ((*buf.get_unchecked(0) & 0b1111) as u32) << 12
                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 6
                | ((*buf.get_unchecked(2) & 0x3F) as u32);
            match n {
                0x0000..=0x07FF => return None, // Overlong
                0xD800..=0xDBFF => return Some(Meaning::LeadSurrogate(n as u16 - 0xD800)),
                0xDC00..=0xDFFF => return Some(Meaning::TrailSurrogate(n as u16 - 0xDC00)),
                _ => {},
            }
        },
        4 => {
            n = ((*buf.get_unchecked(0) & 0b111) as u32) << 18
                | ((*buf.get_unchecked(1) & 0x3F) as u32) << 12
                | ((*buf.get_unchecked(2) & 0x3F) as u32) << 6
                | ((*buf.get_unchecked(3) & 0x3F) as u32);
            if n < 0x1_0000 {
                return None;
            } // Overlong
        },
        _ => debug_unreachable!(),
    }

    char::from_u32(n).map(Meaning::Whole)
}

#[inline(always)]
unsafe fn unsafe_slice(buf: &[u8], start: usize, new_len: usize) -> &[u8] {
    debug_assert!(start <= buf.len());
    debug_assert!(new_len <= (buf.len() - start));
    slice::from_raw_parts(buf.as_ptr().add(start), new_len)
}

/// Describes the UTF-8 codepoint containing the byte at index `idx` within
/// `buf`.
///
/// Returns `None` if `idx` is out of range, or if `buf` contains invalid UTF-8
/// in the vicinity of `idx`.
#[inline]
pub fn classify<'a>(buf: &'a [u8], idx: usize) -> Option<Codepoint<'a>> {
    if idx >= buf.len() {
        return None;
    }

    unsafe {
        let x = *buf.get_unchecked(idx);
        match Byte::classify(x)? {
            Byte::Ascii => Some(Codepoint {
                bytes: unsafe_slice(buf, idx, 1),
                rewind: 0,
                meaning: Meaning::Whole(x as char),
            }),
            Byte::Start(n) => {
                let avail = buf.len() - idx;
                if avail >= n {
                    let bytes = unsafe_slice(buf, idx, n);
                    if !all_cont(unsafe_slice(bytes, 1, n - 1)) {
                        return None;
                    }
                    let meaning = decode(bytes)?;
                    Some(Codepoint {
                        bytes,
                        rewind: 0,
                        meaning,
                    })
                } else {
                    Some(Codepoint {
                        bytes: unsafe_slice(buf, idx, avail),
                        rewind: 0,
                        meaning: Meaning::Prefix(n - avail),
                    })
                }
            },
            Byte::Cont => {
                let mut start = idx;
                let mut checked = 0;
                loop {
                    if start == 0 {
                        // Whoops, fell off the beginning.
                        return Some(Codepoint {
                            bytes: unsafe_slice(buf, 0, idx + 1),
                            rewind: idx,
                            meaning: Meaning::Suffix,
                        });
                    }

                    start -= 1;
                    checked += 1;
                    match Byte::classify(*buf.get_unchecked(start))? {
                        Byte::Cont => (),
                        Byte::Start(n) => {
                            let avail = buf.len() - start;
                            if avail >= n {
                                let bytes = unsafe_slice(buf, start, n);
                                if checked < n {
                                    if !all_cont(unsafe_slice(bytes, checked, n - checked)) {
                                        return None;
                                    }
                                }
                                let meaning = decode(bytes)?;
                                return Some(Codepoint {
                                    bytes,
                                    rewind: idx - start,
                                    meaning,
                                });
                            } else {
                                return Some(Codepoint {
                                    bytes: unsafe_slice(buf, start, avail),
                                    rewind: idx - start,
                                    meaning: Meaning::Prefix(n - avail),
                                });
                            }
                        },
                        _ => return None,
                    }

                    if idx - start >= 3 {
                        // We looked at 3 bytes before a continuation byte
                        // and didn't find a start byte.
                        return None;
                    }
                }
            },
        }
    }
}

#[cfg(test)]
mod tests {
    use super::{all_cont, classify, decode, Byte, Meaning};
    use std::borrow::ToOwned;
    use std::io::Write;

    #[test]
    fn classify_all_bytes() {
        for n in 0x00..0x80 {
            assert_eq!(Byte::classify(n), Some(Byte::Ascii));
        }
        for n in 0x80..0xC0 {
            assert_eq!(Byte::classify(n), Some(Byte::Cont));
        }
        for n in 0xC0..0xE0 {
            assert_eq!(Byte::classify(n), Some(Byte::Start(2)));
        }
        for n in 0xE0..0xF0 {
            assert_eq!(Byte::classify(n), Some(Byte::Start(3)));
        }
        for n in 0xF0..0xF8 {
            assert_eq!(Byte::classify(n), Some(Byte::Start(4)));
        }
        for n in 0xF8..0xFF {
            assert_eq!(Byte::classify(n), None);
        }
        assert_eq!(Byte::classify(0xFF), None);
    }

    #[test]
    fn test_all_cont() {
        assert!(all_cont(b""));
        assert!(all_cont(b"\x80"));
        assert!(all_cont(b"\xBF"));
        assert!(all_cont(b"\x80\xBF\x80\xBF"));

        assert!(!all_cont(b"z"));
        assert!(!all_cont(b"\xC0\xBF"));
        assert!(!all_cont(b"\xFF"));
        assert!(!all_cont(b"\x80\xBFz\x80\xBF"));
        assert!(!all_cont(b"\x80\xBF\xC0\x80\xBF"));
        assert!(!all_cont(b"\x80\xBF\xFF\x80\xBF"));
        assert!(!all_cont(b"\x80\xBF\x80\xBFz"));
        assert!(!all_cont(b"\x80\xBF\x80\xBF\xC0"));
        assert!(!all_cont(b"z\x80\xBF\x80\xBF"));
        assert!(!all_cont(b"\xC0\x80\xBF\x80\xBF"));
    }

    #[test]
    fn test_decode() {
        unsafe {
            assert_eq!(Some(Meaning::Whole('ő')), decode(b"\xC5\x91"));
            assert_eq!(Some(Meaning::Whole('\u{a66e}')), decode(b"\xEA\x99\xAE"));
            assert_eq!(
                Some(Meaning::Whole('\u{1f4a9}')),
                decode(b"\xF0\x9F\x92\xA9")
            );
            assert_eq!(
                Some(Meaning::Whole('\u{10ffff}')),
                decode(b"\xF4\x8F\xBF\xBF")
            );

            assert_eq!(
                Some(Meaning::LeadSurrogate(0x0000)),
                decode(b"\xED\xA0\x80")
            );
            assert_eq!(
                Some(Meaning::LeadSurrogate(0x0001)),
                decode(b"\xED\xA0\x81")
            );
            assert_eq!(
                Some(Meaning::LeadSurrogate(0x03FE)),
                decode(b"\xED\xAF\xBE")
            );
            assert_eq!(
                Some(Meaning::LeadSurrogate(0x03FF)),
                decode(b"\xED\xAF\xBF")
            );

            assert_eq!(
                Some(Meaning::TrailSurrogate(0x0000)),
                decode(b"\xED\xB0\x80")
            );
            assert_eq!(
                Some(Meaning::TrailSurrogate(0x0001)),
                decode(b"\xED\xB0\x81")
            );
            assert_eq!(
                Some(Meaning::TrailSurrogate(0x03FE)),
                decode(b"\xED\xBF\xBE")
            );
            assert_eq!(
                Some(Meaning::TrailSurrogate(0x03FF)),
                decode(b"\xED\xBF\xBF")
            );

            // The last 4-byte UTF-8 sequence. This would be U+1FFFFF, which is out of
            // range.
            assert_eq!(None, decode(b"\xF7\xBF\xBF\xBF"));

            // First otherwise-valid sequence (would be U+110000) that is out of range
            assert_eq!(None, decode(b"\xF4\x90\x80\x80"));

            // Overlong sequences
            assert_eq!(None, decode(b"\xC0\x80"));
            assert_eq!(None, decode(b"\xC1\xBF"));
            assert_eq!(None, decode(b"\xE0\x80\x80"));
            assert_eq!(None, decode(b"\xE0\x9F\xBF"));
            assert_eq!(None, decode(b"\xF0\x80\x80\x80"));
            assert_eq!(None, decode(b"\xF0\x8F\xBF\xBF"));

            // For not-overlong sequence for each sequence length
            assert_eq!(Some(Meaning::Whole('\u{80}')), decode(b"\xC2\x80"));
            assert_eq!(Some(Meaning::Whole('\u{800}')), decode(b"\xE0\xA0\x80"));
            assert_eq!(
                Some(Meaning::Whole('\u{10000}')),
                decode(b"\xF0\x90\x80\x80")
            );
        }
    }

    static JUNK: &[u8] = b"\
        \xf8\x0d\x07\x25\xa6\x7b\x95\xeb\x47\x01\x7f\xee\
        \x3b\x00\x60\x57\x1d\x9e\x5d\x0a\x0b\x0a\x7c\x75\
        \x13\xa1\x82\x46\x27\x34\xe9\x52\x61\x0d\xec\x10\
        \x54\x49\x6e\x54\xdf\x7b\xe1\x31\x8c\x06\x21\x83\
        \x0f\xb5\x1f\x4c\x6a\x71\x52\x42\x74\xe7\x7b\x50\
        \x59\x1f\x6a\xd4\xff\x06\x92\x33\xc4\x34\x97\xff\
        \xcc\xb5\xc4\x00\x7b\xc3\x4a\x7f\x7e\x63\x96\x58\
        \x51\x63\x21\x54\x53\x2f\x03\x8a\x7d\x41\x79\x98\
        \x5b\xcb\xb8\x94\x6b\x73\xf3\x0c\x5a\xd7\xc4\x12\
        \x7a\x2b\x9a\x2e\x67\x62\x2a\x00\x45\x2c\xfe\x7d\
        \x8d\xd6\x51\x4e\x59\x36\x72\x1b\xae\xaa\x06\xe8\
        \x71\x1b\x85\xd3\x35\xb5\xbe\x9e\x16\x96\x72\xd8\
        \x1a\x48\xba\x4d\x55\x4f\x1b\xa2\x77\xfa\x8f\x71\
        \x58\x7d\x03\x93\xa2\x3a\x76\x51\xda\x48\xe2\x3f\
        \xeb\x8d\xda\x89\xae\xf7\xbd\x3d\xb6\x37\x97\xca\
        \x99\xcc\x4a\x8d\x62\x89\x97\xe3\xc0\xd1\x8d\xc1\
        \x26\x11\xbb\x8d\x53\x61\x4f\x76\x03\x00\x30\xd3\
        \x5f\x86\x19\x52\x9c\x3e\x99\x8c\xb7\x21\x48\x1c\
        \x85\xae\xad\xd5\x74\x00\x6c\x3e\xd0\x17\xff\x76\
        \x5c\x32\xc3\xfb\x24\x99\xd4\x4c\xa4\x1f\x66\x46\
        \xe7\x2d\x44\x56\x7d\x14\xd9\x76\x91\x37\x2f\xb7\
        \xcc\x1b\xd3\xc2";

    #[test]
    fn classify_whole() {
        assert_eq!(JUNK.len(), 256);

        for &c in &[
            '\0',
            '\x01',
            'o',
            'z',
            'ő',
            '\u{2764}',
            '\u{a66e}',
            '\u{1f4a9}',
            '\u{1f685}',
        ] {
            for idx in 0..JUNK.len() - 3 {
                let mut buf = JUNK.to_owned();
                let ch = format!("{}", c).into_bytes();
                (&mut buf[idx..]).write_all(&ch).unwrap();

                for j in 0..ch.len() {
                    let class = classify(&buf, idx + j).unwrap();
                    assert_eq!(class.bytes, &*ch);
                    assert_eq!(class.rewind, j);
                    assert_eq!(class.meaning, Meaning::Whole(c));
                }
            }
        }
    }

    #[test]
    fn classify_surrogates() {
        for &(s, b) in &[
            (Meaning::LeadSurrogate(0x0000), b"\xED\xA0\x80"),
            (Meaning::LeadSurrogate(0x0001), b"\xED\xA0\x81"),
            (Meaning::LeadSurrogate(0x03FE), b"\xED\xAF\xBE"),
            (Meaning::LeadSurrogate(0x03FF), b"\xED\xAF\xBF"),
            (Meaning::TrailSurrogate(0x0000), b"\xED\xB0\x80"),
            (Meaning::TrailSurrogate(0x0001), b"\xED\xB0\x81"),
            (Meaning::TrailSurrogate(0x03FE), b"\xED\xBF\xBE"),
            (Meaning::TrailSurrogate(0x03FF), b"\xED\xBF\xBF"),
        ] {
            for idx in 0..JUNK.len() - 2 {
                let mut buf = JUNK.to_owned();
                (&mut buf[idx..]).write_all(b).unwrap();

                let class = classify(&buf, idx).unwrap();
                assert_eq!(class.bytes, b);
                assert_eq!(class.rewind, 0);
                assert_eq!(class.meaning, s);
            }
        }
    }

    #[test]
    fn classify_prefix_suffix() {
        for &c in &['ő', '\u{a66e}', '\u{1f4a9}'] {
            let ch = format!("{}", c).into_bytes();
            for pfx in 1..ch.len() - 1 {
                let mut buf = JUNK.to_owned();
                let buflen = buf.len();
                (&mut buf[buflen - pfx..buflen])
                    .write_all(&ch[..pfx])
                    .unwrap();
                for j in 0..pfx {
                    let idx = buflen - 1 - j;
                    let class = classify(&buf, idx).unwrap();
                    assert_eq!(class.bytes, &ch[..pfx]);
                    assert_eq!(class.rewind, pfx - 1 - j);
                    assert_eq!(class.meaning, Meaning::Prefix(ch.len() - pfx));
                }
            }
            for sfx in 1..ch.len() - 1 {
                let ch_bytes = &ch[ch.len() - sfx..];
                let mut buf = JUNK.to_owned();
                (&mut *buf).write_all(ch_bytes).unwrap();
                for j in 0..sfx {
                    let class = classify(&buf, j).unwrap();
                    assert!(ch_bytes.starts_with(class.bytes));
                    assert_eq!(class.rewind, j);
                    assert_eq!(class.meaning, Meaning::Suffix);
                }
            }
        }
    }

    #[test]
    fn out_of_bounds() {
        assert!(classify(b"", 0).is_none());
        assert!(classify(b"", 7).is_none());
        assert!(classify(b"aaaaaaa", 7).is_none());
    }

    #[test]
    fn malformed() {
        assert_eq!(None, classify(b"\xFF", 0));
        assert_eq!(None, classify(b"\xC5\xC5", 0));
        assert_eq!(None, classify(b"x\x91", 1));
        assert_eq!(None, classify(b"\x91\x91\x91\x91", 3));
        assert_eq!(None, classify(b"\x91\x91\x91\x91\x91", 4));
        assert_eq!(None, classify(b"\xEA\x91\xFF", 1));
        assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 0));
        assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 1));
        assert_eq!(None, classify(b"\xF0\x90\x90\xF0", 2));

        for i in 0..4 {
            // out of range: U+110000
            assert_eq!(None, classify(b"\xF4\x90\x80\x80", i));

            // out of range: U+1FFFFF
            assert_eq!(None, classify(b"\xF7\xBF\xBF\xBF", i));

            // Overlong sequences
            assert_eq!(None, classify(b"\xC0\x80", i));
            assert_eq!(None, classify(b"\xC1\xBF", i));
            assert_eq!(None, classify(b"\xE0\x80\x80", i));
            assert_eq!(None, classify(b"\xE0\x9F\xBF", i));
            assert_eq!(None, classify(b"\xF0\x80\x80\x80", i));
            assert_eq!(None, classify(b"\xF0\x8F\xBF\xBF", i));
        }
    }
}

Coverage Report

Created: 2026-01-13 06:57