/rust/registry/src/index.crates.io-1949cf8c6b5b557f/utf8parse-0.2.2/src/lib.rs
Line | Count | Source |
1 | | //! A table-driven UTF-8 Parser |
2 | | //! |
3 | | //! This module implements a table-driven UTF-8 parser which should |
4 | | //! theoretically contain the minimal number of branches (1). The only branch is |
5 | | //! on the `Action` returned from unpacking a transition. |
6 | | #![deny(clippy::all, clippy::if_not_else, clippy::enum_glob_use)] |
7 | | #![cfg_attr(all(feature = "nightly", test), feature(test))] |
8 | | #![no_std] |
9 | | |
10 | | use core::char; |
11 | | |
12 | | mod types; |
13 | | |
14 | | use types::{Action, State}; |
15 | | |
16 | | /// Handles codepoint and invalid sequence events from the parser. |
17 | | pub trait Receiver { |
18 | | /// Called whenever a codepoint is parsed successfully |
19 | | fn codepoint(&mut self, _: char); |
20 | | |
21 | | /// Called when an invalid_sequence is detected |
22 | | fn invalid_sequence(&mut self); |
23 | | } |
24 | | |
25 | | /// A parser for Utf8 Characters |
26 | | /// |
27 | | /// Repeatedly call `advance` with bytes to emit Utf8 characters |
28 | | #[derive(Clone, Default, PartialEq, Eq, Debug)] |
29 | | pub struct Parser { |
30 | | point: u32, |
31 | | state: State, |
32 | | } |
33 | | |
34 | | /// Continuation bytes are masked with this value. |
35 | | const CONTINUATION_MASK: u8 = 0b0011_1111; |
36 | | |
37 | | impl Parser { |
38 | | /// Create a new Parser |
39 | | pub fn new() -> Parser { |
40 | | Parser { point: 0, state: State::Ground } |
41 | | } |
42 | | |
43 | | /// Advance the parser |
44 | | /// |
45 | | /// The provider receiver will be called whenever a codepoint is completed or an invalid |
46 | | /// sequence is detected. |
47 | 0 | pub fn advance<R>(&mut self, receiver: &mut R, byte: u8) |
48 | 0 | where |
49 | 0 | R: Receiver, |
50 | | { |
51 | 0 | let (state, action) = self.state.advance(byte); |
52 | 0 | self.perform_action(receiver, byte, action); |
53 | 0 | self.state = state; |
54 | 0 | } Unexecuted instantiation: <utf8parse::Parser>::advance::<anstream::adapter::strip::VtUtf8Receiver> Unexecuted instantiation: <utf8parse::Parser>::advance::<anstyle_parse::VtUtf8Receiver> |
55 | | |
56 | 0 | fn perform_action<R>(&mut self, receiver: &mut R, byte: u8, action: Action) |
57 | 0 | where |
58 | 0 | R: Receiver, |
59 | | { |
60 | 0 | match action { |
61 | 0 | Action::InvalidSequence => { |
62 | 0 | self.point = 0; |
63 | 0 | receiver.invalid_sequence(); |
64 | 0 | }, |
65 | 0 | Action::EmitByte => { |
66 | 0 | receiver.codepoint(byte as char); |
67 | 0 | }, |
68 | 0 | Action::SetByte1 => { |
69 | 0 | let point = self.point | ((byte & CONTINUATION_MASK) as u32); |
70 | 0 | let c = unsafe { char::from_u32_unchecked(point) }; |
71 | 0 | self.point = 0; |
72 | 0 |
|
73 | 0 | receiver.codepoint(c); |
74 | 0 | }, |
75 | 0 | Action::SetByte2 => { |
76 | 0 | self.point |= ((byte & CONTINUATION_MASK) as u32) << 6; |
77 | 0 | }, |
78 | 0 | Action::SetByte2Top => { |
79 | 0 | self.point |= ((byte & 0b0001_1111) as u32) << 6; |
80 | 0 | }, |
81 | 0 | Action::SetByte3 => { |
82 | 0 | self.point |= ((byte & CONTINUATION_MASK) as u32) << 12; |
83 | 0 | }, |
84 | 0 | Action::SetByte3Top => { |
85 | 0 | self.point |= ((byte & 0b0000_1111) as u32) << 12; |
86 | 0 | }, |
87 | 0 | Action::SetByte4 => { |
88 | 0 | self.point |= ((byte & 0b0000_0111) as u32) << 18; |
89 | 0 | }, |
90 | | } |
91 | 0 | } Unexecuted instantiation: <utf8parse::Parser>::perform_action::<anstream::adapter::strip::VtUtf8Receiver> Unexecuted instantiation: <utf8parse::Parser>::perform_action::<anstyle_parse::VtUtf8Receiver> |
92 | | } |
93 | | |
94 | | #[cfg(all(feature = "nightly", test))] |
95 | | mod benches { |
96 | | extern crate std; |
97 | | extern crate test; |
98 | | |
99 | | use super::{Parser, Receiver}; |
100 | | |
101 | | use self::test::{black_box, Bencher}; |
102 | | |
103 | | static UTF8_DEMO: &[u8] = include_bytes!("../tests/UTF-8-demo.txt"); |
104 | | |
105 | | impl Receiver for () { |
106 | | fn codepoint(&mut self, c: char) { |
107 | | black_box(c); |
108 | | } |
109 | | |
110 | | fn invalid_sequence(&mut self) {} |
111 | | } |
112 | | |
113 | | #[bench] |
114 | | fn parse_bench_utf8_demo(b: &mut Bencher) { |
115 | | let mut parser = Parser::new(); |
116 | | |
117 | | b.iter(|| { |
118 | | for byte in UTF8_DEMO { |
119 | | parser.advance(&mut (), *byte); |
120 | | } |
121 | | }) |
122 | | } |
123 | | |
124 | | #[bench] |
125 | | fn std_string_parse_utf8(b: &mut Bencher) { |
126 | | b.iter(|| { |
127 | | for c in std::str::from_utf8(UTF8_DEMO).unwrap().chars() { |
128 | | black_box(c); |
129 | | } |
130 | | }); |
131 | | } |
132 | | } |