/rust/registry/src/index.crates.io-1949cf8c6b5b557f/utf-8-0.7.6/src/lib.rs
Line | Count | Source |
1 | | mod lossy; |
2 | | mod read; |
3 | | |
4 | | pub use lossy::LossyDecoder; |
5 | | pub use read::{BufReadDecoder, BufReadDecoderError}; |
6 | | |
7 | | use std::cmp; |
8 | | use std::error::Error; |
9 | | use std::fmt; |
10 | | use std::str; |
11 | | |
12 | | /// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error. |
13 | | pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}"; |
14 | | |
15 | | #[derive(Debug, Copy, Clone)] |
16 | | pub enum DecodeError<'a> { |
17 | | /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`, |
18 | | /// then call `decode()` again with `remaining_input`. |
19 | | Invalid { |
20 | | valid_prefix: &'a str, |
21 | | invalid_sequence: &'a [u8], |
22 | | remaining_input: &'a [u8], |
23 | | }, |
24 | | |
25 | | /// Call the `incomplete_suffix.try_complete` method with more input when available. |
26 | | /// If no more input is available, this is an invalid byte sequence. |
27 | | Incomplete { |
28 | | valid_prefix: &'a str, |
29 | | incomplete_suffix: Incomplete, |
30 | | }, |
31 | | } |
32 | | |
33 | | impl<'a> fmt::Display for DecodeError<'a> { |
34 | 0 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
35 | 0 | match *self { |
36 | | DecodeError::Invalid { |
37 | 0 | valid_prefix, |
38 | 0 | invalid_sequence, |
39 | 0 | remaining_input, |
40 | 0 | } => write!( |
41 | 0 | f, |
42 | 0 | "found invalid byte sequence {invalid_sequence:02x?} after \ |
43 | 0 | {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \ |
44 | 0 | unprocessed bytes", |
45 | | invalid_sequence = invalid_sequence, |
46 | 0 | valid_byte_count = valid_prefix.len(), |
47 | 0 | unprocessed_byte_count = remaining_input.len() |
48 | | ), |
49 | | DecodeError::Incomplete { |
50 | 0 | valid_prefix, |
51 | 0 | incomplete_suffix, |
52 | 0 | } => write!( |
53 | 0 | f, |
54 | 0 | "found incomplete byte sequence {incomplete_suffix:02x?} after \ |
55 | 0 | {valid_byte_count} bytes", |
56 | | incomplete_suffix = incomplete_suffix, |
57 | 0 | valid_byte_count = valid_prefix.len() |
58 | | ), |
59 | | } |
60 | 0 | } |
61 | | } |
62 | | |
63 | | impl<'a> Error for DecodeError<'a> {} |
64 | | |
65 | | #[derive(Debug, Copy, Clone)] |
66 | | pub struct Incomplete { |
67 | | pub buffer: [u8; 4], |
68 | | pub buffer_len: u8, |
69 | | } |
70 | | |
71 | 386k | pub fn decode(input: &[u8]) -> Result<&str, DecodeError> { |
72 | 386k | let error = match str::from_utf8(input) { |
73 | 184k | Ok(valid) => return Ok(valid), |
74 | 201k | Err(error) => error, |
75 | | }; |
76 | | |
77 | | // FIXME: separate function from here to guide inlining? |
78 | 201k | let (valid, after_valid) = input.split_at(error.valid_up_to()); |
79 | 201k | let valid = unsafe { |
80 | 201k | str::from_utf8_unchecked(valid) |
81 | | }; |
82 | | |
83 | 201k | match error.error_len() { |
84 | 313 | Some(invalid_sequence_length) => { |
85 | 313 | let (invalid, rest) = after_valid.split_at(invalid_sequence_length); |
86 | 313 | Err(DecodeError::Invalid { |
87 | 313 | valid_prefix: valid, |
88 | 313 | invalid_sequence: invalid, |
89 | 313 | remaining_input: rest |
90 | 313 | }) |
91 | | } |
92 | | None => { |
93 | 201k | Err(DecodeError::Incomplete { |
94 | 201k | valid_prefix: valid, |
95 | 201k | incomplete_suffix: Incomplete::new(after_valid), |
96 | 201k | }) |
97 | | } |
98 | | } |
99 | 386k | } |
100 | | |
101 | | impl Incomplete { |
102 | 0 | pub fn empty() -> Self { |
103 | 0 | Incomplete { |
104 | 0 | buffer: [0, 0, 0, 0], |
105 | 0 | buffer_len: 0, |
106 | 0 | } |
107 | 0 | } |
108 | | |
109 | 0 | pub fn is_empty(&self) -> bool { |
110 | 0 | self.buffer_len == 0 |
111 | 0 | } |
112 | | |
113 | 201k | pub fn new(bytes: &[u8]) -> Self { |
114 | 201k | let mut buffer = [0, 0, 0, 0]; |
115 | 201k | let len = bytes.len(); |
116 | 201k | buffer[..len].copy_from_slice(bytes); |
117 | 201k | Incomplete { |
118 | 201k | buffer: buffer, |
119 | 201k | buffer_len: len as u8, |
120 | 201k | } |
121 | 201k | } |
122 | | |
123 | | /// * `None`: still incomplete, call `try_complete` again with more input. |
124 | | /// If no more input is available, this is invalid byte sequence. |
125 | | /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`. |
126 | | /// To keep decoding, pass `remaining_input` to `decode()`. |
127 | 201k | pub fn try_complete<'input>(&mut self, input: &'input [u8]) |
128 | 201k | -> Option<(Result<&str, &[u8]>, &'input [u8])> { |
129 | 201k | let (consumed, opt_result) = self.try_complete_offsets(input); |
130 | 201k | let result = opt_result?; |
131 | 200k | let remaining_input = &input[consumed..]; |
132 | 200k | let result_bytes = self.take_buffer(); |
133 | 200k | let result = match result { |
134 | 200k | Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }), |
135 | 41 | Err(()) => Err(result_bytes), |
136 | | }; |
137 | 200k | Some((result, remaining_input)) |
138 | 201k | } |
139 | | |
140 | 200k | fn take_buffer(&mut self) -> &[u8] { |
141 | 200k | let len = self.buffer_len as usize; |
142 | 200k | self.buffer_len = 0; |
143 | 200k | &self.buffer[..len as usize] |
144 | 200k | } |
145 | | |
146 | | /// (consumed_from_input, None): not enough input |
147 | | /// (consumed_from_input, Some(Err(()))): error bytes in buffer |
148 | | /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer |
149 | 201k | fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) { |
150 | 201k | let initial_buffer_len = self.buffer_len as usize; |
151 | | let copied_from_input; |
152 | 201k | { |
153 | 201k | let unwritten = &mut self.buffer[initial_buffer_len..]; |
154 | 201k | copied_from_input = cmp::min(unwritten.len(), input.len()); |
155 | 201k | unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); |
156 | 201k | } |
157 | 201k | let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; |
158 | 201k | match str::from_utf8(spliced) { |
159 | | Ok(_) => { |
160 | 16.8k | self.buffer_len = spliced.len() as u8; |
161 | 16.8k | (copied_from_input, Some(Ok(()))) |
162 | | } |
163 | 184k | Err(error) => { |
164 | 184k | let valid_up_to = error.valid_up_to(); |
165 | 184k | if valid_up_to > 0 { |
166 | 183k | let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); |
167 | 183k | self.buffer_len = valid_up_to as u8; |
168 | 183k | (consumed, Some(Ok(()))) |
169 | | } else { |
170 | 622 | match error.error_len() { |
171 | 41 | Some(invalid_sequence_length) => { |
172 | 41 | let consumed = invalid_sequence_length |
173 | 41 | .checked_sub(initial_buffer_len).unwrap(); |
174 | 41 | self.buffer_len = invalid_sequence_length as u8; |
175 | 41 | (consumed, Some(Err(()))) |
176 | | } |
177 | | None => { |
178 | 581 | self.buffer_len = spliced.len() as u8; |
179 | 581 | (copied_from_input, None) |
180 | | } |
181 | | } |
182 | | } |
183 | | } |
184 | | } |
185 | 201k | } |
186 | | } |