/rust/registry/src/index.crates.io-1949cf8c6b5b557f/unicode-bom-2.0.3/src/lib.rs
Line | Count | Source |
1 | | // Copyright © 2018 Phil Booth |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); you may |
4 | | // not use this file except in compliance with the License. You may obtain |
5 | | // a copy of the License at: |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or |
12 | | // implied. See the License for the specific language governing |
13 | | // permissions and limitations under the License. |
14 | | |
15 | | //! Detects and classifies |
16 | | //! [Unicode byte-order marks](https://en.wikipedia.org/wiki/Byte_order_mark). |
17 | | //! |
18 | | //! ## Usage |
19 | | //! |
20 | | //! ``` |
21 | | //! use unicode_bom::Bom; |
22 | | //! |
23 | | //! // Detect the UTF-32 (little-endian) BOM in a file on disk |
24 | | //! let bom: Bom = "fixtures/utf32-le.txt".parse().unwrap(); |
25 | | //! assert_eq!(bom, Bom::Utf32Le); |
26 | | //! assert_eq!(bom.len(), 4); |
27 | | //! |
28 | | //! // Detect the UTF-16 (little-endian) BOM in a file on disk |
29 | | //! let bom: Bom = "fixtures/utf16-le.txt".parse().unwrap(); |
30 | | //! assert_eq!(bom, Bom::Utf16Le); |
31 | | //! assert_eq!(bom.len(), 2); |
32 | | //! |
33 | | //! // Detect no BOM in a file on disk |
34 | | //! let bom: Bom = "fixtures/ascii.txt".parse().unwrap(); |
35 | | //! assert_eq!(bom, Bom::Null); |
36 | | //! assert_eq!(bom.len(), 0); |
37 | | //! |
38 | | //! // Detect the BOM in a byte array |
39 | | //! let bytes = [0u8, 0u8, 0xfeu8, 0xffu8]; |
40 | | //! assert_eq!(Bom::from(&bytes[0..]), Bom::Utf32Be); |
41 | | //! ``` |
42 | | |
43 | | use std::fmt::{self, Display, Formatter}; |
44 | | use std::fs::File; |
45 | | use std::io::{Error, ErrorKind, Read}; |
46 | | use std::str::FromStr; |
47 | | |
48 | | #[cfg(test)] |
49 | | mod test; |
50 | | |
51 | | /// Unicode byte-order mark (BOM) abstraction. |
52 | | #[derive(Clone, Copy, Debug, PartialEq)] |
53 | | pub enum Bom { |
54 | | /// Indicates no BOM was detected. |
55 | | Null, |
56 | | |
57 | | /// Indicates [BOCU-1](https://www.unicode.org/notes/tn6/) BOM was detected. |
58 | | Bocu1, |
59 | | |
60 | | /// Indicates [GB 18030](https://en.wikipedia.org/wiki/GB_18030) BOM was detected. |
61 | | Gb18030, |
62 | | |
63 | | /// Indicates [SCSU](https://www.unicode.org/reports/tr6/) BOM was detected. |
64 | | Scsu, |
65 | | |
66 | | /// Indicates [UTF-EBCIDC](https://www.unicode.org/reports/tr16/) BOM was detected. |
67 | | UtfEbcdic, |
68 | | |
69 | | /// Indicates [UTF-1](https://en.wikipedia.org/wiki/UTF-1) BOM was detected. |
70 | | Utf1, |
71 | | |
72 | | /// Indicates [UTF-7](https://tools.ietf.org/html/rfc2152) BOM was detected. |
73 | | Utf7, |
74 | | |
75 | | /// Indicates [UTF-8](https://tools.ietf.org/html/rfc3629) BOM was detected. |
76 | | Utf8, |
77 | | |
78 | | /// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (big-endian) BOM was detected. |
79 | | Utf16Be, |
80 | | |
81 | | /// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (little-endian) BOM was detected. |
82 | | Utf16Le, |
83 | | |
84 | | /// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (big-endian) BOM was detected. |
85 | | Utf32Be, |
86 | | |
87 | | /// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (little-endian) BOM was detected. |
88 | | Utf32Le, |
89 | | } |
90 | | |
91 | | impl Bom { |
92 | | /// Returns the size in bytes of the BOM. |
93 | 31.0k | pub fn len(&self) -> usize { |
94 | 31.0k | match *self { |
95 | 30.9k | Bom::Null => 0, |
96 | 3 | Bom::Bocu1 => 3, |
97 | 2 | Bom::Gb18030 => 4, |
98 | 3 | Bom::Scsu => 3, |
99 | 3 | Bom::UtfEbcdic => 4, |
100 | 2 | Bom::Utf1 => 3, |
101 | 44 | Bom::Utf7 => 4, |
102 | 49 | Bom::Utf8 => 3, |
103 | 20 | Bom::Utf16Be => 2, |
104 | 46 | Bom::Utf16Le => 2, |
105 | 5 | Bom::Utf32Be => 4, |
106 | 6 | Bom::Utf32Le => 4, |
107 | | } |
108 | 31.0k | } |
109 | | } |
110 | | |
111 | | impl AsRef<str> for Bom { |
112 | | /// Returns a `&str` representation of the BOM type. |
113 | 0 | fn as_ref(&self) -> &str { |
114 | 0 | match *self { |
115 | 0 | Bom::Null => "[not set]", |
116 | 0 | Bom::Bocu1 => "BOCU-1", |
117 | 0 | Bom::Gb18030 => "GB 18030", |
118 | 0 | Bom::Scsu => "SCSU", |
119 | 0 | Bom::UtfEbcdic => "UTF-EBCDIC", |
120 | 0 | Bom::Utf1 => "UTF-1", |
121 | 0 | Bom::Utf7 => "UTF-7", |
122 | 0 | Bom::Utf8 => "UTF-8", |
123 | 0 | Bom::Utf16Be => "UTF-16 (big-endian)", |
124 | 0 | Bom::Utf16Le => "UTF-16 (little-endian)", |
125 | 0 | Bom::Utf32Be => "UTF-32 (big-endian)", |
126 | 0 | Bom::Utf32Le => "UTF-32 (little-endian)", |
127 | | } |
128 | 0 | } |
129 | | } |
130 | | |
131 | | impl AsRef<[u8]> for Bom { |
132 | | /// Returns the BOM byte-array literal. |
133 | | /// |
134 | | /// Note that for UTF-7, |
135 | | /// only the first three bytes of the BOM are returned. |
136 | | /// That's because the last two bits of the fourth byte |
137 | | /// belong to the following character, |
138 | | /// so it's impossible to return the fourth byte |
139 | | /// without further context. |
140 | | /// Possible values for the missing fourth byte |
141 | | /// are `0x38`, `0x39`, `0x2a` and `0x2b`. |
142 | 0 | fn as_ref(&self) -> &[u8] { |
143 | 0 | match *self { |
144 | 0 | Bom::Null => &[], |
145 | 0 | Bom::Bocu1 => &[0xfb, 0xee, 0x28], |
146 | 0 | Bom::Gb18030 => &[0x84, 0x31, 0x95, 0x33], |
147 | 0 | Bom::Scsu => &[0x0e, 0xfe, 0xff], |
148 | 0 | Bom::UtfEbcdic => &[0xdd, 0x73, 0x66, 0x73], |
149 | 0 | Bom::Utf1 => &[0xf7, 0x64, 0x4c], |
150 | 0 | Bom::Utf7 => &[0x2b, 0x2f, 0x76], |
151 | 0 | Bom::Utf8 => &[0xef, 0xbb, 0xbf], |
152 | 0 | Bom::Utf16Be => &[0xfe, 0xff], |
153 | 0 | Bom::Utf16Le => &[0xff, 0xfe], |
154 | 0 | Bom::Utf32Be => &[0, 0, 0xfe, 0xff], |
155 | 0 | Bom::Utf32Le => &[0xff, 0xfe, 0, 0], |
156 | | } |
157 | 0 | } |
158 | | } |
159 | | |
160 | | impl Default for Bom { |
161 | | /// Returns the default/empty BOM type, `Bom::Null`. |
162 | 0 | fn default() -> Self { |
163 | 0 | Bom::Null |
164 | 0 | } |
165 | | } |
166 | | |
167 | | impl Display for Bom { |
168 | | /// Formats the BOM type as a `String`. |
169 | 0 | fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { |
170 | 0 | write!(formatter, "{}", AsRef::<str>::as_ref(self)) |
171 | 0 | } |
172 | | } |
173 | | |
174 | | impl Eq for Bom {} |
175 | | |
176 | | macro_rules! compare_tail { |
177 | | ($slice:ident, $bytes:expr) => { |
178 | | compare_tail!($slice, $bytes, 1) |
179 | | }; |
180 | | |
181 | | ($slice:ident, $bytes:expr, $from:expr) => { |
182 | | compare_tail!($slice, $bytes.len() + $from, $bytes, $from) |
183 | | }; |
184 | | |
185 | | ($slice:ident, $len:expr, $bytes:expr, $from:expr) => { |
186 | | $slice.len() >= $len && $slice[$from..$from + $bytes.len()] == $bytes |
187 | | }; |
188 | | } |
189 | | |
190 | | impl From<&[u8]> for Bom { |
191 | | /// Detect the BOM type from a byte array. |
192 | 31.0k | fn from(slice: &[u8]) -> Self { |
193 | 31.0k | if slice.len() >= 2 { |
194 | 24.2k | match slice[0] { |
195 | | 0 => { |
196 | 214 | if compare_tail!(slice, [0, 0xfe, 0xff]) { |
197 | 5 | return Bom::Utf32Be; |
198 | 209 | } |
199 | | } |
200 | | 0x0e => { |
201 | 130 | if compare_tail!(slice, [0xfe, 0xff]) { |
202 | 3 | return Bom::Scsu; |
203 | 127 | } |
204 | | } |
205 | | 0x2b => { |
206 | 220 | if compare_tail!(slice, 4, [0x2f, 0x76], 1) |
207 | 61 | && (slice[3] == 0x38 |
208 | 50 | || slice[3] == 0x39 |
209 | 39 | || slice[3] == 0x2b |
210 | 27 | || slice[3] == 0x2f) |
211 | | { |
212 | 44 | return Bom::Utf7; |
213 | 176 | } |
214 | | } |
215 | | 0x84 => { |
216 | 21 | if compare_tail!(slice, [0x31, 0x95, 0x33]) { |
217 | 2 | return Bom::Gb18030; |
218 | 19 | } |
219 | | } |
220 | | 0xdd => { |
221 | 52 | if compare_tail!(slice, [0x73, 0x66, 0x73]) { |
222 | 3 | return Bom::UtfEbcdic; |
223 | 49 | } |
224 | | } |
225 | | 0xef => { |
226 | 139 | if compare_tail!(slice, [0xbb, 0xbf]) { |
227 | 49 | return Bom::Utf8; |
228 | 90 | } |
229 | | } |
230 | | 0xf7 => { |
231 | 72 | if compare_tail!(slice, [0x64, 0x4c]) { |
232 | 2 | return Bom::Utf1; |
233 | 70 | } |
234 | | } |
235 | | 0xfb => { |
236 | 79 | if compare_tail!(slice, [0xee, 0x28]) { |
237 | 3 | return Bom::Bocu1; |
238 | 76 | } |
239 | | } |
240 | | 0xfe => { |
241 | 47 | if slice[1] == 0xff { |
242 | 20 | return Bom::Utf16Be; |
243 | 27 | } |
244 | | } |
245 | | 0xff => { |
246 | 78 | if slice[1] == 0xfe { |
247 | 52 | if compare_tail!(slice, [0, 0], 2) { |
248 | 6 | return Bom::Utf32Le; |
249 | 46 | } |
250 | | |
251 | 46 | return Bom::Utf16Le; |
252 | 26 | } |
253 | | } |
254 | 23.2k | _ => {} |
255 | | } |
256 | 6.81k | } |
257 | | |
258 | 30.9k | Bom::Null |
259 | 31.0k | } |
260 | | } |
261 | | |
262 | | impl From<&mut File> for Bom { |
263 | | /// Detect the BOM type from a `File` instance. |
264 | | /// |
265 | | /// Note that I/O errors are swallowed by this method. |
266 | | /// Instead the default type, `Bom::Null`, |
267 | | /// will be returned. |
268 | 0 | fn from(file: &mut File) -> Self { |
269 | 0 | let mut data = [0u8; 4]; |
270 | 0 | let mut result = file.read_exact(&mut data); |
271 | | |
272 | 0 | if let Err(ref error) = result { |
273 | 0 | if error.kind() == ErrorKind::UnexpectedEof { |
274 | 0 | let short_data = [0u8; 3]; |
275 | 0 | result = file.read_exact(&mut data); |
276 | | |
277 | 0 | if let Err(ref error) = result { |
278 | 0 | if error.kind() == ErrorKind::UnexpectedEof { |
279 | 0 | let short_data = [0u8; 2]; |
280 | 0 | result = file.read_exact(&mut data); |
281 | 0 | data[0] = short_data[0]; |
282 | 0 | data[1] = short_data[1]; |
283 | 0 | } |
284 | 0 | } else { |
285 | 0 | data[0] = short_data[0]; |
286 | 0 | data[1] = short_data[1]; |
287 | 0 | data[2] = short_data[2]; |
288 | 0 | } |
289 | 0 | } |
290 | 0 | } |
291 | | |
292 | 0 | if result.is_ok() { |
293 | 0 | Bom::from(&data[0..]) |
294 | | } else { |
295 | 0 | Bom::Null |
296 | | } |
297 | 0 | } |
298 | | } |
299 | | |
300 | | impl FromStr for Bom { |
301 | | /// A `std::io::Error` instance returned by `std::fs::File::open`. |
302 | | type Err = Error; |
303 | | |
304 | | /// Parse the BOM type from the file located at `path`. |
305 | 0 | fn from_str(path: &str) -> Result<Self, Self::Err> { |
306 | 0 | let mut file = File::open(path)?; |
307 | 0 | Ok(Bom::from(&mut file)) |
308 | 0 | } |
309 | | } |