Coverage Report

Created: 2025-12-31 07:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/unicode-bom-2.0.3/src/lib.rs
Line
Count
Source
1
// Copyright © 2018 Phil Booth
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License"); you may
4
// not use this file except in compliance with the License. You may obtain
5
// a copy of the License at:
6
//
7
// https://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
// implied. See the License for the specific language governing
13
// permissions and limitations under the License.
14
15
//! Detects and classifies
16
//! [Unicode byte-order marks](https://en.wikipedia.org/wiki/Byte_order_mark).
17
//!
18
//! ## Usage
19
//!
20
//! ```
21
//! use unicode_bom::Bom;
22
//!
23
//! // Detect the UTF-32 (little-endian) BOM in a file on disk
24
//! let bom: Bom = "fixtures/utf32-le.txt".parse().unwrap();
25
//! assert_eq!(bom, Bom::Utf32Le);
26
//! assert_eq!(bom.len(), 4);
27
//!
28
//! // Detect the UTF-16 (little-endian) BOM in a file on disk
29
//! let bom: Bom = "fixtures/utf16-le.txt".parse().unwrap();
30
//! assert_eq!(bom, Bom::Utf16Le);
31
//! assert_eq!(bom.len(), 2);
32
//!
33
//! // Detect no BOM in a file on disk
34
//! let bom: Bom = "fixtures/ascii.txt".parse().unwrap();
35
//! assert_eq!(bom, Bom::Null);
36
//! assert_eq!(bom.len(), 0);
37
//!
38
//! // Detect the BOM in a byte array
39
//! let bytes = [0u8, 0u8, 0xfeu8, 0xffu8];
40
//! assert_eq!(Bom::from(&bytes[0..]), Bom::Utf32Be);
41
//! ```
42
43
use std::fmt::{self, Display, Formatter};
44
use std::fs::File;
45
use std::io::{Error, ErrorKind, Read};
46
use std::str::FromStr;
47
48
#[cfg(test)]
49
mod test;
50
51
/// Unicode byte-order mark (BOM) abstraction.
52
#[derive(Clone, Copy, Debug, PartialEq)]
53
pub enum Bom {
54
    /// Indicates no BOM was detected.
55
    Null,
56
57
    /// Indicates [BOCU-1](https://www.unicode.org/notes/tn6/) BOM was detected.
58
    Bocu1,
59
60
    /// Indicates [GB 18030](https://en.wikipedia.org/wiki/GB_18030) BOM was detected.
61
    Gb18030,
62
63
    /// Indicates [SCSU](https://www.unicode.org/reports/tr6/) BOM was detected.
64
    Scsu,
65
66
    /// Indicates [UTF-EBCIDC](https://www.unicode.org/reports/tr16/) BOM was detected.
67
    UtfEbcdic,
68
69
    /// Indicates [UTF-1](https://en.wikipedia.org/wiki/UTF-1) BOM was detected.
70
    Utf1,
71
72
    /// Indicates [UTF-7](https://tools.ietf.org/html/rfc2152) BOM was detected.
73
    Utf7,
74
75
    /// Indicates [UTF-8](https://tools.ietf.org/html/rfc3629) BOM was detected.
76
    Utf8,
77
78
    /// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (big-endian) BOM was detected.
79
    Utf16Be,
80
81
    /// Indicates [UTF-16](https://tools.ietf.org/html/rfc2781) (little-endian) BOM was detected.
82
    Utf16Le,
83
84
    /// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (big-endian) BOM was detected.
85
    Utf32Be,
86
87
    /// Indicates [UTF-32](https://www.unicode.org/reports/tr19/) (little-endian) BOM was detected.
88
    Utf32Le,
89
}
90
91
impl Bom {
92
    /// Returns the size in bytes of the BOM.
93
31.0k
    pub fn len(&self) -> usize {
94
31.0k
        match *self {
95
30.9k
            Bom::Null => 0,
96
3
            Bom::Bocu1 => 3,
97
2
            Bom::Gb18030 => 4,
98
3
            Bom::Scsu => 3,
99
3
            Bom::UtfEbcdic => 4,
100
2
            Bom::Utf1 => 3,
101
44
            Bom::Utf7 => 4,
102
49
            Bom::Utf8 => 3,
103
20
            Bom::Utf16Be => 2,
104
46
            Bom::Utf16Le => 2,
105
5
            Bom::Utf32Be => 4,
106
6
            Bom::Utf32Le => 4,
107
        }
108
31.0k
    }
109
}
110
111
impl AsRef<str> for Bom {
112
    /// Returns a `&str` representation of the BOM type.
113
0
    fn as_ref(&self) -> &str {
114
0
        match *self {
115
0
            Bom::Null => "[not set]",
116
0
            Bom::Bocu1 => "BOCU-1",
117
0
            Bom::Gb18030 => "GB 18030",
118
0
            Bom::Scsu => "SCSU",
119
0
            Bom::UtfEbcdic => "UTF-EBCDIC",
120
0
            Bom::Utf1 => "UTF-1",
121
0
            Bom::Utf7 => "UTF-7",
122
0
            Bom::Utf8 => "UTF-8",
123
0
            Bom::Utf16Be => "UTF-16 (big-endian)",
124
0
            Bom::Utf16Le => "UTF-16 (little-endian)",
125
0
            Bom::Utf32Be => "UTF-32 (big-endian)",
126
0
            Bom::Utf32Le => "UTF-32 (little-endian)",
127
        }
128
0
    }
129
}
130
131
impl AsRef<[u8]> for Bom {
132
    /// Returns the BOM byte-array literal.
133
    ///
134
    /// Note that for UTF-7,
135
    /// only the first three bytes of the BOM are returned.
136
    /// That's because the last two bits of the fourth byte
137
    /// belong to the following character,
138
    /// so it's impossible to return the fourth byte
139
    /// without further context.
140
    /// Possible values for the missing fourth byte
141
    /// are `0x38`, `0x39`, `0x2a` and `0x2b`.
142
0
    fn as_ref(&self) -> &[u8] {
143
0
        match *self {
144
0
            Bom::Null => &[],
145
0
            Bom::Bocu1 => &[0xfb, 0xee, 0x28],
146
0
            Bom::Gb18030 => &[0x84, 0x31, 0x95, 0x33],
147
0
            Bom::Scsu => &[0x0e, 0xfe, 0xff],
148
0
            Bom::UtfEbcdic => &[0xdd, 0x73, 0x66, 0x73],
149
0
            Bom::Utf1 => &[0xf7, 0x64, 0x4c],
150
0
            Bom::Utf7 => &[0x2b, 0x2f, 0x76],
151
0
            Bom::Utf8 => &[0xef, 0xbb, 0xbf],
152
0
            Bom::Utf16Be => &[0xfe, 0xff],
153
0
            Bom::Utf16Le => &[0xff, 0xfe],
154
0
            Bom::Utf32Be => &[0, 0, 0xfe, 0xff],
155
0
            Bom::Utf32Le => &[0xff, 0xfe, 0, 0],
156
        }
157
0
    }
158
}
159
160
impl Default for Bom {
161
    /// Returns the default/empty BOM type, `Bom::Null`.
162
0
    fn default() -> Self {
163
0
        Bom::Null
164
0
    }
165
}
166
167
impl Display for Bom {
168
    /// Formats the BOM type as a `String`.
169
0
    fn fmt(&self, formatter: &mut Formatter) -> fmt::Result {
170
0
        write!(formatter, "{}", AsRef::<str>::as_ref(self))
171
0
    }
172
}
173
174
impl Eq for Bom {}
175
176
macro_rules! compare_tail {
177
    ($slice:ident, $bytes:expr) => {
178
        compare_tail!($slice, $bytes, 1)
179
    };
180
181
    ($slice:ident, $bytes:expr, $from:expr) => {
182
        compare_tail!($slice, $bytes.len() + $from, $bytes, $from)
183
    };
184
185
    ($slice:ident, $len:expr, $bytes:expr, $from:expr) => {
186
        $slice.len() >= $len && $slice[$from..$from + $bytes.len()] == $bytes
187
    };
188
}
189
190
impl From<&[u8]> for Bom {
191
    /// Detect the BOM type from a byte array.
192
31.0k
    fn from(slice: &[u8]) -> Self {
193
31.0k
        if slice.len() >= 2 {
194
24.2k
            match slice[0] {
195
                0 => {
196
214
                    if compare_tail!(slice, [0, 0xfe, 0xff]) {
197
5
                        return Bom::Utf32Be;
198
209
                    }
199
                }
200
                0x0e => {
201
130
                    if compare_tail!(slice, [0xfe, 0xff]) {
202
3
                        return Bom::Scsu;
203
127
                    }
204
                }
205
                0x2b => {
206
220
                    if compare_tail!(slice, 4, [0x2f, 0x76], 1)
207
61
                        && (slice[3] == 0x38
208
50
                            || slice[3] == 0x39
209
39
                            || slice[3] == 0x2b
210
27
                            || slice[3] == 0x2f)
211
                    {
212
44
                        return Bom::Utf7;
213
176
                    }
214
                }
215
                0x84 => {
216
21
                    if compare_tail!(slice, [0x31, 0x95, 0x33]) {
217
2
                        return Bom::Gb18030;
218
19
                    }
219
                }
220
                0xdd => {
221
52
                    if compare_tail!(slice, [0x73, 0x66, 0x73]) {
222
3
                        return Bom::UtfEbcdic;
223
49
                    }
224
                }
225
                0xef => {
226
139
                    if compare_tail!(slice, [0xbb, 0xbf]) {
227
49
                        return Bom::Utf8;
228
90
                    }
229
                }
230
                0xf7 => {
231
72
                    if compare_tail!(slice, [0x64, 0x4c]) {
232
2
                        return Bom::Utf1;
233
70
                    }
234
                }
235
                0xfb => {
236
79
                    if compare_tail!(slice, [0xee, 0x28]) {
237
3
                        return Bom::Bocu1;
238
76
                    }
239
                }
240
                0xfe => {
241
47
                    if slice[1] == 0xff {
242
20
                        return Bom::Utf16Be;
243
27
                    }
244
                }
245
                0xff => {
246
78
                    if slice[1] == 0xfe {
247
52
                        if compare_tail!(slice, [0, 0], 2) {
248
6
                            return Bom::Utf32Le;
249
46
                        }
250
251
46
                        return Bom::Utf16Le;
252
26
                    }
253
                }
254
23.2k
                _ => {}
255
            }
256
6.81k
        }
257
258
30.9k
        Bom::Null
259
31.0k
    }
260
}
261
262
impl From<&mut File> for Bom {
263
    /// Detect the BOM type from a `File` instance.
264
    ///
265
    /// Note that I/O errors are swallowed by this method.
266
    /// Instead the default type, `Bom::Null`,
267
    /// will be returned.
268
0
    fn from(file: &mut File) -> Self {
269
0
        let mut data = [0u8; 4];
270
0
        let mut result = file.read_exact(&mut data);
271
272
0
        if let Err(ref error) = result {
273
0
            if error.kind() == ErrorKind::UnexpectedEof {
274
0
                let short_data = [0u8; 3];
275
0
                result = file.read_exact(&mut data);
276
277
0
                if let Err(ref error) = result {
278
0
                    if error.kind() == ErrorKind::UnexpectedEof {
279
0
                        let short_data = [0u8; 2];
280
0
                        result = file.read_exact(&mut data);
281
0
                        data[0] = short_data[0];
282
0
                        data[1] = short_data[1];
283
0
                    }
284
0
                } else {
285
0
                    data[0] = short_data[0];
286
0
                    data[1] = short_data[1];
287
0
                    data[2] = short_data[2];
288
0
                }
289
0
            }
290
0
        }
291
292
0
        if result.is_ok() {
293
0
            Bom::from(&data[0..])
294
        } else {
295
0
            Bom::Null
296
        }
297
0
    }
298
}
299
300
impl FromStr for Bom {
301
    /// A `std::io::Error` instance returned by `std::fs::File::open`.
302
    type Err = Error;
303
304
    /// Parse the BOM type from the file located at `path`.
305
0
    fn from_str(path: &str) -> Result<Self, Self::Err> {
306
0
        let mut file = File::open(path)?;
307
0
        Ok(Bom::from(&mut file))
308
0
    }
309
}