Coverage Report

Created: 2025-12-14 07:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/encoding_rs-0.8.35/src/lib.rs
Line
Count
Source
1
// Copyright Mozilla Foundation. See the COPYRIGHT
2
// file at the top-level directory of this distribution.
3
//
4
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7
// option. This file may not be copied, modified, or distributed
8
// except according to those terms.
9
10
// The above license applies to code in this file. The label data in
11
// this file is generated from WHATWG's encodings.json, which came under
12
// the following license:
13
14
// Copyright © WHATWG (Apple, Google, Mozilla, Microsoft).
15
//
16
// Redistribution and use in source and binary forms, with or without
17
// modification, are permitted provided that the following conditions are met:
18
//
19
// 1. Redistributions of source code must retain the above copyright notice, this
20
//    list of conditions and the following disclaimer.
21
//
22
// 2. Redistributions in binary form must reproduce the above copyright notice,
23
//    this list of conditions and the following disclaimer in the documentation
24
//    and/or other materials provided with the distribution.
25
//
26
// 3. Neither the name of the copyright holder nor the names of its
27
//    contributors may be used to endorse or promote products derived from
28
//    this software without specific prior written permission.
29
//
30
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
31
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
33
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
34
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
36
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
37
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
38
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
39
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40
41
#![cfg_attr(
42
    feature = "cargo-clippy",
43
    allow(doc_markdown, inline_always, new_ret_no_self)
44
)]
45
46
//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
47
//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
48
//! Gecko-oriented means that converting to and from UTF-16 is supported in
49
//! addition to converting to and from UTF-8, that the performance and
50
//! streamability goals are browser-oriented, and that FFI-friendliness is a
51
//! goal.
52
//!
53
//! Additionally, the `mem` module provides functions that are useful for
54
//! applications that need to be able to deal with legacy in-memory
55
//! representations of Unicode.
56
//!
57
//! For expectation setting, please be sure to read the sections
58
//! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
59
//! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
60
//!
61
//! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
62
//! design and internals of the crate.
63
//!
64
//! # Availability
65
//!
66
//! The code is available under the
67
//! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
68
//! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
69
//! See the
70
//! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
71
//! file for details.
72
//! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
73
//! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
74
//!
75
//! # Integration with `std::io`
76
//!
77
//! This crate doesn't implement traits from `std::io`. However, for the case of
78
//! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
79
//! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
80
//! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
81
//!
82
//! # Examples
83
//!
84
//! Example programs:
85
//!
86
//! * [Rust](https://github.com/hsivonen/recode_rs)
87
//! * [C](https://github.com/hsivonen/recode_c)
88
//! * [C++](https://github.com/hsivonen/recode_cpp)
89
//!
90
//! Decode using the non-streaming API:
91
//!
92
//! ```
93
//! #[cfg(feature = "alloc")] {
94
//! use encoding_rs::*;
95
//!
96
//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
97
//! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
98
//!
99
//! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
100
//! assert_eq!(&cow[..], expectation);
101
//! assert_eq!(encoding_used, SHIFT_JIS);
102
//! assert!(!had_errors);
103
//! }
104
//! ```
105
//!
106
//! Decode using the streaming API with minimal `unsafe`:
107
//!
108
//! ```
109
//! use encoding_rs::*;
110
//!
111
//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
112
//!
113
//! // Use an array of byte slices to demonstrate content arriving piece by
114
//! // piece from the network.
115
//! let bytes: [&'static [u8]; 4] = [b"\x83",
116
//!                                  b"n\x83\x8D\x81",
117
//!                                  b"[\x81E\x83\x8F\x81[\x83",
118
//!                                  b"\x8B\x83h"];
119
//!
120
//! // Very short output buffer to demonstrate the output buffer getting full.
121
//! // Normally, you'd use something like `[0u8; 2048]`.
122
//! let mut buffer_bytes = [0u8; 8];
123
//! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
124
//!
125
//! // How many bytes in the buffer currently hold significant data.
126
//! let mut bytes_in_buffer = 0usize;
127
//!
128
//! // Collect the output to a string for demonstration purposes.
129
//! let mut output = String::new();
130
//!
131
//! // The `Decoder`
132
//! let mut decoder = SHIFT_JIS.new_decoder();
133
//!
134
//! // Track whether we see errors.
135
//! let mut total_had_errors = false;
136
//!
137
//! // Decode using a fixed-size intermediate buffer (for demonstrating the
138
//! // use of a fixed-size buffer; normally when the output of an incremental
139
//! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
140
//! // avoid the intermediate buffer).
141
//! for input in &bytes[..] {
142
//!     // The number of bytes already read from current `input` in total.
143
//!     let mut total_read_from_current_input = 0usize;
144
//!
145
//!     loop {
146
//!         let (result, read, written, had_errors) =
147
//!             decoder.decode_to_str(&input[total_read_from_current_input..],
148
//!                                   &mut buffer[bytes_in_buffer..],
149
//!                                   false);
150
//!         total_read_from_current_input += read;
151
//!         bytes_in_buffer += written;
152
//!         total_had_errors |= had_errors;
153
//!         match result {
154
//!             CoderResult::InputEmpty => {
155
//!                 // We have consumed the current input buffer. Break out of
156
//!                 // the inner loop to get the next input buffer from the
157
//!                 // outer loop.
158
//!                 break;
159
//!             },
160
//!             CoderResult::OutputFull => {
161
//!                 // Write the current buffer out and consider the buffer
162
//!                 // empty.
163
//!                 output.push_str(&buffer[..bytes_in_buffer]);
164
//!                 bytes_in_buffer = 0usize;
165
//!                 continue;
166
//!             }
167
//!         }
168
//!     }
169
//! }
170
//!
171
//! // Process EOF
172
//! loop {
173
//!     let (result, _, written, had_errors) =
174
//!         decoder.decode_to_str(b"",
175
//!                               &mut buffer[bytes_in_buffer..],
176
//!                               true);
177
//!     bytes_in_buffer += written;
178
//!     total_had_errors |= had_errors;
179
//!     // Write the current buffer out and consider the buffer empty.
180
//!     // Need to do this here for both `match` arms, because we exit the
181
//!     // loop on `CoderResult::InputEmpty`.
182
//!     output.push_str(&buffer[..bytes_in_buffer]);
183
//!     bytes_in_buffer = 0usize;
184
//!     match result {
185
//!         CoderResult::InputEmpty => {
186
//!             // Done!
187
//!             break;
188
//!         },
189
//!         CoderResult::OutputFull => {
190
//!             continue;
191
//!         }
192
//!     }
193
//! }
194
//!
195
//! assert_eq!(&output[..], expectation);
196
//! assert!(!total_had_errors);
197
//! ```
198
//!
199
//! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
200
//!
201
//! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
202
//! __so this crate does not provide encoders for those encodings__!
203
//! Along with the replacement encoding, their _output encoding_ (i.e. the
204
//! encoding used for form submission and error handling in the query string
205
//! of URLs) is UTF-8, so you get an UTF-8 encoder if you request an encoder
206
//! for them.
207
//!
208
//! Additionally, the Encoding Standard factors BOM handling into wrapper
209
//! algorithms so that BOM handling isn't part of the definition of the
210
//! encodings themselves. The Unicode _encoding schemes_ in the Unicode
211
//! Standard define BOM handling or lack thereof as part of the encoding
212
//! scheme.
213
//!
214
//! When used with the `_without_bom_handling` entry points, the UTF-16LE
215
//! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
216
//! the Unicode Standard.
217
//!
218
//! When used with the `_with_bom_removal` entry points, the UTF-8
219
//! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
220
//! Standard.
221
//!
222
//! This crate does not provide a mode that matches the UTF-16 _encoding
223
//! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
224
//! the entry points without `_bom_` qualifiers is the closest match,
225
//! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
226
//! not part of the behavior of the UTF-16 _encoding scheme_ per the
227
//! Unicode Standard.
228
//!
229
//! The UTF-32 family of Unicode encoding schemes is not supported
230
//! by this crate. The Encoding Standard doesn't define any UTF-32
231
//! family encodings, since they aren't necessary for consuming Web
232
//! content.
233
//!
234
//! While gb18030 is capable of representing U+FEFF, the Encoding
235
//! Standard does not treat the gb18030 byte representation of U+FEFF
236
//! as a BOM, so neither does this crate.
237
//!
238
//! ## ISO-8859-1
239
//!
240
//! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
241
//! the Encoding Standard. Therefore, an encoding that maps the unsigned
242
//! byte value to the same Unicode scalar value is not available via
243
//! `Encoding` in this crate.
244
//!
245
//! However, the functions whose name starts with `convert` and contains
246
//! `latin1` in the `mem` module support such conversions, which are known as
247
//! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
248
//! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
249
//! in the [Infra Standard](https://infra.spec.whatwg.org/).
250
//!
251
//! ## Web / Browser Focus
252
//!
253
//! Both in terms of scope and performance, the focus is on the Web. For scope,
254
//! this means that encoding_rs implements the Encoding Standard fully and
255
//! doesn't implement encodings that are not specified in the Encoding
256
//! Standard. For performance, this means that decoding performance is
257
//! important as well as performance for encoding into UTF-8 or encoding the
258
//! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
259
//! be encoded into legacy encodings in only two places in the Web platform: in
260
//! the query part of URLs, in which case it's a matter of relatively rare
261
//! error handling, and in form submission, in which case the user action and
262
//! networking tend to hide the performance of the encoder.
263
//!
264
//! Deemphasizing performance of encoding non-Basic Latin text into legacy
265
//! encodings enables smaller code size thanks to the encoder side using the
266
//! decode-optimized data tables without having encode-optimized data tables at
267
//! all. Even in decoders, smaller lookup table size is preferred over avoiding
268
//! multiplication operations.
269
//!
270
//! Additionally, performance is a non-goal for the ASCII-incompatible
271
//! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
272
//! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
273
//! of implementation.
274
//!
275
//! Despite the browser focus, the hope is that non-browser applications
276
//! that wish to consume Web content or submit Web forms in a Web-compatible
277
//! way will find encoding_rs useful. While encoding_rs does not try to match
278
//! Windows behavior, many of the encodings are close enough to legacy
279
//! encodings implemented by Windows that applications that need to consume
280
//! data in legacy Windows encodins may find encoding_rs useful. The
281
//! [codepage](https://crates.io/crates/codepage) crate maps from Windows
282
//! code page identifiers onto encoding_rs `Encoding`s and vice versa.
283
//!
284
//! For decoding email, UTF-7 support is needed (unfortunately) in additition
285
//! to the encodings defined in the Encoding Standard. The
286
//! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
287
//! UTF-7 decoding for email purposes.
288
//!
289
//! For single-byte DOS encodings beyond the ones supported by the Encoding
290
//! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
291
//!
292
//! # Preparing Text for the Encoders
293
//!
294
//! Normalizing text into Unicode Normalization Form C prior to encoding text
295
//! into a legacy encoding minimizes unmappable characters. Text can be
296
//! normalized to Unicode Normalization Form C using the
297
//! [`icu_normalizer`](https://crates.io/crates/icu_normalizer) crate, which
298
//! is part of [ICU4X](https://icu4x.unicode.org/).
299
//!
300
//! The exception is windows-1258, which after normalizing to Unicode
301
//! Normalization Form C requires tone marks to be decomposed in order to
302
//! minimize unmappable characters. Vietnamese tone marks can be decomposed
303
//! using the [`detone`](https://crates.io/crates/detone) crate.
304
//!
305
//! # Streaming & Non-Streaming; Rust & C/C++
306
//!
307
//! The API in Rust has two modes of operation: streaming and non-streaming.
308
//! The streaming API is the foundation of the implementation and should be
309
//! used when processing data that arrives piecemeal from an i/o stream. The
310
//! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
311
//! to C callers. The non-streaming part of the API is for Rust callers only and
312
//! is smart about borrowing instead of copying when possible. When
313
//! streamability is not needed, the non-streaming API should be preferrer in
314
//! order to avoid copying data when a borrow suffices.
315
//!
316
//! There is no analogous C API exposed via FFI, mainly because C doesn't have
317
//! standard types for growable byte buffers and Unicode strings that know
318
//! their length.
319
//!
320
//! The C API (header file generated at `target/include/encoding_rs.h` when
321
//! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
322
//! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
323
//! The C binding comes with a [C++17 wrapper][2] that uses standard library +
324
//! [GSL][3] types and that recreates the non-streaming API in C++ on top of
325
//! the streaming API. A C++ wrapper with XPCOM/MFBT types is available as
326
//! [`mozilla::Encoding`][4].
327
//!
328
//! The `Encoding` type is common to both the streaming and non-streaming
329
//! modes. In the streaming mode, decoding operations are performed with a
330
//! `Decoder` and encoding operations with an `Encoder` object obtained via
331
//! `Encoding`. In the non-streaming mode, decoding and encoding operations are
332
//! performed using methods on `Encoding` objects themselves, so the `Decoder`
333
//! and `Encoder` objects are not used at all.
334
//!
335
//! [1]: https://github.com/hsivonen/encoding_c
336
//! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
337
//! [3]: https://github.com/Microsoft/GSL/
338
//! [4]: https://searchfox.org/mozilla-central/source/intl/Encoding.h
339
//!
340
//! # Memory management
341
//!
342
//! The non-streaming mode never performs heap allocations (even the methods
343
//! that write into a `Vec<u8>` or a `String` by taking them as arguments do
344
//! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
345
//! is, the non-streaming mode uses caller-allocated buffers exclusively.
346
//!
347
//! The methods of the streaming mode that return a `Vec<u8>` or a `String`
348
//! perform heap allocations but only to allocate the backing buffer of the
349
//! `Vec<u8>` or the `String`.
350
//!
351
//! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
352
//! `Drop` cleanup.
353
//!
354
//! # Buffer reading and writing behavior
355
//!
356
//! Based on experience gained with the `java.nio.charset` encoding converter
357
//! API and with the Gecko uconv encoding converter API, the buffer reading
358
//! and writing behaviors of encoding_rs are asymmetric: input buffers are
359
//! fully drained but output buffers are not always fully filled.
360
//!
361
//! When reading from an input buffer, encoding_rs always consumes all input
362
//! up to the next error or to the end of the buffer. In particular, when
363
//! decoding, even if the input buffer ends in the middle of a byte sequence
364
//! for a character, the decoder consumes all input. This has the benefit that
365
//! the caller of the API can always fill the next buffer from the start from
366
//! whatever source the bytes come from and never has to first copy the last
367
//! bytes of the previous buffer to the start of the next buffer. However, when
368
//! encoding, the UTF-8 input buffers have to end at a character boundary, which
369
//! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
370
//! boundaries falling in the middle of a surrogate pair result in both
371
//! suggorates being treated individually as unpaired surrogates.
372
//!
373
//! Additionally, decoders guarantee that they can be fed even one byte at a
374
//! time and encoders guarantee that they can be fed even one code point at a
375
//! time. This has the benefit of not placing restrictions on the size of
376
//! chunks the content arrives e.g. from network.
377
//!
378
//! When writing into an output buffer, encoding_rs makes sure that the code
379
//! unit sequence for a character is never split across output buffer
380
//! boundaries. This may result in wasted space at the end of an output buffer,
381
//! but the advantages are that the output side of both decoders and encoders
382
//! is greatly simplified compared to designs that attempt to fill output
383
//! buffers exactly even when that entails splitting a code unit sequence and
384
//! when encoding_rs methods return to the caller, the output produces thus
385
//! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
386
//! the output needs to be considered as a whole, because the latest output
387
//! buffer taken alone might not be valid taken alone if the transition away
388
//! from the ASCII state occurred in an earlier output buffer. However, since
389
//! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
390
//! state as being in error despite the encoder generating a transition to the
391
//! ASCII state at the end, the claim about the partial output taken as a whole
392
//! being valid is true even for ISO-2022-JP.)
393
//!
394
//! # Error Reporting
395
//!
396
//! Based on experience gained with the `java.nio.charset` encoding converter
397
//! API and with the Gecko uconv encoding converter API, the error reporting
398
//! behaviors of encoding_rs are asymmetric: decoder errors include offsets
399
//! that leave it up to the caller to extract the erroneous bytes from the
400
//! input stream if the caller wishes to do so but encoder errors provide the
401
//! code point associated with the error without requiring the caller to
402
//! extract it from the input on its own.
403
//!
404
//! On the encoder side, an error is always triggered by the most recently
405
//! pushed Unicode scalar, which makes it simple to pass the `char` to the
406
//! caller. Also, it's very typical for the caller to wish to do something with
407
//! this data: generate a numeric escape for the character. Additionally, the
408
//! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
409
//! certain cases, so requiring the caller to extract the character from the
410
//! input buffer would require the caller to handle ISO-2022-JP details.
411
//! Furthermore, requiring the caller to extract the character from the input
412
//! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
413
//! the job of an encoding conversion library.
414
//!
415
//! On the decoder side, errors are triggered in more complex ways. For
416
//! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
417
//! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
418
//! the buffer boundary when processing 'A'. Thus, the bytes in error might not
419
//! be the ones most recently pushed to the decoder and the error might not even
420
//! be in the current buffer.
421
//!
422
//! Some encoding conversion APIs address the problem by not acknowledging
423
//! trailing bytes of an input buffer as consumed if it's still possible for
424
//! future bytes to cause the trailing bytes to be in error. This way, error
425
//! reporting can always refer to the most recently pushed buffer. This has the
426
//! problem that the caller of the API has to copy the unconsumed trailing
427
//! bytes to the start of the next buffer before being able to fill the rest
428
//! of the next buffer. This is annoying, error-prone and inefficient.
429
//!
430
//! A possible solution would be making the decoder remember recently consumed
431
//! bytes in order to be able to include a copy of the erroneous bytes when
432
//! reporting an error. This has two problem: First, callers a rarely
433
//! interested in the erroneous bytes, so attempts to identify them are most
434
//! often just overhead anyway. Second, the rare applications that are
435
//! interested typically care about the location of the error in the input
436
//! stream.
437
//!
438
//! To keep the API convenient for common uses and the overhead low while making
439
//! it possible to develop applications, such as HTML validators, that care
440
//! about which bytes were in error, encoding_rs reports the length of the
441
//! erroneous sequence and the number of bytes consumed after the erroneous
442
//! sequence. As long as the caller doesn't discard the 6 most recent bytes,
443
//! this makes it possible for callers that care about the erroneous bytes to
444
//! locate them.
445
//!
446
//! # No Convenience API for Custom Replacements
447
//!
448
//! The Web Platform and, therefore, the Encoding Standard supports only one
449
//! error recovery mode for decoders and only one error recovery mode for
450
//! encoders. The supported error recovery mode for decoders is emitting the
451
//! REPLACEMENT CHARACTER on error. The supported error recovery mode for
452
//! encoders is emitting an HTML decimal numeric character reference for
453
//! unmappable characters.
454
//!
455
//! Since encoding_rs is Web-focused, these are the only error recovery modes
456
//! for which convenient support is provided. Moreover, on the decoder side,
457
//! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
458
//! on error (other than treating errors as fatal). In particular, simply
459
//! ignoring errors is a
460
//! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
461
//! so it would be a bad idea for encoding_rs to provide a mode that encouraged
462
//! callers to ignore errors.
463
//!
464
//! On the encoder side, there are plausible alternatives for HTML decimal
465
//! numeric character references. For example, when outputting CSS, CSS-style
466
//! escapes would seem to make sense. However, instead of facilitating the
467
//! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
468
//! position that you shouldn't generate output in encodings other than UTF-8,
469
//! except where backward compatibility with interacting with the legacy Web
470
//! requires it. The legacy Web requires it only when parsing the query strings
471
//! of URLs and when submitting forms, and those two both use HTML decimal
472
//! numeric character references.
473
//!
474
//! While encoding_rs doesn't make encoder replacements other than HTML decimal
475
//! numeric character references easy, it does make them _possible_.
476
//! `encode_from_utf8()`, which emits HTML decimal numeric character references
477
//! for unmappable characters, is implemented on top of
478
//! `encode_from_utf8_without_replacement()`. Applications that really, really
479
//! want other replacement schemes for unmappable characters can likewise
480
//! implement them on top of `encode_from_utf8_without_replacement()`.
481
//!
482
//! # No Extensibility by Design
483
//!
484
//! The set of encodings supported by encoding_rs is not extensible by design.
485
//! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
486
//! rather than `trait`s. encoding_rs takes the design position that all future
487
//! text interchange should be done using UTF-8, which can represent all of
488
//! Unicode. (It is, in fact, the only encoding supported by the Encoding
489
//! Standard and encoding_rs that can represent all of Unicode and that has
490
//! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
491
//! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
492
//! legacy compatibility and not due to non-UTF-8 encodings having benefits
493
//! other than being able to consume legacy content.
494
//!
495
//! Considering that UTF-8 can represent all of Unicode and is already supported
496
//! by all Web browsers, introducing a new encoding wouldn't add to the
497
//! expressiveness but would add to compatibility problems. In that sense,
498
//! adding new encodings to the Web Platform doesn't make sense, and, in fact,
499
//! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
500
//! the Web Platform. On the other hand, the set of legacy encodings that must
501
//! be supported for a Web browser to be able to be successful is not going to
502
//! expand. Empirically, the set of encodings specified in the Encoding Standard
503
//! is already sufficient and the set of legacy encodings won't grow
504
//! retroactively.
505
//!
506
//! Since extensibility doesn't make sense considering the Web focus of
507
//! encoding_rs and adding encodings to Web clients would be actively harmful,
508
//! it makes sense to make the set of encodings that encoding_rs supports
509
//! non-extensible and to take the (admittedly small) benefits arising from
510
//! that, such as the size of `Decoder` and `Encoder` objects being known ahead
511
//!  of time, which enables stack allocation thereof.
512
//!
513
//! This does have downsides for applications that might want to put encoding_rs
514
//! to non-Web uses if those non-Web uses involve legacy encodings that aren't
515
//! needed for Web uses. The needs of such applications should not complicate
516
//! encoding_rs itself, though. It is up to those applications to provide a
517
//! framework that delegates the operations with encodings that encoding_rs
518
//! supports to encoding_rs and operations with other encodings to something
519
//! else (as opposed to encoding_rs itself providing an extensibility
520
//! framework).
521
//!
522
//! # Panics
523
//!
524
//! Methods in encoding_rs can panic if the API is used against the requirements
525
//! stated in the documentation, if a state that's supposed to be impossible
526
//! is reached due to an internal bug or on integer overflow. When used
527
//! according to documentation with buffer sizes that stay below integer
528
//! overflow, in the absence of internal bugs, encoding_rs does not panic.
529
//!
530
//! Panics arising from API misuse aren't documented beyond this on individual
531
//! methods.
532
//!
533
//! # At-Risk Parts of the API
534
//!
535
//! The foreseeable source of partially backward-incompatible API change is the
536
//! way the instances of `Encoding` are made available.
537
//!
538
//! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
539
//! initialized with `static`s of type `&'static Encoding`, the non-reference
540
//! `FOO_INIT` public `Encoding` instances will be removed from the public API.
541
//!
542
//! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
543
//! unique when the constant is used in different crates, the reference-typed
544
//! `static`s for the encoding instances will be changed from `static` to
545
//! `const` and the non-reference-typed `_INIT` instances will be removed.
546
//!
547
//! # Mapping Spec Concepts onto the API
548
//!
549
//! <table>
550
//! <thead>
551
//! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
552
//! </thead>
553
//! <tbody>
554
//! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&amp;'static Encoding</code></td><td><code>&amp;'static Encoding</code></td></tr>
555
//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
556
//! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
557
//! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
558
//! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
559
//! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
560
//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
561
//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
562
//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// &hellip; (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
563
//! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
564
//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// &hellip;</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
565
//! </tbody>
566
//! </table>
567
//!
568
//! # Compatibility with the rust-encoding API
569
//!
570
//! The crate
571
//! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
572
//! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
573
//! the API of rust-encoding 0.2.32 on top of encoding_rs.
574
//!
575
//! # Mapping rust-encoding concepts to encoding_rs concepts
576
//!
577
//! The following table provides a mapping from rust-encoding constructs to
578
//! encoding_rs ones.
579
//!
580
//! <table>
581
//! <thead>
582
//! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
583
//! </thead>
584
//! <tbody>
585
//! <tr><td><code>encoding::EncodingRef</code></td><td><code>&amp;'static encoding_rs::Encoding</code></td></tr>
586
//! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
587
//! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
588
//! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
589
//! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
590
//! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
591
//! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
592
//! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
593
//! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
594
//! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
595
//! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
596
//! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
597
//! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
598
//! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
599
//! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
600
//! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
601
//! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
602
//! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
603
//! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
604
//! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
605
//! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
606
//! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
607
//! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
608
//! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
609
//! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
610
//! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
611
//! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
612
//! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
613
//! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
614
//! </tbody>
615
//! </table>
616
//!
617
//! # Relationship with Windows Code Pages
618
//!
619
//! Despite the Web and browser focus, the encodings defined by the Encoding
620
//! Standard and implemented by this crate may be useful for decoding legacy
621
//! data that uses Windows code pages. The following table names the single-byte
622
//! encodings
623
//! that have a closely related Windows code page, the number of the closest
624
//! code page, a column indicating whether Windows maps unassigned code points
625
//! to the Unicode Private Use Area instead of U+FFFD and a remark number
626
//! indicating remarks in the list after the table.
627
//!
628
//! <table>
629
//! <thead>
630
//! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
631
//! </thead>
632
//! <tbody>
633
//! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
634
//! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
635
//! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
636
//! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
637
//! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
638
//! <tr><td>windows-874</td><td>874</td><td>&bullet;</td><td></td></tr>
639
//! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
640
//! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
641
//! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
642
//! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
643
//! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
644
//! <tr><td>windows-1253</td><td>1253</td><td>&bullet;</td><td></td></tr>
645
//! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
646
//! <tr><td>windows-1255</td><td>1255</td><td>&bullet;</td><td></td></tr>
647
//! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
648
//! <tr><td>windows-1257</td><td>1257</td><td>&bullet;</td><td></td></tr>
649
//! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
650
//! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
651
//! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
652
//! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
653
//! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
654
//! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
655
//! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
656
//! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
657
//! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
658
//! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
659
//! <tr><td>ISO-8859-6</td><td>28596</td><td>&bullet;</td><td></td></tr>
660
//! <tr><td>ISO-8859-7</td><td>28597</td><td>&bullet;</td><td>3</td></tr>
661
//! <tr><td>ISO-8859-8</td><td>28598</td><td>&bullet;</td><td>4</td></tr>
662
//! <tr><td>ISO-8859-13</td><td>28603</td><td>&bullet;</td><td></td></tr>
663
//! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
664
//! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
665
//! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
666
//! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
667
//! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
668
//! </tbody>
669
//! </table>
670
//!
671
//! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
672
//! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
673
//! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
674
//!    which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
675
//!    decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
676
//!    LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
677
//!    instead of U+2019 RIGHT SINGLE QUOTATION MARK.
678
//! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
679
//!    of LRM and RLM.
680
//! 5. Remarks from the previous item apply.
681
//!
682
//! The differences between this crate and Windows in the case of multibyte encodings
683
//! are not yet fully documented here. The lack of remarks above should not be taken
684
//! as indication of lack of differences.
685
//!
686
//! # Notable Differences from IANA Naming
687
//!
688
//! In some cases, the Encoding Standard specifies the popular unextended encoding
689
//! name where in IANA terms one of the other labels would be more precise considering
690
//! the extensions that the Encoding Standard has unified into the encoding.
691
//!
692
//! <table>
693
//! <thead>
694
//! <tr><th>Encoding</th><th>IANA</th></tr>
695
//! </thead>
696
//! <tbody>
697
//! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
698
//! <tr><td>EUC-KR</td><td>windows-949</td></tr>
699
//! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
700
//! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
701
//! </tbody>
702
//! </table>
703
//!
704
//! In other cases where the Encoding Standard unifies unextended and extended
705
//! variants of an encoding, the encoding gets the name of the extended
706
//! variant.
707
//!
708
//! <table>
709
//! <thead>
710
//! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
711
//! </thead>
712
//! <tbody>
713
//! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
714
//! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
715
//! <tr><td>TIS-620</td><td>windows-874</td></tr>
716
//! </tbody>
717
//! </table>
718
//!
719
//! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
720
//! for discussion about the UTF-16 family.
721
722
#![no_std]
723
#![cfg_attr(feature = "simd-accel", feature(core_intrinsics, portable_simd))]
724
725
#[cfg(feature = "alloc")]
726
#[cfg_attr(test, macro_use)]
727
extern crate alloc;
728
729
extern crate core;
730
#[macro_use]
731
extern crate cfg_if;
732
733
#[cfg(feature = "serde")]
734
extern crate serde;
735
736
#[cfg(all(test, feature = "serde"))]
737
extern crate bincode;
738
#[cfg(all(test, feature = "serde"))]
739
#[macro_use]
740
extern crate serde_derive;
741
#[cfg(all(test, feature = "serde"))]
742
extern crate serde_json;
743
744
#[macro_use]
745
mod macros;
746
747
#[cfg(all(
748
    feature = "simd-accel",
749
    any(
750
        target_feature = "sse2",
751
        all(target_endian = "little", target_arch = "aarch64"),
752
        all(target_endian = "little", target_feature = "neon")
753
    )
754
))]
755
mod simd_funcs;
756
757
#[cfg(all(test, feature = "alloc"))]
758
mod testing;
759
760
mod big5;
761
mod euc_jp;
762
mod euc_kr;
763
mod gb18030;
764
mod gb18030_2022;
765
mod iso_2022_jp;
766
mod replacement;
767
mod shift_jis;
768
mod single_byte;
769
mod utf_16;
770
mod utf_8;
771
mod x_user_defined;
772
773
mod ascii;
774
mod data;
775
mod handles;
776
mod variant;
777
778
pub mod mem;
779
780
use crate::ascii::ascii_valid_up_to;
781
use crate::ascii::iso_2022_jp_ascii_valid_up_to;
782
use crate::utf_8::utf8_valid_up_to;
783
use crate::variant::*;
784
785
#[cfg(feature = "alloc")]
786
use alloc::borrow::Cow;
787
#[cfg(feature = "alloc")]
788
use alloc::string::String;
789
#[cfg(feature = "alloc")]
790
use alloc::vec::Vec;
791
use core::cmp::Ordering;
792
use core::hash::Hash;
793
use core::hash::Hasher;
794
795
#[cfg(feature = "serde")]
796
use serde::de::Visitor;
797
#[cfg(feature = "serde")]
798
use serde::{Deserialize, Deserializer, Serialize, Serializer};
799
800
/// This has to be the max length of an NCR instead of max
801
/// minus one, because we can't rely on getting the minus
802
/// one from the space reserved for the current unmappable,
803
/// because the ISO-2022-JP encoder can fill up that space
804
/// with a state transition escape.
805
const NCR_EXTRA: usize = 10; // &#1114111;
806
807
// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
808
// Instead, please regenerate using generate-encoding-data.py
809
810
const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
811
812
/// The initializer for the [Big5](static.BIG5.html) encoding.
813
///
814
/// For use only for taking the address of this form when
815
/// Rust prohibits the use of the non-`_INIT` form directly,
816
/// such as in initializers of other `static`s. If in doubt,
817
/// use the corresponding non-`_INIT` reference-typed `static`.
818
///
819
/// This part of the public API will go away if Rust changes
820
/// to make the referent of `pub const FOO: &'static Encoding`
821
/// unique cross-crate or if Rust starts allowing static arrays
822
/// to be initialized with `pub static FOO: &'static Encoding`
823
/// items.
824
pub static BIG5_INIT: Encoding = Encoding {
825
    name: "Big5",
826
    variant: VariantEncoding::Big5,
827
};
828
829
/// The Big5 encoding.
830
///
831
/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
832
/// instead of the Private Use Area code points that have been used historically.
833
/// It is believed to be able to decode existing Web content in a way that makes
834
/// sense.
835
///
836
/// To avoid form submissions generating data that Web servers don't understand,
837
/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
838
/// Big5 in the lexical order.
839
///
840
/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
841
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
842
///
843
/// This encoding is designed to be suited for decoding the Windows code page 950
844
/// and its HKSCS patched "951" variant such that the text makes sense, given
845
/// assignments that Unicode has made after those encodings used Private Use
846
/// Area characters.
847
///
848
/// This will change from `static` to `const` if Rust changes
849
/// to make the referent of `pub const FOO: &'static Encoding`
850
/// unique cross-crate, so don't take the address of this
851
/// `static`.
852
pub static BIG5: &'static Encoding = &BIG5_INIT;
853
854
/// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
855
///
856
/// For use only for taking the address of this form when
857
/// Rust prohibits the use of the non-`_INIT` form directly,
858
/// such as in initializers of other `static`s. If in doubt,
859
/// use the corresponding non-`_INIT` reference-typed `static`.
860
///
861
/// This part of the public API will go away if Rust changes
862
/// to make the referent of `pub const FOO: &'static Encoding`
863
/// unique cross-crate or if Rust starts allowing static arrays
864
/// to be initialized with `pub static FOO: &'static Encoding`
865
/// items.
866
pub static EUC_JP_INIT: Encoding = Encoding {
867
    name: "EUC-JP",
868
    variant: VariantEncoding::EucJp,
869
};
870
871
/// The EUC-JP encoding.
872
///
873
/// This is the legacy Unix encoding for Japanese.
874
///
875
/// For compatibility with Web servers that don't expect three-byte sequences
876
/// in form submissions, the encoder doesn't generate three-byte sequences.
877
/// That is, the JIS X 0212 support is decode-only.
878
///
879
/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
880
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
881
///
882
/// This encoding roughly matches the Windows code page 20932. There are error
883
/// handling differences and a handful of 2-byte sequences that decode differently.
884
/// Additionall, Windows doesn't support 3-byte sequences.
885
///
886
/// This will change from `static` to `const` if Rust changes
887
/// to make the referent of `pub const FOO: &'static Encoding`
888
/// unique cross-crate, so don't take the address of this
889
/// `static`.
890
pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
891
892
/// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
893
///
894
/// For use only for taking the address of this form when
895
/// Rust prohibits the use of the non-`_INIT` form directly,
896
/// such as in initializers of other `static`s. If in doubt,
897
/// use the corresponding non-`_INIT` reference-typed `static`.
898
///
899
/// This part of the public API will go away if Rust changes
900
/// to make the referent of `pub const FOO: &'static Encoding`
901
/// unique cross-crate or if Rust starts allowing static arrays
902
/// to be initialized with `pub static FOO: &'static Encoding`
903
/// items.
904
pub static EUC_KR_INIT: Encoding = Encoding {
905
    name: "EUC-KR",
906
    variant: VariantEncoding::EucKr,
907
};
908
909
/// The EUC-KR encoding.
910
///
911
/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
912
/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
913
/// Classic), with all the characters from the Hangul Syllables block of Unicode.
914
///
915
/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
916
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
917
///
918
/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
919
/// to U+0080 and some byte sequences that are error per the Encoding Standard to
920
/// the question mark or the Private Use Area.
921
///
922
/// This will change from `static` to `const` if Rust changes
923
/// to make the referent of `pub const FOO: &'static Encoding`
924
/// unique cross-crate, so don't take the address of this
925
/// `static`.
926
pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
927
928
/// The initializer for the [GBK](static.GBK.html) encoding.
929
///
930
/// For use only for taking the address of this form when
931
/// Rust prohibits the use of the non-`_INIT` form directly,
932
/// such as in initializers of other `static`s. If in doubt,
933
/// use the corresponding non-`_INIT` reference-typed `static`.
934
///
935
/// This part of the public API will go away if Rust changes
936
/// to make the referent of `pub const FOO: &'static Encoding`
937
/// unique cross-crate or if Rust starts allowing static arrays
938
/// to be initialized with `pub static FOO: &'static Encoding`
939
/// items.
940
pub static GBK_INIT: Encoding = Encoding {
941
    name: "GBK",
942
    variant: VariantEncoding::Gbk,
943
};
944
945
/// The GBK encoding.
946
///
947
/// The decoder for this encoding is the same as the decoder for gb18030.
948
/// The encoder side of this encoding is GBK with Windows code page 936 euro
949
/// sign behavior and with the changes to two-byte sequences made in GB18030-2022.
950
/// GBK extends GB2312-80 to cover the CJK Unified Ideographs Unicode block as
951
/// well as a handful of ideographs from the CJK Unified Ideographs Extension A
952
/// and CJK Compatibility Ideographs blocks.
953
///
954
/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
955
/// unified with the gb18030 encoder in the Encoding Standard out of concern
956
/// that servers that expect GBK form submissions might not be able to handle
957
/// the four-byte sequences.
958
///
959
/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
960
/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
961
///
962
/// The encoder of this encoding roughly matches the Windows code page 936.
963
/// The decoder side is a superset.
964
///
965
/// This will change from `static` to `const` if Rust changes
966
/// to make the referent of `pub const FOO: &'static Encoding`
967
/// unique cross-crate, so don't take the address of this
968
/// `static`.
969
pub static GBK: &'static Encoding = &GBK_INIT;
970
971
/// The initializer for the [IBM866](static.IBM866.html) encoding.
972
///
973
/// For use only for taking the address of this form when
974
/// Rust prohibits the use of the non-`_INIT` form directly,
975
/// such as in initializers of other `static`s. If in doubt,
976
/// use the corresponding non-`_INIT` reference-typed `static`.
977
///
978
/// This part of the public API will go away if Rust changes
979
/// to make the referent of `pub const FOO: &'static Encoding`
980
/// unique cross-crate or if Rust starts allowing static arrays
981
/// to be initialized with `pub static FOO: &'static Encoding`
982
/// items.
983
pub static IBM866_INIT: Encoding = Encoding {
984
    name: "IBM866",
985
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
986
};
987
988
/// The IBM866 encoding.
989
///
990
/// This the most notable one of the DOS Cyrillic code pages. It has the same
991
/// box drawing characters as code page 437, so it can be used for decoding
992
/// DOS-era ASCII + box drawing data.
993
///
994
/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
995
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
996
///
997
/// This encoding matches the Windows code page 866.
998
///
999
/// This will change from `static` to `const` if Rust changes
1000
/// to make the referent of `pub const FOO: &'static Encoding`
1001
/// unique cross-crate, so don't take the address of this
1002
/// `static`.
1003
pub static IBM866: &'static Encoding = &IBM866_INIT;
1004
1005
/// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
1006
///
1007
/// For use only for taking the address of this form when
1008
/// Rust prohibits the use of the non-`_INIT` form directly,
1009
/// such as in initializers of other `static`s. If in doubt,
1010
/// use the corresponding non-`_INIT` reference-typed `static`.
1011
///
1012
/// This part of the public API will go away if Rust changes
1013
/// to make the referent of `pub const FOO: &'static Encoding`
1014
/// unique cross-crate or if Rust starts allowing static arrays
1015
/// to be initialized with `pub static FOO: &'static Encoding`
1016
/// items.
1017
pub static ISO_2022_JP_INIT: Encoding = Encoding {
1018
    name: "ISO-2022-JP",
1019
    variant: VariantEncoding::Iso2022Jp,
1020
};
1021
1022
/// The ISO-2022-JP encoding.
1023
///
1024
/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
1025
/// byte range to encode non-Basic Latin characters. It's the only encoding
1026
/// supported by this crate whose encoder is stateful.
1027
///
1028
/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
1029
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
1030
///
1031
/// This encoding roughly matches the Windows code page 50220. Notably, Windows
1032
/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
1033
/// error handling.
1034
///
1035
/// This will change from `static` to `const` if Rust changes
1036
/// to make the referent of `pub const FOO: &'static Encoding`
1037
/// unique cross-crate, so don't take the address of this
1038
/// `static`.
1039
pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
1040
1041
/// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1042
///
1043
/// For use only for taking the address of this form when
1044
/// Rust prohibits the use of the non-`_INIT` form directly,
1045
/// such as in initializers of other `static`s. If in doubt,
1046
/// use the corresponding non-`_INIT` reference-typed `static`.
1047
///
1048
/// This part of the public API will go away if Rust changes
1049
/// to make the referent of `pub const FOO: &'static Encoding`
1050
/// unique cross-crate or if Rust starts allowing static arrays
1051
/// to be initialized with `pub static FOO: &'static Encoding`
1052
/// items.
1053
pub static ISO_8859_10_INIT: Encoding = Encoding {
1054
    name: "ISO-8859-10",
1055
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1056
};
1057
1058
/// The ISO-8859-10 encoding.
1059
///
1060
/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1061
/// is also known as Latin 6.
1062
///
1063
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1064
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1065
///
1066
/// The Windows code page number for this encoding is 28600, but kernel32.dll
1067
/// does not support this encoding.
1068
///
1069
/// This will change from `static` to `const` if Rust changes
1070
/// to make the referent of `pub const FOO: &'static Encoding`
1071
/// unique cross-crate, so don't take the address of this
1072
/// `static`.
1073
pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1074
1075
/// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1076
///
1077
/// For use only for taking the address of this form when
1078
/// Rust prohibits the use of the non-`_INIT` form directly,
1079
/// such as in initializers of other `static`s. If in doubt,
1080
/// use the corresponding non-`_INIT` reference-typed `static`.
1081
///
1082
/// This part of the public API will go away if Rust changes
1083
/// to make the referent of `pub const FOO: &'static Encoding`
1084
/// unique cross-crate or if Rust starts allowing static arrays
1085
/// to be initialized with `pub static FOO: &'static Encoding`
1086
/// items.
1087
pub static ISO_8859_13_INIT: Encoding = Encoding {
1088
    name: "ISO-8859-13",
1089
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1090
};
1091
1092
/// The ISO-8859-13 encoding.
1093
///
1094
/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1095
/// is also known as Latin 7.
1096
///
1097
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1098
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1099
///
1100
/// This encoding matches the Windows code page 28603, except Windows decodes
1101
/// unassigned code points to the Private Use Area of Unicode.
1102
///
1103
/// This will change from `static` to `const` if Rust changes
1104
/// to make the referent of `pub const FOO: &'static Encoding`
1105
/// unique cross-crate, so don't take the address of this
1106
/// `static`.
1107
pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1108
1109
/// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1110
///
1111
/// For use only for taking the address of this form when
1112
/// Rust prohibits the use of the non-`_INIT` form directly,
1113
/// such as in initializers of other `static`s. If in doubt,
1114
/// use the corresponding non-`_INIT` reference-typed `static`.
1115
///
1116
/// This part of the public API will go away if Rust changes
1117
/// to make the referent of `pub const FOO: &'static Encoding`
1118
/// unique cross-crate or if Rust starts allowing static arrays
1119
/// to be initialized with `pub static FOO: &'static Encoding`
1120
/// items.
1121
pub static ISO_8859_14_INIT: Encoding = Encoding {
1122
    name: "ISO-8859-14",
1123
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1124
};
1125
1126
/// The ISO-8859-14 encoding.
1127
///
1128
/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1129
/// is also known as Latin 8.
1130
///
1131
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1132
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1133
///
1134
/// The Windows code page number for this encoding is 28604, but kernel32.dll
1135
/// does not support this encoding.
1136
///
1137
/// This will change from `static` to `const` if Rust changes
1138
/// to make the referent of `pub const FOO: &'static Encoding`
1139
/// unique cross-crate, so don't take the address of this
1140
/// `static`.
1141
pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1142
1143
/// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1144
///
1145
/// For use only for taking the address of this form when
1146
/// Rust prohibits the use of the non-`_INIT` form directly,
1147
/// such as in initializers of other `static`s. If in doubt,
1148
/// use the corresponding non-`_INIT` reference-typed `static`.
1149
///
1150
/// This part of the public API will go away if Rust changes
1151
/// to make the referent of `pub const FOO: &'static Encoding`
1152
/// unique cross-crate or if Rust starts allowing static arrays
1153
/// to be initialized with `pub static FOO: &'static Encoding`
1154
/// items.
1155
pub static ISO_8859_15_INIT: Encoding = Encoding {
1156
    name: "ISO-8859-15",
1157
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1158
};
1159
1160
/// The ISO-8859-15 encoding.
1161
///
1162
/// This is the revised Western European part of the ISO/IEC 8859 encoding
1163
/// family. This encoding is also known as Latin 9.
1164
///
1165
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1166
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1167
///
1168
/// This encoding matches the Windows code page 28605.
1169
///
1170
/// This will change from `static` to `const` if Rust changes
1171
/// to make the referent of `pub const FOO: &'static Encoding`
1172
/// unique cross-crate, so don't take the address of this
1173
/// `static`.
1174
pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1175
1176
/// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1177
///
1178
/// For use only for taking the address of this form when
1179
/// Rust prohibits the use of the non-`_INIT` form directly,
1180
/// such as in initializers of other `static`s. If in doubt,
1181
/// use the corresponding non-`_INIT` reference-typed `static`.
1182
///
1183
/// This part of the public API will go away if Rust changes
1184
/// to make the referent of `pub const FOO: &'static Encoding`
1185
/// unique cross-crate or if Rust starts allowing static arrays
1186
/// to be initialized with `pub static FOO: &'static Encoding`
1187
/// items.
1188
pub static ISO_8859_16_INIT: Encoding = Encoding {
1189
    name: "ISO-8859-16",
1190
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1191
};
1192
1193
/// The ISO-8859-16 encoding.
1194
///
1195
/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1196
/// family. This encoding is also known as Latin 10.
1197
///
1198
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1199
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1200
///
1201
/// The Windows code page number for this encoding is 28606, but kernel32.dll
1202
/// does not support this encoding.
1203
///
1204
/// This will change from `static` to `const` if Rust changes
1205
/// to make the referent of `pub const FOO: &'static Encoding`
1206
/// unique cross-crate, so don't take the address of this
1207
/// `static`.
1208
pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1209
1210
/// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1211
///
1212
/// For use only for taking the address of this form when
1213
/// Rust prohibits the use of the non-`_INIT` form directly,
1214
/// such as in initializers of other `static`s. If in doubt,
1215
/// use the corresponding non-`_INIT` reference-typed `static`.
1216
///
1217
/// This part of the public API will go away if Rust changes
1218
/// to make the referent of `pub const FOO: &'static Encoding`
1219
/// unique cross-crate or if Rust starts allowing static arrays
1220
/// to be initialized with `pub static FOO: &'static Encoding`
1221
/// items.
1222
pub static ISO_8859_2_INIT: Encoding = Encoding {
1223
    name: "ISO-8859-2",
1224
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1225
};
1226
1227
/// The ISO-8859-2 encoding.
1228
///
1229
/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1230
///
1231
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1232
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1233
///
1234
/// This encoding matches the Windows code page 28592.
1235
///
1236
/// This will change from `static` to `const` if Rust changes
1237
/// to make the referent of `pub const FOO: &'static Encoding`
1238
/// unique cross-crate, so don't take the address of this
1239
/// `static`.
1240
pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1241
1242
/// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1243
///
1244
/// For use only for taking the address of this form when
1245
/// Rust prohibits the use of the non-`_INIT` form directly,
1246
/// such as in initializers of other `static`s. If in doubt,
1247
/// use the corresponding non-`_INIT` reference-typed `static`.
1248
///
1249
/// This part of the public API will go away if Rust changes
1250
/// to make the referent of `pub const FOO: &'static Encoding`
1251
/// unique cross-crate or if Rust starts allowing static arrays
1252
/// to be initialized with `pub static FOO: &'static Encoding`
1253
/// items.
1254
pub static ISO_8859_3_INIT: Encoding = Encoding {
1255
    name: "ISO-8859-3",
1256
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1257
};
1258
1259
/// The ISO-8859-3 encoding.
1260
///
1261
/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1262
///
1263
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1264
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1265
///
1266
/// This encoding matches the Windows code page 28593.
1267
///
1268
/// This will change from `static` to `const` if Rust changes
1269
/// to make the referent of `pub const FOO: &'static Encoding`
1270
/// unique cross-crate, so don't take the address of this
1271
/// `static`.
1272
pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1273
1274
/// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1275
///
1276
/// For use only for taking the address of this form when
1277
/// Rust prohibits the use of the non-`_INIT` form directly,
1278
/// such as in initializers of other `static`s. If in doubt,
1279
/// use the corresponding non-`_INIT` reference-typed `static`.
1280
///
1281
/// This part of the public API will go away if Rust changes
1282
/// to make the referent of `pub const FOO: &'static Encoding`
1283
/// unique cross-crate or if Rust starts allowing static arrays
1284
/// to be initialized with `pub static FOO: &'static Encoding`
1285
/// items.
1286
pub static ISO_8859_4_INIT: Encoding = Encoding {
1287
    name: "ISO-8859-4",
1288
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1289
};
1290
1291
/// The ISO-8859-4 encoding.
1292
///
1293
/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1294
///
1295
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1296
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1297
///
1298
/// This encoding matches the Windows code page 28594.
1299
///
1300
/// This will change from `static` to `const` if Rust changes
1301
/// to make the referent of `pub const FOO: &'static Encoding`
1302
/// unique cross-crate, so don't take the address of this
1303
/// `static`.
1304
pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1305
1306
/// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1307
///
1308
/// For use only for taking the address of this form when
1309
/// Rust prohibits the use of the non-`_INIT` form directly,
1310
/// such as in initializers of other `static`s. If in doubt,
1311
/// use the corresponding non-`_INIT` reference-typed `static`.
1312
///
1313
/// This part of the public API will go away if Rust changes
1314
/// to make the referent of `pub const FOO: &'static Encoding`
1315
/// unique cross-crate or if Rust starts allowing static arrays
1316
/// to be initialized with `pub static FOO: &'static Encoding`
1317
/// items.
1318
pub static ISO_8859_5_INIT: Encoding = Encoding {
1319
    name: "ISO-8859-5",
1320
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1321
};
1322
1323
/// The ISO-8859-5 encoding.
1324
///
1325
/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1326
///
1327
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1328
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1329
///
1330
/// This encoding matches the Windows code page 28595.
1331
///
1332
/// This will change from `static` to `const` if Rust changes
1333
/// to make the referent of `pub const FOO: &'static Encoding`
1334
/// unique cross-crate, so don't take the address of this
1335
/// `static`.
1336
pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1337
1338
/// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1339
///
1340
/// For use only for taking the address of this form when
1341
/// Rust prohibits the use of the non-`_INIT` form directly,
1342
/// such as in initializers of other `static`s. If in doubt,
1343
/// use the corresponding non-`_INIT` reference-typed `static`.
1344
///
1345
/// This part of the public API will go away if Rust changes
1346
/// to make the referent of `pub const FOO: &'static Encoding`
1347
/// unique cross-crate or if Rust starts allowing static arrays
1348
/// to be initialized with `pub static FOO: &'static Encoding`
1349
/// items.
1350
pub static ISO_8859_6_INIT: Encoding = Encoding {
1351
    name: "ISO-8859-6",
1352
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1353
};
1354
1355
/// The ISO-8859-6 encoding.
1356
///
1357
/// This is the Arabic part of the ISO/IEC 8859 encoding family.
1358
///
1359
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1360
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1361
///
1362
/// This encoding matches the Windows code page 28596, except Windows decodes
1363
/// unassigned code points to the Private Use Area of Unicode.
1364
///
1365
/// This will change from `static` to `const` if Rust changes
1366
/// to make the referent of `pub const FOO: &'static Encoding`
1367
/// unique cross-crate, so don't take the address of this
1368
/// `static`.
1369
pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1370
1371
/// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1372
///
1373
/// For use only for taking the address of this form when
1374
/// Rust prohibits the use of the non-`_INIT` form directly,
1375
/// such as in initializers of other `static`s. If in doubt,
1376
/// use the corresponding non-`_INIT` reference-typed `static`.
1377
///
1378
/// This part of the public API will go away if Rust changes
1379
/// to make the referent of `pub const FOO: &'static Encoding`
1380
/// unique cross-crate or if Rust starts allowing static arrays
1381
/// to be initialized with `pub static FOO: &'static Encoding`
1382
/// items.
1383
pub static ISO_8859_7_INIT: Encoding = Encoding {
1384
    name: "ISO-8859-7",
1385
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1386
};
1387
1388
/// The ISO-8859-7 encoding.
1389
///
1390
/// This is the Greek part of the ISO/IEC 8859 encoding family.
1391
///
1392
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1393
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1394
///
1395
/// This encoding roughly matches the Windows code page 28597. Windows decodes
1396
/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1397
/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1398
/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1399
/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1400
/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1401
///
1402
/// This will change from `static` to `const` if Rust changes
1403
/// to make the referent of `pub const FOO: &'static Encoding`
1404
/// unique cross-crate, so don't take the address of this
1405
/// `static`.
1406
pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1407
1408
/// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1409
///
1410
/// For use only for taking the address of this form when
1411
/// Rust prohibits the use of the non-`_INIT` form directly,
1412
/// such as in initializers of other `static`s. If in doubt,
1413
/// use the corresponding non-`_INIT` reference-typed `static`.
1414
///
1415
/// This part of the public API will go away if Rust changes
1416
/// to make the referent of `pub const FOO: &'static Encoding`
1417
/// unique cross-crate or if Rust starts allowing static arrays
1418
/// to be initialized with `pub static FOO: &'static Encoding`
1419
/// items.
1420
pub static ISO_8859_8_INIT: Encoding = Encoding {
1421
    name: "ISO-8859-8",
1422
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1423
};
1424
1425
/// The ISO-8859-8 encoding.
1426
///
1427
/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1428
///
1429
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1430
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1431
///
1432
/// This encoding roughly matches the Windows code page 28598. Windows decodes
1433
/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1434
/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1435
/// the private use area.
1436
///
1437
/// This will change from `static` to `const` if Rust changes
1438
/// to make the referent of `pub const FOO: &'static Encoding`
1439
/// unique cross-crate, so don't take the address of this
1440
/// `static`.
1441
pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1442
1443
/// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1444
///
1445
/// For use only for taking the address of this form when
1446
/// Rust prohibits the use of the non-`_INIT` form directly,
1447
/// such as in initializers of other `static`s. If in doubt,
1448
/// use the corresponding non-`_INIT` reference-typed `static`.
1449
///
1450
/// This part of the public API will go away if Rust changes
1451
/// to make the referent of `pub const FOO: &'static Encoding`
1452
/// unique cross-crate or if Rust starts allowing static arrays
1453
/// to be initialized with `pub static FOO: &'static Encoding`
1454
/// items.
1455
pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1456
    name: "ISO-8859-8-I",
1457
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1458
};
1459
1460
/// The ISO-8859-8-I encoding.
1461
///
1462
/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1463
///
1464
/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1465
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1466
///
1467
/// This encoding roughly matches the Windows code page 38598. Windows decodes
1468
/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1469
/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1470
/// the private use area.
1471
///
1472
/// This will change from `static` to `const` if Rust changes
1473
/// to make the referent of `pub const FOO: &'static Encoding`
1474
/// unique cross-crate, so don't take the address of this
1475
/// `static`.
1476
pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1477
1478
/// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1479
///
1480
/// For use only for taking the address of this form when
1481
/// Rust prohibits the use of the non-`_INIT` form directly,
1482
/// such as in initializers of other `static`s. If in doubt,
1483
/// use the corresponding non-`_INIT` reference-typed `static`.
1484
///
1485
/// This part of the public API will go away if Rust changes
1486
/// to make the referent of `pub const FOO: &'static Encoding`
1487
/// unique cross-crate or if Rust starts allowing static arrays
1488
/// to be initialized with `pub static FOO: &'static Encoding`
1489
/// items.
1490
pub static KOI8_R_INIT: Encoding = Encoding {
1491
    name: "KOI8-R",
1492
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1493
};
1494
1495
/// The KOI8-R encoding.
1496
///
1497
/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1498
///
1499
/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1500
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1501
///
1502
/// This encoding matches the Windows code page 20866.
1503
///
1504
/// This will change from `static` to `const` if Rust changes
1505
/// to make the referent of `pub const FOO: &'static Encoding`
1506
/// unique cross-crate, so don't take the address of this
1507
/// `static`.
1508
pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1509
1510
/// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1511
///
1512
/// For use only for taking the address of this form when
1513
/// Rust prohibits the use of the non-`_INIT` form directly,
1514
/// such as in initializers of other `static`s. If in doubt,
1515
/// use the corresponding non-`_INIT` reference-typed `static`.
1516
///
1517
/// This part of the public API will go away if Rust changes
1518
/// to make the referent of `pub const FOO: &'static Encoding`
1519
/// unique cross-crate or if Rust starts allowing static arrays
1520
/// to be initialized with `pub static FOO: &'static Encoding`
1521
/// items.
1522
pub static KOI8_U_INIT: Encoding = Encoding {
1523
    name: "KOI8-U",
1524
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1525
};
1526
1527
/// The KOI8-U encoding.
1528
///
1529
/// This is an encoding for Ukrainian adapted from KOI8-R.
1530
///
1531
/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1532
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1533
///
1534
/// This encoding matches the Windows code page 21866.
1535
///
1536
/// This will change from `static` to `const` if Rust changes
1537
/// to make the referent of `pub const FOO: &'static Encoding`
1538
/// unique cross-crate, so don't take the address of this
1539
/// `static`.
1540
pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1541
1542
/// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1543
///
1544
/// For use only for taking the address of this form when
1545
/// Rust prohibits the use of the non-`_INIT` form directly,
1546
/// such as in initializers of other `static`s. If in doubt,
1547
/// use the corresponding non-`_INIT` reference-typed `static`.
1548
///
1549
/// This part of the public API will go away if Rust changes
1550
/// to make the referent of `pub const FOO: &'static Encoding`
1551
/// unique cross-crate or if Rust starts allowing static arrays
1552
/// to be initialized with `pub static FOO: &'static Encoding`
1553
/// items.
1554
pub static SHIFT_JIS_INIT: Encoding = Encoding {
1555
    name: "Shift_JIS",
1556
    variant: VariantEncoding::ShiftJis,
1557
};
1558
1559
/// The Shift_JIS encoding.
1560
///
1561
/// This is the Japanese encoding for Windows.
1562
///
1563
/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1564
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1565
///
1566
/// This encoding matches the Windows code page 932, except Windows decodes some byte
1567
/// sequences that are error per the Encoding Standard to the question mark or the
1568
/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1569
///
1570
/// This will change from `static` to `const` if Rust changes
1571
/// to make the referent of `pub const FOO: &'static Encoding`
1572
/// unique cross-crate, so don't take the address of this
1573
/// `static`.
1574
pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1575
1576
/// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1577
///
1578
/// For use only for taking the address of this form when
1579
/// Rust prohibits the use of the non-`_INIT` form directly,
1580
/// such as in initializers of other `static`s. If in doubt,
1581
/// use the corresponding non-`_INIT` reference-typed `static`.
1582
///
1583
/// This part of the public API will go away if Rust changes
1584
/// to make the referent of `pub const FOO: &'static Encoding`
1585
/// unique cross-crate or if Rust starts allowing static arrays
1586
/// to be initialized with `pub static FOO: &'static Encoding`
1587
/// items.
1588
pub static UTF_16BE_INIT: Encoding = Encoding {
1589
    name: "UTF-16BE",
1590
    variant: VariantEncoding::Utf16Be,
1591
};
1592
1593
/// The UTF-16BE encoding.
1594
///
1595
/// This decode-only encoding uses 16-bit code units due to Unicode originally
1596
/// having been designed as a 16-bit reportoire. In the absence of a byte order
1597
/// mark the big endian byte order is assumed.
1598
///
1599
/// There is no corresponding encoder in this crate or in the Encoding
1600
/// Standard. The output encoding of this encoding is UTF-8.
1601
///
1602
/// This encoding matches the Windows code page 1201.
1603
///
1604
/// This will change from `static` to `const` if Rust changes
1605
/// to make the referent of `pub const FOO: &'static Encoding`
1606
/// unique cross-crate, so don't take the address of this
1607
/// `static`.
1608
pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1609
1610
/// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1611
///
1612
/// For use only for taking the address of this form when
1613
/// Rust prohibits the use of the non-`_INIT` form directly,
1614
/// such as in initializers of other `static`s. If in doubt,
1615
/// use the corresponding non-`_INIT` reference-typed `static`.
1616
///
1617
/// This part of the public API will go away if Rust changes
1618
/// to make the referent of `pub const FOO: &'static Encoding`
1619
/// unique cross-crate or if Rust starts allowing static arrays
1620
/// to be initialized with `pub static FOO: &'static Encoding`
1621
/// items.
1622
pub static UTF_16LE_INIT: Encoding = Encoding {
1623
    name: "UTF-16LE",
1624
    variant: VariantEncoding::Utf16Le,
1625
};
1626
1627
/// The UTF-16LE encoding.
1628
///
1629
/// This decode-only encoding uses 16-bit code units due to Unicode originally
1630
/// having been designed as a 16-bit reportoire. In the absence of a byte order
1631
/// mark the little endian byte order is assumed.
1632
///
1633
/// There is no corresponding encoder in this crate or in the Encoding
1634
/// Standard. The output encoding of this encoding is UTF-8.
1635
///
1636
/// This encoding matches the Windows code page 1200.
1637
///
1638
/// This will change from `static` to `const` if Rust changes
1639
/// to make the referent of `pub const FOO: &'static Encoding`
1640
/// unique cross-crate, so don't take the address of this
1641
/// `static`.
1642
pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1643
1644
/// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1645
///
1646
/// For use only for taking the address of this form when
1647
/// Rust prohibits the use of the non-`_INIT` form directly,
1648
/// such as in initializers of other `static`s. If in doubt,
1649
/// use the corresponding non-`_INIT` reference-typed `static`.
1650
///
1651
/// This part of the public API will go away if Rust changes
1652
/// to make the referent of `pub const FOO: &'static Encoding`
1653
/// unique cross-crate or if Rust starts allowing static arrays
1654
/// to be initialized with `pub static FOO: &'static Encoding`
1655
/// items.
1656
pub static UTF_8_INIT: Encoding = Encoding {
1657
    name: "UTF-8",
1658
    variant: VariantEncoding::Utf8,
1659
};
1660
1661
/// The UTF-8 encoding.
1662
///
1663
/// This is the encoding that should be used for all new development it can
1664
/// represent all of Unicode.
1665
///
1666
/// This encoding matches the Windows code page 65001, except Windows differs
1667
/// in the number of errors generated for some erroneous byte sequences.
1668
///
1669
/// This will change from `static` to `const` if Rust changes
1670
/// to make the referent of `pub const FOO: &'static Encoding`
1671
/// unique cross-crate, so don't take the address of this
1672
/// `static`.
1673
pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1674
1675
/// The initializer for the [gb18030](static.GB18030.html) encoding.
1676
///
1677
/// For use only for taking the address of this form when
1678
/// Rust prohibits the use of the non-`_INIT` form directly,
1679
/// such as in initializers of other `static`s. If in doubt,
1680
/// use the corresponding non-`_INIT` reference-typed `static`.
1681
///
1682
/// This part of the public API will go away if Rust changes
1683
/// to make the referent of `pub const FOO: &'static Encoding`
1684
/// unique cross-crate or if Rust starts allowing static arrays
1685
/// to be initialized with `pub static FOO: &'static Encoding`
1686
/// items.
1687
pub static GB18030_INIT: Encoding = Encoding {
1688
    name: "gb18030",
1689
    variant: VariantEncoding::Gb18030,
1690
};
1691
1692
/// The gb18030 encoding.
1693
///
1694
/// This encoding matches GB18030-2022 except the two-byte sequence 0xA3 0xA0
1695
/// maps to U+3000 for compatibility with existing Web content and the four-byte
1696
/// sequences for the non-PUA characters that got two-byte sequences still decode
1697
/// to the same non-PUA characters as in GB18030-2005. As a result, this encoding
1698
/// can represent all of Unicode except for 19 private-use characters.
1699
///
1700
/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1701
/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1702
///
1703
/// This encoding matches the Windows code page 54936.
1704
///
1705
/// This will change from `static` to `const` if Rust changes
1706
/// to make the referent of `pub const FOO: &'static Encoding`
1707
/// unique cross-crate, so don't take the address of this
1708
/// `static`.
1709
pub static GB18030: &'static Encoding = &GB18030_INIT;
1710
1711
/// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1712
///
1713
/// For use only for taking the address of this form when
1714
/// Rust prohibits the use of the non-`_INIT` form directly,
1715
/// such as in initializers of other `static`s. If in doubt,
1716
/// use the corresponding non-`_INIT` reference-typed `static`.
1717
///
1718
/// This part of the public API will go away if Rust changes
1719
/// to make the referent of `pub const FOO: &'static Encoding`
1720
/// unique cross-crate or if Rust starts allowing static arrays
1721
/// to be initialized with `pub static FOO: &'static Encoding`
1722
/// items.
1723
pub static MACINTOSH_INIT: Encoding = Encoding {
1724
    name: "macintosh",
1725
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1726
};
1727
1728
/// The macintosh encoding.
1729
///
1730
/// This is the MacRoman encoding from Mac OS Classic.
1731
///
1732
/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1733
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1734
///
1735
/// This encoding matches the Windows code page 10000, except Windows decodes
1736
/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1737
///
1738
/// This will change from `static` to `const` if Rust changes
1739
/// to make the referent of `pub const FOO: &'static Encoding`
1740
/// unique cross-crate, so don't take the address of this
1741
/// `static`.
1742
pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1743
1744
/// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1745
///
1746
/// For use only for taking the address of this form when
1747
/// Rust prohibits the use of the non-`_INIT` form directly,
1748
/// such as in initializers of other `static`s. If in doubt,
1749
/// use the corresponding non-`_INIT` reference-typed `static`.
1750
///
1751
/// This part of the public API will go away if Rust changes
1752
/// to make the referent of `pub const FOO: &'static Encoding`
1753
/// unique cross-crate or if Rust starts allowing static arrays
1754
/// to be initialized with `pub static FOO: &'static Encoding`
1755
/// items.
1756
pub static REPLACEMENT_INIT: Encoding = Encoding {
1757
    name: "replacement",
1758
    variant: VariantEncoding::Replacement,
1759
};
1760
1761
/// The replacement encoding.
1762
///
1763
/// This decode-only encoding decodes all non-zero-length streams to a single
1764
/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1765
/// ASCII-compatible fallback encoding (typically windows-1252) for some
1766
/// encodings that are no longer supported by the Web Platform and that
1767
/// would be dangerous to treat as ASCII-compatible.
1768
///
1769
/// There is no corresponding encoder. The output encoding of this encoding
1770
/// is UTF-8.
1771
///
1772
/// This encoding does not have a Windows code page number.
1773
///
1774
/// This will change from `static` to `const` if Rust changes
1775
/// to make the referent of `pub const FOO: &'static Encoding`
1776
/// unique cross-crate, so don't take the address of this
1777
/// `static`.
1778
pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1779
1780
/// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1781
///
1782
/// For use only for taking the address of this form when
1783
/// Rust prohibits the use of the non-`_INIT` form directly,
1784
/// such as in initializers of other `static`s. If in doubt,
1785
/// use the corresponding non-`_INIT` reference-typed `static`.
1786
///
1787
/// This part of the public API will go away if Rust changes
1788
/// to make the referent of `pub const FOO: &'static Encoding`
1789
/// unique cross-crate or if Rust starts allowing static arrays
1790
/// to be initialized with `pub static FOO: &'static Encoding`
1791
/// items.
1792
pub static WINDOWS_1250_INIT: Encoding = Encoding {
1793
    name: "windows-1250",
1794
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1795
};
1796
1797
/// The windows-1250 encoding.
1798
///
1799
/// This is the Central European encoding for Windows.
1800
///
1801
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1802
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1803
///
1804
/// This encoding matches the Windows code page 1250.
1805
///
1806
/// This will change from `static` to `const` if Rust changes
1807
/// to make the referent of `pub const FOO: &'static Encoding`
1808
/// unique cross-crate, so don't take the address of this
1809
/// `static`.
1810
pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1811
1812
/// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1813
///
1814
/// For use only for taking the address of this form when
1815
/// Rust prohibits the use of the non-`_INIT` form directly,
1816
/// such as in initializers of other `static`s. If in doubt,
1817
/// use the corresponding non-`_INIT` reference-typed `static`.
1818
///
1819
/// This part of the public API will go away if Rust changes
1820
/// to make the referent of `pub const FOO: &'static Encoding`
1821
/// unique cross-crate or if Rust starts allowing static arrays
1822
/// to be initialized with `pub static FOO: &'static Encoding`
1823
/// items.
1824
pub static WINDOWS_1251_INIT: Encoding = Encoding {
1825
    name: "windows-1251",
1826
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1827
};
1828
1829
/// The windows-1251 encoding.
1830
///
1831
/// This is the Cyrillic encoding for Windows.
1832
///
1833
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1834
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1835
///
1836
/// This encoding matches the Windows code page 1251.
1837
///
1838
/// This will change from `static` to `const` if Rust changes
1839
/// to make the referent of `pub const FOO: &'static Encoding`
1840
/// unique cross-crate, so don't take the address of this
1841
/// `static`.
1842
pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1843
1844
/// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1845
///
1846
/// For use only for taking the address of this form when
1847
/// Rust prohibits the use of the non-`_INIT` form directly,
1848
/// such as in initializers of other `static`s. If in doubt,
1849
/// use the corresponding non-`_INIT` reference-typed `static`.
1850
///
1851
/// This part of the public API will go away if Rust changes
1852
/// to make the referent of `pub const FOO: &'static Encoding`
1853
/// unique cross-crate or if Rust starts allowing static arrays
1854
/// to be initialized with `pub static FOO: &'static Encoding`
1855
/// items.
1856
pub static WINDOWS_1252_INIT: Encoding = Encoding {
1857
    name: "windows-1252",
1858
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1859
};
1860
1861
/// The windows-1252 encoding.
1862
///
1863
/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1864
/// which is known as Latin 1.
1865
///
1866
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1867
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1868
///
1869
/// This encoding matches the Windows code page 1252.
1870
///
1871
/// This will change from `static` to `const` if Rust changes
1872
/// to make the referent of `pub const FOO: &'static Encoding`
1873
/// unique cross-crate, so don't take the address of this
1874
/// `static`.
1875
pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1876
1877
/// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1878
///
1879
/// For use only for taking the address of this form when
1880
/// Rust prohibits the use of the non-`_INIT` form directly,
1881
/// such as in initializers of other `static`s. If in doubt,
1882
/// use the corresponding non-`_INIT` reference-typed `static`.
1883
///
1884
/// This part of the public API will go away if Rust changes
1885
/// to make the referent of `pub const FOO: &'static Encoding`
1886
/// unique cross-crate or if Rust starts allowing static arrays
1887
/// to be initialized with `pub static FOO: &'static Encoding`
1888
/// items.
1889
pub static WINDOWS_1253_INIT: Encoding = Encoding {
1890
    name: "windows-1253",
1891
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1892
};
1893
1894
/// The windows-1253 encoding.
1895
///
1896
/// This is the Greek encoding for Windows. It is mostly an extension of
1897
/// ISO-8859-7, but U+0386 is mapped to a different byte.
1898
///
1899
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1900
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1901
///
1902
/// This encoding matches the Windows code page 1253, except Windows decodes
1903
/// unassigned code points to the Private Use Area of Unicode.
1904
///
1905
/// This will change from `static` to `const` if Rust changes
1906
/// to make the referent of `pub const FOO: &'static Encoding`
1907
/// unique cross-crate, so don't take the address of this
1908
/// `static`.
1909
pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1910
1911
/// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1912
///
1913
/// For use only for taking the address of this form when
1914
/// Rust prohibits the use of the non-`_INIT` form directly,
1915
/// such as in initializers of other `static`s. If in doubt,
1916
/// use the corresponding non-`_INIT` reference-typed `static`.
1917
///
1918
/// This part of the public API will go away if Rust changes
1919
/// to make the referent of `pub const FOO: &'static Encoding`
1920
/// unique cross-crate or if Rust starts allowing static arrays
1921
/// to be initialized with `pub static FOO: &'static Encoding`
1922
/// items.
1923
pub static WINDOWS_1254_INIT: Encoding = Encoding {
1924
    name: "windows-1254",
1925
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1926
};
1927
1928
/// The windows-1254 encoding.
1929
///
1930
/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1931
/// which is known as Latin 5.
1932
///
1933
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1934
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1935
///
1936
/// This encoding matches the Windows code page 1254.
1937
///
1938
/// This will change from `static` to `const` if Rust changes
1939
/// to make the referent of `pub const FOO: &'static Encoding`
1940
/// unique cross-crate, so don't take the address of this
1941
/// `static`.
1942
pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1943
1944
/// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1945
///
1946
/// For use only for taking the address of this form when
1947
/// Rust prohibits the use of the non-`_INIT` form directly,
1948
/// such as in initializers of other `static`s. If in doubt,
1949
/// use the corresponding non-`_INIT` reference-typed `static`.
1950
///
1951
/// This part of the public API will go away if Rust changes
1952
/// to make the referent of `pub const FOO: &'static Encoding`
1953
/// unique cross-crate or if Rust starts allowing static arrays
1954
/// to be initialized with `pub static FOO: &'static Encoding`
1955
/// items.
1956
pub static WINDOWS_1255_INIT: Encoding = Encoding {
1957
    name: "windows-1255",
1958
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1959
};
1960
1961
/// The windows-1255 encoding.
1962
///
1963
/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1964
/// except for a currency sign swap.
1965
///
1966
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1967
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1968
///
1969
/// This encoding matches the Windows code page 1255, except Windows decodes
1970
/// unassigned code points to the Private Use Area of Unicode.
1971
///
1972
/// This will change from `static` to `const` if Rust changes
1973
/// to make the referent of `pub const FOO: &'static Encoding`
1974
/// unique cross-crate, so don't take the address of this
1975
/// `static`.
1976
pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1977
1978
/// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1979
///
1980
/// For use only for taking the address of this form when
1981
/// Rust prohibits the use of the non-`_INIT` form directly,
1982
/// such as in initializers of other `static`s. If in doubt,
1983
/// use the corresponding non-`_INIT` reference-typed `static`.
1984
///
1985
/// This part of the public API will go away if Rust changes
1986
/// to make the referent of `pub const FOO: &'static Encoding`
1987
/// unique cross-crate or if Rust starts allowing static arrays
1988
/// to be initialized with `pub static FOO: &'static Encoding`
1989
/// items.
1990
pub static WINDOWS_1256_INIT: Encoding = Encoding {
1991
    name: "windows-1256",
1992
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1993
};
1994
1995
/// The windows-1256 encoding.
1996
///
1997
/// This is the Arabic encoding for Windows.
1998
///
1999
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
2000
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
2001
///
2002
/// This encoding matches the Windows code page 1256.
2003
///
2004
/// This will change from `static` to `const` if Rust changes
2005
/// to make the referent of `pub const FOO: &'static Encoding`
2006
/// unique cross-crate, so don't take the address of this
2007
/// `static`.
2008
pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
2009
2010
/// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
2011
///
2012
/// For use only for taking the address of this form when
2013
/// Rust prohibits the use of the non-`_INIT` form directly,
2014
/// such as in initializers of other `static`s. If in doubt,
2015
/// use the corresponding non-`_INIT` reference-typed `static`.
2016
///
2017
/// This part of the public API will go away if Rust changes
2018
/// to make the referent of `pub const FOO: &'static Encoding`
2019
/// unique cross-crate or if Rust starts allowing static arrays
2020
/// to be initialized with `pub static FOO: &'static Encoding`
2021
/// items.
2022
pub static WINDOWS_1257_INIT: Encoding = Encoding {
2023
    name: "windows-1257",
2024
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
2025
};
2026
2027
/// The windows-1257 encoding.
2028
///
2029
/// This is the Baltic encoding for Windows.
2030
///
2031
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
2032
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
2033
///
2034
/// This encoding matches the Windows code page 1257, except Windows decodes
2035
/// unassigned code points to the Private Use Area of Unicode.
2036
///
2037
/// This will change from `static` to `const` if Rust changes
2038
/// to make the referent of `pub const FOO: &'static Encoding`
2039
/// unique cross-crate, so don't take the address of this
2040
/// `static`.
2041
pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
2042
2043
/// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2044
///
2045
/// For use only for taking the address of this form when
2046
/// Rust prohibits the use of the non-`_INIT` form directly,
2047
/// such as in initializers of other `static`s. If in doubt,
2048
/// use the corresponding non-`_INIT` reference-typed `static`.
2049
///
2050
/// This part of the public API will go away if Rust changes
2051
/// to make the referent of `pub const FOO: &'static Encoding`
2052
/// unique cross-crate or if Rust starts allowing static arrays
2053
/// to be initialized with `pub static FOO: &'static Encoding`
2054
/// items.
2055
pub static WINDOWS_1258_INIT: Encoding = Encoding {
2056
    name: "windows-1258",
2057
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2058
};
2059
2060
/// The windows-1258 encoding.
2061
///
2062
/// This is the Vietnamese encoding for Windows.
2063
///
2064
/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2065
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2066
///
2067
/// This encoding matches the Windows code page 1258 when used in the
2068
/// non-normalizing mode. Unlike with the other single-byte encodings, the
2069
/// result of decoding is not necessarily in Normalization Form C. On the
2070
/// other hand, input in the Normalization Form C is not encoded without
2071
/// replacement. In general, it's a bad idea to encode to encodings other
2072
/// than UTF-8, but this encoding is especially hazardous to encode to.
2073
///
2074
/// This will change from `static` to `const` if Rust changes
2075
/// to make the referent of `pub const FOO: &'static Encoding`
2076
/// unique cross-crate, so don't take the address of this
2077
/// `static`.
2078
pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2079
2080
/// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2081
///
2082
/// For use only for taking the address of this form when
2083
/// Rust prohibits the use of the non-`_INIT` form directly,
2084
/// such as in initializers of other `static`s. If in doubt,
2085
/// use the corresponding non-`_INIT` reference-typed `static`.
2086
///
2087
/// This part of the public API will go away if Rust changes
2088
/// to make the referent of `pub const FOO: &'static Encoding`
2089
/// unique cross-crate or if Rust starts allowing static arrays
2090
/// to be initialized with `pub static FOO: &'static Encoding`
2091
/// items.
2092
pub static WINDOWS_874_INIT: Encoding = Encoding {
2093
    name: "windows-874",
2094
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2095
};
2096
2097
/// The windows-874 encoding.
2098
///
2099
/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2100
///
2101
/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2102
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2103
///
2104
/// This encoding matches the Windows code page 874, except Windows decodes
2105
/// unassigned code points to the Private Use Area of Unicode.
2106
///
2107
/// This will change from `static` to `const` if Rust changes
2108
/// to make the referent of `pub const FOO: &'static Encoding`
2109
/// unique cross-crate, so don't take the address of this
2110
/// `static`.
2111
pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2112
2113
/// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2114
///
2115
/// For use only for taking the address of this form when
2116
/// Rust prohibits the use of the non-`_INIT` form directly,
2117
/// such as in initializers of other `static`s. If in doubt,
2118
/// use the corresponding non-`_INIT` reference-typed `static`.
2119
///
2120
/// This part of the public API will go away if Rust changes
2121
/// to make the referent of `pub const FOO: &'static Encoding`
2122
/// unique cross-crate or if Rust starts allowing static arrays
2123
/// to be initialized with `pub static FOO: &'static Encoding`
2124
/// items.
2125
pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2126
    name: "x-mac-cyrillic",
2127
    variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2128
};
2129
2130
/// The x-mac-cyrillic encoding.
2131
///
2132
/// This is the MacUkrainian encoding from Mac OS Classic.
2133
///
2134
/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2135
/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2136
///
2137
/// This encoding matches the Windows code page 10017.
2138
///
2139
/// This will change from `static` to `const` if Rust changes
2140
/// to make the referent of `pub const FOO: &'static Encoding`
2141
/// unique cross-crate, so don't take the address of this
2142
/// `static`.
2143
pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2144
2145
/// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2146
///
2147
/// For use only for taking the address of this form when
2148
/// Rust prohibits the use of the non-`_INIT` form directly,
2149
/// such as in initializers of other `static`s. If in doubt,
2150
/// use the corresponding non-`_INIT` reference-typed `static`.
2151
///
2152
/// This part of the public API will go away if Rust changes
2153
/// to make the referent of `pub const FOO: &'static Encoding`
2154
/// unique cross-crate or if Rust starts allowing static arrays
2155
/// to be initialized with `pub static FOO: &'static Encoding`
2156
/// items.
2157
pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2158
    name: "x-user-defined",
2159
    variant: VariantEncoding::UserDefined,
2160
};
2161
2162
/// The x-user-defined encoding.
2163
///
2164
/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2165
/// them to the Private Use Area of Unicode. It was used for loading binary
2166
/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2167
/// the `"arraybuffer"` response type.
2168
///
2169
/// This encoding does not have a Windows code page number.
2170
///
2171
/// This will change from `static` to `const` if Rust changes
2172
/// to make the referent of `pub const FOO: &'static Encoding`
2173
/// unique cross-crate, so don't take the address of this
2174
/// `static`.
2175
pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2176
2177
static LABELS_SORTED: [&'static str; 228] = [
2178
    "l1",
2179
    "l2",
2180
    "l3",
2181
    "l4",
2182
    "l5",
2183
    "l6",
2184
    "l9",
2185
    "866",
2186
    "mac",
2187
    "koi",
2188
    "gbk",
2189
    "big5",
2190
    "utf8",
2191
    "koi8",
2192
    "sjis",
2193
    "ucs-2",
2194
    "ms932",
2195
    "cp866",
2196
    "utf-8",
2197
    "cp819",
2198
    "ascii",
2199
    "x-gbk",
2200
    "greek",
2201
    "cp1250",
2202
    "cp1251",
2203
    "latin1",
2204
    "gb2312",
2205
    "cp1252",
2206
    "latin2",
2207
    "cp1253",
2208
    "latin3",
2209
    "cp1254",
2210
    "latin4",
2211
    "cp1255",
2212
    "csbig5",
2213
    "latin5",
2214
    "utf-16",
2215
    "cp1256",
2216
    "ibm866",
2217
    "latin6",
2218
    "cp1257",
2219
    "cp1258",
2220
    "greek8",
2221
    "ibm819",
2222
    "arabic",
2223
    "visual",
2224
    "korean",
2225
    "euc-jp",
2226
    "koi8-r",
2227
    "koi8_r",
2228
    "euc-kr",
2229
    "x-sjis",
2230
    "koi8-u",
2231
    "hebrew",
2232
    "tis-620",
2233
    "gb18030",
2234
    "ksc5601",
2235
    "gb_2312",
2236
    "dos-874",
2237
    "cn-big5",
2238
    "unicode",
2239
    "chinese",
2240
    "logical",
2241
    "cskoi8r",
2242
    "cseuckr",
2243
    "koi8-ru",
2244
    "x-cp1250",
2245
    "ksc_5601",
2246
    "x-cp1251",
2247
    "iso88591",
2248
    "csgb2312",
2249
    "x-cp1252",
2250
    "iso88592",
2251
    "x-cp1253",
2252
    "iso88593",
2253
    "ecma-114",
2254
    "x-cp1254",
2255
    "iso88594",
2256
    "x-cp1255",
2257
    "iso88595",
2258
    "x-x-big5",
2259
    "x-cp1256",
2260
    "csibm866",
2261
    "iso88596",
2262
    "x-cp1257",
2263
    "iso88597",
2264
    "asmo-708",
2265
    "ecma-118",
2266
    "elot_928",
2267
    "x-cp1258",
2268
    "iso88598",
2269
    "iso88599",
2270
    "cyrillic",
2271
    "utf-16be",
2272
    "utf-16le",
2273
    "us-ascii",
2274
    "ms_kanji",
2275
    "x-euc-jp",
2276
    "iso885910",
2277
    "iso8859-1",
2278
    "iso885911",
2279
    "iso8859-2",
2280
    "iso8859-3",
2281
    "iso885913",
2282
    "iso8859-4",
2283
    "iso885914",
2284
    "iso8859-5",
2285
    "iso885915",
2286
    "iso8859-6",
2287
    "iso8859-7",
2288
    "iso8859-8",
2289
    "iso-ir-58",
2290
    "iso8859-9",
2291
    "csunicode",
2292
    "macintosh",
2293
    "shift-jis",
2294
    "shift_jis",
2295
    "iso-ir-100",
2296
    "iso8859-10",
2297
    "iso-ir-110",
2298
    "gb_2312-80",
2299
    "iso-8859-1",
2300
    "iso_8859-1",
2301
    "iso-ir-101",
2302
    "iso8859-11",
2303
    "iso-8859-2",
2304
    "iso_8859-2",
2305
    "hz-gb-2312",
2306
    "iso-8859-3",
2307
    "iso_8859-3",
2308
    "iso8859-13",
2309
    "iso-8859-4",
2310
    "iso_8859-4",
2311
    "iso8859-14",
2312
    "iso-ir-144",
2313
    "iso-8859-5",
2314
    "iso_8859-5",
2315
    "iso8859-15",
2316
    "iso-8859-6",
2317
    "iso_8859-6",
2318
    "iso-ir-126",
2319
    "iso-8859-7",
2320
    "iso_8859-7",
2321
    "iso-ir-127",
2322
    "iso-ir-157",
2323
    "iso-8859-8",
2324
    "iso_8859-8",
2325
    "iso-ir-138",
2326
    "iso-ir-148",
2327
    "iso-8859-9",
2328
    "iso_8859-9",
2329
    "iso-ir-109",
2330
    "iso-ir-149",
2331
    "big5-hkscs",
2332
    "csshiftjis",
2333
    "iso-8859-10",
2334
    "iso-8859-11",
2335
    "csisolatin1",
2336
    "csisolatin2",
2337
    "iso-8859-13",
2338
    "csisolatin3",
2339
    "iso-8859-14",
2340
    "windows-874",
2341
    "csisolatin4",
2342
    "iso-8859-15",
2343
    "iso_8859-15",
2344
    "csisolatin5",
2345
    "iso-8859-16",
2346
    "csisolatin6",
2347
    "windows-949",
2348
    "csisolatin9",
2349
    "csiso88596e",
2350
    "csiso88598e",
2351
    "unicodefffe",
2352
    "unicodefeff",
2353
    "csmacintosh",
2354
    "csiso88596i",
2355
    "csiso88598i",
2356
    "windows-31j",
2357
    "x-mac-roman",
2358
    "iso-2022-cn",
2359
    "iso-2022-jp",
2360
    "csiso2022jp",
2361
    "iso-2022-kr",
2362
    "csiso2022kr",
2363
    "replacement",
2364
    "windows-1250",
2365
    "windows-1251",
2366
    "windows-1252",
2367
    "windows-1253",
2368
    "windows-1254",
2369
    "windows-1255",
2370
    "windows-1256",
2371
    "windows-1257",
2372
    "windows-1258",
2373
    "iso-8859-6-e",
2374
    "iso-8859-8-e",
2375
    "iso-8859-6-i",
2376
    "iso-8859-8-i",
2377
    "sun_eu_greek",
2378
    "csksc56011987",
2379
    "unicode20utf8",
2380
    "unicode11utf8",
2381
    "ks_c_5601-1987",
2382
    "ansi_x3.4-1968",
2383
    "ks_c_5601-1989",
2384
    "x-mac-cyrillic",
2385
    "x-user-defined",
2386
    "csiso58gb231280",
2387
    "iso-10646-ucs-2",
2388
    "iso_8859-1:1987",
2389
    "iso_8859-2:1987",
2390
    "iso_8859-6:1987",
2391
    "iso_8859-7:1987",
2392
    "iso_8859-3:1988",
2393
    "iso_8859-4:1988",
2394
    "iso_8859-5:1988",
2395
    "iso_8859-8:1988",
2396
    "x-unicode20utf8",
2397
    "iso_8859-9:1989",
2398
    "csisolatingreek",
2399
    "x-mac-ukrainian",
2400
    "iso-2022-cn-ext",
2401
    "csisolatinarabic",
2402
    "csisolatinhebrew",
2403
    "unicode-1-1-utf-8",
2404
    "csisolatincyrillic",
2405
    "cseucpkdfmtjapanese",
2406
];
2407
2408
static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 228] = [
2409
    &WINDOWS_1252_INIT,
2410
    &ISO_8859_2_INIT,
2411
    &ISO_8859_3_INIT,
2412
    &ISO_8859_4_INIT,
2413
    &WINDOWS_1254_INIT,
2414
    &ISO_8859_10_INIT,
2415
    &ISO_8859_15_INIT,
2416
    &IBM866_INIT,
2417
    &MACINTOSH_INIT,
2418
    &KOI8_R_INIT,
2419
    &GBK_INIT,
2420
    &BIG5_INIT,
2421
    &UTF_8_INIT,
2422
    &KOI8_R_INIT,
2423
    &SHIFT_JIS_INIT,
2424
    &UTF_16LE_INIT,
2425
    &SHIFT_JIS_INIT,
2426
    &IBM866_INIT,
2427
    &UTF_8_INIT,
2428
    &WINDOWS_1252_INIT,
2429
    &WINDOWS_1252_INIT,
2430
    &GBK_INIT,
2431
    &ISO_8859_7_INIT,
2432
    &WINDOWS_1250_INIT,
2433
    &WINDOWS_1251_INIT,
2434
    &WINDOWS_1252_INIT,
2435
    &GBK_INIT,
2436
    &WINDOWS_1252_INIT,
2437
    &ISO_8859_2_INIT,
2438
    &WINDOWS_1253_INIT,
2439
    &ISO_8859_3_INIT,
2440
    &WINDOWS_1254_INIT,
2441
    &ISO_8859_4_INIT,
2442
    &WINDOWS_1255_INIT,
2443
    &BIG5_INIT,
2444
    &WINDOWS_1254_INIT,
2445
    &UTF_16LE_INIT,
2446
    &WINDOWS_1256_INIT,
2447
    &IBM866_INIT,
2448
    &ISO_8859_10_INIT,
2449
    &WINDOWS_1257_INIT,
2450
    &WINDOWS_1258_INIT,
2451
    &ISO_8859_7_INIT,
2452
    &WINDOWS_1252_INIT,
2453
    &ISO_8859_6_INIT,
2454
    &ISO_8859_8_INIT,
2455
    &EUC_KR_INIT,
2456
    &EUC_JP_INIT,
2457
    &KOI8_R_INIT,
2458
    &KOI8_R_INIT,
2459
    &EUC_KR_INIT,
2460
    &SHIFT_JIS_INIT,
2461
    &KOI8_U_INIT,
2462
    &ISO_8859_8_INIT,
2463
    &WINDOWS_874_INIT,
2464
    &GB18030_INIT,
2465
    &EUC_KR_INIT,
2466
    &GBK_INIT,
2467
    &WINDOWS_874_INIT,
2468
    &BIG5_INIT,
2469
    &UTF_16LE_INIT,
2470
    &GBK_INIT,
2471
    &ISO_8859_8_I_INIT,
2472
    &KOI8_R_INIT,
2473
    &EUC_KR_INIT,
2474
    &KOI8_U_INIT,
2475
    &WINDOWS_1250_INIT,
2476
    &EUC_KR_INIT,
2477
    &WINDOWS_1251_INIT,
2478
    &WINDOWS_1252_INIT,
2479
    &GBK_INIT,
2480
    &WINDOWS_1252_INIT,
2481
    &ISO_8859_2_INIT,
2482
    &WINDOWS_1253_INIT,
2483
    &ISO_8859_3_INIT,
2484
    &ISO_8859_6_INIT,
2485
    &WINDOWS_1254_INIT,
2486
    &ISO_8859_4_INIT,
2487
    &WINDOWS_1255_INIT,
2488
    &ISO_8859_5_INIT,
2489
    &BIG5_INIT,
2490
    &WINDOWS_1256_INIT,
2491
    &IBM866_INIT,
2492
    &ISO_8859_6_INIT,
2493
    &WINDOWS_1257_INIT,
2494
    &ISO_8859_7_INIT,
2495
    &ISO_8859_6_INIT,
2496
    &ISO_8859_7_INIT,
2497
    &ISO_8859_7_INIT,
2498
    &WINDOWS_1258_INIT,
2499
    &ISO_8859_8_INIT,
2500
    &WINDOWS_1254_INIT,
2501
    &ISO_8859_5_INIT,
2502
    &UTF_16BE_INIT,
2503
    &UTF_16LE_INIT,
2504
    &WINDOWS_1252_INIT,
2505
    &SHIFT_JIS_INIT,
2506
    &EUC_JP_INIT,
2507
    &ISO_8859_10_INIT,
2508
    &WINDOWS_1252_INIT,
2509
    &WINDOWS_874_INIT,
2510
    &ISO_8859_2_INIT,
2511
    &ISO_8859_3_INIT,
2512
    &ISO_8859_13_INIT,
2513
    &ISO_8859_4_INIT,
2514
    &ISO_8859_14_INIT,
2515
    &ISO_8859_5_INIT,
2516
    &ISO_8859_15_INIT,
2517
    &ISO_8859_6_INIT,
2518
    &ISO_8859_7_INIT,
2519
    &ISO_8859_8_INIT,
2520
    &GBK_INIT,
2521
    &WINDOWS_1254_INIT,
2522
    &UTF_16LE_INIT,
2523
    &MACINTOSH_INIT,
2524
    &SHIFT_JIS_INIT,
2525
    &SHIFT_JIS_INIT,
2526
    &WINDOWS_1252_INIT,
2527
    &ISO_8859_10_INIT,
2528
    &ISO_8859_4_INIT,
2529
    &GBK_INIT,
2530
    &WINDOWS_1252_INIT,
2531
    &WINDOWS_1252_INIT,
2532
    &ISO_8859_2_INIT,
2533
    &WINDOWS_874_INIT,
2534
    &ISO_8859_2_INIT,
2535
    &ISO_8859_2_INIT,
2536
    &REPLACEMENT_INIT,
2537
    &ISO_8859_3_INIT,
2538
    &ISO_8859_3_INIT,
2539
    &ISO_8859_13_INIT,
2540
    &ISO_8859_4_INIT,
2541
    &ISO_8859_4_INIT,
2542
    &ISO_8859_14_INIT,
2543
    &ISO_8859_5_INIT,
2544
    &ISO_8859_5_INIT,
2545
    &ISO_8859_5_INIT,
2546
    &ISO_8859_15_INIT,
2547
    &ISO_8859_6_INIT,
2548
    &ISO_8859_6_INIT,
2549
    &ISO_8859_7_INIT,
2550
    &ISO_8859_7_INIT,
2551
    &ISO_8859_7_INIT,
2552
    &ISO_8859_6_INIT,
2553
    &ISO_8859_10_INIT,
2554
    &ISO_8859_8_INIT,
2555
    &ISO_8859_8_INIT,
2556
    &ISO_8859_8_INIT,
2557
    &WINDOWS_1254_INIT,
2558
    &WINDOWS_1254_INIT,
2559
    &WINDOWS_1254_INIT,
2560
    &ISO_8859_3_INIT,
2561
    &EUC_KR_INIT,
2562
    &BIG5_INIT,
2563
    &SHIFT_JIS_INIT,
2564
    &ISO_8859_10_INIT,
2565
    &WINDOWS_874_INIT,
2566
    &WINDOWS_1252_INIT,
2567
    &ISO_8859_2_INIT,
2568
    &ISO_8859_13_INIT,
2569
    &ISO_8859_3_INIT,
2570
    &ISO_8859_14_INIT,
2571
    &WINDOWS_874_INIT,
2572
    &ISO_8859_4_INIT,
2573
    &ISO_8859_15_INIT,
2574
    &ISO_8859_15_INIT,
2575
    &WINDOWS_1254_INIT,
2576
    &ISO_8859_16_INIT,
2577
    &ISO_8859_10_INIT,
2578
    &EUC_KR_INIT,
2579
    &ISO_8859_15_INIT,
2580
    &ISO_8859_6_INIT,
2581
    &ISO_8859_8_INIT,
2582
    &UTF_16BE_INIT,
2583
    &UTF_16LE_INIT,
2584
    &MACINTOSH_INIT,
2585
    &ISO_8859_6_INIT,
2586
    &ISO_8859_8_I_INIT,
2587
    &SHIFT_JIS_INIT,
2588
    &MACINTOSH_INIT,
2589
    &REPLACEMENT_INIT,
2590
    &ISO_2022_JP_INIT,
2591
    &ISO_2022_JP_INIT,
2592
    &REPLACEMENT_INIT,
2593
    &REPLACEMENT_INIT,
2594
    &REPLACEMENT_INIT,
2595
    &WINDOWS_1250_INIT,
2596
    &WINDOWS_1251_INIT,
2597
    &WINDOWS_1252_INIT,
2598
    &WINDOWS_1253_INIT,
2599
    &WINDOWS_1254_INIT,
2600
    &WINDOWS_1255_INIT,
2601
    &WINDOWS_1256_INIT,
2602
    &WINDOWS_1257_INIT,
2603
    &WINDOWS_1258_INIT,
2604
    &ISO_8859_6_INIT,
2605
    &ISO_8859_8_INIT,
2606
    &ISO_8859_6_INIT,
2607
    &ISO_8859_8_I_INIT,
2608
    &ISO_8859_7_INIT,
2609
    &EUC_KR_INIT,
2610
    &UTF_8_INIT,
2611
    &UTF_8_INIT,
2612
    &EUC_KR_INIT,
2613
    &WINDOWS_1252_INIT,
2614
    &EUC_KR_INIT,
2615
    &X_MAC_CYRILLIC_INIT,
2616
    &X_USER_DEFINED_INIT,
2617
    &GBK_INIT,
2618
    &UTF_16LE_INIT,
2619
    &WINDOWS_1252_INIT,
2620
    &ISO_8859_2_INIT,
2621
    &ISO_8859_6_INIT,
2622
    &ISO_8859_7_INIT,
2623
    &ISO_8859_3_INIT,
2624
    &ISO_8859_4_INIT,
2625
    &ISO_8859_5_INIT,
2626
    &ISO_8859_8_INIT,
2627
    &UTF_8_INIT,
2628
    &WINDOWS_1254_INIT,
2629
    &ISO_8859_7_INIT,
2630
    &X_MAC_CYRILLIC_INIT,
2631
    &REPLACEMENT_INIT,
2632
    &ISO_8859_6_INIT,
2633
    &ISO_8859_8_INIT,
2634
    &UTF_8_INIT,
2635
    &ISO_8859_5_INIT,
2636
    &EUC_JP_INIT,
2637
];
2638
2639
// END GENERATED CODE
2640
2641
/// An encoding as defined in the [Encoding Standard][1].
2642
///
2643
/// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2644
/// and, in most cases, vice versa. Each encoding has a name, an output
2645
/// encoding, and one or more labels.
2646
///
2647
/// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2648
/// encoding in formats and protocols. The _name_ of the encoding is the
2649
/// preferred label in the case appropriate for returning from the
2650
/// [`characterSet`][2] property of the `Document` DOM interface.
2651
///
2652
/// The _output encoding_ is the encoding used for form submission and URL
2653
/// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2654
/// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2655
/// encodings.
2656
///
2657
/// [1]: https://encoding.spec.whatwg.org/
2658
/// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2659
///
2660
/// # Streaming vs. Non-Streaming
2661
///
2662
/// When you have the entire input in a single buffer, you can use the
2663
/// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2664
/// [`decode_without_bom_handling()`][5],
2665
/// [`decode_without_bom_handling_and_without_replacement()`][6] and
2666
/// [`encode()`][7]. (These methods are available to Rust callers only and are
2667
/// not available in the C API.) Unlike the rest of the API available to Rust,
2668
/// these methods perform heap allocations. You should the `Decoder` and
2669
/// `Encoder` objects when your input is split into multiple buffers or when
2670
/// you want to control the allocation of the output buffers.
2671
///
2672
/// [3]: #method.decode
2673
/// [4]: #method.decode_with_bom_removal
2674
/// [5]: #method.decode_without_bom_handling
2675
/// [6]: #method.decode_without_bom_handling_and_without_replacement
2676
/// [7]: #method.encode
2677
///
2678
/// # Instances
2679
///
2680
/// All instances of `Encoding` are statically allocated and have the `'static`
2681
/// lifetime. There is precisely one unique `Encoding` instance for each
2682
/// encoding defined in the Encoding Standard.
2683
///
2684
/// To obtain a reference to a particular encoding whose identity you know at
2685
/// compile time, use a `static` that refers to encoding. There is a `static`
2686
/// for each encoding. The `static`s are named in all caps with hyphens
2687
/// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2688
/// name). For example, if you know at compile time that you will want to
2689
/// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2690
/// in C/C++).
2691
///
2692
/// Additionally, there are non-reference-typed forms ending with `_INIT` to
2693
/// work around the problem that `static`s of the type `&'static Encoding`
2694
/// cannot be used to initialize items of an array whose type is
2695
/// `[&'static Encoding; N]`.
2696
///
2697
/// If you don't know what encoding you need at compile time and need to
2698
/// dynamically get an encoding by label, use
2699
/// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2700
///
2701
/// Instances of `Encoding` can be compared with `==` (in both Rust and in
2702
/// C/C++).
2703
pub struct Encoding {
2704
    name: &'static str,
2705
    variant: VariantEncoding,
2706
}
2707
2708
impl Encoding {
2709
    /// Implements the
2710
    /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2711
    /// algorithm.
2712
    ///
2713
    /// If, after ASCII-lowercasing and removing leading and trailing
2714
    /// whitespace, the argument matches a label defined in the Encoding
2715
    /// Standard, `Some(&'static Encoding)` representing the corresponding
2716
    /// encoding is returned. If there is no match, `None` is returned.
2717
    ///
2718
    /// This is the right method to use if the action upon the method returning
2719
    /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2720
    /// When the action upon the method returning `None` is not to proceed with
2721
    /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2722
    /// appropriate.
2723
    ///
2724
    /// The argument is of type `&[u8]` instead of `&str` to save callers
2725
    /// that are extracting the label from a non-UTF-8 protocol the trouble
2726
    /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2727
    /// on it.)
2728
    ///
2729
    /// Available via the C wrapper.
2730
    ///
2731
    /// # Example
2732
    /// ```
2733
    /// use encoding_rs::Encoding;
2734
    ///
2735
    /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"utf-8"));
2736
    /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"unicode11utf8"));
2737
    ///
2738
    /// assert_eq!(Some(encoding_rs::ISO_8859_2), Encoding::for_label(b"latin2"));
2739
    ///
2740
    /// assert_eq!(Some(encoding_rs::UTF_16BE), Encoding::for_label(b"utf-16be"));
2741
    ///
2742
    /// assert_eq!(None, Encoding::for_label(b"unrecognized label"));
2743
    /// ```
2744
0
    pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2745
0
        let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2746
0
        let mut trimmed_pos = 0usize;
2747
0
        let mut iter = label.into_iter();
2748
        // before
2749
        loop {
2750
0
            match iter.next() {
2751
                None => {
2752
0
                    return None;
2753
                }
2754
0
                Some(byte) => {
2755
                    // The characters used in labels are:
2756
                    // a-z (except q, but excluding it below seems excessive)
2757
                    // 0-9
2758
                    // . _ - :
2759
0
                    match *byte {
2760
                        0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2761
0
                            continue;
2762
                        }
2763
0
                        b'A'..=b'Z' => {
2764
0
                            trimmed[trimmed_pos] = *byte + 0x20u8;
2765
0
                            trimmed_pos = 1usize;
2766
0
                            break;
2767
                        }
2768
0
                        b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2769
0
                            trimmed[trimmed_pos] = *byte;
2770
0
                            trimmed_pos = 1usize;
2771
0
                            break;
2772
                        }
2773
                        _ => {
2774
0
                            return None;
2775
                        }
2776
                    }
2777
                }
2778
            }
2779
        }
2780
        // inside
2781
        loop {
2782
0
            match iter.next() {
2783
                None => {
2784
0
                    break;
2785
                }
2786
0
                Some(byte) => {
2787
0
                    match *byte {
2788
                        0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2789
0
                            break;
2790
                        }
2791
0
                        b'A'..=b'Z' => {
2792
0
                            if trimmed_pos == LONGEST_LABEL_LENGTH {
2793
                                // There's no encoding with a label this long
2794
0
                                return None;
2795
0
                            }
2796
0
                            trimmed[trimmed_pos] = *byte + 0x20u8;
2797
0
                            trimmed_pos += 1usize;
2798
0
                            continue;
2799
                        }
2800
0
                        b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2801
0
                            if trimmed_pos == LONGEST_LABEL_LENGTH {
2802
                                // There's no encoding with a label this long
2803
0
                                return None;
2804
0
                            }
2805
0
                            trimmed[trimmed_pos] = *byte;
2806
0
                            trimmed_pos += 1usize;
2807
0
                            continue;
2808
                        }
2809
                        _ => {
2810
0
                            return None;
2811
                        }
2812
                    }
2813
                }
2814
            }
2815
        }
2816
        // after
2817
        loop {
2818
0
            match iter.next() {
2819
                None => {
2820
0
                    break;
2821
                }
2822
0
                Some(byte) => {
2823
0
                    match *byte {
2824
                        0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2825
0
                            continue;
2826
                        }
2827
                        _ => {
2828
                            // There's no label with space in the middle
2829
0
                            return None;
2830
                        }
2831
                    }
2832
                }
2833
            }
2834
        }
2835
0
        let candidate = &trimmed[..trimmed_pos];
2836
0
        match LABELS_SORTED.binary_search_by(|probe| {
2837
0
            let bytes = probe.as_bytes();
2838
0
            let c = bytes.len().cmp(&candidate.len());
2839
0
            if c != Ordering::Equal {
2840
0
                return c;
2841
0
            }
2842
0
            let probe_iter = bytes.iter().rev();
2843
0
            let candidate_iter = candidate.iter().rev();
2844
0
            probe_iter.cmp(candidate_iter)
2845
0
        }) {
2846
0
            Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2847
0
            Err(_) => None,
2848
        }
2849
0
    }
2850
2851
    /// This method behaves the same as `for_label()`, except when `for_label()`
2852
    /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2853
    ///
2854
    /// This method is useful in scenarios where a fatal error is required
2855
    /// upon invalid label, because in those cases the caller typically wishes
2856
    /// to treat the labels that map to the replacement encoding as fatal
2857
    /// errors, too.
2858
    ///
2859
    /// It is not OK to use this method when the action upon the method returning
2860
    /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2861
    /// case, the `for_label()` method should be used instead in order to avoid
2862
    /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2863
    ///
2864
    /// Available via the C wrapper.
2865
    #[inline]
2866
0
    pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2867
0
        match Encoding::for_label(label) {
2868
0
            None => None,
2869
0
            Some(encoding) => {
2870
0
                if encoding == REPLACEMENT {
2871
0
                    None
2872
                } else {
2873
0
                    Some(encoding)
2874
                }
2875
            }
2876
        }
2877
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::for_label_no_replacement
Unexecuted instantiation: <encoding_rs::Encoding>::for_label_no_replacement
2878
2879
    /// Performs non-incremental BOM sniffing.
2880
    ///
2881
    /// The argument must either be a buffer representing the entire input
2882
    /// stream (non-streaming case) or a buffer representing at least the first
2883
    /// three bytes of the input stream (streaming case).
2884
    ///
2885
    /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2886
    /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2887
    /// or UTF-16BE BOM or `None` otherwise.
2888
    ///
2889
    /// Available via the C wrapper.
2890
    #[inline]
2891
0
    pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2892
0
        if buffer.starts_with(b"\xEF\xBB\xBF") {
2893
0
            Some((UTF_8, 3))
2894
0
        } else if buffer.starts_with(b"\xFF\xFE") {
2895
0
            Some((UTF_16LE, 2))
2896
0
        } else if buffer.starts_with(b"\xFE\xFF") {
2897
0
            Some((UTF_16BE, 2))
2898
        } else {
2899
0
            None
2900
        }
2901
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::for_bom
Unexecuted instantiation: <encoding_rs::Encoding>::for_bom
2902
2903
    /// Returns the name of this encoding.
2904
    ///
2905
    /// This name is appropriate to return as-is from the DOM
2906
    /// `document.characterSet` property.
2907
    ///
2908
    /// Available via the C wrapper.
2909
    #[inline]
2910
0
    pub fn name(&'static self) -> &'static str {
2911
0
        self.name
2912
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::name
Unexecuted instantiation: <encoding_rs::Encoding>::name
2913
2914
    /// Checks whether the _output encoding_ of this encoding can encode every
2915
    /// `char`. (Only true if the output encoding is UTF-8.)
2916
    ///
2917
    /// Available via the C wrapper.
2918
    #[inline]
2919
0
    pub fn can_encode_everything(&'static self) -> bool {
2920
0
        self.output_encoding() == UTF_8
2921
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::can_encode_everything
Unexecuted instantiation: <encoding_rs::Encoding>::can_encode_everything
2922
2923
    /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2924
    /// U+0000...U+007F and vice versa.
2925
    ///
2926
    /// Available via the C wrapper.
2927
    #[inline]
2928
0
    pub fn is_ascii_compatible(&'static self) -> bool {
2929
0
        !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2930
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::is_ascii_compatible
Unexecuted instantiation: <encoding_rs::Encoding>::is_ascii_compatible
2931
2932
    /// Checks whether this encoding maps one byte to one Basic Multilingual
2933
    /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2934
    /// vice versa (for mappable characters).
2935
    ///
2936
    /// `true` iff this encoding is on the list of [Legacy single-byte
2937
    /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2938
    /// in the spec or x-user-defined.
2939
    ///
2940
    /// Available via the C wrapper.
2941
    #[inline]
2942
0
    pub fn is_single_byte(&'static self) -> bool {
2943
0
        self.variant.is_single_byte()
2944
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::is_single_byte
Unexecuted instantiation: <encoding_rs::Encoding>::is_single_byte
2945
2946
    /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2947
    /// U+0000...U+007F and vice versa.
2948
    #[cfg(feature = "alloc")]
2949
    #[inline]
2950
0
    fn is_potentially_borrowable(&'static self) -> bool {
2951
0
        !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2952
0
    }
2953
2954
    /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2955
    /// UTF-16BE, UTF-16LE, and replacement and the encoding itself otherwise.
2956
    ///
2957
    /// _Note:_ The _output encoding_ concept is needed for form submission and
2958
    /// error handling in the query strings of URLs in the Web Platform.
2959
    ///
2960
    /// Available via the C wrapper.
2961
    #[inline]
2962
0
    pub fn output_encoding(&'static self) -> &'static Encoding {
2963
0
        if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2964
0
            UTF_8
2965
        } else {
2966
0
            self
2967
        }
2968
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::output_encoding
Unexecuted instantiation: <encoding_rs::Encoding>::output_encoding
2969
2970
    /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2971
    /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2972
    /// entire input is available as a single buffer (i.e. the end of the
2973
    /// buffer marks the end of the stream).
2974
    ///
2975
    /// The BOM, if any, does not appear in the output.
2976
    ///
2977
    /// This method implements the (non-streaming version of) the
2978
    /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2979
    ///
2980
    /// The second item in the returned tuple is the encoding that was actually
2981
    /// used (which may differ from this encoding thanks to BOM sniffing).
2982
    ///
2983
    /// The third item in the returned tuple indicates whether there were
2984
    /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2985
    ///
2986
    /// _Note:_ It is wrong to use this when the input buffer represents only
2987
    /// a segment of the input instead of the whole input. Use `new_decoder()`
2988
    /// when decoding segmented input.
2989
    ///
2990
    /// This method performs a one or two heap allocations for the backing
2991
    /// buffer of the `String` when unable to borrow. (One allocation if not
2992
    /// errors and potentially another one in the presence of errors.) The
2993
    /// first allocation assumes jemalloc and may not be optimal with
2994
    /// allocators that do not use power-of-two buckets. A borrow is performed
2995
    /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2996
    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2997
    /// ISO-2022-JP and the input is entirely in the ASCII state without state
2998
    /// transitions.
2999
    ///
3000
    /// # Panics
3001
    ///
3002
    /// If the size calculation for a heap-allocated backing buffer overflows
3003
    /// `usize`.
3004
    ///
3005
    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3006
    /// by default).
3007
    #[cfg(feature = "alloc")]
3008
    #[inline]
3009
0
    pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
3010
0
        let (encoding, without_bom) = match Encoding::for_bom(bytes) {
3011
0
            Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
3012
0
            None => (self, bytes),
3013
        };
3014
0
        let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
3015
0
        (cow, encoding, had_errors)
3016
0
    }
3017
3018
    /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
3019
    /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
3020
    /// entire input is available as a single buffer (i.e. the end of the
3021
    /// buffer marks the end of the stream).
3022
    ///
3023
    /// Only an initial byte sequence that is a BOM for this encoding is removed.
3024
    ///
3025
    /// When invoked on `UTF_8`, this method implements the (non-streaming
3026
    /// version of) the
3027
    /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
3028
    /// concept.
3029
    ///
3030
    /// The second item in the returned pair indicates whether there were
3031
    /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3032
    ///
3033
    /// _Note:_ It is wrong to use this when the input buffer represents only
3034
    /// a segment of the input instead of the whole input. Use
3035
    /// `new_decoder_with_bom_removal()` when decoding segmented input.
3036
    ///
3037
    /// This method performs a one or two heap allocations for the backing
3038
    /// buffer of the `String` when unable to borrow. (One allocation if not
3039
    /// errors and potentially another one in the presence of errors.) The
3040
    /// first allocation assumes jemalloc and may not be optimal with
3041
    /// allocators that do not use power-of-two buckets. A borrow is performed
3042
    /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3043
    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3044
    /// ISO-2022-JP and the input is entirely in the ASCII state without state
3045
    /// transitions.
3046
    ///
3047
    /// # Panics
3048
    ///
3049
    /// If the size calculation for a heap-allocated backing buffer overflows
3050
    /// `usize`.
3051
    ///
3052
    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3053
    /// by default).
3054
    #[cfg(feature = "alloc")]
3055
    #[inline]
3056
0
    pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3057
0
        let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
3058
0
            &bytes[3..]
3059
0
        } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
3060
0
            || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
3061
        {
3062
0
            &bytes[2..]
3063
        } else {
3064
0
            bytes
3065
        };
3066
0
        self.decode_without_bom_handling(without_bom)
3067
0
    }
3068
3069
    /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3070
    /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
3071
    /// the entire input is available as a single buffer (i.e. the end of the
3072
    /// buffer marks the end of the stream).
3073
    ///
3074
    /// When invoked on `UTF_8`, this method implements the (non-streaming
3075
    /// version of) the
3076
    /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
3077
    /// spec concept.
3078
    ///
3079
    /// The second item in the returned pair indicates whether there were
3080
    /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3081
    ///
3082
    /// _Note:_ It is wrong to use this when the input buffer represents only
3083
    /// a segment of the input instead of the whole input. Use
3084
    /// `new_decoder_without_bom_handling()` when decoding segmented input.
3085
    ///
3086
    /// This method performs a one or two heap allocations for the backing
3087
    /// buffer of the `String` when unable to borrow. (One allocation if not
3088
    /// errors and potentially another one in the presence of errors.) The
3089
    /// first allocation assumes jemalloc and may not be optimal with
3090
    /// allocators that do not use power-of-two buckets. A borrow is performed
3091
    /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3092
    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3093
    /// ISO-2022-JP and the input is entirely in the ASCII state without state
3094
    /// transitions.
3095
    ///
3096
    /// # Panics
3097
    ///
3098
    /// If the size calculation for a heap-allocated backing buffer overflows
3099
    /// `usize`.
3100
    ///
3101
    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3102
    /// by default).
3103
    #[cfg(feature = "alloc")]
3104
0
    pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3105
0
        let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3106
0
            let valid_up_to = if self == UTF_8 {
3107
0
                utf8_valid_up_to(bytes)
3108
0
            } else if self == ISO_2022_JP {
3109
0
                iso_2022_jp_ascii_valid_up_to(bytes)
3110
            } else {
3111
0
                ascii_valid_up_to(bytes)
3112
            };
3113
0
            if valid_up_to == bytes.len() {
3114
0
                let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3115
0
                return (Cow::Borrowed(str), false);
3116
0
            }
3117
0
            let decoder = self.new_decoder_without_bom_handling();
3118
3119
0
            let rounded_without_replacement = checked_next_power_of_two(checked_add(
3120
0
                valid_up_to,
3121
0
                decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3122
            ));
3123
0
            let with_replacement = checked_add(
3124
0
                valid_up_to,
3125
0
                decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3126
            );
3127
0
            let mut string = String::with_capacity(
3128
0
                checked_min(rounded_without_replacement, with_replacement).unwrap(),
3129
            );
3130
0
            unsafe {
3131
0
                let vec = string.as_mut_vec();
3132
0
                vec.set_len(valid_up_to);
3133
0
                core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3134
0
            }
3135
0
            (decoder, string, valid_up_to)
3136
        } else {
3137
0
            let decoder = self.new_decoder_without_bom_handling();
3138
0
            let rounded_without_replacement = checked_next_power_of_two(
3139
0
                decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3140
            );
3141
0
            let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3142
0
            let string = String::with_capacity(
3143
0
                checked_min(rounded_without_replacement, with_replacement).unwrap(),
3144
            );
3145
0
            (decoder, string, 0)
3146
        };
3147
3148
0
        let mut total_had_errors = false;
3149
        loop {
3150
0
            let (result, read, had_errors) =
3151
0
                decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3152
0
            total_read += read;
3153
0
            total_had_errors |= had_errors;
3154
0
            match result {
3155
                CoderResult::InputEmpty => {
3156
0
                    debug_assert_eq!(total_read, bytes.len());
3157
0
                    return (Cow::Owned(string), total_had_errors);
3158
                }
3159
0
                CoderResult::OutputFull => {
3160
0
                    // Allocate for the worst case. That is, we should come
3161
0
                    // here at most once per invocation of this method.
3162
0
                    let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3163
0
                    string.reserve(needed.unwrap());
3164
0
                }
3165
            }
3166
        }
3167
0
    }
3168
3169
    /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3170
    /// _with malformed sequences treated as fatal_ when the entire input is
3171
    /// available as a single buffer (i.e. the end of the buffer marks the end
3172
    /// of the stream).
3173
    ///
3174
    /// When invoked on `UTF_8`, this method implements the (non-streaming
3175
    /// version of) the
3176
    /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3177
    /// spec concept.
3178
    ///
3179
    /// Returns `None` if a malformed sequence was encountered and the result
3180
    /// of the decode as `Some(String)` otherwise.
3181
    ///
3182
    /// _Note:_ It is wrong to use this when the input buffer represents only
3183
    /// a segment of the input instead of the whole input. Use
3184
    /// `new_decoder_without_bom_handling()` when decoding segmented input.
3185
    ///
3186
    /// This method performs a single heap allocation for the backing
3187
    /// buffer of the `String` when unable to borrow. A borrow is performed if
3188
    /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3189
    /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3190
    /// ISO-2022-JP and the input is entirely in the ASCII state without state
3191
    /// transitions.
3192
    ///
3193
    /// # Panics
3194
    ///
3195
    /// If the size calculation for a heap-allocated backing buffer overflows
3196
    /// `usize`.
3197
    ///
3198
    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3199
    /// by default).
3200
    #[cfg(feature = "alloc")]
3201
0
    pub fn decode_without_bom_handling_and_without_replacement<'a>(
3202
0
        &'static self,
3203
0
        bytes: &'a [u8],
3204
0
    ) -> Option<Cow<'a, str>> {
3205
0
        if self == UTF_8 {
3206
0
            let valid_up_to = utf8_valid_up_to(bytes);
3207
0
            if valid_up_to == bytes.len() {
3208
0
                let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3209
0
                return Some(Cow::Borrowed(str));
3210
0
            }
3211
0
            return None;
3212
0
        }
3213
0
        let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3214
0
            let valid_up_to = if self == ISO_2022_JP {
3215
0
                iso_2022_jp_ascii_valid_up_to(bytes)
3216
            } else {
3217
0
                ascii_valid_up_to(bytes)
3218
            };
3219
0
            if valid_up_to == bytes.len() {
3220
0
                let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3221
0
                return Some(Cow::Borrowed(str));
3222
0
            }
3223
0
            let decoder = self.new_decoder_without_bom_handling();
3224
0
            let mut string = String::with_capacity(
3225
0
                checked_add(
3226
0
                    valid_up_to,
3227
0
                    decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3228
                )
3229
0
                .unwrap(),
3230
            );
3231
0
            unsafe {
3232
0
                let vec = string.as_mut_vec();
3233
0
                vec.set_len(valid_up_to);
3234
0
                core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3235
0
            }
3236
0
            (decoder, string, &bytes[valid_up_to..])
3237
        } else {
3238
0
            let decoder = self.new_decoder_without_bom_handling();
3239
0
            let string = String::with_capacity(
3240
0
                decoder
3241
0
                    .max_utf8_buffer_length_without_replacement(bytes.len())
3242
0
                    .unwrap(),
3243
            );
3244
0
            (decoder, string, bytes)
3245
        };
3246
0
        let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3247
0
        match result {
3248
            DecoderResult::InputEmpty => {
3249
0
                debug_assert_eq!(read, input.len());
3250
0
                Some(Cow::Owned(string))
3251
            }
3252
0
            DecoderResult::Malformed(_, _) => None,
3253
0
            DecoderResult::OutputFull => unreachable!(),
3254
        }
3255
0
    }
3256
3257
    /// Encode complete input to `Cow<'a, [u8]>` using the
3258
    /// [_output encoding_](Encoding::output_encoding) of this encoding with
3259
    /// unmappable characters replaced with decimal numeric character references
3260
    /// when the entire input is available as a single buffer (i.e. the end of
3261
    /// the buffer marks the end of the stream).
3262
    ///
3263
    /// This method implements the (non-streaming version of) the
3264
    /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3265
    /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3266
    /// spec concept, it is slightly more efficient to use
3267
    /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3268
    /// method on `UTF_8`.
3269
    ///
3270
    /// The second item in the returned tuple is the encoding that was actually
3271
    /// used (*which may differ from this encoding thanks to some encodings
3272
    /// having UTF-8 as their output encoding*).
3273
    ///
3274
    /// The third item in the returned tuple indicates whether there were
3275
    /// unmappable characters (that were replaced with HTML numeric character
3276
    /// references).
3277
    ///
3278
    /// _Note:_ It is wrong to use this when the input buffer represents only
3279
    /// a segment of the input instead of the whole input. Use `new_encoder()`
3280
    /// when encoding segmented output.
3281
    ///
3282
    /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3283
    /// ASCII-compatible encoding, this method returns a borrow of the input
3284
    /// without a heap allocation. Otherwise, this method performs a single
3285
    /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3286
    /// unmappable characters and potentially multiple heap allocations if
3287
    /// there are. These allocations are tuned for jemalloc and may not be
3288
    /// optimal when using a different allocator that doesn't use power-of-two
3289
    /// buckets.
3290
    ///
3291
    /// # Panics
3292
    ///
3293
    /// If the size calculation for a heap-allocated backing buffer overflows
3294
    /// `usize`.
3295
    ///
3296
    /// Available to Rust only and only with the `alloc` feature enabled (enabled
3297
    /// by default).
3298
    #[cfg(feature = "alloc")]
3299
0
    pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3300
0
        let output_encoding = self.output_encoding();
3301
0
        if output_encoding == UTF_8 {
3302
0
            return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3303
0
        }
3304
0
        debug_assert!(output_encoding.is_potentially_borrowable());
3305
0
        let bytes = string.as_bytes();
3306
0
        let valid_up_to = if output_encoding == ISO_2022_JP {
3307
0
            iso_2022_jp_ascii_valid_up_to(bytes)
3308
        } else {
3309
0
            ascii_valid_up_to(bytes)
3310
        };
3311
0
        if valid_up_to == bytes.len() {
3312
0
            return (Cow::Borrowed(bytes), output_encoding, false);
3313
0
        }
3314
0
        let mut encoder = output_encoding.new_encoder();
3315
0
        let mut vec: Vec<u8> = Vec::with_capacity(
3316
0
            (checked_add(
3317
0
                valid_up_to,
3318
0
                encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3319
            ))
3320
0
            .unwrap()
3321
0
            .next_power_of_two(),
3322
        );
3323
0
        unsafe {
3324
0
            vec.set_len(valid_up_to);
3325
0
            core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3326
0
        }
3327
0
        let mut total_read = valid_up_to;
3328
0
        let mut total_had_errors = false;
3329
        loop {
3330
0
            let (result, read, had_errors) =
3331
0
                encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3332
0
            total_read += read;
3333
0
            total_had_errors |= had_errors;
3334
0
            match result {
3335
                CoderResult::InputEmpty => {
3336
0
                    debug_assert_eq!(total_read, string.len());
3337
0
                    return (Cow::Owned(vec), output_encoding, total_had_errors);
3338
                }
3339
0
                CoderResult::OutputFull => {
3340
0
                    // reserve_exact wants to know how much more on top of current
3341
0
                    // length--not current capacity.
3342
0
                    let needed = encoder
3343
0
                        .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3344
0
                    let rounded = (checked_add(vec.capacity(), needed))
3345
0
                        .unwrap()
3346
0
                        .next_power_of_two();
3347
0
                    let additional = rounded - vec.len();
3348
0
                    vec.reserve_exact(additional);
3349
0
                }
3350
            }
3351
        }
3352
0
    }
3353
3354
0
    fn new_variant_decoder(&'static self) -> VariantDecoder {
3355
0
        self.variant.new_variant_decoder()
3356
0
    }
3357
3358
    /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3359
    ///
3360
    /// BOM sniffing may cause the returned decoder to morph into a decoder
3361
    /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. The BOM
3362
    /// does not appear in the output.
3363
    ///
3364
    /// Available via the C wrapper.
3365
    #[inline]
3366
0
    pub fn new_decoder(&'static self) -> Decoder {
3367
0
        Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3368
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::new_decoder
Unexecuted instantiation: <encoding_rs::Encoding>::new_decoder
3369
3370
    /// Instantiates a new decoder for this encoding with BOM removal.
3371
    ///
3372
    /// If the input starts with bytes that are the BOM for this encoding,
3373
    /// those bytes are removed. However, the decoder never morphs into a
3374
    /// decoder for another encoding: A BOM for another encoding is treated as
3375
    /// (potentially malformed) input to the decoding algorithm for this
3376
    /// encoding.
3377
    ///
3378
    /// Available via the C wrapper.
3379
    #[inline]
3380
0
    pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3381
0
        Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3382
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::new_decoder_with_bom_removal
Unexecuted instantiation: <encoding_rs::Encoding>::new_decoder_with_bom_removal
3383
3384
    /// Instantiates a new decoder for this encoding with BOM handling disabled.
3385
    ///
3386
    /// If the input starts with bytes that look like a BOM, those bytes are
3387
    /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3388
    /// for another encoding.)
3389
    ///
3390
    /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3391
    /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3392
    /// instead of this method to cause the BOM to be removed.
3393
    ///
3394
    /// Available via the C wrapper.
3395
    #[inline]
3396
0
    pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3397
0
        Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3398
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::new_decoder_without_bom_handling
Unexecuted instantiation: <encoding_rs::Encoding>::new_decoder_without_bom_handling
3399
3400
    /// Instantiates a new encoder for the [_output encoding_](Encoding::output_encoding)
3401
    /// of this encoding.
3402
    ///
3403
    /// _Note:_ The output encoding of UTF-16BE, UTF-16LE, and replacement is UTF-8. There
3404
    /// is no encoder for UTF-16BE, UTF-16LE, and replacement themselves.
3405
    ///
3406
    /// Available via the C wrapper.
3407
    #[inline]
3408
0
    pub fn new_encoder(&'static self) -> Encoder {
3409
0
        let enc = self.output_encoding();
3410
0
        enc.variant.new_encoder(enc)
3411
0
    }
Unexecuted instantiation: <encoding_rs::Encoding>::new_encoder
Unexecuted instantiation: <encoding_rs::Encoding>::new_encoder
3412
3413
    /// Validates UTF-8.
3414
    ///
3415
    /// Returns the index of the first byte that makes the input malformed as
3416
    /// UTF-8 or the length of the slice if the slice is entirely valid.
3417
    ///
3418
    /// This is currently faster than the corresponding standard library
3419
    /// functionality. If this implementation gets upstreamed to the standard
3420
    /// library, this method may be removed in the future.
3421
    ///
3422
    /// Available via the C wrapper.
3423
0
    pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3424
0
        utf8_valid_up_to(bytes)
3425
0
    }
3426
3427
    /// Validates ASCII.
3428
    ///
3429
    /// Returns the index of the first byte that makes the input malformed as
3430
    /// ASCII or the length of the slice if the slice is entirely valid.
3431
    ///
3432
    /// Available via the C wrapper.
3433
287M
    pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3434
287M
        ascii_valid_up_to(bytes)
3435
287M
    }
3436
3437
    /// Validates ISO-2022-JP ASCII-state data.
3438
    ///
3439
    /// Returns the index of the first byte that makes the input not
3440
    /// representable in the ASCII state of ISO-2022-JP or the length of the
3441
    /// slice if the slice is entirely representable in the ASCII state of
3442
    /// ISO-2022-JP.
3443
    ///
3444
    /// Available via the C wrapper.
3445
0
    pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3446
0
        iso_2022_jp_ascii_valid_up_to(bytes)
3447
0
    }
3448
}
3449
3450
impl PartialEq for Encoding {
3451
    #[inline]
3452
0
    fn eq(&self, other: &Encoding) -> bool {
3453
0
        (self as *const Encoding) == (other as *const Encoding)
3454
0
    }
3455
}
3456
3457
impl Eq for Encoding {}
3458
3459
#[cfg(test)]
3460
impl PartialOrd for Encoding {
3461
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
3462
        (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize))
3463
    }
3464
}
3465
3466
#[cfg(test)]
3467
impl Ord for Encoding {
3468
    fn cmp(&self, other: &Self) -> Ordering {
3469
        (self as *const Encoding as usize).cmp(&(other as *const Encoding as usize))
3470
    }
3471
}
3472
3473
impl Hash for Encoding {
3474
    #[inline]
3475
0
    fn hash<H: Hasher>(&self, state: &mut H) {
3476
0
        (self as *const Encoding).hash(state);
3477
0
    }
3478
}
3479
3480
impl core::fmt::Debug for Encoding {
3481
    #[inline]
3482
0
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
3483
0
        write!(f, "Encoding {{ {} }}", self.name)
3484
0
    }
3485
}
3486
3487
#[cfg(feature = "serde")]
3488
impl Serialize for Encoding {
3489
    #[inline]
3490
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3491
    where
3492
        S: Serializer,
3493
    {
3494
        serializer.serialize_str(self.name)
3495
    }
3496
}
3497
3498
#[cfg(feature = "serde")]
3499
struct EncodingVisitor;
3500
3501
#[cfg(feature = "serde")]
3502
impl<'de> Visitor<'de> for EncodingVisitor {
3503
    type Value = &'static Encoding;
3504
3505
    fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
3506
        formatter.write_str("a valid encoding label")
3507
    }
3508
3509
    fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3510
    where
3511
        E: serde::de::Error,
3512
    {
3513
        if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3514
            Ok(enc)
3515
        } else {
3516
            Err(E::custom(alloc::format!(
3517
                "invalid encoding label: {}",
3518
                value
3519
            )))
3520
        }
3521
    }
3522
}
3523
3524
#[cfg(feature = "serde")]
3525
impl<'de> Deserialize<'de> for &'static Encoding {
3526
    fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3527
    where
3528
        D: Deserializer<'de>,
3529
    {
3530
        deserializer.deserialize_str(EncodingVisitor)
3531
    }
3532
}
3533
3534
/// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3535
#[derive(PartialEq, Debug, Copy, Clone)]
3536
enum DecoderLifeCycle {
3537
    /// The decoder has seen no input yet.
3538
    AtStart,
3539
    /// The decoder has seen no input yet but expects UTF-8.
3540
    AtUtf8Start,
3541
    /// The decoder has seen no input yet but expects UTF-16BE.
3542
    AtUtf16BeStart,
3543
    /// The decoder has seen no input yet but expects UTF-16LE.
3544
    AtUtf16LeStart,
3545
    /// The decoder has seen EF.
3546
    SeenUtf8First,
3547
    /// The decoder has seen EF, BB.
3548
    SeenUtf8Second,
3549
    /// The decoder has seen FE.
3550
    SeenUtf16BeFirst,
3551
    /// The decoder has seen FF.
3552
    SeenUtf16LeFirst,
3553
    /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3554
    /// underlying decoder reported EF as an error, so we need to remember to
3555
    /// push BB before the next buffer.
3556
    ConvertingWithPendingBB,
3557
    /// No longer looking for a BOM and EOF not yet seen.
3558
    Converting,
3559
    /// EOF has been seen.
3560
    Finished,
3561
}
3562
3563
/// Communicate the BOM handling mode.
3564
#[derive(Debug, Copy, Clone)]
3565
enum BomHandling {
3566
    /// Don't handle the BOM
3567
    Off,
3568
    /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3569
    Sniff,
3570
    /// Remove the BOM only if it's the BOM for this encoding
3571
    Remove,
3572
}
3573
3574
/// Result of a (potentially partial) decode or encode operation with
3575
/// replacement.
3576
#[must_use]
3577
#[derive(Debug, PartialEq, Eq)]
3578
pub enum CoderResult {
3579
    /// The input was exhausted.
3580
    ///
3581
    /// If this result was returned from a call where `last` was `true`, the
3582
    /// conversion process has completed. Otherwise, the caller should call a
3583
    /// decode or encode method again with more input.
3584
    InputEmpty,
3585
3586
    /// The converter cannot produce another unit of output, because the output
3587
    /// buffer does not have enough space left.
3588
    ///
3589
    /// The caller must provide more output space upon the next call and re-push
3590
    /// the remaining input to the converter.
3591
    OutputFull,
3592
}
3593
3594
/// Result of a (potentially partial) decode operation without replacement.
3595
#[must_use]
3596
#[derive(Debug, PartialEq, Eq)]
3597
pub enum DecoderResult {
3598
    /// The input was exhausted.
3599
    ///
3600
    /// If this result was returned from a call where `last` was `true`, the
3601
    /// decoding process has completed. Otherwise, the caller should call a
3602
    /// decode method again with more input.
3603
    InputEmpty,
3604
3605
    /// The decoder cannot produce another unit of output, because the output
3606
    /// buffer does not have enough space left.
3607
    ///
3608
    /// The caller must provide more output space upon the next call and re-push
3609
    /// the remaining input to the decoder.
3610
    OutputFull,
3611
3612
    /// The decoder encountered a malformed byte sequence.
3613
    ///
3614
    /// The caller must either treat this as a fatal error or must append one
3615
    /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3616
    /// the remaining input to the decoder.
3617
    ///
3618
    /// The first wrapped integer indicates the length of the malformed byte
3619
    /// sequence. The second wrapped integer indicates the number of bytes
3620
    /// that were consumed after the malformed sequence. If the second
3621
    /// integer is zero, the last byte that was consumed is the last byte of
3622
    /// the malformed sequence. Note that the malformed bytes may have been part
3623
    /// of an earlier input buffer.
3624
    ///
3625
    /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3626
    /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3627
    /// of the two is 6, which happens with ISO-2022-JP.
3628
    Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3629
}
3630
3631
/// A converter that decodes a byte stream into Unicode according to a
3632
/// character encoding in a streaming (incremental) manner.
3633
///
3634
/// The various `decode_*` methods take an input buffer (`src`) and an output
3635
/// buffer `dst` both of which are caller-allocated. There are variants for
3636
/// both UTF-8 and UTF-16 output buffers.
3637
///
3638
/// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3639
/// into `dst` until one of the following three things happens:
3640
///
3641
/// 1. A malformed byte sequence is encountered (`*_without_replacement`
3642
///    variants only).
3643
///
3644
/// 2. The output buffer has been filled so near capacity that the decoder
3645
///    cannot be sure that processing an additional byte of input wouldn't
3646
///    cause so much output that the output buffer would overflow.
3647
///
3648
/// 3. All the input bytes have been processed.
3649
///
3650
/// The `decode_*` method then returns tuple of a status indicating which one
3651
/// of the three reasons to return happened, how many input bytes were read,
3652
/// how many output code units (`u8` when decoding into UTF-8 and `u16`
3653
/// when decoding to UTF-16) were written (except when decoding into `String`,
3654
/// whose length change indicates this), and in the case of the
3655
/// variants performing replacement, a boolean indicating whether an error was
3656
/// replaced with the REPLACEMENT CHARACTER during the call.
3657
///
3658
/// The number of bytes "written" is what's logically written. Garbage may be
3659
/// written in the output buffer beyond the point logically written to.
3660
/// Therefore, if you wish to decode into an `&mut str`, you should use the
3661
/// methods that take an `&mut str` argument instead of the ones that take an
3662
/// `&mut [u8]` argument. The former take care of overwriting the trailing
3663
/// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3664
/// latter don't.
3665
///
3666
/// In the case of the `*_without_replacement` variants, the status is a
3667
/// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3668
/// `InputEmpty` corresponding to the three cases listed above).
3669
///
3670
/// In the case of methods whose name does not end with
3671
/// `*_without_replacement`, malformed sequences are automatically replaced
3672
/// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3673
/// return early.
3674
///
3675
/// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3676
/// space. When decoding to UTF-16, the output buffer must have at least two
3677
/// UTF-16 code units (`u16`) of space.
3678
///
3679
/// When decoding to UTF-8 without replacement, the methods are guaranteed
3680
/// not to return indicating that more output space is needed if the length
3681
/// of the output buffer is at least the length returned by
3682
/// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3683
/// with replacement, the length of the output buffer that guarantees the
3684
/// methods not to return indicating that more output space is needed is given
3685
/// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3686
/// or without replacement, the length of the output buffer that guarantees
3687
/// the methods not to return indicating that more output space is needed is
3688
/// given by [`max_utf16_buffer_length()`][4].
3689
///
3690
/// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3691
/// and the output after each `decode_*` call is guaranteed to consist of
3692
/// complete characters. (I.e. the code unit sequence for the last character is
3693
/// guaranteed not to be split across output buffers.)
3694
///
3695
/// The boolean argument `last` indicates that the end of the stream is reached
3696
/// when all the bytes in `src` have been consumed.
3697
///
3698
/// A `Decoder` object can be used to incrementally decode a byte stream.
3699
///
3700
/// During the processing of a single stream, the caller must call `decode_*`
3701
/// zero or more times with `last` set to `false` and then call `decode_*` at
3702
/// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3703
/// the processing of the stream has ended. Otherwise, the caller must call
3704
/// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3705
///  a fatal error).
3706
///
3707
/// Once the stream has ended, the `Decoder` object must not be used anymore.
3708
/// That is, you need to create another one to process another stream.
3709
///
3710
/// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3711
/// the caller does not wish to treat it as a fatal error, the input buffer
3712
/// `src` may not have been completely consumed. In that case, the caller must
3713
/// pass the unconsumed contents of `src` to `decode_*` again upon the next
3714
/// call.
3715
///
3716
/// [1]: enum.DecoderResult.html
3717
/// [2]: #method.max_utf8_buffer_length_without_replacement
3718
/// [3]: #method.max_utf8_buffer_length
3719
/// [4]: #method.max_utf16_buffer_length
3720
///
3721
/// # Infinite loops
3722
///
3723
/// When converting with a fixed-size output buffer whose size is too small to
3724
/// accommodate one character or (when applicable) one numeric character
3725
/// reference of output, an infinite loop ensues. When converting with a
3726
/// fixed-size output buffer, it generally makes sense to make the buffer
3727
/// fairly large (e.g. couple of kilobytes).
3728
pub struct Decoder {
3729
    encoding: &'static Encoding,
3730
    variant: VariantDecoder,
3731
    life_cycle: DecoderLifeCycle,
3732
}
3733
3734
impl Decoder {
3735
0
    fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3736
        Decoder {
3737
0
            encoding: enc,
3738
0
            variant: decoder,
3739
0
            life_cycle: match sniffing {
3740
0
                BomHandling::Off => DecoderLifeCycle::Converting,
3741
0
                BomHandling::Sniff => DecoderLifeCycle::AtStart,
3742
                BomHandling::Remove => {
3743
0
                    if enc == UTF_8 {
3744
0
                        DecoderLifeCycle::AtUtf8Start
3745
0
                    } else if enc == UTF_16BE {
3746
0
                        DecoderLifeCycle::AtUtf16BeStart
3747
0
                    } else if enc == UTF_16LE {
3748
0
                        DecoderLifeCycle::AtUtf16LeStart
3749
                    } else {
3750
0
                        DecoderLifeCycle::Converting
3751
                    }
3752
                }
3753
            },
3754
        }
3755
0
    }
3756
3757
    /// The `Encoding` this `Decoder` is for.
3758
    ///
3759
    /// BOM sniffing can change the return value of this method during the life
3760
    /// of the decoder.
3761
    ///
3762
    /// Available via the C wrapper.
3763
    #[inline]
3764
0
    pub fn encoding(&self) -> &'static Encoding {
3765
0
        self.encoding
3766
0
    }
Unexecuted instantiation: <encoding_rs::Decoder>::encoding
Unexecuted instantiation: <encoding_rs::Decoder>::encoding
3767
3768
    /// Query the worst-case UTF-8 output size _with replacement_.
3769
    ///
3770
    /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3771
    /// that will not overflow given the current state of the decoder and
3772
    /// `byte_length` number of additional input bytes when decoding with
3773
    /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3774
    /// sequence or `None` if `usize` would overflow.
3775
    ///
3776
    /// Available via the C wrapper.
3777
0
    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3778
        // Need to consider a) the decoder morphing due to the BOM and b) a partial
3779
        // BOM getting pushed to the underlying decoder.
3780
0
        match self.life_cycle {
3781
            DecoderLifeCycle::Converting
3782
            | DecoderLifeCycle::AtUtf8Start
3783
            | DecoderLifeCycle::AtUtf16LeStart
3784
            | DecoderLifeCycle::AtUtf16BeStart => {
3785
0
                return self.variant.max_utf8_buffer_length(byte_length);
3786
            }
3787
            DecoderLifeCycle::AtStart => {
3788
0
                if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3789
0
                    if let Some(utf16_bom) = checked_add(
3790
0
                        1,
3791
0
                        checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3792
0
                    ) {
3793
0
                        let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3794
0
                        let encoding = self.encoding();
3795
0
                        if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3796
                            // No need to consider the internal state of the underlying decoder,
3797
                            // because it is at start, because no data has reached it yet.
3798
0
                            return Some(utf_bom);
3799
0
                        } else if let Some(non_bom) =
3800
0
                            self.variant.max_utf8_buffer_length(byte_length)
3801
                        {
3802
0
                            return Some(core::cmp::max(utf_bom, non_bom));
3803
0
                        }
3804
0
                    }
3805
0
                }
3806
            }
3807
            DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3808
                // Add two bytes even when only one byte has been seen,
3809
                // because the one byte can become a lead byte in multibyte
3810
                // decoders, but only after the decoder has been queried
3811
                // for max length, so the decoder's own logic for adding
3812
                // one for a pending lead cannot work.
3813
0
                if let Some(sum) = byte_length.checked_add(2) {
3814
0
                    if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3815
0
                        if self.encoding() == UTF_8 {
3816
                            // No need to consider the internal state of the underlying decoder,
3817
                            // because it is at start, because no data has reached it yet.
3818
0
                            return Some(utf8_bom);
3819
0
                        } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3820
0
                            return Some(core::cmp::max(utf8_bom, non_bom));
3821
0
                        }
3822
0
                    }
3823
0
                }
3824
            }
3825
            DecoderLifeCycle::ConvertingWithPendingBB => {
3826
0
                if let Some(sum) = byte_length.checked_add(2) {
3827
0
                    return self.variant.max_utf8_buffer_length(sum);
3828
0
                }
3829
            }
3830
            DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3831
                // Add two bytes even when only one byte has been seen,
3832
                // because the one byte can become a lead byte in multibyte
3833
                // decoders, but only after the decoder has been queried
3834
                // for max length, so the decoder's own logic for adding
3835
                // one for a pending lead cannot work.
3836
0
                if let Some(sum) = byte_length.checked_add(2) {
3837
0
                    if let Some(utf16_bom) =
3838
0
                        checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3839
                    {
3840
0
                        let encoding = self.encoding();
3841
0
                        if encoding == UTF_16LE || encoding == UTF_16BE {
3842
                            // No need to consider the internal state of the underlying decoder,
3843
                            // because it is at start, because no data has reached it yet.
3844
0
                            return Some(utf16_bom);
3845
0
                        } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3846
0
                            return Some(core::cmp::max(utf16_bom, non_bom));
3847
0
                        }
3848
0
                    }
3849
0
                }
3850
            }
3851
0
            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3852
        }
3853
0
        None
3854
0
    }
3855
3856
    /// Query the worst-case UTF-8 output size _without replacement_.
3857
    ///
3858
    /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3859
    /// that will not overflow given the current state of the decoder and
3860
    /// `byte_length` number of additional input bytes when decoding without
3861
    /// replacement error handling or `None` if `usize` would overflow.
3862
    ///
3863
    /// Note that this value may be too small for the `_with_replacement` case.
3864
    /// Use `max_utf8_buffer_length()` for that case.
3865
    ///
3866
    /// Available via the C wrapper.
3867
0
    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3868
        // Need to consider a) the decoder morphing due to the BOM and b) a partial
3869
        // BOM getting pushed to the underlying decoder.
3870
0
        match self.life_cycle {
3871
            DecoderLifeCycle::Converting
3872
            | DecoderLifeCycle::AtUtf8Start
3873
            | DecoderLifeCycle::AtUtf16LeStart
3874
            | DecoderLifeCycle::AtUtf16BeStart => {
3875
0
                return self
3876
0
                    .variant
3877
0
                    .max_utf8_buffer_length_without_replacement(byte_length);
3878
            }
3879
            DecoderLifeCycle::AtStart => {
3880
0
                if let Some(utf8_bom) = byte_length.checked_add(3) {
3881
0
                    if let Some(utf16_bom) = checked_add(
3882
0
                        1,
3883
0
                        checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3884
0
                    ) {
3885
0
                        let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3886
0
                        let encoding = self.encoding();
3887
0
                        if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3888
                            // No need to consider the internal state of the underlying decoder,
3889
                            // because it is at start, because no data has reached it yet.
3890
0
                            return Some(utf_bom);
3891
0
                        } else if let Some(non_bom) = self
3892
0
                            .variant
3893
0
                            .max_utf8_buffer_length_without_replacement(byte_length)
3894
                        {
3895
0
                            return Some(core::cmp::max(utf_bom, non_bom));
3896
0
                        }
3897
0
                    }
3898
0
                }
3899
            }
3900
            DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3901
                // Add two bytes even when only one byte has been seen,
3902
                // because the one byte can become a lead byte in multibyte
3903
                // decoders, but only after the decoder has been queried
3904
                // for max length, so the decoder's own logic for adding
3905
                // one for a pending lead cannot work.
3906
0
                if let Some(sum) = byte_length.checked_add(2) {
3907
0
                    if let Some(utf8_bom) = sum.checked_add(3) {
3908
0
                        if self.encoding() == UTF_8 {
3909
                            // No need to consider the internal state of the underlying decoder,
3910
                            // because it is at start, because no data has reached it yet.
3911
0
                            return Some(utf8_bom);
3912
0
                        } else if let Some(non_bom) =
3913
0
                            self.variant.max_utf8_buffer_length_without_replacement(sum)
3914
                        {
3915
0
                            return Some(core::cmp::max(utf8_bom, non_bom));
3916
0
                        }
3917
0
                    }
3918
0
                }
3919
            }
3920
            DecoderLifeCycle::ConvertingWithPendingBB => {
3921
0
                if let Some(sum) = byte_length.checked_add(2) {
3922
0
                    return self.variant.max_utf8_buffer_length_without_replacement(sum);
3923
0
                }
3924
            }
3925
            DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3926
                // Add two bytes even when only one byte has been seen,
3927
                // because the one byte can become a lead byte in multibyte
3928
                // decoders, but only after the decoder has been queried
3929
                // for max length, so the decoder's own logic for adding
3930
                // one for a pending lead cannot work.
3931
0
                if let Some(sum) = byte_length.checked_add(2) {
3932
0
                    if let Some(utf16_bom) =
3933
0
                        checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3934
                    {
3935
0
                        let encoding = self.encoding();
3936
0
                        if encoding == UTF_16LE || encoding == UTF_16BE {
3937
                            // No need to consider the internal state of the underlying decoder,
3938
                            // because it is at start, because no data has reached it yet.
3939
0
                            return Some(utf16_bom);
3940
0
                        } else if let Some(non_bom) =
3941
0
                            self.variant.max_utf8_buffer_length_without_replacement(sum)
3942
                        {
3943
0
                            return Some(core::cmp::max(utf16_bom, non_bom));
3944
0
                        }
3945
0
                    }
3946
0
                }
3947
            }
3948
0
            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3949
        }
3950
0
        None
3951
0
    }
3952
3953
    /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3954
    /// replaced with the REPLACEMENT CHARACTER.
3955
    ///
3956
    /// See the documentation of the struct for documentation for `decode_*`
3957
    /// methods collectively.
3958
    ///
3959
    /// Available via the C wrapper.
3960
0
    pub fn decode_to_utf8(
3961
0
        &mut self,
3962
0
        src: &[u8],
3963
0
        dst: &mut [u8],
3964
0
        last: bool,
3965
0
    ) -> (CoderResult, usize, usize, bool) {
3966
0
        let mut had_errors = false;
3967
0
        let mut total_read = 0usize;
3968
0
        let mut total_written = 0usize;
3969
        loop {
3970
0
            let (result, read, written) = self.decode_to_utf8_without_replacement(
3971
0
                &src[total_read..],
3972
0
                &mut dst[total_written..],
3973
0
                last,
3974
0
            );
3975
0
            total_read += read;
3976
0
            total_written += written;
3977
0
            match result {
3978
                DecoderResult::InputEmpty => {
3979
0
                    return (
3980
0
                        CoderResult::InputEmpty,
3981
0
                        total_read,
3982
0
                        total_written,
3983
0
                        had_errors,
3984
0
                    );
3985
                }
3986
                DecoderResult::OutputFull => {
3987
0
                    return (
3988
0
                        CoderResult::OutputFull,
3989
0
                        total_read,
3990
0
                        total_written,
3991
0
                        had_errors,
3992
0
                    );
3993
                }
3994
0
                DecoderResult::Malformed(_, _) => {
3995
0
                    had_errors = true;
3996
0
                    // There should always be space for the U+FFFD, because
3997
0
                    // otherwise we'd have gotten OutputFull already.
3998
0
                    // XXX: is the above comment actually true for UTF-8 itself?
3999
0
                    // TODO: Consider having fewer bound checks here.
4000
0
                    dst[total_written] = 0xEFu8;
4001
0
                    total_written += 1;
4002
0
                    dst[total_written] = 0xBFu8;
4003
0
                    total_written += 1;
4004
0
                    dst[total_written] = 0xBDu8;
4005
0
                    total_written += 1;
4006
0
                }
4007
            }
4008
        }
4009
0
    }
4010
4011
    /// Incrementally decode a byte stream into UTF-8 with malformed sequences
4012
    /// replaced with the REPLACEMENT CHARACTER with type system signaling
4013
    /// of UTF-8 validity.
4014
    ///
4015
    /// This methods calls `decode_to_utf8` and then zeroes
4016
    /// out up to three bytes that aren't logically part of the write in order
4017
    /// to retain the UTF-8 validity even for the unwritten part of the buffer.
4018
    ///
4019
    /// See the documentation of the struct for documentation for `decode_*`
4020
    /// methods collectively.
4021
    ///
4022
    /// Available to Rust only.
4023
0
    pub fn decode_to_str(
4024
0
        &mut self,
4025
0
        src: &[u8],
4026
0
        dst: &mut str,
4027
0
        last: bool,
4028
0
    ) -> (CoderResult, usize, usize, bool) {
4029
0
        let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4030
0
        let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
4031
0
        let len = bytes.len();
4032
0
        let mut trail = written;
4033
        // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4034
        // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4035
        // encodings to avoid overwriting here.
4036
0
        if self.encoding != UTF_8 {
4037
0
            let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4038
0
            while trail < max {
4039
0
                bytes[trail] = 0;
4040
0
                trail += 1;
4041
0
            }
4042
0
        }
4043
0
        while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4044
0
            bytes[trail] = 0;
4045
0
            trail += 1;
4046
0
        }
4047
0
        (result, read, written, replaced)
4048
0
    }
4049
4050
    /// Incrementally decode a byte stream into UTF-8 with malformed sequences
4051
    /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
4052
    ///
4053
    /// Like the others, this method follows the logic that the output buffer is
4054
    /// caller-allocated. This method treats the capacity of the `String` as
4055
    /// the output limit. That is, this method guarantees not to cause a
4056
    /// reallocation of the backing buffer of `String`.
4057
    ///
4058
    /// The return value is a tuple that contains the `DecoderResult`, the
4059
    /// number of bytes read and a boolean indicating whether replacements
4060
    /// were done. The number of bytes written is signaled via the length of
4061
    /// the `String` changing.
4062
    ///
4063
    /// See the documentation of the struct for documentation for `decode_*`
4064
    /// methods collectively.
4065
    ///
4066
    /// Available to Rust only and only with the `alloc` feature enabled (enabled
4067
    /// by default).
4068
    #[cfg(feature = "alloc")]
4069
0
    pub fn decode_to_string(
4070
0
        &mut self,
4071
0
        src: &[u8],
4072
0
        dst: &mut String,
4073
0
        last: bool,
4074
0
    ) -> (CoderResult, usize, bool) {
4075
        unsafe {
4076
0
            let vec = dst.as_mut_vec();
4077
0
            let old_len = vec.len();
4078
0
            let capacity = vec.capacity();
4079
0
            vec.set_len(capacity);
4080
0
            let (result, read, written, replaced) =
4081
0
                self.decode_to_utf8(src, &mut vec[old_len..], last);
4082
0
            vec.set_len(old_len + written);
4083
0
            (result, read, replaced)
4084
        }
4085
0
    }
4086
4087
    public_decode_function!(/// Incrementally decode a byte stream into UTF-8
4088
                            /// _without replacement_.
4089
                            ///
4090
                            /// See the documentation of the struct for
4091
                            /// documentation for `decode_*` methods
4092
                            /// collectively.
4093
                            ///
4094
                            /// Available via the C wrapper.
4095
                            ,
4096
                            decode_to_utf8_without_replacement,
4097
                            decode_to_utf8_raw,
4098
                            decode_to_utf8_checking_end,
4099
                            decode_to_utf8_after_one_potential_bom_byte,
4100
                            decode_to_utf8_after_two_potential_bom_bytes,
4101
                            decode_to_utf8_checking_end_with_offset,
4102
                            u8);
4103
4104
    /// Incrementally decode a byte stream into UTF-8 with type system signaling
4105
    /// of UTF-8 validity.
4106
    ///
4107
    /// This methods calls `decode_to_utf8` and then zeroes out up to three
4108
    /// bytes that aren't logically part of the write in order to retain the
4109
    /// UTF-8 validity even for the unwritten part of the buffer.
4110
    ///
4111
    /// See the documentation of the struct for documentation for `decode_*`
4112
    /// methods collectively.
4113
    ///
4114
    /// Available to Rust only.
4115
0
    pub fn decode_to_str_without_replacement(
4116
0
        &mut self,
4117
0
        src: &[u8],
4118
0
        dst: &mut str,
4119
0
        last: bool,
4120
0
    ) -> (DecoderResult, usize, usize) {
4121
0
        let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4122
0
        let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4123
0
        let len = bytes.len();
4124
0
        let mut trail = written;
4125
        // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4126
        // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4127
        // encodings to avoid overwriting here.
4128
0
        if self.encoding != UTF_8 {
4129
0
            let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4130
0
            while trail < max {
4131
0
                bytes[trail] = 0;
4132
0
                trail += 1;
4133
0
            }
4134
0
        }
4135
0
        while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4136
0
            bytes[trail] = 0;
4137
0
            trail += 1;
4138
0
        }
4139
0
        (result, read, written)
4140
0
    }
4141
4142
    /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4143
    ///
4144
    /// Like the others, this method follows the logic that the output buffer is
4145
    /// caller-allocated. This method treats the capacity of the `String` as
4146
    /// the output limit. That is, this method guarantees not to cause a
4147
    /// reallocation of the backing buffer of `String`.
4148
    ///
4149
    /// The return value is a pair that contains the `DecoderResult` and the
4150
    /// number of bytes read. The number of bytes written is signaled via
4151
    /// the length of the `String` changing.
4152
    ///
4153
    /// See the documentation of the struct for documentation for `decode_*`
4154
    /// methods collectively.
4155
    ///
4156
    /// Available to Rust only and only with the `alloc` feature enabled (enabled
4157
    /// by default).
4158
    #[cfg(feature = "alloc")]
4159
0
    pub fn decode_to_string_without_replacement(
4160
0
        &mut self,
4161
0
        src: &[u8],
4162
0
        dst: &mut String,
4163
0
        last: bool,
4164
0
    ) -> (DecoderResult, usize) {
4165
        unsafe {
4166
0
            let vec = dst.as_mut_vec();
4167
0
            let old_len = vec.len();
4168
0
            let capacity = vec.capacity();
4169
0
            vec.set_len(capacity);
4170
0
            let (result, read, written) =
4171
0
                self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4172
0
            vec.set_len(old_len + written);
4173
0
            (result, read)
4174
        }
4175
0
    }
4176
4177
    /// Query the worst-case UTF-16 output size (with or without replacement).
4178
    ///
4179
    /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4180
    /// that will not overflow given the current state of the decoder and
4181
    /// `byte_length` number of additional input bytes or `None` if `usize`
4182
    /// would overflow.
4183
    ///
4184
    /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4185
    /// return value of this method applies also in the
4186
    /// `_without_replacement` case.
4187
    ///
4188
    /// Available via the C wrapper.
4189
0
    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4190
        // Need to consider a) the decoder morphing due to the BOM and b) a partial
4191
        // BOM getting pushed to the underlying decoder.
4192
0
        match self.life_cycle {
4193
            DecoderLifeCycle::Converting
4194
            | DecoderLifeCycle::AtUtf8Start
4195
            | DecoderLifeCycle::AtUtf16LeStart
4196
            | DecoderLifeCycle::AtUtf16BeStart => {
4197
0
                return self.variant.max_utf16_buffer_length(byte_length);
4198
            }
4199
            DecoderLifeCycle::AtStart => {
4200
0
                if let Some(utf8_bom) = byte_length.checked_add(1) {
4201
0
                    if let Some(utf16_bom) =
4202
0
                        checked_add(1, checked_div(byte_length.checked_add(1), 2))
4203
                    {
4204
0
                        let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
4205
0
                        let encoding = self.encoding();
4206
0
                        if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4207
                            // No need to consider the internal state of the underlying decoder,
4208
                            // because it is at start, because no data has reached it yet.
4209
0
                            return Some(utf_bom);
4210
0
                        } else if let Some(non_bom) =
4211
0
                            self.variant.max_utf16_buffer_length(byte_length)
4212
                        {
4213
0
                            return Some(core::cmp::max(utf_bom, non_bom));
4214
0
                        }
4215
0
                    }
4216
0
                }
4217
            }
4218
            DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4219
                // Add two bytes even when only one byte has been seen,
4220
                // because the one byte can become a lead byte in multibyte
4221
                // decoders, but only after the decoder has been queried
4222
                // for max length, so the decoder's own logic for adding
4223
                // one for a pending lead cannot work.
4224
0
                if let Some(sum) = byte_length.checked_add(2) {
4225
0
                    if let Some(utf8_bom) = sum.checked_add(1) {
4226
0
                        if self.encoding() == UTF_8 {
4227
                            // No need to consider the internal state of the underlying decoder,
4228
                            // because it is at start, because no data has reached it yet.
4229
0
                            return Some(utf8_bom);
4230
0
                        } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4231
0
                            return Some(core::cmp::max(utf8_bom, non_bom));
4232
0
                        }
4233
0
                    }
4234
0
                }
4235
            }
4236
            DecoderLifeCycle::ConvertingWithPendingBB => {
4237
0
                if let Some(sum) = byte_length.checked_add(2) {
4238
0
                    return self.variant.max_utf16_buffer_length(sum);
4239
0
                }
4240
            }
4241
            DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4242
                // Add two bytes even when only one byte has been seen,
4243
                // because the one byte can become a lead byte in multibyte
4244
                // decoders, but only after the decoder has been queried
4245
                // for max length, so the decoder's own logic for adding
4246
                // one for a pending lead cannot work.
4247
0
                if let Some(sum) = byte_length.checked_add(2) {
4248
0
                    if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4249
0
                        let encoding = self.encoding();
4250
0
                        if encoding == UTF_16LE || encoding == UTF_16BE {
4251
                            // No need to consider the internal state of the underlying decoder,
4252
                            // because it is at start, because no data has reached it yet.
4253
0
                            return Some(utf16_bom);
4254
0
                        } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4255
0
                            return Some(core::cmp::max(utf16_bom, non_bom));
4256
0
                        }
4257
0
                    }
4258
0
                }
4259
            }
4260
0
            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4261
        }
4262
0
        None
4263
0
    }
4264
4265
    /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4266
    /// replaced with the REPLACEMENT CHARACTER.
4267
    ///
4268
    /// See the documentation of the struct for documentation for `decode_*`
4269
    /// methods collectively.
4270
    ///
4271
    /// Available via the C wrapper.
4272
0
    pub fn decode_to_utf16(
4273
0
        &mut self,
4274
0
        src: &[u8],
4275
0
        dst: &mut [u16],
4276
0
        last: bool,
4277
0
    ) -> (CoderResult, usize, usize, bool) {
4278
0
        let mut had_errors = false;
4279
0
        let mut total_read = 0usize;
4280
0
        let mut total_written = 0usize;
4281
        loop {
4282
0
            let (result, read, written) = self.decode_to_utf16_without_replacement(
4283
0
                &src[total_read..],
4284
0
                &mut dst[total_written..],
4285
0
                last,
4286
0
            );
4287
0
            total_read += read;
4288
0
            total_written += written;
4289
0
            match result {
4290
                DecoderResult::InputEmpty => {
4291
0
                    return (
4292
0
                        CoderResult::InputEmpty,
4293
0
                        total_read,
4294
0
                        total_written,
4295
0
                        had_errors,
4296
0
                    );
4297
                }
4298
                DecoderResult::OutputFull => {
4299
0
                    return (
4300
0
                        CoderResult::OutputFull,
4301
0
                        total_read,
4302
0
                        total_written,
4303
0
                        had_errors,
4304
0
                    );
4305
                }
4306
0
                DecoderResult::Malformed(_, _) => {
4307
0
                    had_errors = true;
4308
0
                    // There should always be space for the U+FFFD, because
4309
0
                    // otherwise we'd have gotten OutputFull already.
4310
0
                    dst[total_written] = 0xFFFD;
4311
0
                    total_written += 1;
4312
0
                }
4313
            }
4314
        }
4315
0
    }
4316
4317
    public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4318
                            /// _without replacement_.
4319
                            ///
4320
                            /// See the documentation of the struct for
4321
                            /// documentation for `decode_*` methods
4322
                            /// collectively.
4323
                            ///
4324
                            /// Available via the C wrapper.
4325
                            ,
4326
                            decode_to_utf16_without_replacement,
4327
                            decode_to_utf16_raw,
4328
                            decode_to_utf16_checking_end,
4329
                            decode_to_utf16_after_one_potential_bom_byte,
4330
                            decode_to_utf16_after_two_potential_bom_bytes,
4331
                            decode_to_utf16_checking_end_with_offset,
4332
                            u16);
4333
4334
    /// Checks for compatibility with storing Unicode scalar values as unsigned
4335
    /// bytes taking into account the state of the decoder.
4336
    ///
4337
    /// Returns `None` if the decoder is not in a neutral state, including waiting
4338
    /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4339
    ///
4340
    /// Otherwise returns the index of the first byte whose unsigned value doesn't
4341
    /// directly correspond to the decoded Unicode scalar value, or the length
4342
    /// of the input if all bytes in the input decode directly to scalar values
4343
    /// corresponding to the unsigned byte values.
4344
    ///
4345
    /// Does not change the state of the decoder.
4346
    ///
4347
    /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4348
    /// storage optimizations.
4349
    ///
4350
    /// Available via the C wrapper.
4351
0
    pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4352
0
        match self.life_cycle {
4353
            DecoderLifeCycle::Converting => {
4354
0
                return self.variant.latin1_byte_compatible_up_to(bytes);
4355
            }
4356
0
            DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4357
0
            _ => None,
4358
        }
4359
0
    }
4360
}
4361
4362
/// Result of a (potentially partial) encode operation without replacement.
4363
#[must_use]
4364
#[derive(Debug, PartialEq, Eq)]
4365
pub enum EncoderResult {
4366
    /// The input was exhausted.
4367
    ///
4368
    /// If this result was returned from a call where `last` was `true`, the
4369
    /// decoding process has completed. Otherwise, the caller should call a
4370
    /// decode method again with more input.
4371
    InputEmpty,
4372
4373
    /// The encoder cannot produce another unit of output, because the output
4374
    /// buffer does not have enough space left.
4375
    ///
4376
    /// The caller must provide more output space upon the next call and re-push
4377
    /// the remaining input to the decoder.
4378
    OutputFull,
4379
4380
    /// The encoder encountered an unmappable character.
4381
    ///
4382
    /// The caller must either treat this as a fatal error or must append
4383
    /// a placeholder to the output and then re-push the remaining input to the
4384
    /// encoder.
4385
    Unmappable(char),
4386
}
4387
4388
impl EncoderResult {
4389
0
    fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4390
0
        EncoderResult::Unmappable(::core::char::from_u32(u32::from(bmp)).unwrap())
4391
0
    }
4392
}
4393
4394
/// A converter that encodes a Unicode stream into bytes according to a
4395
/// character encoding in a streaming (incremental) manner.
4396
///
4397
/// The various `encode_*` methods take an input buffer (`src`) and an output
4398
/// buffer `dst` both of which are caller-allocated. There are variants for
4399
/// both UTF-8 and UTF-16 input buffers.
4400
///
4401
/// An `encode_*` method encode characters from `src` into bytes characters
4402
/// stored into `dst` until one of the following three things happens:
4403
///
4404
/// 1. An unmappable character is encountered (`*_without_replacement` variants
4405
///    only).
4406
///
4407
/// 2. The output buffer has been filled so near capacity that the decoder
4408
///    cannot be sure that processing an additional character of input wouldn't
4409
///    cause so much output that the output buffer would overflow.
4410
///
4411
/// 3. All the input characters have been processed.
4412
///
4413
/// The `encode_*` method then returns tuple of a status indicating which one
4414
/// of the three reasons to return happened, how many input code units (`u8`
4415
/// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4416
/// how many output bytes were written (except when encoding into `Vec<u8>`,
4417
/// whose length change indicates this), and in the case of the variants that
4418
/// perform replacement, a boolean indicating whether an unmappable
4419
/// character was replaced with a numeric character reference during the call.
4420
///
4421
/// The number of bytes "written" is what's logically written. Garbage may be
4422
/// written in the output buffer beyond the point logically written to.
4423
///
4424
/// In the case of the methods whose name ends with
4425
/// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4426
/// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4427
/// the three cases listed above).
4428
///
4429
/// In the case of methods whose name does not end with
4430
/// `*_without_replacement`, unmappable characters are automatically replaced
4431
/// with the corresponding numeric character references and unmappable
4432
/// characters do not cause the methods to return early.
4433
///
4434
/// When encoding from UTF-8 without replacement, the methods are guaranteed
4435
/// not to return indicating that more output space is needed if the length
4436
/// of the output buffer is at least the length returned by
4437
/// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4438
/// UTF-8 with replacement, the length of the output buffer that guarantees the
4439
/// methods not to return indicating that more output space is needed in the
4440
/// absence of unmappable characters is given by
4441
/// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4442
/// UTF-16 without replacement, the methods are guaranteed not to return
4443
/// indicating that more output space is needed if the length of the output
4444
/// buffer is at least the length returned by
4445
/// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4446
/// from UTF-16 with replacement, the the length of the output buffer that
4447
/// guarantees the methods not to return indicating that more output space is
4448
/// needed in the absence of unmappable characters is given by
4449
/// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4450
/// When encoding with replacement, applications are not expected to size the
4451
/// buffer for the worst case ahead of time but to resize the buffer if there
4452
/// are unmappable characters. This is why max length queries are only available
4453
/// for the case where there are no unmappable characters.
4454
///
4455
/// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4456
/// calling from Rust, the type system takes care of this.) When encoding from
4457
/// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4458
/// CHARACTERS. Therefore, in order for astral characters not to turn into a
4459
/// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4460
/// are not split across input buffer boundaries.
4461
///
4462
/// After an `encode_*` call returns, the output produced so far, taken as a
4463
/// whole from the start of the stream, is guaranteed to consist of a valid
4464
/// byte sequence in the target encoding. (I.e. the code unit sequence for a
4465
/// character is guaranteed not to be split across output buffers. However, due
4466
/// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4467
/// from the start for it to be valid. For other encodings, the validity holds
4468
/// on a per-output buffer basis.)
4469
///
4470
/// The boolean argument `last` indicates that the end of the stream is reached
4471
/// when all the characters in `src` have been consumed. This argument is needed
4472
/// for ISO-2022-JP and is ignored for other encodings.
4473
///
4474
/// An `Encoder` object can be used to incrementally encode a byte stream.
4475
///
4476
/// During the processing of a single stream, the caller must call `encode_*`
4477
/// zero or more times with `last` set to `false` and then call `encode_*` at
4478
/// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4479
/// the processing of the stream has ended. Otherwise, the caller must call
4480
/// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4481
/// as a fatal error).
4482
///
4483
/// Once the stream has ended, the `Encoder` object must not be used anymore.
4484
/// That is, you need to create another one to process another stream.
4485
///
4486
/// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4487
/// and the caller does not wish to treat it as a fatal error, the input buffer
4488
/// `src` may not have been completely consumed. In that case, the caller must
4489
/// pass the unconsumed contents of `src` to `encode_*` again upon the next
4490
/// call.
4491
///
4492
/// [1]: enum.EncoderResult.html
4493
/// [2]: #method.max_buffer_length_from_utf8_without_replacement
4494
/// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4495
/// [4]: #method.max_buffer_length_from_utf16_without_replacement
4496
/// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4497
///
4498
/// # Infinite loops
4499
///
4500
/// When converting with a fixed-size output buffer whose size is too small to
4501
/// accommodate one character of output, an infinite loop ensues. When
4502
/// converting with a fixed-size output buffer, it generally makes sense to
4503
/// make the buffer fairly large (e.g. couple of kilobytes).
4504
pub struct Encoder {
4505
    encoding: &'static Encoding,
4506
    variant: VariantEncoder,
4507
}
4508
4509
impl Encoder {
4510
0
    fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4511
0
        Encoder {
4512
0
            encoding: enc,
4513
0
            variant: encoder,
4514
0
        }
4515
0
    }
4516
4517
    /// The `Encoding` this `Encoder` is for.
4518
    #[inline]
4519
0
    pub fn encoding(&self) -> &'static Encoding {
4520
0
        self.encoding
4521
0
    }
Unexecuted instantiation: <encoding_rs::Encoder>::encoding
Unexecuted instantiation: <encoding_rs::Encoder>::encoding
4522
4523
    /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4524
    /// ASCII state and `false` otherwise.
4525
    #[inline]
4526
0
    pub fn has_pending_state(&self) -> bool {
4527
0
        self.variant.has_pending_state()
4528
0
    }
Unexecuted instantiation: <encoding_rs::Encoder>::has_pending_state
Unexecuted instantiation: <encoding_rs::Encoder>::has_pending_state
4529
4530
    /// Query the worst-case output size when encoding from UTF-8 with
4531
    /// replacement.
4532
    ///
4533
    /// Returns the size of the output buffer in bytes that will not overflow
4534
    /// given the current state of the encoder and `byte_length` number of
4535
    /// additional input code units if there are no unmappable characters in
4536
    /// the input or `None` if `usize` would overflow.
4537
    ///
4538
    /// Available via the C wrapper.
4539
0
    pub fn max_buffer_length_from_utf8_if_no_unmappables(
4540
0
        &self,
4541
0
        byte_length: usize,
4542
0
    ) -> Option<usize> {
4543
0
        checked_add(
4544
0
            if self.encoding().can_encode_everything() {
4545
0
                0
4546
            } else {
4547
0
                NCR_EXTRA
4548
            },
4549
0
            self.max_buffer_length_from_utf8_without_replacement(byte_length),
4550
        )
4551
0
    }
4552
4553
    /// Query the worst-case output size when encoding from UTF-8 without
4554
    /// replacement.
4555
    ///
4556
    /// Returns the size of the output buffer in bytes that will not overflow
4557
    /// given the current state of the encoder and `byte_length` number of
4558
    /// additional input code units or `None` if `usize` would overflow.
4559
    ///
4560
    /// Available via the C wrapper.
4561
0
    pub fn max_buffer_length_from_utf8_without_replacement(
4562
0
        &self,
4563
0
        byte_length: usize,
4564
0
    ) -> Option<usize> {
4565
0
        self.variant
4566
0
            .max_buffer_length_from_utf8_without_replacement(byte_length)
4567
0
    }
4568
4569
    /// Incrementally encode into byte stream from UTF-8 with unmappable
4570
    /// characters replaced with HTML (decimal) numeric character references.
4571
    ///
4572
    /// See the documentation of the struct for documentation for `encode_*`
4573
    /// methods collectively.
4574
    ///
4575
    /// Available via the C wrapper.
4576
0
    pub fn encode_from_utf8(
4577
0
        &mut self,
4578
0
        src: &str,
4579
0
        dst: &mut [u8],
4580
0
        last: bool,
4581
0
    ) -> (CoderResult, usize, usize, bool) {
4582
0
        let dst_len = dst.len();
4583
0
        let effective_dst_len = if self.encoding().can_encode_everything() {
4584
0
            dst_len
4585
        } else {
4586
0
            if dst_len < NCR_EXTRA {
4587
0
                if src.is_empty() && !(last && self.has_pending_state()) {
4588
0
                    return (CoderResult::InputEmpty, 0, 0, false);
4589
0
                }
4590
0
                return (CoderResult::OutputFull, 0, 0, false);
4591
0
            }
4592
0
            dst_len - NCR_EXTRA
4593
        };
4594
0
        let mut had_unmappables = false;
4595
0
        let mut total_read = 0usize;
4596
0
        let mut total_written = 0usize;
4597
        loop {
4598
0
            let (result, read, written) = self.encode_from_utf8_without_replacement(
4599
0
                &src[total_read..],
4600
0
                &mut dst[total_written..effective_dst_len],
4601
0
                last,
4602
0
            );
4603
0
            total_read += read;
4604
0
            total_written += written;
4605
0
            match result {
4606
                EncoderResult::InputEmpty => {
4607
0
                    return (
4608
0
                        CoderResult::InputEmpty,
4609
0
                        total_read,
4610
0
                        total_written,
4611
0
                        had_unmappables,
4612
0
                    );
4613
                }
4614
                EncoderResult::OutputFull => {
4615
0
                    return (
4616
0
                        CoderResult::OutputFull,
4617
0
                        total_read,
4618
0
                        total_written,
4619
0
                        had_unmappables,
4620
0
                    );
4621
                }
4622
0
                EncoderResult::Unmappable(unmappable) => {
4623
0
                    had_unmappables = true;
4624
0
                    debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4625
0
                    debug_assert_ne!(self.encoding(), UTF_16BE);
4626
0
                    debug_assert_ne!(self.encoding(), UTF_16LE);
4627
                    // Additionally, Iso2022JpEncoder is responsible for
4628
                    // transitioning to ASCII when returning with Unmappable.
4629
0
                    total_written += write_ncr(unmappable, &mut dst[total_written..]);
4630
0
                    if total_written >= effective_dst_len {
4631
0
                        if total_read == src.len() && !(last && self.has_pending_state()) {
4632
0
                            return (
4633
0
                                CoderResult::InputEmpty,
4634
0
                                total_read,
4635
0
                                total_written,
4636
0
                                had_unmappables,
4637
0
                            );
4638
0
                        }
4639
0
                        return (
4640
0
                            CoderResult::OutputFull,
4641
0
                            total_read,
4642
0
                            total_written,
4643
0
                            had_unmappables,
4644
0
                        );
4645
0
                    }
4646
                }
4647
            }
4648
        }
4649
0
    }
4650
4651
    /// Incrementally encode into byte stream from UTF-8 with unmappable
4652
    /// characters replaced with HTML (decimal) numeric character references.
4653
    ///
4654
    /// See the documentation of the struct for documentation for `encode_*`
4655
    /// methods collectively.
4656
    ///
4657
    /// Available to Rust only and only with the `alloc` feature enabled (enabled
4658
    /// by default).
4659
    #[cfg(feature = "alloc")]
4660
0
    pub fn encode_from_utf8_to_vec(
4661
0
        &mut self,
4662
0
        src: &str,
4663
0
        dst: &mut Vec<u8>,
4664
0
        last: bool,
4665
0
    ) -> (CoderResult, usize, bool) {
4666
        unsafe {
4667
0
            let old_len = dst.len();
4668
0
            let capacity = dst.capacity();
4669
0
            dst.set_len(capacity);
4670
0
            let (result, read, written, replaced) =
4671
0
                self.encode_from_utf8(src, &mut dst[old_len..], last);
4672
0
            dst.set_len(old_len + written);
4673
0
            (result, read, replaced)
4674
        }
4675
0
    }
4676
4677
    /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4678
    ///
4679
    /// See the documentation of the struct for documentation for `encode_*`
4680
    /// methods collectively.
4681
    ///
4682
    /// Available via the C wrapper.
4683
0
    pub fn encode_from_utf8_without_replacement(
4684
0
        &mut self,
4685
0
        src: &str,
4686
0
        dst: &mut [u8],
4687
0
        last: bool,
4688
0
    ) -> (EncoderResult, usize, usize) {
4689
0
        self.variant.encode_from_utf8_raw(src, dst, last)
4690
0
    }
4691
4692
    /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4693
    ///
4694
    /// See the documentation of the struct for documentation for `encode_*`
4695
    /// methods collectively.
4696
    ///
4697
    /// Available to Rust only and only with the `alloc` feature enabled (enabled
4698
    /// by default).
4699
    #[cfg(feature = "alloc")]
4700
0
    pub fn encode_from_utf8_to_vec_without_replacement(
4701
0
        &mut self,
4702
0
        src: &str,
4703
0
        dst: &mut Vec<u8>,
4704
0
        last: bool,
4705
0
    ) -> (EncoderResult, usize) {
4706
        unsafe {
4707
0
            let old_len = dst.len();
4708
0
            let capacity = dst.capacity();
4709
0
            dst.set_len(capacity);
4710
0
            let (result, read, written) =
4711
0
                self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4712
0
            dst.set_len(old_len + written);
4713
0
            (result, read)
4714
        }
4715
0
    }
4716
4717
    /// Query the worst-case output size when encoding from UTF-16 with
4718
    /// replacement.
4719
    ///
4720
    /// Returns the size of the output buffer in bytes that will not overflow
4721
    /// given the current state of the encoder and `u16_length` number of
4722
    /// additional input code units if there are no unmappable characters in
4723
    /// the input or `None` if `usize` would overflow.
4724
    ///
4725
    /// Available via the C wrapper.
4726
0
    pub fn max_buffer_length_from_utf16_if_no_unmappables(
4727
0
        &self,
4728
0
        u16_length: usize,
4729
0
    ) -> Option<usize> {
4730
0
        checked_add(
4731
0
            if self.encoding().can_encode_everything() {
4732
0
                0
4733
            } else {
4734
0
                NCR_EXTRA
4735
            },
4736
0
            self.max_buffer_length_from_utf16_without_replacement(u16_length),
4737
        )
4738
0
    }
4739
4740
    /// Query the worst-case output size when encoding from UTF-16 without
4741
    /// replacement.
4742
    ///
4743
    /// Returns the size of the output buffer in bytes that will not overflow
4744
    /// given the current state of the encoder and `u16_length` number of
4745
    /// additional input code units or `None` if `usize` would overflow.
4746
    ///
4747
    /// Available via the C wrapper.
4748
0
    pub fn max_buffer_length_from_utf16_without_replacement(
4749
0
        &self,
4750
0
        u16_length: usize,
4751
0
    ) -> Option<usize> {
4752
0
        self.variant
4753
0
            .max_buffer_length_from_utf16_without_replacement(u16_length)
4754
0
    }
4755
4756
    /// Incrementally encode into byte stream from UTF-16 with unmappable
4757
    /// characters replaced with HTML (decimal) numeric character references.
4758
    ///
4759
    /// See the documentation of the struct for documentation for `encode_*`
4760
    /// methods collectively.
4761
    ///
4762
    /// Available via the C wrapper.
4763
0
    pub fn encode_from_utf16(
4764
0
        &mut self,
4765
0
        src: &[u16],
4766
0
        dst: &mut [u8],
4767
0
        last: bool,
4768
0
    ) -> (CoderResult, usize, usize, bool) {
4769
0
        let dst_len = dst.len();
4770
0
        let effective_dst_len = if self.encoding().can_encode_everything() {
4771
0
            dst_len
4772
        } else {
4773
0
            if dst_len < NCR_EXTRA {
4774
0
                if src.is_empty() && !(last && self.has_pending_state()) {
4775
0
                    return (CoderResult::InputEmpty, 0, 0, false);
4776
0
                }
4777
0
                return (CoderResult::OutputFull, 0, 0, false);
4778
0
            }
4779
0
            dst_len - NCR_EXTRA
4780
        };
4781
0
        let mut had_unmappables = false;
4782
0
        let mut total_read = 0usize;
4783
0
        let mut total_written = 0usize;
4784
        loop {
4785
0
            let (result, read, written) = self.encode_from_utf16_without_replacement(
4786
0
                &src[total_read..],
4787
0
                &mut dst[total_written..effective_dst_len],
4788
0
                last,
4789
0
            );
4790
0
            total_read += read;
4791
0
            total_written += written;
4792
0
            match result {
4793
                EncoderResult::InputEmpty => {
4794
0
                    return (
4795
0
                        CoderResult::InputEmpty,
4796
0
                        total_read,
4797
0
                        total_written,
4798
0
                        had_unmappables,
4799
0
                    );
4800
                }
4801
                EncoderResult::OutputFull => {
4802
0
                    return (
4803
0
                        CoderResult::OutputFull,
4804
0
                        total_read,
4805
0
                        total_written,
4806
0
                        had_unmappables,
4807
0
                    );
4808
                }
4809
0
                EncoderResult::Unmappable(unmappable) => {
4810
0
                    had_unmappables = true;
4811
0
                    debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4812
                    // There are no UTF-16 encoders and even if there were,
4813
                    // they'd never have unmappables.
4814
0
                    debug_assert_ne!(self.encoding(), UTF_16BE);
4815
0
                    debug_assert_ne!(self.encoding(), UTF_16LE);
4816
                    // Additionally, Iso2022JpEncoder is responsible for
4817
                    // transitioning to ASCII when returning with Unmappable
4818
                    // from the jis0208 state. That is, when we encode
4819
                    // ISO-2022-JP and come here, the encoder is in either the
4820
                    // ASCII or the Roman state. We are allowed to generate any
4821
                    // printable ASCII excluding \ and ~.
4822
0
                    total_written += write_ncr(unmappable, &mut dst[total_written..]);
4823
0
                    if total_written >= effective_dst_len {
4824
0
                        if total_read == src.len() && !(last && self.has_pending_state()) {
4825
0
                            return (
4826
0
                                CoderResult::InputEmpty,
4827
0
                                total_read,
4828
0
                                total_written,
4829
0
                                had_unmappables,
4830
0
                            );
4831
0
                        }
4832
0
                        return (
4833
0
                            CoderResult::OutputFull,
4834
0
                            total_read,
4835
0
                            total_written,
4836
0
                            had_unmappables,
4837
0
                        );
4838
0
                    }
4839
                }
4840
            }
4841
        }
4842
0
    }
4843
4844
    /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4845
    ///
4846
    /// See the documentation of the struct for documentation for `encode_*`
4847
    /// methods collectively.
4848
    ///
4849
    /// Available via the C wrapper.
4850
0
    pub fn encode_from_utf16_without_replacement(
4851
0
        &mut self,
4852
0
        src: &[u16],
4853
0
        dst: &mut [u8],
4854
0
        last: bool,
4855
0
    ) -> (EncoderResult, usize, usize) {
4856
0
        self.variant.encode_from_utf16_raw(src, dst, last)
4857
0
    }
4858
}
4859
4860
/// Format an unmappable as NCR without heap allocation.
4861
0
fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4862
    // len is the number of decimal digits needed to represent unmappable plus
4863
    // 3 (the length of "&#" and ";").
4864
0
    let mut number = unmappable as u32;
4865
0
    let len = if number >= 1_000_000u32 {
4866
0
        10usize
4867
0
    } else if number >= 100_000u32 {
4868
0
        9usize
4869
0
    } else if number >= 10_000u32 {
4870
0
        8usize
4871
0
    } else if number >= 1_000u32 {
4872
0
        7usize
4873
0
    } else if number >= 100u32 {
4874
0
        6usize
4875
    } else {
4876
        // Review the outcome of https://github.com/whatwg/encoding/issues/15
4877
        // to see if this case is possible
4878
0
        5usize
4879
    };
4880
0
    debug_assert!(number >= 10u32);
4881
0
    debug_assert!(len <= dst.len());
4882
0
    let mut pos = len - 1;
4883
0
    dst[pos] = b';';
4884
0
    pos -= 1;
4885
    loop {
4886
0
        let rightmost = number % 10;
4887
0
        dst[pos] = rightmost as u8 + b'0';
4888
0
        pos -= 1;
4889
0
        if number < 10 {
4890
0
            break;
4891
0
        }
4892
0
        number /= 10;
4893
    }
4894
0
    dst[1] = b'#';
4895
0
    dst[0] = b'&';
4896
0
    len
4897
0
}
4898
4899
#[inline(always)]
4900
0
fn in_range16(i: u16, start: u16, end: u16) -> bool {
4901
0
    i.wrapping_sub(start) < (end - start)
4902
0
}
4903
4904
#[inline(always)]
4905
0
fn in_range32(i: u32, start: u32, end: u32) -> bool {
4906
0
    i.wrapping_sub(start) < (end - start)
4907
0
}
4908
4909
#[inline(always)]
4910
0
fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4911
0
    i.wrapping_sub(start) <= (end - start)
4912
0
}
4913
4914
#[inline(always)]
4915
0
fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4916
0
    i.wrapping_sub(start) <= (end - start)
4917
0
}
4918
4919
#[inline(always)]
4920
0
fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4921
0
    i.wrapping_sub(start) <= (end - start)
4922
0
}
4923
4924
#[inline(always)]
4925
0
fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4926
0
    i.wrapping_sub(start) <= (end - start)
4927
0
}
4928
4929
#[inline(always)]
4930
0
fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4931
0
    if let Some(n) = opt {
4932
0
        n.checked_add(num)
4933
    } else {
4934
0
        None
4935
    }
4936
0
}
4937
4938
#[inline(always)]
4939
0
fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4940
0
    if let Some(n) = one {
4941
0
        checked_add(n, other)
4942
    } else {
4943
0
        None
4944
    }
4945
0
}
4946
4947
#[inline(always)]
4948
0
fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4949
0
    if let Some(n) = opt {
4950
0
        n.checked_mul(num)
4951
    } else {
4952
0
        None
4953
    }
4954
0
}
4955
4956
#[inline(always)]
4957
0
fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4958
0
    if let Some(n) = opt {
4959
0
        n.checked_div(num)
4960
    } else {
4961
0
        None
4962
    }
4963
0
}
4964
4965
#[cfg(feature = "alloc")]
4966
#[inline(always)]
4967
0
fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4968
0
    opt.map(|n| n.next_power_of_two())
4969
0
}
4970
4971
#[cfg(feature = "alloc")]
4972
#[inline(always)]
4973
0
fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4974
0
    if let Some(a) = one {
4975
0
        if let Some(b) = other {
4976
0
            Some(::core::cmp::min(a, b))
4977
        } else {
4978
0
            Some(a)
4979
        }
4980
    } else {
4981
0
        other
4982
    }
4983
0
}
4984
4985
// ############## TESTS ###############
4986
4987
#[cfg(all(test, feature = "serde"))]
4988
#[derive(Serialize, Deserialize, Debug, PartialEq)]
4989
struct Demo {
4990
    num: u32,
4991
    name: String,
4992
    enc: &'static Encoding,
4993
}
4994
4995
#[cfg(test)]
4996
mod test_labels_names;
4997
4998
#[cfg(all(test, feature = "alloc"))]
4999
mod tests {
5000
    use super::*;
5001
    use alloc::borrow::Cow;
5002
5003
    fn sniff_to_utf16(
5004
        initial_encoding: &'static Encoding,
5005
        expected_encoding: &'static Encoding,
5006
        bytes: &[u8],
5007
        expect: &[u16],
5008
        breaks: &[usize],
5009
    ) {
5010
        let mut decoder = initial_encoding.new_decoder();
5011
5012
        let mut dest: Vec<u16> =
5013
            Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
5014
        let capacity = dest.capacity();
5015
        dest.resize(capacity, 0u16);
5016
5017
        let mut total_written = 0usize;
5018
        let mut start = 0usize;
5019
        for br in breaks {
5020
            let (result, read, written, _) =
5021
                decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
5022
            total_written += written;
5023
            assert_eq!(read, *br - start);
5024
            match result {
5025
                CoderResult::InputEmpty => {}
5026
                CoderResult::OutputFull => {
5027
                    unreachable!();
5028
                }
5029
            }
5030
            start = *br;
5031
        }
5032
        let (result, read, written, _) =
5033
            decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
5034
        total_written += written;
5035
        match result {
5036
            CoderResult::InputEmpty => {}
5037
            CoderResult::OutputFull => {
5038
                unreachable!();
5039
            }
5040
        }
5041
        assert_eq!(read, bytes.len() - start);
5042
        assert_eq!(total_written, expect.len());
5043
        assert_eq!(&dest[..total_written], expect);
5044
        assert_eq!(decoder.encoding(), expected_encoding);
5045
    }
5046
5047
    // Any copyright to the test code below this comment is dedicated to the
5048
    // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
5049
5050
    #[test]
5051
    fn test_bom_sniffing() {
5052
        // ASCII
5053
        sniff_to_utf16(
5054
            WINDOWS_1252,
5055
            WINDOWS_1252,
5056
            b"\x61\x62",
5057
            &[0x0061u16, 0x0062u16],
5058
            &[],
5059
        );
5060
        // UTF-8
5061
        sniff_to_utf16(
5062
            WINDOWS_1252,
5063
            UTF_8,
5064
            b"\xEF\xBB\xBF\x61\x62",
5065
            &[0x0061u16, 0x0062u16],
5066
            &[],
5067
        );
5068
        sniff_to_utf16(
5069
            WINDOWS_1252,
5070
            UTF_8,
5071
            b"\xEF\xBB\xBF\x61\x62",
5072
            &[0x0061u16, 0x0062u16],
5073
            &[1],
5074
        );
5075
        sniff_to_utf16(
5076
            WINDOWS_1252,
5077
            UTF_8,
5078
            b"\xEF\xBB\xBF\x61\x62",
5079
            &[0x0061u16, 0x0062u16],
5080
            &[2],
5081
        );
5082
        sniff_to_utf16(
5083
            WINDOWS_1252,
5084
            UTF_8,
5085
            b"\xEF\xBB\xBF\x61\x62",
5086
            &[0x0061u16, 0x0062u16],
5087
            &[3],
5088
        );
5089
        sniff_to_utf16(
5090
            WINDOWS_1252,
5091
            UTF_8,
5092
            b"\xEF\xBB\xBF\x61\x62",
5093
            &[0x0061u16, 0x0062u16],
5094
            &[4],
5095
        );
5096
        sniff_to_utf16(
5097
            WINDOWS_1252,
5098
            UTF_8,
5099
            b"\xEF\xBB\xBF\x61\x62",
5100
            &[0x0061u16, 0x0062u16],
5101
            &[2, 3],
5102
        );
5103
        sniff_to_utf16(
5104
            WINDOWS_1252,
5105
            UTF_8,
5106
            b"\xEF\xBB\xBF\x61\x62",
5107
            &[0x0061u16, 0x0062u16],
5108
            &[1, 2],
5109
        );
5110
        sniff_to_utf16(
5111
            WINDOWS_1252,
5112
            UTF_8,
5113
            b"\xEF\xBB\xBF\x61\x62",
5114
            &[0x0061u16, 0x0062u16],
5115
            &[1, 3],
5116
        );
5117
        sniff_to_utf16(
5118
            WINDOWS_1252,
5119
            UTF_8,
5120
            b"\xEF\xBB\xBF\x61\x62",
5121
            &[0x0061u16, 0x0062u16],
5122
            &[1, 2, 3, 4],
5123
        );
5124
        sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
5125
        // Not UTF-8
5126
        sniff_to_utf16(
5127
            WINDOWS_1252,
5128
            WINDOWS_1252,
5129
            b"\xEF\xBB\x61\x62",
5130
            &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5131
            &[],
5132
        );
5133
        sniff_to_utf16(
5134
            WINDOWS_1252,
5135
            WINDOWS_1252,
5136
            b"\xEF\xBB\x61\x62",
5137
            &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5138
            &[1],
5139
        );
5140
        sniff_to_utf16(
5141
            WINDOWS_1252,
5142
            WINDOWS_1252,
5143
            b"\xEF\x61\x62",
5144
            &[0x00EFu16, 0x0061u16, 0x0062u16],
5145
            &[],
5146
        );
5147
        sniff_to_utf16(
5148
            WINDOWS_1252,
5149
            WINDOWS_1252,
5150
            b"\xEF\x61\x62",
5151
            &[0x00EFu16, 0x0061u16, 0x0062u16],
5152
            &[1],
5153
        );
5154
        sniff_to_utf16(
5155
            WINDOWS_1252,
5156
            WINDOWS_1252,
5157
            b"\xEF\xBB",
5158
            &[0x00EFu16, 0x00BBu16],
5159
            &[],
5160
        );
5161
        sniff_to_utf16(
5162
            WINDOWS_1252,
5163
            WINDOWS_1252,
5164
            b"\xEF\xBB",
5165
            &[0x00EFu16, 0x00BBu16],
5166
            &[1],
5167
        );
5168
        sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5169
        // Not UTF-16
5170
        sniff_to_utf16(
5171
            WINDOWS_1252,
5172
            WINDOWS_1252,
5173
            b"\xFE\x61\x62",
5174
            &[0x00FEu16, 0x0061u16, 0x0062u16],
5175
            &[],
5176
        );
5177
        sniff_to_utf16(
5178
            WINDOWS_1252,
5179
            WINDOWS_1252,
5180
            b"\xFE\x61\x62",
5181
            &[0x00FEu16, 0x0061u16, 0x0062u16],
5182
            &[1],
5183
        );
5184
        sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5185
        sniff_to_utf16(
5186
            WINDOWS_1252,
5187
            WINDOWS_1252,
5188
            b"\xFF\x61\x62",
5189
            &[0x00FFu16, 0x0061u16, 0x0062u16],
5190
            &[],
5191
        );
5192
        sniff_to_utf16(
5193
            WINDOWS_1252,
5194
            WINDOWS_1252,
5195
            b"\xFF\x61\x62",
5196
            &[0x00FFu16, 0x0061u16, 0x0062u16],
5197
            &[1],
5198
        );
5199
        sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5200
        // UTF-16
5201
        sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5202
        sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5203
        sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5204
        sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5205
    }
5206
5207
    #[test]
5208
    fn test_output_encoding() {
5209
        assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5210
        assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5211
        assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5212
        assert_eq!(UTF_8.output_encoding(), UTF_8);
5213
        assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5214
        assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5215
        assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5216
        assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5217
        assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5218
        assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5219
    }
5220
5221
    #[test]
5222
    fn test_label_resolution() {
5223
        assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5224
        assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5225
        assert_eq!(
5226
            Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5227
            Some(UTF_8)
5228
        );
5229
        assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5230
        assert_eq!(Encoding::for_label(b"bogus"), None);
5231
        assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5232
    }
5233
5234
    #[test]
5235
    fn test_decode_valid_windows_1257_to_cow() {
5236
        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5237
        match cow {
5238
            Cow::Borrowed(_) => unreachable!(),
5239
            Cow::Owned(s) => {
5240
                assert_eq!(s, "abc\u{20AC}\u{00E4}");
5241
            }
5242
        }
5243
        assert_eq!(encoding, WINDOWS_1257);
5244
        assert!(!had_errors);
5245
    }
5246
5247
    #[test]
5248
    fn test_decode_invalid_windows_1257_to_cow() {
5249
        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5250
        match cow {
5251
            Cow::Borrowed(_) => unreachable!(),
5252
            Cow::Owned(s) => {
5253
                assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5254
            }
5255
        }
5256
        assert_eq!(encoding, WINDOWS_1257);
5257
        assert!(had_errors);
5258
    }
5259
5260
    #[test]
5261
    fn test_decode_ascii_only_windows_1257_to_cow() {
5262
        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5263
        match cow {
5264
            Cow::Borrowed(s) => {
5265
                assert_eq!(s, "abc");
5266
            }
5267
            Cow::Owned(_) => unreachable!(),
5268
        }
5269
        assert_eq!(encoding, WINDOWS_1257);
5270
        assert!(!had_errors);
5271
    }
5272
5273
    #[test]
5274
    fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5275
        let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5276
        match cow {
5277
            Cow::Borrowed(s) => {
5278
                assert_eq!(s, "\u{20AC}\u{00E4}");
5279
            }
5280
            Cow::Owned(_) => unreachable!(),
5281
        }
5282
        assert_eq!(encoding, UTF_8);
5283
        assert!(!had_errors);
5284
    }
5285
5286
    #[test]
5287
    fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5288
        let (cow, encoding, had_errors) =
5289
            WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5290
        match cow {
5291
            Cow::Borrowed(_) => unreachable!(),
5292
            Cow::Owned(s) => {
5293
                assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5294
            }
5295
        }
5296
        assert_eq!(encoding, UTF_8);
5297
        assert!(had_errors);
5298
    }
5299
5300
    #[test]
5301
    fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5302
        let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5303
        match cow {
5304
            Cow::Borrowed(s) => {
5305
                assert_eq!(s, "\u{20AC}\u{00E4}");
5306
            }
5307
            Cow::Owned(_) => unreachable!(),
5308
        }
5309
        assert_eq!(encoding, UTF_8);
5310
        assert!(!had_errors);
5311
    }
5312
5313
    #[test]
5314
    fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5315
        let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5316
        match cow {
5317
            Cow::Borrowed(_) => unreachable!(),
5318
            Cow::Owned(s) => {
5319
                assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5320
            }
5321
        }
5322
        assert_eq!(encoding, UTF_8);
5323
        assert!(had_errors);
5324
    }
5325
5326
    #[test]
5327
    fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5328
        let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5329
        match cow {
5330
            Cow::Borrowed(s) => {
5331
                assert_eq!(s, "\u{20AC}\u{00E4}");
5332
            }
5333
            Cow::Owned(_) => unreachable!(),
5334
        }
5335
        assert!(!had_errors);
5336
    }
5337
5338
    #[test]
5339
    fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5340
        let (cow, had_errors) =
5341
            WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5342
        match cow {
5343
            Cow::Borrowed(_) => unreachable!(),
5344
            Cow::Owned(s) => {
5345
                assert_eq!(
5346
                    s,
5347
                    "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5348
                );
5349
            }
5350
        }
5351
        assert!(!had_errors);
5352
    }
5353
5354
    #[test]
5355
    fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5356
        let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5357
        match cow {
5358
            Cow::Borrowed(_) => unreachable!(),
5359
            Cow::Owned(s) => {
5360
                assert_eq!(s, "abc\u{20AC}\u{00E4}");
5361
            }
5362
        }
5363
        assert!(!had_errors);
5364
    }
5365
5366
    #[test]
5367
    fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5368
        let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5369
        match cow {
5370
            Cow::Borrowed(_) => unreachable!(),
5371
            Cow::Owned(s) => {
5372
                assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5373
            }
5374
        }
5375
        assert!(had_errors);
5376
    }
5377
5378
    #[test]
5379
    fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5380
        let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5381
        match cow {
5382
            Cow::Borrowed(s) => {
5383
                assert_eq!(s, "abc");
5384
            }
5385
            Cow::Owned(_) => unreachable!(),
5386
        }
5387
        assert!(!had_errors);
5388
    }
5389
5390
    #[test]
5391
    fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5392
        let (cow, had_errors) =
5393
            UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5394
        match cow {
5395
            Cow::Borrowed(s) => {
5396
                assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5397
            }
5398
            Cow::Owned(_) => unreachable!(),
5399
        }
5400
        assert!(!had_errors);
5401
    }
5402
5403
    #[test]
5404
    fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5405
        let (cow, had_errors) =
5406
            UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5407
        match cow {
5408
            Cow::Borrowed(_) => unreachable!(),
5409
            Cow::Owned(s) => {
5410
                assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5411
            }
5412
        }
5413
        assert!(had_errors);
5414
    }
5415
5416
    #[test]
5417
    fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5418
        let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5419
        match cow {
5420
            Cow::Borrowed(_) => unreachable!(),
5421
            Cow::Owned(s) => {
5422
                assert_eq!(s, "abc\u{20AC}\u{00E4}");
5423
            }
5424
        }
5425
        assert!(!had_errors);
5426
    }
5427
5428
    #[test]
5429
    fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5430
        let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5431
        match cow {
5432
            Cow::Borrowed(_) => unreachable!(),
5433
            Cow::Owned(s) => {
5434
                assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5435
            }
5436
        }
5437
        assert!(had_errors);
5438
    }
5439
5440
    #[test]
5441
    fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5442
        let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5443
        match cow {
5444
            Cow::Borrowed(s) => {
5445
                assert_eq!(s, "abc");
5446
            }
5447
            Cow::Owned(_) => unreachable!(),
5448
        }
5449
        assert!(!had_errors);
5450
    }
5451
5452
    #[test]
5453
    fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5454
        match UTF_8.decode_without_bom_handling_and_without_replacement(
5455
            b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5456
        ) {
5457
            Some(cow) => match cow {
5458
                Cow::Borrowed(s) => {
5459
                    assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5460
                }
5461
                Cow::Owned(_) => unreachable!(),
5462
            },
5463
            None => unreachable!(),
5464
        }
5465
    }
5466
5467
    #[test]
5468
    fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5469
        assert!(UTF_8
5470
            .decode_without_bom_handling_and_without_replacement(
5471
                b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5472
            )
5473
            .is_none());
5474
    }
5475
5476
    #[test]
5477
    fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5478
        match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5479
            Some(cow) => match cow {
5480
                Cow::Borrowed(_) => unreachable!(),
5481
                Cow::Owned(s) => {
5482
                    assert_eq!(s, "abc\u{20AC}\u{00E4}");
5483
                }
5484
            },
5485
            None => unreachable!(),
5486
        }
5487
    }
5488
5489
    #[test]
5490
    fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5491
        assert!(WINDOWS_1257
5492
            .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5493
            .is_none());
5494
    }
5495
5496
    #[test]
5497
    fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5498
        match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5499
            Some(cow) => match cow {
5500
                Cow::Borrowed(s) => {
5501
                    assert_eq!(s, "abc");
5502
                }
5503
                Cow::Owned(_) => unreachable!(),
5504
            },
5505
            None => unreachable!(),
5506
        }
5507
    }
5508
5509
    #[test]
5510
    fn test_encode_ascii_only_windows_1257_to_cow() {
5511
        let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5512
        match cow {
5513
            Cow::Borrowed(s) => {
5514
                assert_eq!(s, b"abc");
5515
            }
5516
            Cow::Owned(_) => unreachable!(),
5517
        }
5518
        assert_eq!(encoding, WINDOWS_1257);
5519
        assert!(!had_errors);
5520
    }
5521
5522
    #[test]
5523
    fn test_encode_valid_windows_1257_to_cow() {
5524
        let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5525
        match cow {
5526
            Cow::Borrowed(_) => unreachable!(),
5527
            Cow::Owned(s) => {
5528
                assert_eq!(s, b"abc\x80\xE4");
5529
            }
5530
        }
5531
        assert_eq!(encoding, WINDOWS_1257);
5532
        assert!(!had_errors);
5533
    }
5534
5535
    #[test]
5536
    fn test_utf16_space_with_one_bom_byte() {
5537
        let mut decoder = UTF_16LE.new_decoder();
5538
        let mut dst = [0u16; 12];
5539
        {
5540
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5541
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5542
            assert_eq!(result, CoderResult::InputEmpty);
5543
        }
5544
        {
5545
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5546
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5547
            assert_eq!(result, CoderResult::InputEmpty);
5548
        }
5549
    }
5550
5551
    #[test]
5552
    fn test_utf8_space_with_one_bom_byte() {
5553
        let mut decoder = UTF_8.new_decoder();
5554
        let mut dst = [0u16; 12];
5555
        {
5556
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5557
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5558
            assert_eq!(result, CoderResult::InputEmpty);
5559
        }
5560
        {
5561
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5562
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5563
            assert_eq!(result, CoderResult::InputEmpty);
5564
        }
5565
    }
5566
5567
    #[test]
5568
    fn test_utf16_space_with_two_bom_bytes() {
5569
        let mut decoder = UTF_16LE.new_decoder();
5570
        let mut dst = [0u16; 12];
5571
        {
5572
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5573
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5574
            assert_eq!(result, CoderResult::InputEmpty);
5575
        }
5576
        {
5577
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5578
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5579
            assert_eq!(result, CoderResult::InputEmpty);
5580
        }
5581
        {
5582
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5583
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5584
            assert_eq!(result, CoderResult::InputEmpty);
5585
        }
5586
    }
5587
5588
    #[test]
5589
    fn test_utf8_space_with_two_bom_bytes() {
5590
        let mut decoder = UTF_8.new_decoder();
5591
        let mut dst = [0u16; 12];
5592
        {
5593
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5594
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5595
            assert_eq!(result, CoderResult::InputEmpty);
5596
        }
5597
        {
5598
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5599
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5600
            assert_eq!(result, CoderResult::InputEmpty);
5601
        }
5602
        {
5603
            let needed = decoder.max_utf16_buffer_length(1).unwrap();
5604
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5605
            assert_eq!(result, CoderResult::InputEmpty);
5606
        }
5607
    }
5608
5609
    #[test]
5610
    fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5611
        let mut decoder = UTF_16LE.new_decoder();
5612
        let mut dst = [0u16; 12];
5613
        {
5614
            let needed = decoder.max_utf16_buffer_length(2).unwrap();
5615
            let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5616
            assert_eq!(result, CoderResult::InputEmpty);
5617
        }
5618
    }
5619
5620
    #[test]
5621
    fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5622
        let mut dst = [0u8; 8];
5623
        let mut encoder = ISO_2022_JP.new_encoder();
5624
        {
5625
            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5626
            assert_eq!(result, CoderResult::InputEmpty);
5627
        }
5628
        {
5629
            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5630
            assert_eq!(result, CoderResult::InputEmpty);
5631
        }
5632
    }
5633
5634
    #[test]
5635
    fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5636
        let mut dst = [0u8; 16];
5637
        let mut encoder = ISO_2022_JP.new_encoder();
5638
        {
5639
            let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5640
            assert_eq!(result, CoderResult::InputEmpty);
5641
        }
5642
        {
5643
            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5644
            assert_eq!(result, CoderResult::InputEmpty);
5645
        }
5646
        {
5647
            let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5648
            assert_eq!(result, CoderResult::OutputFull);
5649
        }
5650
    }
5651
5652
    #[test]
5653
    fn test_buffer_end_iso_2022_jp_from_utf8() {
5654
        let mut dst = [0u8; 18];
5655
        {
5656
            let mut encoder = ISO_2022_JP.new_encoder();
5657
            let (result, _, _, _) =
5658
                encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5659
            assert_eq!(result, CoderResult::InputEmpty);
5660
        }
5661
        {
5662
            let mut encoder = ISO_2022_JP.new_encoder();
5663
            let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5664
            assert_eq!(result, CoderResult::OutputFull);
5665
        }
5666
        {
5667
            let mut encoder = ISO_2022_JP.new_encoder();
5668
            let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5669
            assert_eq!(result, CoderResult::InputEmpty);
5670
        }
5671
        {
5672
            let mut encoder = ISO_2022_JP.new_encoder();
5673
            let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5674
            assert_eq!(result, CoderResult::InputEmpty);
5675
        }
5676
    }
5677
5678
    #[test]
5679
    fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5680
        let mut dst = [0u8; 8];
5681
        let mut encoder = ISO_2022_JP.new_encoder();
5682
        {
5683
            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5684
            assert_eq!(result, CoderResult::InputEmpty);
5685
        }
5686
        {
5687
            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5688
            assert_eq!(result, CoderResult::InputEmpty);
5689
        }
5690
    }
5691
5692
    #[test]
5693
    fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5694
        let mut dst = [0u8; 16];
5695
        let mut encoder = ISO_2022_JP.new_encoder();
5696
        {
5697
            let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5698
            assert_eq!(result, CoderResult::InputEmpty);
5699
        }
5700
        {
5701
            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5702
            assert_eq!(result, CoderResult::InputEmpty);
5703
        }
5704
        {
5705
            let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5706
            assert_eq!(result, CoderResult::OutputFull);
5707
        }
5708
    }
5709
5710
    #[test]
5711
    fn test_buffer_end_iso_2022_jp_from_utf16() {
5712
        let mut dst = [0u8; 18];
5713
        {
5714
            let mut encoder = ISO_2022_JP.new_encoder();
5715
            let (result, _, _, _) =
5716
                encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5717
            assert_eq!(result, CoderResult::InputEmpty);
5718
        }
5719
        {
5720
            let mut encoder = ISO_2022_JP.new_encoder();
5721
            let (result, _, _, _) =
5722
                encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5723
            assert_eq!(result, CoderResult::OutputFull);
5724
        }
5725
        {
5726
            let mut encoder = ISO_2022_JP.new_encoder();
5727
            let (result, _, _, _) =
5728
                encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5729
            assert_eq!(result, CoderResult::InputEmpty);
5730
        }
5731
        {
5732
            let mut encoder = ISO_2022_JP.new_encoder();
5733
            let (result, _, _, _) =
5734
                encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5735
            assert_eq!(result, CoderResult::InputEmpty);
5736
        }
5737
    }
5738
5739
    #[test]
5740
    fn test_buffer_end_utf16be() {
5741
        let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5742
        let mut dest = [0u8; 4];
5743
5744
        assert_eq!(
5745
            decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5746
            (CoderResult::InputEmpty, 2, 0, false)
5747
        );
5748
5749
        let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5750
    }
5751
5752
    #[test]
5753
    fn test_hash() {
5754
        let mut encodings = ::alloc::collections::btree_set::BTreeSet::new();
5755
        encodings.insert(UTF_8);
5756
        encodings.insert(ISO_2022_JP);
5757
        assert!(encodings.contains(UTF_8));
5758
        assert!(encodings.contains(ISO_2022_JP));
5759
        assert!(!encodings.contains(WINDOWS_1252));
5760
        encodings.remove(ISO_2022_JP);
5761
        assert!(!encodings.contains(ISO_2022_JP));
5762
    }
5763
5764
    #[test]
5765
    fn test_iso_2022_jp_ncr_extra_from_utf16() {
5766
        let mut dst = [0u8; 17];
5767
        {
5768
            let mut encoder = ISO_2022_JP.new_encoder();
5769
            let (result, _, _, _) =
5770
                encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5771
            assert_eq!(result, CoderResult::OutputFull);
5772
        }
5773
    }
5774
5775
    #[test]
5776
    fn test_iso_2022_jp_ncr_extra_from_utf8() {
5777
        let mut dst = [0u8; 17];
5778
        {
5779
            let mut encoder = ISO_2022_JP.new_encoder();
5780
            let (result, _, _, _) =
5781
                encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5782
            assert_eq!(result, CoderResult::OutputFull);
5783
        }
5784
    }
5785
5786
    #[test]
5787
    fn test_max_length_with_bom_to_utf8() {
5788
        let mut output = [0u8; 20];
5789
        let mut decoder = REPLACEMENT.new_decoder();
5790
        let input = b"\xEF\xBB\xBFA";
5791
        {
5792
            let needed = decoder
5793
                .max_utf8_buffer_length_without_replacement(input.len())
5794
                .unwrap();
5795
            let (result, read, written) =
5796
                decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5797
            assert_eq!(result, DecoderResult::InputEmpty);
5798
            assert_eq!(read, input.len());
5799
            assert_eq!(written, 1);
5800
            assert_eq!(output[0], 0x41);
5801
        }
5802
    }
5803
5804
    #[cfg(feature = "serde")]
5805
    #[test]
5806
    fn test_serde() {
5807
        let demo = Demo {
5808
            num: 42,
5809
            name: "foo".into(),
5810
            enc: UTF_8,
5811
        };
5812
5813
        let serialized = serde_json::to_string(&demo).unwrap();
5814
5815
        let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5816
        assert_eq!(deserialized, demo);
5817
5818
        let bincoded = bincode::serialize(&demo).unwrap();
5819
        let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5820
        assert_eq!(debincoded, demo);
5821
    }
5822
5823
    #[test]
5824
    fn test_is_single_byte() {
5825
        assert!(!BIG5.is_single_byte());
5826
        assert!(!EUC_JP.is_single_byte());
5827
        assert!(!EUC_KR.is_single_byte());
5828
        assert!(!GB18030.is_single_byte());
5829
        assert!(!GBK.is_single_byte());
5830
        assert!(!REPLACEMENT.is_single_byte());
5831
        assert!(!SHIFT_JIS.is_single_byte());
5832
        assert!(!UTF_8.is_single_byte());
5833
        assert!(!UTF_16BE.is_single_byte());
5834
        assert!(!UTF_16LE.is_single_byte());
5835
        assert!(!ISO_2022_JP.is_single_byte());
5836
5837
        assert!(IBM866.is_single_byte());
5838
        assert!(ISO_8859_2.is_single_byte());
5839
        assert!(ISO_8859_3.is_single_byte());
5840
        assert!(ISO_8859_4.is_single_byte());
5841
        assert!(ISO_8859_5.is_single_byte());
5842
        assert!(ISO_8859_6.is_single_byte());
5843
        assert!(ISO_8859_7.is_single_byte());
5844
        assert!(ISO_8859_8.is_single_byte());
5845
        assert!(ISO_8859_10.is_single_byte());
5846
        assert!(ISO_8859_13.is_single_byte());
5847
        assert!(ISO_8859_14.is_single_byte());
5848
        assert!(ISO_8859_15.is_single_byte());
5849
        assert!(ISO_8859_16.is_single_byte());
5850
        assert!(ISO_8859_8_I.is_single_byte());
5851
        assert!(KOI8_R.is_single_byte());
5852
        assert!(KOI8_U.is_single_byte());
5853
        assert!(MACINTOSH.is_single_byte());
5854
        assert!(WINDOWS_874.is_single_byte());
5855
        assert!(WINDOWS_1250.is_single_byte());
5856
        assert!(WINDOWS_1251.is_single_byte());
5857
        assert!(WINDOWS_1252.is_single_byte());
5858
        assert!(WINDOWS_1253.is_single_byte());
5859
        assert!(WINDOWS_1254.is_single_byte());
5860
        assert!(WINDOWS_1255.is_single_byte());
5861
        assert!(WINDOWS_1256.is_single_byte());
5862
        assert!(WINDOWS_1257.is_single_byte());
5863
        assert!(WINDOWS_1258.is_single_byte());
5864
        assert!(X_MAC_CYRILLIC.is_single_byte());
5865
        assert!(X_USER_DEFINED.is_single_byte());
5866
    }
5867
5868
    #[test]
5869
    fn test_latin1_byte_compatible_up_to() {
5870
        let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5871
        assert_eq!(
5872
            BIG5.new_decoder_without_bom_handling()
5873
                .latin1_byte_compatible_up_to(buffer)
5874
                .unwrap(),
5875
            1
5876
        );
5877
        assert_eq!(
5878
            EUC_JP
5879
                .new_decoder_without_bom_handling()
5880
                .latin1_byte_compatible_up_to(buffer)
5881
                .unwrap(),
5882
            1
5883
        );
5884
        assert_eq!(
5885
            EUC_KR
5886
                .new_decoder_without_bom_handling()
5887
                .latin1_byte_compatible_up_to(buffer)
5888
                .unwrap(),
5889
            1
5890
        );
5891
        assert_eq!(
5892
            GB18030
5893
                .new_decoder_without_bom_handling()
5894
                .latin1_byte_compatible_up_to(buffer)
5895
                .unwrap(),
5896
            1
5897
        );
5898
        assert_eq!(
5899
            GBK.new_decoder_without_bom_handling()
5900
                .latin1_byte_compatible_up_to(buffer)
5901
                .unwrap(),
5902
            1
5903
        );
5904
        assert!(REPLACEMENT
5905
            .new_decoder_without_bom_handling()
5906
            .latin1_byte_compatible_up_to(buffer)
5907
            .is_none());
5908
        assert_eq!(
5909
            SHIFT_JIS
5910
                .new_decoder_without_bom_handling()
5911
                .latin1_byte_compatible_up_to(buffer)
5912
                .unwrap(),
5913
            1
5914
        );
5915
        assert_eq!(
5916
            UTF_8
5917
                .new_decoder_without_bom_handling()
5918
                .latin1_byte_compatible_up_to(buffer)
5919
                .unwrap(),
5920
            1
5921
        );
5922
        assert!(UTF_16BE
5923
            .new_decoder_without_bom_handling()
5924
            .latin1_byte_compatible_up_to(buffer)
5925
            .is_none());
5926
        assert!(UTF_16LE
5927
            .new_decoder_without_bom_handling()
5928
            .latin1_byte_compatible_up_to(buffer)
5929
            .is_none());
5930
        assert_eq!(
5931
            ISO_2022_JP
5932
                .new_decoder_without_bom_handling()
5933
                .latin1_byte_compatible_up_to(buffer)
5934
                .unwrap(),
5935
            1
5936
        );
5937
5938
        assert_eq!(
5939
            IBM866
5940
                .new_decoder_without_bom_handling()
5941
                .latin1_byte_compatible_up_to(buffer)
5942
                .unwrap(),
5943
            1
5944
        );
5945
        assert_eq!(
5946
            ISO_8859_2
5947
                .new_decoder_without_bom_handling()
5948
                .latin1_byte_compatible_up_to(buffer)
5949
                .unwrap(),
5950
            2
5951
        );
5952
        assert_eq!(
5953
            ISO_8859_3
5954
                .new_decoder_without_bom_handling()
5955
                .latin1_byte_compatible_up_to(buffer)
5956
                .unwrap(),
5957
            2
5958
        );
5959
        assert_eq!(
5960
            ISO_8859_4
5961
                .new_decoder_without_bom_handling()
5962
                .latin1_byte_compatible_up_to(buffer)
5963
                .unwrap(),
5964
            2
5965
        );
5966
        assert_eq!(
5967
            ISO_8859_5
5968
                .new_decoder_without_bom_handling()
5969
                .latin1_byte_compatible_up_to(buffer)
5970
                .unwrap(),
5971
            2
5972
        );
5973
        assert_eq!(
5974
            ISO_8859_6
5975
                .new_decoder_without_bom_handling()
5976
                .latin1_byte_compatible_up_to(buffer)
5977
                .unwrap(),
5978
            2
5979
        );
5980
        assert_eq!(
5981
            ISO_8859_7
5982
                .new_decoder_without_bom_handling()
5983
                .latin1_byte_compatible_up_to(buffer)
5984
                .unwrap(),
5985
            2
5986
        );
5987
        assert_eq!(
5988
            ISO_8859_8
5989
                .new_decoder_without_bom_handling()
5990
                .latin1_byte_compatible_up_to(buffer)
5991
                .unwrap(),
5992
            3
5993
        );
5994
        assert_eq!(
5995
            ISO_8859_10
5996
                .new_decoder_without_bom_handling()
5997
                .latin1_byte_compatible_up_to(buffer)
5998
                .unwrap(),
5999
            2
6000
        );
6001
        assert_eq!(
6002
            ISO_8859_13
6003
                .new_decoder_without_bom_handling()
6004
                .latin1_byte_compatible_up_to(buffer)
6005
                .unwrap(),
6006
            4
6007
        );
6008
        assert_eq!(
6009
            ISO_8859_14
6010
                .new_decoder_without_bom_handling()
6011
                .latin1_byte_compatible_up_to(buffer)
6012
                .unwrap(),
6013
            4
6014
        );
6015
        assert_eq!(
6016
            ISO_8859_15
6017
                .new_decoder_without_bom_handling()
6018
                .latin1_byte_compatible_up_to(buffer)
6019
                .unwrap(),
6020
            6
6021
        );
6022
        assert_eq!(
6023
            ISO_8859_16
6024
                .new_decoder_without_bom_handling()
6025
                .latin1_byte_compatible_up_to(buffer)
6026
                .unwrap(),
6027
            4
6028
        );
6029
        assert_eq!(
6030
            ISO_8859_8_I
6031
                .new_decoder_without_bom_handling()
6032
                .latin1_byte_compatible_up_to(buffer)
6033
                .unwrap(),
6034
            3
6035
        );
6036
        assert_eq!(
6037
            KOI8_R
6038
                .new_decoder_without_bom_handling()
6039
                .latin1_byte_compatible_up_to(buffer)
6040
                .unwrap(),
6041
            1
6042
        );
6043
        assert_eq!(
6044
            KOI8_U
6045
                .new_decoder_without_bom_handling()
6046
                .latin1_byte_compatible_up_to(buffer)
6047
                .unwrap(),
6048
            1
6049
        );
6050
        assert_eq!(
6051
            MACINTOSH
6052
                .new_decoder_without_bom_handling()
6053
                .latin1_byte_compatible_up_to(buffer)
6054
                .unwrap(),
6055
            1
6056
        );
6057
        assert_eq!(
6058
            WINDOWS_874
6059
                .new_decoder_without_bom_handling()
6060
                .latin1_byte_compatible_up_to(buffer)
6061
                .unwrap(),
6062
            2
6063
        );
6064
        assert_eq!(
6065
            WINDOWS_1250
6066
                .new_decoder_without_bom_handling()
6067
                .latin1_byte_compatible_up_to(buffer)
6068
                .unwrap(),
6069
            4
6070
        );
6071
        assert_eq!(
6072
            WINDOWS_1251
6073
                .new_decoder_without_bom_handling()
6074
                .latin1_byte_compatible_up_to(buffer)
6075
                .unwrap(),
6076
            1
6077
        );
6078
        assert_eq!(
6079
            WINDOWS_1252
6080
                .new_decoder_without_bom_handling()
6081
                .latin1_byte_compatible_up_to(buffer)
6082
                .unwrap(),
6083
            5
6084
        );
6085
        assert_eq!(
6086
            WINDOWS_1253
6087
                .new_decoder_without_bom_handling()
6088
                .latin1_byte_compatible_up_to(buffer)
6089
                .unwrap(),
6090
            3
6091
        );
6092
        assert_eq!(
6093
            WINDOWS_1254
6094
                .new_decoder_without_bom_handling()
6095
                .latin1_byte_compatible_up_to(buffer)
6096
                .unwrap(),
6097
            4
6098
        );
6099
        assert_eq!(
6100
            WINDOWS_1255
6101
                .new_decoder_without_bom_handling()
6102
                .latin1_byte_compatible_up_to(buffer)
6103
                .unwrap(),
6104
            3
6105
        );
6106
        assert_eq!(
6107
            WINDOWS_1256
6108
                .new_decoder_without_bom_handling()
6109
                .latin1_byte_compatible_up_to(buffer)
6110
                .unwrap(),
6111
            1
6112
        );
6113
        assert_eq!(
6114
            WINDOWS_1257
6115
                .new_decoder_without_bom_handling()
6116
                .latin1_byte_compatible_up_to(buffer)
6117
                .unwrap(),
6118
            4
6119
        );
6120
        assert_eq!(
6121
            WINDOWS_1258
6122
                .new_decoder_without_bom_handling()
6123
                .latin1_byte_compatible_up_to(buffer)
6124
                .unwrap(),
6125
            4
6126
        );
6127
        assert_eq!(
6128
            X_MAC_CYRILLIC
6129
                .new_decoder_without_bom_handling()
6130
                .latin1_byte_compatible_up_to(buffer)
6131
                .unwrap(),
6132
            1
6133
        );
6134
        assert_eq!(
6135
            X_USER_DEFINED
6136
                .new_decoder_without_bom_handling()
6137
                .latin1_byte_compatible_up_to(buffer)
6138
                .unwrap(),
6139
            1
6140
        );
6141
6142
        assert!(UTF_8
6143
            .new_decoder()
6144
            .latin1_byte_compatible_up_to(buffer)
6145
            .is_none());
6146
6147
        let mut decoder = UTF_8.new_decoder();
6148
        let mut output = [0u16; 4];
6149
        let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6150
        assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6151
        let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6152
        assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6153
        let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6154
        assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6155
    }
6156
}