Coverage Report

Created: 2025-02-21 07:11

/rust/registry/src/index.crates.io-6f17d22bba15001f/encoding_rs-0.8.35/src/big5.rs
Line
Count
Source (jump to first uncovered line)
1
// Copyright Mozilla Foundation. See the COPYRIGHT
2
// file at the top-level directory of this distribution.
3
//
4
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7
// option. This file may not be copied, modified, or distributed
8
// except according to those terms.
9
10
use super::*;
11
use crate::data::*;
12
use crate::handles::*;
13
use crate::variant::*;
14
// Rust 1.14.0 requires the following despite the asterisk above.
15
use super::in_inclusive_range32;
16
17
pub struct Big5Decoder {
18
    lead: Option<u8>,
19
}
20
21
impl Big5Decoder {
22
0
    pub fn new() -> VariantDecoder {
23
0
        VariantDecoder::Big5(Big5Decoder { lead: None })
24
0
    }
25
26
0
    pub fn in_neutral_state(&self) -> bool {
27
0
        self.lead.is_none()
28
0
    }
29
30
0
    fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
31
0
        byte_length.checked_add(match self.lead {
32
0
            None => 0,
33
0
            Some(_) => 1,
34
        })
35
0
    }
36
37
0
    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
38
0
        // If there is a lead but the next byte isn't a valid trail, an
39
0
        // error is generated for the lead (+1). Then another iteration checks
40
0
        // space, which needs +1 to account for the possibility of astral
41
0
        // output or combining pair.
42
0
        checked_add(1, self.plus_one_if_lead(byte_length))
43
0
    }
44
45
0
    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
46
0
        // No need to account for REPLACEMENT CHARACTERS.
47
0
        // Cases:
48
0
        // ASCII: 1 to 1
49
0
        // Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
50
0
        // lead set and first byte is trail: 1 to 4 worst case
51
0
        //
52
0
        // When checking for space for the last byte:
53
0
        // no lead: the last byte must be ASCII (or fatal error): 1 to 1
54
0
        // lead set: space for 4 bytes was already checked when reading the
55
0
        // lead, hence the last lead and the last trail together are worst
56
0
        // case 2 to 4.
57
0
        //
58
0
        // If lead set and the input is a single trail byte, the worst-case
59
0
        // output is 4, so we need to add one before multiplying if lead is
60
0
        // set.
61
0
        //
62
0
        // Finally, add two so that if input is non-zero, the output is at
63
0
        // least 4.
64
0
        checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length)))
65
0
    }
66
67
0
    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
68
0
        // If there is a lead but the next byte isn't a valid trail, an
69
0
        // error is generated for the lead (+(1*3)). Then another iteration
70
0
        // checks space, which needs +3 to account for the possibility of astral
71
0
        // output or combining pair. In between start and end, the worst case
72
0
        // is that every byte is bad: *3.
73
0
        checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length)))
74
0
    }
75
76
    ascii_compatible_two_byte_decoder_functions!(
77
        {
78
            // If lead is between 0x81 and 0xFE, inclusive,
79
            // subtract offset 0x81.
80
            let non_ascii_minus_offset =
81
                non_ascii.wrapping_sub(0x81);
82
            if non_ascii_minus_offset > (0xFE - 0x81) {
83
                return (DecoderResult::Malformed(1, 0),
84
                        source.consumed(),
85
                        handle.written());
86
            }
87
            non_ascii_minus_offset
88
        },
89
        {
90
            // If trail is between 0x40 and 0x7E, inclusive,
91
            // subtract offset 0x40. Else if trail is
92
            // between 0xA1 and 0xFE, inclusive, subtract
93
            // offset 0x62.
94
            // TODO: Find out which range is more probable.
95
            let mut trail_minus_offset =
96
                byte.wrapping_sub(0x40);
97
            if trail_minus_offset > (0x7E - 0x40) {
98
                let trail_minus_range_start =
99
                    byte.wrapping_sub(0xA1);
100
                if trail_minus_range_start >
101
                   (0xFE - 0xA1) {
102
                    if byte < 0x80 {
103
                        return (DecoderResult::Malformed(1, 0),
104
                                unread_handle_trail.unread(),
105
                                handle.written());
106
                    }
107
                    return (DecoderResult::Malformed(2, 0),
108
                            unread_handle_trail.consumed(),
109
                            handle.written());
110
                }
111
                trail_minus_offset = byte - 0x62;
112
            }
113
            let pointer = lead_minus_offset as usize *
114
                          157usize +
115
                          trail_minus_offset as usize;
116
            let rebased_pointer = pointer.wrapping_sub(942);
117
            let low_bits = big5_low_bits(rebased_pointer);
118
            if low_bits == 0 {
119
                match pointer {
120
                    1133 => {
121
                        handle.write_big5_combination(0x00CAu16,
122
                                                      0x0304u16)
123
                    }
124
                    1135 => {
125
                        handle.write_big5_combination(0x00CAu16,
126
                                                      0x030Cu16)
127
                    }
128
                    1164 => {
129
                        handle.write_big5_combination(0x00EAu16,
130
                                                      0x0304u16)
131
                    }
132
                    1166 => {
133
                        handle.write_big5_combination(0x00EAu16,
134
                                                      0x030Cu16)
135
                    }
136
                    _ => {
137
                        if byte < 0x80 {
138
                            return (DecoderResult::Malformed(1, 0),
139
                                    unread_handle_trail.unread(),
140
                                    handle.written());
141
                        }
142
                        return (DecoderResult::Malformed(2, 0),
143
                                unread_handle_trail.consumed(),
144
                                handle.written());
145
                    }
146
                }
147
            } else if big5_is_astral(rebased_pointer) {
148
                handle.write_astral(u32::from(low_bits) |
149
                                    0x20000u32)
150
            } else {
151
                handle.write_bmp_excl_ascii(low_bits)
152
            }
153
        },
154
        self,
155
        non_ascii,
156
        byte,
157
        lead_minus_offset,
158
        unread_handle_trail,
159
        source,
160
        handle,
161
        'outermost,
162
        copy_ascii_from_check_space_astral,
163
        check_space_astral,
164
        false);
165
}
166
167
pub struct Big5Encoder;
168
169
impl Big5Encoder {
170
0
    pub fn new(encoding: &'static Encoding) -> Encoder {
171
0
        Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
172
0
    }
173
174
0
    pub fn max_buffer_length_from_utf16_without_replacement(
175
0
        &self,
176
0
        u16_length: usize,
177
0
    ) -> Option<usize> {
178
0
        // Astral: 2 to 2
179
0
        // ASCII: 1 to 1
180
0
        // Other: 1 to 2
181
0
        u16_length.checked_mul(2)
182
0
    }
183
184
0
    pub fn max_buffer_length_from_utf8_without_replacement(
185
0
        &self,
186
0
        byte_length: usize,
187
0
    ) -> Option<usize> {
188
0
        // Astral: 4 to 2
189
0
        // Upper BMP: 3 to 2
190
0
        // Lower BMP: 2 to 2
191
0
        // ASCII: 1 to 1
192
0
        byte_length.checked_add(1)
193
0
    }
194
195
    ascii_compatible_encoder_functions!(
196
        {
197
            // For simplicity, unified ideographs
198
            // in the pointer range 11206...11212 are handled
199
            // as Level 1 Hanzi.
200
            if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
201
                handle.write_two(lead, trail)
202
            } else {
203
                let pointer = if let Some(pointer) = big5_box_encode(bmp) {
204
                    pointer
205
                } else if let Some(pointer) = big5_other_encode(bmp) {
206
                    pointer
207
                } else {
208
                    return (
209
                        EncoderResult::unmappable_from_bmp(bmp),
210
                        source.consumed(),
211
                        handle.written(),
212
                    );
213
                };
214
                let lead = pointer / 157 + 0x81;
215
                let remainder = pointer % 157;
216
                let trail = if remainder < 0x3F {
217
                    remainder + 0x40
218
                } else {
219
                    remainder + 0x62
220
                };
221
                handle.write_two(lead as u8, trail as u8)
222
            }
223
        },
224
        {
225
            if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) {
226
                if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
227
                    // big5_astral_encode returns rebased pointer,
228
                    // so adding 0x87 instead of 0x81.
229
                    let lead = rebased_pointer / 157 + 0x87;
230
                    let remainder = rebased_pointer % 157;
231
                    let trail = if remainder < 0x3F {
232
                        remainder + 0x40
233
                    } else {
234
                        remainder + 0x62
235
                    };
236
                    handle.write_two(lead as u8, trail as u8)
237
                } else {
238
                    return (
239
                        EncoderResult::Unmappable(astral),
240
                        source.consumed(),
241
                        handle.written(),
242
                    );
243
                }
244
            } else {
245
                return (
246
                    EncoderResult::Unmappable(astral),
247
                    source.consumed(),
248
                    handle.written(),
249
                );
250
            }
251
        },
252
        bmp,
253
        astral,
254
        self,
255
        source,
256
        handle,
257
        copy_ascii_to_check_space_two,
258
        check_space_two,
259
        false
260
    );
261
}
262
263
// Any copyright to the test code below this comment is dedicated to the
264
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
265
266
#[cfg(all(test, feature = "alloc"))]
267
mod tests {
268
    use super::super::testing::*;
269
    use super::super::*;
270
271
    fn decode_big5(bytes: &[u8], expect: &str) {
272
        decode(BIG5, bytes, expect);
273
    }
274
275
    fn encode_big5(string: &str, expect: &[u8]) {
276
        encode(BIG5, string, expect);
277
    }
278
279
    #[test]
280
    fn test_big5_decode() {
281
        // Empty
282
        decode_big5(b"", &"");
283
284
        // ASCII
285
        decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}");
286
287
        // Edge cases
288
        decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}");
289
        decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}");
290
        decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}");
291
        decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}");
292
        decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}");
293
        decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}");
294
        decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}");
295
        decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}");
296
        decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}");
297
        decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}");
298
        decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}");
299
        decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}");
300
301
        // Edge cases surrounded with ASCII
302
        decode_big5(
303
            &[0x61u8, 0x87u8, 0x40u8, 0x62u8],
304
            &"\u{0061}\u{43F0}\u{0062}",
305
        );
306
        decode_big5(
307
            &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8],
308
            &"\u{0061}\u{79D4}\u{0062}",
309
        );
310
        decode_big5(
311
            &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8],
312
            &"\u{0061}\u{2910D}\u{0062}",
313
        );
314
        decode_big5(
315
            &[0x61u8, 0x88u8, 0x62u8, 0x62u8],
316
            &"\u{0061}\u{00CA}\u{0304}\u{0062}",
317
        );
318
        decode_big5(
319
            &[0x61u8, 0x88u8, 0x64u8, 0x62u8],
320
            &"\u{0061}\u{00CA}\u{030C}\u{0062}",
321
        );
322
        decode_big5(
323
            &[0x61u8, 0x88u8, 0x66u8, 0x62u8],
324
            &"\u{0061}\u{00CA}\u{0062}",
325
        );
326
        decode_big5(
327
            &[0x61u8, 0x88u8, 0xA3u8, 0x62u8],
328
            &"\u{0061}\u{00EA}\u{0304}\u{0062}",
329
        );
330
        decode_big5(
331
            &[0x61u8, 0x88u8, 0xA5u8, 0x62u8],
332
            &"\u{0061}\u{00EA}\u{030C}\u{0062}",
333
        );
334
        decode_big5(
335
            &[0x61u8, 0x88u8, 0xA7u8, 0x62u8],
336
            &"\u{0061}\u{00EA}\u{0062}",
337
        );
338
        decode_big5(
339
            &[0x61u8, 0x99u8, 0xD4u8, 0x62u8],
340
            &"\u{0061}\u{8991}\u{0062}",
341
        );
342
        decode_big5(
343
            &[0x61u8, 0x99u8, 0xD5u8, 0x62u8],
344
            &"\u{0061}\u{27967}\u{0062}",
345
        );
346
        decode_big5(
347
            &[0x61u8, 0x99u8, 0xD6u8, 0x62u8],
348
            &"\u{0061}\u{8A29}\u{0062}",
349
        );
350
351
        // Bad sequences
352
        decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}");
353
        decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}");
354
        decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}");
355
        decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}");
356
        decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}");
357
        decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}");
358
    }
359
360
    #[test]
361
    fn test_big5_encode() {
362
        // Empty
363
        encode_big5("", b"");
364
365
        // ASCII
366
        encode_big5("\u{0061}\u{0062}", b"\x61\x62");
367
368
        if !cfg!(miri) {
369
            // Miri is too slow
370
            // Edge cases
371
            encode_big5("\u{9EA6}\u{0061}", b"&#40614;\x61");
372
            encode_big5("\u{2626B}\u{0061}", b"&#156267;\x61");
373
            encode_big5("\u{3000}", b"\xA1\x40");
374
            encode_big5("\u{20AC}", b"\xA3\xE1");
375
            encode_big5("\u{4E00}", b"\xA4\x40");
376
            encode_big5("\u{27607}", b"\xC8\xA4");
377
            encode_big5("\u{FFE2}", b"\xC8\xCD");
378
            encode_big5("\u{79D4}", b"\xFE\xFE");
379
380
            // Not in index
381
            encode_big5("\u{2603}\u{0061}", b"&#9731;\x61");
382
        }
383
384
        // duplicate low bits
385
        encode_big5("\u{203B5}", b"\xFD\x6A");
386
        encode_big5("\u{25605}", b"\xFE\x46");
387
388
        // prefer last
389
        encode_big5("\u{2550}", b"\xF9\xF9");
390
    }
391
392
    #[test]
393
    #[cfg_attr(miri, ignore)] // Miri is too slow
394
    fn test_big5_decode_all() {
395
        let input = include_bytes!("test_data/big5_in.txt");
396
        let expectation = include_str!("test_data/big5_in_ref.txt");
397
        let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
398
        assert!(had_errors, "Should have had errors.");
399
        assert_eq!(&cow[..], expectation);
400
    }
401
402
    #[test]
403
    #[cfg_attr(miri, ignore)] // Miri is too slow
404
    fn test_big5_encode_all() {
405
        let input = include_str!("test_data/big5_out.txt");
406
        let expectation = include_bytes!("test_data/big5_out_ref.txt");
407
        let (cow, encoding, had_errors) = BIG5.encode(input);
408
        assert!(!had_errors, "Should not have had errors.");
409
        assert_eq!(encoding, BIG5);
410
        assert_eq!(&cow[..], &expectation[..]);
411
    }
412
413
    #[test]
414
    #[cfg_attr(miri, ignore)] // Miri is too slow
415
    fn test_big5_encode_from_two_low_surrogates() {
416
        let expectation = b"&#65533;&#65533;";
417
        let mut output = [0u8; 40];
418
        let mut encoder = BIG5.new_encoder();
419
        let (result, read, written, had_errors) =
420
            encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
421
        assert_eq!(result, CoderResult::InputEmpty);
422
        assert_eq!(read, 2);
423
        assert_eq!(written, expectation.len());
424
        assert!(had_errors);
425
        assert_eq!(&output[..written], expectation);
426
    }
427
}