Coverage Report

Created: 2025-10-13 06:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/encoding_rs-0.8.35/src/gb18030.rs
Line
Count
Source
1
// Copyright Mozilla Foundation. See the COPYRIGHT
2
// file at the top-level directory of this distribution.
3
//
4
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7
// option. This file may not be copied, modified, or distributed
8
// except according to those terms.
9
10
use super::*;
11
use crate::data::*;
12
use crate::gb18030_2022::*;
13
use crate::handles::*;
14
use crate::variant::*;
15
// Rust 1.14.0 requires the following despite the asterisk above.
16
use super::in_inclusive_range16;
17
use super::in_range16;
18
19
enum Gb18030Pending {
20
    None,
21
    One(u8),
22
    Two(u8, u8),
23
    Three(u8, u8, u8),
24
}
25
26
impl Gb18030Pending {
27
0
    fn is_none(&self) -> bool {
28
0
        match *self {
29
0
            Gb18030Pending::None => true,
30
0
            _ => false,
31
        }
32
0
    }
33
34
0
    fn count(&self) -> usize {
35
0
        match *self {
36
0
            Gb18030Pending::None => 0,
37
0
            Gb18030Pending::One(_) => 1,
38
0
            Gb18030Pending::Two(_, _) => 2,
39
0
            Gb18030Pending::Three(_, _, _) => 3,
40
        }
41
0
    }
42
}
43
44
pub struct Gb18030Decoder {
45
    first: Option<u8>,
46
    second: Option<u8>,
47
    third: Option<u8>,
48
    pending: Gb18030Pending,
49
    pending_ascii: Option<u8>,
50
}
51
52
impl Gb18030Decoder {
53
0
    pub fn new() -> VariantDecoder {
54
0
        VariantDecoder::Gb18030(Gb18030Decoder {
55
0
            first: None,
56
0
            second: None,
57
0
            third: None,
58
0
            pending: Gb18030Pending::None,
59
0
            pending_ascii: None,
60
0
        })
61
0
    }
62
63
0
    pub fn in_neutral_state(&self) -> bool {
64
0
        self.first.is_none()
65
0
            && self.second.is_none()
66
0
            && self.third.is_none()
67
0
            && self.pending.is_none()
68
0
            && self.pending_ascii.is_none()
69
0
    }
70
71
0
    fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
72
0
        byte_length.checked_add(
73
0
            self.pending.count()
74
0
                + match self.first {
75
0
                    None => 0,
76
0
                    Some(_) => 1,
77
                }
78
0
                + match self.second {
79
0
                    None => 0,
80
0
                    Some(_) => 1,
81
                }
82
0
                + match self.third {
83
0
                    None => 0,
84
0
                    Some(_) => 1,
85
                }
86
0
                + match self.pending_ascii {
87
0
                    None => 0,
88
0
                    Some(_) => 1,
89
                },
90
        )
91
0
    }
92
93
0
    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
94
        // ASCII: 1 to 1 (worst case)
95
        // gbk: 2 to 1
96
        // ranges: 4 to 1 or 4 to 2
97
0
        checked_add(1, self.extra_from_state(byte_length))
98
0
    }
99
100
0
    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
101
        // ASCII: 1 to 1
102
        // gbk: 2 to 2 or 2 to 3
103
        // ranges: 4 to 2, 4 to 3 or 4 to 4
104
        // 0x80: 1 to 3 (worst case)
105
0
        self.max_utf8_buffer_length(byte_length)
106
0
    }
107
108
0
    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
109
0
        checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
110
0
    }
111
112
    gb18030_decoder_functions!(
113
        {
114
            // If first is between 0x81 and 0xFE, inclusive,
115
            // subtract offset 0x81.
116
            let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
117
            if non_ascii_minus_offset > (0xFE - 0x81) {
118
                if non_ascii == 0x80 {
119
                    handle.write_upper_bmp(0x20ACu16);
120
                    continue 'outermost;
121
                }
122
                return (DecoderResult::Malformed(1, 0),
123
                        source.consumed(),
124
                        handle.written());
125
            }
126
            non_ascii_minus_offset
127
        },
128
        {
129
            // Two-byte (or error)
130
            if first_minus_offset >= 0x20 {
131
                // Not the gbk ideograph range above GB2312
132
                let trail_minus_offset = second.wrapping_sub(0xA1);
133
                if trail_minus_offset <= (0xFE - 0xA1) {
134
                    // GB2312
135
                    let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
136
                    if hanzi_lead < (0x77 - 0x2F) {
137
                        // Level 1 Hanzi, Level 2 Hanzi
138
                        // or one of the 5 PUA code
139
                        // points in between.
140
                        let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
141
                        let upper_bmp = GB2312_HANZI[hanzi_pointer];
142
                        handle.write_upper_bmp(upper_bmp)
143
                    } else if first_minus_offset == 0x20 {
144
                        // Symbols (starting with ideographic space)
145
                        let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
146
                        handle.write_bmp_excl_ascii(bmp)
147
                    } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
148
                        handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
149
                    } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
150
                        handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
151
                    } else if first_minus_offset > 0x76 {
152
                        // Bottom PUA
153
                        let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
154
                        handle.write_upper_bmp(pua)
155
                    } else {
156
                        let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
157
                        handle.write_bmp_excl_ascii(bmp)
158
                    }
159
                } else {
160
                    // gbk range on the left
161
                    let mut trail_minus_offset = second.wrapping_sub(0x40);
162
                    if trail_minus_offset > (0x7E - 0x40) {
163
                        let trail_minus_range_start = second.wrapping_sub(0x80);
164
                        if trail_minus_range_start > (0xA0 - 0x80) {
165
                            if second < 0x80 {
166
                                return (DecoderResult::Malformed(1, 0),
167
                                        unread_handle_second.unread(),
168
                                        handle.written());
169
                            }
170
                            return (DecoderResult::Malformed(2, 0),
171
                                    unread_handle_second.consumed(),
172
                                    handle.written());
173
                        }
174
                        trail_minus_offset = second - 0x41;
175
                    }
176
                    // Zero-base lead
177
                    let left_lead = first_minus_offset - 0x20;
178
                    let left_pointer = left_lead as usize * (190 - 94) +
179
                                       trail_minus_offset as usize;
180
                    let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
181
                    if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
182
                        let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
183
                        handle.write_upper_bmp(upper_bmp)
184
                    } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
185
                        let bmp = gbk_other_decode(left_pointer as u16);
186
                        handle.write_bmp_excl_ascii(bmp)
187
                    } else {
188
                        let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
189
                        let upper_bmp = GBK_BOTTOM[bottom_pointer];
190
                        handle.write_upper_bmp(upper_bmp)
191
                    }
192
                }
193
            } else {
194
                // gbk ideograph range above GB2312
195
                let mut trail_minus_offset = second.wrapping_sub(0x40);
196
                if trail_minus_offset > (0x7E - 0x40) {
197
                    let trail_minus_range_start = second.wrapping_sub(0x80);
198
                    if trail_minus_range_start > (0xFE - 0x80) {
199
                        if second < 0x80 {
200
                            return (DecoderResult::Malformed(1, 0),
201
                                    unread_handle_second.unread(),
202
                                    handle.written());
203
                        }
204
                        return (DecoderResult::Malformed(2, 0),
205
                                unread_handle_second.consumed(),
206
                                handle.written());
207
                    }
208
                    trail_minus_offset = second - 0x41;
209
                }
210
                let pointer = first_minus_offset as usize * 190usize +
211
                              trail_minus_offset as usize;
212
                let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
213
                handle.write_upper_bmp(upper_bmp)
214
            }
215
        },
216
        {
217
            // If third is between 0x81 and 0xFE, inclusive,
218
            // subtract offset 0x81.
219
            let third_minus_offset = third.wrapping_sub(0x81);
220
            if third_minus_offset > (0xFE - 0x81) {
221
                // We have an error. Let's inline what's going
222
                // to happen when `second` is
223
                // reprocessed. (`third` gets unread.)
224
                // `second` is guaranteed ASCII, so let's
225
                // put it in `pending_ascii`. Recompute
226
                // `second` from `second_minus_offset`.
227
                self.pending_ascii = Some(second_minus_offset + 0x30);
228
                // Now unread `third` and designate the previous
229
                // `first` as being in error.
230
                return (DecoderResult::Malformed(1, 1),
231
                        unread_handle_third.unread(),
232
                        handle.written());
233
            }
234
            third_minus_offset
235
        },
236
        {
237
            // If fourth is between 0x30 and 0x39, inclusive,
238
            // subtract offset 0x30.
239
            //
240
            // If we have an error, we'll inline what's going
241
            // to happen when `second` and `third` are
242
            // reprocessed. (`fourth` gets unread.)
243
            // `second` is guaranteed ASCII, so let's
244
            // put it in `pending_ascii`. Recompute
245
            // `second` from `second_minus_offset` to
246
            // make this block reusable when `second`
247
            // is not in scope.
248
            //
249
            // `third` is guaranteed to be in the range
250
            // that makes it become the new `self.first`.
251
            //
252
            // `fourth` gets unread and the previous
253
            // `first` gets designates as being in error.
254
            let fourth_minus_offset = fourth.wrapping_sub(0x30);
255
            if fourth_minus_offset > (0x39 - 0x30) {
256
                self.pending_ascii = Some(second_minus_offset + 0x30);
257
                self.pending = Gb18030Pending::One(third_minus_offset);
258
                return (DecoderResult::Malformed(1, 2),
259
                        unread_handle_fourth.unread(),
260
                        handle.written());
261
            }
262
            let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
263
                          (second_minus_offset as usize * (10 * 126)) +
264
                          (third_minus_offset as usize * 10) +
265
                          fourth_minus_offset as usize;
266
            if pointer <= 39419 {
267
                // BMP
268
                if pointer == 7457 {
269
                    handle.write_upper_bmp(0xE7C7)
270
                } else {
271
                    handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
272
                }
273
            } else if pointer >= 189_000 && pointer <= 1_237_575 {
274
                // Astral
275
                handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
276
            } else {
277
                return (DecoderResult::Malformed(4, 0),
278
                        unread_handle_fourth.consumed(),
279
                        handle.written());
280
            }
281
        },
282
        self,
283
        non_ascii,
284
        first_minus_offset,
285
        second,
286
        second_minus_offset,
287
        unread_handle_second,
288
        third,
289
        third_minus_offset,
290
        unread_handle_third,
291
        fourth,
292
        fourth_minus_offset,
293
        unread_handle_fourth,
294
        source,
295
        handle,
296
        'outermost);
297
}
298
299
// XXX Experiment with inline directives
300
0
fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
301
    // Try ideographic punctuation first as it's the most likely case.
302
    // Throwing in the check for full-width currencies and tilde is probably
303
    // more size-efficient here than elsewhere.
304
0
    if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
305
0
        if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
306
0
            return Some((0xA1, pos + 0xA1));
307
0
        }
308
0
    }
309
    // Ext A
310
0
    if in_range16(bmp, 0x3400, 0x4E00) {
311
0
        return position(&GBK_BOTTOM[21..100], bmp).map(|pos| {
312
            (
313
                0xFE,
314
0
                pos + if pos < (0x3F - 16) {
315
0
                    0x40 + 16
316
                } else {
317
0
                    0x41 + 16
318
                },
319
            )
320
0
        });
321
0
    }
322
    // Compatibility ideographs
323
0
    if in_range16(bmp, 0xF900, 0xFB00) {
324
0
        return position(&GBK_BOTTOM[0..21], bmp).map(|pos| {
325
0
            if pos < 5 {
326
                // end of second to last row
327
0
                (0xFD, pos + (190 - 94 - 5 + 0x41))
328
            } else {
329
                // last row
330
0
                (0xFE, pos + (0x40 - 5))
331
            }
332
0
        });
333
0
    }
334
    // Handle everything below U+02CA, which is in GBK_OTHER.
335
0
    if bmp < 0x02CA {
336
0
        if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
337
            // Pinyin except U+1E3F
338
0
            if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
339
0
                return Some((0xA8, pos + 0xA1));
340
0
            }
341
0
        } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7)
342
0
            || in_inclusive_range16(bmp, 0x02C7, 0x02C9)
343
        {
344
            // Diacritics and Latin 1 symbols
345
0
            if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
346
0
                return Some((0xA1, pos + 0xA1 + 3));
347
0
            }
348
0
        }
349
0
        return None;
350
0
    }
351
352
0
    if in_inclusive_range16(bmp, 0xE78D, 0xE864) {
353
        // The array is sorted but short, so let's do linear search.
354
0
        if let Some(pos) = position(&GB18030_2022_OVERRIDE_PUA[..], bmp) {
355
0
            let pair = &GB18030_2022_OVERRIDE_BYTES[pos];
356
0
            return Some((pair[0].into(), pair[1].into()));
357
0
        }
358
0
    } else if bmp >= 0xFE17 {
359
        // Various brackets, all in full-width regions
360
0
        if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
361
0
            return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
362
0
        }
363
0
    } else if bmp == 0x1E3F {
364
        // The one Pinyin placed elsewhere on the BMP
365
0
        return Some((0xA8, 0x7B - 0x60 + 0xA1));
366
0
    } else if in_range16(bmp, 0xA000, 0xD800) {
367
        // Since Korean has usage in China, let's spend a branch to fast-track
368
        // Hangul.
369
0
        return None;
370
0
    }
371
    // GB2312 other (except bottom PUA and PUA between Hanzi levels).
372
0
    if let Some(other_pointer) = gb2312_other_encode(bmp) {
373
0
        let other_lead = other_pointer as usize / 94;
374
0
        let other_trail = other_pointer as usize % 94;
375
0
        return Some((0xA2 + other_lead, 0xA1 + other_trail));
376
0
    }
377
    // At this point, we've handled all mappable characters above U+02D9 but
378
    // below U+2010. Let's check for that range in order to let lower BMP
379
    // characters used for minority languages in China avoid the subsequent
380
    // search that deals mainly with various symbols.
381
0
    if in_range16(bmp, 0x02DA, 0x2010) {
382
0
        return None;
383
0
    }
384
    // GBK other (except radicals and PUA in GBK_BOTTOM).
385
0
    if let Some(other_pointer) = gbk_other_encode(bmp) {
386
0
        let other_lead = other_pointer as usize / (190 - 94);
387
0
        let other_trail = other_pointer as usize % (190 - 94);
388
0
        let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
389
0
        return Some((other_lead + (0x81 + 0x20), other_trail + offset));
390
0
    }
391
    // CJK Radicals Supplement, PUA, and U+9FBx ideographs in GBK_BOTTOM
392
0
    if in_inclusive_range16(bmp, 0x2E81, 0x2ECA)
393
0
        || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB)
394
0
        || in_inclusive_range16(bmp, 0xE816, 0xE855)
395
    {
396
0
        if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
397
0
            let trail = pos + 16;
398
0
            let offset = if trail < 0x3F { 0x40 } else { 0x41 };
399
0
            return Some((0xFE, trail + offset));
400
0
        }
401
0
    }
402
    // GB2312 bottom PUA
403
0
    let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
404
0
    if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
405
0
        let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
406
0
        let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
407
0
        return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
408
0
    }
409
    // PUA between Hanzi Levels
410
0
    let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
411
0
    if bmp_minus_pua_between_hanzi < 5 {
412
0
        return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize));
413
0
    }
414
0
    None
415
0
}
416
417
#[cfg(not(feature = "fast-gb-hanzi-encode"))]
418
#[inline(always)]
419
0
fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
420
0
    if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
421
0
        (lead, trail)
422
0
    } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
423
0
        let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
424
0
        let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
425
0
        (hanzi_lead as u8, hanzi_trail as u8)
426
    } else {
427
0
        let (lead, gbk_trail) = if bmp < 0x72DC {
428
            // Above GB2312
429
0
            let pointer = gbk_top_ideograph_encode(bmp) as usize;
430
0
            let lead = (pointer / 190) + 0x81;
431
0
            let gbk_trail = pointer % 190;
432
0
            (lead, gbk_trail)
433
        } else {
434
            // To the left of GB2312
435
0
            let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
436
0
            let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
437
0
            let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
438
0
            (lead, gbk_trail)
439
        };
440
0
        let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
441
0
        (lead as u8, (gbk_trail + offset) as u8)
442
    }
443
0
}
444
445
#[cfg(feature = "fast-gb-hanzi-encode")]
446
#[inline(always)]
447
fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
448
    gbk_hanzi_encode(bmp_minus_unified_start)
449
}
450
451
pub struct Gb18030Encoder {
452
    extended: bool,
453
}
454
455
impl Gb18030Encoder {
456
0
    pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
457
0
        Encoder::new(
458
0
            encoding,
459
0
            VariantEncoder::Gb18030(Gb18030Encoder {
460
0
                extended: extended_range,
461
0
            }),
462
        )
463
0
    }
464
465
0
    pub fn max_buffer_length_from_utf16_without_replacement(
466
0
        &self,
467
0
        u16_length: usize,
468
0
    ) -> Option<usize> {
469
0
        if self.extended {
470
0
            u16_length.checked_mul(4)
471
        } else {
472
            // Need to add, because space check is done with the four-byte
473
            // assumption.
474
0
            checked_add(2, u16_length.checked_mul(2))
475
        }
476
0
    }
477
478
0
    pub fn max_buffer_length_from_utf8_without_replacement(
479
0
        &self,
480
0
        byte_length: usize,
481
0
    ) -> Option<usize> {
482
0
        if self.extended {
483
            // 1 to 1
484
            // 2 to 2
485
            // 3 to 2
486
            // 2 to 4 (worst)
487
            // 3 to 4
488
            // 4 to 4
489
0
            checked_add(2, byte_length.checked_mul(2))
490
        } else {
491
            // 1 to 1
492
            // 2 to 2
493
            // 3 to 2
494
            // Need to add, because space check is done with the four-byte
495
            // assumption.
496
0
            byte_length.checked_add(3)
497
        }
498
0
    }
499
500
    ascii_compatible_encoder_functions!(
501
        {
502
            let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
503
            if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
504
                // CJK Unified Ideographs
505
                // Can't fail now, since all are
506
                // mapped.
507
                let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
508
                handle.write_two(lead, trail)
509
            } else if bmp == 0xE5E5 {
510
                // It's not optimal to check for the unmappable
511
                // and for euro at this stage, but getting
512
                // the out of the way makes the rest of the
513
                // code less messy.
514
                return (
515
                    EncoderResult::unmappable_from_bmp(bmp),
516
                    source.consumed(),
517
                    handle.written(),
518
                );
519
            } else if bmp == 0x20AC && !self.extended {
520
                handle.write_one(0x80u8)
521
            } else {
522
                match gbk_encode_non_unified(bmp) {
523
                    Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
524
                    None => {
525
                        if !self.extended {
526
                            return (
527
                                EncoderResult::unmappable_from_bmp(bmp),
528
                                source.consumed(),
529
                                handle.written(),
530
                            );
531
                        }
532
                        let range_pointer = gb18030_range_encode(bmp);
533
                        let first = range_pointer / (10 * 126 * 10);
534
                        let rem_first = range_pointer % (10 * 126 * 10);
535
                        let second = rem_first / (10 * 126);
536
                        let rem_second = rem_first % (10 * 126);
537
                        let third = rem_second / 10;
538
                        let fourth = rem_second % 10;
539
                        handle.write_four(
540
                            (first + 0x81) as u8,
541
                            (second + 0x30) as u8,
542
                            (third + 0x81) as u8,
543
                            (fourth + 0x30) as u8,
544
                        )
545
                    }
546
                }
547
            }
548
        },
549
        {
550
            if !self.extended {
551
                return (
552
                    EncoderResult::Unmappable(astral),
553
                    source.consumed(),
554
                    handle.written(),
555
                );
556
            }
557
            let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
558
            let first = range_pointer / (10 * 126 * 10);
559
            let rem_first = range_pointer % (10 * 126 * 10);
560
            let second = rem_first / (10 * 126);
561
            let rem_second = rem_first % (10 * 126);
562
            let third = rem_second / 10;
563
            let fourth = rem_second % 10;
564
            handle.write_four(
565
                (first + 0x81) as u8,
566
                (second + 0x30) as u8,
567
                (third + 0x81) as u8,
568
                (fourth + 0x30) as u8,
569
            )
570
        },
571
        bmp,
572
        astral,
573
        self,
574
        source,
575
        handle,
576
        copy_ascii_to_check_space_four,
577
        check_space_four,
578
        false
579
    );
580
}
581
582
// Any copyright to the test code below this comment is dedicated to the
583
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
584
585
#[cfg(all(test, feature = "alloc"))]
586
mod tests {
587
    use super::super::testing::*;
588
    use super::super::*;
589
590
    fn decode_gb18030(bytes: &[u8], expect: &str) {
591
        decode(GB18030, bytes, expect);
592
    }
593
594
    fn encode_gb18030(string: &str, expect: &[u8]) {
595
        encode(GB18030, string, expect);
596
    }
597
598
    fn encode_gbk(string: &str, expect: &[u8]) {
599
        encode(GBK, string, expect);
600
    }
601
602
    #[test]
603
    fn test_gb18030_decode() {
604
        // Empty
605
        decode_gb18030(b"", &"");
606
607
        // ASCII
608
        decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");
609
610
        // euro
611
        decode_gb18030(b"\x80", "\u{20AC}");
612
        decode_gb18030(b"\xA2\xE3", "\u{20AC}");
613
614
        // two bytes
615
        decode_gb18030(b"\x81\x40", "\u{4E02}");
616
        decode_gb18030(b"\x81\x7E", "\u{4E8A}");
617
        decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
618
        decode_gb18030(b"\x81\x80", "\u{4E90}");
619
        decode_gb18030(b"\x81\xFE", "\u{4FA2}");
620
        decode_gb18030(b"\xFE\x40", "\u{FA0C}");
621
        decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
622
        decode_gb18030(b"\xFE\x80", "\u{4723}");
623
        decode_gb18030(b"\xFE\xFE", "\u{E4C5}");
624
625
        // Changes between GB18030-2005 and GB18030-2022
626
        decode_gb18030(b"\xFE\x7E", "\u{9FB9}");
627
        decode_gb18030(b"\xA6\xDD", "\u{FE14}");
628
629
        // These mappings remain in place the GB18030-2005 way despite GB18030-2022
630
        decode_gb18030(b"\x82\x35\x91\x32", "\u{9FB9}");
631
        decode_gb18030(b"\x84\x31\x83\x30", "\u{FE14}");
632
633
        // The difference from the original GB18030
634
        decode_gb18030(b"\xA3\xA0", "\u{3000}");
635
        decode_gb18030(b"\xA1\xA1", "\u{3000}");
636
637
        // 0xFF
638
        decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
639
        decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} !
640
        decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
641
        decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}");
642
        decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}");
643
        decode_gb18030(
644
            b"\xFF\x32\x9A\x33\x00",
645
            "\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}",
646
        );
647
648
        // Four bytes
649
        decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
650
        decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
651
        decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
652
        decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
653
        decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
654
        decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
655
        decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
656
        decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
657
        decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}");
658
    }
659
660
    #[test]
661
    fn test_gb18030_encode() {
662
        // Empty
663
        encode_gb18030("", b"");
664
665
        // ASCII
666
        encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");
667
668
        // euro
669
        encode_gb18030("\u{20AC}", b"\xA2\xE3");
670
671
        // two bytes
672
        encode_gb18030("\u{4E02}", b"\x81\x40");
673
        encode_gb18030("\u{4E8A}", b"\x81\x7E");
674
        if !cfg!(miri) {
675
            // Miri is too slow
676
            encode_gb18030("\u{4E90}", b"\x81\x80");
677
            encode_gb18030("\u{4FA2}", b"\x81\xFE");
678
            encode_gb18030("\u{FA0C}", b"\xFE\x40");
679
            encode_gb18030("\u{E843}", b"\xFE\x7E");
680
            encode_gb18030("\u{4723}", b"\xFE\x80");
681
            encode_gb18030("\u{E4C5}", b"\xFE\xFE");
682
        }
683
684
        // The difference from the original GB18030
685
        encode_gb18030("\u{E5E5}", b"&#58853;");
686
        encode_gb18030("\u{3000}", b"\xA1\xA1");
687
688
        // Four bytes
689
        encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
690
        encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
691
        if !cfg!(miri) {
692
            // Miri is too slow
693
            encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
694
            encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
695
            encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
696
        }
697
698
        // Edge cases
699
        encode_gb18030("\u{00F7}", b"\xA1\xC2");
700
701
        // GB18030-2022
702
        encode_gb18030("\u{9FB9}", b"\xFE\x7E");
703
        encode_gb18030("\u{FE14}", b"\xA6\xDD");
704
        encode_gb18030("\u{E843}", b"\xFE\x7E");
705
        encode_gb18030("\u{E791}", b"\xA6\xDD");
706
707
        // Non-change in GB18030-2022
708
        encode_gb18030("\u{E817}", b"\xFE\x52");
709
    }
710
711
    #[test]
712
    fn test_gbk_encode() {
713
        // Empty
714
        encode_gbk("", b"");
715
716
        // ASCII
717
        encode_gbk("\u{0061}\u{0062}", b"\x61\x62");
718
719
        // euro
720
        encode_gbk("\u{20AC}", b"\x80");
721
722
        // two bytes
723
        encode_gbk("\u{4E02}", b"\x81\x40");
724
        encode_gbk("\u{4E8A}", b"\x81\x7E");
725
        if !cfg!(miri) {
726
            // Miri is too slow
727
            encode_gbk("\u{4E90}", b"\x81\x80");
728
            encode_gbk("\u{4FA2}", b"\x81\xFE");
729
            encode_gbk("\u{FA0C}", b"\xFE\x40");
730
            encode_gbk("\u{E843}", b"\xFE\x7E");
731
            encode_gbk("\u{4723}", b"\xFE\x80");
732
            encode_gbk("\u{E4C5}", b"\xFE\xFE");
733
        }
734
735
        // The difference from the original gb18030
736
        encode_gbk("\u{E5E5}", b"&#58853;");
737
        encode_gbk("\u{3000}", b"\xA1\xA1");
738
739
        // Four bytes
740
        encode_gbk("\u{0080}", b"&#128;");
741
        encode_gbk("\u{E7C7}", b"&#59335;");
742
        if !cfg!(miri) {
743
            // Miri is too slow
744
            encode_gbk("\u{2603}", b"&#9731;");
745
            encode_gbk("\u{1F4A9}", b"&#128169;");
746
            encode_gbk("\u{10FFFF}", b"&#1114111;");
747
        }
748
749
        // Edge cases
750
        encode_gbk("\u{00F7}", b"\xA1\xC2");
751
752
        // GB18030-2022
753
        encode_gb18030("\u{9FB9}", b"\xFE\x7E");
754
        encode_gb18030("\u{FE14}", b"\xA6\xDD");
755
        encode_gb18030("\u{E843}", b"\xFE\x7E");
756
        encode_gb18030("\u{E791}", b"\xA6\xDD");
757
758
        // Non-change in GB18030-2022
759
        encode_gb18030("\u{E817}", b"\xFE\x52");
760
    }
761
762
    #[test]
763
    #[cfg_attr(miri, ignore)] // Miri is too slow
764
    fn test_gb18030_decode_all() {
765
        let input = include_bytes!("test_data/gb18030_in.txt");
766
        let expectation = include_str!("test_data/gb18030_in_ref.txt");
767
        let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
768
        assert!(!had_errors, "Should not have had errors.");
769
        assert_eq!(&cow[..], expectation);
770
    }
771
772
    #[test]
773
    #[cfg_attr(miri, ignore)] // Miri is too slow
774
    fn test_gb18030_encode_all() {
775
        let input = include_str!("test_data/gb18030_out.txt");
776
        let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
777
        let (cow, encoding, had_errors) = GB18030.encode(input);
778
        assert!(!had_errors, "Should not have had errors.");
779
        assert_eq!(encoding, GB18030);
780
        assert_eq!(&cow[..], &expectation[..]);
781
    }
782
783
    #[test]
784
    fn test_gb18030_encode_from_utf16_max_length() {
785
        let mut output = [0u8; 20];
786
        let mut encoder = GB18030.new_encoder();
787
        {
788
            let needed = encoder
789
                .max_buffer_length_from_utf16_without_replacement(1)
790
                .unwrap();
791
            let (result, read, written) = encoder.encode_from_utf16_without_replacement(
792
                &[0x3000],
793
                &mut output[..needed],
794
                true,
795
            );
796
            assert_eq!(result, EncoderResult::InputEmpty);
797
            assert_eq!(read, 1);
798
            assert_eq!(written, 2);
799
            assert_eq!(output[0], 0xA1);
800
            assert_eq!(output[1], 0xA1);
801
        }
802
    }
803
}