/rust/registry/src/index.crates.io-1949cf8c6b5b557f/encoding_rs-0.8.35/src/gb18030.rs
Line | Count | Source |
1 | | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution. |
3 | | // |
4 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | | // option. This file may not be copied, modified, or distributed |
8 | | // except according to those terms. |
9 | | |
10 | | use super::*; |
11 | | use crate::data::*; |
12 | | use crate::gb18030_2022::*; |
13 | | use crate::handles::*; |
14 | | use crate::variant::*; |
15 | | // Rust 1.14.0 requires the following despite the asterisk above. |
16 | | use super::in_inclusive_range16; |
17 | | use super::in_range16; |
18 | | |
19 | | enum Gb18030Pending { |
20 | | None, |
21 | | One(u8), |
22 | | Two(u8, u8), |
23 | | Three(u8, u8, u8), |
24 | | } |
25 | | |
26 | | impl Gb18030Pending { |
27 | 0 | fn is_none(&self) -> bool { |
28 | 0 | match *self { |
29 | 0 | Gb18030Pending::None => true, |
30 | 0 | _ => false, |
31 | | } |
32 | 0 | } |
33 | | |
34 | 0 | fn count(&self) -> usize { |
35 | 0 | match *self { |
36 | 0 | Gb18030Pending::None => 0, |
37 | 0 | Gb18030Pending::One(_) => 1, |
38 | 0 | Gb18030Pending::Two(_, _) => 2, |
39 | 0 | Gb18030Pending::Three(_, _, _) => 3, |
40 | | } |
41 | 0 | } |
42 | | } |
43 | | |
44 | | pub struct Gb18030Decoder { |
45 | | first: Option<u8>, |
46 | | second: Option<u8>, |
47 | | third: Option<u8>, |
48 | | pending: Gb18030Pending, |
49 | | pending_ascii: Option<u8>, |
50 | | } |
51 | | |
52 | | impl Gb18030Decoder { |
53 | 0 | pub fn new() -> VariantDecoder { |
54 | 0 | VariantDecoder::Gb18030(Gb18030Decoder { |
55 | 0 | first: None, |
56 | 0 | second: None, |
57 | 0 | third: None, |
58 | 0 | pending: Gb18030Pending::None, |
59 | 0 | pending_ascii: None, |
60 | 0 | }) |
61 | 0 | } |
62 | | |
63 | 0 | pub fn in_neutral_state(&self) -> bool { |
64 | 0 | self.first.is_none() |
65 | 0 | && self.second.is_none() |
66 | 0 | && self.third.is_none() |
67 | 0 | && self.pending.is_none() |
68 | 0 | && self.pending_ascii.is_none() |
69 | 0 | } |
70 | | |
71 | 0 | fn extra_from_state(&self, byte_length: usize) -> Option<usize> { |
72 | 0 | byte_length.checked_add( |
73 | 0 | self.pending.count() |
74 | 0 | + match self.first { |
75 | 0 | None => 0, |
76 | 0 | Some(_) => 1, |
77 | | } |
78 | 0 | + match self.second { |
79 | 0 | None => 0, |
80 | 0 | Some(_) => 1, |
81 | | } |
82 | 0 | + match self.third { |
83 | 0 | None => 0, |
84 | 0 | Some(_) => 1, |
85 | | } |
86 | 0 | + match self.pending_ascii { |
87 | 0 | None => 0, |
88 | 0 | Some(_) => 1, |
89 | | }, |
90 | | ) |
91 | 0 | } |
92 | | |
93 | 0 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
94 | | // ASCII: 1 to 1 (worst case) |
95 | | // gbk: 2 to 1 |
96 | | // ranges: 4 to 1 or 4 to 2 |
97 | 0 | checked_add(1, self.extra_from_state(byte_length)) |
98 | 0 | } |
99 | | |
100 | 0 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
101 | | // ASCII: 1 to 1 |
102 | | // gbk: 2 to 2 or 2 to 3 |
103 | | // ranges: 4 to 2, 4 to 3 or 4 to 4 |
104 | | // 0x80: 1 to 3 (worst case) |
105 | 0 | self.max_utf8_buffer_length(byte_length) |
106 | 0 | } |
107 | | |
108 | 0 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
109 | 0 | checked_add(1, checked_mul(3, self.extra_from_state(byte_length))) |
110 | 0 | } |
111 | | |
112 | | gb18030_decoder_functions!( |
113 | | { |
114 | | // If first is between 0x81 and 0xFE, inclusive, |
115 | | // subtract offset 0x81. |
116 | | let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81); |
117 | | if non_ascii_minus_offset > (0xFE - 0x81) { |
118 | | if non_ascii == 0x80 { |
119 | | handle.write_upper_bmp(0x20ACu16); |
120 | | continue 'outermost; |
121 | | } |
122 | | return (DecoderResult::Malformed(1, 0), |
123 | | source.consumed(), |
124 | | handle.written()); |
125 | | } |
126 | | non_ascii_minus_offset |
127 | | }, |
128 | | { |
129 | | // Two-byte (or error) |
130 | | if first_minus_offset >= 0x20 { |
131 | | // Not the gbk ideograph range above GB2312 |
132 | | let trail_minus_offset = second.wrapping_sub(0xA1); |
133 | | if trail_minus_offset <= (0xFE - 0xA1) { |
134 | | // GB2312 |
135 | | let hanzi_lead = first_minus_offset.wrapping_sub(0x2F); |
136 | | if hanzi_lead < (0x77 - 0x2F) { |
137 | | // Level 1 Hanzi, Level 2 Hanzi |
138 | | // or one of the 5 PUA code |
139 | | // points in between. |
140 | | let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize; |
141 | | let upper_bmp = GB2312_HANZI[hanzi_pointer]; |
142 | | handle.write_upper_bmp(upper_bmp) |
143 | | } else if first_minus_offset == 0x20 { |
144 | | // Symbols (starting with ideographic space) |
145 | | let bmp = GB2312_SYMBOLS[trail_minus_offset as usize]; |
146 | | handle.write_bmp_excl_ascii(bmp) |
147 | | } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) { |
148 | | handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize]) |
149 | | } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() { |
150 | | handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize]) |
151 | | } else if first_minus_offset > 0x76 { |
152 | | // Bottom PUA |
153 | | let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16; |
154 | | handle.write_upper_bmp(pua) |
155 | | } else { |
156 | | let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16); |
157 | | handle.write_bmp_excl_ascii(bmp) |
158 | | } |
159 | | } else { |
160 | | // gbk range on the left |
161 | | let mut trail_minus_offset = second.wrapping_sub(0x40); |
162 | | if trail_minus_offset > (0x7E - 0x40) { |
163 | | let trail_minus_range_start = second.wrapping_sub(0x80); |
164 | | if trail_minus_range_start > (0xA0 - 0x80) { |
165 | | if second < 0x80 { |
166 | | return (DecoderResult::Malformed(1, 0), |
167 | | unread_handle_second.unread(), |
168 | | handle.written()); |
169 | | } |
170 | | return (DecoderResult::Malformed(2, 0), |
171 | | unread_handle_second.consumed(), |
172 | | handle.written()); |
173 | | } |
174 | | trail_minus_offset = second - 0x41; |
175 | | } |
176 | | // Zero-base lead |
177 | | let left_lead = first_minus_offset - 0x20; |
178 | | let left_pointer = left_lead as usize * (190 - 94) + |
179 | | trail_minus_offset as usize; |
180 | | let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94)); |
181 | | if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) { |
182 | | let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16); |
183 | | handle.write_upper_bmp(upper_bmp) |
184 | | } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) { |
185 | | let bmp = gbk_other_decode(left_pointer as u16); |
186 | | handle.write_bmp_excl_ascii(bmp) |
187 | | } else { |
188 | | let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5); |
189 | | let upper_bmp = GBK_BOTTOM[bottom_pointer]; |
190 | | handle.write_upper_bmp(upper_bmp) |
191 | | } |
192 | | } |
193 | | } else { |
194 | | // gbk ideograph range above GB2312 |
195 | | let mut trail_minus_offset = second.wrapping_sub(0x40); |
196 | | if trail_minus_offset > (0x7E - 0x40) { |
197 | | let trail_minus_range_start = second.wrapping_sub(0x80); |
198 | | if trail_minus_range_start > (0xFE - 0x80) { |
199 | | if second < 0x80 { |
200 | | return (DecoderResult::Malformed(1, 0), |
201 | | unread_handle_second.unread(), |
202 | | handle.written()); |
203 | | } |
204 | | return (DecoderResult::Malformed(2, 0), |
205 | | unread_handle_second.consumed(), |
206 | | handle.written()); |
207 | | } |
208 | | trail_minus_offset = second - 0x41; |
209 | | } |
210 | | let pointer = first_minus_offset as usize * 190usize + |
211 | | trail_minus_offset as usize; |
212 | | let upper_bmp = gbk_top_ideograph_decode(pointer as u16); |
213 | | handle.write_upper_bmp(upper_bmp) |
214 | | } |
215 | | }, |
216 | | { |
217 | | // If third is between 0x81 and 0xFE, inclusive, |
218 | | // subtract offset 0x81. |
219 | | let third_minus_offset = third.wrapping_sub(0x81); |
220 | | if third_minus_offset > (0xFE - 0x81) { |
221 | | // We have an error. Let's inline what's going |
222 | | // to happen when `second` is |
223 | | // reprocessed. (`third` gets unread.) |
224 | | // `second` is guaranteed ASCII, so let's |
225 | | // put it in `pending_ascii`. Recompute |
226 | | // `second` from `second_minus_offset`. |
227 | | self.pending_ascii = Some(second_minus_offset + 0x30); |
228 | | // Now unread `third` and designate the previous |
229 | | // `first` as being in error. |
230 | | return (DecoderResult::Malformed(1, 1), |
231 | | unread_handle_third.unread(), |
232 | | handle.written()); |
233 | | } |
234 | | third_minus_offset |
235 | | }, |
236 | | { |
237 | | // If fourth is between 0x30 and 0x39, inclusive, |
238 | | // subtract offset 0x30. |
239 | | // |
240 | | // If we have an error, we'll inline what's going |
241 | | // to happen when `second` and `third` are |
242 | | // reprocessed. (`fourth` gets unread.) |
243 | | // `second` is guaranteed ASCII, so let's |
244 | | // put it in `pending_ascii`. Recompute |
245 | | // `second` from `second_minus_offset` to |
246 | | // make this block reusable when `second` |
247 | | // is not in scope. |
248 | | // |
249 | | // `third` is guaranteed to be in the range |
250 | | // that makes it become the new `self.first`. |
251 | | // |
252 | | // `fourth` gets unread and the previous |
253 | | // `first` gets designates as being in error. |
254 | | let fourth_minus_offset = fourth.wrapping_sub(0x30); |
255 | | if fourth_minus_offset > (0x39 - 0x30) { |
256 | | self.pending_ascii = Some(second_minus_offset + 0x30); |
257 | | self.pending = Gb18030Pending::One(third_minus_offset); |
258 | | return (DecoderResult::Malformed(1, 2), |
259 | | unread_handle_fourth.unread(), |
260 | | handle.written()); |
261 | | } |
262 | | let pointer = (first_minus_offset as usize * (10 * 126 * 10)) + |
263 | | (second_minus_offset as usize * (10 * 126)) + |
264 | | (third_minus_offset as usize * 10) + |
265 | | fourth_minus_offset as usize; |
266 | | if pointer <= 39419 { |
267 | | // BMP |
268 | | if pointer == 7457 { |
269 | | handle.write_upper_bmp(0xE7C7) |
270 | | } else { |
271 | | handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16)) |
272 | | } |
273 | | } else if pointer >= 189_000 && pointer <= 1_237_575 { |
274 | | // Astral |
275 | | handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32) |
276 | | } else { |
277 | | return (DecoderResult::Malformed(4, 0), |
278 | | unread_handle_fourth.consumed(), |
279 | | handle.written()); |
280 | | } |
281 | | }, |
282 | | self, |
283 | | non_ascii, |
284 | | first_minus_offset, |
285 | | second, |
286 | | second_minus_offset, |
287 | | unread_handle_second, |
288 | | third, |
289 | | third_minus_offset, |
290 | | unread_handle_third, |
291 | | fourth, |
292 | | fourth_minus_offset, |
293 | | unread_handle_fourth, |
294 | | source, |
295 | | handle, |
296 | | 'outermost); |
297 | | } |
298 | | |
299 | | // XXX Experiment with inline directives |
300 | 0 | fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { |
301 | | // Try ideographic punctuation first as it's the most likely case. |
302 | | // Throwing in the check for full-width currencies and tilde is probably |
303 | | // more size-efficient here than elsewhere. |
304 | 0 | if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) { |
305 | 0 | if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) { |
306 | 0 | return Some((0xA1, pos + 0xA1)); |
307 | 0 | } |
308 | 0 | } |
309 | | // Ext A |
310 | 0 | if in_range16(bmp, 0x3400, 0x4E00) { |
311 | 0 | return position(&GBK_BOTTOM[21..100], bmp).map(|pos| { |
312 | | ( |
313 | | 0xFE, |
314 | 0 | pos + if pos < (0x3F - 16) { |
315 | 0 | 0x40 + 16 |
316 | | } else { |
317 | 0 | 0x41 + 16 |
318 | | }, |
319 | | ) |
320 | 0 | }); |
321 | 0 | } |
322 | | // Compatibility ideographs |
323 | 0 | if in_range16(bmp, 0xF900, 0xFB00) { |
324 | 0 | return position(&GBK_BOTTOM[0..21], bmp).map(|pos| { |
325 | 0 | if pos < 5 { |
326 | | // end of second to last row |
327 | 0 | (0xFD, pos + (190 - 94 - 5 + 0x41)) |
328 | | } else { |
329 | | // last row |
330 | 0 | (0xFE, pos + (0x40 - 5)) |
331 | | } |
332 | 0 | }); |
333 | 0 | } |
334 | | // Handle everything below U+02CA, which is in GBK_OTHER. |
335 | 0 | if bmp < 0x02CA { |
336 | 0 | if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 { |
337 | | // Pinyin except U+1E3F |
338 | 0 | if let Some(pos) = position(&GB2312_PINYIN[..], bmp) { |
339 | 0 | return Some((0xA8, pos + 0xA1)); |
340 | 0 | } |
341 | 0 | } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7) |
342 | 0 | || in_inclusive_range16(bmp, 0x02C7, 0x02C9) |
343 | | { |
344 | | // Diacritics and Latin 1 symbols |
345 | 0 | if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) { |
346 | 0 | return Some((0xA1, pos + 0xA1 + 3)); |
347 | 0 | } |
348 | 0 | } |
349 | 0 | return None; |
350 | 0 | } |
351 | | |
352 | 0 | if in_inclusive_range16(bmp, 0xE78D, 0xE864) { |
353 | | // The array is sorted but short, so let's do linear search. |
354 | 0 | if let Some(pos) = position(&GB18030_2022_OVERRIDE_PUA[..], bmp) { |
355 | 0 | let pair = &GB18030_2022_OVERRIDE_BYTES[pos]; |
356 | 0 | return Some((pair[0].into(), pair[1].into())); |
357 | 0 | } |
358 | 0 | } else if bmp >= 0xFE17 { |
359 | | // Various brackets, all in full-width regions |
360 | 0 | if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) { |
361 | 0 | return Some((0xA6, pos + (0x9F - 0x60 + 0xA1))); |
362 | 0 | } |
363 | 0 | } else if bmp == 0x1E3F { |
364 | | // The one Pinyin placed elsewhere on the BMP |
365 | 0 | return Some((0xA8, 0x7B - 0x60 + 0xA1)); |
366 | 0 | } else if in_range16(bmp, 0xA000, 0xD800) { |
367 | | // Since Korean has usage in China, let's spend a branch to fast-track |
368 | | // Hangul. |
369 | 0 | return None; |
370 | 0 | } |
371 | | // GB2312 other (except bottom PUA and PUA between Hanzi levels). |
372 | 0 | if let Some(other_pointer) = gb2312_other_encode(bmp) { |
373 | 0 | let other_lead = other_pointer as usize / 94; |
374 | 0 | let other_trail = other_pointer as usize % 94; |
375 | 0 | return Some((0xA2 + other_lead, 0xA1 + other_trail)); |
376 | 0 | } |
377 | | // At this point, we've handled all mappable characters above U+02D9 but |
378 | | // below U+2010. Let's check for that range in order to let lower BMP |
379 | | // characters used for minority languages in China avoid the subsequent |
380 | | // search that deals mainly with various symbols. |
381 | 0 | if in_range16(bmp, 0x02DA, 0x2010) { |
382 | 0 | return None; |
383 | 0 | } |
384 | | // GBK other (except radicals and PUA in GBK_BOTTOM). |
385 | 0 | if let Some(other_pointer) = gbk_other_encode(bmp) { |
386 | 0 | let other_lead = other_pointer as usize / (190 - 94); |
387 | 0 | let other_trail = other_pointer as usize % (190 - 94); |
388 | 0 | let offset = if other_trail < 0x3F { 0x40 } else { 0x41 }; |
389 | 0 | return Some((other_lead + (0x81 + 0x20), other_trail + offset)); |
390 | 0 | } |
391 | | // CJK Radicals Supplement, PUA, and U+9FBx ideographs in GBK_BOTTOM |
392 | 0 | if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) |
393 | 0 | || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB) |
394 | 0 | || in_inclusive_range16(bmp, 0xE816, 0xE855) |
395 | | { |
396 | 0 | if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) { |
397 | 0 | let trail = pos + 16; |
398 | 0 | let offset = if trail < 0x3F { 0x40 } else { 0x41 }; |
399 | 0 | return Some((0xFE, trail + offset)); |
400 | 0 | } |
401 | 0 | } |
402 | | // GB2312 bottom PUA |
403 | 0 | let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234); |
404 | 0 | if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) { |
405 | 0 | let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94; |
406 | 0 | let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94; |
407 | 0 | return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail)); |
408 | 0 | } |
409 | | // PUA between Hanzi Levels |
410 | 0 | let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810); |
411 | 0 | if bmp_minus_pua_between_hanzi < 5 { |
412 | 0 | return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize)); |
413 | 0 | } |
414 | 0 | None |
415 | 0 | } |
416 | | |
417 | | #[cfg(not(feature = "fast-gb-hanzi-encode"))] |
418 | | #[inline(always)] |
419 | 0 | fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) { |
420 | 0 | if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) { |
421 | 0 | (lead, trail) |
422 | 0 | } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) { |
423 | 0 | let hanzi_lead = (hanzi_pointer / 94) + (0xD8); |
424 | 0 | let hanzi_trail = (hanzi_pointer % 94) + 0xA1; |
425 | 0 | (hanzi_lead as u8, hanzi_trail as u8) |
426 | | } else { |
427 | 0 | let (lead, gbk_trail) = if bmp < 0x72DC { |
428 | | // Above GB2312 |
429 | 0 | let pointer = gbk_top_ideograph_encode(bmp) as usize; |
430 | 0 | let lead = (pointer / 190) + 0x81; |
431 | 0 | let gbk_trail = pointer % 190; |
432 | 0 | (lead, gbk_trail) |
433 | | } else { |
434 | | // To the left of GB2312 |
435 | 0 | let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize; |
436 | 0 | let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29); |
437 | 0 | let gbk_trail = gbk_left_ideograph_pointer % (190 - 94); |
438 | 0 | (lead, gbk_trail) |
439 | | }; |
440 | 0 | let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 }; |
441 | 0 | (lead as u8, (gbk_trail + offset) as u8) |
442 | | } |
443 | 0 | } |
444 | | |
445 | | #[cfg(feature = "fast-gb-hanzi-encode")] |
446 | | #[inline(always)] |
447 | | fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) { |
448 | | gbk_hanzi_encode(bmp_minus_unified_start) |
449 | | } |
450 | | |
451 | | pub struct Gb18030Encoder { |
452 | | extended: bool, |
453 | | } |
454 | | |
455 | | impl Gb18030Encoder { |
456 | 0 | pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder { |
457 | 0 | Encoder::new( |
458 | 0 | encoding, |
459 | 0 | VariantEncoder::Gb18030(Gb18030Encoder { |
460 | 0 | extended: extended_range, |
461 | 0 | }), |
462 | | ) |
463 | 0 | } |
464 | | |
465 | 0 | pub fn max_buffer_length_from_utf16_without_replacement( |
466 | 0 | &self, |
467 | 0 | u16_length: usize, |
468 | 0 | ) -> Option<usize> { |
469 | 0 | if self.extended { |
470 | 0 | u16_length.checked_mul(4) |
471 | | } else { |
472 | | // Need to add, because space check is done with the four-byte |
473 | | // assumption. |
474 | 0 | checked_add(2, u16_length.checked_mul(2)) |
475 | | } |
476 | 0 | } |
477 | | |
478 | 0 | pub fn max_buffer_length_from_utf8_without_replacement( |
479 | 0 | &self, |
480 | 0 | byte_length: usize, |
481 | 0 | ) -> Option<usize> { |
482 | 0 | if self.extended { |
483 | | // 1 to 1 |
484 | | // 2 to 2 |
485 | | // 3 to 2 |
486 | | // 2 to 4 (worst) |
487 | | // 3 to 4 |
488 | | // 4 to 4 |
489 | 0 | checked_add(2, byte_length.checked_mul(2)) |
490 | | } else { |
491 | | // 1 to 1 |
492 | | // 2 to 2 |
493 | | // 3 to 2 |
494 | | // Need to add, because space check is done with the four-byte |
495 | | // assumption. |
496 | 0 | byte_length.checked_add(3) |
497 | | } |
498 | 0 | } |
499 | | |
500 | | ascii_compatible_encoder_functions!( |
501 | | { |
502 | | let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00); |
503 | | if bmp_minus_unified_start < (0x9FA6 - 0x4E00) { |
504 | | // CJK Unified Ideographs |
505 | | // Can't fail now, since all are |
506 | | // mapped. |
507 | | let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start); |
508 | | handle.write_two(lead, trail) |
509 | | } else if bmp == 0xE5E5 { |
510 | | // It's not optimal to check for the unmappable |
511 | | // and for euro at this stage, but getting |
512 | | // the out of the way makes the rest of the |
513 | | // code less messy. |
514 | | return ( |
515 | | EncoderResult::unmappable_from_bmp(bmp), |
516 | | source.consumed(), |
517 | | handle.written(), |
518 | | ); |
519 | | } else if bmp == 0x20AC && !self.extended { |
520 | | handle.write_one(0x80u8) |
521 | | } else { |
522 | | match gbk_encode_non_unified(bmp) { |
523 | | Some((lead, trail)) => handle.write_two(lead as u8, trail as u8), |
524 | | None => { |
525 | | if !self.extended { |
526 | | return ( |
527 | | EncoderResult::unmappable_from_bmp(bmp), |
528 | | source.consumed(), |
529 | | handle.written(), |
530 | | ); |
531 | | } |
532 | | let range_pointer = gb18030_range_encode(bmp); |
533 | | let first = range_pointer / (10 * 126 * 10); |
534 | | let rem_first = range_pointer % (10 * 126 * 10); |
535 | | let second = rem_first / (10 * 126); |
536 | | let rem_second = rem_first % (10 * 126); |
537 | | let third = rem_second / 10; |
538 | | let fourth = rem_second % 10; |
539 | | handle.write_four( |
540 | | (first + 0x81) as u8, |
541 | | (second + 0x30) as u8, |
542 | | (third + 0x81) as u8, |
543 | | (fourth + 0x30) as u8, |
544 | | ) |
545 | | } |
546 | | } |
547 | | } |
548 | | }, |
549 | | { |
550 | | if !self.extended { |
551 | | return ( |
552 | | EncoderResult::Unmappable(astral), |
553 | | source.consumed(), |
554 | | handle.written(), |
555 | | ); |
556 | | } |
557 | | let range_pointer = astral as usize + (189_000usize - 0x1_0000usize); |
558 | | let first = range_pointer / (10 * 126 * 10); |
559 | | let rem_first = range_pointer % (10 * 126 * 10); |
560 | | let second = rem_first / (10 * 126); |
561 | | let rem_second = rem_first % (10 * 126); |
562 | | let third = rem_second / 10; |
563 | | let fourth = rem_second % 10; |
564 | | handle.write_four( |
565 | | (first + 0x81) as u8, |
566 | | (second + 0x30) as u8, |
567 | | (third + 0x81) as u8, |
568 | | (fourth + 0x30) as u8, |
569 | | ) |
570 | | }, |
571 | | bmp, |
572 | | astral, |
573 | | self, |
574 | | source, |
575 | | handle, |
576 | | copy_ascii_to_check_space_four, |
577 | | check_space_four, |
578 | | false |
579 | | ); |
580 | | } |
581 | | |
582 | | // Any copyright to the test code below this comment is dedicated to the |
583 | | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
584 | | |
585 | | #[cfg(all(test, feature = "alloc"))] |
586 | | mod tests { |
587 | | use super::super::testing::*; |
588 | | use super::super::*; |
589 | | |
590 | | fn decode_gb18030(bytes: &[u8], expect: &str) { |
591 | | decode(GB18030, bytes, expect); |
592 | | } |
593 | | |
594 | | fn encode_gb18030(string: &str, expect: &[u8]) { |
595 | | encode(GB18030, string, expect); |
596 | | } |
597 | | |
598 | | fn encode_gbk(string: &str, expect: &[u8]) { |
599 | | encode(GBK, string, expect); |
600 | | } |
601 | | |
602 | | #[test] |
603 | | fn test_gb18030_decode() { |
604 | | // Empty |
605 | | decode_gb18030(b"", &""); |
606 | | |
607 | | // ASCII |
608 | | decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}"); |
609 | | |
610 | | // euro |
611 | | decode_gb18030(b"\x80", "\u{20AC}"); |
612 | | decode_gb18030(b"\xA2\xE3", "\u{20AC}"); |
613 | | |
614 | | // two bytes |
615 | | decode_gb18030(b"\x81\x40", "\u{4E02}"); |
616 | | decode_gb18030(b"\x81\x7E", "\u{4E8A}"); |
617 | | decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}"); |
618 | | decode_gb18030(b"\x81\x80", "\u{4E90}"); |
619 | | decode_gb18030(b"\x81\xFE", "\u{4FA2}"); |
620 | | decode_gb18030(b"\xFE\x40", "\u{FA0C}"); |
621 | | decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}"); |
622 | | decode_gb18030(b"\xFE\x80", "\u{4723}"); |
623 | | decode_gb18030(b"\xFE\xFE", "\u{E4C5}"); |
624 | | |
625 | | // Changes between GB18030-2005 and GB18030-2022 |
626 | | decode_gb18030(b"\xFE\x7E", "\u{9FB9}"); |
627 | | decode_gb18030(b"\xA6\xDD", "\u{FE14}"); |
628 | | |
629 | | // These mappings remain in place the GB18030-2005 way despite GB18030-2022 |
630 | | decode_gb18030(b"\x82\x35\x91\x32", "\u{9FB9}"); |
631 | | decode_gb18030(b"\x84\x31\x83\x30", "\u{FE14}"); |
632 | | |
633 | | // The difference from the original GB18030 |
634 | | decode_gb18030(b"\xA3\xA0", "\u{3000}"); |
635 | | decode_gb18030(b"\xA1\xA1", "\u{3000}"); |
636 | | |
637 | | // 0xFF |
638 | | decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}"); |
639 | | decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} ! |
640 | | decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} ! |
641 | | decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}"); |
642 | | decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}"); |
643 | | decode_gb18030( |
644 | | b"\xFF\x32\x9A\x33\x00", |
645 | | "\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}", |
646 | | ); |
647 | | |
648 | | // Four bytes |
649 | | decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}"); |
650 | | decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}"); |
651 | | decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}"); |
652 | | decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}"); |
653 | | decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}"); |
654 | | decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}"); |
655 | | decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}"); |
656 | | decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} ! |
657 | | decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}"); |
658 | | } |
659 | | |
660 | | #[test] |
661 | | fn test_gb18030_encode() { |
662 | | // Empty |
663 | | encode_gb18030("", b""); |
664 | | |
665 | | // ASCII |
666 | | encode_gb18030("\u{0061}\u{0062}", b"\x61\x62"); |
667 | | |
668 | | // euro |
669 | | encode_gb18030("\u{20AC}", b"\xA2\xE3"); |
670 | | |
671 | | // two bytes |
672 | | encode_gb18030("\u{4E02}", b"\x81\x40"); |
673 | | encode_gb18030("\u{4E8A}", b"\x81\x7E"); |
674 | | if !cfg!(miri) { |
675 | | // Miri is too slow |
676 | | encode_gb18030("\u{4E90}", b"\x81\x80"); |
677 | | encode_gb18030("\u{4FA2}", b"\x81\xFE"); |
678 | | encode_gb18030("\u{FA0C}", b"\xFE\x40"); |
679 | | encode_gb18030("\u{E843}", b"\xFE\x7E"); |
680 | | encode_gb18030("\u{4723}", b"\xFE\x80"); |
681 | | encode_gb18030("\u{E4C5}", b"\xFE\xFE"); |
682 | | } |
683 | | |
684 | | // The difference from the original GB18030 |
685 | | encode_gb18030("\u{E5E5}", b""); |
686 | | encode_gb18030("\u{3000}", b"\xA1\xA1"); |
687 | | |
688 | | // Four bytes |
689 | | encode_gb18030("\u{0080}", b"\x81\x30\x81\x30"); |
690 | | encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37"); |
691 | | if !cfg!(miri) { |
692 | | // Miri is too slow |
693 | | encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30"); |
694 | | encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33"); |
695 | | encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35"); |
696 | | } |
697 | | |
698 | | // Edge cases |
699 | | encode_gb18030("\u{00F7}", b"\xA1\xC2"); |
700 | | |
701 | | // GB18030-2022 |
702 | | encode_gb18030("\u{9FB9}", b"\xFE\x7E"); |
703 | | encode_gb18030("\u{FE14}", b"\xA6\xDD"); |
704 | | encode_gb18030("\u{E843}", b"\xFE\x7E"); |
705 | | encode_gb18030("\u{E791}", b"\xA6\xDD"); |
706 | | |
707 | | // Non-change in GB18030-2022 |
708 | | encode_gb18030("\u{E817}", b"\xFE\x52"); |
709 | | } |
710 | | |
711 | | #[test] |
712 | | fn test_gbk_encode() { |
713 | | // Empty |
714 | | encode_gbk("", b""); |
715 | | |
716 | | // ASCII |
717 | | encode_gbk("\u{0061}\u{0062}", b"\x61\x62"); |
718 | | |
719 | | // euro |
720 | | encode_gbk("\u{20AC}", b"\x80"); |
721 | | |
722 | | // two bytes |
723 | | encode_gbk("\u{4E02}", b"\x81\x40"); |
724 | | encode_gbk("\u{4E8A}", b"\x81\x7E"); |
725 | | if !cfg!(miri) { |
726 | | // Miri is too slow |
727 | | encode_gbk("\u{4E90}", b"\x81\x80"); |
728 | | encode_gbk("\u{4FA2}", b"\x81\xFE"); |
729 | | encode_gbk("\u{FA0C}", b"\xFE\x40"); |
730 | | encode_gbk("\u{E843}", b"\xFE\x7E"); |
731 | | encode_gbk("\u{4723}", b"\xFE\x80"); |
732 | | encode_gbk("\u{E4C5}", b"\xFE\xFE"); |
733 | | } |
734 | | |
735 | | // The difference from the original gb18030 |
736 | | encode_gbk("\u{E5E5}", b""); |
737 | | encode_gbk("\u{3000}", b"\xA1\xA1"); |
738 | | |
739 | | // Four bytes |
740 | | encode_gbk("\u{0080}", b"€"); |
741 | | encode_gbk("\u{E7C7}", b""); |
742 | | if !cfg!(miri) { |
743 | | // Miri is too slow |
744 | | encode_gbk("\u{2603}", b"☃"); |
745 | | encode_gbk("\u{1F4A9}", b"💩"); |
746 | | encode_gbk("\u{10FFFF}", b""); |
747 | | } |
748 | | |
749 | | // Edge cases |
750 | | encode_gbk("\u{00F7}", b"\xA1\xC2"); |
751 | | |
752 | | // GB18030-2022 |
753 | | encode_gb18030("\u{9FB9}", b"\xFE\x7E"); |
754 | | encode_gb18030("\u{FE14}", b"\xA6\xDD"); |
755 | | encode_gb18030("\u{E843}", b"\xFE\x7E"); |
756 | | encode_gb18030("\u{E791}", b"\xA6\xDD"); |
757 | | |
758 | | // Non-change in GB18030-2022 |
759 | | encode_gb18030("\u{E817}", b"\xFE\x52"); |
760 | | } |
761 | | |
762 | | #[test] |
763 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
764 | | fn test_gb18030_decode_all() { |
765 | | let input = include_bytes!("test_data/gb18030_in.txt"); |
766 | | let expectation = include_str!("test_data/gb18030_in_ref.txt"); |
767 | | let (cow, had_errors) = GB18030.decode_without_bom_handling(input); |
768 | | assert!(!had_errors, "Should not have had errors."); |
769 | | assert_eq!(&cow[..], expectation); |
770 | | } |
771 | | |
772 | | #[test] |
773 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
774 | | fn test_gb18030_encode_all() { |
775 | | let input = include_str!("test_data/gb18030_out.txt"); |
776 | | let expectation = include_bytes!("test_data/gb18030_out_ref.txt"); |
777 | | let (cow, encoding, had_errors) = GB18030.encode(input); |
778 | | assert!(!had_errors, "Should not have had errors."); |
779 | | assert_eq!(encoding, GB18030); |
780 | | assert_eq!(&cow[..], &expectation[..]); |
781 | | } |
782 | | |
783 | | #[test] |
784 | | fn test_gb18030_encode_from_utf16_max_length() { |
785 | | let mut output = [0u8; 20]; |
786 | | let mut encoder = GB18030.new_encoder(); |
787 | | { |
788 | | let needed = encoder |
789 | | .max_buffer_length_from_utf16_without_replacement(1) |
790 | | .unwrap(); |
791 | | let (result, read, written) = encoder.encode_from_utf16_without_replacement( |
792 | | &[0x3000], |
793 | | &mut output[..needed], |
794 | | true, |
795 | | ); |
796 | | assert_eq!(result, EncoderResult::InputEmpty); |
797 | | assert_eq!(read, 1); |
798 | | assert_eq!(written, 2); |
799 | | assert_eq!(output[0], 0xA1); |
800 | | assert_eq!(output[1], 0xA1); |
801 | | } |
802 | | } |
803 | | } |