/rust/registry/src/index.crates.io-6f17d22bba15001f/encoding_rs-0.8.35/src/big5.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution. |
3 | | // |
4 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | | // option. This file may not be copied, modified, or distributed |
8 | | // except according to those terms. |
9 | | |
10 | | use super::*; |
11 | | use crate::data::*; |
12 | | use crate::handles::*; |
13 | | use crate::variant::*; |
14 | | // Rust 1.14.0 requires the following despite the asterisk above. |
15 | | use super::in_inclusive_range32; |
16 | | |
17 | | pub struct Big5Decoder { |
18 | | lead: Option<u8>, |
19 | | } |
20 | | |
21 | | impl Big5Decoder { |
22 | 0 | pub fn new() -> VariantDecoder { |
23 | 0 | VariantDecoder::Big5(Big5Decoder { lead: None }) |
24 | 0 | } |
25 | | |
26 | 0 | pub fn in_neutral_state(&self) -> bool { |
27 | 0 | self.lead.is_none() |
28 | 0 | } |
29 | | |
30 | 0 | fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> { |
31 | 0 | byte_length.checked_add(match self.lead { |
32 | 0 | None => 0, |
33 | 0 | Some(_) => 1, |
34 | | }) |
35 | 0 | } |
36 | | |
37 | 0 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
38 | 0 | // If there is a lead but the next byte isn't a valid trail, an |
39 | 0 | // error is generated for the lead (+1). Then another iteration checks |
40 | 0 | // space, which needs +1 to account for the possibility of astral |
41 | 0 | // output or combining pair. |
42 | 0 | checked_add(1, self.plus_one_if_lead(byte_length)) |
43 | 0 | } |
44 | | |
45 | 0 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
46 | 0 | // No need to account for REPLACEMENT CHARACTERS. |
47 | 0 | // Cases: |
48 | 0 | // ASCII: 1 to 1 |
49 | 0 | // Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4 |
50 | 0 | // lead set and first byte is trail: 1 to 4 worst case |
51 | 0 | // |
52 | 0 | // When checking for space for the last byte: |
53 | 0 | // no lead: the last byte must be ASCII (or fatal error): 1 to 1 |
54 | 0 | // lead set: space for 4 bytes was already checked when reading the |
55 | 0 | // lead, hence the last lead and the last trail together are worst |
56 | 0 | // case 2 to 4. |
57 | 0 | // |
58 | 0 | // If lead set and the input is a single trail byte, the worst-case |
59 | 0 | // output is 4, so we need to add one before multiplying if lead is |
60 | 0 | // set. |
61 | 0 | // |
62 | 0 | // Finally, add two so that if input is non-zero, the output is at |
63 | 0 | // least 4. |
64 | 0 | checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length))) |
65 | 0 | } |
66 | | |
67 | 0 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
68 | 0 | // If there is a lead but the next byte isn't a valid trail, an |
69 | 0 | // error is generated for the lead (+(1*3)). Then another iteration |
70 | 0 | // checks space, which needs +3 to account for the possibility of astral |
71 | 0 | // output or combining pair. In between start and end, the worst case |
72 | 0 | // is that every byte is bad: *3. |
73 | 0 | checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length))) |
74 | 0 | } |
75 | | |
76 | | ascii_compatible_two_byte_decoder_functions!( |
77 | | { |
78 | | // If lead is between 0x81 and 0xFE, inclusive, |
79 | | // subtract offset 0x81. |
80 | | let non_ascii_minus_offset = |
81 | | non_ascii.wrapping_sub(0x81); |
82 | | if non_ascii_minus_offset > (0xFE - 0x81) { |
83 | | return (DecoderResult::Malformed(1, 0), |
84 | | source.consumed(), |
85 | | handle.written()); |
86 | | } |
87 | | non_ascii_minus_offset |
88 | | }, |
89 | | { |
90 | | // If trail is between 0x40 and 0x7E, inclusive, |
91 | | // subtract offset 0x40. Else if trail is |
92 | | // between 0xA1 and 0xFE, inclusive, subtract |
93 | | // offset 0x62. |
94 | | // TODO: Find out which range is more probable. |
95 | | let mut trail_minus_offset = |
96 | | byte.wrapping_sub(0x40); |
97 | | if trail_minus_offset > (0x7E - 0x40) { |
98 | | let trail_minus_range_start = |
99 | | byte.wrapping_sub(0xA1); |
100 | | if trail_minus_range_start > |
101 | | (0xFE - 0xA1) { |
102 | | if byte < 0x80 { |
103 | | return (DecoderResult::Malformed(1, 0), |
104 | | unread_handle_trail.unread(), |
105 | | handle.written()); |
106 | | } |
107 | | return (DecoderResult::Malformed(2, 0), |
108 | | unread_handle_trail.consumed(), |
109 | | handle.written()); |
110 | | } |
111 | | trail_minus_offset = byte - 0x62; |
112 | | } |
113 | | let pointer = lead_minus_offset as usize * |
114 | | 157usize + |
115 | | trail_minus_offset as usize; |
116 | | let rebased_pointer = pointer.wrapping_sub(942); |
117 | | let low_bits = big5_low_bits(rebased_pointer); |
118 | | if low_bits == 0 { |
119 | | match pointer { |
120 | | 1133 => { |
121 | | handle.write_big5_combination(0x00CAu16, |
122 | | 0x0304u16) |
123 | | } |
124 | | 1135 => { |
125 | | handle.write_big5_combination(0x00CAu16, |
126 | | 0x030Cu16) |
127 | | } |
128 | | 1164 => { |
129 | | handle.write_big5_combination(0x00EAu16, |
130 | | 0x0304u16) |
131 | | } |
132 | | 1166 => { |
133 | | handle.write_big5_combination(0x00EAu16, |
134 | | 0x030Cu16) |
135 | | } |
136 | | _ => { |
137 | | if byte < 0x80 { |
138 | | return (DecoderResult::Malformed(1, 0), |
139 | | unread_handle_trail.unread(), |
140 | | handle.written()); |
141 | | } |
142 | | return (DecoderResult::Malformed(2, 0), |
143 | | unread_handle_trail.consumed(), |
144 | | handle.written()); |
145 | | } |
146 | | } |
147 | | } else if big5_is_astral(rebased_pointer) { |
148 | | handle.write_astral(u32::from(low_bits) | |
149 | | 0x20000u32) |
150 | | } else { |
151 | | handle.write_bmp_excl_ascii(low_bits) |
152 | | } |
153 | | }, |
154 | | self, |
155 | | non_ascii, |
156 | | byte, |
157 | | lead_minus_offset, |
158 | | unread_handle_trail, |
159 | | source, |
160 | | handle, |
161 | | 'outermost, |
162 | | copy_ascii_from_check_space_astral, |
163 | | check_space_astral, |
164 | | false); |
165 | | } |
166 | | |
167 | | pub struct Big5Encoder; |
168 | | |
169 | | impl Big5Encoder { |
170 | 0 | pub fn new(encoding: &'static Encoding) -> Encoder { |
171 | 0 | Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder)) |
172 | 0 | } |
173 | | |
174 | 0 | pub fn max_buffer_length_from_utf16_without_replacement( |
175 | 0 | &self, |
176 | 0 | u16_length: usize, |
177 | 0 | ) -> Option<usize> { |
178 | 0 | // Astral: 2 to 2 |
179 | 0 | // ASCII: 1 to 1 |
180 | 0 | // Other: 1 to 2 |
181 | 0 | u16_length.checked_mul(2) |
182 | 0 | } |
183 | | |
184 | 0 | pub fn max_buffer_length_from_utf8_without_replacement( |
185 | 0 | &self, |
186 | 0 | byte_length: usize, |
187 | 0 | ) -> Option<usize> { |
188 | 0 | // Astral: 4 to 2 |
189 | 0 | // Upper BMP: 3 to 2 |
190 | 0 | // Lower BMP: 2 to 2 |
191 | 0 | // ASCII: 1 to 1 |
192 | 0 | byte_length.checked_add(1) |
193 | 0 | } |
194 | | |
195 | | ascii_compatible_encoder_functions!( |
196 | | { |
197 | | // For simplicity, unified ideographs |
198 | | // in the pointer range 11206...11212 are handled |
199 | | // as Level 1 Hanzi. |
200 | | if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) { |
201 | | handle.write_two(lead, trail) |
202 | | } else { |
203 | | let pointer = if let Some(pointer) = big5_box_encode(bmp) { |
204 | | pointer |
205 | | } else if let Some(pointer) = big5_other_encode(bmp) { |
206 | | pointer |
207 | | } else { |
208 | | return ( |
209 | | EncoderResult::unmappable_from_bmp(bmp), |
210 | | source.consumed(), |
211 | | handle.written(), |
212 | | ); |
213 | | }; |
214 | | let lead = pointer / 157 + 0x81; |
215 | | let remainder = pointer % 157; |
216 | | let trail = if remainder < 0x3F { |
217 | | remainder + 0x40 |
218 | | } else { |
219 | | remainder + 0x62 |
220 | | }; |
221 | | handle.write_two(lead as u8, trail as u8) |
222 | | } |
223 | | }, |
224 | | { |
225 | | if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) { |
226 | | if let Some(rebased_pointer) = big5_astral_encode(astral as u16) { |
227 | | // big5_astral_encode returns rebased pointer, |
228 | | // so adding 0x87 instead of 0x81. |
229 | | let lead = rebased_pointer / 157 + 0x87; |
230 | | let remainder = rebased_pointer % 157; |
231 | | let trail = if remainder < 0x3F { |
232 | | remainder + 0x40 |
233 | | } else { |
234 | | remainder + 0x62 |
235 | | }; |
236 | | handle.write_two(lead as u8, trail as u8) |
237 | | } else { |
238 | | return ( |
239 | | EncoderResult::Unmappable(astral), |
240 | | source.consumed(), |
241 | | handle.written(), |
242 | | ); |
243 | | } |
244 | | } else { |
245 | | return ( |
246 | | EncoderResult::Unmappable(astral), |
247 | | source.consumed(), |
248 | | handle.written(), |
249 | | ); |
250 | | } |
251 | | }, |
252 | | bmp, |
253 | | astral, |
254 | | self, |
255 | | source, |
256 | | handle, |
257 | | copy_ascii_to_check_space_two, |
258 | | check_space_two, |
259 | | false |
260 | | ); |
261 | | } |
262 | | |
263 | | // Any copyright to the test code below this comment is dedicated to the |
264 | | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
265 | | |
266 | | #[cfg(all(test, feature = "alloc"))] |
267 | | mod tests { |
268 | | use super::super::testing::*; |
269 | | use super::super::*; |
270 | | |
271 | | fn decode_big5(bytes: &[u8], expect: &str) { |
272 | | decode(BIG5, bytes, expect); |
273 | | } |
274 | | |
275 | | fn encode_big5(string: &str, expect: &[u8]) { |
276 | | encode(BIG5, string, expect); |
277 | | } |
278 | | |
279 | | #[test] |
280 | | fn test_big5_decode() { |
281 | | // Empty |
282 | | decode_big5(b"", &""); |
283 | | |
284 | | // ASCII |
285 | | decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}"); |
286 | | |
287 | | // Edge cases |
288 | | decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}"); |
289 | | decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}"); |
290 | | decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}"); |
291 | | decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}"); |
292 | | decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}"); |
293 | | decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}"); |
294 | | decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}"); |
295 | | decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}"); |
296 | | decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}"); |
297 | | decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}"); |
298 | | decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}"); |
299 | | decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}"); |
300 | | |
301 | | // Edge cases surrounded with ASCII |
302 | | decode_big5( |
303 | | &[0x61u8, 0x87u8, 0x40u8, 0x62u8], |
304 | | &"\u{0061}\u{43F0}\u{0062}", |
305 | | ); |
306 | | decode_big5( |
307 | | &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8], |
308 | | &"\u{0061}\u{79D4}\u{0062}", |
309 | | ); |
310 | | decode_big5( |
311 | | &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8], |
312 | | &"\u{0061}\u{2910D}\u{0062}", |
313 | | ); |
314 | | decode_big5( |
315 | | &[0x61u8, 0x88u8, 0x62u8, 0x62u8], |
316 | | &"\u{0061}\u{00CA}\u{0304}\u{0062}", |
317 | | ); |
318 | | decode_big5( |
319 | | &[0x61u8, 0x88u8, 0x64u8, 0x62u8], |
320 | | &"\u{0061}\u{00CA}\u{030C}\u{0062}", |
321 | | ); |
322 | | decode_big5( |
323 | | &[0x61u8, 0x88u8, 0x66u8, 0x62u8], |
324 | | &"\u{0061}\u{00CA}\u{0062}", |
325 | | ); |
326 | | decode_big5( |
327 | | &[0x61u8, 0x88u8, 0xA3u8, 0x62u8], |
328 | | &"\u{0061}\u{00EA}\u{0304}\u{0062}", |
329 | | ); |
330 | | decode_big5( |
331 | | &[0x61u8, 0x88u8, 0xA5u8, 0x62u8], |
332 | | &"\u{0061}\u{00EA}\u{030C}\u{0062}", |
333 | | ); |
334 | | decode_big5( |
335 | | &[0x61u8, 0x88u8, 0xA7u8, 0x62u8], |
336 | | &"\u{0061}\u{00EA}\u{0062}", |
337 | | ); |
338 | | decode_big5( |
339 | | &[0x61u8, 0x99u8, 0xD4u8, 0x62u8], |
340 | | &"\u{0061}\u{8991}\u{0062}", |
341 | | ); |
342 | | decode_big5( |
343 | | &[0x61u8, 0x99u8, 0xD5u8, 0x62u8], |
344 | | &"\u{0061}\u{27967}\u{0062}", |
345 | | ); |
346 | | decode_big5( |
347 | | &[0x61u8, 0x99u8, 0xD6u8, 0x62u8], |
348 | | &"\u{0061}\u{8A29}\u{0062}", |
349 | | ); |
350 | | |
351 | | // Bad sequences |
352 | | decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}"); |
353 | | decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}"); |
354 | | decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}"); |
355 | | decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}"); |
356 | | decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}"); |
357 | | decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}"); |
358 | | } |
359 | | |
360 | | #[test] |
361 | | fn test_big5_encode() { |
362 | | // Empty |
363 | | encode_big5("", b""); |
364 | | |
365 | | // ASCII |
366 | | encode_big5("\u{0061}\u{0062}", b"\x61\x62"); |
367 | | |
368 | | if !cfg!(miri) { |
369 | | // Miri is too slow |
370 | | // Edge cases |
371 | | encode_big5("\u{9EA6}\u{0061}", b"麦\x61"); |
372 | | encode_big5("\u{2626B}\u{0061}", b"𦉫\x61"); |
373 | | encode_big5("\u{3000}", b"\xA1\x40"); |
374 | | encode_big5("\u{20AC}", b"\xA3\xE1"); |
375 | | encode_big5("\u{4E00}", b"\xA4\x40"); |
376 | | encode_big5("\u{27607}", b"\xC8\xA4"); |
377 | | encode_big5("\u{FFE2}", b"\xC8\xCD"); |
378 | | encode_big5("\u{79D4}", b"\xFE\xFE"); |
379 | | |
380 | | // Not in index |
381 | | encode_big5("\u{2603}\u{0061}", b"☃\x61"); |
382 | | } |
383 | | |
384 | | // duplicate low bits |
385 | | encode_big5("\u{203B5}", b"\xFD\x6A"); |
386 | | encode_big5("\u{25605}", b"\xFE\x46"); |
387 | | |
388 | | // prefer last |
389 | | encode_big5("\u{2550}", b"\xF9\xF9"); |
390 | | } |
391 | | |
392 | | #[test] |
393 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
394 | | fn test_big5_decode_all() { |
395 | | let input = include_bytes!("test_data/big5_in.txt"); |
396 | | let expectation = include_str!("test_data/big5_in_ref.txt"); |
397 | | let (cow, had_errors) = BIG5.decode_without_bom_handling(input); |
398 | | assert!(had_errors, "Should have had errors."); |
399 | | assert_eq!(&cow[..], expectation); |
400 | | } |
401 | | |
402 | | #[test] |
403 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
404 | | fn test_big5_encode_all() { |
405 | | let input = include_str!("test_data/big5_out.txt"); |
406 | | let expectation = include_bytes!("test_data/big5_out_ref.txt"); |
407 | | let (cow, encoding, had_errors) = BIG5.encode(input); |
408 | | assert!(!had_errors, "Should not have had errors."); |
409 | | assert_eq!(encoding, BIG5); |
410 | | assert_eq!(&cow[..], &expectation[..]); |
411 | | } |
412 | | |
413 | | #[test] |
414 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
415 | | fn test_big5_encode_from_two_low_surrogates() { |
416 | | let expectation = b"��"; |
417 | | let mut output = [0u8; 40]; |
418 | | let mut encoder = BIG5.new_encoder(); |
419 | | let (result, read, written, had_errors) = |
420 | | encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true); |
421 | | assert_eq!(result, CoderResult::InputEmpty); |
422 | | assert_eq!(read, 2); |
423 | | assert_eq!(written, expectation.len()); |
424 | | assert!(had_errors); |
425 | | assert_eq!(&output[..written], expectation); |
426 | | } |
427 | | } |