/rust/registry/src/index.crates.io-6f17d22bba15001f/encoding_rs-0.8.35/src/mem.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | | // file at the top-level directory of this distribution. |
3 | | // |
4 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | | // option. This file may not be copied, modified, or distributed |
8 | | // except according to those terms. |
9 | | |
10 | | //! Functions for converting between different in-RAM representations of text |
11 | | //! and for quickly checking if the Unicode Bidirectional Algorithm can be |
12 | | //! avoided. |
13 | | //! |
14 | | //! By using slices for output, the functions here seek to enable by-register |
15 | | //! (ALU register or SIMD register as available) operations in order to |
16 | | //! outperform iterator-based conversions available in the Rust standard |
17 | | //! library. |
18 | | //! |
19 | | //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to |
20 | | //! U+00FF, inclusive, and does not refer to the windows-1252 range. This |
21 | | //! in-memory encoding is sometimes used as a storage optimization of text |
22 | | //! when UTF-16 indexing and length semantics are exposed. |
23 | | //! |
24 | | //! The FFI binding for this module are in the |
25 | | //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem). |
26 | | |
27 | | #[cfg(feature = "alloc")] |
28 | | use alloc::borrow::Cow; |
29 | | #[cfg(feature = "alloc")] |
30 | | use alloc::string::String; |
31 | | #[cfg(feature = "alloc")] |
32 | | use alloc::vec::Vec; |
33 | | |
34 | | use super::in_inclusive_range16; |
35 | | use super::in_inclusive_range32; |
36 | | use super::in_inclusive_range8; |
37 | | use super::in_range16; |
38 | | use super::in_range32; |
39 | | use super::DecoderResult; |
40 | | use crate::ascii::*; |
41 | | use crate::utf_8::*; |
42 | | |
43 | | macro_rules! non_fuzz_debug_assert { |
44 | | ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); }) |
45 | | } |
46 | | |
47 | | cfg_if! { |
48 | | if #[cfg(feature = "simd-accel")] { |
49 | | use ::core::intrinsics::likely; |
50 | | use ::core::intrinsics::unlikely; |
51 | | } else { |
52 | | #[inline(always)] |
53 | 0 | fn likely(b: bool) -> bool { |
54 | 0 | b |
55 | 0 | } |
56 | | #[inline(always)] |
57 | 0 | fn unlikely(b: bool) -> bool { |
58 | 0 | b |
59 | 0 | } |
60 | | } |
61 | | } |
62 | | |
63 | | /// Classification of text as Latin1 (all code points are below U+0100), |
64 | | /// left-to-right with some non-Latin1 characters or as containing at least |
65 | | /// some right-to-left characters. |
66 | | #[must_use] |
67 | | #[derive(Debug, PartialEq, Eq)] |
68 | | #[repr(C)] |
69 | | pub enum Latin1Bidi { |
70 | | /// Every character is below U+0100. |
71 | | Latin1 = 0, |
72 | | /// There is at least one character that's U+0100 or higher, but there |
73 | | /// are no right-to-left characters. |
74 | | LeftToRight = 1, |
75 | | /// There is at least one right-to-left character. |
76 | | Bidi = 2, |
77 | | } |
78 | | |
79 | | // `as` truncates, so works on 32-bit, too. |
80 | | #[allow(dead_code)] |
81 | | const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize; |
82 | | |
83 | | #[allow(unused_macros)] |
84 | | macro_rules! by_unit_check_alu { |
85 | | ($name:ident, $unit:ty, $bound:expr, $mask:ident) => { |
86 | | #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))] |
87 | | #[inline(always)] |
88 | 15.6k | fn $name(buffer: &[$unit]) -> bool { |
89 | 15.6k | let mut offset = 0usize; |
90 | 15.6k | let mut accu = 0usize; |
91 | 15.6k | let unit_size = ::core::mem::size_of::<$unit>(); |
92 | 15.6k | let len = buffer.len(); |
93 | 15.6k | if len >= ALU_ALIGNMENT / unit_size { |
94 | | // The most common reason to return `false` is for the first code |
95 | | // unit to fail the test, so check that first. |
96 | 15.6k | if buffer[0] >= $bound { |
97 | 245 | return false; |
98 | 15.4k | } |
99 | 15.4k | let src = buffer.as_ptr(); |
100 | 15.4k | let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) |
101 | 15.4k | & ALU_ALIGNMENT_MASK) |
102 | 15.4k | / unit_size; |
103 | 15.4k | if until_alignment + ALU_ALIGNMENT / unit_size <= len { |
104 | 15.4k | if until_alignment != 0 { |
105 | 545 | accu |= buffer[offset] as usize; |
106 | 545 | offset += 1; |
107 | 545 | until_alignment -= 1; |
108 | 1.14k | while until_alignment != 0 { |
109 | 603 | accu |= buffer[offset] as usize; |
110 | 603 | offset += 1; |
111 | 603 | until_alignment -= 1; |
112 | 603 | } |
113 | 545 | if accu >= $bound { |
114 | 14 | return false; |
115 | 531 | } |
116 | 14.9k | } |
117 | 15.4k | let len_minus_stride = len - ALU_ALIGNMENT / unit_size; |
118 | 15.4k | if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len { |
119 | | // Safety: the above check lets us perform 4 consecutive reads of |
120 | | // length ALU_ALIGNMENT / unit_size. ALU_ALIGNMENT is the size of usize, and unit_size |
121 | | // is the size of the `src` pointer, so this is equal to performing four usize reads. |
122 | | // |
123 | | // This invariant is upheld on all loop iterations |
124 | 15.3k | let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size)); |
125 | | loop { |
126 | 7.74M | let unroll_accu = unsafe { *(src.add(offset) as *const usize) } |
127 | 7.74M | | unsafe { |
128 | 7.74M | *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize) |
129 | 7.74M | } |
130 | 7.74M | | unsafe { |
131 | 7.74M | *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size))) |
132 | 7.74M | as *const usize) |
133 | 7.74M | } |
134 | 7.74M | | unsafe { |
135 | 7.74M | *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size))) |
136 | 7.74M | as *const usize) |
137 | 7.74M | }; |
138 | 7.74M | if unroll_accu & $mask != 0 { |
139 | 4.89k | return false; |
140 | 7.73M | } |
141 | 7.73M | offset += 4 * (ALU_ALIGNMENT / unit_size); |
142 | 7.73M | // Safety: this check lets us continue to perform the 4 reads earlier |
143 | 7.73M | if offset > len_minus_unroll { |
144 | 10.4k | break; |
145 | 7.72M | } |
146 | | } |
147 | 81 | } |
148 | 22.7k | while offset <= len_minus_stride { |
149 | 12.2k | // Safety: the above check lets us perform one usize read. |
150 | 12.2k | accu |= unsafe { *(src.add(offset) as *const usize) }; |
151 | 12.2k | offset += ALU_ALIGNMENT / unit_size; |
152 | 12.2k | } |
153 | 0 | } |
154 | 0 | } |
155 | 14.6k | for &unit in &buffer[offset..] { |
156 | 14.6k | accu |= unit as usize; |
157 | 14.6k | } |
158 | 10.5k | accu & $mask == 0 |
159 | 15.6k | } Unexecuted instantiation: encoding_rs::mem::is_ascii_impl Unexecuted instantiation: encoding_rs::mem::is_basic_latin_impl encoding_rs::mem::is_utf16_latin1_impl Line | Count | Source | 88 | 15.6k | fn $name(buffer: &[$unit]) -> bool { | 89 | 15.6k | let mut offset = 0usize; | 90 | 15.6k | let mut accu = 0usize; | 91 | 15.6k | let unit_size = ::core::mem::size_of::<$unit>(); | 92 | 15.6k | let len = buffer.len(); | 93 | 15.6k | if len >= ALU_ALIGNMENT / unit_size { | 94 | | // The most common reason to return `false` is for the first code | 95 | | // unit to fail the test, so check that first. | 96 | 15.6k | if buffer[0] >= $bound { | 97 | 245 | return false; | 98 | 15.4k | } | 99 | 15.4k | let src = buffer.as_ptr(); | 100 | 15.4k | let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) | 101 | 15.4k | & ALU_ALIGNMENT_MASK) | 102 | 15.4k | / unit_size; | 103 | 15.4k | if until_alignment + ALU_ALIGNMENT / unit_size <= len { | 104 | 15.4k | if until_alignment != 0 { | 105 | 545 | accu |= buffer[offset] as usize; | 106 | 545 | offset += 1; | 107 | 545 | until_alignment -= 1; | 108 | 1.14k | while until_alignment != 0 { | 109 | 603 | accu |= buffer[offset] as usize; | 110 | 603 | offset += 1; | 111 | 603 | until_alignment -= 1; | 112 | 603 | } | 113 | 545 | if accu >= $bound { | 114 | 14 | return false; | 115 | 531 | } | 116 | 14.9k | } | 117 | 15.4k | let len_minus_stride = len - ALU_ALIGNMENT / unit_size; | 118 | 15.4k | if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len { | 119 | | // Safety: the above check lets us perform 4 consecutive reads of | 120 | | // length ALU_ALIGNMENT / unit_size. ALU_ALIGNMENT is the size of usize, and unit_size | 121 | | // is the size of the `src` pointer, so this is equal to performing four usize reads. | 122 | | // | 123 | | // This invariant is upheld on all loop iterations | 124 | 15.3k | let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size)); | 125 | | loop { | 126 | 7.74M | let unroll_accu = unsafe { *(src.add(offset) as *const usize) } | 127 | 7.74M | | unsafe { | 128 | 7.74M | *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize) | 129 | 7.74M | } | 130 | 7.74M | | unsafe { | 131 | 7.74M | *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size))) | 132 | 7.74M | as *const usize) | 133 | 7.74M | } | 134 | 7.74M | | unsafe { | 135 | 7.74M | *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size))) | 136 | 7.74M | as *const usize) | 137 | 7.74M | }; | 138 | 7.74M | if unroll_accu & $mask != 0 { | 139 | 4.89k | return false; | 140 | 7.73M | } | 141 | 7.73M | offset += 4 * (ALU_ALIGNMENT / unit_size); | 142 | 7.73M | // Safety: this check lets us continue to perform the 4 reads earlier | 143 | 7.73M | if offset > len_minus_unroll { | 144 | 10.4k | break; | 145 | 7.72M | } | 146 | | } | 147 | 81 | } | 148 | 22.7k | while offset <= len_minus_stride { | 149 | 12.2k | // Safety: the above check lets us perform one usize read. | 150 | 12.2k | accu |= unsafe { *(src.add(offset) as *const usize) }; | 151 | 12.2k | offset += ALU_ALIGNMENT / unit_size; | 152 | 12.2k | } | 153 | 0 | } | 154 | 0 | } | 155 | 14.6k | for &unit in &buffer[offset..] { | 156 | 14.6k | accu |= unit as usize; | 157 | 14.6k | } | 158 | 10.5k | accu & $mask == 0 | 159 | 15.6k | } |
|
160 | | }; |
161 | | } |
162 | | |
163 | | #[allow(unused_macros)] |
164 | | macro_rules! by_unit_check_simd { |
165 | | ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => { |
166 | | #[inline(always)] |
167 | | fn $name(buffer: &[$unit]) -> bool { |
168 | | let mut offset = 0usize; |
169 | | let mut accu = 0usize; |
170 | | let unit_size = ::core::mem::size_of::<$unit>(); |
171 | | let len = buffer.len(); |
172 | | if len >= SIMD_STRIDE_SIZE / unit_size { |
173 | | // The most common reason to return `false` is for the first code |
174 | | // unit to fail the test, so check that first. |
175 | | if buffer[0] >= $bound { |
176 | | return false; |
177 | | } |
178 | | let src = buffer.as_ptr(); |
179 | | let mut until_alignment = ((SIMD_ALIGNMENT |
180 | | - ((src as usize) & SIMD_ALIGNMENT_MASK)) |
181 | | & SIMD_ALIGNMENT_MASK) |
182 | | / unit_size; |
183 | | if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len { |
184 | | if until_alignment != 0 { |
185 | | accu |= buffer[offset] as usize; |
186 | | offset += 1; |
187 | | until_alignment -= 1; |
188 | | while until_alignment != 0 { |
189 | | accu |= buffer[offset] as usize; |
190 | | offset += 1; |
191 | | until_alignment -= 1; |
192 | | } |
193 | | if accu >= $bound { |
194 | | return false; |
195 | | } |
196 | | } |
197 | | let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; |
198 | | if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len { |
199 | | // Safety: the above check lets us perform 4 consecutive reads of |
200 | | // length SIMD_STRIDE_SIZE / unit_size. SIMD_STRIDE_SIZE is the size of $simd_ty, and unit_size |
201 | | // is the size of the `src` pointer, so this is equal to performing four $simd_ty reads. |
202 | | // |
203 | | // This invariant is upheld on all loop iterations |
204 | | let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size)); |
205 | | loop { |
206 | | let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) } |
207 | | | unsafe { |
208 | | *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size)) |
209 | | as *const $simd_ty) |
210 | | } |
211 | | | unsafe { |
212 | | *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size))) |
213 | | as *const $simd_ty) |
214 | | } |
215 | | | unsafe { |
216 | | *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size))) |
217 | | as *const $simd_ty) |
218 | | }; |
219 | | if !$func(unroll_accu) { |
220 | | return false; |
221 | | } |
222 | | offset += 4 * (SIMD_STRIDE_SIZE / unit_size); |
223 | | // Safety: this check lets us continue to perform the 4 reads earlier |
224 | | if offset > len_minus_unroll { |
225 | | break; |
226 | | } |
227 | | } |
228 | | } |
229 | | let mut simd_accu = $splat; |
230 | | while offset <= len_minus_stride { |
231 | | // Safety: the above check lets us perform one $simd_ty read. |
232 | | simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) }; |
233 | | offset += SIMD_STRIDE_SIZE / unit_size; |
234 | | } |
235 | | if !$func(simd_accu) { |
236 | | return false; |
237 | | } |
238 | | } |
239 | | } |
240 | | for &unit in &buffer[offset..] { |
241 | | accu |= unit as usize; |
242 | | } |
243 | | accu < $bound |
244 | | } |
245 | | }; |
246 | | } |
247 | | |
248 | | cfg_if! { |
249 | | if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { |
250 | | use crate::simd_funcs::*; |
251 | | use core::simd::u8x16; |
252 | | use core::simd::u16x8; |
253 | | |
254 | | const SIMD_ALIGNMENT: usize = 16; |
255 | | |
256 | | const SIMD_ALIGNMENT_MASK: usize = 15; |
257 | | |
258 | | by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii); |
259 | | by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin); |
260 | | by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1); |
261 | | |
262 | | #[inline(always)] |
263 | | fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { |
264 | | // This function is a mess, because it simultaneously tries to do |
265 | | // only aligned SIMD (perhaps misguidedly) and needs to deal with |
266 | | // the last code unit in a SIMD stride being part of a valid |
267 | | // surrogate pair. |
268 | | let unit_size = ::core::mem::size_of::<u16>(); |
269 | | let src = buffer.as_ptr(); |
270 | | let len = buffer.len(); |
271 | | let mut offset = 0usize; |
272 | | 'outer: loop { |
273 | | let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) & |
274 | | SIMD_ALIGNMENT_MASK) / unit_size; |
275 | | if until_alignment == 0 { |
276 | | if offset + SIMD_STRIDE_SIZE / unit_size > len { |
277 | | break; |
278 | | } |
279 | | } else { |
280 | | let offset_plus_until_alignment = offset + until_alignment; |
281 | | let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1; |
282 | | if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len { |
283 | | break; |
284 | | } |
285 | | let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]); |
286 | | if up_to < until_alignment { |
287 | | return offset + up_to; |
288 | | } |
289 | | if last_valid_low { |
290 | | offset = offset_plus_until_alignment_plus_one; |
291 | | continue; |
292 | | } |
293 | | offset = offset_plus_until_alignment; |
294 | | } |
295 | | let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; |
296 | | loop { |
297 | | let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size; |
298 | | if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) { |
299 | | if offset_plus_stride == len { |
300 | | break 'outer; |
301 | | } |
302 | | let offset_plus_stride_plus_one = offset_plus_stride + 1; |
303 | | let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]); |
304 | | if up_to < SIMD_STRIDE_SIZE / unit_size { |
305 | | return offset + up_to; |
306 | | } |
307 | | if last_valid_low { |
308 | | offset = offset_plus_stride_plus_one; |
309 | | continue 'outer; |
310 | | } |
311 | | } |
312 | | offset = offset_plus_stride; |
313 | | if offset > len_minus_stride { |
314 | | break 'outer; |
315 | | } |
316 | | } |
317 | | } |
318 | | let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]); |
319 | | offset + up_to |
320 | | } |
321 | | } else { |
322 | | by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK); |
323 | | by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK); |
324 | | by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK); |
325 | | |
326 | | #[inline(always)] |
327 | 0 | fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { |
328 | 0 | let (up_to, _) = utf16_valid_up_to_alu(buffer); |
329 | 0 | up_to |
330 | 0 | } |
331 | | } |
332 | | } |
333 | | |
334 | | /// The second return value is true iff the last code unit of the slice was |
335 | | /// reached and turned out to be a low surrogate that is part of a valid pair. |
336 | | #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))] |
337 | | #[inline(always)] |
338 | 0 | fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) { |
339 | 0 | let len = buffer.len(); |
340 | 0 | if len == 0 { |
341 | 0 | return (0, false); |
342 | 0 | } |
343 | 0 | let mut offset = 0usize; |
344 | | loop { |
345 | 0 | let unit = buffer[offset]; |
346 | 0 | let next = offset + 1; |
347 | 0 | let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); |
348 | 0 | if unit_minus_surrogate_start > (0xDFFF - 0xD800) { |
349 | | // Not a surrogate |
350 | 0 | offset = next; |
351 | 0 | if offset == len { |
352 | 0 | return (offset, false); |
353 | 0 | } |
354 | 0 | continue; |
355 | 0 | } |
356 | 0 | if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { |
357 | | // high surrogate |
358 | 0 | if next < len { |
359 | 0 | let second = buffer[next]; |
360 | 0 | let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); |
361 | 0 | if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { |
362 | | // The next code unit is a low surrogate. Advance position. |
363 | 0 | offset = next + 1; |
364 | 0 | if offset == len { |
365 | 0 | return (offset, true); |
366 | 0 | } |
367 | 0 | continue; |
368 | 0 | } |
369 | | // The next code unit is not a low surrogate. Don't advance |
370 | | // position and treat the high surrogate as unpaired. |
371 | | // fall through |
372 | 0 | } |
373 | | // Unpaired, fall through |
374 | 0 | } |
375 | | // Unpaired surrogate |
376 | 0 | return (offset, false); |
377 | | } |
378 | 0 | } |
379 | | |
380 | | cfg_if! { |
381 | | if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { |
382 | | #[inline(always)] |
383 | | fn is_str_latin1_impl(buffer: &str) -> Option<usize> { |
384 | | let mut offset = 0usize; |
385 | | let bytes = buffer.as_bytes(); |
386 | | let len = bytes.len(); |
387 | | if len >= SIMD_STRIDE_SIZE { |
388 | | let src = bytes.as_ptr(); |
389 | | let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & |
390 | | SIMD_ALIGNMENT_MASK; |
391 | | if until_alignment + SIMD_STRIDE_SIZE <= len { |
392 | | while until_alignment != 0 { |
393 | | if bytes[offset] > 0xC3 { |
394 | | return Some(offset); |
395 | | } |
396 | | offset += 1; |
397 | | until_alignment -= 1; |
398 | | } |
399 | | let len_minus_stride = len - SIMD_STRIDE_SIZE; |
400 | | loop { |
401 | | if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) { |
402 | | // TODO: Ensure this compiles away when inlined into `is_str_latin1()`. |
403 | | while bytes[offset] & 0xC0 == 0x80 { |
404 | | offset += 1; |
405 | | } |
406 | | return Some(offset); |
407 | | } |
408 | | offset += SIMD_STRIDE_SIZE; |
409 | | if offset > len_minus_stride { |
410 | | break; |
411 | | } |
412 | | } |
413 | | } |
414 | | } |
415 | | for i in offset..len { |
416 | | if bytes[i] > 0xC3 { |
417 | | return Some(i); |
418 | | } |
419 | | } |
420 | | None |
421 | | } |
422 | | } else { |
423 | | #[inline(always)] |
424 | 0 | fn is_str_latin1_impl(buffer: &str) -> Option<usize> { |
425 | 0 | let mut bytes = buffer.as_bytes(); |
426 | 0 | let mut total = 0; |
427 | | loop { |
428 | 0 | if let Some((byte, offset)) = validate_ascii(bytes) { |
429 | 0 | total += offset; |
430 | 0 | if byte > 0xC3 { |
431 | 0 | return Some(total); |
432 | 0 | } |
433 | 0 | bytes = &bytes[offset + 2..]; |
434 | 0 | total += 2; |
435 | | } else { |
436 | 0 | return None; |
437 | | } |
438 | | } |
439 | 0 | } |
440 | | } |
441 | | } |
442 | | |
443 | | #[inline(always)] |
444 | 0 | fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> { |
445 | 0 | let mut bytes = buffer; |
446 | 0 | let mut total = 0; |
447 | | loop { |
448 | 0 | if let Some((byte, offset)) = validate_ascii(bytes) { |
449 | 0 | total += offset; |
450 | 0 | if in_inclusive_range8(byte, 0xC2, 0xC3) { |
451 | 0 | let next = offset + 1; |
452 | 0 | if next == bytes.len() { |
453 | 0 | return Some(total); |
454 | 0 | } |
455 | 0 | if bytes[next] & 0xC0 != 0x80 { |
456 | 0 | return Some(total); |
457 | 0 | } |
458 | 0 | bytes = &bytes[offset + 2..]; |
459 | 0 | total += 2; |
460 | | } else { |
461 | 0 | return Some(total); |
462 | | } |
463 | | } else { |
464 | 0 | return None; |
465 | | } |
466 | | } |
467 | 0 | } |
468 | | |
469 | | cfg_if! { |
470 | | if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { |
471 | | #[inline(always)] |
472 | | fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { |
473 | | let mut offset = 0usize; |
474 | | let len = buffer.len(); |
475 | | if len >= SIMD_STRIDE_SIZE / 2 { |
476 | | let src = buffer.as_ptr(); |
477 | | let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & |
478 | | SIMD_ALIGNMENT_MASK) / 2; |
479 | | if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { |
480 | | while until_alignment != 0 { |
481 | | if is_utf16_code_unit_bidi(buffer[offset]) { |
482 | | return true; |
483 | | } |
484 | | offset += 1; |
485 | | until_alignment -= 1; |
486 | | } |
487 | | let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); |
488 | | loop { |
489 | | if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) { |
490 | | return true; |
491 | | } |
492 | | offset += SIMD_STRIDE_SIZE / 2; |
493 | | if offset > len_minus_stride { |
494 | | break; |
495 | | } |
496 | | } |
497 | | } |
498 | | } |
499 | | for &u in &buffer[offset..] { |
500 | | if is_utf16_code_unit_bidi(u) { |
501 | | return true; |
502 | | } |
503 | | } |
504 | | false |
505 | | } |
506 | | } else { |
507 | | #[inline(always)] |
508 | 0 | fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { |
509 | 0 | for &u in buffer { |
510 | 0 | if is_utf16_code_unit_bidi(u) { |
511 | 0 | return true; |
512 | 0 | } |
513 | | } |
514 | 0 | false |
515 | 0 | } |
516 | | } |
517 | | } |
518 | | |
519 | | cfg_if! { |
520 | | if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { |
521 | | #[inline(always)] |
522 | | fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { |
523 | | let mut offset = 0usize; |
524 | | let len = buffer.len(); |
525 | | if len >= SIMD_STRIDE_SIZE / 2 { |
526 | | let src = buffer.as_ptr(); |
527 | | let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & |
528 | | SIMD_ALIGNMENT_MASK) / 2; |
529 | | if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { |
530 | | while until_alignment != 0 { |
531 | | if buffer[offset] > 0xFF { |
532 | | // This transition isn't optimal, since the aligment is recomputing |
533 | | // but not tweaking further today. |
534 | | if is_utf16_bidi_impl(&buffer[offset..]) { |
535 | | return Latin1Bidi::Bidi; |
536 | | } |
537 | | return Latin1Bidi::LeftToRight; |
538 | | } |
539 | | offset += 1; |
540 | | until_alignment -= 1; |
541 | | } |
542 | | let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); |
543 | | loop { |
544 | | let mut s = unsafe { *(src.add(offset) as *const u16x8) }; |
545 | | if !simd_is_latin1(s) { |
546 | | loop { |
547 | | if is_u16x8_bidi(s) { |
548 | | return Latin1Bidi::Bidi; |
549 | | } |
550 | | offset += SIMD_STRIDE_SIZE / 2; |
551 | | if offset > len_minus_stride { |
552 | | for &u in &buffer[offset..] { |
553 | | if is_utf16_code_unit_bidi(u) { |
554 | | return Latin1Bidi::Bidi; |
555 | | } |
556 | | } |
557 | | return Latin1Bidi::LeftToRight; |
558 | | } |
559 | | s = unsafe { *(src.add(offset) as *const u16x8) }; |
560 | | } |
561 | | } |
562 | | offset += SIMD_STRIDE_SIZE / 2; |
563 | | if offset > len_minus_stride { |
564 | | break; |
565 | | } |
566 | | } |
567 | | } |
568 | | } |
569 | | let mut iter = (&buffer[offset..]).iter(); |
570 | | loop { |
571 | | if let Some(&u) = iter.next() { |
572 | | if u > 0xFF { |
573 | | let mut inner_u = u; |
574 | | loop { |
575 | | if is_utf16_code_unit_bidi(inner_u) { |
576 | | return Latin1Bidi::Bidi; |
577 | | } |
578 | | if let Some(&code_unit) = iter.next() { |
579 | | inner_u = code_unit; |
580 | | } else { |
581 | | return Latin1Bidi::LeftToRight; |
582 | | } |
583 | | } |
584 | | } |
585 | | } else { |
586 | | return Latin1Bidi::Latin1; |
587 | | } |
588 | | } |
589 | | } |
590 | | } else { |
591 | | #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))] |
592 | | #[inline(always)] |
593 | 0 | fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { |
594 | 0 | let mut offset = 0usize; |
595 | 0 | let len = buffer.len(); |
596 | 0 | if len >= ALU_ALIGNMENT / 2 { |
597 | 0 | let src = buffer.as_ptr(); |
598 | 0 | let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & |
599 | 0 | ALU_ALIGNMENT_MASK) / 2; |
600 | 0 | if until_alignment + ALU_ALIGNMENT / 2 <= len { |
601 | 0 | while until_alignment != 0 { |
602 | 0 | if buffer[offset] > 0xFF { |
603 | 0 | if is_utf16_bidi_impl(&buffer[offset..]) { |
604 | 0 | return Latin1Bidi::Bidi; |
605 | 0 | } |
606 | 0 | return Latin1Bidi::LeftToRight; |
607 | 0 | } |
608 | 0 | offset += 1; |
609 | 0 | until_alignment -= 1; |
610 | | } |
611 | 0 | let len_minus_stride = len - ALU_ALIGNMENT / 2; |
612 | | loop { |
613 | 0 | if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 { |
614 | 0 | if is_utf16_bidi_impl(&buffer[offset..]) { |
615 | 0 | return Latin1Bidi::Bidi; |
616 | 0 | } |
617 | 0 | return Latin1Bidi::LeftToRight; |
618 | 0 | } |
619 | 0 | offset += ALU_ALIGNMENT / 2; |
620 | 0 | if offset > len_minus_stride { |
621 | 0 | break; |
622 | 0 | } |
623 | | } |
624 | 0 | } |
625 | 0 | } |
626 | 0 | let mut iter = (&buffer[offset..]).iter(); |
627 | | loop { |
628 | 0 | if let Some(&u) = iter.next() { |
629 | 0 | if u > 0xFF { |
630 | 0 | let mut inner_u = u; |
631 | | loop { |
632 | 0 | if is_utf16_code_unit_bidi(inner_u) { |
633 | 0 | return Latin1Bidi::Bidi; |
634 | 0 | } |
635 | 0 | if let Some(&code_unit) = iter.next() { |
636 | 0 | inner_u = code_unit; |
637 | 0 | } else { |
638 | 0 | return Latin1Bidi::LeftToRight; |
639 | | } |
640 | | } |
641 | 0 | } |
642 | | } else { |
643 | 0 | return Latin1Bidi::Latin1; |
644 | | } |
645 | | } |
646 | 0 | } |
647 | | } |
648 | | } |
649 | | |
650 | | /// Checks whether the buffer is all-ASCII. |
651 | | /// |
652 | | /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function |
653 | | /// is not guaranteed to fail fast.) |
654 | 0 | pub fn is_ascii(buffer: &[u8]) -> bool { |
655 | 0 | is_ascii_impl(buffer) |
656 | 0 | } |
657 | | |
658 | | /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing |
659 | | /// only ASCII characters). |
660 | | /// |
661 | | /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function |
662 | | /// is not guaranteed to fail fast.) |
663 | 0 | pub fn is_basic_latin(buffer: &[u16]) -> bool { |
664 | 0 | is_basic_latin_impl(buffer) |
665 | 0 | } |
666 | | |
667 | | /// Checks whether the buffer is valid UTF-8 representing only code points |
668 | | /// less than or equal to U+00FF. |
669 | | /// |
670 | | /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8 |
671 | | /// invalidity or code points above U+00FF are discovered. |
672 | 0 | pub fn is_utf8_latin1(buffer: &[u8]) -> bool { |
673 | 0 | is_utf8_latin1_impl(buffer).is_none() |
674 | 0 | } |
675 | | |
676 | | /// Checks whether the buffer represents only code points less than or equal |
677 | | /// to U+00FF. |
678 | | /// |
679 | | /// Fails fast. (I.e. returns before having read the whole buffer if code |
680 | | /// points above U+00FF are discovered. |
681 | 0 | pub fn is_str_latin1(buffer: &str) -> bool { |
682 | 0 | is_str_latin1_impl(buffer).is_none() |
683 | 0 | } |
684 | | |
685 | | /// Checks whether the buffer represents only code point less than or equal |
686 | | /// to U+00FF. |
687 | | /// |
688 | | /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function |
689 | | /// is not guaranteed to fail fast.) |
690 | 15.6k | pub fn is_utf16_latin1(buffer: &[u16]) -> bool { |
691 | 15.6k | is_utf16_latin1_impl(buffer) |
692 | 15.6k | } |
693 | | |
694 | | /// Checks whether a potentially-invalid UTF-8 buffer contains code points |
695 | | /// that trigger right-to-left processing. |
696 | | /// |
697 | | /// The check is done on a Unicode block basis without regard to assigned |
698 | | /// vs. unassigned code points in the block. Hebrew presentation forms in |
699 | | /// the Alphabetic Presentation Forms block are treated as if they formed |
700 | | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
701 | | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
702 | | /// for. Control characters that are technically bidi controls but do not |
703 | | /// cause right-to-left behavior without the presence of right-to-left |
704 | | /// characters or right-to-left controls are not checked for. As a special |
705 | | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
706 | | /// |
707 | | /// Returns `true` if the input is invalid UTF-8 or the input contains an |
708 | | /// RTL character. Returns `false` if the input is valid UTF-8 and contains |
709 | | /// no RTL characters. |
710 | | #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))] |
711 | | #[inline] |
712 | 0 | pub fn is_utf8_bidi(buffer: &[u8]) -> bool { |
713 | 0 | // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster |
714 | 0 | // than UTF-8 validation followed by `is_str_bidi()` for German, |
715 | 0 | // Russian and Japanese. However, this is considerably slower for Thai. |
716 | 0 | // Chances are that the compiler makes some branch predictions that are |
717 | 0 | // unfortunate for Thai. Not spending the time to manually optimize |
718 | 0 | // further at this time, since it's unclear if this variant even has |
719 | 0 | // use cases. However, this is worth revisiting once Rust gets the |
720 | 0 | // ability to annotate relative priorities of match arms. |
721 | 0 |
|
722 | 0 | // U+058F: D6 8F |
723 | 0 | // U+0590: D6 90 |
724 | 0 | // U+08FF: E0 A3 BF |
725 | 0 | // U+0900: E0 A4 80 |
726 | 0 | // |
727 | 0 | // U+200F: E2 80 8F |
728 | 0 | // U+202B: E2 80 AB |
729 | 0 | // U+202E: E2 80 AE |
730 | 0 | // U+2067: E2 81 A7 |
731 | 0 | // |
732 | 0 | // U+FB1C: EF AC 9C |
733 | 0 | // U+FB1D: EF AC 9D |
734 | 0 | // U+FDFF: EF B7 BF |
735 | 0 | // U+FE00: EF B8 80 |
736 | 0 | // |
737 | 0 | // U+FE6F: EF B9 AF |
738 | 0 | // U+FE70: EF B9 B0 |
739 | 0 | // U+FEFE: EF BB BE |
740 | 0 | // U+FEFF: EF BB BF |
741 | 0 | // |
742 | 0 | // U+107FF: F0 90 9F BF |
743 | 0 | // U+10800: F0 90 A0 80 |
744 | 0 | // U+10FFF: F0 90 BF BF |
745 | 0 | // U+11000: F0 91 80 80 |
746 | 0 | // |
747 | 0 | // U+1E7FF: F0 9E 9F BF |
748 | 0 | // U+1E800: F0 9E A0 80 |
749 | 0 | // U+1EFFF: F0 9E BF BF |
750 | 0 | // U+1F000: F0 9F 80 80 |
751 | 0 | let mut src = buffer; |
752 | | 'outer: loop { |
753 | 0 | if let Some((mut byte, mut read)) = validate_ascii(src) { |
754 | | // Check for the longest sequence to avoid checking twice for the |
755 | | // multi-byte sequences. |
756 | 0 | if read + 4 <= src.len() { |
757 | | 'inner: loop { |
758 | | // At this point, `byte` is not included in `read`. |
759 | 0 | match byte { |
760 | 0 | 0..=0x7F => { |
761 | | // ASCII: go back to SIMD. |
762 | 0 | read += 1; |
763 | 0 | src = &src[read..]; |
764 | 0 | continue 'outer; |
765 | | } |
766 | 0 | 0xC2..=0xD5 => { |
767 | | // Two-byte |
768 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
769 | 0 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
770 | 0 | return true; |
771 | 0 | } |
772 | 0 | read += 2; |
773 | | } |
774 | | 0xD6 => { |
775 | | // Two-byte |
776 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
777 | 0 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
778 | 0 | return true; |
779 | 0 | } |
780 | 0 | // XXX consider folding the above and below checks |
781 | 0 | if second > 0x8F { |
782 | 0 | return true; |
783 | 0 | } |
784 | 0 | read += 2; |
785 | | } |
786 | | // two-byte starting with 0xD7 and above is bidi |
787 | 0 | 0xE1 | 0xE3..=0xEC | 0xEE => { |
788 | | // Three-byte normal |
789 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
790 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
791 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
792 | 0 | & unsafe { |
793 | 0 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
794 | 0 | }) |
795 | 0 | | (third >> 6)) |
796 | 0 | != 2 |
797 | | { |
798 | 0 | return true; |
799 | 0 | } |
800 | 0 | read += 3; |
801 | | } |
802 | | 0xE2 => { |
803 | | // Three-byte normal, potentially bidi |
804 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
805 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
806 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
807 | 0 | & unsafe { |
808 | 0 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
809 | 0 | }) |
810 | 0 | | (third >> 6)) |
811 | 0 | != 2 |
812 | | { |
813 | 0 | return true; |
814 | 0 | } |
815 | 0 | if second == 0x80 { |
816 | 0 | if third == 0x8F || third == 0xAB || third == 0xAE { |
817 | 0 | return true; |
818 | 0 | } |
819 | 0 | } else if second == 0x81 { |
820 | 0 | if third == 0xA7 { |
821 | 0 | return true; |
822 | 0 | } |
823 | 0 | } |
824 | 0 | read += 3; |
825 | | } |
826 | | 0xEF => { |
827 | | // Three-byte normal, potentially bidi |
828 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
829 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
830 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
831 | 0 | & unsafe { |
832 | 0 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
833 | 0 | }) |
834 | 0 | | (third >> 6)) |
835 | 0 | != 2 |
836 | | { |
837 | 0 | return true; |
838 | 0 | } |
839 | 0 | if in_inclusive_range8(second, 0xAC, 0xB7) { |
840 | 0 | if second == 0xAC { |
841 | 0 | if third > 0x9C { |
842 | 0 | return true; |
843 | 0 | } |
844 | | } else { |
845 | 0 | return true; |
846 | | } |
847 | 0 | } else if in_inclusive_range8(second, 0xB9, 0xBB) { |
848 | 0 | if second == 0xB9 { |
849 | 0 | if third > 0xAF { |
850 | 0 | return true; |
851 | 0 | } |
852 | 0 | } else if second == 0xBB { |
853 | 0 | if third != 0xBF { |
854 | 0 | return true; |
855 | 0 | } |
856 | | } else { |
857 | 0 | return true; |
858 | | } |
859 | 0 | } |
860 | 0 | read += 3; |
861 | | } |
862 | | 0xE0 => { |
863 | | // Three-byte special lower bound, potentially bidi |
864 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
865 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
866 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
867 | 0 | & unsafe { |
868 | 0 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
869 | 0 | }) |
870 | 0 | | (third >> 6)) |
871 | 0 | != 2 |
872 | | { |
873 | 0 | return true; |
874 | 0 | } |
875 | 0 | // XXX can this be folded into the above validity check |
876 | 0 | if second < 0xA4 { |
877 | 0 | return true; |
878 | 0 | } |
879 | 0 | read += 3; |
880 | | } |
881 | | 0xED => { |
882 | | // Three-byte special upper bound |
883 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
884 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
885 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
886 | 0 | & unsafe { |
887 | 0 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
888 | 0 | }) |
889 | 0 | | (third >> 6)) |
890 | 0 | != 2 |
891 | | { |
892 | 0 | return true; |
893 | 0 | } |
894 | 0 | read += 3; |
895 | | } |
896 | 0 | 0xF1..=0xF4 => { |
897 | | // Four-byte normal |
898 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
899 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
900 | 0 | let fourth = unsafe { *(src.get_unchecked(read + 3)) }; |
901 | 0 | if (u16::from( |
902 | 0 | UTF8_DATA.table[usize::from(second)] |
903 | 0 | & unsafe { |
904 | 0 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
905 | 0 | }, |
906 | 0 | ) | u16::from(third >> 6) |
907 | 0 | | (u16::from(fourth & 0xC0) << 2)) |
908 | 0 | != 0x202 |
909 | | { |
910 | 0 | return true; |
911 | 0 | } |
912 | 0 | read += 4; |
913 | | } |
914 | | 0xF0 => { |
915 | | // Four-byte special lower bound, potentially bidi |
916 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
917 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
918 | 0 | let fourth = unsafe { *(src.get_unchecked(read + 3)) }; |
919 | 0 | if (u16::from( |
920 | 0 | UTF8_DATA.table[usize::from(second)] |
921 | 0 | & unsafe { |
922 | 0 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
923 | 0 | }, |
924 | 0 | ) | u16::from(third >> 6) |
925 | 0 | | (u16::from(fourth & 0xC0) << 2)) |
926 | 0 | != 0x202 |
927 | | { |
928 | 0 | return true; |
929 | 0 | } |
930 | 0 | if unlikely(second == 0x90 || second == 0x9E) { |
931 | 0 | let third = src[read + 2]; |
932 | 0 | if third >= 0xA0 { |
933 | 0 | return true; |
934 | 0 | } |
935 | 0 | } |
936 | 0 | read += 4; |
937 | | } |
938 | | _ => { |
939 | | // Invalid lead or bidi-only lead |
940 | 0 | return true; |
941 | | } |
942 | | } |
943 | 0 | if read + 4 > src.len() { |
944 | 0 | if read == src.len() { |
945 | 0 | return false; |
946 | 0 | } |
947 | 0 | byte = src[read]; |
948 | 0 | break 'inner; |
949 | 0 | } |
950 | 0 | byte = src[read]; |
951 | 0 | continue 'inner; |
952 | | } |
953 | 0 | } |
954 | | // We can't have a complete 4-byte sequence, but we could still have |
955 | | // a complete shorter sequence. |
956 | | |
957 | | // At this point, `byte` is not included in `read`. |
958 | 0 | match byte { |
959 | 0 | 0..=0x7F => { |
960 | | // ASCII: go back to SIMD. |
961 | 0 | read += 1; |
962 | 0 | src = &src[read..]; |
963 | 0 | continue 'outer; |
964 | | } |
965 | 0 | 0xC2..=0xD5 => { |
966 | | // Two-byte |
967 | 0 | let new_read = read + 2; |
968 | 0 | if new_read > src.len() { |
969 | 0 | return true; |
970 | 0 | } |
971 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
972 | 0 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
973 | 0 | return true; |
974 | 0 | } |
975 | 0 | read = new_read; |
976 | 0 | // We need to deal with the case where we came here with 3 bytes |
977 | 0 | // left, so we need to take a look at the last one. |
978 | 0 | src = &src[read..]; |
979 | 0 | continue 'outer; |
980 | | } |
981 | | 0xD6 => { |
982 | | // Two-byte, potentially bidi |
983 | 0 | let new_read = read + 2; |
984 | 0 | if new_read > src.len() { |
985 | 0 | return true; |
986 | 0 | } |
987 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
988 | 0 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
989 | 0 | return true; |
990 | 0 | } |
991 | 0 | // XXX consider folding the above and below checks |
992 | 0 | if second > 0x8F { |
993 | 0 | return true; |
994 | 0 | } |
995 | 0 | read = new_read; |
996 | 0 | // We need to deal with the case where we came here with 3 bytes |
997 | 0 | // left, so we need to take a look at the last one. |
998 | 0 | src = &src[read..]; |
999 | 0 | continue 'outer; |
1000 | | } |
1001 | | // two-byte starting with 0xD7 and above is bidi |
1002 | 0 | 0xE1 | 0xE3..=0xEC | 0xEE => { |
1003 | | // Three-byte normal |
1004 | 0 | let new_read = read + 3; |
1005 | 0 | if new_read > src.len() { |
1006 | 0 | return true; |
1007 | 0 | } |
1008 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
1009 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
1010 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
1011 | 0 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
1012 | 0 | | (third >> 6)) |
1013 | 0 | != 2 |
1014 | | { |
1015 | 0 | return true; |
1016 | 0 | } |
1017 | | } |
1018 | | 0xE2 => { |
1019 | | // Three-byte normal, potentially bidi |
1020 | 0 | let new_read = read + 3; |
1021 | 0 | if new_read > src.len() { |
1022 | 0 | return true; |
1023 | 0 | } |
1024 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
1025 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
1026 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
1027 | 0 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
1028 | 0 | | (third >> 6)) |
1029 | 0 | != 2 |
1030 | | { |
1031 | 0 | return true; |
1032 | 0 | } |
1033 | 0 | if second == 0x80 { |
1034 | 0 | if third == 0x8F || third == 0xAB || third == 0xAE { |
1035 | 0 | return true; |
1036 | 0 | } |
1037 | 0 | } else if second == 0x81 { |
1038 | 0 | if third == 0xA7 { |
1039 | 0 | return true; |
1040 | 0 | } |
1041 | 0 | } |
1042 | | } |
1043 | | 0xEF => { |
1044 | | // Three-byte normal, potentially bidi |
1045 | 0 | let new_read = read + 3; |
1046 | 0 | if new_read > src.len() { |
1047 | 0 | return true; |
1048 | 0 | } |
1049 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
1050 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
1051 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
1052 | 0 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
1053 | 0 | | (third >> 6)) |
1054 | 0 | != 2 |
1055 | | { |
1056 | 0 | return true; |
1057 | 0 | } |
1058 | 0 | if in_inclusive_range8(second, 0xAC, 0xB7) { |
1059 | 0 | if second == 0xAC { |
1060 | 0 | if third > 0x9C { |
1061 | 0 | return true; |
1062 | 0 | } |
1063 | | } else { |
1064 | 0 | return true; |
1065 | | } |
1066 | 0 | } else if in_inclusive_range8(second, 0xB9, 0xBB) { |
1067 | 0 | if second == 0xB9 { |
1068 | 0 | if third > 0xAF { |
1069 | 0 | return true; |
1070 | 0 | } |
1071 | 0 | } else if second == 0xBB { |
1072 | 0 | if third != 0xBF { |
1073 | 0 | return true; |
1074 | 0 | } |
1075 | | } else { |
1076 | 0 | return true; |
1077 | | } |
1078 | 0 | } |
1079 | | } |
1080 | | 0xE0 => { |
1081 | | // Three-byte special lower bound, potentially bidi |
1082 | 0 | let new_read = read + 3; |
1083 | 0 | if new_read > src.len() { |
1084 | 0 | return true; |
1085 | 0 | } |
1086 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
1087 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
1088 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
1089 | 0 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
1090 | 0 | | (third >> 6)) |
1091 | 0 | != 2 |
1092 | | { |
1093 | 0 | return true; |
1094 | 0 | } |
1095 | 0 | // XXX can this be folded into the above validity check |
1096 | 0 | if second < 0xA4 { |
1097 | 0 | return true; |
1098 | 0 | } |
1099 | | } |
1100 | | 0xED => { |
1101 | | // Three-byte special upper bound |
1102 | 0 | let new_read = read + 3; |
1103 | 0 | if new_read > src.len() { |
1104 | 0 | return true; |
1105 | 0 | } |
1106 | 0 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
1107 | 0 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
1108 | 0 | if ((UTF8_DATA.table[usize::from(second)] |
1109 | 0 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
1110 | 0 | | (third >> 6)) |
1111 | 0 | != 2 |
1112 | | { |
1113 | 0 | return true; |
1114 | 0 | } |
1115 | | } |
1116 | | _ => { |
1117 | | // Invalid lead, 4-byte lead or 2-byte bidi-only lead |
1118 | 0 | return true; |
1119 | | } |
1120 | | } |
1121 | 0 | return false; |
1122 | | } else { |
1123 | 0 | return false; |
1124 | | } |
1125 | | } |
1126 | 0 | } Unexecuted instantiation: encoding_rs::mem::is_utf8_bidi Unexecuted instantiation: encoding_rs::mem::is_utf8_bidi |
1127 | | |
1128 | | /// Checks whether a valid UTF-8 buffer contains code points that trigger |
1129 | | /// right-to-left processing. |
1130 | | /// |
1131 | | /// The check is done on a Unicode block basis without regard to assigned |
1132 | | /// vs. unassigned code points in the block. Hebrew presentation forms in |
1133 | | /// the Alphabetic Presentation Forms block are treated as if they formed |
1134 | | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
1135 | | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
1136 | | /// for. Control characters that are technically bidi controls but do not |
1137 | | /// cause right-to-left behavior without the presence of right-to-left |
1138 | | /// characters or right-to-left controls are not checked for. As a special |
1139 | | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
1140 | | #[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))] |
1141 | | #[inline] |
1142 | 0 | pub fn is_str_bidi(buffer: &str) -> bool { |
1143 | 0 | // U+058F: D6 8F |
1144 | 0 | // U+0590: D6 90 |
1145 | 0 | // U+08FF: E0 A3 BF |
1146 | 0 | // U+0900: E0 A4 80 |
1147 | 0 | // |
1148 | 0 | // U+200F: E2 80 8F |
1149 | 0 | // U+202B: E2 80 AB |
1150 | 0 | // U+202E: E2 80 AE |
1151 | 0 | // U+2067: E2 81 A7 |
1152 | 0 | // |
1153 | 0 | // U+FB1C: EF AC 9C |
1154 | 0 | // U+FB1D: EF AC 9D |
1155 | 0 | // U+FDFF: EF B7 BF |
1156 | 0 | // U+FE00: EF B8 80 |
1157 | 0 | // |
1158 | 0 | // U+FE6F: EF B9 AF |
1159 | 0 | // U+FE70: EF B9 B0 |
1160 | 0 | // U+FEFE: EF BB BE |
1161 | 0 | // U+FEFF: EF BB BF |
1162 | 0 | // |
1163 | 0 | // U+107FF: F0 90 9F BF |
1164 | 0 | // U+10800: F0 90 A0 80 |
1165 | 0 | // U+10FFF: F0 90 BF BF |
1166 | 0 | // U+11000: F0 91 80 80 |
1167 | 0 | // |
1168 | 0 | // U+1E7FF: F0 9E 9F BF |
1169 | 0 | // U+1E800: F0 9E A0 80 |
1170 | 0 | // U+1EFFF: F0 9E BF BF |
1171 | 0 | // U+1F000: F0 9F 80 80 |
1172 | 0 | let mut bytes = buffer.as_bytes(); |
1173 | | 'outer: loop { |
1174 | | // TODO: Instead of just validating ASCII using SIMD, use SIMD |
1175 | | // to check for non-ASCII lead bytes, too, to quickly conclude |
1176 | | // that the vector consist entirely of CJK and below-Hebrew |
1177 | | // code points. |
1178 | | // Unfortunately, scripts above Arabic but below CJK share |
1179 | | // lead bytes with RTL. |
1180 | 0 | if let Some((mut byte, mut read)) = validate_ascii(bytes) { |
1181 | | 'inner: loop { |
1182 | | // At this point, `byte` is not included in `read`. |
1183 | 0 | if byte < 0xE0 { |
1184 | 0 | if byte >= 0x80 { |
1185 | | // Two-byte |
1186 | | // Adding `unlikely` here improved throughput on |
1187 | | // Russian plain text by 33%! |
1188 | 0 | if unlikely(byte >= 0xD6) { |
1189 | 0 | if byte == 0xD6 { |
1190 | 0 | let second = bytes[read + 1]; |
1191 | 0 | if second > 0x8F { |
1192 | 0 | return true; |
1193 | 0 | } |
1194 | | } else { |
1195 | 0 | return true; |
1196 | | } |
1197 | 0 | } |
1198 | 0 | read += 2; |
1199 | | } else { |
1200 | | // ASCII: write and go back to SIMD. |
1201 | 0 | read += 1; |
1202 | 0 | // Intuitively, we should go back to the outer loop only |
1203 | 0 | // if byte is 0x30 or above, so as to avoid trashing on |
1204 | 0 | // ASCII space, comma and period in non-Latin context. |
1205 | 0 | // However, the extra branch seems to cost more than it's |
1206 | 0 | // worth. |
1207 | 0 | bytes = &bytes[read..]; |
1208 | 0 | continue 'outer; |
1209 | | } |
1210 | 0 | } else if byte < 0xF0 { |
1211 | | // Three-byte |
1212 | 0 | if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) { |
1213 | 0 | let second = bytes[read + 1]; |
1214 | 0 | if byte == 0xE0 { |
1215 | 0 | if second < 0xA4 { |
1216 | 0 | return true; |
1217 | 0 | } |
1218 | 0 | } else if byte == 0xE2 { |
1219 | 0 | let third = bytes[read + 2]; |
1220 | 0 | if second == 0x80 { |
1221 | 0 | if third == 0x8F || third == 0xAB || third == 0xAE { |
1222 | 0 | return true; |
1223 | 0 | } |
1224 | 0 | } else if second == 0x81 { |
1225 | 0 | if third == 0xA7 { |
1226 | 0 | return true; |
1227 | 0 | } |
1228 | 0 | } |
1229 | | } else { |
1230 | 0 | debug_assert_eq!(byte, 0xEF); |
1231 | 0 | if in_inclusive_range8(second, 0xAC, 0xB7) { |
1232 | 0 | if second == 0xAC { |
1233 | 0 | let third = bytes[read + 2]; |
1234 | 0 | if third > 0x9C { |
1235 | 0 | return true; |
1236 | 0 | } |
1237 | | } else { |
1238 | 0 | return true; |
1239 | | } |
1240 | 0 | } else if in_inclusive_range8(second, 0xB9, 0xBB) { |
1241 | 0 | if second == 0xB9 { |
1242 | 0 | let third = bytes[read + 2]; |
1243 | 0 | if third > 0xAF { |
1244 | 0 | return true; |
1245 | 0 | } |
1246 | 0 | } else if second == 0xBB { |
1247 | 0 | let third = bytes[read + 2]; |
1248 | 0 | if third != 0xBF { |
1249 | 0 | return true; |
1250 | 0 | } |
1251 | | } else { |
1252 | 0 | return true; |
1253 | | } |
1254 | 0 | } |
1255 | | } |
1256 | 0 | } |
1257 | 0 | read += 3; |
1258 | | } else { |
1259 | | // Four-byte |
1260 | 0 | let second = bytes[read + 1]; |
1261 | 0 | if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) { |
1262 | 0 | let third = bytes[read + 2]; |
1263 | 0 | if third >= 0xA0 { |
1264 | 0 | return true; |
1265 | 0 | } |
1266 | 0 | } |
1267 | 0 | read += 4; |
1268 | | } |
1269 | | // The comparison is always < or == and never >, but including |
1270 | | // > here to let the compiler assume that < is true if this |
1271 | | // comparison is false. |
1272 | 0 | if read >= bytes.len() { |
1273 | 0 | return false; |
1274 | 0 | } |
1275 | 0 | byte = bytes[read]; |
1276 | 0 | continue 'inner; |
1277 | | } |
1278 | | } else { |
1279 | 0 | return false; |
1280 | | } |
1281 | | } |
1282 | 0 | } Unexecuted instantiation: encoding_rs::mem::is_str_bidi Unexecuted instantiation: encoding_rs::mem::is_str_bidi |
1283 | | |
1284 | | /// Checks whether a UTF-16 buffer contains code points that trigger |
1285 | | /// right-to-left processing. |
1286 | | /// |
1287 | | /// The check is done on a Unicode block basis without regard to assigned |
1288 | | /// vs. unassigned code points in the block. Hebrew presentation forms in |
1289 | | /// the Alphabetic Presentation Forms block are treated as if they formed |
1290 | | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
1291 | | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
1292 | | /// for. Control characters that are technically bidi controls but do not |
1293 | | /// cause right-to-left behavior without the presence of right-to-left |
1294 | | /// characters or right-to-left controls are not checked for. As a special |
1295 | | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
1296 | | /// |
1297 | | /// Returns `true` if the input contains an RTL character or an unpaired |
1298 | | /// high surrogate that could be the high half of an RTL character. |
1299 | | /// Returns `false` if the input contains neither RTL characters nor |
1300 | | /// unpaired high surrogates that could be higher halves of RTL characters. |
1301 | 0 | pub fn is_utf16_bidi(buffer: &[u16]) -> bool { |
1302 | 0 | is_utf16_bidi_impl(buffer) |
1303 | 0 | } |
1304 | | |
1305 | | /// Checks whether a scalar value triggers right-to-left processing. |
1306 | | /// |
1307 | | /// The check is done on a Unicode block basis without regard to assigned |
1308 | | /// vs. unassigned code points in the block. Hebrew presentation forms in |
1309 | | /// the Alphabetic Presentation Forms block are treated as if they formed |
1310 | | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
1311 | | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
1312 | | /// for. Control characters that are technically bidi controls but do not |
1313 | | /// cause right-to-left behavior without the presence of right-to-left |
1314 | | /// characters or right-to-left controls are not checked for. As a special |
1315 | | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
1316 | | #[inline(always)] |
1317 | 0 | pub fn is_char_bidi(c: char) -> bool { |
1318 | 0 | // Controls: |
1319 | 0 | // Every control with RIGHT-TO-LEFT in its name in |
1320 | 0 | // https://www.unicode.org/charts/PDF/U2000.pdf |
1321 | 0 | // U+200F RLM |
1322 | 0 | // U+202B RLE |
1323 | 0 | // U+202E RLO |
1324 | 0 | // U+2067 RLI |
1325 | 0 | // |
1326 | 0 | // BMP RTL: |
1327 | 0 | // https://www.unicode.org/roadmaps/bmp/ |
1328 | 0 | // U+0590...U+08FF |
1329 | 0 | // U+FB1D...U+FDFF Hebrew presentation forms and |
1330 | 0 | // Arabic Presentation Forms A |
1331 | 0 | // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM) |
1332 | 0 | // |
1333 | 0 | // Supplementary RTL: |
1334 | 0 | // https://www.unicode.org/roadmaps/smp/ |
1335 | 0 | // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803) |
1336 | 0 | // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B) |
1337 | 0 | let code_point = u32::from(c); |
1338 | 0 | if code_point < 0x0590 { |
1339 | | // Below Hebrew |
1340 | 0 | return false; |
1341 | 0 | } |
1342 | 0 | if in_range32(code_point, 0x0900, 0xFB1D) { |
1343 | | // Above Arabic Extended-A and below Hebrew presentation forms |
1344 | 0 | if in_inclusive_range32(code_point, 0x200F, 0x2067) { |
1345 | | // In the range that contains the RTL controls |
1346 | 0 | return code_point == 0x200F |
1347 | 0 | || code_point == 0x202B |
1348 | 0 | || code_point == 0x202E |
1349 | 0 | || code_point == 0x2067; |
1350 | 0 | } |
1351 | 0 | return false; |
1352 | 0 | } |
1353 | 0 | if code_point > 0x1EFFF { |
1354 | | // Above second astral RTL. (Emoji is here.) |
1355 | 0 | return false; |
1356 | 0 | } |
1357 | 0 | if in_range32(code_point, 0x11000, 0x1E800) { |
1358 | | // Between astral RTL blocks |
1359 | 0 | return false; |
1360 | 0 | } |
1361 | 0 | if in_range32(code_point, 0xFEFF, 0x10800) { |
1362 | | // Above Arabic Presentations Forms B (excl. BOM) and below first |
1363 | | // astral RTL |
1364 | 0 | return false; |
1365 | 0 | } |
1366 | 0 | if in_range32(code_point, 0xFE00, 0xFE70) { |
1367 | | // Between Arabic Presentations Forms |
1368 | 0 | return false; |
1369 | 0 | } |
1370 | 0 | true |
1371 | 0 | } |
1372 | | |
1373 | | /// Checks whether a UTF-16 code unit triggers right-to-left processing. |
1374 | | /// |
1375 | | /// The check is done on a Unicode block basis without regard to assigned |
1376 | | /// vs. unassigned code points in the block. Hebrew presentation forms in |
1377 | | /// the Alphabetic Presentation Forms block are treated as if they formed |
1378 | | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
1379 | | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
1380 | | /// for. Control characters that are technically bidi controls but do not |
1381 | | /// cause right-to-left behavior without the presence of right-to-left |
1382 | | /// characters or right-to-left controls are not checked for. As a special |
1383 | | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
1384 | | /// |
1385 | | /// Since supplementary-plane right-to-left blocks are identifiable from the |
1386 | | /// high surrogate without examining the low surrogate, this function returns |
1387 | | /// `true` for such high surrogates making the function suitable for handling |
1388 | | /// supplementary-plane text without decoding surrogate pairs to scalar |
1389 | | /// values. Obviously, such high surrogates are then reported as right-to-left |
1390 | | /// even if actually unpaired. |
1391 | | #[inline(always)] |
1392 | 0 | pub fn is_utf16_code_unit_bidi(u: u16) -> bool { |
1393 | 0 | if u < 0x0590 { |
1394 | | // Below Hebrew |
1395 | 0 | return false; |
1396 | 0 | } |
1397 | 0 | if in_range16(u, 0x0900, 0xD802) { |
1398 | | // Above Arabic Extended-A and below first RTL surrogate |
1399 | 0 | if in_inclusive_range16(u, 0x200F, 0x2067) { |
1400 | | // In the range that contains the RTL controls |
1401 | 0 | return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067; |
1402 | 0 | } |
1403 | 0 | return false; |
1404 | 0 | } |
1405 | 0 | if in_range16(u, 0xD83C, 0xFB1D) { |
1406 | | // Between astral RTL high surrogates and Hebrew presentation forms |
1407 | | // (Emoji is here) |
1408 | 0 | return false; |
1409 | 0 | } |
1410 | 0 | if in_range16(u, 0xD804, 0xD83A) { |
1411 | | // Between RTL high surragates |
1412 | 0 | return false; |
1413 | 0 | } |
1414 | 0 | if u > 0xFEFE { |
1415 | | // Above Arabic Presentation Forms (excl. BOM) |
1416 | 0 | return false; |
1417 | 0 | } |
1418 | 0 | if in_range16(u, 0xFE00, 0xFE70) { |
1419 | | // Between Arabic Presentations Forms |
1420 | 0 | return false; |
1421 | 0 | } |
1422 | 0 | true |
1423 | 0 | } |
1424 | | |
1425 | | /// Checks whether a potentially invalid UTF-8 buffer contains code points |
1426 | | /// that trigger right-to-left processing or is all-Latin1. |
1427 | | /// |
1428 | | /// Possibly more efficient than performing the checks separately. |
1429 | | /// |
1430 | | /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. |
1431 | | /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return |
1432 | | /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. |
1433 | 0 | pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi { |
1434 | 0 | if let Some(offset) = is_utf8_latin1_impl(buffer) { |
1435 | 0 | if is_utf8_bidi(&buffer[offset..]) { |
1436 | 0 | Latin1Bidi::Bidi |
1437 | | } else { |
1438 | 0 | Latin1Bidi::LeftToRight |
1439 | | } |
1440 | | } else { |
1441 | 0 | Latin1Bidi::Latin1 |
1442 | | } |
1443 | 0 | } |
1444 | | |
1445 | | /// Checks whether a valid UTF-8 buffer contains code points |
1446 | | /// that trigger right-to-left processing or is all-Latin1. |
1447 | | /// |
1448 | | /// Possibly more efficient than performing the checks separately. |
1449 | | /// |
1450 | | /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. |
1451 | | /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return |
1452 | | /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. |
1453 | 0 | pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi { |
1454 | | // The transition from the latin1 check to the bidi check isn't |
1455 | | // optimal but not tweaking it to perfection today. |
1456 | 0 | if let Some(offset) = is_str_latin1_impl(buffer) { |
1457 | 0 | if is_str_bidi(&buffer[offset..]) { |
1458 | 0 | Latin1Bidi::Bidi |
1459 | | } else { |
1460 | 0 | Latin1Bidi::LeftToRight |
1461 | | } |
1462 | | } else { |
1463 | 0 | Latin1Bidi::Latin1 |
1464 | | } |
1465 | 0 | } |
1466 | | |
1467 | | /// Checks whether a potentially invalid UTF-16 buffer contains code points |
1468 | | /// that trigger right-to-left processing or is all-Latin1. |
1469 | | /// |
1470 | | /// Possibly more efficient than performing the checks separately. |
1471 | | /// |
1472 | | /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. |
1473 | | /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return |
1474 | | /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. |
1475 | 0 | pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi { |
1476 | 0 | check_utf16_for_latin1_and_bidi_impl(buffer) |
1477 | 0 | } |
1478 | | |
1479 | | /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced |
1480 | | /// with the REPLACEMENT CHARACTER. |
1481 | | /// |
1482 | | /// The length of the destination buffer must be at least the length of the |
1483 | | /// source buffer _plus one_. |
1484 | | /// |
1485 | | /// Returns the number of `u16`s written. |
1486 | | /// |
1487 | | /// # Panics |
1488 | | /// |
1489 | | /// Panics if the destination buffer is shorter than stated above. |
1490 | 0 | pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize { |
1491 | 0 | // TODO: Can the requirement for dst to be at least one unit longer |
1492 | 0 | // be eliminated? |
1493 | 0 | assert!(dst.len() > src.len()); |
1494 | 0 | let mut decoder = Utf8Decoder::new_inner(); |
1495 | 0 | let mut total_read = 0usize; |
1496 | 0 | let mut total_written = 0usize; |
1497 | | loop { |
1498 | 0 | let (result, read, written) = |
1499 | 0 | decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true); |
1500 | 0 | total_read += read; |
1501 | 0 | total_written += written; |
1502 | 0 | match result { |
1503 | | DecoderResult::InputEmpty => { |
1504 | 0 | return total_written; |
1505 | | } |
1506 | | DecoderResult::OutputFull => { |
1507 | 0 | unreachable!("The assert at the top of the function should have caught this."); |
1508 | | } |
1509 | 0 | DecoderResult::Malformed(_, _) => { |
1510 | 0 | // There should always be space for the U+FFFD, because |
1511 | 0 | // otherwise we'd have gotten OutputFull already. |
1512 | 0 | dst[total_written] = 0xFFFD; |
1513 | 0 | total_written += 1; |
1514 | 0 | } |
1515 | | } |
1516 | | } |
1517 | 0 | } |
1518 | | |
1519 | | /// Converts valid UTF-8 to valid UTF-16. |
1520 | | /// |
1521 | | /// The length of the destination buffer must be at least the length of the |
1522 | | /// source buffer. |
1523 | | /// |
1524 | | /// Returns the number of `u16`s written. |
1525 | | /// |
1526 | | /// # Panics |
1527 | | /// |
1528 | | /// Panics if the destination buffer is shorter than stated above. |
1529 | 0 | pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize { |
1530 | 0 | assert!( |
1531 | 0 | dst.len() >= src.len(), |
1532 | 0 | "Destination must not be shorter than the source." |
1533 | 0 | ); |
1534 | 0 | let bytes = src.as_bytes(); |
1535 | 0 | let mut read = 0; |
1536 | 0 | let mut written = 0; |
1537 | | 'outer: loop { |
1538 | 0 | let mut byte = { |
1539 | 0 | let src_remaining = &bytes[read..]; |
1540 | 0 | let dst_remaining = &mut dst[written..]; |
1541 | 0 | let length = src_remaining.len(); |
1542 | 0 | match unsafe { |
1543 | 0 | ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
1544 | 0 | } { |
1545 | | None => { |
1546 | 0 | written += length; |
1547 | 0 | return written; |
1548 | | } |
1549 | 0 | Some((non_ascii, consumed)) => { |
1550 | 0 | read += consumed; |
1551 | 0 | written += consumed; |
1552 | 0 | non_ascii |
1553 | | } |
1554 | | } |
1555 | | }; |
1556 | | 'inner: loop { |
1557 | | // At this point, `byte` is not included in `read`. |
1558 | 0 | if byte < 0xE0 { |
1559 | 0 | if byte >= 0x80 { |
1560 | 0 | // Two-byte |
1561 | 0 | let second = unsafe { *(bytes.get_unchecked(read + 1)) }; |
1562 | 0 | let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F); |
1563 | 0 | unsafe { *(dst.get_unchecked_mut(written)) = point }; |
1564 | 0 | read += 2; |
1565 | 0 | written += 1; |
1566 | 0 | } else { |
1567 | | // ASCII: write and go back to SIMD. |
1568 | 0 | unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; |
1569 | 0 | read += 1; |
1570 | 0 | written += 1; |
1571 | 0 | // Intuitively, we should go back to the outer loop only |
1572 | 0 | // if byte is 0x30 or above, so as to avoid trashing on |
1573 | 0 | // ASCII space, comma and period in non-Latin context. |
1574 | 0 | // However, the extra branch seems to cost more than it's |
1575 | 0 | // worth. |
1576 | 0 | continue 'outer; |
1577 | | } |
1578 | 0 | } else if byte < 0xF0 { |
1579 | 0 | // Three-byte |
1580 | 0 | let second = unsafe { *(bytes.get_unchecked(read + 1)) }; |
1581 | 0 | let third = unsafe { *(bytes.get_unchecked(read + 2)) }; |
1582 | 0 | let point = ((u16::from(byte) & 0xF) << 12) |
1583 | 0 | | ((u16::from(second) & 0x3F) << 6) |
1584 | 0 | | (u16::from(third) & 0x3F); |
1585 | 0 | unsafe { *(dst.get_unchecked_mut(written)) = point }; |
1586 | 0 | read += 3; |
1587 | 0 | written += 1; |
1588 | 0 | } else { |
1589 | 0 | // Four-byte |
1590 | 0 | let second = unsafe { *(bytes.get_unchecked(read + 1)) }; |
1591 | 0 | let third = unsafe { *(bytes.get_unchecked(read + 2)) }; |
1592 | 0 | let fourth = unsafe { *(bytes.get_unchecked(read + 3)) }; |
1593 | 0 | let point = ((u32::from(byte) & 0x7) << 18) |
1594 | 0 | | ((u32::from(second) & 0x3F) << 12) |
1595 | 0 | | ((u32::from(third) & 0x3F) << 6) |
1596 | 0 | | (u32::from(fourth) & 0x3F); |
1597 | 0 | unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 }; |
1598 | 0 | unsafe { |
1599 | 0 | *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16 |
1600 | 0 | }; |
1601 | 0 | read += 4; |
1602 | 0 | written += 2; |
1603 | 0 | } |
1604 | | // The comparison is always < or == and never >, but including |
1605 | | // > here to let the compiler assume that < is true if this |
1606 | | // comparison is false. |
1607 | 0 | if read >= src.len() { |
1608 | 0 | return written; |
1609 | 0 | } |
1610 | 0 | byte = bytes[read]; |
1611 | 0 | continue 'inner; |
1612 | | } |
1613 | | } |
1614 | 0 | } |
1615 | | |
1616 | | /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. |
1617 | | /// |
1618 | | /// The length of the destination buffer must be at least the length of the |
1619 | | /// source buffer. |
1620 | | /// |
1621 | | /// Returns the number of `u16`s written or `None` if the input was invalid. |
1622 | | /// |
1623 | | /// When the input was invalid, some output may have been written. |
1624 | | /// |
1625 | | /// # Panics |
1626 | | /// |
1627 | | /// Panics if the destination buffer is shorter than stated above. |
1628 | 0 | pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> { |
1629 | 0 | assert!( |
1630 | 0 | dst.len() >= src.len(), |
1631 | 0 | "Destination must not be shorter than the source." |
1632 | 0 | ); |
1633 | 0 | let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst); |
1634 | 0 | if read == src.len() { |
1635 | 0 | return Some(written); |
1636 | 0 | } |
1637 | 0 | None |
1638 | 0 | } |
1639 | | |
1640 | | /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
1641 | | /// with the REPLACEMENT CHARACTER with potentially insufficient output |
1642 | | /// space. |
1643 | | /// |
1644 | | /// Returns the number of code units read and the number of bytes written. |
1645 | | /// |
1646 | | /// Guarantees that the bytes in the destination beyond the number of |
1647 | | /// bytes claimed as written by the second item of the return tuple |
1648 | | /// are left unmodified. |
1649 | | /// |
1650 | | /// Not all code units are read if there isn't enough output space. |
1651 | | /// |
1652 | | /// Note that this method isn't designed for general streamability but for |
1653 | | /// not allocating memory for the worst case up front. Specifically, |
1654 | | /// if the input starts with or ends with an unpaired surrogate, those are |
1655 | | /// replaced with the REPLACEMENT CHARACTER. |
1656 | | /// |
1657 | | /// Matches the semantics of `TextEncoder.encodeInto()` from the |
1658 | | /// Encoding Standard. |
1659 | | /// |
1660 | | /// # Safety |
1661 | | /// |
1662 | | /// If you want to convert into a `&mut str`, use |
1663 | | /// `convert_utf16_to_str_partial()` instead of using this function |
1664 | | /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. |
1665 | | #[inline(always)] |
1666 | 0 | pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) { |
1667 | 0 | // The two functions called below are marked `inline(never)` to make |
1668 | 0 | // transitions from the hot part (first function) into the cold part |
1669 | 0 | // (second function) go through a return and another call to discouge |
1670 | 0 | // the CPU from speculating from the hot code into the cold code. |
1671 | 0 | // Letting the transitions be mere intra-function jumps, even to |
1672 | 0 | // basic blocks out-of-lined to the end of the function would wipe |
1673 | 0 | // away a quarter of Arabic encode performance on Haswell! |
1674 | 0 | let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst); |
1675 | 0 | if likely(read == src.len()) { |
1676 | 0 | return (read, written); |
1677 | 0 | } |
1678 | 0 | let (tail_read, tail_written) = |
1679 | 0 | convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]); |
1680 | 0 | (read + tail_read, written + tail_written) |
1681 | 0 | } |
1682 | | |
1683 | | /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
1684 | | /// with the REPLACEMENT CHARACTER. |
1685 | | /// |
1686 | | /// The length of the destination buffer must be at least the length of the |
1687 | | /// source buffer times three. |
1688 | | /// |
1689 | | /// Returns the number of bytes written. |
1690 | | /// |
1691 | | /// # Panics |
1692 | | /// |
1693 | | /// Panics if the destination buffer is shorter than stated above. |
1694 | | /// |
1695 | | /// # Safety |
1696 | | /// |
1697 | | /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()` |
1698 | | /// instead of using this function together with the `unsafe` method |
1699 | | /// `as_bytes_mut()` on `&mut str`. |
1700 | | #[inline(always)] |
1701 | 0 | pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize { |
1702 | 0 | assert!(dst.len() >= src.len() * 3); |
1703 | 0 | let (read, written) = convert_utf16_to_utf8_partial(src, dst); |
1704 | 0 | debug_assert_eq!(read, src.len()); |
1705 | 0 | written |
1706 | 0 | } |
1707 | | |
1708 | | /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
1709 | | /// with the REPLACEMENT CHARACTER such that the validity of the output is |
1710 | | /// signaled using the Rust type system with potentially insufficient output |
1711 | | /// space. |
1712 | | /// |
1713 | | /// Returns the number of code units read and the number of bytes written. |
1714 | | /// |
1715 | | /// Not all code units are read if there isn't enough output space. |
1716 | | /// |
1717 | | /// Note that this method isn't designed for general streamability but for |
1718 | | /// not allocating memory for the worst case up front. Specifically, |
1719 | | /// if the input starts with or ends with an unpaired surrogate, those are |
1720 | | /// replaced with the REPLACEMENT CHARACTER. |
1721 | 0 | pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) { |
1722 | 0 | let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; |
1723 | 0 | let (read, written) = convert_utf16_to_utf8_partial(src, bytes); |
1724 | 0 | let len = bytes.len(); |
1725 | 0 | let mut trail = written; |
1726 | 0 | while trail < len && ((bytes[trail] & 0xC0) == 0x80) { |
1727 | 0 | bytes[trail] = 0; |
1728 | 0 | trail += 1; |
1729 | 0 | } |
1730 | 0 | (read, written) |
1731 | 0 | } |
1732 | | |
1733 | | /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
1734 | | /// with the REPLACEMENT CHARACTER such that the validity of the output is |
1735 | | /// signaled using the Rust type system. |
1736 | | /// |
1737 | | /// The length of the destination buffer must be at least the length of the |
1738 | | /// source buffer times three. |
1739 | | /// |
1740 | | /// Returns the number of bytes written. |
1741 | | /// |
1742 | | /// # Panics |
1743 | | /// |
1744 | | /// Panics if the destination buffer is shorter than stated above. |
1745 | | #[inline(always)] |
1746 | 0 | pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize { |
1747 | 0 | assert!(dst.len() >= src.len() * 3); |
1748 | 0 | let (read, written) = convert_utf16_to_str_partial(src, dst); |
1749 | 0 | debug_assert_eq!(read, src.len()); |
1750 | 0 | written |
1751 | 0 | } |
1752 | | |
1753 | | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1754 | | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16. |
1755 | | /// |
1756 | | /// The length of the destination buffer must be at least the length of the |
1757 | | /// source buffer. |
1758 | | /// |
1759 | | /// The number of `u16`s written equals the length of the source buffer. |
1760 | | /// |
1761 | | /// # Panics |
1762 | | /// |
1763 | | /// Panics if the destination buffer is shorter than stated above. |
1764 | 864k | pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) { |
1765 | 864k | assert!( |
1766 | 864k | dst.len() >= src.len(), |
1767 | 864k | "Destination must not be shorter than the source." |
1768 | 864k | ); |
1769 | | // TODO: On aarch64, the safe version autovectorizes to the same unpacking |
1770 | | // instructions and this code, but, yet, the autovectorized version is |
1771 | | // faster. |
1772 | 864k | unsafe { |
1773 | 864k | unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); |
1774 | 864k | } |
1775 | 864k | } |
1776 | | |
1777 | | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1778 | | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient |
1779 | | /// output space. |
1780 | | /// |
1781 | | /// Returns the number of bytes read and the number of bytes written. |
1782 | | /// |
1783 | | /// If the output isn't large enough, not all input is consumed. |
1784 | | /// |
1785 | | /// # Safety |
1786 | | /// |
1787 | | /// If you want to convert into a `&mut str`, use |
1788 | | /// `convert_utf16_to_str_partial()` instead of using this function |
1789 | | /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. |
1790 | 254k | pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) { |
1791 | 254k | let src_len = src.len(); |
1792 | 254k | let src_ptr = src.as_ptr(); |
1793 | 254k | let dst_ptr = dst.as_mut_ptr(); |
1794 | 254k | let dst_len = dst.len(); |
1795 | 254k | let mut total_read = 0usize; |
1796 | 254k | let mut total_written = 0usize; |
1797 | | loop { |
1798 | | // src can't advance more than dst |
1799 | 254k | let src_left = src_len - total_read; |
1800 | 254k | let dst_left = dst_len - total_written; |
1801 | 254k | let min_left = ::core::cmp::min(src_left, dst_left); |
1802 | 254k | if let Some((non_ascii, consumed)) = unsafe { |
1803 | 254k | ascii_to_ascii( |
1804 | 254k | src_ptr.add(total_read), |
1805 | 254k | dst_ptr.add(total_written), |
1806 | 254k | min_left, |
1807 | 254k | ) |
1808 | 254k | } { |
1809 | 0 | total_read += consumed; |
1810 | 0 | total_written += consumed; |
1811 | 0 | if total_written.checked_add(2).unwrap() > dst_len { |
1812 | 0 | return (total_read, total_written); |
1813 | 0 | } |
1814 | 0 |
|
1815 | 0 | total_read += 1; // consume `non_ascii` |
1816 | 0 |
|
1817 | 0 | dst[total_written] = (non_ascii >> 6) | 0xC0; |
1818 | 0 | total_written += 1; |
1819 | 0 | dst[total_written] = (non_ascii & 0x3F) | 0x80; |
1820 | 0 | total_written += 1; |
1821 | 0 | continue; |
1822 | 254k | } |
1823 | 254k | return (total_read + min_left, total_written + min_left); |
1824 | | } |
1825 | 254k | } |
1826 | | |
1827 | | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1828 | | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. |
1829 | | /// |
1830 | | /// The length of the destination buffer must be at least the length of the |
1831 | | /// source buffer times two. |
1832 | | /// |
1833 | | /// Returns the number of bytes written. |
1834 | | /// |
1835 | | /// # Panics |
1836 | | /// |
1837 | | /// Panics if the destination buffer is shorter than stated above. |
1838 | | /// |
1839 | | /// # Safety |
1840 | | /// |
1841 | | /// Note that this function may write garbage beyond the number of bytes |
1842 | | /// indicated by the return value, so using a `&mut str` interpreted as |
1843 | | /// `&mut [u8]` as the destination is not safe. If you want to convert into |
1844 | | /// a `&mut str`, use `convert_utf16_to_str()` instead of this function. |
1845 | | #[inline] |
1846 | 0 | pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize { |
1847 | 0 | assert!( |
1848 | 0 | dst.len() >= src.len() * 2, |
1849 | 0 | "Destination must not be shorter than the source times two." |
1850 | 0 | ); |
1851 | 0 | let (read, written) = convert_latin1_to_utf8_partial(src, dst); |
1852 | 0 | debug_assert_eq!(read, src.len()); |
1853 | 0 | written |
1854 | 0 | } Unexecuted instantiation: encoding_rs::mem::convert_latin1_to_utf8 Unexecuted instantiation: encoding_rs::mem::convert_latin1_to_utf8 |
1855 | | |
1856 | | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1857 | | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the |
1858 | | /// output is signaled using the Rust type system with potentially insufficient |
1859 | | /// output space. |
1860 | | /// |
1861 | | /// Returns the number of bytes read and the number of bytes written. |
1862 | | /// |
1863 | | /// If the output isn't large enough, not all input is consumed. |
1864 | | #[inline] |
1865 | 0 | pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) { |
1866 | 0 | let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; |
1867 | 0 | let (read, written) = convert_latin1_to_utf8_partial(src, bytes); |
1868 | 0 | let len = bytes.len(); |
1869 | 0 | let mut trail = written; |
1870 | 0 | let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE); |
1871 | 0 | while trail < max { |
1872 | 0 | bytes[trail] = 0; |
1873 | 0 | trail += 1; |
1874 | 0 | } |
1875 | 0 | while trail < len && ((bytes[trail] & 0xC0) == 0x80) { |
1876 | 0 | bytes[trail] = 0; |
1877 | 0 | trail += 1; |
1878 | 0 | } |
1879 | 0 | (read, written) |
1880 | 0 | } |
1881 | | |
1882 | | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1883 | | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the |
1884 | | /// output is signaled using the Rust type system. |
1885 | | /// |
1886 | | /// The length of the destination buffer must be at least the length of the |
1887 | | /// source buffer times two. |
1888 | | /// |
1889 | | /// Returns the number of bytes written. |
1890 | | /// |
1891 | | /// # Panics |
1892 | | /// |
1893 | | /// Panics if the destination buffer is shorter than stated above. |
1894 | | #[inline] |
1895 | 0 | pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize { |
1896 | 0 | assert!( |
1897 | 0 | dst.len() >= src.len() * 2, |
1898 | 0 | "Destination must not be shorter than the source times two." |
1899 | 0 | ); |
1900 | 0 | let (read, written) = convert_latin1_to_str_partial(src, dst); |
1901 | 0 | debug_assert_eq!(read, src.len()); |
1902 | 0 | written |
1903 | 0 | } |
1904 | | |
1905 | | /// If the input is valid UTF-8 representing only Unicode code points from |
1906 | | /// U+0000 to U+00FF, inclusive, converts the input into output that |
1907 | | /// represents the value of each code point as the unsigned byte value of |
1908 | | /// each output byte. |
1909 | | /// |
1910 | | /// If the input does not fulfill the condition stated above, this function |
1911 | | /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise |
1912 | | /// does something that is memory-safe without any promises about any |
1913 | | /// properties of the output. In particular, callers shouldn't assume the |
1914 | | /// output to be the same across crate versions or CPU architectures and |
1915 | | /// should not assume that non-ASCII input can't map to ASCII output. |
1916 | | /// |
1917 | | /// The length of the destination buffer must be at least the length of the |
1918 | | /// source buffer. |
1919 | | /// |
1920 | | /// Returns the number of bytes written. |
1921 | | /// |
1922 | | /// # Panics |
1923 | | /// |
1924 | | /// Panics if the destination buffer is shorter than stated above. |
1925 | | /// |
1926 | | /// If debug assertions are enabled (and not fuzzing) and the input is |
1927 | | /// not in the range U+0000 to U+00FF, inclusive. |
1928 | 0 | pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize { |
1929 | 0 | assert!( |
1930 | 0 | dst.len() >= src.len(), |
1931 | 0 | "Destination must not be shorter than the source." |
1932 | 0 | ); |
1933 | 0 | non_fuzz_debug_assert!(is_utf8_latin1(src)); |
1934 | 0 | let src_len = src.len(); |
1935 | 0 | let src_ptr = src.as_ptr(); |
1936 | 0 | let dst_ptr = dst.as_mut_ptr(); |
1937 | 0 | let mut total_read = 0usize; |
1938 | 0 | let mut total_written = 0usize; |
1939 | | loop { |
1940 | | // dst can't advance more than src |
1941 | 0 | let src_left = src_len - total_read; |
1942 | 0 | if let Some((non_ascii, consumed)) = unsafe { |
1943 | 0 | ascii_to_ascii( |
1944 | 0 | src_ptr.add(total_read), |
1945 | 0 | dst_ptr.add(total_written), |
1946 | 0 | src_left, |
1947 | 0 | ) |
1948 | 0 | } { |
1949 | 0 | total_read += consumed + 1; |
1950 | 0 | total_written += consumed; |
1951 | 0 |
|
1952 | 0 | if total_read == src_len { |
1953 | 0 | return total_written; |
1954 | 0 | } |
1955 | 0 |
|
1956 | 0 | let trail = src[total_read]; |
1957 | 0 | total_read += 1; |
1958 | 0 |
|
1959 | 0 | dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F); |
1960 | 0 | total_written += 1; |
1961 | 0 | continue; |
1962 | 0 | } |
1963 | 0 | return total_written + src_left; |
1964 | | } |
1965 | 0 | } |
1966 | | |
1967 | | /// If the input is valid UTF-16 representing only Unicode code points from |
1968 | | /// U+0000 to U+00FF, inclusive, converts the input into output that |
1969 | | /// represents the value of each code point as the unsigned byte value of |
1970 | | /// each output byte. |
1971 | | /// |
1972 | | /// If the input does not fulfill the condition stated above, does something |
1973 | | /// that is memory-safe without any promises about any properties of the |
1974 | | /// output and will probably assert in debug builds in future versions. |
1975 | | /// In particular, callers shouldn't assume the output to be the same across |
1976 | | /// crate versions or CPU architectures and should not assume that non-ASCII |
1977 | | /// input can't map to ASCII output. |
1978 | | /// |
1979 | | /// The length of the destination buffer must be at least the length of the |
1980 | | /// source buffer. |
1981 | | /// |
1982 | | /// The number of bytes written equals the length of the source buffer. |
1983 | | /// |
1984 | | /// # Panics |
1985 | | /// |
1986 | | /// Panics if the destination buffer is shorter than stated above. |
1987 | | /// |
1988 | | /// (Probably in future versions if debug assertions are enabled (and not |
1989 | | /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) |
1990 | 9.99k | pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) { |
1991 | 9.99k | assert!( |
1992 | 9.99k | dst.len() >= src.len(), |
1993 | 9.99k | "Destination must not be shorter than the source." |
1994 | 9.99k | ); |
1995 | | // non_fuzz_debug_assert!(is_utf16_latin1(src)); |
1996 | 9.99k | unsafe { |
1997 | 9.99k | pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len()); |
1998 | 9.99k | } |
1999 | 9.99k | } |
2000 | | |
2001 | | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
2002 | | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. |
2003 | | /// |
2004 | | /// Borrows if input is ASCII-only. Performs a single heap allocation |
2005 | | /// otherwise. |
2006 | | /// |
2007 | | /// Only available if the `alloc` feature is enabled (enabled by default). |
2008 | | #[cfg(feature = "alloc")] |
2009 | 0 | pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> { |
2010 | 0 | let up_to = ascii_valid_up_to(bytes); |
2011 | 0 | // >= makes later things optimize better than == |
2012 | 0 | if up_to >= bytes.len() { |
2013 | 0 | debug_assert_eq!(up_to, bytes.len()); |
2014 | 0 | let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) }; |
2015 | 0 | return Cow::Borrowed(s); |
2016 | 0 | } |
2017 | 0 | let (head, tail) = bytes.split_at(up_to); |
2018 | 0 | let capacity = head.len() + tail.len() * 2; |
2019 | 0 | let mut vec = Vec::with_capacity(capacity); |
2020 | 0 | unsafe { |
2021 | 0 | vec.set_len(capacity); |
2022 | 0 | } |
2023 | 0 | (&mut vec[..up_to]).copy_from_slice(head); |
2024 | 0 | let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]); |
2025 | 0 | vec.truncate(up_to + written); |
2026 | 0 | Cow::Owned(unsafe { String::from_utf8_unchecked(vec) }) |
2027 | 0 | } |
2028 | | |
2029 | | /// If the input is valid UTF-8 representing only Unicode code points from |
2030 | | /// U+0000 to U+00FF, inclusive, converts the input into output that |
2031 | | /// represents the value of each code point as the unsigned byte value of |
2032 | | /// each output byte. |
2033 | | /// |
2034 | | /// If the input does not fulfill the condition stated above, this function |
2035 | | /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise |
2036 | | /// does something that is memory-safe without any promises about any |
2037 | | /// properties of the output. In particular, callers shouldn't assume the |
2038 | | /// output to be the same across crate versions or CPU architectures and |
2039 | | /// should not assume that non-ASCII input can't map to ASCII output. |
2040 | | /// |
2041 | | /// Borrows if input is ASCII-only. Performs a single heap allocation |
2042 | | /// otherwise. |
2043 | | /// |
2044 | | /// Only available if the `alloc` feature is enabled (enabled by default). |
2045 | | #[cfg(feature = "alloc")] |
2046 | 0 | pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> { |
2047 | 0 | let bytes = string.as_bytes(); |
2048 | 0 | let up_to = ascii_valid_up_to(bytes); |
2049 | 0 | // >= makes later things optimize better than == |
2050 | 0 | if up_to >= bytes.len() { |
2051 | 0 | debug_assert_eq!(up_to, bytes.len()); |
2052 | 0 | return Cow::Borrowed(bytes); |
2053 | 0 | } |
2054 | 0 | let (head, tail) = bytes.split_at(up_to); |
2055 | 0 | let capacity = bytes.len(); |
2056 | 0 | let mut vec = Vec::with_capacity(capacity); |
2057 | 0 | unsafe { |
2058 | 0 | vec.set_len(capacity); |
2059 | 0 | } |
2060 | 0 | (&mut vec[..up_to]).copy_from_slice(head); |
2061 | 0 | let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]); |
2062 | 0 | vec.truncate(up_to + written); |
2063 | 0 | Cow::Owned(vec) |
2064 | 0 | } |
2065 | | |
2066 | | /// Returns the index of the first unpaired surrogate or, if the input is |
2067 | | /// valid UTF-16 in its entirety, the length of the input. |
2068 | 0 | pub fn utf16_valid_up_to(buffer: &[u16]) -> usize { |
2069 | 0 | utf16_valid_up_to_impl(buffer) |
2070 | 0 | } |
2071 | | |
2072 | | /// Returns the index of first byte that starts an invalid byte |
2073 | | /// sequence or a non-Latin1 byte sequence, or the length of the |
2074 | | /// string if there are neither. |
2075 | 0 | pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize { |
2076 | 0 | is_utf8_latin1_impl(buffer).unwrap_or(buffer.len()) |
2077 | 0 | } |
2078 | | |
2079 | | /// Returns the index of first byte that starts a non-Latin1 byte |
2080 | | /// sequence, or the length of the string if there are none. |
2081 | 0 | pub fn str_latin1_up_to(buffer: &str) -> usize { |
2082 | 0 | is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len()) |
2083 | 0 | } |
2084 | | |
2085 | | /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. |
2086 | | #[inline] |
2087 | 0 | pub fn ensure_utf16_validity(buffer: &mut [u16]) { |
2088 | 0 | let mut offset = 0; |
2089 | | loop { |
2090 | 0 | offset += utf16_valid_up_to(&buffer[offset..]); |
2091 | 0 | if offset == buffer.len() { |
2092 | 0 | return; |
2093 | 0 | } |
2094 | 0 | buffer[offset] = 0xFFFD; |
2095 | 0 | offset += 1; |
2096 | | } |
2097 | 0 | } Unexecuted instantiation: encoding_rs::mem::ensure_utf16_validity Unexecuted instantiation: encoding_rs::mem::ensure_utf16_validity |
2098 | | |
2099 | | /// Copies ASCII from source to destination up to the first non-ASCII byte |
2100 | | /// (or the end of the input if it is ASCII in its entirety). |
2101 | | /// |
2102 | | /// The length of the destination buffer must be at least the length of the |
2103 | | /// source buffer. |
2104 | | /// |
2105 | | /// Returns the number of bytes written. |
2106 | | /// |
2107 | | /// # Panics |
2108 | | /// |
2109 | | /// Panics if the destination buffer is shorter than stated above. |
2110 | 0 | pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize { |
2111 | 0 | assert!( |
2112 | 0 | dst.len() >= src.len(), |
2113 | 0 | "Destination must not be shorter than the source." |
2114 | 0 | ); |
2115 | 0 | if let Some((_, consumed)) = |
2116 | 0 | unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } |
2117 | | { |
2118 | 0 | consumed |
2119 | | } else { |
2120 | 0 | src.len() |
2121 | | } |
2122 | 0 | } |
2123 | | |
2124 | | /// Copies ASCII from source to destination zero-extending it to UTF-16 up to |
2125 | | /// the first non-ASCII byte (or the end of the input if it is ASCII in its |
2126 | | /// entirety). |
2127 | | /// |
2128 | | /// The length of the destination buffer must be at least the length of the |
2129 | | /// source buffer. |
2130 | | /// |
2131 | | /// Returns the number of `u16`s written. |
2132 | | /// |
2133 | | /// # Panics |
2134 | | /// |
2135 | | /// Panics if the destination buffer is shorter than stated above. |
2136 | 0 | pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize { |
2137 | 0 | assert!( |
2138 | 0 | dst.len() >= src.len(), |
2139 | 0 | "Destination must not be shorter than the source." |
2140 | 0 | ); |
2141 | 0 | if let Some((_, consumed)) = |
2142 | 0 | unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) } |
2143 | | { |
2144 | 0 | consumed |
2145 | | } else { |
2146 | 0 | src.len() |
2147 | | } |
2148 | 0 | } |
2149 | | |
2150 | | /// Copies Basic Latin from source to destination narrowing it to ASCII up to |
2151 | | /// the first non-Basic Latin code unit (or the end of the input if it is |
2152 | | /// Basic Latin in its entirety). |
2153 | | /// |
2154 | | /// The length of the destination buffer must be at least the length of the |
2155 | | /// source buffer. |
2156 | | /// |
2157 | | /// Returns the number of bytes written. |
2158 | | /// |
2159 | | /// # Panics |
2160 | | /// |
2161 | | /// Panics if the destination buffer is shorter than stated above. |
2162 | 0 | pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize { |
2163 | 0 | assert!( |
2164 | 0 | dst.len() >= src.len(), |
2165 | 0 | "Destination must not be shorter than the source." |
2166 | 0 | ); |
2167 | 0 | if let Some((_, consumed)) = |
2168 | 0 | unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) } |
2169 | | { |
2170 | 0 | consumed |
2171 | | } else { |
2172 | 0 | src.len() |
2173 | | } |
2174 | 0 | } |
2175 | | |
2176 | | // Any copyright to the test code below this comment is dedicated to the |
2177 | | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
2178 | | |
2179 | | #[cfg(all(test, feature = "alloc"))] |
2180 | | mod tests { |
2181 | | use super::*; |
2182 | | |
2183 | | #[test] |
2184 | | fn test_is_ascii_success() { |
2185 | | let mut src: Vec<u8> = Vec::with_capacity(128); |
2186 | | src.resize(128, 0); |
2187 | | for i in 0..src.len() { |
2188 | | src[i] = i as u8; |
2189 | | } |
2190 | | for i in 0..src.len() { |
2191 | | assert!(is_ascii(&src[i..])); |
2192 | | } |
2193 | | } |
2194 | | |
2195 | | #[test] |
2196 | | fn test_is_ascii_fail() { |
2197 | | let mut src: Vec<u8> = Vec::with_capacity(128); |
2198 | | src.resize(128, 0); |
2199 | | for i in 0..src.len() { |
2200 | | src[i] = i as u8; |
2201 | | } |
2202 | | for i in 0..src.len() { |
2203 | | let tail = &mut src[i..]; |
2204 | | for j in 0..tail.len() { |
2205 | | tail[j] = 0xA0; |
2206 | | assert!(!is_ascii(tail)); |
2207 | | } |
2208 | | } |
2209 | | } |
2210 | | |
2211 | | #[test] |
2212 | | fn test_is_basic_latin_success() { |
2213 | | let mut src: Vec<u16> = Vec::with_capacity(128); |
2214 | | src.resize(128, 0); |
2215 | | for i in 0..src.len() { |
2216 | | src[i] = i as u16; |
2217 | | } |
2218 | | for i in 0..src.len() { |
2219 | | assert!(is_basic_latin(&src[i..])); |
2220 | | } |
2221 | | } |
2222 | | |
2223 | | #[test] |
2224 | | fn test_is_basic_latin_fail() { |
2225 | | let mut src: Vec<u16> = Vec::with_capacity(128); |
2226 | | src.resize(128, 0); |
2227 | | for i in 0..src.len() { |
2228 | | src[i] = i as u16; |
2229 | | } |
2230 | | for i in 0..src.len() { |
2231 | | let tail = &mut src[i..]; |
2232 | | for j in 0..tail.len() { |
2233 | | tail[j] = 0xA0; |
2234 | | assert!(!is_basic_latin(tail)); |
2235 | | } |
2236 | | } |
2237 | | } |
2238 | | |
2239 | | #[test] |
2240 | | fn test_is_utf16_latin1_success() { |
2241 | | let mut src: Vec<u16> = Vec::with_capacity(256); |
2242 | | src.resize(256, 0); |
2243 | | for i in 0..src.len() { |
2244 | | src[i] = i as u16; |
2245 | | } |
2246 | | for i in 0..src.len() { |
2247 | | assert!(is_utf16_latin1(&src[i..])); |
2248 | | assert_eq!( |
2249 | | check_utf16_for_latin1_and_bidi(&src[i..]), |
2250 | | Latin1Bidi::Latin1 |
2251 | | ); |
2252 | | } |
2253 | | } |
2254 | | |
2255 | | #[test] |
2256 | | fn test_is_utf16_latin1_fail() { |
2257 | | let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow |
2258 | | let mut src: Vec<u16> = Vec::with_capacity(len); |
2259 | | src.resize(len, 0); |
2260 | | for i in 0..src.len() { |
2261 | | src[i] = i as u16; |
2262 | | } |
2263 | | for i in 0..src.len() { |
2264 | | let tail = &mut src[i..]; |
2265 | | for j in 0..tail.len() { |
2266 | | tail[j] = 0x100 + j as u16; |
2267 | | assert!(!is_utf16_latin1(tail)); |
2268 | | assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1); |
2269 | | } |
2270 | | } |
2271 | | } |
2272 | | |
2273 | | #[test] |
2274 | | fn test_is_str_latin1_success() { |
2275 | | let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow |
2276 | | let mut src: Vec<u16> = Vec::with_capacity(len); |
2277 | | src.resize(len, 0); |
2278 | | for i in 0..src.len() { |
2279 | | src[i] = i as u16; |
2280 | | } |
2281 | | for i in 0..src.len() { |
2282 | | let s = String::from_utf16(&src[i..]).unwrap(); |
2283 | | assert!(is_str_latin1(&s[..])); |
2284 | | assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); |
2285 | | } |
2286 | | } |
2287 | | |
2288 | | #[test] |
2289 | | fn test_is_str_latin1_fail() { |
2290 | | let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow |
2291 | | let mut src: Vec<u16> = Vec::with_capacity(len); |
2292 | | src.resize(len, 0); |
2293 | | for i in 0..src.len() { |
2294 | | src[i] = i as u16; |
2295 | | } |
2296 | | for i in 0..src.len() { |
2297 | | let tail = &mut src[i..]; |
2298 | | for j in 0..tail.len() { |
2299 | | tail[j] = 0x100 + j as u16; |
2300 | | let s = String::from_utf16(tail).unwrap(); |
2301 | | assert!(!is_str_latin1(&s[..])); |
2302 | | assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); |
2303 | | } |
2304 | | } |
2305 | | } |
2306 | | |
2307 | | #[test] |
2308 | | fn test_is_utf8_latin1_success() { |
2309 | | let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow |
2310 | | let mut src: Vec<u16> = Vec::with_capacity(len); |
2311 | | src.resize(len, 0); |
2312 | | for i in 0..src.len() { |
2313 | | src[i] = i as u16; |
2314 | | } |
2315 | | for i in 0..src.len() { |
2316 | | let s = String::from_utf16(&src[i..]).unwrap(); |
2317 | | assert!(is_utf8_latin1(s.as_bytes())); |
2318 | | assert_eq!( |
2319 | | check_utf8_for_latin1_and_bidi(s.as_bytes()), |
2320 | | Latin1Bidi::Latin1 |
2321 | | ); |
2322 | | } |
2323 | | } |
2324 | | |
2325 | | #[test] |
2326 | | fn test_is_utf8_latin1_fail() { |
2327 | | let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow |
2328 | | let mut src: Vec<u16> = Vec::with_capacity(len); |
2329 | | src.resize(len, 0); |
2330 | | for i in 0..src.len() { |
2331 | | src[i] = i as u16; |
2332 | | } |
2333 | | for i in 0..src.len() { |
2334 | | let tail = &mut src[i..]; |
2335 | | for j in 0..tail.len() { |
2336 | | tail[j] = 0x100 + j as u16; |
2337 | | let s = String::from_utf16(tail).unwrap(); |
2338 | | assert!(!is_utf8_latin1(s.as_bytes())); |
2339 | | assert_ne!( |
2340 | | check_utf8_for_latin1_and_bidi(s.as_bytes()), |
2341 | | Latin1Bidi::Latin1 |
2342 | | ); |
2343 | | } |
2344 | | } |
2345 | | } |
2346 | | |
2347 | | #[test] |
2348 | | fn test_is_utf8_latin1_invalid() { |
2349 | | assert!(!is_utf8_latin1(b"\xC3")); |
2350 | | assert!(!is_utf8_latin1(b"a\xC3")); |
2351 | | assert!(!is_utf8_latin1(b"\xFF")); |
2352 | | assert!(!is_utf8_latin1(b"a\xFF")); |
2353 | | assert!(!is_utf8_latin1(b"\xC3\xFF")); |
2354 | | assert!(!is_utf8_latin1(b"a\xC3\xFF")); |
2355 | | } |
2356 | | |
2357 | | #[test] |
2358 | | fn test_convert_utf8_to_utf16() { |
2359 | | let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; |
2360 | | let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1); |
2361 | | dst.resize(src.len() + 1, 0); |
2362 | | let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]); |
2363 | | dst.truncate(len); |
2364 | | let reference: Vec<u16> = src.encode_utf16().collect(); |
2365 | | assert_eq!(dst, reference); |
2366 | | } |
2367 | | |
2368 | | #[test] |
2369 | | fn test_convert_str_to_utf16() { |
2370 | | let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; |
2371 | | let mut dst: Vec<u16> = Vec::with_capacity(src.len()); |
2372 | | dst.resize(src.len(), 0); |
2373 | | let len = convert_str_to_utf16(src, &mut dst[..]); |
2374 | | dst.truncate(len); |
2375 | | let reference: Vec<u16> = src.encode_utf16().collect(); |
2376 | | assert_eq!(dst, reference); |
2377 | | } |
2378 | | |
2379 | | #[test] |
2380 | | fn test_convert_utf16_to_utf8_partial() { |
2381 | | let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; |
2382 | | let src: Vec<u16> = reference.encode_utf16().collect(); |
2383 | | let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1); |
2384 | | dst.resize(src.len() * 3 + 1, 0); |
2385 | | let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]); |
2386 | | let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]); |
2387 | | dst.truncate(len); |
2388 | | assert_eq!(dst, reference.as_bytes()); |
2389 | | } |
2390 | | |
2391 | | #[test] |
2392 | | fn test_convert_utf16_to_utf8() { |
2393 | | let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz"; |
2394 | | let src: Vec<u16> = reference.encode_utf16().collect(); |
2395 | | let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1); |
2396 | | dst.resize(src.len() * 3 + 1, 0); |
2397 | | let len = convert_utf16_to_utf8(&src[..], &mut dst[..]); |
2398 | | dst.truncate(len); |
2399 | | assert_eq!(dst, reference.as_bytes()); |
2400 | | } |
2401 | | |
2402 | | #[test] |
2403 | | fn test_convert_latin1_to_utf16() { |
2404 | | let mut src: Vec<u8> = Vec::with_capacity(256); |
2405 | | src.resize(256, 0); |
2406 | | let mut reference: Vec<u16> = Vec::with_capacity(256); |
2407 | | reference.resize(256, 0); |
2408 | | for i in 0..256 { |
2409 | | src[i] = i as u8; |
2410 | | reference[i] = i as u16; |
2411 | | } |
2412 | | let mut dst: Vec<u16> = Vec::with_capacity(src.len()); |
2413 | | dst.resize(src.len(), 0); |
2414 | | convert_latin1_to_utf16(&src[..], &mut dst[..]); |
2415 | | assert_eq!(dst, reference); |
2416 | | } |
2417 | | |
2418 | | #[test] |
2419 | | fn test_convert_latin1_to_utf8_partial() { |
2420 | | let mut dst = [0u8, 2]; |
2421 | | let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]); |
2422 | | assert_eq!(read, 1); |
2423 | | assert_eq!(written, 1); |
2424 | | } |
2425 | | |
2426 | | #[test] |
2427 | | fn test_convert_latin1_to_utf8() { |
2428 | | let mut src: Vec<u8> = Vec::with_capacity(256); |
2429 | | src.resize(256, 0); |
2430 | | let mut reference: Vec<u16> = Vec::with_capacity(256); |
2431 | | reference.resize(256, 0); |
2432 | | for i in 0..256 { |
2433 | | src[i] = i as u8; |
2434 | | reference[i] = i as u16; |
2435 | | } |
2436 | | let s = String::from_utf16(&reference[..]).unwrap(); |
2437 | | let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2); |
2438 | | dst.resize(src.len() * 2, 0); |
2439 | | let len = convert_latin1_to_utf8(&src[..], &mut dst[..]); |
2440 | | dst.truncate(len); |
2441 | | assert_eq!(&dst[..], s.as_bytes()); |
2442 | | } |
2443 | | |
2444 | | #[test] |
2445 | | fn test_convert_utf8_to_latin1_lossy() { |
2446 | | let mut reference: Vec<u8> = Vec::with_capacity(256); |
2447 | | reference.resize(256, 0); |
2448 | | let mut src16: Vec<u16> = Vec::with_capacity(256); |
2449 | | src16.resize(256, 0); |
2450 | | for i in 0..256 { |
2451 | | src16[i] = i as u16; |
2452 | | reference[i] = i as u8; |
2453 | | } |
2454 | | let src = String::from_utf16(&src16[..]).unwrap(); |
2455 | | let mut dst: Vec<u8> = Vec::with_capacity(src.len()); |
2456 | | dst.resize(src.len(), 0); |
2457 | | let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]); |
2458 | | dst.truncate(len); |
2459 | | assert_eq!(dst, reference); |
2460 | | } |
2461 | | |
2462 | | #[cfg(all(debug_assertions, not(fuzzing)))] |
2463 | | #[test] |
2464 | | #[should_panic] |
2465 | | fn test_convert_utf8_to_latin1_lossy_panics() { |
2466 | | let mut dst = [0u8; 16]; |
2467 | | let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]); |
2468 | | } |
2469 | | |
2470 | | #[test] |
2471 | | fn test_convert_utf16_to_latin1_lossy() { |
2472 | | let mut src: Vec<u16> = Vec::with_capacity(256); |
2473 | | src.resize(256, 0); |
2474 | | let mut reference: Vec<u8> = Vec::with_capacity(256); |
2475 | | reference.resize(256, 0); |
2476 | | for i in 0..256 { |
2477 | | src[i] = i as u16; |
2478 | | reference[i] = i as u8; |
2479 | | } |
2480 | | let mut dst: Vec<u8> = Vec::with_capacity(src.len()); |
2481 | | dst.resize(src.len(), 0); |
2482 | | convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]); |
2483 | | assert_eq!(dst, reference); |
2484 | | } |
2485 | | |
2486 | | #[test] |
2487 | | // #[should_panic] |
2488 | | fn test_convert_utf16_to_latin1_lossy_panics() { |
2489 | | let mut dst = [0u8; 16]; |
2490 | | let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]); |
2491 | | } |
2492 | | |
2493 | | #[test] |
2494 | | fn test_utf16_valid_up_to() { |
2495 | | let valid = vec![ |
2496 | | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16, |
2497 | | 0xD83Du16, 0xDCA9u16, 0x00B6u16, |
2498 | | ]; |
2499 | | assert_eq!(utf16_valid_up_to(&valid[..]), 16); |
2500 | | let lone_high = vec![ |
2501 | | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2502 | | 0x2603u16, 0xD83Du16, 0x00B6u16, |
2503 | | ]; |
2504 | | assert_eq!(utf16_valid_up_to(&lone_high[..]), 14); |
2505 | | let lone_low = vec![ |
2506 | | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2507 | | 0x2603u16, 0xDCA9u16, 0x00B6u16, |
2508 | | ]; |
2509 | | assert_eq!(utf16_valid_up_to(&lone_low[..]), 14); |
2510 | | let lone_high_at_end = vec![ |
2511 | | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2512 | | 0x2603u16, 0x00B6u16, 0xD83Du16, |
2513 | | ]; |
2514 | | assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15); |
2515 | | } |
2516 | | |
2517 | | #[test] |
2518 | | fn test_ensure_utf16_validity() { |
2519 | | let mut src = vec![ |
2520 | | 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2521 | | 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2522 | | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2523 | | ]; |
2524 | | let reference = vec![ |
2525 | | 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2526 | | 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2527 | | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2528 | | ]; |
2529 | | ensure_utf16_validity(&mut src[..]); |
2530 | | assert_eq!(src, reference); |
2531 | | } |
2532 | | |
2533 | | #[test] |
2534 | | fn test_is_char_bidi() { |
2535 | | assert!(!is_char_bidi('a')); |
2536 | | assert!(!is_char_bidi('\u{03B1}')); |
2537 | | assert!(!is_char_bidi('\u{3041}')); |
2538 | | assert!(!is_char_bidi('\u{1F4A9}')); |
2539 | | assert!(!is_char_bidi('\u{FE00}')); |
2540 | | assert!(!is_char_bidi('\u{202C}')); |
2541 | | assert!(!is_char_bidi('\u{FEFF}')); |
2542 | | assert!(is_char_bidi('\u{0590}')); |
2543 | | assert!(is_char_bidi('\u{08FF}')); |
2544 | | assert!(is_char_bidi('\u{061C}')); |
2545 | | assert!(is_char_bidi('\u{FB50}')); |
2546 | | assert!(is_char_bidi('\u{FDFF}')); |
2547 | | assert!(is_char_bidi('\u{FE70}')); |
2548 | | assert!(is_char_bidi('\u{FEFE}')); |
2549 | | assert!(is_char_bidi('\u{200F}')); |
2550 | | assert!(is_char_bidi('\u{202B}')); |
2551 | | assert!(is_char_bidi('\u{202E}')); |
2552 | | assert!(is_char_bidi('\u{2067}')); |
2553 | | assert!(is_char_bidi('\u{10800}')); |
2554 | | assert!(is_char_bidi('\u{10FFF}')); |
2555 | | assert!(is_char_bidi('\u{1E800}')); |
2556 | | assert!(is_char_bidi('\u{1EFFF}')); |
2557 | | } |
2558 | | |
2559 | | #[test] |
2560 | | fn test_is_utf16_code_unit_bidi() { |
2561 | | assert!(!is_utf16_code_unit_bidi(0x0062)); |
2562 | | assert!(!is_utf16_code_unit_bidi(0x03B1)); |
2563 | | assert!(!is_utf16_code_unit_bidi(0x3041)); |
2564 | | assert!(!is_utf16_code_unit_bidi(0xD801)); |
2565 | | assert!(!is_utf16_code_unit_bidi(0xFE00)); |
2566 | | assert!(!is_utf16_code_unit_bidi(0x202C)); |
2567 | | assert!(!is_utf16_code_unit_bidi(0xFEFF)); |
2568 | | assert!(is_utf16_code_unit_bidi(0x0590)); |
2569 | | assert!(is_utf16_code_unit_bidi(0x08FF)); |
2570 | | assert!(is_utf16_code_unit_bidi(0x061C)); |
2571 | | assert!(is_utf16_code_unit_bidi(0xFB1D)); |
2572 | | assert!(is_utf16_code_unit_bidi(0xFB50)); |
2573 | | assert!(is_utf16_code_unit_bidi(0xFDFF)); |
2574 | | assert!(is_utf16_code_unit_bidi(0xFE70)); |
2575 | | assert!(is_utf16_code_unit_bidi(0xFEFE)); |
2576 | | assert!(is_utf16_code_unit_bidi(0x200F)); |
2577 | | assert!(is_utf16_code_unit_bidi(0x202B)); |
2578 | | assert!(is_utf16_code_unit_bidi(0x202E)); |
2579 | | assert!(is_utf16_code_unit_bidi(0x2067)); |
2580 | | assert!(is_utf16_code_unit_bidi(0xD802)); |
2581 | | assert!(is_utf16_code_unit_bidi(0xD803)); |
2582 | | assert!(is_utf16_code_unit_bidi(0xD83A)); |
2583 | | assert!(is_utf16_code_unit_bidi(0xD83B)); |
2584 | | } |
2585 | | |
2586 | | #[test] |
2587 | | fn test_is_str_bidi() { |
2588 | | assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop")); |
2589 | | assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop")); |
2590 | | assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop")); |
2591 | | assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop")); |
2592 | | assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop")); |
2593 | | assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop")); |
2594 | | assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop")); |
2595 | | assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop")); |
2596 | | assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop")); |
2597 | | assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop")); |
2598 | | assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop")); |
2599 | | assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop")); |
2600 | | assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop")); |
2601 | | assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop")); |
2602 | | assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop")); |
2603 | | assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop")); |
2604 | | assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop")); |
2605 | | assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop")); |
2606 | | assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop")); |
2607 | | assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop")); |
2608 | | assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop")); |
2609 | | assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop")); |
2610 | | } |
2611 | | |
2612 | | #[test] |
2613 | | fn test_is_utf8_bidi() { |
2614 | | assert!(!is_utf8_bidi( |
2615 | | "abcdefghijklmnopaabcdefghijklmnop".as_bytes() |
2616 | | )); |
2617 | | assert!(!is_utf8_bidi( |
2618 | | "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes() |
2619 | | )); |
2620 | | assert!(!is_utf8_bidi( |
2621 | | "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes() |
2622 | | )); |
2623 | | assert!(!is_utf8_bidi( |
2624 | | "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes() |
2625 | | )); |
2626 | | assert!(!is_utf8_bidi( |
2627 | | "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes() |
2628 | | )); |
2629 | | assert!(!is_utf8_bidi( |
2630 | | "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes() |
2631 | | )); |
2632 | | assert!(!is_utf8_bidi( |
2633 | | "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes() |
2634 | | )); |
2635 | | assert!(is_utf8_bidi( |
2636 | | "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes() |
2637 | | )); |
2638 | | assert!(is_utf8_bidi( |
2639 | | "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes() |
2640 | | )); |
2641 | | assert!(is_utf8_bidi( |
2642 | | "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes() |
2643 | | )); |
2644 | | assert!(is_utf8_bidi( |
2645 | | "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes() |
2646 | | )); |
2647 | | assert!(is_utf8_bidi( |
2648 | | "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes() |
2649 | | )); |
2650 | | assert!(is_utf8_bidi( |
2651 | | "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes() |
2652 | | )); |
2653 | | assert!(is_utf8_bidi( |
2654 | | "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes() |
2655 | | )); |
2656 | | assert!(is_utf8_bidi( |
2657 | | "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes() |
2658 | | )); |
2659 | | assert!(is_utf8_bidi( |
2660 | | "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes() |
2661 | | )); |
2662 | | assert!(is_utf8_bidi( |
2663 | | "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes() |
2664 | | )); |
2665 | | assert!(is_utf8_bidi( |
2666 | | "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes() |
2667 | | )); |
2668 | | assert!(is_utf8_bidi( |
2669 | | "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes() |
2670 | | )); |
2671 | | assert!(is_utf8_bidi( |
2672 | | "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes() |
2673 | | )); |
2674 | | assert!(is_utf8_bidi( |
2675 | | "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes() |
2676 | | )); |
2677 | | assert!(is_utf8_bidi( |
2678 | | "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes() |
2679 | | )); |
2680 | | } |
2681 | | |
2682 | | #[test] |
2683 | | fn test_is_utf16_bidi() { |
2684 | | assert!(!is_utf16_bidi(&[ |
2685 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66, |
2686 | | 0x67, 0x68, 0x69, |
2687 | | ])); |
2688 | | assert!(!is_utf16_bidi(&[ |
2689 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66, |
2690 | | 0x67, 0x68, 0x69, |
2691 | | ])); |
2692 | | assert!(!is_utf16_bidi(&[ |
2693 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66, |
2694 | | 0x67, 0x68, 0x69, |
2695 | | ])); |
2696 | | assert!(!is_utf16_bidi(&[ |
2697 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66, |
2698 | | 0x67, 0x68, 0x69, |
2699 | | ])); |
2700 | | assert!(!is_utf16_bidi(&[ |
2701 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66, |
2702 | | 0x67, 0x68, 0x69, |
2703 | | ])); |
2704 | | assert!(!is_utf16_bidi(&[ |
2705 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66, |
2706 | | 0x67, 0x68, 0x69, |
2707 | | ])); |
2708 | | assert!(!is_utf16_bidi(&[ |
2709 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66, |
2710 | | 0x67, 0x68, 0x69, |
2711 | | ])); |
2712 | | assert!(is_utf16_bidi(&[ |
2713 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66, |
2714 | | 0x67, 0x68, 0x69, |
2715 | | ])); |
2716 | | assert!(is_utf16_bidi(&[ |
2717 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66, |
2718 | | 0x67, 0x68, 0x69, |
2719 | | ])); |
2720 | | assert!(is_utf16_bidi(&[ |
2721 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66, |
2722 | | 0x67, 0x68, 0x69, |
2723 | | ])); |
2724 | | assert!(is_utf16_bidi(&[ |
2725 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66, |
2726 | | 0x67, 0x68, 0x69, |
2727 | | ])); |
2728 | | assert!(is_utf16_bidi(&[ |
2729 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66, |
2730 | | 0x67, 0x68, 0x69, |
2731 | | ])); |
2732 | | assert!(is_utf16_bidi(&[ |
2733 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66, |
2734 | | 0x67, 0x68, 0x69, |
2735 | | ])); |
2736 | | assert!(is_utf16_bidi(&[ |
2737 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66, |
2738 | | 0x67, 0x68, 0x69, |
2739 | | ])); |
2740 | | assert!(is_utf16_bidi(&[ |
2741 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66, |
2742 | | 0x67, 0x68, 0x69, |
2743 | | ])); |
2744 | | assert!(is_utf16_bidi(&[ |
2745 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66, |
2746 | | 0x67, 0x68, 0x69, |
2747 | | ])); |
2748 | | assert!(is_utf16_bidi(&[ |
2749 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66, |
2750 | | 0x67, 0x68, 0x69, |
2751 | | ])); |
2752 | | assert!(is_utf16_bidi(&[ |
2753 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66, |
2754 | | 0x67, 0x68, 0x69, |
2755 | | ])); |
2756 | | assert!(is_utf16_bidi(&[ |
2757 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66, |
2758 | | 0x67, 0x68, 0x69, |
2759 | | ])); |
2760 | | assert!(is_utf16_bidi(&[ |
2761 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66, |
2762 | | 0x67, 0x68, 0x69, |
2763 | | ])); |
2764 | | assert!(is_utf16_bidi(&[ |
2765 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66, |
2766 | | 0x67, 0x68, 0x69, |
2767 | | ])); |
2768 | | assert!(is_utf16_bidi(&[ |
2769 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66, |
2770 | | 0x67, 0x68, 0x69, |
2771 | | ])); |
2772 | | assert!(is_utf16_bidi(&[ |
2773 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66, |
2774 | | 0x67, 0x68, 0x69, |
2775 | | ])); |
2776 | | |
2777 | | assert!(is_utf16_bidi(&[ |
2778 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65, |
2779 | | 0x66, 0x67, 0x68, 0x69, |
2780 | | ])); |
2781 | | } |
2782 | | |
2783 | | #[test] |
2784 | | fn test_check_str_for_latin1_and_bidi() { |
2785 | | assert_ne!( |
2786 | | check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"), |
2787 | | Latin1Bidi::Bidi |
2788 | | ); |
2789 | | assert_ne!( |
2790 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"), |
2791 | | Latin1Bidi::Bidi |
2792 | | ); |
2793 | | assert_ne!( |
2794 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"), |
2795 | | Latin1Bidi::Bidi |
2796 | | ); |
2797 | | assert_ne!( |
2798 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"), |
2799 | | Latin1Bidi::Bidi |
2800 | | ); |
2801 | | assert_ne!( |
2802 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"), |
2803 | | Latin1Bidi::Bidi |
2804 | | ); |
2805 | | assert_ne!( |
2806 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"), |
2807 | | Latin1Bidi::Bidi |
2808 | | ); |
2809 | | assert_ne!( |
2810 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"), |
2811 | | Latin1Bidi::Bidi |
2812 | | ); |
2813 | | assert_eq!( |
2814 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"), |
2815 | | Latin1Bidi::Bidi |
2816 | | ); |
2817 | | assert_eq!( |
2818 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"), |
2819 | | Latin1Bidi::Bidi |
2820 | | ); |
2821 | | assert_eq!( |
2822 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"), |
2823 | | Latin1Bidi::Bidi |
2824 | | ); |
2825 | | assert_eq!( |
2826 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"), |
2827 | | Latin1Bidi::Bidi |
2828 | | ); |
2829 | | assert_eq!( |
2830 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"), |
2831 | | Latin1Bidi::Bidi |
2832 | | ); |
2833 | | assert_eq!( |
2834 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"), |
2835 | | Latin1Bidi::Bidi |
2836 | | ); |
2837 | | assert_eq!( |
2838 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"), |
2839 | | Latin1Bidi::Bidi |
2840 | | ); |
2841 | | assert_eq!( |
2842 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"), |
2843 | | Latin1Bidi::Bidi |
2844 | | ); |
2845 | | assert_eq!( |
2846 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"), |
2847 | | Latin1Bidi::Bidi |
2848 | | ); |
2849 | | assert_eq!( |
2850 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"), |
2851 | | Latin1Bidi::Bidi |
2852 | | ); |
2853 | | assert_eq!( |
2854 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"), |
2855 | | Latin1Bidi::Bidi |
2856 | | ); |
2857 | | assert_eq!( |
2858 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"), |
2859 | | Latin1Bidi::Bidi |
2860 | | ); |
2861 | | assert_eq!( |
2862 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"), |
2863 | | Latin1Bidi::Bidi |
2864 | | ); |
2865 | | assert_eq!( |
2866 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"), |
2867 | | Latin1Bidi::Bidi |
2868 | | ); |
2869 | | assert_eq!( |
2870 | | check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"), |
2871 | | Latin1Bidi::Bidi |
2872 | | ); |
2873 | | } |
2874 | | |
2875 | | #[test] |
2876 | | fn test_check_utf8_for_latin1_and_bidi() { |
2877 | | assert_ne!( |
2878 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()), |
2879 | | Latin1Bidi::Bidi |
2880 | | ); |
2881 | | assert_ne!( |
2882 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()), |
2883 | | Latin1Bidi::Bidi |
2884 | | ); |
2885 | | assert_ne!( |
2886 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()), |
2887 | | Latin1Bidi::Bidi |
2888 | | ); |
2889 | | assert_ne!( |
2890 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()), |
2891 | | Latin1Bidi::Bidi |
2892 | | ); |
2893 | | assert_ne!( |
2894 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()), |
2895 | | Latin1Bidi::Bidi |
2896 | | ); |
2897 | | assert_ne!( |
2898 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()), |
2899 | | Latin1Bidi::Bidi |
2900 | | ); |
2901 | | assert_ne!( |
2902 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()), |
2903 | | Latin1Bidi::Bidi |
2904 | | ); |
2905 | | assert_eq!( |
2906 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()), |
2907 | | Latin1Bidi::Bidi |
2908 | | ); |
2909 | | assert_eq!( |
2910 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()), |
2911 | | Latin1Bidi::Bidi |
2912 | | ); |
2913 | | assert_eq!( |
2914 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()), |
2915 | | Latin1Bidi::Bidi |
2916 | | ); |
2917 | | assert_eq!( |
2918 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()), |
2919 | | Latin1Bidi::Bidi |
2920 | | ); |
2921 | | assert_eq!( |
2922 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()), |
2923 | | Latin1Bidi::Bidi |
2924 | | ); |
2925 | | assert_eq!( |
2926 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()), |
2927 | | Latin1Bidi::Bidi |
2928 | | ); |
2929 | | assert_eq!( |
2930 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()), |
2931 | | Latin1Bidi::Bidi |
2932 | | ); |
2933 | | assert_eq!( |
2934 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()), |
2935 | | Latin1Bidi::Bidi |
2936 | | ); |
2937 | | assert_eq!( |
2938 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()), |
2939 | | Latin1Bidi::Bidi |
2940 | | ); |
2941 | | assert_eq!( |
2942 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()), |
2943 | | Latin1Bidi::Bidi |
2944 | | ); |
2945 | | assert_eq!( |
2946 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()), |
2947 | | Latin1Bidi::Bidi |
2948 | | ); |
2949 | | assert_eq!( |
2950 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()), |
2951 | | Latin1Bidi::Bidi |
2952 | | ); |
2953 | | assert_eq!( |
2954 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()), |
2955 | | Latin1Bidi::Bidi |
2956 | | ); |
2957 | | assert_eq!( |
2958 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()), |
2959 | | Latin1Bidi::Bidi |
2960 | | ); |
2961 | | assert_eq!( |
2962 | | check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()), |
2963 | | Latin1Bidi::Bidi |
2964 | | ); |
2965 | | } |
2966 | | |
2967 | | #[test] |
2968 | | fn test_check_utf16_for_latin1_and_bidi() { |
2969 | | assert_ne!( |
2970 | | check_utf16_for_latin1_and_bidi(&[ |
2971 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, |
2972 | | 0x66, 0x67, 0x68, 0x69, |
2973 | | ]), |
2974 | | Latin1Bidi::Bidi |
2975 | | ); |
2976 | | assert_ne!( |
2977 | | check_utf16_for_latin1_and_bidi(&[ |
2978 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, |
2979 | | 0x66, 0x67, 0x68, 0x69, |
2980 | | ]), |
2981 | | Latin1Bidi::Bidi |
2982 | | ); |
2983 | | assert_ne!( |
2984 | | check_utf16_for_latin1_and_bidi(&[ |
2985 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, |
2986 | | 0x66, 0x67, 0x68, 0x69, |
2987 | | ]), |
2988 | | Latin1Bidi::Bidi |
2989 | | ); |
2990 | | assert_ne!( |
2991 | | check_utf16_for_latin1_and_bidi(&[ |
2992 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, |
2993 | | 0x66, 0x67, 0x68, 0x69, |
2994 | | ]), |
2995 | | Latin1Bidi::Bidi |
2996 | | ); |
2997 | | assert_ne!( |
2998 | | check_utf16_for_latin1_and_bidi(&[ |
2999 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, |
3000 | | 0x66, 0x67, 0x68, 0x69, |
3001 | | ]), |
3002 | | Latin1Bidi::Bidi |
3003 | | ); |
3004 | | assert_ne!( |
3005 | | check_utf16_for_latin1_and_bidi(&[ |
3006 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, |
3007 | | 0x66, 0x67, 0x68, 0x69, |
3008 | | ]), |
3009 | | Latin1Bidi::Bidi |
3010 | | ); |
3011 | | assert_ne!( |
3012 | | check_utf16_for_latin1_and_bidi(&[ |
3013 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, |
3014 | | 0x66, 0x67, 0x68, 0x69, |
3015 | | ]), |
3016 | | Latin1Bidi::Bidi |
3017 | | ); |
3018 | | assert_eq!( |
3019 | | check_utf16_for_latin1_and_bidi(&[ |
3020 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, |
3021 | | 0x66, 0x67, 0x68, 0x69, |
3022 | | ]), |
3023 | | Latin1Bidi::Bidi |
3024 | | ); |
3025 | | assert_eq!( |
3026 | | check_utf16_for_latin1_and_bidi(&[ |
3027 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, |
3028 | | 0x66, 0x67, 0x68, 0x69, |
3029 | | ]), |
3030 | | Latin1Bidi::Bidi |
3031 | | ); |
3032 | | assert_eq!( |
3033 | | check_utf16_for_latin1_and_bidi(&[ |
3034 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, |
3035 | | 0x66, 0x67, 0x68, 0x69, |
3036 | | ]), |
3037 | | Latin1Bidi::Bidi |
3038 | | ); |
3039 | | assert_eq!( |
3040 | | check_utf16_for_latin1_and_bidi(&[ |
3041 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, |
3042 | | 0x66, 0x67, 0x68, 0x69, |
3043 | | ]), |
3044 | | Latin1Bidi::Bidi |
3045 | | ); |
3046 | | assert_eq!( |
3047 | | check_utf16_for_latin1_and_bidi(&[ |
3048 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, |
3049 | | 0x66, 0x67, 0x68, 0x69, |
3050 | | ]), |
3051 | | Latin1Bidi::Bidi |
3052 | | ); |
3053 | | assert_eq!( |
3054 | | check_utf16_for_latin1_and_bidi(&[ |
3055 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, |
3056 | | 0x66, 0x67, 0x68, 0x69, |
3057 | | ]), |
3058 | | Latin1Bidi::Bidi |
3059 | | ); |
3060 | | assert_eq!( |
3061 | | check_utf16_for_latin1_and_bidi(&[ |
3062 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, |
3063 | | 0x66, 0x67, 0x68, 0x69, |
3064 | | ]), |
3065 | | Latin1Bidi::Bidi |
3066 | | ); |
3067 | | assert_eq!( |
3068 | | check_utf16_for_latin1_and_bidi(&[ |
3069 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, |
3070 | | 0x66, 0x67, 0x68, 0x69, |
3071 | | ]), |
3072 | | Latin1Bidi::Bidi |
3073 | | ); |
3074 | | assert_eq!( |
3075 | | check_utf16_for_latin1_and_bidi(&[ |
3076 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, |
3077 | | 0x66, 0x67, 0x68, 0x69, |
3078 | | ]), |
3079 | | Latin1Bidi::Bidi |
3080 | | ); |
3081 | | assert_eq!( |
3082 | | check_utf16_for_latin1_and_bidi(&[ |
3083 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, |
3084 | | 0x66, 0x67, 0x68, 0x69, |
3085 | | ]), |
3086 | | Latin1Bidi::Bidi |
3087 | | ); |
3088 | | assert_eq!( |
3089 | | check_utf16_for_latin1_and_bidi(&[ |
3090 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, |
3091 | | 0x66, 0x67, 0x68, 0x69, |
3092 | | ]), |
3093 | | Latin1Bidi::Bidi |
3094 | | ); |
3095 | | assert_eq!( |
3096 | | check_utf16_for_latin1_and_bidi(&[ |
3097 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, |
3098 | | 0x66, 0x67, 0x68, 0x69, |
3099 | | ]), |
3100 | | Latin1Bidi::Bidi |
3101 | | ); |
3102 | | assert_eq!( |
3103 | | check_utf16_for_latin1_and_bidi(&[ |
3104 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, |
3105 | | 0x66, 0x67, 0x68, 0x69, |
3106 | | ]), |
3107 | | Latin1Bidi::Bidi |
3108 | | ); |
3109 | | assert_eq!( |
3110 | | check_utf16_for_latin1_and_bidi(&[ |
3111 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, |
3112 | | 0x66, 0x67, 0x68, 0x69, |
3113 | | ]), |
3114 | | Latin1Bidi::Bidi |
3115 | | ); |
3116 | | assert_eq!( |
3117 | | check_utf16_for_latin1_and_bidi(&[ |
3118 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, |
3119 | | 0x66, 0x67, 0x68, 0x69, |
3120 | | ]), |
3121 | | Latin1Bidi::Bidi |
3122 | | ); |
3123 | | assert_eq!( |
3124 | | check_utf16_for_latin1_and_bidi(&[ |
3125 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, |
3126 | | 0x66, 0x67, 0x68, 0x69, |
3127 | | ]), |
3128 | | Latin1Bidi::Bidi |
3129 | | ); |
3130 | | |
3131 | | assert_eq!( |
3132 | | check_utf16_for_latin1_and_bidi(&[ |
3133 | | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, |
3134 | | 0x65, 0x66, 0x67, 0x68, 0x69, |
3135 | | ]), |
3136 | | Latin1Bidi::Bidi |
3137 | | ); |
3138 | | } |
3139 | | |
3140 | | #[inline(always)] |
3141 | | pub fn reference_is_char_bidi(c: char) -> bool { |
3142 | | match c { |
3143 | | '\u{0590}'..='\u{08FF}' |
3144 | | | '\u{FB1D}'..='\u{FDFF}' |
3145 | | | '\u{FE70}'..='\u{FEFE}' |
3146 | | | '\u{10800}'..='\u{10FFF}' |
3147 | | | '\u{1E800}'..='\u{1EFFF}' |
3148 | | | '\u{200F}' |
3149 | | | '\u{202B}' |
3150 | | | '\u{202E}' |
3151 | | | '\u{2067}' => true, |
3152 | | _ => false, |
3153 | | } |
3154 | | } |
3155 | | |
3156 | | #[inline(always)] |
3157 | | pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool { |
3158 | | match u { |
3159 | | 0x0590..=0x08FF |
3160 | | | 0xFB1D..=0xFDFF |
3161 | | | 0xFE70..=0xFEFE |
3162 | | | 0xD802 |
3163 | | | 0xD803 |
3164 | | | 0xD83A |
3165 | | | 0xD83B |
3166 | | | 0x200F |
3167 | | | 0x202B |
3168 | | | 0x202E |
3169 | | | 0x2067 => true, |
3170 | | _ => false, |
3171 | | } |
3172 | | } |
3173 | | |
3174 | | #[test] |
3175 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
3176 | | fn test_is_char_bidi_thoroughly() { |
3177 | | for i in 0..0xD800u32 { |
3178 | | let c: char = ::core::char::from_u32(i).unwrap(); |
3179 | | assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); |
3180 | | } |
3181 | | for i in 0xE000..0x110000u32 { |
3182 | | let c: char = ::core::char::from_u32(i).unwrap(); |
3183 | | assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); |
3184 | | } |
3185 | | } |
3186 | | |
3187 | | #[test] |
3188 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
3189 | | fn test_is_utf16_code_unit_bidi_thoroughly() { |
3190 | | for i in 0..0x10000u32 { |
3191 | | let u = i as u16; |
3192 | | assert_eq!( |
3193 | | is_utf16_code_unit_bidi(u), |
3194 | | reference_is_utf16_code_unit_bidi(u) |
3195 | | ); |
3196 | | } |
3197 | | } |
3198 | | |
3199 | | #[test] |
3200 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
3201 | | fn test_is_str_bidi_thoroughly() { |
3202 | | let mut buf = [0; 4]; |
3203 | | for i in 0..0xD800u32 { |
3204 | | let c: char = ::core::char::from_u32(i).unwrap(); |
3205 | | assert_eq!( |
3206 | | is_str_bidi(c.encode_utf8(&mut buf[..])), |
3207 | | reference_is_char_bidi(c) |
3208 | | ); |
3209 | | } |
3210 | | for i in 0xE000..0x110000u32 { |
3211 | | let c: char = ::core::char::from_u32(i).unwrap(); |
3212 | | assert_eq!( |
3213 | | is_str_bidi(c.encode_utf8(&mut buf[..])), |
3214 | | reference_is_char_bidi(c) |
3215 | | ); |
3216 | | } |
3217 | | } |
3218 | | |
3219 | | #[test] |
3220 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
3221 | | fn test_is_utf8_bidi_thoroughly() { |
3222 | | let mut buf = [0; 8]; |
3223 | | for i in 0..0xD800u32 { |
3224 | | let c: char = ::core::char::from_u32(i).unwrap(); |
3225 | | let expect = reference_is_char_bidi(c); |
3226 | | { |
3227 | | let len = { |
3228 | | let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); |
3229 | | assert_eq!(is_utf8_bidi(bytes), expect); |
3230 | | bytes.len() |
3231 | | }; |
3232 | | { |
3233 | | let tail = &mut buf[len..]; |
3234 | | for b in tail.iter_mut() { |
3235 | | *b = 0; |
3236 | | } |
3237 | | } |
3238 | | } |
3239 | | assert_eq!(is_utf8_bidi(&buf[..]), expect); |
3240 | | } |
3241 | | for i in 0xE000..0x110000u32 { |
3242 | | let c: char = ::core::char::from_u32(i).unwrap(); |
3243 | | let expect = reference_is_char_bidi(c); |
3244 | | { |
3245 | | let len = { |
3246 | | let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); |
3247 | | assert_eq!(is_utf8_bidi(bytes), expect); |
3248 | | bytes.len() |
3249 | | }; |
3250 | | { |
3251 | | let tail = &mut buf[len..]; |
3252 | | for b in tail.iter_mut() { |
3253 | | *b = 0; |
3254 | | } |
3255 | | } |
3256 | | } |
3257 | | assert_eq!(is_utf8_bidi(&buf[..]), expect); |
3258 | | } |
3259 | | } |
3260 | | |
3261 | | #[test] |
3262 | | #[cfg_attr(miri, ignore)] // Miri is too slow |
3263 | | fn test_is_utf16_bidi_thoroughly() { |
3264 | | let mut buf = [0; 32]; |
3265 | | for i in 0..0x10000u32 { |
3266 | | let u = i as u16; |
3267 | | buf[15] = u; |
3268 | | assert_eq!( |
3269 | | is_utf16_bidi(&buf[..]), |
3270 | | reference_is_utf16_code_unit_bidi(u) |
3271 | | ); |
3272 | | } |
3273 | | } |
3274 | | |
3275 | | #[test] |
3276 | | fn test_is_utf8_bidi_edge_cases() { |
3277 | | assert!(!is_utf8_bidi(b"\xD5\xBF\x61")); |
3278 | | assert!(!is_utf8_bidi(b"\xD6\x80\x61")); |
3279 | | assert!(!is_utf8_bidi(b"abc")); |
3280 | | assert!(is_utf8_bidi(b"\xD5\xBF\xC2")); |
3281 | | assert!(is_utf8_bidi(b"\xD6\x80\xC2")); |
3282 | | assert!(is_utf8_bidi(b"ab\xC2")); |
3283 | | } |
3284 | | |
3285 | | #[test] |
3286 | | fn test_decode_latin1() { |
3287 | | match decode_latin1(b"ab") { |
3288 | | Cow::Borrowed(s) => { |
3289 | | assert_eq!(s, "ab"); |
3290 | | } |
3291 | | Cow::Owned(_) => { |
3292 | | unreachable!("Should have borrowed"); |
3293 | | } |
3294 | | } |
3295 | | assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}"); |
3296 | | } |
3297 | | |
3298 | | #[test] |
3299 | | fn test_encode_latin1_lossy() { |
3300 | | match encode_latin1_lossy("ab") { |
3301 | | Cow::Borrowed(s) => { |
3302 | | assert_eq!(s, b"ab"); |
3303 | | } |
3304 | | Cow::Owned(_) => { |
3305 | | unreachable!("Should have borrowed"); |
3306 | | } |
3307 | | } |
3308 | | assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]); |
3309 | | } |
3310 | | |
3311 | | #[test] |
3312 | | fn test_convert_utf8_to_utf16_without_replacement() { |
3313 | | let mut buf = [0u16; 5]; |
3314 | | assert_eq!( |
3315 | | convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]), |
3316 | | Some(2) |
3317 | | ); |
3318 | | assert_eq!(buf[0], u16::from(b'a')); |
3319 | | assert_eq!(buf[1], u16::from(b'b')); |
3320 | | assert_eq!(buf[2], 0); |
3321 | | assert_eq!( |
3322 | | convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]), |
3323 | | Some(2) |
3324 | | ); |
3325 | | assert_eq!(buf[0], 0xE4); |
3326 | | assert_eq!(buf[1], u16::from(b'c')); |
3327 | | assert_eq!(buf[2], 0); |
3328 | | assert_eq!( |
3329 | | convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]), |
3330 | | Some(1) |
3331 | | ); |
3332 | | assert_eq!(buf[0], 0x2603); |
3333 | | assert_eq!(buf[1], u16::from(b'c')); |
3334 | | assert_eq!(buf[2], 0); |
3335 | | assert_eq!( |
3336 | | convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]), |
3337 | | Some(2) |
3338 | | ); |
3339 | | assert_eq!(buf[0], 0x2603); |
3340 | | assert_eq!(buf[1], u16::from(b'd')); |
3341 | | assert_eq!(buf[2], 0); |
3342 | | assert_eq!( |
3343 | | convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]), |
3344 | | Some(2) |
3345 | | ); |
3346 | | assert_eq!(buf[0], 0x2603); |
3347 | | assert_eq!(buf[1], 0xE4); |
3348 | | assert_eq!(buf[2], 0); |
3349 | | assert_eq!( |
3350 | | convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]), |
3351 | | Some(2) |
3352 | | ); |
3353 | | assert_eq!(buf[0], 0xD83D); |
3354 | | assert_eq!(buf[1], 0xDCCE); |
3355 | | assert_eq!(buf[2], 0); |
3356 | | assert_eq!( |
3357 | | convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]), |
3358 | | Some(3) |
3359 | | ); |
3360 | | assert_eq!(buf[0], 0xD83D); |
3361 | | assert_eq!(buf[1], 0xDCCE); |
3362 | | assert_eq!(buf[2], u16::from(b'e')); |
3363 | | assert_eq!( |
3364 | | convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]), |
3365 | | None |
3366 | | ); |
3367 | | } |
3368 | | } |