Coverage Report

Created: 2025-06-16 06:50

/rust/registry/src/index.crates.io-6f17d22bba15001f/encoding_rs-0.8.35/src/mem.rs
Line
Count
Source (jump to first uncovered line)
1
// Copyright Mozilla Foundation. See the COPYRIGHT
2
// file at the top-level directory of this distribution.
3
//
4
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7
// option. This file may not be copied, modified, or distributed
8
// except according to those terms.
9
10
//! Functions for converting between different in-RAM representations of text
11
//! and for quickly checking if the Unicode Bidirectional Algorithm can be
12
//! avoided.
13
//!
14
//! By using slices for output, the functions here seek to enable by-register
15
//! (ALU register or SIMD register as available) operations in order to
16
//! outperform iterator-based conversions available in the Rust standard
17
//! library.
18
//!
19
//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20
//! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21
//! in-memory encoding is sometimes used as a storage optimization of text
22
//! when UTF-16 indexing and length semantics are exposed.
23
//!
24
//! The FFI binding for this module are in the
25
//! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26
27
#[cfg(feature = "alloc")]
28
use alloc::borrow::Cow;
29
#[cfg(feature = "alloc")]
30
use alloc::string::String;
31
#[cfg(feature = "alloc")]
32
use alloc::vec::Vec;
33
34
use super::in_inclusive_range16;
35
use super::in_inclusive_range32;
36
use super::in_inclusive_range8;
37
use super::in_range16;
38
use super::in_range32;
39
use super::DecoderResult;
40
use crate::ascii::*;
41
use crate::utf_8::*;
42
43
macro_rules! non_fuzz_debug_assert {
44
    ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
45
}
46
47
cfg_if! {
48
    if #[cfg(feature = "simd-accel")] {
49
        use ::core::intrinsics::likely;
50
        use ::core::intrinsics::unlikely;
51
    } else {
52
        #[inline(always)]
53
0
        fn likely(b: bool) -> bool {
54
0
            b
55
0
        }
56
        #[inline(always)]
57
0
        fn unlikely(b: bool) -> bool {
58
0
            b
59
0
        }
60
    }
61
}
62
63
/// Classification of text as Latin1 (all code points are below U+0100),
64
/// left-to-right with some non-Latin1 characters or as containing at least
65
/// some right-to-left characters.
66
#[must_use]
67
#[derive(Debug, PartialEq, Eq)]
68
#[repr(C)]
69
pub enum Latin1Bidi {
70
    /// Every character is below U+0100.
71
    Latin1 = 0,
72
    /// There is at least one character that's U+0100 or higher, but there
73
    /// are no right-to-left characters.
74
    LeftToRight = 1,
75
    /// There is at least one right-to-left character.
76
    Bidi = 2,
77
}
78
79
// `as` truncates, so works on 32-bit, too.
80
#[allow(dead_code)]
81
const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
82
83
#[allow(unused_macros)]
84
macro_rules! by_unit_check_alu {
85
    ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
86
        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
87
        #[inline(always)]
88
15.6k
        fn $name(buffer: &[$unit]) -> bool {
89
15.6k
            let mut offset = 0usize;
90
15.6k
            let mut accu = 0usize;
91
15.6k
            let unit_size = ::core::mem::size_of::<$unit>();
92
15.6k
            let len = buffer.len();
93
15.6k
            if len >= ALU_ALIGNMENT / unit_size {
94
                // The most common reason to return `false` is for the first code
95
                // unit to fail the test, so check that first.
96
15.6k
                if buffer[0] >= $bound {
97
245
                    return false;
98
15.4k
                }
99
15.4k
                let src = buffer.as_ptr();
100
15.4k
                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
101
15.4k
                    & ALU_ALIGNMENT_MASK)
102
15.4k
                    / unit_size;
103
15.4k
                if until_alignment + ALU_ALIGNMENT / unit_size <= len {
104
15.4k
                    if until_alignment != 0 {
105
545
                        accu |= buffer[offset] as usize;
106
545
                        offset += 1;
107
545
                        until_alignment -= 1;
108
1.14k
                        while until_alignment != 0 {
109
603
                            accu |= buffer[offset] as usize;
110
603
                            offset += 1;
111
603
                            until_alignment -= 1;
112
603
                        }
113
545
                        if accu >= $bound {
114
14
                            return false;
115
531
                        }
116
14.9k
                    }
117
15.4k
                    let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
118
15.4k
                    if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
119
                        // Safety: the above check lets us perform 4 consecutive reads of
120
                        // length ALU_ALIGNMENT / unit_size. ALU_ALIGNMENT is the size of usize, and unit_size
121
                        // is the size of the `src` pointer, so this is equal to performing four usize reads.
122
                        //
123
                        // This invariant is upheld on all loop iterations
124
15.3k
                        let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
125
                        loop {
126
7.74M
                            let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
127
7.74M
                                | unsafe {
128
7.74M
                                    *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
129
7.74M
                                }
130
7.74M
                                | unsafe {
131
7.74M
                                    *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
132
7.74M
                                        as *const usize)
133
7.74M
                                }
134
7.74M
                                | unsafe {
135
7.74M
                                    *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
136
7.74M
                                        as *const usize)
137
7.74M
                                };
138
7.74M
                            if unroll_accu & $mask != 0 {
139
4.89k
                                return false;
140
7.73M
                            }
141
7.73M
                            offset += 4 * (ALU_ALIGNMENT / unit_size);
142
7.73M
                            // Safety: this check lets us continue to perform the 4 reads earlier
143
7.73M
                            if offset > len_minus_unroll {
144
10.4k
                                break;
145
7.72M
                            }
146
                        }
147
81
                    }
148
22.7k
                    while offset <= len_minus_stride {
149
12.2k
                        // Safety: the above check lets us perform one usize read.
150
12.2k
                        accu |= unsafe { *(src.add(offset) as *const usize) };
151
12.2k
                        offset += ALU_ALIGNMENT / unit_size;
152
12.2k
                    }
153
0
                }
154
0
            }
155
14.6k
            for &unit in &buffer[offset..] {
156
14.6k
                accu |= unit as usize;
157
14.6k
            }
158
10.5k
            accu & $mask == 0
159
15.6k
        }
Unexecuted instantiation: encoding_rs::mem::is_ascii_impl
Unexecuted instantiation: encoding_rs::mem::is_basic_latin_impl
encoding_rs::mem::is_utf16_latin1_impl
Line
Count
Source
88
15.6k
        fn $name(buffer: &[$unit]) -> bool {
89
15.6k
            let mut offset = 0usize;
90
15.6k
            let mut accu = 0usize;
91
15.6k
            let unit_size = ::core::mem::size_of::<$unit>();
92
15.6k
            let len = buffer.len();
93
15.6k
            if len >= ALU_ALIGNMENT / unit_size {
94
                // The most common reason to return `false` is for the first code
95
                // unit to fail the test, so check that first.
96
15.6k
                if buffer[0] >= $bound {
97
245
                    return false;
98
15.4k
                }
99
15.4k
                let src = buffer.as_ptr();
100
15.4k
                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
101
15.4k
                    & ALU_ALIGNMENT_MASK)
102
15.4k
                    / unit_size;
103
15.4k
                if until_alignment + ALU_ALIGNMENT / unit_size <= len {
104
15.4k
                    if until_alignment != 0 {
105
545
                        accu |= buffer[offset] as usize;
106
545
                        offset += 1;
107
545
                        until_alignment -= 1;
108
1.14k
                        while until_alignment != 0 {
109
603
                            accu |= buffer[offset] as usize;
110
603
                            offset += 1;
111
603
                            until_alignment -= 1;
112
603
                        }
113
545
                        if accu >= $bound {
114
14
                            return false;
115
531
                        }
116
14.9k
                    }
117
15.4k
                    let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
118
15.4k
                    if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
119
                        // Safety: the above check lets us perform 4 consecutive reads of
120
                        // length ALU_ALIGNMENT / unit_size. ALU_ALIGNMENT is the size of usize, and unit_size
121
                        // is the size of the `src` pointer, so this is equal to performing four usize reads.
122
                        //
123
                        // This invariant is upheld on all loop iterations
124
15.3k
                        let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
125
                        loop {
126
7.74M
                            let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
127
7.74M
                                | unsafe {
128
7.74M
                                    *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
129
7.74M
                                }
130
7.74M
                                | unsafe {
131
7.74M
                                    *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
132
7.74M
                                        as *const usize)
133
7.74M
                                }
134
7.74M
                                | unsafe {
135
7.74M
                                    *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
136
7.74M
                                        as *const usize)
137
7.74M
                                };
138
7.74M
                            if unroll_accu & $mask != 0 {
139
4.89k
                                return false;
140
7.73M
                            }
141
7.73M
                            offset += 4 * (ALU_ALIGNMENT / unit_size);
142
7.73M
                            // Safety: this check lets us continue to perform the 4 reads earlier
143
7.73M
                            if offset > len_minus_unroll {
144
10.4k
                                break;
145
7.72M
                            }
146
                        }
147
81
                    }
148
22.7k
                    while offset <= len_minus_stride {
149
12.2k
                        // Safety: the above check lets us perform one usize read.
150
12.2k
                        accu |= unsafe { *(src.add(offset) as *const usize) };
151
12.2k
                        offset += ALU_ALIGNMENT / unit_size;
152
12.2k
                    }
153
0
                }
154
0
            }
155
14.6k
            for &unit in &buffer[offset..] {
156
14.6k
                accu |= unit as usize;
157
14.6k
            }
158
10.5k
            accu & $mask == 0
159
15.6k
        }
160
    };
161
}
162
163
#[allow(unused_macros)]
164
macro_rules! by_unit_check_simd {
165
    ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
166
        #[inline(always)]
167
        fn $name(buffer: &[$unit]) -> bool {
168
            let mut offset = 0usize;
169
            let mut accu = 0usize;
170
            let unit_size = ::core::mem::size_of::<$unit>();
171
            let len = buffer.len();
172
            if len >= SIMD_STRIDE_SIZE / unit_size {
173
                // The most common reason to return `false` is for the first code
174
                // unit to fail the test, so check that first.
175
                if buffer[0] >= $bound {
176
                    return false;
177
                }
178
                let src = buffer.as_ptr();
179
                let mut until_alignment = ((SIMD_ALIGNMENT
180
                    - ((src as usize) & SIMD_ALIGNMENT_MASK))
181
                    & SIMD_ALIGNMENT_MASK)
182
                    / unit_size;
183
                if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
184
                    if until_alignment != 0 {
185
                        accu |= buffer[offset] as usize;
186
                        offset += 1;
187
                        until_alignment -= 1;
188
                        while until_alignment != 0 {
189
                            accu |= buffer[offset] as usize;
190
                            offset += 1;
191
                            until_alignment -= 1;
192
                        }
193
                        if accu >= $bound {
194
                            return false;
195
                        }
196
                    }
197
                    let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
198
                    if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
199
                        // Safety: the above check lets us perform 4 consecutive reads of
200
                        // length SIMD_STRIDE_SIZE / unit_size. SIMD_STRIDE_SIZE is the size of $simd_ty, and unit_size
201
                        // is the size of the `src` pointer, so this is equal to performing four $simd_ty reads.
202
                        //
203
                        // This invariant is upheld on all loop iterations
204
                        let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
205
                        loop {
206
                            let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
207
                                | unsafe {
208
                                    *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
209
                                        as *const $simd_ty)
210
                                }
211
                                | unsafe {
212
                                    *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
213
                                        as *const $simd_ty)
214
                                }
215
                                | unsafe {
216
                                    *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
217
                                        as *const $simd_ty)
218
                                };
219
                            if !$func(unroll_accu) {
220
                                return false;
221
                            }
222
                            offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
223
                            // Safety: this check lets us continue to perform the 4 reads earlier
224
                            if offset > len_minus_unroll {
225
                                break;
226
                            }
227
                        }
228
                    }
229
                    let mut simd_accu = $splat;
230
                    while offset <= len_minus_stride {
231
                        // Safety: the above check lets us perform one $simd_ty read.
232
                        simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
233
                        offset += SIMD_STRIDE_SIZE / unit_size;
234
                    }
235
                    if !$func(simd_accu) {
236
                        return false;
237
                    }
238
                }
239
            }
240
            for &unit in &buffer[offset..] {
241
                accu |= unit as usize;
242
            }
243
            accu < $bound
244
        }
245
    };
246
}
247
248
cfg_if! {
249
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
250
        use crate::simd_funcs::*;
251
        use core::simd::u8x16;
252
        use core::simd::u16x8;
253
254
        const SIMD_ALIGNMENT: usize = 16;
255
256
        const SIMD_ALIGNMENT_MASK: usize = 15;
257
258
        by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
259
        by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
260
        by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
261
262
        #[inline(always)]
263
        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
264
            // This function is a mess, because it simultaneously tries to do
265
            // only aligned SIMD (perhaps misguidedly) and needs to deal with
266
            // the last code unit in a SIMD stride being part of a valid
267
            // surrogate pair.
268
            let unit_size = ::core::mem::size_of::<u16>();
269
            let src = buffer.as_ptr();
270
            let len = buffer.len();
271
            let mut offset = 0usize;
272
            'outer: loop {
273
                let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
274
                                        SIMD_ALIGNMENT_MASK) / unit_size;
275
                if until_alignment == 0 {
276
                    if offset + SIMD_STRIDE_SIZE / unit_size > len {
277
                        break;
278
                    }
279
                } else {
280
                    let offset_plus_until_alignment = offset + until_alignment;
281
                    let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
282
                    if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
283
                        break;
284
                    }
285
                    let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
286
                    if up_to < until_alignment {
287
                        return offset + up_to;
288
                    }
289
                    if last_valid_low {
290
                        offset = offset_plus_until_alignment_plus_one;
291
                        continue;
292
                    }
293
                    offset = offset_plus_until_alignment;
294
                }
295
                let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
296
                loop {
297
                    let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
298
                    if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
299
                        if offset_plus_stride == len {
300
                            break 'outer;
301
                        }
302
                        let offset_plus_stride_plus_one = offset_plus_stride + 1;
303
                        let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
304
                        if up_to < SIMD_STRIDE_SIZE / unit_size {
305
                            return offset + up_to;
306
                        }
307
                        if last_valid_low {
308
                            offset = offset_plus_stride_plus_one;
309
                            continue 'outer;
310
                        }
311
                    }
312
                    offset = offset_plus_stride;
313
                    if offset > len_minus_stride {
314
                        break 'outer;
315
                    }
316
                }
317
            }
318
            let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
319
            offset + up_to
320
        }
321
    } else {
322
        by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
323
        by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
324
        by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
325
326
        #[inline(always)]
327
0
        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
328
0
            let (up_to, _) = utf16_valid_up_to_alu(buffer);
329
0
            up_to
330
0
        }
331
    }
332
}
333
334
/// The second return value is true iff the last code unit of the slice was
335
/// reached and turned out to be a low surrogate that is part of a valid pair.
336
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
337
#[inline(always)]
338
0
fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
339
0
    let len = buffer.len();
340
0
    if len == 0 {
341
0
        return (0, false);
342
0
    }
343
0
    let mut offset = 0usize;
344
    loop {
345
0
        let unit = buffer[offset];
346
0
        let next = offset + 1;
347
0
        let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
348
0
        if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
349
            // Not a surrogate
350
0
            offset = next;
351
0
            if offset == len {
352
0
                return (offset, false);
353
0
            }
354
0
            continue;
355
0
        }
356
0
        if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
357
            // high surrogate
358
0
            if next < len {
359
0
                let second = buffer[next];
360
0
                let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
361
0
                if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
362
                    // The next code unit is a low surrogate. Advance position.
363
0
                    offset = next + 1;
364
0
                    if offset == len {
365
0
                        return (offset, true);
366
0
                    }
367
0
                    continue;
368
0
                }
369
                // The next code unit is not a low surrogate. Don't advance
370
                // position and treat the high surrogate as unpaired.
371
                // fall through
372
0
            }
373
            // Unpaired, fall through
374
0
        }
375
        // Unpaired surrogate
376
0
        return (offset, false);
377
    }
378
0
}
379
380
cfg_if! {
381
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
382
        #[inline(always)]
383
        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
384
            let mut offset = 0usize;
385
            let bytes = buffer.as_bytes();
386
            let len = bytes.len();
387
            if len >= SIMD_STRIDE_SIZE {
388
                let src = bytes.as_ptr();
389
                let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
390
                                           SIMD_ALIGNMENT_MASK;
391
                if until_alignment + SIMD_STRIDE_SIZE <= len {
392
                    while until_alignment != 0 {
393
                        if bytes[offset] > 0xC3 {
394
                            return Some(offset);
395
                        }
396
                        offset += 1;
397
                        until_alignment -= 1;
398
                    }
399
                    let len_minus_stride = len - SIMD_STRIDE_SIZE;
400
                    loop {
401
                        if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
402
                            // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
403
                            while bytes[offset] & 0xC0 == 0x80 {
404
                                offset += 1;
405
                            }
406
                            return Some(offset);
407
                        }
408
                        offset += SIMD_STRIDE_SIZE;
409
                        if offset > len_minus_stride {
410
                            break;
411
                        }
412
                    }
413
                }
414
            }
415
            for i in offset..len {
416
                if bytes[i] > 0xC3 {
417
                    return Some(i);
418
                }
419
            }
420
            None
421
        }
422
    } else {
423
        #[inline(always)]
424
0
        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
425
0
            let mut bytes = buffer.as_bytes();
426
0
            let mut total = 0;
427
            loop {
428
0
                if let Some((byte, offset)) = validate_ascii(bytes) {
429
0
                    total += offset;
430
0
                    if byte > 0xC3 {
431
0
                        return Some(total);
432
0
                    }
433
0
                    bytes = &bytes[offset + 2..];
434
0
                    total += 2;
435
                } else {
436
0
                    return None;
437
                }
438
            }
439
0
        }
440
    }
441
}
442
443
#[inline(always)]
444
0
fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
445
0
    let mut bytes = buffer;
446
0
    let mut total = 0;
447
    loop {
448
0
        if let Some((byte, offset)) = validate_ascii(bytes) {
449
0
            total += offset;
450
0
            if in_inclusive_range8(byte, 0xC2, 0xC3) {
451
0
                let next = offset + 1;
452
0
                if next == bytes.len() {
453
0
                    return Some(total);
454
0
                }
455
0
                if bytes[next] & 0xC0 != 0x80 {
456
0
                    return Some(total);
457
0
                }
458
0
                bytes = &bytes[offset + 2..];
459
0
                total += 2;
460
            } else {
461
0
                return Some(total);
462
            }
463
        } else {
464
0
            return None;
465
        }
466
    }
467
0
}
468
469
cfg_if! {
470
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
471
        #[inline(always)]
472
        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
473
            let mut offset = 0usize;
474
            let len = buffer.len();
475
            if len >= SIMD_STRIDE_SIZE / 2 {
476
                let src = buffer.as_ptr();
477
                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
478
                                           SIMD_ALIGNMENT_MASK) / 2;
479
                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
480
                    while until_alignment != 0 {
481
                        if is_utf16_code_unit_bidi(buffer[offset]) {
482
                            return true;
483
                        }
484
                        offset += 1;
485
                        until_alignment -= 1;
486
                    }
487
                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
488
                    loop {
489
                        if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
490
                            return true;
491
                        }
492
                        offset += SIMD_STRIDE_SIZE / 2;
493
                        if offset > len_minus_stride {
494
                            break;
495
                        }
496
                    }
497
                }
498
            }
499
            for &u in &buffer[offset..] {
500
                if is_utf16_code_unit_bidi(u) {
501
                    return true;
502
                }
503
            }
504
            false
505
        }
506
    } else {
507
        #[inline(always)]
508
0
        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
509
0
            for &u in buffer {
510
0
                if is_utf16_code_unit_bidi(u) {
511
0
                    return true;
512
0
                }
513
            }
514
0
            false
515
0
        }
516
    }
517
}
518
519
cfg_if! {
520
    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
521
        #[inline(always)]
522
        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
523
            let mut offset = 0usize;
524
            let len = buffer.len();
525
            if len >= SIMD_STRIDE_SIZE / 2 {
526
                let src = buffer.as_ptr();
527
                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
528
                                           SIMD_ALIGNMENT_MASK) / 2;
529
                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
530
                    while until_alignment != 0 {
531
                        if buffer[offset] > 0xFF {
532
                            // This transition isn't optimal, since the aligment is recomputing
533
                            // but not tweaking further today.
534
                            if is_utf16_bidi_impl(&buffer[offset..]) {
535
                                return Latin1Bidi::Bidi;
536
                            }
537
                            return Latin1Bidi::LeftToRight;
538
                        }
539
                        offset += 1;
540
                        until_alignment -= 1;
541
                    }
542
                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
543
                    loop {
544
                        let mut s = unsafe { *(src.add(offset) as *const u16x8) };
545
                        if !simd_is_latin1(s) {
546
                            loop {
547
                                if is_u16x8_bidi(s) {
548
                                    return Latin1Bidi::Bidi;
549
                                }
550
                                offset += SIMD_STRIDE_SIZE / 2;
551
                                if offset > len_minus_stride {
552
                                    for &u in &buffer[offset..] {
553
                                        if is_utf16_code_unit_bidi(u) {
554
                                            return Latin1Bidi::Bidi;
555
                                        }
556
                                    }
557
                                    return Latin1Bidi::LeftToRight;
558
                                }
559
                                s = unsafe { *(src.add(offset) as *const u16x8) };
560
                            }
561
                        }
562
                        offset += SIMD_STRIDE_SIZE / 2;
563
                        if offset > len_minus_stride {
564
                            break;
565
                        }
566
                    }
567
                }
568
            }
569
            let mut iter = (&buffer[offset..]).iter();
570
            loop {
571
                if let Some(&u) = iter.next() {
572
                    if u > 0xFF {
573
                        let mut inner_u = u;
574
                        loop {
575
                            if is_utf16_code_unit_bidi(inner_u) {
576
                                return Latin1Bidi::Bidi;
577
                            }
578
                            if let Some(&code_unit) = iter.next() {
579
                                inner_u = code_unit;
580
                            } else {
581
                                return Latin1Bidi::LeftToRight;
582
                            }
583
                        }
584
                    }
585
                } else {
586
                    return Latin1Bidi::Latin1;
587
                }
588
            }
589
        }
590
    } else {
591
        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
592
        #[inline(always)]
593
0
        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
594
0
            let mut offset = 0usize;
595
0
            let len = buffer.len();
596
0
            if len >= ALU_ALIGNMENT / 2 {
597
0
                let src = buffer.as_ptr();
598
0
                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
599
0
                                           ALU_ALIGNMENT_MASK) / 2;
600
0
                if until_alignment + ALU_ALIGNMENT / 2 <= len {
601
0
                    while until_alignment != 0 {
602
0
                        if buffer[offset] > 0xFF {
603
0
                            if is_utf16_bidi_impl(&buffer[offset..]) {
604
0
                                return Latin1Bidi::Bidi;
605
0
                            }
606
0
                            return Latin1Bidi::LeftToRight;
607
0
                        }
608
0
                        offset += 1;
609
0
                        until_alignment -= 1;
610
                    }
611
0
                    let len_minus_stride = len - ALU_ALIGNMENT / 2;
612
                    loop {
613
0
                        if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
614
0
                            if is_utf16_bidi_impl(&buffer[offset..]) {
615
0
                                return Latin1Bidi::Bidi;
616
0
                            }
617
0
                            return Latin1Bidi::LeftToRight;
618
0
                        }
619
0
                        offset += ALU_ALIGNMENT / 2;
620
0
                        if offset > len_minus_stride {
621
0
                            break;
622
0
                        }
623
                    }
624
0
                }
625
0
            }
626
0
            let mut iter = (&buffer[offset..]).iter();
627
            loop {
628
0
                if let Some(&u) = iter.next() {
629
0
                    if u > 0xFF {
630
0
                        let mut inner_u = u;
631
                        loop {
632
0
                            if is_utf16_code_unit_bidi(inner_u) {
633
0
                                return Latin1Bidi::Bidi;
634
0
                            }
635
0
                            if let Some(&code_unit) = iter.next() {
636
0
                                inner_u = code_unit;
637
0
                            } else {
638
0
                                return Latin1Bidi::LeftToRight;
639
                            }
640
                        }
641
0
                    }
642
                } else {
643
0
                    return Latin1Bidi::Latin1;
644
                }
645
            }
646
0
        }
647
    }
648
}
649
650
/// Checks whether the buffer is all-ASCII.
651
///
652
/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
653
/// is not guaranteed to fail fast.)
654
0
pub fn is_ascii(buffer: &[u8]) -> bool {
655
0
    is_ascii_impl(buffer)
656
0
}
657
658
/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
659
/// only ASCII characters).
660
///
661
/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
662
/// is not guaranteed to fail fast.)
663
0
pub fn is_basic_latin(buffer: &[u16]) -> bool {
664
0
    is_basic_latin_impl(buffer)
665
0
}
666
667
/// Checks whether the buffer is valid UTF-8 representing only code points
668
/// less than or equal to U+00FF.
669
///
670
/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
671
/// invalidity or code points above U+00FF are discovered.
672
0
pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
673
0
    is_utf8_latin1_impl(buffer).is_none()
674
0
}
675
676
/// Checks whether the buffer represents only code points less than or equal
677
/// to U+00FF.
678
///
679
/// Fails fast. (I.e. returns before having read the whole buffer if code
680
/// points above U+00FF are discovered.
681
0
pub fn is_str_latin1(buffer: &str) -> bool {
682
0
    is_str_latin1_impl(buffer).is_none()
683
0
}
684
685
/// Checks whether the buffer represents only code point less than or equal
686
/// to U+00FF.
687
///
688
/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
689
/// is not guaranteed to fail fast.)
690
15.6k
pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
691
15.6k
    is_utf16_latin1_impl(buffer)
692
15.6k
}
693
694
/// Checks whether a potentially-invalid UTF-8 buffer contains code points
695
/// that trigger right-to-left processing.
696
///
697
/// The check is done on a Unicode block basis without regard to assigned
698
/// vs. unassigned code points in the block. Hebrew presentation forms in
699
/// the Alphabetic Presentation Forms block are treated as if they formed
700
/// a block on their own (i.e. it treated as right-to-left). Additionally,
701
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
702
/// for. Control characters that are technically bidi controls but do not
703
/// cause right-to-left behavior without the presence of right-to-left
704
/// characters or right-to-left controls are not checked for. As a special
705
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
706
///
707
/// Returns `true` if the input is invalid UTF-8 or the input contains an
708
/// RTL character. Returns `false` if the input is valid UTF-8 and contains
709
/// no RTL characters.
710
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
711
#[inline]
712
0
pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
713
0
    // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
714
0
    // than UTF-8 validation followed by `is_str_bidi()` for German,
715
0
    // Russian and Japanese. However, this is considerably slower for Thai.
716
0
    // Chances are that the compiler makes some branch predictions that are
717
0
    // unfortunate for Thai. Not spending the time to manually optimize
718
0
    // further at this time, since it's unclear if this variant even has
719
0
    // use cases. However, this is worth revisiting once Rust gets the
720
0
    // ability to annotate relative priorities of match arms.
721
0
722
0
    // U+058F: D6 8F
723
0
    // U+0590: D6 90
724
0
    // U+08FF: E0 A3 BF
725
0
    // U+0900: E0 A4 80
726
0
    //
727
0
    // U+200F: E2 80 8F
728
0
    // U+202B: E2 80 AB
729
0
    // U+202E: E2 80 AE
730
0
    // U+2067: E2 81 A7
731
0
    //
732
0
    // U+FB1C: EF AC 9C
733
0
    // U+FB1D: EF AC 9D
734
0
    // U+FDFF: EF B7 BF
735
0
    // U+FE00: EF B8 80
736
0
    //
737
0
    // U+FE6F: EF B9 AF
738
0
    // U+FE70: EF B9 B0
739
0
    // U+FEFE: EF BB BE
740
0
    // U+FEFF: EF BB BF
741
0
    //
742
0
    // U+107FF: F0 90 9F BF
743
0
    // U+10800: F0 90 A0 80
744
0
    // U+10FFF: F0 90 BF BF
745
0
    // U+11000: F0 91 80 80
746
0
    //
747
0
    // U+1E7FF: F0 9E 9F BF
748
0
    // U+1E800: F0 9E A0 80
749
0
    // U+1EFFF: F0 9E BF BF
750
0
    // U+1F000: F0 9F 80 80
751
0
    let mut src = buffer;
752
    'outer: loop {
753
0
        if let Some((mut byte, mut read)) = validate_ascii(src) {
754
            // Check for the longest sequence to avoid checking twice for the
755
            // multi-byte sequences.
756
0
            if read + 4 <= src.len() {
757
                'inner: loop {
758
                    // At this point, `byte` is not included in `read`.
759
0
                    match byte {
760
0
                        0..=0x7F => {
761
                            // ASCII: go back to SIMD.
762
0
                            read += 1;
763
0
                            src = &src[read..];
764
0
                            continue 'outer;
765
                        }
766
0
                        0xC2..=0xD5 => {
767
                            // Two-byte
768
0
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
769
0
                            if !in_inclusive_range8(second, 0x80, 0xBF) {
770
0
                                return true;
771
0
                            }
772
0
                            read += 2;
773
                        }
774
                        0xD6 => {
775
                            // Two-byte
776
0
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
777
0
                            if !in_inclusive_range8(second, 0x80, 0xBF) {
778
0
                                return true;
779
0
                            }
780
0
                            // XXX consider folding the above and below checks
781
0
                            if second > 0x8F {
782
0
                                return true;
783
0
                            }
784
0
                            read += 2;
785
                        }
786
                        // two-byte starting with 0xD7 and above is bidi
787
0
                        0xE1 | 0xE3..=0xEC | 0xEE => {
788
                            // Three-byte normal
789
0
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
790
0
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
791
0
                            if ((UTF8_DATA.table[usize::from(second)]
792
0
                                & unsafe {
793
0
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
794
0
                                })
795
0
                                | (third >> 6))
796
0
                                != 2
797
                            {
798
0
                                return true;
799
0
                            }
800
0
                            read += 3;
801
                        }
802
                        0xE2 => {
803
                            // Three-byte normal, potentially bidi
804
0
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
805
0
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
806
0
                            if ((UTF8_DATA.table[usize::from(second)]
807
0
                                & unsafe {
808
0
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
809
0
                                })
810
0
                                | (third >> 6))
811
0
                                != 2
812
                            {
813
0
                                return true;
814
0
                            }
815
0
                            if second == 0x80 {
816
0
                                if third == 0x8F || third == 0xAB || third == 0xAE {
817
0
                                    return true;
818
0
                                }
819
0
                            } else if second == 0x81 {
820
0
                                if third == 0xA7 {
821
0
                                    return true;
822
0
                                }
823
0
                            }
824
0
                            read += 3;
825
                        }
826
                        0xEF => {
827
                            // Three-byte normal, potentially bidi
828
0
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
829
0
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
830
0
                            if ((UTF8_DATA.table[usize::from(second)]
831
0
                                & unsafe {
832
0
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
833
0
                                })
834
0
                                | (third >> 6))
835
0
                                != 2
836
                            {
837
0
                                return true;
838
0
                            }
839
0
                            if in_inclusive_range8(second, 0xAC, 0xB7) {
840
0
                                if second == 0xAC {
841
0
                                    if third > 0x9C {
842
0
                                        return true;
843
0
                                    }
844
                                } else {
845
0
                                    return true;
846
                                }
847
0
                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
848
0
                                if second == 0xB9 {
849
0
                                    if third > 0xAF {
850
0
                                        return true;
851
0
                                    }
852
0
                                } else if second == 0xBB {
853
0
                                    if third != 0xBF {
854
0
                                        return true;
855
0
                                    }
856
                                } else {
857
0
                                    return true;
858
                                }
859
0
                            }
860
0
                            read += 3;
861
                        }
862
                        0xE0 => {
863
                            // Three-byte special lower bound, potentially bidi
864
0
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
865
0
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
866
0
                            if ((UTF8_DATA.table[usize::from(second)]
867
0
                                & unsafe {
868
0
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
869
0
                                })
870
0
                                | (third >> 6))
871
0
                                != 2
872
                            {
873
0
                                return true;
874
0
                            }
875
0
                            // XXX can this be folded into the above validity check
876
0
                            if second < 0xA4 {
877
0
                                return true;
878
0
                            }
879
0
                            read += 3;
880
                        }
881
                        0xED => {
882
                            // Three-byte special upper bound
883
0
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
884
0
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
885
0
                            if ((UTF8_DATA.table[usize::from(second)]
886
0
                                & unsafe {
887
0
                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
888
0
                                })
889
0
                                | (third >> 6))
890
0
                                != 2
891
                            {
892
0
                                return true;
893
0
                            }
894
0
                            read += 3;
895
                        }
896
0
                        0xF1..=0xF4 => {
897
                            // Four-byte normal
898
0
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
899
0
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
900
0
                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
901
0
                            if (u16::from(
902
0
                                UTF8_DATA.table[usize::from(second)]
903
0
                                    & unsafe {
904
0
                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
905
0
                                    },
906
0
                            ) | u16::from(third >> 6)
907
0
                                | (u16::from(fourth & 0xC0) << 2))
908
0
                                != 0x202
909
                            {
910
0
                                return true;
911
0
                            }
912
0
                            read += 4;
913
                        }
914
                        0xF0 => {
915
                            // Four-byte special lower bound, potentially bidi
916
0
                            let second = unsafe { *(src.get_unchecked(read + 1)) };
917
0
                            let third = unsafe { *(src.get_unchecked(read + 2)) };
918
0
                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
919
0
                            if (u16::from(
920
0
                                UTF8_DATA.table[usize::from(second)]
921
0
                                    & unsafe {
922
0
                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
923
0
                                    },
924
0
                            ) | u16::from(third >> 6)
925
0
                                | (u16::from(fourth & 0xC0) << 2))
926
0
                                != 0x202
927
                            {
928
0
                                return true;
929
0
                            }
930
0
                            if unlikely(second == 0x90 || second == 0x9E) {
931
0
                                let third = src[read + 2];
932
0
                                if third >= 0xA0 {
933
0
                                    return true;
934
0
                                }
935
0
                            }
936
0
                            read += 4;
937
                        }
938
                        _ => {
939
                            // Invalid lead or bidi-only lead
940
0
                            return true;
941
                        }
942
                    }
943
0
                    if read + 4 > src.len() {
944
0
                        if read == src.len() {
945
0
                            return false;
946
0
                        }
947
0
                        byte = src[read];
948
0
                        break 'inner;
949
0
                    }
950
0
                    byte = src[read];
951
0
                    continue 'inner;
952
                }
953
0
            }
954
            // We can't have a complete 4-byte sequence, but we could still have
955
            // a complete shorter sequence.
956
957
            // At this point, `byte` is not included in `read`.
958
0
            match byte {
959
0
                0..=0x7F => {
960
                    // ASCII: go back to SIMD.
961
0
                    read += 1;
962
0
                    src = &src[read..];
963
0
                    continue 'outer;
964
                }
965
0
                0xC2..=0xD5 => {
966
                    // Two-byte
967
0
                    let new_read = read + 2;
968
0
                    if new_read > src.len() {
969
0
                        return true;
970
0
                    }
971
0
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
972
0
                    if !in_inclusive_range8(second, 0x80, 0xBF) {
973
0
                        return true;
974
0
                    }
975
0
                    read = new_read;
976
0
                    // We need to deal with the case where we came here with 3 bytes
977
0
                    // left, so we need to take a look at the last one.
978
0
                    src = &src[read..];
979
0
                    continue 'outer;
980
                }
981
                0xD6 => {
982
                    // Two-byte, potentially bidi
983
0
                    let new_read = read + 2;
984
0
                    if new_read > src.len() {
985
0
                        return true;
986
0
                    }
987
0
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
988
0
                    if !in_inclusive_range8(second, 0x80, 0xBF) {
989
0
                        return true;
990
0
                    }
991
0
                    // XXX consider folding the above and below checks
992
0
                    if second > 0x8F {
993
0
                        return true;
994
0
                    }
995
0
                    read = new_read;
996
0
                    // We need to deal with the case where we came here with 3 bytes
997
0
                    // left, so we need to take a look at the last one.
998
0
                    src = &src[read..];
999
0
                    continue 'outer;
1000
                }
1001
                // two-byte starting with 0xD7 and above is bidi
1002
0
                0xE1 | 0xE3..=0xEC | 0xEE => {
1003
                    // Three-byte normal
1004
0
                    let new_read = read + 3;
1005
0
                    if new_read > src.len() {
1006
0
                        return true;
1007
0
                    }
1008
0
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1009
0
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1010
0
                    if ((UTF8_DATA.table[usize::from(second)]
1011
0
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1012
0
                        | (third >> 6))
1013
0
                        != 2
1014
                    {
1015
0
                        return true;
1016
0
                    }
1017
                }
1018
                0xE2 => {
1019
                    // Three-byte normal, potentially bidi
1020
0
                    let new_read = read + 3;
1021
0
                    if new_read > src.len() {
1022
0
                        return true;
1023
0
                    }
1024
0
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1025
0
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1026
0
                    if ((UTF8_DATA.table[usize::from(second)]
1027
0
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1028
0
                        | (third >> 6))
1029
0
                        != 2
1030
                    {
1031
0
                        return true;
1032
0
                    }
1033
0
                    if second == 0x80 {
1034
0
                        if third == 0x8F || third == 0xAB || third == 0xAE {
1035
0
                            return true;
1036
0
                        }
1037
0
                    } else if second == 0x81 {
1038
0
                        if third == 0xA7 {
1039
0
                            return true;
1040
0
                        }
1041
0
                    }
1042
                }
1043
                0xEF => {
1044
                    // Three-byte normal, potentially bidi
1045
0
                    let new_read = read + 3;
1046
0
                    if new_read > src.len() {
1047
0
                        return true;
1048
0
                    }
1049
0
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1050
0
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1051
0
                    if ((UTF8_DATA.table[usize::from(second)]
1052
0
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1053
0
                        | (third >> 6))
1054
0
                        != 2
1055
                    {
1056
0
                        return true;
1057
0
                    }
1058
0
                    if in_inclusive_range8(second, 0xAC, 0xB7) {
1059
0
                        if second == 0xAC {
1060
0
                            if third > 0x9C {
1061
0
                                return true;
1062
0
                            }
1063
                        } else {
1064
0
                            return true;
1065
                        }
1066
0
                    } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1067
0
                        if second == 0xB9 {
1068
0
                            if third > 0xAF {
1069
0
                                return true;
1070
0
                            }
1071
0
                        } else if second == 0xBB {
1072
0
                            if third != 0xBF {
1073
0
                                return true;
1074
0
                            }
1075
                        } else {
1076
0
                            return true;
1077
                        }
1078
0
                    }
1079
                }
1080
                0xE0 => {
1081
                    // Three-byte special lower bound, potentially bidi
1082
0
                    let new_read = read + 3;
1083
0
                    if new_read > src.len() {
1084
0
                        return true;
1085
0
                    }
1086
0
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1087
0
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1088
0
                    if ((UTF8_DATA.table[usize::from(second)]
1089
0
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1090
0
                        | (third >> 6))
1091
0
                        != 2
1092
                    {
1093
0
                        return true;
1094
0
                    }
1095
0
                    // XXX can this be folded into the above validity check
1096
0
                    if second < 0xA4 {
1097
0
                        return true;
1098
0
                    }
1099
                }
1100
                0xED => {
1101
                    // Three-byte special upper bound
1102
0
                    let new_read = read + 3;
1103
0
                    if new_read > src.len() {
1104
0
                        return true;
1105
0
                    }
1106
0
                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1107
0
                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1108
0
                    if ((UTF8_DATA.table[usize::from(second)]
1109
0
                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1110
0
                        | (third >> 6))
1111
0
                        != 2
1112
                    {
1113
0
                        return true;
1114
0
                    }
1115
                }
1116
                _ => {
1117
                    // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1118
0
                    return true;
1119
                }
1120
            }
1121
0
            return false;
1122
        } else {
1123
0
            return false;
1124
        }
1125
    }
1126
0
}
Unexecuted instantiation: encoding_rs::mem::is_utf8_bidi
Unexecuted instantiation: encoding_rs::mem::is_utf8_bidi
1127
1128
/// Checks whether a valid UTF-8 buffer contains code points that trigger
1129
/// right-to-left processing.
1130
///
1131
/// The check is done on a Unicode block basis without regard to assigned
1132
/// vs. unassigned code points in the block. Hebrew presentation forms in
1133
/// the Alphabetic Presentation Forms block are treated as if they formed
1134
/// a block on their own (i.e. it treated as right-to-left). Additionally,
1135
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1136
/// for. Control characters that are technically bidi controls but do not
1137
/// cause right-to-left behavior without the presence of right-to-left
1138
/// characters or right-to-left controls are not checked for. As a special
1139
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1140
#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1141
#[inline]
1142
0
pub fn is_str_bidi(buffer: &str) -> bool {
1143
0
    // U+058F: D6 8F
1144
0
    // U+0590: D6 90
1145
0
    // U+08FF: E0 A3 BF
1146
0
    // U+0900: E0 A4 80
1147
0
    //
1148
0
    // U+200F: E2 80 8F
1149
0
    // U+202B: E2 80 AB
1150
0
    // U+202E: E2 80 AE
1151
0
    // U+2067: E2 81 A7
1152
0
    //
1153
0
    // U+FB1C: EF AC 9C
1154
0
    // U+FB1D: EF AC 9D
1155
0
    // U+FDFF: EF B7 BF
1156
0
    // U+FE00: EF B8 80
1157
0
    //
1158
0
    // U+FE6F: EF B9 AF
1159
0
    // U+FE70: EF B9 B0
1160
0
    // U+FEFE: EF BB BE
1161
0
    // U+FEFF: EF BB BF
1162
0
    //
1163
0
    // U+107FF: F0 90 9F BF
1164
0
    // U+10800: F0 90 A0 80
1165
0
    // U+10FFF: F0 90 BF BF
1166
0
    // U+11000: F0 91 80 80
1167
0
    //
1168
0
    // U+1E7FF: F0 9E 9F BF
1169
0
    // U+1E800: F0 9E A0 80
1170
0
    // U+1EFFF: F0 9E BF BF
1171
0
    // U+1F000: F0 9F 80 80
1172
0
    let mut bytes = buffer.as_bytes();
1173
    'outer: loop {
1174
        // TODO: Instead of just validating ASCII using SIMD, use SIMD
1175
        // to check for non-ASCII lead bytes, too, to quickly conclude
1176
        // that the vector consist entirely of CJK and below-Hebrew
1177
        // code points.
1178
        // Unfortunately, scripts above Arabic but below CJK share
1179
        // lead bytes with RTL.
1180
0
        if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1181
            'inner: loop {
1182
                // At this point, `byte` is not included in `read`.
1183
0
                if byte < 0xE0 {
1184
0
                    if byte >= 0x80 {
1185
                        // Two-byte
1186
                        // Adding `unlikely` here improved throughput on
1187
                        // Russian plain text by 33%!
1188
0
                        if unlikely(byte >= 0xD6) {
1189
0
                            if byte == 0xD6 {
1190
0
                                let second = bytes[read + 1];
1191
0
                                if second > 0x8F {
1192
0
                                    return true;
1193
0
                                }
1194
                            } else {
1195
0
                                return true;
1196
                            }
1197
0
                        }
1198
0
                        read += 2;
1199
                    } else {
1200
                        // ASCII: write and go back to SIMD.
1201
0
                        read += 1;
1202
0
                        // Intuitively, we should go back to the outer loop only
1203
0
                        // if byte is 0x30 or above, so as to avoid trashing on
1204
0
                        // ASCII space, comma and period in non-Latin context.
1205
0
                        // However, the extra branch seems to cost more than it's
1206
0
                        // worth.
1207
0
                        bytes = &bytes[read..];
1208
0
                        continue 'outer;
1209
                    }
1210
0
                } else if byte < 0xF0 {
1211
                    // Three-byte
1212
0
                    if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) {
1213
0
                        let second = bytes[read + 1];
1214
0
                        if byte == 0xE0 {
1215
0
                            if second < 0xA4 {
1216
0
                                return true;
1217
0
                            }
1218
0
                        } else if byte == 0xE2 {
1219
0
                            let third = bytes[read + 2];
1220
0
                            if second == 0x80 {
1221
0
                                if third == 0x8F || third == 0xAB || third == 0xAE {
1222
0
                                    return true;
1223
0
                                }
1224
0
                            } else if second == 0x81 {
1225
0
                                if third == 0xA7 {
1226
0
                                    return true;
1227
0
                                }
1228
0
                            }
1229
                        } else {
1230
0
                            debug_assert_eq!(byte, 0xEF);
1231
0
                            if in_inclusive_range8(second, 0xAC, 0xB7) {
1232
0
                                if second == 0xAC {
1233
0
                                    let third = bytes[read + 2];
1234
0
                                    if third > 0x9C {
1235
0
                                        return true;
1236
0
                                    }
1237
                                } else {
1238
0
                                    return true;
1239
                                }
1240
0
                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1241
0
                                if second == 0xB9 {
1242
0
                                    let third = bytes[read + 2];
1243
0
                                    if third > 0xAF {
1244
0
                                        return true;
1245
0
                                    }
1246
0
                                } else if second == 0xBB {
1247
0
                                    let third = bytes[read + 2];
1248
0
                                    if third != 0xBF {
1249
0
                                        return true;
1250
0
                                    }
1251
                                } else {
1252
0
                                    return true;
1253
                                }
1254
0
                            }
1255
                        }
1256
0
                    }
1257
0
                    read += 3;
1258
                } else {
1259
                    // Four-byte
1260
0
                    let second = bytes[read + 1];
1261
0
                    if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) {
1262
0
                        let third = bytes[read + 2];
1263
0
                        if third >= 0xA0 {
1264
0
                            return true;
1265
0
                        }
1266
0
                    }
1267
0
                    read += 4;
1268
                }
1269
                // The comparison is always < or == and never >, but including
1270
                // > here to let the compiler assume that < is true if this
1271
                // comparison is false.
1272
0
                if read >= bytes.len() {
1273
0
                    return false;
1274
0
                }
1275
0
                byte = bytes[read];
1276
0
                continue 'inner;
1277
            }
1278
        } else {
1279
0
            return false;
1280
        }
1281
    }
1282
0
}
Unexecuted instantiation: encoding_rs::mem::is_str_bidi
Unexecuted instantiation: encoding_rs::mem::is_str_bidi
1283
1284
/// Checks whether a UTF-16 buffer contains code points that trigger
1285
/// right-to-left processing.
1286
///
1287
/// The check is done on a Unicode block basis without regard to assigned
1288
/// vs. unassigned code points in the block. Hebrew presentation forms in
1289
/// the Alphabetic Presentation Forms block are treated as if they formed
1290
/// a block on their own (i.e. it treated as right-to-left). Additionally,
1291
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1292
/// for. Control characters that are technically bidi controls but do not
1293
/// cause right-to-left behavior without the presence of right-to-left
1294
/// characters or right-to-left controls are not checked for. As a special
1295
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1296
///
1297
/// Returns `true` if the input contains an RTL character or an unpaired
1298
/// high surrogate that could be the high half of an RTL character.
1299
/// Returns `false` if the input contains neither RTL characters nor
1300
/// unpaired high surrogates that could be higher halves of RTL characters.
1301
0
pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1302
0
    is_utf16_bidi_impl(buffer)
1303
0
}
1304
1305
/// Checks whether a scalar value triggers right-to-left processing.
1306
///
1307
/// The check is done on a Unicode block basis without regard to assigned
1308
/// vs. unassigned code points in the block. Hebrew presentation forms in
1309
/// the Alphabetic Presentation Forms block are treated as if they formed
1310
/// a block on their own (i.e. it treated as right-to-left). Additionally,
1311
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1312
/// for. Control characters that are technically bidi controls but do not
1313
/// cause right-to-left behavior without the presence of right-to-left
1314
/// characters or right-to-left controls are not checked for. As a special
1315
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1316
#[inline(always)]
1317
0
pub fn is_char_bidi(c: char) -> bool {
1318
0
    // Controls:
1319
0
    // Every control with RIGHT-TO-LEFT in its name in
1320
0
    // https://www.unicode.org/charts/PDF/U2000.pdf
1321
0
    // U+200F RLM
1322
0
    // U+202B RLE
1323
0
    // U+202E RLO
1324
0
    // U+2067 RLI
1325
0
    //
1326
0
    // BMP RTL:
1327
0
    // https://www.unicode.org/roadmaps/bmp/
1328
0
    // U+0590...U+08FF
1329
0
    // U+FB1D...U+FDFF Hebrew presentation forms and
1330
0
    //                 Arabic Presentation Forms A
1331
0
    // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1332
0
    //
1333
0
    // Supplementary RTL:
1334
0
    // https://www.unicode.org/roadmaps/smp/
1335
0
    // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1336
0
    // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1337
0
    let code_point = u32::from(c);
1338
0
    if code_point < 0x0590 {
1339
        // Below Hebrew
1340
0
        return false;
1341
0
    }
1342
0
    if in_range32(code_point, 0x0900, 0xFB1D) {
1343
        // Above Arabic Extended-A and below Hebrew presentation forms
1344
0
        if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1345
            // In the range that contains the RTL controls
1346
0
            return code_point == 0x200F
1347
0
                || code_point == 0x202B
1348
0
                || code_point == 0x202E
1349
0
                || code_point == 0x2067;
1350
0
        }
1351
0
        return false;
1352
0
    }
1353
0
    if code_point > 0x1EFFF {
1354
        // Above second astral RTL. (Emoji is here.)
1355
0
        return false;
1356
0
    }
1357
0
    if in_range32(code_point, 0x11000, 0x1E800) {
1358
        // Between astral RTL blocks
1359
0
        return false;
1360
0
    }
1361
0
    if in_range32(code_point, 0xFEFF, 0x10800) {
1362
        // Above Arabic Presentations Forms B (excl. BOM) and below first
1363
        // astral RTL
1364
0
        return false;
1365
0
    }
1366
0
    if in_range32(code_point, 0xFE00, 0xFE70) {
1367
        // Between Arabic Presentations Forms
1368
0
        return false;
1369
0
    }
1370
0
    true
1371
0
}
1372
1373
/// Checks whether a UTF-16 code unit triggers right-to-left processing.
1374
///
1375
/// The check is done on a Unicode block basis without regard to assigned
1376
/// vs. unassigned code points in the block. Hebrew presentation forms in
1377
/// the Alphabetic Presentation Forms block are treated as if they formed
1378
/// a block on their own (i.e. it treated as right-to-left). Additionally,
1379
/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1380
/// for. Control characters that are technically bidi controls but do not
1381
/// cause right-to-left behavior without the presence of right-to-left
1382
/// characters or right-to-left controls are not checked for. As a special
1383
/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1384
///
1385
/// Since supplementary-plane right-to-left blocks are identifiable from the
1386
/// high surrogate without examining the low surrogate, this function returns
1387
/// `true` for such high surrogates making the function suitable for handling
1388
/// supplementary-plane text without decoding surrogate pairs to scalar
1389
/// values. Obviously, such high surrogates are then reported as right-to-left
1390
/// even if actually unpaired.
1391
#[inline(always)]
1392
0
pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1393
0
    if u < 0x0590 {
1394
        // Below Hebrew
1395
0
        return false;
1396
0
    }
1397
0
    if in_range16(u, 0x0900, 0xD802) {
1398
        // Above Arabic Extended-A and below first RTL surrogate
1399
0
        if in_inclusive_range16(u, 0x200F, 0x2067) {
1400
            // In the range that contains the RTL controls
1401
0
            return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1402
0
        }
1403
0
        return false;
1404
0
    }
1405
0
    if in_range16(u, 0xD83C, 0xFB1D) {
1406
        // Between astral RTL high surrogates and Hebrew presentation forms
1407
        // (Emoji is here)
1408
0
        return false;
1409
0
    }
1410
0
    if in_range16(u, 0xD804, 0xD83A) {
1411
        // Between RTL high surragates
1412
0
        return false;
1413
0
    }
1414
0
    if u > 0xFEFE {
1415
        // Above Arabic Presentation Forms (excl. BOM)
1416
0
        return false;
1417
0
    }
1418
0
    if in_range16(u, 0xFE00, 0xFE70) {
1419
        // Between Arabic Presentations Forms
1420
0
        return false;
1421
0
    }
1422
0
    true
1423
0
}
1424
1425
/// Checks whether a potentially invalid UTF-8 buffer contains code points
1426
/// that trigger right-to-left processing or is all-Latin1.
1427
///
1428
/// Possibly more efficient than performing the checks separately.
1429
///
1430
/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1431
/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1432
/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1433
0
pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1434
0
    if let Some(offset) = is_utf8_latin1_impl(buffer) {
1435
0
        if is_utf8_bidi(&buffer[offset..]) {
1436
0
            Latin1Bidi::Bidi
1437
        } else {
1438
0
            Latin1Bidi::LeftToRight
1439
        }
1440
    } else {
1441
0
        Latin1Bidi::Latin1
1442
    }
1443
0
}
1444
1445
/// Checks whether a valid UTF-8 buffer contains code points
1446
/// that trigger right-to-left processing or is all-Latin1.
1447
///
1448
/// Possibly more efficient than performing the checks separately.
1449
///
1450
/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1451
/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1452
/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1453
0
pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1454
    // The transition from the latin1 check to the bidi check isn't
1455
    // optimal but not tweaking it to perfection today.
1456
0
    if let Some(offset) = is_str_latin1_impl(buffer) {
1457
0
        if is_str_bidi(&buffer[offset..]) {
1458
0
            Latin1Bidi::Bidi
1459
        } else {
1460
0
            Latin1Bidi::LeftToRight
1461
        }
1462
    } else {
1463
0
        Latin1Bidi::Latin1
1464
    }
1465
0
}
1466
1467
/// Checks whether a potentially invalid UTF-16 buffer contains code points
1468
/// that trigger right-to-left processing or is all-Latin1.
1469
///
1470
/// Possibly more efficient than performing the checks separately.
1471
///
1472
/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1473
/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1474
/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1475
0
pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1476
0
    check_utf16_for_latin1_and_bidi_impl(buffer)
1477
0
}
1478
1479
/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1480
/// with the REPLACEMENT CHARACTER.
1481
///
1482
/// The length of the destination buffer must be at least the length of the
1483
/// source buffer _plus one_.
1484
///
1485
/// Returns the number of `u16`s written.
1486
///
1487
/// # Panics
1488
///
1489
/// Panics if the destination buffer is shorter than stated above.
1490
0
pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1491
0
    // TODO: Can the requirement for dst to be at least one unit longer
1492
0
    // be eliminated?
1493
0
    assert!(dst.len() > src.len());
1494
0
    let mut decoder = Utf8Decoder::new_inner();
1495
0
    let mut total_read = 0usize;
1496
0
    let mut total_written = 0usize;
1497
    loop {
1498
0
        let (result, read, written) =
1499
0
            decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1500
0
        total_read += read;
1501
0
        total_written += written;
1502
0
        match result {
1503
            DecoderResult::InputEmpty => {
1504
0
                return total_written;
1505
            }
1506
            DecoderResult::OutputFull => {
1507
0
                unreachable!("The assert at the top of the function should have caught this.");
1508
            }
1509
0
            DecoderResult::Malformed(_, _) => {
1510
0
                // There should always be space for the U+FFFD, because
1511
0
                // otherwise we'd have gotten OutputFull already.
1512
0
                dst[total_written] = 0xFFFD;
1513
0
                total_written += 1;
1514
0
            }
1515
        }
1516
    }
1517
0
}
1518
1519
/// Converts valid UTF-8 to valid UTF-16.
1520
///
1521
/// The length of the destination buffer must be at least the length of the
1522
/// source buffer.
1523
///
1524
/// Returns the number of `u16`s written.
1525
///
1526
/// # Panics
1527
///
1528
/// Panics if the destination buffer is shorter than stated above.
1529
0
pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1530
0
    assert!(
1531
0
        dst.len() >= src.len(),
1532
0
        "Destination must not be shorter than the source."
1533
0
    );
1534
0
    let bytes = src.as_bytes();
1535
0
    let mut read = 0;
1536
0
    let mut written = 0;
1537
    'outer: loop {
1538
0
        let mut byte = {
1539
0
            let src_remaining = &bytes[read..];
1540
0
            let dst_remaining = &mut dst[written..];
1541
0
            let length = src_remaining.len();
1542
0
            match unsafe {
1543
0
                ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1544
0
            } {
1545
                None => {
1546
0
                    written += length;
1547
0
                    return written;
1548
                }
1549
0
                Some((non_ascii, consumed)) => {
1550
0
                    read += consumed;
1551
0
                    written += consumed;
1552
0
                    non_ascii
1553
                }
1554
            }
1555
        };
1556
        'inner: loop {
1557
            // At this point, `byte` is not included in `read`.
1558
0
            if byte < 0xE0 {
1559
0
                if byte >= 0x80 {
1560
0
                    // Two-byte
1561
0
                    let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1562
0
                    let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1563
0
                    unsafe { *(dst.get_unchecked_mut(written)) = point };
1564
0
                    read += 2;
1565
0
                    written += 1;
1566
0
                } else {
1567
                    // ASCII: write and go back to SIMD.
1568
0
                    unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1569
0
                    read += 1;
1570
0
                    written += 1;
1571
0
                    // Intuitively, we should go back to the outer loop only
1572
0
                    // if byte is 0x30 or above, so as to avoid trashing on
1573
0
                    // ASCII space, comma and period in non-Latin context.
1574
0
                    // However, the extra branch seems to cost more than it's
1575
0
                    // worth.
1576
0
                    continue 'outer;
1577
                }
1578
0
            } else if byte < 0xF0 {
1579
0
                // Three-byte
1580
0
                let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1581
0
                let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1582
0
                let point = ((u16::from(byte) & 0xF) << 12)
1583
0
                    | ((u16::from(second) & 0x3F) << 6)
1584
0
                    | (u16::from(third) & 0x3F);
1585
0
                unsafe { *(dst.get_unchecked_mut(written)) = point };
1586
0
                read += 3;
1587
0
                written += 1;
1588
0
            } else {
1589
0
                // Four-byte
1590
0
                let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1591
0
                let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1592
0
                let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1593
0
                let point = ((u32::from(byte) & 0x7) << 18)
1594
0
                    | ((u32::from(second) & 0x3F) << 12)
1595
0
                    | ((u32::from(third) & 0x3F) << 6)
1596
0
                    | (u32::from(fourth) & 0x3F);
1597
0
                unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1598
0
                unsafe {
1599
0
                    *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1600
0
                };
1601
0
                read += 4;
1602
0
                written += 2;
1603
0
            }
1604
            // The comparison is always < or == and never >, but including
1605
            // > here to let the compiler assume that < is true if this
1606
            // comparison is false.
1607
0
            if read >= src.len() {
1608
0
                return written;
1609
0
            }
1610
0
            byte = bytes[read];
1611
0
            continue 'inner;
1612
        }
1613
    }
1614
0
}
1615
1616
/// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1617
///
1618
/// The length of the destination buffer must be at least the length of the
1619
/// source buffer.
1620
///
1621
/// Returns the number of `u16`s written or `None` if the input was invalid.
1622
///
1623
/// When the input was invalid, some output may have been written.
1624
///
1625
/// # Panics
1626
///
1627
/// Panics if the destination buffer is shorter than stated above.
1628
0
pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1629
0
    assert!(
1630
0
        dst.len() >= src.len(),
1631
0
        "Destination must not be shorter than the source."
1632
0
    );
1633
0
    let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1634
0
    if read == src.len() {
1635
0
        return Some(written);
1636
0
    }
1637
0
    None
1638
0
}
1639
1640
/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1641
/// with the REPLACEMENT CHARACTER with potentially insufficient output
1642
/// space.
1643
///
1644
/// Returns the number of code units read and the number of bytes written.
1645
///
1646
/// Guarantees that the bytes in the destination beyond the number of
1647
/// bytes claimed as written by the second item of the return tuple
1648
/// are left unmodified.
1649
///
1650
/// Not all code units are read if there isn't enough output space.
1651
///
1652
/// Note  that this method isn't designed for general streamability but for
1653
/// not allocating memory for the worst case up front. Specifically,
1654
/// if the input starts with or ends with an unpaired surrogate, those are
1655
/// replaced with the REPLACEMENT CHARACTER.
1656
///
1657
/// Matches the semantics of `TextEncoder.encodeInto()` from the
1658
/// Encoding Standard.
1659
///
1660
/// # Safety
1661
///
1662
/// If you want to convert into a `&mut str`, use
1663
/// `convert_utf16_to_str_partial()` instead of using this function
1664
/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1665
#[inline(always)]
1666
0
pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1667
0
    // The two functions called below are marked `inline(never)` to make
1668
0
    // transitions from the hot part (first function) into the cold part
1669
0
    // (second function) go through a return and another call to discouge
1670
0
    // the CPU from speculating from the hot code into the cold code.
1671
0
    // Letting the transitions be mere intra-function jumps, even to
1672
0
    // basic blocks out-of-lined to the end of the function would wipe
1673
0
    // away a quarter of Arabic encode performance on Haswell!
1674
0
    let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1675
0
    if likely(read == src.len()) {
1676
0
        return (read, written);
1677
0
    }
1678
0
    let (tail_read, tail_written) =
1679
0
        convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1680
0
    (read + tail_read, written + tail_written)
1681
0
}
1682
1683
/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1684
/// with the REPLACEMENT CHARACTER.
1685
///
1686
/// The length of the destination buffer must be at least the length of the
1687
/// source buffer times three.
1688
///
1689
/// Returns the number of bytes written.
1690
///
1691
/// # Panics
1692
///
1693
/// Panics if the destination buffer is shorter than stated above.
1694
///
1695
/// # Safety
1696
///
1697
/// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1698
/// instead of using this function together with the `unsafe` method
1699
/// `as_bytes_mut()` on `&mut str`.
1700
#[inline(always)]
1701
0
pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1702
0
    assert!(dst.len() >= src.len() * 3);
1703
0
    let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1704
0
    debug_assert_eq!(read, src.len());
1705
0
    written
1706
0
}
1707
1708
/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1709
/// with the REPLACEMENT CHARACTER such that the validity of the output is
1710
/// signaled using the Rust type system with potentially insufficient output
1711
/// space.
1712
///
1713
/// Returns the number of code units read and the number of bytes written.
1714
///
1715
/// Not all code units are read if there isn't enough output space.
1716
///
1717
/// Note  that this method isn't designed for general streamability but for
1718
/// not allocating memory for the worst case up front. Specifically,
1719
/// if the input starts with or ends with an unpaired surrogate, those are
1720
/// replaced with the REPLACEMENT CHARACTER.
1721
0
pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1722
0
    let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1723
0
    let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1724
0
    let len = bytes.len();
1725
0
    let mut trail = written;
1726
0
    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1727
0
        bytes[trail] = 0;
1728
0
        trail += 1;
1729
0
    }
1730
0
    (read, written)
1731
0
}
1732
1733
/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1734
/// with the REPLACEMENT CHARACTER such that the validity of the output is
1735
/// signaled using the Rust type system.
1736
///
1737
/// The length of the destination buffer must be at least the length of the
1738
/// source buffer times three.
1739
///
1740
/// Returns the number of bytes written.
1741
///
1742
/// # Panics
1743
///
1744
/// Panics if the destination buffer is shorter than stated above.
1745
#[inline(always)]
1746
0
pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1747
0
    assert!(dst.len() >= src.len() * 3);
1748
0
    let (read, written) = convert_utf16_to_str_partial(src, dst);
1749
0
    debug_assert_eq!(read, src.len());
1750
0
    written
1751
0
}
1752
1753
/// Converts bytes whose unsigned value is interpreted as Unicode code point
1754
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1755
///
1756
/// The length of the destination buffer must be at least the length of the
1757
/// source buffer.
1758
///
1759
/// The number of `u16`s written equals the length of the source buffer.
1760
///
1761
/// # Panics
1762
///
1763
/// Panics if the destination buffer is shorter than stated above.
1764
864k
pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1765
864k
    assert!(
1766
864k
        dst.len() >= src.len(),
1767
864k
        "Destination must not be shorter than the source."
1768
864k
    );
1769
    // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1770
    // instructions and this code, but, yet, the autovectorized version is
1771
    // faster.
1772
864k
    unsafe {
1773
864k
        unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1774
864k
    }
1775
864k
}
1776
1777
/// Converts bytes whose unsigned value is interpreted as Unicode code point
1778
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1779
/// output space.
1780
///
1781
/// Returns the number of bytes read and the number of bytes written.
1782
///
1783
/// If the output isn't large enough, not all input is consumed.
1784
///
1785
/// # Safety
1786
///
1787
/// If you want to convert into a `&mut str`, use
1788
/// `convert_utf16_to_str_partial()` instead of using this function
1789
/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1790
254k
pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1791
254k
    let src_len = src.len();
1792
254k
    let src_ptr = src.as_ptr();
1793
254k
    let dst_ptr = dst.as_mut_ptr();
1794
254k
    let dst_len = dst.len();
1795
254k
    let mut total_read = 0usize;
1796
254k
    let mut total_written = 0usize;
1797
    loop {
1798
        // src can't advance more than dst
1799
254k
        let src_left = src_len - total_read;
1800
254k
        let dst_left = dst_len - total_written;
1801
254k
        let min_left = ::core::cmp::min(src_left, dst_left);
1802
254k
        if let Some((non_ascii, consumed)) = unsafe {
1803
254k
            ascii_to_ascii(
1804
254k
                src_ptr.add(total_read),
1805
254k
                dst_ptr.add(total_written),
1806
254k
                min_left,
1807
254k
            )
1808
254k
        } {
1809
0
            total_read += consumed;
1810
0
            total_written += consumed;
1811
0
            if total_written.checked_add(2).unwrap() > dst_len {
1812
0
                return (total_read, total_written);
1813
0
            }
1814
0
1815
0
            total_read += 1; // consume `non_ascii`
1816
0
1817
0
            dst[total_written] = (non_ascii >> 6) | 0xC0;
1818
0
            total_written += 1;
1819
0
            dst[total_written] = (non_ascii & 0x3F) | 0x80;
1820
0
            total_written += 1;
1821
0
            continue;
1822
254k
        }
1823
254k
        return (total_read + min_left, total_written + min_left);
1824
    }
1825
254k
}
1826
1827
/// Converts bytes whose unsigned value is interpreted as Unicode code point
1828
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1829
///
1830
/// The length of the destination buffer must be at least the length of the
1831
/// source buffer times two.
1832
///
1833
/// Returns the number of bytes written.
1834
///
1835
/// # Panics
1836
///
1837
/// Panics if the destination buffer is shorter than stated above.
1838
///
1839
/// # Safety
1840
///
1841
/// Note that this function may write garbage beyond the number of bytes
1842
/// indicated by the return value, so using a `&mut str` interpreted as
1843
/// `&mut [u8]` as the destination is not safe. If you want to convert into
1844
/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1845
#[inline]
1846
0
pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1847
0
    assert!(
1848
0
        dst.len() >= src.len() * 2,
1849
0
        "Destination must not be shorter than the source times two."
1850
0
    );
1851
0
    let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1852
0
    debug_assert_eq!(read, src.len());
1853
0
    written
1854
0
}
Unexecuted instantiation: encoding_rs::mem::convert_latin1_to_utf8
Unexecuted instantiation: encoding_rs::mem::convert_latin1_to_utf8
1855
1856
/// Converts bytes whose unsigned value is interpreted as Unicode code point
1857
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1858
/// output is signaled using the Rust type system with potentially insufficient
1859
/// output space.
1860
///
1861
/// Returns the number of bytes read and the number of bytes written.
1862
///
1863
/// If the output isn't large enough, not all input is consumed.
1864
#[inline]
1865
0
pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1866
0
    let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1867
0
    let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1868
0
    let len = bytes.len();
1869
0
    let mut trail = written;
1870
0
    let max = ::core::cmp::min(len, trail + MAX_STRIDE_SIZE);
1871
0
    while trail < max {
1872
0
        bytes[trail] = 0;
1873
0
        trail += 1;
1874
0
    }
1875
0
    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1876
0
        bytes[trail] = 0;
1877
0
        trail += 1;
1878
0
    }
1879
0
    (read, written)
1880
0
}
1881
1882
/// Converts bytes whose unsigned value is interpreted as Unicode code point
1883
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1884
/// output is signaled using the Rust type system.
1885
///
1886
/// The length of the destination buffer must be at least the length of the
1887
/// source buffer times two.
1888
///
1889
/// Returns the number of bytes written.
1890
///
1891
/// # Panics
1892
///
1893
/// Panics if the destination buffer is shorter than stated above.
1894
#[inline]
1895
0
pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1896
0
    assert!(
1897
0
        dst.len() >= src.len() * 2,
1898
0
        "Destination must not be shorter than the source times two."
1899
0
    );
1900
0
    let (read, written) = convert_latin1_to_str_partial(src, dst);
1901
0
    debug_assert_eq!(read, src.len());
1902
0
    written
1903
0
}
1904
1905
/// If the input is valid UTF-8 representing only Unicode code points from
1906
/// U+0000 to U+00FF, inclusive, converts the input into output that
1907
/// represents the value of each code point as the unsigned byte value of
1908
/// each output byte.
1909
///
1910
/// If the input does not fulfill the condition stated above, this function
1911
/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1912
/// does something that is memory-safe without any promises about any
1913
/// properties of the output. In particular, callers shouldn't assume the
1914
/// output to be the same across crate versions or CPU architectures and
1915
/// should not assume that non-ASCII input can't map to ASCII output.
1916
///
1917
/// The length of the destination buffer must be at least the length of the
1918
/// source buffer.
1919
///
1920
/// Returns the number of bytes written.
1921
///
1922
/// # Panics
1923
///
1924
/// Panics if the destination buffer is shorter than stated above.
1925
///
1926
/// If debug assertions are enabled (and not fuzzing) and the input is
1927
/// not in the range U+0000 to U+00FF, inclusive.
1928
0
pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1929
0
    assert!(
1930
0
        dst.len() >= src.len(),
1931
0
        "Destination must not be shorter than the source."
1932
0
    );
1933
0
    non_fuzz_debug_assert!(is_utf8_latin1(src));
1934
0
    let src_len = src.len();
1935
0
    let src_ptr = src.as_ptr();
1936
0
    let dst_ptr = dst.as_mut_ptr();
1937
0
    let mut total_read = 0usize;
1938
0
    let mut total_written = 0usize;
1939
    loop {
1940
        // dst can't advance more than src
1941
0
        let src_left = src_len - total_read;
1942
0
        if let Some((non_ascii, consumed)) = unsafe {
1943
0
            ascii_to_ascii(
1944
0
                src_ptr.add(total_read),
1945
0
                dst_ptr.add(total_written),
1946
0
                src_left,
1947
0
            )
1948
0
        } {
1949
0
            total_read += consumed + 1;
1950
0
            total_written += consumed;
1951
0
1952
0
            if total_read == src_len {
1953
0
                return total_written;
1954
0
            }
1955
0
1956
0
            let trail = src[total_read];
1957
0
            total_read += 1;
1958
0
1959
0
            dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1960
0
            total_written += 1;
1961
0
            continue;
1962
0
        }
1963
0
        return total_written + src_left;
1964
    }
1965
0
}
1966
1967
/// If the input is valid UTF-16 representing only Unicode code points from
1968
/// U+0000 to U+00FF, inclusive, converts the input into output that
1969
/// represents the value of each code point as the unsigned byte value of
1970
/// each output byte.
1971
///
1972
/// If the input does not fulfill the condition stated above, does something
1973
/// that is memory-safe without any promises about any properties of the
1974
/// output and will probably assert in debug builds in future versions.
1975
/// In particular, callers shouldn't assume the output to be the same across
1976
/// crate versions or CPU architectures and should not assume that non-ASCII
1977
/// input can't map to ASCII output.
1978
///
1979
/// The length of the destination buffer must be at least the length of the
1980
/// source buffer.
1981
///
1982
/// The number of bytes written equals the length of the source buffer.
1983
///
1984
/// # Panics
1985
///
1986
/// Panics if the destination buffer is shorter than stated above.
1987
///
1988
/// (Probably in future versions if debug assertions are enabled (and not
1989
/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
1990
9.99k
pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1991
9.99k
    assert!(
1992
9.99k
        dst.len() >= src.len(),
1993
9.99k
        "Destination must not be shorter than the source."
1994
9.99k
    );
1995
    // non_fuzz_debug_assert!(is_utf16_latin1(src));
1996
9.99k
    unsafe {
1997
9.99k
        pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1998
9.99k
    }
1999
9.99k
}
2000
2001
/// Converts bytes whose unsigned value is interpreted as Unicode code point
2002
/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
2003
///
2004
/// Borrows if input is ASCII-only. Performs a single heap allocation
2005
/// otherwise.
2006
///
2007
/// Only available if the `alloc` feature is enabled (enabled by default).
2008
#[cfg(feature = "alloc")]
2009
0
pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
2010
0
    let up_to = ascii_valid_up_to(bytes);
2011
0
    // >= makes later things optimize better than ==
2012
0
    if up_to >= bytes.len() {
2013
0
        debug_assert_eq!(up_to, bytes.len());
2014
0
        let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
2015
0
        return Cow::Borrowed(s);
2016
0
    }
2017
0
    let (head, tail) = bytes.split_at(up_to);
2018
0
    let capacity = head.len() + tail.len() * 2;
2019
0
    let mut vec = Vec::with_capacity(capacity);
2020
0
    unsafe {
2021
0
        vec.set_len(capacity);
2022
0
    }
2023
0
    (&mut vec[..up_to]).copy_from_slice(head);
2024
0
    let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2025
0
    vec.truncate(up_to + written);
2026
0
    Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2027
0
}
2028
2029
/// If the input is valid UTF-8 representing only Unicode code points from
2030
/// U+0000 to U+00FF, inclusive, converts the input into output that
2031
/// represents the value of each code point as the unsigned byte value of
2032
/// each output byte.
2033
///
2034
/// If the input does not fulfill the condition stated above, this function
2035
/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2036
/// does something that is memory-safe without any promises about any
2037
/// properties of the output. In particular, callers shouldn't assume the
2038
/// output to be the same across crate versions or CPU architectures and
2039
/// should not assume that non-ASCII input can't map to ASCII output.
2040
///
2041
/// Borrows if input is ASCII-only. Performs a single heap allocation
2042
/// otherwise.
2043
///
2044
/// Only available if the `alloc` feature is enabled (enabled by default).
2045
#[cfg(feature = "alloc")]
2046
0
pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2047
0
    let bytes = string.as_bytes();
2048
0
    let up_to = ascii_valid_up_to(bytes);
2049
0
    // >= makes later things optimize better than ==
2050
0
    if up_to >= bytes.len() {
2051
0
        debug_assert_eq!(up_to, bytes.len());
2052
0
        return Cow::Borrowed(bytes);
2053
0
    }
2054
0
    let (head, tail) = bytes.split_at(up_to);
2055
0
    let capacity = bytes.len();
2056
0
    let mut vec = Vec::with_capacity(capacity);
2057
0
    unsafe {
2058
0
        vec.set_len(capacity);
2059
0
    }
2060
0
    (&mut vec[..up_to]).copy_from_slice(head);
2061
0
    let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2062
0
    vec.truncate(up_to + written);
2063
0
    Cow::Owned(vec)
2064
0
}
2065
2066
/// Returns the index of the first unpaired surrogate or, if the input is
2067
/// valid UTF-16 in its entirety, the length of the input.
2068
0
pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2069
0
    utf16_valid_up_to_impl(buffer)
2070
0
}
2071
2072
/// Returns the index of first byte that starts an invalid byte
2073
/// sequence or a non-Latin1 byte sequence, or the length of the
2074
/// string if there are neither.
2075
0
pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2076
0
    is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2077
0
}
2078
2079
/// Returns the index of first byte that starts a non-Latin1 byte
2080
/// sequence, or the length of the string if there are none.
2081
0
pub fn str_latin1_up_to(buffer: &str) -> usize {
2082
0
    is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len())
2083
0
}
2084
2085
/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2086
#[inline]
2087
0
pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2088
0
    let mut offset = 0;
2089
    loop {
2090
0
        offset += utf16_valid_up_to(&buffer[offset..]);
2091
0
        if offset == buffer.len() {
2092
0
            return;
2093
0
        }
2094
0
        buffer[offset] = 0xFFFD;
2095
0
        offset += 1;
2096
    }
2097
0
}
Unexecuted instantiation: encoding_rs::mem::ensure_utf16_validity
Unexecuted instantiation: encoding_rs::mem::ensure_utf16_validity
2098
2099
/// Copies ASCII from source to destination up to the first non-ASCII byte
2100
/// (or the end of the input if it is ASCII in its entirety).
2101
///
2102
/// The length of the destination buffer must be at least the length of the
2103
/// source buffer.
2104
///
2105
/// Returns the number of bytes written.
2106
///
2107
/// # Panics
2108
///
2109
/// Panics if the destination buffer is shorter than stated above.
2110
0
pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2111
0
    assert!(
2112
0
        dst.len() >= src.len(),
2113
0
        "Destination must not be shorter than the source."
2114
0
    );
2115
0
    if let Some((_, consumed)) =
2116
0
        unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2117
    {
2118
0
        consumed
2119
    } else {
2120
0
        src.len()
2121
    }
2122
0
}
2123
2124
/// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2125
/// the first non-ASCII byte (or the end of the input if it is ASCII in its
2126
/// entirety).
2127
///
2128
/// The length of the destination buffer must be at least the length of the
2129
/// source buffer.
2130
///
2131
/// Returns the number of `u16`s written.
2132
///
2133
/// # Panics
2134
///
2135
/// Panics if the destination buffer is shorter than stated above.
2136
0
pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2137
0
    assert!(
2138
0
        dst.len() >= src.len(),
2139
0
        "Destination must not be shorter than the source."
2140
0
    );
2141
0
    if let Some((_, consumed)) =
2142
0
        unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2143
    {
2144
0
        consumed
2145
    } else {
2146
0
        src.len()
2147
    }
2148
0
}
2149
2150
/// Copies Basic Latin from source to destination narrowing it to ASCII up to
2151
/// the first non-Basic Latin code unit (or the end of the input if it is
2152
/// Basic Latin in its entirety).
2153
///
2154
/// The length of the destination buffer must be at least the length of the
2155
/// source buffer.
2156
///
2157
/// Returns the number of bytes written.
2158
///
2159
/// # Panics
2160
///
2161
/// Panics if the destination buffer is shorter than stated above.
2162
0
pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2163
0
    assert!(
2164
0
        dst.len() >= src.len(),
2165
0
        "Destination must not be shorter than the source."
2166
0
    );
2167
0
    if let Some((_, consumed)) =
2168
0
        unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2169
    {
2170
0
        consumed
2171
    } else {
2172
0
        src.len()
2173
    }
2174
0
}
2175
2176
// Any copyright to the test code below this comment is dedicated to the
2177
// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2178
2179
#[cfg(all(test, feature = "alloc"))]
2180
mod tests {
2181
    use super::*;
2182
2183
    #[test]
2184
    fn test_is_ascii_success() {
2185
        let mut src: Vec<u8> = Vec::with_capacity(128);
2186
        src.resize(128, 0);
2187
        for i in 0..src.len() {
2188
            src[i] = i as u8;
2189
        }
2190
        for i in 0..src.len() {
2191
            assert!(is_ascii(&src[i..]));
2192
        }
2193
    }
2194
2195
    #[test]
2196
    fn test_is_ascii_fail() {
2197
        let mut src: Vec<u8> = Vec::with_capacity(128);
2198
        src.resize(128, 0);
2199
        for i in 0..src.len() {
2200
            src[i] = i as u8;
2201
        }
2202
        for i in 0..src.len() {
2203
            let tail = &mut src[i..];
2204
            for j in 0..tail.len() {
2205
                tail[j] = 0xA0;
2206
                assert!(!is_ascii(tail));
2207
            }
2208
        }
2209
    }
2210
2211
    #[test]
2212
    fn test_is_basic_latin_success() {
2213
        let mut src: Vec<u16> = Vec::with_capacity(128);
2214
        src.resize(128, 0);
2215
        for i in 0..src.len() {
2216
            src[i] = i as u16;
2217
        }
2218
        for i in 0..src.len() {
2219
            assert!(is_basic_latin(&src[i..]));
2220
        }
2221
    }
2222
2223
    #[test]
2224
    fn test_is_basic_latin_fail() {
2225
        let mut src: Vec<u16> = Vec::with_capacity(128);
2226
        src.resize(128, 0);
2227
        for i in 0..src.len() {
2228
            src[i] = i as u16;
2229
        }
2230
        for i in 0..src.len() {
2231
            let tail = &mut src[i..];
2232
            for j in 0..tail.len() {
2233
                tail[j] = 0xA0;
2234
                assert!(!is_basic_latin(tail));
2235
            }
2236
        }
2237
    }
2238
2239
    #[test]
2240
    fn test_is_utf16_latin1_success() {
2241
        let mut src: Vec<u16> = Vec::with_capacity(256);
2242
        src.resize(256, 0);
2243
        for i in 0..src.len() {
2244
            src[i] = i as u16;
2245
        }
2246
        for i in 0..src.len() {
2247
            assert!(is_utf16_latin1(&src[i..]));
2248
            assert_eq!(
2249
                check_utf16_for_latin1_and_bidi(&src[i..]),
2250
                Latin1Bidi::Latin1
2251
            );
2252
        }
2253
    }
2254
2255
    #[test]
2256
    fn test_is_utf16_latin1_fail() {
2257
        let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2258
        let mut src: Vec<u16> = Vec::with_capacity(len);
2259
        src.resize(len, 0);
2260
        for i in 0..src.len() {
2261
            src[i] = i as u16;
2262
        }
2263
        for i in 0..src.len() {
2264
            let tail = &mut src[i..];
2265
            for j in 0..tail.len() {
2266
                tail[j] = 0x100 + j as u16;
2267
                assert!(!is_utf16_latin1(tail));
2268
                assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2269
            }
2270
        }
2271
    }
2272
2273
    #[test]
2274
    fn test_is_str_latin1_success() {
2275
        let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2276
        let mut src: Vec<u16> = Vec::with_capacity(len);
2277
        src.resize(len, 0);
2278
        for i in 0..src.len() {
2279
            src[i] = i as u16;
2280
        }
2281
        for i in 0..src.len() {
2282
            let s = String::from_utf16(&src[i..]).unwrap();
2283
            assert!(is_str_latin1(&s[..]));
2284
            assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2285
        }
2286
    }
2287
2288
    #[test]
2289
    fn test_is_str_latin1_fail() {
2290
        let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2291
        let mut src: Vec<u16> = Vec::with_capacity(len);
2292
        src.resize(len, 0);
2293
        for i in 0..src.len() {
2294
            src[i] = i as u16;
2295
        }
2296
        for i in 0..src.len() {
2297
            let tail = &mut src[i..];
2298
            for j in 0..tail.len() {
2299
                tail[j] = 0x100 + j as u16;
2300
                let s = String::from_utf16(tail).unwrap();
2301
                assert!(!is_str_latin1(&s[..]));
2302
                assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2303
            }
2304
        }
2305
    }
2306
2307
    #[test]
2308
    fn test_is_utf8_latin1_success() {
2309
        let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2310
        let mut src: Vec<u16> = Vec::with_capacity(len);
2311
        src.resize(len, 0);
2312
        for i in 0..src.len() {
2313
            src[i] = i as u16;
2314
        }
2315
        for i in 0..src.len() {
2316
            let s = String::from_utf16(&src[i..]).unwrap();
2317
            assert!(is_utf8_latin1(s.as_bytes()));
2318
            assert_eq!(
2319
                check_utf8_for_latin1_and_bidi(s.as_bytes()),
2320
                Latin1Bidi::Latin1
2321
            );
2322
        }
2323
    }
2324
2325
    #[test]
2326
    fn test_is_utf8_latin1_fail() {
2327
        let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2328
        let mut src: Vec<u16> = Vec::with_capacity(len);
2329
        src.resize(len, 0);
2330
        for i in 0..src.len() {
2331
            src[i] = i as u16;
2332
        }
2333
        for i in 0..src.len() {
2334
            let tail = &mut src[i..];
2335
            for j in 0..tail.len() {
2336
                tail[j] = 0x100 + j as u16;
2337
                let s = String::from_utf16(tail).unwrap();
2338
                assert!(!is_utf8_latin1(s.as_bytes()));
2339
                assert_ne!(
2340
                    check_utf8_for_latin1_and_bidi(s.as_bytes()),
2341
                    Latin1Bidi::Latin1
2342
                );
2343
            }
2344
        }
2345
    }
2346
2347
    #[test]
2348
    fn test_is_utf8_latin1_invalid() {
2349
        assert!(!is_utf8_latin1(b"\xC3"));
2350
        assert!(!is_utf8_latin1(b"a\xC3"));
2351
        assert!(!is_utf8_latin1(b"\xFF"));
2352
        assert!(!is_utf8_latin1(b"a\xFF"));
2353
        assert!(!is_utf8_latin1(b"\xC3\xFF"));
2354
        assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2355
    }
2356
2357
    #[test]
2358
    fn test_convert_utf8_to_utf16() {
2359
        let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2360
        let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2361
        dst.resize(src.len() + 1, 0);
2362
        let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2363
        dst.truncate(len);
2364
        let reference: Vec<u16> = src.encode_utf16().collect();
2365
        assert_eq!(dst, reference);
2366
    }
2367
2368
    #[test]
2369
    fn test_convert_str_to_utf16() {
2370
        let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2371
        let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2372
        dst.resize(src.len(), 0);
2373
        let len = convert_str_to_utf16(src, &mut dst[..]);
2374
        dst.truncate(len);
2375
        let reference: Vec<u16> = src.encode_utf16().collect();
2376
        assert_eq!(dst, reference);
2377
    }
2378
2379
    #[test]
2380
    fn test_convert_utf16_to_utf8_partial() {
2381
        let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2382
        let src: Vec<u16> = reference.encode_utf16().collect();
2383
        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2384
        dst.resize(src.len() * 3 + 1, 0);
2385
        let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2386
        let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2387
        dst.truncate(len);
2388
        assert_eq!(dst, reference.as_bytes());
2389
    }
2390
2391
    #[test]
2392
    fn test_convert_utf16_to_utf8() {
2393
        let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2394
        let src: Vec<u16> = reference.encode_utf16().collect();
2395
        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2396
        dst.resize(src.len() * 3 + 1, 0);
2397
        let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2398
        dst.truncate(len);
2399
        assert_eq!(dst, reference.as_bytes());
2400
    }
2401
2402
    #[test]
2403
    fn test_convert_latin1_to_utf16() {
2404
        let mut src: Vec<u8> = Vec::with_capacity(256);
2405
        src.resize(256, 0);
2406
        let mut reference: Vec<u16> = Vec::with_capacity(256);
2407
        reference.resize(256, 0);
2408
        for i in 0..256 {
2409
            src[i] = i as u8;
2410
            reference[i] = i as u16;
2411
        }
2412
        let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2413
        dst.resize(src.len(), 0);
2414
        convert_latin1_to_utf16(&src[..], &mut dst[..]);
2415
        assert_eq!(dst, reference);
2416
    }
2417
2418
    #[test]
2419
    fn test_convert_latin1_to_utf8_partial() {
2420
        let mut dst = [0u8, 2];
2421
        let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2422
        assert_eq!(read, 1);
2423
        assert_eq!(written, 1);
2424
    }
2425
2426
    #[test]
2427
    fn test_convert_latin1_to_utf8() {
2428
        let mut src: Vec<u8> = Vec::with_capacity(256);
2429
        src.resize(256, 0);
2430
        let mut reference: Vec<u16> = Vec::with_capacity(256);
2431
        reference.resize(256, 0);
2432
        for i in 0..256 {
2433
            src[i] = i as u8;
2434
            reference[i] = i as u16;
2435
        }
2436
        let s = String::from_utf16(&reference[..]).unwrap();
2437
        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2438
        dst.resize(src.len() * 2, 0);
2439
        let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2440
        dst.truncate(len);
2441
        assert_eq!(&dst[..], s.as_bytes());
2442
    }
2443
2444
    #[test]
2445
    fn test_convert_utf8_to_latin1_lossy() {
2446
        let mut reference: Vec<u8> = Vec::with_capacity(256);
2447
        reference.resize(256, 0);
2448
        let mut src16: Vec<u16> = Vec::with_capacity(256);
2449
        src16.resize(256, 0);
2450
        for i in 0..256 {
2451
            src16[i] = i as u16;
2452
            reference[i] = i as u8;
2453
        }
2454
        let src = String::from_utf16(&src16[..]).unwrap();
2455
        let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2456
        dst.resize(src.len(), 0);
2457
        let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2458
        dst.truncate(len);
2459
        assert_eq!(dst, reference);
2460
    }
2461
2462
    #[cfg(all(debug_assertions, not(fuzzing)))]
2463
    #[test]
2464
    #[should_panic]
2465
    fn test_convert_utf8_to_latin1_lossy_panics() {
2466
        let mut dst = [0u8; 16];
2467
        let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2468
    }
2469
2470
    #[test]
2471
    fn test_convert_utf16_to_latin1_lossy() {
2472
        let mut src: Vec<u16> = Vec::with_capacity(256);
2473
        src.resize(256, 0);
2474
        let mut reference: Vec<u8> = Vec::with_capacity(256);
2475
        reference.resize(256, 0);
2476
        for i in 0..256 {
2477
            src[i] = i as u16;
2478
            reference[i] = i as u8;
2479
        }
2480
        let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2481
        dst.resize(src.len(), 0);
2482
        convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2483
        assert_eq!(dst, reference);
2484
    }
2485
2486
    #[test]
2487
    // #[should_panic]
2488
    fn test_convert_utf16_to_latin1_lossy_panics() {
2489
        let mut dst = [0u8; 16];
2490
        let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2491
    }
2492
2493
    #[test]
2494
    fn test_utf16_valid_up_to() {
2495
        let valid = vec![
2496
            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2497
            0xD83Du16, 0xDCA9u16, 0x00B6u16,
2498
        ];
2499
        assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2500
        let lone_high = vec![
2501
            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2502
            0x2603u16, 0xD83Du16, 0x00B6u16,
2503
        ];
2504
        assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2505
        let lone_low = vec![
2506
            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2507
            0x2603u16, 0xDCA9u16, 0x00B6u16,
2508
        ];
2509
        assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2510
        let lone_high_at_end = vec![
2511
            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2512
            0x2603u16, 0x00B6u16, 0xD83Du16,
2513
        ];
2514
        assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2515
    }
2516
2517
    #[test]
2518
    fn test_ensure_utf16_validity() {
2519
        let mut src = vec![
2520
            0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2521
            0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2522
            0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2523
        ];
2524
        let reference = vec![
2525
            0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2526
            0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2527
            0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2528
        ];
2529
        ensure_utf16_validity(&mut src[..]);
2530
        assert_eq!(src, reference);
2531
    }
2532
2533
    #[test]
2534
    fn test_is_char_bidi() {
2535
        assert!(!is_char_bidi('a'));
2536
        assert!(!is_char_bidi('\u{03B1}'));
2537
        assert!(!is_char_bidi('\u{3041}'));
2538
        assert!(!is_char_bidi('\u{1F4A9}'));
2539
        assert!(!is_char_bidi('\u{FE00}'));
2540
        assert!(!is_char_bidi('\u{202C}'));
2541
        assert!(!is_char_bidi('\u{FEFF}'));
2542
        assert!(is_char_bidi('\u{0590}'));
2543
        assert!(is_char_bidi('\u{08FF}'));
2544
        assert!(is_char_bidi('\u{061C}'));
2545
        assert!(is_char_bidi('\u{FB50}'));
2546
        assert!(is_char_bidi('\u{FDFF}'));
2547
        assert!(is_char_bidi('\u{FE70}'));
2548
        assert!(is_char_bidi('\u{FEFE}'));
2549
        assert!(is_char_bidi('\u{200F}'));
2550
        assert!(is_char_bidi('\u{202B}'));
2551
        assert!(is_char_bidi('\u{202E}'));
2552
        assert!(is_char_bidi('\u{2067}'));
2553
        assert!(is_char_bidi('\u{10800}'));
2554
        assert!(is_char_bidi('\u{10FFF}'));
2555
        assert!(is_char_bidi('\u{1E800}'));
2556
        assert!(is_char_bidi('\u{1EFFF}'));
2557
    }
2558
2559
    #[test]
2560
    fn test_is_utf16_code_unit_bidi() {
2561
        assert!(!is_utf16_code_unit_bidi(0x0062));
2562
        assert!(!is_utf16_code_unit_bidi(0x03B1));
2563
        assert!(!is_utf16_code_unit_bidi(0x3041));
2564
        assert!(!is_utf16_code_unit_bidi(0xD801));
2565
        assert!(!is_utf16_code_unit_bidi(0xFE00));
2566
        assert!(!is_utf16_code_unit_bidi(0x202C));
2567
        assert!(!is_utf16_code_unit_bidi(0xFEFF));
2568
        assert!(is_utf16_code_unit_bidi(0x0590));
2569
        assert!(is_utf16_code_unit_bidi(0x08FF));
2570
        assert!(is_utf16_code_unit_bidi(0x061C));
2571
        assert!(is_utf16_code_unit_bidi(0xFB1D));
2572
        assert!(is_utf16_code_unit_bidi(0xFB50));
2573
        assert!(is_utf16_code_unit_bidi(0xFDFF));
2574
        assert!(is_utf16_code_unit_bidi(0xFE70));
2575
        assert!(is_utf16_code_unit_bidi(0xFEFE));
2576
        assert!(is_utf16_code_unit_bidi(0x200F));
2577
        assert!(is_utf16_code_unit_bidi(0x202B));
2578
        assert!(is_utf16_code_unit_bidi(0x202E));
2579
        assert!(is_utf16_code_unit_bidi(0x2067));
2580
        assert!(is_utf16_code_unit_bidi(0xD802));
2581
        assert!(is_utf16_code_unit_bidi(0xD803));
2582
        assert!(is_utf16_code_unit_bidi(0xD83A));
2583
        assert!(is_utf16_code_unit_bidi(0xD83B));
2584
    }
2585
2586
    #[test]
2587
    fn test_is_str_bidi() {
2588
        assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2589
        assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2590
        assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2591
        assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2592
        assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2593
        assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2594
        assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2595
        assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2596
        assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2597
        assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2598
        assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2599
        assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2600
        assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2601
        assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2602
        assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2603
        assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2604
        assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2605
        assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2606
        assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2607
        assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2608
        assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2609
        assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2610
    }
2611
2612
    #[test]
2613
    fn test_is_utf8_bidi() {
2614
        assert!(!is_utf8_bidi(
2615
            "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2616
        ));
2617
        assert!(!is_utf8_bidi(
2618
            "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2619
        ));
2620
        assert!(!is_utf8_bidi(
2621
            "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2622
        ));
2623
        assert!(!is_utf8_bidi(
2624
            "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2625
        ));
2626
        assert!(!is_utf8_bidi(
2627
            "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2628
        ));
2629
        assert!(!is_utf8_bidi(
2630
            "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2631
        ));
2632
        assert!(!is_utf8_bidi(
2633
            "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2634
        ));
2635
        assert!(is_utf8_bidi(
2636
            "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2637
        ));
2638
        assert!(is_utf8_bidi(
2639
            "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2640
        ));
2641
        assert!(is_utf8_bidi(
2642
            "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2643
        ));
2644
        assert!(is_utf8_bidi(
2645
            "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2646
        ));
2647
        assert!(is_utf8_bidi(
2648
            "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2649
        ));
2650
        assert!(is_utf8_bidi(
2651
            "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2652
        ));
2653
        assert!(is_utf8_bidi(
2654
            "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2655
        ));
2656
        assert!(is_utf8_bidi(
2657
            "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2658
        ));
2659
        assert!(is_utf8_bidi(
2660
            "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2661
        ));
2662
        assert!(is_utf8_bidi(
2663
            "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2664
        ));
2665
        assert!(is_utf8_bidi(
2666
            "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2667
        ));
2668
        assert!(is_utf8_bidi(
2669
            "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2670
        ));
2671
        assert!(is_utf8_bidi(
2672
            "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2673
        ));
2674
        assert!(is_utf8_bidi(
2675
            "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2676
        ));
2677
        assert!(is_utf8_bidi(
2678
            "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2679
        ));
2680
    }
2681
2682
    #[test]
2683
    fn test_is_utf16_bidi() {
2684
        assert!(!is_utf16_bidi(&[
2685
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2686
            0x67, 0x68, 0x69,
2687
        ]));
2688
        assert!(!is_utf16_bidi(&[
2689
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2690
            0x67, 0x68, 0x69,
2691
        ]));
2692
        assert!(!is_utf16_bidi(&[
2693
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2694
            0x67, 0x68, 0x69,
2695
        ]));
2696
        assert!(!is_utf16_bidi(&[
2697
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2698
            0x67, 0x68, 0x69,
2699
        ]));
2700
        assert!(!is_utf16_bidi(&[
2701
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2702
            0x67, 0x68, 0x69,
2703
        ]));
2704
        assert!(!is_utf16_bidi(&[
2705
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2706
            0x67, 0x68, 0x69,
2707
        ]));
2708
        assert!(!is_utf16_bidi(&[
2709
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2710
            0x67, 0x68, 0x69,
2711
        ]));
2712
        assert!(is_utf16_bidi(&[
2713
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2714
            0x67, 0x68, 0x69,
2715
        ]));
2716
        assert!(is_utf16_bidi(&[
2717
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2718
            0x67, 0x68, 0x69,
2719
        ]));
2720
        assert!(is_utf16_bidi(&[
2721
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2722
            0x67, 0x68, 0x69,
2723
        ]));
2724
        assert!(is_utf16_bidi(&[
2725
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2726
            0x67, 0x68, 0x69,
2727
        ]));
2728
        assert!(is_utf16_bidi(&[
2729
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2730
            0x67, 0x68, 0x69,
2731
        ]));
2732
        assert!(is_utf16_bidi(&[
2733
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2734
            0x67, 0x68, 0x69,
2735
        ]));
2736
        assert!(is_utf16_bidi(&[
2737
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2738
            0x67, 0x68, 0x69,
2739
        ]));
2740
        assert!(is_utf16_bidi(&[
2741
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2742
            0x67, 0x68, 0x69,
2743
        ]));
2744
        assert!(is_utf16_bidi(&[
2745
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2746
            0x67, 0x68, 0x69,
2747
        ]));
2748
        assert!(is_utf16_bidi(&[
2749
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2750
            0x67, 0x68, 0x69,
2751
        ]));
2752
        assert!(is_utf16_bidi(&[
2753
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2754
            0x67, 0x68, 0x69,
2755
        ]));
2756
        assert!(is_utf16_bidi(&[
2757
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2758
            0x67, 0x68, 0x69,
2759
        ]));
2760
        assert!(is_utf16_bidi(&[
2761
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2762
            0x67, 0x68, 0x69,
2763
        ]));
2764
        assert!(is_utf16_bidi(&[
2765
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2766
            0x67, 0x68, 0x69,
2767
        ]));
2768
        assert!(is_utf16_bidi(&[
2769
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2770
            0x67, 0x68, 0x69,
2771
        ]));
2772
        assert!(is_utf16_bidi(&[
2773
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2774
            0x67, 0x68, 0x69,
2775
        ]));
2776
2777
        assert!(is_utf16_bidi(&[
2778
            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2779
            0x66, 0x67, 0x68, 0x69,
2780
        ]));
2781
    }
2782
2783
    #[test]
2784
    fn test_check_str_for_latin1_and_bidi() {
2785
        assert_ne!(
2786
            check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2787
            Latin1Bidi::Bidi
2788
        );
2789
        assert_ne!(
2790
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2791
            Latin1Bidi::Bidi
2792
        );
2793
        assert_ne!(
2794
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2795
            Latin1Bidi::Bidi
2796
        );
2797
        assert_ne!(
2798
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2799
            Latin1Bidi::Bidi
2800
        );
2801
        assert_ne!(
2802
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2803
            Latin1Bidi::Bidi
2804
        );
2805
        assert_ne!(
2806
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2807
            Latin1Bidi::Bidi
2808
        );
2809
        assert_ne!(
2810
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2811
            Latin1Bidi::Bidi
2812
        );
2813
        assert_eq!(
2814
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2815
            Latin1Bidi::Bidi
2816
        );
2817
        assert_eq!(
2818
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2819
            Latin1Bidi::Bidi
2820
        );
2821
        assert_eq!(
2822
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2823
            Latin1Bidi::Bidi
2824
        );
2825
        assert_eq!(
2826
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2827
            Latin1Bidi::Bidi
2828
        );
2829
        assert_eq!(
2830
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2831
            Latin1Bidi::Bidi
2832
        );
2833
        assert_eq!(
2834
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2835
            Latin1Bidi::Bidi
2836
        );
2837
        assert_eq!(
2838
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2839
            Latin1Bidi::Bidi
2840
        );
2841
        assert_eq!(
2842
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2843
            Latin1Bidi::Bidi
2844
        );
2845
        assert_eq!(
2846
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2847
            Latin1Bidi::Bidi
2848
        );
2849
        assert_eq!(
2850
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2851
            Latin1Bidi::Bidi
2852
        );
2853
        assert_eq!(
2854
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2855
            Latin1Bidi::Bidi
2856
        );
2857
        assert_eq!(
2858
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2859
            Latin1Bidi::Bidi
2860
        );
2861
        assert_eq!(
2862
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2863
            Latin1Bidi::Bidi
2864
        );
2865
        assert_eq!(
2866
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2867
            Latin1Bidi::Bidi
2868
        );
2869
        assert_eq!(
2870
            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2871
            Latin1Bidi::Bidi
2872
        );
2873
    }
2874
2875
    #[test]
2876
    fn test_check_utf8_for_latin1_and_bidi() {
2877
        assert_ne!(
2878
            check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2879
            Latin1Bidi::Bidi
2880
        );
2881
        assert_ne!(
2882
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2883
            Latin1Bidi::Bidi
2884
        );
2885
        assert_ne!(
2886
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2887
            Latin1Bidi::Bidi
2888
        );
2889
        assert_ne!(
2890
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2891
            Latin1Bidi::Bidi
2892
        );
2893
        assert_ne!(
2894
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2895
            Latin1Bidi::Bidi
2896
        );
2897
        assert_ne!(
2898
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2899
            Latin1Bidi::Bidi
2900
        );
2901
        assert_ne!(
2902
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2903
            Latin1Bidi::Bidi
2904
        );
2905
        assert_eq!(
2906
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2907
            Latin1Bidi::Bidi
2908
        );
2909
        assert_eq!(
2910
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2911
            Latin1Bidi::Bidi
2912
        );
2913
        assert_eq!(
2914
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2915
            Latin1Bidi::Bidi
2916
        );
2917
        assert_eq!(
2918
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2919
            Latin1Bidi::Bidi
2920
        );
2921
        assert_eq!(
2922
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2923
            Latin1Bidi::Bidi
2924
        );
2925
        assert_eq!(
2926
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2927
            Latin1Bidi::Bidi
2928
        );
2929
        assert_eq!(
2930
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2931
            Latin1Bidi::Bidi
2932
        );
2933
        assert_eq!(
2934
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2935
            Latin1Bidi::Bidi
2936
        );
2937
        assert_eq!(
2938
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2939
            Latin1Bidi::Bidi
2940
        );
2941
        assert_eq!(
2942
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2943
            Latin1Bidi::Bidi
2944
        );
2945
        assert_eq!(
2946
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2947
            Latin1Bidi::Bidi
2948
        );
2949
        assert_eq!(
2950
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2951
            Latin1Bidi::Bidi
2952
        );
2953
        assert_eq!(
2954
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2955
            Latin1Bidi::Bidi
2956
        );
2957
        assert_eq!(
2958
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2959
            Latin1Bidi::Bidi
2960
        );
2961
        assert_eq!(
2962
            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2963
            Latin1Bidi::Bidi
2964
        );
2965
    }
2966
2967
    #[test]
2968
    fn test_check_utf16_for_latin1_and_bidi() {
2969
        assert_ne!(
2970
            check_utf16_for_latin1_and_bidi(&[
2971
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2972
                0x66, 0x67, 0x68, 0x69,
2973
            ]),
2974
            Latin1Bidi::Bidi
2975
        );
2976
        assert_ne!(
2977
            check_utf16_for_latin1_and_bidi(&[
2978
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2979
                0x66, 0x67, 0x68, 0x69,
2980
            ]),
2981
            Latin1Bidi::Bidi
2982
        );
2983
        assert_ne!(
2984
            check_utf16_for_latin1_and_bidi(&[
2985
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2986
                0x66, 0x67, 0x68, 0x69,
2987
            ]),
2988
            Latin1Bidi::Bidi
2989
        );
2990
        assert_ne!(
2991
            check_utf16_for_latin1_and_bidi(&[
2992
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2993
                0x66, 0x67, 0x68, 0x69,
2994
            ]),
2995
            Latin1Bidi::Bidi
2996
        );
2997
        assert_ne!(
2998
            check_utf16_for_latin1_and_bidi(&[
2999
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
3000
                0x66, 0x67, 0x68, 0x69,
3001
            ]),
3002
            Latin1Bidi::Bidi
3003
        );
3004
        assert_ne!(
3005
            check_utf16_for_latin1_and_bidi(&[
3006
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
3007
                0x66, 0x67, 0x68, 0x69,
3008
            ]),
3009
            Latin1Bidi::Bidi
3010
        );
3011
        assert_ne!(
3012
            check_utf16_for_latin1_and_bidi(&[
3013
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
3014
                0x66, 0x67, 0x68, 0x69,
3015
            ]),
3016
            Latin1Bidi::Bidi
3017
        );
3018
        assert_eq!(
3019
            check_utf16_for_latin1_and_bidi(&[
3020
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
3021
                0x66, 0x67, 0x68, 0x69,
3022
            ]),
3023
            Latin1Bidi::Bidi
3024
        );
3025
        assert_eq!(
3026
            check_utf16_for_latin1_and_bidi(&[
3027
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3028
                0x66, 0x67, 0x68, 0x69,
3029
            ]),
3030
            Latin1Bidi::Bidi
3031
        );
3032
        assert_eq!(
3033
            check_utf16_for_latin1_and_bidi(&[
3034
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3035
                0x66, 0x67, 0x68, 0x69,
3036
            ]),
3037
            Latin1Bidi::Bidi
3038
        );
3039
        assert_eq!(
3040
            check_utf16_for_latin1_and_bidi(&[
3041
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3042
                0x66, 0x67, 0x68, 0x69,
3043
            ]),
3044
            Latin1Bidi::Bidi
3045
        );
3046
        assert_eq!(
3047
            check_utf16_for_latin1_and_bidi(&[
3048
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3049
                0x66, 0x67, 0x68, 0x69,
3050
            ]),
3051
            Latin1Bidi::Bidi
3052
        );
3053
        assert_eq!(
3054
            check_utf16_for_latin1_and_bidi(&[
3055
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3056
                0x66, 0x67, 0x68, 0x69,
3057
            ]),
3058
            Latin1Bidi::Bidi
3059
        );
3060
        assert_eq!(
3061
            check_utf16_for_latin1_and_bidi(&[
3062
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3063
                0x66, 0x67, 0x68, 0x69,
3064
            ]),
3065
            Latin1Bidi::Bidi
3066
        );
3067
        assert_eq!(
3068
            check_utf16_for_latin1_and_bidi(&[
3069
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3070
                0x66, 0x67, 0x68, 0x69,
3071
            ]),
3072
            Latin1Bidi::Bidi
3073
        );
3074
        assert_eq!(
3075
            check_utf16_for_latin1_and_bidi(&[
3076
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3077
                0x66, 0x67, 0x68, 0x69,
3078
            ]),
3079
            Latin1Bidi::Bidi
3080
        );
3081
        assert_eq!(
3082
            check_utf16_for_latin1_and_bidi(&[
3083
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3084
                0x66, 0x67, 0x68, 0x69,
3085
            ]),
3086
            Latin1Bidi::Bidi
3087
        );
3088
        assert_eq!(
3089
            check_utf16_for_latin1_and_bidi(&[
3090
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3091
                0x66, 0x67, 0x68, 0x69,
3092
            ]),
3093
            Latin1Bidi::Bidi
3094
        );
3095
        assert_eq!(
3096
            check_utf16_for_latin1_and_bidi(&[
3097
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3098
                0x66, 0x67, 0x68, 0x69,
3099
            ]),
3100
            Latin1Bidi::Bidi
3101
        );
3102
        assert_eq!(
3103
            check_utf16_for_latin1_and_bidi(&[
3104
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3105
                0x66, 0x67, 0x68, 0x69,
3106
            ]),
3107
            Latin1Bidi::Bidi
3108
        );
3109
        assert_eq!(
3110
            check_utf16_for_latin1_and_bidi(&[
3111
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3112
                0x66, 0x67, 0x68, 0x69,
3113
            ]),
3114
            Latin1Bidi::Bidi
3115
        );
3116
        assert_eq!(
3117
            check_utf16_for_latin1_and_bidi(&[
3118
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3119
                0x66, 0x67, 0x68, 0x69,
3120
            ]),
3121
            Latin1Bidi::Bidi
3122
        );
3123
        assert_eq!(
3124
            check_utf16_for_latin1_and_bidi(&[
3125
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3126
                0x66, 0x67, 0x68, 0x69,
3127
            ]),
3128
            Latin1Bidi::Bidi
3129
        );
3130
3131
        assert_eq!(
3132
            check_utf16_for_latin1_and_bidi(&[
3133
                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3134
                0x65, 0x66, 0x67, 0x68, 0x69,
3135
            ]),
3136
            Latin1Bidi::Bidi
3137
        );
3138
    }
3139
3140
    #[inline(always)]
3141
    pub fn reference_is_char_bidi(c: char) -> bool {
3142
        match c {
3143
            '\u{0590}'..='\u{08FF}'
3144
            | '\u{FB1D}'..='\u{FDFF}'
3145
            | '\u{FE70}'..='\u{FEFE}'
3146
            | '\u{10800}'..='\u{10FFF}'
3147
            | '\u{1E800}'..='\u{1EFFF}'
3148
            | '\u{200F}'
3149
            | '\u{202B}'
3150
            | '\u{202E}'
3151
            | '\u{2067}' => true,
3152
            _ => false,
3153
        }
3154
    }
3155
3156
    #[inline(always)]
3157
    pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3158
        match u {
3159
            0x0590..=0x08FF
3160
            | 0xFB1D..=0xFDFF
3161
            | 0xFE70..=0xFEFE
3162
            | 0xD802
3163
            | 0xD803
3164
            | 0xD83A
3165
            | 0xD83B
3166
            | 0x200F
3167
            | 0x202B
3168
            | 0x202E
3169
            | 0x2067 => true,
3170
            _ => false,
3171
        }
3172
    }
3173
3174
    #[test]
3175
    #[cfg_attr(miri, ignore)] // Miri is too slow
3176
    fn test_is_char_bidi_thoroughly() {
3177
        for i in 0..0xD800u32 {
3178
            let c: char = ::core::char::from_u32(i).unwrap();
3179
            assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3180
        }
3181
        for i in 0xE000..0x110000u32 {
3182
            let c: char = ::core::char::from_u32(i).unwrap();
3183
            assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3184
        }
3185
    }
3186
3187
    #[test]
3188
    #[cfg_attr(miri, ignore)] // Miri is too slow
3189
    fn test_is_utf16_code_unit_bidi_thoroughly() {
3190
        for i in 0..0x10000u32 {
3191
            let u = i as u16;
3192
            assert_eq!(
3193
                is_utf16_code_unit_bidi(u),
3194
                reference_is_utf16_code_unit_bidi(u)
3195
            );
3196
        }
3197
    }
3198
3199
    #[test]
3200
    #[cfg_attr(miri, ignore)] // Miri is too slow
3201
    fn test_is_str_bidi_thoroughly() {
3202
        let mut buf = [0; 4];
3203
        for i in 0..0xD800u32 {
3204
            let c: char = ::core::char::from_u32(i).unwrap();
3205
            assert_eq!(
3206
                is_str_bidi(c.encode_utf8(&mut buf[..])),
3207
                reference_is_char_bidi(c)
3208
            );
3209
        }
3210
        for i in 0xE000..0x110000u32 {
3211
            let c: char = ::core::char::from_u32(i).unwrap();
3212
            assert_eq!(
3213
                is_str_bidi(c.encode_utf8(&mut buf[..])),
3214
                reference_is_char_bidi(c)
3215
            );
3216
        }
3217
    }
3218
3219
    #[test]
3220
    #[cfg_attr(miri, ignore)] // Miri is too slow
3221
    fn test_is_utf8_bidi_thoroughly() {
3222
        let mut buf = [0; 8];
3223
        for i in 0..0xD800u32 {
3224
            let c: char = ::core::char::from_u32(i).unwrap();
3225
            let expect = reference_is_char_bidi(c);
3226
            {
3227
                let len = {
3228
                    let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3229
                    assert_eq!(is_utf8_bidi(bytes), expect);
3230
                    bytes.len()
3231
                };
3232
                {
3233
                    let tail = &mut buf[len..];
3234
                    for b in tail.iter_mut() {
3235
                        *b = 0;
3236
                    }
3237
                }
3238
            }
3239
            assert_eq!(is_utf8_bidi(&buf[..]), expect);
3240
        }
3241
        for i in 0xE000..0x110000u32 {
3242
            let c: char = ::core::char::from_u32(i).unwrap();
3243
            let expect = reference_is_char_bidi(c);
3244
            {
3245
                let len = {
3246
                    let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3247
                    assert_eq!(is_utf8_bidi(bytes), expect);
3248
                    bytes.len()
3249
                };
3250
                {
3251
                    let tail = &mut buf[len..];
3252
                    for b in tail.iter_mut() {
3253
                        *b = 0;
3254
                    }
3255
                }
3256
            }
3257
            assert_eq!(is_utf8_bidi(&buf[..]), expect);
3258
        }
3259
    }
3260
3261
    #[test]
3262
    #[cfg_attr(miri, ignore)] // Miri is too slow
3263
    fn test_is_utf16_bidi_thoroughly() {
3264
        let mut buf = [0; 32];
3265
        for i in 0..0x10000u32 {
3266
            let u = i as u16;
3267
            buf[15] = u;
3268
            assert_eq!(
3269
                is_utf16_bidi(&buf[..]),
3270
                reference_is_utf16_code_unit_bidi(u)
3271
            );
3272
        }
3273
    }
3274
3275
    #[test]
3276
    fn test_is_utf8_bidi_edge_cases() {
3277
        assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3278
        assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3279
        assert!(!is_utf8_bidi(b"abc"));
3280
        assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3281
        assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3282
        assert!(is_utf8_bidi(b"ab\xC2"));
3283
    }
3284
3285
    #[test]
3286
    fn test_decode_latin1() {
3287
        match decode_latin1(b"ab") {
3288
            Cow::Borrowed(s) => {
3289
                assert_eq!(s, "ab");
3290
            }
3291
            Cow::Owned(_) => {
3292
                unreachable!("Should have borrowed");
3293
            }
3294
        }
3295
        assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3296
    }
3297
3298
    #[test]
3299
    fn test_encode_latin1_lossy() {
3300
        match encode_latin1_lossy("ab") {
3301
            Cow::Borrowed(s) => {
3302
                assert_eq!(s, b"ab");
3303
            }
3304
            Cow::Owned(_) => {
3305
                unreachable!("Should have borrowed");
3306
            }
3307
        }
3308
        assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3309
    }
3310
3311
    #[test]
3312
    fn test_convert_utf8_to_utf16_without_replacement() {
3313
        let mut buf = [0u16; 5];
3314
        assert_eq!(
3315
            convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3316
            Some(2)
3317
        );
3318
        assert_eq!(buf[0], u16::from(b'a'));
3319
        assert_eq!(buf[1], u16::from(b'b'));
3320
        assert_eq!(buf[2], 0);
3321
        assert_eq!(
3322
            convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3323
            Some(2)
3324
        );
3325
        assert_eq!(buf[0], 0xE4);
3326
        assert_eq!(buf[1], u16::from(b'c'));
3327
        assert_eq!(buf[2], 0);
3328
        assert_eq!(
3329
            convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3330
            Some(1)
3331
        );
3332
        assert_eq!(buf[0], 0x2603);
3333
        assert_eq!(buf[1], u16::from(b'c'));
3334
        assert_eq!(buf[2], 0);
3335
        assert_eq!(
3336
            convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3337
            Some(2)
3338
        );
3339
        assert_eq!(buf[0], 0x2603);
3340
        assert_eq!(buf[1], u16::from(b'd'));
3341
        assert_eq!(buf[2], 0);
3342
        assert_eq!(
3343
            convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3344
            Some(2)
3345
        );
3346
        assert_eq!(buf[0], 0x2603);
3347
        assert_eq!(buf[1], 0xE4);
3348
        assert_eq!(buf[2], 0);
3349
        assert_eq!(
3350
            convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3351
            Some(2)
3352
        );
3353
        assert_eq!(buf[0], 0xD83D);
3354
        assert_eq!(buf[1], 0xDCCE);
3355
        assert_eq!(buf[2], 0);
3356
        assert_eq!(
3357
            convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3358
            Some(3)
3359
        );
3360
        assert_eq!(buf[0], 0xD83D);
3361
        assert_eq!(buf[1], 0xDCCE);
3362
        assert_eq!(buf[2], u16::from(b'e'));
3363
        assert_eq!(
3364
            convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3365
            None
3366
        );
3367
    }
3368
}