/src/simdutf/src/generic/validate_utf16.h
Line | Count | Source |
1 | | namespace simdutf { |
2 | | namespace SIMDUTF_IMPLEMENTATION { |
3 | | namespace { |
4 | | namespace utf16 { |
5 | | /* |
6 | | UTF-16 validation |
7 | | -------------------------------------------------- |
8 | | |
9 | | In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning. |
10 | | |
11 | | In a vectorized algorithm we want to examine the most significant |
12 | | nibble in order to select a fast path. If none of highest nibbles |
13 | | are 0xD (13), than we are sure that UTF-16 chunk in a vector |
14 | | register is valid. |
15 | | |
16 | | Let us analyze what we need to check if the nibble is 0xD. The |
17 | | value of the preceding nibble determines what we have: |
18 | | |
19 | | 0xd000 .. 0xd7ff - a valid word |
20 | | 0xd800 .. 0xdbff - low surrogate |
21 | | 0xdc00 .. 0xdfff - high surrogate |
22 | | |
23 | | Other constraints we have to consider: |
24 | | - there must not be two consecutive low surrogates (0xd800 .. 0xdbff) |
25 | | - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff) |
26 | | - there must not be sole low surrogate nor high surrogate |
27 | | |
28 | | We are going to build three bitmasks based on the 3rd nibble: |
29 | | - V = valid word, |
30 | | - L = low surrogate (0xd800 .. 0xdbff) |
31 | | - H = high surrogate (0xdc00 .. 0xdfff) |
32 | | |
33 | | 0 1 2 3 4 5 6 7 <--- word index |
34 | | [ V | L | H | L | H | V | V | L ] |
35 | | 1 0 0 0 0 1 1 0 - V = valid masks |
36 | | 0 1 0 1 0 0 0 1 - L = low surrogate |
37 | | 0 0 1 0 1 0 0 0 - H high surrogate |
38 | | |
39 | | |
40 | | 1 0 0 0 0 1 1 0 V = valid masks |
41 | | 0 1 0 1 0 0 0 0 a = L & (H >> 1) |
42 | | 0 0 1 0 1 0 0 0 b = a << 1 |
43 | | 1 1 1 1 1 1 1 0 c = V | a | b |
44 | | ^ |
45 | | the last bit can be zero, we just consume 7 |
46 | | code units and recheck this word in the next iteration |
47 | | */ |
48 | | template <endianness big_endian> |
49 | 30.2k | const result validate_utf16_with_errors(const char16_t *input, size_t size) { |
50 | 30.2k | if (simdutf_unlikely(size == 0)) { |
51 | 20.5k | return result(error_code::SUCCESS, 0); |
52 | 20.5k | } |
53 | | |
54 | 9.72k | const char16_t *start = input; |
55 | 9.72k | const char16_t *end = input + size; |
56 | | |
57 | 9.72k | const auto v_d8 = simd8<uint8_t>::splat(0xd8); |
58 | 9.72k | const auto v_f8 = simd8<uint8_t>::splat(0xf8); |
59 | 9.72k | const auto v_fc = simd8<uint8_t>::splat(0xfc); |
60 | 9.72k | const auto v_dc = simd8<uint8_t>::splat(0xdc); |
61 | | |
62 | 68.0k | while (input + simd16<uint16_t>::SIZE * 2 < end) { |
63 | | // 0. Load data: since the validation takes into account only higher |
64 | | // byte of each word, we compress the two vectors into one which |
65 | | // consists only the higher bytes. |
66 | 58.7k | auto in0 = simd16<uint16_t>(input); |
67 | 58.7k | auto in1 = |
68 | 58.7k | simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t)); |
69 | | |
70 | | // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 |
71 | | // and yields a single vector having only higher bytes of characters. |
72 | 58.7k | const auto in = utf16_gather_high_bytes<big_endian>(in0, in1); |
73 | | |
74 | | // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). |
75 | 58.7k | const auto surrogates_wordmask = (in & v_f8) == v_d8; |
76 | 58.7k | const uint16_t surrogates_bitmask = |
77 | 58.7k | static_cast<uint16_t>(surrogates_wordmask.to_bitmask()); |
78 | 58.7k | if (surrogates_bitmask == 0x0000) { |
79 | 53.9k | input += 16; |
80 | 53.9k | } else { |
81 | | // 2. We have some surrogates that have to be distinguished: |
82 | | // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) |
83 | | // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) |
84 | | // |
85 | | // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) |
86 | | |
87 | | // V - non-surrogate code units |
88 | | // V = not surrogates_wordmask |
89 | 4.89k | const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask); |
90 | | |
91 | | // H - word-mask for high surrogates: the six highest bits are 0b1101'11 |
92 | 4.89k | const auto vH = (in & v_fc) == v_dc; |
93 | 4.89k | const uint16_t H = static_cast<uint16_t>(vH.to_bitmask()); |
94 | | |
95 | | // L - word mask for low surrogates |
96 | | // L = not H and surrogates_wordmask |
97 | 4.89k | const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask); |
98 | | |
99 | 4.89k | const uint16_t a = static_cast<uint16_t>( |
100 | 4.89k | L & (H >> 1)); // A low surrogate must be followed by high one. |
101 | | // (A low surrogate placed in the 7th register's word |
102 | | // is an exception we handle.) |
103 | 4.89k | const uint16_t b = static_cast<uint16_t>( |
104 | 4.89k | a << 1); // Just mark that the opinput - startite fact is hold, |
105 | | // thanks to that we have only two masks for valid case. |
106 | 4.89k | const uint16_t c = static_cast<uint16_t>( |
107 | 4.89k | V | a | b); // Combine all the masks into the final one. |
108 | | |
109 | 4.89k | if (c == 0xffff) { |
110 | | // The whole input register contains valid UTF-16, i.e., |
111 | | // either single code units or proper surrogate pairs. |
112 | 3.05k | input += 16; |
113 | 3.05k | } else if (c == 0x7fff) { |
114 | | // The 15 lower code units of the input register contains valid UTF-16. |
115 | | // The 15th word may be either a low or high surrogate. It the next |
116 | | // iteration we 1) check if the low surrogate is followed by a high |
117 | | // one, 2) reject sole high surrogate. |
118 | 1.41k | input += 15; |
119 | 1.41k | } else { |
120 | 428 | return result(error_code::SURROGATE, input - start); |
121 | 428 | } |
122 | 4.89k | } |
123 | 58.7k | } |
124 | | |
125 | 9.30k | return result(error_code::SUCCESS, input - start); |
126 | 9.72k | } simdutf.cpp:simdutf::result const simdutf::haswell::(anonymous namespace)::utf16::validate_utf16_with_errors<(simdutf::endianness)0>(char16_t const*, unsigned long) Line | Count | Source | 49 | 7.76k | const result validate_utf16_with_errors(const char16_t *input, size_t size) { | 50 | 7.76k | if (simdutf_unlikely(size == 0)) { | 51 | 5.11k | return result(error_code::SUCCESS, 0); | 52 | 5.11k | } | 53 | | | 54 | 2.65k | const char16_t *start = input; | 55 | 2.65k | const char16_t *end = input + size; | 56 | | | 57 | 2.65k | const auto v_d8 = simd8<uint8_t>::splat(0xd8); | 58 | 2.65k | const auto v_f8 = simd8<uint8_t>::splat(0xf8); | 59 | 2.65k | const auto v_fc = simd8<uint8_t>::splat(0xfc); | 60 | 2.65k | const auto v_dc = simd8<uint8_t>::splat(0xdc); | 61 | | | 62 | 14.5k | while (input + simd16<uint16_t>::SIZE * 2 < end) { | 63 | | // 0. Load data: since the validation takes into account only higher | 64 | | // byte of each word, we compress the two vectors into one which | 65 | | // consists only the higher bytes. | 66 | 12.0k | auto in0 = simd16<uint16_t>(input); | 67 | 12.0k | auto in1 = | 68 | 12.0k | simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t)); | 69 | | | 70 | | // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 | 71 | | // and yields a single vector having only higher bytes of characters. | 72 | 12.0k | const auto in = utf16_gather_high_bytes<big_endian>(in0, in1); | 73 | | | 74 | | // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). | 75 | 12.0k | const auto surrogates_wordmask = (in & v_f8) == v_d8; | 76 | 12.0k | const uint16_t surrogates_bitmask = | 77 | 12.0k | static_cast<uint16_t>(surrogates_wordmask.to_bitmask()); | 78 | 12.0k | if (surrogates_bitmask == 0x0000) { | 79 | 11.0k | input += 16; | 80 | 11.0k | } else { | 81 | | // 2. We have some surrogates that have to be distinguished: | 82 | | // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) | 83 | | // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) | 84 | | // | 85 | | // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) | 86 | | | 87 | | // V - non-surrogate code units | 88 | | // V = not surrogates_wordmask | 89 | 912 | const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask); | 90 | | | 91 | | // H - word-mask for high surrogates: the six highest bits are 0b1101'11 | 92 | 912 | const auto vH = (in & v_fc) == v_dc; | 93 | 912 | const uint16_t H = static_cast<uint16_t>(vH.to_bitmask()); | 94 | | | 95 | | // L - word mask for low surrogates | 96 | | // L = not H and surrogates_wordmask | 97 | 912 | const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask); | 98 | | | 99 | 912 | const uint16_t a = static_cast<uint16_t>( | 100 | 912 | L & (H >> 1)); // A low surrogate must be followed by high one. | 101 | | // (A low surrogate placed in the 7th register's word | 102 | | // is an exception we handle.) | 103 | 912 | const uint16_t b = static_cast<uint16_t>( | 104 | 912 | a << 1); // Just mark that the opinput - startite fact is hold, | 105 | | // thanks to that we have only two masks for valid case. | 106 | 912 | const uint16_t c = static_cast<uint16_t>( | 107 | 912 | V | a | b); // Combine all the masks into the final one. | 108 | | | 109 | 912 | if (c == 0xffff) { | 110 | | // The whole input register contains valid UTF-16, i.e., | 111 | | // either single code units or proper surrogate pairs. | 112 | 560 | input += 16; | 113 | 560 | } else if (c == 0x7fff) { | 114 | | // The 15 lower code units of the input register contains valid UTF-16. | 115 | | // The 15th word may be either a low or high surrogate. It the next | 116 | | // iteration we 1) check if the low surrogate is followed by a high | 117 | | // one, 2) reject sole high surrogate. | 118 | 288 | input += 15; | 119 | 288 | } else { | 120 | 64 | return result(error_code::SURROGATE, input - start); | 121 | 64 | } | 122 | 912 | } | 123 | 12.0k | } | 124 | | | 125 | 2.59k | return result(error_code::SUCCESS, input - start); | 126 | 2.65k | } |
simdutf.cpp:simdutf::result const simdutf::haswell::(anonymous namespace)::utf16::validate_utf16_with_errors<(simdutf::endianness)1>(char16_t const*, unsigned long) Line | Count | Source | 49 | 7.75k | const result validate_utf16_with_errors(const char16_t *input, size_t size) { | 50 | 7.75k | if (simdutf_unlikely(size == 0)) { | 51 | 5.12k | return result(error_code::SUCCESS, 0); | 52 | 5.12k | } | 53 | | | 54 | 2.63k | const char16_t *start = input; | 55 | 2.63k | const char16_t *end = input + size; | 56 | | | 57 | 2.63k | const auto v_d8 = simd8<uint8_t>::splat(0xd8); | 58 | 2.63k | const auto v_f8 = simd8<uint8_t>::splat(0xf8); | 59 | 2.63k | const auto v_fc = simd8<uint8_t>::splat(0xfc); | 60 | 2.63k | const auto v_dc = simd8<uint8_t>::splat(0xdc); | 61 | | | 62 | 14.5k | while (input + simd16<uint16_t>::SIZE * 2 < end) { | 63 | | // 0. Load data: since the validation takes into account only higher | 64 | | // byte of each word, we compress the two vectors into one which | 65 | | // consists only the higher bytes. | 66 | 12.0k | auto in0 = simd16<uint16_t>(input); | 67 | 12.0k | auto in1 = | 68 | 12.0k | simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t)); | 69 | | | 70 | | // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 | 71 | | // and yields a single vector having only higher bytes of characters. | 72 | 12.0k | const auto in = utf16_gather_high_bytes<big_endian>(in0, in1); | 73 | | | 74 | | // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). | 75 | 12.0k | const auto surrogates_wordmask = (in & v_f8) == v_d8; | 76 | 12.0k | const uint16_t surrogates_bitmask = | 77 | 12.0k | static_cast<uint16_t>(surrogates_wordmask.to_bitmask()); | 78 | 12.0k | if (surrogates_bitmask == 0x0000) { | 79 | 10.9k | input += 16; | 80 | 10.9k | } else { | 81 | | // 2. We have some surrogates that have to be distinguished: | 82 | | // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) | 83 | | // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) | 84 | | // | 85 | | // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) | 86 | | | 87 | | // V - non-surrogate code units | 88 | | // V = not surrogates_wordmask | 89 | 1.09k | const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask); | 90 | | | 91 | | // H - word-mask for high surrogates: the six highest bits are 0b1101'11 | 92 | 1.09k | const auto vH = (in & v_fc) == v_dc; | 93 | 1.09k | const uint16_t H = static_cast<uint16_t>(vH.to_bitmask()); | 94 | | | 95 | | // L - word mask for low surrogates | 96 | | // L = not H and surrogates_wordmask | 97 | 1.09k | const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask); | 98 | | | 99 | 1.09k | const uint16_t a = static_cast<uint16_t>( | 100 | 1.09k | L & (H >> 1)); // A low surrogate must be followed by high one. | 101 | | // (A low surrogate placed in the 7th register's word | 102 | | // is an exception we handle.) | 103 | 1.09k | const uint16_t b = static_cast<uint16_t>( | 104 | 1.09k | a << 1); // Just mark that the opinput - startite fact is hold, | 105 | | // thanks to that we have only two masks for valid case. | 106 | 1.09k | const uint16_t c = static_cast<uint16_t>( | 107 | 1.09k | V | a | b); // Combine all the masks into the final one. | 108 | | | 109 | 1.09k | if (c == 0xffff) { | 110 | | // The whole input register contains valid UTF-16, i.e., | 111 | | // either single code units or proper surrogate pairs. | 112 | 688 | input += 16; | 113 | 688 | } else if (c == 0x7fff) { | 114 | | // The 15 lower code units of the input register contains valid UTF-16. | 115 | | // The 15th word may be either a low or high surrogate. It the next | 116 | | // iteration we 1) check if the low surrogate is followed by a high | 117 | | // one, 2) reject sole high surrogate. | 118 | 308 | input += 15; | 119 | 308 | } else { | 120 | 100 | return result(error_code::SURROGATE, input - start); | 121 | 100 | } | 122 | 1.09k | } | 123 | 12.0k | } | 124 | | | 125 | 2.53k | return result(error_code::SUCCESS, input - start); | 126 | 2.63k | } |
simdutf.cpp:simdutf::result const simdutf::westmere::(anonymous namespace)::utf16::validate_utf16_with_errors<(simdutf::endianness)0>(char16_t const*, unsigned long) Line | Count | Source | 49 | 7.36k | const result validate_utf16_with_errors(const char16_t *input, size_t size) { | 50 | 7.36k | if (simdutf_unlikely(size == 0)) { | 51 | 5.12k | return result(error_code::SUCCESS, 0); | 52 | 5.12k | } | 53 | | | 54 | 2.23k | const char16_t *start = input; | 55 | 2.23k | const char16_t *end = input + size; | 56 | | | 57 | 2.23k | const auto v_d8 = simd8<uint8_t>::splat(0xd8); | 58 | 2.23k | const auto v_f8 = simd8<uint8_t>::splat(0xf8); | 59 | 2.23k | const auto v_fc = simd8<uint8_t>::splat(0xfc); | 60 | 2.23k | const auto v_dc = simd8<uint8_t>::splat(0xdc); | 61 | | | 62 | 19.3k | while (input + simd16<uint16_t>::SIZE * 2 < end) { | 63 | | // 0. Load data: since the validation takes into account only higher | 64 | | // byte of each word, we compress the two vectors into one which | 65 | | // consists only the higher bytes. | 66 | 17.2k | auto in0 = simd16<uint16_t>(input); | 67 | 17.2k | auto in1 = | 68 | 17.2k | simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t)); | 69 | | | 70 | | // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 | 71 | | // and yields a single vector having only higher bytes of characters. | 72 | 17.2k | const auto in = utf16_gather_high_bytes<big_endian>(in0, in1); | 73 | | | 74 | | // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). | 75 | 17.2k | const auto surrogates_wordmask = (in & v_f8) == v_d8; | 76 | 17.2k | const uint16_t surrogates_bitmask = | 77 | 17.2k | static_cast<uint16_t>(surrogates_wordmask.to_bitmask()); | 78 | 17.2k | if (surrogates_bitmask == 0x0000) { | 79 | 15.7k | input += 16; | 80 | 15.7k | } else { | 81 | | // 2. We have some surrogates that have to be distinguished: | 82 | | // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) | 83 | | // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) | 84 | | // | 85 | | // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) | 86 | | | 87 | | // V - non-surrogate code units | 88 | | // V = not surrogates_wordmask | 89 | 1.43k | const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask); | 90 | | | 91 | | // H - word-mask for high surrogates: the six highest bits are 0b1101'11 | 92 | 1.43k | const auto vH = (in & v_fc) == v_dc; | 93 | 1.43k | const uint16_t H = static_cast<uint16_t>(vH.to_bitmask()); | 94 | | | 95 | | // L - word mask for low surrogates | 96 | | // L = not H and surrogates_wordmask | 97 | 1.43k | const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask); | 98 | | | 99 | 1.43k | const uint16_t a = static_cast<uint16_t>( | 100 | 1.43k | L & (H >> 1)); // A low surrogate must be followed by high one. | 101 | | // (A low surrogate placed in the 7th register's word | 102 | | // is an exception we handle.) | 103 | 1.43k | const uint16_t b = static_cast<uint16_t>( | 104 | 1.43k | a << 1); // Just mark that the opinput - startite fact is hold, | 105 | | // thanks to that we have only two masks for valid case. | 106 | 1.43k | const uint16_t c = static_cast<uint16_t>( | 107 | 1.43k | V | a | b); // Combine all the masks into the final one. | 108 | | | 109 | 1.43k | if (c == 0xffff) { | 110 | | // The whole input register contains valid UTF-16, i.e., | 111 | | // either single code units or proper surrogate pairs. | 112 | 900 | input += 16; | 113 | 900 | } else if (c == 0x7fff) { | 114 | | // The 15 lower code units of the input register contains valid UTF-16. | 115 | | // The 15th word may be either a low or high surrogate. It the next | 116 | | // iteration we 1) check if the low surrogate is followed by a high | 117 | | // one, 2) reject sole high surrogate. | 118 | 405 | input += 15; | 119 | 405 | } else { | 120 | 128 | return result(error_code::SURROGATE, input - start); | 121 | 128 | } | 122 | 1.43k | } | 123 | 17.2k | } | 124 | | | 125 | 2.11k | return result(error_code::SUCCESS, input - start); | 126 | 2.23k | } |
simdutf.cpp:simdutf::result const simdutf::westmere::(anonymous namespace)::utf16::validate_utf16_with_errors<(simdutf::endianness)1>(char16_t const*, unsigned long) Line | Count | Source | 49 | 7.34k | const result validate_utf16_with_errors(const char16_t *input, size_t size) { | 50 | 7.34k | if (simdutf_unlikely(size == 0)) { | 51 | 5.14k | return result(error_code::SUCCESS, 0); | 52 | 5.14k | } | 53 | | | 54 | 2.20k | const char16_t *start = input; | 55 | 2.20k | const char16_t *end = input + size; | 56 | | | 57 | 2.20k | const auto v_d8 = simd8<uint8_t>::splat(0xd8); | 58 | 2.20k | const auto v_f8 = simd8<uint8_t>::splat(0xf8); | 59 | 2.20k | const auto v_fc = simd8<uint8_t>::splat(0xfc); | 60 | 2.20k | const auto v_dc = simd8<uint8_t>::splat(0xdc); | 61 | | | 62 | 19.5k | while (input + simd16<uint16_t>::SIZE * 2 < end) { | 63 | | // 0. Load data: since the validation takes into account only higher | 64 | | // byte of each word, we compress the two vectors into one which | 65 | | // consists only the higher bytes. | 66 | 17.5k | auto in0 = simd16<uint16_t>(input); | 67 | 17.5k | auto in1 = | 68 | 17.5k | simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t)); | 69 | | | 70 | | // Function `utf16_gather_high_bytes` consumes two vectors of UTF-16 | 71 | | // and yields a single vector having only higher bytes of characters. | 72 | 17.5k | const auto in = utf16_gather_high_bytes<big_endian>(in0, in1); | 73 | | | 74 | | // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy). | 75 | 17.5k | const auto surrogates_wordmask = (in & v_f8) == v_d8; | 76 | 17.5k | const uint16_t surrogates_bitmask = | 77 | 17.5k | static_cast<uint16_t>(surrogates_wordmask.to_bitmask()); | 78 | 17.5k | if (surrogates_bitmask == 0x0000) { | 79 | 16.0k | input += 16; | 80 | 16.0k | } else { | 81 | | // 2. We have some surrogates that have to be distinguished: | 82 | | // - low surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF) | 83 | | // - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF) | 84 | | // | 85 | | // Fact: high surrogate has 11th bit set (3rd bit in the higher byte) | 86 | | | 87 | | // V - non-surrogate code units | 88 | | // V = not surrogates_wordmask | 89 | 1.45k | const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask); | 90 | | | 91 | | // H - word-mask for high surrogates: the six highest bits are 0b1101'11 | 92 | 1.45k | const auto vH = (in & v_fc) == v_dc; | 93 | 1.45k | const uint16_t H = static_cast<uint16_t>(vH.to_bitmask()); | 94 | | | 95 | | // L - word mask for low surrogates | 96 | | // L = not H and surrogates_wordmask | 97 | 1.45k | const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask); | 98 | | | 99 | 1.45k | const uint16_t a = static_cast<uint16_t>( | 100 | 1.45k | L & (H >> 1)); // A low surrogate must be followed by high one. | 101 | | // (A low surrogate placed in the 7th register's word | 102 | | // is an exception we handle.) | 103 | 1.45k | const uint16_t b = static_cast<uint16_t>( | 104 | 1.45k | a << 1); // Just mark that the opinput - startite fact is hold, | 105 | | // thanks to that we have only two masks for valid case. | 106 | 1.45k | const uint16_t c = static_cast<uint16_t>( | 107 | 1.45k | V | a | b); // Combine all the masks into the final one. | 108 | | | 109 | 1.45k | if (c == 0xffff) { | 110 | | // The whole input register contains valid UTF-16, i.e., | 111 | | // either single code units or proper surrogate pairs. | 112 | 908 | input += 16; | 113 | 908 | } else if (c == 0x7fff) { | 114 | | // The 15 lower code units of the input register contains valid UTF-16. | 115 | | // The 15th word may be either a low or high surrogate. It the next | 116 | | // iteration we 1) check if the low surrogate is followed by a high | 117 | | // one, 2) reject sole high surrogate. | 118 | 413 | input += 15; | 119 | 413 | } else { | 120 | 136 | return result(error_code::SURROGATE, input - start); | 121 | 136 | } | 122 | 1.45k | } | 123 | 17.5k | } | 124 | | | 125 | 2.06k | return result(error_code::SUCCESS, input - start); | 126 | 2.20k | } |
|
127 | | |
128 | | template <endianness big_endian> |
129 | | const result validate_utf16_as_ascii_with_errors(const char16_t *input, |
130 | 0 | size_t size) { |
131 | 0 | if (simdutf_unlikely(size == 0)) { |
132 | 0 | return result(error_code::SUCCESS, 0); |
133 | 0 | } |
134 | 0 | size_t pos = 0; |
135 | 0 | for (; pos < size / 32 * 32; pos += 32) { |
136 | 0 | simd16x32<uint16_t> input_vec( |
137 | 0 | reinterpret_cast<const uint16_t *>(input + pos)); |
138 | 0 | if (!match_system(big_endian)) { |
139 | 0 | input_vec.swap_bytes(); |
140 | 0 | } |
141 | 0 | uint64_t matches = input_vec.lteq(uint16_t(0x7f)); |
142 | 0 | if (~matches) { |
143 | | // Found a match, return the first one |
144 | 0 | int index = trailing_zeroes(~matches) / 2; |
145 | 0 | return result(error_code::TOO_LARGE, pos + index); |
146 | 0 | } |
147 | 0 | } |
148 | | |
149 | | // Scalar tail |
150 | 0 | while (pos < size) { |
151 | 0 | char16_t v = big_endian ? scalar::u16_swap_bytes(input[pos]) : input[pos]; |
152 | 0 | if (v > 0x7F) { |
153 | 0 | return result(error_code::TOO_LARGE, pos); |
154 | 0 | } |
155 | 0 | pos++; |
156 | 0 | } |
157 | 0 | return result(error_code::SUCCESS, size); |
158 | 0 | } Unexecuted instantiation: simdutf.cpp:simdutf::result const simdutf::haswell::(anonymous namespace)::utf16::validate_utf16_as_ascii_with_errors<(simdutf::endianness)0>(char16_t const*, unsigned long) Unexecuted instantiation: simdutf.cpp:simdutf::result const simdutf::haswell::(anonymous namespace)::utf16::validate_utf16_as_ascii_with_errors<(simdutf::endianness)1>(char16_t const*, unsigned long) Unexecuted instantiation: simdutf.cpp:simdutf::result const simdutf::westmere::(anonymous namespace)::utf16::validate_utf16_as_ascii_with_errors<(simdutf::endianness)0>(char16_t const*, unsigned long) Unexecuted instantiation: simdutf.cpp:simdutf::result const simdutf::westmere::(anonymous namespace)::utf16::validate_utf16_as_ascii_with_errors<(simdutf::endianness)1>(char16_t const*, unsigned long) |
159 | | |
160 | | } // namespace utf16 |
161 | | } // unnamed namespace |
162 | | } // namespace SIMDUTF_IMPLEMENTATION |
163 | | } // namespace simdutf |