Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include <memory>
6 : #include <string>
7 : #include <vector>
8 :
9 : #include "src/unicode-decoder.h"
10 : #include "src/unicode-inl.h"
11 : #include "src/vector.h"
12 : #include "testing/gtest/include/gtest/gtest.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 :
17 : namespace {
18 :
19 132 : void DecodeNormally(const std::vector<byte>& bytes,
20 : std::vector<unibrow::uchar>* output) {
21 132 : size_t cursor = 0;
22 1522 : while (cursor < bytes.size()) {
23 1390 : output->push_back(
24 695 : unibrow::Utf8::ValueOf(bytes.data() + cursor, bytes.size(), &cursor));
25 : }
26 132 : }
27 :
28 : template <size_t kBufferSize>
29 140 : void DecodeUtf16(unibrow::Utf8Decoder<kBufferSize>* decoder,
30 : const std::vector<byte>& bytes,
31 : std::vector<unibrow::uchar>* output) {
32 140 : auto vector = Vector<const char>::cast(VectorOf(bytes));
33 : decoder->Reset(vector);
34 :
35 140 : std::vector<uint16_t> utf16(decoder->Utf16Length());
36 140 : decoder->WriteUtf16(&(*utf16.begin()), decoder->Utf16Length(), vector);
37 :
38 : // Decode back into code points
39 1550 : for (size_t i = 0; i < utf16.size(); i++) {
40 705 : uint16_t b = utf16[i];
41 705 : if (unibrow::Utf16::IsLeadSurrogate(b)) {
42 80 : output->push_back(unibrow::Utf16::CombineSurrogatePair(b, utf16[++i]));
43 : } else {
44 1370 : output->push_back(b);
45 : }
46 : }
47 140 : }
48 :
49 132 : void DecodeIncrementally(const std::vector<byte>& bytes,
50 : std::vector<unibrow::uchar>* output) {
51 132 : unibrow::Utf8::Utf8IncrementalBuffer buffer = 0;
52 132 : unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
53 132 : const byte* cursor = &bytes[0];
54 : const byte* end = &bytes[bytes.size()];
55 2188 : while (cursor < end) {
56 : unibrow::uchar result =
57 1028 : unibrow::Utf8::ValueOfIncremental(&cursor, &state, &buffer);
58 1028 : if (result != unibrow::Utf8::kIncomplete) {
59 689 : output->push_back(result);
60 : }
61 : }
62 132 : unibrow::uchar result = unibrow::Utf8::ValueOfIncrementalFinish(&state);
63 132 : if (result != unibrow::Utf8::kBufferEmpty) {
64 6 : output->push_back(result);
65 : }
66 132 : }
67 :
68 : } // namespace
69 :
70 15443 : TEST(UnicodeTest, Utf16BufferReuse) {
71 : unibrow::Utf8Decoder<4> utf16_decoder;
72 :
73 : // Not enough continuation bytes before string ends.
74 : typedef struct {
75 : std::vector<byte> bytes;
76 : std::vector<unibrow::uchar> unicode_expected;
77 35 : } TestCase;
78 :
79 : TestCase data[] = {
80 : {{0x00}, {0x0}},
81 : {{0xC2, 0x80}, {0x80}},
82 : {{0xE0, 0xA0, 0x80}, {0x800}},
83 : {{0xF0, 0x90, 0x80, 0x80}, {0x10000}},
84 : {{0xE0, 0xA0, 0x80}, {0x800}},
85 : {{0xC2, 0x80}, {0x80}},
86 : {{0x00}, {0x0}},
87 8 : };
88 15 : for (auto test : data) {
89 : // For figuring out which test fails:
90 7 : fprintf(stderr, "test: ");
91 23 : for (auto b : test.bytes) {
92 16 : fprintf(stderr, "%x ", b);
93 : }
94 7 : fprintf(stderr, "\n");
95 :
96 : std::vector<unibrow::uchar> output_utf16;
97 7 : DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
98 :
99 7 : CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
100 21 : for (size_t i = 0; i < output_utf16.size(); ++i) {
101 14 : CHECK_EQ(output_utf16[i], test.unicode_expected[i]);
102 : }
103 : }
104 1 : }
105 :
106 15443 : TEST(UnicodeTest, SurrogateOverrunsBuffer) {
107 : unibrow::Utf8Decoder<2> utf16_decoder;
108 :
109 : std::vector<unibrow::uchar> output_utf16;
110 : // Not enough continuation bytes before string ends.
111 1 : DecodeUtf16(&utf16_decoder, {0x00, 0xF0, 0x90, 0x80, 0x80, 0x00},
112 1 : &output_utf16);
113 1 : CHECK_EQ(output_utf16[0], 0x00);
114 1 : CHECK_EQ(output_utf16[1], 0x10000);
115 1 : CHECK_EQ(output_utf16[0], 0x00);
116 1 : }
117 :
118 15443 : TEST(UnicodeTest, IncrementalUTF8DecodingVsNonIncrementalUtf8Decoding) {
119 : // Unfortunately, V8 has two UTF-8 decoders. This test checks that they
120 : // produce the same result. This test was inspired by
121 : // https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt .
122 : typedef struct {
123 : std::vector<byte> bytes;
124 : std::vector<unibrow::uchar> unicode_expected;
125 660 : } TestCase;
126 :
127 : TestCase data[] = {
128 : // Correct UTF-8 text.
129 : {{0xCE, 0xBA, 0xE1, 0xBD, 0xB9, 0xCF, 0x83, 0xCE, 0xBC, 0xCE, 0xB5},
130 : {0x3BA, 0x1F79, 0x3C3, 0x3BC, 0x3B5}},
131 :
132 : // First possible sequence of a certain length:
133 : // 1 byte
134 : {{0x00}, {0x0}},
135 : // 2 bytes
136 : {{0xC2, 0x80}, {0x80}},
137 : // 3 bytes
138 : {{0xE0, 0xA0, 0x80}, {0x800}},
139 : // 4 bytes
140 : {{0xF0, 0x90, 0x80, 0x80}, {0x10000}},
141 : // 5 bytes (not supported)
142 : {{0xF8, 0x88, 0x80, 0x80, 0x80},
143 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
144 : // 6 bytes (not supported)
145 : {{0xFC, 0x84, 0x80, 0x80, 0x80, 0x80},
146 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
147 :
148 : // Last possible sequence of certain length:
149 : // 1 byte
150 : {{0x7F}, {0x7F}},
151 : // 2 bytes
152 : {{0xDF, 0xBF}, {0x7FF}},
153 : // 3 bytes
154 : {{0xEF, 0xBF, 0xBF}, {0xFFFF}},
155 : // 4 bytes (this sequence is not a valid code point)
156 : {{0xF7, 0xBF, 0xBF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
157 : // 5 bytes (not supported)
158 : {{0xFB, 0xBF, 0xBF, 0xBF, 0xBF},
159 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
160 : // 6 bytes (not supported)
161 : {{0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF},
162 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
163 : // Other boundary conditions:
164 : {{0xED, 0x9F, 0xBF}, {0xD7FF}},
165 : {{0xEE, 0x80, 0x80}, {0xE000}},
166 : // U+fffd (invalid code point)
167 : {{0xEF, 0xBF, 0xBD}, {0xFFFD}},
168 : // U+10ffff (last valid code point)
169 : {{0xF4, 0x8F, 0xBF, 0xBF}, {0x10FFFF}},
170 : // First invalid (too large) code point
171 : {{0xF4, 0x90, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
172 :
173 : // Malformed sequences:
174 : // Unexpected continuation bytes:
175 : // First continuation byte
176 : {{0x80}, {0xFFFD}},
177 : // Last continuation byte
178 : {{0xBF}, {0xFFFD}},
179 : // 2 continuation bytes
180 : {{0x80, 0xBF}, {0xFFFD, 0xFFFD}},
181 : // 3 continuation bytes
182 : {{0x80, 0xBF, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
183 : // 4 continuation bytes
184 : {{0x80, 0xBF, 0x80, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
185 : // 5 continuation bytes
186 : {{0x80, 0xBF, 0x80, 0xBF, 0x80},
187 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
188 : // 6 continuation bytes
189 : {{0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF},
190 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
191 : // 7 continuation bytes
192 : {{0x80, 0xBF, 0x80, 0xBF, 0x80, 0xBF, 0xBF},
193 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
194 : // Sequence of all 64 possible continuation bytes
195 : {{0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A,
196 : 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95,
197 : 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F, 0xA0,
198 : 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB,
199 : 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6,
200 : 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF},
201 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
202 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
203 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
204 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
205 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
206 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
207 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
208 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
209 : // Using each possible continuation byte in a two-byte sequence:
210 : {{0xD0, 0x80, 0xD0, 0x81, 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84, 0xD0, 0x85,
211 : 0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88, 0xD0, 0x89, 0xD0, 0x8A, 0xD0, 0x8B,
212 : 0xD0, 0x8C, 0xD0, 0x8D, 0xD0, 0x8E, 0xD0, 0x8F, 0xD0, 0x90, 0xD0, 0x91,
213 : 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97,
214 : 0xD0, 0x98, 0xD0, 0x99, 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D,
215 : 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xD0, 0xA1, 0xD0, 0xA2, 0xD0, 0xA3,
216 : 0xD0, 0xA4, 0xD0, 0xA5, 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8, 0xD0, 0xA9,
217 : 0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC, 0xD0, 0xAD, 0xD0, 0xAE, 0xD0, 0xAF,
218 : 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, 0xB3, 0xD0, 0xB4, 0xD0, 0xB5,
219 : 0xD0, 0xB6, 0xD0, 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0, 0xBB,
220 : 0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0, 0xBF},
221 : {0x400, 0x401, 0x402, 0x403, 0x404, 0x405, 0x406, 0x407, 0x408, 0x409,
222 : 0x40A, 0x40B, 0x40C, 0x40D, 0x40E, 0x40F, 0x410, 0x411, 0x412, 0x413,
223 : 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D,
224 : 0x41E, 0x41F, 0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427,
225 : 0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F, 0x430, 0x431,
226 : 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B,
227 : 0x43C, 0x43D, 0x43E, 0x43F}},
228 :
229 : // Lonely first bytes:
230 : // All 32 first bytes of 32-byte sequences, each followed by a space
231 : // (generates 32 invalid char + space sequences.
232 : {{0xC0, 0x20, 0xC1, 0x20, 0xC2, 0x20, 0xC3, 0x20, 0xC4, 0x20, 0xC5,
233 : 0x20, 0xC6, 0x20, 0xC7, 0x20, 0xC8, 0x20, 0xC9, 0x20, 0xCA, 0x20,
234 : 0xCB, 0x20, 0xCC, 0x20, 0xCD, 0x20, 0xCE, 0x20, 0xCF, 0x20, 0xD0,
235 : 0x20, 0xD1, 0x20, 0xD2, 0x20, 0xD3, 0x20, 0xD4, 0x20, 0xD5, 0x20,
236 : 0xD6, 0x20, 0xD7, 0x20, 0xD8, 0x20, 0xD9, 0x20, 0xDA, 0x20, 0xDB,
237 : 0x20, 0xDC, 0x20, 0xDD, 0x20, 0xDE, 0x20, 0xDF, 0x20},
238 : {0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
239 : 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
240 : 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
241 : 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
242 : 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
243 : 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
244 : 0xFFFD, 0x20, 0xFFFD, 0x20}},
245 : // All 16 first bytes of 3-byte sequences, each followed by a space
246 : // (generates 16 invalid char + space sequences):
247 : {{0xE0, 0x20, 0xE1, 0x20, 0xE2, 0x20, 0xE3, 0x20, 0xE4, 0x20, 0xE5,
248 : 0x20, 0xE6, 0x20, 0xE7, 0x20, 0xE8, 0x20, 0xE9, 0x20, 0xEA, 0x20,
249 : 0xEB, 0x20, 0xEC, 0x20, 0xED, 0x20, 0xEE, 0x20, 0xEF, 0x20},
250 : {0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
251 : 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
252 : 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
253 : 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20}},
254 : // All 8 first bytes of 4-byte sequences, each followed by a space
255 : // (generates 8 invalid char + space sequences):
256 : {{0xF0, 0x20, 0xF1, 0x20, 0xF2, 0x20, 0xF3, 0x20, 0xF4, 0x20, 0xF5, 0x20,
257 : 0xF6, 0x20, 0xF7, 0x20},
258 : {0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20,
259 : 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20}},
260 : // All 4 first bytes of 5-byte sequences (not supported), each followed by
261 : // a space (generates 4 invalid char + space sequences):
262 : {{0xF8, 0x20, 0xF9, 0x20, 0xFA, 0x20, 0xFB, 0x20},
263 : {0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20, 0xFFFD, 0x20}},
264 : // All 2 first bytes of 6-byte sequences (not supported), each followed by
265 : // a space (generates 2 invalid char + space sequences):
266 : {{0xFC, 0x20, 0xFD, 0x20}, {0xFFFD, 0x20, 0xFFFD, 0x20}},
267 :
268 : // Sequences with last continuation byte missing. Normally the whole
269 : // incomplete sequence generates a single invalid character (exceptions
270 : // explained below).
271 :
272 : // 2-byte sequences with last byte missing
273 : {{0xC0}, {0xFFFD}},
274 : {{0xDF}, {0xFFFD}},
275 : // 3-byte sequences with last byte missing.
276 : {{0xE8, 0x80}, {0xFFFD}},
277 : {{0xE0, 0xBF}, {0xFFFD}},
278 : {{0xEF, 0xBF}, {0xFFFD}},
279 : // Start of an overlong sequence. The first "maximal subpart" is the first
280 : // byte; it creates an invalid character. Each following byte generates an
281 : // invalid character too.
282 : {{0xE0, 0x80}, {0xFFFD, 0xFFFD}},
283 : // 4-byte sequences with last byte missing
284 : {{0xF1, 0x80, 0x80}, {0xFFFD}},
285 : {{0xF4, 0x8F, 0xBF}, {0xFFFD}},
286 : // Start of an overlong sequence. The first "maximal subpart" is the first
287 : // byte; it creates an invalid character. Each following byte generates an
288 : // invalid character too.
289 : {{0xF0, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
290 : // 5-byte sequences (not supported) with last byte missing
291 : {{0xF8, 0x80, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
292 : {{0xFB, 0xBF, 0xBF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
293 : // 6-byte sequences (not supported) with last byte missing
294 : {{0xFC, 0x80, 0x80, 0x80, 0x80},
295 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
296 : {{0xFD, 0xBF, 0xBF, 0xBF, 0xBF},
297 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
298 :
299 : // Concatenation of incomplete sequences: above incomplete sequences
300 : // concatenated.
301 : {{0xC0, 0xDF, 0xE8, 0x80, 0xE0, 0xBF, 0xEF, 0xBF, 0xE0, 0x80,
302 : 0xF1, 0x80, 0x80, 0xF4, 0x8F, 0xBF, 0xF0, 0x80, 0x80, 0xF8,
303 : 0x80, 0x80, 0x80, 0xFB, 0xBF, 0xBF, 0xBF, 0xFC, 0x80, 0x80,
304 : 0x80, 0x80, 0xFD, 0xBF, 0xBF, 0xBF, 0xBF},
305 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
306 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
307 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
308 : 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
309 :
310 : // Incomplete sequence tests repeated with a space after the incomplete
311 : // sequence.
312 :
313 : // 2-byte sequences with last byte missing
314 : {{0xC0, 0x20}, {0xFFFD, 0x20}},
315 : {{0xDF, 0x20}, {0xFFFD, 0x20}},
316 : // 3-byte sequences with last byte missing
317 : {{0xE8, 0x80, 0x20}, {0xFFFD, 0x20}},
318 : {{0xE0, 0xBF, 0x20}, {0xFFFD, 0x20}},
319 : {{0xEF, 0xBF, 0x20}, {0xFFFD, 0x20}},
320 : // Start of overlong 3-byte sequence with last byte missing
321 : {{0xE0, 0x80, 0x20}, {0xFFFD, 0xFFFD, 0x20}},
322 : // 4-byte sequences with last byte missing
323 : {{0xF1, 0x80, 0x80, 0x20}, {0xFFFD, 0x20}},
324 : {{0xF4, 0x8F, 0xBF, 0x20}, {0xFFFD, 0x20}},
325 : // Start of overlong 4-byte sequence with last byte missing
326 : {{0xF0, 0x80, 0x80, 0x20}, {0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
327 : // 5-byte sequences (not supported) with last byte missing
328 : {{0xF8, 0x80, 0x80, 0x80, 0x20}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
329 : {{0xFB, 0xBF, 0xBF, 0xBF, 0x20}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
330 : // 6-byte sequences (not supported) with last byte missing
331 : {{0xFC, 0x80, 0x80, 0x80, 0x80, 0x20},
332 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
333 : {{0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0x20},
334 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x20}},
335 :
336 : // Impossible bytes
337 : {{0xFE}, {0xFFFD}},
338 : {{0xFF}, {0xFFFD}},
339 : {{0xFE, 0xFE, 0xFF, 0xFF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
340 : // Lead-byte-like bytes which aren't valid lead bytes.
341 : {{0xC0}, {0xFFFD}},
342 : {{0xC0, 0xAA}, {0xFFFD, 0xFFFD}},
343 : {{0xC1}, {0xFFFD}},
344 : {{0xC1, 0xAA}, {0xFFFD, 0xFFFD}},
345 : {{0xF5}, {0xFFFD}},
346 : {{0xF5, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
347 : {{0xF6}, {0xFFFD}},
348 : {{0xF6, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
349 : {{0xF7}, {0xFFFD}},
350 : {{0xF7, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
351 : {{0xF8}, {0xFFFD}},
352 : {{0xF8, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
353 : {{0xF9}, {0xFFFD}},
354 : {{0xF9, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
355 : {{0xFA}, {0xFFFD}},
356 : {{0xFA, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
357 : {{0xFB}, {0xFFFD}},
358 : {{0xFB, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
359 : {{0xFC}, {0xFFFD}},
360 : {{0xFC, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
361 : {{0xFD}, {0xFFFD}},
362 : {{0xFD, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
363 : {{0xFE}, {0xFFFD}},
364 : {{0xFE, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
365 : {{0xFF}, {0xFFFD}},
366 : {{0xFF, 0xAA, 0xAA, 0xAA}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
367 :
368 : // Overlong sequences:
369 :
370 : // Overlong encodings for "/"
371 : {{0xC0, 0xAF}, {0xFFFD, 0xFFFD}},
372 : {{0xE0, 0x80, 0xAF}, {0xFFFD, 0xFFFD, 0xFFFD}},
373 : {{0xF0, 0x80, 0x80, 0xAF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
374 : // 5-byte sequence (not supported anyway)
375 : {{0xF8, 0x80, 0x80, 0x80, 0xAF},
376 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
377 : // 6-byte sequence (not supported anyway)
378 : {{0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF},
379 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
380 :
381 : // Maximum overlong sequences
382 : {{0xC1, 0xBF}, {0xFFFD, 0xFFFD}},
383 : {{0xE0, 0x9F, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD}},
384 : {{0xF0, 0x8F, 0xBF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
385 : // 5-byte sequence (not supported anyway)
386 : {{0xF8, 0x87, 0xBF, 0xBF, 0xBF},
387 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
388 : // 6-byte sequence (not supported anyway)
389 : {{0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF},
390 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
391 :
392 : // Overlong encodings for 0
393 : {{0xC0, 0x80}, {0xFFFD, 0xFFFD}},
394 : {{0xE0, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
395 : {{0xF0, 0x80, 0x80, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
396 : // 5-byte sequence (not supported anyway)
397 : {{0xF8, 0x80, 0x80, 0x80, 0x80},
398 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
399 : // 6-byte sequence (not supported anyway)
400 : {{0xFC, 0x80, 0x80, 0x80, 0x80, 0x80},
401 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
402 :
403 : // Illegal code positions:
404 :
405 : // Single UTF-16 surrogates
406 : {{0xED, 0xA0, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
407 : {{0xED, 0xA0, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
408 : {{0xED, 0xAD, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD}},
409 : {{0xED, 0xAE, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
410 : {{0xED, 0xAF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD}},
411 : {{0xED, 0xB0, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
412 : {{0xED, 0xBE, 0x80}, {0xFFFD, 0xFFFD, 0xFFFD}},
413 : {{0xED, 0xBF, 0xBF}, {0xFFFD, 0xFFFD, 0xFFFD}},
414 :
415 : // Paired surrogates
416 : {{0xED, 0xA0, 0x80, 0xED, 0xB0, 0x80},
417 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
418 : {{0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF},
419 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
420 : {{0xED, 0xAD, 0xBF, 0xED, 0xB0, 0x80},
421 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
422 : {{0xED, 0xAD, 0xBF, 0xED, 0xBF, 0xBF},
423 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
424 : {{0xED, 0xAE, 0x80, 0xED, 0xB0, 0x80},
425 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
426 : {{0xED, 0xAE, 0x80, 0xED, 0xBF, 0xBF},
427 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
428 : {{0xED, 0xAF, 0xBF, 0xED, 0xB0, 0x80},
429 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
430 : {{0xED, 0xAF, 0xBF, 0xED, 0xBF, 0xBF},
431 : {0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD}},
432 :
433 : // Surrogates with the last byte missing.
434 : {{0xED, 0xA0}, {0xFFFD, 0xFFFD}},
435 : {{0xED, 0xA0}, {0xFFFD, 0xFFFD}},
436 : {{0xED, 0xAD}, {0xFFFD, 0xFFFD}},
437 : {{0xED, 0xAE}, {0xFFFD, 0xFFFD}},
438 : {{0xED, 0xAF}, {0xFFFD, 0xFFFD}},
439 : {{0xED, 0xB0}, {0xFFFD, 0xFFFD}},
440 : {{0xED, 0xBE}, {0xFFFD, 0xFFFD}},
441 : {{0xED, 0xBF}, {0xFFFD, 0xFFFD}},
442 :
443 : // Other non-characters
444 : {{0xEF, 0xBF, 0xBE}, {0xFFFE}},
445 : {{0xEF, 0xBF, 0xBF}, {0xFFFF}},
446 : {{0xEF, 0xB7, 0x90, 0xEF, 0xB7, 0x91, 0xEF, 0xB7, 0x92, 0xEF, 0xB7, 0x93,
447 : 0xEF, 0xB7, 0x94, 0xEF, 0xB7, 0x95, 0xEF, 0xB7, 0x96, 0xEF, 0xB7, 0x97,
448 : 0xEF, 0xB7, 0x98, 0xEF, 0xB7, 0x99, 0xEF, 0xB7, 0x9A, 0xEF, 0xB7, 0x9B,
449 : 0xEF, 0xB7, 0x9C, 0xEF, 0xB7, 0x9D, 0xEF, 0xB7, 0x9E, 0xEF, 0xB7, 0x9F,
450 : 0xEF, 0xB7, 0xA0, 0xEF, 0xB7, 0xA1, 0xEF, 0xB7, 0xA2, 0xEF, 0xB7, 0xA3,
451 : 0xEF, 0xB7, 0xA4, 0xEF, 0xB7, 0xA5, 0xEF, 0xB7, 0xA6, 0xEF, 0xB7, 0xA7,
452 : 0xEF, 0xB7, 0xA8, 0xEF, 0xB7, 0xA9, 0xEF, 0xB7, 0xAA, 0xEF, 0xB7, 0xAB,
453 : 0xEF, 0xB7, 0xAC, 0xEF, 0xB7, 0xAD, 0xEF, 0xB7, 0xAE, 0xEF, 0xB7, 0xAF},
454 : {0xFDD0, 0xFDD1, 0xFDD2, 0xFDD3, 0xFDD4, 0xFDD5, 0xFDD6, 0xFDD7,
455 : 0xFDD8, 0xFDD9, 0xFDDA, 0xFDDB, 0xFDDC, 0xFDDD, 0xFDDE, 0xFDDF,
456 : 0xFDE0, 0xFDE1, 0xFDE2, 0xFDE3, 0xFDE4, 0xFDE5, 0xFDE6, 0xFDE7,
457 : 0xFDE8, 0xFDE9, 0xFDEA, 0xFDEB, 0xFDEC, 0xFDED, 0xFDEE, 0xFDEF}},
458 : {{0xF0, 0x9F, 0xBF, 0xBE, 0xF0, 0x9F, 0xBF, 0xBF, 0xF0, 0xAF, 0xBF,
459 : 0xBE, 0xF0, 0xAF, 0xBF, 0xBF, 0xF0, 0xBF, 0xBF, 0xBE, 0xF0, 0xBF,
460 : 0xBF, 0xBF, 0xF1, 0x8F, 0xBF, 0xBE, 0xF1, 0x8F, 0xBF, 0xBF, 0xF1,
461 : 0x9F, 0xBF, 0xBE, 0xF1, 0x9F, 0xBF, 0xBF, 0xF1, 0xAF, 0xBF, 0xBE,
462 : 0xF1, 0xAF, 0xBF, 0xBF, 0xF1, 0xBF, 0xBF, 0xBE, 0xF1, 0xBF, 0xBF,
463 : 0xBF, 0xF2, 0x8F, 0xBF, 0xBE, 0xF2, 0x8F, 0xBF, 0xBF},
464 : {0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF,
465 : 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
466 : 0x8FFFF}},
467 133 : };
468 :
469 : unibrow::Utf8Decoder<50> utf16_decoder;
470 :
471 265 : for (auto test : data) {
472 : // For figuring out which test fails:
473 132 : fprintf(stderr, "test: ");
474 1052 : for (auto b : test.bytes) {
475 920 : fprintf(stderr, "%x ", b);
476 : }
477 132 : fprintf(stderr, "\n");
478 :
479 : std::vector<unibrow::uchar> output_normal;
480 132 : DecodeNormally(test.bytes, &output_normal);
481 :
482 132 : CHECK_EQ(output_normal.size(), test.unicode_expected.size());
483 1522 : for (size_t i = 0; i < output_normal.size(); ++i) {
484 1390 : CHECK_EQ(output_normal[i], test.unicode_expected[i]);
485 : }
486 :
487 : std::vector<unibrow::uchar> output_incremental;
488 132 : DecodeIncrementally(test.bytes, &output_incremental);
489 :
490 132 : CHECK_EQ(output_incremental.size(), test.unicode_expected.size());
491 1522 : for (size_t i = 0; i < output_incremental.size(); ++i) {
492 1390 : CHECK_EQ(output_incremental[i], test.unicode_expected[i]);
493 : }
494 :
495 : std::vector<unibrow::uchar> output_utf16;
496 132 : DecodeUtf16(&utf16_decoder, test.bytes, &output_utf16);
497 :
498 132 : CHECK_EQ(output_utf16.size(), test.unicode_expected.size());
499 1522 : for (size_t i = 0; i < output_utf16.size(); ++i) {
500 1390 : CHECK_EQ(output_utf16[i], test.unicode_expected[i]);
501 : }
502 : }
503 1 : }
504 :
505 : } // namespace internal
506 9264 : } // namespace v8
|