Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/factory.h" // for i::Factory::NewExternalStringFrom*Byte
6 : #include "src/feedback-vector-inl.h" // for include "src/factory.h"
7 : #include "src/objects-inl.h"
8 : #include "src/parsing/scanner-character-streams.h"
9 : #include "src/parsing/scanner.h"
10 : #include "test/cctest/cctest.h"
11 :
12 : namespace {
13 :
14 : // Implement ExternalSourceStream based on const char**.
15 : // This will take each string as one chunk. The last chunk must be empty.
16 : class ChunkSource : public v8::ScriptCompiler::ExternalSourceStream {
17 : public:
18 558 : explicit ChunkSource(const char** chunks) : current_(0) {
19 642 : do {
20 : chunks_.push_back(
21 1284 : {reinterpret_cast<const uint8_t*>(*chunks), strlen(*chunks)});
22 642 : chunks++;
23 642 : } while (chunks_.back().len > 0);
24 186 : }
25 234 : explicit ChunkSource(const char* chunks) : current_(0) {
26 186 : do {
27 : chunks_.push_back(
28 372 : {reinterpret_cast<const uint8_t*>(chunks), strlen(chunks)});
29 186 : chunks += strlen(chunks) + 1;
30 186 : } while (chunks_.back().len > 0);
31 78 : }
32 210 : ChunkSource(const uint8_t* data, size_t len, bool extra_chunky)
33 420 : : current_(0) {
34 : // If extra_chunky, we'll use increasingly large chunk sizes.
35 : // If not, we'll have a single chunk of full length.
36 210 : size_t chunk_size = extra_chunky ? 1 : len;
37 2352 : for (size_t i = 0; i < len; i += chunk_size, chunk_size++) {
38 6426 : chunks_.push_back({data + i, i::Min(chunk_size, len - i)});
39 : }
40 420 : chunks_.push_back({nullptr, 0});
41 210 : }
42 474 : ~ChunkSource() {}
43 0 : bool SetBookmark() override { return false; }
44 0 : void ResetToBookmark() override {}
45 3168 : size_t GetMoreData(const uint8_t** src) override {
46 : DCHECK_LT(current_, chunks_.size());
47 3168 : Chunk& next = chunks_[current_++];
48 3168 : uint8_t* chunk = new uint8_t[next.len];
49 3168 : i::MemMove(chunk, next.ptr, next.len);
50 3168 : *src = chunk;
51 3168 : return next.len;
52 : }
53 :
54 : private:
55 : struct Chunk {
56 : const uint8_t* ptr;
57 : size_t len;
58 : };
59 : std::vector<Chunk> chunks_;
60 : size_t current_;
61 : };
62 :
63 : class TestExternalResource : public v8::String::ExternalStringResource {
64 : public:
65 : explicit TestExternalResource(uint16_t* data, int length)
66 36 : : data_(data), length_(static_cast<size_t>(length)) {}
67 :
68 36 : ~TestExternalResource() {}
69 :
70 78 : const uint16_t* data() const { return data_; }
71 36 : size_t length() const { return length_; }
72 :
73 : private:
74 : uint16_t* data_;
75 : size_t length_;
76 : };
77 :
78 36 : class TestExternalOneByteResource
79 : : public v8::String::ExternalOneByteStringResource {
80 : public:
81 : TestExternalOneByteResource(const char* data, size_t length)
82 36 : : data_(data), length_(length) {}
83 :
84 60 : const char* data() const { return data_; }
85 36 : size_t length() const { return length_; }
86 :
87 : private:
88 : const char* data_;
89 : size_t length_;
90 : };
91 :
92 : // A test string with all lengths of utf-8 encodings.
93 : const char unicode_utf8[] =
94 : "abc" // 3x ascii
95 : "\xc3\xa4" // a Umlaut, code point 228
96 : "\xe2\xa8\xa0" // >> (math symbol), code point 10784
97 : "\xf0\x9f\x92\xa9" // best character, code point 128169,
98 : // as utf-16 surrogates: 55357 56489
99 : "def"; // 3x ascii again.
100 : const uint16_t unicode_ucs2[] = {97, 98, 99, 228, 10784, 55357,
101 : 56489, 100, 101, 102, 0};
102 :
103 : } // anonymous namespace
104 :
105 23724 : TEST(Utf8StreamAsciiOnly) {
106 6 : const char* chunks[] = {"abc", "def", "ghi", ""};
107 6 : ChunkSource chunk_source(chunks);
108 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
109 : v8::internal::ScannerStream::For(
110 6 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
111 :
112 : // Read the data without dying.
113 : v8::internal::uc32 c;
114 60 : do {
115 60 : c = stream->Advance();
116 : } while (c != v8::internal::Utf16CharacterStream::kEndOfInput);
117 6 : }
118 :
119 23724 : TEST(Utf8StreamBOM) {
120 : // Construct test string w/ UTF-8 BOM (byte order mark)
121 6 : char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};
122 : strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));
123 :
124 6 : const char* chunks[] = {data, "\0"};
125 6 : ChunkSource chunk_source(chunks);
126 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
127 : v8::internal::ScannerStream::For(
128 6 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
129 :
130 : // Read the data without tripping over the BOM.
131 66 : for (size_t i = 0; unicode_ucs2[i]; i++) {
132 120 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
133 : }
134 6 : CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance());
135 :
136 : // Make sure seek works.
137 6 : stream->Seek(0);
138 12 : CHECK_EQ(unicode_ucs2[0], stream->Advance());
139 :
140 6 : stream->Seek(5);
141 12 : CHECK_EQ(unicode_ucs2[5], stream->Advance());
142 :
143 : // Try again, but make sure we have to seek 'backwards'.
144 30 : while (v8::internal::Utf16CharacterStream::kEndOfInput != stream->Advance()) {
145 : // Do nothing. We merely advance the stream to the end of its input.
146 : }
147 6 : stream->Seek(5);
148 12 : CHECK_EQ(unicode_ucs2[5], stream->Advance());
149 6 : }
150 :
151 23724 : TEST(Utf8SplitBOM) {
152 : // Construct chunks with a BOM split into two chunks.
153 6 : char partial_bom[] = "\xef\xbb";
154 6 : char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
155 : strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));
156 :
157 : {
158 6 : const char* chunks[] = {partial_bom, data, "\0"};
159 6 : ChunkSource chunk_source(chunks);
160 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
161 : v8::internal::ScannerStream::For(
162 6 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
163 :
164 : // Read the data without tripping over the BOM.
165 66 : for (size_t i = 0; unicode_ucs2[i]; i++) {
166 120 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
167 : }
168 : }
169 :
170 : // And now with single-byte BOM chunks.
171 6 : char bom_byte_1[] = "\xef";
172 6 : char bom_byte_2[] = "\xbb";
173 : {
174 6 : const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"};
175 6 : ChunkSource chunk_source(chunks);
176 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
177 : v8::internal::ScannerStream::For(
178 6 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
179 :
180 : // Read the data without tripping over the BOM.
181 66 : for (size_t i = 0; unicode_ucs2[i]; i++) {
182 120 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
183 : }
184 : }
185 6 : }
186 :
187 23724 : TEST(Utf8ChunkBoundaries) {
188 : // Test utf-8 parsing at chunk boundaries.
189 :
190 : // Split the test string at each byte and pass it to the stream. This way,
191 : // we'll have a split at each possible boundary.
192 : size_t len = strlen(unicode_utf8);
193 : char buffer[arraysize(unicode_utf8) + 3];
194 96 : for (size_t i = 1; i < len; i++) {
195 : // Copy source string into buffer, splitting it at i.
196 : // Then add three chunks, 0..i-1, i..strlen-1, empty.
197 : strncpy(buffer, unicode_utf8, i);
198 84 : strncpy(buffer + i + 1, unicode_utf8 + i, len - i);
199 84 : buffer[i] = '\0';
200 84 : buffer[len + 1] = '\0';
201 84 : buffer[len + 2] = '\0';
202 84 : const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
203 :
204 84 : ChunkSource chunk_source(chunks);
205 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
206 : v8::internal::ScannerStream::For(
207 84 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
208 :
209 924 : for (size_t i = 0; unicode_ucs2[i]; i++) {
210 1680 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
211 : }
212 84 : CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
213 : stream->Advance());
214 : }
215 6 : }
216 :
217 23724 : TEST(Utf8SingleByteChunks) {
218 : // Have each byte as a single-byte chunk.
219 : size_t len = strlen(unicode_utf8);
220 : char buffer[arraysize(unicode_utf8) + 4];
221 90 : for (size_t i = 1; i < len - 1; i++) {
222 : // Copy source string into buffer, make a single-byte chunk at i.
223 : strncpy(buffer, unicode_utf8, i);
224 78 : strncpy(buffer + i + 3, unicode_utf8 + i + 1, len - i - 1);
225 78 : buffer[i] = '\0';
226 78 : buffer[i + 1] = unicode_utf8[i];
227 78 : buffer[i + 2] = '\0';
228 78 : buffer[len + 2] = '\0';
229 78 : buffer[len + 3] = '\0';
230 : const char* chunks[] = {buffer, buffer + i + 1, buffer + i + 3,
231 78 : buffer + len + 3};
232 :
233 78 : ChunkSource chunk_source(chunks);
234 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
235 : v8::internal::ScannerStream::For(
236 78 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
237 :
238 858 : for (size_t j = 0; unicode_ucs2[j]; j++) {
239 1560 : CHECK_EQ(unicode_ucs2[j], stream->Advance());
240 : }
241 78 : CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
242 : stream->Advance());
243 : }
244 6 : }
245 :
246 : #define CHECK_EQU(v1, v2) CHECK_EQ(static_cast<int>(v1), static_cast<int>(v2))
247 :
248 324 : void TestCharacterStream(const char* reference, i::Utf16CharacterStream* stream,
249 : unsigned length, unsigned start, unsigned end) {
250 : // Read streams one char at a time
251 : unsigned i;
252 312432 : for (i = start; i < end; i++) {
253 312108 : CHECK_EQU(i, stream->pos());
254 312108 : CHECK_EQU(reference[i], stream->Advance());
255 : }
256 324 : CHECK_EQU(end, stream->pos());
257 324 : CHECK_EQU(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
258 324 : CHECK_EQU(end + 1, stream->pos());
259 324 : stream->Back();
260 :
261 : // Pushback, re-read, pushback again.
262 324 : while (i > end / 4) {
263 244536 : int32_t c0 = reference[i - 1];
264 244536 : CHECK_EQU(i, stream->pos());
265 244536 : stream->Back();
266 : i--;
267 244536 : CHECK_EQU(i, stream->pos());
268 244536 : int32_t c1 = stream->Advance();
269 : i++;
270 244536 : CHECK_EQU(i, stream->pos());
271 244536 : CHECK_EQ(c0, c1);
272 244536 : stream->Back();
273 : i--;
274 244536 : CHECK_EQU(i, stream->pos());
275 : }
276 :
277 : // Seek + read streams one char at a time.
278 324 : unsigned halfway = end / 2;
279 324 : stream->Seek(stream->pos() + halfway - i);
280 163380 : for (i = halfway; i < end; i++) {
281 163056 : CHECK_EQU(i, stream->pos());
282 163056 : CHECK_EQU(reference[i], stream->Advance());
283 : }
284 324 : CHECK_EQU(i, stream->pos());
285 324 : CHECK_LT(stream->Advance(), 0);
286 :
287 : // Seek back, then seek beyond end of stream.
288 324 : stream->Seek(start);
289 324 : if (start < length) {
290 264 : CHECK_EQU(stream->Advance(), reference[start]);
291 : } else {
292 60 : CHECK_LT(stream->Advance(), 0);
293 : }
294 324 : stream->Seek(length + 5);
295 324 : CHECK_LT(stream->Advance(), 0);
296 324 : }
297 :
298 : #undef CHECK_EQU
299 :
300 36 : void TestCharacterStreams(const char* one_byte_source, unsigned length,
301 : unsigned start = 0, unsigned end = 0) {
302 36 : if (end == 0) end = length;
303 :
304 : i::Isolate* isolate = CcTest::i_isolate();
305 : i::Factory* factory = isolate->factory();
306 :
307 : // 2-byte external string
308 36 : std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
309 : i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(),
310 36 : static_cast<int>(length));
311 : {
312 49290 : for (unsigned i = 0; i < length; i++) {
313 98508 : uc16_buffer[i] = static_cast<i::uc16>(one_byte_source[i]);
314 : }
315 : TestExternalResource resource(uc16_buffer.get(), length);
316 : i::Handle<i::String> uc16_string(
317 72 : factory->NewExternalStringFromTwoByte(&resource).ToHandleChecked());
318 : std::unique_ptr<i::Utf16CharacterStream> uc16_stream(
319 36 : i::ScannerStream::For(uc16_string, start, end));
320 36 : TestCharacterStream(one_byte_source, uc16_stream.get(), length, start, end);
321 : }
322 :
323 : // 1-byte external string
324 : i::Vector<const uint8_t> one_byte_vector =
325 : i::OneByteVector(one_byte_source, static_cast<int>(length));
326 : i::Handle<i::String> one_byte_string =
327 72 : factory->NewStringFromOneByte(one_byte_vector).ToHandleChecked();
328 : {
329 : TestExternalOneByteResource one_byte_resource(one_byte_source, length);
330 : i::Handle<i::String> ext_one_byte_string(
331 : factory->NewExternalStringFromOneByte(&one_byte_resource)
332 72 : .ToHandleChecked());
333 : std::unique_ptr<i::Utf16CharacterStream> one_byte_stream(
334 36 : i::ScannerStream::For(ext_one_byte_string, start, end));
335 : TestCharacterStream(one_byte_source, one_byte_stream.get(), length, start,
336 36 : end);
337 : }
338 :
339 : // 1-byte generic i::String
340 : {
341 : std::unique_ptr<i::Utf16CharacterStream> string_stream(
342 36 : i::ScannerStream::For(one_byte_string, start, end));
343 : TestCharacterStream(one_byte_source, string_stream.get(), length, start,
344 36 : end);
345 : }
346 :
347 : // 2-byte generic i::String
348 : {
349 : i::Handle<i::String> two_byte_string =
350 72 : factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked();
351 : std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
352 36 : i::ScannerStream::For(two_byte_string, start, end));
353 : TestCharacterStream(one_byte_source, two_byte_string_stream.get(), length,
354 36 : start, end);
355 : }
356 :
357 : // Streaming has no notion of start/end, so let's skip streaming tests for
358 : // these cases.
359 72 : if (start != 0 || end != length) return;
360 :
361 : // 1-byte streaming stream, single + many chunks.
362 : {
363 : const uint8_t* data = one_byte_vector.begin();
364 : const uint8_t* data_end = one_byte_vector.end();
365 :
366 30 : ChunkSource single_chunk(data, data_end - data, false);
367 : std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream(
368 : i::ScannerStream::For(&single_chunk,
369 : v8::ScriptCompiler::StreamedSource::ONE_BYTE,
370 30 : nullptr));
371 : TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
372 30 : length, start, end);
373 :
374 30 : ChunkSource many_chunks(data, data_end - data, true);
375 : one_byte_streaming_stream.reset(i::ScannerStream::For(
376 30 : &many_chunks, v8::ScriptCompiler::StreamedSource::ONE_BYTE, nullptr));
377 : TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
378 30 : length, start, end);
379 : }
380 :
381 : // UTF-8 streaming stream, single + many chunks.
382 : {
383 : const uint8_t* data = one_byte_vector.begin();
384 : const uint8_t* data_end = one_byte_vector.end();
385 30 : ChunkSource chunks(data, data_end - data, false);
386 : std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream(
387 : i::ScannerStream::For(&chunks, v8::ScriptCompiler::StreamedSource::UTF8,
388 30 : nullptr));
389 : TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
390 30 : start, end);
391 :
392 30 : ChunkSource many_chunks(data, data_end - data, true);
393 : utf8_streaming_stream.reset(i::ScannerStream::For(
394 30 : &many_chunks, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
395 : TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
396 30 : start, end);
397 : }
398 :
399 : // 2-byte streaming stream, single + many chunks.
400 : {
401 : const uint8_t* data =
402 : reinterpret_cast<const uint8_t*>(two_byte_vector.begin());
403 : const uint8_t* data_end =
404 : reinterpret_cast<const uint8_t*>(two_byte_vector.end());
405 30 : ChunkSource chunks(data, data_end - data, false);
406 : std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream(
407 : i::ScannerStream::For(
408 30 : &chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
409 : TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
410 30 : length, start, end);
411 :
412 30 : ChunkSource many_chunks(data, data_end - data, true);
413 : two_byte_streaming_stream.reset(i::ScannerStream::For(
414 30 : &many_chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE, nullptr));
415 : TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
416 30 : length, start, end);
417 : }
418 : }
419 :
420 23724 : TEST(CharacterStreams) {
421 6 : v8::Isolate* isolate = CcTest::isolate();
422 6 : v8::HandleScope handles(isolate);
423 6 : v8::Local<v8::Context> context = v8::Context::New(isolate);
424 : v8::Context::Scope context_scope(context);
425 :
426 6 : TestCharacterStreams("abcdefghi", 9);
427 6 : TestCharacterStreams("abc\0\n\r\x7f", 7);
428 6 : TestCharacterStreams("\0", 1);
429 6 : TestCharacterStreams("", 0);
430 :
431 : // 4k large buffer.
432 : char buffer[4096 + 1];
433 24588 : for (unsigned i = 0; i < arraysize(buffer); i++) {
434 24582 : buffer[i] = static_cast<char>(i & 0x7F);
435 : }
436 6 : buffer[arraysize(buffer) - 1] = '\0';
437 6 : TestCharacterStreams(buffer, arraysize(buffer) - 1);
438 12 : TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
439 6 : }
440 :
441 : // Regression test for crbug.com/651333. Read invalid utf-8.
442 23724 : TEST(Regress651333) {
443 : const uint8_t bytes[] =
444 : "A\xf1"
445 6 : "ad"; // Anad, with n == n-with-tilde.
446 6 : const uint16_t unicode[] = {65, 65533, 97, 100};
447 :
448 : // Run the test for all sub-strings 0..N of bytes, to make sure we hit the
449 : // error condition in and at chunk boundaries.
450 36 : for (size_t len = 0; len < arraysize(bytes); len++) {
451 : // Read len bytes from bytes, and compare against the expected unicode
452 : // characters. Expect kBadChar ( == Unicode replacement char == code point
453 : // 65533) instead of the incorrectly coded Latin1 char.
454 30 : ChunkSource chunks(bytes, len, false);
455 : std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
456 30 : &chunks, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
457 90 : for (size_t i = 0; i < len; i++) {
458 120 : CHECK_EQ(unicode[i], stream->Advance());
459 : }
460 30 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
461 : }
462 6 : }
463 :
464 18 : void TestChunkStreamAgainstReference(
465 : const char* cases[],
466 1404 : const std::vector<std::vector<uint16_t>>& unicode_expected) {
467 192 : for (size_t c = 0; c < unicode_expected.size(); ++c) {
468 78 : ChunkSource chunk_source(cases[c]);
469 : std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
470 78 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8, nullptr));
471 1098 : for (size_t i = 0; i < unicode_expected[c].size(); i++) {
472 1152 : CHECK_EQ(unicode_expected[c][i], stream->Advance());
473 : }
474 78 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
475 78 : stream->Seek(0);
476 1098 : for (size_t i = 0; i < unicode_expected[c].size(); i++) {
477 1152 : CHECK_EQ(unicode_expected[c][i], stream->Advance());
478 : }
479 78 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
480 : }
481 18 : }
482 :
483 23724 : TEST(Regress6377) {
484 : const char* cases[] = {
485 : "\xf0\x90\0" // first chunk - start of 4-byte seq
486 : "\x80\x80" // second chunk - end of 4-byte seq
487 : "a\0", // and an 'a'
488 :
489 : "\xe0\xbf\0" // first chunk - start of 3-byte seq
490 : "\xbf" // second chunk - one-byte end of 3-byte seq
491 : "a\0", // and an 'a'
492 :
493 : "\xc3\0" // first chunk - start of 2-byte seq
494 : "\xbf" // second chunk - end of 2-byte seq
495 : "a\0", // and an 'a'
496 :
497 : "\xf0\x90\x80\0" // first chunk - start of 4-byte seq
498 : "\x80" // second chunk - one-byte end of 4-byte seq
499 : "a\xc3\0" // and an 'a' + start of 2-byte seq
500 : "\xbf\0", // third chunk - end of 2-byte seq
501 6 : };
502 : const std::vector<std::vector<uint16_t>> unicode_expected = {
503 : {0xd800, 0xdc00, 97}, {0xfff, 97}, {0xff, 97}, {0xd800, 0xdc00, 97, 0xff},
504 6 : };
505 12 : CHECK_EQ(unicode_expected.size(), arraysize(cases));
506 6 : TestChunkStreamAgainstReference(cases, unicode_expected);
507 6 : }
508 :
509 23724 : TEST(Regress6836) {
510 : const char* cases[] = {
511 : // 0xc2 is a lead byte, but there's no continuation. The bug occurs when
512 : // this happens near the chunk end.
513 : "X\xc2Y\0",
514 : // Last chunk ends with a 2-byte char lead.
515 : "X\xc2\0",
516 : // Last chunk ends with a 3-byte char lead and only one continuation
517 : // character.
518 : "X\xe0\xbf\0",
519 6 : };
520 : const std::vector<std::vector<uint16_t>> unicode_expected = {
521 : {0x58, 0xfffd, 0x59}, {0x58, 0xfffd}, {0x58, 0xfffd},
522 6 : };
523 12 : CHECK_EQ(unicode_expected.size(), arraysize(cases));
524 6 : TestChunkStreamAgainstReference(cases, unicode_expected);
525 6 : }
526 :
527 23724 : TEST(TestOverlongAndInvalidSequences) {
528 : const char* cases[] = {
529 : // Overlong 2-byte sequence.
530 : "X\xc0\xbfY\0",
531 : // Another overlong 2-byte sequence.
532 : "X\xc1\xbfY\0",
533 : // Overlong 3-byte sequence.
534 : "X\xe0\x9f\xbfY\0",
535 : // Overlong 4-byte sequence.
536 : "X\xf0\x89\xbf\xbfY\0",
537 : // Invalid 3-byte sequence (reserved for surrogates).
538 : "X\xed\xa0\x80Y\0",
539 : // Invalid 4-bytes sequence (value out of range).
540 : "X\xf4\x90\x80\x80Y\0",
541 6 : };
542 : const std::vector<std::vector<uint16_t>> unicode_expected = {
543 : {0x58, 0xfffd, 0xfffd, 0x59},
544 : {0x58, 0xfffd, 0xfffd, 0x59},
545 : {0x58, 0xfffd, 0xfffd, 0xfffd, 0x59},
546 : {0x58, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x59},
547 : {0x58, 0xfffd, 0xfffd, 0xfffd, 0x59},
548 : {0x58, 0xfffd, 0xfffd, 0xfffd, 0xfffd, 0x59},
549 6 : };
550 12 : CHECK_EQ(unicode_expected.size(), arraysize(cases));
551 6 : TestChunkStreamAgainstReference(cases, unicode_expected);
552 71160 : }
|