Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/heap/factory-inl.h"
6 : #include "src/objects-inl.h"
7 : #include "src/parsing/scanner-character-streams.h"
8 : #include "src/parsing/scanner.h"
9 : #include "test/cctest/cctest.h"
10 :
11 : namespace {
12 :
13 : // Implement ExternalSourceStream based on const char**.
14 : // This will take each string as one chunk. The last chunk must be empty.
15 : class ChunkSource : public v8::ScriptCompiler::ExternalSourceStream {
16 : public:
17 735 : explicit ChunkSource(const char** chunks) : current_(0) {
18 785 : do {
19 : chunks_.push_back(
20 1570 : {reinterpret_cast<const uint8_t*>(*chunks), strlen(*chunks)});
21 785 : chunks++;
22 785 : } while (chunks_.back().len > 0);
23 245 : }
24 210 : explicit ChunkSource(const char* chunks) : current_(0) {
25 175 : do {
26 : chunks_.push_back(
27 350 : {reinterpret_cast<const uint8_t*>(chunks), strlen(chunks)});
28 175 : chunks += strlen(chunks) + 1;
29 175 : } while (chunks_.back().len > 0);
30 70 : }
31 175 : ChunkSource(const uint8_t* data, size_t char_size, size_t len,
32 : bool extra_chunky)
33 350 : : current_(0) {
34 : // If extra_chunky, we'll use increasingly large chunk sizes. If not, we'll
35 : // have a single chunk of full length. Make sure that chunks are always
36 : // aligned to char-size though.
37 175 : size_t chunk_size = extra_chunky ? char_size : len;
38 1755 : for (size_t i = 0; i < len; i += chunk_size, chunk_size += char_size) {
39 4740 : chunks_.push_back({data + i, i::Min(chunk_size, len - i)});
40 : }
41 350 : chunks_.push_back({nullptr, 0});
42 175 : }
43 490 : ~ChunkSource() override = default;
44 0 : bool SetBookmark() override { return false; }
45 0 : void ResetToBookmark() override {}
46 2610 : size_t GetMoreData(const uint8_t** src) override {
47 : DCHECK_LT(current_, chunks_.size());
48 2610 : Chunk& next = chunks_[current_++];
49 2610 : uint8_t* chunk = new uint8_t[next.len];
50 2610 : i::MemMove(chunk, next.ptr, next.len);
51 2610 : *src = chunk;
52 2610 : return next.len;
53 : }
54 :
55 : private:
56 : struct Chunk {
57 : const uint8_t* ptr;
58 : size_t len;
59 : };
60 : std::vector<Chunk> chunks_;
61 : size_t current_;
62 : };
63 :
64 : // Checks that Lock() / Unlock() pairs are balanced. Not thread-safe.
65 : class LockChecker {
66 : public:
67 70 : LockChecker() : lock_depth_(0) {}
68 70 : ~LockChecker() { CHECK_EQ(0, lock_depth_); }
69 :
70 110 : void Lock() const { lock_depth_++; }
71 :
72 110 : void Unlock() const {
73 110 : CHECK_GT(lock_depth_, 0);
74 110 : lock_depth_--;
75 110 : }
76 :
77 85 : bool IsLocked() const { return lock_depth_ > 0; }
78 :
79 10 : int LockDepth() const { return lock_depth_; }
80 :
81 : protected:
82 : mutable int lock_depth_;
83 : };
84 :
85 35 : class TestExternalResource : public v8::String::ExternalStringResource,
86 : public LockChecker {
87 : public:
88 : explicit TestExternalResource(uint16_t* data, int length)
89 35 : : LockChecker(), data_(data), length_(static_cast<size_t>(length)) {}
90 :
91 50 : const uint16_t* data() const override {
92 50 : CHECK(IsLocked());
93 50 : return data_;
94 : }
95 :
96 65 : size_t length() const override { return length_; }
97 :
98 30 : bool IsCacheable() const override { return false; }
99 150 : void Lock() const override { LockChecker::Lock(); }
100 75 : void Unlock() const override { LockChecker::Unlock(); }
101 :
102 : private:
103 : uint16_t* data_;
104 : size_t length_;
105 : };
106 :
107 35 : class TestExternalOneByteResource
108 : : public v8::String::ExternalOneByteStringResource,
109 : public LockChecker {
110 : public:
111 : TestExternalOneByteResource(const char* data, size_t length)
112 35 : : data_(data), length_(length) {}
113 :
114 30 : const char* data() const override {
115 30 : CHECK(IsLocked());
116 30 : return data_;
117 : }
118 65 : size_t length() const override { return length_; }
119 :
120 30 : bool IsCacheable() const override { return false; }
121 70 : void Lock() const override { LockChecker::Lock(); }
122 35 : void Unlock() const override { LockChecker::Unlock(); }
123 :
124 : private:
125 : const char* data_;
126 : size_t length_;
127 : };
128 :
129 : // A test string with all lengths of utf-8 encodings.
130 : const char unicode_utf8[] =
131 : "abc" // 3x ascii
132 : "\xc3\xa4" // a Umlaut, code point 228
133 : "\xe2\xa8\xa0" // >> (math symbol), code point 10784
134 : "\xf0\x9f\x92\xa9" // best character, code point 128169,
135 : // as utf-16 surrogates: 55357 56489
136 : "def"; // 3x ascii again.
137 : const uint16_t unicode_ucs2[] = {97, 98, 99, 228, 10784, 55357,
138 : 56489, 100, 101, 102, 0};
139 :
140 35 : i::Handle<i::String> NewExternalTwoByteStringFromResource(
141 : i::Isolate* isolate, TestExternalResource* resource) {
142 : i::Factory* factory = isolate->factory();
143 : // String creation accesses the resource.
144 35 : resource->Lock();
145 : i::Handle<i::String> uc16_string(
146 70 : factory->NewExternalStringFromTwoByte(resource).ToHandleChecked());
147 35 : resource->Unlock();
148 35 : return uc16_string;
149 : }
150 :
151 : } // anonymous namespace
152 :
153 28342 : TEST(Utf8StreamAsciiOnly) {
154 5 : const char* chunks[] = {"abc", "def", "ghi", ""};
155 5 : ChunkSource chunk_source(chunks);
156 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
157 : v8::internal::ScannerStream::For(
158 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
159 :
160 : // Read the data without dying.
161 : v8::internal::uc32 c;
162 50 : do {
163 : c = stream->Advance();
164 : } while (c != v8::internal::Utf16CharacterStream::kEndOfInput);
165 5 : }
166 :
167 28342 : TEST(Utf8StreamBOM) {
168 : // Construct test string w/ UTF-8 BOM (byte order mark)
169 5 : char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};
170 : strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));
171 :
172 5 : const char* chunks[] = {data, "\0"};
173 5 : ChunkSource chunk_source(chunks);
174 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
175 : v8::internal::ScannerStream::For(
176 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
177 :
178 : // Read the data without tripping over the BOM.
179 55 : for (size_t i = 0; unicode_ucs2[i]; i++) {
180 50 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
181 : }
182 5 : CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance());
183 :
184 : // Make sure seek works.
185 5 : stream->Seek(0);
186 5 : CHECK_EQ(unicode_ucs2[0], stream->Advance());
187 :
188 5 : stream->Seek(5);
189 5 : CHECK_EQ(unicode_ucs2[5], stream->Advance());
190 :
191 : // Try again, but make sure we have to seek 'backwards'.
192 25 : while (v8::internal::Utf16CharacterStream::kEndOfInput != stream->Advance()) {
193 : // Do nothing. We merely advance the stream to the end of its input.
194 : }
195 5 : stream->Seek(5);
196 5 : CHECK_EQ(unicode_ucs2[5], stream->Advance());
197 5 : }
198 :
199 28342 : TEST(Utf8SplitBOM) {
200 : // Construct chunks with a BOM split into two chunks.
201 5 : char partial_bom[] = "\xef\xbb";
202 5 : char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
203 : strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));
204 :
205 : {
206 5 : const char* chunks[] = {partial_bom, data, "\0"};
207 5 : ChunkSource chunk_source(chunks);
208 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
209 : v8::internal::ScannerStream::For(
210 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
211 :
212 : // Read the data without tripping over the BOM.
213 55 : for (size_t i = 0; unicode_ucs2[i]; i++) {
214 50 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
215 : }
216 : }
217 :
218 : // And now with single-byte BOM chunks.
219 5 : char bom_byte_1[] = "\xef";
220 5 : char bom_byte_2[] = "\xbb";
221 : {
222 5 : const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"};
223 5 : ChunkSource chunk_source(chunks);
224 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
225 : v8::internal::ScannerStream::For(
226 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
227 :
228 : // Read the data without tripping over the BOM.
229 55 : for (size_t i = 0; unicode_ucs2[i]; i++) {
230 50 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
231 : }
232 : }
233 5 : }
234 :
235 28342 : TEST(Utf8SplitMultiBOM) {
236 : // Construct chunks with a split BOM followed by another split BOM.
237 : const char* chunks = "\xef\xbb\0\xbf\xef\xbb\0\xbf\0\0";
238 5 : ChunkSource chunk_source(chunks);
239 : std::unique_ptr<i::Utf16CharacterStream> stream(
240 : v8::internal::ScannerStream::For(
241 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
242 :
243 : // Read the data, ensuring we get exactly one of the two BOMs back.
244 5 : CHECK_EQ(0xFEFF, stream->Advance());
245 5 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
246 5 : }
247 :
248 28342 : TEST(Utf8AdvanceUntil) {
249 : // Test utf-8 advancing until a certain char.
250 :
251 : const char line_term = '\n';
252 : const size_t kLen = arraysize(unicode_utf8);
253 : char data[kLen + 1];
254 : strncpy(data, unicode_utf8, kLen);
255 5 : data[kLen - 1] = line_term;
256 5 : data[kLen] = '\0';
257 :
258 : {
259 5 : const char* chunks[] = {data, "\0"};
260 5 : ChunkSource chunk_source(chunks);
261 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
262 : v8::internal::ScannerStream::For(
263 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
264 :
265 : int32_t res = stream->AdvanceUntil(
266 110 : [](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
267 5 : CHECK_EQ(line_term, res);
268 : }
269 5 : }
270 :
271 28342 : TEST(AdvanceMatchAdvanceUntil) {
272 : // Test if single advance and advanceUntil behave the same
273 :
274 5 : char data[] = {'a', 'b', '\n', 'c', '\0'};
275 :
276 : {
277 5 : const char* chunks[] = {data, "\0"};
278 5 : ChunkSource chunk_source_a(chunks);
279 :
280 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream_advance(
281 : v8::internal::ScannerStream::For(
282 5 : &chunk_source_a, v8::ScriptCompiler::StreamedSource::UTF8));
283 :
284 5 : ChunkSource chunk_source_au(chunks);
285 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream_advance_until(
286 : v8::internal::ScannerStream::For(
287 5 : &chunk_source_au, v8::ScriptCompiler::StreamedSource::UTF8));
288 :
289 : int32_t au_c0_ = stream_advance_until->AdvanceUntil(
290 30 : [](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
291 :
292 : int32_t a_c0_ = '0';
293 45 : while (!unibrow::IsLineTerminator(a_c0_)) {
294 : a_c0_ = stream_advance->Advance();
295 : }
296 :
297 : // Check both advances methods have the same output
298 5 : CHECK_EQ(a_c0_, au_c0_);
299 :
300 : // Check if both set the cursor to the correct position by advancing both
301 : // streams by one character.
302 : a_c0_ = stream_advance->Advance();
303 : au_c0_ = stream_advance_until->Advance();
304 5 : CHECK_EQ(a_c0_, au_c0_);
305 : }
306 5 : }
307 :
308 28342 : TEST(Utf8AdvanceUntilOverChunkBoundaries) {
309 : // Test utf-8 advancing until a certain char, crossing chunk boundaries.
310 :
311 : // Split the test string at each byte and pass it to the stream. This way,
312 : // we'll have a split at each possible boundary.
313 : size_t len = strlen(unicode_utf8);
314 : char buffer[arraysize(unicode_utf8) + 4];
315 80 : for (size_t i = 1; i < len; i++) {
316 : // Copy source string into buffer, splitting it at i.
317 : // Then add three chunks, 0..i-1, i..strlen-1, empty.
318 : strncpy(buffer, unicode_utf8, i);
319 70 : strncpy(buffer + i + 1, unicode_utf8 + i, len - i);
320 70 : buffer[i] = '\0';
321 70 : buffer[len + 1] = '\n';
322 70 : buffer[len + 2] = '\0';
323 70 : buffer[len + 3] = '\0';
324 70 : const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
325 :
326 70 : ChunkSource chunk_source(chunks);
327 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
328 : v8::internal::ScannerStream::For(
329 70 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
330 :
331 : int32_t res = stream->AdvanceUntil(
332 1540 : [](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
333 140 : CHECK_EQ(buffer[len + 1], res);
334 : }
335 5 : }
336 :
337 28342 : TEST(Utf8ChunkBoundaries) {
338 : // Test utf-8 parsing at chunk boundaries.
339 :
340 : // Split the test string at each byte and pass it to the stream. This way,
341 : // we'll have a split at each possible boundary.
342 : size_t len = strlen(unicode_utf8);
343 : char buffer[arraysize(unicode_utf8) + 3];
344 80 : for (size_t i = 1; i < len; i++) {
345 : // Copy source string into buffer, splitting it at i.
346 : // Then add three chunks, 0..i-1, i..strlen-1, empty.
347 : strncpy(buffer, unicode_utf8, i);
348 70 : strncpy(buffer + i + 1, unicode_utf8 + i, len - i);
349 70 : buffer[i] = '\0';
350 70 : buffer[len + 1] = '\0';
351 70 : buffer[len + 2] = '\0';
352 70 : const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
353 :
354 70 : ChunkSource chunk_source(chunks);
355 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
356 : v8::internal::ScannerStream::For(
357 70 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
358 :
359 770 : for (size_t i = 0; unicode_ucs2[i]; i++) {
360 700 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
361 : }
362 70 : CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
363 : stream->Advance());
364 : }
365 5 : }
366 :
367 28342 : TEST(Utf8SingleByteChunks) {
368 : // Have each byte as a single-byte chunk.
369 : size_t len = strlen(unicode_utf8);
370 : char buffer[arraysize(unicode_utf8) + 4];
371 75 : for (size_t i = 1; i < len - 1; i++) {
372 : // Copy source string into buffer, make a single-byte chunk at i.
373 : strncpy(buffer, unicode_utf8, i);
374 65 : strncpy(buffer + i + 3, unicode_utf8 + i + 1, len - i - 1);
375 65 : buffer[i] = '\0';
376 65 : buffer[i + 1] = unicode_utf8[i];
377 65 : buffer[i + 2] = '\0';
378 65 : buffer[len + 2] = '\0';
379 65 : buffer[len + 3] = '\0';
380 : const char* chunks[] = {buffer, buffer + i + 1, buffer + i + 3,
381 65 : buffer + len + 3};
382 :
383 65 : ChunkSource chunk_source(chunks);
384 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
385 : v8::internal::ScannerStream::For(
386 65 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
387 :
388 715 : for (size_t j = 0; unicode_ucs2[j]; j++) {
389 650 : CHECK_EQ(unicode_ucs2[j], stream->Advance());
390 : }
391 65 : CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
392 : stream->Advance());
393 : }
394 5 : }
395 :
396 : #define CHECK_EQU(v1, v2) CHECK_EQ(static_cast<int>(v1), static_cast<int>(v2))
397 :
398 290 : void TestCharacterStream(const char* reference, i::Utf16CharacterStream* stream,
399 : unsigned length, unsigned start, unsigned end) {
400 : // Read streams one char at a time
401 : unsigned i;
402 260520 : for (i = start; i < end; i++) {
403 260230 : CHECK_EQU(i, stream->pos());
404 260230 : CHECK_EQU(reference[i], stream->Advance());
405 : }
406 290 : CHECK_EQU(end, stream->pos());
407 290 : CHECK_EQU(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
408 290 : CHECK_EQU(end + 1, stream->pos());
409 290 : stream->Back();
410 :
411 : // Pushback, re-read, pushback again.
412 290 : while (i > end / 4) {
413 203920 : int32_t c0 = reference[i - 1];
414 203920 : CHECK_EQU(i, stream->pos());
415 203920 : stream->Back();
416 : i--;
417 203920 : CHECK_EQU(i, stream->pos());
418 : int32_t c1 = stream->Advance();
419 : i++;
420 203920 : CHECK_EQU(i, stream->pos());
421 203920 : CHECK_EQ(c0, c1);
422 203920 : stream->Back();
423 : i--;
424 203920 : CHECK_EQU(i, stream->pos());
425 : }
426 :
427 : // Seek + read streams one char at a time.
428 290 : unsigned halfway = end / 2;
429 290 : stream->Seek(stream->pos() + halfway - i);
430 136270 : for (i = halfway; i < end; i++) {
431 135980 : CHECK_EQU(i, stream->pos());
432 135980 : CHECK_EQU(reference[i], stream->Advance());
433 : }
434 290 : CHECK_EQU(i, stream->pos());
435 290 : CHECK_LT(stream->Advance(), 0);
436 :
437 : // Seek back, then seek beyond end of stream.
438 290 : stream->Seek(start);
439 290 : if (start < length) {
440 480 : CHECK_EQU(stream->Advance(), reference[start]);
441 : } else {
442 50 : CHECK_LT(stream->Advance(), 0);
443 : }
444 290 : stream->Seek(length + 5);
445 290 : CHECK_LT(stream->Advance(), 0);
446 290 : }
447 :
448 10 : void TestCloneCharacterStream(const char* reference,
449 : i::Utf16CharacterStream* stream,
450 : unsigned length) {
451 10 : std::unique_ptr<i::Utf16CharacterStream> clone = stream->Clone();
452 :
453 : unsigned i;
454 10 : unsigned halfway = length / 2;
455 : // Advance original half way.
456 50 : for (i = 0; i < halfway; i++) {
457 40 : CHECK_EQU(i, stream->pos());
458 40 : CHECK_EQU(reference[i], stream->Advance());
459 : }
460 :
461 : // Test advancing original stream didn't affect the clone.
462 10 : TestCharacterStream(reference, clone.get(), length, 0, length);
463 :
464 : // Test advancing clone didn't affect original stream.
465 10 : TestCharacterStream(reference, stream, length, i, length);
466 10 : }
467 :
468 : #undef CHECK_EQU
469 :
470 30 : void TestCharacterStreams(const char* one_byte_source, unsigned length,
471 : unsigned start = 0, unsigned end = 0) {
472 30 : if (end == 0) end = length;
473 :
474 : i::Isolate* isolate = CcTest::i_isolate();
475 : i::Factory* factory = isolate->factory();
476 :
477 : // 2-byte external string
478 30 : std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
479 : i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(),
480 30 : static_cast<int>(length));
481 : {
482 41075 : for (unsigned i = 0; i < length; i++) {
483 82090 : uc16_buffer[i] = static_cast<i::uc16>(one_byte_source[i]);
484 : }
485 : TestExternalResource resource(uc16_buffer.get(), length);
486 : i::Handle<i::String> uc16_string(
487 30 : NewExternalTwoByteStringFromResource(isolate, &resource));
488 : std::unique_ptr<i::Utf16CharacterStream> uc16_stream(
489 30 : i::ScannerStream::For(isolate, uc16_string, start, end));
490 30 : TestCharacterStream(one_byte_source, uc16_stream.get(), length, start, end);
491 :
492 : // This avoids the GC from trying to free a stack allocated resource.
493 60 : if (uc16_string->IsExternalString())
494 : i::Handle<i::ExternalTwoByteString>::cast(uc16_string)
495 50 : ->SetResource(isolate, nullptr);
496 : }
497 :
498 : // 1-byte external string
499 : i::Vector<const uint8_t> one_byte_vector =
500 : i::OneByteVector(one_byte_source, static_cast<int>(length));
501 : i::Handle<i::String> one_byte_string =
502 60 : factory->NewStringFromOneByte(one_byte_vector).ToHandleChecked();
503 : {
504 : TestExternalOneByteResource one_byte_resource(one_byte_source, length);
505 : i::Handle<i::String> ext_one_byte_string(
506 : factory->NewExternalStringFromOneByte(&one_byte_resource)
507 60 : .ToHandleChecked());
508 : std::unique_ptr<i::Utf16CharacterStream> one_byte_stream(
509 30 : i::ScannerStream::For(isolate, ext_one_byte_string, start, end));
510 : TestCharacterStream(one_byte_source, one_byte_stream.get(), length, start,
511 30 : end);
512 : // This avoids the GC from trying to free a stack allocated resource.
513 60 : if (ext_one_byte_string->IsExternalString())
514 : i::Handle<i::ExternalOneByteString>::cast(ext_one_byte_string)
515 50 : ->SetResource(isolate, nullptr);
516 : }
517 :
518 : // 1-byte generic i::String
519 : {
520 : std::unique_ptr<i::Utf16CharacterStream> string_stream(
521 30 : i::ScannerStream::For(isolate, one_byte_string, start, end));
522 : TestCharacterStream(one_byte_source, string_stream.get(), length, start,
523 30 : end);
524 : }
525 :
526 : // 2-byte generic i::String
527 : {
528 : i::Handle<i::String> two_byte_string =
529 60 : factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked();
530 : std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
531 30 : i::ScannerStream::For(isolate, two_byte_string, start, end));
532 : TestCharacterStream(one_byte_source, two_byte_string_stream.get(), length,
533 30 : start, end);
534 : }
535 :
536 : // Streaming has no notion of start/end, so let's skip streaming tests for
537 : // these cases.
538 60 : if (start != 0 || end != length) return;
539 :
540 : // 1-byte streaming stream, single + many chunks.
541 : {
542 : const uint8_t* data = one_byte_vector.begin();
543 : const uint8_t* data_end = one_byte_vector.end();
544 :
545 25 : ChunkSource single_chunk(data, 1, data_end - data, false);
546 : std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream(
547 : i::ScannerStream::For(&single_chunk,
548 25 : v8::ScriptCompiler::StreamedSource::ONE_BYTE));
549 : TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
550 25 : length, start, end);
551 :
552 25 : ChunkSource many_chunks(data, 1, data_end - data, true);
553 : one_byte_streaming_stream.reset(i::ScannerStream::For(
554 25 : &many_chunks, v8::ScriptCompiler::StreamedSource::ONE_BYTE));
555 : TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
556 25 : length, start, end);
557 : }
558 :
559 : // UTF-8 streaming stream, single + many chunks.
560 : {
561 : const uint8_t* data = one_byte_vector.begin();
562 : const uint8_t* data_end = one_byte_vector.end();
563 25 : ChunkSource chunks(data, 1, data_end - data, false);
564 : std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream(
565 : i::ScannerStream::For(&chunks,
566 25 : v8::ScriptCompiler::StreamedSource::UTF8));
567 : TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
568 25 : start, end);
569 :
570 25 : ChunkSource many_chunks(data, 1, data_end - data, true);
571 : utf8_streaming_stream.reset(i::ScannerStream::For(
572 25 : &many_chunks, v8::ScriptCompiler::StreamedSource::UTF8));
573 : TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
574 25 : start, end);
575 : }
576 :
577 : // 2-byte streaming stream, single + many chunks.
578 : {
579 : const uint8_t* data =
580 : reinterpret_cast<const uint8_t*>(two_byte_vector.begin());
581 : const uint8_t* data_end =
582 : reinterpret_cast<const uint8_t*>(two_byte_vector.end());
583 25 : ChunkSource chunks(data, 2, data_end - data, false);
584 : std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream(
585 : i::ScannerStream::For(&chunks,
586 25 : v8::ScriptCompiler::StreamedSource::TWO_BYTE));
587 : TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
588 25 : length, start, end);
589 :
590 25 : ChunkSource many_chunks(data, 2, data_end - data, true);
591 : two_byte_streaming_stream.reset(i::ScannerStream::For(
592 25 : &many_chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE));
593 : TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
594 25 : length, start, end);
595 : }
596 : }
597 :
598 28342 : TEST(CharacterStreams) {
599 5 : v8::Isolate* isolate = CcTest::isolate();
600 5 : v8::HandleScope handles(isolate);
601 5 : v8::Local<v8::Context> context = v8::Context::New(isolate);
602 : v8::Context::Scope context_scope(context);
603 :
604 5 : TestCharacterStreams("abcdefghi", 9);
605 5 : TestCharacterStreams("abc\0\n\r\x7f", 7);
606 5 : TestCharacterStreams("\0", 1);
607 5 : TestCharacterStreams("", 0);
608 :
609 : // 4k large buffer.
610 : char buffer[4096 + 1];
611 20490 : for (unsigned i = 0; i < arraysize(buffer); i++) {
612 20485 : buffer[i] = static_cast<char>(i & 0x7F);
613 : }
614 5 : buffer[arraysize(buffer) - 1] = '\0';
615 5 : TestCharacterStreams(buffer, arraysize(buffer) - 1);
616 10 : TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
617 5 : }
618 :
619 : // Regression test for crbug.com/651333. Read invalid utf-8.
620 28342 : TEST(Regress651333) {
621 : const uint8_t bytes[] =
622 : "A\xf1"
623 5 : "ad"; // Anad, with n == n-with-tilde.
624 5 : const uint16_t unicode[] = {65, 65533, 97, 100};
625 :
626 : // Run the test for all sub-strings 0..N of bytes, to make sure we hit the
627 : // error condition in and at chunk boundaries.
628 30 : for (size_t len = 0; len < arraysize(bytes); len++) {
629 : // Read len bytes from bytes, and compare against the expected unicode
630 : // characters. Expect kBadChar ( == Unicode replacement char == code point
631 : // 65533) instead of the incorrectly coded Latin1 char.
632 25 : ChunkSource chunks(bytes, 1, len, false);
633 : std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
634 25 : &chunks, v8::ScriptCompiler::StreamedSource::UTF8));
635 75 : for (size_t i = 0; i < len; i++) {
636 100 : CHECK_EQ(unicode[i], stream->Advance());
637 : }
638 25 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
639 : }
640 5 : }
641 :
642 15 : void TestChunkStreamAgainstReference(
643 : const char* cases[],
644 1170 : const std::vector<std::vector<uint16_t>>& unicode_expected) {
645 160 : for (size_t c = 0; c < unicode_expected.size(); ++c) {
646 65 : ChunkSource chunk_source(cases[c]);
647 : std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
648 65 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
649 915 : for (size_t i = 0; i < unicode_expected[c].size(); i++) {
650 720 : CHECK_EQ(unicode_expected[c][i], stream->Advance());
651 : }
652 65 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
653 65 : stream->Seek(0);
654 915 : for (size_t i = 0; i < unicode_expected[c].size(); i++) {
655 720 : CHECK_EQ(unicode_expected[c][i], stream->Advance());
656 : }
657 65 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
658 : }
659 15 : }
660 :
661 28342 : TEST(Regress6377) {
662 : const char* cases[] = {
663 : "\xf0\x90\0" // first chunk - start of 4-byte seq
664 : "\x80\x80" // second chunk - end of 4-byte seq
665 : "a\0", // and an 'a'
666 :
667 : "\xe0\xbf\0" // first chunk - start of 3-byte seq
668 : "\xbf" // second chunk - one-byte end of 3-byte seq
669 : "a\0", // and an 'a'
670 :
671 : "\xc3\0" // first chunk - start of 2-byte seq
672 : "\xbf" // second chunk - end of 2-byte seq
673 : "a\0", // and an 'a'
674 :
675 : "\xf0\x90\x80\0" // first chunk - start of 4-byte seq
676 : "\x80" // second chunk - one-byte end of 4-byte seq
677 : "a\xc3\0" // and an 'a' + start of 2-byte seq
678 : "\xbf\0", // third chunk - end of 2-byte seq
679 5 : };
680 : const std::vector<std::vector<uint16_t>> unicode_expected = {
681 : {0xD800, 0xDC00, 97}, {0xFFF, 97}, {0xFF, 97}, {0xD800, 0xDC00, 97, 0xFF},
682 5 : };
683 10 : CHECK_EQ(unicode_expected.size(), arraysize(cases));
684 5 : TestChunkStreamAgainstReference(cases, unicode_expected);
685 5 : }
686 :
687 28342 : TEST(Regress6836) {
688 : const char* cases[] = {
689 : // 0xC2 is a lead byte, but there's no continuation. The bug occurs when
690 : // this happens near the chunk end.
691 : "X\xc2Y\0",
692 : // Last chunk ends with a 2-byte char lead.
693 : "X\xc2\0",
694 : // Last chunk ends with a 3-byte char lead and only one continuation
695 : // character.
696 : "X\xe0\xbf\0",
697 5 : };
698 : const std::vector<std::vector<uint16_t>> unicode_expected = {
699 : {0x58, 0xFFFD, 0x59}, {0x58, 0xFFFD}, {0x58, 0xFFFD},
700 5 : };
701 10 : CHECK_EQ(unicode_expected.size(), arraysize(cases));
702 5 : TestChunkStreamAgainstReference(cases, unicode_expected);
703 5 : }
704 :
705 28342 : TEST(TestOverlongAndInvalidSequences) {
706 : const char* cases[] = {
707 : // Overlong 2-byte sequence.
708 : "X\xc0\xbfY\0",
709 : // Another overlong 2-byte sequence.
710 : "X\xc1\xbfY\0",
711 : // Overlong 3-byte sequence.
712 : "X\xe0\x9f\xbfY\0",
713 : // Overlong 4-byte sequence.
714 : "X\xf0\x89\xbf\xbfY\0",
715 : // Invalid 3-byte sequence (reserved for surrogates).
716 : "X\xed\xa0\x80Y\0",
717 : // Invalid 4-bytes sequence (value out of range).
718 : "X\xf4\x90\x80\x80Y\0",
719 5 : };
720 : const std::vector<std::vector<uint16_t>> unicode_expected = {
721 : {0x58, 0xFFFD, 0xFFFD, 0x59},
722 : {0x58, 0xFFFD, 0xFFFD, 0x59},
723 : {0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
724 : {0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
725 : {0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
726 : {0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
727 5 : };
728 10 : CHECK_EQ(unicode_expected.size(), arraysize(cases));
729 5 : TestChunkStreamAgainstReference(cases, unicode_expected);
730 5 : }
731 :
732 28342 : TEST(RelocatingCharacterStream) {
733 : ManualGCScope manual_gc_scope;
734 5 : CcTest::InitializeVM();
735 : i::Isolate* i_isolate = CcTest::i_isolate();
736 10 : v8::HandleScope scope(CcTest::isolate());
737 :
738 : const char* string = "abcd";
739 : int length = static_cast<int>(strlen(string));
740 5 : std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
741 25 : for (int i = 0; i < length; i++) {
742 40 : uc16_buffer[i] = string[i];
743 : }
744 : i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(), length);
745 : i::Handle<i::String> two_byte_string =
746 : i_isolate->factory()
747 : ->NewStringFromTwoByte(two_byte_vector, i::NOT_TENURED)
748 10 : .ToHandleChecked();
749 : std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
750 5 : i::ScannerStream::For(i_isolate, two_byte_string, 0, length));
751 5 : CHECK_EQ('a', two_byte_string_stream->Advance());
752 5 : CHECK_EQ('b', two_byte_string_stream->Advance());
753 5 : CHECK_EQ(size_t{2}, two_byte_string_stream->pos());
754 5 : i::String raw = *two_byte_string;
755 : i_isolate->heap()->CollectGarbage(i::NEW_SPACE,
756 5 : i::GarbageCollectionReason::kUnknown);
757 : // GC moved the string.
758 10 : CHECK_NE(raw, *two_byte_string);
759 5 : CHECK_EQ('c', two_byte_string_stream->Advance());
760 5 : CHECK_EQ('d', two_byte_string_stream->Advance());
761 5 : }
762 :
763 28342 : TEST(CloneCharacterStreams) {
764 5 : v8::HandleScope handles(CcTest::isolate());
765 5 : v8::Local<v8::Context> context = v8::Context::New(CcTest::isolate());
766 : v8::Context::Scope context_scope(context);
767 :
768 : i::Isolate* isolate = CcTest::i_isolate();
769 : i::Factory* factory = isolate->factory();
770 :
771 : const char* one_byte_source = "abcdefghi";
772 : unsigned length = static_cast<unsigned>(strlen(one_byte_source));
773 :
774 : // Check that cloning a character stream does not update
775 :
776 : // 2-byte external string
777 5 : std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
778 : i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(),
779 : static_cast<int>(length));
780 : {
781 50 : for (unsigned i = 0; i < length; i++) {
782 90 : uc16_buffer[i] = static_cast<i::uc16>(one_byte_source[i]);
783 : }
784 : TestExternalResource resource(uc16_buffer.get(), length);
785 : i::Handle<i::String> uc16_string(
786 5 : NewExternalTwoByteStringFromResource(isolate, &resource));
787 : std::unique_ptr<i::Utf16CharacterStream> uc16_stream(
788 5 : i::ScannerStream::For(isolate, uc16_string, 0, length));
789 :
790 5 : CHECK(resource.IsLocked());
791 5 : CHECK_EQ(1, resource.LockDepth());
792 5 : std::unique_ptr<i::Utf16CharacterStream> cloned = uc16_stream->Clone();
793 5 : CHECK_EQ(2, resource.LockDepth());
794 : uc16_stream = std::move(cloned);
795 5 : CHECK_EQ(1, resource.LockDepth());
796 :
797 5 : TestCloneCharacterStream(one_byte_source, uc16_stream.get(), length);
798 :
799 : // This avoids the GC from trying to free a stack allocated resource.
800 10 : if (uc16_string->IsExternalString())
801 : i::Handle<i::ExternalTwoByteString>::cast(uc16_string)
802 10 : ->SetResource(isolate, nullptr);
803 : }
804 :
805 : // 1-byte external string
806 : i::Vector<const uint8_t> one_byte_vector =
807 5 : i::OneByteVector(one_byte_source, static_cast<int>(length));
808 : i::Handle<i::String> one_byte_string =
809 10 : factory->NewStringFromOneByte(one_byte_vector).ToHandleChecked();
810 : {
811 : TestExternalOneByteResource one_byte_resource(one_byte_source, length);
812 : i::Handle<i::String> ext_one_byte_string(
813 : factory->NewExternalStringFromOneByte(&one_byte_resource)
814 10 : .ToHandleChecked());
815 : std::unique_ptr<i::Utf16CharacterStream> one_byte_stream(
816 5 : i::ScannerStream::For(isolate, ext_one_byte_string, 0, length));
817 5 : TestCloneCharacterStream(one_byte_source, one_byte_stream.get(), length);
818 : // This avoids the GC from trying to free a stack allocated resource.
819 10 : if (ext_one_byte_string->IsExternalString())
820 : i::Handle<i::ExternalOneByteString>::cast(ext_one_byte_string)
821 10 : ->SetResource(isolate, nullptr);
822 : }
823 :
824 : // Relocatinable streams aren't clonable.
825 : {
826 : std::unique_ptr<i::Utf16CharacterStream> string_stream(
827 5 : i::ScannerStream::For(isolate, one_byte_string, 0, length));
828 5 : CHECK(!string_stream->can_be_cloned());
829 :
830 : i::Handle<i::String> two_byte_string =
831 10 : factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked();
832 : std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
833 5 : i::ScannerStream::For(isolate, two_byte_string, 0, length));
834 5 : CHECK(!two_byte_string_stream->can_be_cloned());
835 : }
836 :
837 : // Chunk sources currently not cloneable.
838 : {
839 5 : const char* chunks[] = {"1234", "\0"};
840 5 : ChunkSource chunk_source(chunks);
841 : std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream(
842 : i::ScannerStream::For(&chunk_source,
843 5 : v8::ScriptCompiler::StreamedSource::ONE_BYTE));
844 5 : CHECK(!one_byte_streaming_stream->can_be_cloned());
845 :
846 : std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream(
847 : i::ScannerStream::For(&chunk_source,
848 5 : v8::ScriptCompiler::StreamedSource::UTF8));
849 5 : CHECK(!utf8_streaming_stream->can_be_cloned());
850 :
851 : std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream(
852 : i::ScannerStream::For(&chunk_source,
853 5 : v8::ScriptCompiler::StreamedSource::TWO_BYTE));
854 5 : CHECK(!two_byte_streaming_stream->can_be_cloned());
855 5 : }
856 85016 : }
|