Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/heap/factory-inl.h"
6 : #include "src/objects-inl.h"
7 : #include "src/parsing/scanner-character-streams.h"
8 : #include "src/parsing/scanner.h"
9 : #include "test/cctest/cctest.h"
10 :
11 : namespace {
12 :
13 : // Implement ExternalSourceStream based on const char**.
14 : // This will take each string as one chunk. The last chunk must be empty.
15 : class ChunkSource : public v8::ScriptCompiler::ExternalSourceStream {
16 : public:
17 750 : explicit ChunkSource(const char** chunks) : current_(0) {
18 795 : do {
19 2385 : chunks_.push_back(
20 795 : {reinterpret_cast<const uint8_t*>(*chunks), strlen(*chunks)});
21 795 : chunks++;
22 795 : } while (chunks_.back().len > 0);
23 250 : }
24 210 : explicit ChunkSource(const char* chunks) : current_(0) {
25 175 : do {
26 525 : chunks_.push_back(
27 175 : {reinterpret_cast<const uint8_t*>(chunks), strlen(chunks)});
28 175 : chunks += strlen(chunks) + 1;
29 175 : } while (chunks_.back().len > 0);
30 70 : }
31 175 : ChunkSource(const uint8_t* data, size_t char_size, size_t len,
32 : bool extra_chunky)
33 350 : : current_(0) {
34 : // If extra_chunky, we'll use increasingly large chunk sizes. If not, we'll
35 : // have a single chunk of full length. Make sure that chunks are always
36 : // aligned to char-size though.
37 175 : size_t chunk_size = extra_chunky ? char_size : len;
38 3335 : for (size_t i = 0; i < len; i += chunk_size, chunk_size += char_size) {
39 4740 : chunks_.push_back({data + i, i::Min(chunk_size, len - i)});
40 : }
41 350 : chunks_.push_back({nullptr, 0});
42 175 : }
43 990 : ~ChunkSource() override = default;
44 0 : bool SetBookmark() override { return false; }
45 0 : void ResetToBookmark() override {}
46 2620 : size_t GetMoreData(const uint8_t** src) override {
47 : DCHECK_LT(current_, chunks_.size());
48 2620 : Chunk& next = chunks_[current_++];
49 2620 : uint8_t* chunk = new uint8_t[next.len];
50 2620 : if (next.len > 0) {
51 2225 : i::MemMove(chunk, next.ptr, next.len);
52 : }
53 2620 : *src = chunk;
54 2620 : return next.len;
55 : }
56 :
57 : private:
58 : struct Chunk {
59 : const uint8_t* ptr;
60 : size_t len;
61 : };
62 : std::vector<Chunk> chunks_;
63 : size_t current_;
64 : };
65 :
66 : // Checks that Lock() / Unlock() pairs are balanced. Not thread-safe.
67 : class LockChecker {
68 : public:
69 70 : LockChecker() : lock_depth_(0) {}
70 70 : ~LockChecker() { CHECK_EQ(0, lock_depth_); }
71 :
72 110 : void Lock() const { lock_depth_++; }
73 :
74 : void Unlock() const {
75 110 : CHECK_GT(lock_depth_, 0);
76 110 : lock_depth_--;
77 : }
78 :
79 65 : bool IsLocked() const { return lock_depth_ > 0; }
80 :
81 10 : int LockDepth() const { return lock_depth_; }
82 :
83 : protected:
84 : mutable int lock_depth_;
85 : };
86 :
87 70 : class TestExternalResource : public v8::String::ExternalStringResource,
88 : public LockChecker {
89 : public:
90 : explicit TestExternalResource(uint16_t* data, int length)
91 35 : : LockChecker(), data_(data), length_(static_cast<size_t>(length)) {}
92 :
93 30 : const uint16_t* data() const override {
94 30 : CHECK(IsLocked());
95 30 : return data_;
96 : }
97 :
98 65 : size_t length() const override { return length_; }
99 :
100 30 : bool IsCacheable() const override { return false; }
101 80 : void Lock() const override { LockChecker::Lock(); }
102 80 : void Unlock() const override { LockChecker::Unlock(); }
103 :
104 : private:
105 : uint16_t* data_;
106 : size_t length_;
107 : };
108 :
109 70 : class TestExternalOneByteResource
110 : : public v8::String::ExternalOneByteStringResource,
111 : public LockChecker {
112 : public:
113 : TestExternalOneByteResource(const char* data, size_t length)
114 35 : : data_(data), length_(length) {}
115 :
116 30 : const char* data() const override {
117 30 : CHECK(IsLocked());
118 30 : return data_;
119 : }
120 65 : size_t length() const override { return length_; }
121 :
122 30 : bool IsCacheable() const override { return false; }
123 70 : void Lock() const override { LockChecker::Lock(); }
124 70 : void Unlock() const override { LockChecker::Unlock(); }
125 :
126 : private:
127 : const char* data_;
128 : size_t length_;
129 : };
130 :
131 : // A test string with all lengths of utf-8 encodings.
132 : const char unicode_utf8[] =
133 : "abc" // 3x ascii
134 : "\xc3\xa4" // a Umlaut, code point 228
135 : "\xe2\xa8\xa0" // >> (math symbol), code point 10784
136 : "\xf0\x9f\x92\xa9" // best character, code point 128169,
137 : // as utf-16 surrogates: 55357 56489
138 : "def"; // 3x ascii again.
139 : const uint16_t unicode_ucs2[] = {97, 98, 99, 228, 10784, 55357,
140 : 56489, 100, 101, 102, 0};
141 :
142 35 : i::Handle<i::String> NewExternalTwoByteStringFromResource(
143 : i::Isolate* isolate, TestExternalResource* resource) {
144 : i::Factory* factory = isolate->factory();
145 : // String creation accesses the resource.
146 : resource->Lock();
147 : i::Handle<i::String> uc16_string(
148 70 : factory->NewExternalStringFromTwoByte(resource).ToHandleChecked());
149 : resource->Unlock();
150 35 : return uc16_string;
151 : }
152 :
153 : } // anonymous namespace
154 :
155 26644 : TEST(Utf8StreamAsciiOnly) {
156 5 : const char* chunks[] = {"abc", "def", "ghi", ""};
157 5 : ChunkSource chunk_source(chunks);
158 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
159 : v8::internal::ScannerStream::For(
160 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
161 :
162 : // Read the data without dying.
163 : v8::internal::uc32 c;
164 : do {
165 : c = stream->Advance();
166 50 : } while (c != v8::internal::Utf16CharacterStream::kEndOfInput);
167 5 : }
168 :
169 26644 : TEST(Utf8StreamMaxNonSurrogateCharCode) {
170 5 : const char* chunks[] = {"\uffff\uffff", ""};
171 5 : ChunkSource chunk_source(chunks);
172 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
173 : v8::internal::ScannerStream::For(
174 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
175 :
176 : // Read the correct character.
177 : uint16_t max = unibrow::Utf16::kMaxNonSurrogateCharCode;
178 5 : CHECK_EQ(max, static_cast<uint32_t>(stream->Advance()));
179 5 : CHECK_EQ(max, static_cast<uint32_t>(stream->Advance()));
180 5 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
181 5 : }
182 :
183 26644 : TEST(Utf8StreamBOM) {
184 : // Construct test string w/ UTF-8 BOM (byte order mark)
185 5 : char data[3 + arraysize(unicode_utf8)] = {"\xef\xbb\xbf"};
186 : strncpy(data + 3, unicode_utf8, arraysize(unicode_utf8));
187 :
188 5 : const char* chunks[] = {data, "\0"};
189 5 : ChunkSource chunk_source(chunks);
190 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
191 : v8::internal::ScannerStream::For(
192 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
193 :
194 : // Read the data without tripping over the BOM.
195 105 : for (size_t i = 0; unicode_ucs2[i]; i++) {
196 50 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
197 : }
198 5 : CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput, stream->Advance());
199 :
200 : // Make sure seek works.
201 5 : stream->Seek(0);
202 5 : CHECK_EQ(unicode_ucs2[0], stream->Advance());
203 :
204 5 : stream->Seek(5);
205 5 : CHECK_EQ(unicode_ucs2[5], stream->Advance());
206 :
207 : // Try again, but make sure we have to seek 'backwards'.
208 25 : while (v8::internal::Utf16CharacterStream::kEndOfInput != stream->Advance()) {
209 : // Do nothing. We merely advance the stream to the end of its input.
210 : }
211 5 : stream->Seek(5);
212 5 : CHECK_EQ(unicode_ucs2[5], stream->Advance());
213 5 : }
214 :
215 26644 : TEST(Utf8SplitBOM) {
216 : // Construct chunks with a BOM split into two chunks.
217 5 : char partial_bom[] = "\xef\xbb";
218 5 : char data[1 + arraysize(unicode_utf8)] = {"\xbf"};
219 : strncpy(data + 1, unicode_utf8, arraysize(unicode_utf8));
220 :
221 : {
222 5 : const char* chunks[] = {partial_bom, data, "\0"};
223 5 : ChunkSource chunk_source(chunks);
224 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
225 : v8::internal::ScannerStream::For(
226 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
227 :
228 : // Read the data without tripping over the BOM.
229 105 : for (size_t i = 0; unicode_ucs2[i]; i++) {
230 50 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
231 : }
232 : }
233 :
234 : // And now with single-byte BOM chunks.
235 5 : char bom_byte_1[] = "\xef";
236 5 : char bom_byte_2[] = "\xbb";
237 : {
238 5 : const char* chunks[] = {bom_byte_1, bom_byte_2, data, "\0"};
239 5 : ChunkSource chunk_source(chunks);
240 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
241 : v8::internal::ScannerStream::For(
242 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
243 :
244 : // Read the data without tripping over the BOM.
245 105 : for (size_t i = 0; unicode_ucs2[i]; i++) {
246 50 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
247 : }
248 : }
249 5 : }
250 :
251 26644 : TEST(Utf8SplitMultiBOM) {
252 : // Construct chunks with a split BOM followed by another split BOM.
253 : const char* chunks = "\xef\xbb\0\xbf\xef\xbb\0\xbf\0\0";
254 5 : ChunkSource chunk_source(chunks);
255 : std::unique_ptr<i::Utf16CharacterStream> stream(
256 : v8::internal::ScannerStream::For(
257 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
258 :
259 : // Read the data, ensuring we get exactly one of the two BOMs back.
260 5 : CHECK_EQ(0xFEFF, stream->Advance());
261 5 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
262 5 : }
263 :
264 26644 : TEST(Utf8AdvanceUntil) {
265 : // Test utf-8 advancing until a certain char.
266 :
267 : const char line_term = '\n';
268 : const size_t kLen = arraysize(unicode_utf8);
269 : char data[kLen + 1];
270 : strncpy(data, unicode_utf8, kLen);
271 5 : data[kLen - 1] = line_term;
272 5 : data[kLen] = '\0';
273 :
274 : {
275 5 : const char* chunks[] = {data, "\0"};
276 5 : ChunkSource chunk_source(chunks);
277 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
278 : v8::internal::ScannerStream::For(
279 5 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
280 :
281 : int32_t res = stream->AdvanceUntil(
282 55 : [](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
283 5 : CHECK_EQ(line_term, res);
284 : }
285 5 : }
286 :
287 26644 : TEST(AdvanceMatchAdvanceUntil) {
288 : // Test if single advance and advanceUntil behave the same
289 :
290 5 : char data[] = {'a', 'b', '\n', 'c', '\0'};
291 :
292 : {
293 5 : const char* chunks[] = {data, "\0"};
294 5 : ChunkSource chunk_source_a(chunks);
295 :
296 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream_advance(
297 : v8::internal::ScannerStream::For(
298 5 : &chunk_source_a, v8::ScriptCompiler::StreamedSource::UTF8));
299 :
300 5 : ChunkSource chunk_source_au(chunks);
301 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream_advance_until(
302 : v8::internal::ScannerStream::For(
303 5 : &chunk_source_au, v8::ScriptCompiler::StreamedSource::UTF8));
304 :
305 : int32_t au_c0_ = stream_advance_until->AdvanceUntil(
306 15 : [](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
307 :
308 : int32_t a_c0_ = '0';
309 55 : while (!unibrow::IsLineTerminator(a_c0_)) {
310 : a_c0_ = stream_advance->Advance();
311 : }
312 :
313 : // Check both advances methods have the same output
314 5 : CHECK_EQ(a_c0_, au_c0_);
315 :
316 : // Check if both set the cursor to the correct position by advancing both
317 : // streams by one character.
318 : a_c0_ = stream_advance->Advance();
319 : au_c0_ = stream_advance_until->Advance();
320 5 : CHECK_EQ(a_c0_, au_c0_);
321 : }
322 5 : }
323 :
324 26644 : TEST(Utf8AdvanceUntilOverChunkBoundaries) {
325 : // Test utf-8 advancing until a certain char, crossing chunk boundaries.
326 :
327 : // Split the test string at each byte and pass it to the stream. This way,
328 : // we'll have a split at each possible boundary.
329 : size_t len = strlen(unicode_utf8);
330 : char buffer[arraysize(unicode_utf8) + 4];
331 145 : for (size_t i = 1; i < len; i++) {
332 : // Copy source string into buffer, splitting it at i.
333 : // Then add three chunks, 0..i-1, i..strlen-1, empty.
334 : strncpy(buffer, unicode_utf8, i);
335 70 : strncpy(buffer + i + 1, unicode_utf8 + i, len - i);
336 70 : buffer[i] = '\0';
337 70 : buffer[len + 1] = '\n';
338 70 : buffer[len + 2] = '\0';
339 70 : buffer[len + 3] = '\0';
340 70 : const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
341 :
342 70 : ChunkSource chunk_source(chunks);
343 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
344 : v8::internal::ScannerStream::For(
345 70 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
346 :
347 : int32_t res = stream->AdvanceUntil(
348 770 : [](int32_t c0_) { return unibrow::IsLineTerminator(c0_); });
349 140 : CHECK_EQ(buffer[len + 1], res);
350 : }
351 5 : }
352 :
353 26644 : TEST(Utf8ChunkBoundaries) {
354 : // Test utf-8 parsing at chunk boundaries.
355 :
356 : // Split the test string at each byte and pass it to the stream. This way,
357 : // we'll have a split at each possible boundary.
358 : size_t len = strlen(unicode_utf8);
359 : char buffer[arraysize(unicode_utf8) + 3];
360 145 : for (size_t i = 1; i < len; i++) {
361 : // Copy source string into buffer, splitting it at i.
362 : // Then add three chunks, 0..i-1, i..strlen-1, empty.
363 : strncpy(buffer, unicode_utf8, i);
364 70 : strncpy(buffer + i + 1, unicode_utf8 + i, len - i);
365 70 : buffer[i] = '\0';
366 70 : buffer[len + 1] = '\0';
367 70 : buffer[len + 2] = '\0';
368 70 : const char* chunks[] = {buffer, buffer + i + 1, buffer + len + 2};
369 :
370 70 : ChunkSource chunk_source(chunks);
371 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
372 : v8::internal::ScannerStream::For(
373 70 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
374 :
375 1470 : for (size_t i = 0; unicode_ucs2[i]; i++) {
376 700 : CHECK_EQ(unicode_ucs2[i], stream->Advance());
377 : }
378 70 : CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
379 : stream->Advance());
380 : }
381 5 : }
382 :
383 26644 : TEST(Utf8SingleByteChunks) {
384 : // Have each byte as a single-byte chunk.
385 : size_t len = strlen(unicode_utf8);
386 : char buffer[arraysize(unicode_utf8) + 4];
387 135 : for (size_t i = 1; i < len - 1; i++) {
388 : // Copy source string into buffer, make a single-byte chunk at i.
389 : strncpy(buffer, unicode_utf8, i);
390 65 : strncpy(buffer + i + 3, unicode_utf8 + i + 1, len - i - 1);
391 65 : buffer[i] = '\0';
392 65 : buffer[i + 1] = unicode_utf8[i];
393 65 : buffer[i + 2] = '\0';
394 65 : buffer[len + 2] = '\0';
395 65 : buffer[len + 3] = '\0';
396 : const char* chunks[] = {buffer, buffer + i + 1, buffer + i + 3,
397 65 : buffer + len + 3};
398 :
399 65 : ChunkSource chunk_source(chunks);
400 : std::unique_ptr<v8::internal::Utf16CharacterStream> stream(
401 : v8::internal::ScannerStream::For(
402 65 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
403 :
404 1365 : for (size_t j = 0; unicode_ucs2[j]; j++) {
405 650 : CHECK_EQ(unicode_ucs2[j], stream->Advance());
406 : }
407 65 : CHECK_EQ(v8::internal::Utf16CharacterStream::kEndOfInput,
408 : stream->Advance());
409 : }
410 5 : }
411 :
412 : #define CHECK_EQU(v1, v2) CHECK_EQ(static_cast<int>(v1), static_cast<int>(v2))
413 :
414 290 : void TestCharacterStream(const char* reference, i::Utf16CharacterStream* stream,
415 : unsigned length, unsigned start, unsigned end) {
416 : // Read streams one char at a time
417 : unsigned i;
418 520750 : for (i = start; i < end; i++) {
419 260230 : CHECK_EQU(i, stream->pos());
420 260230 : CHECK_EQU(reference[i], stream->Advance());
421 : }
422 290 : CHECK_EQU(end, stream->pos());
423 290 : CHECK_EQU(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
424 290 : CHECK_EQU(end + 1, stream->pos());
425 290 : stream->Back();
426 :
427 : // Pushback, re-read, pushback again.
428 204210 : while (i > end / 4) {
429 203920 : int32_t c0 = reference[i - 1];
430 203920 : CHECK_EQU(i, stream->pos());
431 203920 : stream->Back();
432 : i--;
433 203920 : CHECK_EQU(i, stream->pos());
434 : int32_t c1 = stream->Advance();
435 : i++;
436 203920 : CHECK_EQU(i, stream->pos());
437 203920 : CHECK_EQ(c0, c1);
438 203920 : stream->Back();
439 : i--;
440 203920 : CHECK_EQU(i, stream->pos());
441 : }
442 :
443 : // Seek + read streams one char at a time.
444 290 : unsigned halfway = end / 2;
445 290 : stream->Seek(stream->pos() + halfway - i);
446 272250 : for (i = halfway; i < end; i++) {
447 135980 : CHECK_EQU(i, stream->pos());
448 135980 : CHECK_EQU(reference[i], stream->Advance());
449 : }
450 290 : CHECK_EQU(i, stream->pos());
451 290 : CHECK_LT(stream->Advance(), 0);
452 :
453 : // Seek back, then seek beyond end of stream.
454 290 : stream->Seek(start);
455 290 : if (start < length) {
456 480 : CHECK_EQU(stream->Advance(), reference[start]);
457 : } else {
458 50 : CHECK_LT(stream->Advance(), 0);
459 : }
460 290 : stream->Seek(length + 5);
461 290 : CHECK_LT(stream->Advance(), 0);
462 290 : }
463 :
464 10 : void TestCloneCharacterStream(const char* reference,
465 : i::Utf16CharacterStream* stream,
466 : unsigned length) {
467 10 : std::unique_ptr<i::Utf16CharacterStream> clone = stream->Clone();
468 :
469 : unsigned i;
470 10 : unsigned halfway = length / 2;
471 : // Advance original half way.
472 90 : for (i = 0; i < halfway; i++) {
473 40 : CHECK_EQU(i, stream->pos());
474 40 : CHECK_EQU(reference[i], stream->Advance());
475 : }
476 :
477 : // Test advancing original stream didn't affect the clone.
478 10 : TestCharacterStream(reference, clone.get(), length, 0, length);
479 :
480 : // Test advancing clone didn't affect original stream.
481 10 : TestCharacterStream(reference, stream, length, i, length);
482 10 : }
483 :
484 : #undef CHECK_EQU
485 :
486 30 : void TestCharacterStreams(const char* one_byte_source, unsigned length,
487 : unsigned start = 0, unsigned end = 0) {
488 30 : if (end == 0) end = length;
489 :
490 : i::Isolate* isolate = CcTest::i_isolate();
491 : i::Factory* factory = isolate->factory();
492 :
493 : // 2-byte external string
494 30 : std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
495 : i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(),
496 30 : static_cast<int>(length));
497 : {
498 82120 : for (unsigned i = 0; i < length; i++) {
499 82090 : uc16_buffer[i] = static_cast<i::uc16>(one_byte_source[i]);
500 : }
501 : TestExternalResource resource(uc16_buffer.get(), length);
502 : i::Handle<i::String> uc16_string(
503 30 : NewExternalTwoByteStringFromResource(isolate, &resource));
504 : std::unique_ptr<i::Utf16CharacterStream> uc16_stream(
505 30 : i::ScannerStream::For(isolate, uc16_string, start, end));
506 30 : TestCharacterStream(one_byte_source, uc16_stream.get(), length, start, end);
507 :
508 : // This avoids the GC from trying to free a stack allocated resource.
509 30 : if (uc16_string->IsExternalString())
510 : i::Handle<i::ExternalTwoByteString>::cast(uc16_string)
511 25 : ->SetResource(isolate, nullptr);
512 : }
513 :
514 : // 1-byte external string
515 : i::Vector<const uint8_t> one_byte_vector =
516 : i::OneByteVector(one_byte_source, static_cast<int>(length));
517 : i::Handle<i::String> one_byte_string =
518 60 : factory->NewStringFromOneByte(one_byte_vector).ToHandleChecked();
519 : {
520 : TestExternalOneByteResource one_byte_resource(one_byte_source, length);
521 : i::Handle<i::String> ext_one_byte_string(
522 60 : factory->NewExternalStringFromOneByte(&one_byte_resource)
523 : .ToHandleChecked());
524 : std::unique_ptr<i::Utf16CharacterStream> one_byte_stream(
525 30 : i::ScannerStream::For(isolate, ext_one_byte_string, start, end));
526 : TestCharacterStream(one_byte_source, one_byte_stream.get(), length, start,
527 30 : end);
528 : // This avoids the GC from trying to free a stack allocated resource.
529 30 : if (ext_one_byte_string->IsExternalString())
530 : i::Handle<i::ExternalOneByteString>::cast(ext_one_byte_string)
531 25 : ->SetResource(isolate, nullptr);
532 : }
533 :
534 : // 1-byte generic i::String
535 : {
536 : std::unique_ptr<i::Utf16CharacterStream> string_stream(
537 30 : i::ScannerStream::For(isolate, one_byte_string, start, end));
538 : TestCharacterStream(one_byte_source, string_stream.get(), length, start,
539 30 : end);
540 : }
541 :
542 : // 2-byte generic i::String
543 : {
544 : i::Handle<i::String> two_byte_string =
545 60 : factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked();
546 : std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
547 30 : i::ScannerStream::For(isolate, two_byte_string, start, end));
548 : TestCharacterStream(one_byte_source, two_byte_string_stream.get(), length,
549 30 : start, end);
550 : }
551 :
552 : // Streaming has no notion of start/end, so let's skip streaming tests for
553 : // these cases.
554 30 : if (start != 0 || end != length) return;
555 :
556 : // 1-byte streaming stream, single + many chunks.
557 : {
558 : const uint8_t* data = one_byte_vector.begin();
559 : const uint8_t* data_end = one_byte_vector.end();
560 :
561 25 : ChunkSource single_chunk(data, 1, data_end - data, false);
562 : std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream(
563 : i::ScannerStream::For(&single_chunk,
564 25 : v8::ScriptCompiler::StreamedSource::ONE_BYTE));
565 : TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
566 25 : length, start, end);
567 :
568 25 : ChunkSource many_chunks(data, 1, data_end - data, true);
569 25 : one_byte_streaming_stream.reset(i::ScannerStream::For(
570 : &many_chunks, v8::ScriptCompiler::StreamedSource::ONE_BYTE));
571 : TestCharacterStream(one_byte_source, one_byte_streaming_stream.get(),
572 25 : length, start, end);
573 : }
574 :
575 : // UTF-8 streaming stream, single + many chunks.
576 : {
577 : const uint8_t* data = one_byte_vector.begin();
578 : const uint8_t* data_end = one_byte_vector.end();
579 25 : ChunkSource chunks(data, 1, data_end - data, false);
580 : std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream(
581 : i::ScannerStream::For(&chunks,
582 25 : v8::ScriptCompiler::StreamedSource::UTF8));
583 : TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
584 25 : start, end);
585 :
586 25 : ChunkSource many_chunks(data, 1, data_end - data, true);
587 25 : utf8_streaming_stream.reset(i::ScannerStream::For(
588 : &many_chunks, v8::ScriptCompiler::StreamedSource::UTF8));
589 : TestCharacterStream(one_byte_source, utf8_streaming_stream.get(), length,
590 25 : start, end);
591 : }
592 :
593 : // 2-byte streaming stream, single + many chunks.
594 : {
595 : const uint8_t* data =
596 : reinterpret_cast<const uint8_t*>(two_byte_vector.begin());
597 : const uint8_t* data_end =
598 : reinterpret_cast<const uint8_t*>(two_byte_vector.end());
599 25 : ChunkSource chunks(data, 2, data_end - data, false);
600 : std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream(
601 : i::ScannerStream::For(&chunks,
602 25 : v8::ScriptCompiler::StreamedSource::TWO_BYTE));
603 : TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
604 25 : length, start, end);
605 :
606 25 : ChunkSource many_chunks(data, 2, data_end - data, true);
607 25 : two_byte_streaming_stream.reset(i::ScannerStream::For(
608 : &many_chunks, v8::ScriptCompiler::StreamedSource::TWO_BYTE));
609 : TestCharacterStream(one_byte_source, two_byte_streaming_stream.get(),
610 25 : length, start, end);
611 : }
612 : }
613 :
614 26644 : TEST(CharacterStreams) {
615 5 : v8::Isolate* isolate = CcTest::isolate();
616 10 : v8::HandleScope handles(isolate);
617 5 : v8::Local<v8::Context> context = v8::Context::New(isolate);
618 : v8::Context::Scope context_scope(context);
619 :
620 5 : TestCharacterStreams("abcdefghi", 9);
621 5 : TestCharacterStreams("abc\0\n\r\x7f", 7);
622 5 : TestCharacterStreams("\0", 1);
623 5 : TestCharacterStreams("", 0);
624 :
625 : // 4k large buffer.
626 : char buffer[4096 + 1];
627 40975 : for (unsigned i = 0; i < arraysize(buffer); i++) {
628 20485 : buffer[i] = static_cast<char>(i & 0x7F);
629 : }
630 5 : buffer[arraysize(buffer) - 1] = '\0';
631 5 : TestCharacterStreams(buffer, arraysize(buffer) - 1);
632 5 : TestCharacterStreams(buffer, arraysize(buffer) - 1, 576, 3298);
633 5 : }
634 :
635 : // Regression test for crbug.com/651333. Read invalid utf-8.
636 26644 : TEST(Regress651333) {
637 : const uint8_t bytes[] =
638 : "A\xf1"
639 5 : "ad"; // Anad, with n == n-with-tilde.
640 5 : const uint16_t unicode[] = {65, 65533, 97, 100};
641 :
642 : // Run the test for all sub-strings 0..N of bytes, to make sure we hit the
643 : // error condition in and at chunk boundaries.
644 55 : for (size_t len = 0; len < arraysize(bytes); len++) {
645 : // Read len bytes from bytes, and compare against the expected unicode
646 : // characters. Expect kBadChar ( == Unicode replacement char == code point
647 : // 65533) instead of the incorrectly coded Latin1 char.
648 25 : ChunkSource chunks(bytes, 1, len, false);
649 : std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
650 25 : &chunks, v8::ScriptCompiler::StreamedSource::UTF8));
651 125 : for (size_t i = 0; i < len; i++) {
652 100 : CHECK_EQ(unicode[i], stream->Advance());
653 : }
654 25 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
655 : }
656 5 : }
657 :
658 15 : void TestChunkStreamAgainstReference(
659 : const char* cases[],
660 : const std::vector<std::vector<uint16_t>>& unicode_expected) {
661 145 : for (size_t c = 0; c < unicode_expected.size(); ++c) {
662 65 : ChunkSource chunk_source(cases[c]);
663 : std::unique_ptr<i::Utf16CharacterStream> stream(i::ScannerStream::For(
664 65 : &chunk_source, v8::ScriptCompiler::StreamedSource::UTF8));
665 545 : for (size_t i = 0; i < unicode_expected[c].size(); i++) {
666 480 : CHECK_EQ(unicode_expected[c][i], stream->Advance());
667 : }
668 65 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
669 65 : stream->Seek(0);
670 545 : for (size_t i = 0; i < unicode_expected[c].size(); i++) {
671 480 : CHECK_EQ(unicode_expected[c][i], stream->Advance());
672 : }
673 65 : CHECK_EQ(i::Utf16CharacterStream::kEndOfInput, stream->Advance());
674 : }
675 15 : }
676 :
677 26644 : TEST(Regress6377) {
678 : const char* cases[] = {
679 : "\xf0\x90\0" // first chunk - start of 4-byte seq
680 : "\x80\x80" // second chunk - end of 4-byte seq
681 : "a\0", // and an 'a'
682 :
683 : "\xe0\xbf\0" // first chunk - start of 3-byte seq
684 : "\xbf" // second chunk - one-byte end of 3-byte seq
685 : "a\0", // and an 'a'
686 :
687 : "\xc3\0" // first chunk - start of 2-byte seq
688 : "\xbf" // second chunk - end of 2-byte seq
689 : "a\0", // and an 'a'
690 :
691 : "\xf0\x90\x80\0" // first chunk - start of 4-byte seq
692 : "\x80" // second chunk - one-byte end of 4-byte seq
693 : "a\xc3\0" // and an 'a' + start of 2-byte seq
694 : "\xbf\0", // third chunk - end of 2-byte seq
695 5 : };
696 : const std::vector<std::vector<uint16_t>> unicode_expected = {
697 : {0xD800, 0xDC00, 97}, {0xFFF, 97}, {0xFF, 97}, {0xD800, 0xDC00, 97, 0xFF},
698 10 : };
699 5 : CHECK_EQ(unicode_expected.size(), arraysize(cases));
700 5 : TestChunkStreamAgainstReference(cases, unicode_expected);
701 5 : }
702 :
703 26644 : TEST(Regress6836) {
704 : const char* cases[] = {
705 : // 0xC2 is a lead byte, but there's no continuation. The bug occurs when
706 : // this happens near the chunk end.
707 : "X\xc2Y\0",
708 : // Last chunk ends with a 2-byte char lead.
709 : "X\xc2\0",
710 : // Last chunk ends with a 3-byte char lead and only one continuation
711 : // character.
712 : "X\xe0\xbf\0",
713 5 : };
714 : const std::vector<std::vector<uint16_t>> unicode_expected = {
715 : {0x58, 0xFFFD, 0x59}, {0x58, 0xFFFD}, {0x58, 0xFFFD},
716 10 : };
717 5 : CHECK_EQ(unicode_expected.size(), arraysize(cases));
718 5 : TestChunkStreamAgainstReference(cases, unicode_expected);
719 5 : }
720 :
721 26644 : TEST(TestOverlongAndInvalidSequences) {
722 : const char* cases[] = {
723 : // Overlong 2-byte sequence.
724 : "X\xc0\xbfY\0",
725 : // Another overlong 2-byte sequence.
726 : "X\xc1\xbfY\0",
727 : // Overlong 3-byte sequence.
728 : "X\xe0\x9f\xbfY\0",
729 : // Overlong 4-byte sequence.
730 : "X\xf0\x89\xbf\xbfY\0",
731 : // Invalid 3-byte sequence (reserved for surrogates).
732 : "X\xed\xa0\x80Y\0",
733 : // Invalid 4-bytes sequence (value out of range).
734 : "X\xf4\x90\x80\x80Y\0",
735 5 : };
736 : const std::vector<std::vector<uint16_t>> unicode_expected = {
737 : {0x58, 0xFFFD, 0xFFFD, 0x59},
738 : {0x58, 0xFFFD, 0xFFFD, 0x59},
739 : {0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
740 : {0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
741 : {0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
742 : {0x58, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0x59},
743 10 : };
744 5 : CHECK_EQ(unicode_expected.size(), arraysize(cases));
745 5 : TestChunkStreamAgainstReference(cases, unicode_expected);
746 5 : }
747 :
748 26644 : TEST(RelocatingCharacterStream) {
749 : ManualGCScope manual_gc_scope;
750 5 : CcTest::InitializeVM();
751 : i::Isolate* i_isolate = CcTest::i_isolate();
752 10 : v8::HandleScope scope(CcTest::isolate());
753 :
754 : const char* string = "abcd";
755 : int length = static_cast<int>(strlen(string));
756 5 : std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
757 45 : for (int i = 0; i < length; i++) {
758 40 : uc16_buffer[i] = string[i];
759 : }
760 : i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(), length);
761 : i::Handle<i::String> two_byte_string =
762 : i_isolate->factory()
763 10 : ->NewStringFromTwoByte(two_byte_vector, i::AllocationType::kYoung)
764 : .ToHandleChecked();
765 : std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
766 5 : i::ScannerStream::For(i_isolate, two_byte_string, 0, length));
767 5 : CHECK_EQ('a', two_byte_string_stream->Advance());
768 5 : CHECK_EQ('b', two_byte_string_stream->Advance());
769 5 : CHECK_EQ(size_t{2}, two_byte_string_stream->pos());
770 : i::String raw = *two_byte_string;
771 : i_isolate->heap()->CollectGarbage(i::NEW_SPACE,
772 5 : i::GarbageCollectionReason::kUnknown);
773 : // GC moved the string.
774 5 : CHECK_NE(raw, *two_byte_string);
775 5 : CHECK_EQ('c', two_byte_string_stream->Advance());
776 5 : CHECK_EQ('d', two_byte_string_stream->Advance());
777 5 : }
778 :
779 26644 : TEST(CloneCharacterStreams) {
780 10 : v8::HandleScope handles(CcTest::isolate());
781 5 : v8::Local<v8::Context> context = v8::Context::New(CcTest::isolate());
782 : v8::Context::Scope context_scope(context);
783 :
784 : i::Isolate* isolate = CcTest::i_isolate();
785 : i::Factory* factory = isolate->factory();
786 :
787 : const char* one_byte_source = "abcdefghi";
788 : unsigned length = static_cast<unsigned>(strlen(one_byte_source));
789 :
790 : // Check that cloning a character stream does not update
791 :
792 : // 2-byte external string
793 5 : std::unique_ptr<i::uc16[]> uc16_buffer(new i::uc16[length]);
794 : i::Vector<const i::uc16> two_byte_vector(uc16_buffer.get(),
795 : static_cast<int>(length));
796 : {
797 95 : for (unsigned i = 0; i < length; i++) {
798 90 : uc16_buffer[i] = static_cast<i::uc16>(one_byte_source[i]);
799 : }
800 : TestExternalResource resource(uc16_buffer.get(), length);
801 : i::Handle<i::String> uc16_string(
802 5 : NewExternalTwoByteStringFromResource(isolate, &resource));
803 : std::unique_ptr<i::Utf16CharacterStream> uc16_stream(
804 5 : i::ScannerStream::For(isolate, uc16_string, 0, length));
805 :
806 5 : CHECK(resource.IsLocked());
807 5 : CHECK_EQ(1, resource.LockDepth());
808 5 : std::unique_ptr<i::Utf16CharacterStream> cloned = uc16_stream->Clone();
809 5 : CHECK_EQ(2, resource.LockDepth());
810 : uc16_stream = std::move(cloned);
811 5 : CHECK_EQ(1, resource.LockDepth());
812 :
813 5 : TestCloneCharacterStream(one_byte_source, uc16_stream.get(), length);
814 :
815 : // This avoids the GC from trying to free a stack allocated resource.
816 5 : if (uc16_string->IsExternalString())
817 : i::Handle<i::ExternalTwoByteString>::cast(uc16_string)
818 5 : ->SetResource(isolate, nullptr);
819 : }
820 :
821 : // 1-byte external string
822 : i::Vector<const uint8_t> one_byte_vector =
823 5 : i::OneByteVector(one_byte_source, static_cast<int>(length));
824 : i::Handle<i::String> one_byte_string =
825 10 : factory->NewStringFromOneByte(one_byte_vector).ToHandleChecked();
826 : {
827 : TestExternalOneByteResource one_byte_resource(one_byte_source, length);
828 : i::Handle<i::String> ext_one_byte_string(
829 10 : factory->NewExternalStringFromOneByte(&one_byte_resource)
830 : .ToHandleChecked());
831 : std::unique_ptr<i::Utf16CharacterStream> one_byte_stream(
832 5 : i::ScannerStream::For(isolate, ext_one_byte_string, 0, length));
833 5 : TestCloneCharacterStream(one_byte_source, one_byte_stream.get(), length);
834 : // This avoids the GC from trying to free a stack allocated resource.
835 5 : if (ext_one_byte_string->IsExternalString())
836 : i::Handle<i::ExternalOneByteString>::cast(ext_one_byte_string)
837 5 : ->SetResource(isolate, nullptr);
838 : }
839 :
840 : // Relocatinable streams aren't clonable.
841 : {
842 : std::unique_ptr<i::Utf16CharacterStream> string_stream(
843 5 : i::ScannerStream::For(isolate, one_byte_string, 0, length));
844 5 : CHECK(!string_stream->can_be_cloned());
845 :
846 : i::Handle<i::String> two_byte_string =
847 10 : factory->NewStringFromTwoByte(two_byte_vector).ToHandleChecked();
848 : std::unique_ptr<i::Utf16CharacterStream> two_byte_string_stream(
849 5 : i::ScannerStream::For(isolate, two_byte_string, 0, length));
850 5 : CHECK(!two_byte_string_stream->can_be_cloned());
851 : }
852 :
853 : // Chunk sources currently not cloneable.
854 : {
855 5 : const char* chunks[] = {"1234", "\0"};
856 5 : ChunkSource chunk_source(chunks);
857 : std::unique_ptr<i::Utf16CharacterStream> one_byte_streaming_stream(
858 : i::ScannerStream::For(&chunk_source,
859 5 : v8::ScriptCompiler::StreamedSource::ONE_BYTE));
860 5 : CHECK(!one_byte_streaming_stream->can_be_cloned());
861 :
862 : std::unique_ptr<i::Utf16CharacterStream> utf8_streaming_stream(
863 : i::ScannerStream::For(&chunk_source,
864 5 : v8::ScriptCompiler::StreamedSource::UTF8));
865 5 : CHECK(!utf8_streaming_stream->can_be_cloned());
866 :
867 : std::unique_ptr<i::Utf16CharacterStream> two_byte_streaming_stream(
868 : i::ScannerStream::For(&chunk_source,
869 5 : v8::ScriptCompiler::StreamedSource::TWO_BYTE));
870 5 : CHECK(!two_byte_streaming_stream->can_be_cloned());
871 : }
872 79922 : }
|