Line data Source code
1 : // Copyright 2011 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/parsing/scanner-character-streams.h"
6 :
7 : #include <memory>
8 : #include <vector>
9 :
10 : #include "include/v8.h"
11 : #include "src/counters.h"
12 : #include "src/globals.h"
13 : #include "src/handles.h"
14 : #include "src/objects-inl.h"
15 : #include "src/parsing/scanner.h"
16 : #include "src/unicode-inl.h"
17 :
18 : namespace v8 {
19 : namespace internal {
20 :
21 : class ScopedExternalStringLock {
22 : public:
23 3768 : explicit ScopedExternalStringLock(ExternalString string) {
24 : DCHECK(!string.is_null());
25 3770 : if (string->IsExternalOneByteString()) {
26 3708 : resource_ = ExternalOneByteString::cast(string)->resource();
27 : } else {
28 : DCHECK(string->IsExternalTwoByteString());
29 62 : resource_ = ExternalTwoByteString::cast(string)->resource();
30 : }
31 : DCHECK(resource_);
32 3770 : resource_->Lock();
33 3770 : }
34 :
35 : // Copying a lock increases the locking depth.
36 : ScopedExternalStringLock(const ScopedExternalStringLock& other) V8_NOEXCEPT
37 95 : : resource_(other.resource_) {
38 95 : resource_->Lock();
39 : }
40 :
41 3860 : ~ScopedExternalStringLock() { resource_->Unlock(); }
42 :
43 : private:
44 : // Not nullptr.
45 : const v8::String::ExternalStringResourceBase* resource_;
46 : };
47 :
48 : namespace {
49 : const unibrow::uchar kUtf8Bom = 0xFEFF;
50 : } // namespace
51 :
52 : template <typename Char>
53 : struct CharTraits;
54 :
55 : template <>
56 : struct CharTraits<uint8_t> {
57 : using String = SeqOneByteString;
58 : using ExternalString = ExternalOneByteString;
59 : };
60 :
61 : template <>
62 : struct CharTraits<uint16_t> {
63 : using String = SeqTwoByteString;
64 : using ExternalString = ExternalTwoByteString;
65 : };
66 :
67 : template <typename Char>
68 : struct Range {
69 : const Char* start;
70 : const Char* end;
71 :
72 8981744 : size_t length() { return static_cast<size_t>(end - start); }
73 : bool unaligned_start() const {
74 : return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
75 : }
76 : };
77 :
78 : // A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString.
79 : template <typename Char>
80 : class OnHeapStream {
81 : public:
82 : using String = typename CharTraits<Char>::String;
83 :
84 : OnHeapStream(Handle<String> string, size_t start_offset, size_t end)
85 2975881 : : string_(string), start_offset_(start_offset), length_(end) {}
86 :
87 0 : OnHeapStream(const OnHeapStream&) V8_NOEXCEPT : start_offset_(0), length_(0) {
88 0 : UNREACHABLE();
89 : }
90 :
91 : // The no_gc argument is only here because of the templated way this class
92 : // is used along with other implementations that require V8 heap access.
93 : Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
94 : DisallowHeapAllocation* no_gc) {
95 8849376 : return {&string_->GetChars(*no_gc)[start_offset_ + Min(length_, pos)],
96 8849376 : &string_->GetChars(*no_gc)[start_offset_ + length_]};
97 : }
98 :
99 : static const bool kCanBeCloned = false;
100 : static const bool kCanAccessHeap = true;
101 :
102 : private:
103 : Handle<String> string_;
104 : const size_t start_offset_;
105 : const size_t length_;
106 : };
107 :
108 : // A Char stream backed by an off-heap ExternalOneByteString or
109 : // ExternalTwoByteString.
110 : template <typename Char>
111 3861 : class ExternalStringStream {
112 : using ExternalString = typename CharTraits<Char>::ExternalString;
113 :
114 : public:
115 3768 : ExternalStringStream(ExternalString string, size_t start_offset,
116 : size_t length)
117 : : lock_(string),
118 3769 : data_(string->GetChars() + start_offset),
119 7537 : length_(length) {}
120 :
121 : ExternalStringStream(const ExternalStringStream& other) V8_NOEXCEPT
122 : : lock_(other.lock_),
123 : data_(other.data_),
124 95 : length_(other.length_) {}
125 :
126 : // The no_gc argument is only here because of the templated way this class
127 : // is used along with other implementations that require V8 heap access.
128 : Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
129 : DisallowHeapAllocation* no_gc = nullptr) {
130 64973 : return {&data_[Min(length_, pos)], &data_[length_]};
131 : }
132 :
133 : static const bool kCanBeCloned = true;
134 : static const bool kCanAccessHeap = false;
135 :
136 : private:
137 : ScopedExternalStringLock lock_;
138 : const Char* const data_;
139 : const size_t length_;
140 : };
141 :
142 : // A Char stream backed by a C array. Testing only.
143 : template <typename Char>
144 : class TestingStream {
145 : public:
146 : TestingStream(const Char* data, size_t length)
147 1746 : : data_(data), length_(length) {}
148 : // The no_gc argument is only here because of the templated way this class
149 : // is used along with other implementations that require V8 heap access.
150 : Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
151 : DisallowHeapAllocation* no_gc = nullptr) {
152 7150 : return {&data_[Min(length_, pos)], &data_[length_]};
153 : }
154 :
155 : static const bool kCanBeCloned = true;
156 : static const bool kCanAccessHeap = false;
157 :
158 : private:
159 : const Char* const data_;
160 : const size_t length_;
161 : };
162 :
163 : // A Char stream backed by multiple source-stream provided off-heap chunks.
164 : template <typename Char>
165 : class ChunkedStream {
166 : public:
167 : explicit ChunkedStream(ScriptCompiler::ExternalSourceStream* source)
168 150 : : source_(source) {}
169 :
170 0 : ChunkedStream(const ChunkedStream&) V8_NOEXCEPT {
171 : // TODO(rmcilroy): Implement cloning for chunked streams.
172 0 : UNREACHABLE();
173 : }
174 :
175 : // The no_gc argument is only here because of the templated way this class
176 : // is used along with other implementations that require V8 heap access.
177 63820 : Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
178 : DisallowHeapAllocation* no_gc = nullptr) {
179 63820 : Chunk chunk = FindChunk(pos, stats);
180 63820 : size_t buffer_end = chunk.length;
181 63820 : size_t buffer_pos = Min(buffer_end, pos - chunk.position);
182 63820 : return {&chunk.data[buffer_pos], &chunk.data[buffer_end]};
183 : }
184 :
185 150 : ~ChunkedStream() {
186 1435 : for (Chunk& chunk : chunks_) delete[] chunk.data;
187 300 : }
188 :
189 : static const bool kCanBeCloned = false;
190 : static const bool kCanAccessHeap = false;
191 :
192 : private:
193 : struct Chunk {
194 : Chunk(const Char* const data, size_t position, size_t length)
195 1285 : : data(data), position(position), length(length) {}
196 : const Char* const data;
197 : // The logical position of data.
198 : const size_t position;
199 : const size_t length;
200 64965 : size_t end_position() const { return position + length; }
201 : };
202 :
203 63820 : Chunk FindChunk(size_t position, RuntimeCallStats* stats) {
204 63960 : while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0}, stats);
205 :
206 : // Walk forwards while the position is in front of the current chunk.
207 66110 : while (position >= chunks_.back().end_position() &&
208 : chunks_.back().length > 0) {
209 1145 : FetchChunk(chunks_.back().end_position(), stats);
210 : }
211 :
212 : // Walk backwards.
213 763800 : for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend();
214 : ++reverse_it) {
215 763800 : if (reverse_it->position <= position) return *reverse_it;
216 : }
217 :
218 0 : UNREACHABLE();
219 : }
220 :
221 1285 : virtual void ProcessChunk(const uint8_t* data, size_t position,
222 : size_t length) {
223 : // Incoming data has to be aligned to Char size.
224 : DCHECK_EQ(0, length % sizeof(Char));
225 1285 : chunks_.emplace_back(reinterpret_cast<const Char*>(data), position,
226 : length / sizeof(Char));
227 1285 : }
228 :
229 1285 : void FetchChunk(size_t position, RuntimeCallStats* stats) {
230 1285 : const uint8_t* data = nullptr;
231 : size_t length;
232 : {
233 : RuntimeCallTimerScope scope(stats,
234 1285 : RuntimeCallCounterId::kGetMoreDataCallback);
235 1285 : length = source_->GetMoreData(&data);
236 : }
237 1285 : ProcessChunk(data, position, length);
238 1285 : }
239 :
240 : ScriptCompiler::ExternalSourceStream* source_;
241 :
242 : protected:
243 : std::vector<struct Chunk> chunks_;
244 : };
245 :
246 : // Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
247 : // Chars are buffered if either the underlying stream isn't utf-16 or the
248 : // underlying utf-16 stream might move (is on-heap).
249 : template <template <typename T> class ByteStream>
250 5945929 : class BufferedCharacterStream : public Utf16CharacterStream {
251 : public:
252 : template <class... TArgs>
253 2972892 : BufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
254 2971050 : buffer_pos_ = pos;
255 : }
256 :
257 70 : bool can_be_cloned() const final {
258 70 : return ByteStream<uint16_t>::kCanBeCloned;
259 : }
260 :
261 85 : std::unique_ptr<Utf16CharacterStream> Clone() const override {
262 0 : CHECK(can_be_cloned());
263 : return std::unique_ptr<Utf16CharacterStream>(
264 170 : new BufferedCharacterStream<ByteStream>(*this));
265 : }
266 :
267 : protected:
268 8904508 : bool ReadBlock() final {
269 : size_t position = pos();
270 8904508 : buffer_pos_ = position;
271 8904508 : buffer_start_ = &buffer_[0];
272 8904508 : buffer_cursor_ = buffer_start_;
273 :
274 : DisallowHeapAllocation no_gc;
275 : Range<uint8_t> range =
276 32010 : byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
277 8904508 : if (range.length() == 0) {
278 2396714 : buffer_end_ = buffer_start_;
279 2396714 : return false;
280 : }
281 :
282 : size_t length = Min(kBufferSize, range.length());
283 : i::CopyCharsUnsigned(buffer_, range.start, length);
284 6507794 : buffer_end_ = &buffer_[length];
285 6507794 : return true;
286 : }
287 :
288 3422 : bool can_access_heap() const final {
289 3422 : return ByteStream<uint8_t>::kCanAccessHeap;
290 : }
291 :
292 : private:
293 : BufferedCharacterStream(const BufferedCharacterStream<ByteStream>& other)
294 85 : : byte_stream_(other.byte_stream_) {}
295 :
296 : static const size_t kBufferSize = 512;
297 : uc16 buffer_[kBufferSize];
298 : ByteStream<uint8_t> byte_stream_;
299 : };
300 :
301 : // Provides a unbuffered utf-16 view on the bytes from the underlying
302 : // ByteStream.
303 : template <template <typename T> class ByteStream>
304 17330 : class UnbufferedCharacterStream : public Utf16CharacterStream {
305 : public:
306 : template <class... TArgs>
307 8655 : UnbufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
308 8600 : buffer_pos_ = pos;
309 : }
310 :
311 310 : bool can_access_heap() const final {
312 310 : return ByteStream<uint16_t>::kCanAccessHeap;
313 : }
314 :
315 5 : bool can_be_cloned() const final {
316 5 : return ByteStream<uint16_t>::kCanBeCloned;
317 : }
318 :
319 10 : std::unique_ptr<Utf16CharacterStream> Clone() const override {
320 : return std::unique_ptr<Utf16CharacterStream>(
321 20 : new UnbufferedCharacterStream<ByteStream>(*this));
322 : }
323 :
324 : protected:
325 77236 : bool ReadBlock() final {
326 : size_t position = pos();
327 77236 : buffer_pos_ = position;
328 : DisallowHeapAllocation no_gc;
329 : Range<uint16_t> range =
330 31810 : byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
331 77236 : buffer_start_ = range.start;
332 77236 : buffer_end_ = range.end;
333 77236 : buffer_cursor_ = buffer_start_;
334 77236 : if (range.length() == 0) return false;
335 :
336 : DCHECK(!range.unaligned_start());
337 : DCHECK_LE(buffer_start_, buffer_end_);
338 68344 : return true;
339 : }
340 :
341 : UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream>& other)
342 10 : : byte_stream_(other.byte_stream_) {}
343 :
344 : ByteStream<uint16_t> byte_stream_;
345 : };
346 :
347 : // Provides a unbuffered utf-16 view on the bytes from the underlying
348 : // ByteStream.
349 : class RelocatingCharacterStream
350 : : public UnbufferedCharacterStream<OnHeapStream> {
351 : public:
352 : template <class... TArgs>
353 : RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args)
354 : : UnbufferedCharacterStream<OnHeapStream>(pos, args...),
355 8538 : isolate_(isolate) {
356 8538 : isolate->heap()->AddGCEpilogueCallback(UpdateBufferPointersCallback,
357 : v8::kGCTypeAll, this);
358 : }
359 :
360 : private:
361 17076 : ~RelocatingCharacterStream() final {
362 8538 : isolate_->heap()->RemoveGCEpilogueCallback(UpdateBufferPointersCallback,
363 8538 : this);
364 8538 : }
365 :
366 0 : static void UpdateBufferPointersCallback(v8::Isolate* v8_isolate,
367 : v8::GCType type,
368 : v8::GCCallbackFlags flags,
369 : void* stream) {
370 : reinterpret_cast<RelocatingCharacterStream*>(stream)
371 : ->UpdateBufferPointers();
372 0 : }
373 :
374 : void UpdateBufferPointers() {
375 : DisallowHeapAllocation no_gc;
376 : Range<uint16_t> range =
377 : byte_stream_.GetDataAt(0, runtime_call_stats(), &no_gc);
378 0 : if (range.start != buffer_start_) {
379 0 : buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start;
380 0 : buffer_start_ = range.start;
381 0 : buffer_end_ = range.end;
382 : }
383 : }
384 :
385 : Isolate* isolate_;
386 : };
387 :
388 : // ----------------------------------------------------------------------------
389 : // BufferedUtf16CharacterStreams
390 : //
391 : // A buffered character stream based on a random access character
392 : // source (ReadBlock can be called with pos() pointing to any position,
393 : // even positions before the current).
394 : //
395 : // TODO(verwaest): Remove together with Utf8 external streaming streams.
396 27514 : class BufferedUtf16CharacterStream : public Utf16CharacterStream {
397 : public:
398 : BufferedUtf16CharacterStream();
399 :
400 : protected:
401 : static const size_t kBufferSize = 512;
402 :
403 : bool ReadBlock() final;
404 :
405 : // FillBuffer should read up to kBufferSize characters at position and store
406 : // them into buffer_[0..]. It returns the number of characters stored.
407 : virtual size_t FillBuffer(size_t position) = 0;
408 :
409 : // Fixed sized buffer that this class reads from.
410 : // The base class' buffer_start_ should always point to buffer_.
411 : uc16 buffer_[kBufferSize];
412 : };
413 :
414 0 : BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
415 27514 : : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}
416 :
417 428066 : bool BufferedUtf16CharacterStream::ReadBlock() {
418 : DCHECK_EQ(buffer_start_, buffer_);
419 :
420 : size_t position = pos();
421 428066 : buffer_pos_ = position;
422 428066 : buffer_cursor_ = buffer_;
423 428066 : buffer_end_ = buffer_ + FillBuffer(position);
424 : DCHECK_EQ(pos(), position);
425 : DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
426 428066 : return buffer_cursor_ < buffer_end_;
427 : }
428 :
429 : // ----------------------------------------------------------------------------
430 : // Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
431 : //
432 : // This implementation is fairly complex, since data arrives in chunks which
433 : // may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
434 : // character position is tricky because the byte position cannot be derived
435 : // from the character position.
436 : //
437 : // TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
438 : // instead so we don't need to buffer.
439 :
440 : class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream {
441 : public:
442 : Utf8ExternalStreamingStream(
443 : ScriptCompiler::ExternalSourceStream* source_stream)
444 : : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
445 27514 : source_stream_(source_stream) {}
446 41271 : ~Utf8ExternalStreamingStream() final {
447 41998 : for (const Chunk& chunk : chunks_) delete[] chunk.data;
448 27514 : }
449 :
450 83 : bool can_access_heap() const final { return false; }
451 :
452 16 : bool can_be_cloned() const final { return false; }
453 :
454 0 : std::unique_ptr<Utf16CharacterStream> Clone() const override {
455 0 : UNREACHABLE();
456 : }
457 :
458 : protected:
459 : size_t FillBuffer(size_t position) final;
460 :
461 : private:
462 : // A position within the data stream. It stores:
463 : // - The 'physical' position (# of bytes in the stream),
464 : // - the 'logical' position (# of ucs-2 characters, also within the stream),
465 : // - a possibly incomplete utf-8 char at the current 'physical' position.
466 : struct StreamPosition {
467 : size_t bytes;
468 : size_t chars;
469 : uint32_t incomplete_char;
470 : unibrow::Utf8::State state;
471 : };
472 :
473 : // Position contains a StreamPosition and the index of the chunk the position
474 : // points into. (The chunk_no could be derived from pos, but that'd be
475 : // an expensive search through all chunks.)
476 : struct Position {
477 : size_t chunk_no;
478 : StreamPosition pos;
479 : };
480 :
481 : // A chunk in the list of chunks, containing:
482 : // - The chunk data (data pointer and length), and
483 : // - the position at the first byte of the chunk.
484 : struct Chunk {
485 : const uint8_t* data;
486 : size_t length;
487 : StreamPosition start;
488 : };
489 :
490 : // Within the current chunk, skip forward from current_ towards position.
491 : bool SkipToPosition(size_t position);
492 : // Within the current chunk, fill the buffer_ (while it has capacity).
493 : void FillBufferFromCurrentChunk();
494 : // Fetch a new chunk (assuming current_ is at the end of the current data).
495 : bool FetchChunk();
496 : // Search through the chunks and set current_ to point to the given position.
497 : // (This call is potentially expensive.)
498 : void SearchPosition(size_t position);
499 :
500 : std::vector<Chunk> chunks_;
501 : Position current_;
502 : ScriptCompiler::ExternalSourceStream* source_stream_;
503 : };
504 :
505 40 : bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
506 : DCHECK_LE(current_.pos.chars, position); // We can only skip forward.
507 :
508 : // Already there? Then return immediately.
509 40 : if (current_.pos.chars == position) return true;
510 :
511 5 : const Chunk& chunk = chunks_[current_.chunk_no];
512 : DCHECK(current_.pos.bytes >= chunk.start.bytes);
513 :
514 5 : unibrow::Utf8::State state = chunk.start.state;
515 5 : uint32_t incomplete_char = chunk.start.incomplete_char;
516 5 : size_t it = current_.pos.bytes - chunk.start.bytes;
517 5 : const uint8_t* cursor = &chunk.data[it];
518 5 : const uint8_t* end = &chunk.data[chunk.length];
519 :
520 : size_t chars = current_.pos.chars;
521 :
522 5 : if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
523 15 : while (cursor < end) {
524 : unibrow::uchar t =
525 15 : unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
526 15 : if (t == unibrow::Utf8::kIncomplete) continue;
527 5 : if (t != kUtf8Bom) {
528 0 : chars++;
529 0 : if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
530 : }
531 : break;
532 : }
533 : }
534 :
535 45 : while (cursor < end && chars < position) {
536 : unibrow::uchar t =
537 40 : unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
538 40 : if (t != unibrow::Utf8::kIncomplete) {
539 25 : chars++;
540 25 : if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
541 : }
542 : }
543 :
544 5 : current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
545 5 : current_.pos.chars = chars;
546 5 : current_.pos.incomplete_char = incomplete_char;
547 5 : current_.pos.state = state;
548 5 : current_.chunk_no += (cursor == end);
549 :
550 5 : return current_.pos.chars == position;
551 : }
552 :
553 427886 : void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
554 : DCHECK_LT(current_.chunk_no, chunks_.size());
555 : DCHECK_EQ(buffer_start_, buffer_cursor_);
556 : DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);
557 :
558 427886 : const Chunk& chunk = chunks_[current_.chunk_no];
559 :
560 : // The buffer_ is writable, but buffer_*_ members are const. So we get a
561 : // non-const pointer into buffer that points to the same char as buffer_end_.
562 427886 : uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
563 : DCHECK_EQ(output_cursor, buffer_end_);
564 :
565 427886 : unibrow::Utf8::State state = current_.pos.state;
566 427886 : uint32_t incomplete_char = current_.pos.incomplete_char;
567 :
568 : // If the current chunk is the last (empty) chunk we'll have to process
569 : // any left-over, partial characters.
570 427886 : if (chunk.length == 0) {
571 13579 : unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
572 13579 : if (t != unibrow::Utf8::kBufferEmpty) {
573 : DCHECK_EQ(t, unibrow::Utf8::kBadChar);
574 25 : *output_cursor = static_cast<uc16>(t);
575 25 : buffer_end_++;
576 25 : current_.pos.chars++;
577 25 : current_.pos.incomplete_char = 0;
578 25 : current_.pos.state = state;
579 : }
580 13579 : return;
581 : }
582 :
583 414307 : size_t it = current_.pos.bytes - chunk.start.bytes;
584 414307 : const uint8_t* cursor = chunk.data + it;
585 414307 : const uint8_t* end = chunk.data + chunk.length;
586 :
587 : // Deal with possible BOM.
588 414307 : if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
589 13987 : while (cursor < end) {
590 : unibrow::uchar t =
591 13947 : unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
592 13947 : if (V8_LIKELY(t < kUtf8Bom)) {
593 13797 : *(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
594 150 : } else if (t == unibrow::Utf8::kIncomplete) {
595 : continue;
596 40 : } else if (t == kUtf8Bom) {
597 : // BOM detected at beginning of the stream. Don't copy it.
598 15 : } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
599 5 : *(output_cursor++) = static_cast<uc16>(t);
600 : } else {
601 10 : *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
602 20 : *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
603 : }
604 : break;
605 : }
606 : }
607 :
608 200123159 : while (cursor < end && output_cursor + 1 < buffer_start_ + kBufferSize) {
609 : unibrow::uchar t =
610 199708852 : unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
611 199708852 : if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
612 199431744 : *(output_cursor++) = static_cast<uc16>(t); // The most frequent case.
613 277108 : } else if (t == unibrow::Utf8::kIncomplete) {
614 : continue;
615 : } else {
616 347 : *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
617 694 : *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
618 : }
619 : }
620 :
621 414307 : current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
622 414307 : current_.pos.chars += (output_cursor - buffer_end_);
623 414307 : current_.pos.incomplete_char = incomplete_char;
624 414307 : current_.pos.state = state;
625 414307 : current_.chunk_no += (cursor == end);
626 :
627 414307 : buffer_end_ = output_cursor;
628 : }
629 :
630 28241 : bool Utf8ExternalStreamingStream::FetchChunk() {
631 : RuntimeCallTimerScope scope(runtime_call_stats(),
632 28241 : RuntimeCallCounterId::kGetMoreDataCallback);
633 : DCHECK_EQ(current_.chunk_no, chunks_.size());
634 : DCHECK(chunks_.empty() || chunks_.back().length != 0);
635 :
636 28241 : const uint8_t* chunk = nullptr;
637 28241 : size_t length = source_stream_->GetMoreData(&chunk);
638 56482 : chunks_.push_back({chunk, length, current_.pos});
639 56482 : return length > 0;
640 : }
641 :
642 428066 : void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
643 : // If current_ already points to the right position, we're done.
644 : //
645 : // This is expected to be the common case, since we typically call
646 : // FillBuffer right after the current buffer.
647 428066 : if (current_.pos.chars == position) return;
648 :
649 : // No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
650 31165 : if (chunks_.empty()) {
651 : DCHECK_EQ(current_.chunk_no, 0u);
652 : DCHECK_EQ(current_.pos.bytes, 0u);
653 : DCHECK_EQ(current_.pos.chars, 0u);
654 0 : FetchChunk();
655 : }
656 :
657 : // Search for the last chunk whose start position is less or equal to
658 : // position.
659 31165 : size_t chunk_no = chunks_.size() - 1;
660 1091390 : while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
661 348230 : chunk_no--;
662 : }
663 :
664 : // Did we find the terminating (zero-length) chunk? Then we're seeking
665 : // behind the end of the data, and position does not exist.
666 : // Set current_ to point to the terminating chunk.
667 31165 : if (chunks_[chunk_no].length == 0) {
668 100 : current_ = {chunk_no, chunks_[chunk_no].start};
669 100 : return;
670 : }
671 :
672 : // Did we find the non-last chunk? Then our position must be within chunk_no.
673 31065 : if (chunk_no + 1 < chunks_.size()) {
674 : // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
675 : // (Many web sites declare utf-8 encoding, but use only (or almost only) the
676 : // ASCII subset for their JavaScript sources. We can exploit this, by
677 : // checking whether the # bytes in a chunk are equal to the # chars, and if
678 : // so avoid the expensive SkipToPosition.)
679 : bool ascii_only_chunk =
680 62110 : chunks_[chunk_no].start.incomplete_char == 0 &&
681 31045 : (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
682 31045 : (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
683 31065 : if (ascii_only_chunk) {
684 31025 : size_t skip = position - chunks_[chunk_no].start.chars;
685 : current_ = {chunk_no,
686 31025 : {chunks_[chunk_no].start.bytes + skip,
687 : chunks_[chunk_no].start.chars + skip, 0,
688 31025 : unibrow::Utf8::State::kAccept}};
689 : } else {
690 40 : current_ = {chunk_no, chunks_[chunk_no].start};
691 40 : SkipToPosition(position);
692 : }
693 :
694 : // Since position was within the chunk, SkipToPosition should have found
695 : // something.
696 : DCHECK_EQ(position, current_.pos.chars);
697 : return;
698 : }
699 :
700 : // What's left: We're in the last, non-terminating chunk. Our position
701 : // may be in the chunk, but it may also be in 'future' chunks, which we'll
702 : // have to obtain.
703 : DCHECK_EQ(chunk_no, chunks_.size() - 1);
704 0 : current_ = {chunk_no, chunks_[chunk_no].start};
705 : bool have_more_data = true;
706 0 : bool found = SkipToPosition(position);
707 0 : while (have_more_data && !found) {
708 : DCHECK_EQ(current_.chunk_no, chunks_.size());
709 0 : have_more_data = FetchChunk();
710 0 : found = have_more_data && SkipToPosition(position);
711 : }
712 :
713 : // We'll return with a postion != the desired position only if we're out
714 : // of data. In that case, we'll point to the terminating chunk.
715 : DCHECK_EQ(found, current_.pos.chars == position);
716 : DCHECK_EQ(have_more_data, chunks_.back().length != 0);
717 : DCHECK_IMPLIES(!found, !have_more_data);
718 : DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
719 : }
720 :
721 428066 : size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
722 428066 : buffer_cursor_ = buffer_;
723 428066 : buffer_end_ = buffer_;
724 :
725 428066 : SearchPosition(position);
726 827976 : bool out_of_data = current_.chunk_no != chunks_.size() &&
727 828251 : chunks_[current_.chunk_no].length == 0 &&
728 275 : current_.pos.incomplete_char == 0;
729 :
730 428066 : if (out_of_data) return 0;
731 :
732 : // Fill the buffer, until we have at least one char (or are out of data).
733 : // (The embedder might give us 1-byte blocks within a utf-8 char, so we
734 : // can't guarantee progress with one chunk. Thus we iterate.)
735 1283573 : while (!out_of_data && buffer_cursor_ == buffer_end_) {
736 : // At end of current data, but there might be more? Then fetch it.
737 855772 : if (current_.chunk_no == chunks_.size()) {
738 28241 : out_of_data = !FetchChunk();
739 : }
740 427886 : FillBufferFromCurrentChunk();
741 : }
742 :
743 : DCHECK_EQ(current_.pos.chars - position,
744 : static_cast<size_t>(buffer_end_ - buffer_cursor_));
745 427801 : return buffer_end_ - buffer_cursor_;
746 : }
747 :
748 : // ----------------------------------------------------------------------------
749 : // ScannerStream: Create stream instances.
750 :
751 2259303 : Utf16CharacterStream* ScannerStream::For(Isolate* isolate,
752 : Handle<String> data) {
753 2259303 : return ScannerStream::For(isolate, data, 0, data->length());
754 : }
755 :
756 2979643 : Utf16CharacterStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
757 : int start_pos, int end_pos) {
758 : DCHECK_GE(start_pos, 0);
759 : DCHECK_LE(start_pos, end_pos);
760 : DCHECK_LE(end_pos, data->length());
761 : size_t start_offset = 0;
762 2979643 : if (data->IsSlicedString()) {
763 : SlicedString string = SlicedString::cast(*data);
764 542 : start_offset = string->offset();
765 : String parent = string->parent();
766 542 : if (parent->IsThinString()) parent = ThinString::cast(parent)->actual();
767 : data = handle(parent, isolate);
768 : } else {
769 2979101 : data = String::Flatten(isolate, data);
770 : }
771 2979645 : if (data->IsExternalOneByteString()) {
772 : return new BufferedCharacterStream<ExternalStringStream>(
773 : static_cast<size_t>(start_pos), ExternalOneByteString::cast(*data),
774 3707 : start_offset, static_cast<size_t>(end_pos));
775 2975938 : } else if (data->IsExternalTwoByteString()) {
776 : return new UnbufferedCharacterStream<ExternalStringStream>(
777 : static_cast<size_t>(start_pos), ExternalTwoByteString::cast(*data),
778 62 : start_offset, static_cast<size_t>(end_pos));
779 2975876 : } else if (data->IsSeqOneByteString()) {
780 : return new BufferedCharacterStream<OnHeapStream>(
781 : static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data),
782 2967338 : start_offset, static_cast<size_t>(end_pos));
783 8538 : } else if (data->IsSeqTwoByteString()) {
784 : return new RelocatingCharacterStream(
785 : isolate, static_cast<size_t>(start_pos),
786 : Handle<SeqTwoByteString>::cast(data), start_offset,
787 8538 : static_cast<size_t>(end_pos));
788 : } else {
789 0 : UNREACHABLE();
790 : }
791 : }
792 :
793 451 : std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
794 : const char* data) {
795 451 : return ScannerStream::ForTesting(data, strlen(data));
796 : }
797 :
798 1746 : std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
799 : const char* data, size_t length) {
800 : return std::unique_ptr<Utf16CharacterStream>(
801 : new BufferedCharacterStream<TestingStream>(
802 : static_cast<size_t>(0), reinterpret_cast<const uint8_t*>(data),
803 3492 : static_cast<size_t>(length)));
804 : }
805 :
806 13907 : Utf16CharacterStream* ScannerStream::For(
807 : ScriptCompiler::ExternalSourceStream* source_stream,
808 : v8::ScriptCompiler::StreamedSource::Encoding encoding) {
809 13907 : switch (encoding) {
810 : case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
811 : return new UnbufferedCharacterStream<ChunkedStream>(
812 110 : static_cast<size_t>(0), source_stream);
813 : case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
814 : return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
815 190 : source_stream);
816 : case v8::ScriptCompiler::StreamedSource::UTF8:
817 27514 : return new Utf8ExternalStreamingStream(source_stream);
818 : }
819 0 : UNREACHABLE();
820 : }
821 :
822 : } // namespace internal
823 122036 : } // namespace v8
|