Line data Source code
1 : // Copyright 2011 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/parsing/scanner-character-streams.h"
6 :
7 : #include <memory>
8 : #include <vector>
9 :
10 : #include "include/v8.h"
11 : #include "src/counters.h"
12 : #include "src/globals.h"
13 : #include "src/handles.h"
14 : #include "src/objects-inl.h"
15 : #include "src/parsing/scanner.h"
16 : #include "src/unicode-inl.h"
17 :
18 : namespace v8 {
19 : namespace internal {
20 :
21 : class ScopedExternalStringLock {
22 : public:
23 3715 : explicit ScopedExternalStringLock(ExternalString string) {
24 : DCHECK(!string.is_null());
25 3719 : if (string->IsExternalOneByteString()) {
26 3657 : resource_ = ExternalOneByteString::cast(string)->resource();
27 : } else {
28 : DCHECK(string->IsExternalTwoByteString());
29 62 : resource_ = ExternalTwoByteString::cast(string)->resource();
30 : }
31 : DCHECK(resource_);
32 3719 : resource_->Lock();
33 3714 : }
34 :
35 : // Copying a lock increases the locking depth.
36 : ScopedExternalStringLock(const ScopedExternalStringLock& other)
37 95 : : resource_(other.resource_) {
38 95 : resource_->Lock();
39 : }
40 :
41 3801 : ~ScopedExternalStringLock() { resource_->Unlock(); }
42 :
43 : private:
44 : // Not nullptr.
45 : const v8::String::ExternalStringResourceBase* resource_;
46 : };
47 :
48 : namespace {
49 : const unibrow::uchar kUtf8Bom = 0xFEFF;
50 : } // namespace
51 :
52 : template <typename Char>
53 : struct CharTraits;
54 :
55 : template <>
56 : struct CharTraits<uint8_t> {
57 : typedef SeqOneByteString String;
58 : typedef ExternalOneByteString ExternalString;
59 : };
60 :
61 : template <>
62 : struct CharTraits<uint16_t> {
63 : typedef SeqTwoByteString String;
64 : typedef ExternalTwoByteString ExternalString;
65 : };
66 :
67 : template <typename Char>
68 : struct Range {
69 : const Char* start;
70 : const Char* end;
71 :
72 15083296 : size_t length() { return static_cast<size_t>(end - start); }
73 : bool unaligned_start() const {
74 : return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
75 : }
76 : };
77 :
78 : // A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString.
79 : template <typename Char>
80 : class OnHeapStream {
81 : public:
82 : typedef typename CharTraits<Char>::String String;
83 :
84 : OnHeapStream(Handle<String> string, size_t start_offset, size_t end)
85 2933799 : : string_(string), start_offset_(start_offset), length_(end) {}
86 :
87 0 : OnHeapStream(const OnHeapStream& other) : start_offset_(0), length_(0) {
88 0 : UNREACHABLE();
89 : }
90 :
91 : // The no_gc argument is only here because of the templated way this class
92 : // is used along with other implementations that require V8 heap access.
93 8654305 : Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
94 : DisallowHeapAllocation* no_gc) {
95 8654308 : return {&string_->GetChars(*no_gc)[start_offset_ + Min(length_, pos)],
96 25962931 : &string_->GetChars(*no_gc)[start_offset_ + length_]};
97 : }
98 :
99 : static const bool kCanBeCloned = false;
100 : static const bool kCanAccessHeap = true;
101 :
102 : private:
103 : Handle<String> string_;
104 : const size_t start_offset_;
105 : const size_t length_;
106 : };
107 :
108 : // A Char stream backed by an off-heap ExternalOneByteString or
109 : // ExternalTwoByteString.
110 : template <typename Char>
111 : class ExternalStringStream {
112 : typedef typename CharTraits<Char>::ExternalString ExternalString;
113 :
114 : public:
115 3718 : ExternalStringStream(ExternalString string, size_t start_offset,
116 : size_t length)
117 : : lock_(string),
118 3716 : data_(string->GetChars() + start_offset),
119 7434 : length_(length) {}
120 :
121 : ExternalStringStream(const ExternalStringStream& other)
122 95 : : lock_(other.lock_), data_(other.data_), length_(other.length_) {}
123 :
124 : // The no_gc argument is only here because of the templated way this class
125 : // is used along with other implementations that require V8 heap access.
126 : Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
127 : DisallowHeapAllocation* no_gc = nullptr) {
128 64839 : return {&data_[Min(length_, pos)], &data_[length_]};
129 : }
130 :
131 : static const bool kCanBeCloned = true;
132 : static const bool kCanAccessHeap = false;
133 :
134 : private:
135 : ScopedExternalStringLock lock_;
136 : const Char* const data_;
137 : const size_t length_;
138 : };
139 :
140 : // A Char stream backed by a C array. Testing only.
141 : template <typename Char>
142 : class TestingStream {
143 : public:
144 : TestingStream(const Char* data, size_t length)
145 1686 : : data_(data), length_(length) {}
146 : // The no_gc argument is only here because of the templated way this class
147 : // is used along with other implementations that require V8 heap access.
148 : Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
149 : DisallowHeapAllocation* no_gc = nullptr) {
150 6910 : return {&data_[Min(length_, pos)], &data_[length_]};
151 : }
152 :
153 : static const bool kCanBeCloned = true;
154 : static const bool kCanAccessHeap = false;
155 :
156 : private:
157 : const Char* const data_;
158 : const size_t length_;
159 : };
160 :
161 : // A Char stream backed by multiple source-stream provided off-heap chunks.
162 : template <typename Char>
163 : class ChunkedStream {
164 : public:
165 : explicit ChunkedStream(ScriptCompiler::ExternalSourceStream* source)
166 150 : : source_(source) {}
167 :
168 0 : ChunkedStream(const ChunkedStream& other) {
169 : // TODO(rmcilroy): Implement cloning for chunked streams.
170 0 : UNREACHABLE();
171 : }
172 :
173 : // The no_gc argument is only here because of the templated way this class
174 : // is used along with other implementations that require V8 heap access.
175 63820 : Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
176 : DisallowHeapAllocation* no_gc = nullptr) {
177 63820 : Chunk chunk = FindChunk(pos, stats);
178 63820 : size_t buffer_end = chunk.length;
179 63820 : size_t buffer_pos = Min(buffer_end, pos - chunk.position);
180 63820 : return {&chunk.data[buffer_pos], &chunk.data[buffer_end]};
181 : }
182 :
183 150 : ~ChunkedStream() {
184 1585 : for (Chunk& chunk : chunks_) delete[] chunk.data;
185 300 : }
186 :
187 : static const bool kCanBeCloned = false;
188 : static const bool kCanAccessHeap = false;
189 :
190 : private:
191 : struct Chunk {
192 : Chunk(const Char* const data, size_t position, size_t length)
193 1285 : : data(data), position(position), length(length) {}
194 : const Char* const data;
195 : // The logical position of data.
196 : const size_t position;
197 : const size_t length;
198 64965 : size_t end_position() const { return position + length; }
199 : };
200 :
201 63820 : Chunk FindChunk(size_t position, RuntimeCallStats* stats) {
202 127780 : while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0}, stats);
203 :
204 : // Walk forwards while the position is in front of the current chunk.
205 129930 : while (position >= chunks_.back().end_position() &&
206 : chunks_.back().length > 0) {
207 1145 : FetchChunk(chunks_.back().end_position(), stats);
208 : }
209 :
210 : // Walk backwards.
211 763800 : for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend();
212 : ++reverse_it) {
213 763800 : if (reverse_it->position <= position) return *reverse_it;
214 : }
215 :
216 0 : UNREACHABLE();
217 : }
218 :
219 1285 : virtual void ProcessChunk(const uint8_t* data, size_t position,
220 : size_t length) {
221 : // Incoming data has to be aligned to Char size.
222 : DCHECK_EQ(0, length % sizeof(Char));
223 1285 : chunks_.emplace_back(reinterpret_cast<const Char*>(data), position,
224 : length / sizeof(Char));
225 1285 : }
226 :
227 1285 : void FetchChunk(size_t position, RuntimeCallStats* stats) {
228 1285 : const uint8_t* data = nullptr;
229 : size_t length;
230 : {
231 : RuntimeCallTimerScope scope(stats,
232 1285 : RuntimeCallCounterId::kGetMoreDataCallback);
233 1285 : length = source_->GetMoreData(&data);
234 : }
235 1285 : ProcessChunk(data, position, length);
236 1285 : }
237 :
238 : ScriptCompiler::ExternalSourceStream* source_;
239 :
240 : protected:
241 : std::vector<struct Chunk> chunks_;
242 : };
243 :
244 : // Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
245 : // Chars are buffered if either the underlying stream isn't utf-16 or the
246 : // underlying utf-16 stream might move (is on-heap).
247 : template <template <typename T> class ByteStream>
248 5862207 : class BufferedCharacterStream : public Utf16CharacterStream {
249 : public:
250 : template <class... TArgs>
251 2931048 : BufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
252 2929263 : buffer_pos_ = pos;
253 : }
254 :
255 70 : bool can_be_cloned() const final {
256 70 : return ByteStream<uint16_t>::kCanBeCloned;
257 : }
258 :
259 85 : std::unique_ptr<Utf16CharacterStream> Clone() const override {
260 0 : CHECK(can_be_cloned());
261 : return std::unique_ptr<Utf16CharacterStream>(
262 170 : new BufferedCharacterStream<ByteStream>(*this));
263 : }
264 :
265 : protected:
266 8709885 : bool ReadBlock() final {
267 32010 : size_t position = pos();
268 8709885 : buffer_pos_ = position;
269 8709885 : buffer_start_ = &buffer_[0];
270 8709885 : buffer_cursor_ = buffer_start_;
271 :
272 : DisallowHeapAllocation no_gc;
273 : Range<uint8_t> range =
274 8706430 : byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
275 8709928 : if (range.length() == 0) {
276 2413096 : buffer_end_ = buffer_start_;
277 2413096 : return false;
278 : }
279 :
280 : size_t length = Min(kBufferSize, range.length());
281 : i::CopyCharsUnsigned(buffer_, range.start, length);
282 6296832 : buffer_end_ = &buffer_[length];
283 6296832 : return true;
284 : }
285 :
286 4129 : bool can_access_heap() const final {
287 4129 : return ByteStream<uint8_t>::kCanAccessHeap;
288 : }
289 :
290 : private:
291 : BufferedCharacterStream(const BufferedCharacterStream<ByteStream>& other)
292 85 : : byte_stream_(other.byte_stream_) {}
293 :
294 : static const size_t kBufferSize = 512;
295 : uc16 buffer_[kBufferSize];
296 : ByteStream<uint8_t> byte_stream_;
297 : };
298 :
299 : // Provides a unbuffered utf-16 view on the bytes from the underlying
300 : // ByteStream.
301 : template <template <typename T> class ByteStream>
302 8444 : class UnbufferedCharacterStream : public Utf16CharacterStream {
303 : public:
304 : template <class... TArgs>
305 8307 : UnbufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
306 8252 : buffer_pos_ = pos;
307 : }
308 :
309 386 : bool can_access_heap() const final {
310 386 : return ByteStream<uint16_t>::kCanAccessHeap;
311 : }
312 :
313 5 : bool can_be_cloned() const final {
314 5 : return ByteStream<uint16_t>::kCanBeCloned;
315 : }
316 :
317 10 : std::unique_ptr<Utf16CharacterStream> Clone() const override {
318 : return std::unique_ptr<Utf16CharacterStream>(
319 20 : new UnbufferedCharacterStream<ByteStream>(*this));
320 : }
321 :
322 : protected:
323 76536 : bool ReadBlock() final {
324 31810 : size_t position = pos();
325 76536 : buffer_pos_ = position;
326 : DisallowHeapAllocation no_gc;
327 : Range<uint16_t> range =
328 76536 : byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
329 76536 : buffer_start_ = range.start;
330 76536 : buffer_end_ = range.end;
331 76536 : buffer_cursor_ = buffer_start_;
332 76536 : if (range.length() == 0) return false;
333 :
334 : DCHECK(!range.unaligned_start());
335 : DCHECK_LE(buffer_start_, buffer_end_);
336 68088 : return true;
337 : }
338 :
339 0 : UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream>& other)
340 10 : : byte_stream_(other.byte_stream_) {}
341 :
342 : ByteStream<uint16_t> byte_stream_;
343 : };
344 :
345 : // Provides a unbuffered utf-16 view on the bytes from the underlying
346 : // ByteStream.
347 : class RelocatingCharacterStream
348 : : public UnbufferedCharacterStream<OnHeapStream> {
349 : public:
350 : template <class... TArgs>
351 : RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args)
352 : : UnbufferedCharacterStream<OnHeapStream>(pos, args...),
353 8190 : isolate_(isolate) {
354 8190 : isolate->heap()->AddGCEpilogueCallback(UpdateBufferPointersCallback,
355 8190 : v8::kGCTypeAll, this);
356 : }
357 :
358 : private:
359 8190 : ~RelocatingCharacterStream() final {
360 : isolate_->heap()->RemoveGCEpilogueCallback(UpdateBufferPointersCallback,
361 8190 : this);
362 8190 : }
363 :
364 0 : static void UpdateBufferPointersCallback(v8::Isolate* v8_isolate,
365 : v8::GCType type,
366 : v8::GCCallbackFlags flags,
367 : void* stream) {
368 : reinterpret_cast<RelocatingCharacterStream*>(stream)
369 0 : ->UpdateBufferPointers();
370 0 : }
371 :
372 0 : void UpdateBufferPointers() {
373 : DisallowHeapAllocation no_gc;
374 : Range<uint16_t> range =
375 0 : byte_stream_.GetDataAt(0, runtime_call_stats(), &no_gc);
376 0 : if (range.start != buffer_start_) {
377 0 : buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start;
378 0 : buffer_start_ = range.start;
379 0 : buffer_end_ = range.end;
380 : }
381 0 : }
382 :
383 : Isolate* isolate_;
384 : };
385 :
386 : // ----------------------------------------------------------------------------
387 : // BufferedUtf16CharacterStreams
388 : //
389 : // A buffered character stream based on a random access character
390 : // source (ReadBlock can be called with pos() pointing to any position,
391 : // even positions before the current).
392 : //
393 : // TODO(verwaest): Remove together with Utf8 external streaming streams.
394 13401 : class BufferedUtf16CharacterStream : public Utf16CharacterStream {
395 : public:
396 : BufferedUtf16CharacterStream();
397 :
398 : protected:
399 : static const size_t kBufferSize = 512;
400 :
401 : bool ReadBlock() final;
402 :
403 : // FillBuffer should read up to kBufferSize characters at position and store
404 : // them into buffer_[0..]. It returns the number of characters stored.
405 : virtual size_t FillBuffer(size_t position) = 0;
406 :
407 : // Fixed sized buffer that this class reads from.
408 : // The base class' buffer_start_ should always point to buffer_.
409 : uc16 buffer_[kBufferSize];
410 : };
411 :
412 0 : BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
413 26802 : : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}
414 :
415 391006 : bool BufferedUtf16CharacterStream::ReadBlock() {
416 : DCHECK_EQ(buffer_start_, buffer_);
417 :
418 : size_t position = pos();
419 391006 : buffer_pos_ = position;
420 391006 : buffer_cursor_ = buffer_;
421 391006 : buffer_end_ = buffer_ + FillBuffer(position);
422 : DCHECK_EQ(pos(), position);
423 : DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
424 391006 : return buffer_cursor_ < buffer_end_;
425 : }
426 :
427 : // ----------------------------------------------------------------------------
428 : // Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
429 : //
430 : // This implementation is fairly complex, since data arrives in chunks which
431 : // may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
432 : // character position is tricky because the byte position cannot be dericed
433 : // from the character position.
434 : //
435 : // TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
436 : // instead so we don't need to buffer.
437 :
438 : class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream {
439 : public:
440 : Utf8ExternalStreamingStream(
441 : ScriptCompiler::ExternalSourceStream* source_stream)
442 : : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
443 26802 : source_stream_(source_stream) {}
444 26802 : ~Utf8ExternalStreamingStream() final {
445 82028 : for (size_t i = 0; i < chunks_.size(); i++) delete[] chunks_[i].data;
446 26802 : }
447 :
448 83 : bool can_access_heap() const final { return false; }
449 :
450 16 : bool can_be_cloned() const final { return false; }
451 :
452 0 : std::unique_ptr<Utf16CharacterStream> Clone() const override {
453 0 : UNREACHABLE();
454 : }
455 :
456 : protected:
457 : size_t FillBuffer(size_t position) final;
458 :
459 : private:
460 : // A position within the data stream. It stores:
461 : // - The 'physical' position (# of bytes in the stream),
462 : // - the 'logical' position (# of ucs-2 characters, also within the stream),
463 : // - a possibly incomplete utf-8 char at the current 'physical' position.
464 : struct StreamPosition {
465 : size_t bytes;
466 : size_t chars;
467 : uint32_t incomplete_char;
468 : unibrow::Utf8::State state;
469 : };
470 :
471 : // Position contains a StreamPosition and the index of the chunk the position
472 : // points into. (The chunk_no could be derived from pos, but that'd be
473 : // an expensive search through all chunks.)
474 : struct Position {
475 : size_t chunk_no;
476 : StreamPosition pos;
477 : };
478 :
479 : // A chunk in the list of chunks, containing:
480 : // - The chunk data (data pointer and length), and
481 : // - the position at the first byte of the chunk.
482 : struct Chunk {
483 : const uint8_t* data;
484 : size_t length;
485 : StreamPosition start;
486 : };
487 :
488 : // Within the current chunk, skip forward from current_ towards position.
489 : bool SkipToPosition(size_t position);
490 : // Within the current chunk, fill the buffer_ (while it has capacity).
491 : void FillBufferFromCurrentChunk();
492 : // Fetch a new chunk (assuming current_ is at the end of the current data).
493 : bool FetchChunk();
494 : // Search through the chunks and set current_ to point to the given position.
495 : // (This call is potentially expensive.)
496 : void SearchPosition(size_t position);
497 :
498 : std::vector<Chunk> chunks_;
499 : Position current_;
500 : ScriptCompiler::ExternalSourceStream* source_stream_;
501 : };
502 :
503 40 : bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
504 : DCHECK_LE(current_.pos.chars, position); // We can only skip forward.
505 :
506 : // Already there? Then return immediately.
507 40 : if (current_.pos.chars == position) return true;
508 :
509 5 : const Chunk& chunk = chunks_[current_.chunk_no];
510 : DCHECK(current_.pos.bytes >= chunk.start.bytes);
511 :
512 5 : unibrow::Utf8::State state = chunk.start.state;
513 5 : uint32_t incomplete_char = chunk.start.incomplete_char;
514 5 : size_t it = current_.pos.bytes - chunk.start.bytes;
515 5 : size_t chars = chunk.start.chars;
516 65 : while (it < chunk.length && chars < position) {
517 : unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
518 55 : chunk.data[it], &it, &state, &incomplete_char);
519 55 : if (t == kUtf8Bom && current_.pos.chars == 0) {
520 : // BOM detected at beginning of the stream. Don't copy it.
521 50 : } else if (t != unibrow::Utf8::kIncomplete) {
522 25 : chars++;
523 25 : if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
524 : }
525 : }
526 :
527 5 : current_.pos.bytes += it;
528 5 : current_.pos.chars = chars;
529 5 : current_.pos.incomplete_char = incomplete_char;
530 5 : current_.pos.state = state;
531 5 : current_.chunk_no += (it == chunk.length);
532 :
533 5 : return current_.pos.chars == position;
534 : }
535 :
536 390834 : void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
537 : DCHECK_LT(current_.chunk_no, chunks_.size());
538 : DCHECK_EQ(buffer_start_, buffer_cursor_);
539 : DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);
540 :
541 390834 : const Chunk& chunk = chunks_[current_.chunk_no];
542 :
543 : // The buffer_ is writable, but buffer_*_ members are const. So we get a
544 : // non-const pointer into buffer that points to the same char as buffer_end_.
545 390834 : uint16_t* cursor = buffer_ + (buffer_end_ - buffer_start_);
546 : DCHECK_EQ(cursor, buffer_end_);
547 :
548 390834 : unibrow::Utf8::State state = current_.pos.state;
549 390834 : uint32_t incomplete_char = current_.pos.incomplete_char;
550 :
551 : // If the current chunk is the last (empty) chunk we'll have to process
552 : // any left-over, partial characters.
553 390834 : if (chunk.length == 0) {
554 13306 : unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
555 13306 : if (t != unibrow::Utf8::kBufferEmpty) {
556 : DCHECK_EQ(t, unibrow::Utf8::kBadChar);
557 25 : *cursor = static_cast<uc16>(t);
558 25 : buffer_end_++;
559 25 : current_.pos.chars++;
560 25 : current_.pos.incomplete_char = 0;
561 25 : current_.pos.state = state;
562 : }
563 13306 : return;
564 : }
565 :
566 377528 : size_t it = current_.pos.bytes - chunk.start.bytes;
567 182713414 : while (it < chunk.length && cursor + 1 < buffer_start_ + kBufferSize) {
568 : unibrow::uchar t = unibrow::Utf8::ValueOfIncremental(
569 181958366 : chunk.data[it], &it, &state, &incomplete_char);
570 181958358 : if (V8_LIKELY(t < kUtf8Bom)) {
571 181675946 : *(cursor++) = static_cast<uc16>(t); // The by most frequent case.
572 282412 : } else if (t == unibrow::Utf8::kIncomplete) {
573 : continue;
574 1710 : } else if (t == kUtf8Bom && current_.pos.bytes + it == 3) {
575 : // BOM detected at beginning of the stream. Don't copy it.
576 1685 : } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
577 1328 : *(cursor++) = static_cast<uc16>(t);
578 : } else {
579 357 : *(cursor++) = unibrow::Utf16::LeadSurrogate(t);
580 714 : *(cursor++) = unibrow::Utf16::TrailSurrogate(t);
581 : }
582 : }
583 :
584 377520 : current_.pos.bytes = chunk.start.bytes + it;
585 377520 : current_.pos.chars += (cursor - buffer_end_);
586 377520 : current_.pos.incomplete_char = incomplete_char;
587 377520 : current_.pos.state = state;
588 377520 : current_.chunk_no += (it == chunk.length);
589 :
590 377520 : buffer_end_ = cursor;
591 : }
592 :
593 27613 : bool Utf8ExternalStreamingStream::FetchChunk() {
594 : RuntimeCallTimerScope scope(runtime_call_stats(),
595 27613 : RuntimeCallCounterId::kGetMoreDataCallback);
596 : DCHECK_EQ(current_.chunk_no, chunks_.size());
597 : DCHECK(chunks_.empty() || chunks_.back().length != 0);
598 :
599 27613 : const uint8_t* chunk = nullptr;
600 27613 : size_t length = source_stream_->GetMoreData(&chunk);
601 55226 : chunks_.push_back({chunk, length, current_.pos});
602 55226 : return length > 0;
603 : }
604 :
605 391006 : void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
606 : // If current_ already points to the right position, we're done.
607 : //
608 : // This is expected to be the common case, since we typically call
609 : // FillBuffer right after the current buffer.
610 391006 : if (current_.pos.chars == position) return;
611 :
612 : // No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
613 62330 : if (chunks_.empty()) {
614 : DCHECK_EQ(current_.chunk_no, 0u);
615 : DCHECK_EQ(current_.pos.bytes, 0u);
616 : DCHECK_EQ(current_.pos.chars, 0u);
617 0 : FetchChunk();
618 : }
619 :
620 : // Search for the last chunk whose start position is less or equal to
621 : // position.
622 31165 : size_t chunk_no = chunks_.size() - 1;
623 774325 : while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
624 348230 : chunk_no--;
625 : }
626 :
627 : // Did we find the terminating (zero-length) chunk? Then we're seeking
628 : // behind the end of the data, and position does not exist.
629 : // Set current_ to point to the terminating chunk.
630 31165 : if (chunks_[chunk_no].length == 0) {
631 100 : current_ = {chunk_no, chunks_[chunk_no].start};
632 100 : return;
633 : }
634 :
635 : // Did we find the non-last chunk? Then our position must be within chunk_no.
636 31065 : if (chunk_no + 1 < chunks_.size()) {
637 : // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
638 : // (Many web sites declare utf-8 encoding, but use only (or almost only) the
639 : // ASCII subset for their JavaScript sources. We can exploit this, by
640 : // checking whether the # bytes in a chunk are equal to the # chars, and if
641 : // so avoid the expensive SkipToPosition.)
642 : bool ascii_only_chunk =
643 62110 : chunks_[chunk_no].start.incomplete_char == 0 &&
644 31045 : (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
645 31045 : (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
646 31065 : if (ascii_only_chunk) {
647 31025 : size_t skip = position - chunks_[chunk_no].start.chars;
648 : current_ = {chunk_no,
649 31025 : {chunks_[chunk_no].start.bytes + skip,
650 31025 : chunks_[chunk_no].start.chars + skip, 0,
651 31025 : unibrow::Utf8::State::kAccept}};
652 : } else {
653 40 : current_ = {chunk_no, chunks_[chunk_no].start};
654 40 : SkipToPosition(position);
655 : }
656 :
657 : // Since position was within the chunk, SkipToPosition should have found
658 : // something.
659 : DCHECK_EQ(position, current_.pos.chars);
660 : return;
661 : }
662 :
663 : // What's left: We're in the last, non-terminating chunk. Our position
664 : // may be in the chunk, but it may also be in 'future' chunks, which we'll
665 : // have to obtain.
666 : DCHECK_EQ(chunk_no, chunks_.size() - 1);
667 0 : current_ = {chunk_no, chunks_[chunk_no].start};
668 : bool have_more_data = true;
669 0 : bool found = SkipToPosition(position);
670 0 : while (have_more_data && !found) {
671 : DCHECK_EQ(current_.chunk_no, chunks_.size());
672 0 : have_more_data = FetchChunk();
673 0 : found = have_more_data && SkipToPosition(position);
674 : }
675 :
676 : // We'll return with a postion != the desired position only if we're out
677 : // of data. In that case, we'll point to the terminating chunk.
678 : DCHECK_EQ(found, current_.pos.chars == position);
679 : DCHECK_EQ(have_more_data, chunks_.back().length != 0);
680 : DCHECK_IMPLIES(!found, !have_more_data);
681 : DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
682 : }
683 :
684 391006 : size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
685 391006 : buffer_cursor_ = buffer_;
686 391006 : buffer_end_ = buffer_;
687 :
688 391006 : SearchPosition(position);
689 1145310 : bool out_of_data = current_.chunk_no != chunks_.size() &&
690 754759 : chunks_[current_.chunk_no].length == 0 &&
691 275 : current_.pos.incomplete_char == 0;
692 :
693 391006 : if (out_of_data) return 0;
694 :
695 : // Fill the buffer, until we have at least one char (or are out of data).
696 : // (The embedder might give us 1-byte blocks within a utf-8 char, so we
697 : // can't guarantee progress with one chunk. Thus we iterate.)
698 781567 : while (!out_of_data && buffer_cursor_ == buffer_end_) {
699 : // At end of current data, but there might be more? Then fetch it.
700 781652 : if (current_.chunk_no == chunks_.size()) {
701 27613 : out_of_data = !FetchChunk();
702 : }
703 390826 : FillBufferFromCurrentChunk();
704 : }
705 :
706 : DCHECK_EQ(current_.pos.chars - position,
707 : static_cast<size_t>(buffer_end_ - buffer_cursor_));
708 390741 : return buffer_end_ - buffer_cursor_;
709 : }
710 :
711 : // ----------------------------------------------------------------------------
712 : // ScannerStream: Create stream instances.
713 :
714 2219929 : Utf16CharacterStream* ScannerStream::For(Isolate* isolate,
715 : Handle<String> data) {
716 2219930 : return ScannerStream::For(isolate, data, 0, data->length());
717 : }
718 :
719 2937500 : Utf16CharacterStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
720 : int start_pos, int end_pos) {
721 : DCHECK_GE(start_pos, 0);
722 : DCHECK_LE(start_pos, end_pos);
723 : DCHECK_LE(end_pos, data->length());
724 : size_t start_offset = 0;
725 5875009 : if (data->IsSlicedString()) {
726 645 : SlicedString string = SlicedString::cast(*data);
727 645 : start_offset = string->offset();
728 645 : String parent = string->parent();
729 650 : if (parent->IsThinString()) parent = ThinString::cast(parent)->actual();
730 : data = handle(parent, isolate);
731 : } else {
732 2936861 : data = String::Flatten(isolate, data);
733 : }
734 5875021 : if (data->IsExternalOneByteString()) {
735 : return new BufferedCharacterStream<ExternalStringStream>(
736 : static_cast<size_t>(start_pos), ExternalOneByteString::cast(*data),
737 3655 : start_offset, static_cast<size_t>(end_pos));
738 5867712 : } else if (data->IsExternalTwoByteString()) {
739 : return new UnbufferedCharacterStream<ExternalStringStream>(
740 : static_cast<size_t>(start_pos), ExternalTwoByteString::cast(*data),
741 62 : start_offset, static_cast<size_t>(end_pos));
742 5867592 : } else if (data->IsSeqOneByteString()) {
743 : return new BufferedCharacterStream<OnHeapStream>(
744 : static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data),
745 5851216 : start_offset, static_cast<size_t>(end_pos));
746 16380 : } else if (data->IsSeqTwoByteString()) {
747 : return new RelocatingCharacterStream(
748 : isolate, static_cast<size_t>(start_pos),
749 : Handle<SeqTwoByteString>::cast(data), start_offset,
750 16380 : static_cast<size_t>(end_pos));
751 : } else {
752 0 : UNREACHABLE();
753 : }
754 : }
755 :
756 451 : std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
757 : const char* data) {
758 451 : return ScannerStream::ForTesting(data, strlen(data));
759 : }
760 :
761 1686 : std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
762 : const char* data, size_t length) {
763 : return std::unique_ptr<Utf16CharacterStream>(
764 : new BufferedCharacterStream<TestingStream>(
765 : static_cast<size_t>(0), reinterpret_cast<const uint8_t*>(data),
766 3372 : static_cast<size_t>(length)));
767 : }
768 :
769 13551 : Utf16CharacterStream* ScannerStream::For(
770 : ScriptCompiler::ExternalSourceStream* source_stream,
771 : v8::ScriptCompiler::StreamedSource::Encoding encoding) {
772 13551 : switch (encoding) {
773 : case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
774 : return new UnbufferedCharacterStream<ChunkedStream>(
775 110 : static_cast<size_t>(0), source_stream);
776 : case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
777 : return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
778 190 : source_stream);
779 : case v8::ScriptCompiler::StreamedSource::UTF8:
780 26802 : return new Utf8ExternalStreamingStream(source_stream);
781 : }
782 0 : UNREACHABLE();
783 : }
784 :
785 : } // namespace internal
786 183867 : } // namespace v8
|