LCOV - code coverage report
Current view: top level - src/parsing - scanner-character-streams.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 232 257 90.3 %
Date: 2019-04-18 Functions: 52 76 68.4 %

          Line data    Source code
       1             : // Copyright 2011 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/parsing/scanner-character-streams.h"
       6             : 
       7             : #include <memory>
       8             : #include <vector>
       9             : 
      10             : #include "include/v8.h"
      11             : #include "src/counters.h"
      12             : #include "src/globals.h"
      13             : #include "src/handles.h"
      14             : #include "src/objects-inl.h"
      15             : #include "src/parsing/scanner.h"
      16             : #include "src/unicode-inl.h"
      17             : 
      18             : namespace v8 {
      19             : namespace internal {
      20             : 
      21             : class ScopedExternalStringLock {
      22             :  public:
      23        3768 :   explicit ScopedExternalStringLock(ExternalString string) {
      24             :     DCHECK(!string.is_null());
      25        3770 :     if (string->IsExternalOneByteString()) {
      26        3708 :       resource_ = ExternalOneByteString::cast(string)->resource();
      27             :     } else {
      28             :       DCHECK(string->IsExternalTwoByteString());
      29          62 :       resource_ = ExternalTwoByteString::cast(string)->resource();
      30             :     }
      31             :     DCHECK(resource_);
      32        3770 :     resource_->Lock();
      33        3770 :   }
      34             : 
      35             :   // Copying a lock increases the locking depth.
      36             :   ScopedExternalStringLock(const ScopedExternalStringLock& other) V8_NOEXCEPT
      37          95 :       : resource_(other.resource_) {
      38          95 :     resource_->Lock();
      39             :   }
      40             : 
      41        3860 :   ~ScopedExternalStringLock() { resource_->Unlock(); }
      42             : 
      43             :  private:
      44             :   // Not nullptr.
      45             :   const v8::String::ExternalStringResourceBase* resource_;
      46             : };
      47             : 
      48             : namespace {
      49             : const unibrow::uchar kUtf8Bom = 0xFEFF;
      50             : }  // namespace
      51             : 
      52             : template <typename Char>
      53             : struct CharTraits;
      54             : 
      55             : template <>
      56             : struct CharTraits<uint8_t> {
      57             :   using String = SeqOneByteString;
      58             :   using ExternalString = ExternalOneByteString;
      59             : };
      60             : 
      61             : template <>
      62             : struct CharTraits<uint16_t> {
      63             :   using String = SeqTwoByteString;
      64             :   using ExternalString = ExternalTwoByteString;
      65             : };
      66             : 
      67             : template <typename Char>
      68             : struct Range {
      69             :   const Char* start;
      70             :   const Char* end;
      71             : 
      72     8981744 :   size_t length() { return static_cast<size_t>(end - start); }
      73             :   bool unaligned_start() const {
      74             :     return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
      75             :   }
      76             : };
      77             : 
      78             : // A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString.
      79             : template <typename Char>
      80             : class OnHeapStream {
      81             :  public:
      82             :   using String = typename CharTraits<Char>::String;
      83             : 
      84             :   OnHeapStream(Handle<String> string, size_t start_offset, size_t end)
      85     2975881 :       : string_(string), start_offset_(start_offset), length_(end) {}
      86             : 
      87           0 :   OnHeapStream(const OnHeapStream&) V8_NOEXCEPT : start_offset_(0), length_(0) {
      88           0 :     UNREACHABLE();
      89             :   }
      90             : 
      91             :   // The no_gc argument is only here because of the templated way this class
      92             :   // is used along with other implementations that require V8 heap access.
      93             :   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
      94             :                         DisallowHeapAllocation* no_gc) {
      95     8849376 :     return {&string_->GetChars(*no_gc)[start_offset_ + Min(length_, pos)],
      96     8849376 :             &string_->GetChars(*no_gc)[start_offset_ + length_]};
      97             :   }
      98             : 
      99             :   static const bool kCanBeCloned = false;
     100             :   static const bool kCanAccessHeap = true;
     101             : 
     102             :  private:
     103             :   Handle<String> string_;
     104             :   const size_t start_offset_;
     105             :   const size_t length_;
     106             : };
     107             : 
     108             : // A Char stream backed by an off-heap ExternalOneByteString or
     109             : // ExternalTwoByteString.
     110             : template <typename Char>
     111        3861 : class ExternalStringStream {
     112             :   using ExternalString = typename CharTraits<Char>::ExternalString;
     113             : 
     114             :  public:
     115        3768 :   ExternalStringStream(ExternalString string, size_t start_offset,
     116             :                        size_t length)
     117             :       : lock_(string),
     118        3769 :         data_(string->GetChars() + start_offset),
     119        7537 :         length_(length) {}
     120             : 
     121             :   ExternalStringStream(const ExternalStringStream& other) V8_NOEXCEPT
     122             :       : lock_(other.lock_),
     123             :         data_(other.data_),
     124          95 :         length_(other.length_) {}
     125             : 
     126             :   // The no_gc argument is only here because of the templated way this class
     127             :   // is used along with other implementations that require V8 heap access.
     128             :   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
     129             :                         DisallowHeapAllocation* no_gc = nullptr) {
     130       64973 :     return {&data_[Min(length_, pos)], &data_[length_]};
     131             :   }
     132             : 
     133             :   static const bool kCanBeCloned = true;
     134             :   static const bool kCanAccessHeap = false;
     135             : 
     136             :  private:
     137             :   ScopedExternalStringLock lock_;
     138             :   const Char* const data_;
     139             :   const size_t length_;
     140             : };
     141             : 
     142             : // A Char stream backed by a C array. Testing only.
     143             : template <typename Char>
     144             : class TestingStream {
     145             :  public:
     146             :   TestingStream(const Char* data, size_t length)
     147        1746 :       : data_(data), length_(length) {}
     148             :   // The no_gc argument is only here because of the templated way this class
     149             :   // is used along with other implementations that require V8 heap access.
     150             :   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
     151             :                         DisallowHeapAllocation* no_gc = nullptr) {
     152        7150 :     return {&data_[Min(length_, pos)], &data_[length_]};
     153             :   }
     154             : 
     155             :   static const bool kCanBeCloned = true;
     156             :   static const bool kCanAccessHeap = false;
     157             : 
     158             :  private:
     159             :   const Char* const data_;
     160             :   const size_t length_;
     161             : };
     162             : 
     163             : // A Char stream backed by multiple source-stream provided off-heap chunks.
     164             : template <typename Char>
     165             : class ChunkedStream {
     166             :  public:
     167             :   explicit ChunkedStream(ScriptCompiler::ExternalSourceStream* source)
     168         150 :       : source_(source) {}
     169             : 
     170           0 :   ChunkedStream(const ChunkedStream&) V8_NOEXCEPT {
     171             :     // TODO(rmcilroy): Implement cloning for chunked streams.
     172           0 :     UNREACHABLE();
     173             :   }
     174             : 
     175             :   // The no_gc argument is only here because of the templated way this class
     176             :   // is used along with other implementations that require V8 heap access.
     177       63820 :   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
     178             :                         DisallowHeapAllocation* no_gc = nullptr) {
     179       63820 :     Chunk chunk = FindChunk(pos, stats);
     180       63820 :     size_t buffer_end = chunk.length;
     181       63820 :     size_t buffer_pos = Min(buffer_end, pos - chunk.position);
     182       63820 :     return {&chunk.data[buffer_pos], &chunk.data[buffer_end]};
     183             :   }
     184             : 
     185         150 :   ~ChunkedStream() {
     186        1435 :     for (Chunk& chunk : chunks_) delete[] chunk.data;
     187         300 :   }
     188             : 
     189             :   static const bool kCanBeCloned = false;
     190             :   static const bool kCanAccessHeap = false;
     191             : 
     192             :  private:
     193             :   struct Chunk {
     194             :     Chunk(const Char* const data, size_t position, size_t length)
     195        1285 :         : data(data), position(position), length(length) {}
     196             :     const Char* const data;
     197             :     // The logical position of data.
     198             :     const size_t position;
     199             :     const size_t length;
     200       64965 :     size_t end_position() const { return position + length; }
     201             :   };
     202             : 
     203       63820 :   Chunk FindChunk(size_t position, RuntimeCallStats* stats) {
     204       63960 :     while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0}, stats);
     205             : 
     206             :     // Walk forwards while the position is in front of the current chunk.
     207       66110 :     while (position >= chunks_.back().end_position() &&
     208             :            chunks_.back().length > 0) {
     209        1145 :       FetchChunk(chunks_.back().end_position(), stats);
     210             :     }
     211             : 
     212             :     // Walk backwards.
     213      763800 :     for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend();
     214             :          ++reverse_it) {
     215      763800 :       if (reverse_it->position <= position) return *reverse_it;
     216             :     }
     217             : 
     218           0 :     UNREACHABLE();
     219             :   }
     220             : 
     221        1285 :   virtual void ProcessChunk(const uint8_t* data, size_t position,
     222             :                             size_t length) {
     223             :     // Incoming data has to be aligned to Char size.
     224             :     DCHECK_EQ(0, length % sizeof(Char));
     225        1285 :     chunks_.emplace_back(reinterpret_cast<const Char*>(data), position,
     226             :                          length / sizeof(Char));
     227        1285 :   }
     228             : 
     229        1285 :   void FetchChunk(size_t position, RuntimeCallStats* stats) {
     230        1285 :     const uint8_t* data = nullptr;
     231             :     size_t length;
     232             :     {
     233             :       RuntimeCallTimerScope scope(stats,
     234        1285 :                                   RuntimeCallCounterId::kGetMoreDataCallback);
     235        1285 :       length = source_->GetMoreData(&data);
     236             :     }
     237        1285 :     ProcessChunk(data, position, length);
     238        1285 :   }
     239             : 
     240             :   ScriptCompiler::ExternalSourceStream* source_;
     241             : 
     242             :  protected:
     243             :   std::vector<struct Chunk> chunks_;
     244             : };
     245             : 
     246             : // Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
     247             : // Chars are buffered if either the underlying stream isn't utf-16 or the
     248             : // underlying utf-16 stream might move (is on-heap).
     249             : template <template <typename T> class ByteStream>
     250     5945929 : class BufferedCharacterStream : public Utf16CharacterStream {
     251             :  public:
     252             :   template <class... TArgs>
     253     2972892 :   BufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
     254     2971050 :     buffer_pos_ = pos;
     255             :   }
     256             : 
     257          70 :   bool can_be_cloned() const final {
     258          70 :     return ByteStream<uint16_t>::kCanBeCloned;
     259             :   }
     260             : 
     261          85 :   std::unique_ptr<Utf16CharacterStream> Clone() const override {
     262           0 :     CHECK(can_be_cloned());
     263             :     return std::unique_ptr<Utf16CharacterStream>(
     264         170 :         new BufferedCharacterStream<ByteStream>(*this));
     265             :   }
     266             : 
     267             :  protected:
     268     8904508 :   bool ReadBlock() final {
     269             :     size_t position = pos();
     270     8904508 :     buffer_pos_ = position;
     271     8904508 :     buffer_start_ = &buffer_[0];
     272     8904508 :     buffer_cursor_ = buffer_start_;
     273             : 
     274             :     DisallowHeapAllocation no_gc;
     275             :     Range<uint8_t> range =
     276       32010 :         byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
     277     8904508 :     if (range.length() == 0) {
     278     2396714 :       buffer_end_ = buffer_start_;
     279     2396714 :       return false;
     280             :     }
     281             : 
     282             :     size_t length = Min(kBufferSize, range.length());
     283             :     i::CopyCharsUnsigned(buffer_, range.start, length);
     284     6507794 :     buffer_end_ = &buffer_[length];
     285     6507794 :     return true;
     286             :   }
     287             : 
     288        3422 :   bool can_access_heap() const final {
     289        3422 :     return ByteStream<uint8_t>::kCanAccessHeap;
     290             :   }
     291             : 
     292             :  private:
     293             :   BufferedCharacterStream(const BufferedCharacterStream<ByteStream>& other)
     294          85 :       : byte_stream_(other.byte_stream_) {}
     295             : 
     296             :   static const size_t kBufferSize = 512;
     297             :   uc16 buffer_[kBufferSize];
     298             :   ByteStream<uint8_t> byte_stream_;
     299             : };
     300             : 
     301             : // Provides a unbuffered utf-16 view on the bytes from the underlying
     302             : // ByteStream.
     303             : template <template <typename T> class ByteStream>
     304       17330 : class UnbufferedCharacterStream : public Utf16CharacterStream {
     305             :  public:
     306             :   template <class... TArgs>
     307        8655 :   UnbufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
     308        8600 :     buffer_pos_ = pos;
     309             :   }
     310             : 
     311         310 :   bool can_access_heap() const final {
     312         310 :     return ByteStream<uint16_t>::kCanAccessHeap;
     313             :   }
     314             : 
     315           5 :   bool can_be_cloned() const final {
     316           5 :     return ByteStream<uint16_t>::kCanBeCloned;
     317             :   }
     318             : 
     319          10 :   std::unique_ptr<Utf16CharacterStream> Clone() const override {
     320             :     return std::unique_ptr<Utf16CharacterStream>(
     321          20 :         new UnbufferedCharacterStream<ByteStream>(*this));
     322             :   }
     323             : 
     324             :  protected:
     325       77236 :   bool ReadBlock() final {
     326             :     size_t position = pos();
     327       77236 :     buffer_pos_ = position;
     328             :     DisallowHeapAllocation no_gc;
     329             :     Range<uint16_t> range =
     330       31810 :         byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
     331       77236 :     buffer_start_ = range.start;
     332       77236 :     buffer_end_ = range.end;
     333       77236 :     buffer_cursor_ = buffer_start_;
     334       77236 :     if (range.length() == 0) return false;
     335             : 
     336             :     DCHECK(!range.unaligned_start());
     337             :     DCHECK_LE(buffer_start_, buffer_end_);
     338       68344 :     return true;
     339             :   }
     340             : 
     341             :   UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream>& other)
     342          10 :       : byte_stream_(other.byte_stream_) {}
     343             : 
     344             :   ByteStream<uint16_t> byte_stream_;
     345             : };
     346             : 
     347             : // Provides a unbuffered utf-16 view on the bytes from the underlying
     348             : // ByteStream.
     349             : class RelocatingCharacterStream
     350             :     : public UnbufferedCharacterStream<OnHeapStream> {
     351             :  public:
     352             :   template <class... TArgs>
     353             :   RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args)
     354             :       : UnbufferedCharacterStream<OnHeapStream>(pos, args...),
     355        8538 :         isolate_(isolate) {
     356        8538 :     isolate->heap()->AddGCEpilogueCallback(UpdateBufferPointersCallback,
     357             :                                            v8::kGCTypeAll, this);
     358             :   }
     359             : 
     360             :  private:
     361       17076 :   ~RelocatingCharacterStream() final {
     362        8538 :     isolate_->heap()->RemoveGCEpilogueCallback(UpdateBufferPointersCallback,
     363        8538 :                                                this);
     364        8538 :   }
     365             : 
     366           0 :   static void UpdateBufferPointersCallback(v8::Isolate* v8_isolate,
     367             :                                            v8::GCType type,
     368             :                                            v8::GCCallbackFlags flags,
     369             :                                            void* stream) {
     370             :     reinterpret_cast<RelocatingCharacterStream*>(stream)
     371             :         ->UpdateBufferPointers();
     372           0 :   }
     373             : 
     374             :   void UpdateBufferPointers() {
     375             :     DisallowHeapAllocation no_gc;
     376             :     Range<uint16_t> range =
     377             :         byte_stream_.GetDataAt(0, runtime_call_stats(), &no_gc);
     378           0 :     if (range.start != buffer_start_) {
     379           0 :       buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start;
     380           0 :       buffer_start_ = range.start;
     381           0 :       buffer_end_ = range.end;
     382             :     }
     383             :   }
     384             : 
     385             :   Isolate* isolate_;
     386             : };
     387             : 
     388             : // ----------------------------------------------------------------------------
     389             : // BufferedUtf16CharacterStreams
     390             : //
     391             : // A buffered character stream based on a random access character
     392             : // source (ReadBlock can be called with pos() pointing to any position,
     393             : // even positions before the current).
     394             : //
     395             : // TODO(verwaest): Remove together with Utf8 external streaming streams.
     396       27514 : class BufferedUtf16CharacterStream : public Utf16CharacterStream {
     397             :  public:
     398             :   BufferedUtf16CharacterStream();
     399             : 
     400             :  protected:
     401             :   static const size_t kBufferSize = 512;
     402             : 
     403             :   bool ReadBlock() final;
     404             : 
     405             :   // FillBuffer should read up to kBufferSize characters at position and store
     406             :   // them into buffer_[0..]. It returns the number of characters stored.
     407             :   virtual size_t FillBuffer(size_t position) = 0;
     408             : 
     409             :   // Fixed sized buffer that this class reads from.
     410             :   // The base class' buffer_start_ should always point to buffer_.
     411             :   uc16 buffer_[kBufferSize];
     412             : };
     413             : 
     414           0 : BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
     415       27514 :     : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}
     416             : 
     417      428066 : bool BufferedUtf16CharacterStream::ReadBlock() {
     418             :   DCHECK_EQ(buffer_start_, buffer_);
     419             : 
     420             :   size_t position = pos();
     421      428066 :   buffer_pos_ = position;
     422      428066 :   buffer_cursor_ = buffer_;
     423      428066 :   buffer_end_ = buffer_ + FillBuffer(position);
     424             :   DCHECK_EQ(pos(), position);
     425             :   DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
     426      428066 :   return buffer_cursor_ < buffer_end_;
     427             : }
     428             : 
     429             : // ----------------------------------------------------------------------------
     430             : // Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
     431             : //
     432             : // This implementation is fairly complex, since data arrives in chunks which
     433             : // may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
     434             : // character position is tricky because the byte position cannot be derived
     435             : // from the character position.
     436             : //
     437             : // TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
     438             : // instead so we don't need to buffer.
     439             : 
     440             : class Utf8ExternalStreamingStream : public BufferedUtf16CharacterStream {
     441             :  public:
     442             :   Utf8ExternalStreamingStream(
     443             :       ScriptCompiler::ExternalSourceStream* source_stream)
     444             :       : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
     445       27514 :         source_stream_(source_stream) {}
     446       41271 :   ~Utf8ExternalStreamingStream() final {
     447       41998 :     for (const Chunk& chunk : chunks_) delete[] chunk.data;
     448       27514 :   }
     449             : 
     450          83 :   bool can_access_heap() const final { return false; }
     451             : 
     452          16 :   bool can_be_cloned() const final { return false; }
     453             : 
     454           0 :   std::unique_ptr<Utf16CharacterStream> Clone() const override {
     455           0 :     UNREACHABLE();
     456             :   }
     457             : 
     458             :  protected:
     459             :   size_t FillBuffer(size_t position) final;
     460             : 
     461             :  private:
     462             :   // A position within the data stream. It stores:
     463             :   // - The 'physical' position (# of bytes in the stream),
     464             :   // - the 'logical' position (# of ucs-2 characters, also within the stream),
     465             :   // - a possibly incomplete utf-8 char at the current 'physical' position.
     466             :   struct StreamPosition {
     467             :     size_t bytes;
     468             :     size_t chars;
     469             :     uint32_t incomplete_char;
     470             :     unibrow::Utf8::State state;
     471             :   };
     472             : 
     473             :   // Position contains a StreamPosition and the index of the chunk the position
     474             :   // points into. (The chunk_no could be derived from pos, but that'd be
     475             :   // an expensive search through all chunks.)
     476             :   struct Position {
     477             :     size_t chunk_no;
     478             :     StreamPosition pos;
     479             :   };
     480             : 
     481             :   // A chunk in the list of chunks, containing:
     482             :   // - The chunk data (data pointer and length), and
     483             :   // - the position at the first byte of the chunk.
     484             :   struct Chunk {
     485             :     const uint8_t* data;
     486             :     size_t length;
     487             :     StreamPosition start;
     488             :   };
     489             : 
     490             :   // Within the current chunk, skip forward from current_ towards position.
     491             :   bool SkipToPosition(size_t position);
     492             :   // Within the current chunk, fill the buffer_ (while it has capacity).
     493             :   void FillBufferFromCurrentChunk();
     494             :   // Fetch a new chunk (assuming current_ is at the end of the current data).
     495             :   bool FetchChunk();
     496             :   // Search through the chunks and set current_ to point to the given position.
     497             :   // (This call is potentially expensive.)
     498             :   void SearchPosition(size_t position);
     499             : 
     500             :   std::vector<Chunk> chunks_;
     501             :   Position current_;
     502             :   ScriptCompiler::ExternalSourceStream* source_stream_;
     503             : };
     504             : 
     505          40 : bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
     506             :   DCHECK_LE(current_.pos.chars, position);  // We can only skip forward.
     507             : 
     508             :   // Already there? Then return immediately.
     509          40 :   if (current_.pos.chars == position) return true;
     510             : 
     511           5 :   const Chunk& chunk = chunks_[current_.chunk_no];
     512             :   DCHECK(current_.pos.bytes >= chunk.start.bytes);
     513             : 
     514           5 :   unibrow::Utf8::State state = chunk.start.state;
     515           5 :   uint32_t incomplete_char = chunk.start.incomplete_char;
     516           5 :   size_t it = current_.pos.bytes - chunk.start.bytes;
     517           5 :   const uint8_t* cursor = &chunk.data[it];
     518           5 :   const uint8_t* end = &chunk.data[chunk.length];
     519             : 
     520             :   size_t chars = current_.pos.chars;
     521             : 
     522           5 :   if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
     523          15 :     while (cursor < end) {
     524             :       unibrow::uchar t =
     525          15 :           unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
     526          15 :       if (t == unibrow::Utf8::kIncomplete) continue;
     527           5 :       if (t != kUtf8Bom) {
     528           0 :         chars++;
     529           0 :         if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
     530             :       }
     531             :       break;
     532             :     }
     533             :   }
     534             : 
     535          45 :   while (cursor < end && chars < position) {
     536             :     unibrow::uchar t =
     537          40 :         unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
     538          40 :     if (t != unibrow::Utf8::kIncomplete) {
     539          25 :       chars++;
     540          25 :       if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
     541             :     }
     542             :   }
     543             : 
     544           5 :   current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
     545           5 :   current_.pos.chars = chars;
     546           5 :   current_.pos.incomplete_char = incomplete_char;
     547           5 :   current_.pos.state = state;
     548           5 :   current_.chunk_no += (cursor == end);
     549             : 
     550           5 :   return current_.pos.chars == position;
     551             : }
     552             : 
     553      427886 : void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
     554             :   DCHECK_LT(current_.chunk_no, chunks_.size());
     555             :   DCHECK_EQ(buffer_start_, buffer_cursor_);
     556             :   DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);
     557             : 
     558      427886 :   const Chunk& chunk = chunks_[current_.chunk_no];
     559             : 
     560             :   // The buffer_ is writable, but buffer_*_ members are const. So we get a
     561             :   // non-const pointer into buffer that points to the same char as buffer_end_.
     562      427886 :   uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
     563             :   DCHECK_EQ(output_cursor, buffer_end_);
     564             : 
     565      427886 :   unibrow::Utf8::State state = current_.pos.state;
     566      427886 :   uint32_t incomplete_char = current_.pos.incomplete_char;
     567             : 
     568             :   // If the current chunk is the last (empty) chunk we'll have to process
     569             :   // any left-over, partial characters.
     570      427886 :   if (chunk.length == 0) {
     571       13579 :     unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
     572       13579 :     if (t != unibrow::Utf8::kBufferEmpty) {
     573             :       DCHECK_EQ(t, unibrow::Utf8::kBadChar);
     574          25 :       *output_cursor = static_cast<uc16>(t);
     575          25 :       buffer_end_++;
     576          25 :       current_.pos.chars++;
     577          25 :       current_.pos.incomplete_char = 0;
     578          25 :       current_.pos.state = state;
     579             :     }
     580       13579 :     return;
     581             :   }
     582             : 
     583      414307 :   size_t it = current_.pos.bytes - chunk.start.bytes;
     584      414307 :   const uint8_t* cursor = chunk.data + it;
     585      414307 :   const uint8_t* end = chunk.data + chunk.length;
     586             : 
     587             :   // Deal with possible BOM.
     588      414307 :   if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
     589       13987 :     while (cursor < end) {
     590             :       unibrow::uchar t =
     591       13947 :           unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
     592       13947 :       if (V8_LIKELY(t < kUtf8Bom)) {
     593       13797 :         *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
     594         150 :       } else if (t == unibrow::Utf8::kIncomplete) {
     595             :         continue;
     596          40 :       } else if (t == kUtf8Bom) {
     597             :         // BOM detected at beginning of the stream. Don't copy it.
     598          15 :       } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
     599           5 :         *(output_cursor++) = static_cast<uc16>(t);
     600             :       } else {
     601          10 :         *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
     602          20 :         *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
     603             :       }
     604             :       break;
     605             :     }
     606             :   }
     607             : 
     608   200123159 :   while (cursor < end && output_cursor + 1 < buffer_start_ + kBufferSize) {
     609             :     unibrow::uchar t =
     610   199708852 :         unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
     611   199708852 :     if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
     612   199431744 :       *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
     613      277108 :     } else if (t == unibrow::Utf8::kIncomplete) {
     614             :       continue;
     615             :     } else {
     616         347 :       *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
     617         694 :       *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
     618             :     }
     619             :   }
     620             : 
     621      414307 :   current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
     622      414307 :   current_.pos.chars += (output_cursor - buffer_end_);
     623      414307 :   current_.pos.incomplete_char = incomplete_char;
     624      414307 :   current_.pos.state = state;
     625      414307 :   current_.chunk_no += (cursor == end);
     626             : 
     627      414307 :   buffer_end_ = output_cursor;
     628             : }
     629             : 
     630       28241 : bool Utf8ExternalStreamingStream::FetchChunk() {
     631             :   RuntimeCallTimerScope scope(runtime_call_stats(),
     632       28241 :                               RuntimeCallCounterId::kGetMoreDataCallback);
     633             :   DCHECK_EQ(current_.chunk_no, chunks_.size());
     634             :   DCHECK(chunks_.empty() || chunks_.back().length != 0);
     635             : 
     636       28241 :   const uint8_t* chunk = nullptr;
     637       28241 :   size_t length = source_stream_->GetMoreData(&chunk);
     638       56482 :   chunks_.push_back({chunk, length, current_.pos});
     639       56482 :   return length > 0;
     640             : }
     641             : 
     642      428066 : void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
     643             :   // If current_ already points to the right position, we're done.
     644             :   //
     645             :   // This is expected to be the common case, since we typically call
     646             :   // FillBuffer right after the current buffer.
     647      428066 :   if (current_.pos.chars == position) return;
     648             : 
     649             :   // No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
     650       31165 :   if (chunks_.empty()) {
     651             :     DCHECK_EQ(current_.chunk_no, 0u);
     652             :     DCHECK_EQ(current_.pos.bytes, 0u);
     653             :     DCHECK_EQ(current_.pos.chars, 0u);
     654           0 :     FetchChunk();
     655             :   }
     656             : 
     657             :   // Search for the last chunk whose start position is less or equal to
     658             :   // position.
     659       31165 :   size_t chunk_no = chunks_.size() - 1;
     660     1091390 :   while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
     661      348230 :     chunk_no--;
     662             :   }
     663             : 
     664             :   // Did we find the terminating (zero-length) chunk? Then we're seeking
     665             :   // behind the end of the data, and position does not exist.
     666             :   // Set current_ to point to the terminating chunk.
     667       31165 :   if (chunks_[chunk_no].length == 0) {
     668         100 :     current_ = {chunk_no, chunks_[chunk_no].start};
     669         100 :     return;
     670             :   }
     671             : 
     672             :   // Did we find the non-last chunk? Then our position must be within chunk_no.
     673       31065 :   if (chunk_no + 1 < chunks_.size()) {
     674             :     // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
     675             :     // (Many web sites declare utf-8 encoding, but use only (or almost only) the
     676             :     //  ASCII subset for their JavaScript sources. We can exploit this, by
     677             :     //  checking whether the # bytes in a chunk are equal to the # chars, and if
     678             :     //  so avoid the expensive SkipToPosition.)
     679             :     bool ascii_only_chunk =
     680       62110 :         chunks_[chunk_no].start.incomplete_char == 0 &&
     681       31045 :         (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
     682       31045 :             (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
     683       31065 :     if (ascii_only_chunk) {
     684       31025 :       size_t skip = position - chunks_[chunk_no].start.chars;
     685             :       current_ = {chunk_no,
     686       31025 :                   {chunks_[chunk_no].start.bytes + skip,
     687             :                    chunks_[chunk_no].start.chars + skip, 0,
     688       31025 :                    unibrow::Utf8::State::kAccept}};
     689             :     } else {
     690          40 :       current_ = {chunk_no, chunks_[chunk_no].start};
     691          40 :       SkipToPosition(position);
     692             :     }
     693             : 
     694             :     // Since position was within the chunk, SkipToPosition should have found
     695             :     // something.
     696             :     DCHECK_EQ(position, current_.pos.chars);
     697             :     return;
     698             :   }
     699             : 
     700             :   // What's left: We're in the last, non-terminating chunk. Our position
     701             :   // may be in the chunk, but it may also be in 'future' chunks, which we'll
     702             :   // have to obtain.
     703             :   DCHECK_EQ(chunk_no, chunks_.size() - 1);
     704           0 :   current_ = {chunk_no, chunks_[chunk_no].start};
     705             :   bool have_more_data = true;
     706           0 :   bool found = SkipToPosition(position);
     707           0 :   while (have_more_data && !found) {
     708             :     DCHECK_EQ(current_.chunk_no, chunks_.size());
     709           0 :     have_more_data = FetchChunk();
     710           0 :     found = have_more_data && SkipToPosition(position);
     711             :   }
     712             : 
     713             :   // We'll return with a postion != the desired position only if we're out
     714             :   // of data. In that case, we'll point to the terminating chunk.
     715             :   DCHECK_EQ(found, current_.pos.chars == position);
     716             :   DCHECK_EQ(have_more_data, chunks_.back().length != 0);
     717             :   DCHECK_IMPLIES(!found, !have_more_data);
     718             :   DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
     719             : }
     720             : 
     721      428066 : size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
     722      428066 :   buffer_cursor_ = buffer_;
     723      428066 :   buffer_end_ = buffer_;
     724             : 
     725      428066 :   SearchPosition(position);
     726      827976 :   bool out_of_data = current_.chunk_no != chunks_.size() &&
     727      828251 :                      chunks_[current_.chunk_no].length == 0 &&
     728         275 :                      current_.pos.incomplete_char == 0;
     729             : 
     730      428066 :   if (out_of_data) return 0;
     731             : 
     732             :   // Fill the buffer, until we have at least one char (or are out of data).
     733             :   // (The embedder might give us 1-byte blocks within a utf-8 char, so we
     734             :   //  can't guarantee progress with one chunk. Thus we iterate.)
     735     1283573 :   while (!out_of_data && buffer_cursor_ == buffer_end_) {
     736             :     // At end of current data, but there might be more? Then fetch it.
     737      855772 :     if (current_.chunk_no == chunks_.size()) {
     738       28241 :       out_of_data = !FetchChunk();
     739             :     }
     740      427886 :     FillBufferFromCurrentChunk();
     741             :   }
     742             : 
     743             :   DCHECK_EQ(current_.pos.chars - position,
     744             :             static_cast<size_t>(buffer_end_ - buffer_cursor_));
     745      427801 :   return buffer_end_ - buffer_cursor_;
     746             : }
     747             : 
     748             : // ----------------------------------------------------------------------------
     749             : // ScannerStream: Create stream instances.
     750             : 
     751     2259303 : Utf16CharacterStream* ScannerStream::For(Isolate* isolate,
     752             :                                          Handle<String> data) {
     753     2259303 :   return ScannerStream::For(isolate, data, 0, data->length());
     754             : }
     755             : 
     756     2979643 : Utf16CharacterStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
     757             :                                          int start_pos, int end_pos) {
     758             :   DCHECK_GE(start_pos, 0);
     759             :   DCHECK_LE(start_pos, end_pos);
     760             :   DCHECK_LE(end_pos, data->length());
     761             :   size_t start_offset = 0;
     762     2979643 :   if (data->IsSlicedString()) {
     763             :     SlicedString string = SlicedString::cast(*data);
     764         542 :     start_offset = string->offset();
     765             :     String parent = string->parent();
     766         542 :     if (parent->IsThinString()) parent = ThinString::cast(parent)->actual();
     767             :     data = handle(parent, isolate);
     768             :   } else {
     769     2979101 :     data = String::Flatten(isolate, data);
     770             :   }
     771     2979645 :   if (data->IsExternalOneByteString()) {
     772             :     return new BufferedCharacterStream<ExternalStringStream>(
     773             :         static_cast<size_t>(start_pos), ExternalOneByteString::cast(*data),
     774        3707 :         start_offset, static_cast<size_t>(end_pos));
     775     2975938 :   } else if (data->IsExternalTwoByteString()) {
     776             :     return new UnbufferedCharacterStream<ExternalStringStream>(
     777             :         static_cast<size_t>(start_pos), ExternalTwoByteString::cast(*data),
     778          62 :         start_offset, static_cast<size_t>(end_pos));
     779     2975876 :   } else if (data->IsSeqOneByteString()) {
     780             :     return new BufferedCharacterStream<OnHeapStream>(
     781             :         static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data),
     782     2967338 :         start_offset, static_cast<size_t>(end_pos));
     783        8538 :   } else if (data->IsSeqTwoByteString()) {
     784             :     return new RelocatingCharacterStream(
     785             :         isolate, static_cast<size_t>(start_pos),
     786             :         Handle<SeqTwoByteString>::cast(data), start_offset,
     787        8538 :         static_cast<size_t>(end_pos));
     788             :   } else {
     789           0 :     UNREACHABLE();
     790             :   }
     791             : }
     792             : 
     793         451 : std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
     794             :     const char* data) {
     795         451 :   return ScannerStream::ForTesting(data, strlen(data));
     796             : }
     797             : 
     798        1746 : std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
     799             :     const char* data, size_t length) {
     800             :   return std::unique_ptr<Utf16CharacterStream>(
     801             :       new BufferedCharacterStream<TestingStream>(
     802             :           static_cast<size_t>(0), reinterpret_cast<const uint8_t*>(data),
     803        3492 :           static_cast<size_t>(length)));
     804             : }
     805             : 
     806       13907 : Utf16CharacterStream* ScannerStream::For(
     807             :     ScriptCompiler::ExternalSourceStream* source_stream,
     808             :     v8::ScriptCompiler::StreamedSource::Encoding encoding) {
     809       13907 :   switch (encoding) {
     810             :     case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
     811             :       return new UnbufferedCharacterStream<ChunkedStream>(
     812         110 :           static_cast<size_t>(0), source_stream);
     813             :     case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
     814             :       return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
     815         190 :                                                         source_stream);
     816             :     case v8::ScriptCompiler::StreamedSource::UTF8:
     817       27514 :       return new Utf8ExternalStreamingStream(source_stream);
     818             :   }
     819           0 :   UNREACHABLE();
     820             : }
     821             : 
     822             : }  // namespace internal
     823      122036 : }  // namespace v8

Generated by: LCOV version 1.10