LCOV - code coverage report
Current view: top level - src/regexp - jsregexp.h (source / functions) Hit Total Coverage
Test: app.info Lines: 142 166 85.5 %
Date: 2017-10-20 Functions: 25 57 43.9 %

          Line data    Source code
       1             : // Copyright 2012 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_REGEXP_JSREGEXP_H_
       6             : #define V8_REGEXP_JSREGEXP_H_
       7             : 
       8             : #include "src/allocation.h"
       9             : #include "src/assembler.h"
      10             : #include "src/objects/js-regexp.h"
      11             : #include "src/regexp/regexp-ast.h"
      12             : #include "src/regexp/regexp-macro-assembler.h"
      13             : 
      14             : namespace v8 {
      15             : namespace internal {
      16             : 
      17             : class NodeVisitor;
      18             : class RegExpCompiler;
      19             : class RegExpMacroAssembler;
      20             : class RegExpNode;
      21             : class RegExpTree;
      22             : class BoyerMooreLookahead;
      23             : 
      24             : class RegExpImpl {
      25             :  public:
      26             :   // Whether V8 is compiled with native regexp support or not.
      27             :   static bool UsesNativeRegExp() {
      28             : #ifdef V8_INTERPRETED_REGEXP
      29             :     return false;
      30             : #else
      31             :     return true;
      32             : #endif
      33             :   }
      34             : 
      35             :   // Returns a string representation of a regular expression.
      36             :   // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
      37             :   // This function calls the garbage collector if necessary.
      38             :   static Handle<String> ToString(Handle<Object> value);
      39             : 
      40             :   // Parses the RegExp pattern and prepares the JSRegExp object with
      41             :   // generic data and choice of implementation - as well as what
      42             :   // the implementation wants to store in the data field.
      43             :   // Returns false if compilation fails.
      44             :   MUST_USE_RESULT static MaybeHandle<Object> Compile(Handle<JSRegExp> re,
      45             :                                                      Handle<String> pattern,
      46             :                                                      JSRegExp::Flags flags);
      47             : 
      48             :   // See ECMA-262 section 15.10.6.2.
      49             :   // This function calls the garbage collector if necessary.
      50             :   V8_EXPORT_PRIVATE MUST_USE_RESULT static MaybeHandle<Object> Exec(
      51             :       Handle<JSRegExp> regexp, Handle<String> subject, int index,
      52             :       Handle<RegExpMatchInfo> last_match_info);
      53             : 
      54             :   // Prepares a JSRegExp object with Irregexp-specific data.
      55             :   static void IrregexpInitialize(Handle<JSRegExp> re,
      56             :                                  Handle<String> pattern,
      57             :                                  JSRegExp::Flags flags,
      58             :                                  int capture_register_count);
      59             : 
      60             : 
      61             :   static void AtomCompile(Handle<JSRegExp> re,
      62             :                           Handle<String> pattern,
      63             :                           JSRegExp::Flags flags,
      64             :                           Handle<String> match_pattern);
      65             : 
      66             : 
      67             :   static int AtomExecRaw(Handle<JSRegExp> regexp,
      68             :                          Handle<String> subject,
      69             :                          int index,
      70             :                          int32_t* output,
      71             :                          int output_size);
      72             : 
      73             :   static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
      74             :                                  Handle<String> subject, int index,
      75             :                                  Handle<RegExpMatchInfo> last_match_info);
      76             : 
      77             :   enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
      78             : 
      79             :   // Prepare a RegExp for being executed one or more times (using
      80             :   // IrregexpExecOnce) on the subject.
      81             :   // This ensures that the regexp is compiled for the subject, and that
      82             :   // the subject is flat.
      83             :   // Returns the number of integer spaces required by IrregexpExecOnce
      84             :   // as its "registers" argument.  If the regexp cannot be compiled,
      85             :   // an exception is set as pending, and this function returns negative.
      86             :   static int IrregexpPrepare(Handle<JSRegExp> regexp,
      87             :                              Handle<String> subject);
      88             : 
      89             :   // Execute a regular expression on the subject, starting from index.
      90             :   // If matching succeeds, return the number of matches.  This can be larger
      91             :   // than one in the case of global regular expressions.
      92             :   // The captures and subcaptures are stored into the registers vector.
      93             :   // If matching fails, returns RE_FAILURE.
      94             :   // If execution fails, sets a pending exception and returns RE_EXCEPTION.
      95             :   static int IrregexpExecRaw(Handle<JSRegExp> regexp,
      96             :                              Handle<String> subject,
      97             :                              int index,
      98             :                              int32_t* output,
      99             :                              int output_size);
     100             : 
     101             :   // Execute an Irregexp bytecode pattern.
     102             :   // On a successful match, the result is a JSArray containing
     103             :   // captured positions.  On a failure, the result is the null value.
     104             :   // Returns an empty handle in case of an exception.
     105             :   MUST_USE_RESULT static MaybeHandle<Object> IrregexpExec(
     106             :       Handle<JSRegExp> regexp, Handle<String> subject, int index,
     107             :       Handle<RegExpMatchInfo> last_match_info);
     108             : 
     109             :   // Set last match info.  If match is nullptr, then setting captures is
     110             :   // omitted.
     111             :   static Handle<RegExpMatchInfo> SetLastMatchInfo(
     112             :       Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
     113             :       int capture_count, int32_t* match);
     114             : 
     115             :   class GlobalCache {
     116             :    public:
     117             :     GlobalCache(Handle<JSRegExp> regexp,
     118             :                 Handle<String> subject,
     119             :                 Isolate* isolate);
     120             : 
     121             :     INLINE(~GlobalCache());
     122             : 
     123             :     // Fetch the next entry in the cache for global regexp match results.
     124             :     // This does not set the last match info.  Upon failure, nullptr is
     125             :     // returned. The cause can be checked with Result().  The previous result is
     126             :     // still in available in memory when a failure happens.
     127             :     INLINE(int32_t* FetchNext());
     128             : 
     129             :     INLINE(int32_t* LastSuccessfulMatch());
     130             : 
     131             :     INLINE(bool HasException()) { return num_matches_ < 0; }
     132             : 
     133             :    private:
     134             :     int AdvanceZeroLength(int last_index);
     135             : 
     136             :     int num_matches_;
     137             :     int max_matches_;
     138             :     int current_match_index_;
     139             :     int registers_per_match_;
     140             :     // Pointer to the last set of captures.
     141             :     int32_t* register_array_;
     142             :     int register_array_size_;
     143             :     Handle<JSRegExp> regexp_;
     144             :     Handle<String> subject_;
     145             :   };
     146             : 
     147             :   // For acting on the JSRegExp data FixedArray.
     148             :   static int IrregexpMaxRegisterCount(FixedArray* re);
     149             :   static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
     150             :   static void SetIrregexpCaptureNameMap(FixedArray* re,
     151             :                                         Handle<FixedArray> value);
     152             :   static int IrregexpNumberOfCaptures(FixedArray* re);
     153             :   static int IrregexpNumberOfRegisters(FixedArray* re);
     154             :   static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
     155             :   static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
     156             : 
     157             :   // Limit the space regexps take up on the heap.  In order to limit this we
     158             :   // would like to keep track of the amount of regexp code on the heap.  This
     159             :   // is not tracked, however.  As a conservative approximation we track the
     160             :   // total regexp code compiled including code that has subsequently been freed
     161             :   // and the total executable memory at any point.
     162             :   static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
     163             :   static const size_t kRegExpCompiledLimit = 1 * MB;
     164             :   static const int kRegExpTooLargeToOptimize = 20 * KB;
     165             : 
     166             :  private:
     167             :   static bool CompileIrregexp(Handle<JSRegExp> re,
     168             :                               Handle<String> sample_subject, bool is_one_byte);
     169             :   static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re,
     170             :                                             Handle<String> sample_subject,
     171             :                                             bool is_one_byte);
     172             : };
     173             : 
     174             : 
     175             : // Represents the location of one element relative to the intersection of
     176             : // two sets. Corresponds to the four areas of a Venn diagram.
     177             : enum ElementInSetsRelation {
     178             :   kInsideNone = 0,
     179             :   kInsideFirst = 1,
     180             :   kInsideSecond = 2,
     181             :   kInsideBoth = 3
     182             : };
     183             : 
     184             : 
     185             : // A set of unsigned integers that behaves especially well on small
     186             : // integers (< 32).  May do zone-allocation.
     187             : class OutSet: public ZoneObject {
     188             :  public:
     189     1073102 :   OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
     190             :   OutSet* Extend(unsigned value, Zone* zone);
     191             :   bool Get(unsigned value) const;
     192             :   static const unsigned kFirstLimit = 32;
     193             : 
     194             :  private:
     195             :   // Destructively set a value in this set.  In most cases you want
     196             :   // to use Extend instead to ensure that only one instance exists
     197             :   // that contains the same values.
     198             :   void Set(unsigned value, Zone* zone);
     199             : 
     200             :   // The successors are a list of sets that contain the same values
     201             :   // as this set and the one more value that is not present in this
     202             :   // set.
     203             :   ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
     204             : 
     205             :   OutSet(uint32_t first, ZoneList<unsigned>* remaining)
     206       17565 :       : first_(first), remaining_(remaining), successors_(nullptr) {}
     207             :   uint32_t first_;
     208             :   ZoneList<unsigned>* remaining_;
     209             :   ZoneList<OutSet*>* successors_;
     210             :   friend class Trace;
     211             : };
     212             : 
     213             : 
     214             : // A mapping from integers, specified as ranges, to a set of integers.
     215             : // Used for mapping character ranges to choices.
     216             : class DispatchTable : public ZoneObject {
     217             :  public:
     218             :   explicit DispatchTable(Zone* zone) : tree_(zone) { }
     219             : 
     220             :   class Entry {
     221             :    public:
     222             :     Entry() : from_(0), to_(0), out_set_(nullptr) {}
     223             :     Entry(uc32 from, uc32 to, OutSet* out_set)
     224             :         : from_(from), to_(to), out_set_(out_set) {
     225             :       DCHECK(from <= to);
     226             :     }
     227             :     uc32 from() { return from_; }
     228             :     uc32 to() { return to_; }
     229        5398 :     void set_to(uc32 value) { to_ = value; }
     230             :     void AddValue(int value, Zone* zone) {
     231       78219 :       out_set_ = out_set_->Extend(value, zone);
     232             :     }
     233             :     OutSet* out_set() { return out_set_; }
     234             :    private:
     235             :     uc32 from_;
     236             :     uc32 to_;
     237             :     OutSet* out_set_;
     238             :   };
     239             : 
     240             :   class Config {
     241             :    public:
     242             :     typedef uc32 Key;
     243             :     typedef Entry Value;
     244             :     static const uc32 kNoKey;
     245             :     static const Entry NoValue() { return Value(); }
     246             :     static inline int Compare(uc32 a, uc32 b) {
     247     1558716 :       if (a == b)
     248             :         return 0;
     249     1521641 :       else if (a < b)
     250             :         return -1;
     251             :       else
     252             :         return 1;
     253             :     }
     254             :   };
     255             : 
     256             :   void AddRange(CharacterRange range, int value, Zone* zone);
     257             :   OutSet* Get(uc32 value);
     258             :   void Dump();
     259             : 
     260             :   template <typename Callback>
     261             :   void ForEach(Callback* callback) {
     262        2632 :     return tree()->ForEach(callback);
     263             :   }
     264             : 
     265             :  private:
     266             :   // There can't be a static empty set since it allocates its
     267             :   // successors in a zone and caches them.
     268             :   OutSet* empty() { return &empty_; }
     269             :   OutSet empty_;
     270             :   ZoneSplayTree<Config>* tree() { return &tree_; }
     271             :   ZoneSplayTree<Config> tree_;
     272             : };
     273             : 
     274             : 
     275             : // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
     276             : class UnicodeRangeSplitter {
     277             :  public:
     278             :   UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
     279             :   void Call(uc32 from, DispatchTable::Entry entry);
     280             : 
     281             :   ZoneList<CharacterRange>* bmp() { return bmp_; }
     282             :   ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
     283             :   ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
     284             :   ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
     285             : 
     286             :  private:
     287             :   static const int kBase = 0;
     288             :   // Separate ranges into
     289             :   static const int kBmpCodePoints = 1;
     290             :   static const int kLeadSurrogates = 2;
     291             :   static const int kTrailSurrogates = 3;
     292             :   static const int kNonBmpCodePoints = 4;
     293             : 
     294             :   Zone* zone_;
     295             :   DispatchTable table_;
     296             :   ZoneList<CharacterRange>* bmp_;
     297             :   ZoneList<CharacterRange>* lead_surrogates_;
     298             :   ZoneList<CharacterRange>* trail_surrogates_;
     299             :   ZoneList<CharacterRange>* non_bmp_;
     300             : };
     301             : 
     302             : 
     303             : #define FOR_EACH_NODE_TYPE(VISIT)                                    \
     304             :   VISIT(End)                                                         \
     305             :   VISIT(Action)                                                      \
     306             :   VISIT(Choice)                                                      \
     307             :   VISIT(BackReference)                                               \
     308             :   VISIT(Assertion)                                                   \
     309             :   VISIT(Text)
     310             : 
     311             : 
     312             : class Trace;
     313             : struct PreloadState;
     314             : class GreedyLoopState;
     315             : class AlternativeGenerationList;
     316             : 
     317             : struct NodeInfo {
     318             :   NodeInfo()
     319             :       : being_analyzed(false),
     320             :         been_analyzed(false),
     321             :         follows_word_interest(false),
     322             :         follows_newline_interest(false),
     323             :         follows_start_interest(false),
     324             :         at_end(false),
     325             :         visited(false),
     326     4880080 :         replacement_calculated(false) { }
     327             : 
     328             :   // Returns true if the interests and assumptions of this node
     329             :   // matches the given one.
     330             :   bool Matches(NodeInfo* that) {
     331             :     return (at_end == that->at_end) &&
     332             :            (follows_word_interest == that->follows_word_interest) &&
     333             :            (follows_newline_interest == that->follows_newline_interest) &&
     334             :            (follows_start_interest == that->follows_start_interest);
     335             :   }
     336             : 
     337             :   // Updates the interests of this node given the interests of the
     338             :   // node preceding it.
     339             :   void AddFromPreceding(NodeInfo* that) {
     340             :     at_end |= that->at_end;
     341             :     follows_word_interest |= that->follows_word_interest;
     342             :     follows_newline_interest |= that->follows_newline_interest;
     343             :     follows_start_interest |= that->follows_start_interest;
     344             :   }
     345             : 
     346             :   bool HasLookbehind() {
     347             :     return follows_word_interest ||
     348             :            follows_newline_interest ||
     349             :            follows_start_interest;
     350             :   }
     351             : 
     352             :   // Sets the interests of this node to include the interests of the
     353             :   // following node.
     354             :   void AddFromFollowing(NodeInfo* that) {
     355      632164 :     follows_word_interest |= that->follows_word_interest;
     356      632164 :     follows_newline_interest |= that->follows_newline_interest;
     357      632164 :     follows_start_interest |= that->follows_start_interest;
     358             :   }
     359             : 
     360             :   void ResetCompilationState() {
     361             :     being_analyzed = false;
     362             :     been_analyzed = false;
     363             :   }
     364             : 
     365             :   bool being_analyzed: 1;
     366             :   bool been_analyzed: 1;
     367             : 
     368             :   // These bits are set of this node has to know what the preceding
     369             :   // character was.
     370             :   bool follows_word_interest: 1;
     371             :   bool follows_newline_interest: 1;
     372             :   bool follows_start_interest: 1;
     373             : 
     374             :   bool at_end: 1;
     375             :   bool visited: 1;
     376             :   bool replacement_calculated: 1;
     377             : };
     378             : 
     379             : 
     380             : // Details of a quick mask-compare check that can look ahead in the
     381             : // input stream.
     382             : class QuickCheckDetails {
     383             :  public:
     384             :   QuickCheckDetails()
     385             :       : characters_(0),
     386             :         mask_(0),
     387             :         value_(0),
     388    15963540 :         cannot_match_(false) { }
     389             :   explicit QuickCheckDetails(int characters)
     390             :       : characters_(characters),
     391             :         mask_(0),
     392             :         value_(0),
     393      868115 :         cannot_match_(false) { }
     394             :   bool Rationalize(bool one_byte);
     395             :   // Merge in the information from another branch of an alternation.
     396             :   void Merge(QuickCheckDetails* other, int from_index);
     397             :   // Advance the current position by some amount.
     398             :   void Advance(int by, bool one_byte);
     399             :   void Clear();
     400             :   bool cannot_match() { return cannot_match_; }
     401         491 :   void set_cannot_match() { cannot_match_ = true; }
     402             :   struct Position {
     403    13465324 :     Position() : mask(0), value(0), determines_perfectly(false) { }
     404             :     uc16 mask;
     405             :     uc16 value;
     406             :     bool determines_perfectly;
     407             :   };
     408             :   int characters() { return characters_; }
     409      652878 :   void set_characters(int characters) { characters_ = characters; }
     410             :   Position* positions(int index) {
     411             :     DCHECK_LE(0, index);
     412             :     DCHECK_GT(characters_, index);
     413     1890514 :     return positions_ + index;
     414             :   }
     415             :   uint32_t mask() { return mask_; }
     416             :   uint32_t value() { return value_; }
     417             : 
     418             :  private:
     419             :   // How many characters do we have quick check information from.  This is
     420             :   // the same for all branches of a choice node.
     421             :   int characters_;
     422             :   Position positions_[4];
     423             :   // These values are the condensate of the above array after Rationalize().
     424             :   uint32_t mask_;
     425             :   uint32_t value_;
     426             :   // If set to true, there is no way this quick check can match at all.
     427             :   // E.g., if it requires to be at the start of the input, and isn't.
     428             :   bool cannot_match_;
     429             : };
     430             : 
     431             : 
     432             : extern int kUninitializedRegExpNodePlaceHolder;
     433             : 
     434             : 
     435             : class RegExpNode: public ZoneObject {
     436             :  public:
     437             :   explicit RegExpNode(Zone* zone)
     438             :       : replacement_(nullptr),
     439             :         on_work_list_(false),
     440             :         trace_count_(0),
     441    14640240 :         zone_(zone) {
     442     4880080 :     bm_info_[0] = bm_info_[1] = nullptr;
     443             :   }
     444             :   virtual ~RegExpNode();
     445             :   virtual void Accept(NodeVisitor* visitor) = 0;
     446             :   // Generates a goto to this node or actually generates the code at this point.
     447             :   virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
     448             :   // How many characters must this node consume at a minimum in order to
     449             :   // succeed.  If we have found at least 'still_to_find' characters that
     450             :   // must be consumed there is no need to ask any following nodes whether
     451             :   // they are sure to eat any more characters.  The not_at_start argument is
     452             :   // used to indicate that we know we are not at the start of the input.  In
     453             :   // this case anchored branches will always fail and can be ignored when
     454             :   // determining how many characters are consumed on success.
     455             :   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
     456             :   // Emits some quick code that checks whether the preloaded characters match.
     457             :   // Falls through on certain failure, jumps to the label on possible success.
     458             :   // If the node cannot make a quick check it does nothing and returns false.
     459             :   bool EmitQuickCheck(RegExpCompiler* compiler,
     460             :                       Trace* bounds_check_trace,
     461             :                       Trace* trace,
     462             :                       bool preload_has_checked_bounds,
     463             :                       Label* on_possible_success,
     464             :                       QuickCheckDetails* details_return,
     465             :                       bool fall_through_on_failure);
     466             :   // For a given number of characters this returns a mask and a value.  The
     467             :   // next n characters are anded with the mask and compared with the value.
     468             :   // A comparison failure indicates the node cannot match the next n characters.
     469             :   // A comparison success indicates the node may match.
     470             :   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
     471             :                                     RegExpCompiler* compiler,
     472             :                                     int characters_filled_in,
     473             :                                     bool not_at_start) = 0;
     474             :   static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
     475       98187 :   virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
     476             :   // Only returns the successor for a text node of length 1 that matches any
     477             :   // character and that has no guards on it.
     478       95615 :   virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
     479             :       RegExpCompiler* compiler) {
     480       95615 :     return nullptr;
     481             :   }
     482             : 
     483             :   // Collects information on the possible code units (mod 128) that can match if
     484             :   // we look forward.  This is used for a Boyer-Moore-like string searching
     485             :   // implementation.  TODO(erikcorry):  This should share more code with
     486             :   // EatsAtLeast, GetQuickCheckDetails.  The budget argument is used to limit
     487             :   // the number of nodes we are willing to look at in order to create this data.
     488             :   static const int kRecursionBudget = 200;
     489             :   bool KeepRecursing(RegExpCompiler* compiler);
     490           0 :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
     491             :                             BoyerMooreLookahead* bm, bool not_at_start) {
     492           0 :     UNREACHABLE();
     493             :   }
     494             : 
     495             :   // If we know that the input is one-byte then there are some nodes that can
     496             :   // never match.  This method returns a node that can be substituted for
     497             :   // itself, or nullptr if the node can never match.
     498       22309 :   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
     499       22309 :     return this;
     500             :   }
     501             :   // Helper for FilterOneByte.
     502             :   RegExpNode* replacement() {
     503             :     DCHECK(info()->replacement_calculated);
     504             :     return replacement_;
     505             :   }
     506             :   RegExpNode* set_replacement(RegExpNode* replacement) {
     507      243130 :     info()->replacement_calculated = true;
     508      243130 :     replacement_ =  replacement;
     509             :     return replacement;  // For convenience.
     510             :   }
     511             : 
     512             :   // We want to avoid recalculating the lookahead info, so we store it on the
     513             :   // node.  Only info that is for this node is stored.  We can tell that the
     514             :   // info is for this node when offset == 0, so the information is calculated
     515             :   // relative to this node.
     516             :   void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
     517      133622 :     if (offset == 0) set_bm_info(not_at_start, bm);
     518             :   }
     519             : 
     520             :   Label* label() { return &label_; }
     521             :   // If non-generic code is generated for a node (i.e. the node is not at the
     522             :   // start of the trace) then it cannot be reused.  This variable sets a limit
     523             :   // on how often we allow that to happen before we insist on starting a new
     524             :   // trace and generating generic code for a node that can be reused by flushing
     525             :   // the deferred actions in the current trace and generating a goto.
     526             :   static const int kMaxCopiesCodeGenerated = 10;
     527             : 
     528             :   bool on_work_list() { return on_work_list_; }
     529      430832 :   void set_on_work_list(bool value) { on_work_list_ = value; }
     530             : 
     531             :   NodeInfo* info() { return &info_; }
     532             : 
     533             :   BoyerMooreLookahead* bm_info(bool not_at_start) {
     534       89442 :     return bm_info_[not_at_start ? 1 : 0];
     535             :   }
     536             : 
     537             :   Zone* zone() const { return zone_; }
     538             : 
     539             :  protected:
     540             :   enum LimitResult { DONE, CONTINUE };
     541             :   RegExpNode* replacement_;
     542             : 
     543             :   LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
     544             : 
     545             :   void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
     546      199753 :     bm_info_[not_at_start ? 1 : 0] = bm;
     547             :   }
     548             : 
     549             :  private:
     550             :   static const int kFirstCharBudget = 10;
     551             :   Label label_;
     552             :   bool on_work_list_;
     553             :   NodeInfo info_;
     554             :   // This variable keeps track of how many times code has been generated for
     555             :   // this node (in different traces).  We don't keep track of where the
     556             :   // generated code is located unless the code is generated at the start of
     557             :   // a trace, in which case it is generic and can be reused by flushing the
     558             :   // deferred operations in the current trace and generating a goto.
     559             :   int trace_count_;
     560             :   BoyerMooreLookahead* bm_info_[2];
     561             : 
     562             :   Zone* zone_;
     563             : };
     564             : 
     565             : 
     566           0 : class SeqRegExpNode: public RegExpNode {
     567             :  public:
     568     3638916 :   explicit SeqRegExpNode(RegExpNode* on_success)
     569     3638916 :       : RegExpNode(on_success->zone()), on_success_(on_success) { }
     570             :   RegExpNode* on_success() { return on_success_; }
     571             :   void set_on_success(RegExpNode* node) { on_success_ = node; }
     572             :   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
     573           0 :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
     574             :                             BoyerMooreLookahead* bm, bool not_at_start) {
     575           0 :     on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
     576           0 :     if (offset == 0) set_bm_info(not_at_start, bm);
     577           0 :   }
     578             : 
     579             :  protected:
     580             :   RegExpNode* FilterSuccessor(int depth, bool ignore_case);
     581             : 
     582             :  private:
     583             :   RegExpNode* on_success_;
     584             : };
     585             : 
     586             : 
     587           0 : class ActionNode: public SeqRegExpNode {
     588             :  public:
     589             :   enum ActionType {
     590             :     SET_REGISTER,
     591             :     INCREMENT_REGISTER,
     592             :     STORE_POSITION,
     593             :     BEGIN_SUBMATCH,
     594             :     POSITIVE_SUBMATCH_SUCCESS,
     595             :     EMPTY_MATCH_CHECK,
     596             :     CLEAR_CAPTURES
     597             :   };
     598             :   static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
     599             :   static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
     600             :   static ActionNode* StorePosition(int reg,
     601             :                                    bool is_capture,
     602             :                                    RegExpNode* on_success);
     603             :   static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
     604             :   static ActionNode* BeginSubmatch(int stack_pointer_reg,
     605             :                                    int position_reg,
     606             :                                    RegExpNode* on_success);
     607             :   static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
     608             :                                              int restore_reg,
     609             :                                              int clear_capture_count,
     610             :                                              int clear_capture_from,
     611             :                                              RegExpNode* on_success);
     612             :   static ActionNode* EmptyMatchCheck(int start_register,
     613             :                                      int repetition_register,
     614             :                                      int repetition_limit,
     615             :                                      RegExpNode* on_success);
     616             :   virtual void Accept(NodeVisitor* visitor);
     617             :   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
     618             :   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
     619       84910 :   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
     620             :                                     RegExpCompiler* compiler,
     621             :                                     int filled_in,
     622             :                                     bool not_at_start) {
     623       84910 :     return on_success()->GetQuickCheckDetails(
     624       84910 :         details, compiler, filled_in, not_at_start);
     625             :   }
     626             :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
     627             :                             BoyerMooreLookahead* bm, bool not_at_start);
     628             :   ActionType action_type() { return action_type_; }
     629             :   // TODO(erikcorry): We should allow some action nodes in greedy loops.
     630      132786 :   virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
     631             : 
     632             :  private:
     633             :   union {
     634             :     struct {
     635             :       int reg;
     636             :       int value;
     637             :     } u_store_register;
     638             :     struct {
     639             :       int reg;
     640             :     } u_increment_register;
     641             :     struct {
     642             :       int reg;
     643             :       bool is_capture;
     644             :     } u_position_register;
     645             :     struct {
     646             :       int stack_pointer_register;
     647             :       int current_position_register;
     648             :       int clear_register_count;
     649             :       int clear_register_from;
     650             :     } u_submatch;
     651             :     struct {
     652             :       int start_register;
     653             :       int repetition_register;
     654             :       int repetition_limit;
     655             :     } u_empty_match_check;
     656             :     struct {
     657             :       int range_from;
     658             :       int range_to;
     659             :     } u_clear_captures;
     660             :   } data_;
     661             :   ActionNode(ActionType action_type, RegExpNode* on_success)
     662             :       : SeqRegExpNode(on_success),
     663     2285655 :         action_type_(action_type) { }
     664             :   ActionType action_type_;
     665             :   friend class DotPrinter;
     666             : };
     667             : 
     668             : 
     669           0 : class TextNode: public SeqRegExpNode {
     670             :  public:
     671             :   TextNode(ZoneList<TextElement>* elms, bool read_backward,
     672             :            RegExpNode* on_success)
     673     1146916 :       : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
     674      194874 :   TextNode(RegExpCharacterClass* that, bool read_backward,
     675             :            RegExpNode* on_success)
     676             :       : SeqRegExpNode(on_success),
     677      389748 :         elms_(new (zone()) ZoneList<TextElement>(1, zone())),
     678      389748 :         read_backward_(read_backward) {
     679      194874 :     elms_->Add(TextElement::CharClass(that), zone());
     680      194874 :   }
     681             :   // Create TextNode for a single character class for the given ranges.
     682             :   static TextNode* CreateForCharacterRanges(Zone* zone,
     683             :                                             ZoneList<CharacterRange>* ranges,
     684             :                                             bool read_backward,
     685             :                                             RegExpNode* on_success);
     686             :   // Create TextNode for a surrogate pair with a range given for the
     687             :   // lead and the trail surrogate each.
     688             :   static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
     689             :                                           CharacterRange trail,
     690             :                                           bool read_backward,
     691             :                                           RegExpNode* on_success);
     692             :   virtual void Accept(NodeVisitor* visitor);
     693             :   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
     694             :   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
     695             :   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
     696             :                                     RegExpCompiler* compiler,
     697             :                                     int characters_filled_in,
     698             :                                     bool not_at_start);
     699             :   ZoneList<TextElement>* elements() { return elms_; }
     700             :   bool read_backward() { return read_backward_; }
     701             :   void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
     702             :   virtual int GreedyLoopTextLength();
     703             :   virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
     704             :       RegExpCompiler* compiler);
     705             :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
     706             :                             BoyerMooreLookahead* bm, bool not_at_start);
     707             :   void CalculateOffsets();
     708             :   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
     709             : 
     710             :  private:
     711             :   enum TextEmitPassType {
     712             :     NON_LATIN1_MATCH,            // Check for characters that can't match.
     713             :     SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
     714             :     NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case equivs.
     715             :     CASE_CHARACTER_MATCH,        // Case-independent single character check.
     716             :     CHARACTER_CLASS_MATCH        // Character class.
     717             :   };
     718             :   static bool SkipPass(int pass, bool ignore_case);
     719             :   static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
     720             :   static const int kLastPass = CHARACTER_CLASS_MATCH;
     721             :   void TextEmitPass(RegExpCompiler* compiler,
     722             :                     TextEmitPassType pass,
     723             :                     bool preloaded,
     724             :                     Trace* trace,
     725             :                     bool first_element_checked,
     726             :                     int* checked_up_to);
     727             :   int Length();
     728             :   ZoneList<TextElement>* elms_;
     729             :   bool read_backward_;
     730             : };
     731             : 
     732             : 
     733           0 : class AssertionNode: public SeqRegExpNode {
     734             :  public:
     735             :   enum AssertionType {
     736             :     AT_END,
     737             :     AT_START,
     738             :     AT_BOUNDARY,
     739             :     AT_NON_BOUNDARY,
     740             :     AFTER_NEWLINE
     741             :   };
     742        3474 :   static AssertionNode* AtEnd(RegExpNode* on_success) {
     743        3474 :     return new(on_success->zone()) AssertionNode(AT_END, on_success);
     744             :   }
     745        4277 :   static AssertionNode* AtStart(RegExpNode* on_success) {
     746        4277 :     return new(on_success->zone()) AssertionNode(AT_START, on_success);
     747             :   }
     748         145 :   static AssertionNode* AtBoundary(RegExpNode* on_success) {
     749         145 :     return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
     750             :   }
     751         124 :   static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
     752         124 :     return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
     753             :   }
     754         782 :   static AssertionNode* AfterNewline(RegExpNode* on_success) {
     755         782 :     return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
     756             :   }
     757             :   virtual void Accept(NodeVisitor* visitor);
     758             :   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
     759             :   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
     760             :   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
     761             :                                     RegExpCompiler* compiler,
     762             :                                     int filled_in,
     763             :                                     bool not_at_start);
     764             :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
     765             :                             BoyerMooreLookahead* bm, bool not_at_start);
     766             :   AssertionType assertion_type() { return assertion_type_; }
     767             : 
     768             :  private:
     769             :   void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
     770             :   enum IfPrevious { kIsNonWord, kIsWord };
     771             :   void BacktrackIfPrevious(RegExpCompiler* compiler,
     772             :                            Trace* trace,
     773             :                            IfPrevious backtrack_if_previous);
     774             :   AssertionNode(AssertionType t, RegExpNode* on_success)
     775        8802 :       : SeqRegExpNode(on_success), assertion_type_(t) { }
     776             :   AssertionType assertion_type_;
     777             : };
     778             : 
     779             : 
     780           0 : class BackReferenceNode: public SeqRegExpNode {
     781             :  public:
     782             :   BackReferenceNode(int start_reg, int end_reg, bool read_backward,
     783             :                     RegExpNode* on_success)
     784             :       : SeqRegExpNode(on_success),
     785             :         start_reg_(start_reg),
     786             :         end_reg_(end_reg),
     787        2669 :         read_backward_(read_backward) {}
     788             :   virtual void Accept(NodeVisitor* visitor);
     789             :   int start_register() { return start_reg_; }
     790             :   int end_register() { return end_reg_; }
     791             :   bool read_backward() { return read_backward_; }
     792             :   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
     793             :   virtual int EatsAtLeast(int still_to_find,
     794             :                           int recursion_depth,
     795             :                           bool not_at_start);
     796         615 :   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
     797             :                                     RegExpCompiler* compiler,
     798             :                                     int characters_filled_in,
     799             :                                     bool not_at_start) {
     800         615 :     return;
     801             :   }
     802             :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
     803             :                             BoyerMooreLookahead* bm, bool not_at_start);
     804             : 
     805             :  private:
     806             :   int start_reg_;
     807             :   int end_reg_;
     808             :   bool read_backward_;
     809             : };
     810             : 
     811             : 
     812           0 : class EndNode: public RegExpNode {
     813             :  public:
     814             :   enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
     815       97200 :   EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
     816             :   virtual void Accept(NodeVisitor* visitor);
     817             :   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
     818      130683 :   virtual int EatsAtLeast(int still_to_find,
     819             :                           int recursion_depth,
     820      130683 :                           bool not_at_start) { return 0; }
     821           0 :   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
     822             :                                     RegExpCompiler* compiler,
     823             :                                     int characters_filled_in,
     824             :                                     bool not_at_start) {
     825             :     // Returning 0 from EatsAtLeast should ensure we never get here.
     826           0 :     UNREACHABLE();
     827             :   }
     828           0 :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
     829             :                             BoyerMooreLookahead* bm, bool not_at_start) {
     830             :     // Returning 0 from EatsAtLeast should ensure we never get here.
     831           0 :     UNREACHABLE();
     832             :   }
     833             : 
     834             :  private:
     835             :   Action action_;
     836             : };
     837             : 
     838             : 
     839           0 : class NegativeSubmatchSuccess: public EndNode {
     840             :  public:
     841             :   NegativeSubmatchSuccess(int stack_pointer_reg,
     842             :                           int position_reg,
     843             :                           int clear_capture_count,
     844             :                           int clear_capture_start,
     845             :                           Zone* zone)
     846             :       : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
     847             :         stack_pointer_register_(stack_pointer_reg),
     848             :         current_position_register_(position_reg),
     849             :         clear_capture_count_(clear_capture_count),
     850        2928 :         clear_capture_start_(clear_capture_start) { }
     851             :   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
     852             : 
     853             :  private:
     854             :   int stack_pointer_register_;
     855             :   int current_position_register_;
     856             :   int clear_capture_count_;
     857             :   int clear_capture_start_;
     858             : };
     859             : 
     860             : 
     861             : class Guard: public ZoneObject {
     862             :  public:
     863             :   enum Relation { LT, GEQ };
     864             :   Guard(int reg, Relation op, int value)
     865             :       : reg_(reg),
     866             :         op_(op),
     867     1007347 :         value_(value) { }
     868             :   int reg() { return reg_; }
     869             :   Relation op() { return op_; }
     870             :   int value() { return value_; }
     871             : 
     872             :  private:
     873             :   int reg_;
     874             :   Relation op_;
     875             :   int value_;
     876             : };
     877             : 
     878             : 
     879             : class GuardedAlternative {
     880             :  public:
     881             :   explicit GuardedAlternative(RegExpNode* node)
     882     2227726 :       : node_(node), guards_(nullptr) {}
     883             :   void AddGuard(Guard* guard, Zone* zone);
     884             :   RegExpNode* node() { return node_; }
     885       87022 :   void set_node(RegExpNode* node) { node_ = node; }
     886             :   ZoneList<Guard*>* guards() { return guards_; }
     887             : 
     888             :  private:
     889             :   RegExpNode* node_;
     890             :   ZoneList<Guard*>* guards_;
     891             : };
     892             : 
     893             : 
     894             : class AlternativeGeneration;
     895             : 
     896             : 
     897           0 : class ChoiceNode: public RegExpNode {
     898             :  public:
     899     1143964 :   explicit ChoiceNode(int expected_size, Zone* zone)
     900             :       : RegExpNode(zone),
     901             :         alternatives_(new (zone)
     902     1143964 :                           ZoneList<GuardedAlternative>(expected_size, zone)),
     903             :         table_(nullptr),
     904             :         not_at_start_(false),
     905     2287928 :         being_calculated_(false) {}
     906             :   virtual void Accept(NodeVisitor* visitor);
     907     2354582 :   void AddAlternative(GuardedAlternative node) {
     908     4693935 :     alternatives()->Add(node, zone());
     909             :   }
     910             :   ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
     911             :   DispatchTable* GetTable(bool ignore_case);
     912             :   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
     913             :   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
     914             :   int EatsAtLeastHelper(int still_to_find,
     915             :                         int budget,
     916             :                         RegExpNode* ignore_this_node,
     917             :                         bool not_at_start);
     918             :   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
     919             :                                     RegExpCompiler* compiler,
     920             :                                     int characters_filled_in,
     921             :                                     bool not_at_start);
     922             :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
     923             :                             BoyerMooreLookahead* bm, bool not_at_start);
     924             : 
     925             :   bool being_calculated() { return being_calculated_; }
     926             :   bool not_at_start() { return not_at_start_; }
     927        7831 :   void set_not_at_start() { not_at_start_ = true; }
     928           0 :   void set_being_calculated(bool b) { being_calculated_ = b; }
     929      520155 :   virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
     930      520155 :     return true;
     931             :   }
     932             :   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
     933           0 :   virtual bool read_backward() { return false; }
     934             : 
     935             :  protected:
     936             :   int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
     937             :   ZoneList<GuardedAlternative>* alternatives_;
     938             : 
     939             :  private:
     940             :   friend class DispatchTableConstructor;
     941             :   friend class Analysis;
     942             :   void GenerateGuard(RegExpMacroAssembler* macro_assembler,
     943             :                      Guard* guard,
     944             :                      Trace* trace);
     945             :   int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
     946             :   void EmitOutOfLineContinuation(RegExpCompiler* compiler,
     947             :                                  Trace* trace,
     948             :                                  GuardedAlternative alternative,
     949             :                                  AlternativeGeneration* alt_gen,
     950             :                                  int preload_characters,
     951             :                                  bool next_expects_preload);
     952             :   void SetUpPreLoad(RegExpCompiler* compiler,
     953             :                     Trace* current_trace,
     954             :                     PreloadState* preloads);
     955             :   void AssertGuardsMentionRegisters(Trace* trace);
     956             :   int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
     957             :   Trace* EmitGreedyLoop(RegExpCompiler* compiler,
     958             :                         Trace* trace,
     959             :                         AlternativeGenerationList* alt_gens,
     960             :                         PreloadState* preloads,
     961             :                         GreedyLoopState* greedy_loop_state,
     962             :                         int text_length);
     963             :   void EmitChoices(RegExpCompiler* compiler,
     964             :                    AlternativeGenerationList* alt_gens,
     965             :                    int first_choice,
     966             :                    Trace* trace,
     967             :                    PreloadState* preloads);
     968             :   DispatchTable* table_;
     969             :   // If true, this node is never checked at the start of the input.
     970             :   // Allows a new trace to start with at_start() set to false.
     971             :   bool not_at_start_;
     972             :   bool being_calculated_;
     973             : };
     974             : 
     975             : 
     976           0 : class NegativeLookaroundChoiceNode : public ChoiceNode {
     977             :  public:
     978        2928 :   explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
     979             :                                         GuardedAlternative then_do_this,
     980             :                                         Zone* zone)
     981        2928 :       : ChoiceNode(2, zone) {
     982             :     AddAlternative(this_must_fail);
     983             :     AddAlternative(then_do_this);
     984        2928 :   }
     985             :   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
     986             :   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
     987             :                                     RegExpCompiler* compiler,
     988             :                                     int characters_filled_in,
     989             :                                     bool not_at_start);
     990        1187 :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
     991             :                             BoyerMooreLookahead* bm, bool not_at_start) {
     992        2374 :     alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
     993        1187 :                                               not_at_start);
     994        1187 :     if (offset == 0) set_bm_info(not_at_start, bm);
     995        1187 :   }
     996             :   // For a negative lookahead we don't emit the quick check for the
     997             :   // alternative that is expected to fail.  This is because quick check code
     998             :   // starts by loading enough characters for the alternative that takes fewest
     999             :   // characters, but on a negative lookahead the negative branch did not take
    1000             :   // part in that calculation (EatsAtLeast) so the assumptions don't hold.
    1001        5832 :   virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
    1002        5832 :     return !is_first;
    1003             :   }
    1004             :   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
    1005             : };
    1006             : 
    1007             : 
    1008           0 : class LoopChoiceNode: public ChoiceNode {
    1009             :  public:
    1010             :   LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
    1011             :       : ChoiceNode(2, zone),
    1012             :         loop_node_(nullptr),
    1013             :         continue_node_(nullptr),
    1014             :         body_can_be_zero_length_(body_can_be_zero_length),
    1015     1112399 :         read_backward_(read_backward) {}
    1016             :   void AddLoopAlternative(GuardedAlternative alt);
    1017             :   void AddContinueAlternative(GuardedAlternative alt);
    1018             :   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
    1019             :   virtual int EatsAtLeast(int still_to_find,  int budget, bool not_at_start);
    1020             :   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
    1021             :                                     RegExpCompiler* compiler,
    1022             :                                     int characters_filled_in,
    1023             :                                     bool not_at_start);
    1024             :   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
    1025             :                             BoyerMooreLookahead* bm, bool not_at_start);
    1026             :   RegExpNode* loop_node() { return loop_node_; }
    1027             :   RegExpNode* continue_node() { return continue_node_; }
    1028             :   bool body_can_be_zero_length() { return body_can_be_zero_length_; }
    1029       29618 :   virtual bool read_backward() { return read_backward_; }
    1030             :   virtual void Accept(NodeVisitor* visitor);
    1031             :   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
    1032             : 
    1033             :  private:
    1034             :   // AddAlternative is made private for loop nodes because alternatives
    1035             :   // should not be added freely, we need to keep track of which node
    1036             :   // goes back to the node itself.
    1037             :   void AddAlternative(GuardedAlternative node) {
    1038             :     ChoiceNode::AddAlternative(node);
    1039             :   }
    1040             : 
    1041             :   RegExpNode* loop_node_;
    1042             :   RegExpNode* continue_node_;
    1043             :   bool body_can_be_zero_length_;
    1044             :   bool read_backward_;
    1045             : };
    1046             : 
    1047             : 
    1048             : // Improve the speed that we scan for an initial point where a non-anchored
    1049             : // regexp can match by using a Boyer-Moore-like table. This is done by
    1050             : // identifying non-greedy non-capturing loops in the nodes that eat any
    1051             : // character one at a time.  For example in the middle of the regexp
    1052             : // /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
    1053             : // inserted at the start of any non-anchored regexp.
    1054             : //
    1055             : // When we have found such a loop we look ahead in the nodes to find the set of
    1056             : // characters that can come at given distances. For example for the regexp
    1057             : // /.?foo/ we know that there are at least 3 characters ahead of us, and the
    1058             : // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
    1059             : // the lookahead info where the set of characters is reasonably constrained. In
    1060             : // our example this is from index 1 to 2 (0 is not constrained). We can now
    1061             : // look 3 characters ahead and if we don't find one of [f, o] (the union of
    1062             : // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
    1063             : //
    1064             : // For Unicode input strings we do the same, but modulo 128.
    1065             : //
    1066             : // We also look at the first string fed to the regexp and use that to get a hint
    1067             : // of the character frequencies in the inputs. This affects the assessment of
    1068             : // whether the set of characters is 'reasonably constrained'.
    1069             : //
    1070             : // We also have another lookahead mechanism (called quick check in the code),
    1071             : // which uses a wide load of multiple characters followed by a mask and compare
    1072             : // to determine whether a match is possible at this point.
    1073             : enum ContainedInLattice {
    1074             :   kNotYet = 0,
    1075             :   kLatticeIn = 1,
    1076             :   kLatticeOut = 2,
    1077             :   kLatticeUnknown = 3  // Can also mean both in and out.
    1078             : };
    1079             : 
    1080             : 
    1081             : inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
    1082      885017 :   return static_cast<ContainedInLattice>(a | b);
    1083             : }
    1084             : 
    1085             : 
    1086             : ContainedInLattice AddRange(ContainedInLattice a,
    1087             :                             const int* ranges,
    1088             :                             int ranges_size,
    1089             :                             Interval new_range);
    1090             : 
    1091             : 
    1092             : class BoyerMoorePositionInfo : public ZoneObject {
    1093             :  public:
    1094      130495 :   explicit BoyerMoorePositionInfo(Zone* zone)
    1095      130495 :       : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
    1096             :         map_count_(0),
    1097             :         w_(kNotYet),
    1098             :         s_(kNotYet),
    1099             :         d_(kNotYet),
    1100      130495 :         surrogate_(kNotYet) {
    1101    16833855 :      for (int i = 0; i < kMapSize; i++) {
    1102    16703360 :        map_->Add(false, zone);
    1103             :      }
    1104      130495 :   }
    1105             : 
    1106             :   bool& at(int i) { return map_->at(i); }
    1107             : 
    1108             :   static const int kMapSize = 128;
    1109             :   static const int kMask = kMapSize - 1;
    1110             : 
    1111             :   int map_count() const { return map_count_; }
    1112             : 
    1113             :   void Set(int character);
    1114             :   void SetInterval(const Interval& interval);
    1115             :   void SetAll();
    1116             :   bool is_non_word() { return w_ == kLatticeOut; }
    1117             :   bool is_word() { return w_ == kLatticeIn; }
    1118             : 
    1119             :  private:
    1120             :   ZoneList<bool>* map_;
    1121             :   int map_count_;  // Number of set bits in the map.
    1122             :   ContainedInLattice w_;  // The \w character class.
    1123             :   ContainedInLattice s_;  // The \s character class.
    1124             :   ContainedInLattice d_;  // The \d character class.
    1125             :   ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
    1126             : };
    1127             : 
    1128             : 
    1129             : class BoyerMooreLookahead : public ZoneObject {
    1130             :  public:
    1131             :   BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
    1132             : 
    1133             :   int length() { return length_; }
    1134             :   int max_char() { return max_char_; }
    1135             :   RegExpCompiler* compiler() { return compiler_; }
    1136             : 
    1137             :   int Count(int map_number) {
    1138      653831 :     return bitmaps_->at(map_number)->map_count();
    1139             :   }
    1140             : 
    1141         167 :   BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
    1142             : 
    1143       65072 :   void Set(int map_number, int character) {
    1144      130144 :     if (character > max_char_) return;
    1145      130144 :     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    1146             :     info->Set(character);
    1147             :   }
    1148             : 
    1149      402942 :   void SetInterval(int map_number, const Interval& interval) {
    1150      402942 :     if (interval.from() > max_char_) return;
    1151      402942 :     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
    1152      201471 :     if (interval.to() > max_char_) {
    1153           0 :       info->SetInterval(Interval(interval.from(), max_char_));
    1154             :     } else {
    1155      201471 :       info->SetInterval(interval);
    1156             :     }
    1157             :   }
    1158             : 
    1159        5675 :   void SetAll(int map_number) {
    1160       11350 :     bitmaps_->at(map_number)->SetAll();
    1161        5675 :   }
    1162             : 
    1163        1145 :   void SetRest(int from_map) {
    1164        1145 :     for (int i = from_map; i < length_; i++) SetAll(i);
    1165             :   }
    1166             :   void EmitSkipInstructions(RegExpMacroAssembler* masm);
    1167             : 
    1168             :  private:
    1169             :   // This is the value obtained by EatsAtLeast.  If we do not have at least this
    1170             :   // many characters left in the sample string then the match is bound to fail.
    1171             :   // Therefore it is OK to read a character this far ahead of the current match
    1172             :   // point.
    1173             :   int length_;
    1174             :   RegExpCompiler* compiler_;
    1175             :   // 0xff for Latin1, 0xffff for UTF-16.
    1176             :   int max_char_;
    1177             :   ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
    1178             : 
    1179             :   int GetSkipTable(int min_lookahead,
    1180             :                    int max_lookahead,
    1181             :                    Handle<ByteArray> boolean_skip_table);
    1182             :   bool FindWorthwhileInterval(int* from, int* to);
    1183             :   int FindBestInterval(
    1184             :     int max_number_of_chars, int old_biggest_points, int* from, int* to);
    1185             : };
    1186             : 
    1187             : 
    1188             : // There are many ways to generate code for a node.  This class encapsulates
    1189             : // the current way we should be generating.  In other words it encapsulates
    1190             : // the current state of the code generator.  The effect of this is that we
    1191             : // generate code for paths that the matcher can take through the regular
    1192             : // expression.  A given node in the regexp can be code-generated several times
    1193             : // as it can be part of several traces.  For example for the regexp:
    1194             : // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
    1195             : // of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
    1196             : // to match foo is generated only once (the traces have a common prefix).  The
    1197             : // code to store the capture is deferred and generated (twice) after the places
    1198             : // where baz has been matched.
    1199             : class Trace {
    1200             :  public:
    1201             :   // A value for a property that is either known to be true, know to be false,
    1202             :   // or not known.
    1203             :   enum TriBool {
    1204             :     UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
    1205             :   };
    1206             : 
    1207             :   class DeferredAction {
    1208             :    public:
    1209             :     DeferredAction(ActionNode::ActionType action_type, int reg)
    1210      309577 :         : action_type_(action_type), reg_(reg), next_(nullptr) {}
    1211             :     DeferredAction* next() { return next_; }
    1212             :     bool Mentions(int reg);
    1213             :     int reg() { return reg_; }
    1214             :     ActionNode::ActionType action_type() { return action_type_; }
    1215             :    private:
    1216             :     ActionNode::ActionType action_type_;
    1217             :     int reg_;
    1218             :     DeferredAction* next_;
    1219             :     friend class Trace;
    1220             :   };
    1221             : 
    1222             :   class DeferredCapture : public DeferredAction {
    1223             :    public:
    1224      292851 :     DeferredCapture(int reg, bool is_capture, Trace* trace)
    1225             :         : DeferredAction(ActionNode::STORE_POSITION, reg),
    1226             :           cp_offset_(trace->cp_offset()),
    1227      292851 :           is_capture_(is_capture) { }
    1228             :     int cp_offset() { return cp_offset_; }
    1229             :     bool is_capture() { return is_capture_; }
    1230             :    private:
    1231             :     int cp_offset_;
    1232             :     bool is_capture_;
    1233             :     void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
    1234             :   };
    1235             : 
    1236             :   class DeferredSetRegister : public DeferredAction {
    1237             :    public:
    1238             :     DeferredSetRegister(int reg, int value)
    1239             :         : DeferredAction(ActionNode::SET_REGISTER, reg),
    1240        6013 :           value_(value) { }
    1241             :     int value() { return value_; }
    1242             :    private:
    1243             :     int value_;
    1244             :   };
    1245             : 
    1246             :   class DeferredClearCaptures : public DeferredAction {
    1247             :    public:
    1248             :     explicit DeferredClearCaptures(Interval range)
    1249             :         : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
    1250        3803 :           range_(range) { }
    1251             :     Interval range() { return range_; }
    1252             :    private:
    1253             :     Interval range_;
    1254             :   };
    1255             : 
    1256             :   class DeferredIncrementRegister : public DeferredAction {
    1257             :    public:
    1258             :     explicit DeferredIncrementRegister(int reg)
    1259             :         : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
    1260             :   };
    1261             : 
    1262      711490 :   Trace()
    1263             :       : cp_offset_(0),
    1264             :         actions_(nullptr),
    1265             :         backtrack_(nullptr),
    1266             :         stop_node_(nullptr),
    1267             :         loop_label_(nullptr),
    1268             :         characters_preloaded_(0),
    1269             :         bound_checked_up_to_(0),
    1270             :         flush_budget_(100),
    1271     1422980 :         at_start_(UNKNOWN) {}
    1272             : 
    1273             :   // End the trace.  This involves flushing the deferred actions in the trace
    1274             :   // and pushing a backtrack location onto the backtrack stack.  Once this is
    1275             :   // done we can start a new trace or go to one that has already been
    1276             :   // generated.
    1277             :   void Flush(RegExpCompiler* compiler, RegExpNode* successor);
    1278             :   int cp_offset() { return cp_offset_; }
    1279             :   DeferredAction* actions() { return actions_; }
    1280             :   // A trivial trace is one that has no deferred actions or other state that
    1281             :   // affects the assumptions used when generating code.  There is no recorded
    1282             :   // backtrack location in a trivial trace, so with a trivial trace we will
    1283             :   // generate code that, on a failure to match, gets the backtrack location
    1284             :   // from the backtrack stack rather than using a direct jump instruction.  We
    1285             :   // always start code generation with a trivial trace and non-trivial traces
    1286             :   // are created as we emit code for nodes or add to the list of deferred
    1287             :   // actions in the trace.  The location of the code generated for a node using
    1288             :   // a trivial trace is recorded in a label in the node so that gotos can be
    1289             :   // generated to that code.
    1290     2416014 :   bool is_trivial() {
    1291     3810184 :     return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
    1292     2010312 :            characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
    1293     4385510 :            quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
    1294             :   }
    1295             :   TriBool at_start() { return at_start_; }
    1296      583270 :   void set_at_start(TriBool at_start) { at_start_ = at_start; }
    1297             :   Label* backtrack() { return backtrack_; }
    1298             :   Label* loop_label() { return loop_label_; }
    1299             :   RegExpNode* stop_node() { return stop_node_; }
    1300             :   int characters_preloaded() { return characters_preloaded_; }
    1301             :   int bound_checked_up_to() { return bound_checked_up_to_; }
    1302             :   int flush_budget() { return flush_budget_; }
    1303             :   QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
    1304             :   bool mentions_reg(int reg);
    1305             :   // Returns true if a deferred position store exists to the specified
    1306             :   // register and stores the offset in the out-parameter.  Otherwise
    1307             :   // returns false.
    1308             :   bool GetStoredPosition(int reg, int* cp_offset);
    1309             :   // These set methods and AdvanceCurrentPositionInTrace should be used only on
    1310             :   // new traces - the intention is that traces are immutable after creation.
    1311             :   void add_action(DeferredAction* new_action) {
    1312             :     DCHECK(new_action->next_ == nullptr);
    1313      309577 :     new_action->next_ = actions_;
    1314      309577 :     actions_ = new_action;
    1315             :   }
    1316      854423 :   void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
    1317       14809 :   void set_stop_node(RegExpNode* node) { stop_node_ = node; }
    1318       14809 :   void set_loop_label(Label* label) { loop_label_ = label; }
    1319      877736 :   void set_characters_preloaded(int count) { characters_preloaded_ = count; }
    1320      474434 :   void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
    1321      216902 :   void set_flush_budget(int to) { flush_budget_ = to; }
    1322             :   void set_quick_check_performed(QuickCheckDetails* d) {
    1323      224858 :     quick_check_performed_ = *d;
    1324             :   }
    1325             :   void InvalidateCurrentCharacter();
    1326             :   void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
    1327             : 
    1328             :  private:
    1329             :   int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
    1330             :   void PerformDeferredActions(RegExpMacroAssembler* macro,
    1331             :                               int max_register,
    1332             :                               const OutSet& affected_registers,
    1333             :                               OutSet* registers_to_pop,
    1334             :                               OutSet* registers_to_clear,
    1335             :                               Zone* zone);
    1336             :   void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
    1337             :                                 int max_register,
    1338             :                                 const OutSet& registers_to_pop,
    1339             :                                 const OutSet& registers_to_clear);
    1340             :   int cp_offset_;
    1341             :   DeferredAction* actions_;
    1342             :   Label* backtrack_;
    1343             :   RegExpNode* stop_node_;
    1344             :   Label* loop_label_;
    1345             :   int characters_preloaded_;
    1346             :   int bound_checked_up_to_;
    1347             :   QuickCheckDetails quick_check_performed_;
    1348             :   int flush_budget_;
    1349             :   TriBool at_start_;
    1350             : };
    1351             : 
    1352             : 
    1353             : class GreedyLoopState {
    1354             :  public:
    1355             :   explicit GreedyLoopState(bool not_at_start);
    1356             : 
    1357             :   Label* label() { return &label_; }
    1358             :   Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
    1359             : 
    1360             :  private:
    1361             :   Label label_;
    1362             :   Trace counter_backtrack_trace_;
    1363             : };
    1364             : 
    1365             : 
    1366             : struct PreloadState {
    1367             :   static const int kEatsAtLeastNotYetInitialized = -1;
    1368             :   bool preload_is_current_;
    1369             :   bool preload_has_checked_bounds_;
    1370             :   int preload_characters_;
    1371             :   int eats_at_least_;
    1372             :   void init() {
    1373      245782 :     eats_at_least_ = kEatsAtLeastNotYetInitialized;
    1374             :   }
    1375             : };
    1376             : 
    1377             : 
    1378       94032 : class NodeVisitor {
    1379             :  public:
    1380       94032 :   virtual ~NodeVisitor() { }
    1381             : #define DECLARE_VISIT(Type)                                          \
    1382             :   virtual void Visit##Type(Type##Node* that) = 0;
    1383             : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
    1384             : #undef DECLARE_VISIT
    1385           0 :   virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
    1386             : };
    1387             : 
    1388             : 
    1389             : // Node visitor used to add the start set of the alternatives to the
    1390             : // dispatch table of a choice node.
    1391          66 : class DispatchTableConstructor: public NodeVisitor {
    1392             :  public:
    1393             :   DispatchTableConstructor(DispatchTable* table, bool ignore_case,
    1394             :                            Zone* zone)
    1395             :       : table_(table),
    1396             :         choice_index_(-1),
    1397             :         ignore_case_(ignore_case),
    1398          66 :         zone_(zone) { }
    1399             : 
    1400             :   void BuildTable(ChoiceNode* node);
    1401             : 
    1402         696 :   void AddRange(CharacterRange range) {
    1403        1392 :     table()->AddRange(range, choice_index_, zone_);
    1404             :   }
    1405             : 
    1406             :   void AddInverse(ZoneList<CharacterRange>* ranges);
    1407             : 
    1408             : #define DECLARE_VISIT(Type)                                          \
    1409             :   virtual void Visit##Type(Type##Node* that);
    1410             : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
    1411             : #undef DECLARE_VISIT
    1412             : 
    1413             :   DispatchTable* table() { return table_; }
    1414          66 :   void set_choice_index(int value) { choice_index_ = value; }
    1415             : 
    1416             :  protected:
    1417             :   DispatchTable* table_;
    1418             :   int choice_index_;
    1419             :   bool ignore_case_;
    1420             :   Zone* zone_;
    1421             : };
    1422             : 
    1423             : 
    1424             : // Assertion propagation moves information about assertions such as
    1425             : // \b to the affected nodes.  For instance, in /.\b./ information must
    1426             : // be propagated to the first '.' that whatever follows needs to know
    1427             : // if it matched a word or a non-word, and to the second '.' that it
    1428             : // has to check if it succeeds a word or non-word.  In this case the
    1429             : // result will be something like:
    1430             : //
    1431             : //   +-------+        +------------+
    1432             : //   |   .   |        |      .     |
    1433             : //   +-------+  --->  +------------+
    1434             : //   | word? |        | check word |
    1435             : //   +-------+        +------------+
    1436       93966 : class Analysis: public NodeVisitor {
    1437             :  public:
    1438             :   Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
    1439             :       : isolate_(isolate),
    1440             :         flags_(flags),
    1441             :         is_one_byte_(is_one_byte),
    1442       93966 :         error_message_(nullptr) {}
    1443             :   void EnsureAnalyzed(RegExpNode* node);
    1444             : 
    1445             : #define DECLARE_VISIT(Type)                                          \
    1446             :   virtual void Visit##Type(Type##Node* that);
    1447             : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
    1448             : #undef DECLARE_VISIT
    1449             :   virtual void VisitLoopChoice(LoopChoiceNode* that);
    1450             : 
    1451             :   bool has_failed() { return error_message_ != nullptr; }
    1452             :   const char* error_message() {
    1453             :     DCHECK(error_message_ != nullptr);
    1454             :     return error_message_;
    1455             :   }
    1456             :   void fail(const char* error_message) {
    1457         448 :     error_message_ = error_message;
    1458             :   }
    1459             : 
    1460             :   Isolate* isolate() const { return isolate_; }
    1461             : 
    1462             :   bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
    1463             :   bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
    1464             : 
    1465             :  private:
    1466             :   Isolate* isolate_;
    1467             :   JSRegExp::Flags flags_;
    1468             :   bool is_one_byte_;
    1469             :   const char* error_message_;
    1470             : 
    1471             :   DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
    1472             : };
    1473             : 
    1474             : 
    1475             : struct RegExpCompileData {
    1476             :   RegExpCompileData()
    1477             :       : tree(nullptr),
    1478             :         node(nullptr),
    1479             :         simple(true),
    1480             :         contains_anchor(false),
    1481      795978 :         capture_count(0) {}
    1482             :   RegExpTree* tree;
    1483             :   RegExpNode* node;
    1484             :   bool simple;
    1485             :   bool contains_anchor;
    1486             :   Handle<FixedArray> capture_name_map;
    1487             :   Handle<String> error;
    1488             :   int capture_count;
    1489             : };
    1490             : 
    1491             : 
    1492             : class RegExpEngine: public AllStatic {
    1493             :  public:
    1494             :   struct CompilationResult {
    1495             :     CompilationResult(Isolate* isolate, const char* error_message)
    1496             :         : error_message(error_message),
    1497         458 :           code(isolate->heap()->the_hole_value()),
    1498         916 :           num_registers(0) {}
    1499             :     CompilationResult(Object* code, int registers)
    1500       93518 :         : error_message(nullptr), code(code), num_registers(registers) {}
    1501             :     const char* error_message;
    1502             :     Object* code;
    1503             :     int num_registers;
    1504             :   };
    1505             : 
    1506             :   static CompilationResult Compile(Isolate* isolate, Zone* zone,
    1507             :                                    RegExpCompileData* input,
    1508             :                                    JSRegExp::Flags flags,
    1509             :                                    Handle<String> pattern,
    1510             :                                    Handle<String> sample_subject,
    1511             :                                    bool is_one_byte);
    1512             : 
    1513             :   static bool TooMuchRegExpCode(Handle<String> pattern);
    1514             : 
    1515             :   static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
    1516             : };
    1517             : 
    1518             : 
    1519             : class RegExpResultsCache : public AllStatic {
    1520             :  public:
    1521             :   enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
    1522             : 
    1523             :   // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
    1524             :   // On success, the returned result is guaranteed to be a COW-array.
    1525             :   static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
    1526             :                         FixedArray** last_match_out, ResultsCacheType type);
    1527             :   // Attempt to add value_array to the cache specified by type.  On success,
    1528             :   // value_array is turned into a COW-array.
    1529             :   static void Enter(Isolate* isolate, Handle<String> key_string,
    1530             :                     Handle<Object> key_pattern, Handle<FixedArray> value_array,
    1531             :                     Handle<FixedArray> last_match_cache, ResultsCacheType type);
    1532             :   static void Clear(FixedArray* cache);
    1533             :   static const int kRegExpResultsCacheSize = 0x100;
    1534             : 
    1535             :  private:
    1536             :   static const int kArrayEntriesPerCacheEntry = 4;
    1537             :   static const int kStringOffset = 0;
    1538             :   static const int kPatternOffset = 1;
    1539             :   static const int kArrayOffset = 2;
    1540             :   static const int kLastMatchOffset = 3;
    1541             : };
    1542             : 
    1543             : }  // namespace internal
    1544             : }  // namespace v8
    1545             : 
    1546             : #endif  // V8_REGEXP_JSREGEXP_H_

Generated by: LCOV version 1.10