|           Line data    Source code 
       1             : // Copyright 2016 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_REGEXP_REGEXP_PARSER_H_
       6             : #define V8_REGEXP_REGEXP_PARSER_H_
       7             : 
       8             : #include "src/objects.h"
       9             : #include "src/regexp/regexp-ast.h"
      10             : #include "src/zone/zone.h"
      11             : 
      12             : namespace v8 {
      13             : namespace internal {
      14             : 
      15             : struct RegExpCompileData;
      16             : 
      17             : 
      18             : // A BufferedZoneList is an automatically growing list, just like (and backed
      19             : // by) a ZoneList, that is optimized for the case of adding and removing
      20             : // a single element. The last element added is stored outside the backing list,
      21             : // and if no more than one element is ever added, the ZoneList isn't even
      22             : // allocated.
      23             : // Elements must not be NULL pointers.
      24             : template <typename T, int initial_size>
      25             : class BufferedZoneList {
      26             :  public:
      27     2143904 :   BufferedZoneList() : list_(NULL), last_(NULL) {}
      28             : 
      29             :   // Adds element at end of list. This element is buffered and can
      30             :   // be read using last() or removed using RemoveLast until a new Add or until
      31             :   // RemoveLast or GetList has been called.
      32     7714029 :   void Add(T* value, Zone* zone) {
      33     7714029 :     if (last_ != NULL) {
      34     3046784 :       if (list_ == NULL) {
      35      131400 :         list_ = new (zone) ZoneList<T*>(initial_size, zone);
      36             :       }
      37     3046784 :       list_->Add(last_, zone);
      38             :     }
      39     7714029 :     last_ = value;
      40     7714029 :   }
      41             : 
      42             :   T* last() {
      43             :     DCHECK(last_ != NULL);
      44             :     return last_;
      45             :   }
      46             : 
      47       60734 :   T* RemoveLast() {
      48             :     DCHECK(last_ != NULL);
      49       60734 :     T* result = last_;
      50       60734 :     if ((list_ != NULL) && (list_->length() > 0))
      51       75848 :       last_ = list_->RemoveLast();
      52             :     else
      53       22810 :       last_ = NULL;
      54       60734 :     return result;
      55             :   }
      56             : 
      57             :   T* Get(int i) {
      58             :     DCHECK((0 <= i) && (i < length()));
      59       20323 :     if (list_ == NULL) {
      60             :       DCHECK_EQ(0, i);
      61           0 :       return last_;
      62             :     } else {
      63       20323 :       if (i == list_->length()) {
      64             :         DCHECK(last_ != NULL);
      65        5990 :         return last_;
      66             :       } else {
      67       14333 :         return list_->at(i);
      68             :       }
      69             :     }
      70             :   }
      71             : 
      72             :   void Clear() {
      73     2965603 :     list_ = NULL;
      74     2965603 :     last_ = NULL;
      75             :   }
      76             : 
      77             :   int length() {
      78    10834474 :     int length = (list_ == NULL) ? 0 : list_->length();
      79    10834474 :     return length + ((last_ == NULL) ? 0 : 1);
      80             :   }
      81             : 
      82      106909 :   ZoneList<T*>* GetList(Zone* zone) {
      83      106909 :     if (list_ == NULL) {
      84           0 :       list_ = new (zone) ZoneList<T*>(initial_size, zone);
      85             :     }
      86      106909 :     if (last_ != NULL) {
      87      106909 :       list_->Add(last_, zone);
      88      106909 :       last_ = NULL;
      89             :     }
      90      106909 :     return list_;
      91             :   }
      92             : 
      93             :  private:
      94             :   ZoneList<T*>* list_;
      95             :   T* last_;
      96             : };
      97             : 
      98             : 
      99             : // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
     100             : class RegExpBuilder : public ZoneObject {
     101             :  public:
     102             :   RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
     103             :   void AddCharacter(uc16 character);
     104             :   void AddUnicodeCharacter(uc32 character);
     105             :   void AddEscapedUnicodeCharacter(uc32 character);
     106             :   // "Adds" an empty expression. Does nothing except consume a
     107             :   // following quantifier
     108             :   void AddEmpty();
     109             :   void AddCharacterClass(RegExpCharacterClass* cc);
     110             :   void AddCharacterClassForDesugaring(uc32 c);
     111             :   void AddAtom(RegExpTree* tree);
     112             :   void AddTerm(RegExpTree* tree);
     113             :   void AddAssertion(RegExpTree* tree);
     114             :   void NewAlternative();  // '|'
     115             :   bool AddQuantifierToAtom(int min, int max,
     116             :                            RegExpQuantifier::QuantifierType type);
     117             :   RegExpTree* ToRegExp();
     118             : 
     119             :  private:
     120             :   static const uc16 kNoPendingSurrogate = 0;
     121             :   void AddLeadSurrogate(uc16 lead_surrogate);
     122             :   void AddTrailSurrogate(uc16 trail_surrogate);
     123             :   void FlushPendingSurrogate();
     124             :   void FlushCharacters();
     125             :   void FlushText();
     126             :   void FlushTerms();
     127             :   bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
     128             :   bool NeedsDesugaringForIgnoreCase(uc32 c);
     129             :   Zone* zone() const { return zone_; }
     130             :   bool ignore_case() const { return ignore_case_; }
     131             :   bool unicode() const { return unicode_; }
     132             : 
     133             :   Zone* zone_;
     134             :   bool pending_empty_;
     135             :   bool ignore_case_;
     136             :   bool unicode_;
     137             :   ZoneList<uc16>* characters_;
     138             :   uc16 pending_surrogate_;
     139             :   BufferedZoneList<RegExpTree, 2> terms_;
     140             :   BufferedZoneList<RegExpTree, 2> text_;
     141             :   BufferedZoneList<RegExpTree, 2> alternatives_;
     142             : #ifdef DEBUG
     143             :   enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
     144             : #define LAST(x) last_added_ = x;
     145             : #else
     146             : #define LAST(x)
     147             : #endif
     148             : };
     149             : 
     150             : 
     151             : class RegExpParser BASE_EMBEDDED {
     152             :  public:
     153             :   RegExpParser(FlatStringReader* in, Handle<String>* error,
     154             :                JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
     155             : 
     156             :   static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
     157             :                           JSRegExp::Flags flags, RegExpCompileData* result);
     158             : 
     159             :   RegExpTree* ParsePattern();
     160             :   RegExpTree* ParseDisjunction();
     161             :   RegExpTree* ParseGroup();
     162             :   RegExpTree* ParseCharacterClass();
     163             : 
     164             :   // Parses a {...,...} quantifier and stores the range in the given
     165             :   // out parameters.
     166             :   bool ParseIntervalQuantifier(int* min_out, int* max_out);
     167             : 
     168             :   // Parses and returns a single escaped character.  The character
     169             :   // must not be 'b' or 'B' since they are usually handle specially.
     170             :   uc32 ParseClassCharacterEscape();
     171             : 
     172             :   // Checks whether the following is a length-digit hexadecimal number,
     173             :   // and sets the value if it is.
     174             :   bool ParseHexEscape(int length, uc32* value);
     175             :   bool ParseUnicodeEscape(uc32* value);
     176             :   bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
     177             :   bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
     178             : 
     179             :   uc32 ParseOctalLiteral();
     180             : 
     181             :   // Tries to parse the input as a back reference.  If successful it
     182             :   // stores the result in the output parameter and returns true.  If
     183             :   // it fails it will push back the characters read so the same characters
     184             :   // can be reparsed.
     185             :   bool ParseBackReferenceIndex(int* index_out);
     186             : 
     187             :   // The default behavior is to combine surrogate pairs in unicode mode and
     188             :   // don't combine them otherwise (a quantifier after a surrogate pair would
     189             :   // then apply only to the trailing surrogate). Forcing combination is required
     190             :   // when parsing capture names since they can always legally contain surrogate
     191             :   // pairs.
     192             :   enum class ScanMode { DEFAULT, FORCE_COMBINE_SURROGATE_PAIRS };
     193             : 
     194             :   bool ParseClassProperty(ZoneList<CharacterRange>* result);
     195             :   CharacterRange ParseClassAtom(uc16* char_class);
     196             :   RegExpTree* ReportError(Vector<const char> message);
     197             :   void Advance(ScanMode mode = ScanMode::DEFAULT);
     198             :   void Advance(int dist, ScanMode mode = ScanMode::DEFAULT);
     199             :   void Reset(int pos);
     200             : 
     201             :   // Reports whether the pattern might be used as a literal search string.
     202             :   // Only use if the result of the parse is a single atom node.
     203             :   bool simple();
     204             :   bool contains_anchor() { return contains_anchor_; }
     205       14806 :   void set_contains_anchor() { contains_anchor_ = true; }
     206             :   int captures_started() { return captures_started_; }
     207       41537 :   int position() { return next_pos_ - 1; }
     208             :   bool failed() { return failed_; }
     209             :   bool dotall() const { return dotall_; }
     210             :   bool ignore_case() const { return ignore_case_; }
     211             :   bool multiline() const { return multiline_; }
     212             :   bool unicode() const { return unicode_; }
     213             : 
     214             :   static bool IsSyntaxCharacterOrSlash(uc32 c);
     215             : 
     216             :   static const int kMaxCaptures = 1 << 16;
     217             :   static const uc32 kEndMarker = (1 << 21);
     218             : 
     219             :  private:
     220             :   enum SubexpressionType {
     221             :     INITIAL,
     222             :     CAPTURE,  // All positive values represent captures.
     223             :     POSITIVE_LOOKAROUND,
     224             :     NEGATIVE_LOOKAROUND,
     225             :     GROUPING
     226             :   };
     227             : 
     228             :   class RegExpParserState : public ZoneObject {
     229             :    public:
     230     2143904 :     RegExpParserState(RegExpParserState* previous_state,
     231             :                       SubexpressionType group_type,
     232             :                       RegExpLookaround::Type lookaround_type,
     233             :                       int disjunction_capture_index,
     234             :                       const ZoneVector<uc16>* capture_name, bool ignore_case,
     235             :                       bool unicode, Zone* zone)
     236             :         : previous_state_(previous_state),
     237             :           builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
     238             :           group_type_(group_type),
     239             :           lookaround_type_(lookaround_type),
     240             :           disjunction_capture_index_(disjunction_capture_index),
     241     4287808 :           capture_name_(capture_name) {}
     242             :     // Parser state of containing expression, if any.
     243             :     RegExpParserState* previous_state() { return previous_state_; }
     244             :     bool IsSubexpression() { return previous_state_ != NULL; }
     245             :     // RegExpBuilder building this regexp's AST.
     246             :     RegExpBuilder* builder() { return builder_; }
     247             :     // Type of regexp being parsed (parenthesized group or entire regexp).
     248             :     SubexpressionType group_type() { return group_type_; }
     249             :     // Lookahead or Lookbehind.
     250             :     RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
     251             :     // Index in captures array of first capture in this sub-expression, if any.
     252             :     // Also the capture index of this sub-expression itself, if group_type
     253             :     // is CAPTURE.
     254             :     int capture_index() { return disjunction_capture_index_; }
     255             :     // The name of the current sub-expression, if group_type is CAPTURE. Only
     256             :     // used for named captures.
     257             :     const ZoneVector<uc16>* capture_name() { return capture_name_; }
     258             : 
     259             :     bool IsNamedCapture() const { return capture_name_ != nullptr; }
     260             : 
     261             :     // Check whether the parser is inside a capture group with the given index.
     262             :     bool IsInsideCaptureGroup(int index);
     263             :     // Check whether the parser is inside a capture group with the given name.
     264             :     bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
     265             : 
     266             :    private:
     267             :     // Linked list implementation of stack of states.
     268             :     RegExpParserState* previous_state_;
     269             :     // Builder for the stored disjunction.
     270             :     RegExpBuilder* builder_;
     271             :     // Stored disjunction type (capture, look-ahead or grouping), if any.
     272             :     SubexpressionType group_type_;
     273             :     // Stored read direction.
     274             :     RegExpLookaround::Type lookaround_type_;
     275             :     // Stored disjunction's capture index (if any).
     276             :     int disjunction_capture_index_;
     277             :     // Stored capture name (if any).
     278             :     const ZoneVector<uc16>* capture_name_;
     279             :   };
     280             : 
     281             :   // Return the 1-indexed RegExpCapture object, allocate if necessary.
     282             :   RegExpCapture* GetCapture(int index);
     283             : 
     284             :   // Creates a new named capture at the specified index. Must be called exactly
     285             :   // once for each named capture. Fails if a capture with the same name is
     286             :   // encountered.
     287             :   bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
     288             : 
     289             :   // Parses the name of a capture group (?<name>pattern). The name must adhere
     290             :   // to IdentifierName in the ECMAScript standard.
     291             :   const ZoneVector<uc16>* ParseCaptureGroupName();
     292             : 
     293             :   bool ParseNamedBackReference(RegExpBuilder* builder,
     294             :                                RegExpParserState* state);
     295             : 
     296             :   // After the initial parsing pass, patch corresponding RegExpCapture objects
     297             :   // into all RegExpBackReferences. This is done after initial parsing in order
     298             :   // to avoid complicating cases in which references comes before the capture.
     299             :   void PatchNamedBackReferences();
     300             : 
     301             :   Handle<FixedArray> CreateCaptureNameMap();
     302             : 
     303             :   // Returns true iff the pattern contains named captures. May call
     304             :   // ScanForCaptures to look ahead at the remaining pattern.
     305             :   bool HasNamedCaptures();
     306             : 
     307             :   Isolate* isolate() { return isolate_; }
     308             :   Zone* zone() const { return zone_; }
     309             : 
     310             :   uc32 current() { return current_; }
     311             :   bool has_more() { return has_more_; }
     312    15924220 :   bool has_next() { return next_pos_ < in()->length(); }
     313             :   uc32 Next();
     314             :   uc32 ReadNext(bool update_position, ScanMode mode);
     315             :   FlatStringReader* in() { return in_; }
     316             :   void ScanForCaptures();
     317             : 
     318             :   Isolate* isolate_;
     319             :   Zone* zone_;
     320             :   Handle<String>* error_;
     321             :   ZoneList<RegExpCapture*>* captures_;
     322             :   ZoneList<RegExpCapture*>* named_captures_;
     323             :   ZoneList<RegExpBackReference*>* named_back_references_;
     324             :   FlatStringReader* in_;
     325             :   uc32 current_;
     326             :   bool dotall_;
     327             :   bool ignore_case_;
     328             :   bool multiline_;
     329             :   bool unicode_;
     330             :   int next_pos_;
     331             :   int captures_started_;
     332             :   int capture_count_;  // Only valid after we have scanned for captures.
     333             :   bool has_more_;
     334             :   bool simple_;
     335             :   bool contains_anchor_;
     336             :   bool is_scanned_for_captures_;
     337             :   bool has_named_captures_;  // Only valid after we have scanned for captures.
     338             :   bool failed_;
     339             : };
     340             : 
     341             : }  // namespace internal
     342             : }  // namespace v8
     343             : 
     344             : #endif  // V8_REGEXP_REGEXP_PARSER_H_
 |