LCOV - code coverage report
Current view: top level - src/regexp - regexp-parser.h (source / functions) Hit Total Coverage
Test: app.info Lines: 34 36 94.4 %
Date: 2019-04-17 Functions: 4 4 100.0 %

          Line data    Source code
       1             : // Copyright 2016 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #ifndef V8_REGEXP_REGEXP_PARSER_H_
       6             : #define V8_REGEXP_REGEXP_PARSER_H_
       7             : 
       8             : #include "src/objects.h"
       9             : #include "src/objects/js-regexp.h"
      10             : #include "src/regexp/regexp-ast.h"
      11             : #include "src/zone/zone.h"
      12             : 
      13             : namespace v8 {
      14             : namespace internal {
      15             : 
      16             : struct RegExpCompileData;
      17             : 
      18             : // A BufferedZoneList is an automatically growing list, just like (and backed
      19             : // by) a ZoneList, that is optimized for the case of adding and removing
      20             : // a single element. The last element added is stored outside the backing list,
      21             : // and if no more than one element is ever added, the ZoneList isn't even
      22             : // allocated.
      23             : // Elements must not be nullptr pointers.
      24             : template <typename T, int initial_size>
      25             : class BufferedZoneList {
      26             :  public:
      27     1486350 :   BufferedZoneList() : list_(nullptr), last_(nullptr) {}
      28             : 
      29             :   // Adds element at end of list. This element is buffered and can
      30             :   // be read using last() or removed using RemoveLast until a new Add or until
      31             :   // RemoveLast or GetList has been called.
      32     5439958 :   void Add(T* value, Zone* zone) {
      33     5439958 :     if (last_ != nullptr) {
      34     2072408 :       if (list_ == nullptr) {
      35      109006 :         list_ = new (zone) ZoneList<T*>(initial_size, zone);
      36             :       }
      37     2072408 :       list_->Add(last_, zone);
      38             :     }
      39     5439958 :     last_ = value;
      40     5439958 :   }
      41             : 
      42             :   T* last() {
      43             :     DCHECK(last_ != nullptr);
      44             :     return last_;
      45             :   }
      46             : 
      47       23771 :   T* RemoveLast() {
      48             :     DCHECK(last_ != nullptr);
      49       23771 :     T* result = last_;
      50       38106 :     if ((list_ != nullptr) && (list_->length() > 0))
      51       14335 :       last_ = list_->RemoveLast();
      52             :     else
      53        9436 :       last_ = nullptr;
      54       23771 :     return result;
      55             :   }
      56             : 
      57             :   T* Get(int i) {
      58             :     DCHECK((0 <= i) && (i < length()));
      59      107006 :     if (list_ == nullptr) {
      60             :       DCHECK_EQ(0, i);
      61           0 :       return last_;
      62             :     } else {
      63      107006 :       if (i == list_->length()) {
      64             :         DCHECK(last_ != nullptr);
      65       37536 :         return last_;
      66             :       } else {
      67       69470 :         return list_->at(i);
      68             :       }
      69             :     }
      70             :   }
      71             : 
      72             :   void Clear() {
      73     2204426 :     list_ = nullptr;
      74     2204426 :     last_ = nullptr;
      75             :   }
      76             : 
      77             :   int length() {
      78     7368369 :     int length = (list_ == nullptr) ? 0 : list_->length();
      79     7368369 :     return length + ((last_ == nullptr) ? 0 : 1);
      80             :   }
      81             : 
      82       62326 :   ZoneList<T*>* GetList(Zone* zone) {
      83       62326 :     if (list_ == nullptr) {
      84           0 :       list_ = new (zone) ZoneList<T*>(initial_size, zone);
      85             :     }
      86       62326 :     if (last_ != nullptr) {
      87       62326 :       list_->Add(last_, zone);
      88       62326 :       last_ = nullptr;
      89             :     }
      90       62326 :     return list_;
      91             :   }
      92             : 
      93             :  private:
      94             :   ZoneList<T*>* list_;
      95             :   T* last_;
      96             : };
      97             : 
      98             : 
      99             : // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
     100             : class RegExpBuilder : public ZoneObject {
     101             :  public:
     102             :   RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
     103             :   void AddCharacter(uc16 character);
     104             :   void AddUnicodeCharacter(uc32 character);
     105             :   void AddEscapedUnicodeCharacter(uc32 character);
     106             :   // "Adds" an empty expression. Does nothing except consume a
     107             :   // following quantifier
     108             :   void AddEmpty();
     109             :   void AddCharacterClass(RegExpCharacterClass* cc);
     110             :   void AddCharacterClassForDesugaring(uc32 c);
     111             :   void AddAtom(RegExpTree* tree);
     112             :   void AddTerm(RegExpTree* tree);
     113             :   void AddAssertion(RegExpTree* tree);
     114             :   void NewAlternative();  // '|'
     115             :   bool AddQuantifierToAtom(int min, int max,
     116             :                            RegExpQuantifier::QuantifierType type);
     117             :   void FlushText();
     118             :   RegExpTree* ToRegExp();
     119             :   JSRegExp::Flags flags() const { return flags_; }
     120        1075 :   void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
     121             : 
     122             :   bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
     123             :   bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
     124             :   bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
     125             : 
     126             :  private:
     127             :   static const uc16 kNoPendingSurrogate = 0;
     128             :   void AddLeadSurrogate(uc16 lead_surrogate);
     129             :   void AddTrailSurrogate(uc16 trail_surrogate);
     130             :   void FlushPendingSurrogate();
     131             :   void FlushCharacters();
     132             :   void FlushTerms();
     133             :   bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
     134             :   bool NeedsDesugaringForIgnoreCase(uc32 c);
     135             :   Zone* zone() const { return zone_; }
     136             :   bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
     137             : 
     138             :   Zone* zone_;
     139             :   bool pending_empty_;
     140             :   JSRegExp::Flags flags_;
     141             :   ZoneList<uc16>* characters_;
     142             :   uc16 pending_surrogate_;
     143             :   BufferedZoneList<RegExpTree, 2> terms_;
     144             :   BufferedZoneList<RegExpTree, 2> text_;
     145             :   BufferedZoneList<RegExpTree, 2> alternatives_;
     146             : #ifdef DEBUG
     147             :   enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
     148             : #define LAST(x) last_added_ = x;
     149             : #else
     150             : #define LAST(x)
     151             : #endif
     152             : };
     153             : 
     154             : class V8_EXPORT_PRIVATE RegExpParser {
     155             :  public:
     156             :   RegExpParser(FlatStringReader* in, Handle<String>* error,
     157             :                JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
     158             : 
     159             :   static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
     160             :                           JSRegExp::Flags flags, RegExpCompileData* result);
     161             : 
     162             :   RegExpTree* ParsePattern();
     163             :   RegExpTree* ParseDisjunction();
     164             :   RegExpTree* ParseGroup();
     165             : 
     166             :   // Parses a {...,...} quantifier and stores the range in the given
     167             :   // out parameters.
     168             :   bool ParseIntervalQuantifier(int* min_out, int* max_out);
     169             : 
     170             :   // Parses and returns a single escaped character.  The character
     171             :   // must not be 'b' or 'B' since they are usually handle specially.
     172             :   uc32 ParseClassCharacterEscape();
     173             : 
     174             :   // Checks whether the following is a length-digit hexadecimal number,
     175             :   // and sets the value if it is.
     176             :   bool ParseHexEscape(int length, uc32* value);
     177             :   bool ParseUnicodeEscape(uc32* value);
     178             :   bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
     179             : 
     180             :   bool ParsePropertyClassName(std::vector<char>* name_1,
     181             :                               std::vector<char>* name_2);
     182             :   bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
     183             :                              const std::vector<char>& name_1,
     184             :                              const std::vector<char>& name_2);
     185             : 
     186             :   RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
     187             :   RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
     188             : 
     189             :   uc32 ParseOctalLiteral();
     190             : 
     191             :   // Tries to parse the input as a back reference.  If successful it
     192             :   // stores the result in the output parameter and returns true.  If
     193             :   // it fails it will push back the characters read so the same characters
     194             :   // can be reparsed.
     195             :   bool ParseBackReferenceIndex(int* index_out);
     196             : 
     197             :   // Parse inside a class. Either add escaped class to the range, or return
     198             :   // false and pass parsed single character through |char_out|.
     199             :   void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
     200             :                         bool add_unicode_case_equivalents, uc32* char_out,
     201             :                         bool* is_class_escape);
     202             : 
     203             :   char ParseClassEscape();
     204             : 
     205             :   RegExpTree* ReportError(Vector<const char> message);
     206             :   void Advance();
     207             :   void Advance(int dist);
     208             :   void Reset(int pos);
     209             : 
     210             :   // Reports whether the pattern might be used as a literal search string.
     211             :   // Only use if the result of the parse is a single atom node.
     212             :   bool simple();
     213             :   bool contains_anchor() { return contains_anchor_; }
     214        6184 :   void set_contains_anchor() { contains_anchor_ = true; }
     215             :   int captures_started() { return captures_started_; }
     216       96289 :   int position() { return next_pos_ - 1; }
     217             :   bool failed() { return failed_; }
     218             :   // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
     219             :   // just read the initial flag value here.
     220             :   bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
     221             : 
     222             :   static bool IsSyntaxCharacterOrSlash(uc32 c);
     223             : 
     224             :   static const int kMaxCaptures = 1 << 16;
     225             :   static const uc32 kEndMarker = (1 << 21);
     226             : 
     227             :  private:
     228             :   enum SubexpressionType {
     229             :     INITIAL,
     230             :     CAPTURE,  // All positive values represent captures.
     231             :     POSITIVE_LOOKAROUND,
     232             :     NEGATIVE_LOOKAROUND,
     233             :     GROUPING
     234             :   };
     235             : 
     236             :   class RegExpParserState : public ZoneObject {
     237             :    public:
     238             :     // Push a state on the stack.
     239     1486200 :     RegExpParserState(RegExpParserState* previous_state,
     240             :                       SubexpressionType group_type,
     241             :                       RegExpLookaround::Type lookaround_type,
     242             :                       int disjunction_capture_index,
     243             :                       const ZoneVector<uc16>* capture_name,
     244             :                       JSRegExp::Flags flags, Zone* zone)
     245             :         : previous_state_(previous_state),
     246             :           builder_(new (zone) RegExpBuilder(zone, flags)),
     247             :           group_type_(group_type),
     248             :           lookaround_type_(lookaround_type),
     249             :           disjunction_capture_index_(disjunction_capture_index),
     250     2972400 :           capture_name_(capture_name) {}
     251             :     // Parser state of containing expression, if any.
     252             :     RegExpParserState* previous_state() const { return previous_state_; }
     253             :     bool IsSubexpression() { return previous_state_ != nullptr; }
     254             :     // RegExpBuilder building this regexp's AST.
     255             :     RegExpBuilder* builder() const { return builder_; }
     256             :     // Type of regexp being parsed (parenthesized group or entire regexp).
     257             :     SubexpressionType group_type() const { return group_type_; }
     258             :     // Lookahead or Lookbehind.
     259             :     RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
     260             :     // Index in captures array of first capture in this sub-expression, if any.
     261             :     // Also the capture index of this sub-expression itself, if group_type
     262             :     // is CAPTURE.
     263             :     int capture_index() const { return disjunction_capture_index_; }
     264             :     // The name of the current sub-expression, if group_type is CAPTURE. Only
     265             :     // used for named captures.
     266             :     const ZoneVector<uc16>* capture_name() const { return capture_name_; }
     267             : 
     268             :     bool IsNamedCapture() const { return capture_name_ != nullptr; }
     269             : 
     270             :     // Check whether the parser is inside a capture group with the given index.
     271             :     bool IsInsideCaptureGroup(int index);
     272             :     // Check whether the parser is inside a capture group with the given name.
     273             :     bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
     274             : 
     275             :    private:
     276             :     // Linked list implementation of stack of states.
     277             :     RegExpParserState* const previous_state_;
     278             :     // Builder for the stored disjunction.
     279             :     RegExpBuilder* const builder_;
     280             :     // Stored disjunction type (capture, look-ahead or grouping), if any.
     281             :     const SubexpressionType group_type_;
     282             :     // Stored read direction.
     283             :     const RegExpLookaround::Type lookaround_type_;
     284             :     // Stored disjunction's capture index (if any).
     285             :     const int disjunction_capture_index_;
     286             :     // Stored capture name (if any).
     287             :     const ZoneVector<uc16>* const capture_name_;
     288             :   };
     289             : 
     290             :   // Return the 1-indexed RegExpCapture object, allocate if necessary.
     291             :   RegExpCapture* GetCapture(int index);
     292             : 
     293             :   // Creates a new named capture at the specified index. Must be called exactly
     294             :   // once for each named capture. Fails if a capture with the same name is
     295             :   // encountered.
     296             :   bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
     297             : 
     298             :   // Parses the name of a capture group (?<name>pattern). The name must adhere
     299             :   // to IdentifierName in the ECMAScript standard.
     300             :   const ZoneVector<uc16>* ParseCaptureGroupName();
     301             : 
     302             :   bool ParseNamedBackReference(RegExpBuilder* builder,
     303             :                                RegExpParserState* state);
     304             :   RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
     305             : 
     306             :   // After the initial parsing pass, patch corresponding RegExpCapture objects
     307             :   // into all RegExpBackReferences. This is done after initial parsing in order
     308             :   // to avoid complicating cases in which references comes before the capture.
     309             :   void PatchNamedBackReferences();
     310             : 
     311             :   Handle<FixedArray> CreateCaptureNameMap();
     312             : 
     313             :   // Returns true iff the pattern contains named captures. May call
     314             :   // ScanForCaptures to look ahead at the remaining pattern.
     315             :   bool HasNamedCaptures();
     316             : 
     317             :   Isolate* isolate() { return isolate_; }
     318             :   Zone* zone() const { return zone_; }
     319             : 
     320             :   uc32 current() { return current_; }
     321             :   bool has_more() { return has_more_; }
     322   389298900 :   bool has_next() { return next_pos_ < in()->length(); }
     323             :   uc32 Next();
     324             :   template <bool update_position>
     325             :   uc32 ReadNext();
     326             :   FlatStringReader* in() { return in_; }
     327             :   void ScanForCaptures();
     328             : 
     329             :   Isolate* isolate_;
     330             :   Zone* zone_;
     331             :   Handle<String>* error_;
     332             :   ZoneList<RegExpCapture*>* captures_;
     333             :   ZoneList<RegExpCapture*>* named_captures_;
     334             :   ZoneList<RegExpBackReference*>* named_back_references_;
     335             :   FlatStringReader* in_;
     336             :   uc32 current_;
     337             :   // These are the flags specified outside the regexp syntax ie after the
     338             :   // terminating '/' or in the second argument to the constructor.  The current
     339             :   // flags are stored on the RegExpBuilder.
     340             :   JSRegExp::Flags top_level_flags_;
     341             :   int next_pos_;
     342             :   int captures_started_;
     343             :   int capture_count_;  // Only valid after we have scanned for captures.
     344             :   bool has_more_;
     345             :   bool simple_;
     346             :   bool contains_anchor_;
     347             :   bool is_scanned_for_captures_;
     348             :   bool has_named_captures_;  // Only valid after we have scanned for captures.
     349             :   bool failed_;
     350             : };
     351             : 
     352             : }  // namespace internal
     353             : }  // namespace v8
     354             : 
     355             : #endif  // V8_REGEXP_REGEXP_PARSER_H_

Generated by: LCOV version 1.10