LCOV - app.info - src/regexp/regexp-parser.cc

LCOV - code coverage report

Current view:	top level - src/regexp - regexp-parser.cc (source / functions)		Hit	Total	Coverage
Test:	app.info	Lines:	785	819	95.8 %
Date:	2019-04-17	Functions:	50	64	78.1 %

          Line data    Source code

       1             : // Copyright 2016 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/regexp/regexp-parser.h"
       6             : 
       7             : #include <vector>
       8             : 
       9             : #include "src/char-predicates-inl.h"
      10             : #include "src/heap/factory.h"
      11             : #include "src/isolate.h"
      12             : #include "src/objects-inl.h"
      13             : #include "src/ostreams.h"
      14             : #include "src/regexp/jsregexp.h"
      15             : #include "src/regexp/property-sequences.h"
      16             : #include "src/utils.h"
      17             : #include "src/zone/zone-list-inl.h"
      18             : 
      19             : #ifdef V8_INTL_SUPPORT
      20             : #include "unicode/uniset.h"
      21             : #endif  // V8_INTL_SUPPORT
      22             : 
      23             : namespace v8 {
      24             : namespace internal {
      25             : 
      26           0 : RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
      27             :                            JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
      28             :     : isolate_(isolate),
      29             :       zone_(zone),
      30             :       error_(error),
      31             :       captures_(nullptr),
      32             :       named_captures_(nullptr),
      33             :       named_back_references_(nullptr),
      34             :       in_(in),
      35             :       current_(kEndMarker),
      36             :       top_level_flags_(flags),
      37             :       next_pos_(0),
      38             :       captures_started_(0),
      39             :       capture_count_(0),
      40             :       has_more_(true),
      41             :       simple_(false),
      42             :       contains_anchor_(false),
      43             :       is_scanned_for_captures_(false),
      44             :       has_named_captures_(false),
      45      380302 :       failed_(false) {
      46      380302 :   Advance();
      47           0 : }
      48             : 
      49             : template <bool update_position>
      50   388900300 : inline uc32 RegExpParser::ReadNext() {
      51   388900300 :   int position = next_pos_;
      52             :   uc32 c0 = in()->Get(position);
      53   388900300 :   position++;
      54             :   // Read the whole surrogate pair in case of unicode flag, if possible.
      55   389598593 :   if (unicode() && position < in()->length() &&
      56             :       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
      57             :     uc16 c1 = in()->Get(position);
      58         586 :     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
      59         389 :       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
      60         371 :       position++;
      61             :     }
      62             :   }
      63   388727589 :   if (update_position) next_pos_ = position;
      64   388900300 :   return c0;
      65             : }
      66             : 
      67             : 
      68           0 : uc32 RegExpParser::Next() {
      69      172735 :   if (has_next()) {
      70      172711 :     return ReadNext<false>();
      71             :   } else {
      72             :     return kEndMarker;
      73             :   }
      74             : }
      75             : 
      76   389107580 : void RegExpParser::Advance() {
      77   389107580 :   if (has_next()) {
      78             :     StackLimitCheck check(isolate());
      79   388727976 :     if (check.HasOverflowed()) {
      80         378 :       if (FLAG_abort_on_stack_or_string_length_overflow) {
      81           0 :         FATAL("Aborting on stack overflow");
      82             :       }
      83         378 :       ReportError(CStrVector(
      84         378 :           MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
      85   388727598 :     } else if (zone()->excess_allocation()) {
      86           9 :       ReportError(CStrVector("Regular expression too large"));
      87             :     } else {
      88   388727589 :       current_ = ReadNext<true>();
      89             :     }
      90             :   } else {
      91      379604 :     current_ = kEndMarker;
      92             :     // Advance so that position() points to 1-after-the-last-character. This is
      93             :     // important so that Reset() to this position works correctly.
      94      379604 :     next_pos_ = in()->length() + 1;
      95      379604 :     has_more_ = false;
      96             :   }
      97   389107580 : }
      98             : 
      99             : 
     100           0 : void RegExpParser::Reset(int pos) {
     101        2609 :   next_pos_ = pos;
     102        2609 :   has_more_ = (pos < in()->length());
     103        2609 :   Advance();
     104           0 : }
     105             : 
     106           0 : void RegExpParser::Advance(int dist) {
     107       46407 :   next_pos_ += dist - 1;
     108       46407 :   Advance();
     109           0 : }
     110             : 
     111             : 
     112      206333 : bool RegExpParser::simple() { return simple_; }
     113             : 
     114           0 : bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
     115         177 :   switch (c) {
     116             :     case '^':
     117             :     case '$':
     118             :     case '\\':
     119             :     case '.':
     120             :     case '*':
     121             :     case '+':
     122             :     case '?':
     123             :     case '(':
     124             :     case ')':
     125             :     case '[':
     126             :     case ']':
     127             :     case '{':
     128             :     case '}':
     129             :     case '|':
     130             :     case '/':
     131             :       return true;
     132             :     default:
     133             :       break;
     134             :   }
     135           0 :   return false;
     136             : }
     137             : 
     138             : 
     139        2971 : RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
     140        2971 :   if (failed_) return nullptr;  // Do not overwrite any existing error.
     141        2966 :   failed_ = true;
     142        2966 :   *error_ = isolate()
     143             :                 ->factory()
     144        5932 :                 ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
     145        2966 :                 .ToHandleChecked();
     146             :   // Zip to the end to make sure the no more input is read.
     147        2966 :   current_ = kEndMarker;
     148        2966 :   next_pos_ = in()->length();
     149        2966 :   return nullptr;
     150             : }
     151             : 
     152             : #define CHECK_FAILED /**/);    \
     153             :   if (failed_) return nullptr; \
     154             :   ((void)0
     155             : 
     156             : // Pattern ::
     157             : //   Disjunction
     158      380302 : RegExpTree* RegExpParser::ParsePattern() {
     159      380302 :   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
     160      377436 :   PatchNamedBackReferences(CHECK_FAILED);
     161             :   DCHECK(!has_more());
     162             :   // If the result of parsing is a literal string atom, and it has the
     163             :   // same length as the input, then the atom is identical to the input.
     164      583669 :   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
     165      199368 :     simple_ = true;
     166             :   }
     167             :   return result;
     168             : }
     169             : 
     170             : 
     171             : // Disjunction ::
     172             : //   Alternative
     173             : //   Alternative | Disjunction
     174             : // Alternative ::
     175             : //   [empty]
     176             : //   Term Alternative
     177             : // Term ::
     178             : //   Assertion
     179             : //   Atom
     180             : //   Atom Quantifier
     181      380302 : RegExpTree* RegExpParser::ParseDisjunction() {
     182             :   // Used to store current state while parsing subexpressions.
     183             :   RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
     184      380302 :                                   0, nullptr, top_level_flags_, zone());
     185             :   RegExpParserState* state = &initial_state;
     186             :   // Cache the builder in a local variable for quick access.
     187             :   RegExpBuilder* builder = initial_state.builder();
     188             :   while (true) {
     189   386265666 :     switch (current()) {
     190             :       case kEndMarker:
     191      377771 :         if (state->IsSubexpression()) {
     192             :           // Inside a parenthesized group when hitting end of input.
     193      380365 :           return ReportError(CStrVector("Unterminated group"));
     194             :         }
     195             :         DCHECK_EQ(INITIAL, state->group_type());
     196             :         // Parsing completed successfully.
     197      377708 :         return builder->ToRegExp();
     198             :       case ')': {
     199     1105695 :         if (!state->IsSubexpression()) {
     200          14 :           return ReportError(CStrVector("Unmatched ')'"));
     201             :         }
     202             :         DCHECK_NE(INITIAL, state->group_type());
     203             : 
     204     1105681 :         Advance();
     205             :         // End disjunction parsing and convert builder content to new single
     206             :         // regexp atom.
     207     1105681 :         RegExpTree* body = builder->ToRegExp();
     208             : 
     209             :         int end_capture_index = captures_started();
     210             : 
     211             :         int capture_index = state->capture_index();
     212             :         SubexpressionType group_type = state->group_type();
     213             : 
     214             :         // Build result of subexpression.
     215     1105681 :         if (group_type == CAPTURE) {
     216     1082087 :           if (state->IsNamedCapture()) {
     217             :             CreateNamedCaptureAtIndex(state->capture_name(),
     218        1257 :                                       capture_index CHECK_FAILED);
     219             :           }
     220     1082046 :           RegExpCapture* capture = GetCapture(capture_index);
     221             :           capture->set_body(body);
     222             :           body = capture;
     223       23594 :         } else if (group_type == GROUPING) {
     224             :           body = new (zone()) RegExpGroup(body);
     225             :         } else {
     226             :           DCHECK(group_type == POSITIVE_LOOKAROUND ||
     227             :                  group_type == NEGATIVE_LOOKAROUND);
     228        3611 :           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
     229             :           body = new (zone()) RegExpLookaround(
     230             :               body, is_positive, end_capture_index - capture_index,
     231        3611 :               capture_index, state->lookaround_type());
     232             :         }
     233             : 
     234             :         // Restore previous state.
     235             :         state = state->previous_state();
     236             :         builder = state->builder();
     237             : 
     238     1105640 :         builder->AddAtom(body);
     239             :         // For compatibility with JSC and ES3, we allow quantifiers after
     240             :         // lookaheads, and break in all cases.
     241     1105640 :         break;
     242             :       }
     243             :       case '|': {
     244      117326 :         Advance();
     245             :         builder->NewAlternative();
     246   384433421 :         continue;
     247             :       }
     248             :       case '*':
     249             :       case '+':
     250             :       case '?':
     251          95 :         return ReportError(CStrVector("Nothing to repeat"));
     252             :       case '^': {
     253        6447 :         Advance();
     254        6447 :         if (builder->multiline()) {
     255             :           builder->AddAssertion(new (zone()) RegExpAssertion(
     256         263 :               RegExpAssertion::START_OF_LINE, builder->flags()));
     257             :         } else {
     258             :           builder->AddAssertion(new (zone()) RegExpAssertion(
     259        6184 :               RegExpAssertion::START_OF_INPUT, builder->flags()));
     260             :           set_contains_anchor();
     261             :         }
     262             :         continue;
     263             :       }
     264             :       case '$': {
     265        4041 :         Advance();
     266             :         RegExpAssertion::AssertionType assertion_type =
     267             :             builder->multiline() ? RegExpAssertion::END_OF_LINE
     268        4041 :                                  : RegExpAssertion::END_OF_INPUT;
     269             :         builder->AddAssertion(
     270        4041 :             new (zone()) RegExpAssertion(assertion_type, builder->flags()));
     271        4041 :         continue;
     272             :       }
     273             :       case '.': {
     274       19037 :         Advance();
     275             :         ZoneList<CharacterRange>* ranges =
     276             :             new (zone()) ZoneList<CharacterRange>(2, zone());
     277             : 
     278       19037 :         if (builder->dotall()) {
     279             :           // Everything.
     280         210 :           CharacterRange::AddClassEscape('*', ranges, false, zone());
     281             :         } else {
     282             :           // Everything except \x0A, \x0D, \u2028 and \u2029
     283       18827 :           CharacterRange::AddClassEscape('.', ranges, false, zone());
     284             :         }
     285             : 
     286             :         RegExpCharacterClass* cc =
     287       38074 :             new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
     288       19037 :         builder->AddCharacterClass(cc);
     289       19037 :         break;
     290             :       }
     291             :       case '(': {
     292     1107381 :         state = ParseOpenParenthesis(state CHECK_FAILED);
     293             :         builder = state->builder();
     294     1106888 :         continue;
     295             :       }
     296             :       case '[': {
     297      155232 :         RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
     298      154856 :         builder->AddCharacterClass(cc->AsCharacterClass());
     299      154856 :         break;
     300             :       }
     301             :       // Atom ::
     302             :       //   \ AtomEscape
     303             :       case '\\':
     304       31048 :         switch (Next()) {
     305             :           case kEndMarker:
     306           5 :             return ReportError(CStrVector("\\ at end of pattern"));
     307             :           case 'b':
     308             :             Advance(2);
     309             :             builder->AddAssertion(new (zone()) RegExpAssertion(
     310         431 :                 RegExpAssertion::BOUNDARY, builder->flags()));
     311         431 :             continue;
     312             :           case 'B':
     313             :             Advance(2);
     314             :             builder->AddAssertion(new (zone()) RegExpAssertion(
     315         327 :                 RegExpAssertion::NON_BOUNDARY, builder->flags()));
     316         327 :             continue;
     317             :           // AtomEscape ::
     318             :           //   CharacterClassEscape
     319             :           //
     320             :           // CharacterClassEscape :: one of
     321             :           //   d D s S w W
     322             :           case 'd':
     323             :           case 'D':
     324             :           case 's':
     325             :           case 'S':
     326             :           case 'w':
     327             :           case 'W': {
     328             :             uc32 c = Next();
     329             :             Advance(2);
     330             :             ZoneList<CharacterRange>* ranges =
     331             :                 new (zone()) ZoneList<CharacterRange>(2, zone());
     332        6785 :             CharacterRange::AddClassEscape(
     333       14023 :                 c, ranges, unicode() && builder->ignore_case(), zone());
     334             :             RegExpCharacterClass* cc = new (zone())
     335       13570 :                 RegExpCharacterClass(zone(), ranges, builder->flags());
     336        6785 :             builder->AddCharacterClass(cc);
     337        6785 :             break;
     338             :           }
     339             :           case 'p':
     340             :           case 'P': {
     341             :             uc32 p = Next();
     342             :             Advance(2);
     343        4187 :             if (unicode()) {
     344             :               ZoneList<CharacterRange>* ranges =
     345             :                   new (zone()) ZoneList<CharacterRange>(2, zone());
     346             :               std::vector<char> name_1, name_2;
     347        3989 :               if (ParsePropertyClassName(&name_1, &name_2)) {
     348        3863 :                 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
     349             :                   RegExpCharacterClass* cc = new (zone())
     350        6040 :                       RegExpCharacterClass(zone(), ranges, builder->flags());
     351        3020 :                   builder->AddCharacterClass(cc);
     352        3020 :                   break;
     353             :                 }
     354         843 :                 if (p == 'p' && name_2.empty()) {
     355         510 :                   RegExpTree* sequence = GetPropertySequence(name_1);
     356         510 :                   if (sequence != nullptr) {
     357         150 :                     builder->AddAtom(sequence);
     358         150 :                     break;
     359             :                   }
     360             :                 }
     361             :               }
     362         819 :               return ReportError(CStrVector("Invalid property name"));
     363             :             } else {
     364         198 :               builder->AddCharacter(p);
     365             :             }
     366         198 :             break;
     367             :           }
     368             :           case '1':
     369             :           case '2':
     370             :           case '3':
     371             :           case '4':
     372             :           case '5':
     373             :           case '6':
     374             :           case '7':
     375             :           case '8':
     376             :           case '9': {
     377        5078 :             int index = 0;
     378        5155 :             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
     379        5073 :             if (is_backref) {
     380        9190 :               if (state->IsInsideCaptureGroup(index)) {
     381             :                 // The back reference is inside the capture group it refers to.
     382             :                 // Nothing can possibly have been captured yet, so we use empty
     383             :                 // instead. This ensures that, when checking a back reference,
     384             :                 // the capture registers of the referenced capture are either
     385             :                 // both set or both cleared.
     386             :                 builder->AddEmpty();
     387             :               } else {
     388        4502 :                 RegExpCapture* capture = GetCapture(index);
     389             :                 RegExpTree* atom =
     390             :                     new (zone()) RegExpBackReference(capture, builder->flags());
     391        4502 :                 builder->AddAtom(atom);
     392             :               }
     393        4632 :               break;
     394             :             }
     395             :             // With /u, no identity escapes except for syntax characters
     396             :             // are allowed. Otherwise, all identity escapes are allowed.
     397         478 :             if (unicode()) {
     398          72 :               return ReportError(CStrVector("Invalid escape"));
     399             :             }
     400             :             uc32 first_digit = Next();
     401         406 :             if (first_digit == '8' || first_digit == '9') {
     402          37 :               builder->AddCharacter(first_digit);
     403             :               Advance(2);
     404             :               break;
     405             :             }
     406             :             V8_FALLTHROUGH;
     407             :           }
     408             :           case '0': {
     409        3210 :             Advance();
     410        3248 :             if (unicode() && Next() >= '0' && Next() <= '9') {
     411             :               // With /u, decimal escape with leading 0 are not parsed as octal.
     412          19 :               return ReportError(CStrVector("Invalid decimal escape"));
     413             :             }
     414        3191 :             uc32 octal = ParseOctalLiteral();
     415        3191 :             builder->AddCharacter(octal);
     416        3191 :             break;
     417             :           }
     418             :           // ControlEscape :: one of
     419             :           //   f n r t v
     420             :           case 'f':
     421             :             Advance(2);
     422           6 :             builder->AddCharacter('\f');
     423           6 :             break;
     424             :           case 'n':
     425             :             Advance(2);
     426         724 :             builder->AddCharacter('\n');
     427         724 :             break;
     428             :           case 'r':
     429             :             Advance(2);
     430          71 :             builder->AddCharacter('\r');
     431          71 :             break;
     432             :           case 't':
     433             :             Advance(2);
     434           8 :             builder->AddCharacter('\t');
     435           8 :             break;
     436             :           case 'v':
     437             :             Advance(2);
     438          28 :             builder->AddCharacter('\v');
     439          28 :             break;
     440             :           case 'c': {
     441         149 :             Advance();
     442             :             uc32 controlLetter = Next();
     443             :             // Special case if it is an ASCII letter.
     444             :             // Convert lower case letters to uppercase.
     445         149 :             uc32 letter = controlLetter & ~('a' ^ 'A');
     446         149 :             if (letter < 'A' || 'Z' < letter) {
     447             :               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
     448             :               // Read the backslash as a literal character instead of as
     449             :               // starting an escape.
     450             :               // ES#prod-annexB-ExtendedPatternCharacter
     451          89 :               if (unicode()) {
     452             :                 // With /u, invalid escapes are not treated as identity escapes.
     453          19 :                 return ReportError(CStrVector("Invalid unicode escape"));
     454             :               }
     455          70 :               builder->AddCharacter('\\');
     456             :             } else {
     457             :               Advance(2);
     458          60 :               builder->AddCharacter(controlLetter & 0x1F);
     459             :             }
     460             :             break;
     461             :           }
     462             :           case 'x': {
     463             :             Advance(2);
     464             :             uc32 value;
     465         246 :             if (ParseHexEscape(2, &value)) {
     466         147 :               builder->AddCharacter(value);
     467          99 :             } else if (!unicode()) {
     468          63 :               builder->AddCharacter('x');
     469             :             } else {
     470             :               // With /u, invalid escapes are not treated as identity escapes.
     471          36 :               return ReportError(CStrVector("Invalid escape"));
     472             :             }
     473         210 :             break;
     474             :           }
     475             :           case 'u': {
     476             :             Advance(2);
     477             :             uc32 value;
     478        2007 :             if (ParseUnicodeEscape(&value)) {
     479        1819 :               builder->AddEscapedUnicodeCharacter(value);
     480         188 :             } else if (!unicode()) {
     481         116 :               builder->AddCharacter('u');
     482             :             } else {
     483             :               // With /u, invalid escapes are not treated as identity escapes.
     484          72 :               return ReportError(CStrVector("Invalid Unicode escape"));
     485             :             }
     486        1935 :             break;
     487             :           }
     488             :           case 'k':
     489             :             // Either an identity escape or a named back-reference.  The two
     490             :             // interpretations are mutually exclusive: '\k' is interpreted as
     491             :             // an identity escape for non-Unicode patterns without named
     492             :             // capture groups, and as the beginning of a named back-reference
     493             :             // in all other cases.
     494         672 :             if (unicode() || HasNamedCaptures()) {
     495             :               Advance(2);
     496         396 :               ParseNamedBackReference(builder, state CHECK_FAILED);
     497             :               break;
     498             :             }
     499             :             V8_FALLTHROUGH;
     500             :           default:
     501        7759 :             Advance();
     502             :             // With /u, no identity escapes except for syntax characters
     503             :             // are allowed. Otherwise, all identity escapes are allowed.
     504        7796 :             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
     505        7732 :               builder->AddCharacter(current());
     506        7732 :               Advance();
     507             :             } else {
     508          27 :               return ReportError(CStrVector("Invalid escape"));
     509             :             }
     510        7732 :             break;
     511             :         }
     512             :         break;
     513             :       case '{': {
     514             :         int dummy;
     515         509 :         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
     516         444 :         if (parsed) return ReportError(CStrVector("Nothing to repeat"));
     517             :         V8_FALLTHROUGH;
     518             :       }
     519             :       case '}':
     520             :       case ']':
     521         903 :         if (unicode()) {
     522          28 :           return ReportError(CStrVector("Lone quantifier brackets"));
     523             :         }
     524             :         V8_FALLTHROUGH;
     525             :       default:
     526   383341520 :         builder->AddUnicodeCharacter(current());
     527   383341520 :         Advance();
     528   383341520 :         break;
     529             :     }  // end switch(current())
     530             : 
     531             :     int min;
     532             :     int max;
     533   384650178 :     switch (current()) {
     534             :       // QuantifierPrefix ::
     535             :       //   *
     536             :       //   +
     537             :       //   ?
     538             :       //   {
     539             :       case '*':
     540       19616 :         min = 0;
     541       19616 :         max = RegExpTree::kInfinity;
     542       19616 :         Advance();
     543       19616 :         break;
     544             :       case '+':
     545        7967 :         min = 1;
     546        7967 :         max = RegExpTree::kInfinity;
     547        7967 :         Advance();
     548        7967 :         break;
     549             :       case '?':
     550     1421676 :         min = 0;
     551     1421676 :         max = 1;
     552     1421676 :         Advance();
     553     1421676 :         break;
     554             :       case '{':
     555        3266 :         if (ParseIntervalQuantifier(&min, &max)) {
     556        2759 :           if (max < min) {
     557             :             return ReportError(
     558           2 :                 CStrVector("numbers out of order in {} quantifier"));
     559             :           }
     560             :           break;
     561         507 :         } else if (unicode()) {
     562             :           // With /u, incomplete quantifiers are not allowed.
     563         199 :           return ReportError(CStrVector("Incomplete quantifier"));
     564             :         }
     565             :         continue;
     566             :       default:
     567             :         continue;
     568             :     }
     569             :     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
     570     1452016 :     if (current() == '?') {
     571             :       quantifier_type = RegExpQuantifier::NON_GREEDY;
     572        1713 :       Advance();
     573             :     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
     574             :       // FLAG_regexp_possessive_quantifier is a debug-only flag.
     575             :       quantifier_type = RegExpQuantifier::POSSESSIVE;
     576             :       Advance();
     577             :     }
     578     1452016 :     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
     579          73 :       return ReportError(CStrVector("Invalid quantifier"));
     580             :     }
     581             :   }
     582             : }
     583             : 
     584     1107381 : RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
     585             :     RegExpParserState* state) {
     586             :   RegExpLookaround::Type lookaround_type = state->lookaround_type();
     587             :   bool is_named_capture = false;
     588             :   JSRegExp::Flags switch_on = JSRegExp::kNone;
     589             :   JSRegExp::Flags switch_off = JSRegExp::kNone;
     590             :   const ZoneVector<uc16>* capture_name = nullptr;
     591             :   SubexpressionType subexpr_type = CAPTURE;
     592     1107381 :   Advance();
     593     1107381 :   if (current() == '?') {
     594       26384 :     switch (Next()) {
     595             :       case ':':
     596             :         Advance(2);
     597             :         subexpr_type = GROUPING;
     598       19753 :         break;
     599             :       case '=':
     600             :         Advance(2);
     601             :         lookaround_type = RegExpLookaround::LOOKAHEAD;
     602             :         subexpr_type = POSITIVE_LOOKAROUND;
     603        1709 :         break;
     604             :       case '!':
     605             :         Advance(2);
     606             :         lookaround_type = RegExpLookaround::LOOKAHEAD;
     607             :         subexpr_type = NEGATIVE_LOOKAROUND;
     608         457 :         break;
     609             :       case '-':
     610             :       case 'i':
     611             :       case 's':
     612             :       case 'm': {
     613        1373 :         if (!FLAG_regexp_mode_modifiers) {
     614           4 :           ReportError(CStrVector("Invalid group"));
     615           4 :           return nullptr;
     616             :         }
     617        1369 :         Advance();
     618             :         bool flags_sense = true;  // Switching on flags.
     619        3384 :         while (subexpr_type != GROUPING) {
     620        3126 :           switch (current()) {
     621             :             case '-':
     622         344 :               if (!flags_sense) {
     623          18 :                 ReportError(CStrVector("Multiple dashes in flag group"));
     624          18 :                 return nullptr;
     625             :               }
     626             :               flags_sense = false;
     627         326 :               Advance();
     628         326 :               continue;
     629             :             case 's':
     630             :             case 'i':
     631             :             case 'm': {
     632             :               JSRegExp::Flags bit = JSRegExp::kUnicode;
     633        1449 :               if (current() == 'i') bit = JSRegExp::kIgnoreCase;
     634        1449 :               if (current() == 'm') bit = JSRegExp::kMultiline;
     635        1449 :               if (current() == 's') bit = JSRegExp::kDotAll;
     636        1449 :               if (((switch_on | switch_off) & bit) != 0) {
     637          18 :                 ReportError(CStrVector("Repeated flag in flag group"));
     638             :                 return nullptr;
     639             :               }
     640        1431 :               if (flags_sense) {
     641             :                 switch_on |= bit;
     642             :               } else {
     643             :                 switch_off |= bit;
     644             :               }
     645        1431 :               Advance();
     646             :               continue;
     647             :             }
     648             :             case ')': {
     649        1075 :               Advance();
     650             :               state->builder()
     651        1075 :                   ->FlushText();  // Flush pending text using old flags.
     652             :               // These (?i)-style flag switches don't put us in a subexpression
     653             :               // at all, they just modify the flags in the rest of the current
     654             :               // subexpression.
     655             :               JSRegExp::Flags flags =
     656             :                   (state->builder()->flags() | switch_on) & ~switch_off;
     657             :               state->builder()->set_flags(flags);
     658             :               return state;
     659             :             }
     660             :             case ':':
     661         258 :               Advance();
     662             :               subexpr_type = GROUPING;  // Will break us out of the outer loop.
     663         258 :               continue;
     664             :             default:
     665           0 :               ReportError(CStrVector("Invalid flag group"));
     666           0 :               return nullptr;
     667             :           }
     668             :         }
     669             :         break;
     670             :       }
     671             :       case '<':
     672        3073 :         Advance();
     673        3073 :         if (Next() == '=') {
     674             :           Advance(2);
     675             :           lookaround_type = RegExpLookaround::LOOKBEHIND;
     676             :           subexpr_type = POSITIVE_LOOKAROUND;
     677        1218 :           break;
     678        1855 :         } else if (Next() == '!') {
     679             :           Advance(2);
     680             :           lookaround_type = RegExpLookaround::LOOKBEHIND;
     681             :           subexpr_type = NEGATIVE_LOOKAROUND;
     682         245 :           break;
     683             :         }
     684             :         is_named_capture = true;
     685        1610 :         has_named_captures_ = true;
     686        1610 :         Advance();
     687        1610 :         break;
     688             :       default:
     689          19 :         ReportError(CStrVector("Invalid group"));
     690          19 :         return nullptr;
     691             :     }
     692             :   }
     693     1106247 :   if (subexpr_type == CAPTURE) {
     694     1082607 :     if (captures_started_ >= kMaxCaptures) {
     695           5 :       ReportError(CStrVector("Too many captures"));
     696           5 :       return nullptr;
     697             :     }
     698     1082602 :     captures_started_++;
     699             : 
     700     1082602 :     if (is_named_capture) {
     701        1610 :       capture_name = ParseCaptureGroupName(CHECK_FAILED);
     702             :     }
     703             :   }
     704             :   JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
     705             :   // Store current state and begin new disjunction parsing.
     706             :   return new (zone())
     707             :       RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
     708     2211796 :                         capture_name, flags, zone());
     709             : }
     710             : 
     711             : #ifdef DEBUG
     712             : // Currently only used in an DCHECK.
     713             : static bool IsSpecialClassEscape(uc32 c) {
     714             :   switch (c) {
     715             :     case 'd':
     716             :     case 'D':
     717             :     case 's':
     718             :     case 'S':
     719             :     case 'w':
     720             :     case 'W':
     721             :       return true;
     722             :     default:
     723             :       return false;
     724             :   }
     725             : }
     726             : #endif
     727             : 
     728             : 
     729             : // In order to know whether an escape is a backreference or not we have to scan
     730             : // the entire regexp and find the number of capturing parentheses.  However we
     731             : // don't want to scan the regexp twice unless it is necessary.  This mini-parser
     732             : // is called when needed.  It can see the difference between capturing and
     733             : // noncapturing parentheses and can skip character classes and backslash-escaped
     734             : // characters.
     735         754 : void RegExpParser::ScanForCaptures() {
     736             :   DCHECK(!is_scanned_for_captures_);
     737             :   const int saved_position = position();
     738             :   // Start with captures started previous to current position
     739             :   int capture_count = captures_started();
     740             :   // Add count of captures after this position.
     741             :   int n;
     742        5348 :   while ((n = current()) != kEndMarker) {
     743        4594 :     Advance();
     744        4594 :     switch (n) {
     745             :       case '\\':
     746         605 :         Advance();
     747         605 :         break;
     748             :       case '[': {
     749             :         int c;
     750         165 :         while ((c = current()) != kEndMarker) {
     751         156 :           Advance();
     752         156 :           if (c == '\\') {
     753           0 :             Advance();
     754             :           } else {
     755         156 :             if (c == ']') break;
     756             :           }
     757             :         }
     758             :         break;
     759             :       }
     760             :       case '(':
     761         571 :         if (current() == '?') {
     762             :           // At this point we could be in
     763             :           // * a non-capturing group '(:',
     764             :           // * a lookbehind assertion '(?<=' '(?<!'
     765             :           // * or a named capture '(?<'.
     766             :           //
     767             :           // Of these, only named captures are capturing groups.
     768             : 
     769         145 :           Advance();
     770         145 :           if (current() != '<') break;
     771             : 
     772         121 :           Advance();
     773         121 :           if (current() == '=' || current() == '!') break;
     774             : 
     775             :           // Found a possible named capture. It could turn out to be a syntax
     776             :           // error (e.g. an unterminated or invalid name), but that distinction
     777             :           // does not matter for our purposes.
     778          75 :           has_named_captures_ = true;
     779             :         }
     780         501 :         capture_count++;
     781         501 :         break;
     782             :     }
     783             :   }
     784         754 :   capture_count_ = capture_count;
     785         754 :   is_scanned_for_captures_ = true;
     786             :   Reset(saved_position);
     787         754 : }
     788             : 
     789             : 
     790        5078 : bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
     791             :   DCHECK_EQ('\\', current());
     792             :   DCHECK('1' <= Next() && Next() <= '9');
     793             :   // Try to parse a decimal literal that is no greater than the total number
     794             :   // of left capturing parentheses in the input.
     795             :   int start = position();
     796        5078 :   int value = Next() - '0';
     797             :   Advance(2);
     798         617 :   while (true) {
     799             :     uc32 c = current();
     800        5695 :     if (IsDecimalDigit(c)) {
     801         637 :       value = 10 * value + (c - '0');
     802         637 :       if (value > kMaxCaptures) {
     803             :         Reset(start);
     804          20 :         return false;
     805             :       }
     806         617 :       Advance();
     807             :     } else {
     808             :       break;
     809             :     }
     810             :   }
     811        5058 :   if (value > captures_started()) {
     812         847 :     if (!is_scanned_for_captures_) ScanForCaptures();
     813         847 :     if (value > capture_count_) {
     814             :       Reset(start);
     815         463 :       return false;
     816             :     }
     817             :   }
     818        4595 :   *index_out = value;
     819        4595 :   return true;
     820             : }
     821             : 
     822        2425 : static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
     823        2425 :   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
     824        4770 :     v->push_back(code_unit);
     825             :   } else {
     826          80 :     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
     827          80 :     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
     828             :   }
     829        2425 : }
     830             : 
     831        1965 : const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
     832             :   ZoneVector<uc16>* name =
     833        1965 :       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
     834             : 
     835             :   bool at_start = true;
     836        2425 :   while (true) {
     837        4390 :     uc32 c = current();
     838        4390 :     Advance();
     839             : 
     840             :     // Convert unicode escapes.
     841        4390 :     if (c == '\\' && current() == 'u') {
     842         230 :       Advance();
     843         230 :       if (!ParseUnicodeEscape(&c)) {
     844          59 :         ReportError(CStrVector("Invalid Unicode escape sequence"));
     845          59 :         return nullptr;
     846             :       }
     847             :     }
     848             : 
     849             :     // The backslash char is misclassified as both ID_Start and ID_Continue.
     850        4331 :     if (c == '\\') {
     851          18 :       ReportError(CStrVector("Invalid capture group name"));
     852          18 :       return nullptr;
     853             :     }
     854             : 
     855        4313 :     if (at_start) {
     856        1933 :       if (!IsIdentifierStart(c)) {
     857         100 :         ReportError(CStrVector("Invalid capture group name"));
     858         100 :         return nullptr;
     859             :       }
     860        1833 :       push_code_unit(name, c);
     861             :       at_start = false;
     862             :     } else {
     863        2380 :       if (c == '>') {
     864             :         break;
     865         809 :       } else if (IsIdentifierPart(c)) {
     866         592 :         push_code_unit(name, c);
     867             :       } else {
     868         217 :         ReportError(CStrVector("Invalid capture group name"));
     869         217 :         return nullptr;
     870             :       }
     871             :     }
     872             :   }
     873             : 
     874        1571 :   return name;
     875             : }
     876             : 
     877        1257 : bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
     878             :                                              int index) {
     879             :   DCHECK(0 < index && index <= captures_started_);
     880             :   DCHECK_NOT_NULL(name);
     881             : 
     882        1257 :   if (named_captures_ == nullptr) {
     883         933 :     named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
     884             :   } else {
     885             :     // Check for duplicates and bail if we find any.
     886             :     // TODO(jgruber): O(n^2).
     887        1020 :     for (const auto& named_capture : *named_captures_) {
     888         389 :       if (*named_capture->name() == *name) {
     889          41 :         ReportError(CStrVector("Duplicate capture group name"));
     890          41 :         return false;
     891             :       }
     892             :     }
     893             :   }
     894             : 
     895        1216 :   RegExpCapture* capture = GetCapture(index);
     896             :   DCHECK_NULL(capture->name());
     897             : 
     898             :   capture->set_name(name);
     899        1216 :   named_captures_->Add(capture, zone());
     900             : 
     901        1216 :   return true;
     902             : }
     903             : 
     904         396 : bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
     905             :                                            RegExpParserState* state) {
     906             :   // The parser is assumed to be on the '<' in \k<name>.
     907         396 :   if (current() != '<') {
     908          41 :     ReportError(CStrVector("Invalid named reference"));
     909          41 :     return false;
     910             :   }
     911             : 
     912         355 :   Advance();
     913         355 :   const ZoneVector<uc16>* name = ParseCaptureGroupName();
     914         355 :   if (name == nullptr) {
     915             :     return false;
     916             :   }
     917             : 
     918         305 :   if (state->IsInsideCaptureGroup(name)) {
     919             :     builder->AddEmpty();
     920             :   } else {
     921             :     RegExpBackReference* atom =
     922         285 :         new (zone()) RegExpBackReference(builder->flags());
     923             :     atom->set_name(name);
     924             : 
     925         285 :     builder->AddAtom(atom);
     926             : 
     927         285 :     if (named_back_references_ == nullptr) {
     928             :       named_back_references_ =
     929         220 :           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
     930             :     }
     931         285 :     named_back_references_->Add(atom, zone());
     932             :   }
     933             : 
     934             :   return true;
     935             : }
     936             : 
     937      377436 : void RegExpParser::PatchNamedBackReferences() {
     938      377436 :   if (named_back_references_ == nullptr) return;
     939             : 
     940         220 :   if (named_captures_ == nullptr) {
     941          14 :     ReportError(CStrVector("Invalid named capture referenced"));
     942          14 :     return;
     943             :   }
     944             : 
     945             :   // Look up and patch the actual capture for each named back reference.
     946             :   // TODO(jgruber): O(n^2), optimize if necessary.
     947             : 
     948         967 :   for (int i = 0; i < named_back_references_->length(); i++) {
     949         271 :     RegExpBackReference* ref = named_back_references_->at(i);
     950             : 
     951             :     int index = -1;
     952         854 :     for (const auto& capture : *named_captures_) {
     953         341 :       if (*capture->name() == *ref->name()) {
     954             :         index = capture->index();
     955         185 :         break;
     956             :       }
     957             :     }
     958             : 
     959         271 :     if (index == -1) {
     960          86 :       ReportError(CStrVector("Invalid named capture referenced"));
     961          86 :       return;
     962             :     }
     963             : 
     964         185 :     ref->set_capture(GetCapture(index));
     965             :   }
     966             : }
     967             : 
     968     1087949 : RegExpCapture* RegExpParser::GetCapture(int index) {
     969             :   // The index for the capture groups are one-based. Its index in the list is
     970             :   // zero-based.
     971             :   int know_captures =
     972     1087949 :       is_scanned_for_captures_ ? capture_count_ : captures_started_;
     973             :   DCHECK(index <= know_captures);
     974     1087949 :   if (captures_ == nullptr) {
     975       13667 :     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
     976             :   }
     977     5422063 :   while (captures_->length() < know_captures) {
     978     2164110 :     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
     979             :   }
     980     2175898 :   return captures_->at(index - 1);
     981             : }
     982             : 
     983      377336 : Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
     984      378106 :   if (named_captures_ == nullptr || named_captures_->is_empty())
     985      376566 :     return Handle<FixedArray>();
     986             : 
     987             :   Factory* factory = isolate()->factory();
     988             : 
     989         770 :   int len = named_captures_->length() * 2;
     990         770 :   Handle<FixedArray> array = factory->NewFixedArray(len);
     991             : 
     992        4645 :   for (int i = 0; i < named_captures_->length(); i++) {
     993        1035 :     RegExpCapture* capture = named_captures_->at(i);
     994             :     Vector<const uc16> capture_name(capture->name()->data(),
     995             :                                     capture->name()->size());
     996             :     // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
     997             :     // internalized so they can be used as property names in the 'exec' results.
     998        1035 :     Handle<String> name = factory->InternalizeTwoByteString(capture_name);
     999        2070 :     array->set(i * 2, *name);
    1000             :     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
    1001             :   }
    1002             : 
    1003         770 :   return array;
    1004             : }
    1005             : 
    1006           0 : bool RegExpParser::HasNamedCaptures() {
    1007         223 :   if (has_named_captures_ || is_scanned_for_captures_) {
    1008             :     return has_named_captures_;
    1009             :   }
    1010             : 
    1011         118 :   ScanForCaptures();
    1012             :   DCHECK(is_scanned_for_captures_);
    1013         118 :   return has_named_captures_;
    1014             : }
    1015             : 
    1016           0 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
    1017        9564 :   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
    1018        5337 :     if (s->group_type() != CAPTURE) continue;
    1019             :     // Return true if we found the matching capture index.
    1020         570 :     if (index == s->capture_index()) return true;
    1021             :     // Abort if index is larger than what has been parsed up till this state.
    1022         477 :     if (index > s->capture_index()) return false;
    1023             :   }
    1024             :   return false;
    1025             : }
    1026             : 
    1027         305 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
    1028             :     const ZoneVector<uc16>* name) {
    1029             :   DCHECK_NOT_NULL(name);
    1030         895 :   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
    1031         315 :     if (s->capture_name() == nullptr) continue;
    1032          30 :     if (*s->capture_name() == *name) return true;
    1033             :   }
    1034             :   return false;
    1035             : }
    1036             : 
    1037             : // QuantifierPrefix ::
    1038             : //   { DecimalDigits }
    1039             : //   { DecimalDigits , }
    1040             : //   { DecimalDigits , DecimalDigits }
    1041             : //
    1042             : // Returns true if parsing succeeds, and set the min_out and max_out
    1043             : // values. Values are truncated to RegExpTree::kInfinity if they overflow.
    1044        3730 : bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
    1045             :   DCHECK_EQ(current(), '{');
    1046             :   int start = position();
    1047        3730 :   Advance();
    1048             :   int min = 0;
    1049        3730 :   if (!IsDecimalDigit(current())) {
    1050             :     Reset(start);
    1051         834 :     return false;
    1052             :   }
    1053       11558 :   while (IsDecimalDigit(current())) {
    1054        4456 :     int next = current() - '0';
    1055        4456 :     if (min > (RegExpTree::kInfinity - next) / 10) {
    1056             :       // Overflow. Skip past remaining decimal digits and return -1.
    1057         815 :       do {
    1058         815 :         Advance();
    1059             :       } while (IsDecimalDigit(current()));
    1060             :       min = RegExpTree::kInfinity;
    1061             :       break;
    1062             :     }
    1063        4331 :     min = 10 * min + next;
    1064        4331 :     Advance();
    1065             :   }
    1066             :   int max = 0;
    1067        2896 :   if (current() == '}') {
    1068             :     max = min;
    1069        1172 :     Advance();
    1070        1724 :   } else if (current() == ',') {
    1071        1686 :     Advance();
    1072        1686 :     if (current() == '}') {
    1073             :       max = RegExpTree::kInfinity;
    1074         334 :       Advance();
    1075             :     } else {
    1076        5618 :       while (IsDecimalDigit(current())) {
    1077        2193 :         int next = current() - '0';
    1078        2193 :         if (max > (RegExpTree::kInfinity - next) / 10) {
    1079         750 :           do {
    1080         750 :             Advance();
    1081             :           } while (IsDecimalDigit(current()));
    1082             :           max = RegExpTree::kInfinity;
    1083             :           break;
    1084             :         }
    1085        2133 :         max = 10 * max + next;
    1086        2133 :         Advance();
    1087             :       }
    1088        1352 :       if (current() != '}') {
    1089             :         Reset(start);
    1090          74 :         return false;
    1091             :       }
    1092        1278 :       Advance();
    1093             :     }
    1094             :   } else {
    1095             :     Reset(start);
    1096          38 :     return false;
    1097             :   }
    1098        2784 :   *min_out = min;
    1099        2784 :   *max_out = max;
    1100        2784 :   return true;
    1101             : }
    1102             : 
    1103             : 
    1104        4253 : uc32 RegExpParser::ParseOctalLiteral() {
    1105             :   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
    1106             :   // For compatibility with some other browsers (not all), we parse
    1107             :   // up to three octal digits with a value below 256.
    1108             :   // ES#prod-annexB-LegacyOctalEscapeSequence
    1109        4253 :   uc32 value = current() - '0';
    1110        4253 :   Advance();
    1111        4253 :   if ('0' <= current() && current() <= '7') {
    1112        1388 :     value = value * 8 + current() - '0';
    1113        1388 :     Advance();
    1114        1388 :     if (value < 32 && '0' <= current() && current() <= '7') {
    1115        1290 :       value = value * 8 + current() - '0';
    1116        1290 :       Advance();
    1117             :     }
    1118             :   }
    1119        4253 :   return value;
    1120             : }
    1121             : 
    1122             : 
    1123       79064 : bool RegExpParser::ParseHexEscape(int length, uc32* value) {
    1124             :   int start = position();
    1125             :   uc32 val = 0;
    1126      699946 :   for (int i = 0; i < length; ++i) {
    1127             :     uc32 c = current();
    1128             :     int d = HexValue(c);
    1129      310815 :     if (d < 0) {
    1130             :       Reset(start);
    1131         374 :       return false;
    1132             :     }
    1133      310441 :     val = val * 16 + d;
    1134      310441 :     Advance();
    1135             :   }
    1136       78690 :   *value = val;
    1137       78690 :   return true;
    1138             : }
    1139             : 
    1140             : // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
    1141       83916 : bool RegExpParser::ParseUnicodeEscape(uc32* value) {
    1142             :   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
    1143             :   // allowed). In the latter case, the number of hex digits between { } is
    1144             :   // arbitrary. \ and u have already been read.
    1145       91474 :   if (current() == '{' && unicode()) {
    1146             :     int start = position();
    1147        7458 :     Advance();
    1148        7458 :     if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
    1149        7431 :       if (current() == '}') {
    1150        7426 :         Advance();
    1151        7426 :         return true;
    1152             :       }
    1153             :     }
    1154             :     Reset(start);
    1155          32 :     return false;
    1156             :   }
    1157             :   // \u but no {, or \u{...} escapes not allowed.
    1158       76458 :   bool result = ParseHexEscape(4, value);
    1159      226665 :   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
    1160             :       current() == '\\') {
    1161             :     // Attempt to read trail surrogate.
    1162             :     int start = position();
    1163         205 :     if (Next() == 'u') {
    1164             :       Advance(2);
    1165             :       uc32 trail;
    1166         380 :       if (ParseHexEscape(4, &trail) &&
    1167         185 :           unibrow::Utf16::IsTrailSurrogate(trail)) {
    1168         185 :         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
    1169         185 :                                                       static_cast<uc16>(trail));
    1170         185 :         return true;
    1171             :       }
    1172             :     }
    1173             :     Reset(start);
    1174             :   }
    1175             :   return result;
    1176             : }
    1177             : 
    1178             : #ifdef V8_INTL_SUPPORT
    1179             : 
    1180             : namespace {
    1181             : 
    1182        2708 : bool IsExactPropertyAlias(const char* property_name, UProperty property) {
    1183        2708 :   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
    1184        2708 :   if (short_name != nullptr && strcmp(property_name, short_name) == 0)
    1185             :     return true;
    1186             :   for (int i = 0;; i++) {
    1187         806 :     const char* long_name = u_getPropertyName(
    1188        1612 :         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
    1189         806 :     if (long_name == nullptr) break;
    1190         743 :     if (strcmp(property_name, long_name) == 0) return true;
    1191             :   }
    1192             :   return false;
    1193             : }
    1194             : 
    1195        3153 : bool IsExactPropertyValueAlias(const char* property_value_name,
    1196             :                                UProperty property, int32_t property_value) {
    1197             :   const char* short_name =
    1198        3153 :       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
    1199        3153 :   if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
    1200             :     return true;
    1201             :   }
    1202             :   for (int i = 0;; i++) {
    1203         310 :     const char* long_name = u_getPropertyValueName(
    1204             :         property, property_value,
    1205         620 :         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
    1206         310 :     if (long_name == nullptr) break;
    1207         310 :     if (strcmp(property_value_name, long_name) == 0) return true;
    1208             :   }
    1209             :   return false;
    1210             : }
    1211             : 
    1212        5216 : bool LookupPropertyValueName(UProperty property,
    1213             :                              const char* property_value_name, bool negate,
    1214             :                              ZoneList<CharacterRange>* result, Zone* zone) {
    1215             :   UProperty property_for_lookup = property;
    1216        5216 :   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
    1217             :     // For the property Script_Extensions, we have to do the property value
    1218             :     // name lookup as if the property is Script.
    1219             :     property_for_lookup = UCHAR_SCRIPT;
    1220             :   }
    1221             :   int32_t property_value =
    1222        5216 :       u_getPropertyValueEnum(property_for_lookup, property_value_name);
    1223        5216 :   if (property_value == UCHAR_INVALID_CODE) return false;
    1224             : 
    1225             :   // We require the property name to match exactly to one of the property value
    1226             :   // aliases. However, u_getPropertyValueEnum uses loose matching.
    1227        3153 :   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
    1228             :                                  property_value)) {
    1229             :     return false;
    1230             :   }
    1231             : 
    1232        3153 :   UErrorCode ec = U_ZERO_ERROR;
    1233        6306 :   icu::UnicodeSet set;
    1234        3153 :   set.applyIntPropertyValue(property, property_value, ec);
    1235        3153 :   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
    1236             : 
    1237        3153 :   if (success) {
    1238        3153 :     set.removeAllStrings();
    1239        3153 :     if (negate) set.complement();
    1240     1170897 :     for (int i = 0; i < set.getRangeCount(); i++) {
    1241             :       result->Add(
    1242     1167744 :           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
    1243      583872 :           zone);
    1244             :     }
    1245             :   }
    1246             :   return success;
    1247             : }
    1248             : 
    1249             : template <size_t N>
    1250             : inline bool NameEquals(const char* name, const char (&literal)[N]) {
    1251        6194 :   return strncmp(name, literal, N + 1) == 0;
    1252             : }
    1253             : 
    1254        2063 : bool LookupSpecialPropertyValueName(const char* name,
    1255             :                                     ZoneList<CharacterRange>* result,
    1256             :                                     bool negate, Zone* zone) {
    1257        2063 :   if (NameEquals(name, "Any")) {
    1258         135 :     if (negate) {
    1259             :       // Leave the list of character ranges empty, since the negation of 'Any'
    1260             :       // is the empty set.
    1261             :     } else {
    1262          40 :       result->Add(CharacterRange::Everything(), zone);
    1263             :     }
    1264        1928 :   } else if (NameEquals(name, "ASCII")) {
    1265         210 :     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
    1266             :                        : CharacterRange::Range(0x0, 0x7F),
    1267         105 :                 zone);
    1268        1823 :   } else if (NameEquals(name, "Assigned")) {
    1269          75 :     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
    1270         150 :                                    !negate, result, zone);
    1271             :   } else {
    1272             :     return false;
    1273             :   }
    1274             :   return true;
    1275             : }
    1276             : 
    1277             : // Explicitly whitelist supported binary properties. The spec forbids supporting
    1278             : // properties outside of this set to ensure interoperability.
    1279        1748 : bool IsSupportedBinaryProperty(UProperty property) {
    1280        1748 :   switch (property) {
    1281             :     case UCHAR_ALPHABETIC:
    1282             :     // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
    1283             :     // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
    1284             :     case UCHAR_ASCII_HEX_DIGIT:
    1285             :     // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
    1286             :     case UCHAR_BIDI_CONTROL:
    1287             :     case UCHAR_BIDI_MIRRORED:
    1288             :     case UCHAR_CASE_IGNORABLE:
    1289             :     case UCHAR_CASED:
    1290             :     case UCHAR_CHANGES_WHEN_CASEFOLDED:
    1291             :     case UCHAR_CHANGES_WHEN_CASEMAPPED:
    1292             :     case UCHAR_CHANGES_WHEN_LOWERCASED:
    1293             :     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
    1294             :     case UCHAR_CHANGES_WHEN_TITLECASED:
    1295             :     case UCHAR_CHANGES_WHEN_UPPERCASED:
    1296             :     case UCHAR_DASH:
    1297             :     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
    1298             :     case UCHAR_DEPRECATED:
    1299             :     case UCHAR_DIACRITIC:
    1300             :     case UCHAR_EMOJI:
    1301             :     case UCHAR_EMOJI_COMPONENT:
    1302             :     case UCHAR_EMOJI_MODIFIER_BASE:
    1303             :     case UCHAR_EMOJI_MODIFIER:
    1304             :     case UCHAR_EMOJI_PRESENTATION:
    1305             :     case UCHAR_EXTENDED_PICTOGRAPHIC:
    1306             :     case UCHAR_EXTENDER:
    1307             :     case UCHAR_GRAPHEME_BASE:
    1308             :     case UCHAR_GRAPHEME_EXTEND:
    1309             :     case UCHAR_HEX_DIGIT:
    1310             :     case UCHAR_ID_CONTINUE:
    1311             :     case UCHAR_ID_START:
    1312             :     case UCHAR_IDEOGRAPHIC:
    1313             :     case UCHAR_IDS_BINARY_OPERATOR:
    1314             :     case UCHAR_IDS_TRINARY_OPERATOR:
    1315             :     case UCHAR_JOIN_CONTROL:
    1316             :     case UCHAR_LOGICAL_ORDER_EXCEPTION:
    1317             :     case UCHAR_LOWERCASE:
    1318             :     case UCHAR_MATH:
    1319             :     case UCHAR_NONCHARACTER_CODE_POINT:
    1320             :     case UCHAR_PATTERN_SYNTAX:
    1321             :     case UCHAR_PATTERN_WHITE_SPACE:
    1322             :     case UCHAR_QUOTATION_MARK:
    1323             :     case UCHAR_RADICAL:
    1324             :     case UCHAR_REGIONAL_INDICATOR:
    1325             :     case UCHAR_S_TERM:
    1326             :     case UCHAR_SOFT_DOTTED:
    1327             :     case UCHAR_TERMINAL_PUNCTUATION:
    1328             :     case UCHAR_UNIFIED_IDEOGRAPH:
    1329             :     case UCHAR_UPPERCASE:
    1330             :     case UCHAR_VARIATION_SELECTOR:
    1331             :     case UCHAR_WHITE_SPACE:
    1332             :     case UCHAR_XID_CONTINUE:
    1333             :     case UCHAR_XID_START:
    1334             :       return true;
    1335             :     default:
    1336             :       break;
    1337             :   }
    1338         843 :   return false;
    1339             : }
    1340             : 
    1341             : bool IsUnicodePropertyValueCharacter(char c) {
    1342             :   // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
    1343             :   //
    1344             :   // Note that using this to validate each parsed char is quite conservative.
    1345             :   // A possible alternative solution would be to only ensure the parsed
    1346             :   // property name/value candidate string does not contain '\0' characters and
    1347             :   // let ICU lookups trigger the final failure.
    1348       39082 :   if ('a' <= c && c <= 'z') return true;
    1349       10168 :   if ('A' <= c && c <= 'Z') return true;
    1350        1989 :   if ('0' <= c && c <= '9') return true;
    1351        1971 :   return (c == '_');
    1352             : }
    1353             : 
    1354             : }  // anonymous namespace
    1355             : 
    1356        4547 : bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
    1357             :                                           std::vector<char>* name_2) {
    1358             :   DCHECK(name_1->empty());
    1359             :   DCHECK(name_2->empty());
    1360             :   // Parse the property class as follows:
    1361             :   // - In \p{name}, 'name' is interpreted
    1362             :   //   - either as a general category property value name.
    1363             :   //   - or as a binary property name.
    1364             :   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
    1365             :   //   and 'value' is interpreted as one of the available property value names.
    1366             :   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
    1367             :   // - Loose matching is not applied.
    1368        4547 :   if (current() == '{') {
    1369             :     // Parse \p{[PropertyName=]PropertyNameValue}
    1370       35602 :     for (Advance(); current() != '}' && current() != '='; Advance()) {
    1371       62398 :       if (!IsUnicodePropertyValueCharacter(current())) return false;
    1372       31136 :       if (!has_next()) return false;
    1373       62272 :       name_1->push_back(static_cast<char>(current()));
    1374             :     }
    1375        4403 :     if (current() == '=') {
    1376        9686 :       for (Advance(); current() != '}'; Advance()) {
    1377       15766 :         if (!IsUnicodePropertyValueCharacter(current())) return false;
    1378        7874 :         if (!has_next()) return false;
    1379       15748 :         name_2->push_back(static_cast<char>(current()));
    1380             :       }
    1381        3606 :       name_2->push_back(0);  // null-terminate string.
    1382             :     }
    1383             :   } else {
    1384             :     return false;
    1385             :   }
    1386        4394 :   Advance();
    1387        8788 :   name_1->push_back(0);  // null-terminate string.
    1388             : 
    1389             :   DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
    1390             :   DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
    1391        4394 :   return true;
    1392             : }
    1393             : 
    1394        4394 : bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
    1395             :                                          bool negate,
    1396             :                                          const std::vector<char>& name_1,
    1397             :                                          const std::vector<char>& name_2) {
    1398        4394 :   if (name_2.empty()) {
    1399             :     // First attempt to interpret as general category property value name.
    1400             :     const char* name = name_1.data();
    1401        2591 :     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
    1402             :                                 add_to, zone())) {
    1403             :       return true;
    1404             :     }
    1405             :     // Interpret "Any", "ASCII", and "Assigned".
    1406        2063 :     if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
    1407             :       return true;
    1408             :     }
    1409             :     // Then attempt to interpret as binary property name with value name 'Y'.
    1410        1748 :     UProperty property = u_getPropertyEnum(name);
    1411        1748 :     if (!IsSupportedBinaryProperty(property)) return false;
    1412         905 :     if (!IsExactPropertyAlias(name, property)) return false;
    1413         905 :     return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
    1414         905 :                                    zone());
    1415             :   } else {
    1416             :     // Both property name and value name are specified. Attempt to interpret
    1417             :     // the property name as enumerated property.
    1418             :     const char* property_name = name_1.data();
    1419             :     const char* value_name = name_2.data();
    1420        1803 :     UProperty property = u_getPropertyEnum(property_name);
    1421        1803 :     if (!IsExactPropertyAlias(property_name, property)) return false;
    1422        1740 :     if (property == UCHAR_GENERAL_CATEGORY) {
    1423             :       // We want to allow aggregate value names such as "Letter".
    1424             :       property = UCHAR_GENERAL_CATEGORY_MASK;
    1425        3400 :     } else if (property != UCHAR_SCRIPT &&
    1426        1700 :                property != UCHAR_SCRIPT_EXTENSIONS) {
    1427             :       return false;
    1428             :     }
    1429        1605 :     return LookupPropertyValueName(property, value_name, negate, add_to,
    1430        1605 :                                    zone());
    1431             :   }
    1432             : }
    1433             : 
    1434         510 : RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
    1435         510 :   if (!FLAG_harmony_regexp_sequence) return nullptr;
    1436             :   const char* name = name_1.data();
    1437             :   const uc32* sequence_list = nullptr;
    1438             :   JSRegExp::Flags flags = JSRegExp::kUnicode;
    1439         150 :   if (NameEquals(name, "Emoji_Flag_Sequence")) {
    1440             :     sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
    1441          90 :   } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
    1442             :     sequence_list = UnicodePropertySequences::kEmojiTagSequences;
    1443          70 :   } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
    1444             :     sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
    1445             :   }
    1446         150 :   if (sequence_list != nullptr) {
    1447             :     // TODO(yangguo): this creates huge regexp code. Alternative to this is
    1448             :     // to create a new operator that checks for these sequences at runtime.
    1449             :     RegExpBuilder builder(zone(), flags);
    1450             :     while (true) {                   // Iterate through list of sequences.
    1451      267120 :       while (*sequence_list != 0) {  // Iterate through sequence.
    1452      116760 :         builder.AddUnicodeCharacter(*sequence_list);
    1453      116760 :         sequence_list++;
    1454             :       }
    1455       33600 :       sequence_list++;
    1456       33600 :       if (*sequence_list == 0) break;
    1457             :       builder.NewAlternative();
    1458             :     }
    1459         100 :     return builder.ToRegExp();
    1460             :   }
    1461             : 
    1462          50 :   if (NameEquals(name, "Emoji_Keycap_Sequence")) {
    1463             :     // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
    1464             :     // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
    1465             :     RegExpBuilder builder(zone(), flags);
    1466             :     ZoneList<CharacterRange>* prefix_ranges =
    1467             :         new (zone()) ZoneList<CharacterRange>(2, zone());
    1468          30 :     prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
    1469          30 :     prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
    1470          30 :     prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
    1471          60 :     builder.AddCharacterClass(
    1472          30 :         new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
    1473          30 :     builder.AddCharacter(0xFE0F);
    1474          30 :     builder.AddCharacter(0x20E3);
    1475          30 :     return builder.ToRegExp();
    1476          20 :   } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
    1477             :     // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
    1478             :     // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
    1479             :     RegExpBuilder builder(zone(), flags);
    1480             :     ZoneList<CharacterRange>* modifier_base_ranges =
    1481             :         new (zone()) ZoneList<CharacterRange>(2, zone());
    1482             :     LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
    1483          20 :                             modifier_base_ranges, zone());
    1484          40 :     builder.AddCharacterClass(
    1485          20 :         new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
    1486             :     ZoneList<CharacterRange>* modifier_ranges =
    1487             :         new (zone()) ZoneList<CharacterRange>(2, zone());
    1488             :     LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
    1489          20 :                             zone());
    1490          40 :     builder.AddCharacterClass(
    1491          20 :         new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
    1492          20 :     return builder.ToRegExp();
    1493             :   }
    1494             : 
    1495             :   return nullptr;
    1496             : }
    1497             : 
    1498             : #else  // V8_INTL_SUPPORT
    1499             : 
    1500             : bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
    1501             :                                           std::vector<char>* name_2) {
    1502             :   return false;
    1503             : }
    1504             : 
    1505             : bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
    1506             :                                          bool negate,
    1507             :                                          const std::vector<char>& name_1,
    1508             :                                          const std::vector<char>& name_2) {
    1509             :   return false;
    1510             : }
    1511             : 
    1512             : RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
    1513             :   return nullptr;
    1514             : }
    1515             : 
    1516             : #endif  // V8_INTL_SUPPORT
    1517             : 
    1518        7458 : bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
    1519             :   uc32 x = 0;
    1520             :   int d = HexValue(current());
    1521        7458 :   if (d < 0) {
    1522             :     return false;
    1523             :   }
    1524       81648 :   while (d >= 0) {
    1525       37122 :     x = x * 16 + d;
    1526       37122 :     if (x > max_value) {
    1527             :       return false;
    1528             :     }
    1529       37095 :     Advance();
    1530             :     d = HexValue(current());
    1531             :   }
    1532        7431 :   *value = x;
    1533        7431 :   return true;
    1534             : }
    1535             : 
    1536             : 
    1537       88535 : uc32 RegExpParser::ParseClassCharacterEscape() {
    1538             :   DCHECK_EQ('\\', current());
    1539             :   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
    1540       88535 :   Advance();
    1541       88535 :   switch (current()) {
    1542             :     case 'b':
    1543          14 :       Advance();
    1544          14 :       return '\b';
    1545             :     // ControlEscape :: one of
    1546             :     //   f n r t v
    1547             :     case 'f':
    1548           0 :       Advance();
    1549           0 :       return '\f';
    1550             :     case 'n':
    1551         471 :       Advance();
    1552         471 :       return '\n';
    1553             :     case 'r':
    1554          48 :       Advance();
    1555          48 :       return '\r';
    1556             :     case 't':
    1557         186 :       Advance();
    1558         186 :       return '\t';
    1559             :     case 'v':
    1560           1 :       Advance();
    1561           1 :       return '\v';
    1562             :     case 'c': {
    1563             :       uc32 controlLetter = Next();
    1564         290 :       uc32 letter = controlLetter & ~('A' ^ 'a');
    1565             :       // Inside a character class, we also accept digits and underscore as
    1566             :       // control characters, unless with /u. See Annex B:
    1567             :       // ES#prod-annexB-ClassControlLetter
    1568         290 :       if (letter >= 'A' && letter <= 'Z') {
    1569             :         Advance(2);
    1570             :         // Control letters mapped to ASCII control characters in the range
    1571             :         // 0x00-0x1F.
    1572          30 :         return controlLetter & 0x1F;
    1573             :       }
    1574         260 :       if (unicode()) {
    1575             :         // With /u, invalid escapes are not treated as identity escapes.
    1576          18 :         ReportError(CStrVector("Invalid class escape"));
    1577          18 :         return 0;
    1578             :       }
    1579         484 :       if ((controlLetter >= '0' && controlLetter <= '9') ||
    1580         242 :           controlLetter == '_') {
    1581             :         Advance(2);
    1582         160 :         return controlLetter & 0x1F;
    1583             :       }
    1584             :       // We match JSC in reading the backslash as a literal
    1585             :       // character instead of as starting an escape.
    1586             :       // TODO(v8:6201): Not yet covered by the spec.
    1587             :       return '\\';
    1588             :     }
    1589             :     case '0':
    1590             :       // With /u, \0 is interpreted as NUL if not followed by another digit.
    1591        1270 :       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
    1592         130 :         Advance();
    1593         130 :         return 0;
    1594             :       }
    1595             :       V8_FALLTHROUGH;
    1596             :     case '1':
    1597             :     case '2':
    1598             :     case '3':
    1599             :     case '4':
    1600             :     case '5':
    1601             :     case '6':
    1602             :     case '7':
    1603             :       // For compatibility, we interpret a decimal escape that isn't
    1604             :       // a back reference (and therefore either \0 or not valid according
    1605             :       // to the specification) as a 1..3 digit octal character code.
    1606             :       // ES#prod-annexB-LegacyOctalEscapeSequence
    1607        1098 :       if (unicode()) {
    1608             :         // With /u, decimal escape is not interpreted as octal character code.
    1609          36 :         ReportError(CStrVector("Invalid class escape"));
    1610          36 :         return 0;
    1611             :       }
    1612        1062 :       return ParseOctalLiteral();
    1613             :     case 'x': {
    1614        2165 :       Advance();
    1615             :       uc32 value;
    1616        2165 :       if (ParseHexEscape(2, &value)) return value;
    1617          30 :       if (unicode()) {
    1618             :         // With /u, invalid escapes are not treated as identity escapes.
    1619           0 :         ReportError(CStrVector("Invalid escape"));
    1620           0 :         return 0;
    1621             :       }
    1622             :       // If \x is not followed by a two-digit hexadecimal, treat it
    1623             :       // as an identity escape.
    1624             :       return 'x';
    1625             :     }
    1626             :     case 'u': {
    1627       81679 :       Advance();
    1628             :       uc32 value;
    1629       81679 :       if (ParseUnicodeEscape(&value)) return value;
    1630          20 :       if (unicode()) {
    1631             :         // With /u, invalid escapes are not treated as identity escapes.
    1632           0 :         ReportError(CStrVector("Invalid unicode escape"));
    1633           0 :         return 0;
    1634             :       }
    1635             :       // If \u is not followed by a two-digit hexadecimal, treat it
    1636             :       // as an identity escape.
    1637             :       return 'u';
    1638             :     }
    1639             :     default: {
    1640             :       uc32 result = current();
    1641             :       // With /u, no identity escapes except for syntax characters and '-' are
    1642             :       // allowed. Otherwise, all identity escapes are allowed.
    1643        2593 :       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
    1644        2453 :         Advance();
    1645        2453 :         return result;
    1646             :       }
    1647           0 :       ReportError(CStrVector("Invalid escape"));
    1648           0 :       return 0;
    1649             :     }
    1650             :   }
    1651             :   return 0;
    1652             : }
    1653             : 
    1654      471188 : void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
    1655             :                                     Zone* zone,
    1656             :                                     bool add_unicode_case_equivalents,
    1657             :                                     uc32* char_out, bool* is_class_escape) {
    1658             :   uc32 current_char = current();
    1659      471188 :   if (current_char == '\\') {
    1660       90784 :     switch (Next()) {
    1661             :       case 'w':
    1662             :       case 'W':
    1663             :       case 'd':
    1664             :       case 'D':
    1665             :       case 's':
    1666             :       case 'S': {
    1667        3382 :         CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
    1668        1691 :                                        add_unicode_case_equivalents, zone);
    1669             :         Advance(2);
    1670        1691 :         *is_class_escape = true;
    1671        1691 :         return;
    1672             :       }
    1673             :       case kEndMarker:
    1674           0 :         ReportError(CStrVector("\\ at end of pattern"));
    1675           0 :         return;
    1676             :       case 'p':
    1677             :       case 'P':
    1678         620 :         if (unicode()) {
    1679         558 :           bool negate = Next() == 'P';
    1680             :           Advance(2);
    1681             :           std::vector<char> name_1, name_2;
    1682        1089 :           if (!ParsePropertyClassName(&name_1, &name_2) ||
    1683         531 :               !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
    1684         225 :             ReportError(CStrVector("Invalid property name in character class"));
    1685             :           }
    1686         558 :           *is_class_escape = true;
    1687             :           return;
    1688             :         }
    1689             :         break;
    1690             :       default:
    1691             :         break;
    1692             :     }
    1693       88535 :     *char_out = ParseClassCharacterEscape();
    1694       88535 :     *is_class_escape = false;
    1695             :   } else {
    1696      380404 :     Advance();
    1697      380404 :     *char_out = current_char;
    1698      380404 :     *is_class_escape = false;
    1699             :   }
    1700             : }
    1701             : 
    1702      155232 : RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
    1703             :   static const char* kUnterminated = "Unterminated character class";
    1704             :   static const char* kRangeInvalid = "Invalid character class";
    1705             :   static const char* kRangeOutOfOrder = "Range out of order in character class";
    1706             : 
    1707             :   DCHECK_EQ(current(), '[');
    1708      155232 :   Advance();
    1709             :   bool is_negated = false;
    1710      155232 :   if (current() == '^') {
    1711             :     is_negated = true;
    1712       10400 :     Advance();
    1713             :   }
    1714             :   ZoneList<CharacterRange>* ranges =
    1715             :       new (zone()) ZoneList<CharacterRange>(2, zone());
    1716      156929 :   bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
    1717      451248 :   while (has_more() && current() != ']') {
    1718             :     uc32 char_1, char_2;
    1719             :     bool is_class_1, is_class_2;
    1720      296444 :     ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
    1721      296800 :                      &is_class_1 CHECK_FAILED);
    1722      296165 :     if (current() == '-') {
    1723      174816 :       Advance();
    1724      174816 :       if (current() == kEndMarker) {
    1725             :         // If we reach the end we break out of the loop and let the
    1726             :         // following code report an error.
    1727             :         break;
    1728      174811 :       } else if (current() == ']') {
    1729          67 :         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
    1730          67 :         ranges->Add(CharacterRange::Singleton('-'), zone());
    1731          67 :         break;
    1732             :       }
    1733             :       ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
    1734      174744 :                        &is_class_2 CHECK_FAILED);
    1735      174744 :       if (is_class_1 || is_class_2) {
    1736             :         // Either end is an escaped character class. Treat the '-' verbatim.
    1737         176 :         if (unicode()) {
    1738             :           // ES2015 21.2.2.15.1 step 1.
    1739          54 :           return ReportError(CStrVector(kRangeInvalid));
    1740             :         }
    1741         149 :         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
    1742         149 :         ranges->Add(CharacterRange::Singleton('-'), zone());
    1743         149 :         if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
    1744         149 :         continue;
    1745             :       }
    1746             :       // ES2015 21.2.2.15.1 step 6.
    1747      174568 :       if (char_1 > char_2) {
    1748         100 :         return ReportError(CStrVector(kRangeOutOfOrder));
    1749             :       }
    1750      174518 :       ranges->Add(CharacterRange::Range(char_1, char_2), zone());
    1751             :     } else {
    1752      121349 :       if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
    1753             :     }
    1754             :   }
    1755      154876 :   if (!has_more()) {
    1756          40 :     return ReportError(CStrVector(kUnterminated));
    1757             :   }
    1758      154856 :   Advance();
    1759             :   RegExpCharacterClass::CharacterClassFlags character_class_flags;
    1760      154856 :   if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
    1761      309712 :   return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
    1762      309712 :                                            character_class_flags);
    1763             : }
    1764             : 
    1765             : 
    1766             : #undef CHECK_FAILED
    1767             : 
    1768             : 
    1769      380302 : bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
    1770             :                                FlatStringReader* input, JSRegExp::Flags flags,
    1771             :                                RegExpCompileData* result) {
    1772             :   DCHECK(result != nullptr);
    1773      380302 :   RegExpParser parser(input, &result->error, flags, isolate, zone);
    1774      380302 :   RegExpTree* tree = parser.ParsePattern();
    1775      380302 :   if (parser.failed()) {
    1776             :     DCHECK(tree == nullptr);
    1777             :     DCHECK(!result->error.is_null());
    1778             :   } else {
    1779             :     DCHECK(tree != nullptr);
    1780             :     DCHECK(result->error.is_null());
    1781             :     if (FLAG_trace_regexp_parser) {
    1782             :       StdoutStream os;
    1783             :       tree->Print(os, zone);
    1784             :       os << "\n";
    1785             :     }
    1786      377336 :     result->tree = tree;
    1787             :     int capture_count = parser.captures_started();
    1788      583669 :     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
    1789      377336 :     result->contains_anchor = parser.contains_anchor();
    1790      377336 :     result->capture_name_map = parser.CreateCaptureNameMap();
    1791      377336 :     result->capture_count = capture_count;
    1792             :   }
    1793      380302 :   return !parser.failed();
    1794             : }
    1795             : 
    1796           0 : RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
    1797             :     : zone_(zone),
    1798             :       pending_empty_(false),
    1799             :       flags_(flags),
    1800             :       characters_(nullptr),
    1801             :       pending_surrogate_(kNoPendingSurrogate),
    1802             :       terms_(),
    1803     1486350 :       alternatives_()
    1804             : #ifdef DEBUG
    1805             :       ,
    1806             :       last_added_(ADD_NONE)
    1807             : #endif
    1808             : {
    1809           0 : }
    1810             : 
    1811             : 
    1812           0 : void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
    1813             :   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
    1814             :   FlushPendingSurrogate();
    1815             :   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
    1816       73918 :   pending_surrogate_ = lead_surrogate;
    1817           0 : }
    1818             : 
    1819             : 
    1820       73858 : void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
    1821             :   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
    1822       73858 :   if (pending_surrogate_ != kNoPendingSurrogate) {
    1823       73668 :     uc16 lead_surrogate = pending_surrogate_;
    1824       73668 :     pending_surrogate_ = kNoPendingSurrogate;
    1825             :     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
    1826             :     uc32 combined =
    1827       73668 :         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
    1828       73668 :     if (NeedsDesugaringForIgnoreCase(combined)) {
    1829          60 :       AddCharacterClassForDesugaring(combined);
    1830             :     } else {
    1831             :       ZoneList<uc16> surrogate_pair(2, zone());
    1832       73608 :       surrogate_pair.Add(lead_surrogate, zone());
    1833       73608 :       surrogate_pair.Add(trail_surrogate, zone());
    1834             :       RegExpAtom* atom =
    1835       73608 :           new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
    1836       73608 :       AddAtom(atom);
    1837             :     }
    1838             :   } else {
    1839         190 :     pending_surrogate_ = trail_surrogate;
    1840             :     FlushPendingSurrogate();
    1841             :   }
    1842       73858 : }
    1843             : 
    1844             : 
    1845           0 : void RegExpBuilder::FlushPendingSurrogate() {
    1846   389388818 :   if (pending_surrogate_ != kNoPendingSurrogate) {
    1847             :     DCHECK(unicode());
    1848         440 :     uc32 c = pending_surrogate_;
    1849         440 :     pending_surrogate_ = kNoPendingSurrogate;
    1850         440 :     AddCharacterClassForDesugaring(c);
    1851             :   }
    1852           0 : }
    1853             : 
    1854             : 
    1855     4460554 : void RegExpBuilder::FlushCharacters() {
    1856             :   FlushPendingSurrogate();
    1857     4460554 :   pending_empty_ = false;
    1858     4460554 :   if (characters_ != nullptr) {
    1859             :     RegExpTree* atom =
    1860      402644 :         new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
    1861      402644 :     characters_ = nullptr;
    1862      402644 :     text_.Add(atom, zone());
    1863             :     LAST(ADD_ATOM);
    1864             :   }
    1865     4460554 : }
    1866             : 
    1867             : 
    1868     4207944 : void RegExpBuilder::FlushText() {
    1869     4207944 :   FlushCharacters();
    1870             :   int num_text = text_.length();
    1871     4207944 :   if (num_text == 0) {
    1872             :     return;
    1873      570061 :   } else if (num_text == 1) {
    1874      532525 :     terms_.Add(text_.last(), zone());
    1875             :   } else {
    1876             :     RegExpText* text = new (zone()) RegExpText(zone());
    1877      251548 :     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
    1878       37536 :     terms_.Add(text, zone());
    1879             :   }
    1880             :   text_.Clear();
    1881             : }
    1882             : 
    1883             : 
    1884   383398502 : void RegExpBuilder::AddCharacter(uc16 c) {
    1885             :   FlushPendingSurrogate();
    1886   383398502 :   pending_empty_ = false;
    1887   383398502 :   if (NeedsDesugaringForIgnoreCase(c)) {
    1888         905 :     AddCharacterClassForDesugaring(c);
    1889             :   } else {
    1890   383397597 :     if (characters_ == nullptr) {
    1891     1831190 :       characters_ = new (zone()) ZoneList<uc16>(4, zone());
    1892             :     }
    1893   383397597 :     characters_->Add(c, zone());
    1894             :     LAST(ADD_CHAR);
    1895             :   }
    1896   383398502 : }
    1897             : 
    1898             : 
    1899   383460099 : void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
    1900   383460099 :   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    1901             :     DCHECK(unicode());
    1902       73668 :     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
    1903       73668 :     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
    1904   383433383 :   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
    1905             :     AddLeadSurrogate(c);
    1906   383432883 :   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
    1907         190 :     AddTrailSurrogate(c);
    1908             :   } else {
    1909   383385991 :     AddCharacter(static_cast<uc16>(c));
    1910             :   }
    1911   383460099 : }
    1912             : 
    1913        1819 : void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
    1914             :   // A lead or trail surrogate parsed via escape sequence will not
    1915             :   // pair up with any preceding lead or following trail surrogate.
    1916             :   FlushPendingSurrogate();
    1917        1819 :   AddUnicodeCharacter(character);
    1918             :   FlushPendingSurrogate();
    1919        1819 : }
    1920             : 
    1921         113 : void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
    1922             : 
    1923             : 
    1924      183768 : void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
    1925      183768 :   if (NeedsDesugaringForUnicode(cc)) {
    1926             :     // With /u, character class needs to be desugared, so it
    1927             :     // must be a standalone term instead of being part of a RegExpText.
    1928             :     AddTerm(cc);
    1929             :   } else {
    1930      178972 :     AddAtom(cc);
    1931             :   }
    1932      183768 : }
    1933             : 
    1934        1405 : void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
    1935        2810 :   AddTerm(new (zone()) RegExpCharacterClass(
    1936             :       zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
    1937        1405 :       flags_));
    1938        1405 : }
    1939             : 
    1940             : 
    1941     1363157 : void RegExpBuilder::AddAtom(RegExpTree* term) {
    1942     1363157 :   if (term->IsEmpty()) {
    1943             :     AddEmpty();
    1944             :     return;
    1945             :   }
    1946     1363157 :   if (term->IsTextElement()) {
    1947      252610 :     FlushCharacters();
    1948      252610 :     text_.Add(term, zone());
    1949             :   } else {
    1950     1110547 :     FlushText();
    1951     1110547 :     terms_.Add(term, zone());
    1952             :   }
    1953             :   LAST(ADD_ATOM);
    1954             : }
    1955             : 
    1956             : 
    1957           0 : void RegExpBuilder::AddTerm(RegExpTree* term) {
    1958        6201 :   FlushText();
    1959        6201 :   terms_.Add(term, zone());
    1960             :   LAST(ADD_ATOM);
    1961           0 : }
    1962             : 
    1963             : 
    1964       11246 : void RegExpBuilder::AddAssertion(RegExpTree* assert) {
    1965       11246 :   FlushText();
    1966       11246 :   if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
    1967             :     // Omit repeated assertions of the same type.
    1968         199 :     RegExpAssertion* last = terms_.last()->AsAssertion();
    1969         199 :     RegExpAssertion* next = assert->AsAssertion();
    1970         199 :     if (last->assertion_type() == next->assertion_type()) return;
    1971             :   }
    1972       11166 :   terms_.Add(assert, zone());
    1973             :   LAST(ADD_ASSERT);
    1974             : }
    1975             : 
    1976             : 
    1977      150826 : void RegExpBuilder::NewAlternative() { FlushTerms(); }
    1978             : 
    1979             : 
    1980     1634365 : void RegExpBuilder::FlushTerms() {
    1981     1634365 :   FlushText();
    1982             :   int num_terms = terms_.length();
    1983             :   RegExpTree* alternative;
    1984     1634365 :   if (num_terms == 0) {
    1985      329988 :     alternative = new (zone()) RegExpEmpty();
    1986     1304377 :   } else if (num_terms == 1) {
    1987             :     alternative = terms_.last();
    1988             :   } else {
    1989       81794 :     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
    1990             :   }
    1991     1634365 :   alternatives_.Add(alternative, zone());
    1992             :   terms_.Clear();
    1993             :   LAST(ADD_NONE);
    1994     1634365 : }
    1995             : 
    1996             : 
    1997      183768 : bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
    1998      183768 :   if (!unicode()) return false;
    1999             :   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
    2000             :   // necessarily mean that we need to desugar. It's probably nicer to have a
    2001             :   // separate pass to figure out unicode desugarings.
    2002        6084 :   if (ignore_case()) return true;
    2003             :   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
    2004        5256 :   CharacterRange::Canonicalize(ranges);
    2005       13589 :   for (int i = ranges->length() - 1; i >= 0; i--) {
    2006             :     uc32 from = ranges->at(i).from();
    2007             :     uc32 to = ranges->at(i).to();
    2008             :     // Check for non-BMP characters.
    2009       12301 :     if (to >= kNonBmpStart) return true;
    2010             :     // Check for lone surrogates.
    2011        8443 :     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
    2012             :   }
    2013             :   return false;
    2014             : }
    2015             : 
    2016             : 
    2017   383472170 : bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
    2018             : #ifdef V8_INTL_SUPPORT
    2019   383592420 :   if (unicode() && ignore_case()) {
    2020        2180 :     icu::UnicodeSet set(c, c);
    2021        1090 :     set.closeOver(USET_CASE_INSENSITIVE);
    2022        1090 :     set.removeAllStrings();
    2023        1090 :     return set.size() > 1;
    2024             :   }
    2025             :   // In the case where ICU is not included, we act as if the unicode flag is
    2026             :   // not set, and do not desugar.
    2027             : #endif  // V8_INTL_SUPPORT
    2028             :   return false;
    2029             : }
    2030             : 
    2031             : 
    2032     1483539 : RegExpTree* RegExpBuilder::ToRegExp() {
    2033     1483539 :   FlushTerms();
    2034             :   int num_alternatives = alternatives_.length();
    2035     1483539 :   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
    2036     1483539 :   if (num_alternatives == 1) return alternatives_.last();
    2037       42858 :   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
    2038             : }
    2039             : 
    2040     1452016 : bool RegExpBuilder::AddQuantifierToAtom(
    2041             :     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
    2042             :   FlushPendingSurrogate();
    2043     1452016 :   if (pending_empty_) {
    2044           2 :     pending_empty_ = false;
    2045           2 :     return true;
    2046             :   }
    2047             :   RegExpTree* atom;
    2048     1452014 :   if (characters_ != nullptr) {
    2049             :     DCHECK(last_added_ == ADD_CHAR);
    2050             :     // Last atom was character.
    2051             :     Vector<const uc16> char_vector = characters_->ToConstVector();
    2052             :     int num_chars = char_vector.length();
    2053     1428243 :     if (num_chars > 1) {
    2054         562 :       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
    2055        1124 :       text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
    2056             :       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
    2057             :     }
    2058     1428243 :     characters_ = nullptr;
    2059     1428243 :     atom = new (zone()) RegExpAtom(char_vector, flags_);
    2060     1428243 :     FlushText();
    2061       23771 :   } else if (text_.length() > 0) {
    2062             :     DCHECK(last_added_ == ADD_ATOM);
    2063       16267 :     atom = text_.RemoveLast();
    2064       16267 :     FlushText();
    2065        7504 :   } else if (terms_.length() > 0) {
    2066             :     DCHECK(last_added_ == ADD_ATOM);
    2067        7504 :     atom = terms_.RemoveLast();
    2068        7504 :     if (atom->IsLookaround()) {
    2069             :       // With /u, lookarounds are not quantifiable.
    2070         239 :       if (unicode()) return false;
    2071             :       // Lookbehinds are not quantifiable.
    2072         193 :       if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
    2073             :         return false;
    2074             :       }
    2075             :     }
    2076        7431 :     if (atom->max_match() == 0) {
    2077             :       // Guaranteed to only match an empty string.
    2078             :       LAST(ADD_TERM);
    2079         236 :       if (min == 0) {
    2080             :         return true;
    2081             :       }
    2082          97 :       terms_.Add(atom, zone());
    2083          97 :       return true;
    2084             :     }
    2085             :   } else {
    2086             :     // Only call immediately after adding an atom or character!
    2087           0 :     UNREACHABLE();
    2088             :   }
    2089     1451705 :   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
    2090     1451705 :              zone());
    2091             :   LAST(ADD_TERM);
    2092     1451705 :   return true;
    2093             : }
    2094             : 
    2095             : }  // namespace internal
    2096      121996 : }  // namespace v8

Generated by: LCOV version 1.10