LCOV - code coverage report
Current view: top level - src/regexp - regexp-parser.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 719 744 96.6 %
Date: 2017-10-20 Functions: 47 59 79.7 %

          Line data    Source code
       1             : // Copyright 2016 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/regexp/regexp-parser.h"
       6             : 
       7             : #include <vector>
       8             : 
       9             : #include "src/char-predicates-inl.h"
      10             : #include "src/factory.h"
      11             : #include "src/isolate.h"
      12             : #include "src/objects-inl.h"
      13             : #include "src/ostreams.h"
      14             : #include "src/regexp/jsregexp.h"
      15             : #include "src/utils.h"
      16             : 
      17             : #ifdef V8_INTL_SUPPORT
      18             : #include "unicode/uniset.h"
      19             : #endif  // V8_INTL_SUPPORT
      20             : 
      21             : namespace v8 {
      22             : namespace internal {
      23             : 
      24      397989 : RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
      25             :                            JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
      26             :     : isolate_(isolate),
      27             :       zone_(zone),
      28             :       error_(error),
      29             :       captures_(nullptr),
      30             :       named_captures_(nullptr),
      31             :       named_back_references_(nullptr),
      32             :       in_(in),
      33             :       current_(kEndMarker),
      34             :       dotall_(flags & JSRegExp::kDotAll),
      35             :       ignore_case_(flags & JSRegExp::kIgnoreCase),
      36             :       multiline_(flags & JSRegExp::kMultiline),
      37             :       unicode_(flags & JSRegExp::kUnicode),
      38             :       next_pos_(0),
      39             :       captures_started_(0),
      40             :       capture_count_(0),
      41             :       has_more_(true),
      42             :       simple_(false),
      43             :       contains_anchor_(false),
      44             :       is_scanned_for_captures_(false),
      45             :       has_named_captures_(false),
      46     1989945 :       failed_(false) {
      47             :   DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
      48      397989 :   Advance();
      49      397989 : }
      50             : 
      51             : template <bool update_position>
      52  1294796829 : inline uc32 RegExpParser::ReadNext() {
      53   431598943 :   int position = next_pos_;
      54             :   uc32 c0 = in()->Get(position);
      55   431598943 :   position++;
      56             :   // Read the whole surrogate pair in case of unicode flag, if possible.
      57   431783516 :   if (unicode() && position < in()->length() &&
      58             :       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
      59             :     uc16 c1 = in()->Get(position);
      60         603 :     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
      61         406 :       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
      62         388 :       position++;
      63             :     }
      64             :   }
      65   431436095 :   if (update_position) next_pos_ = position;
      66   431598943 :   return c0;
      67             : }
      68             : 
      69             : 
      70           0 : uc32 RegExpParser::Next() {
      71      162881 :   if (has_next()) {
      72      162848 :     return ReadNext<false>();
      73             :   } else {
      74             :     return kEndMarker;
      75             :   }
      76             : }
      77             : 
      78  1294707007 : void RegExpParser::Advance() {
      79   431834398 :   if (has_next()) {
      80             :     StackLimitCheck check(isolate());
      81   431436504 :     if (check.HasOverflowed()) {
      82         399 :       if (FLAG_abort_on_stack_or_string_length_overflow) {
      83           0 :         FATAL("Aborting on stack overflow");
      84             :       }
      85             :       ReportError(CStrVector(
      86         798 :           MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
      87   431436105 :     } else if (zone()->excess_allocation()) {
      88          10 :       ReportError(CStrVector("Regular expression too large"));
      89             :     } else {
      90   431436095 :       current_ = ReadNext<true>();
      91             :     }
      92             :   } else {
      93      397894 :     current_ = kEndMarker;
      94             :     // Advance so that position() points to 1-after-the-last-character. This is
      95             :     // important so that Reset() to this position works correctly.
      96      397894 :     next_pos_ = in()->length() + 1;
      97      397894 :     has_more_ = false;
      98             :   }
      99   431834398 : }
     100             : 
     101             : 
     102        3276 : void RegExpParser::Reset(int pos) {
     103        3276 :   next_pos_ = pos;
     104        3276 :   has_more_ = (pos < in()->length());
     105        3276 :   Advance();
     106           0 : }
     107             : 
     108           0 : void RegExpParser::Advance(int dist) {
     109       71136 :   next_pos_ += dist - 1;
     110       71136 :   Advance();
     111           0 : }
     112             : 
     113             : 
     114      211472 : bool RegExpParser::simple() { return simple_; }
     115             : 
     116           0 : bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
     117          96 :   switch (c) {
     118             :     case '^':
     119             :     case '$':
     120             :     case '\\':
     121             :     case '.':
     122             :     case '*':
     123             :     case '+':
     124             :     case '?':
     125             :     case '(':
     126             :     case ')':
     127             :     case '[':
     128             :     case ']':
     129             :     case '{':
     130             :     case '}':
     131             :     case '|':
     132             :     case '/':
     133             :       return true;
     134             :     default:
     135             :       break;
     136             :   }
     137           0 :   return false;
     138             : }
     139             : 
     140             : 
     141        8130 : RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
     142        2774 :   if (failed_) return nullptr;  // Do not overwrite any existing error.
     143        2678 :   failed_ = true;
     144             :   *error_ = isolate()
     145             :                 ->factory()
     146             :                 ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
     147        8034 :                 .ToHandleChecked();
     148             :   // Zip to the end to make sure the no more input is read.
     149        2678 :   current_ = kEndMarker;
     150        2678 :   next_pos_ = in()->length();
     151        2678 :   return nullptr;
     152             : }
     153             : 
     154             : #define CHECK_FAILED /**/);    \
     155             :   if (failed_) return nullptr; \
     156             :   ((void)0
     157             : 
     158             : // Pattern ::
     159             : //   Disjunction
     160      609461 : RegExpTree* RegExpParser::ParsePattern() {
     161      397989 :   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
     162      395413 :   PatchNamedBackReferences(CHECK_FAILED);
     163             :   DCHECK(!has_more());
     164             :   // If the result of parsing is a literal string atom, and it has the
     165             :   // same length as the input, then the atom is identical to the input.
     166      606783 :   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
     167      201442 :     simple_ = true;
     168             :   }
     169      395311 :   return result;
     170             : }
     171             : 
     172             : 
     173             : // Disjunction ::
     174             : //   Alternative
     175             : //   Alternative | Disjunction
     176             : // Alternative ::
     177             : //   [empty]
     178             : //   Term Alternative
     179             : // Term ::
     180             : //   Assertion
     181             : //   Atom
     182             : //   Atom Quantifier
     183  1291314343 : RegExpTree* RegExpParser::ParseDisjunction() {
     184             :   // Used to store current state while parsing subexpressions.
     185             :   RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
     186      795978 :                                   0, nullptr, ignore_case(), unicode(), zone());
     187     8173322 :   RegExpParserState* state = &initial_state;
     188             :   // Cache the builder in a local variable for quick access.
     189      397989 :   RegExpBuilder* builder = initial_state.builder();
     190             :   while (true) {
     191   429090050 :     switch (current()) {
     192             :       case kEndMarker:
     193      395824 :         if (state->IsSubexpression()) {
     194             :           // Inside a parenthesized group when hitting end of input.
     195      398117 :           return ReportError(CStrVector("Unterminated group"));
     196             :         }
     197             :         DCHECK_EQ(INITIAL, state->group_type());
     198             :         // Parsing completed successfully.
     199      395696 :         return builder->ToRegExp();
     200             :       case ')': {
     201     1300846 :         if (!state->IsSubexpression()) {
     202          14 :           return ReportError(CStrVector("Unmatched ')'"));
     203             :         }
     204             :         DCHECK_NE(INITIAL, state->group_type());
     205             : 
     206     1300832 :         Advance();
     207             :         // End disjunction parsing and convert builder content to new single
     208             :         // regexp atom.
     209     1300832 :         RegExpTree* body = builder->ToRegExp();
     210             : 
     211             :         int end_capture_index = captures_started();
     212             : 
     213             :         int capture_index = state->capture_index();
     214             :         SubexpressionType group_type = state->group_type();
     215             : 
     216             :         // Build result of subexpression.
     217     1300832 :         if (group_type == CAPTURE) {
     218     1268838 :           if (state->IsNamedCapture()) {
     219             :             CreateNamedCaptureAtIndex(state->capture_name(),
     220        1359 :                                       capture_index CHECK_FAILED);
     221             :           }
     222     1268796 :           RegExpCapture* capture = GetCapture(capture_index);
     223             :           capture->set_body(body);
     224             :           body = capture;
     225       31994 :         } else if (group_type == GROUPING) {
     226             :           body = new (zone()) RegExpGroup(body);
     227             :         } else {
     228             :           DCHECK(group_type == POSITIVE_LOOKAROUND ||
     229             :                  group_type == NEGATIVE_LOOKAROUND);
     230        3768 :           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
     231             :           body = new (zone()) RegExpLookaround(
     232             :               body, is_positive, end_capture_index - capture_index,
     233        3768 :               capture_index, state->lookaround_type());
     234             :         }
     235             : 
     236             :         // Restore previous state.
     237             :         state = state->previous_state();
     238             :         builder = state->builder();
     239             : 
     240     1300790 :         builder->AddAtom(body);
     241             :         // For compatibility with JSC and ES3, we allow quantifiers after
     242             :         // lookaheads, and break in all cases.
     243     1300790 :         break;
     244             :       }
     245             :       case '|': {
     246      132851 :         Advance();
     247             :         builder->NewAlternative();
     248   427004231 :         continue;
     249             :       }
     250             :       case '*':
     251             :       case '+':
     252             :       case '?':
     253         110 :         return ReportError(CStrVector("Nothing to repeat"));
     254             :       case '^': {
     255       14145 :         Advance();
     256       14145 :         if (multiline()) {
     257             :           builder->AddAssertion(
     258        1569 :               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
     259             :         } else {
     260             :           builder->AddAssertion(
     261       12576 :               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
     262             :           set_contains_anchor();
     263             :         }
     264             :         continue;
     265             :       }
     266             :       case '$': {
     267        6998 :         Advance();
     268             :         RegExpAssertion::AssertionType assertion_type =
     269             :             multiline() ? RegExpAssertion::END_OF_LINE
     270        6998 :                         : RegExpAssertion::END_OF_INPUT;
     271        6998 :         builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
     272        6998 :         continue;
     273             :       }
     274             :       case '.': {
     275       24291 :         Advance();
     276             :         ZoneList<CharacterRange>* ranges =
     277       24291 :             new (zone()) ZoneList<CharacterRange>(2, zone());
     278             : 
     279       24291 :         if (dotall()) {
     280             :           // Everything.
     281             :           DCHECK(FLAG_harmony_regexp_dotall);
     282          40 :           CharacterRange::AddClassEscape('*', ranges, false, zone());
     283             :         } else {
     284             :           // Everything except \x0a, \x0d, \u2028 and \u2029
     285       24251 :           CharacterRange::AddClassEscape('.', ranges, false, zone());
     286             :         }
     287             : 
     288             :         RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges);
     289       24291 :         builder->AddCharacterClass(cc);
     290       24291 :         break;
     291             :       }
     292             :       case '(': {
     293             :         SubexpressionType subexpr_type = CAPTURE;
     294             :         RegExpLookaround::Type lookaround_type = state->lookaround_type();
     295             :         bool is_named_capture = false;
     296     1301401 :         Advance();
     297     1301401 :         if (current() == '?') {
     298       33765 :           switch (Next()) {
     299             :             case ':':
     300             :               subexpr_type = GROUPING;
     301             :               Advance(2);
     302             :               break;
     303             :             case '=':
     304             :               lookaround_type = RegExpLookaround::LOOKAHEAD;
     305             :               subexpr_type = POSITIVE_LOOKAROUND;
     306             :               Advance(2);
     307             :               break;
     308             :             case '!':
     309             :               lookaround_type = RegExpLookaround::LOOKAHEAD;
     310             :               subexpr_type = NEGATIVE_LOOKAROUND;
     311             :               Advance(2);
     312             :               break;
     313             :             case '<':
     314        3137 :               Advance();
     315        3137 :               if (FLAG_harmony_regexp_lookbehind) {
     316        3137 :                 if (Next() == '=') {
     317             :                   subexpr_type = POSITIVE_LOOKAROUND;
     318             :                   lookaround_type = RegExpLookaround::LOOKBEHIND;
     319             :                   Advance(2);
     320             :                   break;
     321        1968 :                 } else if (Next() == '!') {
     322             :                   subexpr_type = NEGATIVE_LOOKAROUND;
     323             :                   lookaround_type = RegExpLookaround::LOOKBEHIND;
     324             :                   Advance(2);
     325             :                   break;
     326             :                 }
     327             :               }
     328        1716 :               if (FLAG_harmony_regexp_named_captures) {
     329        1716 :                 has_named_captures_ = true;
     330             :                 is_named_capture = true;
     331        1716 :                 Advance();
     332        1716 :                 break;
     333             :               }
     334             :             // Fall through.
     335             :             default:
     336          24 :               return ReportError(CStrVector("Invalid group"));
     337             :           }
     338             :         }
     339             : 
     340             :         const ZoneVector<uc16>* capture_name = nullptr;
     341     1301377 :         if (subexpr_type == CAPTURE) {
     342     1269352 :           if (captures_started_ >= kMaxCaptures) {
     343           6 :             return ReportError(CStrVector("Too many captures"));
     344             :           }
     345     1269346 :           captures_started_++;
     346             : 
     347     1269346 :           if (is_named_capture) {
     348        1716 :             capture_name = ParseCaptureGroupName(CHECK_FAILED);
     349             :           }
     350             :         }
     351             :         // Store current state and begin new disjunction parsing.
     352             :         state = new (zone()) RegExpParserState(
     353             :             state, subexpr_type, lookaround_type, captures_started_,
     354     3903069 :             capture_name, ignore_case(), unicode(), zone());
     355             :         builder = state->builder();
     356     1301023 :         continue;
     357             :       }
     358             :       case '[': {
     359      164331 :         RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
     360      164127 :         builder->AddCharacterClass(cc->AsCharacterClass());
     361      164127 :         break;
     362             :       }
     363             :       // Atom ::
     364             :       //   \ AtomEscape
     365             :       case '\\':
     366       66591 :         switch (Next()) {
     367             :           case kEndMarker:
     368           6 :             return ReportError(CStrVector("\\ at end of pattern"));
     369             :           case 'b':
     370             :             Advance(2);
     371             :             builder->AddAssertion(
     372         540 :                 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
     373         540 :             continue;
     374             :           case 'B':
     375             :             Advance(2);
     376             :             builder->AddAssertion(
     377         356 :                 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
     378         356 :             continue;
     379             :           // AtomEscape ::
     380             :           //   CharacterClassEscape
     381             :           //
     382             :           // CharacterClassEscape :: one of
     383             :           //   d D s S w W
     384             :           case 'd':
     385             :           case 'D':
     386             :           case 's':
     387             :           case 'S':
     388             :           case 'w':
     389             :           case 'W': {
     390             :             uc32 c = Next();
     391             :             Advance(2);
     392             :             ZoneList<CharacterRange>* ranges =
     393       21353 :                 new (zone()) ZoneList<CharacterRange>(2, zone());
     394             :             CharacterRange::AddClassEscape(c, ranges,
     395       21761 :                                            unicode() && ignore_case(), zone());
     396             :             RegExpCharacterClass* cc =
     397             :                 new (zone()) RegExpCharacterClass(ranges);
     398       21353 :             builder->AddCharacterClass(cc);
     399       21353 :             break;
     400             :           }
     401             :           case 'p':
     402             :           case 'P': {
     403             :             uc32 p = Next();
     404             :             Advance(2);
     405        4869 :             if (unicode()) {
     406        4660 :               if (FLAG_harmony_regexp_property) {
     407             :                 ZoneList<CharacterRange>* ranges =
     408        4651 :                     new (zone()) ZoneList<CharacterRange>(2, zone());
     409        4651 :                 if (!ParsePropertyClass(ranges, p == 'P')) {
     410         756 :                   return ReportError(CStrVector("Invalid property name"));
     411             :                 }
     412             :                 RegExpCharacterClass* cc =
     413             :                     new (zone()) RegExpCharacterClass(ranges);
     414        3895 :                 builder->AddCharacterClass(cc);
     415             :               } else {
     416             :                 // With /u, no identity escapes except for syntax characters
     417             :                 // are allowed. Otherwise, all identity escapes are allowed.
     418           9 :                 return ReportError(CStrVector("Invalid escape"));
     419             :               }
     420             :             } else {
     421         209 :               builder->AddCharacter(p);
     422             :             }
     423             :             break;
     424             :           }
     425             :           case '1':
     426             :           case '2':
     427             :           case '3':
     428             :           case '4':
     429             :           case '5':
     430             :           case '6':
     431             :           case '7':
     432             :           case '8':
     433             :           case '9': {
     434        5998 :             int index = 0;
     435        6082 :             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
     436        5986 :             if (is_backref) {
     437       10760 :               if (state->IsInsideCaptureGroup(index)) {
     438             :                 // The back reference is inside the capture group it refers to.
     439             :                 // Nothing can possibly have been captured yet, so we use empty
     440             :                 // instead. This ensures that, when checking a back reference,
     441             :                 // the capture registers of the referenced capture are either
     442             :                 // both set or both cleared.
     443             :                 builder->AddEmpty();
     444             :               } else {
     445        5217 :                 RegExpCapture* capture = GetCapture(index);
     446             :                 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
     447        5217 :                 builder->AddAtom(atom);
     448             :               }
     449        5448 :               break;
     450             :             }
     451             :             // With /u, no identity escapes except for syntax characters
     452             :             // are allowed. Otherwise, all identity escapes are allowed.
     453         606 :             if (unicode()) {
     454          72 :               return ReportError(CStrVector("Invalid escape"));
     455             :             }
     456             :             uc32 first_digit = Next();
     457         534 :             if (first_digit == '8' || first_digit == '9') {
     458          68 :               builder->AddCharacter(first_digit);
     459             :               Advance(2);
     460             :               break;
     461             :             }
     462             :           }
     463             :           // Fall through.
     464             :           case '0': {
     465        3386 :             Advance();
     466        3424 :             if (unicode() && Next() >= '0' && Next() <= '9') {
     467             :               // With /u, decimal escape with leading 0 are not parsed as octal.
     468          19 :               return ReportError(CStrVector("Invalid decimal escape"));
     469             :             }
     470        3367 :             uc32 octal = ParseOctalLiteral();
     471        3367 :             builder->AddCharacter(octal);
     472        3367 :             break;
     473             :           }
     474             :           // ControlEscape :: one of
     475             :           //   f n r t v
     476             :           case 'f':
     477             :             Advance(2);
     478          14 :             builder->AddCharacter('\f');
     479          14 :             break;
     480             :           case 'n':
     481             :             Advance(2);
     482         599 :             builder->AddCharacter('\n');
     483         599 :             break;
     484             :           case 'r':
     485             :             Advance(2);
     486          84 :             builder->AddCharacter('\r');
     487          84 :             break;
     488             :           case 't':
     489             :             Advance(2);
     490          16 :             builder->AddCharacter('\t');
     491          16 :             break;
     492             :           case 'v':
     493             :             Advance(2);
     494          36 :             builder->AddCharacter('\v');
     495          36 :             break;
     496             :           case 'c': {
     497         251 :             Advance();
     498             :             uc32 controlLetter = Next();
     499             :             // Special case if it is an ASCII letter.
     500             :             // Convert lower case letters to uppercase.
     501         251 :             uc32 letter = controlLetter & ~('a' ^ 'A');
     502         251 :             if (letter < 'A' || 'Z' < letter) {
     503             :               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
     504             :               // Read the backslash as a literal character instead of as
     505             :               // starting an escape.
     506             :               // ES#prod-annexB-ExtendedPatternCharacter
     507         137 :               if (unicode()) {
     508             :                 // With /u, invalid escapes are not treated as identity escapes.
     509          19 :                 return ReportError(CStrVector("Invalid unicode escape"));
     510             :               }
     511         118 :               builder->AddCharacter('\\');
     512             :             } else {
     513             :               Advance(2);
     514         114 :               builder->AddCharacter(controlLetter & 0x1f);
     515             :             }
     516             :             break;
     517             :           }
     518             :           case 'x': {
     519             :             Advance(2);
     520             :             uc32 value;
     521         286 :             if (ParseHexEscape(2, &value)) {
     522         169 :               builder->AddCharacter(value);
     523         117 :             } else if (!unicode()) {
     524          81 :               builder->AddCharacter('x');
     525             :             } else {
     526             :               // With /u, invalid escapes are not treated as identity escapes.
     527          36 :               return ReportError(CStrVector("Invalid escape"));
     528             :             }
     529         250 :             break;
     530             :           }
     531             :           case 'u': {
     532             :             Advance(2);
     533             :             uc32 value;
     534        1818 :             if (ParseUnicodeEscape(&value)) {
     535        1599 :               builder->AddEscapedUnicodeCharacter(value);
     536         219 :             } else if (!unicode()) {
     537         147 :               builder->AddCharacter('u');
     538             :             } else {
     539             :               // With /u, invalid escapes are not treated as identity escapes.
     540          72 :               return ReportError(CStrVector("Invalid unicode escape"));
     541             :             }
     542        1746 :             break;
     543             :           }
     544             :           case 'k':
     545             :             // Either an identity escape or a named back-reference.  The two
     546             :             // interpretations are mutually exclusive: '\k' is interpreted as
     547             :             // an identity escape for non-unicode patterns without named
     548             :             // capture groups, and as the beginning of a named back-reference
     549             :             // in all other cases.
     550        1593 :             if (FLAG_harmony_regexp_named_captures &&
     551         238 :                 (unicode() || HasNamedCaptures())) {
     552             :               Advance(2);
     553         463 :               ParseNamedBackReference(builder, state CHECK_FAILED);
     554             :               break;
     555             :             }
     556             :           // Fall through.
     557             :           default:
     558       26982 :             Advance();
     559             :             // With /u, no identity escapes except for syntax characters
     560             :             // are allowed. Otherwise, all identity escapes are allowed.
     561       27024 :             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
     562       26955 :               builder->AddCharacter(current());
     563       26955 :               Advance();
     564             :             } else {
     565          27 :               return ReportError(CStrVector("Invalid escape"));
     566             :             }
     567       26955 :             break;
     568             :         }
     569             :         break;
     570             :       case '{': {
     571             :         int dummy;
     572         684 :         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
     573         619 :         if (parsed) return ReportError(CStrVector("Nothing to repeat"));
     574             :         // Fall through.
     575             :       }
     576             :       case '}':
     577             :       case ']':
     578        1182 :         if (unicode()) {
     579          28 :           return ReportError(CStrVector("Lone quantifier brackets"));
     580             :         }
     581             :       // Fall through.
     582             :       default:
     583   425682587 :         builder->AddUnicodeCharacter(current());
     584   425682587 :         Advance();
     585   425682587 :         break;
     586             :     }  // end switch(current())
     587             : 
     588             :     int min;
     589             :     int max;
     590   427236369 :     switch (current()) {
     591             :       // QuantifierPrefix ::
     592             :       //   *
     593             :       //   +
     594             :       //   ?
     595             :       //   {
     596             :       case '*':
     597       38233 :         min = 0;
     598       38233 :         max = RegExpTree::kInfinity;
     599       38233 :         Advance();
     600       38233 :         break;
     601             :       case '+':
     602       13086 :         min = 1;
     603       13086 :         max = RegExpTree::kInfinity;
     604       13086 :         Advance();
     605       13086 :         break;
     606             :       case '?':
     607     1625542 :         min = 0;
     608     1625542 :         max = 1;
     609     1625542 :         Advance();
     610     1625542 :         break;
     611             :       case '{':
     612       11602 :         if (ParseIntervalQuantifier(&min, &max)) {
     613       10991 :           if (max < min) {
     614             :             return ReportError(
     615           2 :                 CStrVector("numbers out of order in {} quantifier"));
     616             :           }
     617             :           break;
     618         611 :         } else if (unicode()) {
     619             :           // With /u, incomplete quantifiers are not allowed.
     620         199 :           return ReportError(CStrVector("Incomplete quantifier"));
     621             :         }
     622             :         continue;
     623             :       default:
     624             :         continue;
     625             :     }
     626             :     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
     627     1687850 :     if (current() == '?') {
     628             :       quantifier_type = RegExpQuantifier::NON_GREEDY;
     629        5827 :       Advance();
     630             :     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
     631             :       // FLAG_regexp_possessive_quantifier is a debug-only flag.
     632             :       quantifier_type = RegExpQuantifier::POSSESSIVE;
     633             :       Advance();
     634             :     }
     635     1687850 :     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
     636          20 :       return ReportError(CStrVector("Invalid quantifier"));
     637             :     }
     638             :   }
     639             : }
     640             : 
     641             : 
     642             : #ifdef DEBUG
     643             : // Currently only used in an DCHECK.
     644             : static bool IsSpecialClassEscape(uc32 c) {
     645             :   switch (c) {
     646             :     case 'd':
     647             :     case 'D':
     648             :     case 's':
     649             :     case 'S':
     650             :     case 'w':
     651             :     case 'W':
     652             :       return true;
     653             :     default:
     654             :       return false;
     655             :   }
     656             : }
     657             : #endif
     658             : 
     659             : 
     660             : // In order to know whether an escape is a backreference or not we have to scan
     661             : // the entire regexp and find the number of capturing parentheses.  However we
     662             : // don't want to scan the regexp twice unless it is necessary.  This mini-parser
     663             : // is called when needed.  It can see the difference between capturing and
     664             : // noncapturing parentheses and can skip character classes and backslash-escaped
     665             : // characters.
     666        9291 : void RegExpParser::ScanForCaptures() {
     667             :   DCHECK(!is_scanned_for_captures_);
     668             :   const int saved_position = position();
     669             :   // Start with captures started previous to current position
     670             :   int capture_count = captures_started();
     671             :   // Add count of captures after this position.
     672             :   int n;
     673        7226 :   while ((n = current()) != kEndMarker) {
     674        5300 :     Advance();
     675        5300 :     switch (n) {
     676             :       case '\\':
     677         750 :         Advance();
     678         750 :         break;
     679             :       case '[': {
     680             :         int c;
     681         124 :         while ((c = current()) != kEndMarker) {
     682         124 :           Advance();
     683         124 :           if (c == '\\') {
     684           0 :             Advance();
     685             :           } else {
     686         124 :             if (c == ']') break;
     687             :           }
     688             :         }
     689             :         break;
     690             :       }
     691             :       case '(':
     692         689 :         if (current() == '?') {
     693             :           // At this point we could be in
     694             :           // * a non-capturing group '(:',
     695             :           // * a lookbehind assertion '(?<=' '(?<!'
     696             :           // * or a named capture '(?<'.
     697             :           //
     698             :           // Of these, only named captures are capturing groups.
     699         161 :           if (!FLAG_harmony_regexp_named_captures) break;
     700             : 
     701         161 :           Advance();
     702         161 :           if (current() != '<') break;
     703             : 
     704         128 :           if (FLAG_harmony_regexp_lookbehind) {
     705         128 :             Advance();
     706         128 :             if (current() == '=' || current() == '!') break;
     707             :           }
     708             : 
     709             :           // Found a possible named capture. It could turn out to be a syntax
     710             :           // error (e.g. an unterminated or invalid name), but that distinction
     711             :           // does not matter for our purposes.
     712          75 :           has_named_captures_ = true;
     713             :         }
     714         603 :         capture_count++;
     715         603 :         break;
     716             :     }
     717             :   }
     718         963 :   capture_count_ = capture_count;
     719         963 :   is_scanned_for_captures_ = true;
     720             :   Reset(saved_position);
     721         963 : }
     722             : 
     723             : 
     724       18752 : bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
     725             :   DCHECK_EQ('\\', current());
     726             :   DCHECK('1' <= Next() && Next() <= '9');
     727             :   // Try to parse a decimal literal that is no greater than the total number
     728             :   // of left capturing parentheses in the input.
     729             :   int start = position();
     730        5998 :   int value = Next() - '0';
     731             :   Advance(2);
     732             :   while (true) {
     733             :     uc32 c = current();
     734        6798 :     if (IsDecimalDigit(c)) {
     735         842 :       value = 10 * value + (c - '0');
     736         842 :       if (value > kMaxCaptures) {
     737             :         Reset(start);
     738          42 :         return false;
     739             :       }
     740         800 :       Advance();
     741             :     } else {
     742             :       break;
     743             :     }
     744             :   }
     745        5956 :   if (value > captures_started()) {
     746        1060 :     if (!is_scanned_for_captures_) ScanForCaptures();
     747        1060 :     if (value > capture_count_) {
     748             :       Reset(start);
     749         576 :       return false;
     750             :     }
     751             :   }
     752        5380 :   *index_out = value;
     753        6180 :   return true;
     754             : }
     755             : 
     756        2594 : static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
     757        2594 :   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
     758        5108 :     v->push_back(code_unit);
     759             :   } else {
     760          80 :     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
     761          80 :     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
     762             :   }
     763        2594 : }
     764             : 
     765        7131 : const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
     766             :   DCHECK(FLAG_harmony_regexp_named_captures);
     767             : 
     768             :   ZoneVector<uc16>* name =
     769        2137 :       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
     770             : 
     771             :   bool at_start = true;
     772             :   while (true) {
     773        4731 :     uc32 c = current();
     774        4731 :     Advance();
     775             : 
     776             :     // Convert unicode escapes.
     777        4994 :     if (c == '\\' && current() == 'u') {
     778         245 :       Advance();
     779         245 :       if (!ParseUnicodeEscape(&c)) {
     780          60 :         ReportError(CStrVector("Invalid Unicode escape sequence"));
     781          60 :         return nullptr;
     782             :       }
     783             :     }
     784             : 
     785             :     // The backslash char is misclassified as both ID_Start and ID_Continue.
     786        4671 :     if (c == '\\') {
     787          18 :       ReportError(CStrVector("Invalid capture group name"));
     788          18 :       return nullptr;
     789             :     }
     790             : 
     791        4653 :     if (at_start) {
     792        2104 :       if (!IdentifierStart::Is(c)) {
     793         102 :         ReportError(CStrVector("Invalid capture group name"));
     794         102 :         return nullptr;
     795             :       }
     796        2002 :       push_code_unit(name, c);
     797             :       at_start = false;
     798             :     } else {
     799        2549 :       if (c == '>') {
     800             :         break;
     801         811 :       } else if (IdentifierPart::Is(c)) {
     802         592 :         push_code_unit(name, c);
     803             :       } else {
     804         219 :         ReportError(CStrVector("Invalid capture group name"));
     805         219 :         return nullptr;
     806             :       }
     807             :     }
     808             :   }
     809             : 
     810        4332 :   return name;
     811             : }
     812             : 
     813        1359 : bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
     814        2312 :                                              int index) {
     815             :   DCHECK(FLAG_harmony_regexp_named_captures);
     816             :   DCHECK(0 < index && index <= captures_started_);
     817             :   DCHECK_NOT_NULL(name);
     818             : 
     819        1359 :   if (named_captures_ == nullptr) {
     820         995 :     named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
     821             :   } else {
     822             :     // Check for duplicates and bail if we find any.
     823             :     // TODO(jgruber): O(n^2).
     824        1136 :     for (const auto& named_capture : *named_captures_) {
     825         450 :       if (*named_capture->name() == *name) {
     826          42 :         ReportError(CStrVector("Duplicate capture group name"));
     827          42 :         return false;
     828             :       }
     829             :     }
     830             :   }
     831             : 
     832        1317 :   RegExpCapture* capture = GetCapture(index);
     833             :   DCHECK_NULL(capture->name());
     834             : 
     835             :   capture->set_name(name);
     836        1317 :   named_captures_->Add(capture, zone());
     837             : 
     838        1317 :   return true;
     839             : }
     840             : 
     841         463 : bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
     842        1399 :                                            RegExpParserState* state) {
     843             :   // The parser is assumed to be on the '<' in \k<name>.
     844         463 :   if (current() != '<') {
     845          42 :     ReportError(CStrVector("Invalid named reference"));
     846          42 :     return false;
     847             :   }
     848             : 
     849         421 :   Advance();
     850         421 :   const ZoneVector<uc16>* name = ParseCaptureGroupName();
     851         421 :   if (name == nullptr) {
     852             :     return false;
     853             :   }
     854             : 
     855         370 :   if (state->IsInsideCaptureGroup(name)) {
     856             :     builder->AddEmpty();
     857             :   } else {
     858         336 :     RegExpBackReference* atom = new (zone()) RegExpBackReference();
     859             :     atom->set_name(name);
     860             : 
     861         336 :     builder->AddAtom(atom);
     862             : 
     863         336 :     if (named_back_references_ == nullptr) {
     864             :       named_back_references_ =
     865         264 :           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
     866             :     }
     867         336 :     named_back_references_->Add(atom, zone());
     868             :   }
     869             : 
     870             :   return true;
     871             : }
     872             : 
     873      395413 : void RegExpParser::PatchNamedBackReferences() {
     874      395896 :   if (named_back_references_ == nullptr) return;
     875             : 
     876         264 :   if (named_captures_ == nullptr) {
     877          15 :     ReportError(CStrVector("Invalid named capture referenced"));
     878          15 :     return;
     879             :   }
     880             : 
     881             :   // Look up and patch the actual capture for each named back reference.
     882             :   // TODO(jgruber): O(n^2), optimize if necessary.
     883             : 
     884         717 :   for (int i = 0; i < named_back_references_->length(); i++) {
     885         740 :     RegExpBackReference* ref = named_back_references_->at(i);
     886             : 
     887             :     int index = -1;
     888         827 :     for (const auto& capture : *named_captures_) {
     889         419 :       if (*capture->name() == *ref->name()) {
     890             :         index = capture->index();
     891         234 :         break;
     892             :       }
     893             :     }
     894             : 
     895         321 :     if (index == -1) {
     896          87 :       ReportError(CStrVector("Invalid named capture referenced"));
     897          87 :       return;
     898             :     }
     899             : 
     900         234 :     ref->set_capture(GetCapture(index));
     901             :   }
     902             : }
     903             : 
     904     2561172 : RegExpCapture* RegExpParser::GetCapture(int index) {
     905             :   // The index for the capture groups are one-based. Its index in the list is
     906             :   // zero-based.
     907             :   int know_captures =
     908     1275564 :       is_scanned_for_captures_ ? capture_count_ : captures_started_;
     909             :   DCHECK(index <= know_captures);
     910     3819924 :   if (captures_ == nullptr) {
     911       16812 :     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
     912             :   }
     913     5088720 :   while (captures_->length() < know_captures) {
     914     2537592 :     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
     915             :   }
     916     2551128 :   return captures_->at(index - 1);
     917             : }
     918             : 
     919      396141 : Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
     920      398105 :   if (named_captures_ == nullptr || named_captures_->is_empty())
     921      394481 :     return Handle<FixedArray>();
     922             : 
     923             :   Factory* factory = isolate()->factory();
     924             : 
     925         830 :   int len = named_captures_->length() * 2;
     926         830 :   Handle<FixedArray> array = factory->NewFixedArray(len);
     927             : 
     928        3928 :   for (int i = 0; i < named_captures_->length(); i++) {
     929        2268 :     RegExpCapture* capture = named_captures_->at(i);
     930        1134 :     MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
     931        2268 :     array->set(i * 2, *name.ToHandleChecked());
     932             :     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
     933             :   }
     934             : 
     935         830 :   return array;
     936             : }
     937             : 
     938           0 : bool RegExpParser::HasNamedCaptures() {
     939         238 :   if (has_named_captures_ || is_scanned_for_captures_) {
     940             :     return has_named_captures_;
     941             :   }
     942             : 
     943         133 :   ScanForCaptures();
     944             :   DCHECK(is_scanned_for_captures_);
     945         133 :   return has_named_captures_;
     946             : }
     947             : 
     948           0 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
     949       12651 :   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
     950        6209 :     if (s->group_type() != CAPTURE) continue;
     951             :     // Return true if we found the matching capture index.
     952         751 :     if (index == s->capture_index()) return true;
     953             :     // Abort if index is larger than what has been parsed up till this state.
     954         588 :     if (index > s->capture_index()) return false;
     955             :   }
     956             :   return false;
     957             : }
     958             : 
     959         370 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
     960             :     const ZoneVector<uc16>* name) {
     961             :   DCHECK_NOT_NULL(name);
     962         730 :   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
     963         394 :     if (s->capture_name() == nullptr) continue;
     964          58 :     if (*s->capture_name() == *name) return true;
     965             :   }
     966             :   return false;
     967             : }
     968             : 
     969             : // QuantifierPrefix ::
     970             : //   { DecimalDigits }
     971             : //   { DecimalDigits , }
     972             : //   { DecimalDigits , DecimalDigits }
     973             : //
     974             : // Returns true if parsing succeeds, and set the min_out and max_out
     975             : // values. Values are truncated to RegExpTree::kInfinity if they overflow.
     976       87088 : bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
     977             :   DCHECK_EQ(current(), '{');
     978             :   int start = position();
     979       12239 :   Advance();
     980             :   int min = 0;
     981       12239 :   if (!IsDecimalDigit(current())) {
     982             :     Reset(start);
     983        1010 :     return false;
     984             :   }
     985       24082 :   while (IsDecimalDigit(current())) {
     986             :     int next = current() - '0';
     987       12991 :     if (min > (RegExpTree::kInfinity - next) / 10) {
     988             :       // Overflow. Skip past remaining decimal digits and return -1.
     989         966 :       do {
     990         966 :         Advance();
     991             :       } while (IsDecimalDigit(current()));
     992             :       min = RegExpTree::kInfinity;
     993             :       break;
     994             :     }
     995       12853 :     min = 10 * min + next;
     996       12853 :     Advance();
     997             :   }
     998             :   int max = 0;
     999       11229 :   if (current() == '}') {
    1000             :     max = min;
    1001        4780 :     Advance();
    1002        6449 :   } else if (current() == ',') {
    1003        6367 :     Advance();
    1004        6367 :     if (current() == '}') {
    1005             :       max = RegExpTree::kInfinity;
    1006         352 :       Advance();
    1007             :     } else {
    1008       13051 :       while (IsDecimalDigit(current())) {
    1009             :         int next = current() - '0';
    1010        7108 :         if (max > (RegExpTree::kInfinity - next) / 10) {
    1011         900 :           do {
    1012         900 :             Advance();
    1013             :           } while (IsDecimalDigit(current()));
    1014             :           max = RegExpTree::kInfinity;
    1015             :           break;
    1016             :         }
    1017        7036 :         max = 10 * max + next;
    1018        7036 :         Advance();
    1019             :       }
    1020        6015 :       if (current() != '}') {
    1021             :         Reset(start);
    1022         127 :         return false;
    1023             :       }
    1024        5888 :       Advance();
    1025             :     }
    1026             :   } else {
    1027             :     Reset(start);
    1028          82 :     return false;
    1029             :   }
    1030       11020 :   *min_out = min;
    1031       11020 :   *max_out = max;
    1032       11020 :   return true;
    1033             : }
    1034             : 
    1035             : 
    1036       11855 : uc32 RegExpParser::ParseOctalLiteral() {
    1037             :   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
    1038             :   // For compatibility with some other browsers (not all), we parse
    1039             :   // up to three octal digits with a value below 256.
    1040             :   // ES#prod-annexB-LegacyOctalEscapeSequence
    1041        4497 :   uc32 value = current() - '0';
    1042        4497 :   Advance();
    1043        4497 :   if ('0' <= current() && current() <= '7') {
    1044        1528 :     value = value * 8 + current() - '0';
    1045        1528 :     Advance();
    1046        3017 :     if (value < 32 && '0' <= current() && current() <= '7') {
    1047        2744 :       value = value * 8 + current() - '0';
    1048        1372 :       Advance();
    1049             :     }
    1050             :   }
    1051        4497 :   return value;
    1052             : }
    1053             : 
    1054             : 
    1055       72550 : bool RegExpParser::ParseHexEscape(int length, uc32* value) {
    1056             :   int start = position();
    1057             :   uc32 val = 0;
    1058       72127 :   for (int i = 0; i < length; ++i) {
    1059             :     uc32 c = current();
    1060             :     int d = HexValue(c);
    1061       57071 :     if (d < 0) {
    1062             :       Reset(start);
    1063         423 :       return false;
    1064             :     }
    1065       56648 :     val = val * 16 + d;
    1066       56648 :     Advance();
    1067             :   }
    1068       15056 :   *value = val;
    1069       15056 :   return true;
    1070             : }
    1071             : 
    1072             : // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
    1073       33251 : bool RegExpParser::ParseUnicodeEscape(uc32* value) {
    1074             :   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
    1075             :   // allowed). In the latter case, the number of hex digits between { } is
    1076             :   // arbitrary. \ and u have already been read.
    1077       16506 :   if (current() == '{' && unicode()) {
    1078             :     int start = position();
    1079        1609 :     Advance();
    1080        1609 :     if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
    1081        1582 :       if (current() == '}') {
    1082        1576 :         Advance();
    1083        1576 :         return true;
    1084             :       }
    1085             :     }
    1086             :     Reset(start);
    1087          33 :     return false;
    1088             :   }
    1089             :   // \u but no {, or \u{...} escapes not allowed.
    1090       13198 :   bool result = ParseHexEscape(4, value);
    1091       37374 :   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
    1092             :       current() == '\\') {
    1093             :     // Attempt to read trail surrogate.
    1094             :     int start = position();
    1095         206 :     if (Next() == 'u') {
    1096             :       Advance(2);
    1097             :       uc32 trail;
    1098         382 :       if (ParseHexEscape(4, &trail) &&
    1099         186 :           unibrow::Utf16::IsTrailSurrogate(trail)) {
    1100             :         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
    1101         372 :                                                       static_cast<uc16>(trail));
    1102         186 :         return true;
    1103             :       }
    1104             :     }
    1105             :     Reset(start);
    1106             :   }
    1107       13012 :   return result;
    1108             : }
    1109             : 
    1110             : #ifdef V8_INTL_SUPPORT
    1111             : 
    1112             : namespace {
    1113             : 
    1114        3796 : bool IsExactPropertyAlias(const char* property_name, UProperty property) {
    1115        3796 :   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
    1116        3796 :   if (short_name != nullptr && strcmp(property_name, short_name) == 0)
    1117             :     return true;
    1118             :   for (int i = 0;; i++) {
    1119             :     const char* long_name = u_getPropertyName(
    1120         758 :         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
    1121         758 :     if (long_name == nullptr) break;
    1122         686 :     if (strcmp(property_name, long_name) == 0) return true;
    1123             :   }
    1124             :   return false;
    1125             : }
    1126             : 
    1127        4037 : bool IsExactPropertyValueAlias(const char* property_value_name,
    1128             :                                UProperty property, int32_t property_value) {
    1129             :   const char* short_name =
    1130        4037 :       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
    1131        4037 :   if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
    1132             :     return true;
    1133             :   }
    1134             :   for (int i = 0;; i++) {
    1135             :     const char* long_name = u_getPropertyValueName(
    1136             :         property, property_value,
    1137         337 :         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
    1138         337 :     if (long_name == nullptr) break;
    1139         328 :     if (strcmp(property_value_name, long_name) == 0) return true;
    1140             :   }
    1141             :   return false;
    1142             : }
    1143             : 
    1144        5620 : bool LookupPropertyValueName(UProperty property,
    1145             :                              const char* property_value_name, bool negate,
    1146             :                              ZoneList<CharacterRange>* result, Zone* zone) {
    1147             :   UProperty property_for_lookup = property;
    1148        5620 :   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
    1149             :     // For the property Script_Extensions, we have to do the property value
    1150             :     // name lookup as if the property is Script.
    1151             :     property_for_lookup = UCHAR_SCRIPT;
    1152             :   }
    1153             :   int32_t property_value =
    1154        5620 :       u_getPropertyValueEnum(property_for_lookup, property_value_name);
    1155        5620 :   if (property_value == UCHAR_INVALID_CODE) return false;
    1156             : 
    1157             :   // We require the property name to match exactly to one of the property value
    1158             :   // aliases. However, u_getPropertyValueEnum uses loose matching.
    1159        4037 :   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
    1160        4037 :                                  property_value)) {
    1161             :     return false;
    1162             :   }
    1163             : 
    1164        4028 :   UErrorCode ec = U_ZERO_ERROR;
    1165        4028 :   icu::UnicodeSet set;
    1166        4028 :   set.applyIntPropertyValue(property, property_value, ec);
    1167        4028 :   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
    1168             : 
    1169        4028 :   if (success) {
    1170        4028 :     set.removeAllStrings();
    1171        4028 :     if (negate) set.complement();
    1172      480345 :     for (int i = 0; i < set.getRangeCount(); i++) {
    1173             :       result->Add(
    1174      480345 :           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
    1175      480345 :           zone);
    1176             :     }
    1177             :   }
    1178        4028 :   return success;
    1179             : }
    1180             : 
    1181             : template <size_t N>
    1182             : inline bool NameEquals(const char* name, const char (&literal)[N]) {
    1183        4551 :   return strncmp(name, literal, N + 1) == 0;
    1184             : }
    1185             : 
    1186        1592 : bool LookupSpecialPropertyValueName(const char* name,
    1187             :                                     ZoneList<CharacterRange>* result,
    1188             :                                     bool negate, Zone* zone) {
    1189        1592 :   if (NameEquals(name, "Any")) {
    1190          60 :     if (!negate) result->Add(CharacterRange::Everything(), zone);
    1191        1532 :   } else if (NameEquals(name, "ASCII")) {
    1192             :     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
    1193             :                        : CharacterRange::Range(0x0, 0x7f),
    1194         105 :                 zone);
    1195        1427 :   } else if (NameEquals(name, "Assigned")) {
    1196             :     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
    1197          75 :                                    !negate, result, zone);
    1198             :   } else {
    1199             :     return false;
    1200             :   }
    1201             :   return true;
    1202             : }
    1203             : 
    1204             : // Explicitly whitelist supported binary properties. The spec forbids supporting
    1205             : // properties outside of this set to ensure interoperability.
    1206             : bool IsSupportedBinaryProperty(UProperty property) {
    1207             :   switch (property) {
    1208             :     case UCHAR_ALPHABETIC:
    1209             :     // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
    1210             :     // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
    1211             :     case UCHAR_ASCII_HEX_DIGIT:
    1212             :     // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
    1213             :     case UCHAR_BIDI_CONTROL:
    1214             :     case UCHAR_BIDI_MIRRORED:
    1215             :     case UCHAR_CASE_IGNORABLE:
    1216             :     case UCHAR_CASED:
    1217             :     case UCHAR_CHANGES_WHEN_CASEFOLDED:
    1218             :     case UCHAR_CHANGES_WHEN_CASEMAPPED:
    1219             :     case UCHAR_CHANGES_WHEN_LOWERCASED:
    1220             :     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
    1221             :     case UCHAR_CHANGES_WHEN_TITLECASED:
    1222             :     case UCHAR_CHANGES_WHEN_UPPERCASED:
    1223             :     case UCHAR_DASH:
    1224             :     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
    1225             :     case UCHAR_DEPRECATED:
    1226             :     case UCHAR_DIACRITIC:
    1227             :     case UCHAR_EMOJI:
    1228             :     // TODO(yangguo): Uncomment this once we upgrade to ICU 60.
    1229             :     //                See https://ssl.icu-project.org/trac/ticket/13062
    1230             :     // case UCHAR_EMOJI_COMPONENT:
    1231             :     case UCHAR_EMOJI_MODIFIER_BASE:
    1232             :     case UCHAR_EMOJI_MODIFIER:
    1233             :     case UCHAR_EMOJI_PRESENTATION:
    1234             :     case UCHAR_EXTENDER:
    1235             :     case UCHAR_GRAPHEME_BASE:
    1236             :     case UCHAR_GRAPHEME_EXTEND:
    1237             :     case UCHAR_HEX_DIGIT:
    1238             :     case UCHAR_ID_CONTINUE:
    1239             :     case UCHAR_ID_START:
    1240             :     case UCHAR_IDEOGRAPHIC:
    1241             :     case UCHAR_IDS_BINARY_OPERATOR:
    1242             :     case UCHAR_IDS_TRINARY_OPERATOR:
    1243             :     case UCHAR_JOIN_CONTROL:
    1244             :     case UCHAR_LOGICAL_ORDER_EXCEPTION:
    1245             :     case UCHAR_LOWERCASE:
    1246             :     case UCHAR_MATH:
    1247             :     case UCHAR_NONCHARACTER_CODE_POINT:
    1248             :     case UCHAR_PATTERN_SYNTAX:
    1249             :     case UCHAR_PATTERN_WHITE_SPACE:
    1250             :     case UCHAR_QUOTATION_MARK:
    1251             :     case UCHAR_RADICAL:
    1252             :     case UCHAR_S_TERM:
    1253             :     case UCHAR_SOFT_DOTTED:
    1254             :     case UCHAR_TERMINAL_PUNCTUATION:
    1255             :     case UCHAR_UNIFIED_IDEOGRAPH:
    1256             :     case UCHAR_UPPERCASE:
    1257             :     case UCHAR_VARIATION_SELECTOR:
    1258             :     case UCHAR_WHITE_SPACE:
    1259             :     case UCHAR_XID_CONTINUE:
    1260             :     case UCHAR_XID_START:
    1261             :       return true;
    1262             :     default:
    1263             :       break;
    1264             :   }
    1265             :   return false;
    1266             : }
    1267             : 
    1268             : }  // anonymous namespace
    1269             : 
    1270        4994 : bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
    1271       58406 :                                       bool negate) {
    1272             :   // Parse the property class as follows:
    1273             :   // - In \p{name}, 'name' is interpreted
    1274             :   //   - either as a general category property value name.
    1275             :   //   - or as a binary property name.
    1276             :   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
    1277             :   //   and 'value' is interpreted as one of the available property value names.
    1278             :   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
    1279             :   // - Loose matching is not applied.
    1280             :   std::vector<char> first_part;
    1281             :   std::vector<char> second_part;
    1282        4994 :   if (current() == '{') {
    1283             :     // Parse \p{[PropertyName=]PropertyNameValue}
    1284       62070 :     for (Advance(); current() != '}' && current() != '='; Advance()) {
    1285       26149 :       if (!has_next()) return false;
    1286       52244 :       first_part.push_back(static_cast<char>(current()));
    1287             :     }
    1288        4886 :     if (current() == '=') {
    1289       30480 :       for (Advance(); current() != '}'; Advance()) {
    1290       12319 :         if (!has_next()) return false;
    1291       24638 :         second_part.push_back(static_cast<char>(current()));
    1292             :       }
    1293        5842 :       second_part.push_back(0);  // null-terminate string.
    1294             :     }
    1295             :   } else {
    1296             :     return false;
    1297             :   }
    1298        4886 :   Advance();
    1299        9772 :   first_part.push_back(0);  // null-terminate string.
    1300             : 
    1301        4886 :   if (second_part.empty()) {
    1302             :     // First attempt to interpret as general category property value name.
    1303             :     const char* name = first_part.data();
    1304        1965 :     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
    1305        1965 :                                 result, zone())) {
    1306             :       return true;
    1307             :     }
    1308             :     // Interpret "Any", "ASCII", and "Assigned".
    1309        1592 :     if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
    1310             :       return true;
    1311             :     }
    1312             :     // Then attempt to interpret as binary property name with value name 'Y'.
    1313        1352 :     UProperty property = u_getPropertyEnum(name);
    1314        1352 :     if (!IsSupportedBinaryProperty(property)) return false;
    1315         875 :     if (!IsExactPropertyAlias(name, property)) return false;
    1316             :     return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
    1317         875 :                                    zone());
    1318             :   } else {
    1319             :     // Both property name and value name are specified. Attempt to interpret
    1320             :     // the property name as enumerated property.
    1321             :     const char* property_name = first_part.data();
    1322             :     const char* value_name = second_part.data();
    1323        2921 :     UProperty property = u_getPropertyEnum(property_name);
    1324        2921 :     if (!IsExactPropertyAlias(property_name, property)) return false;
    1325        2849 :     if (property == UCHAR_GENERAL_CATEGORY) {
    1326             :       // We want to allow aggregate value names such as "Letter".
    1327             :       property = UCHAR_GENERAL_CATEGORY_MASK;
    1328        5618 :     } else if (property != UCHAR_SCRIPT &&
    1329        2809 :                property != UCHAR_SCRIPT_EXTENSIONS) {
    1330             :       return false;
    1331             :     }
    1332             :     return LookupPropertyValueName(property, value_name, negate, result,
    1333        2705 :                                    zone());
    1334             :   }
    1335             : }
    1336             : 
    1337             : #else  // V8_INTL_SUPPORT
    1338             : 
    1339             : bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
    1340             :                                       bool negate) {
    1341             :   return false;
    1342             : }
    1343             : 
    1344             : #endif  // V8_INTL_SUPPORT
    1345             : 
    1346        9334 : bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
    1347             :   uc32 x = 0;
    1348             :   int d = HexValue(current());
    1349        1609 :   if (d < 0) {
    1350             :     return false;
    1351             :   }
    1352        9334 :   while (d >= 0) {
    1353        7752 :     x = x * 16 + d;
    1354        7752 :     if (x > max_value) {
    1355             :       return false;
    1356             :     }
    1357        7725 :     Advance();
    1358             :     d = HexValue(current());
    1359             :   }
    1360        1582 :   *value = x;
    1361        1582 :   return true;
    1362             : }
    1363             : 
    1364             : 
    1365       44462 : uc32 RegExpParser::ParseClassCharacterEscape() {
    1366             :   DCHECK_EQ('\\', current());
    1367             :   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
    1368       19598 :   Advance();
    1369       19598 :   switch (current()) {
    1370             :     case 'b':
    1371          14 :       Advance();
    1372          14 :       return '\b';
    1373             :     // ControlEscape :: one of
    1374             :     //   f n r t v
    1375             :     case 'f':
    1376           0 :       Advance();
    1377           0 :       return '\f';
    1378             :     case 'n':
    1379         475 :       Advance();
    1380         475 :       return '\n';
    1381             :     case 'r':
    1382          50 :       Advance();
    1383          50 :       return '\r';
    1384             :     case 't':
    1385         186 :       Advance();
    1386         186 :       return '\t';
    1387             :     case 'v':
    1388          11 :       Advance();
    1389          11 :       return '\v';
    1390             :     case 'c': {
    1391             :       uc32 controlLetter = Next();
    1392         386 :       uc32 letter = controlLetter & ~('A' ^ 'a');
    1393             :       // Inside a character class, we also accept digits and underscore as
    1394             :       // control characters, unless with /u. See Annex B:
    1395             :       // ES#prod-annexB-ClassControlLetter
    1396         386 :       if (letter >= 'A' && letter <= 'Z') {
    1397             :         Advance(2);
    1398             :         // Control letters mapped to ASCII control characters in the range
    1399             :         // 0x00-0x1f.
    1400          60 :         return controlLetter & 0x1f;
    1401             :       }
    1402         326 :       if (unicode()) {
    1403             :         // With /u, invalid escapes are not treated as identity escapes.
    1404          18 :         ReportError(CStrVector("Invalid class escape"));
    1405          18 :         return 0;
    1406             :       }
    1407         616 :       if ((controlLetter >= '0' && controlLetter <= '9') ||
    1408         308 :           controlLetter == '_') {
    1409             :         Advance(2);
    1410         202 :         return controlLetter & 0x1f;
    1411             :       }
    1412             :       // We match JSC in reading the backslash as a literal
    1413             :       // character instead of as starting an escape.
    1414             :       // TODO(v8:6201): Not yet covered by the spec.
    1415             :       return '\\';
    1416             :     }
    1417             :     case '0':
    1418             :       // With /u, \0 is interpreted as NUL if not followed by another digit.
    1419        1101 :       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
    1420          35 :         Advance();
    1421          35 :         return 0;
    1422             :       }
    1423             :     // Fall through.
    1424             :     case '1':
    1425             :     case '2':
    1426             :     case '3':
    1427             :     case '4':
    1428             :     case '5':
    1429             :     case '6':
    1430             :     case '7':
    1431             :       // For compatibility, we interpret a decimal escape that isn't
    1432             :       // a back reference (and therefore either \0 or not valid according
    1433             :       // to the specification) as a 1..3 digit octal character code.
    1434             :       // ES#prod-annexB-LegacyOctalEscapeSequence
    1435        1166 :       if (unicode()) {
    1436             :         // With /u, decimal escape is not interpreted as octal character code.
    1437          36 :         ReportError(CStrVector("Invalid class escape"));
    1438          36 :         return 0;
    1439             :       }
    1440        1130 :       return ParseOctalLiteral();
    1441             :     case 'x': {
    1442        1799 :       Advance();
    1443             :       uc32 value;
    1444        1799 :       if (ParseHexEscape(2, &value)) return value;
    1445          30 :       if (unicode()) {
    1446             :         // With /u, invalid escapes are not treated as identity escapes.
    1447           0 :         ReportError(CStrVector("Invalid escape"));
    1448           0 :         return 0;
    1449             :       }
    1450             :       // If \x is not followed by a two-digit hexadecimal, treat it
    1451             :       // as an identity escape.
    1452             :       return 'x';
    1453             :     }
    1454             :     case 'u': {
    1455       12744 :       Advance();
    1456             :       uc32 value;
    1457       12744 :       if (ParseUnicodeEscape(&value)) return value;
    1458          20 :       if (unicode()) {
    1459             :         // With /u, invalid escapes are not treated as identity escapes.
    1460           0 :         ReportError(CStrVector("Invalid unicode escape"));
    1461           0 :         return 0;
    1462             :       }
    1463             :       // If \u is not followed by a two-digit hexadecimal, treat it
    1464             :       // as an identity escape.
    1465             :       return 'u';
    1466             :     }
    1467             :     default: {
    1468             :       uc32 result = current();
    1469             :       // With /u, no identity escapes except for syntax characters and '-' are
    1470             :       // allowed. Otherwise, all identity escapes are allowed.
    1471        2786 :       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
    1472        2723 :         Advance();
    1473        2723 :         return result;
    1474             :       }
    1475           9 :       ReportError(CStrVector("Invalid escape"));
    1476           9 :       return 0;
    1477             :     }
    1478             :   }
    1479             :   return 0;
    1480             : }
    1481             : 
    1482      428021 : void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
    1483             :                                     Zone* zone,
    1484             :                                     bool add_unicode_case_equivalents,
    1485      428364 :                                     uc32* char_out, bool* is_class_escape) {
    1486             :   uc32 current_char = current();
    1487      428021 :   if (current_char == '\\') {
    1488       21637 :     switch (Next()) {
    1489             :       case 'w':
    1490             :       case 'W':
    1491             :       case 'd':
    1492             :       case 'D':
    1493             :       case 's':
    1494             :       case 'S': {
    1495             :         CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
    1496        3392 :                                        add_unicode_case_equivalents, zone);
    1497             :         Advance(2);
    1498        1696 :         *is_class_escape = true;
    1499        1696 :         return;
    1500             :       }
    1501             :       case kEndMarker:
    1502           0 :         ReportError(CStrVector("\\ at end of pattern"));
    1503           0 :         return;
    1504             :       case 'p':
    1505             :       case 'P':
    1506         757 :         if (FLAG_harmony_regexp_property && unicode()) {
    1507         343 :           bool negate = Next() == 'P';
    1508             :           Advance(2);
    1509         343 :           if (!ParsePropertyClass(ranges, negate)) {
    1510          45 :             ReportError(CStrVector("Invalid property name in character class"));
    1511             :           }
    1512         343 :           *is_class_escape = true;
    1513         343 :           return;
    1514             :         }
    1515             :         break;
    1516             :       default:
    1517             :         break;
    1518             :     }
    1519       19598 :     *char_out = ParseClassCharacterEscape();
    1520       19598 :     *is_class_escape = false;
    1521             :   } else {
    1522      406384 :     Advance();
    1523      406384 :     *char_out = current_char;
    1524      406384 :     *is_class_escape = false;
    1525             :   }
    1526             : }
    1527             : 
    1528     2913280 : RegExpTree* RegExpParser::ParseCharacterClass() {
    1529             :   static const char* kUnterminated = "Unterminated character class";
    1530             :   static const char* kRangeInvalid = "Invalid character class";
    1531             :   static const char* kRangeOutOfOrder = "Range out of order in character class";
    1532             : 
    1533             :   DCHECK_EQ(current(), '[');
    1534      164331 :   Advance();
    1535             :   bool is_negated = false;
    1536      164331 :   if (current() == '^') {
    1537             :     is_negated = true;
    1538       10674 :     Advance();
    1539             :   }
    1540      164127 :   ZoneList<CharacterRange>* ranges =
    1541      164331 :       new (zone()) ZoneList<CharacterRange>(2, zone());
    1542      165515 :   bool add_unicode_case_equivalents = unicode() && ignore_case();
    1543      987713 :   while (has_more() && current() != ']') {
    1544             :     uc32 char_1, char_2;
    1545             :     bool is_class_1, is_class_2;
    1546             :     ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
    1547      495886 :                      &is_class_1 CHECK_FAILED);
    1548      247742 :     if (current() == '-') {
    1549      180469 :       Advance();
    1550      180469 :       if (current() == kEndMarker) {
    1551             :         // If we reach the end we break out of the loop and let the
    1552             :         // following code report an error.
    1553             :         break;
    1554      180463 :       } else if (current() == ']') {
    1555         562 :         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
    1556         292 :         ranges->Add(CharacterRange::Singleton('-'), zone());
    1557         292 :         break;
    1558             :       }
    1559             :       ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
    1560      180171 :                        &is_class_2 CHECK_FAILED);
    1561      180171 :       if (is_class_1 || is_class_2) {
    1562             :         // Either end is an escaped character class. Treat the '-' verbatim.
    1563         211 :         if (unicode()) {
    1564             :           // ES2015 21.2.2.15.1 step 1.
    1565          54 :           return ReportError(CStrVector(kRangeInvalid));
    1566             :         }
    1567         216 :         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
    1568         184 :         ranges->Add(CharacterRange::Singleton('-'), zone());
    1569         314 :         if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
    1570         184 :         continue;
    1571             :       }
    1572             :       // ES2015 21.2.2.15.1 step 6.
    1573      179960 :       if (char_1 > char_2) {
    1574         102 :         return ReportError(CStrVector(kRangeOutOfOrder));
    1575             :       }
    1576      179909 :       ranges->Add(CharacterRange::Range(char_1, char_2), zone());
    1577             :     } else {
    1578      132816 :       if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
    1579             :     }
    1580             :   }
    1581      164145 :   if (!has_more()) {
    1582          36 :     return ReportError(CStrVector(kUnterminated));
    1583             :   }
    1584      164127 :   Advance();
    1585      164127 :   if (ranges->length() == 0) {
    1586         315 :     ranges->Add(CharacterRange::Everything(), zone());
    1587         315 :     is_negated = !is_negated;
    1588             :   }
    1589             :   RegExpCharacterClass::Flags flags;
    1590      164127 :   if (is_negated) flags = RegExpCharacterClass::NEGATED;
    1591             :   return new (zone()) RegExpCharacterClass(ranges, flags);
    1592             : }
    1593             : 
    1594             : 
    1595             : #undef CHECK_FAILED
    1596             : 
    1597             : 
    1598      397989 : bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
    1599             :                                FlatStringReader* input, JSRegExp::Flags flags,
    1600             :                                RegExpCompileData* result) {
    1601             :   DCHECK(result != nullptr);
    1602      397989 :   RegExpParser parser(input, &result->error, flags, isolate, zone);
    1603      397989 :   RegExpTree* tree = parser.ParsePattern();
    1604      397989 :   if (parser.failed()) {
    1605             :     DCHECK(tree == nullptr);
    1606             :     DCHECK(!result->error.is_null());
    1607             :   } else {
    1608             :     DCHECK(tree != nullptr);
    1609             :     DCHECK(result->error.is_null());
    1610             :     if (FLAG_trace_regexp_parser) {
    1611             :       OFStream os(stdout);
    1612             :       tree->Print(os, zone);
    1613             :       os << "\n";
    1614             :     }
    1615      395311 :     result->tree = tree;
    1616      395311 :     int capture_count = parser.captures_started();
    1617      606783 :     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
    1618      395311 :     result->contains_anchor = parser.contains_anchor();
    1619      395311 :     result->capture_name_map = parser.CreateCaptureNameMap();
    1620      395311 :     result->capture_count = capture_count;
    1621             :   }
    1622      397989 :   return !parser.failed();
    1623             : }
    1624             : 
    1625           0 : RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
    1626             :     : zone_(zone),
    1627             :       pending_empty_(false),
    1628             :       ignore_case_(ignore_case),
    1629             :       unicode_(unicode),
    1630             :       characters_(nullptr),
    1631             :       pending_surrogate_(kNoPendingSurrogate),
    1632             :       terms_(),
    1633     1699012 :       alternatives_()
    1634             : #ifdef DEBUG
    1635             :       ,
    1636             :       last_added_(ADD_NONE)
    1637             : #endif
    1638             : {
    1639           0 : }
    1640             : 
    1641             : 
    1642           0 : void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
    1643             :   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
    1644             :   FlushPendingSurrogate();
    1645             :   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
    1646         785 :   pending_surrogate_ = lead_surrogate;
    1647           0 : }
    1648             : 
    1649             : 
    1650        2725 : void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
    1651             :   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
    1652         721 :   if (pending_surrogate_ != kNoPendingSurrogate) {
    1653         531 :     uc16 lead_surrogate = pending_surrogate_;
    1654         531 :     pending_surrogate_ = kNoPendingSurrogate;
    1655             :     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
    1656             :     uc32 combined =
    1657         531 :         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
    1658         531 :     if (NeedsDesugaringForIgnoreCase(combined)) {
    1659          30 :       AddCharacterClassForDesugaring(combined);
    1660             :     } else {
    1661         501 :       ZoneList<uc16> surrogate_pair(2, zone());
    1662         501 :       surrogate_pair.Add(lead_surrogate, zone());
    1663         501 :       surrogate_pair.Add(trail_surrogate, zone());
    1664             :       RegExpAtom* atom =
    1665         501 :           new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
    1666         501 :       AddAtom(atom);
    1667             :     }
    1668             :   } else {
    1669         190 :     pending_surrogate_ = trail_surrogate;
    1670             :     FlushPendingSurrogate();
    1671             :   }
    1672         721 : }
    1673             : 
    1674             : 
    1675           0 : void RegExpBuilder::FlushPendingSurrogate() {
    1676   432448584 :   if (pending_surrogate_ != kNoPendingSurrogate) {
    1677             :     DCHECK(unicode());
    1678         444 :     uc32 c = pending_surrogate_;
    1679         444 :     pending_surrogate_ = kNoPendingSurrogate;
    1680         444 :     AddCharacterClassForDesugaring(c);
    1681             :   }
    1682           0 : }
    1683             : 
    1684             : 
    1685     5879995 : void RegExpBuilder::FlushCharacters() {
    1686             :   FlushPendingSurrogate();
    1687     5041373 :   pending_empty_ = false;
    1688     5041373 :   if (characters_ != nullptr) {
    1689             :     RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
    1690      419311 :     characters_ = nullptr;
    1691      419311 :     text_.Add(atom, zone());
    1692             :     LAST(ADD_ATOM);
    1693             :   }
    1694     5041373 : }
    1695             : 
    1696             : 
    1697     5438067 : void RegExpBuilder::FlushText() {
    1698     4832029 :   FlushCharacters();
    1699             :   int num_text = text_.length();
    1700     4832029 :   if (num_text == 0) {
    1701     4832029 :     return;
    1702      584270 :   } else if (num_text == 1) {
    1703      579026 :     terms_.Add(text_.last(), zone());
    1704             :   } else {
    1705             :     RegExpText* text = new (zone()) RegExpText(zone());
    1706       38292 :     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
    1707        5244 :     terms_.Add(text, zone());
    1708             :   }
    1709             :   text_.Clear();
    1710             : }
    1711             : 
    1712             : 
    1713   853484715 : void RegExpBuilder::AddCharacter(uc16 c) {
    1714             :   FlushPendingSurrogate();
    1715   425715188 :   pending_empty_ = false;
    1716   425715188 :   if (NeedsDesugaringForIgnoreCase(c)) {
    1717         215 :     AddCharacterClassForDesugaring(c);
    1718             :   } else {
    1719   425714973 :     if (characters_ == nullptr) {
    1720     2054554 :       characters_ = new (zone()) ZoneList<uc16>(4, zone());
    1721             :     }
    1722   425714973 :     characters_->Add(c, zone());
    1723             :     LAST(ADD_CHAR);
    1724             :   }
    1725   425715188 : }
    1726             : 
    1727             : 
    1728   851367841 : void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
    1729   425684186 :   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    1730             :     DCHECK(unicode());
    1731         531 :     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
    1732         531 :     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
    1733   425686100 :   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
    1734         254 :     AddLeadSurrogate(c);
    1735   425685592 :   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
    1736         190 :     AddTrailSurrogate(c);
    1737             :   } else {
    1738   425683211 :     AddCharacter(static_cast<uc16>(c));
    1739             :   }
    1740   425684186 : }
    1741             : 
    1742        1599 : void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
    1743             :   // A lead or trail surrogate parsed via escape sequence will not
    1744             :   // pair up with any preceding lead or following trail surrogate.
    1745             :   FlushPendingSurrogate();
    1746        1599 :   AddUnicodeCharacter(character);
    1747             :   FlushPendingSurrogate();
    1748        1599 : }
    1749             : 
    1750         197 : void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
    1751             : 
    1752             : 
    1753      213666 : void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
    1754      213666 :   if (NeedsDesugaringForUnicode(cc)) {
    1755             :     // With /u, character class needs to be desugared, so it
    1756             :     // must be a standalone term instead of being part of a RegExpText.
    1757        4823 :     AddTerm(cc);
    1758             :   } else {
    1759      208843 :     AddAtom(cc);
    1760             :   }
    1761      213666 : }
    1762             : 
    1763        1378 : void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
    1764             :   AddTerm(new (zone()) RegExpCharacterClass(
    1765        1378 :       CharacterRange::List(zone(), CharacterRange::Singleton(c))));
    1766         689 : }
    1767             : 
    1768             : 
    1769     3031374 : void RegExpBuilder::AddAtom(RegExpTree* term) {
    1770     1515687 :   if (term->IsEmpty()) {
    1771             :     AddEmpty();
    1772     1515687 :     return;
    1773             :   }
    1774     1515687 :   if (term->IsTextElement()) {
    1775      209344 :     FlushCharacters();
    1776      209344 :     text_.Add(term, zone());
    1777             :   } else {
    1778     1306343 :     FlushText();
    1779     1306343 :     terms_.Add(term, zone());
    1780             :   }
    1781             :   LAST(ADD_ATOM);
    1782             : }
    1783             : 
    1784             : 
    1785       11024 : void RegExpBuilder::AddTerm(RegExpTree* term) {
    1786        5512 :   FlushText();
    1787        5512 :   terms_.Add(term, zone());
    1788             :   LAST(ADD_ATOM);
    1789        5512 : }
    1790             : 
    1791             : 
    1792       43886 : void RegExpBuilder::AddAssertion(RegExpTree* assert) {
    1793       22039 :   FlushText();
    1794       22039 :   if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
    1795             :     // Omit repeated assertions of the same type.
    1796         738 :     RegExpAssertion* last = terms_.last()->AsAssertion();
    1797         738 :     RegExpAssertion* next = assert->AsAssertion();
    1798       22408 :     if (last->assertion_type() == next->assertion_type()) return;
    1799             :   }
    1800       21847 :   terms_.Add(assert, zone());
    1801             :   LAST(ADD_ASSERT);
    1802             : }
    1803             : 
    1804             : 
    1805      132851 : void RegExpBuilder::NewAlternative() { FlushTerms(); }
    1806             : 
    1807             : 
    1808     4185624 : void RegExpBuilder::FlushTerms() {
    1809     1829379 :   FlushText();
    1810             :   int num_terms = terms_.length();
    1811             :   RegExpTree* alternative;
    1812     1829379 :   if (num_terms == 0) {
    1813             :     alternative = new (zone()) RegExpEmpty();
    1814     1433721 :   } else if (num_terms == 1) {
    1815             :     alternative = terms_.last();
    1816             :   } else {
    1817      131208 :     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
    1818             :   }
    1819     1829379 :   alternatives_.Add(alternative, zone());
    1820             :   terms_.Clear();
    1821             :   LAST(ADD_NONE);
    1822     1829379 : }
    1823             : 
    1824             : 
    1825      225956 : bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
    1826      213666 :   if (!unicode()) return false;
    1827             :   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
    1828             :   // necessarily mean that we need to desugar. It's probably nicer to have a
    1829             :   // separate pass to figure out unicode desugarings.
    1830        6302 :   if (ignore_case()) return true;
    1831        5988 :   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
    1832        5988 :   CharacterRange::Canonicalize(ranges);
    1833       16628 :   for (int i = ranges->length() - 1; i >= 0; i--) {
    1834       15149 :     uc32 from = ranges->at(i).from();
    1835       15149 :     uc32 to = ranges->at(i).to();
    1836             :     // Check for non-BMP characters.
    1837       15149 :     if (to >= kNonBmpStart) return true;
    1838             :     // Check for lone surrogates.
    1839       10750 :     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
    1840             :   }
    1841             :   return false;
    1842             : }
    1843             : 
    1844             : 
    1845   425718266 : bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
    1846             : #ifdef V8_INTL_SUPPORT
    1847   425718266 :   if (unicode() && ignore_case()) {
    1848         260 :     icu::UnicodeSet set(c, c);
    1849         260 :     set.closeOver(USET_CASE_INSENSITIVE);
    1850         260 :     set.removeAllStrings();
    1851         260 :     return set.size() > 1;
    1852             :   }
    1853             :   // In the case where ICU is not included, we act as if the unicode flag is
    1854             :   // not set, and do not desugar.
    1855             : #endif  // V8_INTL_SUPPORT
    1856             :   return false;
    1857             : }
    1858             : 
    1859             : 
    1860     1749078 : RegExpTree* RegExpBuilder::ToRegExp() {
    1861     1696528 :   FlushTerms();
    1862             :   int num_alternatives = alternatives_.length();
    1863     1696528 :   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
    1864     1696528 :   if (num_alternatives == 1) return alternatives_.last();
    1865       52550 :   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
    1866             : }
    1867             : 
    1868     1687850 : bool RegExpBuilder::AddQuantifierToAtom(
    1869     3342405 :     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
    1870             :   FlushPendingSurrogate();
    1871     1687850 :   if (pending_empty_) {
    1872           2 :     pending_empty_ = false;
    1873           2 :     return true;
    1874             :   }
    1875             :   RegExpTree* atom;
    1876     1687848 :   if (characters_ != nullptr) {
    1877             :     DCHECK(last_added_ == ADD_CHAR);
    1878             :     // Last atom was character.
    1879             :     Vector<const uc16> char_vector = characters_->ToConstVector();
    1880             :     int num_chars = char_vector.length();
    1881     1634995 :     if (num_chars > 1) {
    1882         674 :       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
    1883         674 :       text_.Add(new (zone()) RegExpAtom(prefix), zone());
    1884             :       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
    1885             :     }
    1886     1634995 :     characters_ = nullptr;
    1887             :     atom = new (zone()) RegExpAtom(char_vector);
    1888     1634995 :     FlushText();
    1889       52853 :   } else if (text_.length() > 0) {
    1890             :     DCHECK(last_added_ == ADD_ATOM);
    1891       33761 :     atom = text_.RemoveLast();
    1892       33761 :     FlushText();
    1893       19092 :   } else if (terms_.length() > 0) {
    1894             :     DCHECK(last_added_ == ADD_ATOM);
    1895       19092 :     atom = terms_.RemoveLast();
    1896             :     // With /u, lookarounds are not quantifiable.
    1897       19092 :     if (unicode() && atom->IsLookaround()) return false;
    1898       19072 :     if (atom->max_match() == 0) {
    1899             :       // Guaranteed to only match an empty string.
    1900             :       LAST(ADD_TERM);
    1901         297 :       if (min == 0) {
    1902             :         return true;
    1903             :       }
    1904         113 :       terms_.Add(atom, zone());
    1905         113 :       return true;
    1906             :     }
    1907             :   } else {
    1908             :     // Only call immediately after adding an atom or character!
    1909           0 :     UNREACHABLE();
    1910             :   }
    1911     1687531 :   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
    1912     1687531 :              zone());
    1913             :   LAST(ADD_TERM);
    1914     1687531 :   return true;
    1915             : }
    1916             : 
    1917             : }  // namespace internal
    1918             : }  // namespace v8

Generated by: LCOV version 1.10