LCOV - code coverage report
Current view: top level - src/regexp - regexp-parser.cc (source / functions) Hit Total Coverage
Test: app.info Lines: 798 829 96.3 %
Date: 2019-01-20 Functions: 52 65 80.0 %

          Line data    Source code
       1             : // Copyright 2016 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/regexp/regexp-parser.h"
       6             : 
       7             : #include <vector>
       8             : 
       9             : #include "src/char-predicates-inl.h"
      10             : #include "src/heap/factory.h"
      11             : #include "src/isolate.h"
      12             : #include "src/objects-inl.h"
      13             : #include "src/ostreams.h"
      14             : #include "src/regexp/jsregexp.h"
      15             : #include "src/regexp/property-sequences.h"
      16             : #include "src/utils.h"
      17             : 
      18             : #ifdef V8_INTL_SUPPORT
      19             : #include "unicode/uniset.h"
      20             : #endif  // V8_INTL_SUPPORT
      21             : 
      22             : namespace v8 {
      23             : namespace internal {
      24             : 
      25           0 : RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
      26             :                            JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
      27             :     : isolate_(isolate),
      28             :       zone_(zone),
      29             :       error_(error),
      30             :       captures_(nullptr),
      31             :       named_captures_(nullptr),
      32             :       named_back_references_(nullptr),
      33             :       in_(in),
      34             :       current_(kEndMarker),
      35             :       top_level_flags_(flags),
      36             :       next_pos_(0),
      37             :       captures_started_(0),
      38             :       capture_count_(0),
      39             :       has_more_(true),
      40             :       simple_(false),
      41             :       contains_anchor_(false),
      42             :       is_scanned_for_captures_(false),
      43             :       has_named_captures_(false),
      44      380740 :       failed_(false) {
      45      380740 :   Advance();
      46           0 : }
      47             : 
      48             : template <bool update_position>
      49   777820408 : inline uc32 RegExpParser::ReadNext() {
      50   388910204 :   int position = next_pos_;
      51             :   uc32 c0 = in()->Get(position);
      52   388910204 :   position++;
      53             :   // Read the whole surrogate pair in case of unicode flag, if possible.
      54   389608447 :   if (unicode() && position < in()->length() &&
      55             :       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
      56             :     uc16 c1 = in()->Get(position);
      57         586 :     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
      58         389 :       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
      59         371 :       position++;
      60             :     }
      61             :   }
      62   388737044 :   if (update_position) next_pos_ = position;
      63   388910204 :   return c0;
      64             : }
      65             : 
      66             : 
      67           0 : uc32 RegExpParser::Next() {
      68      173184 :   if (has_next()) {
      69      173160 :     return ReadNext<false>();
      70             :   } else {
      71             :     return kEndMarker;
      72             :   }
      73             : }
      74             : 
      75  1166592010 : void RegExpParser::Advance() {
      76   389117516 :   if (has_next()) {
      77             :     StackLimitCheck check(isolate());
      78   388737441 :     if (check.HasOverflowed()) {
      79         388 :       if (FLAG_abort_on_stack_or_string_length_overflow) {
      80           0 :         FATAL("Aborting on stack overflow");
      81             :       }
      82             :       ReportError(CStrVector(
      83         776 :           MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
      84   388737053 :     } else if (zone()->excess_allocation()) {
      85           9 :       ReportError(CStrVector("Regular expression too large"));
      86             :     } else {
      87   388737044 :       current_ = ReadNext<true>();
      88             :     }
      89             :   } else {
      90      380075 :     current_ = kEndMarker;
      91             :     // Advance so that position() points to 1-after-the-last-character. This is
      92             :     // important so that Reset() to this position works correctly.
      93      380075 :     next_pos_ = in()->length() + 1;
      94      380075 :     has_more_ = false;
      95             :   }
      96   389117516 : }
      97             : 
      98             : 
      99        2639 : void RegExpParser::Reset(int pos) {
     100        2639 :   next_pos_ = pos;
     101        2639 :   has_more_ = (pos < in()->length());
     102        2639 :   Advance();
     103           0 : }
     104             : 
     105           0 : void RegExpParser::Advance(int dist) {
     106       46501 :   next_pos_ += dist - 1;
     107       46501 :   Advance();
     108           0 : }
     109             : 
     110             : 
     111      206841 : bool RegExpParser::simple() { return simple_; }
     112             : 
     113           0 : bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
     114         177 :   switch (c) {
     115             :     case '^':
     116             :     case '$':
     117             :     case '\\':
     118             :     case '.':
     119             :     case '*':
     120             :     case '+':
     121             :     case '?':
     122             :     case '(':
     123             :     case ')':
     124             :     case '[':
     125             :     case ']':
     126             :     case '{':
     127             :     case '}':
     128             :     case '|':
     129             :     case '/':
     130             :       return true;
     131             :     default:
     132             :       break;
     133             :   }
     134           0 :   return false;
     135             : }
     136             : 
     137             : 
     138        8943 : RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
     139        2981 :   if (failed_) return nullptr;  // Do not overwrite any existing error.
     140        2981 :   failed_ = true;
     141             :   *error_ = isolate()
     142             :                 ->factory()
     143             :                 ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
     144        8943 :                 .ToHandleChecked();
     145             :   // Zip to the end to make sure the no more input is read.
     146        2981 :   current_ = kEndMarker;
     147        2981 :   next_pos_ = in()->length();
     148        2981 :   return nullptr;
     149             : }
     150             : 
     151             : #define CHECK_FAILED /**/);    \
     152             :   if (failed_) return nullptr; \
     153             :   ((void)0
     154             : 
     155             : // Pattern ::
     156             : //   Disjunction
     157      587581 : RegExpTree* RegExpParser::ParsePattern() {
     158      380740 :   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
     159      377859 :   PatchNamedBackReferences(CHECK_FAILED);
     160             :   DCHECK(!has_more());
     161             :   // If the result of parsing is a literal string atom, and it has the
     162             :   // same length as the input, then the atom is identical to the input.
     163      584600 :   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
     164      199836 :     simple_ = true;
     165             :   }
     166      377759 :   return result;
     167             : }
     168             : 
     169             : 
     170             : // Disjunction ::
     171             : //   Alternative
     172             : //   Alternative | Disjunction
     173             : // Alternative ::
     174             : //   [empty]
     175             : //   Term Alternative
     176             : // Term ::
     177             : //   Assertion
     178             : //   Atom
     179             : //   Atom Quantifier
     180  1157354042 : RegExpTree* RegExpParser::ParseDisjunction() {
     181             :   // Used to store current state while parsing subexpressions.
     182             :   RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
     183      380740 :                                   0, nullptr, top_level_flags_, zone());
     184     5887000 :   RegExpParserState* state = &initial_state;
     185             :   // Cache the builder in a local variable for quick access.
     186      380740 :   RegExpBuilder* builder = initial_state.builder();
     187             :   while (true) {
     188   386275177 :     switch (current()) {
     189             :       case kEndMarker:
     190      378186 :         if (state->IsSubexpression()) {
     191             :           // Inside a parenthesized group when hitting end of input.
     192      380803 :           return ReportError(CStrVector("Unterminated group"));
     193             :         }
     194             :         DCHECK_EQ(INITIAL, state->group_type());
     195             :         // Parsing completed successfully.
     196      378123 :         return builder->ToRegExp();
     197             :       case ')': {
     198     1105546 :         if (!state->IsSubexpression()) {
     199          14 :           return ReportError(CStrVector("Unmatched ')'"));
     200             :         }
     201             :         DCHECK_NE(INITIAL, state->group_type());
     202             : 
     203     1105532 :         Advance();
     204             :         // End disjunction parsing and convert builder content to new single
     205             :         // regexp atom.
     206     1105532 :         RegExpTree* body = builder->ToRegExp();
     207             : 
     208             :         int end_capture_index = captures_started();
     209             : 
     210             :         int capture_index = state->capture_index();
     211             :         SubexpressionType group_type = state->group_type();
     212             : 
     213             :         // Build result of subexpression.
     214     1105532 :         if (group_type == CAPTURE) {
     215     1081900 :           if (state->IsNamedCapture()) {
     216             :             CreateNamedCaptureAtIndex(state->capture_name(),
     217        1257 :                                       capture_index CHECK_FAILED);
     218             :           }
     219     1081859 :           RegExpCapture* capture = GetCapture(capture_index);
     220             :           capture->set_body(body);
     221             :           body = capture;
     222       23632 :         } else if (group_type == GROUPING) {
     223             :           body = new (zone()) RegExpGroup(body);
     224             :         } else {
     225             :           DCHECK(group_type == POSITIVE_LOOKAROUND ||
     226             :                  group_type == NEGATIVE_LOOKAROUND);
     227        3611 :           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
     228             :           body = new (zone()) RegExpLookaround(
     229             :               body, is_positive, end_capture_index - capture_index,
     230        3611 :               capture_index, state->lookaround_type());
     231             :         }
     232             : 
     233             :         // Restore previous state.
     234             :         state = state->previous_state();
     235             :         builder = state->builder();
     236             : 
     237     1105491 :         builder->AddAtom(body);
     238             :         // For compatibility with JSC and ES3, we allow quantifiers after
     239             :         // lookaheads, and break in all cases.
     240     1105491 :         break;
     241             :       }
     242             :       case '|': {
     243      117380 :         Advance();
     244             :         builder->NewAlternative();
     245   384442511 :         continue;
     246             :       }
     247             :       case '*':
     248             :       case '+':
     249             :       case '?':
     250          95 :         return ReportError(CStrVector("Nothing to repeat"));
     251             :       case '^': {
     252        6467 :         Advance();
     253        6467 :         if (builder->multiline()) {
     254             :           builder->AddAssertion(new (zone()) RegExpAssertion(
     255         263 :               RegExpAssertion::START_OF_LINE, builder->flags()));
     256             :         } else {
     257             :           builder->AddAssertion(new (zone()) RegExpAssertion(
     258        6204 :               RegExpAssertion::START_OF_INPUT, builder->flags()));
     259             :           set_contains_anchor();
     260             :         }
     261             :         continue;
     262             :       }
     263             :       case '$': {
     264        4117 :         Advance();
     265             :         RegExpAssertion::AssertionType assertion_type =
     266             :             builder->multiline() ? RegExpAssertion::END_OF_LINE
     267        4117 :                                  : RegExpAssertion::END_OF_INPUT;
     268             :         builder->AddAssertion(
     269        4117 :             new (zone()) RegExpAssertion(assertion_type, builder->flags()));
     270        4117 :         continue;
     271             :       }
     272             :       case '.': {
     273       18500 :         Advance();
     274             :         ZoneList<CharacterRange>* ranges =
     275       18500 :             new (zone()) ZoneList<CharacterRange>(2, zone());
     276             : 
     277       18500 :         if (builder->dotall()) {
     278             :           // Everything.
     279         210 :           CharacterRange::AddClassEscape('*', ranges, false, zone());
     280             :         } else {
     281             :           // Everything except \x0A, \x0D, \u2028 and \u2029
     282       18290 :           CharacterRange::AddClassEscape('.', ranges, false, zone());
     283             :         }
     284             : 
     285             :         RegExpCharacterClass* cc =
     286       37000 :             new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
     287       18500 :         builder->AddCharacterClass(cc);
     288       18500 :         break;
     289             :       }
     290             :       case '(': {
     291     1107247 :         state = ParseOpenParenthesis(state CHECK_FAILED);
     292             :         builder = state->builder();
     293     1106734 :         continue;
     294             :       }
     295             :       case '[': {
     296      155307 :         RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
     297      154923 :         builder->AddCharacterClass(cc->AsCharacterClass());
     298      154923 :         break;
     299             :       }
     300             :       // Atom ::
     301             :       //   \ AtomEscape
     302             :       case '\\':
     303       31247 :         switch (Next()) {
     304             :           case kEndMarker:
     305           5 :             return ReportError(CStrVector("\\ at end of pattern"));
     306             :           case 'b':
     307             :             Advance(2);
     308             :             builder->AddAssertion(new (zone()) RegExpAssertion(
     309         431 :                 RegExpAssertion::BOUNDARY, builder->flags()));
     310         431 :             continue;
     311             :           case 'B':
     312             :             Advance(2);
     313             :             builder->AddAssertion(new (zone()) RegExpAssertion(
     314         327 :                 RegExpAssertion::NON_BOUNDARY, builder->flags()));
     315         327 :             continue;
     316             :           // AtomEscape ::
     317             :           //   CharacterClassEscape
     318             :           //
     319             :           // CharacterClassEscape :: one of
     320             :           //   d D s S w W
     321             :           case 'd':
     322             :           case 'D':
     323             :           case 's':
     324             :           case 'S':
     325             :           case 'w':
     326             :           case 'W': {
     327             :             uc32 c = Next();
     328             :             Advance(2);
     329             :             ZoneList<CharacterRange>* ranges =
     330        6797 :                 new (zone()) ZoneList<CharacterRange>(2, zone());
     331             :             CharacterRange::AddClassEscape(
     332        7250 :                 c, ranges, unicode() && builder->ignore_case(), zone());
     333             :             RegExpCharacterClass* cc = new (zone())
     334       13594 :                 RegExpCharacterClass(zone(), ranges, builder->flags());
     335        6797 :             builder->AddCharacterClass(cc);
     336        6797 :             break;
     337             :           }
     338             :           case 'p':
     339             :           case 'P': {
     340             :             uc32 p = Next();
     341             :             Advance(2);
     342        4187 :             if (unicode()) {
     343             :               ZoneList<CharacterRange>* ranges =
     344        3989 :                   new (zone()) ZoneList<CharacterRange>(2, zone());
     345             :               std::vector<char> name_1, name_2;
     346        3989 :               if (ParsePropertyClassName(&name_1, &name_2)) {
     347        3863 :                 if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
     348             :                   RegExpCharacterClass* cc = new (zone())
     349        6040 :                       RegExpCharacterClass(zone(), ranges, builder->flags());
     350        3020 :                   builder->AddCharacterClass(cc);
     351        3020 :                   break;
     352             :                 }
     353        1542 :                 if (p == 'p' && name_2.empty()) {
     354         510 :                   RegExpTree* sequence = GetPropertySequence(name_1);
     355         510 :                   if (sequence != nullptr) {
     356         150 :                     builder->AddAtom(sequence);
     357         150 :                     break;
     358             :                   }
     359             :                 }
     360             :               }
     361         819 :               return ReportError(CStrVector("Invalid property name"));
     362             :             } else {
     363         198 :               builder->AddCharacter(p);
     364             :             }
     365         198 :             break;
     366             :           }
     367             :           case '1':
     368             :           case '2':
     369             :           case '3':
     370             :           case '4':
     371             :           case '5':
     372             :           case '6':
     373             :           case '7':
     374             :           case '8':
     375             :           case '9': {
     376        5111 :             int index = 0;
     377        5183 :             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
     378        5111 :             if (is_backref) {
     379        9266 :               if (state->IsInsideCaptureGroup(index)) {
     380             :                 // The back reference is inside the capture group it refers to.
     381             :                 // Nothing can possibly have been captured yet, so we use empty
     382             :                 // instead. This ensures that, when checking a back reference,
     383             :                 // the capture registers of the referenced capture are either
     384             :                 // both set or both cleared.
     385             :                 builder->AddEmpty();
     386             :               } else {
     387        4540 :                 RegExpCapture* capture = GetCapture(index);
     388             :                 RegExpTree* atom =
     389             :                     new (zone()) RegExpBackReference(capture, builder->flags());
     390        4540 :                 builder->AddAtom(atom);
     391             :               }
     392        4670 :               break;
     393             :             }
     394             :             // With /u, no identity escapes except for syntax characters
     395             :             // are allowed. Otherwise, all identity escapes are allowed.
     396         478 :             if (unicode()) {
     397          72 :               return ReportError(CStrVector("Invalid escape"));
     398             :             }
     399             :             uc32 first_digit = Next();
     400         406 :             if (first_digit == '8' || first_digit == '9') {
     401          37 :               builder->AddCharacter(first_digit);
     402             :               Advance(2);
     403             :               break;
     404             :             }
     405             :             V8_FALLTHROUGH;
     406             :           }
     407             :           case '0': {
     408        3241 :             Advance();
     409        3279 :             if (unicode() && Next() >= '0' && Next() <= '9') {
     410             :               // With /u, decimal escape with leading 0 are not parsed as octal.
     411          19 :               return ReportError(CStrVector("Invalid decimal escape"));
     412             :             }
     413        3222 :             uc32 octal = ParseOctalLiteral();
     414        3222 :             builder->AddCharacter(octal);
     415        3222 :             break;
     416             :           }
     417             :           // ControlEscape :: one of
     418             :           //   f n r t v
     419             :           case 'f':
     420             :             Advance(2);
     421           6 :             builder->AddCharacter('\f');
     422           6 :             break;
     423             :           case 'n':
     424             :             Advance(2);
     425         711 :             builder->AddCharacter('\n');
     426         711 :             break;
     427             :           case 'r':
     428             :             Advance(2);
     429          73 :             builder->AddCharacter('\r');
     430          73 :             break;
     431             :           case 't':
     432             :             Advance(2);
     433           8 :             builder->AddCharacter('\t');
     434           8 :             break;
     435             :           case 'v':
     436             :             Advance(2);
     437          28 :             builder->AddCharacter('\v');
     438          28 :             break;
     439             :           case 'c': {
     440         149 :             Advance();
     441             :             uc32 controlLetter = Next();
     442             :             // Special case if it is an ASCII letter.
     443             :             // Convert lower case letters to uppercase.
     444         149 :             uc32 letter = controlLetter & ~('a' ^ 'A');
     445         149 :             if (letter < 'A' || 'Z' < letter) {
     446             :               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
     447             :               // Read the backslash as a literal character instead of as
     448             :               // starting an escape.
     449             :               // ES#prod-annexB-ExtendedPatternCharacter
     450          89 :               if (unicode()) {
     451             :                 // With /u, invalid escapes are not treated as identity escapes.
     452          19 :                 return ReportError(CStrVector("Invalid unicode escape"));
     453             :               }
     454          70 :               builder->AddCharacter('\\');
     455             :             } else {
     456             :               Advance(2);
     457          60 :               builder->AddCharacter(controlLetter & 0x1F);
     458             :             }
     459             :             break;
     460             :           }
     461             :           case 'x': {
     462             :             Advance(2);
     463             :             uc32 value;
     464         246 :             if (ParseHexEscape(2, &value)) {
     465         147 :               builder->AddCharacter(value);
     466          99 :             } else if (!unicode()) {
     467          63 :               builder->AddCharacter('x');
     468             :             } else {
     469             :               // With /u, invalid escapes are not treated as identity escapes.
     470          36 :               return ReportError(CStrVector("Invalid escape"));
     471             :             }
     472         210 :             break;
     473             :           }
     474             :           case 'u': {
     475             :             Advance(2);
     476             :             uc32 value;
     477        1997 :             if (ParseUnicodeEscape(&value)) {
     478        1809 :               builder->AddEscapedUnicodeCharacter(value);
     479         188 :             } else if (!unicode()) {
     480         116 :               builder->AddCharacter('u');
     481             :             } else {
     482             :               // With /u, invalid escapes are not treated as identity escapes.
     483          72 :               return ReportError(CStrVector("Invalid Unicode escape"));
     484             :             }
     485        1925 :             break;
     486             :           }
     487             :           case 'k':
     488             :             // Either an identity escape or a named back-reference.  The two
     489             :             // interpretations are mutually exclusive: '\k' is interpreted as
     490             :             // an identity escape for non-Unicode patterns without named
     491             :             // capture groups, and as the beginning of a named back-reference
     492             :             // in all other cases.
     493         672 :             if (unicode() || HasNamedCaptures()) {
     494             :               Advance(2);
     495         396 :               ParseNamedBackReference(builder, state CHECK_FAILED);
     496             :               break;
     497             :             }
     498             :             V8_FALLTHROUGH;
     499             :           default:
     500        7903 :             Advance();
     501             :             // With /u, no identity escapes except for syntax characters
     502             :             // are allowed. Otherwise, all identity escapes are allowed.
     503        7940 :             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
     504        7876 :               builder->AddCharacter(current());
     505        7876 :               Advance();
     506             :             } else {
     507          27 :               return ReportError(CStrVector("Invalid escape"));
     508             :             }
     509        7876 :             break;
     510             :         }
     511             :         break;
     512             :       case '{': {
     513             :         int dummy;
     514         509 :         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
     515         444 :         if (parsed) return ReportError(CStrVector("Nothing to repeat"));
     516             :         V8_FALLTHROUGH;
     517             :       }
     518             :       case '}':
     519             :       case ']':
     520         903 :         if (unicode()) {
     521          28 :           return ReportError(CStrVector("Lone quantifier brackets"));
     522             :         }
     523             :         V8_FALLTHROUGH;
     524             :       default:
     525   383351012 :         builder->AddUnicodeCharacter(current());
     526   383351012 :         Advance();
     527   383351012 :         break;
     528             :     }  // end switch(current())
     529             : 
     530             :     int min;
     531             :     int max;
     532   384659255 :     switch (current()) {
     533             :       // QuantifierPrefix ::
     534             :       //   *
     535             :       //   +
     536             :       //   ?
     537             :       //   {
     538             :       case '*':
     539       19551 :         min = 0;
     540       19551 :         max = RegExpTree::kInfinity;
     541       19551 :         Advance();
     542       19551 :         break;
     543             :       case '+':
     544        8002 :         min = 1;
     545        8002 :         max = RegExpTree::kInfinity;
     546        8002 :         Advance();
     547        8002 :         break;
     548             :       case '?':
     549     1421675 :         min = 0;
     550     1421675 :         max = 1;
     551     1421675 :         Advance();
     552     1421675 :         break;
     553             :       case '{':
     554        3280 :         if (ParseIntervalQuantifier(&min, &max)) {
     555        2773 :           if (max < min) {
     556             :             return ReportError(
     557           2 :                 CStrVector("numbers out of order in {} quantifier"));
     558             :           }
     559             :           break;
     560         507 :         } else if (unicode()) {
     561             :           // With /u, incomplete quantifiers are not allowed.
     562         199 :           return ReportError(CStrVector("Incomplete quantifier"));
     563             :         }
     564             :         continue;
     565             :       default:
     566             :         continue;
     567             :     }
     568             :     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
     569     1451999 :     if (current() == '?') {
     570             :       quantifier_type = RegExpQuantifier::NON_GREEDY;
     571        1745 :       Advance();
     572             :     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
     573             :       // FLAG_regexp_possessive_quantifier is a debug-only flag.
     574             :       quantifier_type = RegExpQuantifier::POSSESSIVE;
     575             :       Advance();
     576             :     }
     577     1451999 :     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
     578          73 :       return ReportError(CStrVector("Invalid quantifier"));
     579             :     }
     580             :   }
     581             : }
     582             : 
     583     1107247 : RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
     584     4431298 :     RegExpParserState* state) {
     585             :   RegExpLookaround::Type lookaround_type = state->lookaround_type();
     586             :   bool is_named_capture = false;
     587             :   JSRegExp::Flags switch_on = JSRegExp::kNone;
     588             :   JSRegExp::Flags switch_off = JSRegExp::kNone;
     589             :   const ZoneVector<uc16>* capture_name = nullptr;
     590             :   SubexpressionType subexpr_type = CAPTURE;
     591     1107247 :   Advance();
     592     1107247 :   if (current() == '?') {
     593       26422 :     switch (Next()) {
     594             :       case ':':
     595             :         Advance(2);
     596             :         subexpr_type = GROUPING;
     597       19791 :         break;
     598             :       case '=':
     599             :         Advance(2);
     600             :         lookaround_type = RegExpLookaround::LOOKAHEAD;
     601             :         subexpr_type = POSITIVE_LOOKAROUND;
     602        1709 :         break;
     603             :       case '!':
     604             :         Advance(2);
     605             :         lookaround_type = RegExpLookaround::LOOKAHEAD;
     606             :         subexpr_type = NEGATIVE_LOOKAROUND;
     607         457 :         break;
     608             :       case '-':
     609             :       case 'i':
     610             :       case 's':
     611             :       case 'm': {
     612        1373 :         if (!FLAG_regexp_mode_modifiers) {
     613           4 :           ReportError(CStrVector("Invalid group"));
     614           4 :           return nullptr;
     615             :         }
     616        1369 :         Advance();
     617             :         bool flags_sense = true;  // Switching on flags.
     618        4753 :         while (subexpr_type != GROUPING) {
     619        3126 :           switch (current()) {
     620             :             case '-':
     621         344 :               if (!flags_sense) {
     622          18 :                 ReportError(CStrVector("Multiple dashes in flag group"));
     623          18 :                 return nullptr;
     624             :               }
     625             :               flags_sense = false;
     626         326 :               Advance();
     627         326 :               continue;
     628             :             case 's':
     629             :             case 'i':
     630             :             case 'm': {
     631             :               JSRegExp::Flags bit = JSRegExp::kUnicode;
     632        1449 :               if (current() == 'i') bit = JSRegExp::kIgnoreCase;
     633        1449 :               if (current() == 'm') bit = JSRegExp::kMultiline;
     634        1449 :               if (current() == 's') bit = JSRegExp::kDotAll;
     635        1449 :               if (((switch_on | switch_off) & bit) != 0) {
     636          18 :                 ReportError(CStrVector("Repeated flag in flag group"));
     637             :                 return nullptr;
     638             :               }
     639        1431 :               if (flags_sense) {
     640             :                 switch_on |= bit;
     641             :               } else {
     642             :                 switch_off |= bit;
     643             :               }
     644        1431 :               Advance();
     645             :               continue;
     646             :             }
     647             :             case ')': {
     648        1075 :               Advance();
     649             :               state->builder()
     650        1075 :                   ->FlushText();  // Flush pending text using old flags.
     651             :               // These (?i)-style flag switches don't put us in a subexpression
     652             :               // at all, they just modify the flags in the rest of the current
     653             :               // subexpression.
     654             :               JSRegExp::Flags flags =
     655             :                   (state->builder()->flags() | switch_on) & ~switch_off;
     656             :               state->builder()->set_flags(flags);
     657             :               return state;
     658             :             }
     659             :             case ':':
     660         258 :               Advance();
     661             :               subexpr_type = GROUPING;  // Will break us out of the outer loop.
     662         258 :               continue;
     663             :             default:
     664           0 :               ReportError(CStrVector("Invalid flag group"));
     665           0 :               return nullptr;
     666             :           }
     667             :         }
     668             :         break;
     669             :       }
     670             :       case '<':
     671        3073 :         Advance();
     672        3073 :         if (Next() == '=') {
     673             :           Advance(2);
     674             :           lookaround_type = RegExpLookaround::LOOKBEHIND;
     675             :           subexpr_type = POSITIVE_LOOKAROUND;
     676        1218 :           break;
     677        1855 :         } else if (Next() == '!') {
     678             :           Advance(2);
     679             :           lookaround_type = RegExpLookaround::LOOKBEHIND;
     680             :           subexpr_type = NEGATIVE_LOOKAROUND;
     681         245 :           break;
     682             :         }
     683             :         is_named_capture = true;
     684        1610 :         has_named_captures_ = true;
     685        1610 :         Advance();
     686        1610 :         break;
     687             :       default:
     688          19 :         ReportError(CStrVector("Invalid group"));
     689          19 :         return nullptr;
     690             :     }
     691             :   }
     692     1106113 :   if (subexpr_type == CAPTURE) {
     693     1082435 :     if (captures_started_ >= kMaxCaptures) {
     694           5 :       ReportError(CStrVector("Too many captures"));
     695           5 :       return nullptr;
     696             :     }
     697     1082430 :     captures_started_++;
     698             : 
     699     1082430 :     if (is_named_capture) {
     700        1610 :       capture_name = ParseCaptureGroupName(CHECK_FAILED);
     701             :     }
     702             :   }
     703             :   JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
     704             :   // Store current state and begin new disjunction parsing.
     705             :   return new (zone())
     706             :       RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
     707     2211528 :                         capture_name, flags, zone());
     708             : }
     709             : 
     710             : #ifdef DEBUG
     711             : // Currently only used in an DCHECK.
     712             : static bool IsSpecialClassEscape(uc32 c) {
     713             :   switch (c) {
     714             :     case 'd':
     715             :     case 'D':
     716             :     case 's':
     717             :     case 'S':
     718             :     case 'w':
     719             :     case 'W':
     720             :       return true;
     721             :     default:
     722             :       return false;
     723             :   }
     724             : }
     725             : #endif
     726             : 
     727             : 
     728             : // In order to know whether an escape is a backreference or not we have to scan
     729             : // the entire regexp and find the number of capturing parentheses.  However we
     730             : // don't want to scan the regexp twice unless it is necessary.  This mini-parser
     731             : // is called when needed.  It can see the difference between capturing and
     732             : // noncapturing parentheses and can skip character classes and backslash-escaped
     733             : // characters.
     734        8158 : void RegExpParser::ScanForCaptures() {
     735             :   DCHECK(!is_scanned_for_captures_);
     736             :   const int saved_position = position();
     737             :   // Start with captures started previous to current position
     738             :   int capture_count = captures_started();
     739             :   // Add count of captures after this position.
     740             :   int n;
     741        6327 :   while ((n = current()) != kEndMarker) {
     742        4749 :     Advance();
     743        4749 :     switch (n) {
     744             :       case '\\':
     745         605 :         Advance();
     746         605 :         break;
     747             :       case '[': {
     748             :         int c;
     749         165 :         while ((c = current()) != kEndMarker) {
     750         156 :           Advance();
     751         156 :           if (c == '\\') {
     752           0 :             Advance();
     753             :           } else {
     754         156 :             if (c == ']') break;
     755             :           }
     756             :         }
     757             :         break;
     758             :       }
     759             :       case '(':
     760         611 :         if (current() == '?') {
     761             :           // At this point we could be in
     762             :           // * a non-capturing group '(:',
     763             :           // * a lookbehind assertion '(?<=' '(?<!'
     764             :           // * or a named capture '(?<'.
     765             :           //
     766             :           // Of these, only named captures are capturing groups.
     767             : 
     768         145 :           Advance();
     769         145 :           if (current() != '<') break;
     770             : 
     771         121 :           Advance();
     772         121 :           if (current() == '=' || current() == '!') break;
     773             : 
     774             :           // Found a possible named capture. It could turn out to be a syntax
     775             :           // error (e.g. an unterminated or invalid name), but that distinction
     776             :           // does not matter for our purposes.
     777          75 :           has_named_captures_ = true;
     778             :         }
     779         541 :         capture_count++;
     780         541 :         break;
     781             :     }
     782             :   }
     783         789 :   capture_count_ = capture_count;
     784         789 :   is_scanned_for_captures_ = true;
     785             :   Reset(saved_position);
     786         789 : }
     787             : 
     788             : 
     789       15930 : bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
     790             :   DCHECK_EQ('\\', current());
     791             :   DCHECK('1' <= Next() && Next() <= '9');
     792             :   // Try to parse a decimal literal that is no greater than the total number
     793             :   // of left capturing parentheses in the input.
     794             :   int start = position();
     795        5111 :   int value = Next() - '0';
     796             :   Advance(2);
     797             :   while (true) {
     798             :     uc32 c = current();
     799        5728 :     if (IsDecimalDigit(c)) {
     800         637 :       value = 10 * value + (c - '0');
     801         637 :       if (value > kMaxCaptures) {
     802             :         Reset(start);
     803          20 :         return false;
     804             :       }
     805         617 :       Advance();
     806             :     } else {
     807             :       break;
     808             :     }
     809             :   }
     810        5091 :   if (value > captures_started()) {
     811         882 :     if (!is_scanned_for_captures_) ScanForCaptures();
     812         882 :     if (value > capture_count_) {
     813             :       Reset(start);
     814         458 :       return false;
     815             :     }
     816             :   }
     817        4633 :   *index_out = value;
     818        5250 :   return true;
     819             : }
     820             : 
     821        2425 : static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
     822        2425 :   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
     823        4770 :     v->push_back(code_unit);
     824             :   } else {
     825          80 :     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
     826          80 :     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
     827             :   }
     828        2425 : }
     829             : 
     830        6603 : const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
     831             :   ZoneVector<uc16>* name =
     832        1965 :       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
     833             : 
     834             :   bool at_start = true;
     835             :   while (true) {
     836        4390 :     uc32 c = current();
     837        4390 :     Advance();
     838             : 
     839             :     // Convert unicode escapes.
     840        4638 :     if (c == '\\' && current() == 'u') {
     841         230 :       Advance();
     842         230 :       if (!ParseUnicodeEscape(&c)) {
     843          59 :         ReportError(CStrVector("Invalid Unicode escape sequence"));
     844          59 :         return nullptr;
     845             :       }
     846             :     }
     847             : 
     848             :     // The backslash char is misclassified as both ID_Start and ID_Continue.
     849        4331 :     if (c == '\\') {
     850          18 :       ReportError(CStrVector("Invalid capture group name"));
     851          18 :       return nullptr;
     852             :     }
     853             : 
     854        4313 :     if (at_start) {
     855        1933 :       if (!IsIdentifierStart(c)) {
     856         100 :         ReportError(CStrVector("Invalid capture group name"));
     857         100 :         return nullptr;
     858             :       }
     859        1833 :       push_code_unit(name, c);
     860             :       at_start = false;
     861             :     } else {
     862        2380 :       if (c == '>') {
     863             :         break;
     864         809 :       } else if (IsIdentifierPart(c)) {
     865         592 :         push_code_unit(name, c);
     866             :       } else {
     867         217 :         ReportError(CStrVector("Invalid capture group name"));
     868         217 :         return nullptr;
     869             :       }
     870             :     }
     871             :   }
     872             : 
     873        3996 :   return name;
     874             : }
     875             : 
     876        1257 : bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
     877        2149 :                                              int index) {
     878             :   DCHECK(0 < index && index <= captures_started_);
     879             :   DCHECK_NOT_NULL(name);
     880             : 
     881        1257 :   if (named_captures_ == nullptr) {
     882         933 :     named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
     883             :   } else {
     884             :     // Check for duplicates and bail if we find any.
     885             :     // TODO(jgruber): O(n^2).
     886         996 :     for (const auto& named_capture : *named_captures_) {
     887         389 :       if (*named_capture->name() == *name) {
     888          41 :         ReportError(CStrVector("Duplicate capture group name"));
     889          41 :         return false;
     890             :       }
     891             :     }
     892             :   }
     893             : 
     894        1216 :   RegExpCapture* capture = GetCapture(index);
     895             :   DCHECK_NULL(capture->name());
     896             : 
     897             :   capture->set_name(name);
     898        1216 :   named_captures_->Add(capture, zone());
     899             : 
     900        1216 :   return true;
     901             : }
     902             : 
     903         396 : bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
     904        1186 :                                            RegExpParserState* state) {
     905             :   // The parser is assumed to be on the '<' in \k<name>.
     906         396 :   if (current() != '<') {
     907          41 :     ReportError(CStrVector("Invalid named reference"));
     908          41 :     return false;
     909             :   }
     910             : 
     911         355 :   Advance();
     912         355 :   const ZoneVector<uc16>* name = ParseCaptureGroupName();
     913         355 :   if (name == nullptr) {
     914             :     return false;
     915             :   }
     916             : 
     917         305 :   if (state->IsInsideCaptureGroup(name)) {
     918             :     builder->AddEmpty();
     919             :   } else {
     920             :     RegExpBackReference* atom =
     921         285 :         new (zone()) RegExpBackReference(builder->flags());
     922             :     atom->set_name(name);
     923             : 
     924         285 :     builder->AddAtom(atom);
     925             : 
     926         285 :     if (named_back_references_ == nullptr) {
     927             :       named_back_references_ =
     928         220 :           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
     929             :     }
     930         285 :     named_back_references_->Add(atom, zone());
     931             :   }
     932             : 
     933             :   return true;
     934             : }
     935             : 
     936      377859 : void RegExpParser::PatchNamedBackReferences() {
     937      378250 :   if (named_back_references_ == nullptr) return;
     938             : 
     939         220 :   if (named_captures_ == nullptr) {
     940          14 :     ReportError(CStrVector("Invalid named capture referenced"));
     941          14 :     return;
     942             :   }
     943             : 
     944             :   // Look up and patch the actual capture for each named back reference.
     945             :   // TODO(jgruber): O(n^2), optimize if necessary.
     946             : 
     947         576 :   for (int i = 0; i < named_back_references_->length(); i++) {
     948         612 :     RegExpBackReference* ref = named_back_references_->at(i);
     949             : 
     950             :     int index = -1;
     951         698 :     for (const auto& capture : *named_captures_) {
     952         341 :       if (*capture->name() == *ref->name()) {
     953             :         index = capture->index();
     954         185 :         break;
     955             :       }
     956             :     }
     957             : 
     958         271 :     if (index == -1) {
     959          86 :       ReportError(CStrVector("Invalid named capture referenced"));
     960          86 :       return;
     961             :     }
     962             : 
     963         185 :     ref->set_capture(GetCapture(index));
     964             :   }
     965             : }
     966             : 
     967     2183327 : RegExpCapture* RegExpParser::GetCapture(int index) {
     968             :   // The index for the capture groups are one-based. Its index in the list is
     969             :   // zero-based.
     970             :   int know_captures =
     971     1087800 :       is_scanned_for_captures_ ? capture_count_ : captures_started_;
     972             :   DCHECK(index <= know_captures);
     973     3257468 :   if (captures_ == nullptr) {
     974       13659 :     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
     975             :   }
     976     4339336 :   while (captures_->length() < know_captures) {
     977     2163736 :     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
     978             :   }
     979     2175600 :   return captures_->at(index - 1);
     980             : }
     981             : 
     982      378529 : Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
     983      380334 :   if (named_captures_ == nullptr || named_captures_->is_empty())
     984      376989 :     return Handle<FixedArray>();
     985             : 
     986             :   Factory* factory = isolate()->factory();
     987             : 
     988         770 :   int len = named_captures_->length() * 2;
     989         770 :   Handle<FixedArray> array = factory->NewFixedArray(len);
     990             : 
     991        3610 :   for (int i = 0; i < named_captures_->length(); i++) {
     992        2070 :     RegExpCapture* capture = named_captures_->at(i);
     993        1035 :     MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
     994        2070 :     array->set(i * 2, *name.ToHandleChecked());
     995             :     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
     996             :   }
     997             : 
     998         770 :   return array;
     999             : }
    1000             : 
    1001           0 : bool RegExpParser::HasNamedCaptures() {
    1002         223 :   if (has_named_captures_ || is_scanned_for_captures_) {
    1003             :     return has_named_captures_;
    1004             :   }
    1005             : 
    1006         118 :   ScanForCaptures();
    1007             :   DCHECK(is_scanned_for_captures_);
    1008         118 :   return has_named_captures_;
    1009             : }
    1010             : 
    1011           0 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
    1012       10952 :   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
    1013        5375 :     if (s->group_type() != CAPTURE) continue;
    1014             :     // Return true if we found the matching capture index.
    1015         610 :     if (index == s->capture_index()) return true;
    1016             :     // Abort if index is larger than what has been parsed up till this state.
    1017         517 :     if (index > s->capture_index()) return false;
    1018             :   }
    1019             :   return false;
    1020             : }
    1021             : 
    1022         305 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
    1023             :     const ZoneVector<uc16>* name) {
    1024             :   DCHECK_NOT_NULL(name);
    1025         600 :   for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
    1026         315 :     if (s->capture_name() == nullptr) continue;
    1027          30 :     if (*s->capture_name() == *name) return true;
    1028             :   }
    1029             :   return false;
    1030             : }
    1031             : 
    1032             : // QuantifierPrefix ::
    1033             : //   { DecimalDigits }
    1034             : //   { DecimalDigits , }
    1035             : //   { DecimalDigits , DecimalDigits }
    1036             : //
    1037             : // Returns true if parsing succeeds, and set the min_out and max_out
    1038             : // values. Values are truncated to RegExpTree::kInfinity if they overflow.
    1039       25771 : bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
    1040             :   DCHECK_EQ(current(), '{');
    1041             :   int start = position();
    1042        3744 :   Advance();
    1043             :   int min = 0;
    1044        3744 :   if (!IsDecimalDigit(current())) {
    1045             :     Reset(start);
    1046         834 :     return false;
    1047             :   }
    1048        7217 :   while (IsDecimalDigit(current())) {
    1049        4427 :     int next = current() - '0';
    1050        4427 :     if (min > (RegExpTree::kInfinity - next) / 10) {
    1051             :       // Overflow. Skip past remaining decimal digits and return -1.
    1052         810 :       do {
    1053         810 :         Advance();
    1054             :       } while (IsDecimalDigit(current()));
    1055             :       min = RegExpTree::kInfinity;
    1056             :       break;
    1057             :     }
    1058        4307 :     min = 10 * min + next;
    1059        4307 :     Advance();
    1060             :   }
    1061             :   int max = 0;
    1062        2910 :   if (current() == '}') {
    1063             :     max = min;
    1064        1172 :     Advance();
    1065        1738 :   } else if (current() == ',') {
    1066        1700 :     Advance();
    1067        1700 :     if (current() == '}') {
    1068             :       max = RegExpTree::kInfinity;
    1069         329 :       Advance();
    1070             :     } else {
    1071        3525 :       while (IsDecimalDigit(current())) {
    1072        2214 :         int next = current() - '0';
    1073        2214 :         if (max > (RegExpTree::kInfinity - next) / 10) {
    1074         750 :           do {
    1075         750 :             Advance();
    1076             :           } while (IsDecimalDigit(current()));
    1077             :           max = RegExpTree::kInfinity;
    1078             :           break;
    1079             :         }
    1080        2154 :         max = 10 * max + next;
    1081        2154 :         Advance();
    1082             :       }
    1083        1371 :       if (current() != '}') {
    1084             :         Reset(start);
    1085          74 :         return false;
    1086             :       }
    1087        1297 :       Advance();
    1088             :     }
    1089             :   } else {
    1090             :     Reset(start);
    1091          38 :     return false;
    1092             :   }
    1093        2798 :   *min_out = min;
    1094        2798 :   *max_out = max;
    1095        2798 :   return true;
    1096             : }
    1097             : 
    1098             : 
    1099       11214 : uc32 RegExpParser::ParseOctalLiteral() {
    1100             :   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
    1101             :   // For compatibility with some other browsers (not all), we parse
    1102             :   // up to three octal digits with a value below 256.
    1103             :   // ES#prod-annexB-LegacyOctalEscapeSequence
    1104        4284 :   uc32 value = current() - '0';
    1105        4284 :   Advance();
    1106        4284 :   if ('0' <= current() && current() <= '7') {
    1107        1388 :     value = value * 8 + current() - '0';
    1108        1388 :     Advance();
    1109        2744 :     if (value < 32 && '0' <= current() && current() <= '7') {
    1110        2580 :       value = value * 8 + current() - '0';
    1111        1290 :       Advance();
    1112             :     }
    1113             :   }
    1114        4284 :   return value;
    1115             : }
    1116             : 
    1117             : 
    1118      389704 : bool RegExpParser::ParseHexEscape(int length, uc32* value) {
    1119             :   int start = position();
    1120             :   uc32 val = 0;
    1121      389330 :   for (int i = 0; i < length; ++i) {
    1122             :     uc32 c = current();
    1123             :     int d = HexValue(c);
    1124      310675 :     if (d < 0) {
    1125             :       Reset(start);
    1126         374 :       return false;
    1127             :     }
    1128      310301 :     val = val * 16 + d;
    1129      310301 :     Advance();
    1130             :   }
    1131       78655 :   *value = val;
    1132       78655 :   return true;
    1133             : }
    1134             : 
    1135             : // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
    1136       99386 : bool RegExpParser::ParseUnicodeEscape(uc32* value) {
    1137             :   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
    1138             :   // allowed). In the latter case, the number of hex digits between { } is
    1139             :   // arbitrary. \ and u have already been read.
    1140       91439 :   if (current() == '{' && unicode()) {
    1141             :     int start = position();
    1142        7458 :     Advance();
    1143        7458 :     if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
    1144        7431 :       if (current() == '}') {
    1145        7426 :         Advance();
    1146        7426 :         return true;
    1147             :       }
    1148             :     }
    1149             :     Reset(start);
    1150          32 :     return false;
    1151             :   }
    1152             :   // \u but no {, or \u{...} escapes not allowed.
    1153       76423 :   bool result = ParseHexEscape(4, value);
    1154      226996 :   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
    1155             :       current() == '\\') {
    1156             :     // Attempt to read trail surrogate.
    1157             :     int start = position();
    1158         205 :     if (Next() == 'u') {
    1159             :       Advance(2);
    1160             :       uc32 trail;
    1161         380 :       if (ParseHexEscape(4, &trail) &&
    1162         185 :           unibrow::Utf16::IsTrailSurrogate(trail)) {
    1163             :         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
    1164         370 :                                                       static_cast<uc16>(trail));
    1165         185 :         return true;
    1166             :       }
    1167             :     }
    1168             :     Reset(start);
    1169             :   }
    1170       76238 :   return result;
    1171             : }
    1172             : 
    1173             : #ifdef V8_INTL_SUPPORT
    1174             : 
    1175             : namespace {
    1176             : 
    1177        2708 : bool IsExactPropertyAlias(const char* property_name, UProperty property) {
    1178        2708 :   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
    1179        2708 :   if (short_name != nullptr && strcmp(property_name, short_name) == 0)
    1180             :     return true;
    1181             :   for (int i = 0;; i++) {
    1182             :     const char* long_name = u_getPropertyName(
    1183         806 :         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
    1184         806 :     if (long_name == nullptr) break;
    1185         743 :     if (strcmp(property_name, long_name) == 0) return true;
    1186             :   }
    1187             :   return false;
    1188             : }
    1189             : 
    1190        3153 : bool IsExactPropertyValueAlias(const char* property_value_name,
    1191             :                                UProperty property, int32_t property_value) {
    1192             :   const char* short_name =
    1193        3153 :       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
    1194        3153 :   if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
    1195             :     return true;
    1196             :   }
    1197             :   for (int i = 0;; i++) {
    1198             :     const char* long_name = u_getPropertyValueName(
    1199             :         property, property_value,
    1200         310 :         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
    1201         310 :     if (long_name == nullptr) break;
    1202         310 :     if (strcmp(property_value_name, long_name) == 0) return true;
    1203             :   }
    1204             :   return false;
    1205             : }
    1206             : 
    1207        5216 : bool LookupPropertyValueName(UProperty property,
    1208             :                              const char* property_value_name, bool negate,
    1209             :                              ZoneList<CharacterRange>* result, Zone* zone) {
    1210             :   UProperty property_for_lookup = property;
    1211        5216 :   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
    1212             :     // For the property Script_Extensions, we have to do the property value
    1213             :     // name lookup as if the property is Script.
    1214             :     property_for_lookup = UCHAR_SCRIPT;
    1215             :   }
    1216             :   int32_t property_value =
    1217        5216 :       u_getPropertyValueEnum(property_for_lookup, property_value_name);
    1218        5216 :   if (property_value == UCHAR_INVALID_CODE) return false;
    1219             : 
    1220             :   // We require the property name to match exactly to one of the property value
    1221             :   // aliases. However, u_getPropertyValueEnum uses loose matching.
    1222        3153 :   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
    1223        3153 :                                  property_value)) {
    1224             :     return false;
    1225             :   }
    1226             : 
    1227        3153 :   UErrorCode ec = U_ZERO_ERROR;
    1228        3153 :   icu::UnicodeSet set;
    1229        3153 :   set.applyIntPropertyValue(property, property_value, ec);
    1230        3153 :   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
    1231             : 
    1232        3153 :   if (success) {
    1233        3153 :     set.removeAllStrings();
    1234        3153 :     if (negate) set.complement();
    1235      578061 :     for (int i = 0; i < set.getRangeCount(); i++) {
    1236             :       result->Add(
    1237      578061 :           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
    1238      578061 :           zone);
    1239             :     }
    1240             :   }
    1241        3153 :   return success;
    1242             : }
    1243             : 
    1244             : template <size_t N>
    1245             : inline bool NameEquals(const char* name, const char (&literal)[N]) {
    1246        6194 :   return strncmp(name, literal, N + 1) == 0;
    1247             : }
    1248             : 
    1249        2063 : bool LookupSpecialPropertyValueName(const char* name,
    1250             :                                     ZoneList<CharacterRange>* result,
    1251             :                                     bool negate, Zone* zone) {
    1252        2063 :   if (NameEquals(name, "Any")) {
    1253         135 :     if (negate) {
    1254             :       // Leave the list of character ranges empty, since the negation of 'Any'
    1255             :       // is the empty set.
    1256             :     } else {
    1257          40 :       result->Add(CharacterRange::Everything(), zone);
    1258             :     }
    1259        1928 :   } else if (NameEquals(name, "ASCII")) {
    1260             :     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
    1261             :                        : CharacterRange::Range(0x0, 0x7F),
    1262         105 :                 zone);
    1263        1823 :   } else if (NameEquals(name, "Assigned")) {
    1264             :     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
    1265          75 :                                    !negate, result, zone);
    1266             :   } else {
    1267             :     return false;
    1268             :   }
    1269             :   return true;
    1270             : }
    1271             : 
    1272             : // Explicitly whitelist supported binary properties. The spec forbids supporting
    1273             : // properties outside of this set to ensure interoperability.
    1274        1748 : bool IsSupportedBinaryProperty(UProperty property) {
    1275        1748 :   switch (property) {
    1276             :     case UCHAR_ALPHABETIC:
    1277             :     // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
    1278             :     // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
    1279             :     case UCHAR_ASCII_HEX_DIGIT:
    1280             :     // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
    1281             :     case UCHAR_BIDI_CONTROL:
    1282             :     case UCHAR_BIDI_MIRRORED:
    1283             :     case UCHAR_CASE_IGNORABLE:
    1284             :     case UCHAR_CASED:
    1285             :     case UCHAR_CHANGES_WHEN_CASEFOLDED:
    1286             :     case UCHAR_CHANGES_WHEN_CASEMAPPED:
    1287             :     case UCHAR_CHANGES_WHEN_LOWERCASED:
    1288             :     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
    1289             :     case UCHAR_CHANGES_WHEN_TITLECASED:
    1290             :     case UCHAR_CHANGES_WHEN_UPPERCASED:
    1291             :     case UCHAR_DASH:
    1292             :     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
    1293             :     case UCHAR_DEPRECATED:
    1294             :     case UCHAR_DIACRITIC:
    1295             :     case UCHAR_EMOJI:
    1296             :     case UCHAR_EMOJI_COMPONENT:
    1297             :     case UCHAR_EMOJI_MODIFIER_BASE:
    1298             :     case UCHAR_EMOJI_MODIFIER:
    1299             :     case UCHAR_EMOJI_PRESENTATION:
    1300             :     case UCHAR_EXTENDED_PICTOGRAPHIC:
    1301             :     case UCHAR_EXTENDER:
    1302             :     case UCHAR_GRAPHEME_BASE:
    1303             :     case UCHAR_GRAPHEME_EXTEND:
    1304             :     case UCHAR_HEX_DIGIT:
    1305             :     case UCHAR_ID_CONTINUE:
    1306             :     case UCHAR_ID_START:
    1307             :     case UCHAR_IDEOGRAPHIC:
    1308             :     case UCHAR_IDS_BINARY_OPERATOR:
    1309             :     case UCHAR_IDS_TRINARY_OPERATOR:
    1310             :     case UCHAR_JOIN_CONTROL:
    1311             :     case UCHAR_LOGICAL_ORDER_EXCEPTION:
    1312             :     case UCHAR_LOWERCASE:
    1313             :     case UCHAR_MATH:
    1314             :     case UCHAR_NONCHARACTER_CODE_POINT:
    1315             :     case UCHAR_PATTERN_SYNTAX:
    1316             :     case UCHAR_PATTERN_WHITE_SPACE:
    1317             :     case UCHAR_QUOTATION_MARK:
    1318             :     case UCHAR_RADICAL:
    1319             :     case UCHAR_REGIONAL_INDICATOR:
    1320             :     case UCHAR_S_TERM:
    1321             :     case UCHAR_SOFT_DOTTED:
    1322             :     case UCHAR_TERMINAL_PUNCTUATION:
    1323             :     case UCHAR_UNIFIED_IDEOGRAPH:
    1324             :     case UCHAR_UPPERCASE:
    1325             :     case UCHAR_VARIATION_SELECTOR:
    1326             :     case UCHAR_WHITE_SPACE:
    1327             :     case UCHAR_XID_CONTINUE:
    1328             :     case UCHAR_XID_START:
    1329             :       return true;
    1330             :     default:
    1331             :       break;
    1332             :   }
    1333         843 :   return false;
    1334             : }
    1335             : 
    1336             : bool IsUnicodePropertyValueCharacter(char c) {
    1337             :   // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
    1338             :   //
    1339             :   // Note that using this to validate each parsed char is quite conservative.
    1340             :   // A possible alternative solution would be to only ensure the parsed
    1341             :   // property name/value candidate string does not contain '\0' characters and
    1342             :   // let ICU lookups trigger the final failure.
    1343       39082 :   if ('a' <= c && c <= 'z') return true;
    1344       10168 :   if ('A' <= c && c <= 'Z') return true;
    1345        1989 :   if ('0' <= c && c <= '9') return true;
    1346        1971 :   return (c == '_');
    1347             : }
    1348             : 
    1349             : }  // anonymous namespace
    1350             : 
    1351        4547 : bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
    1352       49835 :                                           std::vector<char>* name_2) {
    1353             :   DCHECK(name_1->empty());
    1354             :   DCHECK(name_2->empty());
    1355             :   // Parse the property class as follows:
    1356             :   // - In \p{name}, 'name' is interpreted
    1357             :   //   - either as a general category property value name.
    1358             :   //   - or as a binary property name.
    1359             :   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
    1360             :   //   and 'value' is interpreted as one of the available property value names.
    1361             :   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
    1362             :   // - Loose matching is not applied.
    1363        4547 :   if (current() == '{') {
    1364             :     // Parse \p{[PropertyName=]PropertyNameValue}
    1365       71204 :     for (Advance(); current() != '}' && current() != '='; Advance()) {
    1366       62398 :       if (!IsUnicodePropertyValueCharacter(current())) return false;
    1367       31136 :       if (!has_next()) return false;
    1368       62272 :       name_1->push_back(static_cast<char>(current()));
    1369             :     }
    1370        4403 :     if (current() == '=') {
    1371       19372 :       for (Advance(); current() != '}'; Advance()) {
    1372       15766 :         if (!IsUnicodePropertyValueCharacter(current())) return false;
    1373        7874 :         if (!has_next()) return false;
    1374       15748 :         name_2->push_back(static_cast<char>(current()));
    1375             :       }
    1376        3606 :       name_2->push_back(0);  // null-terminate string.
    1377             :     }
    1378             :   } else {
    1379             :     return false;
    1380             :   }
    1381        4394 :   Advance();
    1382        8788 :   name_1->push_back(0);  // null-terminate string.
    1383             : 
    1384             :   DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
    1385             :   DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
    1386        4394 :   return true;
    1387             : }
    1388             : 
    1389        4394 : bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
    1390             :                                          bool negate,
    1391             :                                          const std::vector<char>& name_1,
    1392        7164 :                                          const std::vector<char>& name_2) {
    1393        4394 :   if (name_2.empty()) {
    1394             :     // First attempt to interpret as general category property value name.
    1395             :     const char* name = name_1.data();
    1396        2591 :     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
    1397        2591 :                                 add_to, zone())) {
    1398             :       return true;
    1399             :     }
    1400             :     // Interpret "Any", "ASCII", and "Assigned".
    1401        2063 :     if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
    1402             :       return true;
    1403             :     }
    1404             :     // Then attempt to interpret as binary property name with value name 'Y'.
    1405        1748 :     UProperty property = u_getPropertyEnum(name);
    1406        1748 :     if (!IsSupportedBinaryProperty(property)) return false;
    1407         905 :     if (!IsExactPropertyAlias(name, property)) return false;
    1408             :     return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
    1409         905 :                                    zone());
    1410             :   } else {
    1411             :     // Both property name and value name are specified. Attempt to interpret
    1412             :     // the property name as enumerated property.
    1413             :     const char* property_name = name_1.data();
    1414             :     const char* value_name = name_2.data();
    1415        1803 :     UProperty property = u_getPropertyEnum(property_name);
    1416        1803 :     if (!IsExactPropertyAlias(property_name, property)) return false;
    1417        1740 :     if (property == UCHAR_GENERAL_CATEGORY) {
    1418             :       // We want to allow aggregate value names such as "Letter".
    1419             :       property = UCHAR_GENERAL_CATEGORY_MASK;
    1420        3400 :     } else if (property != UCHAR_SCRIPT &&
    1421        1700 :                property != UCHAR_SCRIPT_EXTENSIONS) {
    1422             :       return false;
    1423             :     }
    1424             :     return LookupPropertyValueName(property, value_name, negate, add_to,
    1425        1605 :                                    zone());
    1426             :   }
    1427             : }
    1428             : 
    1429         880 : RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
    1430         510 :   if (!FLAG_harmony_regexp_sequence) return nullptr;
    1431             :   const char* name = name_1.data();
    1432             :   const uc32* sequence_list = nullptr;
    1433             :   JSRegExp::Flags flags = JSRegExp::kUnicode;
    1434         150 :   if (NameEquals(name, "Emoji_Flag_Sequence")) {
    1435             :     sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
    1436          90 :   } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
    1437             :     sequence_list = UnicodePropertySequences::kEmojiTagSequences;
    1438          70 :   } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
    1439             :     sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
    1440             :   }
    1441         150 :   if (sequence_list != nullptr) {
    1442             :     // TODO(yangguo): this creates huge regexp code. Alternative to this is
    1443             :     // to create a new operator that checks for these sequences at runtime.
    1444             :     RegExpBuilder builder(zone(), flags);
    1445             :     while (true) {                   // Iterate through list of sequences.
    1446      133680 :       while (*sequence_list != 0) {  // Iterate through sequence.
    1447      102660 :         builder.AddUnicodeCharacter(*sequence_list);
    1448      102660 :         sequence_list++;
    1449             :       }
    1450       31020 :       sequence_list++;
    1451       31020 :       if (*sequence_list == 0) break;
    1452             :       builder.NewAlternative();
    1453             :     }
    1454         100 :     return builder.ToRegExp();
    1455             :   }
    1456             : 
    1457          50 :   if (NameEquals(name, "Emoji_Keycap_Sequence")) {
    1458             :     // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
    1459             :     // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
    1460             :     RegExpBuilder builder(zone(), flags);
    1461             :     ZoneList<CharacterRange>* prefix_ranges =
    1462          30 :         new (zone()) ZoneList<CharacterRange>(2, zone());
    1463          30 :     prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
    1464          30 :     prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
    1465          30 :     prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
    1466             :     builder.AddCharacterClass(
    1467          60 :         new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
    1468          30 :     builder.AddCharacter(0xFE0F);
    1469          30 :     builder.AddCharacter(0x20E3);
    1470          30 :     return builder.ToRegExp();
    1471          20 :   } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
    1472             :     // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
    1473             :     // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
    1474             :     RegExpBuilder builder(zone(), flags);
    1475             :     ZoneList<CharacterRange>* modifier_base_ranges =
    1476          20 :         new (zone()) ZoneList<CharacterRange>(2, zone());
    1477             :     LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
    1478          20 :                             modifier_base_ranges, zone());
    1479             :     builder.AddCharacterClass(
    1480          40 :         new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
    1481             :     ZoneList<CharacterRange>* modifier_ranges =
    1482          20 :         new (zone()) ZoneList<CharacterRange>(2, zone());
    1483             :     LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
    1484          20 :                             zone());
    1485             :     builder.AddCharacterClass(
    1486          40 :         new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
    1487          20 :     return builder.ToRegExp();
    1488             :   }
    1489             : 
    1490             :   return nullptr;
    1491             : }
    1492             : 
    1493             : #else  // V8_INTL_SUPPORT
    1494             : 
    1495             : bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
    1496             :                                           std::vector<char>* name_2) {
    1497             :   return false;
    1498             : }
    1499             : 
    1500             : bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
    1501             :                                          bool negate,
    1502             :                                          const std::vector<char>& name_1,
    1503             :                                          const std::vector<char>& name_2) {
    1504             :   return false;
    1505             : }
    1506             : 
    1507             : RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
    1508             :   return nullptr;
    1509             : }
    1510             : 
    1511             : #endif  // V8_INTL_SUPPORT
    1512             : 
    1513       44553 : bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
    1514             :   uc32 x = 0;
    1515             :   int d = HexValue(current());
    1516        7458 :   if (d < 0) {
    1517             :     return false;
    1518             :   }
    1519       44553 :   while (d >= 0) {
    1520       37122 :     x = x * 16 + d;
    1521       37122 :     if (x > max_value) {
    1522             :       return false;
    1523             :     }
    1524       37095 :     Advance();
    1525             :     d = HexValue(current());
    1526             :   }
    1527        7431 :   *value = x;
    1528        7431 :   return true;
    1529             : }
    1530             : 
    1531             : 
    1532      177276 : uc32 RegExpParser::ParseClassCharacterEscape() {
    1533             :   DCHECK_EQ('\\', current());
    1534             :   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
    1535       88638 :   Advance();
    1536       88638 :   switch (current()) {
    1537             :     case 'b':
    1538          14 :       Advance();
    1539          14 :       return '\b';
    1540             :     // ControlEscape :: one of
    1541             :     //   f n r t v
    1542             :     case 'f':
    1543           0 :       Advance();
    1544           0 :       return '\f';
    1545             :     case 'n':
    1546         471 :       Advance();
    1547         471 :       return '\n';
    1548             :     case 'r':
    1549          48 :       Advance();
    1550          48 :       return '\r';
    1551             :     case 't':
    1552         186 :       Advance();
    1553         186 :       return '\t';
    1554             :     case 'v':
    1555           1 :       Advance();
    1556           1 :       return '\v';
    1557             :     case 'c': {
    1558             :       uc32 controlLetter = Next();
    1559         290 :       uc32 letter = controlLetter & ~('A' ^ 'a');
    1560             :       // Inside a character class, we also accept digits and underscore as
    1561             :       // control characters, unless with /u. See Annex B:
    1562             :       // ES#prod-annexB-ClassControlLetter
    1563         290 :       if (letter >= 'A' && letter <= 'Z') {
    1564             :         Advance(2);
    1565             :         // Control letters mapped to ASCII control characters in the range
    1566             :         // 0x00-0x1F.
    1567          30 :         return controlLetter & 0x1F;
    1568             :       }
    1569         260 :       if (unicode()) {
    1570             :         // With /u, invalid escapes are not treated as identity escapes.
    1571          18 :         ReportError(CStrVector("Invalid class escape"));
    1572          18 :         return 0;
    1573             :       }
    1574         484 :       if ((controlLetter >= '0' && controlLetter <= '9') ||
    1575         242 :           controlLetter == '_') {
    1576             :         Advance(2);
    1577         160 :         return controlLetter & 0x1F;
    1578             :       }
    1579             :       // We match JSC in reading the backslash as a literal
    1580             :       // character instead of as starting an escape.
    1581             :       // TODO(v8:6201): Not yet covered by the spec.
    1582             :       return '\\';
    1583             :     }
    1584             :     case '0':
    1585             :       // With /u, \0 is interpreted as NUL if not followed by another digit.
    1586        1270 :       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
    1587         130 :         Advance();
    1588         130 :         return 0;
    1589             :       }
    1590             :       V8_FALLTHROUGH;
    1591             :     case '1':
    1592             :     case '2':
    1593             :     case '3':
    1594             :     case '4':
    1595             :     case '5':
    1596             :     case '6':
    1597             :     case '7':
    1598             :       // For compatibility, we interpret a decimal escape that isn't
    1599             :       // a back reference (and therefore either \0 or not valid according
    1600             :       // to the specification) as a 1..3 digit octal character code.
    1601             :       // ES#prod-annexB-LegacyOctalEscapeSequence
    1602        1098 :       if (unicode()) {
    1603             :         // With /u, decimal escape is not interpreted as octal character code.
    1604          36 :         ReportError(CStrVector("Invalid class escape"));
    1605          36 :         return 0;
    1606             :       }
    1607        1062 :       return ParseOctalLiteral();
    1608             :     case 'x': {
    1609        2165 :       Advance();
    1610             :       uc32 value;
    1611        2165 :       if (ParseHexEscape(2, &value)) return value;
    1612          30 :       if (unicode()) {
    1613             :         // With /u, invalid escapes are not treated as identity escapes.
    1614           0 :         ReportError(CStrVector("Invalid escape"));
    1615           0 :         return 0;
    1616             :       }
    1617             :       // If \x is not followed by a two-digit hexadecimal, treat it
    1618             :       // as an identity escape.
    1619             :       return 'x';
    1620             :     }
    1621             :     case 'u': {
    1622       81654 :       Advance();
    1623             :       uc32 value;
    1624       81654 :       if (ParseUnicodeEscape(&value)) return value;
    1625          20 :       if (unicode()) {
    1626             :         // With /u, invalid escapes are not treated as identity escapes.
    1627           0 :         ReportError(CStrVector("Invalid unicode escape"));
    1628           0 :         return 0;
    1629             :       }
    1630             :       // If \u is not followed by a two-digit hexadecimal, treat it
    1631             :       // as an identity escape.
    1632             :       return 'u';
    1633             :     }
    1634             :     default: {
    1635             :       uc32 result = current();
    1636             :       // With /u, no identity escapes except for syntax characters and '-' are
    1637             :       // allowed. Otherwise, all identity escapes are allowed.
    1638        2721 :       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
    1639        2581 :         Advance();
    1640        2581 :         return result;
    1641             :       }
    1642           0 :       ReportError(CStrVector("Invalid escape"));
    1643           0 :       return 0;
    1644             :     }
    1645             :   }
    1646             :   return 0;
    1647             : }
    1648             : 
    1649      471205 : void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
    1650             :                                     Zone* zone,
    1651             :                                     bool add_unicode_case_equivalents,
    1652      471205 :                                     uc32* char_out, bool* is_class_escape) {
    1653             :   uc32 current_char = current();
    1654      471205 :   if (current_char == '\\') {
    1655       90919 :     switch (Next()) {
    1656             :       case 'w':
    1657             :       case 'W':
    1658             :       case 'd':
    1659             :       case 'D':
    1660             :       case 's':
    1661             :       case 'S': {
    1662             :         CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
    1663        3446 :                                        add_unicode_case_equivalents, zone);
    1664             :         Advance(2);
    1665        1723 :         *is_class_escape = true;
    1666        1723 :         return;
    1667             :       }
    1668             :       case kEndMarker:
    1669           0 :         ReportError(CStrVector("\\ at end of pattern"));
    1670           0 :         return;
    1671             :       case 'p':
    1672             :       case 'P':
    1673         620 :         if (unicode()) {
    1674         558 :           bool negate = Next() == 'P';
    1675             :           Advance(2);
    1676             :           std::vector<char> name_1, name_2;
    1677        1089 :           if (!ParsePropertyClassName(&name_1, &name_2) ||
    1678         531 :               !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
    1679         225 :             ReportError(CStrVector("Invalid property name in character class"));
    1680             :           }
    1681         558 :           *is_class_escape = true;
    1682             :           return;
    1683             :         }
    1684             :         break;
    1685             :       default:
    1686             :         break;
    1687             :     }
    1688       88638 :     *char_out = ParseClassCharacterEscape();
    1689       88638 :     *is_class_escape = false;
    1690             :   } else {
    1691      380286 :     Advance();
    1692      380286 :     *char_out = current_char;
    1693      380286 :     *is_class_escape = false;
    1694             :   }
    1695             : }
    1696             : 
    1697     2915476 : RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
    1698             :   static const char* kUnterminated = "Unterminated character class";
    1699             :   static const char* kRangeInvalid = "Invalid character class";
    1700             :   static const char* kRangeOutOfOrder = "Range out of order in character class";
    1701             : 
    1702             :   DCHECK_EQ(current(), '[');
    1703      155307 :   Advance();
    1704             :   bool is_negated = false;
    1705      155307 :   if (current() == '^') {
    1706             :     is_negated = true;
    1707       10458 :     Advance();
    1708             :   }
    1709             :   ZoneList<CharacterRange>* ranges =
    1710      155307 :       new (zone()) ZoneList<CharacterRange>(2, zone());
    1711      157004 :   bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
    1712     1058228 :   while (has_more() && current() != ']') {
    1713             :     uc32 char_1, char_2;
    1714             :     bool is_class_1, is_class_2;
    1715             :     ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
    1716      593558 :                      &is_class_1 CHECK_FAILED);
    1717      296310 :     if (current() == '-') {
    1718      174680 :       Advance();
    1719      174680 :       if (current() == kEndMarker) {
    1720             :         // If we reach the end we break out of the loop and let the
    1721             :         // following code report an error.
    1722             :         break;
    1723      174675 :       } else if (current() == ']') {
    1724         112 :         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
    1725          67 :         ranges->Add(CharacterRange::Singleton('-'), zone());
    1726          67 :         break;
    1727             :       }
    1728             :       ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
    1729      174608 :                        &is_class_2 CHECK_FAILED);
    1730      174608 :       if (is_class_1 || is_class_2) {
    1731             :         // Either end is an escaped character class. Treat the '-' verbatim.
    1732         176 :         if (unicode()) {
    1733             :           // ES2015 21.2.2.15.1 step 1.
    1734          54 :           return ReportError(CStrVector(kRangeInvalid));
    1735             :         }
    1736         174 :         if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
    1737         149 :         ranges->Add(CharacterRange::Singleton('-'), zone());
    1738         258 :         if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
    1739         149 :         continue;
    1740             :       }
    1741             :       // ES2015 21.2.2.15.1 step 6.
    1742      174432 :       if (char_1 > char_2) {
    1743         100 :         return ReportError(CStrVector(kRangeOutOfOrder));
    1744             :       }
    1745      174382 :       ranges->Add(CharacterRange::Range(char_1, char_2), zone());
    1746             :     } else {
    1747      241426 :       if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
    1748             :     }
    1749             :   }
    1750      154943 :   if (!has_more()) {
    1751          40 :     return ReportError(CStrVector(kUnterminated));
    1752             :   }
    1753      154923 :   Advance();
    1754             :   RegExpCharacterClass::CharacterClassFlags character_class_flags;
    1755      154923 :   if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
    1756             :   return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
    1757      309846 :                                            character_class_flags);
    1758             : }
    1759             : 
    1760             : 
    1761             : #undef CHECK_FAILED
    1762             : 
    1763             : 
    1764      380740 : bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
    1765             :                                FlatStringReader* input, JSRegExp::Flags flags,
    1766             :                                RegExpCompileData* result) {
    1767             :   DCHECK(result != nullptr);
    1768      380740 :   RegExpParser parser(input, &result->error, flags, isolate, zone);
    1769      380740 :   RegExpTree* tree = parser.ParsePattern();
    1770      380740 :   if (parser.failed()) {
    1771             :     DCHECK(tree == nullptr);
    1772             :     DCHECK(!result->error.is_null());
    1773             :   } else {
    1774             :     DCHECK(tree != nullptr);
    1775             :     DCHECK(result->error.is_null());
    1776             :     if (FLAG_trace_regexp_parser) {
    1777             :       StdoutStream os;
    1778             :       tree->Print(os, zone);
    1779             :       os << "\n";
    1780             :     }
    1781      377759 :     result->tree = tree;
    1782      377759 :     int capture_count = parser.captures_started();
    1783      584600 :     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
    1784      377759 :     result->contains_anchor = parser.contains_anchor();
    1785      377759 :     result->capture_name_map = parser.CreateCaptureNameMap();
    1786      377759 :     result->capture_count = capture_count;
    1787             :   }
    1788      380740 :   return !parser.failed();
    1789             : }
    1790             : 
    1791           0 : RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
    1792             :     : zone_(zone),
    1793             :       pending_empty_(false),
    1794             :       flags_(flags),
    1795             :       characters_(nullptr),
    1796             :       pending_surrogate_(kNoPendingSurrogate),
    1797             :       terms_(),
    1798     1486654 :       alternatives_()
    1799             : #ifdef DEBUG
    1800             :       ,
    1801             :       last_added_(ADD_NONE)
    1802             : #endif
    1803             : {
    1804           0 : }
    1805             : 
    1806             : 
    1807           0 : void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
    1808             :   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
    1809             :   FlushPendingSurrogate();
    1810             :   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
    1811       64958 :   pending_surrogate_ = lead_surrogate;
    1812           0 : }
    1813             : 
    1814             : 
    1815      323490 : void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
    1816             :   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
    1817       64898 :   if (pending_surrogate_ != kNoPendingSurrogate) {
    1818       64708 :     uc16 lead_surrogate = pending_surrogate_;
    1819       64708 :     pending_surrogate_ = kNoPendingSurrogate;
    1820             :     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
    1821             :     uc32 combined =
    1822       64708 :         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
    1823       64708 :     if (NeedsDesugaringForIgnoreCase(combined)) {
    1824          60 :       AddCharacterClassForDesugaring(combined);
    1825             :     } else {
    1826       64648 :       ZoneList<uc16> surrogate_pair(2, zone());
    1827       64648 :       surrogate_pair.Add(lead_surrogate, zone());
    1828       64648 :       surrogate_pair.Add(trail_surrogate, zone());
    1829             :       RegExpAtom* atom =
    1830      129296 :           new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
    1831       64648 :       AddAtom(atom);
    1832             :     }
    1833             :   } else {
    1834         190 :     pending_surrogate_ = trail_surrogate;
    1835             :     FlushPendingSurrogate();
    1836             :   }
    1837       64898 : }
    1838             : 
    1839             : 
    1840           0 : void RegExpBuilder::FlushPendingSurrogate() {
    1841   389372674 :   if (pending_surrogate_ != kNoPendingSurrogate) {
    1842             :     DCHECK(unicode());
    1843         440 :     uc32 c = pending_surrogate_;
    1844         440 :     pending_surrogate_ = kNoPendingSurrogate;
    1845         440 :     AddCharacterClassForDesugaring(c);
    1846             :   }
    1847           0 : }
    1848             : 
    1849             : 
    1850     5247759 : void RegExpBuilder::FlushCharacters() {
    1851             :   FlushPendingSurrogate();
    1852     4448901 :   pending_empty_ = false;
    1853     4448901 :   if (characters_ != nullptr) {
    1854             :     RegExpTree* atom =
    1855      399429 :         new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
    1856      399429 :     characters_ = nullptr;
    1857      399429 :     text_.Add(atom, zone());
    1858             :     LAST(ADD_ATOM);
    1859             :   }
    1860     4448901 : }
    1861             : 
    1862             : 
    1863     4902449 : void RegExpBuilder::FlushText() {
    1864     4205709 :   FlushCharacters();
    1865             :   int num_text = text_.length();
    1866     4205709 :   if (num_text == 0) {
    1867     4205709 :     return;
    1868      567442 :   } else if (num_text == 1) {
    1869      532515 :     terms_.Add(text_.last(), zone());
    1870             :   } else {
    1871             :     RegExpText* text = new (zone()) RegExpText(zone());
    1872      223669 :     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
    1873       34927 :     terms_.Add(text, zone());
    1874             :   }
    1875             :   text_.Clear();
    1876             : }
    1877             : 
    1878             : 
    1879   768633129 : void RegExpBuilder::AddCharacter(uc16 c) {
    1880             :   FlushPendingSurrogate();
    1881   383403008 :   pending_empty_ = false;
    1882   383403008 :   if (NeedsDesugaringForIgnoreCase(c)) {
    1883         895 :     AddCharacterClassForDesugaring(c);
    1884             :   } else {
    1885   383402113 :     if (characters_ == nullptr) {
    1886     1828008 :       characters_ = new (zone()) ZoneList<uc16>(4, zone());
    1887             :     }
    1888   383402113 :     characters_->Add(c, zone());
    1889             :     LAST(ADD_CHAR);
    1890             :   }
    1891   383403008 : }
    1892             : 
    1893             : 
    1894   383455481 : void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
    1895   383455481 :   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    1896             :     DCHECK(unicode());
    1897       64708 :     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
    1898       64708 :     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
    1899   383432575 :   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
    1900         250 :     AddLeadSurrogate(c);
    1901   383432075 :   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
    1902         190 :     AddTrailSurrogate(c);
    1903             :   } else {
    1904   383390333 :     AddCharacter(static_cast<uc16>(c));
    1905             :   }
    1906   383455481 : }
    1907             : 
    1908        1809 : void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
    1909             :   // A lead or trail surrogate parsed via escape sequence will not
    1910             :   // pair up with any preceding lead or following trail surrogate.
    1911             :   FlushPendingSurrogate();
    1912        1809 :   AddUnicodeCharacter(character);
    1913             :   FlushPendingSurrogate();
    1914        1809 : }
    1915             : 
    1916         113 : void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
    1917             : 
    1918             : 
    1919      183310 : void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
    1920      183310 :   if (NeedsDesugaringForUnicode(cc)) {
    1921             :     // With /u, character class needs to be desugared, so it
    1922             :     // must be a standalone term instead of being part of a RegExpText.
    1923        4796 :     AddTerm(cc);
    1924             :   } else {
    1925      178514 :     AddAtom(cc);
    1926             :   }
    1927      183310 : }
    1928             : 
    1929        2790 : void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
    1930             :   AddTerm(new (zone()) RegExpCharacterClass(
    1931             :       zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
    1932        2790 :       flags_));
    1933        1395 : }
    1934             : 
    1935             : 
    1936     2707256 : void RegExpBuilder::AddAtom(RegExpTree* term) {
    1937     1353628 :   if (term->IsEmpty()) {
    1938             :     AddEmpty();
    1939     1353628 :     return;
    1940             :   }
    1941     1353628 :   if (term->IsTextElement()) {
    1942      243192 :     FlushCharacters();
    1943      243192 :     text_.Add(term, zone());
    1944             :   } else {
    1945     1110436 :     FlushText();
    1946     1110436 :     terms_.Add(term, zone());
    1947             :   }
    1948             :   LAST(ADD_ATOM);
    1949             : }
    1950             : 
    1951             : 
    1952       12382 : void RegExpBuilder::AddTerm(RegExpTree* term) {
    1953        6191 :   FlushText();
    1954        6191 :   terms_.Add(term, zone());
    1955             :   LAST(ADD_ATOM);
    1956        6191 : }
    1957             : 
    1958             : 
    1959       22604 : void RegExpBuilder::AddAssertion(RegExpTree* assert) {
    1960       11342 :   FlushText();
    1961       11342 :   if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
    1962             :     // Omit repeated assertions of the same type.
    1963         398 :     RegExpAssertion* last = terms_.last()->AsAssertion();
    1964         398 :     RegExpAssertion* next = assert->AsAssertion();
    1965       11541 :     if (last->assertion_type() == next->assertion_type()) return;
    1966             :   }
    1967       11262 :   terms_.Add(assert, zone());
    1968             :   LAST(ADD_ASSERT);
    1969             : }
    1970             : 
    1971             : 
    1972      148300 : void RegExpBuilder::NewAlternative() { FlushTerms(); }
    1973             : 
    1974             : 
    1975     3675932 : void RegExpBuilder::FlushTerms() {
    1976     1632105 :   FlushText();
    1977             :   int num_terms = terms_.length();
    1978             :   RegExpTree* alternative;
    1979     1632105 :   if (num_terms == 0) {
    1980      330014 :     alternative = new (zone()) RegExpEmpty();
    1981     1302091 :   } else if (num_terms == 1) {
    1982             :     alternative = terms_.last();
    1983             :   } else {
    1984       81708 :     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
    1985             :   }
    1986     1632105 :   alternatives_.Add(alternative, zone());
    1987             :   terms_.Clear();
    1988             :   LAST(ADD_NONE);
    1989     1632105 : }
    1990             : 
    1991             : 
    1992      188566 : bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
    1993      183310 :   if (!unicode()) return false;
    1994             :   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
    1995             :   // necessarily mean that we need to desugar. It's probably nicer to have a
    1996             :   // separate pass to figure out unicode desugarings.
    1997        6084 :   if (ignore_case()) return true;
    1998        5256 :   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
    1999        5256 :   CharacterRange::Canonicalize(ranges);
    2000       13449 :   for (int i = ranges->length() - 1; i >= 0; i--) {
    2001       12161 :     uc32 from = ranges->at(i).from();
    2002       12161 :     uc32 to = ranges->at(i).to();
    2003             :     // Check for non-BMP characters.
    2004       12161 :     if (to >= kNonBmpStart) return true;
    2005             :     // Check for lone surrogates.
    2006        8303 :     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
    2007             :   }
    2008             :   return false;
    2009             : }
    2010             : 
    2011             : 
    2012   383467716 : bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
    2013             : #ifdef V8_INTL_SUPPORT
    2014   383573856 :   if (unicode() && ignore_case()) {
    2015        1080 :     icu::UnicodeSet set(c, c);
    2016        1080 :     set.closeOver(USET_CASE_INSENSITIVE);
    2017        1080 :     set.removeAllStrings();
    2018        1080 :     return set.size() > 1;
    2019             :   }
    2020             :   // In the case where ICU is not included, we act as if the unicode flag is
    2021             :   // not set, and do not desugar.
    2022             : #endif  // V8_INTL_SUPPORT
    2023             :   return false;
    2024             : }
    2025             : 
    2026             : 
    2027     1526739 : RegExpTree* RegExpBuilder::ToRegExp() {
    2028     1483805 :   FlushTerms();
    2029             :   int num_alternatives = alternatives_.length();
    2030     1483805 :   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
    2031     1483805 :   if (num_alternatives == 1) return alternatives_.last();
    2032       42934 :   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
    2033             : }
    2034             : 
    2035     1451999 : bool RegExpBuilder::AddQuantifierToAtom(
    2036     2880628 :     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
    2037             :   FlushPendingSurrogate();
    2038     1451999 :   if (pending_empty_) {
    2039           2 :     pending_empty_ = false;
    2040           2 :     return true;
    2041             :   }
    2042             :   RegExpTree* atom;
    2043     1451997 :   if (characters_ != nullptr) {
    2044             :     DCHECK(last_added_ == ADD_CHAR);
    2045             :     // Last atom was character.
    2046             :     Vector<const uc16> char_vector = characters_->ToConstVector();
    2047             :     int num_chars = char_vector.length();
    2048     1428276 :     if (num_chars > 1) {
    2049         567 :       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
    2050        1134 :       text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
    2051             :       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
    2052             :     }
    2053     1428276 :     characters_ = nullptr;
    2054     1428276 :     atom = new (zone()) RegExpAtom(char_vector, flags_);
    2055     1428276 :     FlushText();
    2056       23721 :   } else if (text_.length() > 0) {
    2057             :     DCHECK(last_added_ == ADD_ATOM);
    2058       16284 :     atom = text_.RemoveLast();
    2059       16284 :     FlushText();
    2060        7437 :   } else if (terms_.length() > 0) {
    2061             :     DCHECK(last_added_ == ADD_ATOM);
    2062        7437 :     atom = terms_.RemoveLast();
    2063        7437 :     if (atom->IsLookaround()) {
    2064             :       // With /u, lookarounds are not quantifiable.
    2065         239 :       if (unicode()) return false;
    2066             :       // Lookbehinds are not quantifiable.
    2067         193 :       if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
    2068             :         return false;
    2069             :       }
    2070             :     }
    2071        7364 :     if (atom->max_match() == 0) {
    2072             :       // Guaranteed to only match an empty string.
    2073             :       LAST(ADD_TERM);
    2074         236 :       if (min == 0) {
    2075             :         return true;
    2076             :       }
    2077          97 :       terms_.Add(atom, zone());
    2078          97 :       return true;
    2079             :     }
    2080             :   } else {
    2081             :     // Only call immediately after adding an atom or character!
    2082           0 :     UNREACHABLE();
    2083             :   }
    2084     1451688 :   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
    2085     1451688 :              zone());
    2086             :   LAST(ADD_TERM);
    2087     1451688 :   return true;
    2088             : }
    2089             : 
    2090             : }  // namespace internal
    2091      183867 : }  // namespace v8

Generated by: LCOV version 1.10