LCOV - app.info - src/regexp/regexp-parser.cc

LCOV - code coverage report

Current view:	top level - src/regexp - regexp-parser.cc (source / functions)		Hit	Total	Coverage
Test:	app.info	Lines:	730	753	96.9 %
Date:	2017-04-26	Functions:	49	60	81.7 %

          Line data    Source code

       1             : // Copyright 2016 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/regexp/regexp-parser.h"
       6             : 
       7             : #include "src/char-predicates-inl.h"
       8             : #include "src/factory.h"
       9             : #include "src/isolate.h"
      10             : #include "src/objects-inl.h"
      11             : #include "src/ostreams.h"
      12             : #include "src/regexp/jsregexp.h"
      13             : #include "src/utils.h"
      14             : 
      15             : #ifdef V8_INTL_SUPPORT
      16             : #include "unicode/uniset.h"
      17             : #endif  // V8_INTL_SUPPORT
      18             : 
      19             : namespace v8 {
      20             : namespace internal {
      21             : 
      22      459845 : RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
      23             :                            JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
      24             :     : isolate_(isolate),
      25             :       zone_(zone),
      26             :       error_(error),
      27             :       captures_(NULL),
      28             :       named_captures_(NULL),
      29             :       named_back_references_(NULL),
      30             :       in_(in),
      31             :       current_(kEndMarker),
      32             :       dotall_(flags & JSRegExp::kDotAll),
      33             :       ignore_case_(flags & JSRegExp::kIgnoreCase),
      34             :       multiline_(flags & JSRegExp::kMultiline),
      35             :       unicode_(flags & JSRegExp::kUnicode),
      36             :       next_pos_(0),
      37             :       captures_started_(0),
      38             :       capture_count_(0),
      39             :       has_more_(true),
      40             :       simple_(false),
      41             :       contains_anchor_(false),
      42             :       is_scanned_for_captures_(false),
      43             :       has_named_captures_(false),
      44     2299225 :       failed_(false) {
      45             :   DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
      46      459845 :   Advance();
      47      459845 : }
      48             : 
      49    46246980 : inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {
      50    15415660 :   int position = next_pos_;
      51             :   uc32 c0 = in()->Get(position);
      52    15415660 :   position++;
      53             :   const bool try_combine_surrogate_pairs =
      54    15415660 :       (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);
      55    15656524 :   if (try_combine_surrogate_pairs && position < in()->length() &&
      56             :       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
      57             :     uc16 c1 = in()->Get(position);
      58         791 :     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
      59         531 :       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
      60         531 :       position++;
      61             :     }
      62             :   }
      63    15415660 :   if (update_position) next_pos_ = position;
      64    15415660 :   return c0;
      65             : }
      66             : 
      67             : 
      68      200090 : uc32 RegExpParser::Next() {
      69      200090 :   if (has_next()) {
      70      200048 :     return ReadNext(false, ScanMode::DEFAULT);
      71             :   } else {
      72             :     return kEndMarker;
      73             :   }
      74             : }
      75             : 
      76    46106943 : void RegExpParser::Advance(ScanMode mode) {
      77    15675370 :   if (has_next()) {
      78             :     StackLimitCheck check(isolate());
      79    15215961 :     if (check.HasOverflowed()) {
      80         349 :       if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");
      81             :       ReportError(CStrVector(
      82         698 :           MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
      83    15215612 :     } else if (zone()->excess_allocation()) {
      84           0 :       ReportError(CStrVector("Regular expression too large"));
      85             :     } else {
      86    15215612 :       current_ = ReadNext(true, mode);
      87             :     }
      88             :   } else {
      89      459409 :     current_ = kEndMarker;
      90             :     // Advance so that position() points to 1-after-the-last-character. This is
      91             :     // important so that Reset() to this position works correctly.
      92      459409 :     next_pos_ = in()->length() + 1;
      93      459409 :     has_more_ = false;
      94             :   }
      95    15675370 : }
      96             : 
      97             : 
      98        3679 : void RegExpParser::Reset(int pos) {
      99        3679 :   next_pos_ = pos;
     100        3679 :   has_more_ = (pos < in()->length());
     101        3679 :   Advance();
     102           0 : }
     103             : 
     104           0 : void RegExpParser::Advance(int dist, ScanMode mode) {
     105       83487 :   next_pos_ += dist - 1;
     106       83487 :   Advance(mode);
     107           0 : }
     108             : 
     109             : 
     110      264595 : bool RegExpParser::simple() { return simple_; }
     111             : 
     112           0 : bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
     113         122 :   switch (c) {
     114             :     case '^':
     115             :     case '$':
     116             :     case '\\':
     117             :     case '.':
     118             :     case '*':
     119             :     case '+':
     120             :     case '?':
     121             :     case '(':
     122             :     case ')':
     123             :     case '[':
     124             :     case ']':
     125             :     case '{':
     126             :     case '}':
     127             :     case '|':
     128             :     case '/':
     129             :       return true;
     130             :     default:
     131             :       break;
     132             :   }
     133           0 :   return false;
     134             : }
     135             : 
     136             : 
     137       11415 : RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
     138        3853 :   if (failed_) return NULL;  // Do not overwrite any existing error.
     139        3781 :   failed_ = true;
     140             :   *error_ = isolate()
     141             :                 ->factory()
     142             :                 ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
     143       11343 :                 .ToHandleChecked();
     144             :   // Zip to the end to make sure the no more input is read.
     145        3781 :   current_ = kEndMarker;
     146        3781 :   next_pos_ = in()->length();
     147        3781 :   return NULL;
     148             : }
     149             : 
     150             : 
     151             : #define CHECK_FAILED /**/); \
     152             :   if (failed_) return NULL; \
     153             :   ((void)0
     154             : 
     155             : 
     156             : // Pattern ::
     157             : //   Disjunction
     158      724440 : RegExpTree* RegExpParser::ParsePattern() {
     159      459845 :   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
     160      456218 :   PatchNamedBackReferences(CHECK_FAILED);
     161             :   DCHECK(!has_more());
     162             :   // If the result of parsing is a literal string atom, and it has the
     163             :   // same length as the input, then the atom is identical to the input.
     164      720659 :   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
     165      256506 :     simple_ = true;
     166             :   }
     167      456064 :   return result;
     168             : }
     169             : 
     170             : 
     171             : // Disjunction ::
     172             : //   Alternative
     173             : //   Alternative | Disjunction
     174             : // Alternative ::
     175             : //   [empty]
     176             : //   Term Alternative
     177             : // Term ::
     178             : //   Assertion
     179             : //   Atom
     180             : //   Atom Quantifier
     181    42178989 : RegExpTree* RegExpParser::ParseDisjunction() {
     182             :   // Used to store current state while parsing subexpressions.
     183             :   RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
     184      919690 :                                   nullptr, ignore_case(), unicode(), zone());
     185    10525672 :   RegExpParserState* state = &initial_state;
     186             :   // Cache the builder in a local variable for quick access.
     187      459845 :   RegExpBuilder* builder = initial_state.builder();
     188             :   while (true) {
     189    12253277 :     switch (current()) {
     190             :       case kEndMarker:
     191      456603 :         if (state->IsSubexpression()) {
     192             :           // Inside a parenthesized group when hitting end of input.
     193      459977 :           return ReportError(CStrVector("Unterminated group"));
     194             :         }
     195             :         DCHECK_EQ(INITIAL, state->group_type());
     196             :         // Parsing completed successfully.
     197      456471 :         return builder->ToRegExp();
     198             :       case ')': {
     199     1683861 :         if (!state->IsSubexpression()) {
     200          19 :           return ReportError(CStrVector("Unmatched ')'"));
     201             :         }
     202             :         DCHECK_NE(INITIAL, state->group_type());
     203             : 
     204     1683842 :         Advance();
     205             :         // End disjunction parsing and convert builder content to new single
     206             :         // regexp atom.
     207     1683842 :         RegExpTree* body = builder->ToRegExp();
     208             : 
     209             :         int end_capture_index = captures_started();
     210             : 
     211             :         int capture_index = state->capture_index();
     212             :         SubexpressionType group_type = state->group_type();
     213             : 
     214             :         // Build result of subexpression.
     215     1683842 :         if (group_type == CAPTURE) {
     216     1644448 :           if (state->IsNamedCapture()) {
     217             :             CreateNamedCaptureAtIndex(state->capture_name(),
     218        1721 :                                       capture_index CHECK_FAILED);
     219             :           }
     220     1644385 :           RegExpCapture* capture = GetCapture(capture_index);
     221             :           capture->set_body(body);
     222             :           body = capture;
     223       39394 :         } else if (group_type == GROUPING) {
     224             :           body = new (zone()) RegExpGroup(body);
     225             :         } else {
     226             :           DCHECK(group_type == POSITIVE_LOOKAROUND ||
     227             :                  group_type == NEGATIVE_LOOKAROUND);
     228        4471 :           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
     229             :           body = new (zone()) RegExpLookaround(
     230             :               body, is_positive, end_capture_index - capture_index,
     231        4471 :               capture_index, state->lookaround_type());
     232             :         }
     233             : 
     234             :         // Restore previous state.
     235             :         state = state->previous_state();
     236             :         builder = state->builder();
     237             : 
     238     1683779 :         builder->AddAtom(body);
     239             :         // For compatability with JSC and ES3, we allow quantifiers after
     240             :         // lookaheads, and break in all cases.
     241     1683779 :         break;
     242             :       }
     243             :       case '|': {
     244      152158 :         Advance();
     245             :         builder->NewAlternative();
     246     9492103 :         continue;
     247             :       }
     248             :       case '*':
     249             :       case '+':
     250             :       case '?':
     251         155 :         return ReportError(CStrVector("Nothing to repeat"));
     252             :       case '^': {
     253       16405 :         Advance();
     254       16405 :         if (multiline()) {
     255             :           builder->AddAssertion(
     256        1599 :               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
     257             :         } else {
     258             :           builder->AddAssertion(
     259       14806 :               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
     260             :           set_contains_anchor();
     261             :         }
     262             :         continue;
     263             :       }
     264             :       case '$': {
     265        7666 :         Advance();
     266             :         RegExpAssertion::AssertionType assertion_type =
     267             :             multiline() ? RegExpAssertion::END_OF_LINE
     268        7666 :                         : RegExpAssertion::END_OF_INPUT;
     269        7666 :         builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
     270        7666 :         continue;
     271             :       }
     272             :       case '.': {
     273       30896 :         Advance();
     274             :         ZoneList<CharacterRange>* ranges =
     275       30896 :             new (zone()) ZoneList<CharacterRange>(2, zone());
     276             : 
     277       30896 :         if (dotall()) {
     278             :           // Everything.
     279             :           DCHECK(FLAG_harmony_regexp_dotall);
     280          48 :           CharacterRange::AddClassEscape('*', ranges, false, zone());
     281             :         } else {
     282             :           // Everything except \x0a, \x0d, \u2028 and \u2029
     283       30848 :           CharacterRange::AddClassEscape('.', ranges, false, zone());
     284             :         }
     285             : 
     286             :         RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges);
     287       30896 :         builder->AddCharacterClass(cc);
     288       30896 :         break;
     289             :       }
     290             :       case '(': {
     291             :         SubexpressionType subexpr_type = CAPTURE;
     292             :         RegExpLookaround::Type lookaround_type = state->lookaround_type();
     293             :         bool is_named_capture = false;
     294     1684609 :         Advance();
     295     1684609 :         if (current() == '?') {
     296       41718 :           switch (Next()) {
     297             :             case ':':
     298             :               subexpr_type = GROUPING;
     299             :               Advance(2);
     300             :               break;
     301             :             case '=':
     302             :               lookaround_type = RegExpLookaround::LOOKAHEAD;
     303             :               subexpr_type = POSITIVE_LOOKAROUND;
     304             :               Advance(2);
     305             :               break;
     306             :             case '!':
     307             :               lookaround_type = RegExpLookaround::LOOKAHEAD;
     308             :               subexpr_type = NEGATIVE_LOOKAROUND;
     309             :               Advance(2);
     310             :               break;
     311             :             case '<':
     312        3956 :               Advance();
     313        3956 :               if (FLAG_harmony_regexp_lookbehind) {
     314        3907 :                 if (Next() == '=') {
     315             :                   subexpr_type = POSITIVE_LOOKAROUND;
     316             :                   lookaround_type = RegExpLookaround::LOOKBEHIND;
     317             :                   Advance(2);
     318             :                   break;
     319        2506 :                 } else if (Next() == '!') {
     320             :                   subexpr_type = NEGATIVE_LOOKAROUND;
     321             :                   lookaround_type = RegExpLookaround::LOOKBEHIND;
     322             :                   Advance(2);
     323             :                   break;
     324             :                 }
     325             :               }
     326        2253 :               if (FLAG_harmony_regexp_named_captures) {
     327        2253 :                 has_named_captures_ = true;
     328             :                 is_named_capture = true;
     329        2253 :                 break;
     330             :               }
     331             :             // Fall through.
     332             :             default:
     333          25 :               return ReportError(CStrVector("Invalid group"));
     334             :           }
     335             :         }
     336             : 
     337             :         const ZoneVector<uc16>* capture_name = nullptr;
     338     1684584 :         if (subexpr_type == CAPTURE) {
     339     1645144 :           if (captures_started_ >= kMaxCaptures) {
     340           7 :             return ReportError(CStrVector("Too many captures"));
     341             :           }
     342     1645137 :           captures_started_++;
     343             : 
     344     1645137 :           if (is_named_capture) {
     345        2253 :             capture_name = ParseCaptureGroupName(CHECK_FAILED);
     346             :           }
     347             :         }
     348             :         // Store current state and begin new disjunction parsing.
     349             :         state = new (zone()) RegExpParserState(
     350             :             state, subexpr_type, lookaround_type, captures_started_,
     351     5052177 :             capture_name, ignore_case(), unicode(), zone());
     352             :         builder = state->builder();
     353     1684059 :         continue;
     354             :       }
     355             :       case '[': {
     356      167245 :         RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
     357      166951 :         builder->AddCharacterClass(cc->AsCharacterClass());
     358      166951 :         break;
     359             :       }
     360             :       // Atom ::
     361             :       //   \ AtomEscape
     362             :       case '\\':
     363       75112 :         switch (Next()) {
     364             :           case kEndMarker:
     365           7 :             return ReportError(CStrVector("\\ at end of pattern"));
     366             :           case 'b':
     367             :             Advance(2);
     368             :             builder->AddAssertion(
     369         627 :                 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
     370         627 :             continue;
     371             :           case 'B':
     372             :             Advance(2);
     373             :             builder->AddAssertion(
     374         413 :                 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
     375         413 :             continue;
     376             :           // AtomEscape ::
     377             :           //   CharacterClassEscape
     378             :           //
     379             :           // CharacterClassEscape :: one of
     380             :           //   d D s S w W
     381             :           case 'd':
     382             :           case 'D':
     383             :           case 's':
     384             :           case 'S':
     385             :           case 'w':
     386             :           case 'W': {
     387       22975 :             uc32 c = Next();
     388             :             Advance(2);
     389             :             ZoneList<CharacterRange>* ranges =
     390       22975 :                 new (zone()) ZoneList<CharacterRange>(2, zone());
     391             :             CharacterRange::AddClassEscape(c, ranges,
     392       23463 :                                            unicode() && ignore_case(), zone());
     393             :             RegExpCharacterClass* cc =
     394             :                 new (zone()) RegExpCharacterClass(ranges);
     395       22975 :             builder->AddCharacterClass(cc);
     396       22975 :             break;
     397             :           }
     398             :           case 'p':
     399             :           case 'P': {
     400        6057 :             uc32 p = Next();
     401             :             Advance(2);
     402        6057 :             if (unicode()) {
     403        5864 :               if (FLAG_harmony_regexp_property) {
     404             :                 ZoneList<CharacterRange>* ranges =
     405        5850 :                     new (zone()) ZoneList<CharacterRange>(2, zone());
     406        5850 :                 if (!ParsePropertyClass(ranges, p == 'P')) {
     407        1176 :                   return ReportError(CStrVector("Invalid property name"));
     408             :                 }
     409             :                 RegExpCharacterClass* cc =
     410             :                     new (zone()) RegExpCharacterClass(ranges);
     411        4674 :                 builder->AddCharacterClass(cc);
     412             :               } else {
     413             :                 // With /u, no identity escapes except for syntax characters
     414             :                 // are allowed. Otherwise, all identity escapes are allowed.
     415          14 :                 return ReportError(CStrVector("Invalid escape"));
     416             :               }
     417             :             } else {
     418         193 :               builder->AddCharacter(p);
     419             :             }
     420             :             break;
     421             :           }
     422             :           case '1':
     423             :           case '2':
     424             :           case '3':
     425             :           case '4':
     426             :           case '5':
     427             :           case '6':
     428             :           case '7':
     429             :           case '8':
     430             :           case '9': {
     431        6931 :             int index = 0;
     432        7052 :             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
     433        6922 :             if (is_backref) {
     434       12518 :               if (state->IsInsideCaptureGroup(index)) {
     435             :                 // The back reference is inside the capture group it refers to.
     436             :                 // Nothing can possibly have been captured yet, so we use empty
     437             :                 // instead. This ensures that, when checking a back reference,
     438             :                 // the capture registers of the referenced capture are either
     439             :                 // both set or both cleared.
     440             :                 builder->AddEmpty();
     441             :               } else {
     442        6070 :                 RegExpCapture* capture = GetCapture(index);
     443             :                 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
     444        6070 :                 builder->AddAtom(atom);
     445             :               }
     446        6319 :               break;
     447             :             }
     448             :             // With /u, no identity escapes except for syntax characters
     449             :             // are allowed. Otherwise, all identity escapes are allowed.
     450         663 :             if (unicode()) {
     451         112 :               return ReportError(CStrVector("Invalid escape"));
     452             :             }
     453         551 :             uc32 first_digit = Next();
     454         551 :             if (first_digit == '8' || first_digit == '9') {
     455          60 :               builder->AddCharacter(first_digit);
     456             :               Advance(2);
     457             :               break;
     458             :             }
     459             :           }
     460             :           // Fall through.
     461             :           case '0': {
     462        3929 :             Advance();
     463        3929 :             if (unicode() && Next() >= '0' && Next() <= '9') {
     464             :               // With /u, decimal escape with leading 0 are not parsed as octal.
     465          29 :               return ReportError(CStrVector("Invalid decimal escape"));
     466             :             }
     467        3900 :             uc32 octal = ParseOctalLiteral();
     468        3900 :             builder->AddCharacter(octal);
     469        3900 :             break;
     470             :           }
     471             :           // ControlEscape :: one of
     472             :           //   f n r t v
     473             :           case 'f':
     474             :             Advance(2);
     475          15 :             builder->AddCharacter('\f');
     476          15 :             break;
     477             :           case 'n':
     478             :             Advance(2);
     479         563 :             builder->AddCharacter('\n');
     480         563 :             break;
     481             :           case 'r':
     482             :             Advance(2);
     483          97 :             builder->AddCharacter('\r');
     484          97 :             break;
     485             :           case 't':
     486             :             Advance(2);
     487          17 :             builder->AddCharacter('\t');
     488          17 :             break;
     489             :           case 'v':
     490             :             Advance(2);
     491          37 :             builder->AddCharacter('\v');
     492          37 :             break;
     493             :           case 'c': {
     494         294 :             Advance();
     495         294 :             uc32 controlLetter = Next();
     496             :             // Special case if it is an ASCII letter.
     497             :             // Convert lower case letters to uppercase.
     498         294 :             uc32 letter = controlLetter & ~('a' ^ 'A');
     499         294 :             if (letter < 'A' || 'Z' < letter) {
     500             :               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
     501             :               // Read the backslash as a literal character instead of as
     502             :               // starting an escape.
     503             :               // ES#prod-annexB-ExtendedPatternCharacter
     504         161 :               if (unicode()) {
     505             :                 // With /u, invalid escapes are not treated as identity escapes.
     506          29 :                 return ReportError(CStrVector("Invalid unicode escape"));
     507             :               }
     508         132 :               builder->AddCharacter('\\');
     509             :             } else {
     510             :               Advance(2);
     511         133 :               builder->AddCharacter(controlLetter & 0x1f);
     512             :             }
     513             :             break;
     514             :           }
     515             :           case 'x': {
     516             :             Advance(2);
     517             :             uc32 value;
     518         327 :             if (ParseHexEscape(2, &value)) {
     519         188 :               builder->AddCharacter(value);
     520         139 :             } else if (!unicode()) {
     521          83 :               builder->AddCharacter('x');
     522             :             } else {
     523             :               // With /u, invalid escapes are not treated as identity escapes.
     524          56 :               return ReportError(CStrVector("Invalid escape"));
     525             :             }
     526         271 :             break;
     527             :           }
     528             :           case 'u': {
     529             :             Advance(2);
     530             :             uc32 value;
     531        2143 :             if (ParseUnicodeEscape(&value)) {
     532        1891 :               builder->AddEscapedUnicodeCharacter(value);
     533         252 :             } else if (!unicode()) {
     534         140 :               builder->AddCharacter('u');
     535             :             } else {
     536             :               // With /u, invalid escapes are not treated as identity escapes.
     537         112 :               return ReportError(CStrVector("Invalid unicode escape"));
     538             :             }
     539        2031 :             break;
     540             :           }
     541             :           case 'k':
     542             :             // Either an identity escape or a named back-reference.  The two
     543             :             // interpretations are mutually exclusive: '\k' is interpreted as
     544             :             // an identity escape for non-unicode patterns without named
     545             :             // capture groups, and as the beginning of a named back-reference
     546             :             // in all other cases.
     547        2022 :             if (FLAG_harmony_regexp_named_captures &&
     548         296 :                 (unicode() || HasNamedCaptures())) {
     549             :               Advance(2);
     550         612 :               ParseNamedBackReference(builder, state CHECK_FAILED);
     551             :               break;
     552             :             }
     553             :           // Fall through.
     554             :           default:
     555       30559 :             Advance();
     556             :             // With /u, no identity escapes except for syntax characters
     557             :             // are allowed. Otherwise, all identity escapes are allowed.
     558       30613 :             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
     559       30517 :               builder->AddCharacter(current());
     560       30517 :               Advance();
     561             :             } else {
     562          42 :               return ReportError(CStrVector("Invalid escape"));
     563             :             }
     564       30517 :             break;
     565             :         }
     566             :         break;
     567             :       case '{': {
     568             :         int dummy;
     569         709 :         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
     570         644 :         if (parsed) return ReportError(CStrVector("Nothing to repeat"));
     571             :         // Fall through.
     572             :       }
     573             :       case '}':
     574             :       case ']':
     575        1241 :         if (unicode()) {
     576          43 :           return ReportError(CStrVector("Lone quantifier brackets"));
     577             :         }
     578             :       // Fall through.
     579             :       default:
     580     7978473 :         builder->AddUnicodeCharacter(current());
     581     7978473 :         Advance();
     582     7978473 :         break;
     583             :     }  // end switch(current())
     584             : 
     585             :     int min;
     586             :     int max;
     587     9932445 :     switch (current()) {
     588             :       // QuantifierPrefix ::
     589             :       //   *
     590             :       //   +
     591             :       //   ?
     592             :       //   {
     593             :       case '*':
     594       47764 :         min = 0;
     595       47764 :         max = RegExpTree::kInfinity;
     596       47764 :         Advance();
     597       47764 :         break;
     598             :       case '+':
     599       12591 :         min = 1;
     600       12591 :         max = RegExpTree::kInfinity;
     601       12591 :         Advance();
     602       12591 :         break;
     603             :       case '?':
     604     2229230 :         min = 0;
     605     2229230 :         max = 1;
     606     2229230 :         Advance();
     607     2229230 :         break;
     608             :       case '{':
     609       12482 :         if (ParseIntervalQuantifier(&min, &max)) {
     610       11776 :           if (max < min) {
     611             :             return ReportError(
     612           2 :                 CStrVector("numbers out of order in {} quantifier"));
     613             :           }
     614             :           break;
     615         706 :         } else if (unicode()) {
     616             :           // With /u, incomplete quantifiers are not allowed.
     617         309 :           return ReportError(CStrVector("Incomplete quantifier"));
     618             :         }
     619             :         continue;
     620             :       default:
     621             :         continue;
     622             :     }
     623             :     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
     624     2301359 :     if (current() == '?') {
     625             :       quantifier_type = RegExpQuantifier::NON_GREEDY;
     626        7685 :       Advance();
     627             :     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
     628             :       // FLAG_regexp_possessive_quantifier is a debug-only flag.
     629             :       quantifier_type = RegExpQuantifier::POSSESSIVE;
     630             :       Advance();
     631             :     }
     632     2301359 :     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
     633          30 :       return ReportError(CStrVector("Invalid quantifier"));
     634             :     }
     635             :   }
     636             : }
     637             : 
     638             : 
     639             : #ifdef DEBUG
     640             : // Currently only used in an DCHECK.
     641             : static bool IsSpecialClassEscape(uc32 c) {
     642             :   switch (c) {
     643             :     case 'd':
     644             :     case 'D':
     645             :     case 's':
     646             :     case 'S':
     647             :     case 'w':
     648             :     case 'W':
     649             :       return true;
     650             :     default:
     651             :       return false;
     652             :   }
     653             : }
     654             : #endif
     655             : 
     656             : 
     657             : // In order to know whether an escape is a backreference or not we have to scan
     658             : // the entire regexp and find the number of capturing parentheses.  However we
     659             : // don't want to scan the regexp twice unless it is necessary.  This mini-parser
     660             : // is called when needed.  It can see the difference between capturing and
     661             : // noncapturing parentheses and can skip character classes and backslash-escaped
     662             : // characters.
     663       10707 : void RegExpParser::ScanForCaptures() {
     664             :   DCHECK(!is_scanned_for_captures_);
     665             :   const int saved_position = position();
     666             :   // Start with captures started previous to current position
     667             :   int capture_count = captures_started();
     668             :   // Add count of captures after this position.
     669             :   int n;
     670        8327 :   while ((n = current()) != kEndMarker) {
     671        6123 :     Advance();
     672        6123 :     switch (n) {
     673             :       case '\\':
     674         835 :         Advance();
     675         835 :         break;
     676             :       case '[': {
     677             :         int c;
     678         148 :         while ((c = current()) != kEndMarker) {
     679         148 :           Advance();
     680         148 :           if (c == '\\') {
     681           0 :             Advance();
     682             :           } else {
     683         148 :             if (c == ']') break;
     684             :           }
     685             :         }
     686             :         break;
     687             :       }
     688             :       case '(':
     689         800 :         if (current() == '?') {
     690             :           // At this point we could be in
     691             :           // * a non-capturing group '(:',
     692             :           // * a lookbehind assertion '(?<=' '(?<!'
     693             :           // * or a named capture '(?<'.
     694             :           //
     695             :           // Of these, only named captures are capturing groups.
     696         217 :           if (!FLAG_harmony_regexp_named_captures) break;
     697             : 
     698         172 :           Advance();
     699         172 :           if (current() != '<') break;
     700             : 
     701         158 :           if (FLAG_harmony_regexp_lookbehind) {
     702         158 :             Advance();
     703         158 :             if (current() == '=' || current() == '!') break;
     704             :           }
     705             : 
     706             :           // Found a possible named capture. It could turn out to be a syntax
     707             :           // error (e.g. an unterminated or invalid name), but that distinction
     708             :           // does not matter for our purposes.
     709         106 :           has_named_captures_ = true;
     710             :         }
     711         689 :         capture_count++;
     712         689 :         break;
     713             :     }
     714             :   }
     715        1102 :   capture_count_ = capture_count;
     716        1102 :   is_scanned_for_captures_ = true;
     717             :   Reset(saved_position);
     718        1102 : }
     719             : 
     720             : 
     721       21538 : bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
     722             :   DCHECK_EQ('\\', current());
     723             :   DCHECK('1' <= Next() && Next() <= '9');
     724             :   // Try to parse a decimal literal that is no greater than the total number
     725             :   // of left capturing parentheses in the input.
     726             :   int start = position();
     727        6931 :   int value = Next() - '0';
     728             :   Advance(2);
     729             :   while (true) {
     730             :     uc32 c = current();
     731        7701 :     if (IsDecimalDigit(c)) {
     732         795 :       value = 10 * value + (c - '0');
     733         795 :       if (value > kMaxCaptures) {
     734             :         Reset(start);
     735          25 :         return false;
     736             :       }
     737         770 :       Advance();
     738             :     } else {
     739             :       break;
     740             :     }
     741             :   }
     742        6906 :   if (value > captures_started()) {
     743        1178 :     if (!is_scanned_for_captures_) ScanForCaptures();
     744        1178 :     if (value > capture_count_) {
     745             :       Reset(start);
     746         647 :       return false;
     747             :     }
     748             :   }
     749        6259 :   *index_out = value;
     750        7029 :   return true;
     751             : }
     752             : 
     753        3359 : static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
     754        3359 :   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
     755        6574 :     v->push_back(code_unit);
     756             :   } else {
     757         144 :     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
     758         144 :     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
     759             :   }
     760        3359 : }
     761             : 
     762        9332 : const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
     763             :   DCHECK(FLAG_harmony_regexp_named_captures);
     764             :   DCHECK_EQ(current(), '<');
     765             : 
     766             :   ZoneVector<uc16>* name =
     767        2802 :       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
     768             : 
     769             :   // Capture names can always contain surrogate pairs, and we need to scan
     770             :   // accordingly.
     771             :   const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
     772        2802 :   Advance(scan_mode);
     773             : 
     774             :   bool at_start = true;
     775             :   while (true) {
     776        6161 :     uc32 c = current();
     777        6161 :     Advance(scan_mode);
     778             : 
     779             :     // Convert unicode escapes.
     780        6530 :     if (c == '\\' && current() == 'u') {
     781         341 :       Advance(scan_mode);
     782         341 :       if (!ParseUnicodeEscape(&c)) {
     783          91 :         ReportError(CStrVector("Invalid Unicode escape sequence"));
     784          91 :         return nullptr;
     785             :       }
     786             :     }
     787             : 
     788             :     // The backslash char is misclassified as both ID_Start and ID_Continue.
     789        6070 :     if (c == '\\') {
     790          28 :       ReportError(CStrVector("Invalid capture group name"));
     791          28 :       return nullptr;
     792             :     }
     793             : 
     794        6042 :     if (at_start) {
     795        2753 :       if (!IdentifierStart::Is(c)) {
     796         154 :         ReportError(CStrVector("Invalid capture group name"));
     797         154 :         return nullptr;
     798             :       }
     799        2599 :       push_code_unit(name, c);
     800             :       at_start = false;
     801             :     } else {
     802        3289 :       if (c == '>') {
     803             :         break;
     804        1082 :       } else if (IdentifierPart::Is(c)) {
     805         760 :         push_code_unit(name, c);
     806             :       } else {
     807         322 :         ReportError(CStrVector("Invalid capture group name"));
     808         322 :         return nullptr;
     809             :       }
     810             :     }
     811             :   }
     812             : 
     813        5566 :   return name;
     814             : }
     815             : 
     816        1721 : bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
     817        2926 :                                              int index) {
     818             :   DCHECK(FLAG_harmony_regexp_named_captures);
     819             :   DCHECK(0 < index && index <= captures_started_);
     820             :   DCHECK_NOT_NULL(name);
     821             : 
     822        1721 :   if (named_captures_ == nullptr) {
     823        1268 :     named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
     824             :   } else {
     825             :     // Check for duplicates and bail if we find any.
     826             :     // TODO(jgruber): O(n^2).
     827        1398 :     for (const auto& named_capture : *named_captures_) {
     828         555 :       if (*named_capture->name() == *name) {
     829          63 :         ReportError(CStrVector("Duplicate capture group name"));
     830          63 :         return false;
     831             :       }
     832             :     }
     833             :   }
     834             : 
     835        1658 :   RegExpCapture* capture = GetCapture(index);
     836             :   DCHECK(capture->name() == nullptr);
     837             : 
     838             :   capture->set_name(name);
     839        1658 :   named_captures_->Add(capture, zone());
     840             : 
     841        1658 :   return true;
     842             : }
     843             : 
     844         612 : bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
     845        1822 :                                            RegExpParserState* state) {
     846             :   // The parser is assumed to be on the '<' in \k<name>.
     847         612 :   if (current() != '<') {
     848          63 :     ReportError(CStrVector("Invalid named reference"));
     849          63 :     return false;
     850             :   }
     851             : 
     852         549 :   const ZoneVector<uc16>* name = ParseCaptureGroupName();
     853         549 :   if (name == nullptr) {
     854             :     return false;
     855             :   }
     856             : 
     857         472 :   if (state->IsInsideCaptureGroup(name)) {
     858             :     builder->AddEmpty();
     859             :   } else {
     860             :     RegExpBackReference* atom = new (zone()) RegExpBackReference();
     861             :     atom->set_name(name);
     862             : 
     863         432 :     builder->AddAtom(atom);
     864             : 
     865         432 :     if (named_back_references_ == nullptr) {
     866             :       named_back_references_ =
     867         346 :           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
     868             :     }
     869         432 :     named_back_references_->Add(atom, zone());
     870             :   }
     871             : 
     872             :   return true;
     873             : }
     874             : 
     875      456218 : void RegExpParser::PatchNamedBackReferences() {
     876      456218 :   if (named_back_references_ == nullptr) return;
     877             : 
     878         346 :   if (named_captures_ == nullptr) {
     879          21 :     ReportError(CStrVector("Invalid named capture referenced"));
     880          21 :     return;
     881             :   }
     882             : 
     883             :   // Look up and patch the actual capture for each named back reference.
     884             :   // TODO(jgruber): O(n^2), optimize if necessary.
     885             : 
     886         881 :   for (int i = 0; i < named_back_references_->length(); i++) {
     887        1541 :     RegExpBackReference* ref = named_back_references_->at(i);
     888             : 
     889             :     int index = -1;
     890        1071 :     for (const auto& capture : *named_captures_) {
     891         527 :       if (*capture->name() == *ref->name()) {
     892             :         index = capture->index();
     893         278 :         break;
     894             :       }
     895             :     }
     896             : 
     897         411 :     if (index == -1) {
     898         133 :       ReportError(CStrVector("Invalid named capture referenced"));
     899         133 :       return;
     900             :     }
     901             : 
     902         278 :     ref->set_capture(GetCapture(index));
     903             :   }
     904             : }
     905             : 
     906     3315493 : RegExpCapture* RegExpParser::GetCapture(int index) {
     907             :   // The index for the capture groups are one-based. Its index in the list is
     908             :   // zero-based.
     909             :   int know_captures =
     910     1652391 :       is_scanned_for_captures_ ? capture_count_ : captures_started_;
     911             :   DCHECK(index <= know_captures);
     912     1652391 :   if (captures_ == NULL) {
     913       18717 :     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
     914             :   }
     915     6593552 :   while (captures_->length() < know_captures) {
     916     6585546 :     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
     917             :   }
     918     3304782 :   return captures_->at(index - 1);
     919             : }
     920             : 
     921      457080 : Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
     922      459474 :   if (named_captures_ == nullptr || named_captures_->is_empty())
     923      455048 :     return Handle<FixedArray>();
     924             : 
     925             :   Factory* factory = isolate()->factory();
     926             : 
     927        1016 :   int len = named_captures_->length() * 2;
     928        1016 :   Handle<FixedArray> array = factory->NewFixedArray(len);
     929             : 
     930        4788 :   for (int i = 0; i < named_captures_->length(); i++) {
     931        2756 :     RegExpCapture* capture = named_captures_->at(i);
     932        1378 :     MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
     933        2756 :     array->set(i * 2, *name.ToHandleChecked());
     934             :     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
     935             :   }
     936             : 
     937        1016 :   return array;
     938             : }
     939             : 
     940           0 : bool RegExpParser::HasNamedCaptures() {
     941         296 :   if (has_named_captures_ || is_scanned_for_captures_) {
     942             :     return has_named_captures_;
     943             :   }
     944             : 
     945         154 :   ScanForCaptures();
     946             :   DCHECK(is_scanned_for_captures_);
     947         154 :   return has_named_captures_;
     948             : }
     949             : 
     950           0 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
     951       14759 :   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
     952        7242 :     if (s->group_type() != CAPTURE) continue;
     953             :     // Return true if we found the matching capture index.
     954         842 :     if (index == s->capture_index()) return true;
     955             :     // Abort if index is larger than what has been parsed up till this state.
     956         653 :     if (index > s->capture_index()) return false;
     957             :   }
     958             :   return false;
     959             : }
     960             : 
     961         472 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
     962             :     const ZoneVector<uc16>* name) {
     963             :   DCHECK_NOT_NULL(name);
     964         932 :   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
     965         500 :     if (s->capture_name() == nullptr) continue;
     966          68 :     if (*s->capture_name() == *name) return true;
     967             :   }
     968             :   return false;
     969             : }
     970             : 
     971             : // QuantifierPrefix ::
     972             : //   { DecimalDigits }
     973             : //   { DecimalDigits , }
     974             : //   { DecimalDigits , DecimalDigits }
     975             : //
     976             : // Returns true if parsing succeeds, and set the min_out and max_out
     977             : // values. Values are truncated to RegExpTree::kInfinity if they overflow.
     978       93806 : bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
     979             :   DCHECK_EQ(current(), '{');
     980             :   int start = position();
     981       13140 :   Advance();
     982             :   int min = 0;
     983       13140 :   if (!IsDecimalDigit(current())) {
     984             :     Reset(start);
     985        1090 :     return false;
     986             :   }
     987       25989 :   while (IsDecimalDigit(current())) {
     988             :     int next = current() - '0';
     989       14101 :     if (min > (RegExpTree::kInfinity - next) / 10) {
     990             :       // Overflow. Skip past remaining decimal digits and return -1.
     991        1128 :       do {
     992        1128 :         Advance();
     993             :       } while (IsDecimalDigit(current()));
     994             :       min = RegExpTree::kInfinity;
     995             :       break;
     996             :     }
     997       13939 :     min = 10 * min + next;
     998       13939 :     Advance();
     999             :   }
    1000             :   int max = 0;
    1001       12050 :   if (current() == '}') {
    1002             :     max = min;
    1003        5122 :     Advance();
    1004        6928 :   } else if (current() == ',') {
    1005        6836 :     Advance();
    1006        6836 :     if (current() == '}') {
    1007             :       max = RegExpTree::kInfinity;
    1008         403 :       Advance();
    1009             :     } else {
    1010       14040 :       while (IsDecimalDigit(current())) {
    1011             :         int next = current() - '0';
    1012        7691 :         if (max > (RegExpTree::kInfinity - next) / 10) {
    1013        1050 :           do {
    1014        1050 :             Advance();
    1015             :           } while (IsDecimalDigit(current()));
    1016             :           max = RegExpTree::kInfinity;
    1017             :           break;
    1018             :         }
    1019        7607 :         max = 10 * max + next;
    1020        7607 :         Advance();
    1021             :       }
    1022        6433 :       if (current() != '}') {
    1023             :         Reset(start);
    1024         145 :         return false;
    1025             :       }
    1026        6288 :       Advance();
    1027             :     }
    1028             :   } else {
    1029             :     Reset(start);
    1030          92 :     return false;
    1031             :   }
    1032       11813 :   *min_out = min;
    1033       11813 :   *max_out = max;
    1034       11813 :   return true;
    1035             : }
    1036             : 
    1037             : 
    1038       13070 : uc32 RegExpParser::ParseOctalLiteral() {
    1039             :   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
    1040             :   // For compatibility with some other browsers (not all), we parse
    1041             :   // up to three octal digits with a value below 256.
    1042             :   // ES#prod-annexB-LegacyOctalEscapeSequence
    1043        5072 :   uc32 value = current() - '0';
    1044        5072 :   Advance();
    1045        5072 :   if ('0' <= current() && current() <= '7') {
    1046        1571 :     value = value * 8 + current() - '0';
    1047        1571 :     Advance();
    1048        3102 :     if (value < 32 && '0' <= current() && current() <= '7') {
    1049        2790 :       value = value * 8 + current() - '0';
    1050        1395 :       Advance();
    1051             :     }
    1052             :   }
    1053        5072 :   return value;
    1054             : }
    1055             : 
    1056             : 
    1057       85772 : bool RegExpParser::ParseHexEscape(int length, uc32* value) {
    1058             :   int start = position();
    1059             :   uc32 val = 0;
    1060       85267 :   for (int i = 0; i < length; ++i) {
    1061             :     uc32 c = current();
    1062             :     int d = HexValue(c);
    1063       67587 :     if (d < 0) {
    1064             :       Reset(start);
    1065         505 :       return false;
    1066             :     }
    1067       67082 :     val = val * 16 + d;
    1068       67082 :     Advance();
    1069             :   }
    1070       17680 :   *value = val;
    1071       17680 :   return true;
    1072             : }
    1073             : 
    1074             : // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
    1075       39812 : bool RegExpParser::ParseUnicodeEscape(uc32* value) {
    1076             :   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
    1077             :   // allowed). In the latter case, the number of hex digits between { } is
    1078             :   // arbitrary. \ and u have already been read.
    1079       19770 :   if (current() == '{' && unicode()) {
    1080             :     int start = position();
    1081        1933 :     Advance();
    1082        1933 :     if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
    1083        1891 :       if (current() == '}') {
    1084        1884 :         Advance();
    1085        1884 :         return true;
    1086             :       }
    1087             :     }
    1088             :     Reset(start);
    1089          49 :     return false;
    1090             :   }
    1091             :   // \u but no {, or \u{...} escapes not allowed.
    1092       15780 :   bool result = ParseHexEscape(4, value);
    1093       44762 :   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
    1094             :       current() == '\\') {
    1095             :     // Attempt to read trail surrogate.
    1096             :     int start = position();
    1097         246 :     if (Next() == 'u') {
    1098             :       Advance(2);
    1099             :       uc32 trail;
    1100         456 :       if (ParseHexEscape(4, &trail) &&
    1101         222 :           unibrow::Utf16::IsTrailSurrogate(trail)) {
    1102             :         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
    1103         444 :                                                       static_cast<uc16>(trail));
    1104         222 :         return true;
    1105             :       }
    1106             :     }
    1107             :     Reset(start);
    1108             :   }
    1109       15558 :   return result;
    1110             : }
    1111             : 
    1112             : #ifdef V8_INTL_SUPPORT
    1113             : 
    1114             : namespace {
    1115             : 
    1116        4632 : bool IsExactPropertyAlias(const char* property_name, UProperty property) {
    1117        4632 :   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
    1118        4632 :   if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
    1119             :   for (int i = 0;; i++) {
    1120             :     const char* long_name = u_getPropertyName(
    1121         980 :         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
    1122         980 :     if (long_name == NULL) break;
    1123         868 :     if (strcmp(property_name, long_name) == 0) return true;
    1124             :   }
    1125             :   return false;
    1126             : }
    1127             : 
    1128        4838 : bool IsExactPropertyValueAlias(const char* property_value_name,
    1129             :                                UProperty property, int32_t property_value) {
    1130             :   const char* short_name =
    1131        4838 :       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
    1132        4838 :   if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
    1133             :     return true;
    1134             :   }
    1135             :   for (int i = 0;; i++) {
    1136             :     const char* long_name = u_getPropertyValueName(
    1137             :         property, property_value,
    1138         414 :         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
    1139         414 :     if (long_name == NULL) break;
    1140         400 :     if (strcmp(property_value_name, long_name) == 0) return true;
    1141             :   }
    1142             :   return false;
    1143             : }
    1144             : 
    1145        6904 : bool LookupPropertyValueName(UProperty property,
    1146             :                              const char* property_value_name, bool negate,
    1147             :                              ZoneList<CharacterRange>* result, Zone* zone) {
    1148             :   UProperty property_for_lookup = property;
    1149        6904 :   if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
    1150             :     // For the property Script_Extensions, we have to do the property value
    1151             :     // name lookup as if the property is Script.
    1152             :     property_for_lookup = UCHAR_SCRIPT;
    1153             :   }
    1154             :   int32_t property_value =
    1155        6904 :       u_getPropertyValueEnum(property_for_lookup, property_value_name);
    1156        6904 :   if (property_value == UCHAR_INVALID_CODE) return false;
    1157             : 
    1158             :   // We require the property name to match exactly to one of the property value
    1159             :   // aliases. However, u_getPropertyValueEnum uses loose matching.
    1160        4838 :   if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
    1161        4838 :                                  property_value)) {
    1162             :     return false;
    1163             :   }
    1164             : 
    1165        4824 :   UErrorCode ec = U_ZERO_ERROR;
    1166        4824 :   icu::UnicodeSet set;
    1167        4824 :   set.applyIntPropertyValue(property, property_value, ec);
    1168        4824 :   bool success = ec == U_ZERO_ERROR && !set.isEmpty();
    1169             : 
    1170        4824 :   if (success) {
    1171        4824 :     set.removeAllStrings();
    1172        4824 :     if (negate) set.complement();
    1173      570420 :     for (int i = 0; i < set.getRangeCount(); i++) {
    1174             :       result->Add(
    1175      570420 :           CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
    1176     1140840 :           zone);
    1177             :     }
    1178             :   }
    1179        4824 :   return success;
    1180             : }
    1181             : 
    1182             : template <size_t N>
    1183             : inline bool NameEquals(const char* name, const char (&literal)[N]) {
    1184        5970 :   return strncmp(name, literal, N + 1) == 0;
    1185             : }
    1186             : 
    1187        2080 : bool LookupSpecialPropertyValueName(const char* name,
    1188             :                                     ZoneList<CharacterRange>* result,
    1189             :                                     bool negate, Zone* zone) {
    1190        2080 :   if (NameEquals(name, "Any")) {
    1191          96 :     if (!negate) result->Add(CharacterRange::Everything(), zone);
    1192        2008 :   } else if (NameEquals(name, "ASCII")) {
    1193             :     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
    1194             :                        : CharacterRange::Range(0x0, 0x7f),
    1195         252 :                 zone);
    1196        1882 :   } else if (NameEquals(name, "Assigned")) {
    1197             :     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
    1198          90 :                                    !negate, result, zone);
    1199             :   } else {
    1200             :     return false;
    1201             :   }
    1202             :   return true;
    1203             : }
    1204             : 
    1205             : // Explicitly whitelist supported binary properties. The spec forbids supporting
    1206             : // properties outside of this set to ensure interoperability.
    1207             : bool IsSupportedBinaryProperty(UProperty property) {
    1208             :   switch (property) {
    1209             :     case UCHAR_ALPHABETIC:
    1210             :     // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
    1211             :     // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
    1212             :     case UCHAR_ASCII_HEX_DIGIT:
    1213             :     // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
    1214             :     case UCHAR_BIDI_CONTROL:
    1215             :     case UCHAR_BIDI_MIRRORED:
    1216             :     case UCHAR_CASE_IGNORABLE:
    1217             :     case UCHAR_CASED:
    1218             :     case UCHAR_CHANGES_WHEN_CASEFOLDED:
    1219             :     case UCHAR_CHANGES_WHEN_CASEMAPPED:
    1220             :     case UCHAR_CHANGES_WHEN_LOWERCASED:
    1221             :     case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
    1222             :     case UCHAR_CHANGES_WHEN_TITLECASED:
    1223             :     case UCHAR_CHANGES_WHEN_UPPERCASED:
    1224             :     case UCHAR_DASH:
    1225             :     case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
    1226             :     case UCHAR_DEPRECATED:
    1227             :     case UCHAR_DIACRITIC:
    1228             :     case UCHAR_EMOJI:
    1229             :     // TODO(yangguo): Uncomment this once we upgrade to ICU 60.
    1230             :     //                See https://ssl.icu-project.org/trac/ticket/13062
    1231             :     // case UCHAR_EMOJI_COMPONENT:
    1232             :     case UCHAR_EMOJI_MODIFIER_BASE:
    1233             :     case UCHAR_EMOJI_MODIFIER:
    1234             :     case UCHAR_EMOJI_PRESENTATION:
    1235             :     case UCHAR_EXTENDER:
    1236             :     case UCHAR_GRAPHEME_BASE:
    1237             :     case UCHAR_GRAPHEME_EXTEND:
    1238             :     case UCHAR_HEX_DIGIT:
    1239             :     case UCHAR_ID_CONTINUE:
    1240             :     case UCHAR_ID_START:
    1241             :     case UCHAR_IDEOGRAPHIC:
    1242             :     case UCHAR_IDS_BINARY_OPERATOR:
    1243             :     case UCHAR_IDS_TRINARY_OPERATOR:
    1244             :     case UCHAR_JOIN_CONTROL:
    1245             :     case UCHAR_LOGICAL_ORDER_EXCEPTION:
    1246             :     case UCHAR_LOWERCASE:
    1247             :     case UCHAR_MATH:
    1248             :     case UCHAR_NONCHARACTER_CODE_POINT:
    1249             :     case UCHAR_PATTERN_SYNTAX:
    1250             :     case UCHAR_PATTERN_WHITE_SPACE:
    1251             :     case UCHAR_QUOTATION_MARK:
    1252             :     case UCHAR_RADICAL:
    1253             :     case UCHAR_S_TERM:
    1254             :     case UCHAR_SOFT_DOTTED:
    1255             :     case UCHAR_TERMINAL_PUNCTUATION:
    1256             :     case UCHAR_UNIFIED_IDEOGRAPH:
    1257             :     case UCHAR_UPPERCASE:
    1258             :     case UCHAR_VARIATION_SELECTOR:
    1259             :     case UCHAR_WHITE_SPACE:
    1260             :     case UCHAR_XID_CONTINUE:
    1261             :     case UCHAR_XID_START:
    1262             :       return true;
    1263             :     default:
    1264             :       break;
    1265             :   }
    1266             :   return false;
    1267             : }
    1268             : 
    1269             : }  // anonymous namespace
    1270             : 
    1271        6268 : bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
    1272       73604 :                                       bool negate) {
    1273             :   // Parse the property class as follows:
    1274             :   // - In \p{name}, 'name' is interpreted
    1275             :   //   - either as a general category property value name.
    1276             :   //   - or as a binary property name.
    1277             :   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
    1278             :   //   and 'value' is interpreted as one of the available property value names.
    1279             :   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
    1280             :   // - Loose matching is not applied.
    1281             :   List<char> first_part;
    1282             :   List<char> second_part;
    1283        6268 :   if (current() == '{') {
    1284             :     // Parse \p{[PropertyName=]PropertyNameValue}
    1285       79412 :     for (Advance(); current() != '}' && current() != '='; Advance()) {
    1286       33606 :       if (!has_next()) return false;
    1287       33564 :       first_part.Add(static_cast<char>(current()));
    1288             :     }
    1289        6100 :     if (current() == '=') {
    1290       37472 :       for (Advance(); current() != '}'; Advance()) {
    1291       15154 :         if (!has_next()) return false;
    1292       15154 :         second_part.Add(static_cast<char>(current()));
    1293             :       }
    1294        3582 :       second_part.Add(0);  // null-terminate string.
    1295             :     }
    1296             :   } else {
    1297             :     return false;
    1298             :   }
    1299        6100 :   Advance();
    1300        6100 :   first_part.Add(0);  // null-terminate string.
    1301             : 
    1302        6100 :   if (second_part.is_empty()) {
    1303             :     // First attempt to interpret as general category property value name.
    1304        2518 :     const char* name = first_part.ToConstVector().start();
    1305        2518 :     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
    1306        2518 :                                 result, zone())) {
    1307             :       return true;
    1308             :     }
    1309             :     // Interpret "Any", "ASCII", and "Assigned".
    1310        2080 :     if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
    1311             :       return true;
    1312             :     }
    1313             :     // Then attempt to interpret as binary property name with value name 'Y'.
    1314        1792 :     UProperty property = u_getPropertyEnum(name);
    1315        1792 :     if (!IsSupportedBinaryProperty(property)) return false;
    1316        1050 :     if (!IsExactPropertyAlias(name, property)) return false;
    1317             :     return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
    1318        1050 :                                    zone());
    1319             :   } else {
    1320             :     // Both property name and value name are specified. Attempt to interpret
    1321             :     // the property name as enumerated property.
    1322        3582 :     const char* property_name = first_part.ToConstVector().start();
    1323        3582 :     const char* value_name = second_part.ToConstVector().start();
    1324        3582 :     UProperty property = u_getPropertyEnum(property_name);
    1325        3582 :     if (!IsExactPropertyAlias(property_name, property)) return false;
    1326        3470 :     if (property == UCHAR_GENERAL_CATEGORY) {
    1327             :       // We want to allow aggregate value names such as "Letter".
    1328             :       property = UCHAR_GENERAL_CATEGORY_MASK;
    1329        6844 :     } else if (property != UCHAR_SCRIPT &&
    1330        3422 :                property != UCHAR_SCRIPT_EXTENSIONS) {
    1331             :       return false;
    1332             :     }
    1333             :     return LookupPropertyValueName(property, value_name, negate, result,
    1334        3246 :                                    zone());
    1335             :   }
    1336             : }
    1337             : 
    1338             : #else  // V8_INTL_SUPPORT
    1339             : 
    1340             : bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
    1341             :                                       bool negate) {
    1342             :   return false;
    1343             : }
    1344             : 
    1345             : #endif  // V8_INTL_SUPPORT
    1346             : 
    1347       11219 : bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
    1348             :   uc32 x = 0;
    1349             :   int d = HexValue(current());
    1350        1933 :   if (d < 0) {
    1351             :     return false;
    1352             :   }
    1353       11219 :   while (d >= 0) {
    1354        9328 :     x = x * 16 + d;
    1355        9328 :     if (x > max_value) {
    1356             :       return false;
    1357             :     }
    1358        9286 :     Advance();
    1359             :     d = HexValue(current());
    1360             :   }
    1361        1891 :   *value = x;
    1362        1891 :   return true;
    1363             : }
    1364             : 
    1365             : 
    1366       51400 : uc32 RegExpParser::ParseClassCharacterEscape() {
    1367             :   DCHECK(current() == '\\');
    1368             :   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
    1369       22755 :   Advance();
    1370       22755 :   switch (current()) {
    1371             :     case 'b':
    1372          16 :       Advance();
    1373          16 :       return '\b';
    1374             :     // ControlEscape :: one of
    1375             :     //   f n r t v
    1376             :     case 'f':
    1377           0 :       Advance();
    1378           0 :       return '\f';
    1379             :     case 'n':
    1380         487 :       Advance();
    1381         487 :       return '\n';
    1382             :     case 'r':
    1383          56 :       Advance();
    1384          56 :       return '\r';
    1385             :     case 't':
    1386         190 :       Advance();
    1387         190 :       return '\t';
    1388             :     case 'v':
    1389          13 :       Advance();
    1390          13 :       return '\v';
    1391             :     case 'c': {
    1392         458 :       uc32 controlLetter = Next();
    1393         458 :       uc32 letter = controlLetter & ~('A' ^ 'a');
    1394             :       // Inside a character class, we also accept digits and underscore as
    1395             :       // control characters, unless with /u. See Annex B:
    1396             :       // ES#prod-annexB-ClassControlLetter
    1397         458 :       if (letter >= 'A' && letter <= 'Z') {
    1398             :         Advance(2);
    1399             :         // Control letters mapped to ASCII control characters in the range
    1400             :         // 0x00-0x1f.
    1401          70 :         return controlLetter & 0x1f;
    1402             :       }
    1403         388 :       if (unicode()) {
    1404             :         // With /u, invalid escapes are not treated as identity escapes.
    1405          28 :         ReportError(CStrVector("Invalid class escape"));
    1406          28 :         return 0;
    1407             :       }
    1408         720 :       if ((controlLetter >= '0' && controlLetter <= '9') ||
    1409         360 :           controlLetter == '_') {
    1410             :         Advance(2);
    1411         236 :         return controlLetter & 0x1f;
    1412             :       }
    1413             :       // We match JSC in reading the backslash as a literal
    1414             :       // character instead of as starting an escape.
    1415             :       // TODO(v8:6201): Not yet covered by the spec.
    1416             :       return '\\';
    1417             :     }
    1418             :     case '0':
    1419             :       // With /u, \0 is interpreted as NUL if not followed by another digit.
    1420        1022 :       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
    1421          42 :         Advance();
    1422          42 :         return 0;
    1423             :       }
    1424             :     // Fall through.
    1425             :     case '1':
    1426             :     case '2':
    1427             :     case '3':
    1428             :     case '4':
    1429             :     case '5':
    1430             :     case '6':
    1431             :     case '7':
    1432             :       // For compatibility, we interpret a decimal escape that isn't
    1433             :       // a back reference (and therefore either \0 or not valid according
    1434             :       // to the specification) as a 1..3 digit octal character code.
    1435             :       // ES#prod-annexB-LegacyOctalEscapeSequence
    1436        1228 :       if (unicode()) {
    1437             :         // With /u, decimal escape is not interpreted as octal character code.
    1438          56 :         ReportError(CStrVector("Invalid class escape"));
    1439          56 :         return 0;
    1440             :       }
    1441        1172 :       return ParseOctalLiteral();
    1442             :     case 'x': {
    1443        1844 :       Advance();
    1444             :       uc32 value;
    1445        1844 :       if (ParseHexEscape(2, &value)) return value;
    1446          36 :       if (unicode()) {
    1447             :         // With /u, invalid escapes are not treated as identity escapes.
    1448           0 :         ReportError(CStrVector("Invalid escape"));
    1449           0 :         return 0;
    1450             :       }
    1451             :       // If \x is not followed by a two-digit hexadecimal, treat it
    1452             :       // as an identity escape.
    1453             :       return 'x';
    1454             :     }
    1455             :     case 'u': {
    1456       15229 :       Advance();
    1457             :       uc32 value;
    1458       15229 :       if (ParseUnicodeEscape(&value)) return value;
    1459          24 :       if (unicode()) {
    1460             :         // With /u, invalid escapes are not treated as identity escapes.
    1461           0 :         ReportError(CStrVector("Invalid unicode escape"));
    1462           0 :         return 0;
    1463             :       }
    1464             :       // If \u is not followed by a two-digit hexadecimal, treat it
    1465             :       // as an identity escape.
    1466             :       return 'u';
    1467             :     }
    1468             :     default: {
    1469             :       uc32 result = current();
    1470             :       // With /u, no identity escapes except for syntax characters and '-' are
    1471             :       // allowed. Otherwise, all identity escapes are allowed.
    1472        3260 :       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
    1473        3178 :         Advance();
    1474        3178 :         return result;
    1475             :       }
    1476          14 :       ReportError(CStrVector("Invalid escape"));
    1477          14 :       return 0;
    1478             :     }
    1479             :   }
    1480             :   return 0;
    1481             : }
    1482             : 
    1483             : 
    1484      435098 : CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
    1485             :   DCHECK_EQ(0, *char_class);
    1486             :   uc32 first = current();
    1487      435098 :   if (first == '\\') {
    1488       24837 :     switch (Next()) {
    1489             :       case 'w':
    1490             :       case 'W':
    1491             :       case 'd':
    1492             :       case 'D':
    1493             :       case 's':
    1494             :       case 'S': {
    1495        2082 :         *char_class = Next();
    1496             :         Advance(2);
    1497             :         return CharacterRange::Singleton(0);  // Return dummy value.
    1498             :       }
    1499             :       case kEndMarker:
    1500           0 :         return ReportError(CStrVector("\\ at end of pattern"));
    1501             :       default:
    1502       22755 :         first = ParseClassCharacterEscape(CHECK_FAILED);
    1503             :     }
    1504             :   } else {
    1505      410261 :     Advance();
    1506             :   }
    1507             : 
    1508             :   return CharacterRange::Singleton(first);
    1509             : }
    1510             : 
    1511             : static const uc16 kNoCharClass = 0;
    1512             : 
    1513             : // Adds range or pre-defined character class to character ranges.
    1514             : // If char_class is not kInvalidClass, it's interpreted as a class
    1515             : // escape (i.e., 's' means whitespace, from '\s').
    1516       73061 : static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
    1517             :                                     uc16 char_class, CharacterRange range,
    1518             :                                     bool add_unicode_case_equivalents,
    1519             :                                     Zone* zone) {
    1520       73061 :   if (char_class != kNoCharClass) {
    1521             :     CharacterRange::AddClassEscape(char_class, ranges,
    1522        2054 :                                    add_unicode_case_equivalents, zone);
    1523             :   } else {
    1524             :     ranges->Add(range, zone);
    1525             :   }
    1526       73061 : }
    1527             : 
    1528      279111 : bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
    1529      254335 :   if (!FLAG_harmony_regexp_property) return false;
    1530       12544 :   if (!unicode()) return false;
    1531       12232 :   if (current() != '\\') return false;
    1532       12208 :   uc32 next = Next();
    1533             :   bool parse_success = false;
    1534       12208 :   if (next == 'p') {
    1535             :     Advance(2);
    1536         238 :     parse_success = ParsePropertyClass(ranges, false);
    1537       11970 :   } else if (next == 'P') {
    1538             :     Advance(2);
    1539         180 :     parse_success = ParsePropertyClass(ranges, true);
    1540             :   } else {
    1541             :     return false;
    1542             :   }
    1543         418 :   if (!parse_success)
    1544          70 :     ReportError(CStrVector("Invalid property name in character class"));
    1545         418 :   return parse_success;
    1546             : }
    1547             : 
    1548     2536696 : RegExpTree* RegExpParser::ParseCharacterClass() {
    1549             :   static const char* kUnterminated = "Unterminated character class";
    1550             :   static const char* kRangeInvalid = "Invalid character class";
    1551             :   static const char* kRangeOutOfOrder = "Range out of order in character class";
    1552             : 
    1553             :   DCHECK_EQ(current(), '[');
    1554      167245 :   Advance();
    1555             :   bool is_negated = false;
    1556      167245 :   if (current() == '^') {
    1557             :     is_negated = true;
    1558       12260 :     Advance();
    1559             :   }
    1560             :   ZoneList<CharacterRange>* ranges =
    1561      167245 :       new (zone()) ZoneList<CharacterRange>(2, zone());
    1562      168689 :   bool add_unicode_case_equivalents = unicode() && ignore_case();
    1563     1009269 :   while (has_more() && current() != ']') {
    1564      254608 :     bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
    1565      254828 :     if (parsed_property) continue;
    1566      253916 :     uc16 char_class = kNoCharClass;
    1567      253916 :     CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
    1568      253818 :     if (current() == '-') {
    1569      181470 :       Advance();
    1570      181470 :       if (current() == kEndMarker) {
    1571             :         // If we reach the end we break out of the loop and let the
    1572             :         // following code report an error.
    1573             :         break;
    1574      181463 :       } else if (current() == ']') {
    1575             :         AddRangeOrEscape(ranges, char_class, first,
    1576         281 :                          add_unicode_case_equivalents, zone());
    1577         562 :         ranges->Add(CharacterRange::Singleton('-'), zone());
    1578         281 :         break;
    1579             :       }
    1580      181182 :       uc16 char_class_2 = kNoCharClass;
    1581      181286 :       CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
    1582      181182 :       if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
    1583             :         // Either end is an escaped character class. Treat the '-' verbatim.
    1584         244 :         if (unicode()) {
    1585             :           // ES2015 21.2.2.15.1 step 1.
    1586          56 :           return ReportError(CStrVector(kRangeInvalid));
    1587             :         }
    1588             :         AddRangeOrEscape(ranges, char_class, first,
    1589         216 :                          add_unicode_case_equivalents, zone());
    1590         432 :         ranges->Add(CharacterRange::Singleton('-'), zone());
    1591             :         AddRangeOrEscape(ranges, char_class_2, next,
    1592         216 :                          add_unicode_case_equivalents, zone());
    1593         216 :         continue;
    1594             :       }
    1595             :       // ES2015 21.2.2.15.1 step 6.
    1596      180938 :       if (first.from() > next.to()) {
    1597         152 :         return ReportError(CStrVector(kRangeOutOfOrder));
    1598             :       }
    1599      361724 :       ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
    1600             :     } else {
    1601             :       AddRangeOrEscape(ranges, char_class, first, add_unicode_case_equivalents,
    1602       72348 :                        zone());
    1603             :     }
    1604             :   }
    1605      166972 :   if (!has_more()) {
    1606          42 :     return ReportError(CStrVector(kUnterminated));
    1607             :   }
    1608      166951 :   Advance();
    1609      166951 :   if (ranges->length() == 0) {
    1610         746 :     ranges->Add(CharacterRange::Everything(), zone());
    1611         373 :     is_negated = !is_negated;
    1612             :   }
    1613             :   RegExpCharacterClass::Flags flags;
    1614      166951 :   if (is_negated) flags = RegExpCharacterClass::NEGATED;
    1615             :   return new (zone()) RegExpCharacterClass(ranges, flags);
    1616             : }
    1617             : 
    1618             : 
    1619             : #undef CHECK_FAILED
    1620             : 
    1621             : 
    1622      459845 : bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
    1623             :                                FlatStringReader* input, JSRegExp::Flags flags,
    1624             :                                RegExpCompileData* result) {
    1625             :   DCHECK(result != NULL);
    1626      459845 :   RegExpParser parser(input, &result->error, flags, isolate, zone);
    1627      459845 :   RegExpTree* tree = parser.ParsePattern();
    1628      459845 :   if (parser.failed()) {
    1629             :     DCHECK(tree == NULL);
    1630             :     DCHECK(!result->error.is_null());
    1631             :   } else {
    1632             :     DCHECK(tree != NULL);
    1633             :     DCHECK(result->error.is_null());
    1634             :     if (FLAG_trace_regexp_parser) {
    1635             :       OFStream os(stdout);
    1636             :       tree->Print(os, zone);
    1637             :       os << "\n";
    1638             :     }
    1639      456064 :     result->tree = tree;
    1640      456064 :     int capture_count = parser.captures_started();
    1641      720659 :     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
    1642      456064 :     result->contains_anchor = parser.contains_anchor();
    1643      456064 :     result->capture_name_map = parser.CreateCaptureNameMap();
    1644      456064 :     result->capture_count = capture_count;
    1645             :   }
    1646      459845 :   return !parser.failed();
    1647             : }
    1648             : 
    1649           0 : RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
    1650             :     : zone_(zone),
    1651             :       pending_empty_(false),
    1652             :       ignore_case_(ignore_case),
    1653             :       unicode_(unicode),
    1654             :       characters_(NULL),
    1655             :       pending_surrogate_(kNoPendingSurrogate),
    1656             :       terms_(),
    1657     2143904 :       alternatives_()
    1658             : #ifdef DEBUG
    1659             :       ,
    1660             :       last_added_(ADD_NONE)
    1661             : #endif
    1662             : {
    1663           0 : }
    1664             : 
    1665             : 
    1666           0 : void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
    1667             :   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
    1668             :   FlushPendingSurrogate();
    1669             :   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
    1670         933 :   pending_surrogate_ = lead_surrogate;
    1671           0 : }
    1672             : 
    1673             : 
    1674        3229 : void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
    1675             :   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
    1676         857 :   if (pending_surrogate_ != kNoPendingSurrogate) {
    1677             :     uc16 lead_surrogate = pending_surrogate_;
    1678         629 :     pending_surrogate_ = kNoPendingSurrogate;
    1679             :     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
    1680             :     uc32 combined =
    1681             :         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
    1682         629 :     if (NeedsDesugaringForIgnoreCase(combined)) {
    1683          36 :       AddCharacterClassForDesugaring(combined);
    1684             :     } else {
    1685         593 :       ZoneList<uc16> surrogate_pair(2, zone());
    1686             :       surrogate_pair.Add(lead_surrogate, zone());
    1687             :       surrogate_pair.Add(trail_surrogate, zone());
    1688             :       RegExpAtom* atom =
    1689         593 :           new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
    1690         593 :       AddAtom(atom);
    1691             :     }
    1692             :   } else {
    1693         228 :     pending_surrogate_ = trail_surrogate;
    1694             :     FlushPendingSurrogate();
    1695             :   }
    1696         857 : }
    1697             : 
    1698             : 
    1699           0 : void RegExpBuilder::FlushPendingSurrogate() {
    1700    16835002 :   if (pending_surrogate_ != kNoPendingSurrogate) {
    1701             :     DCHECK(unicode());
    1702         532 :     uc32 c = pending_surrogate_;
    1703         532 :     pending_surrogate_ = kNoPendingSurrogate;
    1704         532 :     AddCharacterClassForDesugaring(c);
    1705             :   }
    1706           0 : }
    1707             : 
    1708             : 
    1709     7522416 : void RegExpBuilder::FlushCharacters() {
    1710             :   FlushPendingSurrogate();
    1711     6513422 :   pending_empty_ = false;
    1712     6513422 :   if (characters_ != NULL) {
    1713      504497 :     RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
    1714      504497 :     characters_ = NULL;
    1715      504497 :     text_.Add(atom, zone());
    1716             :     LAST(ADD_ATOM);
    1717             :   }
    1718     6513422 : }
    1719             : 
    1720             : 
    1721     6992585 : void RegExpBuilder::FlushText() {
    1722     6293140 :   FlushCharacters();
    1723             :   int num_text = text_.length();
    1724     6293140 :   if (num_text == 0) {
    1725     6293140 :     return;
    1726      673132 :   } else if (num_text == 1) {
    1727      667142 :     terms_.Add(text_.last(), zone());
    1728             :   } else {
    1729             :     RegExpText* text = new (zone()) RegExpText(zone());
    1730       46636 :     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
    1731        5990 :     terms_.Add(text, zone());
    1732             :   }
    1733             :   text_.Clear();
    1734             : }
    1735             : 
    1736             : 
    1737    18775796 : void RegExpBuilder::AddCharacter(uc16 c) {
    1738             :   FlushPendingSurrogate();
    1739     8015278 :   pending_empty_ = false;
    1740     8015278 :   if (NeedsDesugaringForIgnoreCase(c)) {
    1741         250 :     AddCharacterClassForDesugaring(c);
    1742             :   } else {
    1743     8015028 :     if (characters_ == NULL) {
    1744     2745490 :       characters_ = new (zone()) ZoneList<uc16>(4, zone());
    1745             :     }
    1746     8015028 :     characters_->Add(c, zone());
    1747             :     LAST(ADD_CHAR);
    1748             :   }
    1749     8015278 : }
    1750             : 
    1751             : 
    1752    15960099 : void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
    1753     7980364 :   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
    1754             :     DCHECK(unicode());
    1755         629 :     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
    1756         629 :     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
    1757     7982715 :   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
    1758         304 :     AddLeadSurrogate(c);
    1759     7982107 :   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
    1760         228 :     AddTrailSurrogate(c);
    1761             :   } else {
    1762     7979203 :     AddCharacter(static_cast<uc16>(c));
    1763             :   }
    1764     7980364 : }
    1765             : 
    1766        1891 : void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
    1767             :   // A lead or trail surrogate parsed via escape sequence will not
    1768             :   // pair up with any preceding lead or following trail surrogate.
    1769             :   FlushPendingSurrogate();
    1770        1891 :   AddUnicodeCharacter(character);
    1771             :   FlushPendingSurrogate();
    1772        1891 : }
    1773             : 
    1774         229 : void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
    1775             : 
    1776             : 
    1777      225496 : void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
    1778      225496 :   if (NeedsDesugaringForUnicode(cc)) {
    1779             :     // With /u, character class needs to be desugared, so it
    1780             :     // must be a standalone term instead of being part of a RegExpText.
    1781        5807 :     AddTerm(cc);
    1782             :   } else {
    1783      219689 :     AddAtom(cc);
    1784             :   }
    1785      225496 : }
    1786             : 
    1787        1636 : void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
    1788             :   AddTerm(new (zone()) RegExpCharacterClass(
    1789        1636 :       CharacterRange::List(zone(), CharacterRange::Singleton(c))));
    1790         818 : }
    1791             : 
    1792             : 
    1793     3821126 : void RegExpBuilder::AddAtom(RegExpTree* term) {
    1794     1910563 :   if (term->IsEmpty()) {
    1795             :     AddEmpty();
    1796     1910563 :     return;
    1797             :   }
    1798     1910563 :   if (term->IsTextElement()) {
    1799      220282 :     FlushCharacters();
    1800      220282 :     text_.Add(term, zone());
    1801             :   } else {
    1802     1690281 :     FlushText();
    1803     1690281 :     terms_.Add(term, zone());
    1804             :   }
    1805             :   LAST(ADD_ATOM);
    1806             : }
    1807             : 
    1808             : 
    1809       13250 : void RegExpBuilder::AddTerm(RegExpTree* term) {
    1810        6625 :   FlushText();
    1811        6625 :   terms_.Add(term, zone());
    1812             :   LAST(ADD_ATOM);
    1813        6625 : }
    1814             : 
    1815             : 
    1816       49998 : void RegExpBuilder::AddAssertion(RegExpTree* assert) {
    1817       25111 :   FlushText();
    1818       25111 :   if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
    1819             :     // Omit repeated assertions of the same type.
    1820         860 :     RegExpAssertion* last = terms_.last()->AsAssertion();
    1821         860 :     RegExpAssertion* next = assert->AsAssertion();
    1822       25541 :     if (last->assertion_type() == next->assertion_type()) return;
    1823             :   }
    1824       24887 :   terms_.Add(assert, zone());
    1825             :   LAST(ADD_ASSERT);
    1826             : }
    1827             : 
    1828             : 
    1829      152158 : void RegExpBuilder::NewAlternative() { FlushTerms(); }
    1830             : 
    1831             : 
    1832     5199347 : void RegExpBuilder::FlushTerms() {
    1833     2292471 :   FlushText();
    1834             :   int num_terms = terms_.length();
    1835             :   RegExpTree* alternative;
    1836     2292471 :   if (num_terms == 0) {
    1837             :     alternative = new (zone()) RegExpEmpty();
    1838     1830812 :   } else if (num_terms == 1) {
    1839             :     alternative = terms_.last();
    1840             :   } else {
    1841      152746 :     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
    1842             :   }
    1843     2292471 :   alternatives_.Add(alternative, zone());
    1844             :   terms_.Clear();
    1845             :   LAST(ADD_NONE);
    1846     2292471 : }
    1847             : 
    1848             : 
    1849      240289 : bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
    1850      225496 :   if (!unicode()) return false;
    1851             :   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
    1852             :   // necessarily mean that we need to desugar. It's probably nicer to have a
    1853             :   // separate pass to figure out unicode desugarings.
    1854        7580 :   if (ignore_case()) return true;
    1855             :   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
    1856        7213 :   CharacterRange::Canonicalize(ranges);
    1857       19978 :   for (int i = ranges->length() - 1; i >= 0; i--) {
    1858       18205 :     uc32 from = ranges->at(i).from();
    1859       18205 :     uc32 to = ranges->at(i).to();
    1860             :     // Check for non-BMP characters.
    1861       18205 :     if (to >= kNonBmpStart) return true;
    1862             :     // Check for lone surrogates.
    1863       12897 :     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
    1864             :   }
    1865             :   return false;
    1866             : }
    1867             : 
    1868             : 
    1869     8018996 : bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
    1870             : #ifdef V8_INTL_SUPPORT
    1871     8018996 :   if (unicode() && ignore_case()) {
    1872         301 :     icu::UnicodeSet set(c, c);
    1873         301 :     set.closeOver(USET_CASE_INSENSITIVE);
    1874         301 :     set.removeAllStrings();
    1875         301 :     return set.size() > 1;
    1876             :   }
    1877             :   // In the case where ICU is not included, we act as if the unicode flag is
    1878             :   // not set, and do not desugar.
    1879             : #endif  // V8_INTL_SUPPORT
    1880             :   return false;
    1881             : }
    1882             : 
    1883             : 
    1884     2201385 : RegExpTree* RegExpBuilder::ToRegExp() {
    1885     2140313 :   FlushTerms();
    1886             :   int num_alternatives = alternatives_.length();
    1887     2140313 :   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
    1888     2140313 :   if (num_alternatives == 1) return alternatives_.last();
    1889       61072 :   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
    1890             : }
    1891             : 
    1892     2301359 : bool RegExpBuilder::AddQuantifierToAtom(
    1893     4565182 :     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
    1894             :   FlushPendingSurrogate();
    1895     2301359 :   if (pending_empty_) {
    1896           2 :     pending_empty_ = false;
    1897           2 :     return true;
    1898             :   }
    1899             :   RegExpTree* atom;
    1900     2301357 :   if (characters_ != NULL) {
    1901             :     DCHECK(last_added_ == ADD_CHAR);
    1902             :     // Last atom was character.
    1903     2240623 :     Vector<const uc16> char_vector = characters_->ToConstVector();
    1904             :     int num_chars = char_vector.length();
    1905     2240623 :     if (num_chars > 1) {
    1906         743 :       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
    1907         743 :       text_.Add(new (zone()) RegExpAtom(prefix), zone());
    1908             :       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
    1909             :     }
    1910     2240623 :     characters_ = NULL;
    1911             :     atom = new (zone()) RegExpAtom(char_vector);
    1912     2240623 :     FlushText();
    1913       60734 :   } else if (text_.length() > 0) {
    1914             :     DCHECK(last_added_ == ADD_ATOM);
    1915       38029 :     atom = text_.RemoveLast();
    1916       38029 :     FlushText();
    1917       22705 :   } else if (terms_.length() > 0) {
    1918             :     DCHECK(last_added_ == ADD_ATOM);
    1919       22705 :     atom = terms_.RemoveLast();
    1920             :     // With /u, lookarounds are not quantifiable.
    1921       22705 :     if (unicode() && atom->IsLookaround()) return false;
    1922       22675 :     if (atom->max_match() == 0) {
    1923             :       // Guaranteed to only match an empty string.
    1924             :       LAST(ADD_TERM);
    1925         349 :       if (min == 0) {
    1926             :         return true;
    1927             :       }
    1928         133 :       terms_.Add(atom, zone());
    1929         133 :       return true;
    1930             :     }
    1931             :   } else {
    1932             :     // Only call immediately after adding an atom or character!
    1933           0 :     UNREACHABLE();
    1934             :     return false;
    1935             :   }
    1936     2300978 :   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
    1937     2300978 :              zone());
    1938             :   LAST(ADD_TERM);
    1939     2300978 :   return true;
    1940             : }
    1941             : 
    1942             : }  // namespace internal
    1943             : }  // namespace v8

Generated by: LCOV version 1.10