Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/regexp-parser.h"
6 :
7 : #include <vector>
8 :
9 : #include "src/char-predicates-inl.h"
10 : #include "src/factory.h"
11 : #include "src/isolate.h"
12 : #include "src/objects-inl.h"
13 : #include "src/ostreams.h"
14 : #include "src/regexp/jsregexp.h"
15 : #include "src/utils.h"
16 :
17 : #ifdef V8_INTL_SUPPORT
18 : #include "unicode/uniset.h"
19 : #endif // V8_INTL_SUPPORT
20 :
21 : namespace v8 {
22 : namespace internal {
23 :
24 397989 : RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
25 : JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
26 : : isolate_(isolate),
27 : zone_(zone),
28 : error_(error),
29 : captures_(nullptr),
30 : named_captures_(nullptr),
31 : named_back_references_(nullptr),
32 : in_(in),
33 : current_(kEndMarker),
34 : dotall_(flags & JSRegExp::kDotAll),
35 : ignore_case_(flags & JSRegExp::kIgnoreCase),
36 : multiline_(flags & JSRegExp::kMultiline),
37 : unicode_(flags & JSRegExp::kUnicode),
38 : next_pos_(0),
39 : captures_started_(0),
40 : capture_count_(0),
41 : has_more_(true),
42 : simple_(false),
43 : contains_anchor_(false),
44 : is_scanned_for_captures_(false),
45 : has_named_captures_(false),
46 1989945 : failed_(false) {
47 : DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
48 397989 : Advance();
49 397989 : }
50 :
51 : template <bool update_position>
52 1294796829 : inline uc32 RegExpParser::ReadNext() {
53 431598943 : int position = next_pos_;
54 : uc32 c0 = in()->Get(position);
55 431598943 : position++;
56 : // Read the whole surrogate pair in case of unicode flag, if possible.
57 431783516 : if (unicode() && position < in()->length() &&
58 : unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
59 : uc16 c1 = in()->Get(position);
60 603 : if (unibrow::Utf16::IsTrailSurrogate(c1)) {
61 406 : c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
62 388 : position++;
63 : }
64 : }
65 431436095 : if (update_position) next_pos_ = position;
66 431598943 : return c0;
67 : }
68 :
69 :
70 0 : uc32 RegExpParser::Next() {
71 162881 : if (has_next()) {
72 162848 : return ReadNext<false>();
73 : } else {
74 : return kEndMarker;
75 : }
76 : }
77 :
78 1294707007 : void RegExpParser::Advance() {
79 431834398 : if (has_next()) {
80 : StackLimitCheck check(isolate());
81 431436504 : if (check.HasOverflowed()) {
82 399 : if (FLAG_abort_on_stack_or_string_length_overflow) {
83 0 : FATAL("Aborting on stack overflow");
84 : }
85 : ReportError(CStrVector(
86 798 : MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
87 431436105 : } else if (zone()->excess_allocation()) {
88 10 : ReportError(CStrVector("Regular expression too large"));
89 : } else {
90 431436095 : current_ = ReadNext<true>();
91 : }
92 : } else {
93 397894 : current_ = kEndMarker;
94 : // Advance so that position() points to 1-after-the-last-character. This is
95 : // important so that Reset() to this position works correctly.
96 397894 : next_pos_ = in()->length() + 1;
97 397894 : has_more_ = false;
98 : }
99 431834398 : }
100 :
101 :
102 3276 : void RegExpParser::Reset(int pos) {
103 3276 : next_pos_ = pos;
104 3276 : has_more_ = (pos < in()->length());
105 3276 : Advance();
106 0 : }
107 :
108 0 : void RegExpParser::Advance(int dist) {
109 71136 : next_pos_ += dist - 1;
110 71136 : Advance();
111 0 : }
112 :
113 :
114 211472 : bool RegExpParser::simple() { return simple_; }
115 :
116 0 : bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
117 96 : switch (c) {
118 : case '^':
119 : case '$':
120 : case '\\':
121 : case '.':
122 : case '*':
123 : case '+':
124 : case '?':
125 : case '(':
126 : case ')':
127 : case '[':
128 : case ']':
129 : case '{':
130 : case '}':
131 : case '|':
132 : case '/':
133 : return true;
134 : default:
135 : break;
136 : }
137 0 : return false;
138 : }
139 :
140 :
141 8130 : RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
142 2774 : if (failed_) return nullptr; // Do not overwrite any existing error.
143 2678 : failed_ = true;
144 : *error_ = isolate()
145 : ->factory()
146 : ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
147 8034 : .ToHandleChecked();
148 : // Zip to the end to make sure the no more input is read.
149 2678 : current_ = kEndMarker;
150 2678 : next_pos_ = in()->length();
151 2678 : return nullptr;
152 : }
153 :
154 : #define CHECK_FAILED /**/); \
155 : if (failed_) return nullptr; \
156 : ((void)0
157 :
158 : // Pattern ::
159 : // Disjunction
160 609461 : RegExpTree* RegExpParser::ParsePattern() {
161 397989 : RegExpTree* result = ParseDisjunction(CHECK_FAILED);
162 395413 : PatchNamedBackReferences(CHECK_FAILED);
163 : DCHECK(!has_more());
164 : // If the result of parsing is a literal string atom, and it has the
165 : // same length as the input, then the atom is identical to the input.
166 606783 : if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
167 201442 : simple_ = true;
168 : }
169 395311 : return result;
170 : }
171 :
172 :
173 : // Disjunction ::
174 : // Alternative
175 : // Alternative | Disjunction
176 : // Alternative ::
177 : // [empty]
178 : // Term Alternative
179 : // Term ::
180 : // Assertion
181 : // Atom
182 : // Atom Quantifier
183 1291314343 : RegExpTree* RegExpParser::ParseDisjunction() {
184 : // Used to store current state while parsing subexpressions.
185 : RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
186 795978 : 0, nullptr, ignore_case(), unicode(), zone());
187 8173322 : RegExpParserState* state = &initial_state;
188 : // Cache the builder in a local variable for quick access.
189 397989 : RegExpBuilder* builder = initial_state.builder();
190 : while (true) {
191 429090050 : switch (current()) {
192 : case kEndMarker:
193 395824 : if (state->IsSubexpression()) {
194 : // Inside a parenthesized group when hitting end of input.
195 398117 : return ReportError(CStrVector("Unterminated group"));
196 : }
197 : DCHECK_EQ(INITIAL, state->group_type());
198 : // Parsing completed successfully.
199 395696 : return builder->ToRegExp();
200 : case ')': {
201 1300846 : if (!state->IsSubexpression()) {
202 14 : return ReportError(CStrVector("Unmatched ')'"));
203 : }
204 : DCHECK_NE(INITIAL, state->group_type());
205 :
206 1300832 : Advance();
207 : // End disjunction parsing and convert builder content to new single
208 : // regexp atom.
209 1300832 : RegExpTree* body = builder->ToRegExp();
210 :
211 : int end_capture_index = captures_started();
212 :
213 : int capture_index = state->capture_index();
214 : SubexpressionType group_type = state->group_type();
215 :
216 : // Build result of subexpression.
217 1300832 : if (group_type == CAPTURE) {
218 1268838 : if (state->IsNamedCapture()) {
219 : CreateNamedCaptureAtIndex(state->capture_name(),
220 1359 : capture_index CHECK_FAILED);
221 : }
222 1268796 : RegExpCapture* capture = GetCapture(capture_index);
223 : capture->set_body(body);
224 : body = capture;
225 31994 : } else if (group_type == GROUPING) {
226 : body = new (zone()) RegExpGroup(body);
227 : } else {
228 : DCHECK(group_type == POSITIVE_LOOKAROUND ||
229 : group_type == NEGATIVE_LOOKAROUND);
230 3768 : bool is_positive = (group_type == POSITIVE_LOOKAROUND);
231 : body = new (zone()) RegExpLookaround(
232 : body, is_positive, end_capture_index - capture_index,
233 3768 : capture_index, state->lookaround_type());
234 : }
235 :
236 : // Restore previous state.
237 : state = state->previous_state();
238 : builder = state->builder();
239 :
240 1300790 : builder->AddAtom(body);
241 : // For compatibility with JSC and ES3, we allow quantifiers after
242 : // lookaheads, and break in all cases.
243 1300790 : break;
244 : }
245 : case '|': {
246 132851 : Advance();
247 : builder->NewAlternative();
248 427004231 : continue;
249 : }
250 : case '*':
251 : case '+':
252 : case '?':
253 110 : return ReportError(CStrVector("Nothing to repeat"));
254 : case '^': {
255 14145 : Advance();
256 14145 : if (multiline()) {
257 : builder->AddAssertion(
258 1569 : new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
259 : } else {
260 : builder->AddAssertion(
261 12576 : new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
262 : set_contains_anchor();
263 : }
264 : continue;
265 : }
266 : case '$': {
267 6998 : Advance();
268 : RegExpAssertion::AssertionType assertion_type =
269 : multiline() ? RegExpAssertion::END_OF_LINE
270 6998 : : RegExpAssertion::END_OF_INPUT;
271 6998 : builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
272 6998 : continue;
273 : }
274 : case '.': {
275 24291 : Advance();
276 : ZoneList<CharacterRange>* ranges =
277 24291 : new (zone()) ZoneList<CharacterRange>(2, zone());
278 :
279 24291 : if (dotall()) {
280 : // Everything.
281 : DCHECK(FLAG_harmony_regexp_dotall);
282 40 : CharacterRange::AddClassEscape('*', ranges, false, zone());
283 : } else {
284 : // Everything except \x0a, \x0d, \u2028 and \u2029
285 24251 : CharacterRange::AddClassEscape('.', ranges, false, zone());
286 : }
287 :
288 : RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges);
289 24291 : builder->AddCharacterClass(cc);
290 24291 : break;
291 : }
292 : case '(': {
293 : SubexpressionType subexpr_type = CAPTURE;
294 : RegExpLookaround::Type lookaround_type = state->lookaround_type();
295 : bool is_named_capture = false;
296 1301401 : Advance();
297 1301401 : if (current() == '?') {
298 33765 : switch (Next()) {
299 : case ':':
300 : subexpr_type = GROUPING;
301 : Advance(2);
302 : break;
303 : case '=':
304 : lookaround_type = RegExpLookaround::LOOKAHEAD;
305 : subexpr_type = POSITIVE_LOOKAROUND;
306 : Advance(2);
307 : break;
308 : case '!':
309 : lookaround_type = RegExpLookaround::LOOKAHEAD;
310 : subexpr_type = NEGATIVE_LOOKAROUND;
311 : Advance(2);
312 : break;
313 : case '<':
314 3137 : Advance();
315 3137 : if (FLAG_harmony_regexp_lookbehind) {
316 3137 : if (Next() == '=') {
317 : subexpr_type = POSITIVE_LOOKAROUND;
318 : lookaround_type = RegExpLookaround::LOOKBEHIND;
319 : Advance(2);
320 : break;
321 1968 : } else if (Next() == '!') {
322 : subexpr_type = NEGATIVE_LOOKAROUND;
323 : lookaround_type = RegExpLookaround::LOOKBEHIND;
324 : Advance(2);
325 : break;
326 : }
327 : }
328 1716 : if (FLAG_harmony_regexp_named_captures) {
329 1716 : has_named_captures_ = true;
330 : is_named_capture = true;
331 1716 : Advance();
332 1716 : break;
333 : }
334 : // Fall through.
335 : default:
336 24 : return ReportError(CStrVector("Invalid group"));
337 : }
338 : }
339 :
340 : const ZoneVector<uc16>* capture_name = nullptr;
341 1301377 : if (subexpr_type == CAPTURE) {
342 1269352 : if (captures_started_ >= kMaxCaptures) {
343 6 : return ReportError(CStrVector("Too many captures"));
344 : }
345 1269346 : captures_started_++;
346 :
347 1269346 : if (is_named_capture) {
348 1716 : capture_name = ParseCaptureGroupName(CHECK_FAILED);
349 : }
350 : }
351 : // Store current state and begin new disjunction parsing.
352 : state = new (zone()) RegExpParserState(
353 : state, subexpr_type, lookaround_type, captures_started_,
354 3903069 : capture_name, ignore_case(), unicode(), zone());
355 : builder = state->builder();
356 1301023 : continue;
357 : }
358 : case '[': {
359 164331 : RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
360 164127 : builder->AddCharacterClass(cc->AsCharacterClass());
361 164127 : break;
362 : }
363 : // Atom ::
364 : // \ AtomEscape
365 : case '\\':
366 66591 : switch (Next()) {
367 : case kEndMarker:
368 6 : return ReportError(CStrVector("\\ at end of pattern"));
369 : case 'b':
370 : Advance(2);
371 : builder->AddAssertion(
372 540 : new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
373 540 : continue;
374 : case 'B':
375 : Advance(2);
376 : builder->AddAssertion(
377 356 : new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
378 356 : continue;
379 : // AtomEscape ::
380 : // CharacterClassEscape
381 : //
382 : // CharacterClassEscape :: one of
383 : // d D s S w W
384 : case 'd':
385 : case 'D':
386 : case 's':
387 : case 'S':
388 : case 'w':
389 : case 'W': {
390 : uc32 c = Next();
391 : Advance(2);
392 : ZoneList<CharacterRange>* ranges =
393 21353 : new (zone()) ZoneList<CharacterRange>(2, zone());
394 : CharacterRange::AddClassEscape(c, ranges,
395 21761 : unicode() && ignore_case(), zone());
396 : RegExpCharacterClass* cc =
397 : new (zone()) RegExpCharacterClass(ranges);
398 21353 : builder->AddCharacterClass(cc);
399 21353 : break;
400 : }
401 : case 'p':
402 : case 'P': {
403 : uc32 p = Next();
404 : Advance(2);
405 4869 : if (unicode()) {
406 4660 : if (FLAG_harmony_regexp_property) {
407 : ZoneList<CharacterRange>* ranges =
408 4651 : new (zone()) ZoneList<CharacterRange>(2, zone());
409 4651 : if (!ParsePropertyClass(ranges, p == 'P')) {
410 756 : return ReportError(CStrVector("Invalid property name"));
411 : }
412 : RegExpCharacterClass* cc =
413 : new (zone()) RegExpCharacterClass(ranges);
414 3895 : builder->AddCharacterClass(cc);
415 : } else {
416 : // With /u, no identity escapes except for syntax characters
417 : // are allowed. Otherwise, all identity escapes are allowed.
418 9 : return ReportError(CStrVector("Invalid escape"));
419 : }
420 : } else {
421 209 : builder->AddCharacter(p);
422 : }
423 : break;
424 : }
425 : case '1':
426 : case '2':
427 : case '3':
428 : case '4':
429 : case '5':
430 : case '6':
431 : case '7':
432 : case '8':
433 : case '9': {
434 5998 : int index = 0;
435 6082 : bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
436 5986 : if (is_backref) {
437 10760 : if (state->IsInsideCaptureGroup(index)) {
438 : // The back reference is inside the capture group it refers to.
439 : // Nothing can possibly have been captured yet, so we use empty
440 : // instead. This ensures that, when checking a back reference,
441 : // the capture registers of the referenced capture are either
442 : // both set or both cleared.
443 : builder->AddEmpty();
444 : } else {
445 5217 : RegExpCapture* capture = GetCapture(index);
446 : RegExpTree* atom = new (zone()) RegExpBackReference(capture);
447 5217 : builder->AddAtom(atom);
448 : }
449 5448 : break;
450 : }
451 : // With /u, no identity escapes except for syntax characters
452 : // are allowed. Otherwise, all identity escapes are allowed.
453 606 : if (unicode()) {
454 72 : return ReportError(CStrVector("Invalid escape"));
455 : }
456 : uc32 first_digit = Next();
457 534 : if (first_digit == '8' || first_digit == '9') {
458 68 : builder->AddCharacter(first_digit);
459 : Advance(2);
460 : break;
461 : }
462 : }
463 : // Fall through.
464 : case '0': {
465 3386 : Advance();
466 3424 : if (unicode() && Next() >= '0' && Next() <= '9') {
467 : // With /u, decimal escape with leading 0 are not parsed as octal.
468 19 : return ReportError(CStrVector("Invalid decimal escape"));
469 : }
470 3367 : uc32 octal = ParseOctalLiteral();
471 3367 : builder->AddCharacter(octal);
472 3367 : break;
473 : }
474 : // ControlEscape :: one of
475 : // f n r t v
476 : case 'f':
477 : Advance(2);
478 14 : builder->AddCharacter('\f');
479 14 : break;
480 : case 'n':
481 : Advance(2);
482 599 : builder->AddCharacter('\n');
483 599 : break;
484 : case 'r':
485 : Advance(2);
486 84 : builder->AddCharacter('\r');
487 84 : break;
488 : case 't':
489 : Advance(2);
490 16 : builder->AddCharacter('\t');
491 16 : break;
492 : case 'v':
493 : Advance(2);
494 36 : builder->AddCharacter('\v');
495 36 : break;
496 : case 'c': {
497 251 : Advance();
498 : uc32 controlLetter = Next();
499 : // Special case if it is an ASCII letter.
500 : // Convert lower case letters to uppercase.
501 251 : uc32 letter = controlLetter & ~('a' ^ 'A');
502 251 : if (letter < 'A' || 'Z' < letter) {
503 : // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
504 : // Read the backslash as a literal character instead of as
505 : // starting an escape.
506 : // ES#prod-annexB-ExtendedPatternCharacter
507 137 : if (unicode()) {
508 : // With /u, invalid escapes are not treated as identity escapes.
509 19 : return ReportError(CStrVector("Invalid unicode escape"));
510 : }
511 118 : builder->AddCharacter('\\');
512 : } else {
513 : Advance(2);
514 114 : builder->AddCharacter(controlLetter & 0x1f);
515 : }
516 : break;
517 : }
518 : case 'x': {
519 : Advance(2);
520 : uc32 value;
521 286 : if (ParseHexEscape(2, &value)) {
522 169 : builder->AddCharacter(value);
523 117 : } else if (!unicode()) {
524 81 : builder->AddCharacter('x');
525 : } else {
526 : // With /u, invalid escapes are not treated as identity escapes.
527 36 : return ReportError(CStrVector("Invalid escape"));
528 : }
529 250 : break;
530 : }
531 : case 'u': {
532 : Advance(2);
533 : uc32 value;
534 1818 : if (ParseUnicodeEscape(&value)) {
535 1599 : builder->AddEscapedUnicodeCharacter(value);
536 219 : } else if (!unicode()) {
537 147 : builder->AddCharacter('u');
538 : } else {
539 : // With /u, invalid escapes are not treated as identity escapes.
540 72 : return ReportError(CStrVector("Invalid unicode escape"));
541 : }
542 1746 : break;
543 : }
544 : case 'k':
545 : // Either an identity escape or a named back-reference. The two
546 : // interpretations are mutually exclusive: '\k' is interpreted as
547 : // an identity escape for non-unicode patterns without named
548 : // capture groups, and as the beginning of a named back-reference
549 : // in all other cases.
550 1593 : if (FLAG_harmony_regexp_named_captures &&
551 238 : (unicode() || HasNamedCaptures())) {
552 : Advance(2);
553 463 : ParseNamedBackReference(builder, state CHECK_FAILED);
554 : break;
555 : }
556 : // Fall through.
557 : default:
558 26982 : Advance();
559 : // With /u, no identity escapes except for syntax characters
560 : // are allowed. Otherwise, all identity escapes are allowed.
561 27024 : if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
562 26955 : builder->AddCharacter(current());
563 26955 : Advance();
564 : } else {
565 27 : return ReportError(CStrVector("Invalid escape"));
566 : }
567 26955 : break;
568 : }
569 : break;
570 : case '{': {
571 : int dummy;
572 684 : bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
573 619 : if (parsed) return ReportError(CStrVector("Nothing to repeat"));
574 : // Fall through.
575 : }
576 : case '}':
577 : case ']':
578 1182 : if (unicode()) {
579 28 : return ReportError(CStrVector("Lone quantifier brackets"));
580 : }
581 : // Fall through.
582 : default:
583 425682587 : builder->AddUnicodeCharacter(current());
584 425682587 : Advance();
585 425682587 : break;
586 : } // end switch(current())
587 :
588 : int min;
589 : int max;
590 427236369 : switch (current()) {
591 : // QuantifierPrefix ::
592 : // *
593 : // +
594 : // ?
595 : // {
596 : case '*':
597 38233 : min = 0;
598 38233 : max = RegExpTree::kInfinity;
599 38233 : Advance();
600 38233 : break;
601 : case '+':
602 13086 : min = 1;
603 13086 : max = RegExpTree::kInfinity;
604 13086 : Advance();
605 13086 : break;
606 : case '?':
607 1625542 : min = 0;
608 1625542 : max = 1;
609 1625542 : Advance();
610 1625542 : break;
611 : case '{':
612 11602 : if (ParseIntervalQuantifier(&min, &max)) {
613 10991 : if (max < min) {
614 : return ReportError(
615 2 : CStrVector("numbers out of order in {} quantifier"));
616 : }
617 : break;
618 611 : } else if (unicode()) {
619 : // With /u, incomplete quantifiers are not allowed.
620 199 : return ReportError(CStrVector("Incomplete quantifier"));
621 : }
622 : continue;
623 : default:
624 : continue;
625 : }
626 : RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
627 1687850 : if (current() == '?') {
628 : quantifier_type = RegExpQuantifier::NON_GREEDY;
629 5827 : Advance();
630 : } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
631 : // FLAG_regexp_possessive_quantifier is a debug-only flag.
632 : quantifier_type = RegExpQuantifier::POSSESSIVE;
633 : Advance();
634 : }
635 1687850 : if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
636 20 : return ReportError(CStrVector("Invalid quantifier"));
637 : }
638 : }
639 : }
640 :
641 :
642 : #ifdef DEBUG
643 : // Currently only used in an DCHECK.
644 : static bool IsSpecialClassEscape(uc32 c) {
645 : switch (c) {
646 : case 'd':
647 : case 'D':
648 : case 's':
649 : case 'S':
650 : case 'w':
651 : case 'W':
652 : return true;
653 : default:
654 : return false;
655 : }
656 : }
657 : #endif
658 :
659 :
660 : // In order to know whether an escape is a backreference or not we have to scan
661 : // the entire regexp and find the number of capturing parentheses. However we
662 : // don't want to scan the regexp twice unless it is necessary. This mini-parser
663 : // is called when needed. It can see the difference between capturing and
664 : // noncapturing parentheses and can skip character classes and backslash-escaped
665 : // characters.
666 9291 : void RegExpParser::ScanForCaptures() {
667 : DCHECK(!is_scanned_for_captures_);
668 : const int saved_position = position();
669 : // Start with captures started previous to current position
670 : int capture_count = captures_started();
671 : // Add count of captures after this position.
672 : int n;
673 7226 : while ((n = current()) != kEndMarker) {
674 5300 : Advance();
675 5300 : switch (n) {
676 : case '\\':
677 750 : Advance();
678 750 : break;
679 : case '[': {
680 : int c;
681 124 : while ((c = current()) != kEndMarker) {
682 124 : Advance();
683 124 : if (c == '\\') {
684 0 : Advance();
685 : } else {
686 124 : if (c == ']') break;
687 : }
688 : }
689 : break;
690 : }
691 : case '(':
692 689 : if (current() == '?') {
693 : // At this point we could be in
694 : // * a non-capturing group '(:',
695 : // * a lookbehind assertion '(?<=' '(?<!'
696 : // * or a named capture '(?<'.
697 : //
698 : // Of these, only named captures are capturing groups.
699 161 : if (!FLAG_harmony_regexp_named_captures) break;
700 :
701 161 : Advance();
702 161 : if (current() != '<') break;
703 :
704 128 : if (FLAG_harmony_regexp_lookbehind) {
705 128 : Advance();
706 128 : if (current() == '=' || current() == '!') break;
707 : }
708 :
709 : // Found a possible named capture. It could turn out to be a syntax
710 : // error (e.g. an unterminated or invalid name), but that distinction
711 : // does not matter for our purposes.
712 75 : has_named_captures_ = true;
713 : }
714 603 : capture_count++;
715 603 : break;
716 : }
717 : }
718 963 : capture_count_ = capture_count;
719 963 : is_scanned_for_captures_ = true;
720 : Reset(saved_position);
721 963 : }
722 :
723 :
724 18752 : bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
725 : DCHECK_EQ('\\', current());
726 : DCHECK('1' <= Next() && Next() <= '9');
727 : // Try to parse a decimal literal that is no greater than the total number
728 : // of left capturing parentheses in the input.
729 : int start = position();
730 5998 : int value = Next() - '0';
731 : Advance(2);
732 : while (true) {
733 : uc32 c = current();
734 6798 : if (IsDecimalDigit(c)) {
735 842 : value = 10 * value + (c - '0');
736 842 : if (value > kMaxCaptures) {
737 : Reset(start);
738 42 : return false;
739 : }
740 800 : Advance();
741 : } else {
742 : break;
743 : }
744 : }
745 5956 : if (value > captures_started()) {
746 1060 : if (!is_scanned_for_captures_) ScanForCaptures();
747 1060 : if (value > capture_count_) {
748 : Reset(start);
749 576 : return false;
750 : }
751 : }
752 5380 : *index_out = value;
753 6180 : return true;
754 : }
755 :
756 2594 : static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
757 2594 : if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
758 5108 : v->push_back(code_unit);
759 : } else {
760 80 : v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
761 80 : v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
762 : }
763 2594 : }
764 :
765 7131 : const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
766 : DCHECK(FLAG_harmony_regexp_named_captures);
767 :
768 : ZoneVector<uc16>* name =
769 2137 : new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
770 :
771 : bool at_start = true;
772 : while (true) {
773 4731 : uc32 c = current();
774 4731 : Advance();
775 :
776 : // Convert unicode escapes.
777 4994 : if (c == '\\' && current() == 'u') {
778 245 : Advance();
779 245 : if (!ParseUnicodeEscape(&c)) {
780 60 : ReportError(CStrVector("Invalid Unicode escape sequence"));
781 60 : return nullptr;
782 : }
783 : }
784 :
785 : // The backslash char is misclassified as both ID_Start and ID_Continue.
786 4671 : if (c == '\\') {
787 18 : ReportError(CStrVector("Invalid capture group name"));
788 18 : return nullptr;
789 : }
790 :
791 4653 : if (at_start) {
792 2104 : if (!IdentifierStart::Is(c)) {
793 102 : ReportError(CStrVector("Invalid capture group name"));
794 102 : return nullptr;
795 : }
796 2002 : push_code_unit(name, c);
797 : at_start = false;
798 : } else {
799 2549 : if (c == '>') {
800 : break;
801 811 : } else if (IdentifierPart::Is(c)) {
802 592 : push_code_unit(name, c);
803 : } else {
804 219 : ReportError(CStrVector("Invalid capture group name"));
805 219 : return nullptr;
806 : }
807 : }
808 : }
809 :
810 4332 : return name;
811 : }
812 :
813 1359 : bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
814 2312 : int index) {
815 : DCHECK(FLAG_harmony_regexp_named_captures);
816 : DCHECK(0 < index && index <= captures_started_);
817 : DCHECK_NOT_NULL(name);
818 :
819 1359 : if (named_captures_ == nullptr) {
820 995 : named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
821 : } else {
822 : // Check for duplicates and bail if we find any.
823 : // TODO(jgruber): O(n^2).
824 1136 : for (const auto& named_capture : *named_captures_) {
825 450 : if (*named_capture->name() == *name) {
826 42 : ReportError(CStrVector("Duplicate capture group name"));
827 42 : return false;
828 : }
829 : }
830 : }
831 :
832 1317 : RegExpCapture* capture = GetCapture(index);
833 : DCHECK_NULL(capture->name());
834 :
835 : capture->set_name(name);
836 1317 : named_captures_->Add(capture, zone());
837 :
838 1317 : return true;
839 : }
840 :
841 463 : bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
842 1399 : RegExpParserState* state) {
843 : // The parser is assumed to be on the '<' in \k<name>.
844 463 : if (current() != '<') {
845 42 : ReportError(CStrVector("Invalid named reference"));
846 42 : return false;
847 : }
848 :
849 421 : Advance();
850 421 : const ZoneVector<uc16>* name = ParseCaptureGroupName();
851 421 : if (name == nullptr) {
852 : return false;
853 : }
854 :
855 370 : if (state->IsInsideCaptureGroup(name)) {
856 : builder->AddEmpty();
857 : } else {
858 336 : RegExpBackReference* atom = new (zone()) RegExpBackReference();
859 : atom->set_name(name);
860 :
861 336 : builder->AddAtom(atom);
862 :
863 336 : if (named_back_references_ == nullptr) {
864 : named_back_references_ =
865 264 : new (zone()) ZoneList<RegExpBackReference*>(1, zone());
866 : }
867 336 : named_back_references_->Add(atom, zone());
868 : }
869 :
870 : return true;
871 : }
872 :
873 395413 : void RegExpParser::PatchNamedBackReferences() {
874 395896 : if (named_back_references_ == nullptr) return;
875 :
876 264 : if (named_captures_ == nullptr) {
877 15 : ReportError(CStrVector("Invalid named capture referenced"));
878 15 : return;
879 : }
880 :
881 : // Look up and patch the actual capture for each named back reference.
882 : // TODO(jgruber): O(n^2), optimize if necessary.
883 :
884 717 : for (int i = 0; i < named_back_references_->length(); i++) {
885 740 : RegExpBackReference* ref = named_back_references_->at(i);
886 :
887 : int index = -1;
888 827 : for (const auto& capture : *named_captures_) {
889 419 : if (*capture->name() == *ref->name()) {
890 : index = capture->index();
891 234 : break;
892 : }
893 : }
894 :
895 321 : if (index == -1) {
896 87 : ReportError(CStrVector("Invalid named capture referenced"));
897 87 : return;
898 : }
899 :
900 234 : ref->set_capture(GetCapture(index));
901 : }
902 : }
903 :
904 2561172 : RegExpCapture* RegExpParser::GetCapture(int index) {
905 : // The index for the capture groups are one-based. Its index in the list is
906 : // zero-based.
907 : int know_captures =
908 1275564 : is_scanned_for_captures_ ? capture_count_ : captures_started_;
909 : DCHECK(index <= know_captures);
910 3819924 : if (captures_ == nullptr) {
911 16812 : captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
912 : }
913 5088720 : while (captures_->length() < know_captures) {
914 2537592 : captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
915 : }
916 2551128 : return captures_->at(index - 1);
917 : }
918 :
919 396141 : Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
920 398105 : if (named_captures_ == nullptr || named_captures_->is_empty())
921 394481 : return Handle<FixedArray>();
922 :
923 : Factory* factory = isolate()->factory();
924 :
925 830 : int len = named_captures_->length() * 2;
926 830 : Handle<FixedArray> array = factory->NewFixedArray(len);
927 :
928 3928 : for (int i = 0; i < named_captures_->length(); i++) {
929 2268 : RegExpCapture* capture = named_captures_->at(i);
930 1134 : MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
931 2268 : array->set(i * 2, *name.ToHandleChecked());
932 : array->set(i * 2 + 1, Smi::FromInt(capture->index()));
933 : }
934 :
935 830 : return array;
936 : }
937 :
938 0 : bool RegExpParser::HasNamedCaptures() {
939 238 : if (has_named_captures_ || is_scanned_for_captures_) {
940 : return has_named_captures_;
941 : }
942 :
943 133 : ScanForCaptures();
944 : DCHECK(is_scanned_for_captures_);
945 133 : return has_named_captures_;
946 : }
947 :
948 0 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
949 12651 : for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
950 6209 : if (s->group_type() != CAPTURE) continue;
951 : // Return true if we found the matching capture index.
952 751 : if (index == s->capture_index()) return true;
953 : // Abort if index is larger than what has been parsed up till this state.
954 588 : if (index > s->capture_index()) return false;
955 : }
956 : return false;
957 : }
958 :
959 370 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
960 : const ZoneVector<uc16>* name) {
961 : DCHECK_NOT_NULL(name);
962 730 : for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
963 394 : if (s->capture_name() == nullptr) continue;
964 58 : if (*s->capture_name() == *name) return true;
965 : }
966 : return false;
967 : }
968 :
969 : // QuantifierPrefix ::
970 : // { DecimalDigits }
971 : // { DecimalDigits , }
972 : // { DecimalDigits , DecimalDigits }
973 : //
974 : // Returns true if parsing succeeds, and set the min_out and max_out
975 : // values. Values are truncated to RegExpTree::kInfinity if they overflow.
976 87088 : bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
977 : DCHECK_EQ(current(), '{');
978 : int start = position();
979 12239 : Advance();
980 : int min = 0;
981 12239 : if (!IsDecimalDigit(current())) {
982 : Reset(start);
983 1010 : return false;
984 : }
985 24082 : while (IsDecimalDigit(current())) {
986 : int next = current() - '0';
987 12991 : if (min > (RegExpTree::kInfinity - next) / 10) {
988 : // Overflow. Skip past remaining decimal digits and return -1.
989 966 : do {
990 966 : Advance();
991 : } while (IsDecimalDigit(current()));
992 : min = RegExpTree::kInfinity;
993 : break;
994 : }
995 12853 : min = 10 * min + next;
996 12853 : Advance();
997 : }
998 : int max = 0;
999 11229 : if (current() == '}') {
1000 : max = min;
1001 4780 : Advance();
1002 6449 : } else if (current() == ',') {
1003 6367 : Advance();
1004 6367 : if (current() == '}') {
1005 : max = RegExpTree::kInfinity;
1006 352 : Advance();
1007 : } else {
1008 13051 : while (IsDecimalDigit(current())) {
1009 : int next = current() - '0';
1010 7108 : if (max > (RegExpTree::kInfinity - next) / 10) {
1011 900 : do {
1012 900 : Advance();
1013 : } while (IsDecimalDigit(current()));
1014 : max = RegExpTree::kInfinity;
1015 : break;
1016 : }
1017 7036 : max = 10 * max + next;
1018 7036 : Advance();
1019 : }
1020 6015 : if (current() != '}') {
1021 : Reset(start);
1022 127 : return false;
1023 : }
1024 5888 : Advance();
1025 : }
1026 : } else {
1027 : Reset(start);
1028 82 : return false;
1029 : }
1030 11020 : *min_out = min;
1031 11020 : *max_out = max;
1032 11020 : return true;
1033 : }
1034 :
1035 :
1036 11855 : uc32 RegExpParser::ParseOctalLiteral() {
1037 : DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1038 : // For compatibility with some other browsers (not all), we parse
1039 : // up to three octal digits with a value below 256.
1040 : // ES#prod-annexB-LegacyOctalEscapeSequence
1041 4497 : uc32 value = current() - '0';
1042 4497 : Advance();
1043 4497 : if ('0' <= current() && current() <= '7') {
1044 1528 : value = value * 8 + current() - '0';
1045 1528 : Advance();
1046 3017 : if (value < 32 && '0' <= current() && current() <= '7') {
1047 2744 : value = value * 8 + current() - '0';
1048 1372 : Advance();
1049 : }
1050 : }
1051 4497 : return value;
1052 : }
1053 :
1054 :
1055 72550 : bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1056 : int start = position();
1057 : uc32 val = 0;
1058 72127 : for (int i = 0; i < length; ++i) {
1059 : uc32 c = current();
1060 : int d = HexValue(c);
1061 57071 : if (d < 0) {
1062 : Reset(start);
1063 423 : return false;
1064 : }
1065 56648 : val = val * 16 + d;
1066 56648 : Advance();
1067 : }
1068 15056 : *value = val;
1069 15056 : return true;
1070 : }
1071 :
1072 : // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1073 33251 : bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1074 : // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1075 : // allowed). In the latter case, the number of hex digits between { } is
1076 : // arbitrary. \ and u have already been read.
1077 16506 : if (current() == '{' && unicode()) {
1078 : int start = position();
1079 1609 : Advance();
1080 1609 : if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
1081 1582 : if (current() == '}') {
1082 1576 : Advance();
1083 1576 : return true;
1084 : }
1085 : }
1086 : Reset(start);
1087 33 : return false;
1088 : }
1089 : // \u but no {, or \u{...} escapes not allowed.
1090 13198 : bool result = ParseHexEscape(4, value);
1091 37374 : if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1092 : current() == '\\') {
1093 : // Attempt to read trail surrogate.
1094 : int start = position();
1095 206 : if (Next() == 'u') {
1096 : Advance(2);
1097 : uc32 trail;
1098 382 : if (ParseHexEscape(4, &trail) &&
1099 186 : unibrow::Utf16::IsTrailSurrogate(trail)) {
1100 : *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1101 372 : static_cast<uc16>(trail));
1102 186 : return true;
1103 : }
1104 : }
1105 : Reset(start);
1106 : }
1107 13012 : return result;
1108 : }
1109 :
1110 : #ifdef V8_INTL_SUPPORT
1111 :
1112 : namespace {
1113 :
1114 3796 : bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1115 3796 : const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1116 3796 : if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1117 : return true;
1118 : for (int i = 0;; i++) {
1119 : const char* long_name = u_getPropertyName(
1120 758 : property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1121 758 : if (long_name == nullptr) break;
1122 686 : if (strcmp(property_name, long_name) == 0) return true;
1123 : }
1124 : return false;
1125 : }
1126 :
1127 4037 : bool IsExactPropertyValueAlias(const char* property_value_name,
1128 : UProperty property, int32_t property_value) {
1129 : const char* short_name =
1130 4037 : u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1131 4037 : if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1132 : return true;
1133 : }
1134 : for (int i = 0;; i++) {
1135 : const char* long_name = u_getPropertyValueName(
1136 : property, property_value,
1137 337 : static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1138 337 : if (long_name == nullptr) break;
1139 328 : if (strcmp(property_value_name, long_name) == 0) return true;
1140 : }
1141 : return false;
1142 : }
1143 :
1144 5620 : bool LookupPropertyValueName(UProperty property,
1145 : const char* property_value_name, bool negate,
1146 : ZoneList<CharacterRange>* result, Zone* zone) {
1147 : UProperty property_for_lookup = property;
1148 5620 : if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1149 : // For the property Script_Extensions, we have to do the property value
1150 : // name lookup as if the property is Script.
1151 : property_for_lookup = UCHAR_SCRIPT;
1152 : }
1153 : int32_t property_value =
1154 5620 : u_getPropertyValueEnum(property_for_lookup, property_value_name);
1155 5620 : if (property_value == UCHAR_INVALID_CODE) return false;
1156 :
1157 : // We require the property name to match exactly to one of the property value
1158 : // aliases. However, u_getPropertyValueEnum uses loose matching.
1159 4037 : if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1160 4037 : property_value)) {
1161 : return false;
1162 : }
1163 :
1164 4028 : UErrorCode ec = U_ZERO_ERROR;
1165 4028 : icu::UnicodeSet set;
1166 4028 : set.applyIntPropertyValue(property, property_value, ec);
1167 4028 : bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1168 :
1169 4028 : if (success) {
1170 4028 : set.removeAllStrings();
1171 4028 : if (negate) set.complement();
1172 480345 : for (int i = 0; i < set.getRangeCount(); i++) {
1173 : result->Add(
1174 480345 : CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1175 480345 : zone);
1176 : }
1177 : }
1178 4028 : return success;
1179 : }
1180 :
1181 : template <size_t N>
1182 : inline bool NameEquals(const char* name, const char (&literal)[N]) {
1183 4551 : return strncmp(name, literal, N + 1) == 0;
1184 : }
1185 :
1186 1592 : bool LookupSpecialPropertyValueName(const char* name,
1187 : ZoneList<CharacterRange>* result,
1188 : bool negate, Zone* zone) {
1189 1592 : if (NameEquals(name, "Any")) {
1190 60 : if (!negate) result->Add(CharacterRange::Everything(), zone);
1191 1532 : } else if (NameEquals(name, "ASCII")) {
1192 : result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1193 : : CharacterRange::Range(0x0, 0x7f),
1194 105 : zone);
1195 1427 : } else if (NameEquals(name, "Assigned")) {
1196 : return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1197 75 : !negate, result, zone);
1198 : } else {
1199 : return false;
1200 : }
1201 : return true;
1202 : }
1203 :
1204 : // Explicitly whitelist supported binary properties. The spec forbids supporting
1205 : // properties outside of this set to ensure interoperability.
1206 : bool IsSupportedBinaryProperty(UProperty property) {
1207 : switch (property) {
1208 : case UCHAR_ALPHABETIC:
1209 : // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1210 : // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1211 : case UCHAR_ASCII_HEX_DIGIT:
1212 : // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1213 : case UCHAR_BIDI_CONTROL:
1214 : case UCHAR_BIDI_MIRRORED:
1215 : case UCHAR_CASE_IGNORABLE:
1216 : case UCHAR_CASED:
1217 : case UCHAR_CHANGES_WHEN_CASEFOLDED:
1218 : case UCHAR_CHANGES_WHEN_CASEMAPPED:
1219 : case UCHAR_CHANGES_WHEN_LOWERCASED:
1220 : case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1221 : case UCHAR_CHANGES_WHEN_TITLECASED:
1222 : case UCHAR_CHANGES_WHEN_UPPERCASED:
1223 : case UCHAR_DASH:
1224 : case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1225 : case UCHAR_DEPRECATED:
1226 : case UCHAR_DIACRITIC:
1227 : case UCHAR_EMOJI:
1228 : // TODO(yangguo): Uncomment this once we upgrade to ICU 60.
1229 : // See https://ssl.icu-project.org/trac/ticket/13062
1230 : // case UCHAR_EMOJI_COMPONENT:
1231 : case UCHAR_EMOJI_MODIFIER_BASE:
1232 : case UCHAR_EMOJI_MODIFIER:
1233 : case UCHAR_EMOJI_PRESENTATION:
1234 : case UCHAR_EXTENDER:
1235 : case UCHAR_GRAPHEME_BASE:
1236 : case UCHAR_GRAPHEME_EXTEND:
1237 : case UCHAR_HEX_DIGIT:
1238 : case UCHAR_ID_CONTINUE:
1239 : case UCHAR_ID_START:
1240 : case UCHAR_IDEOGRAPHIC:
1241 : case UCHAR_IDS_BINARY_OPERATOR:
1242 : case UCHAR_IDS_TRINARY_OPERATOR:
1243 : case UCHAR_JOIN_CONTROL:
1244 : case UCHAR_LOGICAL_ORDER_EXCEPTION:
1245 : case UCHAR_LOWERCASE:
1246 : case UCHAR_MATH:
1247 : case UCHAR_NONCHARACTER_CODE_POINT:
1248 : case UCHAR_PATTERN_SYNTAX:
1249 : case UCHAR_PATTERN_WHITE_SPACE:
1250 : case UCHAR_QUOTATION_MARK:
1251 : case UCHAR_RADICAL:
1252 : case UCHAR_S_TERM:
1253 : case UCHAR_SOFT_DOTTED:
1254 : case UCHAR_TERMINAL_PUNCTUATION:
1255 : case UCHAR_UNIFIED_IDEOGRAPH:
1256 : case UCHAR_UPPERCASE:
1257 : case UCHAR_VARIATION_SELECTOR:
1258 : case UCHAR_WHITE_SPACE:
1259 : case UCHAR_XID_CONTINUE:
1260 : case UCHAR_XID_START:
1261 : return true;
1262 : default:
1263 : break;
1264 : }
1265 : return false;
1266 : }
1267 :
1268 : } // anonymous namespace
1269 :
1270 4994 : bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1271 58406 : bool negate) {
1272 : // Parse the property class as follows:
1273 : // - In \p{name}, 'name' is interpreted
1274 : // - either as a general category property value name.
1275 : // - or as a binary property name.
1276 : // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1277 : // and 'value' is interpreted as one of the available property value names.
1278 : // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1279 : // - Loose matching is not applied.
1280 : std::vector<char> first_part;
1281 : std::vector<char> second_part;
1282 4994 : if (current() == '{') {
1283 : // Parse \p{[PropertyName=]PropertyNameValue}
1284 62070 : for (Advance(); current() != '}' && current() != '='; Advance()) {
1285 26149 : if (!has_next()) return false;
1286 52244 : first_part.push_back(static_cast<char>(current()));
1287 : }
1288 4886 : if (current() == '=') {
1289 30480 : for (Advance(); current() != '}'; Advance()) {
1290 12319 : if (!has_next()) return false;
1291 24638 : second_part.push_back(static_cast<char>(current()));
1292 : }
1293 5842 : second_part.push_back(0); // null-terminate string.
1294 : }
1295 : } else {
1296 : return false;
1297 : }
1298 4886 : Advance();
1299 9772 : first_part.push_back(0); // null-terminate string.
1300 :
1301 4886 : if (second_part.empty()) {
1302 : // First attempt to interpret as general category property value name.
1303 : const char* name = first_part.data();
1304 1965 : if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1305 1965 : result, zone())) {
1306 : return true;
1307 : }
1308 : // Interpret "Any", "ASCII", and "Assigned".
1309 1592 : if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1310 : return true;
1311 : }
1312 : // Then attempt to interpret as binary property name with value name 'Y'.
1313 1352 : UProperty property = u_getPropertyEnum(name);
1314 1352 : if (!IsSupportedBinaryProperty(property)) return false;
1315 875 : if (!IsExactPropertyAlias(name, property)) return false;
1316 : return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1317 875 : zone());
1318 : } else {
1319 : // Both property name and value name are specified. Attempt to interpret
1320 : // the property name as enumerated property.
1321 : const char* property_name = first_part.data();
1322 : const char* value_name = second_part.data();
1323 2921 : UProperty property = u_getPropertyEnum(property_name);
1324 2921 : if (!IsExactPropertyAlias(property_name, property)) return false;
1325 2849 : if (property == UCHAR_GENERAL_CATEGORY) {
1326 : // We want to allow aggregate value names such as "Letter".
1327 : property = UCHAR_GENERAL_CATEGORY_MASK;
1328 5618 : } else if (property != UCHAR_SCRIPT &&
1329 2809 : property != UCHAR_SCRIPT_EXTENSIONS) {
1330 : return false;
1331 : }
1332 : return LookupPropertyValueName(property, value_name, negate, result,
1333 2705 : zone());
1334 : }
1335 : }
1336 :
1337 : #else // V8_INTL_SUPPORT
1338 :
1339 : bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1340 : bool negate) {
1341 : return false;
1342 : }
1343 :
1344 : #endif // V8_INTL_SUPPORT
1345 :
1346 9334 : bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1347 : uc32 x = 0;
1348 : int d = HexValue(current());
1349 1609 : if (d < 0) {
1350 : return false;
1351 : }
1352 9334 : while (d >= 0) {
1353 7752 : x = x * 16 + d;
1354 7752 : if (x > max_value) {
1355 : return false;
1356 : }
1357 7725 : Advance();
1358 : d = HexValue(current());
1359 : }
1360 1582 : *value = x;
1361 1582 : return true;
1362 : }
1363 :
1364 :
1365 44462 : uc32 RegExpParser::ParseClassCharacterEscape() {
1366 : DCHECK_EQ('\\', current());
1367 : DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1368 19598 : Advance();
1369 19598 : switch (current()) {
1370 : case 'b':
1371 14 : Advance();
1372 14 : return '\b';
1373 : // ControlEscape :: one of
1374 : // f n r t v
1375 : case 'f':
1376 0 : Advance();
1377 0 : return '\f';
1378 : case 'n':
1379 475 : Advance();
1380 475 : return '\n';
1381 : case 'r':
1382 50 : Advance();
1383 50 : return '\r';
1384 : case 't':
1385 186 : Advance();
1386 186 : return '\t';
1387 : case 'v':
1388 11 : Advance();
1389 11 : return '\v';
1390 : case 'c': {
1391 : uc32 controlLetter = Next();
1392 386 : uc32 letter = controlLetter & ~('A' ^ 'a');
1393 : // Inside a character class, we also accept digits and underscore as
1394 : // control characters, unless with /u. See Annex B:
1395 : // ES#prod-annexB-ClassControlLetter
1396 386 : if (letter >= 'A' && letter <= 'Z') {
1397 : Advance(2);
1398 : // Control letters mapped to ASCII control characters in the range
1399 : // 0x00-0x1f.
1400 60 : return controlLetter & 0x1f;
1401 : }
1402 326 : if (unicode()) {
1403 : // With /u, invalid escapes are not treated as identity escapes.
1404 18 : ReportError(CStrVector("Invalid class escape"));
1405 18 : return 0;
1406 : }
1407 616 : if ((controlLetter >= '0' && controlLetter <= '9') ||
1408 308 : controlLetter == '_') {
1409 : Advance(2);
1410 202 : return controlLetter & 0x1f;
1411 : }
1412 : // We match JSC in reading the backslash as a literal
1413 : // character instead of as starting an escape.
1414 : // TODO(v8:6201): Not yet covered by the spec.
1415 : return '\\';
1416 : }
1417 : case '0':
1418 : // With /u, \0 is interpreted as NUL if not followed by another digit.
1419 1101 : if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1420 35 : Advance();
1421 35 : return 0;
1422 : }
1423 : // Fall through.
1424 : case '1':
1425 : case '2':
1426 : case '3':
1427 : case '4':
1428 : case '5':
1429 : case '6':
1430 : case '7':
1431 : // For compatibility, we interpret a decimal escape that isn't
1432 : // a back reference (and therefore either \0 or not valid according
1433 : // to the specification) as a 1..3 digit octal character code.
1434 : // ES#prod-annexB-LegacyOctalEscapeSequence
1435 1166 : if (unicode()) {
1436 : // With /u, decimal escape is not interpreted as octal character code.
1437 36 : ReportError(CStrVector("Invalid class escape"));
1438 36 : return 0;
1439 : }
1440 1130 : return ParseOctalLiteral();
1441 : case 'x': {
1442 1799 : Advance();
1443 : uc32 value;
1444 1799 : if (ParseHexEscape(2, &value)) return value;
1445 30 : if (unicode()) {
1446 : // With /u, invalid escapes are not treated as identity escapes.
1447 0 : ReportError(CStrVector("Invalid escape"));
1448 0 : return 0;
1449 : }
1450 : // If \x is not followed by a two-digit hexadecimal, treat it
1451 : // as an identity escape.
1452 : return 'x';
1453 : }
1454 : case 'u': {
1455 12744 : Advance();
1456 : uc32 value;
1457 12744 : if (ParseUnicodeEscape(&value)) return value;
1458 20 : if (unicode()) {
1459 : // With /u, invalid escapes are not treated as identity escapes.
1460 0 : ReportError(CStrVector("Invalid unicode escape"));
1461 0 : return 0;
1462 : }
1463 : // If \u is not followed by a two-digit hexadecimal, treat it
1464 : // as an identity escape.
1465 : return 'u';
1466 : }
1467 : default: {
1468 : uc32 result = current();
1469 : // With /u, no identity escapes except for syntax characters and '-' are
1470 : // allowed. Otherwise, all identity escapes are allowed.
1471 2786 : if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1472 2723 : Advance();
1473 2723 : return result;
1474 : }
1475 9 : ReportError(CStrVector("Invalid escape"));
1476 9 : return 0;
1477 : }
1478 : }
1479 : return 0;
1480 : }
1481 :
1482 428021 : void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1483 : Zone* zone,
1484 : bool add_unicode_case_equivalents,
1485 428364 : uc32* char_out, bool* is_class_escape) {
1486 : uc32 current_char = current();
1487 428021 : if (current_char == '\\') {
1488 21637 : switch (Next()) {
1489 : case 'w':
1490 : case 'W':
1491 : case 'd':
1492 : case 'D':
1493 : case 's':
1494 : case 'S': {
1495 : CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1496 3392 : add_unicode_case_equivalents, zone);
1497 : Advance(2);
1498 1696 : *is_class_escape = true;
1499 1696 : return;
1500 : }
1501 : case kEndMarker:
1502 0 : ReportError(CStrVector("\\ at end of pattern"));
1503 0 : return;
1504 : case 'p':
1505 : case 'P':
1506 757 : if (FLAG_harmony_regexp_property && unicode()) {
1507 343 : bool negate = Next() == 'P';
1508 : Advance(2);
1509 343 : if (!ParsePropertyClass(ranges, negate)) {
1510 45 : ReportError(CStrVector("Invalid property name in character class"));
1511 : }
1512 343 : *is_class_escape = true;
1513 343 : return;
1514 : }
1515 : break;
1516 : default:
1517 : break;
1518 : }
1519 19598 : *char_out = ParseClassCharacterEscape();
1520 19598 : *is_class_escape = false;
1521 : } else {
1522 406384 : Advance();
1523 406384 : *char_out = current_char;
1524 406384 : *is_class_escape = false;
1525 : }
1526 : }
1527 :
1528 2913280 : RegExpTree* RegExpParser::ParseCharacterClass() {
1529 : static const char* kUnterminated = "Unterminated character class";
1530 : static const char* kRangeInvalid = "Invalid character class";
1531 : static const char* kRangeOutOfOrder = "Range out of order in character class";
1532 :
1533 : DCHECK_EQ(current(), '[');
1534 164331 : Advance();
1535 : bool is_negated = false;
1536 164331 : if (current() == '^') {
1537 : is_negated = true;
1538 10674 : Advance();
1539 : }
1540 164127 : ZoneList<CharacterRange>* ranges =
1541 164331 : new (zone()) ZoneList<CharacterRange>(2, zone());
1542 165515 : bool add_unicode_case_equivalents = unicode() && ignore_case();
1543 987713 : while (has_more() && current() != ']') {
1544 : uc32 char_1, char_2;
1545 : bool is_class_1, is_class_2;
1546 : ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1547 495886 : &is_class_1 CHECK_FAILED);
1548 247742 : if (current() == '-') {
1549 180469 : Advance();
1550 180469 : if (current() == kEndMarker) {
1551 : // If we reach the end we break out of the loop and let the
1552 : // following code report an error.
1553 : break;
1554 180463 : } else if (current() == ']') {
1555 562 : if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1556 292 : ranges->Add(CharacterRange::Singleton('-'), zone());
1557 292 : break;
1558 : }
1559 : ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1560 180171 : &is_class_2 CHECK_FAILED);
1561 180171 : if (is_class_1 || is_class_2) {
1562 : // Either end is an escaped character class. Treat the '-' verbatim.
1563 211 : if (unicode()) {
1564 : // ES2015 21.2.2.15.1 step 1.
1565 54 : return ReportError(CStrVector(kRangeInvalid));
1566 : }
1567 216 : if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1568 184 : ranges->Add(CharacterRange::Singleton('-'), zone());
1569 314 : if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1570 184 : continue;
1571 : }
1572 : // ES2015 21.2.2.15.1 step 6.
1573 179960 : if (char_1 > char_2) {
1574 102 : return ReportError(CStrVector(kRangeOutOfOrder));
1575 : }
1576 179909 : ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1577 : } else {
1578 132816 : if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1579 : }
1580 : }
1581 164145 : if (!has_more()) {
1582 36 : return ReportError(CStrVector(kUnterminated));
1583 : }
1584 164127 : Advance();
1585 164127 : if (ranges->length() == 0) {
1586 315 : ranges->Add(CharacterRange::Everything(), zone());
1587 315 : is_negated = !is_negated;
1588 : }
1589 : RegExpCharacterClass::Flags flags;
1590 164127 : if (is_negated) flags = RegExpCharacterClass::NEGATED;
1591 : return new (zone()) RegExpCharacterClass(ranges, flags);
1592 : }
1593 :
1594 :
1595 : #undef CHECK_FAILED
1596 :
1597 :
1598 397989 : bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1599 : FlatStringReader* input, JSRegExp::Flags flags,
1600 : RegExpCompileData* result) {
1601 : DCHECK(result != nullptr);
1602 397989 : RegExpParser parser(input, &result->error, flags, isolate, zone);
1603 397989 : RegExpTree* tree = parser.ParsePattern();
1604 397989 : if (parser.failed()) {
1605 : DCHECK(tree == nullptr);
1606 : DCHECK(!result->error.is_null());
1607 : } else {
1608 : DCHECK(tree != nullptr);
1609 : DCHECK(result->error.is_null());
1610 : if (FLAG_trace_regexp_parser) {
1611 : OFStream os(stdout);
1612 : tree->Print(os, zone);
1613 : os << "\n";
1614 : }
1615 395311 : result->tree = tree;
1616 395311 : int capture_count = parser.captures_started();
1617 606783 : result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1618 395311 : result->contains_anchor = parser.contains_anchor();
1619 395311 : result->capture_name_map = parser.CreateCaptureNameMap();
1620 395311 : result->capture_count = capture_count;
1621 : }
1622 397989 : return !parser.failed();
1623 : }
1624 :
1625 0 : RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
1626 : : zone_(zone),
1627 : pending_empty_(false),
1628 : ignore_case_(ignore_case),
1629 : unicode_(unicode),
1630 : characters_(nullptr),
1631 : pending_surrogate_(kNoPendingSurrogate),
1632 : terms_(),
1633 1699012 : alternatives_()
1634 : #ifdef DEBUG
1635 : ,
1636 : last_added_(ADD_NONE)
1637 : #endif
1638 : {
1639 0 : }
1640 :
1641 :
1642 0 : void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1643 : DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1644 : FlushPendingSurrogate();
1645 : // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1646 785 : pending_surrogate_ = lead_surrogate;
1647 0 : }
1648 :
1649 :
1650 2725 : void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1651 : DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1652 721 : if (pending_surrogate_ != kNoPendingSurrogate) {
1653 531 : uc16 lead_surrogate = pending_surrogate_;
1654 531 : pending_surrogate_ = kNoPendingSurrogate;
1655 : DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1656 : uc32 combined =
1657 531 : unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1658 531 : if (NeedsDesugaringForIgnoreCase(combined)) {
1659 30 : AddCharacterClassForDesugaring(combined);
1660 : } else {
1661 501 : ZoneList<uc16> surrogate_pair(2, zone());
1662 501 : surrogate_pair.Add(lead_surrogate, zone());
1663 501 : surrogate_pair.Add(trail_surrogate, zone());
1664 : RegExpAtom* atom =
1665 501 : new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1666 501 : AddAtom(atom);
1667 : }
1668 : } else {
1669 190 : pending_surrogate_ = trail_surrogate;
1670 : FlushPendingSurrogate();
1671 : }
1672 721 : }
1673 :
1674 :
1675 0 : void RegExpBuilder::FlushPendingSurrogate() {
1676 432448584 : if (pending_surrogate_ != kNoPendingSurrogate) {
1677 : DCHECK(unicode());
1678 444 : uc32 c = pending_surrogate_;
1679 444 : pending_surrogate_ = kNoPendingSurrogate;
1680 444 : AddCharacterClassForDesugaring(c);
1681 : }
1682 0 : }
1683 :
1684 :
1685 5879995 : void RegExpBuilder::FlushCharacters() {
1686 : FlushPendingSurrogate();
1687 5041373 : pending_empty_ = false;
1688 5041373 : if (characters_ != nullptr) {
1689 : RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1690 419311 : characters_ = nullptr;
1691 419311 : text_.Add(atom, zone());
1692 : LAST(ADD_ATOM);
1693 : }
1694 5041373 : }
1695 :
1696 :
1697 5438067 : void RegExpBuilder::FlushText() {
1698 4832029 : FlushCharacters();
1699 : int num_text = text_.length();
1700 4832029 : if (num_text == 0) {
1701 4832029 : return;
1702 584270 : } else if (num_text == 1) {
1703 579026 : terms_.Add(text_.last(), zone());
1704 : } else {
1705 : RegExpText* text = new (zone()) RegExpText(zone());
1706 38292 : for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1707 5244 : terms_.Add(text, zone());
1708 : }
1709 : text_.Clear();
1710 : }
1711 :
1712 :
1713 853484715 : void RegExpBuilder::AddCharacter(uc16 c) {
1714 : FlushPendingSurrogate();
1715 425715188 : pending_empty_ = false;
1716 425715188 : if (NeedsDesugaringForIgnoreCase(c)) {
1717 215 : AddCharacterClassForDesugaring(c);
1718 : } else {
1719 425714973 : if (characters_ == nullptr) {
1720 2054554 : characters_ = new (zone()) ZoneList<uc16>(4, zone());
1721 : }
1722 425714973 : characters_->Add(c, zone());
1723 : LAST(ADD_CHAR);
1724 : }
1725 425715188 : }
1726 :
1727 :
1728 851367841 : void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1729 425684186 : if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1730 : DCHECK(unicode());
1731 531 : AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1732 531 : AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1733 425686100 : } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1734 254 : AddLeadSurrogate(c);
1735 425685592 : } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1736 190 : AddTrailSurrogate(c);
1737 : } else {
1738 425683211 : AddCharacter(static_cast<uc16>(c));
1739 : }
1740 425684186 : }
1741 :
1742 1599 : void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1743 : // A lead or trail surrogate parsed via escape sequence will not
1744 : // pair up with any preceding lead or following trail surrogate.
1745 : FlushPendingSurrogate();
1746 1599 : AddUnicodeCharacter(character);
1747 : FlushPendingSurrogate();
1748 1599 : }
1749 :
1750 197 : void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1751 :
1752 :
1753 213666 : void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1754 213666 : if (NeedsDesugaringForUnicode(cc)) {
1755 : // With /u, character class needs to be desugared, so it
1756 : // must be a standalone term instead of being part of a RegExpText.
1757 4823 : AddTerm(cc);
1758 : } else {
1759 208843 : AddAtom(cc);
1760 : }
1761 213666 : }
1762 :
1763 1378 : void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1764 : AddTerm(new (zone()) RegExpCharacterClass(
1765 1378 : CharacterRange::List(zone(), CharacterRange::Singleton(c))));
1766 689 : }
1767 :
1768 :
1769 3031374 : void RegExpBuilder::AddAtom(RegExpTree* term) {
1770 1515687 : if (term->IsEmpty()) {
1771 : AddEmpty();
1772 1515687 : return;
1773 : }
1774 1515687 : if (term->IsTextElement()) {
1775 209344 : FlushCharacters();
1776 209344 : text_.Add(term, zone());
1777 : } else {
1778 1306343 : FlushText();
1779 1306343 : terms_.Add(term, zone());
1780 : }
1781 : LAST(ADD_ATOM);
1782 : }
1783 :
1784 :
1785 11024 : void RegExpBuilder::AddTerm(RegExpTree* term) {
1786 5512 : FlushText();
1787 5512 : terms_.Add(term, zone());
1788 : LAST(ADD_ATOM);
1789 5512 : }
1790 :
1791 :
1792 43886 : void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1793 22039 : FlushText();
1794 22039 : if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
1795 : // Omit repeated assertions of the same type.
1796 738 : RegExpAssertion* last = terms_.last()->AsAssertion();
1797 738 : RegExpAssertion* next = assert->AsAssertion();
1798 22408 : if (last->assertion_type() == next->assertion_type()) return;
1799 : }
1800 21847 : terms_.Add(assert, zone());
1801 : LAST(ADD_ASSERT);
1802 : }
1803 :
1804 :
1805 132851 : void RegExpBuilder::NewAlternative() { FlushTerms(); }
1806 :
1807 :
1808 4185624 : void RegExpBuilder::FlushTerms() {
1809 1829379 : FlushText();
1810 : int num_terms = terms_.length();
1811 : RegExpTree* alternative;
1812 1829379 : if (num_terms == 0) {
1813 : alternative = new (zone()) RegExpEmpty();
1814 1433721 : } else if (num_terms == 1) {
1815 : alternative = terms_.last();
1816 : } else {
1817 131208 : alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1818 : }
1819 1829379 : alternatives_.Add(alternative, zone());
1820 : terms_.Clear();
1821 : LAST(ADD_NONE);
1822 1829379 : }
1823 :
1824 :
1825 225956 : bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1826 213666 : if (!unicode()) return false;
1827 : // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1828 : // necessarily mean that we need to desugar. It's probably nicer to have a
1829 : // separate pass to figure out unicode desugarings.
1830 6302 : if (ignore_case()) return true;
1831 5988 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1832 5988 : CharacterRange::Canonicalize(ranges);
1833 16628 : for (int i = ranges->length() - 1; i >= 0; i--) {
1834 15149 : uc32 from = ranges->at(i).from();
1835 15149 : uc32 to = ranges->at(i).to();
1836 : // Check for non-BMP characters.
1837 15149 : if (to >= kNonBmpStart) return true;
1838 : // Check for lone surrogates.
1839 10750 : if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1840 : }
1841 : return false;
1842 : }
1843 :
1844 :
1845 425718266 : bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1846 : #ifdef V8_INTL_SUPPORT
1847 425718266 : if (unicode() && ignore_case()) {
1848 260 : icu::UnicodeSet set(c, c);
1849 260 : set.closeOver(USET_CASE_INSENSITIVE);
1850 260 : set.removeAllStrings();
1851 260 : return set.size() > 1;
1852 : }
1853 : // In the case where ICU is not included, we act as if the unicode flag is
1854 : // not set, and do not desugar.
1855 : #endif // V8_INTL_SUPPORT
1856 : return false;
1857 : }
1858 :
1859 :
1860 1749078 : RegExpTree* RegExpBuilder::ToRegExp() {
1861 1696528 : FlushTerms();
1862 : int num_alternatives = alternatives_.length();
1863 1696528 : if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1864 1696528 : if (num_alternatives == 1) return alternatives_.last();
1865 52550 : return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1866 : }
1867 :
1868 1687850 : bool RegExpBuilder::AddQuantifierToAtom(
1869 3342405 : int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1870 : FlushPendingSurrogate();
1871 1687850 : if (pending_empty_) {
1872 2 : pending_empty_ = false;
1873 2 : return true;
1874 : }
1875 : RegExpTree* atom;
1876 1687848 : if (characters_ != nullptr) {
1877 : DCHECK(last_added_ == ADD_CHAR);
1878 : // Last atom was character.
1879 : Vector<const uc16> char_vector = characters_->ToConstVector();
1880 : int num_chars = char_vector.length();
1881 1634995 : if (num_chars > 1) {
1882 674 : Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1883 674 : text_.Add(new (zone()) RegExpAtom(prefix), zone());
1884 : char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1885 : }
1886 1634995 : characters_ = nullptr;
1887 : atom = new (zone()) RegExpAtom(char_vector);
1888 1634995 : FlushText();
1889 52853 : } else if (text_.length() > 0) {
1890 : DCHECK(last_added_ == ADD_ATOM);
1891 33761 : atom = text_.RemoveLast();
1892 33761 : FlushText();
1893 19092 : } else if (terms_.length() > 0) {
1894 : DCHECK(last_added_ == ADD_ATOM);
1895 19092 : atom = terms_.RemoveLast();
1896 : // With /u, lookarounds are not quantifiable.
1897 19092 : if (unicode() && atom->IsLookaround()) return false;
1898 19072 : if (atom->max_match() == 0) {
1899 : // Guaranteed to only match an empty string.
1900 : LAST(ADD_TERM);
1901 297 : if (min == 0) {
1902 : return true;
1903 : }
1904 113 : terms_.Add(atom, zone());
1905 113 : return true;
1906 : }
1907 : } else {
1908 : // Only call immediately after adding an atom or character!
1909 0 : UNREACHABLE();
1910 : }
1911 1687531 : terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1912 1687531 : zone());
1913 : LAST(ADD_TERM);
1914 1687531 : return true;
1915 : }
1916 :
1917 : } // namespace internal
1918 : } // namespace v8
|