Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/regexp-parser.h"
6 :
7 : #include <vector>
8 :
9 : #include "src/char-predicates-inl.h"
10 : #include "src/heap/factory.h"
11 : #include "src/isolate.h"
12 : #include "src/objects-inl.h"
13 : #include "src/ostreams.h"
14 : #include "src/regexp/jsregexp.h"
15 : #include "src/regexp/property-sequences.h"
16 : #include "src/utils.h"
17 :
18 : #ifdef V8_INTL_SUPPORT
19 : #include "unicode/uniset.h"
20 : #endif // V8_INTL_SUPPORT
21 :
22 : namespace v8 {
23 : namespace internal {
24 :
25 0 : RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
26 : JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
27 : : isolate_(isolate),
28 : zone_(zone),
29 : error_(error),
30 : captures_(nullptr),
31 : named_captures_(nullptr),
32 : named_back_references_(nullptr),
33 : in_(in),
34 : current_(kEndMarker),
35 : top_level_flags_(flags),
36 : next_pos_(0),
37 : captures_started_(0),
38 : capture_count_(0),
39 : has_more_(true),
40 : simple_(false),
41 : contains_anchor_(false),
42 : is_scanned_for_captures_(false),
43 : has_named_captures_(false),
44 380740 : failed_(false) {
45 380740 : Advance();
46 0 : }
47 :
48 : template <bool update_position>
49 777820408 : inline uc32 RegExpParser::ReadNext() {
50 388910204 : int position = next_pos_;
51 : uc32 c0 = in()->Get(position);
52 388910204 : position++;
53 : // Read the whole surrogate pair in case of unicode flag, if possible.
54 389608447 : if (unicode() && position < in()->length() &&
55 : unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
56 : uc16 c1 = in()->Get(position);
57 586 : if (unibrow::Utf16::IsTrailSurrogate(c1)) {
58 389 : c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
59 371 : position++;
60 : }
61 : }
62 388737044 : if (update_position) next_pos_ = position;
63 388910204 : return c0;
64 : }
65 :
66 :
67 0 : uc32 RegExpParser::Next() {
68 173184 : if (has_next()) {
69 173160 : return ReadNext<false>();
70 : } else {
71 : return kEndMarker;
72 : }
73 : }
74 :
75 1166592010 : void RegExpParser::Advance() {
76 389117516 : if (has_next()) {
77 : StackLimitCheck check(isolate());
78 388737441 : if (check.HasOverflowed()) {
79 388 : if (FLAG_abort_on_stack_or_string_length_overflow) {
80 0 : FATAL("Aborting on stack overflow");
81 : }
82 : ReportError(CStrVector(
83 776 : MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
84 388737053 : } else if (zone()->excess_allocation()) {
85 9 : ReportError(CStrVector("Regular expression too large"));
86 : } else {
87 388737044 : current_ = ReadNext<true>();
88 : }
89 : } else {
90 380075 : current_ = kEndMarker;
91 : // Advance so that position() points to 1-after-the-last-character. This is
92 : // important so that Reset() to this position works correctly.
93 380075 : next_pos_ = in()->length() + 1;
94 380075 : has_more_ = false;
95 : }
96 389117516 : }
97 :
98 :
99 2639 : void RegExpParser::Reset(int pos) {
100 2639 : next_pos_ = pos;
101 2639 : has_more_ = (pos < in()->length());
102 2639 : Advance();
103 0 : }
104 :
105 0 : void RegExpParser::Advance(int dist) {
106 46501 : next_pos_ += dist - 1;
107 46501 : Advance();
108 0 : }
109 :
110 :
111 206841 : bool RegExpParser::simple() { return simple_; }
112 :
113 0 : bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
114 177 : switch (c) {
115 : case '^':
116 : case '$':
117 : case '\\':
118 : case '.':
119 : case '*':
120 : case '+':
121 : case '?':
122 : case '(':
123 : case ')':
124 : case '[':
125 : case ']':
126 : case '{':
127 : case '}':
128 : case '|':
129 : case '/':
130 : return true;
131 : default:
132 : break;
133 : }
134 0 : return false;
135 : }
136 :
137 :
138 8943 : RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
139 2981 : if (failed_) return nullptr; // Do not overwrite any existing error.
140 2981 : failed_ = true;
141 : *error_ = isolate()
142 : ->factory()
143 : ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
144 8943 : .ToHandleChecked();
145 : // Zip to the end to make sure the no more input is read.
146 2981 : current_ = kEndMarker;
147 2981 : next_pos_ = in()->length();
148 2981 : return nullptr;
149 : }
150 :
151 : #define CHECK_FAILED /**/); \
152 : if (failed_) return nullptr; \
153 : ((void)0
154 :
155 : // Pattern ::
156 : // Disjunction
157 587581 : RegExpTree* RegExpParser::ParsePattern() {
158 380740 : RegExpTree* result = ParseDisjunction(CHECK_FAILED);
159 377859 : PatchNamedBackReferences(CHECK_FAILED);
160 : DCHECK(!has_more());
161 : // If the result of parsing is a literal string atom, and it has the
162 : // same length as the input, then the atom is identical to the input.
163 584600 : if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
164 199836 : simple_ = true;
165 : }
166 377759 : return result;
167 : }
168 :
169 :
170 : // Disjunction ::
171 : // Alternative
172 : // Alternative | Disjunction
173 : // Alternative ::
174 : // [empty]
175 : // Term Alternative
176 : // Term ::
177 : // Assertion
178 : // Atom
179 : // Atom Quantifier
180 1157354042 : RegExpTree* RegExpParser::ParseDisjunction() {
181 : // Used to store current state while parsing subexpressions.
182 : RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
183 380740 : 0, nullptr, top_level_flags_, zone());
184 5887000 : RegExpParserState* state = &initial_state;
185 : // Cache the builder in a local variable for quick access.
186 380740 : RegExpBuilder* builder = initial_state.builder();
187 : while (true) {
188 386275177 : switch (current()) {
189 : case kEndMarker:
190 378186 : if (state->IsSubexpression()) {
191 : // Inside a parenthesized group when hitting end of input.
192 380803 : return ReportError(CStrVector("Unterminated group"));
193 : }
194 : DCHECK_EQ(INITIAL, state->group_type());
195 : // Parsing completed successfully.
196 378123 : return builder->ToRegExp();
197 : case ')': {
198 1105546 : if (!state->IsSubexpression()) {
199 14 : return ReportError(CStrVector("Unmatched ')'"));
200 : }
201 : DCHECK_NE(INITIAL, state->group_type());
202 :
203 1105532 : Advance();
204 : // End disjunction parsing and convert builder content to new single
205 : // regexp atom.
206 1105532 : RegExpTree* body = builder->ToRegExp();
207 :
208 : int end_capture_index = captures_started();
209 :
210 : int capture_index = state->capture_index();
211 : SubexpressionType group_type = state->group_type();
212 :
213 : // Build result of subexpression.
214 1105532 : if (group_type == CAPTURE) {
215 1081900 : if (state->IsNamedCapture()) {
216 : CreateNamedCaptureAtIndex(state->capture_name(),
217 1257 : capture_index CHECK_FAILED);
218 : }
219 1081859 : RegExpCapture* capture = GetCapture(capture_index);
220 : capture->set_body(body);
221 : body = capture;
222 23632 : } else if (group_type == GROUPING) {
223 : body = new (zone()) RegExpGroup(body);
224 : } else {
225 : DCHECK(group_type == POSITIVE_LOOKAROUND ||
226 : group_type == NEGATIVE_LOOKAROUND);
227 3611 : bool is_positive = (group_type == POSITIVE_LOOKAROUND);
228 : body = new (zone()) RegExpLookaround(
229 : body, is_positive, end_capture_index - capture_index,
230 3611 : capture_index, state->lookaround_type());
231 : }
232 :
233 : // Restore previous state.
234 : state = state->previous_state();
235 : builder = state->builder();
236 :
237 1105491 : builder->AddAtom(body);
238 : // For compatibility with JSC and ES3, we allow quantifiers after
239 : // lookaheads, and break in all cases.
240 1105491 : break;
241 : }
242 : case '|': {
243 117380 : Advance();
244 : builder->NewAlternative();
245 384442511 : continue;
246 : }
247 : case '*':
248 : case '+':
249 : case '?':
250 95 : return ReportError(CStrVector("Nothing to repeat"));
251 : case '^': {
252 6467 : Advance();
253 6467 : if (builder->multiline()) {
254 : builder->AddAssertion(new (zone()) RegExpAssertion(
255 263 : RegExpAssertion::START_OF_LINE, builder->flags()));
256 : } else {
257 : builder->AddAssertion(new (zone()) RegExpAssertion(
258 6204 : RegExpAssertion::START_OF_INPUT, builder->flags()));
259 : set_contains_anchor();
260 : }
261 : continue;
262 : }
263 : case '$': {
264 4117 : Advance();
265 : RegExpAssertion::AssertionType assertion_type =
266 : builder->multiline() ? RegExpAssertion::END_OF_LINE
267 4117 : : RegExpAssertion::END_OF_INPUT;
268 : builder->AddAssertion(
269 4117 : new (zone()) RegExpAssertion(assertion_type, builder->flags()));
270 4117 : continue;
271 : }
272 : case '.': {
273 18500 : Advance();
274 : ZoneList<CharacterRange>* ranges =
275 18500 : new (zone()) ZoneList<CharacterRange>(2, zone());
276 :
277 18500 : if (builder->dotall()) {
278 : // Everything.
279 210 : CharacterRange::AddClassEscape('*', ranges, false, zone());
280 : } else {
281 : // Everything except \x0A, \x0D, \u2028 and \u2029
282 18290 : CharacterRange::AddClassEscape('.', ranges, false, zone());
283 : }
284 :
285 : RegExpCharacterClass* cc =
286 37000 : new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
287 18500 : builder->AddCharacterClass(cc);
288 18500 : break;
289 : }
290 : case '(': {
291 1107247 : state = ParseOpenParenthesis(state CHECK_FAILED);
292 : builder = state->builder();
293 1106734 : continue;
294 : }
295 : case '[': {
296 155307 : RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
297 154923 : builder->AddCharacterClass(cc->AsCharacterClass());
298 154923 : break;
299 : }
300 : // Atom ::
301 : // \ AtomEscape
302 : case '\\':
303 31247 : switch (Next()) {
304 : case kEndMarker:
305 5 : return ReportError(CStrVector("\\ at end of pattern"));
306 : case 'b':
307 : Advance(2);
308 : builder->AddAssertion(new (zone()) RegExpAssertion(
309 431 : RegExpAssertion::BOUNDARY, builder->flags()));
310 431 : continue;
311 : case 'B':
312 : Advance(2);
313 : builder->AddAssertion(new (zone()) RegExpAssertion(
314 327 : RegExpAssertion::NON_BOUNDARY, builder->flags()));
315 327 : continue;
316 : // AtomEscape ::
317 : // CharacterClassEscape
318 : //
319 : // CharacterClassEscape :: one of
320 : // d D s S w W
321 : case 'd':
322 : case 'D':
323 : case 's':
324 : case 'S':
325 : case 'w':
326 : case 'W': {
327 : uc32 c = Next();
328 : Advance(2);
329 : ZoneList<CharacterRange>* ranges =
330 6797 : new (zone()) ZoneList<CharacterRange>(2, zone());
331 : CharacterRange::AddClassEscape(
332 7250 : c, ranges, unicode() && builder->ignore_case(), zone());
333 : RegExpCharacterClass* cc = new (zone())
334 13594 : RegExpCharacterClass(zone(), ranges, builder->flags());
335 6797 : builder->AddCharacterClass(cc);
336 6797 : break;
337 : }
338 : case 'p':
339 : case 'P': {
340 : uc32 p = Next();
341 : Advance(2);
342 4187 : if (unicode()) {
343 : ZoneList<CharacterRange>* ranges =
344 3989 : new (zone()) ZoneList<CharacterRange>(2, zone());
345 : std::vector<char> name_1, name_2;
346 3989 : if (ParsePropertyClassName(&name_1, &name_2)) {
347 3863 : if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
348 : RegExpCharacterClass* cc = new (zone())
349 6040 : RegExpCharacterClass(zone(), ranges, builder->flags());
350 3020 : builder->AddCharacterClass(cc);
351 3020 : break;
352 : }
353 1542 : if (p == 'p' && name_2.empty()) {
354 510 : RegExpTree* sequence = GetPropertySequence(name_1);
355 510 : if (sequence != nullptr) {
356 150 : builder->AddAtom(sequence);
357 150 : break;
358 : }
359 : }
360 : }
361 819 : return ReportError(CStrVector("Invalid property name"));
362 : } else {
363 198 : builder->AddCharacter(p);
364 : }
365 198 : break;
366 : }
367 : case '1':
368 : case '2':
369 : case '3':
370 : case '4':
371 : case '5':
372 : case '6':
373 : case '7':
374 : case '8':
375 : case '9': {
376 5111 : int index = 0;
377 5183 : bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
378 5111 : if (is_backref) {
379 9266 : if (state->IsInsideCaptureGroup(index)) {
380 : // The back reference is inside the capture group it refers to.
381 : // Nothing can possibly have been captured yet, so we use empty
382 : // instead. This ensures that, when checking a back reference,
383 : // the capture registers of the referenced capture are either
384 : // both set or both cleared.
385 : builder->AddEmpty();
386 : } else {
387 4540 : RegExpCapture* capture = GetCapture(index);
388 : RegExpTree* atom =
389 : new (zone()) RegExpBackReference(capture, builder->flags());
390 4540 : builder->AddAtom(atom);
391 : }
392 4670 : break;
393 : }
394 : // With /u, no identity escapes except for syntax characters
395 : // are allowed. Otherwise, all identity escapes are allowed.
396 478 : if (unicode()) {
397 72 : return ReportError(CStrVector("Invalid escape"));
398 : }
399 : uc32 first_digit = Next();
400 406 : if (first_digit == '8' || first_digit == '9') {
401 37 : builder->AddCharacter(first_digit);
402 : Advance(2);
403 : break;
404 : }
405 : V8_FALLTHROUGH;
406 : }
407 : case '0': {
408 3241 : Advance();
409 3279 : if (unicode() && Next() >= '0' && Next() <= '9') {
410 : // With /u, decimal escape with leading 0 are not parsed as octal.
411 19 : return ReportError(CStrVector("Invalid decimal escape"));
412 : }
413 3222 : uc32 octal = ParseOctalLiteral();
414 3222 : builder->AddCharacter(octal);
415 3222 : break;
416 : }
417 : // ControlEscape :: one of
418 : // f n r t v
419 : case 'f':
420 : Advance(2);
421 6 : builder->AddCharacter('\f');
422 6 : break;
423 : case 'n':
424 : Advance(2);
425 711 : builder->AddCharacter('\n');
426 711 : break;
427 : case 'r':
428 : Advance(2);
429 73 : builder->AddCharacter('\r');
430 73 : break;
431 : case 't':
432 : Advance(2);
433 8 : builder->AddCharacter('\t');
434 8 : break;
435 : case 'v':
436 : Advance(2);
437 28 : builder->AddCharacter('\v');
438 28 : break;
439 : case 'c': {
440 149 : Advance();
441 : uc32 controlLetter = Next();
442 : // Special case if it is an ASCII letter.
443 : // Convert lower case letters to uppercase.
444 149 : uc32 letter = controlLetter & ~('a' ^ 'A');
445 149 : if (letter < 'A' || 'Z' < letter) {
446 : // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
447 : // Read the backslash as a literal character instead of as
448 : // starting an escape.
449 : // ES#prod-annexB-ExtendedPatternCharacter
450 89 : if (unicode()) {
451 : // With /u, invalid escapes are not treated as identity escapes.
452 19 : return ReportError(CStrVector("Invalid unicode escape"));
453 : }
454 70 : builder->AddCharacter('\\');
455 : } else {
456 : Advance(2);
457 60 : builder->AddCharacter(controlLetter & 0x1F);
458 : }
459 : break;
460 : }
461 : case 'x': {
462 : Advance(2);
463 : uc32 value;
464 246 : if (ParseHexEscape(2, &value)) {
465 147 : builder->AddCharacter(value);
466 99 : } else if (!unicode()) {
467 63 : builder->AddCharacter('x');
468 : } else {
469 : // With /u, invalid escapes are not treated as identity escapes.
470 36 : return ReportError(CStrVector("Invalid escape"));
471 : }
472 210 : break;
473 : }
474 : case 'u': {
475 : Advance(2);
476 : uc32 value;
477 1997 : if (ParseUnicodeEscape(&value)) {
478 1809 : builder->AddEscapedUnicodeCharacter(value);
479 188 : } else if (!unicode()) {
480 116 : builder->AddCharacter('u');
481 : } else {
482 : // With /u, invalid escapes are not treated as identity escapes.
483 72 : return ReportError(CStrVector("Invalid Unicode escape"));
484 : }
485 1925 : break;
486 : }
487 : case 'k':
488 : // Either an identity escape or a named back-reference. The two
489 : // interpretations are mutually exclusive: '\k' is interpreted as
490 : // an identity escape for non-Unicode patterns without named
491 : // capture groups, and as the beginning of a named back-reference
492 : // in all other cases.
493 672 : if (unicode() || HasNamedCaptures()) {
494 : Advance(2);
495 396 : ParseNamedBackReference(builder, state CHECK_FAILED);
496 : break;
497 : }
498 : V8_FALLTHROUGH;
499 : default:
500 7903 : Advance();
501 : // With /u, no identity escapes except for syntax characters
502 : // are allowed. Otherwise, all identity escapes are allowed.
503 7940 : if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
504 7876 : builder->AddCharacter(current());
505 7876 : Advance();
506 : } else {
507 27 : return ReportError(CStrVector("Invalid escape"));
508 : }
509 7876 : break;
510 : }
511 : break;
512 : case '{': {
513 : int dummy;
514 509 : bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
515 444 : if (parsed) return ReportError(CStrVector("Nothing to repeat"));
516 : V8_FALLTHROUGH;
517 : }
518 : case '}':
519 : case ']':
520 903 : if (unicode()) {
521 28 : return ReportError(CStrVector("Lone quantifier brackets"));
522 : }
523 : V8_FALLTHROUGH;
524 : default:
525 383351012 : builder->AddUnicodeCharacter(current());
526 383351012 : Advance();
527 383351012 : break;
528 : } // end switch(current())
529 :
530 : int min;
531 : int max;
532 384659255 : switch (current()) {
533 : // QuantifierPrefix ::
534 : // *
535 : // +
536 : // ?
537 : // {
538 : case '*':
539 19551 : min = 0;
540 19551 : max = RegExpTree::kInfinity;
541 19551 : Advance();
542 19551 : break;
543 : case '+':
544 8002 : min = 1;
545 8002 : max = RegExpTree::kInfinity;
546 8002 : Advance();
547 8002 : break;
548 : case '?':
549 1421675 : min = 0;
550 1421675 : max = 1;
551 1421675 : Advance();
552 1421675 : break;
553 : case '{':
554 3280 : if (ParseIntervalQuantifier(&min, &max)) {
555 2773 : if (max < min) {
556 : return ReportError(
557 2 : CStrVector("numbers out of order in {} quantifier"));
558 : }
559 : break;
560 507 : } else if (unicode()) {
561 : // With /u, incomplete quantifiers are not allowed.
562 199 : return ReportError(CStrVector("Incomplete quantifier"));
563 : }
564 : continue;
565 : default:
566 : continue;
567 : }
568 : RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
569 1451999 : if (current() == '?') {
570 : quantifier_type = RegExpQuantifier::NON_GREEDY;
571 1745 : Advance();
572 : } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
573 : // FLAG_regexp_possessive_quantifier is a debug-only flag.
574 : quantifier_type = RegExpQuantifier::POSSESSIVE;
575 : Advance();
576 : }
577 1451999 : if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
578 73 : return ReportError(CStrVector("Invalid quantifier"));
579 : }
580 : }
581 : }
582 :
583 1107247 : RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
584 4431298 : RegExpParserState* state) {
585 : RegExpLookaround::Type lookaround_type = state->lookaround_type();
586 : bool is_named_capture = false;
587 : JSRegExp::Flags switch_on = JSRegExp::kNone;
588 : JSRegExp::Flags switch_off = JSRegExp::kNone;
589 : const ZoneVector<uc16>* capture_name = nullptr;
590 : SubexpressionType subexpr_type = CAPTURE;
591 1107247 : Advance();
592 1107247 : if (current() == '?') {
593 26422 : switch (Next()) {
594 : case ':':
595 : Advance(2);
596 : subexpr_type = GROUPING;
597 19791 : break;
598 : case '=':
599 : Advance(2);
600 : lookaround_type = RegExpLookaround::LOOKAHEAD;
601 : subexpr_type = POSITIVE_LOOKAROUND;
602 1709 : break;
603 : case '!':
604 : Advance(2);
605 : lookaround_type = RegExpLookaround::LOOKAHEAD;
606 : subexpr_type = NEGATIVE_LOOKAROUND;
607 457 : break;
608 : case '-':
609 : case 'i':
610 : case 's':
611 : case 'm': {
612 1373 : if (!FLAG_regexp_mode_modifiers) {
613 4 : ReportError(CStrVector("Invalid group"));
614 4 : return nullptr;
615 : }
616 1369 : Advance();
617 : bool flags_sense = true; // Switching on flags.
618 4753 : while (subexpr_type != GROUPING) {
619 3126 : switch (current()) {
620 : case '-':
621 344 : if (!flags_sense) {
622 18 : ReportError(CStrVector("Multiple dashes in flag group"));
623 18 : return nullptr;
624 : }
625 : flags_sense = false;
626 326 : Advance();
627 326 : continue;
628 : case 's':
629 : case 'i':
630 : case 'm': {
631 : JSRegExp::Flags bit = JSRegExp::kUnicode;
632 1449 : if (current() == 'i') bit = JSRegExp::kIgnoreCase;
633 1449 : if (current() == 'm') bit = JSRegExp::kMultiline;
634 1449 : if (current() == 's') bit = JSRegExp::kDotAll;
635 1449 : if (((switch_on | switch_off) & bit) != 0) {
636 18 : ReportError(CStrVector("Repeated flag in flag group"));
637 : return nullptr;
638 : }
639 1431 : if (flags_sense) {
640 : switch_on |= bit;
641 : } else {
642 : switch_off |= bit;
643 : }
644 1431 : Advance();
645 : continue;
646 : }
647 : case ')': {
648 1075 : Advance();
649 : state->builder()
650 1075 : ->FlushText(); // Flush pending text using old flags.
651 : // These (?i)-style flag switches don't put us in a subexpression
652 : // at all, they just modify the flags in the rest of the current
653 : // subexpression.
654 : JSRegExp::Flags flags =
655 : (state->builder()->flags() | switch_on) & ~switch_off;
656 : state->builder()->set_flags(flags);
657 : return state;
658 : }
659 : case ':':
660 258 : Advance();
661 : subexpr_type = GROUPING; // Will break us out of the outer loop.
662 258 : continue;
663 : default:
664 0 : ReportError(CStrVector("Invalid flag group"));
665 0 : return nullptr;
666 : }
667 : }
668 : break;
669 : }
670 : case '<':
671 3073 : Advance();
672 3073 : if (Next() == '=') {
673 : Advance(2);
674 : lookaround_type = RegExpLookaround::LOOKBEHIND;
675 : subexpr_type = POSITIVE_LOOKAROUND;
676 1218 : break;
677 1855 : } else if (Next() == '!') {
678 : Advance(2);
679 : lookaround_type = RegExpLookaround::LOOKBEHIND;
680 : subexpr_type = NEGATIVE_LOOKAROUND;
681 245 : break;
682 : }
683 : is_named_capture = true;
684 1610 : has_named_captures_ = true;
685 1610 : Advance();
686 1610 : break;
687 : default:
688 19 : ReportError(CStrVector("Invalid group"));
689 19 : return nullptr;
690 : }
691 : }
692 1106113 : if (subexpr_type == CAPTURE) {
693 1082435 : if (captures_started_ >= kMaxCaptures) {
694 5 : ReportError(CStrVector("Too many captures"));
695 5 : return nullptr;
696 : }
697 1082430 : captures_started_++;
698 :
699 1082430 : if (is_named_capture) {
700 1610 : capture_name = ParseCaptureGroupName(CHECK_FAILED);
701 : }
702 : }
703 : JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
704 : // Store current state and begin new disjunction parsing.
705 : return new (zone())
706 : RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
707 2211528 : capture_name, flags, zone());
708 : }
709 :
710 : #ifdef DEBUG
711 : // Currently only used in an DCHECK.
712 : static bool IsSpecialClassEscape(uc32 c) {
713 : switch (c) {
714 : case 'd':
715 : case 'D':
716 : case 's':
717 : case 'S':
718 : case 'w':
719 : case 'W':
720 : return true;
721 : default:
722 : return false;
723 : }
724 : }
725 : #endif
726 :
727 :
728 : // In order to know whether an escape is a backreference or not we have to scan
729 : // the entire regexp and find the number of capturing parentheses. However we
730 : // don't want to scan the regexp twice unless it is necessary. This mini-parser
731 : // is called when needed. It can see the difference between capturing and
732 : // noncapturing parentheses and can skip character classes and backslash-escaped
733 : // characters.
734 8158 : void RegExpParser::ScanForCaptures() {
735 : DCHECK(!is_scanned_for_captures_);
736 : const int saved_position = position();
737 : // Start with captures started previous to current position
738 : int capture_count = captures_started();
739 : // Add count of captures after this position.
740 : int n;
741 6327 : while ((n = current()) != kEndMarker) {
742 4749 : Advance();
743 4749 : switch (n) {
744 : case '\\':
745 605 : Advance();
746 605 : break;
747 : case '[': {
748 : int c;
749 165 : while ((c = current()) != kEndMarker) {
750 156 : Advance();
751 156 : if (c == '\\') {
752 0 : Advance();
753 : } else {
754 156 : if (c == ']') break;
755 : }
756 : }
757 : break;
758 : }
759 : case '(':
760 611 : if (current() == '?') {
761 : // At this point we could be in
762 : // * a non-capturing group '(:',
763 : // * a lookbehind assertion '(?<=' '(?<!'
764 : // * or a named capture '(?<'.
765 : //
766 : // Of these, only named captures are capturing groups.
767 :
768 145 : Advance();
769 145 : if (current() != '<') break;
770 :
771 121 : Advance();
772 121 : if (current() == '=' || current() == '!') break;
773 :
774 : // Found a possible named capture. It could turn out to be a syntax
775 : // error (e.g. an unterminated or invalid name), but that distinction
776 : // does not matter for our purposes.
777 75 : has_named_captures_ = true;
778 : }
779 541 : capture_count++;
780 541 : break;
781 : }
782 : }
783 789 : capture_count_ = capture_count;
784 789 : is_scanned_for_captures_ = true;
785 : Reset(saved_position);
786 789 : }
787 :
788 :
789 15930 : bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
790 : DCHECK_EQ('\\', current());
791 : DCHECK('1' <= Next() && Next() <= '9');
792 : // Try to parse a decimal literal that is no greater than the total number
793 : // of left capturing parentheses in the input.
794 : int start = position();
795 5111 : int value = Next() - '0';
796 : Advance(2);
797 : while (true) {
798 : uc32 c = current();
799 5728 : if (IsDecimalDigit(c)) {
800 637 : value = 10 * value + (c - '0');
801 637 : if (value > kMaxCaptures) {
802 : Reset(start);
803 20 : return false;
804 : }
805 617 : Advance();
806 : } else {
807 : break;
808 : }
809 : }
810 5091 : if (value > captures_started()) {
811 882 : if (!is_scanned_for_captures_) ScanForCaptures();
812 882 : if (value > capture_count_) {
813 : Reset(start);
814 458 : return false;
815 : }
816 : }
817 4633 : *index_out = value;
818 5250 : return true;
819 : }
820 :
821 2425 : static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
822 2425 : if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
823 4770 : v->push_back(code_unit);
824 : } else {
825 80 : v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
826 80 : v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
827 : }
828 2425 : }
829 :
830 6603 : const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
831 : ZoneVector<uc16>* name =
832 1965 : new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
833 :
834 : bool at_start = true;
835 : while (true) {
836 4390 : uc32 c = current();
837 4390 : Advance();
838 :
839 : // Convert unicode escapes.
840 4638 : if (c == '\\' && current() == 'u') {
841 230 : Advance();
842 230 : if (!ParseUnicodeEscape(&c)) {
843 59 : ReportError(CStrVector("Invalid Unicode escape sequence"));
844 59 : return nullptr;
845 : }
846 : }
847 :
848 : // The backslash char is misclassified as both ID_Start and ID_Continue.
849 4331 : if (c == '\\') {
850 18 : ReportError(CStrVector("Invalid capture group name"));
851 18 : return nullptr;
852 : }
853 :
854 4313 : if (at_start) {
855 1933 : if (!IsIdentifierStart(c)) {
856 100 : ReportError(CStrVector("Invalid capture group name"));
857 100 : return nullptr;
858 : }
859 1833 : push_code_unit(name, c);
860 : at_start = false;
861 : } else {
862 2380 : if (c == '>') {
863 : break;
864 809 : } else if (IsIdentifierPart(c)) {
865 592 : push_code_unit(name, c);
866 : } else {
867 217 : ReportError(CStrVector("Invalid capture group name"));
868 217 : return nullptr;
869 : }
870 : }
871 : }
872 :
873 3996 : return name;
874 : }
875 :
876 1257 : bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
877 2149 : int index) {
878 : DCHECK(0 < index && index <= captures_started_);
879 : DCHECK_NOT_NULL(name);
880 :
881 1257 : if (named_captures_ == nullptr) {
882 933 : named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
883 : } else {
884 : // Check for duplicates and bail if we find any.
885 : // TODO(jgruber): O(n^2).
886 996 : for (const auto& named_capture : *named_captures_) {
887 389 : if (*named_capture->name() == *name) {
888 41 : ReportError(CStrVector("Duplicate capture group name"));
889 41 : return false;
890 : }
891 : }
892 : }
893 :
894 1216 : RegExpCapture* capture = GetCapture(index);
895 : DCHECK_NULL(capture->name());
896 :
897 : capture->set_name(name);
898 1216 : named_captures_->Add(capture, zone());
899 :
900 1216 : return true;
901 : }
902 :
903 396 : bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
904 1186 : RegExpParserState* state) {
905 : // The parser is assumed to be on the '<' in \k<name>.
906 396 : if (current() != '<') {
907 41 : ReportError(CStrVector("Invalid named reference"));
908 41 : return false;
909 : }
910 :
911 355 : Advance();
912 355 : const ZoneVector<uc16>* name = ParseCaptureGroupName();
913 355 : if (name == nullptr) {
914 : return false;
915 : }
916 :
917 305 : if (state->IsInsideCaptureGroup(name)) {
918 : builder->AddEmpty();
919 : } else {
920 : RegExpBackReference* atom =
921 285 : new (zone()) RegExpBackReference(builder->flags());
922 : atom->set_name(name);
923 :
924 285 : builder->AddAtom(atom);
925 :
926 285 : if (named_back_references_ == nullptr) {
927 : named_back_references_ =
928 220 : new (zone()) ZoneList<RegExpBackReference*>(1, zone());
929 : }
930 285 : named_back_references_->Add(atom, zone());
931 : }
932 :
933 : return true;
934 : }
935 :
936 377859 : void RegExpParser::PatchNamedBackReferences() {
937 378250 : if (named_back_references_ == nullptr) return;
938 :
939 220 : if (named_captures_ == nullptr) {
940 14 : ReportError(CStrVector("Invalid named capture referenced"));
941 14 : return;
942 : }
943 :
944 : // Look up and patch the actual capture for each named back reference.
945 : // TODO(jgruber): O(n^2), optimize if necessary.
946 :
947 576 : for (int i = 0; i < named_back_references_->length(); i++) {
948 612 : RegExpBackReference* ref = named_back_references_->at(i);
949 :
950 : int index = -1;
951 698 : for (const auto& capture : *named_captures_) {
952 341 : if (*capture->name() == *ref->name()) {
953 : index = capture->index();
954 185 : break;
955 : }
956 : }
957 :
958 271 : if (index == -1) {
959 86 : ReportError(CStrVector("Invalid named capture referenced"));
960 86 : return;
961 : }
962 :
963 185 : ref->set_capture(GetCapture(index));
964 : }
965 : }
966 :
967 2183327 : RegExpCapture* RegExpParser::GetCapture(int index) {
968 : // The index for the capture groups are one-based. Its index in the list is
969 : // zero-based.
970 : int know_captures =
971 1087800 : is_scanned_for_captures_ ? capture_count_ : captures_started_;
972 : DCHECK(index <= know_captures);
973 3257468 : if (captures_ == nullptr) {
974 13659 : captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
975 : }
976 4339336 : while (captures_->length() < know_captures) {
977 2163736 : captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
978 : }
979 2175600 : return captures_->at(index - 1);
980 : }
981 :
982 378529 : Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
983 380334 : if (named_captures_ == nullptr || named_captures_->is_empty())
984 376989 : return Handle<FixedArray>();
985 :
986 : Factory* factory = isolate()->factory();
987 :
988 770 : int len = named_captures_->length() * 2;
989 770 : Handle<FixedArray> array = factory->NewFixedArray(len);
990 :
991 3610 : for (int i = 0; i < named_captures_->length(); i++) {
992 2070 : RegExpCapture* capture = named_captures_->at(i);
993 1035 : MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
994 2070 : array->set(i * 2, *name.ToHandleChecked());
995 : array->set(i * 2 + 1, Smi::FromInt(capture->index()));
996 : }
997 :
998 770 : return array;
999 : }
1000 :
1001 0 : bool RegExpParser::HasNamedCaptures() {
1002 223 : if (has_named_captures_ || is_scanned_for_captures_) {
1003 : return has_named_captures_;
1004 : }
1005 :
1006 118 : ScanForCaptures();
1007 : DCHECK(is_scanned_for_captures_);
1008 118 : return has_named_captures_;
1009 : }
1010 :
1011 0 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
1012 10952 : for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1013 5375 : if (s->group_type() != CAPTURE) continue;
1014 : // Return true if we found the matching capture index.
1015 610 : if (index == s->capture_index()) return true;
1016 : // Abort if index is larger than what has been parsed up till this state.
1017 517 : if (index > s->capture_index()) return false;
1018 : }
1019 : return false;
1020 : }
1021 :
1022 305 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1023 : const ZoneVector<uc16>* name) {
1024 : DCHECK_NOT_NULL(name);
1025 600 : for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1026 315 : if (s->capture_name() == nullptr) continue;
1027 30 : if (*s->capture_name() == *name) return true;
1028 : }
1029 : return false;
1030 : }
1031 :
1032 : // QuantifierPrefix ::
1033 : // { DecimalDigits }
1034 : // { DecimalDigits , }
1035 : // { DecimalDigits , DecimalDigits }
1036 : //
1037 : // Returns true if parsing succeeds, and set the min_out and max_out
1038 : // values. Values are truncated to RegExpTree::kInfinity if they overflow.
1039 25771 : bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
1040 : DCHECK_EQ(current(), '{');
1041 : int start = position();
1042 3744 : Advance();
1043 : int min = 0;
1044 3744 : if (!IsDecimalDigit(current())) {
1045 : Reset(start);
1046 834 : return false;
1047 : }
1048 7217 : while (IsDecimalDigit(current())) {
1049 4427 : int next = current() - '0';
1050 4427 : if (min > (RegExpTree::kInfinity - next) / 10) {
1051 : // Overflow. Skip past remaining decimal digits and return -1.
1052 810 : do {
1053 810 : Advance();
1054 : } while (IsDecimalDigit(current()));
1055 : min = RegExpTree::kInfinity;
1056 : break;
1057 : }
1058 4307 : min = 10 * min + next;
1059 4307 : Advance();
1060 : }
1061 : int max = 0;
1062 2910 : if (current() == '}') {
1063 : max = min;
1064 1172 : Advance();
1065 1738 : } else if (current() == ',') {
1066 1700 : Advance();
1067 1700 : if (current() == '}') {
1068 : max = RegExpTree::kInfinity;
1069 329 : Advance();
1070 : } else {
1071 3525 : while (IsDecimalDigit(current())) {
1072 2214 : int next = current() - '0';
1073 2214 : if (max > (RegExpTree::kInfinity - next) / 10) {
1074 750 : do {
1075 750 : Advance();
1076 : } while (IsDecimalDigit(current()));
1077 : max = RegExpTree::kInfinity;
1078 : break;
1079 : }
1080 2154 : max = 10 * max + next;
1081 2154 : Advance();
1082 : }
1083 1371 : if (current() != '}') {
1084 : Reset(start);
1085 74 : return false;
1086 : }
1087 1297 : Advance();
1088 : }
1089 : } else {
1090 : Reset(start);
1091 38 : return false;
1092 : }
1093 2798 : *min_out = min;
1094 2798 : *max_out = max;
1095 2798 : return true;
1096 : }
1097 :
1098 :
1099 11214 : uc32 RegExpParser::ParseOctalLiteral() {
1100 : DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1101 : // For compatibility with some other browsers (not all), we parse
1102 : // up to three octal digits with a value below 256.
1103 : // ES#prod-annexB-LegacyOctalEscapeSequence
1104 4284 : uc32 value = current() - '0';
1105 4284 : Advance();
1106 4284 : if ('0' <= current() && current() <= '7') {
1107 1388 : value = value * 8 + current() - '0';
1108 1388 : Advance();
1109 2744 : if (value < 32 && '0' <= current() && current() <= '7') {
1110 2580 : value = value * 8 + current() - '0';
1111 1290 : Advance();
1112 : }
1113 : }
1114 4284 : return value;
1115 : }
1116 :
1117 :
1118 389704 : bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1119 : int start = position();
1120 : uc32 val = 0;
1121 389330 : for (int i = 0; i < length; ++i) {
1122 : uc32 c = current();
1123 : int d = HexValue(c);
1124 310675 : if (d < 0) {
1125 : Reset(start);
1126 374 : return false;
1127 : }
1128 310301 : val = val * 16 + d;
1129 310301 : Advance();
1130 : }
1131 78655 : *value = val;
1132 78655 : return true;
1133 : }
1134 :
1135 : // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1136 99386 : bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1137 : // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1138 : // allowed). In the latter case, the number of hex digits between { } is
1139 : // arbitrary. \ and u have already been read.
1140 91439 : if (current() == '{' && unicode()) {
1141 : int start = position();
1142 7458 : Advance();
1143 7458 : if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1144 7431 : if (current() == '}') {
1145 7426 : Advance();
1146 7426 : return true;
1147 : }
1148 : }
1149 : Reset(start);
1150 32 : return false;
1151 : }
1152 : // \u but no {, or \u{...} escapes not allowed.
1153 76423 : bool result = ParseHexEscape(4, value);
1154 226996 : if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1155 : current() == '\\') {
1156 : // Attempt to read trail surrogate.
1157 : int start = position();
1158 205 : if (Next() == 'u') {
1159 : Advance(2);
1160 : uc32 trail;
1161 380 : if (ParseHexEscape(4, &trail) &&
1162 185 : unibrow::Utf16::IsTrailSurrogate(trail)) {
1163 : *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1164 370 : static_cast<uc16>(trail));
1165 185 : return true;
1166 : }
1167 : }
1168 : Reset(start);
1169 : }
1170 76238 : return result;
1171 : }
1172 :
1173 : #ifdef V8_INTL_SUPPORT
1174 :
1175 : namespace {
1176 :
1177 2708 : bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1178 2708 : const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1179 2708 : if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1180 : return true;
1181 : for (int i = 0;; i++) {
1182 : const char* long_name = u_getPropertyName(
1183 806 : property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1184 806 : if (long_name == nullptr) break;
1185 743 : if (strcmp(property_name, long_name) == 0) return true;
1186 : }
1187 : return false;
1188 : }
1189 :
1190 3153 : bool IsExactPropertyValueAlias(const char* property_value_name,
1191 : UProperty property, int32_t property_value) {
1192 : const char* short_name =
1193 3153 : u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1194 3153 : if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1195 : return true;
1196 : }
1197 : for (int i = 0;; i++) {
1198 : const char* long_name = u_getPropertyValueName(
1199 : property, property_value,
1200 310 : static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1201 310 : if (long_name == nullptr) break;
1202 310 : if (strcmp(property_value_name, long_name) == 0) return true;
1203 : }
1204 : return false;
1205 : }
1206 :
1207 5216 : bool LookupPropertyValueName(UProperty property,
1208 : const char* property_value_name, bool negate,
1209 : ZoneList<CharacterRange>* result, Zone* zone) {
1210 : UProperty property_for_lookup = property;
1211 5216 : if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1212 : // For the property Script_Extensions, we have to do the property value
1213 : // name lookup as if the property is Script.
1214 : property_for_lookup = UCHAR_SCRIPT;
1215 : }
1216 : int32_t property_value =
1217 5216 : u_getPropertyValueEnum(property_for_lookup, property_value_name);
1218 5216 : if (property_value == UCHAR_INVALID_CODE) return false;
1219 :
1220 : // We require the property name to match exactly to one of the property value
1221 : // aliases. However, u_getPropertyValueEnum uses loose matching.
1222 3153 : if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1223 3153 : property_value)) {
1224 : return false;
1225 : }
1226 :
1227 3153 : UErrorCode ec = U_ZERO_ERROR;
1228 3153 : icu::UnicodeSet set;
1229 3153 : set.applyIntPropertyValue(property, property_value, ec);
1230 3153 : bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1231 :
1232 3153 : if (success) {
1233 3153 : set.removeAllStrings();
1234 3153 : if (negate) set.complement();
1235 578061 : for (int i = 0; i < set.getRangeCount(); i++) {
1236 : result->Add(
1237 578061 : CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1238 578061 : zone);
1239 : }
1240 : }
1241 3153 : return success;
1242 : }
1243 :
1244 : template <size_t N>
1245 : inline bool NameEquals(const char* name, const char (&literal)[N]) {
1246 6194 : return strncmp(name, literal, N + 1) == 0;
1247 : }
1248 :
1249 2063 : bool LookupSpecialPropertyValueName(const char* name,
1250 : ZoneList<CharacterRange>* result,
1251 : bool negate, Zone* zone) {
1252 2063 : if (NameEquals(name, "Any")) {
1253 135 : if (negate) {
1254 : // Leave the list of character ranges empty, since the negation of 'Any'
1255 : // is the empty set.
1256 : } else {
1257 40 : result->Add(CharacterRange::Everything(), zone);
1258 : }
1259 1928 : } else if (NameEquals(name, "ASCII")) {
1260 : result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1261 : : CharacterRange::Range(0x0, 0x7F),
1262 105 : zone);
1263 1823 : } else if (NameEquals(name, "Assigned")) {
1264 : return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1265 75 : !negate, result, zone);
1266 : } else {
1267 : return false;
1268 : }
1269 : return true;
1270 : }
1271 :
1272 : // Explicitly whitelist supported binary properties. The spec forbids supporting
1273 : // properties outside of this set to ensure interoperability.
1274 1748 : bool IsSupportedBinaryProperty(UProperty property) {
1275 1748 : switch (property) {
1276 : case UCHAR_ALPHABETIC:
1277 : // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1278 : // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1279 : case UCHAR_ASCII_HEX_DIGIT:
1280 : // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1281 : case UCHAR_BIDI_CONTROL:
1282 : case UCHAR_BIDI_MIRRORED:
1283 : case UCHAR_CASE_IGNORABLE:
1284 : case UCHAR_CASED:
1285 : case UCHAR_CHANGES_WHEN_CASEFOLDED:
1286 : case UCHAR_CHANGES_WHEN_CASEMAPPED:
1287 : case UCHAR_CHANGES_WHEN_LOWERCASED:
1288 : case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1289 : case UCHAR_CHANGES_WHEN_TITLECASED:
1290 : case UCHAR_CHANGES_WHEN_UPPERCASED:
1291 : case UCHAR_DASH:
1292 : case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1293 : case UCHAR_DEPRECATED:
1294 : case UCHAR_DIACRITIC:
1295 : case UCHAR_EMOJI:
1296 : case UCHAR_EMOJI_COMPONENT:
1297 : case UCHAR_EMOJI_MODIFIER_BASE:
1298 : case UCHAR_EMOJI_MODIFIER:
1299 : case UCHAR_EMOJI_PRESENTATION:
1300 : case UCHAR_EXTENDED_PICTOGRAPHIC:
1301 : case UCHAR_EXTENDER:
1302 : case UCHAR_GRAPHEME_BASE:
1303 : case UCHAR_GRAPHEME_EXTEND:
1304 : case UCHAR_HEX_DIGIT:
1305 : case UCHAR_ID_CONTINUE:
1306 : case UCHAR_ID_START:
1307 : case UCHAR_IDEOGRAPHIC:
1308 : case UCHAR_IDS_BINARY_OPERATOR:
1309 : case UCHAR_IDS_TRINARY_OPERATOR:
1310 : case UCHAR_JOIN_CONTROL:
1311 : case UCHAR_LOGICAL_ORDER_EXCEPTION:
1312 : case UCHAR_LOWERCASE:
1313 : case UCHAR_MATH:
1314 : case UCHAR_NONCHARACTER_CODE_POINT:
1315 : case UCHAR_PATTERN_SYNTAX:
1316 : case UCHAR_PATTERN_WHITE_SPACE:
1317 : case UCHAR_QUOTATION_MARK:
1318 : case UCHAR_RADICAL:
1319 : case UCHAR_REGIONAL_INDICATOR:
1320 : case UCHAR_S_TERM:
1321 : case UCHAR_SOFT_DOTTED:
1322 : case UCHAR_TERMINAL_PUNCTUATION:
1323 : case UCHAR_UNIFIED_IDEOGRAPH:
1324 : case UCHAR_UPPERCASE:
1325 : case UCHAR_VARIATION_SELECTOR:
1326 : case UCHAR_WHITE_SPACE:
1327 : case UCHAR_XID_CONTINUE:
1328 : case UCHAR_XID_START:
1329 : return true;
1330 : default:
1331 : break;
1332 : }
1333 843 : return false;
1334 : }
1335 :
1336 : bool IsUnicodePropertyValueCharacter(char c) {
1337 : // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1338 : //
1339 : // Note that using this to validate each parsed char is quite conservative.
1340 : // A possible alternative solution would be to only ensure the parsed
1341 : // property name/value candidate string does not contain '\0' characters and
1342 : // let ICU lookups trigger the final failure.
1343 39082 : if ('a' <= c && c <= 'z') return true;
1344 10168 : if ('A' <= c && c <= 'Z') return true;
1345 1989 : if ('0' <= c && c <= '9') return true;
1346 1971 : return (c == '_');
1347 : }
1348 :
1349 : } // anonymous namespace
1350 :
1351 4547 : bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
1352 49835 : std::vector<char>* name_2) {
1353 : DCHECK(name_1->empty());
1354 : DCHECK(name_2->empty());
1355 : // Parse the property class as follows:
1356 : // - In \p{name}, 'name' is interpreted
1357 : // - either as a general category property value name.
1358 : // - or as a binary property name.
1359 : // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1360 : // and 'value' is interpreted as one of the available property value names.
1361 : // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1362 : // - Loose matching is not applied.
1363 4547 : if (current() == '{') {
1364 : // Parse \p{[PropertyName=]PropertyNameValue}
1365 71204 : for (Advance(); current() != '}' && current() != '='; Advance()) {
1366 62398 : if (!IsUnicodePropertyValueCharacter(current())) return false;
1367 31136 : if (!has_next()) return false;
1368 62272 : name_1->push_back(static_cast<char>(current()));
1369 : }
1370 4403 : if (current() == '=') {
1371 19372 : for (Advance(); current() != '}'; Advance()) {
1372 15766 : if (!IsUnicodePropertyValueCharacter(current())) return false;
1373 7874 : if (!has_next()) return false;
1374 15748 : name_2->push_back(static_cast<char>(current()));
1375 : }
1376 3606 : name_2->push_back(0); // null-terminate string.
1377 : }
1378 : } else {
1379 : return false;
1380 : }
1381 4394 : Advance();
1382 8788 : name_1->push_back(0); // null-terminate string.
1383 :
1384 : DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1385 : DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1386 4394 : return true;
1387 : }
1388 :
1389 4394 : bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1390 : bool negate,
1391 : const std::vector<char>& name_1,
1392 7164 : const std::vector<char>& name_2) {
1393 4394 : if (name_2.empty()) {
1394 : // First attempt to interpret as general category property value name.
1395 : const char* name = name_1.data();
1396 2591 : if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1397 2591 : add_to, zone())) {
1398 : return true;
1399 : }
1400 : // Interpret "Any", "ASCII", and "Assigned".
1401 2063 : if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1402 : return true;
1403 : }
1404 : // Then attempt to interpret as binary property name with value name 'Y'.
1405 1748 : UProperty property = u_getPropertyEnum(name);
1406 1748 : if (!IsSupportedBinaryProperty(property)) return false;
1407 905 : if (!IsExactPropertyAlias(name, property)) return false;
1408 : return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1409 905 : zone());
1410 : } else {
1411 : // Both property name and value name are specified. Attempt to interpret
1412 : // the property name as enumerated property.
1413 : const char* property_name = name_1.data();
1414 : const char* value_name = name_2.data();
1415 1803 : UProperty property = u_getPropertyEnum(property_name);
1416 1803 : if (!IsExactPropertyAlias(property_name, property)) return false;
1417 1740 : if (property == UCHAR_GENERAL_CATEGORY) {
1418 : // We want to allow aggregate value names such as "Letter".
1419 : property = UCHAR_GENERAL_CATEGORY_MASK;
1420 3400 : } else if (property != UCHAR_SCRIPT &&
1421 1700 : property != UCHAR_SCRIPT_EXTENSIONS) {
1422 : return false;
1423 : }
1424 : return LookupPropertyValueName(property, value_name, negate, add_to,
1425 1605 : zone());
1426 : }
1427 : }
1428 :
1429 880 : RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
1430 510 : if (!FLAG_harmony_regexp_sequence) return nullptr;
1431 : const char* name = name_1.data();
1432 : const uc32* sequence_list = nullptr;
1433 : JSRegExp::Flags flags = JSRegExp::kUnicode;
1434 150 : if (NameEquals(name, "Emoji_Flag_Sequence")) {
1435 : sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
1436 90 : } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
1437 : sequence_list = UnicodePropertySequences::kEmojiTagSequences;
1438 70 : } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
1439 : sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
1440 : }
1441 150 : if (sequence_list != nullptr) {
1442 : // TODO(yangguo): this creates huge regexp code. Alternative to this is
1443 : // to create a new operator that checks for these sequences at runtime.
1444 : RegExpBuilder builder(zone(), flags);
1445 : while (true) { // Iterate through list of sequences.
1446 133680 : while (*sequence_list != 0) { // Iterate through sequence.
1447 102660 : builder.AddUnicodeCharacter(*sequence_list);
1448 102660 : sequence_list++;
1449 : }
1450 31020 : sequence_list++;
1451 31020 : if (*sequence_list == 0) break;
1452 : builder.NewAlternative();
1453 : }
1454 100 : return builder.ToRegExp();
1455 : }
1456 :
1457 50 : if (NameEquals(name, "Emoji_Keycap_Sequence")) {
1458 : // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
1459 : // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
1460 : RegExpBuilder builder(zone(), flags);
1461 : ZoneList<CharacterRange>* prefix_ranges =
1462 30 : new (zone()) ZoneList<CharacterRange>(2, zone());
1463 30 : prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
1464 30 : prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
1465 30 : prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
1466 : builder.AddCharacterClass(
1467 60 : new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
1468 30 : builder.AddCharacter(0xFE0F);
1469 30 : builder.AddCharacter(0x20E3);
1470 30 : return builder.ToRegExp();
1471 20 : } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
1472 : // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
1473 : // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
1474 : RegExpBuilder builder(zone(), flags);
1475 : ZoneList<CharacterRange>* modifier_base_ranges =
1476 20 : new (zone()) ZoneList<CharacterRange>(2, zone());
1477 : LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
1478 20 : modifier_base_ranges, zone());
1479 : builder.AddCharacterClass(
1480 40 : new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
1481 : ZoneList<CharacterRange>* modifier_ranges =
1482 20 : new (zone()) ZoneList<CharacterRange>(2, zone());
1483 : LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
1484 20 : zone());
1485 : builder.AddCharacterClass(
1486 40 : new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
1487 20 : return builder.ToRegExp();
1488 : }
1489 :
1490 : return nullptr;
1491 : }
1492 :
1493 : #else // V8_INTL_SUPPORT
1494 :
1495 : bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
1496 : std::vector<char>* name_2) {
1497 : return false;
1498 : }
1499 :
1500 : bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1501 : bool negate,
1502 : const std::vector<char>& name_1,
1503 : const std::vector<char>& name_2) {
1504 : return false;
1505 : }
1506 :
1507 : RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
1508 : return nullptr;
1509 : }
1510 :
1511 : #endif // V8_INTL_SUPPORT
1512 :
1513 44553 : bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1514 : uc32 x = 0;
1515 : int d = HexValue(current());
1516 7458 : if (d < 0) {
1517 : return false;
1518 : }
1519 44553 : while (d >= 0) {
1520 37122 : x = x * 16 + d;
1521 37122 : if (x > max_value) {
1522 : return false;
1523 : }
1524 37095 : Advance();
1525 : d = HexValue(current());
1526 : }
1527 7431 : *value = x;
1528 7431 : return true;
1529 : }
1530 :
1531 :
1532 177276 : uc32 RegExpParser::ParseClassCharacterEscape() {
1533 : DCHECK_EQ('\\', current());
1534 : DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1535 88638 : Advance();
1536 88638 : switch (current()) {
1537 : case 'b':
1538 14 : Advance();
1539 14 : return '\b';
1540 : // ControlEscape :: one of
1541 : // f n r t v
1542 : case 'f':
1543 0 : Advance();
1544 0 : return '\f';
1545 : case 'n':
1546 471 : Advance();
1547 471 : return '\n';
1548 : case 'r':
1549 48 : Advance();
1550 48 : return '\r';
1551 : case 't':
1552 186 : Advance();
1553 186 : return '\t';
1554 : case 'v':
1555 1 : Advance();
1556 1 : return '\v';
1557 : case 'c': {
1558 : uc32 controlLetter = Next();
1559 290 : uc32 letter = controlLetter & ~('A' ^ 'a');
1560 : // Inside a character class, we also accept digits and underscore as
1561 : // control characters, unless with /u. See Annex B:
1562 : // ES#prod-annexB-ClassControlLetter
1563 290 : if (letter >= 'A' && letter <= 'Z') {
1564 : Advance(2);
1565 : // Control letters mapped to ASCII control characters in the range
1566 : // 0x00-0x1F.
1567 30 : return controlLetter & 0x1F;
1568 : }
1569 260 : if (unicode()) {
1570 : // With /u, invalid escapes are not treated as identity escapes.
1571 18 : ReportError(CStrVector("Invalid class escape"));
1572 18 : return 0;
1573 : }
1574 484 : if ((controlLetter >= '0' && controlLetter <= '9') ||
1575 242 : controlLetter == '_') {
1576 : Advance(2);
1577 160 : return controlLetter & 0x1F;
1578 : }
1579 : // We match JSC in reading the backslash as a literal
1580 : // character instead of as starting an escape.
1581 : // TODO(v8:6201): Not yet covered by the spec.
1582 : return '\\';
1583 : }
1584 : case '0':
1585 : // With /u, \0 is interpreted as NUL if not followed by another digit.
1586 1270 : if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1587 130 : Advance();
1588 130 : return 0;
1589 : }
1590 : V8_FALLTHROUGH;
1591 : case '1':
1592 : case '2':
1593 : case '3':
1594 : case '4':
1595 : case '5':
1596 : case '6':
1597 : case '7':
1598 : // For compatibility, we interpret a decimal escape that isn't
1599 : // a back reference (and therefore either \0 or not valid according
1600 : // to the specification) as a 1..3 digit octal character code.
1601 : // ES#prod-annexB-LegacyOctalEscapeSequence
1602 1098 : if (unicode()) {
1603 : // With /u, decimal escape is not interpreted as octal character code.
1604 36 : ReportError(CStrVector("Invalid class escape"));
1605 36 : return 0;
1606 : }
1607 1062 : return ParseOctalLiteral();
1608 : case 'x': {
1609 2165 : Advance();
1610 : uc32 value;
1611 2165 : if (ParseHexEscape(2, &value)) return value;
1612 30 : if (unicode()) {
1613 : // With /u, invalid escapes are not treated as identity escapes.
1614 0 : ReportError(CStrVector("Invalid escape"));
1615 0 : return 0;
1616 : }
1617 : // If \x is not followed by a two-digit hexadecimal, treat it
1618 : // as an identity escape.
1619 : return 'x';
1620 : }
1621 : case 'u': {
1622 81654 : Advance();
1623 : uc32 value;
1624 81654 : if (ParseUnicodeEscape(&value)) return value;
1625 20 : if (unicode()) {
1626 : // With /u, invalid escapes are not treated as identity escapes.
1627 0 : ReportError(CStrVector("Invalid unicode escape"));
1628 0 : return 0;
1629 : }
1630 : // If \u is not followed by a two-digit hexadecimal, treat it
1631 : // as an identity escape.
1632 : return 'u';
1633 : }
1634 : default: {
1635 : uc32 result = current();
1636 : // With /u, no identity escapes except for syntax characters and '-' are
1637 : // allowed. Otherwise, all identity escapes are allowed.
1638 2721 : if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1639 2581 : Advance();
1640 2581 : return result;
1641 : }
1642 0 : ReportError(CStrVector("Invalid escape"));
1643 0 : return 0;
1644 : }
1645 : }
1646 : return 0;
1647 : }
1648 :
1649 471205 : void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1650 : Zone* zone,
1651 : bool add_unicode_case_equivalents,
1652 471205 : uc32* char_out, bool* is_class_escape) {
1653 : uc32 current_char = current();
1654 471205 : if (current_char == '\\') {
1655 90919 : switch (Next()) {
1656 : case 'w':
1657 : case 'W':
1658 : case 'd':
1659 : case 'D':
1660 : case 's':
1661 : case 'S': {
1662 : CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1663 3446 : add_unicode_case_equivalents, zone);
1664 : Advance(2);
1665 1723 : *is_class_escape = true;
1666 1723 : return;
1667 : }
1668 : case kEndMarker:
1669 0 : ReportError(CStrVector("\\ at end of pattern"));
1670 0 : return;
1671 : case 'p':
1672 : case 'P':
1673 620 : if (unicode()) {
1674 558 : bool negate = Next() == 'P';
1675 : Advance(2);
1676 : std::vector<char> name_1, name_2;
1677 1089 : if (!ParsePropertyClassName(&name_1, &name_2) ||
1678 531 : !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1679 225 : ReportError(CStrVector("Invalid property name in character class"));
1680 : }
1681 558 : *is_class_escape = true;
1682 : return;
1683 : }
1684 : break;
1685 : default:
1686 : break;
1687 : }
1688 88638 : *char_out = ParseClassCharacterEscape();
1689 88638 : *is_class_escape = false;
1690 : } else {
1691 380286 : Advance();
1692 380286 : *char_out = current_char;
1693 380286 : *is_class_escape = false;
1694 : }
1695 : }
1696 :
1697 2915476 : RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1698 : static const char* kUnterminated = "Unterminated character class";
1699 : static const char* kRangeInvalid = "Invalid character class";
1700 : static const char* kRangeOutOfOrder = "Range out of order in character class";
1701 :
1702 : DCHECK_EQ(current(), '[');
1703 155307 : Advance();
1704 : bool is_negated = false;
1705 155307 : if (current() == '^') {
1706 : is_negated = true;
1707 10458 : Advance();
1708 : }
1709 : ZoneList<CharacterRange>* ranges =
1710 155307 : new (zone()) ZoneList<CharacterRange>(2, zone());
1711 157004 : bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1712 1058228 : while (has_more() && current() != ']') {
1713 : uc32 char_1, char_2;
1714 : bool is_class_1, is_class_2;
1715 : ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1716 593558 : &is_class_1 CHECK_FAILED);
1717 296310 : if (current() == '-') {
1718 174680 : Advance();
1719 174680 : if (current() == kEndMarker) {
1720 : // If we reach the end we break out of the loop and let the
1721 : // following code report an error.
1722 : break;
1723 174675 : } else if (current() == ']') {
1724 112 : if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1725 67 : ranges->Add(CharacterRange::Singleton('-'), zone());
1726 67 : break;
1727 : }
1728 : ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1729 174608 : &is_class_2 CHECK_FAILED);
1730 174608 : if (is_class_1 || is_class_2) {
1731 : // Either end is an escaped character class. Treat the '-' verbatim.
1732 176 : if (unicode()) {
1733 : // ES2015 21.2.2.15.1 step 1.
1734 54 : return ReportError(CStrVector(kRangeInvalid));
1735 : }
1736 174 : if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1737 149 : ranges->Add(CharacterRange::Singleton('-'), zone());
1738 258 : if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1739 149 : continue;
1740 : }
1741 : // ES2015 21.2.2.15.1 step 6.
1742 174432 : if (char_1 > char_2) {
1743 100 : return ReportError(CStrVector(kRangeOutOfOrder));
1744 : }
1745 174382 : ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1746 : } else {
1747 241426 : if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1748 : }
1749 : }
1750 154943 : if (!has_more()) {
1751 40 : return ReportError(CStrVector(kUnterminated));
1752 : }
1753 154923 : Advance();
1754 : RegExpCharacterClass::CharacterClassFlags character_class_flags;
1755 154923 : if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1756 : return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
1757 309846 : character_class_flags);
1758 : }
1759 :
1760 :
1761 : #undef CHECK_FAILED
1762 :
1763 :
1764 380740 : bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1765 : FlatStringReader* input, JSRegExp::Flags flags,
1766 : RegExpCompileData* result) {
1767 : DCHECK(result != nullptr);
1768 380740 : RegExpParser parser(input, &result->error, flags, isolate, zone);
1769 380740 : RegExpTree* tree = parser.ParsePattern();
1770 380740 : if (parser.failed()) {
1771 : DCHECK(tree == nullptr);
1772 : DCHECK(!result->error.is_null());
1773 : } else {
1774 : DCHECK(tree != nullptr);
1775 : DCHECK(result->error.is_null());
1776 : if (FLAG_trace_regexp_parser) {
1777 : StdoutStream os;
1778 : tree->Print(os, zone);
1779 : os << "\n";
1780 : }
1781 377759 : result->tree = tree;
1782 377759 : int capture_count = parser.captures_started();
1783 584600 : result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1784 377759 : result->contains_anchor = parser.contains_anchor();
1785 377759 : result->capture_name_map = parser.CreateCaptureNameMap();
1786 377759 : result->capture_count = capture_count;
1787 : }
1788 380740 : return !parser.failed();
1789 : }
1790 :
1791 0 : RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1792 : : zone_(zone),
1793 : pending_empty_(false),
1794 : flags_(flags),
1795 : characters_(nullptr),
1796 : pending_surrogate_(kNoPendingSurrogate),
1797 : terms_(),
1798 1486654 : alternatives_()
1799 : #ifdef DEBUG
1800 : ,
1801 : last_added_(ADD_NONE)
1802 : #endif
1803 : {
1804 0 : }
1805 :
1806 :
1807 0 : void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1808 : DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1809 : FlushPendingSurrogate();
1810 : // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1811 64958 : pending_surrogate_ = lead_surrogate;
1812 0 : }
1813 :
1814 :
1815 323490 : void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1816 : DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1817 64898 : if (pending_surrogate_ != kNoPendingSurrogate) {
1818 64708 : uc16 lead_surrogate = pending_surrogate_;
1819 64708 : pending_surrogate_ = kNoPendingSurrogate;
1820 : DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1821 : uc32 combined =
1822 64708 : unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1823 64708 : if (NeedsDesugaringForIgnoreCase(combined)) {
1824 60 : AddCharacterClassForDesugaring(combined);
1825 : } else {
1826 64648 : ZoneList<uc16> surrogate_pair(2, zone());
1827 64648 : surrogate_pair.Add(lead_surrogate, zone());
1828 64648 : surrogate_pair.Add(trail_surrogate, zone());
1829 : RegExpAtom* atom =
1830 129296 : new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
1831 64648 : AddAtom(atom);
1832 : }
1833 : } else {
1834 190 : pending_surrogate_ = trail_surrogate;
1835 : FlushPendingSurrogate();
1836 : }
1837 64898 : }
1838 :
1839 :
1840 0 : void RegExpBuilder::FlushPendingSurrogate() {
1841 389372674 : if (pending_surrogate_ != kNoPendingSurrogate) {
1842 : DCHECK(unicode());
1843 440 : uc32 c = pending_surrogate_;
1844 440 : pending_surrogate_ = kNoPendingSurrogate;
1845 440 : AddCharacterClassForDesugaring(c);
1846 : }
1847 0 : }
1848 :
1849 :
1850 5247759 : void RegExpBuilder::FlushCharacters() {
1851 : FlushPendingSurrogate();
1852 4448901 : pending_empty_ = false;
1853 4448901 : if (characters_ != nullptr) {
1854 : RegExpTree* atom =
1855 399429 : new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
1856 399429 : characters_ = nullptr;
1857 399429 : text_.Add(atom, zone());
1858 : LAST(ADD_ATOM);
1859 : }
1860 4448901 : }
1861 :
1862 :
1863 4902449 : void RegExpBuilder::FlushText() {
1864 4205709 : FlushCharacters();
1865 : int num_text = text_.length();
1866 4205709 : if (num_text == 0) {
1867 4205709 : return;
1868 567442 : } else if (num_text == 1) {
1869 532515 : terms_.Add(text_.last(), zone());
1870 : } else {
1871 : RegExpText* text = new (zone()) RegExpText(zone());
1872 223669 : for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1873 34927 : terms_.Add(text, zone());
1874 : }
1875 : text_.Clear();
1876 : }
1877 :
1878 :
1879 768633129 : void RegExpBuilder::AddCharacter(uc16 c) {
1880 : FlushPendingSurrogate();
1881 383403008 : pending_empty_ = false;
1882 383403008 : if (NeedsDesugaringForIgnoreCase(c)) {
1883 895 : AddCharacterClassForDesugaring(c);
1884 : } else {
1885 383402113 : if (characters_ == nullptr) {
1886 1828008 : characters_ = new (zone()) ZoneList<uc16>(4, zone());
1887 : }
1888 383402113 : characters_->Add(c, zone());
1889 : LAST(ADD_CHAR);
1890 : }
1891 383403008 : }
1892 :
1893 :
1894 383455481 : void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1895 383455481 : if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1896 : DCHECK(unicode());
1897 64708 : AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1898 64708 : AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1899 383432575 : } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1900 250 : AddLeadSurrogate(c);
1901 383432075 : } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1902 190 : AddTrailSurrogate(c);
1903 : } else {
1904 383390333 : AddCharacter(static_cast<uc16>(c));
1905 : }
1906 383455481 : }
1907 :
1908 1809 : void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1909 : // A lead or trail surrogate parsed via escape sequence will not
1910 : // pair up with any preceding lead or following trail surrogate.
1911 : FlushPendingSurrogate();
1912 1809 : AddUnicodeCharacter(character);
1913 : FlushPendingSurrogate();
1914 1809 : }
1915 :
1916 113 : void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1917 :
1918 :
1919 183310 : void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1920 183310 : if (NeedsDesugaringForUnicode(cc)) {
1921 : // With /u, character class needs to be desugared, so it
1922 : // must be a standalone term instead of being part of a RegExpText.
1923 4796 : AddTerm(cc);
1924 : } else {
1925 178514 : AddAtom(cc);
1926 : }
1927 183310 : }
1928 :
1929 2790 : void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1930 : AddTerm(new (zone()) RegExpCharacterClass(
1931 : zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1932 2790 : flags_));
1933 1395 : }
1934 :
1935 :
1936 2707256 : void RegExpBuilder::AddAtom(RegExpTree* term) {
1937 1353628 : if (term->IsEmpty()) {
1938 : AddEmpty();
1939 1353628 : return;
1940 : }
1941 1353628 : if (term->IsTextElement()) {
1942 243192 : FlushCharacters();
1943 243192 : text_.Add(term, zone());
1944 : } else {
1945 1110436 : FlushText();
1946 1110436 : terms_.Add(term, zone());
1947 : }
1948 : LAST(ADD_ATOM);
1949 : }
1950 :
1951 :
1952 12382 : void RegExpBuilder::AddTerm(RegExpTree* term) {
1953 6191 : FlushText();
1954 6191 : terms_.Add(term, zone());
1955 : LAST(ADD_ATOM);
1956 6191 : }
1957 :
1958 :
1959 22604 : void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1960 11342 : FlushText();
1961 11342 : if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
1962 : // Omit repeated assertions of the same type.
1963 398 : RegExpAssertion* last = terms_.last()->AsAssertion();
1964 398 : RegExpAssertion* next = assert->AsAssertion();
1965 11541 : if (last->assertion_type() == next->assertion_type()) return;
1966 : }
1967 11262 : terms_.Add(assert, zone());
1968 : LAST(ADD_ASSERT);
1969 : }
1970 :
1971 :
1972 148300 : void RegExpBuilder::NewAlternative() { FlushTerms(); }
1973 :
1974 :
1975 3675932 : void RegExpBuilder::FlushTerms() {
1976 1632105 : FlushText();
1977 : int num_terms = terms_.length();
1978 : RegExpTree* alternative;
1979 1632105 : if (num_terms == 0) {
1980 330014 : alternative = new (zone()) RegExpEmpty();
1981 1302091 : } else if (num_terms == 1) {
1982 : alternative = terms_.last();
1983 : } else {
1984 81708 : alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1985 : }
1986 1632105 : alternatives_.Add(alternative, zone());
1987 : terms_.Clear();
1988 : LAST(ADD_NONE);
1989 1632105 : }
1990 :
1991 :
1992 188566 : bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1993 183310 : if (!unicode()) return false;
1994 : // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1995 : // necessarily mean that we need to desugar. It's probably nicer to have a
1996 : // separate pass to figure out unicode desugarings.
1997 6084 : if (ignore_case()) return true;
1998 5256 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1999 5256 : CharacterRange::Canonicalize(ranges);
2000 13449 : for (int i = ranges->length() - 1; i >= 0; i--) {
2001 12161 : uc32 from = ranges->at(i).from();
2002 12161 : uc32 to = ranges->at(i).to();
2003 : // Check for non-BMP characters.
2004 12161 : if (to >= kNonBmpStart) return true;
2005 : // Check for lone surrogates.
2006 8303 : if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2007 : }
2008 : return false;
2009 : }
2010 :
2011 :
2012 383467716 : bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
2013 : #ifdef V8_INTL_SUPPORT
2014 383573856 : if (unicode() && ignore_case()) {
2015 1080 : icu::UnicodeSet set(c, c);
2016 1080 : set.closeOver(USET_CASE_INSENSITIVE);
2017 1080 : set.removeAllStrings();
2018 1080 : return set.size() > 1;
2019 : }
2020 : // In the case where ICU is not included, we act as if the unicode flag is
2021 : // not set, and do not desugar.
2022 : #endif // V8_INTL_SUPPORT
2023 : return false;
2024 : }
2025 :
2026 :
2027 1526739 : RegExpTree* RegExpBuilder::ToRegExp() {
2028 1483805 : FlushTerms();
2029 : int num_alternatives = alternatives_.length();
2030 1483805 : if (num_alternatives == 0) return new (zone()) RegExpEmpty();
2031 1483805 : if (num_alternatives == 1) return alternatives_.last();
2032 42934 : return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
2033 : }
2034 :
2035 1451999 : bool RegExpBuilder::AddQuantifierToAtom(
2036 2880628 : int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2037 : FlushPendingSurrogate();
2038 1451999 : if (pending_empty_) {
2039 2 : pending_empty_ = false;
2040 2 : return true;
2041 : }
2042 : RegExpTree* atom;
2043 1451997 : if (characters_ != nullptr) {
2044 : DCHECK(last_added_ == ADD_CHAR);
2045 : // Last atom was character.
2046 : Vector<const uc16> char_vector = characters_->ToConstVector();
2047 : int num_chars = char_vector.length();
2048 1428276 : if (num_chars > 1) {
2049 567 : Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
2050 1134 : text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
2051 : char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2052 : }
2053 1428276 : characters_ = nullptr;
2054 1428276 : atom = new (zone()) RegExpAtom(char_vector, flags_);
2055 1428276 : FlushText();
2056 23721 : } else if (text_.length() > 0) {
2057 : DCHECK(last_added_ == ADD_ATOM);
2058 16284 : atom = text_.RemoveLast();
2059 16284 : FlushText();
2060 7437 : } else if (terms_.length() > 0) {
2061 : DCHECK(last_added_ == ADD_ATOM);
2062 7437 : atom = terms_.RemoveLast();
2063 7437 : if (atom->IsLookaround()) {
2064 : // With /u, lookarounds are not quantifiable.
2065 239 : if (unicode()) return false;
2066 : // Lookbehinds are not quantifiable.
2067 193 : if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2068 : return false;
2069 : }
2070 : }
2071 7364 : if (atom->max_match() == 0) {
2072 : // Guaranteed to only match an empty string.
2073 : LAST(ADD_TERM);
2074 236 : if (min == 0) {
2075 : return true;
2076 : }
2077 97 : terms_.Add(atom, zone());
2078 97 : return true;
2079 : }
2080 : } else {
2081 : // Only call immediately after adding an atom or character!
2082 0 : UNREACHABLE();
2083 : }
2084 1451688 : terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
2085 1451688 : zone());
2086 : LAST(ADD_TERM);
2087 1451688 : return true;
2088 : }
2089 :
2090 : } // namespace internal
2091 183867 : } // namespace v8
|