Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/regexp-parser.h"
6 :
7 : #include <vector>
8 :
9 : #include "src/char-predicates-inl.h"
10 : #include "src/heap/factory.h"
11 : #include "src/isolate.h"
12 : #include "src/objects-inl.h"
13 : #include "src/ostreams.h"
14 : #include "src/regexp/jsregexp.h"
15 : #include "src/regexp/property-sequences.h"
16 : #include "src/utils.h"
17 : #include "src/zone/zone-list-inl.h"
18 :
19 : #ifdef V8_INTL_SUPPORT
20 : #include "unicode/uniset.h"
21 : #endif // V8_INTL_SUPPORT
22 :
23 : namespace v8 {
24 : namespace internal {
25 :
26 0 : RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
27 : JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
28 : : isolate_(isolate),
29 : zone_(zone),
30 : error_(error),
31 : captures_(nullptr),
32 : named_captures_(nullptr),
33 : named_back_references_(nullptr),
34 : in_(in),
35 : current_(kEndMarker),
36 : top_level_flags_(flags),
37 : next_pos_(0),
38 : captures_started_(0),
39 : capture_count_(0),
40 : has_more_(true),
41 : simple_(false),
42 : contains_anchor_(false),
43 : is_scanned_for_captures_(false),
44 : has_named_captures_(false),
45 380302 : failed_(false) {
46 380302 : Advance();
47 0 : }
48 :
49 : template <bool update_position>
50 388900300 : inline uc32 RegExpParser::ReadNext() {
51 388900300 : int position = next_pos_;
52 : uc32 c0 = in()->Get(position);
53 388900300 : position++;
54 : // Read the whole surrogate pair in case of unicode flag, if possible.
55 389598593 : if (unicode() && position < in()->length() &&
56 : unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
57 : uc16 c1 = in()->Get(position);
58 586 : if (unibrow::Utf16::IsTrailSurrogate(c1)) {
59 389 : c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
60 371 : position++;
61 : }
62 : }
63 388727589 : if (update_position) next_pos_ = position;
64 388900300 : return c0;
65 : }
66 :
67 :
68 0 : uc32 RegExpParser::Next() {
69 172735 : if (has_next()) {
70 172711 : return ReadNext<false>();
71 : } else {
72 : return kEndMarker;
73 : }
74 : }
75 :
76 389107580 : void RegExpParser::Advance() {
77 389107580 : if (has_next()) {
78 : StackLimitCheck check(isolate());
79 388727976 : if (check.HasOverflowed()) {
80 378 : if (FLAG_abort_on_stack_or_string_length_overflow) {
81 0 : FATAL("Aborting on stack overflow");
82 : }
83 378 : ReportError(CStrVector(
84 378 : MessageFormatter::TemplateString(MessageTemplate::kStackOverflow)));
85 388727598 : } else if (zone()->excess_allocation()) {
86 9 : ReportError(CStrVector("Regular expression too large"));
87 : } else {
88 388727589 : current_ = ReadNext<true>();
89 : }
90 : } else {
91 379604 : current_ = kEndMarker;
92 : // Advance so that position() points to 1-after-the-last-character. This is
93 : // important so that Reset() to this position works correctly.
94 379604 : next_pos_ = in()->length() + 1;
95 379604 : has_more_ = false;
96 : }
97 389107580 : }
98 :
99 :
100 0 : void RegExpParser::Reset(int pos) {
101 2609 : next_pos_ = pos;
102 2609 : has_more_ = (pos < in()->length());
103 2609 : Advance();
104 0 : }
105 :
106 0 : void RegExpParser::Advance(int dist) {
107 46407 : next_pos_ += dist - 1;
108 46407 : Advance();
109 0 : }
110 :
111 :
112 206333 : bool RegExpParser::simple() { return simple_; }
113 :
114 0 : bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
115 177 : switch (c) {
116 : case '^':
117 : case '$':
118 : case '\\':
119 : case '.':
120 : case '*':
121 : case '+':
122 : case '?':
123 : case '(':
124 : case ')':
125 : case '[':
126 : case ']':
127 : case '{':
128 : case '}':
129 : case '|':
130 : case '/':
131 : return true;
132 : default:
133 : break;
134 : }
135 0 : return false;
136 : }
137 :
138 :
139 2971 : RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
140 2971 : if (failed_) return nullptr; // Do not overwrite any existing error.
141 2966 : failed_ = true;
142 2966 : *error_ = isolate()
143 : ->factory()
144 5932 : ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
145 2966 : .ToHandleChecked();
146 : // Zip to the end to make sure the no more input is read.
147 2966 : current_ = kEndMarker;
148 2966 : next_pos_ = in()->length();
149 2966 : return nullptr;
150 : }
151 :
152 : #define CHECK_FAILED /**/); \
153 : if (failed_) return nullptr; \
154 : ((void)0
155 :
156 : // Pattern ::
157 : // Disjunction
158 380302 : RegExpTree* RegExpParser::ParsePattern() {
159 380302 : RegExpTree* result = ParseDisjunction(CHECK_FAILED);
160 377436 : PatchNamedBackReferences(CHECK_FAILED);
161 : DCHECK(!has_more());
162 : // If the result of parsing is a literal string atom, and it has the
163 : // same length as the input, then the atom is identical to the input.
164 583669 : if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
165 199368 : simple_ = true;
166 : }
167 : return result;
168 : }
169 :
170 :
171 : // Disjunction ::
172 : // Alternative
173 : // Alternative | Disjunction
174 : // Alternative ::
175 : // [empty]
176 : // Term Alternative
177 : // Term ::
178 : // Assertion
179 : // Atom
180 : // Atom Quantifier
181 380302 : RegExpTree* RegExpParser::ParseDisjunction() {
182 : // Used to store current state while parsing subexpressions.
183 : RegExpParserState initial_state(nullptr, INITIAL, RegExpLookaround::LOOKAHEAD,
184 380302 : 0, nullptr, top_level_flags_, zone());
185 : RegExpParserState* state = &initial_state;
186 : // Cache the builder in a local variable for quick access.
187 : RegExpBuilder* builder = initial_state.builder();
188 : while (true) {
189 386265666 : switch (current()) {
190 : case kEndMarker:
191 377771 : if (state->IsSubexpression()) {
192 : // Inside a parenthesized group when hitting end of input.
193 380365 : return ReportError(CStrVector("Unterminated group"));
194 : }
195 : DCHECK_EQ(INITIAL, state->group_type());
196 : // Parsing completed successfully.
197 377708 : return builder->ToRegExp();
198 : case ')': {
199 1105695 : if (!state->IsSubexpression()) {
200 14 : return ReportError(CStrVector("Unmatched ')'"));
201 : }
202 : DCHECK_NE(INITIAL, state->group_type());
203 :
204 1105681 : Advance();
205 : // End disjunction parsing and convert builder content to new single
206 : // regexp atom.
207 1105681 : RegExpTree* body = builder->ToRegExp();
208 :
209 : int end_capture_index = captures_started();
210 :
211 : int capture_index = state->capture_index();
212 : SubexpressionType group_type = state->group_type();
213 :
214 : // Build result of subexpression.
215 1105681 : if (group_type == CAPTURE) {
216 1082087 : if (state->IsNamedCapture()) {
217 : CreateNamedCaptureAtIndex(state->capture_name(),
218 1257 : capture_index CHECK_FAILED);
219 : }
220 1082046 : RegExpCapture* capture = GetCapture(capture_index);
221 : capture->set_body(body);
222 : body = capture;
223 23594 : } else if (group_type == GROUPING) {
224 : body = new (zone()) RegExpGroup(body);
225 : } else {
226 : DCHECK(group_type == POSITIVE_LOOKAROUND ||
227 : group_type == NEGATIVE_LOOKAROUND);
228 3611 : bool is_positive = (group_type == POSITIVE_LOOKAROUND);
229 : body = new (zone()) RegExpLookaround(
230 : body, is_positive, end_capture_index - capture_index,
231 3611 : capture_index, state->lookaround_type());
232 : }
233 :
234 : // Restore previous state.
235 : state = state->previous_state();
236 : builder = state->builder();
237 :
238 1105640 : builder->AddAtom(body);
239 : // For compatibility with JSC and ES3, we allow quantifiers after
240 : // lookaheads, and break in all cases.
241 1105640 : break;
242 : }
243 : case '|': {
244 117326 : Advance();
245 : builder->NewAlternative();
246 384433421 : continue;
247 : }
248 : case '*':
249 : case '+':
250 : case '?':
251 95 : return ReportError(CStrVector("Nothing to repeat"));
252 : case '^': {
253 6447 : Advance();
254 6447 : if (builder->multiline()) {
255 : builder->AddAssertion(new (zone()) RegExpAssertion(
256 263 : RegExpAssertion::START_OF_LINE, builder->flags()));
257 : } else {
258 : builder->AddAssertion(new (zone()) RegExpAssertion(
259 6184 : RegExpAssertion::START_OF_INPUT, builder->flags()));
260 : set_contains_anchor();
261 : }
262 : continue;
263 : }
264 : case '$': {
265 4041 : Advance();
266 : RegExpAssertion::AssertionType assertion_type =
267 : builder->multiline() ? RegExpAssertion::END_OF_LINE
268 4041 : : RegExpAssertion::END_OF_INPUT;
269 : builder->AddAssertion(
270 4041 : new (zone()) RegExpAssertion(assertion_type, builder->flags()));
271 4041 : continue;
272 : }
273 : case '.': {
274 19037 : Advance();
275 : ZoneList<CharacterRange>* ranges =
276 : new (zone()) ZoneList<CharacterRange>(2, zone());
277 :
278 19037 : if (builder->dotall()) {
279 : // Everything.
280 210 : CharacterRange::AddClassEscape('*', ranges, false, zone());
281 : } else {
282 : // Everything except \x0A, \x0D, \u2028 and \u2029
283 18827 : CharacterRange::AddClassEscape('.', ranges, false, zone());
284 : }
285 :
286 : RegExpCharacterClass* cc =
287 38074 : new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags());
288 19037 : builder->AddCharacterClass(cc);
289 19037 : break;
290 : }
291 : case '(': {
292 1107381 : state = ParseOpenParenthesis(state CHECK_FAILED);
293 : builder = state->builder();
294 1106888 : continue;
295 : }
296 : case '[': {
297 155232 : RegExpTree* cc = ParseCharacterClass(builder CHECK_FAILED);
298 154856 : builder->AddCharacterClass(cc->AsCharacterClass());
299 154856 : break;
300 : }
301 : // Atom ::
302 : // \ AtomEscape
303 : case '\\':
304 31048 : switch (Next()) {
305 : case kEndMarker:
306 5 : return ReportError(CStrVector("\\ at end of pattern"));
307 : case 'b':
308 : Advance(2);
309 : builder->AddAssertion(new (zone()) RegExpAssertion(
310 431 : RegExpAssertion::BOUNDARY, builder->flags()));
311 431 : continue;
312 : case 'B':
313 : Advance(2);
314 : builder->AddAssertion(new (zone()) RegExpAssertion(
315 327 : RegExpAssertion::NON_BOUNDARY, builder->flags()));
316 327 : continue;
317 : // AtomEscape ::
318 : // CharacterClassEscape
319 : //
320 : // CharacterClassEscape :: one of
321 : // d D s S w W
322 : case 'd':
323 : case 'D':
324 : case 's':
325 : case 'S':
326 : case 'w':
327 : case 'W': {
328 : uc32 c = Next();
329 : Advance(2);
330 : ZoneList<CharacterRange>* ranges =
331 : new (zone()) ZoneList<CharacterRange>(2, zone());
332 6785 : CharacterRange::AddClassEscape(
333 14023 : c, ranges, unicode() && builder->ignore_case(), zone());
334 : RegExpCharacterClass* cc = new (zone())
335 13570 : RegExpCharacterClass(zone(), ranges, builder->flags());
336 6785 : builder->AddCharacterClass(cc);
337 6785 : break;
338 : }
339 : case 'p':
340 : case 'P': {
341 : uc32 p = Next();
342 : Advance(2);
343 4187 : if (unicode()) {
344 : ZoneList<CharacterRange>* ranges =
345 : new (zone()) ZoneList<CharacterRange>(2, zone());
346 : std::vector<char> name_1, name_2;
347 3989 : if (ParsePropertyClassName(&name_1, &name_2)) {
348 3863 : if (AddPropertyClassRange(ranges, p == 'P', name_1, name_2)) {
349 : RegExpCharacterClass* cc = new (zone())
350 6040 : RegExpCharacterClass(zone(), ranges, builder->flags());
351 3020 : builder->AddCharacterClass(cc);
352 3020 : break;
353 : }
354 843 : if (p == 'p' && name_2.empty()) {
355 510 : RegExpTree* sequence = GetPropertySequence(name_1);
356 510 : if (sequence != nullptr) {
357 150 : builder->AddAtom(sequence);
358 150 : break;
359 : }
360 : }
361 : }
362 819 : return ReportError(CStrVector("Invalid property name"));
363 : } else {
364 198 : builder->AddCharacter(p);
365 : }
366 198 : break;
367 : }
368 : case '1':
369 : case '2':
370 : case '3':
371 : case '4':
372 : case '5':
373 : case '6':
374 : case '7':
375 : case '8':
376 : case '9': {
377 5078 : int index = 0;
378 5155 : bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
379 5073 : if (is_backref) {
380 9190 : if (state->IsInsideCaptureGroup(index)) {
381 : // The back reference is inside the capture group it refers to.
382 : // Nothing can possibly have been captured yet, so we use empty
383 : // instead. This ensures that, when checking a back reference,
384 : // the capture registers of the referenced capture are either
385 : // both set or both cleared.
386 : builder->AddEmpty();
387 : } else {
388 4502 : RegExpCapture* capture = GetCapture(index);
389 : RegExpTree* atom =
390 : new (zone()) RegExpBackReference(capture, builder->flags());
391 4502 : builder->AddAtom(atom);
392 : }
393 4632 : break;
394 : }
395 : // With /u, no identity escapes except for syntax characters
396 : // are allowed. Otherwise, all identity escapes are allowed.
397 478 : if (unicode()) {
398 72 : return ReportError(CStrVector("Invalid escape"));
399 : }
400 : uc32 first_digit = Next();
401 406 : if (first_digit == '8' || first_digit == '9') {
402 37 : builder->AddCharacter(first_digit);
403 : Advance(2);
404 : break;
405 : }
406 : V8_FALLTHROUGH;
407 : }
408 : case '0': {
409 3210 : Advance();
410 3248 : if (unicode() && Next() >= '0' && Next() <= '9') {
411 : // With /u, decimal escape with leading 0 are not parsed as octal.
412 19 : return ReportError(CStrVector("Invalid decimal escape"));
413 : }
414 3191 : uc32 octal = ParseOctalLiteral();
415 3191 : builder->AddCharacter(octal);
416 3191 : break;
417 : }
418 : // ControlEscape :: one of
419 : // f n r t v
420 : case 'f':
421 : Advance(2);
422 6 : builder->AddCharacter('\f');
423 6 : break;
424 : case 'n':
425 : Advance(2);
426 724 : builder->AddCharacter('\n');
427 724 : break;
428 : case 'r':
429 : Advance(2);
430 71 : builder->AddCharacter('\r');
431 71 : break;
432 : case 't':
433 : Advance(2);
434 8 : builder->AddCharacter('\t');
435 8 : break;
436 : case 'v':
437 : Advance(2);
438 28 : builder->AddCharacter('\v');
439 28 : break;
440 : case 'c': {
441 149 : Advance();
442 : uc32 controlLetter = Next();
443 : // Special case if it is an ASCII letter.
444 : // Convert lower case letters to uppercase.
445 149 : uc32 letter = controlLetter & ~('a' ^ 'A');
446 149 : if (letter < 'A' || 'Z' < letter) {
447 : // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
448 : // Read the backslash as a literal character instead of as
449 : // starting an escape.
450 : // ES#prod-annexB-ExtendedPatternCharacter
451 89 : if (unicode()) {
452 : // With /u, invalid escapes are not treated as identity escapes.
453 19 : return ReportError(CStrVector("Invalid unicode escape"));
454 : }
455 70 : builder->AddCharacter('\\');
456 : } else {
457 : Advance(2);
458 60 : builder->AddCharacter(controlLetter & 0x1F);
459 : }
460 : break;
461 : }
462 : case 'x': {
463 : Advance(2);
464 : uc32 value;
465 246 : if (ParseHexEscape(2, &value)) {
466 147 : builder->AddCharacter(value);
467 99 : } else if (!unicode()) {
468 63 : builder->AddCharacter('x');
469 : } else {
470 : // With /u, invalid escapes are not treated as identity escapes.
471 36 : return ReportError(CStrVector("Invalid escape"));
472 : }
473 210 : break;
474 : }
475 : case 'u': {
476 : Advance(2);
477 : uc32 value;
478 2007 : if (ParseUnicodeEscape(&value)) {
479 1819 : builder->AddEscapedUnicodeCharacter(value);
480 188 : } else if (!unicode()) {
481 116 : builder->AddCharacter('u');
482 : } else {
483 : // With /u, invalid escapes are not treated as identity escapes.
484 72 : return ReportError(CStrVector("Invalid Unicode escape"));
485 : }
486 1935 : break;
487 : }
488 : case 'k':
489 : // Either an identity escape or a named back-reference. The two
490 : // interpretations are mutually exclusive: '\k' is interpreted as
491 : // an identity escape for non-Unicode patterns without named
492 : // capture groups, and as the beginning of a named back-reference
493 : // in all other cases.
494 672 : if (unicode() || HasNamedCaptures()) {
495 : Advance(2);
496 396 : ParseNamedBackReference(builder, state CHECK_FAILED);
497 : break;
498 : }
499 : V8_FALLTHROUGH;
500 : default:
501 7759 : Advance();
502 : // With /u, no identity escapes except for syntax characters
503 : // are allowed. Otherwise, all identity escapes are allowed.
504 7796 : if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
505 7732 : builder->AddCharacter(current());
506 7732 : Advance();
507 : } else {
508 27 : return ReportError(CStrVector("Invalid escape"));
509 : }
510 7732 : break;
511 : }
512 : break;
513 : case '{': {
514 : int dummy;
515 509 : bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
516 444 : if (parsed) return ReportError(CStrVector("Nothing to repeat"));
517 : V8_FALLTHROUGH;
518 : }
519 : case '}':
520 : case ']':
521 903 : if (unicode()) {
522 28 : return ReportError(CStrVector("Lone quantifier brackets"));
523 : }
524 : V8_FALLTHROUGH;
525 : default:
526 383341520 : builder->AddUnicodeCharacter(current());
527 383341520 : Advance();
528 383341520 : break;
529 : } // end switch(current())
530 :
531 : int min;
532 : int max;
533 384650178 : switch (current()) {
534 : // QuantifierPrefix ::
535 : // *
536 : // +
537 : // ?
538 : // {
539 : case '*':
540 19616 : min = 0;
541 19616 : max = RegExpTree::kInfinity;
542 19616 : Advance();
543 19616 : break;
544 : case '+':
545 7967 : min = 1;
546 7967 : max = RegExpTree::kInfinity;
547 7967 : Advance();
548 7967 : break;
549 : case '?':
550 1421676 : min = 0;
551 1421676 : max = 1;
552 1421676 : Advance();
553 1421676 : break;
554 : case '{':
555 3266 : if (ParseIntervalQuantifier(&min, &max)) {
556 2759 : if (max < min) {
557 : return ReportError(
558 2 : CStrVector("numbers out of order in {} quantifier"));
559 : }
560 : break;
561 507 : } else if (unicode()) {
562 : // With /u, incomplete quantifiers are not allowed.
563 199 : return ReportError(CStrVector("Incomplete quantifier"));
564 : }
565 : continue;
566 : default:
567 : continue;
568 : }
569 : RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
570 1452016 : if (current() == '?') {
571 : quantifier_type = RegExpQuantifier::NON_GREEDY;
572 1713 : Advance();
573 : } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
574 : // FLAG_regexp_possessive_quantifier is a debug-only flag.
575 : quantifier_type = RegExpQuantifier::POSSESSIVE;
576 : Advance();
577 : }
578 1452016 : if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
579 73 : return ReportError(CStrVector("Invalid quantifier"));
580 : }
581 : }
582 : }
583 :
584 1107381 : RegExpParser::RegExpParserState* RegExpParser::ParseOpenParenthesis(
585 : RegExpParserState* state) {
586 : RegExpLookaround::Type lookaround_type = state->lookaround_type();
587 : bool is_named_capture = false;
588 : JSRegExp::Flags switch_on = JSRegExp::kNone;
589 : JSRegExp::Flags switch_off = JSRegExp::kNone;
590 : const ZoneVector<uc16>* capture_name = nullptr;
591 : SubexpressionType subexpr_type = CAPTURE;
592 1107381 : Advance();
593 1107381 : if (current() == '?') {
594 26384 : switch (Next()) {
595 : case ':':
596 : Advance(2);
597 : subexpr_type = GROUPING;
598 19753 : break;
599 : case '=':
600 : Advance(2);
601 : lookaround_type = RegExpLookaround::LOOKAHEAD;
602 : subexpr_type = POSITIVE_LOOKAROUND;
603 1709 : break;
604 : case '!':
605 : Advance(2);
606 : lookaround_type = RegExpLookaround::LOOKAHEAD;
607 : subexpr_type = NEGATIVE_LOOKAROUND;
608 457 : break;
609 : case '-':
610 : case 'i':
611 : case 's':
612 : case 'm': {
613 1373 : if (!FLAG_regexp_mode_modifiers) {
614 4 : ReportError(CStrVector("Invalid group"));
615 4 : return nullptr;
616 : }
617 1369 : Advance();
618 : bool flags_sense = true; // Switching on flags.
619 3384 : while (subexpr_type != GROUPING) {
620 3126 : switch (current()) {
621 : case '-':
622 344 : if (!flags_sense) {
623 18 : ReportError(CStrVector("Multiple dashes in flag group"));
624 18 : return nullptr;
625 : }
626 : flags_sense = false;
627 326 : Advance();
628 326 : continue;
629 : case 's':
630 : case 'i':
631 : case 'm': {
632 : JSRegExp::Flags bit = JSRegExp::kUnicode;
633 1449 : if (current() == 'i') bit = JSRegExp::kIgnoreCase;
634 1449 : if (current() == 'm') bit = JSRegExp::kMultiline;
635 1449 : if (current() == 's') bit = JSRegExp::kDotAll;
636 1449 : if (((switch_on | switch_off) & bit) != 0) {
637 18 : ReportError(CStrVector("Repeated flag in flag group"));
638 : return nullptr;
639 : }
640 1431 : if (flags_sense) {
641 : switch_on |= bit;
642 : } else {
643 : switch_off |= bit;
644 : }
645 1431 : Advance();
646 : continue;
647 : }
648 : case ')': {
649 1075 : Advance();
650 : state->builder()
651 1075 : ->FlushText(); // Flush pending text using old flags.
652 : // These (?i)-style flag switches don't put us in a subexpression
653 : // at all, they just modify the flags in the rest of the current
654 : // subexpression.
655 : JSRegExp::Flags flags =
656 : (state->builder()->flags() | switch_on) & ~switch_off;
657 : state->builder()->set_flags(flags);
658 : return state;
659 : }
660 : case ':':
661 258 : Advance();
662 : subexpr_type = GROUPING; // Will break us out of the outer loop.
663 258 : continue;
664 : default:
665 0 : ReportError(CStrVector("Invalid flag group"));
666 0 : return nullptr;
667 : }
668 : }
669 : break;
670 : }
671 : case '<':
672 3073 : Advance();
673 3073 : if (Next() == '=') {
674 : Advance(2);
675 : lookaround_type = RegExpLookaround::LOOKBEHIND;
676 : subexpr_type = POSITIVE_LOOKAROUND;
677 1218 : break;
678 1855 : } else if (Next() == '!') {
679 : Advance(2);
680 : lookaround_type = RegExpLookaround::LOOKBEHIND;
681 : subexpr_type = NEGATIVE_LOOKAROUND;
682 245 : break;
683 : }
684 : is_named_capture = true;
685 1610 : has_named_captures_ = true;
686 1610 : Advance();
687 1610 : break;
688 : default:
689 19 : ReportError(CStrVector("Invalid group"));
690 19 : return nullptr;
691 : }
692 : }
693 1106247 : if (subexpr_type == CAPTURE) {
694 1082607 : if (captures_started_ >= kMaxCaptures) {
695 5 : ReportError(CStrVector("Too many captures"));
696 5 : return nullptr;
697 : }
698 1082602 : captures_started_++;
699 :
700 1082602 : if (is_named_capture) {
701 1610 : capture_name = ParseCaptureGroupName(CHECK_FAILED);
702 : }
703 : }
704 : JSRegExp::Flags flags = (state->builder()->flags() | switch_on) & ~switch_off;
705 : // Store current state and begin new disjunction parsing.
706 : return new (zone())
707 : RegExpParserState(state, subexpr_type, lookaround_type, captures_started_,
708 2211796 : capture_name, flags, zone());
709 : }
710 :
711 : #ifdef DEBUG
712 : // Currently only used in an DCHECK.
713 : static bool IsSpecialClassEscape(uc32 c) {
714 : switch (c) {
715 : case 'd':
716 : case 'D':
717 : case 's':
718 : case 'S':
719 : case 'w':
720 : case 'W':
721 : return true;
722 : default:
723 : return false;
724 : }
725 : }
726 : #endif
727 :
728 :
729 : // In order to know whether an escape is a backreference or not we have to scan
730 : // the entire regexp and find the number of capturing parentheses. However we
731 : // don't want to scan the regexp twice unless it is necessary. This mini-parser
732 : // is called when needed. It can see the difference between capturing and
733 : // noncapturing parentheses and can skip character classes and backslash-escaped
734 : // characters.
735 754 : void RegExpParser::ScanForCaptures() {
736 : DCHECK(!is_scanned_for_captures_);
737 : const int saved_position = position();
738 : // Start with captures started previous to current position
739 : int capture_count = captures_started();
740 : // Add count of captures after this position.
741 : int n;
742 5348 : while ((n = current()) != kEndMarker) {
743 4594 : Advance();
744 4594 : switch (n) {
745 : case '\\':
746 605 : Advance();
747 605 : break;
748 : case '[': {
749 : int c;
750 165 : while ((c = current()) != kEndMarker) {
751 156 : Advance();
752 156 : if (c == '\\') {
753 0 : Advance();
754 : } else {
755 156 : if (c == ']') break;
756 : }
757 : }
758 : break;
759 : }
760 : case '(':
761 571 : if (current() == '?') {
762 : // At this point we could be in
763 : // * a non-capturing group '(:',
764 : // * a lookbehind assertion '(?<=' '(?<!'
765 : // * or a named capture '(?<'.
766 : //
767 : // Of these, only named captures are capturing groups.
768 :
769 145 : Advance();
770 145 : if (current() != '<') break;
771 :
772 121 : Advance();
773 121 : if (current() == '=' || current() == '!') break;
774 :
775 : // Found a possible named capture. It could turn out to be a syntax
776 : // error (e.g. an unterminated or invalid name), but that distinction
777 : // does not matter for our purposes.
778 75 : has_named_captures_ = true;
779 : }
780 501 : capture_count++;
781 501 : break;
782 : }
783 : }
784 754 : capture_count_ = capture_count;
785 754 : is_scanned_for_captures_ = true;
786 : Reset(saved_position);
787 754 : }
788 :
789 :
790 5078 : bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
791 : DCHECK_EQ('\\', current());
792 : DCHECK('1' <= Next() && Next() <= '9');
793 : // Try to parse a decimal literal that is no greater than the total number
794 : // of left capturing parentheses in the input.
795 : int start = position();
796 5078 : int value = Next() - '0';
797 : Advance(2);
798 617 : while (true) {
799 : uc32 c = current();
800 5695 : if (IsDecimalDigit(c)) {
801 637 : value = 10 * value + (c - '0');
802 637 : if (value > kMaxCaptures) {
803 : Reset(start);
804 20 : return false;
805 : }
806 617 : Advance();
807 : } else {
808 : break;
809 : }
810 : }
811 5058 : if (value > captures_started()) {
812 847 : if (!is_scanned_for_captures_) ScanForCaptures();
813 847 : if (value > capture_count_) {
814 : Reset(start);
815 463 : return false;
816 : }
817 : }
818 4595 : *index_out = value;
819 4595 : return true;
820 : }
821 :
822 2425 : static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
823 2425 : if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
824 4770 : v->push_back(code_unit);
825 : } else {
826 80 : v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
827 80 : v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
828 : }
829 2425 : }
830 :
831 1965 : const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
832 : ZoneVector<uc16>* name =
833 1965 : new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
834 :
835 : bool at_start = true;
836 2425 : while (true) {
837 4390 : uc32 c = current();
838 4390 : Advance();
839 :
840 : // Convert unicode escapes.
841 4390 : if (c == '\\' && current() == 'u') {
842 230 : Advance();
843 230 : if (!ParseUnicodeEscape(&c)) {
844 59 : ReportError(CStrVector("Invalid Unicode escape sequence"));
845 59 : return nullptr;
846 : }
847 : }
848 :
849 : // The backslash char is misclassified as both ID_Start and ID_Continue.
850 4331 : if (c == '\\') {
851 18 : ReportError(CStrVector("Invalid capture group name"));
852 18 : return nullptr;
853 : }
854 :
855 4313 : if (at_start) {
856 1933 : if (!IsIdentifierStart(c)) {
857 100 : ReportError(CStrVector("Invalid capture group name"));
858 100 : return nullptr;
859 : }
860 1833 : push_code_unit(name, c);
861 : at_start = false;
862 : } else {
863 2380 : if (c == '>') {
864 : break;
865 809 : } else if (IsIdentifierPart(c)) {
866 592 : push_code_unit(name, c);
867 : } else {
868 217 : ReportError(CStrVector("Invalid capture group name"));
869 217 : return nullptr;
870 : }
871 : }
872 : }
873 :
874 1571 : return name;
875 : }
876 :
877 1257 : bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
878 : int index) {
879 : DCHECK(0 < index && index <= captures_started_);
880 : DCHECK_NOT_NULL(name);
881 :
882 1257 : if (named_captures_ == nullptr) {
883 933 : named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
884 : } else {
885 : // Check for duplicates and bail if we find any.
886 : // TODO(jgruber): O(n^2).
887 1020 : for (const auto& named_capture : *named_captures_) {
888 389 : if (*named_capture->name() == *name) {
889 41 : ReportError(CStrVector("Duplicate capture group name"));
890 41 : return false;
891 : }
892 : }
893 : }
894 :
895 1216 : RegExpCapture* capture = GetCapture(index);
896 : DCHECK_NULL(capture->name());
897 :
898 : capture->set_name(name);
899 1216 : named_captures_->Add(capture, zone());
900 :
901 1216 : return true;
902 : }
903 :
904 396 : bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
905 : RegExpParserState* state) {
906 : // The parser is assumed to be on the '<' in \k<name>.
907 396 : if (current() != '<') {
908 41 : ReportError(CStrVector("Invalid named reference"));
909 41 : return false;
910 : }
911 :
912 355 : Advance();
913 355 : const ZoneVector<uc16>* name = ParseCaptureGroupName();
914 355 : if (name == nullptr) {
915 : return false;
916 : }
917 :
918 305 : if (state->IsInsideCaptureGroup(name)) {
919 : builder->AddEmpty();
920 : } else {
921 : RegExpBackReference* atom =
922 285 : new (zone()) RegExpBackReference(builder->flags());
923 : atom->set_name(name);
924 :
925 285 : builder->AddAtom(atom);
926 :
927 285 : if (named_back_references_ == nullptr) {
928 : named_back_references_ =
929 220 : new (zone()) ZoneList<RegExpBackReference*>(1, zone());
930 : }
931 285 : named_back_references_->Add(atom, zone());
932 : }
933 :
934 : return true;
935 : }
936 :
937 377436 : void RegExpParser::PatchNamedBackReferences() {
938 377436 : if (named_back_references_ == nullptr) return;
939 :
940 220 : if (named_captures_ == nullptr) {
941 14 : ReportError(CStrVector("Invalid named capture referenced"));
942 14 : return;
943 : }
944 :
945 : // Look up and patch the actual capture for each named back reference.
946 : // TODO(jgruber): O(n^2), optimize if necessary.
947 :
948 967 : for (int i = 0; i < named_back_references_->length(); i++) {
949 271 : RegExpBackReference* ref = named_back_references_->at(i);
950 :
951 : int index = -1;
952 854 : for (const auto& capture : *named_captures_) {
953 341 : if (*capture->name() == *ref->name()) {
954 : index = capture->index();
955 185 : break;
956 : }
957 : }
958 :
959 271 : if (index == -1) {
960 86 : ReportError(CStrVector("Invalid named capture referenced"));
961 86 : return;
962 : }
963 :
964 185 : ref->set_capture(GetCapture(index));
965 : }
966 : }
967 :
968 1087949 : RegExpCapture* RegExpParser::GetCapture(int index) {
969 : // The index for the capture groups are one-based. Its index in the list is
970 : // zero-based.
971 : int know_captures =
972 1087949 : is_scanned_for_captures_ ? capture_count_ : captures_started_;
973 : DCHECK(index <= know_captures);
974 1087949 : if (captures_ == nullptr) {
975 13667 : captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
976 : }
977 5422063 : while (captures_->length() < know_captures) {
978 2164110 : captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
979 : }
980 2175898 : return captures_->at(index - 1);
981 : }
982 :
983 377336 : Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
984 378106 : if (named_captures_ == nullptr || named_captures_->is_empty())
985 376566 : return Handle<FixedArray>();
986 :
987 : Factory* factory = isolate()->factory();
988 :
989 770 : int len = named_captures_->length() * 2;
990 770 : Handle<FixedArray> array = factory->NewFixedArray(len);
991 :
992 4645 : for (int i = 0; i < named_captures_->length(); i++) {
993 1035 : RegExpCapture* capture = named_captures_->at(i);
994 : Vector<const uc16> capture_name(capture->name()->data(),
995 : capture->name()->size());
996 : // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
997 : // internalized so they can be used as property names in the 'exec' results.
998 1035 : Handle<String> name = factory->InternalizeTwoByteString(capture_name);
999 2070 : array->set(i * 2, *name);
1000 : array->set(i * 2 + 1, Smi::FromInt(capture->index()));
1001 : }
1002 :
1003 770 : return array;
1004 : }
1005 :
1006 0 : bool RegExpParser::HasNamedCaptures() {
1007 223 : if (has_named_captures_ || is_scanned_for_captures_) {
1008 : return has_named_captures_;
1009 : }
1010 :
1011 118 : ScanForCaptures();
1012 : DCHECK(is_scanned_for_captures_);
1013 118 : return has_named_captures_;
1014 : }
1015 :
1016 0 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
1017 9564 : for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1018 5337 : if (s->group_type() != CAPTURE) continue;
1019 : // Return true if we found the matching capture index.
1020 570 : if (index == s->capture_index()) return true;
1021 : // Abort if index is larger than what has been parsed up till this state.
1022 477 : if (index > s->capture_index()) return false;
1023 : }
1024 : return false;
1025 : }
1026 :
1027 305 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
1028 : const ZoneVector<uc16>* name) {
1029 : DCHECK_NOT_NULL(name);
1030 895 : for (RegExpParserState* s = this; s != nullptr; s = s->previous_state()) {
1031 315 : if (s->capture_name() == nullptr) continue;
1032 30 : if (*s->capture_name() == *name) return true;
1033 : }
1034 : return false;
1035 : }
1036 :
1037 : // QuantifierPrefix ::
1038 : // { DecimalDigits }
1039 : // { DecimalDigits , }
1040 : // { DecimalDigits , DecimalDigits }
1041 : //
1042 : // Returns true if parsing succeeds, and set the min_out and max_out
1043 : // values. Values are truncated to RegExpTree::kInfinity if they overflow.
1044 3730 : bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
1045 : DCHECK_EQ(current(), '{');
1046 : int start = position();
1047 3730 : Advance();
1048 : int min = 0;
1049 3730 : if (!IsDecimalDigit(current())) {
1050 : Reset(start);
1051 834 : return false;
1052 : }
1053 11558 : while (IsDecimalDigit(current())) {
1054 4456 : int next = current() - '0';
1055 4456 : if (min > (RegExpTree::kInfinity - next) / 10) {
1056 : // Overflow. Skip past remaining decimal digits and return -1.
1057 815 : do {
1058 815 : Advance();
1059 : } while (IsDecimalDigit(current()));
1060 : min = RegExpTree::kInfinity;
1061 : break;
1062 : }
1063 4331 : min = 10 * min + next;
1064 4331 : Advance();
1065 : }
1066 : int max = 0;
1067 2896 : if (current() == '}') {
1068 : max = min;
1069 1172 : Advance();
1070 1724 : } else if (current() == ',') {
1071 1686 : Advance();
1072 1686 : if (current() == '}') {
1073 : max = RegExpTree::kInfinity;
1074 334 : Advance();
1075 : } else {
1076 5618 : while (IsDecimalDigit(current())) {
1077 2193 : int next = current() - '0';
1078 2193 : if (max > (RegExpTree::kInfinity - next) / 10) {
1079 750 : do {
1080 750 : Advance();
1081 : } while (IsDecimalDigit(current()));
1082 : max = RegExpTree::kInfinity;
1083 : break;
1084 : }
1085 2133 : max = 10 * max + next;
1086 2133 : Advance();
1087 : }
1088 1352 : if (current() != '}') {
1089 : Reset(start);
1090 74 : return false;
1091 : }
1092 1278 : Advance();
1093 : }
1094 : } else {
1095 : Reset(start);
1096 38 : return false;
1097 : }
1098 2784 : *min_out = min;
1099 2784 : *max_out = max;
1100 2784 : return true;
1101 : }
1102 :
1103 :
1104 4253 : uc32 RegExpParser::ParseOctalLiteral() {
1105 : DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1106 : // For compatibility with some other browsers (not all), we parse
1107 : // up to three octal digits with a value below 256.
1108 : // ES#prod-annexB-LegacyOctalEscapeSequence
1109 4253 : uc32 value = current() - '0';
1110 4253 : Advance();
1111 4253 : if ('0' <= current() && current() <= '7') {
1112 1388 : value = value * 8 + current() - '0';
1113 1388 : Advance();
1114 1388 : if (value < 32 && '0' <= current() && current() <= '7') {
1115 1290 : value = value * 8 + current() - '0';
1116 1290 : Advance();
1117 : }
1118 : }
1119 4253 : return value;
1120 : }
1121 :
1122 :
1123 79064 : bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1124 : int start = position();
1125 : uc32 val = 0;
1126 699946 : for (int i = 0; i < length; ++i) {
1127 : uc32 c = current();
1128 : int d = HexValue(c);
1129 310815 : if (d < 0) {
1130 : Reset(start);
1131 374 : return false;
1132 : }
1133 310441 : val = val * 16 + d;
1134 310441 : Advance();
1135 : }
1136 78690 : *value = val;
1137 78690 : return true;
1138 : }
1139 :
1140 : // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1141 83916 : bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1142 : // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1143 : // allowed). In the latter case, the number of hex digits between { } is
1144 : // arbitrary. \ and u have already been read.
1145 91474 : if (current() == '{' && unicode()) {
1146 : int start = position();
1147 7458 : Advance();
1148 7458 : if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1149 7431 : if (current() == '}') {
1150 7426 : Advance();
1151 7426 : return true;
1152 : }
1153 : }
1154 : Reset(start);
1155 32 : return false;
1156 : }
1157 : // \u but no {, or \u{...} escapes not allowed.
1158 76458 : bool result = ParseHexEscape(4, value);
1159 226665 : if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1160 : current() == '\\') {
1161 : // Attempt to read trail surrogate.
1162 : int start = position();
1163 205 : if (Next() == 'u') {
1164 : Advance(2);
1165 : uc32 trail;
1166 380 : if (ParseHexEscape(4, &trail) &&
1167 185 : unibrow::Utf16::IsTrailSurrogate(trail)) {
1168 185 : *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1169 185 : static_cast<uc16>(trail));
1170 185 : return true;
1171 : }
1172 : }
1173 : Reset(start);
1174 : }
1175 : return result;
1176 : }
1177 :
1178 : #ifdef V8_INTL_SUPPORT
1179 :
1180 : namespace {
1181 :
1182 2708 : bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1183 2708 : const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1184 2708 : if (short_name != nullptr && strcmp(property_name, short_name) == 0)
1185 : return true;
1186 : for (int i = 0;; i++) {
1187 806 : const char* long_name = u_getPropertyName(
1188 1612 : property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1189 806 : if (long_name == nullptr) break;
1190 743 : if (strcmp(property_name, long_name) == 0) return true;
1191 : }
1192 : return false;
1193 : }
1194 :
1195 3153 : bool IsExactPropertyValueAlias(const char* property_value_name,
1196 : UProperty property, int32_t property_value) {
1197 : const char* short_name =
1198 3153 : u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1199 3153 : if (short_name != nullptr && strcmp(property_value_name, short_name) == 0) {
1200 : return true;
1201 : }
1202 : for (int i = 0;; i++) {
1203 310 : const char* long_name = u_getPropertyValueName(
1204 : property, property_value,
1205 620 : static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1206 310 : if (long_name == nullptr) break;
1207 310 : if (strcmp(property_value_name, long_name) == 0) return true;
1208 : }
1209 : return false;
1210 : }
1211 :
1212 5216 : bool LookupPropertyValueName(UProperty property,
1213 : const char* property_value_name, bool negate,
1214 : ZoneList<CharacterRange>* result, Zone* zone) {
1215 : UProperty property_for_lookup = property;
1216 5216 : if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1217 : // For the property Script_Extensions, we have to do the property value
1218 : // name lookup as if the property is Script.
1219 : property_for_lookup = UCHAR_SCRIPT;
1220 : }
1221 : int32_t property_value =
1222 5216 : u_getPropertyValueEnum(property_for_lookup, property_value_name);
1223 5216 : if (property_value == UCHAR_INVALID_CODE) return false;
1224 :
1225 : // We require the property name to match exactly to one of the property value
1226 : // aliases. However, u_getPropertyValueEnum uses loose matching.
1227 3153 : if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1228 : property_value)) {
1229 : return false;
1230 : }
1231 :
1232 3153 : UErrorCode ec = U_ZERO_ERROR;
1233 6306 : icu::UnicodeSet set;
1234 3153 : set.applyIntPropertyValue(property, property_value, ec);
1235 3153 : bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1236 :
1237 3153 : if (success) {
1238 3153 : set.removeAllStrings();
1239 3153 : if (negate) set.complement();
1240 1170897 : for (int i = 0; i < set.getRangeCount(); i++) {
1241 : result->Add(
1242 1167744 : CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1243 583872 : zone);
1244 : }
1245 : }
1246 : return success;
1247 : }
1248 :
1249 : template <size_t N>
1250 : inline bool NameEquals(const char* name, const char (&literal)[N]) {
1251 6194 : return strncmp(name, literal, N + 1) == 0;
1252 : }
1253 :
1254 2063 : bool LookupSpecialPropertyValueName(const char* name,
1255 : ZoneList<CharacterRange>* result,
1256 : bool negate, Zone* zone) {
1257 2063 : if (NameEquals(name, "Any")) {
1258 135 : if (negate) {
1259 : // Leave the list of character ranges empty, since the negation of 'Any'
1260 : // is the empty set.
1261 : } else {
1262 40 : result->Add(CharacterRange::Everything(), zone);
1263 : }
1264 1928 : } else if (NameEquals(name, "ASCII")) {
1265 210 : result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1266 : : CharacterRange::Range(0x0, 0x7F),
1267 105 : zone);
1268 1823 : } else if (NameEquals(name, "Assigned")) {
1269 75 : return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1270 150 : !negate, result, zone);
1271 : } else {
1272 : return false;
1273 : }
1274 : return true;
1275 : }
1276 :
1277 : // Explicitly whitelist supported binary properties. The spec forbids supporting
1278 : // properties outside of this set to ensure interoperability.
1279 1748 : bool IsSupportedBinaryProperty(UProperty property) {
1280 1748 : switch (property) {
1281 : case UCHAR_ALPHABETIC:
1282 : // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1283 : // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1284 : case UCHAR_ASCII_HEX_DIGIT:
1285 : // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1286 : case UCHAR_BIDI_CONTROL:
1287 : case UCHAR_BIDI_MIRRORED:
1288 : case UCHAR_CASE_IGNORABLE:
1289 : case UCHAR_CASED:
1290 : case UCHAR_CHANGES_WHEN_CASEFOLDED:
1291 : case UCHAR_CHANGES_WHEN_CASEMAPPED:
1292 : case UCHAR_CHANGES_WHEN_LOWERCASED:
1293 : case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1294 : case UCHAR_CHANGES_WHEN_TITLECASED:
1295 : case UCHAR_CHANGES_WHEN_UPPERCASED:
1296 : case UCHAR_DASH:
1297 : case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1298 : case UCHAR_DEPRECATED:
1299 : case UCHAR_DIACRITIC:
1300 : case UCHAR_EMOJI:
1301 : case UCHAR_EMOJI_COMPONENT:
1302 : case UCHAR_EMOJI_MODIFIER_BASE:
1303 : case UCHAR_EMOJI_MODIFIER:
1304 : case UCHAR_EMOJI_PRESENTATION:
1305 : case UCHAR_EXTENDED_PICTOGRAPHIC:
1306 : case UCHAR_EXTENDER:
1307 : case UCHAR_GRAPHEME_BASE:
1308 : case UCHAR_GRAPHEME_EXTEND:
1309 : case UCHAR_HEX_DIGIT:
1310 : case UCHAR_ID_CONTINUE:
1311 : case UCHAR_ID_START:
1312 : case UCHAR_IDEOGRAPHIC:
1313 : case UCHAR_IDS_BINARY_OPERATOR:
1314 : case UCHAR_IDS_TRINARY_OPERATOR:
1315 : case UCHAR_JOIN_CONTROL:
1316 : case UCHAR_LOGICAL_ORDER_EXCEPTION:
1317 : case UCHAR_LOWERCASE:
1318 : case UCHAR_MATH:
1319 : case UCHAR_NONCHARACTER_CODE_POINT:
1320 : case UCHAR_PATTERN_SYNTAX:
1321 : case UCHAR_PATTERN_WHITE_SPACE:
1322 : case UCHAR_QUOTATION_MARK:
1323 : case UCHAR_RADICAL:
1324 : case UCHAR_REGIONAL_INDICATOR:
1325 : case UCHAR_S_TERM:
1326 : case UCHAR_SOFT_DOTTED:
1327 : case UCHAR_TERMINAL_PUNCTUATION:
1328 : case UCHAR_UNIFIED_IDEOGRAPH:
1329 : case UCHAR_UPPERCASE:
1330 : case UCHAR_VARIATION_SELECTOR:
1331 : case UCHAR_WHITE_SPACE:
1332 : case UCHAR_XID_CONTINUE:
1333 : case UCHAR_XID_START:
1334 : return true;
1335 : default:
1336 : break;
1337 : }
1338 843 : return false;
1339 : }
1340 :
1341 : bool IsUnicodePropertyValueCharacter(char c) {
1342 : // https://tc39.github.io/proposal-regexp-unicode-property-escapes/
1343 : //
1344 : // Note that using this to validate each parsed char is quite conservative.
1345 : // A possible alternative solution would be to only ensure the parsed
1346 : // property name/value candidate string does not contain '\0' characters and
1347 : // let ICU lookups trigger the final failure.
1348 39082 : if ('a' <= c && c <= 'z') return true;
1349 10168 : if ('A' <= c && c <= 'Z') return true;
1350 1989 : if ('0' <= c && c <= '9') return true;
1351 1971 : return (c == '_');
1352 : }
1353 :
1354 : } // anonymous namespace
1355 :
1356 4547 : bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
1357 : std::vector<char>* name_2) {
1358 : DCHECK(name_1->empty());
1359 : DCHECK(name_2->empty());
1360 : // Parse the property class as follows:
1361 : // - In \p{name}, 'name' is interpreted
1362 : // - either as a general category property value name.
1363 : // - or as a binary property name.
1364 : // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1365 : // and 'value' is interpreted as one of the available property value names.
1366 : // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1367 : // - Loose matching is not applied.
1368 4547 : if (current() == '{') {
1369 : // Parse \p{[PropertyName=]PropertyNameValue}
1370 35602 : for (Advance(); current() != '}' && current() != '='; Advance()) {
1371 62398 : if (!IsUnicodePropertyValueCharacter(current())) return false;
1372 31136 : if (!has_next()) return false;
1373 62272 : name_1->push_back(static_cast<char>(current()));
1374 : }
1375 4403 : if (current() == '=') {
1376 9686 : for (Advance(); current() != '}'; Advance()) {
1377 15766 : if (!IsUnicodePropertyValueCharacter(current())) return false;
1378 7874 : if (!has_next()) return false;
1379 15748 : name_2->push_back(static_cast<char>(current()));
1380 : }
1381 3606 : name_2->push_back(0); // null-terminate string.
1382 : }
1383 : } else {
1384 : return false;
1385 : }
1386 4394 : Advance();
1387 8788 : name_1->push_back(0); // null-terminate string.
1388 :
1389 : DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
1390 : DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
1391 4394 : return true;
1392 : }
1393 :
1394 4394 : bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1395 : bool negate,
1396 : const std::vector<char>& name_1,
1397 : const std::vector<char>& name_2) {
1398 4394 : if (name_2.empty()) {
1399 : // First attempt to interpret as general category property value name.
1400 : const char* name = name_1.data();
1401 2591 : if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1402 : add_to, zone())) {
1403 : return true;
1404 : }
1405 : // Interpret "Any", "ASCII", and "Assigned".
1406 2063 : if (LookupSpecialPropertyValueName(name, add_to, negate, zone())) {
1407 : return true;
1408 : }
1409 : // Then attempt to interpret as binary property name with value name 'Y'.
1410 1748 : UProperty property = u_getPropertyEnum(name);
1411 1748 : if (!IsSupportedBinaryProperty(property)) return false;
1412 905 : if (!IsExactPropertyAlias(name, property)) return false;
1413 905 : return LookupPropertyValueName(property, negate ? "N" : "Y", false, add_to,
1414 905 : zone());
1415 : } else {
1416 : // Both property name and value name are specified. Attempt to interpret
1417 : // the property name as enumerated property.
1418 : const char* property_name = name_1.data();
1419 : const char* value_name = name_2.data();
1420 1803 : UProperty property = u_getPropertyEnum(property_name);
1421 1803 : if (!IsExactPropertyAlias(property_name, property)) return false;
1422 1740 : if (property == UCHAR_GENERAL_CATEGORY) {
1423 : // We want to allow aggregate value names such as "Letter".
1424 : property = UCHAR_GENERAL_CATEGORY_MASK;
1425 3400 : } else if (property != UCHAR_SCRIPT &&
1426 1700 : property != UCHAR_SCRIPT_EXTENSIONS) {
1427 : return false;
1428 : }
1429 1605 : return LookupPropertyValueName(property, value_name, negate, add_to,
1430 1605 : zone());
1431 : }
1432 : }
1433 :
1434 510 : RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name_1) {
1435 510 : if (!FLAG_harmony_regexp_sequence) return nullptr;
1436 : const char* name = name_1.data();
1437 : const uc32* sequence_list = nullptr;
1438 : JSRegExp::Flags flags = JSRegExp::kUnicode;
1439 150 : if (NameEquals(name, "Emoji_Flag_Sequence")) {
1440 : sequence_list = UnicodePropertySequences::kEmojiFlagSequences;
1441 90 : } else if (NameEquals(name, "Emoji_Tag_Sequence")) {
1442 : sequence_list = UnicodePropertySequences::kEmojiTagSequences;
1443 70 : } else if (NameEquals(name, "Emoji_ZWJ_Sequence")) {
1444 : sequence_list = UnicodePropertySequences::kEmojiZWJSequences;
1445 : }
1446 150 : if (sequence_list != nullptr) {
1447 : // TODO(yangguo): this creates huge regexp code. Alternative to this is
1448 : // to create a new operator that checks for these sequences at runtime.
1449 : RegExpBuilder builder(zone(), flags);
1450 : while (true) { // Iterate through list of sequences.
1451 267120 : while (*sequence_list != 0) { // Iterate through sequence.
1452 116760 : builder.AddUnicodeCharacter(*sequence_list);
1453 116760 : sequence_list++;
1454 : }
1455 33600 : sequence_list++;
1456 33600 : if (*sequence_list == 0) break;
1457 : builder.NewAlternative();
1458 : }
1459 100 : return builder.ToRegExp();
1460 : }
1461 :
1462 50 : if (NameEquals(name, "Emoji_Keycap_Sequence")) {
1463 : // https://unicode.org/reports/tr51/#def_emoji_keycap_sequence
1464 : // emoji_keycap_sequence := [0-9#*] \x{FE0F 20E3}
1465 : RegExpBuilder builder(zone(), flags);
1466 : ZoneList<CharacterRange>* prefix_ranges =
1467 : new (zone()) ZoneList<CharacterRange>(2, zone());
1468 30 : prefix_ranges->Add(CharacterRange::Range('0', '9'), zone());
1469 30 : prefix_ranges->Add(CharacterRange::Singleton('#'), zone());
1470 30 : prefix_ranges->Add(CharacterRange::Singleton('*'), zone());
1471 60 : builder.AddCharacterClass(
1472 30 : new (zone()) RegExpCharacterClass(zone(), prefix_ranges, flags));
1473 30 : builder.AddCharacter(0xFE0F);
1474 30 : builder.AddCharacter(0x20E3);
1475 30 : return builder.ToRegExp();
1476 20 : } else if (NameEquals(name, "Emoji_Modifier_Sequence")) {
1477 : // https://unicode.org/reports/tr51/#def_emoji_modifier_sequence
1478 : // emoji_modifier_sequence := emoji_modifier_base emoji_modifier
1479 : RegExpBuilder builder(zone(), flags);
1480 : ZoneList<CharacterRange>* modifier_base_ranges =
1481 : new (zone()) ZoneList<CharacterRange>(2, zone());
1482 : LookupPropertyValueName(UCHAR_EMOJI_MODIFIER_BASE, "Y", false,
1483 20 : modifier_base_ranges, zone());
1484 40 : builder.AddCharacterClass(
1485 20 : new (zone()) RegExpCharacterClass(zone(), modifier_base_ranges, flags));
1486 : ZoneList<CharacterRange>* modifier_ranges =
1487 : new (zone()) ZoneList<CharacterRange>(2, zone());
1488 : LookupPropertyValueName(UCHAR_EMOJI_MODIFIER, "Y", false, modifier_ranges,
1489 20 : zone());
1490 40 : builder.AddCharacterClass(
1491 20 : new (zone()) RegExpCharacterClass(zone(), modifier_ranges, flags));
1492 20 : return builder.ToRegExp();
1493 : }
1494 :
1495 : return nullptr;
1496 : }
1497 :
1498 : #else // V8_INTL_SUPPORT
1499 :
1500 : bool RegExpParser::ParsePropertyClassName(std::vector<char>* name_1,
1501 : std::vector<char>* name_2) {
1502 : return false;
1503 : }
1504 :
1505 : bool RegExpParser::AddPropertyClassRange(ZoneList<CharacterRange>* add_to,
1506 : bool negate,
1507 : const std::vector<char>& name_1,
1508 : const std::vector<char>& name_2) {
1509 : return false;
1510 : }
1511 :
1512 : RegExpTree* RegExpParser::GetPropertySequence(const std::vector<char>& name) {
1513 : return nullptr;
1514 : }
1515 :
1516 : #endif // V8_INTL_SUPPORT
1517 :
1518 7458 : bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1519 : uc32 x = 0;
1520 : int d = HexValue(current());
1521 7458 : if (d < 0) {
1522 : return false;
1523 : }
1524 81648 : while (d >= 0) {
1525 37122 : x = x * 16 + d;
1526 37122 : if (x > max_value) {
1527 : return false;
1528 : }
1529 37095 : Advance();
1530 : d = HexValue(current());
1531 : }
1532 7431 : *value = x;
1533 7431 : return true;
1534 : }
1535 :
1536 :
1537 88535 : uc32 RegExpParser::ParseClassCharacterEscape() {
1538 : DCHECK_EQ('\\', current());
1539 : DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1540 88535 : Advance();
1541 88535 : switch (current()) {
1542 : case 'b':
1543 14 : Advance();
1544 14 : return '\b';
1545 : // ControlEscape :: one of
1546 : // f n r t v
1547 : case 'f':
1548 0 : Advance();
1549 0 : return '\f';
1550 : case 'n':
1551 471 : Advance();
1552 471 : return '\n';
1553 : case 'r':
1554 48 : Advance();
1555 48 : return '\r';
1556 : case 't':
1557 186 : Advance();
1558 186 : return '\t';
1559 : case 'v':
1560 1 : Advance();
1561 1 : return '\v';
1562 : case 'c': {
1563 : uc32 controlLetter = Next();
1564 290 : uc32 letter = controlLetter & ~('A' ^ 'a');
1565 : // Inside a character class, we also accept digits and underscore as
1566 : // control characters, unless with /u. See Annex B:
1567 : // ES#prod-annexB-ClassControlLetter
1568 290 : if (letter >= 'A' && letter <= 'Z') {
1569 : Advance(2);
1570 : // Control letters mapped to ASCII control characters in the range
1571 : // 0x00-0x1F.
1572 30 : return controlLetter & 0x1F;
1573 : }
1574 260 : if (unicode()) {
1575 : // With /u, invalid escapes are not treated as identity escapes.
1576 18 : ReportError(CStrVector("Invalid class escape"));
1577 18 : return 0;
1578 : }
1579 484 : if ((controlLetter >= '0' && controlLetter <= '9') ||
1580 242 : controlLetter == '_') {
1581 : Advance(2);
1582 160 : return controlLetter & 0x1F;
1583 : }
1584 : // We match JSC in reading the backslash as a literal
1585 : // character instead of as starting an escape.
1586 : // TODO(v8:6201): Not yet covered by the spec.
1587 : return '\\';
1588 : }
1589 : case '0':
1590 : // With /u, \0 is interpreted as NUL if not followed by another digit.
1591 1270 : if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1592 130 : Advance();
1593 130 : return 0;
1594 : }
1595 : V8_FALLTHROUGH;
1596 : case '1':
1597 : case '2':
1598 : case '3':
1599 : case '4':
1600 : case '5':
1601 : case '6':
1602 : case '7':
1603 : // For compatibility, we interpret a decimal escape that isn't
1604 : // a back reference (and therefore either \0 or not valid according
1605 : // to the specification) as a 1..3 digit octal character code.
1606 : // ES#prod-annexB-LegacyOctalEscapeSequence
1607 1098 : if (unicode()) {
1608 : // With /u, decimal escape is not interpreted as octal character code.
1609 36 : ReportError(CStrVector("Invalid class escape"));
1610 36 : return 0;
1611 : }
1612 1062 : return ParseOctalLiteral();
1613 : case 'x': {
1614 2165 : Advance();
1615 : uc32 value;
1616 2165 : if (ParseHexEscape(2, &value)) return value;
1617 30 : if (unicode()) {
1618 : // With /u, invalid escapes are not treated as identity escapes.
1619 0 : ReportError(CStrVector("Invalid escape"));
1620 0 : return 0;
1621 : }
1622 : // If \x is not followed by a two-digit hexadecimal, treat it
1623 : // as an identity escape.
1624 : return 'x';
1625 : }
1626 : case 'u': {
1627 81679 : Advance();
1628 : uc32 value;
1629 81679 : if (ParseUnicodeEscape(&value)) return value;
1630 20 : if (unicode()) {
1631 : // With /u, invalid escapes are not treated as identity escapes.
1632 0 : ReportError(CStrVector("Invalid unicode escape"));
1633 0 : return 0;
1634 : }
1635 : // If \u is not followed by a two-digit hexadecimal, treat it
1636 : // as an identity escape.
1637 : return 'u';
1638 : }
1639 : default: {
1640 : uc32 result = current();
1641 : // With /u, no identity escapes except for syntax characters and '-' are
1642 : // allowed. Otherwise, all identity escapes are allowed.
1643 2593 : if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1644 2453 : Advance();
1645 2453 : return result;
1646 : }
1647 0 : ReportError(CStrVector("Invalid escape"));
1648 0 : return 0;
1649 : }
1650 : }
1651 : return 0;
1652 : }
1653 :
1654 471188 : void RegExpParser::ParseClassEscape(ZoneList<CharacterRange>* ranges,
1655 : Zone* zone,
1656 : bool add_unicode_case_equivalents,
1657 : uc32* char_out, bool* is_class_escape) {
1658 : uc32 current_char = current();
1659 471188 : if (current_char == '\\') {
1660 90784 : switch (Next()) {
1661 : case 'w':
1662 : case 'W':
1663 : case 'd':
1664 : case 'D':
1665 : case 's':
1666 : case 'S': {
1667 3382 : CharacterRange::AddClassEscape(static_cast<char>(Next()), ranges,
1668 1691 : add_unicode_case_equivalents, zone);
1669 : Advance(2);
1670 1691 : *is_class_escape = true;
1671 1691 : return;
1672 : }
1673 : case kEndMarker:
1674 0 : ReportError(CStrVector("\\ at end of pattern"));
1675 0 : return;
1676 : case 'p':
1677 : case 'P':
1678 620 : if (unicode()) {
1679 558 : bool negate = Next() == 'P';
1680 : Advance(2);
1681 : std::vector<char> name_1, name_2;
1682 1089 : if (!ParsePropertyClassName(&name_1, &name_2) ||
1683 531 : !AddPropertyClassRange(ranges, negate, name_1, name_2)) {
1684 225 : ReportError(CStrVector("Invalid property name in character class"));
1685 : }
1686 558 : *is_class_escape = true;
1687 : return;
1688 : }
1689 : break;
1690 : default:
1691 : break;
1692 : }
1693 88535 : *char_out = ParseClassCharacterEscape();
1694 88535 : *is_class_escape = false;
1695 : } else {
1696 380404 : Advance();
1697 380404 : *char_out = current_char;
1698 380404 : *is_class_escape = false;
1699 : }
1700 : }
1701 :
1702 155232 : RegExpTree* RegExpParser::ParseCharacterClass(const RegExpBuilder* builder) {
1703 : static const char* kUnterminated = "Unterminated character class";
1704 : static const char* kRangeInvalid = "Invalid character class";
1705 : static const char* kRangeOutOfOrder = "Range out of order in character class";
1706 :
1707 : DCHECK_EQ(current(), '[');
1708 155232 : Advance();
1709 : bool is_negated = false;
1710 155232 : if (current() == '^') {
1711 : is_negated = true;
1712 10400 : Advance();
1713 : }
1714 : ZoneList<CharacterRange>* ranges =
1715 : new (zone()) ZoneList<CharacterRange>(2, zone());
1716 156929 : bool add_unicode_case_equivalents = unicode() && builder->ignore_case();
1717 451248 : while (has_more() && current() != ']') {
1718 : uc32 char_1, char_2;
1719 : bool is_class_1, is_class_2;
1720 296444 : ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
1721 296800 : &is_class_1 CHECK_FAILED);
1722 296165 : if (current() == '-') {
1723 174816 : Advance();
1724 174816 : if (current() == kEndMarker) {
1725 : // If we reach the end we break out of the loop and let the
1726 : // following code report an error.
1727 : break;
1728 174811 : } else if (current() == ']') {
1729 67 : if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1730 67 : ranges->Add(CharacterRange::Singleton('-'), zone());
1731 67 : break;
1732 : }
1733 : ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
1734 174744 : &is_class_2 CHECK_FAILED);
1735 174744 : if (is_class_1 || is_class_2) {
1736 : // Either end is an escaped character class. Treat the '-' verbatim.
1737 176 : if (unicode()) {
1738 : // ES2015 21.2.2.15.1 step 1.
1739 54 : return ReportError(CStrVector(kRangeInvalid));
1740 : }
1741 149 : if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1742 149 : ranges->Add(CharacterRange::Singleton('-'), zone());
1743 149 : if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
1744 149 : continue;
1745 : }
1746 : // ES2015 21.2.2.15.1 step 6.
1747 174568 : if (char_1 > char_2) {
1748 100 : return ReportError(CStrVector(kRangeOutOfOrder));
1749 : }
1750 174518 : ranges->Add(CharacterRange::Range(char_1, char_2), zone());
1751 : } else {
1752 121349 : if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
1753 : }
1754 : }
1755 154876 : if (!has_more()) {
1756 40 : return ReportError(CStrVector(kUnterminated));
1757 : }
1758 154856 : Advance();
1759 : RegExpCharacterClass::CharacterClassFlags character_class_flags;
1760 154856 : if (is_negated) character_class_flags = RegExpCharacterClass::NEGATED;
1761 309712 : return new (zone()) RegExpCharacterClass(zone(), ranges, builder->flags(),
1762 309712 : character_class_flags);
1763 : }
1764 :
1765 :
1766 : #undef CHECK_FAILED
1767 :
1768 :
1769 380302 : bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1770 : FlatStringReader* input, JSRegExp::Flags flags,
1771 : RegExpCompileData* result) {
1772 : DCHECK(result != nullptr);
1773 380302 : RegExpParser parser(input, &result->error, flags, isolate, zone);
1774 380302 : RegExpTree* tree = parser.ParsePattern();
1775 380302 : if (parser.failed()) {
1776 : DCHECK(tree == nullptr);
1777 : DCHECK(!result->error.is_null());
1778 : } else {
1779 : DCHECK(tree != nullptr);
1780 : DCHECK(result->error.is_null());
1781 : if (FLAG_trace_regexp_parser) {
1782 : StdoutStream os;
1783 : tree->Print(os, zone);
1784 : os << "\n";
1785 : }
1786 377336 : result->tree = tree;
1787 : int capture_count = parser.captures_started();
1788 583669 : result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1789 377336 : result->contains_anchor = parser.contains_anchor();
1790 377336 : result->capture_name_map = parser.CreateCaptureNameMap();
1791 377336 : result->capture_count = capture_count;
1792 : }
1793 380302 : return !parser.failed();
1794 : }
1795 :
1796 0 : RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1797 : : zone_(zone),
1798 : pending_empty_(false),
1799 : flags_(flags),
1800 : characters_(nullptr),
1801 : pending_surrogate_(kNoPendingSurrogate),
1802 : terms_(),
1803 1486350 : alternatives_()
1804 : #ifdef DEBUG
1805 : ,
1806 : last_added_(ADD_NONE)
1807 : #endif
1808 : {
1809 0 : }
1810 :
1811 :
1812 0 : void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1813 : DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1814 : FlushPendingSurrogate();
1815 : // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1816 73918 : pending_surrogate_ = lead_surrogate;
1817 0 : }
1818 :
1819 :
1820 73858 : void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1821 : DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1822 73858 : if (pending_surrogate_ != kNoPendingSurrogate) {
1823 73668 : uc16 lead_surrogate = pending_surrogate_;
1824 73668 : pending_surrogate_ = kNoPendingSurrogate;
1825 : DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1826 : uc32 combined =
1827 73668 : unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1828 73668 : if (NeedsDesugaringForIgnoreCase(combined)) {
1829 60 : AddCharacterClassForDesugaring(combined);
1830 : } else {
1831 : ZoneList<uc16> surrogate_pair(2, zone());
1832 73608 : surrogate_pair.Add(lead_surrogate, zone());
1833 73608 : surrogate_pair.Add(trail_surrogate, zone());
1834 : RegExpAtom* atom =
1835 73608 : new (zone()) RegExpAtom(surrogate_pair.ToConstVector(), flags_);
1836 73608 : AddAtom(atom);
1837 : }
1838 : } else {
1839 190 : pending_surrogate_ = trail_surrogate;
1840 : FlushPendingSurrogate();
1841 : }
1842 73858 : }
1843 :
1844 :
1845 0 : void RegExpBuilder::FlushPendingSurrogate() {
1846 389388818 : if (pending_surrogate_ != kNoPendingSurrogate) {
1847 : DCHECK(unicode());
1848 440 : uc32 c = pending_surrogate_;
1849 440 : pending_surrogate_ = kNoPendingSurrogate;
1850 440 : AddCharacterClassForDesugaring(c);
1851 : }
1852 0 : }
1853 :
1854 :
1855 4460554 : void RegExpBuilder::FlushCharacters() {
1856 : FlushPendingSurrogate();
1857 4460554 : pending_empty_ = false;
1858 4460554 : if (characters_ != nullptr) {
1859 : RegExpTree* atom =
1860 402644 : new (zone()) RegExpAtom(characters_->ToConstVector(), flags_);
1861 402644 : characters_ = nullptr;
1862 402644 : text_.Add(atom, zone());
1863 : LAST(ADD_ATOM);
1864 : }
1865 4460554 : }
1866 :
1867 :
1868 4207944 : void RegExpBuilder::FlushText() {
1869 4207944 : FlushCharacters();
1870 : int num_text = text_.length();
1871 4207944 : if (num_text == 0) {
1872 : return;
1873 570061 : } else if (num_text == 1) {
1874 532525 : terms_.Add(text_.last(), zone());
1875 : } else {
1876 : RegExpText* text = new (zone()) RegExpText(zone());
1877 251548 : for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1878 37536 : terms_.Add(text, zone());
1879 : }
1880 : text_.Clear();
1881 : }
1882 :
1883 :
1884 383398502 : void RegExpBuilder::AddCharacter(uc16 c) {
1885 : FlushPendingSurrogate();
1886 383398502 : pending_empty_ = false;
1887 383398502 : if (NeedsDesugaringForIgnoreCase(c)) {
1888 905 : AddCharacterClassForDesugaring(c);
1889 : } else {
1890 383397597 : if (characters_ == nullptr) {
1891 1831190 : characters_ = new (zone()) ZoneList<uc16>(4, zone());
1892 : }
1893 383397597 : characters_->Add(c, zone());
1894 : LAST(ADD_CHAR);
1895 : }
1896 383398502 : }
1897 :
1898 :
1899 383460099 : void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1900 383460099 : if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1901 : DCHECK(unicode());
1902 73668 : AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1903 73668 : AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1904 383433383 : } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1905 : AddLeadSurrogate(c);
1906 383432883 : } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1907 190 : AddTrailSurrogate(c);
1908 : } else {
1909 383385991 : AddCharacter(static_cast<uc16>(c));
1910 : }
1911 383460099 : }
1912 :
1913 1819 : void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1914 : // A lead or trail surrogate parsed via escape sequence will not
1915 : // pair up with any preceding lead or following trail surrogate.
1916 : FlushPendingSurrogate();
1917 1819 : AddUnicodeCharacter(character);
1918 : FlushPendingSurrogate();
1919 1819 : }
1920 :
1921 113 : void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1922 :
1923 :
1924 183768 : void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1925 183768 : if (NeedsDesugaringForUnicode(cc)) {
1926 : // With /u, character class needs to be desugared, so it
1927 : // must be a standalone term instead of being part of a RegExpText.
1928 : AddTerm(cc);
1929 : } else {
1930 178972 : AddAtom(cc);
1931 : }
1932 183768 : }
1933 :
1934 1405 : void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1935 2810 : AddTerm(new (zone()) RegExpCharacterClass(
1936 : zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c)),
1937 1405 : flags_));
1938 1405 : }
1939 :
1940 :
1941 1363157 : void RegExpBuilder::AddAtom(RegExpTree* term) {
1942 1363157 : if (term->IsEmpty()) {
1943 : AddEmpty();
1944 : return;
1945 : }
1946 1363157 : if (term->IsTextElement()) {
1947 252610 : FlushCharacters();
1948 252610 : text_.Add(term, zone());
1949 : } else {
1950 1110547 : FlushText();
1951 1110547 : terms_.Add(term, zone());
1952 : }
1953 : LAST(ADD_ATOM);
1954 : }
1955 :
1956 :
1957 0 : void RegExpBuilder::AddTerm(RegExpTree* term) {
1958 6201 : FlushText();
1959 6201 : terms_.Add(term, zone());
1960 : LAST(ADD_ATOM);
1961 0 : }
1962 :
1963 :
1964 11246 : void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1965 11246 : FlushText();
1966 11246 : if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
1967 : // Omit repeated assertions of the same type.
1968 199 : RegExpAssertion* last = terms_.last()->AsAssertion();
1969 199 : RegExpAssertion* next = assert->AsAssertion();
1970 199 : if (last->assertion_type() == next->assertion_type()) return;
1971 : }
1972 11166 : terms_.Add(assert, zone());
1973 : LAST(ADD_ASSERT);
1974 : }
1975 :
1976 :
1977 150826 : void RegExpBuilder::NewAlternative() { FlushTerms(); }
1978 :
1979 :
1980 1634365 : void RegExpBuilder::FlushTerms() {
1981 1634365 : FlushText();
1982 : int num_terms = terms_.length();
1983 : RegExpTree* alternative;
1984 1634365 : if (num_terms == 0) {
1985 329988 : alternative = new (zone()) RegExpEmpty();
1986 1304377 : } else if (num_terms == 1) {
1987 : alternative = terms_.last();
1988 : } else {
1989 81794 : alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1990 : }
1991 1634365 : alternatives_.Add(alternative, zone());
1992 : terms_.Clear();
1993 : LAST(ADD_NONE);
1994 1634365 : }
1995 :
1996 :
1997 183768 : bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1998 183768 : if (!unicode()) return false;
1999 : // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
2000 : // necessarily mean that we need to desugar. It's probably nicer to have a
2001 : // separate pass to figure out unicode desugarings.
2002 6084 : if (ignore_case()) return true;
2003 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2004 5256 : CharacterRange::Canonicalize(ranges);
2005 13589 : for (int i = ranges->length() - 1; i >= 0; i--) {
2006 : uc32 from = ranges->at(i).from();
2007 : uc32 to = ranges->at(i).to();
2008 : // Check for non-BMP characters.
2009 12301 : if (to >= kNonBmpStart) return true;
2010 : // Check for lone surrogates.
2011 8443 : if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
2012 : }
2013 : return false;
2014 : }
2015 :
2016 :
2017 383472170 : bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
2018 : #ifdef V8_INTL_SUPPORT
2019 383592420 : if (unicode() && ignore_case()) {
2020 2180 : icu::UnicodeSet set(c, c);
2021 1090 : set.closeOver(USET_CASE_INSENSITIVE);
2022 1090 : set.removeAllStrings();
2023 1090 : return set.size() > 1;
2024 : }
2025 : // In the case where ICU is not included, we act as if the unicode flag is
2026 : // not set, and do not desugar.
2027 : #endif // V8_INTL_SUPPORT
2028 : return false;
2029 : }
2030 :
2031 :
2032 1483539 : RegExpTree* RegExpBuilder::ToRegExp() {
2033 1483539 : FlushTerms();
2034 : int num_alternatives = alternatives_.length();
2035 1483539 : if (num_alternatives == 0) return new (zone()) RegExpEmpty();
2036 1483539 : if (num_alternatives == 1) return alternatives_.last();
2037 42858 : return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
2038 : }
2039 :
2040 1452016 : bool RegExpBuilder::AddQuantifierToAtom(
2041 : int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
2042 : FlushPendingSurrogate();
2043 1452016 : if (pending_empty_) {
2044 2 : pending_empty_ = false;
2045 2 : return true;
2046 : }
2047 : RegExpTree* atom;
2048 1452014 : if (characters_ != nullptr) {
2049 : DCHECK(last_added_ == ADD_CHAR);
2050 : // Last atom was character.
2051 : Vector<const uc16> char_vector = characters_->ToConstVector();
2052 : int num_chars = char_vector.length();
2053 1428243 : if (num_chars > 1) {
2054 562 : Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
2055 1124 : text_.Add(new (zone()) RegExpAtom(prefix, flags_), zone());
2056 : char_vector = char_vector.SubVector(num_chars - 1, num_chars);
2057 : }
2058 1428243 : characters_ = nullptr;
2059 1428243 : atom = new (zone()) RegExpAtom(char_vector, flags_);
2060 1428243 : FlushText();
2061 23771 : } else if (text_.length() > 0) {
2062 : DCHECK(last_added_ == ADD_ATOM);
2063 16267 : atom = text_.RemoveLast();
2064 16267 : FlushText();
2065 7504 : } else if (terms_.length() > 0) {
2066 : DCHECK(last_added_ == ADD_ATOM);
2067 7504 : atom = terms_.RemoveLast();
2068 7504 : if (atom->IsLookaround()) {
2069 : // With /u, lookarounds are not quantifiable.
2070 239 : if (unicode()) return false;
2071 : // Lookbehinds are not quantifiable.
2072 193 : if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
2073 : return false;
2074 : }
2075 : }
2076 7431 : if (atom->max_match() == 0) {
2077 : // Guaranteed to only match an empty string.
2078 : LAST(ADD_TERM);
2079 236 : if (min == 0) {
2080 : return true;
2081 : }
2082 97 : terms_.Add(atom, zone());
2083 97 : return true;
2084 : }
2085 : } else {
2086 : // Only call immediately after adding an atom or character!
2087 0 : UNREACHABLE();
2088 : }
2089 1451705 : terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
2090 1451705 : zone());
2091 : LAST(ADD_TERM);
2092 1451705 : return true;
2093 : }
2094 :
2095 : } // namespace internal
2096 121996 : } // namespace v8
|