Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/regexp-parser.h"
6 :
7 : #include "src/char-predicates-inl.h"
8 : #include "src/factory.h"
9 : #include "src/isolate.h"
10 : #include "src/objects-inl.h"
11 : #include "src/ostreams.h"
12 : #include "src/regexp/jsregexp.h"
13 : #include "src/utils.h"
14 :
15 : #ifdef V8_INTL_SUPPORT
16 : #include "unicode/uniset.h"
17 : #endif // V8_INTL_SUPPORT
18 :
19 : namespace v8 {
20 : namespace internal {
21 :
22 459845 : RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
23 : JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
24 : : isolate_(isolate),
25 : zone_(zone),
26 : error_(error),
27 : captures_(NULL),
28 : named_captures_(NULL),
29 : named_back_references_(NULL),
30 : in_(in),
31 : current_(kEndMarker),
32 : dotall_(flags & JSRegExp::kDotAll),
33 : ignore_case_(flags & JSRegExp::kIgnoreCase),
34 : multiline_(flags & JSRegExp::kMultiline),
35 : unicode_(flags & JSRegExp::kUnicode),
36 : next_pos_(0),
37 : captures_started_(0),
38 : capture_count_(0),
39 : has_more_(true),
40 : simple_(false),
41 : contains_anchor_(false),
42 : is_scanned_for_captures_(false),
43 : has_named_captures_(false),
44 2299225 : failed_(false) {
45 : DCHECK_IMPLIES(dotall(), FLAG_harmony_regexp_dotall);
46 459845 : Advance();
47 459845 : }
48 :
49 46246980 : inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) {
50 15415660 : int position = next_pos_;
51 : uc32 c0 = in()->Get(position);
52 15415660 : position++;
53 : const bool try_combine_surrogate_pairs =
54 15415660 : (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS);
55 15656524 : if (try_combine_surrogate_pairs && position < in()->length() &&
56 : unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
57 : uc16 c1 = in()->Get(position);
58 791 : if (unibrow::Utf16::IsTrailSurrogate(c1)) {
59 531 : c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
60 531 : position++;
61 : }
62 : }
63 15415660 : if (update_position) next_pos_ = position;
64 15415660 : return c0;
65 : }
66 :
67 :
68 200090 : uc32 RegExpParser::Next() {
69 200090 : if (has_next()) {
70 200048 : return ReadNext(false, ScanMode::DEFAULT);
71 : } else {
72 : return kEndMarker;
73 : }
74 : }
75 :
76 46106943 : void RegExpParser::Advance(ScanMode mode) {
77 15675370 : if (has_next()) {
78 : StackLimitCheck check(isolate());
79 15215961 : if (check.HasOverflowed()) {
80 349 : if (FLAG_abort_on_stack_overflow) FATAL("Aborting on stack overflow");
81 : ReportError(CStrVector(
82 698 : MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
83 15215612 : } else if (zone()->excess_allocation()) {
84 0 : ReportError(CStrVector("Regular expression too large"));
85 : } else {
86 15215612 : current_ = ReadNext(true, mode);
87 : }
88 : } else {
89 459409 : current_ = kEndMarker;
90 : // Advance so that position() points to 1-after-the-last-character. This is
91 : // important so that Reset() to this position works correctly.
92 459409 : next_pos_ = in()->length() + 1;
93 459409 : has_more_ = false;
94 : }
95 15675370 : }
96 :
97 :
98 3679 : void RegExpParser::Reset(int pos) {
99 3679 : next_pos_ = pos;
100 3679 : has_more_ = (pos < in()->length());
101 3679 : Advance();
102 0 : }
103 :
104 0 : void RegExpParser::Advance(int dist, ScanMode mode) {
105 83487 : next_pos_ += dist - 1;
106 83487 : Advance(mode);
107 0 : }
108 :
109 :
110 264595 : bool RegExpParser::simple() { return simple_; }
111 :
112 0 : bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
113 122 : switch (c) {
114 : case '^':
115 : case '$':
116 : case '\\':
117 : case '.':
118 : case '*':
119 : case '+':
120 : case '?':
121 : case '(':
122 : case ')':
123 : case '[':
124 : case ']':
125 : case '{':
126 : case '}':
127 : case '|':
128 : case '/':
129 : return true;
130 : default:
131 : break;
132 : }
133 0 : return false;
134 : }
135 :
136 :
137 11415 : RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
138 3853 : if (failed_) return NULL; // Do not overwrite any existing error.
139 3781 : failed_ = true;
140 : *error_ = isolate()
141 : ->factory()
142 : ->NewStringFromOneByte(Vector<const uint8_t>::cast(message))
143 11343 : .ToHandleChecked();
144 : // Zip to the end to make sure the no more input is read.
145 3781 : current_ = kEndMarker;
146 3781 : next_pos_ = in()->length();
147 3781 : return NULL;
148 : }
149 :
150 :
151 : #define CHECK_FAILED /**/); \
152 : if (failed_) return NULL; \
153 : ((void)0
154 :
155 :
156 : // Pattern ::
157 : // Disjunction
158 724440 : RegExpTree* RegExpParser::ParsePattern() {
159 459845 : RegExpTree* result = ParseDisjunction(CHECK_FAILED);
160 456218 : PatchNamedBackReferences(CHECK_FAILED);
161 : DCHECK(!has_more());
162 : // If the result of parsing is a literal string atom, and it has the
163 : // same length as the input, then the atom is identical to the input.
164 720659 : if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
165 256506 : simple_ = true;
166 : }
167 456064 : return result;
168 : }
169 :
170 :
171 : // Disjunction ::
172 : // Alternative
173 : // Alternative | Disjunction
174 : // Alternative ::
175 : // [empty]
176 : // Term Alternative
177 : // Term ::
178 : // Assertion
179 : // Atom
180 : // Atom Quantifier
181 42178989 : RegExpTree* RegExpParser::ParseDisjunction() {
182 : // Used to store current state while parsing subexpressions.
183 : RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
184 919690 : nullptr, ignore_case(), unicode(), zone());
185 10525672 : RegExpParserState* state = &initial_state;
186 : // Cache the builder in a local variable for quick access.
187 459845 : RegExpBuilder* builder = initial_state.builder();
188 : while (true) {
189 12253277 : switch (current()) {
190 : case kEndMarker:
191 456603 : if (state->IsSubexpression()) {
192 : // Inside a parenthesized group when hitting end of input.
193 459977 : return ReportError(CStrVector("Unterminated group"));
194 : }
195 : DCHECK_EQ(INITIAL, state->group_type());
196 : // Parsing completed successfully.
197 456471 : return builder->ToRegExp();
198 : case ')': {
199 1683861 : if (!state->IsSubexpression()) {
200 19 : return ReportError(CStrVector("Unmatched ')'"));
201 : }
202 : DCHECK_NE(INITIAL, state->group_type());
203 :
204 1683842 : Advance();
205 : // End disjunction parsing and convert builder content to new single
206 : // regexp atom.
207 1683842 : RegExpTree* body = builder->ToRegExp();
208 :
209 : int end_capture_index = captures_started();
210 :
211 : int capture_index = state->capture_index();
212 : SubexpressionType group_type = state->group_type();
213 :
214 : // Build result of subexpression.
215 1683842 : if (group_type == CAPTURE) {
216 1644448 : if (state->IsNamedCapture()) {
217 : CreateNamedCaptureAtIndex(state->capture_name(),
218 1721 : capture_index CHECK_FAILED);
219 : }
220 1644385 : RegExpCapture* capture = GetCapture(capture_index);
221 : capture->set_body(body);
222 : body = capture;
223 39394 : } else if (group_type == GROUPING) {
224 : body = new (zone()) RegExpGroup(body);
225 : } else {
226 : DCHECK(group_type == POSITIVE_LOOKAROUND ||
227 : group_type == NEGATIVE_LOOKAROUND);
228 4471 : bool is_positive = (group_type == POSITIVE_LOOKAROUND);
229 : body = new (zone()) RegExpLookaround(
230 : body, is_positive, end_capture_index - capture_index,
231 4471 : capture_index, state->lookaround_type());
232 : }
233 :
234 : // Restore previous state.
235 : state = state->previous_state();
236 : builder = state->builder();
237 :
238 1683779 : builder->AddAtom(body);
239 : // For compatability with JSC and ES3, we allow quantifiers after
240 : // lookaheads, and break in all cases.
241 1683779 : break;
242 : }
243 : case '|': {
244 152158 : Advance();
245 : builder->NewAlternative();
246 9492103 : continue;
247 : }
248 : case '*':
249 : case '+':
250 : case '?':
251 155 : return ReportError(CStrVector("Nothing to repeat"));
252 : case '^': {
253 16405 : Advance();
254 16405 : if (multiline()) {
255 : builder->AddAssertion(
256 1599 : new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
257 : } else {
258 : builder->AddAssertion(
259 14806 : new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
260 : set_contains_anchor();
261 : }
262 : continue;
263 : }
264 : case '$': {
265 7666 : Advance();
266 : RegExpAssertion::AssertionType assertion_type =
267 : multiline() ? RegExpAssertion::END_OF_LINE
268 7666 : : RegExpAssertion::END_OF_INPUT;
269 7666 : builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
270 7666 : continue;
271 : }
272 : case '.': {
273 30896 : Advance();
274 : ZoneList<CharacterRange>* ranges =
275 30896 : new (zone()) ZoneList<CharacterRange>(2, zone());
276 :
277 30896 : if (dotall()) {
278 : // Everything.
279 : DCHECK(FLAG_harmony_regexp_dotall);
280 48 : CharacterRange::AddClassEscape('*', ranges, false, zone());
281 : } else {
282 : // Everything except \x0a, \x0d, \u2028 and \u2029
283 30848 : CharacterRange::AddClassEscape('.', ranges, false, zone());
284 : }
285 :
286 : RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(ranges);
287 30896 : builder->AddCharacterClass(cc);
288 30896 : break;
289 : }
290 : case '(': {
291 : SubexpressionType subexpr_type = CAPTURE;
292 : RegExpLookaround::Type lookaround_type = state->lookaround_type();
293 : bool is_named_capture = false;
294 1684609 : Advance();
295 1684609 : if (current() == '?') {
296 41718 : switch (Next()) {
297 : case ':':
298 : subexpr_type = GROUPING;
299 : Advance(2);
300 : break;
301 : case '=':
302 : lookaround_type = RegExpLookaround::LOOKAHEAD;
303 : subexpr_type = POSITIVE_LOOKAROUND;
304 : Advance(2);
305 : break;
306 : case '!':
307 : lookaround_type = RegExpLookaround::LOOKAHEAD;
308 : subexpr_type = NEGATIVE_LOOKAROUND;
309 : Advance(2);
310 : break;
311 : case '<':
312 3956 : Advance();
313 3956 : if (FLAG_harmony_regexp_lookbehind) {
314 3907 : if (Next() == '=') {
315 : subexpr_type = POSITIVE_LOOKAROUND;
316 : lookaround_type = RegExpLookaround::LOOKBEHIND;
317 : Advance(2);
318 : break;
319 2506 : } else if (Next() == '!') {
320 : subexpr_type = NEGATIVE_LOOKAROUND;
321 : lookaround_type = RegExpLookaround::LOOKBEHIND;
322 : Advance(2);
323 : break;
324 : }
325 : }
326 2253 : if (FLAG_harmony_regexp_named_captures) {
327 2253 : has_named_captures_ = true;
328 : is_named_capture = true;
329 2253 : break;
330 : }
331 : // Fall through.
332 : default:
333 25 : return ReportError(CStrVector("Invalid group"));
334 : }
335 : }
336 :
337 : const ZoneVector<uc16>* capture_name = nullptr;
338 1684584 : if (subexpr_type == CAPTURE) {
339 1645144 : if (captures_started_ >= kMaxCaptures) {
340 7 : return ReportError(CStrVector("Too many captures"));
341 : }
342 1645137 : captures_started_++;
343 :
344 1645137 : if (is_named_capture) {
345 2253 : capture_name = ParseCaptureGroupName(CHECK_FAILED);
346 : }
347 : }
348 : // Store current state and begin new disjunction parsing.
349 : state = new (zone()) RegExpParserState(
350 : state, subexpr_type, lookaround_type, captures_started_,
351 5052177 : capture_name, ignore_case(), unicode(), zone());
352 : builder = state->builder();
353 1684059 : continue;
354 : }
355 : case '[': {
356 167245 : RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
357 166951 : builder->AddCharacterClass(cc->AsCharacterClass());
358 166951 : break;
359 : }
360 : // Atom ::
361 : // \ AtomEscape
362 : case '\\':
363 75112 : switch (Next()) {
364 : case kEndMarker:
365 7 : return ReportError(CStrVector("\\ at end of pattern"));
366 : case 'b':
367 : Advance(2);
368 : builder->AddAssertion(
369 627 : new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
370 627 : continue;
371 : case 'B':
372 : Advance(2);
373 : builder->AddAssertion(
374 413 : new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
375 413 : continue;
376 : // AtomEscape ::
377 : // CharacterClassEscape
378 : //
379 : // CharacterClassEscape :: one of
380 : // d D s S w W
381 : case 'd':
382 : case 'D':
383 : case 's':
384 : case 'S':
385 : case 'w':
386 : case 'W': {
387 22975 : uc32 c = Next();
388 : Advance(2);
389 : ZoneList<CharacterRange>* ranges =
390 22975 : new (zone()) ZoneList<CharacterRange>(2, zone());
391 : CharacterRange::AddClassEscape(c, ranges,
392 23463 : unicode() && ignore_case(), zone());
393 : RegExpCharacterClass* cc =
394 : new (zone()) RegExpCharacterClass(ranges);
395 22975 : builder->AddCharacterClass(cc);
396 22975 : break;
397 : }
398 : case 'p':
399 : case 'P': {
400 6057 : uc32 p = Next();
401 : Advance(2);
402 6057 : if (unicode()) {
403 5864 : if (FLAG_harmony_regexp_property) {
404 : ZoneList<CharacterRange>* ranges =
405 5850 : new (zone()) ZoneList<CharacterRange>(2, zone());
406 5850 : if (!ParsePropertyClass(ranges, p == 'P')) {
407 1176 : return ReportError(CStrVector("Invalid property name"));
408 : }
409 : RegExpCharacterClass* cc =
410 : new (zone()) RegExpCharacterClass(ranges);
411 4674 : builder->AddCharacterClass(cc);
412 : } else {
413 : // With /u, no identity escapes except for syntax characters
414 : // are allowed. Otherwise, all identity escapes are allowed.
415 14 : return ReportError(CStrVector("Invalid escape"));
416 : }
417 : } else {
418 193 : builder->AddCharacter(p);
419 : }
420 : break;
421 : }
422 : case '1':
423 : case '2':
424 : case '3':
425 : case '4':
426 : case '5':
427 : case '6':
428 : case '7':
429 : case '8':
430 : case '9': {
431 6931 : int index = 0;
432 7052 : bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
433 6922 : if (is_backref) {
434 12518 : if (state->IsInsideCaptureGroup(index)) {
435 : // The back reference is inside the capture group it refers to.
436 : // Nothing can possibly have been captured yet, so we use empty
437 : // instead. This ensures that, when checking a back reference,
438 : // the capture registers of the referenced capture are either
439 : // both set or both cleared.
440 : builder->AddEmpty();
441 : } else {
442 6070 : RegExpCapture* capture = GetCapture(index);
443 : RegExpTree* atom = new (zone()) RegExpBackReference(capture);
444 6070 : builder->AddAtom(atom);
445 : }
446 6319 : break;
447 : }
448 : // With /u, no identity escapes except for syntax characters
449 : // are allowed. Otherwise, all identity escapes are allowed.
450 663 : if (unicode()) {
451 112 : return ReportError(CStrVector("Invalid escape"));
452 : }
453 551 : uc32 first_digit = Next();
454 551 : if (first_digit == '8' || first_digit == '9') {
455 60 : builder->AddCharacter(first_digit);
456 : Advance(2);
457 : break;
458 : }
459 : }
460 : // Fall through.
461 : case '0': {
462 3929 : Advance();
463 3929 : if (unicode() && Next() >= '0' && Next() <= '9') {
464 : // With /u, decimal escape with leading 0 are not parsed as octal.
465 29 : return ReportError(CStrVector("Invalid decimal escape"));
466 : }
467 3900 : uc32 octal = ParseOctalLiteral();
468 3900 : builder->AddCharacter(octal);
469 3900 : break;
470 : }
471 : // ControlEscape :: one of
472 : // f n r t v
473 : case 'f':
474 : Advance(2);
475 15 : builder->AddCharacter('\f');
476 15 : break;
477 : case 'n':
478 : Advance(2);
479 563 : builder->AddCharacter('\n');
480 563 : break;
481 : case 'r':
482 : Advance(2);
483 97 : builder->AddCharacter('\r');
484 97 : break;
485 : case 't':
486 : Advance(2);
487 17 : builder->AddCharacter('\t');
488 17 : break;
489 : case 'v':
490 : Advance(2);
491 37 : builder->AddCharacter('\v');
492 37 : break;
493 : case 'c': {
494 294 : Advance();
495 294 : uc32 controlLetter = Next();
496 : // Special case if it is an ASCII letter.
497 : // Convert lower case letters to uppercase.
498 294 : uc32 letter = controlLetter & ~('a' ^ 'A');
499 294 : if (letter < 'A' || 'Z' < letter) {
500 : // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
501 : // Read the backslash as a literal character instead of as
502 : // starting an escape.
503 : // ES#prod-annexB-ExtendedPatternCharacter
504 161 : if (unicode()) {
505 : // With /u, invalid escapes are not treated as identity escapes.
506 29 : return ReportError(CStrVector("Invalid unicode escape"));
507 : }
508 132 : builder->AddCharacter('\\');
509 : } else {
510 : Advance(2);
511 133 : builder->AddCharacter(controlLetter & 0x1f);
512 : }
513 : break;
514 : }
515 : case 'x': {
516 : Advance(2);
517 : uc32 value;
518 327 : if (ParseHexEscape(2, &value)) {
519 188 : builder->AddCharacter(value);
520 139 : } else if (!unicode()) {
521 83 : builder->AddCharacter('x');
522 : } else {
523 : // With /u, invalid escapes are not treated as identity escapes.
524 56 : return ReportError(CStrVector("Invalid escape"));
525 : }
526 271 : break;
527 : }
528 : case 'u': {
529 : Advance(2);
530 : uc32 value;
531 2143 : if (ParseUnicodeEscape(&value)) {
532 1891 : builder->AddEscapedUnicodeCharacter(value);
533 252 : } else if (!unicode()) {
534 140 : builder->AddCharacter('u');
535 : } else {
536 : // With /u, invalid escapes are not treated as identity escapes.
537 112 : return ReportError(CStrVector("Invalid unicode escape"));
538 : }
539 2031 : break;
540 : }
541 : case 'k':
542 : // Either an identity escape or a named back-reference. The two
543 : // interpretations are mutually exclusive: '\k' is interpreted as
544 : // an identity escape for non-unicode patterns without named
545 : // capture groups, and as the beginning of a named back-reference
546 : // in all other cases.
547 2022 : if (FLAG_harmony_regexp_named_captures &&
548 296 : (unicode() || HasNamedCaptures())) {
549 : Advance(2);
550 612 : ParseNamedBackReference(builder, state CHECK_FAILED);
551 : break;
552 : }
553 : // Fall through.
554 : default:
555 30559 : Advance();
556 : // With /u, no identity escapes except for syntax characters
557 : // are allowed. Otherwise, all identity escapes are allowed.
558 30613 : if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
559 30517 : builder->AddCharacter(current());
560 30517 : Advance();
561 : } else {
562 42 : return ReportError(CStrVector("Invalid escape"));
563 : }
564 30517 : break;
565 : }
566 : break;
567 : case '{': {
568 : int dummy;
569 709 : bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
570 644 : if (parsed) return ReportError(CStrVector("Nothing to repeat"));
571 : // Fall through.
572 : }
573 : case '}':
574 : case ']':
575 1241 : if (unicode()) {
576 43 : return ReportError(CStrVector("Lone quantifier brackets"));
577 : }
578 : // Fall through.
579 : default:
580 7978473 : builder->AddUnicodeCharacter(current());
581 7978473 : Advance();
582 7978473 : break;
583 : } // end switch(current())
584 :
585 : int min;
586 : int max;
587 9932445 : switch (current()) {
588 : // QuantifierPrefix ::
589 : // *
590 : // +
591 : // ?
592 : // {
593 : case '*':
594 47764 : min = 0;
595 47764 : max = RegExpTree::kInfinity;
596 47764 : Advance();
597 47764 : break;
598 : case '+':
599 12591 : min = 1;
600 12591 : max = RegExpTree::kInfinity;
601 12591 : Advance();
602 12591 : break;
603 : case '?':
604 2229230 : min = 0;
605 2229230 : max = 1;
606 2229230 : Advance();
607 2229230 : break;
608 : case '{':
609 12482 : if (ParseIntervalQuantifier(&min, &max)) {
610 11776 : if (max < min) {
611 : return ReportError(
612 2 : CStrVector("numbers out of order in {} quantifier"));
613 : }
614 : break;
615 706 : } else if (unicode()) {
616 : // With /u, incomplete quantifiers are not allowed.
617 309 : return ReportError(CStrVector("Incomplete quantifier"));
618 : }
619 : continue;
620 : default:
621 : continue;
622 : }
623 : RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
624 2301359 : if (current() == '?') {
625 : quantifier_type = RegExpQuantifier::NON_GREEDY;
626 7685 : Advance();
627 : } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
628 : // FLAG_regexp_possessive_quantifier is a debug-only flag.
629 : quantifier_type = RegExpQuantifier::POSSESSIVE;
630 : Advance();
631 : }
632 2301359 : if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
633 30 : return ReportError(CStrVector("Invalid quantifier"));
634 : }
635 : }
636 : }
637 :
638 :
639 : #ifdef DEBUG
640 : // Currently only used in an DCHECK.
641 : static bool IsSpecialClassEscape(uc32 c) {
642 : switch (c) {
643 : case 'd':
644 : case 'D':
645 : case 's':
646 : case 'S':
647 : case 'w':
648 : case 'W':
649 : return true;
650 : default:
651 : return false;
652 : }
653 : }
654 : #endif
655 :
656 :
657 : // In order to know whether an escape is a backreference or not we have to scan
658 : // the entire regexp and find the number of capturing parentheses. However we
659 : // don't want to scan the regexp twice unless it is necessary. This mini-parser
660 : // is called when needed. It can see the difference between capturing and
661 : // noncapturing parentheses and can skip character classes and backslash-escaped
662 : // characters.
663 10707 : void RegExpParser::ScanForCaptures() {
664 : DCHECK(!is_scanned_for_captures_);
665 : const int saved_position = position();
666 : // Start with captures started previous to current position
667 : int capture_count = captures_started();
668 : // Add count of captures after this position.
669 : int n;
670 8327 : while ((n = current()) != kEndMarker) {
671 6123 : Advance();
672 6123 : switch (n) {
673 : case '\\':
674 835 : Advance();
675 835 : break;
676 : case '[': {
677 : int c;
678 148 : while ((c = current()) != kEndMarker) {
679 148 : Advance();
680 148 : if (c == '\\') {
681 0 : Advance();
682 : } else {
683 148 : if (c == ']') break;
684 : }
685 : }
686 : break;
687 : }
688 : case '(':
689 800 : if (current() == '?') {
690 : // At this point we could be in
691 : // * a non-capturing group '(:',
692 : // * a lookbehind assertion '(?<=' '(?<!'
693 : // * or a named capture '(?<'.
694 : //
695 : // Of these, only named captures are capturing groups.
696 217 : if (!FLAG_harmony_regexp_named_captures) break;
697 :
698 172 : Advance();
699 172 : if (current() != '<') break;
700 :
701 158 : if (FLAG_harmony_regexp_lookbehind) {
702 158 : Advance();
703 158 : if (current() == '=' || current() == '!') break;
704 : }
705 :
706 : // Found a possible named capture. It could turn out to be a syntax
707 : // error (e.g. an unterminated or invalid name), but that distinction
708 : // does not matter for our purposes.
709 106 : has_named_captures_ = true;
710 : }
711 689 : capture_count++;
712 689 : break;
713 : }
714 : }
715 1102 : capture_count_ = capture_count;
716 1102 : is_scanned_for_captures_ = true;
717 : Reset(saved_position);
718 1102 : }
719 :
720 :
721 21538 : bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
722 : DCHECK_EQ('\\', current());
723 : DCHECK('1' <= Next() && Next() <= '9');
724 : // Try to parse a decimal literal that is no greater than the total number
725 : // of left capturing parentheses in the input.
726 : int start = position();
727 6931 : int value = Next() - '0';
728 : Advance(2);
729 : while (true) {
730 : uc32 c = current();
731 7701 : if (IsDecimalDigit(c)) {
732 795 : value = 10 * value + (c - '0');
733 795 : if (value > kMaxCaptures) {
734 : Reset(start);
735 25 : return false;
736 : }
737 770 : Advance();
738 : } else {
739 : break;
740 : }
741 : }
742 6906 : if (value > captures_started()) {
743 1178 : if (!is_scanned_for_captures_) ScanForCaptures();
744 1178 : if (value > capture_count_) {
745 : Reset(start);
746 647 : return false;
747 : }
748 : }
749 6259 : *index_out = value;
750 7029 : return true;
751 : }
752 :
753 3359 : static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
754 3359 : if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
755 6574 : v->push_back(code_unit);
756 : } else {
757 144 : v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
758 144 : v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
759 : }
760 3359 : }
761 :
762 9332 : const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
763 : DCHECK(FLAG_harmony_regexp_named_captures);
764 : DCHECK_EQ(current(), '<');
765 :
766 : ZoneVector<uc16>* name =
767 2802 : new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
768 :
769 : // Capture names can always contain surrogate pairs, and we need to scan
770 : // accordingly.
771 : const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS;
772 2802 : Advance(scan_mode);
773 :
774 : bool at_start = true;
775 : while (true) {
776 6161 : uc32 c = current();
777 6161 : Advance(scan_mode);
778 :
779 : // Convert unicode escapes.
780 6530 : if (c == '\\' && current() == 'u') {
781 341 : Advance(scan_mode);
782 341 : if (!ParseUnicodeEscape(&c)) {
783 91 : ReportError(CStrVector("Invalid Unicode escape sequence"));
784 91 : return nullptr;
785 : }
786 : }
787 :
788 : // The backslash char is misclassified as both ID_Start and ID_Continue.
789 6070 : if (c == '\\') {
790 28 : ReportError(CStrVector("Invalid capture group name"));
791 28 : return nullptr;
792 : }
793 :
794 6042 : if (at_start) {
795 2753 : if (!IdentifierStart::Is(c)) {
796 154 : ReportError(CStrVector("Invalid capture group name"));
797 154 : return nullptr;
798 : }
799 2599 : push_code_unit(name, c);
800 : at_start = false;
801 : } else {
802 3289 : if (c == '>') {
803 : break;
804 1082 : } else if (IdentifierPart::Is(c)) {
805 760 : push_code_unit(name, c);
806 : } else {
807 322 : ReportError(CStrVector("Invalid capture group name"));
808 322 : return nullptr;
809 : }
810 : }
811 : }
812 :
813 5566 : return name;
814 : }
815 :
816 1721 : bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
817 2926 : int index) {
818 : DCHECK(FLAG_harmony_regexp_named_captures);
819 : DCHECK(0 < index && index <= captures_started_);
820 : DCHECK_NOT_NULL(name);
821 :
822 1721 : if (named_captures_ == nullptr) {
823 1268 : named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
824 : } else {
825 : // Check for duplicates and bail if we find any.
826 : // TODO(jgruber): O(n^2).
827 1398 : for (const auto& named_capture : *named_captures_) {
828 555 : if (*named_capture->name() == *name) {
829 63 : ReportError(CStrVector("Duplicate capture group name"));
830 63 : return false;
831 : }
832 : }
833 : }
834 :
835 1658 : RegExpCapture* capture = GetCapture(index);
836 : DCHECK(capture->name() == nullptr);
837 :
838 : capture->set_name(name);
839 1658 : named_captures_->Add(capture, zone());
840 :
841 1658 : return true;
842 : }
843 :
844 612 : bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
845 1822 : RegExpParserState* state) {
846 : // The parser is assumed to be on the '<' in \k<name>.
847 612 : if (current() != '<') {
848 63 : ReportError(CStrVector("Invalid named reference"));
849 63 : return false;
850 : }
851 :
852 549 : const ZoneVector<uc16>* name = ParseCaptureGroupName();
853 549 : if (name == nullptr) {
854 : return false;
855 : }
856 :
857 472 : if (state->IsInsideCaptureGroup(name)) {
858 : builder->AddEmpty();
859 : } else {
860 : RegExpBackReference* atom = new (zone()) RegExpBackReference();
861 : atom->set_name(name);
862 :
863 432 : builder->AddAtom(atom);
864 :
865 432 : if (named_back_references_ == nullptr) {
866 : named_back_references_ =
867 346 : new (zone()) ZoneList<RegExpBackReference*>(1, zone());
868 : }
869 432 : named_back_references_->Add(atom, zone());
870 : }
871 :
872 : return true;
873 : }
874 :
875 456218 : void RegExpParser::PatchNamedBackReferences() {
876 456218 : if (named_back_references_ == nullptr) return;
877 :
878 346 : if (named_captures_ == nullptr) {
879 21 : ReportError(CStrVector("Invalid named capture referenced"));
880 21 : return;
881 : }
882 :
883 : // Look up and patch the actual capture for each named back reference.
884 : // TODO(jgruber): O(n^2), optimize if necessary.
885 :
886 881 : for (int i = 0; i < named_back_references_->length(); i++) {
887 1541 : RegExpBackReference* ref = named_back_references_->at(i);
888 :
889 : int index = -1;
890 1071 : for (const auto& capture : *named_captures_) {
891 527 : if (*capture->name() == *ref->name()) {
892 : index = capture->index();
893 278 : break;
894 : }
895 : }
896 :
897 411 : if (index == -1) {
898 133 : ReportError(CStrVector("Invalid named capture referenced"));
899 133 : return;
900 : }
901 :
902 278 : ref->set_capture(GetCapture(index));
903 : }
904 : }
905 :
906 3315493 : RegExpCapture* RegExpParser::GetCapture(int index) {
907 : // The index for the capture groups are one-based. Its index in the list is
908 : // zero-based.
909 : int know_captures =
910 1652391 : is_scanned_for_captures_ ? capture_count_ : captures_started_;
911 : DCHECK(index <= know_captures);
912 1652391 : if (captures_ == NULL) {
913 18717 : captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
914 : }
915 6593552 : while (captures_->length() < know_captures) {
916 6585546 : captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
917 : }
918 3304782 : return captures_->at(index - 1);
919 : }
920 :
921 457080 : Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
922 459474 : if (named_captures_ == nullptr || named_captures_->is_empty())
923 455048 : return Handle<FixedArray>();
924 :
925 : Factory* factory = isolate()->factory();
926 :
927 1016 : int len = named_captures_->length() * 2;
928 1016 : Handle<FixedArray> array = factory->NewFixedArray(len);
929 :
930 4788 : for (int i = 0; i < named_captures_->length(); i++) {
931 2756 : RegExpCapture* capture = named_captures_->at(i);
932 1378 : MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
933 2756 : array->set(i * 2, *name.ToHandleChecked());
934 : array->set(i * 2 + 1, Smi::FromInt(capture->index()));
935 : }
936 :
937 1016 : return array;
938 : }
939 :
940 0 : bool RegExpParser::HasNamedCaptures() {
941 296 : if (has_named_captures_ || is_scanned_for_captures_) {
942 : return has_named_captures_;
943 : }
944 :
945 154 : ScanForCaptures();
946 : DCHECK(is_scanned_for_captures_);
947 154 : return has_named_captures_;
948 : }
949 :
950 0 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
951 14759 : for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
952 7242 : if (s->group_type() != CAPTURE) continue;
953 : // Return true if we found the matching capture index.
954 842 : if (index == s->capture_index()) return true;
955 : // Abort if index is larger than what has been parsed up till this state.
956 653 : if (index > s->capture_index()) return false;
957 : }
958 : return false;
959 : }
960 :
961 472 : bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
962 : const ZoneVector<uc16>* name) {
963 : DCHECK_NOT_NULL(name);
964 932 : for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
965 500 : if (s->capture_name() == nullptr) continue;
966 68 : if (*s->capture_name() == *name) return true;
967 : }
968 : return false;
969 : }
970 :
971 : // QuantifierPrefix ::
972 : // { DecimalDigits }
973 : // { DecimalDigits , }
974 : // { DecimalDigits , DecimalDigits }
975 : //
976 : // Returns true if parsing succeeds, and set the min_out and max_out
977 : // values. Values are truncated to RegExpTree::kInfinity if they overflow.
978 93806 : bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
979 : DCHECK_EQ(current(), '{');
980 : int start = position();
981 13140 : Advance();
982 : int min = 0;
983 13140 : if (!IsDecimalDigit(current())) {
984 : Reset(start);
985 1090 : return false;
986 : }
987 25989 : while (IsDecimalDigit(current())) {
988 : int next = current() - '0';
989 14101 : if (min > (RegExpTree::kInfinity - next) / 10) {
990 : // Overflow. Skip past remaining decimal digits and return -1.
991 1128 : do {
992 1128 : Advance();
993 : } while (IsDecimalDigit(current()));
994 : min = RegExpTree::kInfinity;
995 : break;
996 : }
997 13939 : min = 10 * min + next;
998 13939 : Advance();
999 : }
1000 : int max = 0;
1001 12050 : if (current() == '}') {
1002 : max = min;
1003 5122 : Advance();
1004 6928 : } else if (current() == ',') {
1005 6836 : Advance();
1006 6836 : if (current() == '}') {
1007 : max = RegExpTree::kInfinity;
1008 403 : Advance();
1009 : } else {
1010 14040 : while (IsDecimalDigit(current())) {
1011 : int next = current() - '0';
1012 7691 : if (max > (RegExpTree::kInfinity - next) / 10) {
1013 1050 : do {
1014 1050 : Advance();
1015 : } while (IsDecimalDigit(current()));
1016 : max = RegExpTree::kInfinity;
1017 : break;
1018 : }
1019 7607 : max = 10 * max + next;
1020 7607 : Advance();
1021 : }
1022 6433 : if (current() != '}') {
1023 : Reset(start);
1024 145 : return false;
1025 : }
1026 6288 : Advance();
1027 : }
1028 : } else {
1029 : Reset(start);
1030 92 : return false;
1031 : }
1032 11813 : *min_out = min;
1033 11813 : *max_out = max;
1034 11813 : return true;
1035 : }
1036 :
1037 :
1038 13070 : uc32 RegExpParser::ParseOctalLiteral() {
1039 : DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
1040 : // For compatibility with some other browsers (not all), we parse
1041 : // up to three octal digits with a value below 256.
1042 : // ES#prod-annexB-LegacyOctalEscapeSequence
1043 5072 : uc32 value = current() - '0';
1044 5072 : Advance();
1045 5072 : if ('0' <= current() && current() <= '7') {
1046 1571 : value = value * 8 + current() - '0';
1047 1571 : Advance();
1048 3102 : if (value < 32 && '0' <= current() && current() <= '7') {
1049 2790 : value = value * 8 + current() - '0';
1050 1395 : Advance();
1051 : }
1052 : }
1053 5072 : return value;
1054 : }
1055 :
1056 :
1057 85772 : bool RegExpParser::ParseHexEscape(int length, uc32* value) {
1058 : int start = position();
1059 : uc32 val = 0;
1060 85267 : for (int i = 0; i < length; ++i) {
1061 : uc32 c = current();
1062 : int d = HexValue(c);
1063 67587 : if (d < 0) {
1064 : Reset(start);
1065 505 : return false;
1066 : }
1067 67082 : val = val * 16 + d;
1068 67082 : Advance();
1069 : }
1070 17680 : *value = val;
1071 17680 : return true;
1072 : }
1073 :
1074 : // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
1075 39812 : bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1076 : // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1077 : // allowed). In the latter case, the number of hex digits between { } is
1078 : // arbitrary. \ and u have already been read.
1079 19770 : if (current() == '{' && unicode()) {
1080 : int start = position();
1081 1933 : Advance();
1082 1933 : if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
1083 1891 : if (current() == '}') {
1084 1884 : Advance();
1085 1884 : return true;
1086 : }
1087 : }
1088 : Reset(start);
1089 49 : return false;
1090 : }
1091 : // \u but no {, or \u{...} escapes not allowed.
1092 15780 : bool result = ParseHexEscape(4, value);
1093 44762 : if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1094 : current() == '\\') {
1095 : // Attempt to read trail surrogate.
1096 : int start = position();
1097 246 : if (Next() == 'u') {
1098 : Advance(2);
1099 : uc32 trail;
1100 456 : if (ParseHexEscape(4, &trail) &&
1101 222 : unibrow::Utf16::IsTrailSurrogate(trail)) {
1102 : *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1103 444 : static_cast<uc16>(trail));
1104 222 : return true;
1105 : }
1106 : }
1107 : Reset(start);
1108 : }
1109 15558 : return result;
1110 : }
1111 :
1112 : #ifdef V8_INTL_SUPPORT
1113 :
1114 : namespace {
1115 :
1116 4632 : bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1117 4632 : const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1118 4632 : if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
1119 : for (int i = 0;; i++) {
1120 : const char* long_name = u_getPropertyName(
1121 980 : property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1122 980 : if (long_name == NULL) break;
1123 868 : if (strcmp(property_name, long_name) == 0) return true;
1124 : }
1125 : return false;
1126 : }
1127 :
1128 4838 : bool IsExactPropertyValueAlias(const char* property_value_name,
1129 : UProperty property, int32_t property_value) {
1130 : const char* short_name =
1131 4838 : u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1132 4838 : if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
1133 : return true;
1134 : }
1135 : for (int i = 0;; i++) {
1136 : const char* long_name = u_getPropertyValueName(
1137 : property, property_value,
1138 414 : static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1139 414 : if (long_name == NULL) break;
1140 400 : if (strcmp(property_value_name, long_name) == 0) return true;
1141 : }
1142 : return false;
1143 : }
1144 :
1145 6904 : bool LookupPropertyValueName(UProperty property,
1146 : const char* property_value_name, bool negate,
1147 : ZoneList<CharacterRange>* result, Zone* zone) {
1148 : UProperty property_for_lookup = property;
1149 6904 : if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
1150 : // For the property Script_Extensions, we have to do the property value
1151 : // name lookup as if the property is Script.
1152 : property_for_lookup = UCHAR_SCRIPT;
1153 : }
1154 : int32_t property_value =
1155 6904 : u_getPropertyValueEnum(property_for_lookup, property_value_name);
1156 6904 : if (property_value == UCHAR_INVALID_CODE) return false;
1157 :
1158 : // We require the property name to match exactly to one of the property value
1159 : // aliases. However, u_getPropertyValueEnum uses loose matching.
1160 4838 : if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
1161 4838 : property_value)) {
1162 : return false;
1163 : }
1164 :
1165 4824 : UErrorCode ec = U_ZERO_ERROR;
1166 4824 : icu::UnicodeSet set;
1167 4824 : set.applyIntPropertyValue(property, property_value, ec);
1168 4824 : bool success = ec == U_ZERO_ERROR && !set.isEmpty();
1169 :
1170 4824 : if (success) {
1171 4824 : set.removeAllStrings();
1172 4824 : if (negate) set.complement();
1173 570420 : for (int i = 0; i < set.getRangeCount(); i++) {
1174 : result->Add(
1175 570420 : CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
1176 1140840 : zone);
1177 : }
1178 : }
1179 4824 : return success;
1180 : }
1181 :
1182 : template <size_t N>
1183 : inline bool NameEquals(const char* name, const char (&literal)[N]) {
1184 5970 : return strncmp(name, literal, N + 1) == 0;
1185 : }
1186 :
1187 2080 : bool LookupSpecialPropertyValueName(const char* name,
1188 : ZoneList<CharacterRange>* result,
1189 : bool negate, Zone* zone) {
1190 2080 : if (NameEquals(name, "Any")) {
1191 96 : if (!negate) result->Add(CharacterRange::Everything(), zone);
1192 2008 : } else if (NameEquals(name, "ASCII")) {
1193 : result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1194 : : CharacterRange::Range(0x0, 0x7f),
1195 252 : zone);
1196 1882 : } else if (NameEquals(name, "Assigned")) {
1197 : return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1198 90 : !negate, result, zone);
1199 : } else {
1200 : return false;
1201 : }
1202 : return true;
1203 : }
1204 :
1205 : // Explicitly whitelist supported binary properties. The spec forbids supporting
1206 : // properties outside of this set to ensure interoperability.
1207 : bool IsSupportedBinaryProperty(UProperty property) {
1208 : switch (property) {
1209 : case UCHAR_ALPHABETIC:
1210 : // 'Any' is not supported by ICU. See LookupSpecialPropertyValueName.
1211 : // 'ASCII' is not supported by ICU. See LookupSpecialPropertyValueName.
1212 : case UCHAR_ASCII_HEX_DIGIT:
1213 : // 'Assigned' is not supported by ICU. See LookupSpecialPropertyValueName.
1214 : case UCHAR_BIDI_CONTROL:
1215 : case UCHAR_BIDI_MIRRORED:
1216 : case UCHAR_CASE_IGNORABLE:
1217 : case UCHAR_CASED:
1218 : case UCHAR_CHANGES_WHEN_CASEFOLDED:
1219 : case UCHAR_CHANGES_WHEN_CASEMAPPED:
1220 : case UCHAR_CHANGES_WHEN_LOWERCASED:
1221 : case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
1222 : case UCHAR_CHANGES_WHEN_TITLECASED:
1223 : case UCHAR_CHANGES_WHEN_UPPERCASED:
1224 : case UCHAR_DASH:
1225 : case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
1226 : case UCHAR_DEPRECATED:
1227 : case UCHAR_DIACRITIC:
1228 : case UCHAR_EMOJI:
1229 : // TODO(yangguo): Uncomment this once we upgrade to ICU 60.
1230 : // See https://ssl.icu-project.org/trac/ticket/13062
1231 : // case UCHAR_EMOJI_COMPONENT:
1232 : case UCHAR_EMOJI_MODIFIER_BASE:
1233 : case UCHAR_EMOJI_MODIFIER:
1234 : case UCHAR_EMOJI_PRESENTATION:
1235 : case UCHAR_EXTENDER:
1236 : case UCHAR_GRAPHEME_BASE:
1237 : case UCHAR_GRAPHEME_EXTEND:
1238 : case UCHAR_HEX_DIGIT:
1239 : case UCHAR_ID_CONTINUE:
1240 : case UCHAR_ID_START:
1241 : case UCHAR_IDEOGRAPHIC:
1242 : case UCHAR_IDS_BINARY_OPERATOR:
1243 : case UCHAR_IDS_TRINARY_OPERATOR:
1244 : case UCHAR_JOIN_CONTROL:
1245 : case UCHAR_LOGICAL_ORDER_EXCEPTION:
1246 : case UCHAR_LOWERCASE:
1247 : case UCHAR_MATH:
1248 : case UCHAR_NONCHARACTER_CODE_POINT:
1249 : case UCHAR_PATTERN_SYNTAX:
1250 : case UCHAR_PATTERN_WHITE_SPACE:
1251 : case UCHAR_QUOTATION_MARK:
1252 : case UCHAR_RADICAL:
1253 : case UCHAR_S_TERM:
1254 : case UCHAR_SOFT_DOTTED:
1255 : case UCHAR_TERMINAL_PUNCTUATION:
1256 : case UCHAR_UNIFIED_IDEOGRAPH:
1257 : case UCHAR_UPPERCASE:
1258 : case UCHAR_VARIATION_SELECTOR:
1259 : case UCHAR_WHITE_SPACE:
1260 : case UCHAR_XID_CONTINUE:
1261 : case UCHAR_XID_START:
1262 : return true;
1263 : default:
1264 : break;
1265 : }
1266 : return false;
1267 : }
1268 :
1269 : } // anonymous namespace
1270 :
1271 6268 : bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1272 73604 : bool negate) {
1273 : // Parse the property class as follows:
1274 : // - In \p{name}, 'name' is interpreted
1275 : // - either as a general category property value name.
1276 : // - or as a binary property name.
1277 : // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1278 : // and 'value' is interpreted as one of the available property value names.
1279 : // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1280 : // - Loose matching is not applied.
1281 : List<char> first_part;
1282 : List<char> second_part;
1283 6268 : if (current() == '{') {
1284 : // Parse \p{[PropertyName=]PropertyNameValue}
1285 79412 : for (Advance(); current() != '}' && current() != '='; Advance()) {
1286 33606 : if (!has_next()) return false;
1287 33564 : first_part.Add(static_cast<char>(current()));
1288 : }
1289 6100 : if (current() == '=') {
1290 37472 : for (Advance(); current() != '}'; Advance()) {
1291 15154 : if (!has_next()) return false;
1292 15154 : second_part.Add(static_cast<char>(current()));
1293 : }
1294 3582 : second_part.Add(0); // null-terminate string.
1295 : }
1296 : } else {
1297 : return false;
1298 : }
1299 6100 : Advance();
1300 6100 : first_part.Add(0); // null-terminate string.
1301 :
1302 6100 : if (second_part.is_empty()) {
1303 : // First attempt to interpret as general category property value name.
1304 2518 : const char* name = first_part.ToConstVector().start();
1305 2518 : if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1306 2518 : result, zone())) {
1307 : return true;
1308 : }
1309 : // Interpret "Any", "ASCII", and "Assigned".
1310 2080 : if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1311 : return true;
1312 : }
1313 : // Then attempt to interpret as binary property name with value name 'Y'.
1314 1792 : UProperty property = u_getPropertyEnum(name);
1315 1792 : if (!IsSupportedBinaryProperty(property)) return false;
1316 1050 : if (!IsExactPropertyAlias(name, property)) return false;
1317 : return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1318 1050 : zone());
1319 : } else {
1320 : // Both property name and value name are specified. Attempt to interpret
1321 : // the property name as enumerated property.
1322 3582 : const char* property_name = first_part.ToConstVector().start();
1323 3582 : const char* value_name = second_part.ToConstVector().start();
1324 3582 : UProperty property = u_getPropertyEnum(property_name);
1325 3582 : if (!IsExactPropertyAlias(property_name, property)) return false;
1326 3470 : if (property == UCHAR_GENERAL_CATEGORY) {
1327 : // We want to allow aggregate value names such as "Letter".
1328 : property = UCHAR_GENERAL_CATEGORY_MASK;
1329 6844 : } else if (property != UCHAR_SCRIPT &&
1330 3422 : property != UCHAR_SCRIPT_EXTENSIONS) {
1331 : return false;
1332 : }
1333 : return LookupPropertyValueName(property, value_name, negate, result,
1334 3246 : zone());
1335 : }
1336 : }
1337 :
1338 : #else // V8_INTL_SUPPORT
1339 :
1340 : bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1341 : bool negate) {
1342 : return false;
1343 : }
1344 :
1345 : #endif // V8_INTL_SUPPORT
1346 :
1347 11219 : bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1348 : uc32 x = 0;
1349 : int d = HexValue(current());
1350 1933 : if (d < 0) {
1351 : return false;
1352 : }
1353 11219 : while (d >= 0) {
1354 9328 : x = x * 16 + d;
1355 9328 : if (x > max_value) {
1356 : return false;
1357 : }
1358 9286 : Advance();
1359 : d = HexValue(current());
1360 : }
1361 1891 : *value = x;
1362 1891 : return true;
1363 : }
1364 :
1365 :
1366 51400 : uc32 RegExpParser::ParseClassCharacterEscape() {
1367 : DCHECK(current() == '\\');
1368 : DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1369 22755 : Advance();
1370 22755 : switch (current()) {
1371 : case 'b':
1372 16 : Advance();
1373 16 : return '\b';
1374 : // ControlEscape :: one of
1375 : // f n r t v
1376 : case 'f':
1377 0 : Advance();
1378 0 : return '\f';
1379 : case 'n':
1380 487 : Advance();
1381 487 : return '\n';
1382 : case 'r':
1383 56 : Advance();
1384 56 : return '\r';
1385 : case 't':
1386 190 : Advance();
1387 190 : return '\t';
1388 : case 'v':
1389 13 : Advance();
1390 13 : return '\v';
1391 : case 'c': {
1392 458 : uc32 controlLetter = Next();
1393 458 : uc32 letter = controlLetter & ~('A' ^ 'a');
1394 : // Inside a character class, we also accept digits and underscore as
1395 : // control characters, unless with /u. See Annex B:
1396 : // ES#prod-annexB-ClassControlLetter
1397 458 : if (letter >= 'A' && letter <= 'Z') {
1398 : Advance(2);
1399 : // Control letters mapped to ASCII control characters in the range
1400 : // 0x00-0x1f.
1401 70 : return controlLetter & 0x1f;
1402 : }
1403 388 : if (unicode()) {
1404 : // With /u, invalid escapes are not treated as identity escapes.
1405 28 : ReportError(CStrVector("Invalid class escape"));
1406 28 : return 0;
1407 : }
1408 720 : if ((controlLetter >= '0' && controlLetter <= '9') ||
1409 360 : controlLetter == '_') {
1410 : Advance(2);
1411 236 : return controlLetter & 0x1f;
1412 : }
1413 : // We match JSC in reading the backslash as a literal
1414 : // character instead of as starting an escape.
1415 : // TODO(v8:6201): Not yet covered by the spec.
1416 : return '\\';
1417 : }
1418 : case '0':
1419 : // With /u, \0 is interpreted as NUL if not followed by another digit.
1420 1022 : if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1421 42 : Advance();
1422 42 : return 0;
1423 : }
1424 : // Fall through.
1425 : case '1':
1426 : case '2':
1427 : case '3':
1428 : case '4':
1429 : case '5':
1430 : case '6':
1431 : case '7':
1432 : // For compatibility, we interpret a decimal escape that isn't
1433 : // a back reference (and therefore either \0 or not valid according
1434 : // to the specification) as a 1..3 digit octal character code.
1435 : // ES#prod-annexB-LegacyOctalEscapeSequence
1436 1228 : if (unicode()) {
1437 : // With /u, decimal escape is not interpreted as octal character code.
1438 56 : ReportError(CStrVector("Invalid class escape"));
1439 56 : return 0;
1440 : }
1441 1172 : return ParseOctalLiteral();
1442 : case 'x': {
1443 1844 : Advance();
1444 : uc32 value;
1445 1844 : if (ParseHexEscape(2, &value)) return value;
1446 36 : if (unicode()) {
1447 : // With /u, invalid escapes are not treated as identity escapes.
1448 0 : ReportError(CStrVector("Invalid escape"));
1449 0 : return 0;
1450 : }
1451 : // If \x is not followed by a two-digit hexadecimal, treat it
1452 : // as an identity escape.
1453 : return 'x';
1454 : }
1455 : case 'u': {
1456 15229 : Advance();
1457 : uc32 value;
1458 15229 : if (ParseUnicodeEscape(&value)) return value;
1459 24 : if (unicode()) {
1460 : // With /u, invalid escapes are not treated as identity escapes.
1461 0 : ReportError(CStrVector("Invalid unicode escape"));
1462 0 : return 0;
1463 : }
1464 : // If \u is not followed by a two-digit hexadecimal, treat it
1465 : // as an identity escape.
1466 : return 'u';
1467 : }
1468 : default: {
1469 : uc32 result = current();
1470 : // With /u, no identity escapes except for syntax characters and '-' are
1471 : // allowed. Otherwise, all identity escapes are allowed.
1472 3260 : if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1473 3178 : Advance();
1474 3178 : return result;
1475 : }
1476 14 : ReportError(CStrVector("Invalid escape"));
1477 14 : return 0;
1478 : }
1479 : }
1480 : return 0;
1481 : }
1482 :
1483 :
1484 435098 : CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1485 : DCHECK_EQ(0, *char_class);
1486 : uc32 first = current();
1487 435098 : if (first == '\\') {
1488 24837 : switch (Next()) {
1489 : case 'w':
1490 : case 'W':
1491 : case 'd':
1492 : case 'D':
1493 : case 's':
1494 : case 'S': {
1495 2082 : *char_class = Next();
1496 : Advance(2);
1497 : return CharacterRange::Singleton(0); // Return dummy value.
1498 : }
1499 : case kEndMarker:
1500 0 : return ReportError(CStrVector("\\ at end of pattern"));
1501 : default:
1502 22755 : first = ParseClassCharacterEscape(CHECK_FAILED);
1503 : }
1504 : } else {
1505 410261 : Advance();
1506 : }
1507 :
1508 : return CharacterRange::Singleton(first);
1509 : }
1510 :
1511 : static const uc16 kNoCharClass = 0;
1512 :
1513 : // Adds range or pre-defined character class to character ranges.
1514 : // If char_class is not kInvalidClass, it's interpreted as a class
1515 : // escape (i.e., 's' means whitespace, from '\s').
1516 73061 : static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1517 : uc16 char_class, CharacterRange range,
1518 : bool add_unicode_case_equivalents,
1519 : Zone* zone) {
1520 73061 : if (char_class != kNoCharClass) {
1521 : CharacterRange::AddClassEscape(char_class, ranges,
1522 2054 : add_unicode_case_equivalents, zone);
1523 : } else {
1524 : ranges->Add(range, zone);
1525 : }
1526 73061 : }
1527 :
1528 279111 : bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1529 254335 : if (!FLAG_harmony_regexp_property) return false;
1530 12544 : if (!unicode()) return false;
1531 12232 : if (current() != '\\') return false;
1532 12208 : uc32 next = Next();
1533 : bool parse_success = false;
1534 12208 : if (next == 'p') {
1535 : Advance(2);
1536 238 : parse_success = ParsePropertyClass(ranges, false);
1537 11970 : } else if (next == 'P') {
1538 : Advance(2);
1539 180 : parse_success = ParsePropertyClass(ranges, true);
1540 : } else {
1541 : return false;
1542 : }
1543 418 : if (!parse_success)
1544 70 : ReportError(CStrVector("Invalid property name in character class"));
1545 418 : return parse_success;
1546 : }
1547 :
1548 2536696 : RegExpTree* RegExpParser::ParseCharacterClass() {
1549 : static const char* kUnterminated = "Unterminated character class";
1550 : static const char* kRangeInvalid = "Invalid character class";
1551 : static const char* kRangeOutOfOrder = "Range out of order in character class";
1552 :
1553 : DCHECK_EQ(current(), '[');
1554 167245 : Advance();
1555 : bool is_negated = false;
1556 167245 : if (current() == '^') {
1557 : is_negated = true;
1558 12260 : Advance();
1559 : }
1560 : ZoneList<CharacterRange>* ranges =
1561 167245 : new (zone()) ZoneList<CharacterRange>(2, zone());
1562 168689 : bool add_unicode_case_equivalents = unicode() && ignore_case();
1563 1009269 : while (has_more() && current() != ']') {
1564 254608 : bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1565 254828 : if (parsed_property) continue;
1566 253916 : uc16 char_class = kNoCharClass;
1567 253916 : CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1568 253818 : if (current() == '-') {
1569 181470 : Advance();
1570 181470 : if (current() == kEndMarker) {
1571 : // If we reach the end we break out of the loop and let the
1572 : // following code report an error.
1573 : break;
1574 181463 : } else if (current() == ']') {
1575 : AddRangeOrEscape(ranges, char_class, first,
1576 281 : add_unicode_case_equivalents, zone());
1577 562 : ranges->Add(CharacterRange::Singleton('-'), zone());
1578 281 : break;
1579 : }
1580 181182 : uc16 char_class_2 = kNoCharClass;
1581 181286 : CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1582 181182 : if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1583 : // Either end is an escaped character class. Treat the '-' verbatim.
1584 244 : if (unicode()) {
1585 : // ES2015 21.2.2.15.1 step 1.
1586 56 : return ReportError(CStrVector(kRangeInvalid));
1587 : }
1588 : AddRangeOrEscape(ranges, char_class, first,
1589 216 : add_unicode_case_equivalents, zone());
1590 432 : ranges->Add(CharacterRange::Singleton('-'), zone());
1591 : AddRangeOrEscape(ranges, char_class_2, next,
1592 216 : add_unicode_case_equivalents, zone());
1593 216 : continue;
1594 : }
1595 : // ES2015 21.2.2.15.1 step 6.
1596 180938 : if (first.from() > next.to()) {
1597 152 : return ReportError(CStrVector(kRangeOutOfOrder));
1598 : }
1599 361724 : ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1600 : } else {
1601 : AddRangeOrEscape(ranges, char_class, first, add_unicode_case_equivalents,
1602 72348 : zone());
1603 : }
1604 : }
1605 166972 : if (!has_more()) {
1606 42 : return ReportError(CStrVector(kUnterminated));
1607 : }
1608 166951 : Advance();
1609 166951 : if (ranges->length() == 0) {
1610 746 : ranges->Add(CharacterRange::Everything(), zone());
1611 373 : is_negated = !is_negated;
1612 : }
1613 : RegExpCharacterClass::Flags flags;
1614 166951 : if (is_negated) flags = RegExpCharacterClass::NEGATED;
1615 : return new (zone()) RegExpCharacterClass(ranges, flags);
1616 : }
1617 :
1618 :
1619 : #undef CHECK_FAILED
1620 :
1621 :
1622 459845 : bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1623 : FlatStringReader* input, JSRegExp::Flags flags,
1624 : RegExpCompileData* result) {
1625 : DCHECK(result != NULL);
1626 459845 : RegExpParser parser(input, &result->error, flags, isolate, zone);
1627 459845 : RegExpTree* tree = parser.ParsePattern();
1628 459845 : if (parser.failed()) {
1629 : DCHECK(tree == NULL);
1630 : DCHECK(!result->error.is_null());
1631 : } else {
1632 : DCHECK(tree != NULL);
1633 : DCHECK(result->error.is_null());
1634 : if (FLAG_trace_regexp_parser) {
1635 : OFStream os(stdout);
1636 : tree->Print(os, zone);
1637 : os << "\n";
1638 : }
1639 456064 : result->tree = tree;
1640 456064 : int capture_count = parser.captures_started();
1641 720659 : result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1642 456064 : result->contains_anchor = parser.contains_anchor();
1643 456064 : result->capture_name_map = parser.CreateCaptureNameMap();
1644 456064 : result->capture_count = capture_count;
1645 : }
1646 459845 : return !parser.failed();
1647 : }
1648 :
1649 0 : RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
1650 : : zone_(zone),
1651 : pending_empty_(false),
1652 : ignore_case_(ignore_case),
1653 : unicode_(unicode),
1654 : characters_(NULL),
1655 : pending_surrogate_(kNoPendingSurrogate),
1656 : terms_(),
1657 2143904 : alternatives_()
1658 : #ifdef DEBUG
1659 : ,
1660 : last_added_(ADD_NONE)
1661 : #endif
1662 : {
1663 0 : }
1664 :
1665 :
1666 0 : void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1667 : DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1668 : FlushPendingSurrogate();
1669 : // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1670 933 : pending_surrogate_ = lead_surrogate;
1671 0 : }
1672 :
1673 :
1674 3229 : void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1675 : DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1676 857 : if (pending_surrogate_ != kNoPendingSurrogate) {
1677 : uc16 lead_surrogate = pending_surrogate_;
1678 629 : pending_surrogate_ = kNoPendingSurrogate;
1679 : DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1680 : uc32 combined =
1681 : unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1682 629 : if (NeedsDesugaringForIgnoreCase(combined)) {
1683 36 : AddCharacterClassForDesugaring(combined);
1684 : } else {
1685 593 : ZoneList<uc16> surrogate_pair(2, zone());
1686 : surrogate_pair.Add(lead_surrogate, zone());
1687 : surrogate_pair.Add(trail_surrogate, zone());
1688 : RegExpAtom* atom =
1689 593 : new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1690 593 : AddAtom(atom);
1691 : }
1692 : } else {
1693 228 : pending_surrogate_ = trail_surrogate;
1694 : FlushPendingSurrogate();
1695 : }
1696 857 : }
1697 :
1698 :
1699 0 : void RegExpBuilder::FlushPendingSurrogate() {
1700 16835002 : if (pending_surrogate_ != kNoPendingSurrogate) {
1701 : DCHECK(unicode());
1702 532 : uc32 c = pending_surrogate_;
1703 532 : pending_surrogate_ = kNoPendingSurrogate;
1704 532 : AddCharacterClassForDesugaring(c);
1705 : }
1706 0 : }
1707 :
1708 :
1709 7522416 : void RegExpBuilder::FlushCharacters() {
1710 : FlushPendingSurrogate();
1711 6513422 : pending_empty_ = false;
1712 6513422 : if (characters_ != NULL) {
1713 504497 : RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1714 504497 : characters_ = NULL;
1715 504497 : text_.Add(atom, zone());
1716 : LAST(ADD_ATOM);
1717 : }
1718 6513422 : }
1719 :
1720 :
1721 6992585 : void RegExpBuilder::FlushText() {
1722 6293140 : FlushCharacters();
1723 : int num_text = text_.length();
1724 6293140 : if (num_text == 0) {
1725 6293140 : return;
1726 673132 : } else if (num_text == 1) {
1727 667142 : terms_.Add(text_.last(), zone());
1728 : } else {
1729 : RegExpText* text = new (zone()) RegExpText(zone());
1730 46636 : for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1731 5990 : terms_.Add(text, zone());
1732 : }
1733 : text_.Clear();
1734 : }
1735 :
1736 :
1737 18775796 : void RegExpBuilder::AddCharacter(uc16 c) {
1738 : FlushPendingSurrogate();
1739 8015278 : pending_empty_ = false;
1740 8015278 : if (NeedsDesugaringForIgnoreCase(c)) {
1741 250 : AddCharacterClassForDesugaring(c);
1742 : } else {
1743 8015028 : if (characters_ == NULL) {
1744 2745490 : characters_ = new (zone()) ZoneList<uc16>(4, zone());
1745 : }
1746 8015028 : characters_->Add(c, zone());
1747 : LAST(ADD_CHAR);
1748 : }
1749 8015278 : }
1750 :
1751 :
1752 15960099 : void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1753 7980364 : if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1754 : DCHECK(unicode());
1755 629 : AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1756 629 : AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1757 7982715 : } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1758 304 : AddLeadSurrogate(c);
1759 7982107 : } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1760 228 : AddTrailSurrogate(c);
1761 : } else {
1762 7979203 : AddCharacter(static_cast<uc16>(c));
1763 : }
1764 7980364 : }
1765 :
1766 1891 : void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1767 : // A lead or trail surrogate parsed via escape sequence will not
1768 : // pair up with any preceding lead or following trail surrogate.
1769 : FlushPendingSurrogate();
1770 1891 : AddUnicodeCharacter(character);
1771 : FlushPendingSurrogate();
1772 1891 : }
1773 :
1774 229 : void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1775 :
1776 :
1777 225496 : void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1778 225496 : if (NeedsDesugaringForUnicode(cc)) {
1779 : // With /u, character class needs to be desugared, so it
1780 : // must be a standalone term instead of being part of a RegExpText.
1781 5807 : AddTerm(cc);
1782 : } else {
1783 219689 : AddAtom(cc);
1784 : }
1785 225496 : }
1786 :
1787 1636 : void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1788 : AddTerm(new (zone()) RegExpCharacterClass(
1789 1636 : CharacterRange::List(zone(), CharacterRange::Singleton(c))));
1790 818 : }
1791 :
1792 :
1793 3821126 : void RegExpBuilder::AddAtom(RegExpTree* term) {
1794 1910563 : if (term->IsEmpty()) {
1795 : AddEmpty();
1796 1910563 : return;
1797 : }
1798 1910563 : if (term->IsTextElement()) {
1799 220282 : FlushCharacters();
1800 220282 : text_.Add(term, zone());
1801 : } else {
1802 1690281 : FlushText();
1803 1690281 : terms_.Add(term, zone());
1804 : }
1805 : LAST(ADD_ATOM);
1806 : }
1807 :
1808 :
1809 13250 : void RegExpBuilder::AddTerm(RegExpTree* term) {
1810 6625 : FlushText();
1811 6625 : terms_.Add(term, zone());
1812 : LAST(ADD_ATOM);
1813 6625 : }
1814 :
1815 :
1816 49998 : void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1817 25111 : FlushText();
1818 25111 : if (terms_.length() > 0 && terms_.last()->IsAssertion()) {
1819 : // Omit repeated assertions of the same type.
1820 860 : RegExpAssertion* last = terms_.last()->AsAssertion();
1821 860 : RegExpAssertion* next = assert->AsAssertion();
1822 25541 : if (last->assertion_type() == next->assertion_type()) return;
1823 : }
1824 24887 : terms_.Add(assert, zone());
1825 : LAST(ADD_ASSERT);
1826 : }
1827 :
1828 :
1829 152158 : void RegExpBuilder::NewAlternative() { FlushTerms(); }
1830 :
1831 :
1832 5199347 : void RegExpBuilder::FlushTerms() {
1833 2292471 : FlushText();
1834 : int num_terms = terms_.length();
1835 : RegExpTree* alternative;
1836 2292471 : if (num_terms == 0) {
1837 : alternative = new (zone()) RegExpEmpty();
1838 1830812 : } else if (num_terms == 1) {
1839 : alternative = terms_.last();
1840 : } else {
1841 152746 : alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1842 : }
1843 2292471 : alternatives_.Add(alternative, zone());
1844 : terms_.Clear();
1845 : LAST(ADD_NONE);
1846 2292471 : }
1847 :
1848 :
1849 240289 : bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1850 225496 : if (!unicode()) return false;
1851 : // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1852 : // necessarily mean that we need to desugar. It's probably nicer to have a
1853 : // separate pass to figure out unicode desugarings.
1854 7580 : if (ignore_case()) return true;
1855 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1856 7213 : CharacterRange::Canonicalize(ranges);
1857 19978 : for (int i = ranges->length() - 1; i >= 0; i--) {
1858 18205 : uc32 from = ranges->at(i).from();
1859 18205 : uc32 to = ranges->at(i).to();
1860 : // Check for non-BMP characters.
1861 18205 : if (to >= kNonBmpStart) return true;
1862 : // Check for lone surrogates.
1863 12897 : if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1864 : }
1865 : return false;
1866 : }
1867 :
1868 :
1869 8018996 : bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1870 : #ifdef V8_INTL_SUPPORT
1871 8018996 : if (unicode() && ignore_case()) {
1872 301 : icu::UnicodeSet set(c, c);
1873 301 : set.closeOver(USET_CASE_INSENSITIVE);
1874 301 : set.removeAllStrings();
1875 301 : return set.size() > 1;
1876 : }
1877 : // In the case where ICU is not included, we act as if the unicode flag is
1878 : // not set, and do not desugar.
1879 : #endif // V8_INTL_SUPPORT
1880 : return false;
1881 : }
1882 :
1883 :
1884 2201385 : RegExpTree* RegExpBuilder::ToRegExp() {
1885 2140313 : FlushTerms();
1886 : int num_alternatives = alternatives_.length();
1887 2140313 : if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1888 2140313 : if (num_alternatives == 1) return alternatives_.last();
1889 61072 : return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1890 : }
1891 :
1892 2301359 : bool RegExpBuilder::AddQuantifierToAtom(
1893 4565182 : int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1894 : FlushPendingSurrogate();
1895 2301359 : if (pending_empty_) {
1896 2 : pending_empty_ = false;
1897 2 : return true;
1898 : }
1899 : RegExpTree* atom;
1900 2301357 : if (characters_ != NULL) {
1901 : DCHECK(last_added_ == ADD_CHAR);
1902 : // Last atom was character.
1903 2240623 : Vector<const uc16> char_vector = characters_->ToConstVector();
1904 : int num_chars = char_vector.length();
1905 2240623 : if (num_chars > 1) {
1906 743 : Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1907 743 : text_.Add(new (zone()) RegExpAtom(prefix), zone());
1908 : char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1909 : }
1910 2240623 : characters_ = NULL;
1911 : atom = new (zone()) RegExpAtom(char_vector);
1912 2240623 : FlushText();
1913 60734 : } else if (text_.length() > 0) {
1914 : DCHECK(last_added_ == ADD_ATOM);
1915 38029 : atom = text_.RemoveLast();
1916 38029 : FlushText();
1917 22705 : } else if (terms_.length() > 0) {
1918 : DCHECK(last_added_ == ADD_ATOM);
1919 22705 : atom = terms_.RemoveLast();
1920 : // With /u, lookarounds are not quantifiable.
1921 22705 : if (unicode() && atom->IsLookaround()) return false;
1922 22675 : if (atom->max_match() == 0) {
1923 : // Guaranteed to only match an empty string.
1924 : LAST(ADD_TERM);
1925 349 : if (min == 0) {
1926 : return true;
1927 : }
1928 133 : terms_.Add(atom, zone());
1929 133 : return true;
1930 : }
1931 : } else {
1932 : // Only call immediately after adding an atom or character!
1933 0 : UNREACHABLE();
1934 : return false;
1935 : }
1936 2300978 : terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1937 2300978 : zone());
1938 : LAST(ADD_TERM);
1939 2300978 : return true;
1940 : }
1941 :
1942 : } // namespace internal
1943 : } // namespace v8
|