|           Line data    Source code 
       1             : // Copyright 2012 the V8 project authors. All rights reserved.
       2             : // Use of this source code is governed by a BSD-style license that can be
       3             : // found in the LICENSE file.
       4             : 
       5             : #include "src/regexp/jsregexp.h"
       6             : 
       7             : #include <memory>
       8             : 
       9             : #include "src/base/platform/platform.h"
      10             : #include "src/compilation-cache.h"
      11             : #include "src/elements.h"
      12             : #include "src/execution.h"
      13             : #include "src/factory.h"
      14             : #include "src/isolate-inl.h"
      15             : #include "src/messages.h"
      16             : #include "src/ostreams.h"
      17             : #include "src/regexp/interpreter-irregexp.h"
      18             : #include "src/regexp/jsregexp-inl.h"
      19             : #include "src/regexp/regexp-macro-assembler-irregexp.h"
      20             : #include "src/regexp/regexp-macro-assembler-tracer.h"
      21             : #include "src/regexp/regexp-macro-assembler.h"
      22             : #include "src/regexp/regexp-parser.h"
      23             : #include "src/regexp/regexp-stack.h"
      24             : #include "src/runtime/runtime.h"
      25             : #include "src/splay-tree-inl.h"
      26             : #include "src/string-search.h"
      27             : #include "src/unicode-decoder.h"
      28             : 
      29             : #ifdef V8_INTL_SUPPORT
      30             : #include "unicode/uniset.h"
      31             : #include "unicode/utypes.h"
      32             : #endif  // V8_INTL_SUPPORT
      33             : 
      34             : #ifndef V8_INTERPRETED_REGEXP
      35             : #if V8_TARGET_ARCH_IA32
      36             : #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
      37             : #elif V8_TARGET_ARCH_X64
      38             : #include "src/regexp/x64/regexp-macro-assembler-x64.h"
      39             : #elif V8_TARGET_ARCH_ARM64
      40             : #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
      41             : #elif V8_TARGET_ARCH_ARM
      42             : #include "src/regexp/arm/regexp-macro-assembler-arm.h"
      43             : #elif V8_TARGET_ARCH_PPC
      44             : #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
      45             : #elif V8_TARGET_ARCH_S390
      46             : #include "src/regexp/s390/regexp-macro-assembler-s390.h"
      47             : #elif V8_TARGET_ARCH_MIPS
      48             : #include "src/regexp/mips/regexp-macro-assembler-mips.h"
      49             : #elif V8_TARGET_ARCH_MIPS64
      50             : #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
      51             : #elif V8_TARGET_ARCH_X87
      52             : #include "src/regexp/x87/regexp-macro-assembler-x87.h"
      53             : #else
      54             : #error Unsupported target architecture.
      55             : #endif
      56             : #endif
      57             : 
      58             : 
      59             : namespace v8 {
      60             : namespace internal {
      61             : 
      62             : MUST_USE_RESULT
      63        4076 : static inline MaybeHandle<Object> ThrowRegExpException(
      64             :     Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
      65             :   Isolate* isolate = re->GetIsolate();
      66        8152 :   THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
      67             :                                           pattern, error_text),
      68             :                   Object);
      69             : }
      70             : 
      71             : 
      72         456 : inline void ThrowRegExpException(Handle<JSRegExp> re,
      73             :                                  Handle<String> error_text) {
      74         456 :   USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
      75         456 : }
      76             : 
      77             : 
      78     1088516 : ContainedInLattice AddRange(ContainedInLattice containment,
      79             :                             const int* ranges,
      80             :                             int ranges_length,
      81             :                             Interval new_range) {
      82             :   DCHECK((ranges_length & 1) == 1);
      83             :   DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
      84     1088516 :   if (containment == kLatticeUnknown) return containment;
      85             :   bool inside = false;
      86             :   int last = 0;
      87     3700949 :   for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
      88             :     // Consider the range from last to ranges[i].
      89             :     // We haven't got to the new range yet.
      90     4584555 :     if (ranges[i] <= new_range.from()) continue;
      91             :     // New range is wholly inside last-ranges[i].  Note that new_range.to() is
      92             :     // inclusive, but the values in ranges are not.
      93      883606 :     if (last <= new_range.from() && new_range.to() < ranges[i]) {
      94     1723778 :       return Combine(containment, inside ? kLatticeIn : kLatticeOut);
      95             :     }
      96             :     return kLatticeUnknown;
      97             :   }
      98             :   return containment;
      99             : }
     100             : 
     101             : 
     102             : // More makes code generation slower, less makes V8 benchmark score lower.
     103             : const int kMaxLookaheadForBoyerMoore = 8;
     104             : // In a 3-character pattern you can maximally step forwards 3 characters
     105             : // at a time, which is not always enough to pay for the extra logic.
     106             : const int kPatternTooShortForBoyerMoore = 2;
     107             : 
     108             : 
     109             : // Identifies the sort of regexps where the regexp engine is faster
     110             : // than the code used for atom matches.
     111      262562 : static bool HasFewDifferentCharacters(Handle<String> pattern) {
     112             :   int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
     113      262562 :   if (length <= kPatternTooShortForBoyerMoore) return false;
     114             :   const int kMod = 128;
     115             :   bool character_found[kMod];
     116             :   int different = 0;
     117             :   memset(&character_found[0], 0, sizeof(character_found));
     118      768996 :   for (int i = 0; i < length; i++) {
     119      768884 :     int ch = (pattern->Get(i) & (kMod - 1));
     120      768884 :     if (!character_found[ch]) {
     121      768428 :       character_found[ch] = true;
     122      768428 :       different++;
     123             :       // We declare a regexp low-alphabet if it has at least 3 times as many
     124             :       // characters as it has different characters.
     125      768428 :       if (different * 3 > length) return false;
     126             :     }
     127             :   }
     128             :   return true;
     129             : }
     130             : 
     131             : 
     132             : // Generic RegExp methods. Dispatches to implementation specific methods.
     133             : 
     134             : 
     135      719988 : MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
     136             :                                         Handle<String> pattern,
     137             :                                         JSRegExp::Flags flags) {
     138             :   DCHECK(pattern->IsFlat());
     139             : 
     140     1439976 :   Isolate* isolate = re->GetIsolate();
     141      719988 :   Zone zone(isolate->allocator(), ZONE_NAME);
     142             :   CompilationCache* compilation_cache = isolate->compilation_cache();
     143             :   MaybeHandle<FixedArray> maybe_cached =
     144      719988 :       compilation_cache->LookupRegExp(pattern, flags);
     145             :   Handle<FixedArray> cached;
     146      719988 :   if (maybe_cached.ToHandle(&cached)) {
     147      356415 :     re->set_data(*cached);
     148             :     return re;
     149             :   }
     150             : 
     151             :   PostponeInterruptsScope postpone(isolate);
     152             :   RegExpCompileData parse_result;
     153      363573 :   FlatStringReader reader(isolate, pattern);
     154      363573 :   if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
     155      363573 :                                  &parse_result)) {
     156             :     // Throw an exception if we fail to parse the pattern.
     157        3562 :     return ThrowRegExpException(re, pattern, parse_result.error);
     158             :   }
     159             : 
     160             :   bool has_been_compiled = false;
     161             : 
     162      870650 :   if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
     163      615020 :       !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
     164             :     // Parse-tree is a single atom that is equal to the pattern.
     165      254957 :     AtomCompile(re, pattern, flags, pattern);
     166             :     has_been_compiled = true;
     167      225918 :   } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
     168      112621 :              !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
     169        7553 :     RegExpAtom* atom = parse_result.tree->AsAtom();
     170        7553 :     Vector<const uc16> atom_pattern = atom->data();
     171             :     Handle<String> atom_string;
     172       15106 :     ASSIGN_RETURN_ON_EXCEPTION(
     173             :         isolate, atom_string,
     174             :         isolate->factory()->NewStringFromTwoByte(atom_pattern),
     175             :         Object);
     176        7553 :     if (!HasFewDifferentCharacters(atom_string)) {
     177        7493 :       AtomCompile(re, pattern, flags, atom_string);
     178             :       has_been_compiled = true;
     179             :     }
     180             :   }
     181      360011 :   if (!has_been_compiled) {
     182       97561 :     IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
     183             :   }
     184             :   DCHECK(re->data()->IsFixedArray());
     185             :   // Compilation succeeded so the data is set on the regexp
     186             :   // and we can store it in the cache.
     187             :   Handle<FixedArray> data(FixedArray::cast(re->data()));
     188      360011 :   compilation_cache->PutRegExp(pattern, flags, data);
     189             : 
     190      719988 :   return re;
     191             : }
     192             : 
     193      611183 : MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
     194             :                                      Handle<String> subject, int index,
     195             :                                      Handle<RegExpMatchInfo> last_match_info) {
     196      611183 :   switch (regexp->TypeTag()) {
     197             :     case JSRegExp::ATOM:
     198      486154 :       return AtomExec(regexp, subject, index, last_match_info);
     199             :     case JSRegExp::IRREGEXP: {
     200      125029 :       return IrregexpExec(regexp, subject, index, last_match_info);
     201             :     }
     202             :     default:
     203           0 :       UNREACHABLE();
     204             :       return MaybeHandle<Object>();
     205             :   }
     206             : }
     207             : 
     208             : 
     209             : // RegExp Atom implementation: Simple string search using indexOf.
     210             : 
     211             : 
     212      262450 : void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
     213             :                              Handle<String> pattern,
     214             :                              JSRegExp::Flags flags,
     215             :                              Handle<String> match_pattern) {
     216             :   re->GetIsolate()->factory()->SetRegExpAtomData(re,
     217             :                                                  JSRegExp::ATOM,
     218             :                                                  pattern,
     219             :                                                  flags,
     220      262450 :                                                  match_pattern);
     221      262450 : }
     222             : 
     223      330120 : static void SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info,
     224             :                                String* subject, int from, int to) {
     225             :   SealHandleScope shs(last_match_info->GetIsolate());
     226             :   last_match_info->SetNumberOfCaptureRegisters(2);
     227             :   last_match_info->SetLastSubject(subject);
     228             :   last_match_info->SetLastInput(subject);
     229             :   last_match_info->SetCapture(0, from);
     230             :   last_match_info->SetCapture(1, to);
     231      330120 : }
     232             : 
     233             : 
     234      626544 : int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
     235             :                             Handle<String> subject,
     236             :                             int index,
     237             :                             int32_t* output,
     238             :                             int output_size) {
     239             :   Isolate* isolate = regexp->GetIsolate();
     240             : 
     241             :   DCHECK(0 <= index);
     242             :   DCHECK(index <= subject->length());
     243             : 
     244      626544 :   subject = String::Flatten(subject);
     245             :   DisallowHeapAllocation no_gc;  // ensure vectors stay valid
     246             : 
     247             :   String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
     248             :   int needle_len = needle->length();
     249             :   DCHECK(needle->IsFlat());
     250             :   DCHECK_LT(0, needle_len);
     251             : 
     252     1253088 :   if (index + needle_len > subject->length()) {
     253             :     return RegExpImpl::RE_FAILURE;
     254             :   }
     255             : 
     256      471738 :   for (int i = 0; i < output_size; i += 2) {
     257      767514 :     String::FlatContent needle_content = needle->GetFlatContent();
     258      767514 :     String::FlatContent subject_content = subject->GetFlatContent();
     259             :     DCHECK(needle_content.IsFlat());
     260             :     DCHECK(subject_content.IsFlat());
     261             :     // dispatch on type of strings
     262             :     index =
     263      767514 :         (needle_content.IsOneByte()
     264             :              ? (subject_content.IsOneByte()
     265             :                     ? SearchString(isolate, subject_content.ToOneByteVector(),
     266             :                                    needle_content.ToOneByteVector(), index)
     267             :                     : SearchString(isolate, subject_content.ToUC16Vector(),
     268             :                                    needle_content.ToOneByteVector(), index))
     269             :              : (subject_content.IsOneByte()
     270             :                     ? SearchString(isolate, subject_content.ToOneByteVector(),
     271             :                                    needle_content.ToUC16Vector(), index)
     272             :                     : SearchString(isolate, subject_content.ToUC16Vector(),
     273      767514 :                                    needle_content.ToUC16Vector(), index)));
     274      767514 :     if (index == -1) {
     275      295776 :       return i / 2;  // Return number of matches.
     276             :     } else {
     277      471738 :       output[i] = index;
     278      471738 :       output[i+1] = index + needle_len;
     279             :       index += needle_len;
     280             :     }
     281             :   }
     282      330120 :   return output_size / 2;
     283             : }
     284             : 
     285      486154 : Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject,
     286             :                                     int index,
     287             :                                     Handle<RegExpMatchInfo> last_match_info) {
     288             :   Isolate* isolate = re->GetIsolate();
     289             : 
     290             :   static const int kNumRegisters = 2;
     291             :   STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
     292      486154 :   int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
     293             : 
     294      486154 :   int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
     295             : 
     296      642188 :   if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
     297             : 
     298             :   DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
     299             :   SealHandleScope shs(isolate);
     300             :   SetAtomLastCapture(last_match_info, *subject, output_registers[0],
     301      660240 :                      output_registers[1]);
     302      330120 :   return last_match_info;
     303             : }
     304             : 
     305             : 
     306             : // Irregexp implementation.
     307             : 
     308             : // Ensures that the regexp object contains a compiled version of the
     309             : // source for either one-byte or two-byte subject strings.
     310             : // If the compiled version doesn't already exist, it is compiled
     311             : // from the source pattern.
     312             : // If compilation fails, an exception is thrown and this function
     313             : // returns false.
     314     1057858 : bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
     315             :                                         Handle<String> sample_subject,
     316             :                                         bool is_one_byte) {
     317             :   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
     318             : #ifdef V8_INTERPRETED_REGEXP
     319             :   if (compiled_code->IsByteArray()) return true;
     320             : #else  // V8_INTERPRETED_REGEXP (RegExp native code)
     321     1057858 :   if (compiled_code->IsCode()) return true;
     322             : #endif
     323             :   // We could potentially have marked this as flushable, but have kept
     324             :   // a saved version if we did not flush it yet.
     325             :   Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
     326       93313 :   if (saved_code->IsCode()) {
     327             :     // Reinstate the code in the original place.
     328             :     re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
     329             :     DCHECK(compiled_code->IsSmi());
     330         625 :     return true;
     331             :   }
     332       92688 :   return CompileIrregexp(re, sample_subject, is_one_byte);
     333             : }
     334             : 
     335             : 
     336       92688 : bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
     337             :                                  Handle<String> sample_subject,
     338             :                                  bool is_one_byte) {
     339             :   // Compile the RegExp.
     340       92688 :   Isolate* isolate = re->GetIsolate();
     341       92688 :   Zone zone(isolate->allocator(), ZONE_NAME);
     342             :   PostponeInterruptsScope postpone(isolate);
     343             :   // If we had a compilation error the last time this is saved at the
     344             :   // saved code index.
     345             :   Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
     346             :   // When arriving here entry can only be a smi, either representing an
     347             :   // uncompiled regexp, a previous compilation error, or code that has
     348             :   // been flushed.
     349             :   DCHECK(entry->IsSmi());
     350             :   int entry_value = Smi::cast(entry)->value();
     351             :   DCHECK(entry_value == JSRegExp::kUninitializedValue ||
     352             :          entry_value == JSRegExp::kCompilationErrorValue ||
     353             :          (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
     354             : 
     355       92688 :   if (entry_value == JSRegExp::kCompilationErrorValue) {
     356             :     // A previous compilation failed and threw an error which we store in
     357             :     // the saved code index (we store the error message, not the actual
     358             :     // error). Recreate the error object and throw it.
     359             :     Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
     360             :     DCHECK(error_string->IsString());
     361             :     Handle<String> error_message(String::cast(error_string));
     362           0 :     ThrowRegExpException(re, error_message);
     363             :     return false;
     364             :   }
     365             : 
     366       92688 :   JSRegExp::Flags flags = re->GetFlags();
     367             : 
     368             :   Handle<String> pattern(re->Pattern());
     369       92688 :   pattern = String::Flatten(pattern);
     370             :   RegExpCompileData compile_data;
     371       92688 :   FlatStringReader reader(isolate, pattern);
     372       92688 :   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
     373       92688 :                                  &compile_data)) {
     374             :     // Throw an exception if we fail to parse the pattern.
     375             :     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
     376          58 :     USE(ThrowRegExpException(re, pattern, compile_data.error));
     377          58 :     return false;
     378             :   }
     379             :   RegExpEngine::CompilationResult result =
     380             :       RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
     381       92630 :                             sample_subject, is_one_byte);
     382       92630 :   if (result.error_message != NULL) {
     383             :     // Unable to compile regexp.
     384             :     Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
     385         912 :         CStrVector(result.error_message)).ToHandleChecked();
     386         456 :     ThrowRegExpException(re, error_message);
     387             :     return false;
     388             :   }
     389             : 
     390             :   Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
     391      184348 :   data->set(JSRegExp::code_index(is_one_byte), result.code);
     392       92174 :   SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
     393             :   int register_max = IrregexpMaxRegisterCount(*data);
     394       92174 :   if (result.num_registers > register_max) {
     395             :     SetIrregexpMaxRegisterCount(*data, result.num_registers);
     396             :   }
     397             : 
     398       92688 :   return true;
     399             : }
     400             : 
     401             : 
     402           0 : int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
     403             :   return Smi::cast(
     404           0 :       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
     405             : }
     406             : 
     407             : 
     408           0 : void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
     409             :   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
     410           0 : }
     411             : 
     412       92174 : void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
     413             :                                            Handle<FixedArray> value) {
     414       92174 :   if (value.is_null()) {
     415             :     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
     416             :   } else {
     417         438 :     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
     418             :   }
     419       92174 : }
     420             : 
     421           0 : int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
     422           0 :   return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
     423             : }
     424             : 
     425             : 
     426           0 : int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
     427           0 :   return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
     428             : }
     429             : 
     430             : 
     431           0 : ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
     432           0 :   return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
     433             : }
     434             : 
     435             : 
     436           0 : Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
     437           0 :   return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
     438             : }
     439             : 
     440             : 
     441       97561 : void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
     442             :                                     Handle<String> pattern,
     443             :                                     JSRegExp::Flags flags,
     444             :                                     int capture_count) {
     445             :   // Initialize compiled code entries to null.
     446             :   re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
     447             :                                                      JSRegExp::IRREGEXP,
     448             :                                                      pattern,
     449             :                                                      flags,
     450       97561 :                                                      capture_count);
     451       97561 : }
     452             : 
     453             : 
     454      357478 : int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
     455             :                                 Handle<String> subject) {
     456             :   DCHECK(subject->IsFlat());
     457             : 
     458             :   // Check representation of the underlying storage.
     459      357478 :   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
     460      357478 :   if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
     461             : 
     462             : #ifdef V8_INTERPRETED_REGEXP
     463             :   // Byte-code regexp needs space allocated for all its registers.
     464             :   // The result captures are copied to the start of the registers array
     465             :   // if the match succeeds.  This way those registers are not clobbered
     466             :   // when we set the last match info from last successful match.
     467             :   return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
     468             :          (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
     469             : #else  // V8_INTERPRETED_REGEXP
     470             :   // Native regexp only needs room to output captures. Registers are handled
     471             :   // internally.
     472      356964 :   return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
     473             : #endif  // V8_INTERPRETED_REGEXP
     474             : }
     475             : 
     476             : 
     477      700380 : int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
     478             :                                 Handle<String> subject,
     479             :                                 int index,
     480             :                                 int32_t* output,
     481             :                                 int output_size) {
     482             :   Isolate* isolate = regexp->GetIsolate();
     483             : 
     484             :   Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
     485             : 
     486             :   DCHECK(index >= 0);
     487             :   DCHECK(index <= subject->length());
     488             :   DCHECK(subject->IsFlat());
     489             : 
     490      700380 :   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
     491             : 
     492             : #ifndef V8_INTERPRETED_REGEXP
     493             :   DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
     494             :   do {
     495      700380 :     EnsureCompiledIrregexp(regexp, subject, is_one_byte);
     496             :     Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
     497             :     // The stack is used to allocate registers for the compiled regexp code.
     498             :     // This means that in case of failure, the output registers array is left
     499             :     // untouched and contains the capture results from the previous successful
     500             :     // match.  We can use that to set the last match info lazily.
     501             :     NativeRegExpMacroAssembler::Result res =
     502             :         NativeRegExpMacroAssembler::Match(code,
     503             :                                           subject,
     504             :                                           output,
     505             :                                           output_size,
     506             :                                           index,
     507      700380 :                                           isolate);
     508      700380 :     if (res != NativeRegExpMacroAssembler::RETRY) {
     509             :       DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
     510             :              isolate->has_pending_exception());
     511             :       STATIC_ASSERT(
     512             :           static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
     513             :       STATIC_ASSERT(
     514             :           static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
     515             :       STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
     516             :                     == RE_EXCEPTION);
     517      700380 :       return static_cast<IrregexpResult>(res);
     518             :     }
     519             :     // If result is RETRY, the string has changed representation, and we
     520             :     // must restart from scratch.
     521             :     // In this case, it means we must make sure we are prepared to handle
     522             :     // the, potentially, different subject (the string can switch between
     523             :     // being internal and external, and even between being Latin1 and UC16,
     524             :     // but the characters are always the same).
     525           0 :     IrregexpPrepare(regexp, subject);
     526           0 :     is_one_byte = subject->IsOneByteRepresentationUnderneath();
     527             :   } while (true);
     528             :   UNREACHABLE();
     529           0 :   return RE_EXCEPTION;
     530             : #else  // V8_INTERPRETED_REGEXP
     531             : 
     532             :   DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
     533             :   // We must have done EnsureCompiledIrregexp, so we can get the number of
     534             :   // registers.
     535             :   int number_of_capture_registers =
     536             :       (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
     537             :   int32_t* raw_output = &output[number_of_capture_registers];
     538             :   // We do not touch the actual capture result registers until we know there
     539             :   // has been a match so that we can use those capture results to set the
     540             :   // last match info.
     541             :   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
     542             :     raw_output[i] = -1;
     543             :   }
     544             :   Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
     545             :                                isolate);
     546             : 
     547             :   IrregexpResult result = IrregexpInterpreter::Match(isolate,
     548             :                                                      byte_codes,
     549             :                                                      subject,
     550             :                                                      raw_output,
     551             :                                                      index);
     552             :   if (result == RE_SUCCESS) {
     553             :     // Copy capture results to the start of the registers array.
     554             :     MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
     555             :   }
     556             :   if (result == RE_EXCEPTION) {
     557             :     DCHECK(!isolate->has_pending_exception());
     558             :     isolate->StackOverflow();
     559             :   }
     560             :   return result;
     561             : #endif  // V8_INTERPRETED_REGEXP
     562             : }
     563             : 
     564      125029 : MaybeHandle<Object> RegExpImpl::IrregexpExec(
     565             :     Handle<JSRegExp> regexp, Handle<String> subject, int previous_index,
     566             :     Handle<RegExpMatchInfo> last_match_info) {
     567             :   Isolate* isolate = regexp->GetIsolate();
     568             :   DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
     569             : 
     570      125029 :   subject = String::Flatten(subject);
     571             : 
     572             :   // Prepare space for the return values.
     573             : #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
     574             :   if (FLAG_trace_regexp_bytecodes) {
     575             :     String* pattern = regexp->Pattern();
     576             :     PrintF("\n\nRegexp match:   /%s/\n\n", pattern->ToCString().get());
     577             :     PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
     578             :   }
     579             : #endif
     580      125029 :   int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
     581      125029 :   if (required_registers < 0) {
     582             :     // Compiling failed with an exception.
     583             :     DCHECK(isolate->has_pending_exception());
     584             :     return MaybeHandle<Object>();
     585             :   }
     586             : 
     587             :   int32_t* output_registers = NULL;
     588      124760 :   if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
     589        2422 :     output_registers = NewArray<int32_t>(required_registers);
     590             :   }
     591             :   std::unique_ptr<int32_t[]> auto_release(output_registers);
     592      124760 :   if (output_registers == NULL) {
     593      122338 :     output_registers = isolate->jsregexp_static_offsets_vector();
     594             :   }
     595             : 
     596             :   int res = RegExpImpl::IrregexpExecRaw(
     597      124760 :       regexp, subject, previous_index, output_registers, required_registers);
     598      124760 :   if (res == RE_SUCCESS) {
     599             :     int capture_count =
     600             :         IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
     601             :     return SetLastMatchInfo(
     602       97638 :         last_match_info, subject, capture_count, output_registers);
     603             :   }
     604       27122 :   if (res == RE_EXCEPTION) {
     605             :     DCHECK(isolate->has_pending_exception());
     606             :     return MaybeHandle<Object>();
     607             :   }
     608             :   DCHECK(res == RE_FAILURE);
     609             :   return isolate->factory()->null_value();
     610             : }
     611             : 
     612      259838 : Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
     613             :     Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
     614             :     int capture_count, int32_t* match) {
     615             :   // This is the only place where match infos can grow. If, after executing the
     616             :   // regexp, RegExpExecStub finds that the match info is too small, it restarts
     617             :   // execution in RegExpImpl::Exec, which finally grows the match info right
     618             :   // here.
     619             : 
     620      259838 :   int capture_register_count = (capture_count + 1) * 2;
     621             :   Handle<RegExpMatchInfo> result =
     622      259838 :       RegExpMatchInfo::ReserveCaptures(last_match_info, capture_register_count);
     623             :   result->SetNumberOfCaptureRegisters(capture_register_count);
     624             : 
     625      259838 :   if (*result != *last_match_info) {
     626             :     // The match info has been reallocated, update the corresponding reference
     627             :     // on the native context.
     628             :     Isolate* isolate = last_match_info->GetIsolate();
     629        9720 :     if (*last_match_info == *isolate->regexp_last_match_info()) {
     630        6188 :       isolate->native_context()->set_regexp_last_match_info(*result);
     631        3532 :     } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
     632        3532 :       isolate->native_context()->set_regexp_internal_match_info(*result);
     633             :     }
     634             :   }
     635             : 
     636             :   DisallowHeapAllocation no_allocation;
     637      259838 :   if (match != NULL) {
     638     1176454 :     for (int i = 0; i < capture_register_count; i += 2) {
     639     1176454 :       result->SetCapture(i, match[i]);
     640     1176454 :       result->SetCapture(i + 1, match[i + 1]);
     641             :     }
     642             :   }
     643             :   result->SetLastSubject(*subject);
     644             :   result->SetLastInput(*subject);
     645      259838 :   return result;
     646             : }
     647             : 
     648             : 
     649      361004 : RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
     650             :                                      Handle<String> subject,
     651             :                                      Isolate* isolate)
     652             :   : register_array_(NULL),
     653             :     register_array_size_(0),
     654             :     regexp_(regexp),
     655      361004 :     subject_(subject) {
     656             : #ifdef V8_INTERPRETED_REGEXP
     657             :   bool interpreted = true;
     658             : #else
     659             :   bool interpreted = false;
     660             : #endif  // V8_INTERPRETED_REGEXP
     661             : 
     662      361004 :   if (regexp_->TypeTag() == JSRegExp::ATOM) {
     663             :     static const int kAtomRegistersPerMatch = 2;
     664      140390 :     registers_per_match_ = kAtomRegistersPerMatch;
     665             :     // There is no distinction between interpreted and native for atom regexps.
     666             :     interpreted = false;
     667             :   } else {
     668      220614 :     registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
     669      220614 :     if (registers_per_match_ < 0) {
     670         117 :       num_matches_ = -1;  // Signal exception.
     671      361121 :       return;
     672             :     }
     673             :   }
     674             : 
     675             :   DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
     676             :   if (!interpreted) {
     677             :     register_array_size_ =
     678      721774 :         Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
     679      360887 :     max_matches_ = register_array_size_ / registers_per_match_;
     680             :   } else {
     681             :     // Global loop in interpreted regexp is not implemented.  We choose
     682             :     // the size of the offsets vector so that it can only store one match.
     683             :     register_array_size_ = registers_per_match_;
     684             :     max_matches_ = 1;
     685             :   }
     686             : 
     687      360887 :   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
     688        1764 :     register_array_ = NewArray<int32_t>(register_array_size_);
     689             :   } else {
     690      359123 :     register_array_ = isolate->jsregexp_static_offsets_vector();
     691             :   }
     692             : 
     693             :   // Set state so that fetching the results the first time triggers a call
     694             :   // to the compiled regexp.
     695      360887 :   current_match_index_ = max_matches_ - 1;
     696      360887 :   num_matches_ = max_matches_;
     697             :   DCHECK(registers_per_match_ >= 2);  // Each match has at least one capture.
     698             :   DCHECK_GE(register_array_size_, registers_per_match_);
     699             :   int32_t* last_match =
     700      360887 :       ®ister_array_[current_match_index_ * registers_per_match_];
     701      360887 :   last_match[0] = -1;
     702      360887 :   last_match[1] = 0;
     703             : }
     704             : 
     705         449 : int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
     706         883 :   if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
     707         868 :       last_index + 1 < subject_->length() &&
     708        1317 :       unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
     709         434 :       unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
     710             :     // Advance over the surrogate pair.
     711         434 :     return last_index + 2;
     712             :   }
     713          15 :   return last_index + 1;
     714             : }
     715             : 
     716             : // -------------------------------------------------------------------
     717             : // Implementation of the Irregexp regular expression engine.
     718             : //
     719             : // The Irregexp regular expression engine is intended to be a complete
     720             : // implementation of ECMAScript regular expressions.  It generates either
     721             : // bytecodes or native code.
     722             : 
     723             : //   The Irregexp regexp engine is structured in three steps.
     724             : //   1) The parser generates an abstract syntax tree.  See ast.cc.
     725             : //   2) From the AST a node network is created.  The nodes are all
     726             : //      subclasses of RegExpNode.  The nodes represent states when
     727             : //      executing a regular expression.  Several optimizations are
     728             : //      performed on the node network.
     729             : //   3) From the nodes we generate either byte codes or native code
     730             : //      that can actually execute the regular expression (perform
     731             : //      the search).  The code generation step is described in more
     732             : //      detail below.
     733             : 
     734             : // Code generation.
     735             : //
     736             : //   The nodes are divided into four main categories.
     737             : //   * Choice nodes
     738             : //        These represent places where the regular expression can
     739             : //        match in more than one way.  For example on entry to an
     740             : //        alternation (foo|bar) or a repetition (*, +, ? or {}).
     741             : //   * Action nodes
     742             : //        These represent places where some action should be
     743             : //        performed.  Examples include recording the current position
     744             : //        in the input string to a register (in order to implement
     745             : //        captures) or other actions on register for example in order
     746             : //        to implement the counters needed for {} repetitions.
     747             : //   * Matching nodes
     748             : //        These attempt to match some element part of the input string.
     749             : //        Examples of elements include character classes, plain strings
     750             : //        or back references.
     751             : //   * End nodes
     752             : //        These are used to implement the actions required on finding
     753             : //        a successful match or failing to find a match.
     754             : //
     755             : //   The code generated (whether as byte codes or native code) maintains
     756             : //   some state as it runs.  This consists of the following elements:
     757             : //
     758             : //   * The capture registers.  Used for string captures.
     759             : //   * Other registers.  Used for counters etc.
     760             : //   * The current position.
     761             : //   * The stack of backtracking information.  Used when a matching node
     762             : //     fails to find a match and needs to try an alternative.
     763             : //
     764             : // Conceptual regular expression execution model:
     765             : //
     766             : //   There is a simple conceptual model of regular expression execution
     767             : //   which will be presented first.  The actual code generated is a more
     768             : //   efficient simulation of the simple conceptual model:
     769             : //
     770             : //   * Choice nodes are implemented as follows:
     771             : //     For each choice except the last {
     772             : //       push current position
     773             : //       push backtrack code location
     774             : //       <generate code to test for choice>
     775             : //       backtrack code location:
     776             : //       pop current position
     777             : //     }
     778             : //     <generate code to test for last choice>
     779             : //
     780             : //   * Actions nodes are generated as follows
     781             : //     <push affected registers on backtrack stack>
     782             : //     <generate code to perform action>
     783             : //     push backtrack code location
     784             : //     <generate code to test for following nodes>
     785             : //     backtrack code location:
     786             : //     <pop affected registers to restore their state>
     787             : //     <pop backtrack location from stack and go to it>
     788             : //
     789             : //   * Matching nodes are generated as follows:
     790             : //     if input string matches at current position
     791             : //       update current position
     792             : //       <generate code to test for following nodes>
     793             : //     else
     794             : //       <pop backtrack location from stack and go to it>
     795             : //
     796             : //   Thus it can be seen that the current position is saved and restored
     797             : //   by the choice nodes, whereas the registers are saved and restored by
     798             : //   by the action nodes that manipulate them.
     799             : //
     800             : //   The other interesting aspect of this model is that nodes are generated
     801             : //   at the point where they are needed by a recursive call to Emit().  If
     802             : //   the node has already been code generated then the Emit() call will
     803             : //   generate a jump to the previously generated code instead.  In order to
     804             : //   limit recursion it is possible for the Emit() function to put the node
     805             : //   on a work list for later generation and instead generate a jump.  The
     806             : //   destination of the jump is resolved later when the code is generated.
     807             : //
     808             : // Actual regular expression code generation.
     809             : //
     810             : //   Code generation is actually more complicated than the above.  In order
     811             : //   to improve the efficiency of the generated code some optimizations are
     812             : //   performed
     813             : //
     814             : //   * Choice nodes have 1-character lookahead.
     815             : //     A choice node looks at the following character and eliminates some of
     816             : //     the choices immediately based on that character.  This is not yet
     817             : //     implemented.
     818             : //   * Simple greedy loops store reduced backtracking information.
     819             : //     A quantifier like /.*foo/m will greedily match the whole input.  It will
     820             : //     then need to backtrack to a point where it can match "foo".  The naive
     821             : //     implementation of this would push each character position onto the
     822             : //     backtracking stack, then pop them off one by one.  This would use space
     823             : //     proportional to the length of the input string.  However since the "."
     824             : //     can only match in one way and always has a constant length (in this case
     825             : //     of 1) it suffices to store the current position on the top of the stack
     826             : //     once.  Matching now becomes merely incrementing the current position and
     827             : //     backtracking becomes decrementing the current position and checking the
     828             : //     result against the stored current position.  This is faster and saves
     829             : //     space.
     830             : //   * The current state is virtualized.
     831             : //     This is used to defer expensive operations until it is clear that they
     832             : //     are needed and to generate code for a node more than once, allowing
     833             : //     specialized an efficient versions of the code to be created. This is
     834             : //     explained in the section below.
     835             : //
     836             : // Execution state virtualization.
     837             : //
     838             : //   Instead of emitting code, nodes that manipulate the state can record their
     839             : //   manipulation in an object called the Trace.  The Trace object can record a
     840             : //   current position offset, an optional backtrack code location on the top of
     841             : //   the virtualized backtrack stack and some register changes.  When a node is
     842             : //   to be emitted it can flush the Trace or update it.  Flushing the Trace
     843             : //   will emit code to bring the actual state into line with the virtual state.
     844             : //   Avoiding flushing the state can postpone some work (e.g. updates of capture
     845             : //   registers).  Postponing work can save time when executing the regular
     846             : //   expression since it may be found that the work never has to be done as a
     847             : //   failure to match can occur.  In addition it is much faster to jump to a
     848             : //   known backtrack code location than it is to pop an unknown backtrack
     849             : //   location from the stack and jump there.
     850             : //
     851             : //   The virtual state found in the Trace affects code generation.  For example
     852             : //   the virtual state contains the difference between the actual current
     853             : //   position and the virtual current position, and matching code needs to use
     854             : //   this offset to attempt a match in the correct location of the input
     855             : //   string.  Therefore code generated for a non-trivial trace is specialized
     856             : //   to that trace.  The code generator therefore has the ability to generate
     857             : //   code for each node several times.  In order to limit the size of the
     858             : //   generated code there is an arbitrary limit on how many specialized sets of
     859             : //   code may be generated for a given node.  If the limit is reached, the
     860             : //   trace is flushed and a generic version of the code for a node is emitted.
     861             : //   This is subsequently used for that node.  The code emitted for non-generic
     862             : //   trace is not recorded in the node and so it cannot currently be reused in
     863             : //   the event that code generation is requested for an identical trace.
     864             : 
     865             : 
     866           0 : void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
     867           0 :   UNREACHABLE();
     868             : }
     869             : 
     870             : 
     871        9319 : void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
     872        9319 :   text->AddElement(TextElement::Atom(this), zone);
     873        9319 : }
     874             : 
     875             : 
     876       11004 : void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
     877       11004 :   text->AddElement(TextElement::CharClass(this), zone);
     878       11004 : }
     879             : 
     880             : 
     881           0 : void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
     882           0 :   for (int i = 0; i < elements()->length(); i++)
     883           0 :     text->AddElement(elements()->at(i), zone);
     884           0 : }
     885             : 
     886             : 
     887           0 : TextElement TextElement::Atom(RegExpAtom* atom) {
     888           0 :   return TextElement(ATOM, atom);
     889             : }
     890             : 
     891             : 
     892           0 : TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
     893           0 :   return TextElement(CHAR_CLASS, char_class);
     894             : }
     895             : 
     896             : 
     897     9570053 : int TextElement::length() const {
     898     9570053 :   switch (text_type()) {
     899             :     case ATOM:
     900     8627839 :       return atom()->length();
     901             : 
     902             :     case CHAR_CLASS:
     903             :       return 1;
     904             :   }
     905           0 :   UNREACHABLE();
     906             :   return 0;
     907             : }
     908             : 
     909             : 
     910           0 : DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
     911           0 :   if (table_ == NULL) {
     912           0 :     table_ = new(zone()) DispatchTable(zone());
     913             :     DispatchTableConstructor cons(table_, ignore_case, zone());
     914           0 :     cons.BuildTable(this);
     915             :   }
     916           0 :   return table_;
     917             : }
     918             : 
     919             : 
     920             : class FrequencyCollator {
     921             :  public:
     922    11948238 :   FrequencyCollator() : total_samples_(0) {
     923    11855616 :     for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
     924    11855616 :       frequencies_[i] = CharacterFrequency(i);
     925             :     }
     926             :   }
     927             : 
     928             :   void CountCharacter(int character) {
     929      640498 :     int index = (character & RegExpMacroAssembler::kTableMask);
     930      640498 :     frequencies_[index].Increment();
     931      640498 :     total_samples_++;
     932             :   }
     933             : 
     934             :   // Does not measure in percent, but rather per-128 (the table size from the
     935             :   // regexp macro assembler).
     936             :   int Frequency(int in_character) {
     937             :     DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
     938      514944 :     if (total_samples_ < 1) return 1;  // Division by zero.
     939             :     int freq_in_per128 =
     940      514604 :         (frequencies_[in_character].counter() * 128) / total_samples_;
     941             :     return freq_in_per128;
     942             :   }
     943             : 
     944             :  private:
     945             :   class CharacterFrequency {
     946             :    public:
     947    11855616 :     CharacterFrequency() : counter_(0), character_(-1) { }
     948             :     explicit CharacterFrequency(int character)
     949             :         : counter_(0), character_(character) { }
     950             : 
     951      640498 :     void Increment() { counter_++; }
     952             :     int counter() { return counter_; }
     953             :     int character() { return character_; }
     954             : 
     955             :    private:
     956             :     int counter_;
     957             :     int character_;
     958             :   };
     959             : 
     960             : 
     961             :  private:
     962             :   CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
     963             :   int total_samples_;
     964             : };
     965             : 
     966             : 
     967             : class RegExpCompiler {
     968             :  public:
     969             :   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
     970             :                  JSRegExp::Flags flags, bool is_one_byte);
     971             : 
     972             :   int AllocateRegister() {
     973     1513575 :     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
     974      517005 :       reg_exp_too_big_ = true;
     975             :       return next_register_;
     976             :     }
     977      996570 :     return next_register_++;
     978             :   }
     979             : 
     980             :   // Lookarounds to match lone surrogates for unicode character class matches
     981             :   // are never nested. We can therefore reuse registers.
     982             :   int UnicodeLookaroundStackRegister() {
     983        3148 :     if (unicode_lookaround_stack_register_ == kNoRegister) {
     984        1353 :       unicode_lookaround_stack_register_ = AllocateRegister();
     985             :     }
     986        3148 :     return unicode_lookaround_stack_register_;
     987             :   }
     988             : 
     989             :   int UnicodeLookaroundPositionRegister() {
     990        3148 :     if (unicode_lookaround_position_register_ == kNoRegister) {
     991        1353 :       unicode_lookaround_position_register_ = AllocateRegister();
     992             :     }
     993        3148 :     return unicode_lookaround_position_register_;
     994             :   }
     995             : 
     996             :   RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
     997             :                                            RegExpNode* start,
     998             :                                            int capture_count,
     999             :                                            Handle<String> pattern);
    1000             : 
    1001      679160 :   inline void AddWork(RegExpNode* node) {
    1002      679160 :     if (!node->on_work_list() && !node->label()->is_bound()) {
    1003             :       node->set_on_work_list(true);
    1004      232894 :       work_list_->Add(node);
    1005             :     }
    1006      679160 :   }
    1007             : 
    1008             :   static const int kImplementationOffset = 0;
    1009             :   static const int kNumberOfRegistersOffset = 0;
    1010             :   static const int kCodeOffset = 1;
    1011             : 
    1012             :   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
    1013             :   EndNode* accept() { return accept_; }
    1014             : 
    1015             :   static const int kMaxRecursion = 100;
    1016             :   inline int recursion_depth() { return recursion_depth_; }
    1017     1222149 :   inline void IncrementRecursionDepth() { recursion_depth_++; }
    1018     1222149 :   inline void DecrementRecursionDepth() { recursion_depth_--; }
    1019             : 
    1020           0 :   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
    1021             : 
    1022             :   inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
    1023       14753 :   inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
    1024             :   // Both unicode and ignore_case flags are set. We need to use ICU to find
    1025             :   // the closure over case equivalents.
    1026             :   inline bool needs_unicode_case_equivalents() {
    1027      209589 :     return unicode() && ignore_case();
    1028             :   }
    1029             :   inline bool one_byte() { return one_byte_; }
    1030             :   inline bool optimize() { return optimize_; }
    1031       91379 :   inline void set_optimize(bool value) { optimize_ = value; }
    1032             :   inline bool limiting_recursion() { return limiting_recursion_; }
    1033             :   inline void set_limiting_recursion(bool value) {
    1034     1097654 :     limiting_recursion_ = value;
    1035             :   }
    1036             :   bool read_backward() { return read_backward_; }
    1037        4024 :   void set_read_backward(bool value) { read_backward_ = value; }
    1038             :   FrequencyCollator* frequency_collator() { return &frequency_collator_; }
    1039             : 
    1040             :   int current_expansion_factor() { return current_expansion_factor_; }
    1041             :   void set_current_expansion_factor(int value) {
    1042      120061 :     current_expansion_factor_ = value;
    1043             :   }
    1044             : 
    1045             :   Isolate* isolate() const { return isolate_; }
    1046             :   Zone* zone() const { return zone_; }
    1047             : 
    1048             :   static const int kNoRegister = -1;
    1049             : 
    1050             :  private:
    1051             :   EndNode* accept_;
    1052             :   int next_register_;
    1053             :   int unicode_lookaround_stack_register_;
    1054             :   int unicode_lookaround_position_register_;
    1055             :   List<RegExpNode*>* work_list_;
    1056             :   int recursion_depth_;
    1057             :   RegExpMacroAssembler* macro_assembler_;
    1058             :   JSRegExp::Flags flags_;
    1059             :   bool one_byte_;
    1060             :   bool reg_exp_too_big_;
    1061             :   bool limiting_recursion_;
    1062             :   bool optimize_;
    1063             :   bool read_backward_;
    1064             :   int current_expansion_factor_;
    1065             :   FrequencyCollator frequency_collator_;
    1066             :   Isolate* isolate_;
    1067             :   Zone* zone_;
    1068             : };
    1069             : 
    1070             : 
    1071             : class RecursionCheck {
    1072             :  public:
    1073             :   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
    1074             :     compiler->IncrementRecursionDepth();
    1075             :   }
    1076             :   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
    1077             :  private:
    1078             :   RegExpCompiler* compiler_;
    1079             : };
    1080             : 
    1081             : 
    1082             : static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
    1083             :   return RegExpEngine::CompilationResult(isolate, "RegExp too big");
    1084             : }
    1085             : 
    1086             : 
    1087             : // Attempts to compile the regexp using an Irregexp code generator.  Returns
    1088             : // a fixed array or a null handle depending on whether it succeeded.
    1089       92622 : RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
    1090             :                                JSRegExp::Flags flags, bool one_byte)
    1091       92622 :     : next_register_(2 * (capture_count + 1)),
    1092             :       unicode_lookaround_stack_register_(kNoRegister),
    1093             :       unicode_lookaround_position_register_(kNoRegister),
    1094             :       work_list_(NULL),
    1095             :       recursion_depth_(0),
    1096             :       flags_(flags),
    1097             :       one_byte_(one_byte),
    1098             :       reg_exp_too_big_(false),
    1099             :       limiting_recursion_(false),
    1100             :       optimize_(FLAG_regexp_optimization),
    1101             :       read_backward_(false),
    1102             :       current_expansion_factor_(1),
    1103             :       frequency_collator_(),
    1104             :       isolate_(isolate),
    1105      185244 :       zone_(zone) {
    1106       92622 :   accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
    1107             :   DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
    1108       92622 : }
    1109             : 
    1110             : 
    1111       92181 : RegExpEngine::CompilationResult RegExpCompiler::Assemble(
    1112             :     RegExpMacroAssembler* macro_assembler,
    1113             :     RegExpNode* start,
    1114             :     int capture_count,
    1115             :     Handle<String> pattern) {
    1116             :   Isolate* isolate = pattern->GetHeap()->isolate();
    1117             : 
    1118             : #ifdef DEBUG
    1119             :   if (FLAG_trace_regexp_assembler)
    1120             :     macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
    1121             :   else
    1122             : #endif
    1123       92181 :     macro_assembler_ = macro_assembler;
    1124             : 
    1125             :   List <RegExpNode*> work_list(0);
    1126       92181 :   work_list_ = &work_list;
    1127             :   Label fail;
    1128       92181 :   macro_assembler_->PushBacktrack(&fail);
    1129       92181 :   Trace new_trace;
    1130       92181 :   start->Emit(this, &new_trace);
    1131       92181 :   macro_assembler_->Bind(&fail);
    1132       92181 :   macro_assembler_->Fail();
    1133      417256 :   while (!work_list.is_empty()) {
    1134             :     RegExpNode* node = work_list.RemoveLast();
    1135             :     node->set_on_work_list(false);
    1136      232894 :     if (!node->label()->is_bound()) node->Emit(this, &new_trace);
    1137             :   }
    1138       92181 :   if (reg_exp_too_big_) {
    1139           0 :     macro_assembler_->AbortedCodeGeneration();
    1140           0 :     return IrregexpRegExpTooBig(isolate_);
    1141             :   }
    1142             : 
    1143       92181 :   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
    1144       92181 :   isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
    1145       92181 :   work_list_ = NULL;
    1146             : #ifdef ENABLE_DISASSEMBLER
    1147             :   if (FLAG_print_code) {
    1148             :     CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
    1149             :     OFStream os(trace_scope.file());
    1150             :     Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
    1151             :   }
    1152             : #endif
    1153             : #ifdef DEBUG
    1154             :   if (FLAG_trace_regexp_assembler) {
    1155             :     delete macro_assembler_;
    1156             :   }
    1157             : #endif
    1158       92181 :   return RegExpEngine::CompilationResult(*code, next_register_);
    1159             : }
    1160             : 
    1161             : 
    1162     7083898 : bool Trace::DeferredAction::Mentions(int that) {
    1163     3580420 :   if (action_type() == ActionNode::CLEAR_CAPTURES) {
    1164             :     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
    1165             :     return range.Contains(that);
    1166             :   } else {
    1167     3503478 :     return reg() == that;
    1168             :   }
    1169             : }
    1170             : 
    1171             : 
    1172           0 : bool Trace::mentions_reg(int reg) {
    1173           0 :   for (DeferredAction* action = actions_;
    1174             :        action != NULL;
    1175             :        action = action->next()) {
    1176           0 :     if (action->Mentions(reg))
    1177             :       return true;
    1178             :   }
    1179             :   return false;
    1180             : }
    1181             : 
    1182             : 
    1183        1147 : bool Trace::GetStoredPosition(int reg, int* cp_offset) {
    1184             :   DCHECK_EQ(0, *cp_offset);
    1185        2263 :   for (DeferredAction* action = actions_;
    1186             :        action != NULL;
    1187             :        action = action->next()) {
    1188        1116 :     if (action->Mentions(reg)) {
    1189         494 :       if (action->action_type() == ActionNode::STORE_POSITION) {
    1190         494 :         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
    1191         494 :         return true;
    1192             :       } else {
    1193             :         return false;
    1194             :       }
    1195             :     }
    1196             :   }
    1197             :   return false;
    1198             : }
    1199             : 
    1200             : 
    1201      595306 : int Trace::FindAffectedRegisters(OutSet* affected_registers,
    1202             :                                  Zone* zone) {
    1203             :   int max_register = RegExpCompiler::kNoRegister;
    1204     2196117 :   for (DeferredAction* action = actions_;
    1205             :        action != NULL;
    1206             :        action = action->next()) {
    1207      537517 :     if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
    1208             :       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
    1209       77690 :       for (int i = range.from(); i <= range.to(); i++)
    1210       71820 :         affected_registers->Set(i, zone);
    1211        5870 :       if (range.to() > max_register) max_register = range.to();
    1212             :     } else {
    1213      531647 :       affected_registers->Set(action->reg(), zone);
    1214      531647 :       if (action->reg() > max_register) max_register = action->reg();
    1215             :     }
    1216             :   }
    1217      595306 :   return max_register;
    1218             : }
    1219             : 
    1220             : 
    1221      595306 : void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
    1222             :                                      int max_register,
    1223             :                                      const OutSet& registers_to_pop,
    1224             :                                      const OutSet& registers_to_clear) {
    1225    14804544 :   for (int reg = max_register; reg >= 0; reg--) {
    1226    14209238 :     if (registers_to_pop.Get(reg)) {
    1227       86082 :       assembler->PopRegister(reg);
    1228    14123156 :     } else if (registers_to_clear.Get(reg)) {
    1229             :       int clear_to = reg;
    1230      272634 :       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
    1231      158448 :         reg--;
    1232             :       }
    1233      114186 :       assembler->ClearRegisters(reg, clear_to);
    1234             :     }
    1235             :   }
    1236      595306 : }
    1237             : 
    1238             : 
    1239      595306 : void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
    1240             :                                    int max_register,
    1241             :                                    const OutSet& affected_registers,
    1242             :                                    OutSet* registers_to_pop,
    1243             :                                    OutSet* registers_to_clear,
    1244             :                                    Zone* zone) {
    1245             :   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
    1246      595306 :   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
    1247             : 
    1248             :   // Count pushes performed to force a stack limit check occasionally.
    1249             :   int pushes = 0;
    1250             : 
    1251    14962992 :   for (int reg = 0; reg <= max_register; reg++) {
    1252    14367686 :     if (!affected_registers.Get(reg)) {
    1253             :       continue;
    1254             :     }
    1255             : 
    1256             :     // The chronologically first deferred action in the trace
    1257             :     // is used to infer the action needed to restore a register
    1258             :     // to its previous state (or not, if it's safe to ignore it).
    1259             :     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
    1260             :     DeferredActionUndoType undo_action = IGNORE;
    1261             : 
    1262             :     int value = 0;
    1263             :     bool absolute = false;
    1264             :     bool clear = false;
    1265             :     static const int kNoStore = kMinInt;
    1266             :     int store_position = kNoStore;
    1267             :     // This is a little tricky because we are scanning the actions in reverse
    1268             :     // historical order (newest first).
    1269     4169636 :     for (DeferredAction* action = actions_;
    1270             :          action != NULL;
    1271             :          action = action->next()) {
    1272     3579304 :       if (action->Mentions(reg)) {
    1273      603467 :         switch (action->action_type()) {
    1274             :           case ActionNode::SET_REGISTER: {
    1275        6226 :             Trace::DeferredSetRegister* psr =
    1276             :                 static_cast<Trace::DeferredSetRegister*>(action);
    1277        6226 :             if (!absolute) {
    1278        6226 :               value += psr->value();
    1279             :               absolute = true;
    1280             :             }
    1281             :             // SET_REGISTER is currently only used for newly introduced loop
    1282             :             // counters. They can have a significant previous value if they
    1283             :             // occour in a loop. TODO(lrn): Propagate this information, so
    1284             :             // we can set undo_action to IGNORE if we know there is no value to
    1285             :             // restore.
    1286             :             undo_action = RESTORE;
    1287             :             DCHECK_EQ(store_position, kNoStore);
    1288             :             DCHECK(!clear);
    1289             :             break;
    1290             :           }
    1291             :           case ActionNode::INCREMENT_REGISTER:
    1292        7305 :             if (!absolute) {
    1293        7305 :               value++;
    1294             :             }
    1295             :             DCHECK_EQ(store_position, kNoStore);
    1296             :             DCHECK(!clear);
    1297             :             undo_action = RESTORE;
    1298             :             break;
    1299             :           case ActionNode::STORE_POSITION: {
    1300      804616 :             Trace::DeferredCapture* pc =
    1301             :                 static_cast<Trace::DeferredCapture*>(action);
    1302      518116 :             if (!clear && store_position == kNoStore) {
    1303             :               store_position = pc->cp_offset();
    1304             :             }
    1305             : 
    1306             :             // For captures we know that stores and clears alternate.
    1307             :             // Other register, are never cleared, and if the occur
    1308             :             // inside a loop, they might be assigned more than once.
    1309      518116 :             if (reg <= 1) {
    1310             :               // Registers zero and one, aka "capture zero", is
    1311             :               // always set correctly if we succeed. There is no
    1312             :               // need to undo a setting on backtrack, because we
    1313             :               // will set it again or fail.
    1314             :               undo_action = IGNORE;
    1315             :             } else {
    1316      286500 :               undo_action = pc->is_capture() ? CLEAR : RESTORE;
    1317             :             }
    1318             :             DCHECK(!absolute);
    1319             :             DCHECK_EQ(value, 0);
    1320             :             break;
    1321             :           }
    1322             :           case ActionNode::CLEAR_CAPTURES: {
    1323             :             // Since we're scanning in reverse order, if we've already
    1324             :             // set the position we have to ignore historically earlier
    1325             :             // clearing operations.
    1326       71820 :             if (store_position == kNoStore) {
    1327             :               clear = true;
    1328             :             }
    1329             :             undo_action = RESTORE;
    1330             :             DCHECK(!absolute);
    1331             :             DCHECK_EQ(value, 0);
    1332             :             break;
    1333             :           }
    1334             :           default:
    1335           0 :             UNREACHABLE();
    1336             :             break;
    1337             :         }
    1338             :       }
    1339             :     }
    1340             :     // Prepare for the undo-action (e.g., push if it's going to be popped).
    1341      590332 :     if (undo_action == RESTORE) {
    1342       86082 :       pushes++;
    1343             :       RegExpMacroAssembler::StackCheckFlag stack_check =
    1344             :           RegExpMacroAssembler::kNoStackLimitCheck;
    1345       86082 :       if (pushes == push_limit) {
    1346             :         stack_check = RegExpMacroAssembler::kCheckStackLimit;
    1347             :         pushes = 0;
    1348             :       }
    1349             : 
    1350       86082 :       assembler->PushRegister(reg, stack_check);
    1351       86082 :       registers_to_pop->Set(reg, zone);
    1352      504250 :     } else if (undo_action == CLEAR) {
    1353      272634 :       registers_to_clear->Set(reg, zone);
    1354             :     }
    1355             :     // Perform the chronologically last action (or accumulated increment)
    1356             :     // for the register.
    1357      590332 :     if (store_position != kNoStore) {
    1358      518116 :       assembler->WriteCurrentPositionToRegister(reg, store_position);
    1359       72216 :     } else if (clear) {
    1360       58685 :       assembler->ClearRegisters(reg, reg);
    1361       13531 :     } else if (absolute) {
    1362        6226 :       assembler->SetRegister(reg, value);
    1363        7305 :     } else if (value != 0) {
    1364        7305 :       assembler->AdvanceRegister(reg, value);
    1365             :     }
    1366             :   }
    1367      595306 : }
    1368             : 
    1369             : 
    1370             : // This is called as we come into a loop choice node and some other tricky
    1371             : // nodes.  It normalizes the state of the code generator to ensure we can
    1372             : // generate generic code.
    1373     4193998 : void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
    1374             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    1375             : 
    1376             :   DCHECK(!is_trivial());
    1377             : 
    1378     1338330 :   if (actions_ == NULL && backtrack() == NULL) {
    1379             :     // Here we just have some deferred cp advances to fix and we are back to
    1380             :     // a normal situation.  We may also have to forget some information gained
    1381             :     // through a quick check that was already performed.
    1382      209740 :     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
    1383             :     // Create a new trivial state and generate the node with that.
    1384      209740 :     Trace new_state;
    1385      209740 :     successor->Emit(compiler, &new_state);
    1386      805046 :     return;
    1387             :   }
    1388             : 
    1389             :   // Generate deferred actions here along with code to undo them again.
    1390             :   OutSet affected_registers;
    1391             : 
    1392      595306 :   if (backtrack() != NULL) {
    1393             :     // Here we have a concrete backtrack location.  These are set up by choice
    1394             :     // nodes and so they indicate that we have a deferred save of the current
    1395             :     // position which we may need to emit here.
    1396      474444 :     assembler->PushCurrentPosition();
    1397             :   }
    1398             : 
    1399             :   int max_register = FindAffectedRegisters(&affected_registers,
    1400      595306 :                                            compiler->zone());
    1401             :   OutSet registers_to_pop;
    1402             :   OutSet registers_to_clear;
    1403             :   PerformDeferredActions(assembler,
    1404             :                          max_register,
    1405             :                          affected_registers,
    1406             :                          ®isters_to_pop,
    1407             :                          ®isters_to_clear,
    1408      595306 :                          compiler->zone());
    1409      595306 :   if (cp_offset_ != 0) {
    1410      352299 :     assembler->AdvanceCurrentPosition(cp_offset_);
    1411             :   }
    1412             : 
    1413             :   // Create a new trivial state and generate the node with that.
    1414             :   Label undo;
    1415      595306 :   assembler->PushBacktrack(&undo);
    1416      595306 :   if (successor->KeepRecursing(compiler)) {
    1417      165861 :     Trace new_state;
    1418      165861 :     successor->Emit(compiler, &new_state);
    1419             :   } else {
    1420      429445 :     compiler->AddWork(successor);
    1421      429445 :     assembler->GoTo(successor->label());
    1422             :   }
    1423             : 
    1424             :   // On backtrack we need to restore state.
    1425      595306 :   assembler->Bind(&undo);
    1426             :   RestoreAffectedRegisters(assembler,
    1427             :                            max_register,
    1428             :                            registers_to_pop,
    1429      595306 :                            registers_to_clear);
    1430      595306 :   if (backtrack() == NULL) {
    1431      120862 :     assembler->Backtrack();
    1432             :   } else {
    1433      474444 :     assembler->PopCurrentPosition();
    1434      948888 :     assembler->GoTo(backtrack());
    1435             :   }
    1436             : }
    1437             : 
    1438             : 
    1439        3550 : void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
    1440             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    1441             : 
    1442             :   // Omit flushing the trace. We discard the entire stack frame anyway.
    1443             : 
    1444        3550 :   if (!label()->is_bound()) {
    1445             :     // We are completely independent of the trace, since we ignore it,
    1446             :     // so this code can be used as the generic version.
    1447        3501 :     assembler->Bind(label());
    1448             :   }
    1449             : 
    1450             :   // Throw away everything on the backtrack stack since the start
    1451             :   // of the negative submatch and restore the character position.
    1452        3550 :   assembler->ReadCurrentPositionFromRegister(current_position_register_);
    1453        3550 :   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
    1454        3550 :   if (clear_capture_count_ > 0) {
    1455             :     // Clear any captures that might have been performed during the success
    1456             :     // of the body of the negative look-ahead.
    1457         136 :     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
    1458         136 :     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
    1459             :   }
    1460             :   // Now that we have unwound the stack we find at the top of the stack the
    1461             :   // backtrack that the BeginSubmatch node got.
    1462        3550 :   assembler->Backtrack();
    1463        3550 : }
    1464             : 
    1465             : 
    1466      310962 : void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
    1467      206952 :   if (!trace->is_trivial()) {
    1468      103298 :     trace->Flush(compiler, this);
    1469      103298 :     return;
    1470             :   }
    1471             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    1472      103654 :   if (!label()->is_bound()) {
    1473       92168 :     assembler->Bind(label());
    1474             :   }
    1475      103654 :   switch (action_) {
    1476             :     case ACCEPT:
    1477      103298 :       assembler->Succeed();
    1478      103298 :       return;
    1479             :     case BACKTRACK:
    1480         712 :       assembler->GoTo(trace->backtrack());
    1481         356 :       return;
    1482             :     case NEGATIVE_SUBMATCH_SUCCESS:
    1483             :       // This case is handled in a different virtual method.
    1484           0 :       UNREACHABLE();
    1485             :   }
    1486           0 :   UNIMPLEMENTED();
    1487             : }
    1488             : 
    1489             : 
    1490     1507750 : void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
    1491     1507750 :   if (guards_ == NULL)
    1492     1507750 :     guards_ = new(zone) ZoneList<Guard*>(1, zone);
    1493     1507750 :   guards_->Add(guard, zone);
    1494     1507750 : }
    1495             : 
    1496             : 
    1497     1506002 : ActionNode* ActionNode::SetRegister(int reg,
    1498             :                                     int val,
    1499     1506002 :                                     RegExpNode* on_success) {
    1500             :   ActionNode* result =
    1501             :       new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
    1502     1506002 :   result->data_.u_store_register.reg = reg;
    1503     1506002 :   result->data_.u_store_register.value = val;
    1504     1506002 :   return result;
    1505             : }
    1506             : 
    1507             : 
    1508     1506002 : ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
    1509             :   ActionNode* result =
    1510             :       new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
    1511     1506002 :   result->data_.u_increment_register.reg = reg;
    1512     1506002 :   return result;
    1513             : }
    1514             : 
    1515             : 
    1516      271147 : ActionNode* ActionNode::StorePosition(int reg,
    1517             :                                       bool is_capture,
    1518      271147 :                                       RegExpNode* on_success) {
    1519             :   ActionNode* result =
    1520             :       new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
    1521      271147 :   result->data_.u_position_register.reg = reg;
    1522      271147 :   result->data_.u_position_register.is_capture = is_capture;
    1523      271147 :   return result;
    1524             : }
    1525             : 
    1526             : 
    1527        4107 : ActionNode* ActionNode::ClearCaptures(Interval range,
    1528        4107 :                                       RegExpNode* on_success) {
    1529             :   ActionNode* result =
    1530             :       new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
    1531        4107 :   result->data_.u_clear_captures.range_from = range.from();
    1532        4107 :   result->data_.u_clear_captures.range_to = range.to();
    1533        4107 :   return result;
    1534             : }
    1535             : 
    1536             : 
    1537        5352 : ActionNode* ActionNode::BeginSubmatch(int stack_reg,
    1538             :                                       int position_reg,
    1539        5352 :                                       RegExpNode* on_success) {
    1540             :   ActionNode* result =
    1541             :       new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
    1542        5352 :   result->data_.u_submatch.stack_pointer_register = stack_reg;
    1543        5352 :   result->data_.u_submatch.current_position_register = position_reg;
    1544        5352 :   return result;
    1545             : }
    1546             : 
    1547             : 
    1548        1837 : ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
    1549             :                                                 int position_reg,
    1550             :                                                 int clear_register_count,
    1551             :                                                 int clear_register_from,
    1552        1837 :                                                 RegExpNode* on_success) {
    1553             :   ActionNode* result =
    1554             :       new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
    1555        1837 :   result->data_.u_submatch.stack_pointer_register = stack_reg;
    1556        1837 :   result->data_.u_submatch.current_position_register = position_reg;
    1557        1837 :   result->data_.u_submatch.clear_register_count = clear_register_count;
    1558        1837 :   result->data_.u_submatch.clear_register_from = clear_register_from;
    1559        1837 :   return result;
    1560             : }
    1561             : 
    1562             : 
    1563         675 : ActionNode* ActionNode::EmptyMatchCheck(int start_register,
    1564             :                                         int repetition_register,
    1565             :                                         int repetition_limit,
    1566         675 :                                         RegExpNode* on_success) {
    1567             :   ActionNode* result =
    1568             :       new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
    1569         675 :   result->data_.u_empty_match_check.start_register = start_register;
    1570         675 :   result->data_.u_empty_match_check.repetition_register = repetition_register;
    1571         675 :   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
    1572         675 :   return result;
    1573             : }
    1574             : 
    1575             : 
    1576             : #define DEFINE_ACCEPT(Type)                                          \
    1577             :   void Type##Node::Accept(NodeVisitor* visitor) {                    \
    1578             :     visitor->Visit##Type(this);                                      \
    1579             :   }
    1580      900514 : FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
    1581             : #undef DEFINE_ACCEPT
    1582             : 
    1583             : 
    1584      216296 : void LoopChoiceNode::Accept(NodeVisitor* visitor) {
    1585      216296 :   visitor->VisitLoopChoice(this);
    1586      216296 : }
    1587             : 
    1588             : 
    1589             : // -------------------------------------------------------------------
    1590             : // Emit code.
    1591             : 
    1592             : 
    1593        7750 : void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
    1594       15500 :                                Guard* guard,
    1595        7750 :                                Trace* trace) {
    1596        7750 :   switch (guard->op()) {
    1597             :     case Guard::LT:
    1598             :       DCHECK(!trace->mentions_reg(guard->reg()));
    1599             :       macro_assembler->IfRegisterGE(guard->reg(),
    1600             :                                     guard->value(),
    1601        9166 :                                     trace->backtrack());
    1602        4583 :       break;
    1603             :     case Guard::GEQ:
    1604             :       DCHECK(!trace->mentions_reg(guard->reg()));
    1605             :       macro_assembler->IfRegisterLT(guard->reg(),
    1606             :                                     guard->value(),
    1607        6334 :                                     trace->backtrack());
    1608        3167 :       break;
    1609             :   }
    1610        7750 : }
    1611             : 
    1612             : 
    1613             : // Returns the number of characters in the equivalence class, omitting those
    1614             : // that cannot occur in the source string because it is Latin1.
    1615       60974 : static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
    1616             :                                      bool one_byte_subject,
    1617             :                                      unibrow::uchar* letters) {
    1618             :   int length =
    1619       60974 :       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
    1620             :   // Unibrow returns 0 or 1 for characters where case independence is
    1621             :   // trivial.
    1622       60974 :   if (length == 0) {
    1623        7751 :     letters[0] = character;
    1624             :     length = 1;
    1625             :   }
    1626             : 
    1627       60974 :   if (one_byte_subject) {
    1628             :     int new_length = 0;
    1629      103107 :     for (int i = 0; i < length; i++) {
    1630      103107 :       if (letters[i] <= String::kMaxOneByteCharCode) {
    1631      102727 :         letters[new_length++] = letters[i];
    1632             :       }
    1633             :     }
    1634             :     length = new_length;
    1635             :   }
    1636             : 
    1637       60974 :   return length;
    1638             : }
    1639             : 
    1640             : 
    1641      620149 : static inline bool EmitSimpleCharacter(Isolate* isolate,
    1642      620149 :                                        RegExpCompiler* compiler,
    1643             :                                        uc16 c,
    1644             :                                        Label* on_failure,
    1645             :                                        int cp_offset,
    1646             :                                        bool check,
    1647             :                                        bool preloaded) {
    1648             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    1649             :   bool bound_checked = false;
    1650      620149 :   if (!preloaded) {
    1651             :     assembler->LoadCurrentCharacter(
    1652             :         cp_offset,
    1653             :         on_failure,
    1654      620149 :         check);
    1655             :     bound_checked = true;
    1656             :   }
    1657      620149 :   assembler->CheckNotCharacter(c, on_failure);
    1658      620149 :   return bound_checked;
    1659             : }
    1660             : 
    1661             : 
    1662             : // Only emits non-letters (things that don't have case).  Only used for case
    1663             : // independent matches.
    1664       14932 : static inline bool EmitAtomNonLetter(Isolate* isolate,
    1665       14932 :                                      RegExpCompiler* compiler,
    1666             :                                      uc16 c,
    1667             :                                      Label* on_failure,
    1668             :                                      int cp_offset,
    1669             :                                      bool check,
    1670             :                                      bool preloaded) {
    1671             :   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    1672             :   bool one_byte = compiler->one_byte();
    1673             :   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
    1674       14932 :   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
    1675       14932 :   if (length < 1) {
    1676             :     // This can't match.  Must be an one-byte subject and a non-one-byte
    1677             :     // character.  We do not need to do anything since the one-byte pass
    1678             :     // already handled this.
    1679             :     return false;  // Bounds not checked.
    1680             :   }
    1681             :   bool checked = false;
    1682             :   // We handle the length > 1 case in a later pass.
    1683       14925 :   if (length == 1) {
    1684        1543 :     if (one_byte && c > String::kMaxOneByteCharCodeU) {
    1685             :       // Can't match - see above.
    1686             :       return false;  // Bounds not checked.
    1687             :     }
    1688        1543 :     if (!preloaded) {
    1689        1543 :       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
    1690             :       checked = check;
    1691             :     }
    1692        1543 :     macro_assembler->CheckNotCharacter(c, on_failure);
    1693             :   }
    1694       14925 :   return checked;
    1695             : }
    1696             : 
    1697             : 
    1698       12992 : static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
    1699             :                                       bool one_byte, uc16 c1, uc16 c2,
    1700             :                                       Label* on_failure) {
    1701             :   uc16 char_mask;
    1702       12992 :   if (one_byte) {
    1703             :     char_mask = String::kMaxOneByteCharCode;
    1704             :   } else {
    1705             :     char_mask = String::kMaxUtf16CodeUnit;
    1706             :   }
    1707       12992 :   uc16 exor = c1 ^ c2;
    1708             :   // Check whether exor has only one bit set.
    1709       12992 :   if (((exor - 1) & exor) == 0) {
    1710             :     // If c1 and c2 differ only by one bit.
    1711             :     // Ecma262UnCanonicalize always gives the highest number last.
    1712             :     DCHECK(c2 > c1);
    1713       12874 :     uc16 mask = char_mask ^ exor;
    1714       12874 :     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
    1715       12874 :     return true;
    1716             :   }
    1717             :   DCHECK(c2 > c1);
    1718         118 :   uc16 diff = c2 - c1;
    1719         118 :   if (((diff - 1) & diff) == 0 && c1 >= diff) {
    1720             :     // If the characters differ by 2^n but don't differ by one bit then
    1721             :     // subtract the difference from the found character, then do the or
    1722             :     // trick.  We avoid the theoretical case where negative numbers are
    1723             :     // involved in order to simplify code generation.
    1724         104 :     uc16 mask = char_mask ^ diff;
    1725             :     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
    1726             :                                                     diff,
    1727             :                                                     mask,
    1728         104 :                                                     on_failure);
    1729         104 :     return true;
    1730             :   }
    1731             :   return false;
    1732             : }
    1733             : 
    1734             : 
    1735             : typedef bool EmitCharacterFunction(Isolate* isolate,
    1736             :                                    RegExpCompiler* compiler,
    1737             :                                    uc16 c,
    1738             :                                    Label* on_failure,
    1739             :                                    int cp_offset,
    1740             :                                    bool check,
    1741             :                                    bool preloaded);
    1742             : 
    1743             : // Only emits letters (things that have case).  Only used for case independent
    1744             : // matches.
    1745       14932 : static inline bool EmitAtomLetter(Isolate* isolate,
    1746       14932 :                                   RegExpCompiler* compiler,
    1747             :                                   uc16 c,
    1748             :                                   Label* on_failure,
    1749             :                                   int cp_offset,
    1750             :                                   bool check,
    1751             :                                   bool preloaded) {
    1752             :   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    1753             :   bool one_byte = compiler->one_byte();
    1754             :   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
    1755       14932 :   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
    1756       14932 :   if (length <= 1) return false;
    1757             :   // We may not need to check against the end of the input string
    1758             :   // if this character lies before a character that matched.
    1759       13382 :   if (!preloaded) {
    1760       12989 :     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
    1761             :   }
    1762             :   Label ok;
    1763             :   DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
    1764       13382 :   switch (length) {
    1765             :     case 2: {
    1766       25984 :       if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
    1767       25984 :                                     chars[1], on_failure)) {
    1768             :       } else {
    1769          14 :         macro_assembler->CheckCharacter(chars[0], &ok);
    1770          14 :         macro_assembler->CheckNotCharacter(chars[1], on_failure);
    1771          14 :         macro_assembler->Bind(&ok);
    1772             :       }
    1773             :       break;
    1774             :     }
    1775             :     case 4:
    1776          30 :       macro_assembler->CheckCharacter(chars[3], &ok);
    1777             :       // Fall through!
    1778             :     case 3:
    1779         390 :       macro_assembler->CheckCharacter(chars[0], &ok);
    1780         390 :       macro_assembler->CheckCharacter(chars[1], &ok);
    1781         390 :       macro_assembler->CheckNotCharacter(chars[2], on_failure);
    1782         390 :       macro_assembler->Bind(&ok);
    1783         390 :       break;
    1784             :     default:
    1785           0 :       UNREACHABLE();
    1786             :       break;
    1787             :   }
    1788             :   return true;
    1789             : }
    1790             : 
    1791             : 
    1792        9788 : static void EmitBoundaryTest(RegExpMacroAssembler* masm,
    1793             :                              int border,
    1794             :                              Label* fall_through,
    1795             :                              Label* above_or_equal,
    1796             :                              Label* below) {
    1797        9788 :   if (below != fall_through) {
    1798        9377 :     masm->CheckCharacterLT(border, below);
    1799        9377 :     if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
    1800             :   } else {
    1801         411 :     masm->CheckCharacterGT(border - 1, above_or_equal);
    1802             :   }
    1803        9788 : }
    1804             : 
    1805             : 
    1806      178894 : static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
    1807             :                                    int first,
    1808             :                                    int last,
    1809             :                                    Label* fall_through,
    1810             :                                    Label* in_range,
    1811             :                                    Label* out_of_range) {
    1812      178894 :   if (in_range == fall_through) {
    1813      123270 :     if (first == last) {
    1814       15327 :       masm->CheckNotCharacter(first, out_of_range);
    1815             :     } else {
    1816      107943 :       masm->CheckCharacterNotInRange(first, last, out_of_range);
    1817             :     }
    1818             :   } else {
    1819       55624 :     if (first == last) {
    1820       28254 :       masm->CheckCharacter(first, in_range);
    1821             :     } else {
    1822       27370 :       masm->CheckCharacterInRange(first, last, in_range);
    1823             :     }
    1824       55624 :     if (out_of_range != fall_through) masm->GoTo(out_of_range);
    1825             :   }
    1826      178894 : }
    1827             : 
    1828             : 
    1829             : // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
    1830             : // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
    1831        6432 : static void EmitUseLookupTable(
    1832        6432 :     RegExpMacroAssembler* masm,
    1833             :     ZoneList<int>* ranges,
    1834             :     int start_index,
    1835             :     int end_index,
    1836             :     int min_char,
    1837             :     Label* fall_through,
    1838             :     Label* even_label,
    1839             :     Label* odd_label) {
    1840             :   static const int kSize = RegExpMacroAssembler::kTableSize;
    1841             :   static const int kMask = RegExpMacroAssembler::kTableMask;
    1842             : 
    1843             :   int base = (min_char & ~kMask);
    1844             :   USE(base);
    1845             : 
    1846             :   // Assert that everything is on one kTableSize page.
    1847             :   for (int i = start_index; i <= end_index; i++) {
    1848             :     DCHECK_EQ(ranges->at(i) & ~kMask, base);
    1849             :   }
    1850             :   DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
    1851             : 
    1852             :   char templ[kSize];
    1853             :   Label* on_bit_set;
    1854             :   Label* on_bit_clear;
    1855             :   int bit;
    1856        6432 :   if (even_label == fall_through) {
    1857             :     on_bit_set = odd_label;
    1858             :     on_bit_clear = even_label;
    1859             :     bit = 1;
    1860             :   } else {
    1861             :     on_bit_set = even_label;
    1862             :     on_bit_clear = odd_label;
    1863             :     bit = 0;
    1864             :   }
    1865      262224 :   for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
    1866      127896 :     templ[i] = bit;
    1867             :   }
    1868             :   int j = 0;
    1869        6432 :   bit ^= 1;
    1870      108651 :   for (int i = start_index; i < end_index; i++) {
    1871     1338774 :     for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
    1872      567168 :       templ[j] = bit;
    1873             :     }
    1874      102219 :     bit ^= 1;
    1875             :   }
    1876      128232 :   for (int i = j; i < kSize; i++) {
    1877      128232 :     templ[i] = bit;
    1878             :   }
    1879             :   Factory* factory = masm->isolate()->factory();
    1880             :   // TODO(erikcorry): Cache these.
    1881        6432 :   Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
    1882      823296 :   for (int i = 0; i < kSize; i++) {
    1883      823296 :     ba->set(i, templ[i]);
    1884             :   }
    1885        6432 :   masm->CheckBitInTable(ba, on_bit_set);
    1886        6432 :   if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
    1887        6432 : }
    1888             : 
    1889             : 
    1890       39249 : static void CutOutRange(RegExpMacroAssembler* masm,
    1891             :                         ZoneList<int>* ranges,
    1892             :                         int start_index,
    1893             :                         int end_index,
    1894             :                         int cut_index,
    1895             :                         Label* even_label,
    1896             :                         Label* odd_label) {
    1897       39249 :   bool odd = (((cut_index - start_index) & 1) == 1);
    1898       39249 :   Label* in_range_label = odd ? odd_label : even_label;
    1899             :   Label dummy;
    1900             :   EmitDoubleBoundaryTest(masm,
    1901             :                          ranges->at(cut_index),
    1902       39249 :                          ranges->at(cut_index + 1) - 1,
    1903             :                          &dummy,
    1904             :                          in_range_label,
    1905       78498 :                          &dummy);
    1906             :   DCHECK(!dummy.is_linked());
    1907             :   // Cut out the single range by rewriting the array.  This creates a new
    1908             :   // range that is a merger of the two ranges on either side of the one we
    1909             :   // are cutting out.  The oddity of the labels is preserved.
    1910       99371 :   for (int j = cut_index; j > start_index; j--) {
    1911       41746 :     ranges->at(j) = ranges->at(j - 1);
    1912             :   }
    1913      117857 :   for (int j = cut_index + 1; j < end_index; j++) {
    1914      157216 :     ranges->at(j) = ranges->at(j + 1);
    1915             :   }
    1916       39249 : }
    1917             : 
    1918             : 
    1919             : // Unicode case.  Split the search space into kSize spaces that are handled
    1920             : // with recursion.
    1921       21679 : static void SplitSearchSpace(ZoneList<int>* ranges,
    1922             :                              int start_index,
    1923             :                              int end_index,
    1924             :                              int* new_start_index,
    1925             :                              int* new_end_index,
    1926             :                              int* border) {
    1927             :   static const int kSize = RegExpMacroAssembler::kTableSize;
    1928             :   static const int kMask = RegExpMacroAssembler::kTableMask;
    1929             : 
    1930       21679 :   int first = ranges->at(start_index);
    1931       21679 :   int last = ranges->at(end_index) - 1;
    1932             : 
    1933       21679 :   *new_start_index = start_index;
    1934       21679 :   *border = (ranges->at(start_index) & ~kMask) + kSize;
    1935      190400 :   while (*new_start_index < end_index) {
    1936      167441 :     if (ranges->at(*new_start_index) > *border) break;
    1937      147042 :     (*new_start_index)++;
    1938             :   }
    1939             :   // new_start_index is the index of the first edge that is beyond the
    1940             :   // current kSize space.
    1941             : 
    1942             :   // For very large search spaces we do a binary chop search of the non-Latin1
    1943             :   // space instead of just going to the end of the current kSize space.  The
    1944             :   // heuristics are complicated a little by the fact that any 128-character
    1945             :   // encoding space can be quickly tested with a table lookup, so we don't
    1946             :   // wish to do binary chop search at a smaller granularity than that.  A
    1947             :   // 128-character space can take up a lot of space in the ranges array if,
    1948             :   // for example, we only want to match every second character (eg. the lower
    1949             :   // case characters on some Unicode pages).
    1950       21679 :   int binary_chop_index = (end_index + start_index) / 2;
    1951             :   // The first test ensures that we get to the code that handles the Latin1
    1952             :   // range with a single not-taken branch, speeding up this important
    1953             :   // character range (even non-Latin1 charset-based text has spaces and
    1954             :   // punctuation).
    1955       60175 :   if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
    1956       31024 :       end_index - start_index > (*new_start_index - start_index) * 2 &&
    1957       61872 :       last - first > kSize * 2 && binary_chop_index > *new_start_index &&
    1958       25728 :       ranges->at(binary_chop_index) >= first + 2 * kSize) {
    1959             :     int scan_forward_for_section_border = binary_chop_index;;
    1960       10614 :     int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
    1961             : 
    1962       82386 :     while (scan_forward_for_section_border < end_index) {
    1963       69843 :       if (ranges->at(scan_forward_for_section_border) > new_border) {
    1964        8685 :         *new_start_index = scan_forward_for_section_border;
    1965        8685 :         *border = new_border;
    1966        8685 :         break;
    1967             :       }
    1968       61158 :       scan_forward_for_section_border++;
    1969             :     }
    1970             :   }
    1971             : 
    1972             :   DCHECK(*new_start_index > start_index);
    1973       21679 :   *new_end_index = *new_start_index - 1;
    1974       21679 :   if (ranges->at(*new_end_index) == *border) {
    1975        3467 :     (*new_end_index)--;
    1976             :   }
    1977       43358 :   if (*border >= ranges->at(end_index)) {
    1978        1278 :     *border = ranges->at(end_index);
    1979        1278 :     *new_start_index = end_index;  // Won't be used.
    1980        1278 :     *new_end_index = end_index - 1;
    1981             :   }
    1982       21679 : }
    1983             : 
    1984             : 
    1985             : // Gets a series of segment boundaries representing a character class.  If the
    1986             : // character is in the range between an even and an odd boundary (counting from
    1987             : // start_index) then go to even_label, otherwise go to odd_label.  We already
    1988             : // know that the character is in the range of min_char to max_char inclusive.
    1989             : // Either label can be NULL indicating backtracking.  Either label can also be
    1990             : // equal to the fall_through label.
    1991      225968 : static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
    1992             :                              int start_index, int end_index, uc32 min_char,
    1993             :                              uc32 max_char, Label* fall_through,
    1994             :                              Label* even_label, Label* odd_label) {
    1995             :   DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
    1996             :   DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
    1997             : 
    1998      225968 :   int first = ranges->at(start_index);
    1999      225968 :   int last = ranges->at(end_index) - 1;
    2000             : 
    2001             :   DCHECK_LT(min_char, first);
    2002             : 
    2003             :   // Just need to test if the character is before or on-or-after
    2004             :   // a particular character.
    2005      225968 :   if (start_index == end_index) {
    2006        9788 :     EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
    2007        9788 :     return;
    2008             :   }
    2009             : 
    2010             :   // Another almost trivial case:  There is one interval in the middle that is
    2011             :   // different from the end intervals.
    2012      216180 :   if (start_index + 1 == end_index) {
    2013             :     EmitDoubleBoundaryTest(
    2014      139645 :         masm, first, last, fall_through, even_label, odd_label);
    2015      139645 :     return;
    2016             :   }
    2017             : 
    2018             :   // It's not worth using table lookup if there are very few intervals in the
    2019             :   // character class.
    2020       76535 :   if (end_index - start_index <= 6) {
    2021             :     // It is faster to test for individual characters, so we look for those
    2022             :     // first, then try arbitrary ranges in the second round.
    2023             :     static int kNoCutIndex = -1;
    2024       39249 :     int cut = kNoCutIndex;
    2025      161546 :     for (int i = start_index; i < end_index; i++) {
    2026      204512 :       if (ranges->at(i) == ranges->at(i + 1) - 1) {
    2027             :         cut = i;
    2028             :         break;
    2029             :       }
    2030             :     }
    2031       39249 :     if (cut == kNoCutIndex) cut = start_index;
    2032             :     CutOutRange(
    2033       39249 :         masm, ranges, start_index, end_index, cut, even_label, odd_label);
    2034             :     DCHECK_GE(end_index - start_index, 2);
    2035             :     GenerateBranches(masm,
    2036             :                      ranges,
    2037             :                      start_index + 1,
    2038             :                      end_index - 1,
    2039             :                      min_char,
    2040             :                      max_char,
    2041             :                      fall_through,
    2042             :                      even_label,
    2043       39249 :                      odd_label);
    2044       39249 :     return;
    2045             :   }
    2046             : 
    2047             :   // If there are a lot of intervals in the regexp, then we will use tables to
    2048             :   // determine whether the character is inside or outside the character class.
    2049             :   static const int kBits = RegExpMacroAssembler::kTableSizeBits;
    2050             : 
    2051       37286 :   if ((max_char >> kBits) == (min_char >> kBits)) {
    2052             :     EmitUseLookupTable(masm,
    2053             :                        ranges,
    2054             :                        start_index,
    2055             :                        end_index,
    2056             :                        min_char,
    2057             :                        fall_through,
    2058             :                        even_label,
    2059        6432 :                        odd_label);
    2060        6432 :     return;
    2061             :   }
    2062             : 
    2063       30854 :   if ((min_char >> kBits) != (first >> kBits)) {
    2064        9175 :     masm->CheckCharacterLT(first, odd_label);
    2065             :     GenerateBranches(masm,
    2066             :                      ranges,
    2067             :                      start_index + 1,
    2068             :                      end_index,
    2069             :                      first,
    2070             :                      max_char,
    2071             :                      fall_through,
    2072             :                      odd_label,
    2073        9175 :                      even_label);
    2074        9175 :     return;
    2075             :   }
    2076             : 
    2077       21679 :   int new_start_index = 0;
    2078       21679 :   int new_end_index = 0;
    2079       21679 :   int border = 0;
    2080             : 
    2081             :   SplitSearchSpace(ranges,
    2082             :                    start_index,
    2083             :                    end_index,
    2084             :                    &new_start_index,
    2085             :                    &new_end_index,
    2086       21679 :                    &border);
    2087             : 
    2088             :   Label handle_rest;
    2089             :   Label* above = &handle_rest;
    2090       21679 :   if (border == last + 1) {
    2091             :     // We didn't find any section that started after the limit, so everything
    2092             :     // above the border is one of the terminal labels.
    2093        1278 :     above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
    2094             :     DCHECK(new_end_index == end_index - 1);
    2095             :   }
    2096             : 
    2097             :   DCHECK_LE(start_index, new_end_index);
    2098             :   DCHECK_LE(new_start_index, end_index);
    2099             :   DCHECK_LT(start_index, new_start_index);
    2100             :   DCHECK_LT(new_end_index, end_index);
    2101             :   DCHECK(new_end_index + 1 == new_start_index ||
    2102             :          (new_end_index + 2 == new_start_index &&
    2103             :           border == ranges->at(new_end_index + 1)));
    2104             :   DCHECK_LT(min_char, border - 1);
    2105             :   DCHECK_LT(border, max_char);
    2106             :   DCHECK_LT(ranges->at(new_end_index), border);
    2107             :   DCHECK(border < ranges->at(new_start_index) ||
    2108             :          (border == ranges->at(new_start_index) &&
    2109             :           new_start_index == end_index &&
    2110             :           new_end_index == end_index - 1 &&
    2111             :           border == last + 1));
    2112             :   DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
    2113             : 
    2114       21679 :   masm->CheckCharacterGT(border - 1, above);
    2115             :   Label dummy;
    2116             :   GenerateBranches(masm,
    2117             :                    ranges,
    2118             :                    start_index,
    2119             :                    new_end_index,
    2120             :                    min_char,
    2121             :                    border - 1,
    2122             :                    &dummy,
    2123             :                    even_label,
    2124       21679 :                    odd_label);
    2125       21679 :   if (handle_rest.is_linked()) {
    2126       20401 :     masm->Bind(&handle_rest);
    2127       20401 :     bool flip = (new_start_index & 1) != (start_index & 1);
    2128             :     GenerateBranches(masm,
    2129             :                      ranges,
    2130             :                      new_start_index,
    2131             :                      end_index,
    2132             :                      border,
    2133             :                      max_char,
    2134             :                      &dummy,
    2135             :                      flip ? odd_label : even_label,
    2136       20401 :                      flip ? even_label : odd_label);
    2137             :   }
    2138             : }
    2139             : 
    2140             : 
    2141      241451 : static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
    2142             :                           RegExpCharacterClass* cc, bool one_byte,
    2143             :                           Label* on_failure, int cp_offset, bool check_offset,
    2144             :                           bool preloaded, Zone* zone) {
    2145             :   ZoneList<CharacterRange>* ranges = cc->ranges(zone);
    2146      241451 :   CharacterRange::Canonicalize(ranges);
    2147             : 
    2148             :   int max_char;
    2149      241451 :   if (one_byte) {
    2150             :     max_char = String::kMaxOneByteCharCode;
    2151             :   } else {
    2152             :     max_char = String::kMaxUtf16CodeUnit;
    2153             :   }
    2154             : 
    2155      241451 :   int range_count = ranges->length();
    2156             : 
    2157      241451 :   int last_valid_range = range_count - 1;
    2158      700980 :   while (last_valid_range >= 0) {
    2159      459484 :     CharacterRange& range = ranges->at(last_valid_range);
    2160      459484 :     if (range.from() <= max_char) {
    2161             :       break;
    2162             :     }
    2163      218078 :     last_valid_range--;
    2164             :   }
    2165             : 
    2166      241451 :   if (last_valid_range < 0) {
    2167          45 :     if (!cc->is_negated()) {
    2168          14 :       macro_assembler->GoTo(on_failure);
    2169             :     }
    2170          45 :     if (check_offset) {
    2171          45 :       macro_assembler->CheckPosition(cp_offset, on_failure);
    2172             :     }
    2173      105987 :     return;
    2174             :   }
    2175             : 
    2176      449397 :   if (last_valid_range == 0 &&
    2177      207991 :       ranges->at(0).IsEverything(max_char)) {
    2178       87799 :     if (cc->is_negated()) {
    2179          21 :       macro_assembler->GoTo(on_failure);
    2180             :     } else {
    2181             :       // This is a common case hit by non-anchored expressions.
    2182       87778 :       if (check_offset) {
    2183       56209 :         macro_assembler->CheckPosition(cp_offset, on_failure);
    2184             :       }
    2185             :     }
    2186             :     return;
    2187             :   }
    2188             : 
    2189      153607 :   if (!preloaded) {
    2190      140426 :     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
    2191             :   }
    2192             : 
    2193      172129 :   if (cc->is_standard(zone) &&
    2194             :       macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
    2195       37044 :                                                   on_failure)) {
    2196             :       return;
    2197             :   }
    2198             : 
    2199             : 
    2200             :   // A new list with ascending entries.  Each entry is a code unit
    2201             :   // where there is a boundary between code units that are part of
    2202             :   // the class and code units that are not.  Normally we insert an
    2203             :   // entry at zero which goes to the failure label, but if there
    2204             :   // was already one there we fall through for success on that entry.
    2205             :   // Subsequent entries have alternating meaning (success/failure).
    2206             :   ZoneList<int>* range_boundaries =
    2207      135464 :       new(zone) ZoneList<int>(last_valid_range, zone);
    2208             : 
    2209      135464 :   bool zeroth_entry_is_failure = !cc->is_negated();
    2210             : 
    2211      383249 :   for (int i = 0; i <= last_valid_range; i++) {
    2212      495570 :     CharacterRange& range = ranges->at(i);
    2213      247785 :     if (range.from() == 0) {
    2214             :       DCHECK_EQ(i, 0);
    2215        2398 :       zeroth_entry_is_failure = !zeroth_entry_is_failure;
    2216             :     } else {
    2217      490774 :       range_boundaries->Add(range.from(), zone);
    2218             :     }
    2219      495570 :     range_boundaries->Add(range.to() + 1, zone);
    2220             :   }
    2221      135464 :   int end_index = range_boundaries->length() - 1;
    2222      135464 :   if (range_boundaries->at(end_index) > max_char) {
    2223        3025 :     end_index--;
    2224             :   }
    2225             : 
    2226             :   Label fall_through;
    2227             :   GenerateBranches(macro_assembler,
    2228             :                    range_boundaries,
    2229             :                    0,  // start_index.
    2230             :                    end_index,
    2231             :                    0,  // min_char.
    2232             :                    max_char,
    2233             :                    &fall_through,
    2234             :                    zeroth_entry_is_failure ? &fall_through : on_failure,
    2235      135464 :                    zeroth_entry_is_failure ? on_failure : &fall_through);
    2236      135464 :   macro_assembler->Bind(&fall_through);
    2237             : }
    2238             : 
    2239             : 
    2240           0 : RegExpNode::~RegExpNode() {
    2241           0 : }
    2242             : 
    2243             : 
    2244     5341068 : RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
    2245     2558996 :                                                   Trace* trace) {
    2246             :   // If we are generating a greedy loop then don't stop and don't reuse code.
    2247     2022827 :   if (trace->stop_node() != NULL) {
    2248             :     return CONTINUE;
    2249             :   }
    2250             : 
    2251             :   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    2252     2006775 :   if (trace->is_trivial()) {
    2253     1724621 :     if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
    2254             :       // If a generic version is already scheduled to be generated or we have
    2255             :       // recursed too deeply then just generate a jump to that code.
    2256      249715 :       macro_assembler->GoTo(&label_);
    2257             :       // This will queue it up for generation of a generic version if it hasn't
    2258             :       // already been queued.
    2259      249715 :       compiler->AddWork(this);
    2260      249715 :       return DONE;
    2261             :     }
    2262             :     // Generate generic version of the node and bind the label for later use.
    2263      438904 :     macro_assembler->Bind(&label_);
    2264      438904 :     return CONTINUE;
    2265             :   }
    2266             : 
    2267             :   // We are being asked to make a non-generic version.  Keep track of how many
    2268             :   // non-generic versions we generate so as not to overdo it.
    2269     1318156 :   trace_count_++;
    2270     2629622 :   if (KeepRecursing(compiler) && compiler->optimize() &&
    2271             :       trace_count_ < kMaxCopiesCodeGenerated) {
    2272             :     return CONTINUE;
    2273             :   }
    2274             : 
    2275             :   // If we get here code has been generated for this node too many times or
    2276             :   // recursion is too deep.  Time to switch to a generic version.  The code for
    2277             :   // generic versions above can handle deep recursion properly.
    2278             :   bool was_limiting = compiler->limiting_recursion();
    2279             :   compiler->set_limiting_recursion(true);
    2280      548827 :   trace->Flush(compiler, this);
    2281             :   compiler->set_limiting_recursion(was_limiting);
    2282      548827 :   return DONE;
    2283             : }
    2284             : 
    2285             : 
    2286     4336278 : bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
    2287     4336278 :   return !compiler->limiting_recursion() &&
    2288           0 :          compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
    2289             : }
    2290             : 
    2291             : 
    2292      779046 : int ActionNode::EatsAtLeast(int still_to_find,
    2293             :                             int budget,
    2294             :                             bool not_at_start) {
    2295      779046 :   if (budget <= 0) return 0;
    2296      760101 :   if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
    2297      754400 :   return on_success()->EatsAtLeast(still_to_find,
    2298             :                                    budget - 1,
    2299      754400 :                                    not_at_start);
    2300             : }
    2301             : 
    2302             : 
    2303      101768 : void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
    2304             :                               BoyerMooreLookahead* bm, bool not_at_start) {
    2305      101768 :   if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
    2306      101768 :     on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
    2307             :   }
    2308             :   SaveBMInfo(bm, not_at_start, offset);
    2309      101768 : }
    2310             : 
    2311             : 
    2312       18518 : int AssertionNode::EatsAtLeast(int still_to_find,
    2313             :                                int budget,
    2314       17645 :                                bool not_at_start) {
    2315       18518 :   if (budget <= 0) return 0;
    2316             :   // If we know we are not at the start and we are asked "how many characters
    2317             :   // will you match if you succeed?" then we can answer anything since false
    2318             :   // implies false.  So lets just return the max answer (still_to_find) since
    2319             :   // that won't prevent us from preloading a lot of characters for the other
    2320             :   // branches in the node graph.
    2321       17645 :   if (assertion_type() == AT_START && not_at_start) return still_to_find;
    2322       17363 :   return on_success()->EatsAtLeast(still_to_find,
    2323             :                                    budget - 1,
    2324       17363 :                                    not_at_start);
    2325             : }
    2326             : 
    2327             : 
    2328        1134 : void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
    2329        1134 :                                  BoyerMooreLookahead* bm, bool not_at_start) {
    2330             :   // Match the behaviour of EatsAtLeast on this node.
    2331        2268 :   if (assertion_type() == AT_START && not_at_start) return;
    2332        1114 :   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
    2333             :   SaveBMInfo(bm, not_at_start, offset);
    2334             : }
    2335             : 
    2336             : 
    2337        3988 : int BackReferenceNode::EatsAtLeast(int still_to_find,
    2338             :                                    int budget,
    2339        3988 :                                    bool not_at_start) {
    2340        3988 :   if (read_backward()) return 0;
    2341        3856 :   if (budget <= 0) return 0;
    2342        3856 :   return on_success()->EatsAtLeast(still_to_find,
    2343             :                                    budget - 1,
    2344        3856 :                                    not_at_start);
    2345             : }
    2346             : 
    2347             : 
    2348     7741859 : int TextNode::EatsAtLeast(int still_to_find,
    2349             :                           int budget,
    2350     7741859 :                           bool not_at_start) {
    2351     7741859 :   if (read_backward()) return 0;
    2352     7740050 :   int answer = Length();
    2353     7740050 :   if (answer >= still_to_find) return answer;
    2354     4665853 :   if (budget <= 0) return answer;
    2355             :   // We are not at start after this node so we set the last argument to 'true'.
    2356     3257497 :   return answer + on_success()->EatsAtLeast(still_to_find - answer,
    2357             :                                             budget - 1,
    2358     3257497 :                                             true);
    2359             : }
    2360             : 
    2361             : 
    2362       11380 : int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
    2363             :                                               bool not_at_start) {
    2364       11380 :   if (budget <= 0) return 0;
    2365             :   // Alternative 0 is the negative lookahead, alternative 1 is what comes
    2366             :   // afterwards.
    2367       22522 :   RegExpNode* node = alternatives_->at(1).node();
    2368       11261 :   return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
    2369             : }
    2370             : 
    2371             : 
    2372        4508 : void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
    2373             :     QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
    2374             :     bool not_at_start) {
    2375             :   // Alternative 0 is the negative lookahead, alternative 1 is what comes
    2376             :   // afterwards.
    2377        9016 :   RegExpNode* node = alternatives_->at(1).node();
    2378        4508 :   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
    2379             : }
    2380             : 
    2381             : 
    2382     9665326 : int ChoiceNode::EatsAtLeastHelper(int still_to_find,
    2383             :                                   int budget,
    2384             :                                   RegExpNode* ignore_this_node,
    2385             :                                   bool not_at_start) {
    2386     9665326 :   if (budget <= 0) return 0;
    2387             :   int min = 100;
    2388     6689076 :   int choice_count = alternatives_->length();
    2389     6689076 :   budget = (budget - 1) / choice_count;
    2390    14608442 :   for (int i = 0; i < choice_count; i++) {
    2391    28458628 :     RegExpNode* node = alternatives_->at(i).node();
    2392    14229314 :     if (node == ignore_this_node) continue;
    2393             :     int node_eats_at_least =
    2394    14041857 :         node->EatsAtLeast(still_to_find, budget, not_at_start);
    2395    14041857 :     if (node_eats_at_least < min) min = node_eats_at_least;
    2396    14041857 :     if (min == 0) return 0;
    2397             :   }
    2398             :   return min;
    2399             : }
    2400             : 
    2401             : 
    2402      198628 : int LoopChoiceNode::EatsAtLeast(int still_to_find,
    2403             :                                 int budget,
    2404             :                                 bool not_at_start) {
    2405             :   return EatsAtLeastHelper(still_to_find,
    2406             :                            budget - 1,
    2407             :                            loop_node_,
    2408      198628 :                            not_at_start);
    2409             : }
    2410             : 
    2411             : 
    2412     9466698 : int ChoiceNode::EatsAtLeast(int still_to_find,
    2413             :                             int budget,
    2414             :                             bool not_at_start) {
    2415             :   return EatsAtLeastHelper(still_to_find,
    2416             :                            budget,
    2417             :                            NULL,
    2418     9466698 :                            not_at_start);
    2419             : }
    2420             : 
    2421             : 
    2422             : // Takes the left-most 1-bit and smears it out, setting all bits to its right.
    2423             : static inline uint32_t SmearBitsRight(uint32_t v) {
    2424      324912 :   v |= v >> 1;
    2425      324912 :   v |= v >> 2;
    2426      324912 :   v |= v >> 4;
    2427      324912 :   v |= v >> 8;
    2428      324912 :   v |= v >> 16;
    2429             :   return v;
    2430             : }
    2431             : 
    2432             : 
    2433      304720 : bool QuickCheckDetails::Rationalize(bool asc) {
    2434             :   bool found_useful_op = false;
    2435             :   uint32_t char_mask;
    2436      304720 :   if (asc) {
    2437             :     char_mask = String::kMaxOneByteCharCode;
    2438             :   } else {
    2439             :     char_mask = String::kMaxUtf16CodeUnit;
    2440             :   }
    2441      304720 :   mask_ = 0;
    2442      304720 :   value_ = 0;
    2443             :   int char_shift = 0;
    2444      849012 :   for (int i = 0; i < characters_; i++) {
    2445      544292 :     Position* pos = &positions_[i];
    2446      544292 :     if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
    2447             :       found_useful_op = true;
    2448             :     }
    2449      544292 :     mask_ |= (pos->mask & char_mask) << char_shift;
    2450      544292 :     value_ |= (pos->value & char_mask) << char_shift;
    2451      544292 :     char_shift += asc ? 8 : 16;
    2452             :   }
    2453      304720 :   return found_useful_op;
    2454             : }
    2455             : 
    2456             : 
    2457     1394488 : bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
    2458       75702 :                                 Trace* bounds_check_trace,
    2459      679039 :                                 Trace* trace,
    2460             :                                 bool preload_has_checked_bounds,
    2461             :                                 Label* on_possible_success,
    2462     1657499 :                                 QuickCheckDetails* details,
    2463             :                                 bool fall_through_on_failure) {
    2464      590803 :   if (details->characters() == 0) return false;
    2465             :   GetQuickCheckDetails(
    2466      609716 :       details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
    2467      304858 :   if (details->cannot_match()) return false;
    2468      304720 :   if (!details->Rationalize(compiler->one_byte())) return false;
    2469             :   DCHECK(details->characters() == 1 ||
    2470             :          compiler->macro_assembler()->CanReadUnaligned());
    2471             :   uint32_t mask = details->mask();
    2472             :   uint32_t value = details->value();
    2473             : 
    2474             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    2475             : 
    2476      253946 :   if (trace->characters_preloaded() != details->characters()) {
    2477             :     DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
    2478             :     // We are attempting to preload the minimum number of characters
    2479             :     // any choice would eat, so if the bounds check fails, then none of the
    2480             :     // choices can succeed, so we can just immediately backtrack, rather
    2481             :     // than go to the next choice.
    2482             :     assembler->LoadCurrentCharacter(trace->cp_offset(),
    2483             :                                     bounds_check_trace->backtrack(),
    2484       75702 :                                     !preload_has_checked_bounds,
    2485      227106 :                                     details->characters());
    2486             :   }
    2487             : 
    2488             : 
    2489             :   bool need_mask = true;
    2490             : 
    2491      253946 :   if (details->characters() == 1) {
    2492             :     // If number of characters preloaded is 1 then we used a byte or 16 bit
    2493             :     // load so the value is already masked down.
    2494             :     uint32_t char_mask;
    2495       50248 :     if (compiler->one_byte()) {
    2496             :       char_mask = String::kMaxOneByteCharCode;
    2497             :     } else {
    2498             :       char_mask = String::kMaxUtf16CodeUnit;
    2499             :     }
    2500       50248 :     if ((mask & char_mask) == char_mask) need_mask = false;
    2501             :     mask &= char_mask;
    2502             :   } else {
    2503             :     // For 2-character preloads in one-byte mode or 1-character preloads in
    2504             :     // two-byte mode we also use a 16 bit load with zero extend.
    2505             :     static const uint32_t kTwoByteMask = 0xffff;
    2506             :     static const uint32_t kFourByteMask = 0xffffffff;
    2507      398469 :     if (details->characters() == 2 && compiler->one_byte()) {
    2508      192853 :       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
    2509       10845 :     } else if (details->characters() == 1 && !compiler->one_byte()) {
    2510           0 :       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
    2511             :     } else {
    2512       10845 :       if (mask == kFourByteMask) need_mask = false;
    2513             :     }
    2514             :   }
    2515             : 
    2516      253946 :   if (fall_through_on_failure) {
    2517      209413 :     if (need_mask) {
    2518       61784 :       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
    2519             :     } else {
    2520      147629 :       assembler->CheckCharacter(value, on_possible_success);
    2521             :     }
    2522             :   } else {
    2523       44533 :     if (need_mask) {
    2524       12706 :       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
    2525             :     } else {
    2526       76360 :       assembler->CheckNotCharacter(value, trace->backtrack());
    2527             :     }
    2528             :   }
    2529             :   return true;
    2530             : }
    2531             : 
    2532             : 
    2533             : // Here is the meat of GetQuickCheckDetails (see also the comment on the
    2534             : // super-class in the .h file).
    2535             : //
    2536             : // We iterate along the text object, building up for each character a
    2537             : // mask and value that can be used to test for a quick failure to match.
    2538             : // The masks and values for the positions will be combined into a single
    2539             : // machine word for the current character width in order to be used in
    2540             : // generating a quick check.
    2541     1935795 : void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
    2542     1147124 :                                     RegExpCompiler* compiler,
    2543             :                                     int characters_filled_in,
    2544     1194150 :                                     bool not_at_start) {
    2545             :   // Do not collect any quick check details if the text node reads backward,
    2546             :   // since it reads in the opposite direction than we use for quick checks.
    2547      560994 :   if (read_backward()) return;
    2548      560994 :   Isolate* isolate = compiler->macro_assembler()->isolate();
    2549             :   DCHECK(characters_filled_in < details->characters());
    2550             :   int characters = details->characters();
    2551             :   int char_mask;
    2552      560994 :   if (compiler->one_byte()) {
    2553             :     char_mask = String::kMaxOneByteCharCode;
    2554             :   } else {
    2555             :     char_mask = String::kMaxUtf16CodeUnit;
    2556             :   }
    2557     1266312 :   for (int k = 0; k < elements()->length(); k++) {
    2558      567367 :     TextElement elm = elements()->at(k);
    2559      567367 :     if (elm.text_type() == TextElement::ATOM) {
    2560             :       Vector<const uc16> quarks = elm.atom()->data();
    2561      641361 :       for (int i = 0; i < characters && i < quarks.length(); i++) {
    2562             :         QuickCheckDetails::Position* pos =
    2563      599759 :             details->positions(characters_filled_in);
    2564      599759 :         uc16 c = quarks[i];
    2565      599759 :         if (compiler->ignore_case()) {
    2566             :           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
    2567             :           int length = GetCaseIndependentLetters(isolate, c,
    2568       25136 :                                                  compiler->one_byte(), chars);
    2569       25136 :           if (length == 0) {
    2570             :             // This can happen because all case variants are non-Latin1, but we
    2571             :             // know the input is Latin1.
    2572             :             details->set_cannot_match();
    2573          35 :             pos->determines_perfectly = false;
    2574          35 :             return;
    2575             :           }
    2576       25101 :           if (length == 1) {
    2577             :             // This letter has no case equivalents, so it's nice and simple
    2578             :             // and the mask-compare will determine definitely whether we have
    2579             :             // a match at this character position.
    2580        3533 :             pos->mask = char_mask;
    2581        3533 :             pos->value = c;
    2582        3533 :             pos->determines_perfectly = true;
    2583             :           } else {
    2584       21568 :             uint32_t common_bits = char_mask;
    2585       21568 :             uint32_t bits = chars[0];
    2586       43545 :             for (int j = 1; j < length; j++) {
    2587       21977 :               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
    2588       21977 :               common_bits ^= differing_bits;
    2589       21977 :               bits &= common_bits;
    2590             :             }
    2591             :             // If length is 2 and common bits has only one zero in it then
    2592             :             // our mask and compare instruction will determine definitely
    2593             :             // whether we have a match at this character position.  Otherwise
    2594             :             // it can only be an approximate check.
    2595       21568 :             uint32_t one_zero = (common_bits | ~char_mask);
    2596       21568 :             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
    2597       21089 :               pos->determines_perfectly = true;
    2598             :             }
    2599       21568 :             pos->mask = common_bits;
    2600       21568 :             pos->value = bits;
    2601             :           }
    2602             :         } else {
    2603             :           // Don't ignore case.  Nice simple case where the mask-compare will
    2604             :           // determine definitely whether we have a match at this character
    2605             :           // position.
    2606      574623 :           if (c > char_mask) {
    2607             :             details->set_cannot_match();
    2608          35 :             pos->determines_perfectly = false;
    2609          35 :             return;
    2610             :           }
    2611      574588 :           pos->mask = char_mask;
    2612      574588 :           pos->value = c;
    2613      574588 :           pos->determines_perfectly = true;
    2614             :         }
    2615      599689 :         characters_filled_in++;
    2616             :         DCHECK(characters_filled_in <= details->characters());
    2617      599689 :         if (characters_filled_in == details->characters()) {
    2618             :           return;
    2619             :         }
    2620             :       }
    2621             :     } else {
    2622             :       QuickCheckDetails::Position* pos =
    2623      148399 :           details->positions(characters_filled_in);
    2624             :       RegExpCharacterClass* tree = elm.char_class();
    2625      148399 :       ZoneList<CharacterRange>* ranges = tree->ranges(zone());
    2626      148399 :       if (tree->is_negated()) {
    2627             :         // A quick check uses multi-character mask and compare.  There is no
    2628             :         // useful way to incorporate a negative char class into this scheme
    2629             :         // so we just conservatively create a mask and value that will always
    2630             :         // succeed.
    2631        3848 :         pos->mask = 0;
    2632        3848 :         pos->value = 0;
    2633             :       } else {
    2634             :         int first_range = 0;
    2635      144589 :         while (ranges->at(first_range).from() > char_mask) {
    2636         108 :           first_range++;
    2637      503602 :           if (first_range == ranges->length()) {
    2638             :             details->set_cannot_match();
    2639          70 :             pos->determines_perfectly = false;
    2640             :             return;
    2641             :           }
    2642             :         }
    2643      144481 :         CharacterRange range = ranges->at(first_range);
    2644      144481 :         uc16 from = range.from();
    2645      144481 :         uc16 to = range.to();
    2646      144481 :         if (to > char_mask) {
    2647       22720 :           to = char_mask;
    2648             :         }
    2649      144481 :         uint32_t differing_bits = (from ^ to);
    2650             :         // A mask and compare is only perfect if the differing bits form a
    2651             :         // number like 00011111 with one single block of trailing 1s.
    2652      255417 :         if ((differing_bits & (differing_bits + 1)) == 0 &&
    2653      110936 :              from + differing_bits == to) {
    2654      101979 :           pos->determines_perfectly = true;
    2655             :         }
    2656      144481 :         uint32_t common_bits = ~SmearBitsRight(differing_bits);
    2657      144481 :         uint32_t bits = (from & common_bits);
    2658     1006988 :         for (int i = first_range + 1; i < ranges->length(); i++) {
    2659      359013 :           CharacterRange range = ranges->at(i);
    2660      359013 :           uc16 from = range.from();
    2661      359013 :           uc16 to = range.to();
    2662      359013 :           if (from > char_mask) continue;
    2663      180431 :           if (to > char_mask) to = char_mask;
    2664             :           // Here we are combining more ranges into the mask and compare
    2665             :           // value.  With each new range the mask becomes more sparse and
    2666             :           // so the chances of a false positive rise.  A character class
    2667             :           // with multiple ranges is assumed never to be equivalent to a
    2668             :           // mask and compare operation.
    2669      180431 :           pos->determines_perfectly = false;
    2670      180431 :           uint32_t new_common_bits = (from ^ to);
    2671      180431 :           new_common_bits = ~SmearBitsRight(new_common_bits);
    2672      180431 :           common_bits &= new_common_bits;
    2673      180431 :           bits &= new_common_bits;
    2674      180431 :           uint32_t differing_bits = (from & common_bits) ^ bits;
    2675      180431 :           common_bits ^= differing_bits;
    2676      180431 :           bits &= common_bits;
    2677             :         }
    2678      144481 :         pos->mask = common_bits;
    2679      144481 :         pos->value = bits;
    2680             :       }
    2681      148329 :       characters_filled_in++;
    2682             :       DCHECK(characters_filled_in <= details->characters());
    2683      148329 :       if (characters_filled_in == details->characters()) {
    2684             :         return;
    2685             :       }
    2686             :     }
    2687             :   }
    2688             :   DCHECK(characters_filled_in != details->characters());
    2689       65789 :   if (!details->cannot_match()) {
    2690       65789 :     on_success()-> GetQuickCheckDetails(details,
    2691             :                                         compiler,
    2692             :                                         characters_filled_in,
    2693       65789 :                                         true);
    2694             :   }
    2695             : }
    2696             : 
    2697             : 
    2698           0 : void QuickCheckDetails::Clear() {
    2699      407849 :   for (int i = 0; i < characters_; i++) {
    2700      407849 :     positions_[i].mask = 0;
    2701      407849 :     positions_[i].value = 0;
    2702      407849 :     positions_[i].determines_perfectly = false;
    2703             :   }
    2704     1304652 :   characters_ = 0;
    2705           0 : }
    2706             : 
    2707             : 
    2708      612652 : void QuickCheckDetails::Advance(int by, bool one_byte) {
    2709      612652 :   if (by >= characters_ || by < 0) {
    2710             :     DCHECK_IMPLIES(by < 0, characters_ == 0);
    2711             :     Clear();
    2712      612652 :     return;
    2713             :   }
    2714             :   DCHECK_LE(characters_ - by, 4);
    2715             :   DCHECK_LE(characters_, 4);
    2716       47062 :   for (int i = 0; i < characters_ - by; i++) {
    2717       47062 :     positions_[i] = positions_[by + i];
    2718             :   }
    2719       39020 :   for (int i = characters_ - by; i < characters_; i++) {
    2720       39020 :     positions_[i].mask = 0;
    2721       39020 :     positions_[i].value = 0;
    2722       39020 :     positions_[i].determines_perfectly = false;
    2723             :   }
    2724       38201 :   characters_ -= by;
    2725             :   // We could change mask_ and value_ here but we would never advance unless
    2726             :   // they had already been used in a check and they won't be used again because
    2727             :   // it would gain us nothing.  So there's no point.
    2728             : }
    2729             : 
    2730             : 
    2731      199109 : void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
    2732             :   DCHECK(characters_ == other->characters_);
    2733      199109 :   if (other->cannot_match_) {
    2734             :     return;
    2735             :   }
    2736      199005 :   if (cannot_match_) {
    2737         336 :     *this = *other;
    2738         336 :     return;
    2739             :   }
    2740      217714 :   for (int i = from_index; i < characters_; i++) {
    2741      217714 :     QuickCheckDetails::Position* pos = positions(i);
    2742      217714 :     QuickCheckDetails::Position* other_pos = other->positions(i);
    2743      258620 :     if (pos->mask != other_pos->mask ||
    2744       47526 :         pos->value != other_pos->value ||
    2745        6620 :         !other_pos->determines_perfectly) {
    2746             :       // Our mask-compare operation will be approximate unless we have the
    2747             :       // exact same operation on both sides of the alternation.
    2748      215416 :       pos->determines_perfectly = false;
    2749             :     }
    2750      217714 :     pos->mask &= other_pos->mask;
    2751      217714 :     pos->value &= pos->mask;
    2752      217714 :     other_pos->value &= pos->mask;
    2753      217714 :     uc16 differing_bits = (pos->value ^ other_pos->value);
    2754      217714 :     pos->mask &= ~differing_bits;
    2755      217714 :     pos->value &= pos->mask;
    2756             :   }
    2757             : }
    2758             : 
    2759             : 
    2760             : class VisitMarker {
    2761             :  public:
    2762             :   explicit VisitMarker(NodeInfo* info) : info_(info) {
    2763             :     DCHECK(!info->visited);
    2764      298699 :     info->visited = true;
    2765             :   }
    2766             :   ~VisitMarker() {
    2767      265237 :     info_->visited = false;
    2768             :   }
    2769             :  private:
    2770             :   NodeInfo* info_;
    2771             : };
    2772             : 
    2773             : 
    2774      150800 : RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
    2775      150800 :   if (info()->replacement_calculated) return replacement();
    2776      110286 :   if (depth < 0) return this;
    2777             :   DCHECK(!info()->visited);
    2778      110040 :   VisitMarker marker(info());
    2779             :   return FilterSuccessor(depth - 1, ignore_case);
    2780             : }
    2781             : 
    2782             : 
    2783           0 : RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
    2784      204225 :   RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
    2785      204225 :   if (next == NULL) return set_replacement(NULL);
    2786      203615 :   on_success_ = next;
    2787      203615 :   return set_replacement(this);
    2788             : }
    2789             : 
    2790             : 
    2791             : // We need to check for the following characters: 0x39c 0x3bc 0x178.
    2792        8359 : static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
    2793             :   // TODO(dcarney): this could be a lot more efficient.
    2794        8273 :   return range.Contains(0x39c) ||
    2795       16620 :       range.Contains(0x3bc) || range.Contains(0x178);
    2796             : }
    2797             : 
    2798             : 
    2799          26 : static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
    2800          64 :   for (int i = 0; i < ranges->length(); i++) {
    2801             :     // TODO(dcarney): this could be a lot more efficient.
    2802          58 :     if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
    2803             :   }
    2804             :   return false;
    2805             : }
    2806             : 
    2807             : 
    2808      297565 : RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
    2809      156395 :   if (info()->replacement_calculated) return replacement();
    2810       94780 :   if (depth < 0) return this;
    2811             :   DCHECK(!info()->visited);
    2812       94717 :   VisitMarker marker(info());
    2813       94717 :   int element_count = elements()->length();
    2814      195614 :   for (int i = 0; i < element_count; i++) {
    2815      101429 :     TextElement elm = elements()->at(i);
    2816      101429 :     if (elm.text_type() == TextElement::ATOM) {
    2817             :       Vector<const uc16> quarks = elm.atom()->data();
    2818      154576 :       for (int j = 0; j < quarks.length(); j++) {
    2819      108342 :         uint16_t c = quarks[j];
    2820      108342 :         if (c <= String::kMaxOneByteCharCode) continue;
    2821         393 :         if (!ignore_case) return set_replacement(NULL);
    2822             :         // Here, we need to check for characters whose upper and lower cases
    2823             :         // are outside the Latin-1 range.
    2824             :         uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
    2825             :         // Character is outside Latin-1 completely
    2826         231 :         if (converted == 0) return set_replacement(NULL);
    2827             :         // Convert quark to Latin-1 in place.
    2828             :         uint16_t* copy = const_cast<uint16_t*>(quarks.start());
    2829          47 :         copy[j] = converted;
    2830             :       }
    2831             :     } else {
    2832             :       DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
    2833             :       RegExpCharacterClass* cc = elm.char_class();
    2834             :       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
    2835       54976 :       CharacterRange::Canonicalize(ranges);
    2836             :       // Now they are in order so we only need to look at the first.
    2837       54976 :       int range_count = ranges->length();
    2838       54976 :       if (cc->is_negated()) {
    2839        9674 :         if (range_count != 0 &&
    2840        9871 :             ranges->at(0).from() == 0 &&
    2841         197 :             ranges->at(0).to() >= String::kMaxOneByteCharCode) {
    2842             :           // This will be handled in a later filter.
    2843          44 :           if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
    2844          43 :           return set_replacement(NULL);
    2845             :         }
    2846             :       } else {
    2847      100272 :         if (range_count == 0 ||
    2848       50133 :             ranges->at(0).from() > String::kMaxOneByteCharCode) {
    2849             :           // This will be handled in a later filter.
    2850         289 :           if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
    2851         270 :           return set_replacement(NULL);
    2852             :         }
    2853             :       }
    2854             :     }
    2855             :   }
    2856       94185 :   return FilterSuccessor(depth - 1, ignore_case);
    2857             : }
    2858             : 
    2859             : 
    2860       82711 : RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
    2861       82711 :   if (info()->replacement_calculated) return replacement();
    2862       63751 :   if (depth < 0) return this;
    2863       63623 :   if (info()->visited) return this;
    2864             :   {
    2865       33886 :     VisitMarker marker(info());
    2866             : 
    2867             :     RegExpNode* continue_replacement =
    2868       33886 :         continue_node_->FilterOneByte(depth - 1, ignore_case);
    2869             :     // If we can't continue after the loop then there is no sense in doing the
    2870             :     // loop.
    2871       33886 :     if (continue_replacement == NULL) return set_replacement(NULL);
    2872             :   }
    2873             : 
    2874       33462 :   return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
    2875             : }
    2876             : 
    2877             : 
    2878       42999 : RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
    2879       43185 :   if (info()->replacement_calculated) return replacement();
    2880       40528 :   if (depth < 0) return this;
    2881       40407 :   if (info()->visited) return this;
    2882       40407 :   VisitMarker marker(info());
    2883       40407 :   int choice_count = alternatives_->length();
    2884             : 
    2885      126590 :   for (int i = 0; i < choice_count; i++) {
    2886       90114 :     GuardedAlternative alternative = alternatives_->at(i);
    2887       90114 :     if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
    2888        3931 :       set_replacement(this);
    2889             :       return this;
    2890             :     }
    2891             :   }
    2892             : 
    2893             :   int surviving = 0;
    2894             :   RegExpNode* survivor = NULL;
    2895       85419 :   for (int i = 0; i < choice_count; i++) {
    2896      170838 :     GuardedAlternative alternative = alternatives_->at(i);
    2897             :     RegExpNode* replacement =
    2898       85419 :         alternative.node()->FilterOneByte(depth - 1, ignore_case);
    2899             :     DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
    2900       85419 :     if (replacement != NULL) {
    2901       85228 :       alternatives_->at(i).set_node(replacement);
    2902       85228 :       surviving++;
    2903             :       survivor = replacement;
    2904             :     }
    2905             :   }
    2906       36563 :   if (surviving < 2) return set_replacement(survivor);
    2907             : 
    2908       36389 :   set_replacement(this);
    2909       36389 :   if (surviving == choice_count) {
    2910             :     return this;
    2911             :   }
    2912             :   // Only some of the nodes survived the filtering.  We need to rebuild the
    2913             :   // alternatives list.
    2914             :   ZoneList<GuardedAlternative>* new_alternatives =
    2915          26 :       new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
    2916         248 :   for (int i = 0; i < choice_count; i++) {
    2917             :     RegExpNode* replacement =
    2918         444 :         alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
    2919         222 :     if (replacement != NULL) {
    2920         160 :       alternatives_->at(i).set_node(replacement);
    2921         160 :       new_alternatives->Add(alternatives_->at(i), zone());
    2922             :     }
    2923             :   }
    2924          26 :   alternatives_ = new_alternatives;
    2925          26 :   return this;
    2926             : }
    2927             : 
    2928             : 
    2929         387 : RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
    2930             :                                                         bool ignore_case) {
    2931         387 :   if (info()->replacement_calculated) return replacement();
    2932         387 :   if (depth < 0) return this;
    2933         387 :   if (info()->visited) return this;
    2934         387 :   VisitMarker marker(info());
    2935             :   // Alternative 0 is the negative lookahead, alternative 1 is what comes
    2936             :   // afterwards.
    2937         774 :   RegExpNode* node = alternatives_->at(1).node();
    2938         387 :   RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
    2939         394 :   if (replacement == NULL) return set_replacement(NULL);
    2940         380 :   alternatives_->at(1).set_node(replacement);
    2941             : 
    2942         760 :   RegExpNode* neg_node = alternatives_->at(0).node();
    2943         380 :   RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
    2944             :   // If the negative lookahead is always going to fail then
    2945             :   // we don't need to check it.
    2946         387 :   if (neg_replacement == NULL) return set_replacement(replacement);
    2947         373 :   alternatives_->at(0).set_node(neg_replacement);
    2948         746 :   return set_replacement(this);
    2949             : }
    2950             : 
    2951             : 
    2952       26860 : void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
    2953             :                                           RegExpCompiler* compiler,
    2954             :                                           int characters_filled_in,
    2955             :                                           bool not_at_start) {
    2956       26860 :   if (body_can_be_zero_length_ || info()->visited) return;
    2957       19262 :   VisitMarker marker(info());
    2958             :   return ChoiceNode::GetQuickCheckDetails(details,
    2959             :                                           compiler,
    2960             :                                           characters_filled_in,
    2961       19262 :                                           not_at_start);
    2962             : }
    2963             : 
    2964             : 
    2965       16894 : void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
    2966             :                                   BoyerMooreLookahead* bm, bool not_at_start) {
    2967       16894 :   if (body_can_be_zero_length_ || budget <= 0) {
    2968             :     bm->SetRest(offset);
    2969             :     SaveBMInfo(bm, not_at_start, offset);
    2970       16894 :     return;
    2971             :   }
    2972       16375 :   ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
    2973             :   SaveBMInfo(bm, not_at_start, offset);
    2974             : }
    2975             : 
    2976             : 
    2977      257628 : void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
    2978             :                                       RegExpCompiler* compiler,
    2979             :                                       int characters_filled_in,
    2980             :                                       bool not_at_start) {
    2981       58519 :   not_at_start = (not_at_start || not_at_start_);
    2982       58519 :   int choice_count = alternatives_->length();
    2983             :   DCHECK(choice_count > 0);
    2984       58519 :   alternatives_->at(0).node()->GetQuickCheckDetails(details,
    2985             :                                                     compiler,
    2986             :                                                     characters_filled_in,
    2987       58519 :                                                     not_at_start);
    2988      257628 :   for (int i = 1; i < choice_count; i++) {
    2989             :     QuickCheckDetails new_details(details->characters());
    2990      398218 :     RegExpNode* node = alternatives_->at(i).node();
    2991             :     node->GetQuickCheckDetails(&new_details, compiler,
    2992             :                                characters_filled_in,
    2993      199109 :                                not_at_start);
    2994             :     // Here we merge the quick match details of the two branches.
    2995      199109 :     details->Merge(&new_details, characters_filled_in);
    2996             :   }
    2997       58519 : }
    2998             : 
    2999             : 
    3000             : // Check for [0-9A-Z_a-z].
    3001         674 : static void EmitWordCheck(RegExpMacroAssembler* assembler,
    3002             :                           Label* word,
    3003             :                           Label* non_word,
    3004             :                           bool fall_through_on_word) {
    3005         674 :   if (assembler->CheckSpecialCharacterClass(
    3006             :           fall_through_on_word ? 'w' : 'W',
    3007         674 :           fall_through_on_word ? non_word : word)) {
    3008             :     // Optimized implementation available.
    3009         674 :     return;
    3010             :   }
    3011           0 :   assembler->CheckCharacterGT('z', non_word);
    3012           0 :   assembler->CheckCharacterLT('0', non_word);
    3013           0 :   assembler->CheckCharacterGT('a' - 1, word);
    3014           0 :   assembler->CheckCharacterLT('9' + 1, word);
    3015           0 :   assembler->CheckCharacterLT('A', non_word);
    3016           0 :   assembler->CheckCharacterLT('Z' + 1, word);
    3017           0 :   if (fall_through_on_word) {
    3018           0 :     assembler->CheckNotCharacter('_', non_word);
    3019             :   } else {
    3020           0 :     assembler->CheckCharacter('_', word);
    3021             :   }
    3022             : }
    3023             : 
    3024             : 
    3025             : // Emit the code to check for a ^ in multiline mode (1-character lookbehind
    3026             : // that matches newline or the start of input).
    3027         797 : static void EmitHat(RegExpCompiler* compiler,
    3028             :                     RegExpNode* on_success,
    3029             :                     Trace* trace) {
    3030             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    3031             :   // We will be loading the previous character into the current character
    3032             :   // register.
    3033         797 :   Trace new_trace(*trace);
    3034             :   new_trace.InvalidateCurrentCharacter();
    3035             : 
    3036             :   Label ok;
    3037         797 :   if (new_trace.cp_offset() == 0) {
    3038             :     // The start of input counts as a newline in this context, so skip to
    3039             :     // ok if we are at the start.
    3040         791 :     assembler->CheckAtStart(&ok);
    3041             :   }
    3042             :   // We already checked that we are not at the start of input so it must be
    3043             :   // OK to load the previous character.
    3044         797 :   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
    3045             :                                   new_trace.backtrack(),
    3046        1594 :                                   false);
    3047         797 :   if (!assembler->CheckSpecialCharacterClass('n',
    3048         797 :                                              new_trace.backtrack())) {
    3049             :     // Newline means \n, \r, 0x2028 or 0x2029.
    3050           0 :     if (!compiler->one_byte()) {
    3051           0 :       assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
    3052             :     }
    3053           0 :     assembler->CheckCharacter('\n', &ok);
    3054           0 :     assembler->CheckNotCharacter('\r', new_trace.backtrack());
    3055             :   }
    3056         797 :   assembler->Bind(&ok);
    3057         797 :   on_success->Emit(compiler, &new_trace);
    3058         797 : }
    3059             : 
    3060             : 
    3061             : // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
    3062         989 : void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
    3063         316 :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    3064             :   Isolate* isolate = assembler->isolate();
    3065             :   Trace::TriBool next_is_word_character = Trace::UNKNOWN;
    3066         316 :   bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
    3067         193 :   BoyerMooreLookahead* lookahead = bm_info(not_at_start);
    3068         316 :   if (lookahead == NULL) {
    3069             :     int eats_at_least =
    3070             :         Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
    3071             :                                                     kRecursionBudget,
    3072         249 :                                                     not_at_start));
    3073         249 :     if (eats_at_least >= 1) {
    3074         126 :       BoyerMooreLookahead* bm =
    3075         126 :           new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
    3076         126 :       FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
    3077         126 :       if (bm->at(0)->is_non_word())
    3078             :         next_is_word_character = Trace::FALSE_VALUE;
    3079         126 :       if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
    3080             :     }
    3081             :   } else {
    3082          67 :     if (lookahead->at(0)->is_non_word())
    3083             :       next_is_word_character = Trace::FALSE_VALUE;
    3084          67 :     if (lookahead->at(0)->is_word())
    3085             :       next_is_word_character = Trace::TRUE_VALUE;
    3086             :   }
    3087         316 :   bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
    3088         316 :   if (next_is_word_character == Trace::UNKNOWN) {
    3089             :     Label before_non_word;
    3090             :     Label before_word;
    3091         179 :     if (trace->characters_preloaded() != 1) {
    3092         356 :       assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
    3093             :     }
    3094             :     // Fall through on non-word.
    3095         179 :     EmitWordCheck(assembler, &before_word, &before_non_word, false);
    3096             :     // Next character is not a word character.
    3097         179 :     assembler->Bind(&before_non_word);
    3098             :     Label ok;
    3099         179 :     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
    3100         179 :     assembler->GoTo(&ok);
    3101             : 
    3102         179 :     assembler->Bind(&before_word);
    3103         179 :     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
    3104         179 :     assembler->Bind(&ok);
    3105         137 :   } else if (next_is_word_character == Trace::TRUE_VALUE) {
    3106         102 :     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
    3107             :   } else {
    3108             :     DCHECK(next_is_word_character == Trace::FALSE_VALUE);
    3109          35 :     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
    3110             :   }
    3111         316 : }
    3112             : 
    3113             : 
    3114         495 : void AssertionNode::BacktrackIfPrevious(
    3115         495 :     RegExpCompiler* compiler,
    3116             :     Trace* trace,
    3117             :     AssertionNode::IfPrevious backtrack_if_previous) {
    3118             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    3119         495 :   Trace new_trace(*trace);
    3120             :   new_trace.InvalidateCurrentCharacter();
    3121             : 
    3122             :   Label fall_through, dummy;
    3123             : 
    3124             :   Label* non_word = backtrack_if_previous == kIsNonWord ?
    3125         235 :                     new_trace.backtrack() :
    3126         495 :                     &fall_through;
    3127             :   Label* word = backtrack_if_previous == kIsNonWord ?
    3128             :                 &fall_through :
    3129         495 :                 new_trace.backtrack();
    3130             : 
    3131         495 :   if (new_trace.cp_offset() == 0) {
    3132             :     // The start of input counts as a non-word character, so the question is
    3133             :     // decided if we are at the start.
    3134         188 :     assembler->CheckAtStart(non_word);
    3135             :   }
    3136             :   // We already checked that we are not at the start of input so it must be
    3137             :   // OK to load the previous character.
    3138         495 :   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
    3139         495 :   EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
    3140             : 
    3141         495 :   assembler->Bind(&fall_through);
    3142         495 :   on_success()->Emit(compiler, &new_trace);
    3143         495 : }
    3144             : 
    3145             : 
    3146        3509 : void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
    3147             :                                          RegExpCompiler* compiler,
    3148             :                                          int filled_in,
    3149             :                                          bool not_at_start) {
    3150        3509 :   if (assertion_type_ == AT_START && not_at_start) {
    3151             :     details->set_cannot_match();
    3152             :     return;
    3153             :   }
    3154        3071 :   return on_success()->GetQuickCheckDetails(details,
    3155             :                                             compiler,
    3156             :                                             filled_in,
    3157        3071 :                                             not_at_start);
    3158             : }
    3159             : 
    3160             : 
    3161       36963 : void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
    3162             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    3163       13063 :   switch (assertion_type_) {
    3164             :     case AT_END: {
    3165             :       Label ok;
    3166       14540 :       assembler->CheckPosition(trace->cp_offset(), &ok);
    3167       14540 :       assembler->GoTo(trace->backtrack());
    3168        7270 :       assembler->Bind(&ok);
    3169             :       break;
    3170             :     }
    3171             :     case AT_START: {
    3172        4680 :       if (trace->at_start() == Trace::FALSE_VALUE) {
    3173          42 :         assembler->GoTo(trace->backtrack());
    3174          21 :         return;
    3175             :       }
    3176        4659 :       if (trace->at_start() == Trace::UNKNOWN) {
    3177        9318 :         assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
    3178        4659 :         Trace at_start_trace = *trace;
    3179             :         at_start_trace.set_at_start(Trace::TRUE_VALUE);
    3180       11929 :         on_success()->Emit(compiler, &at_start_trace);
    3181             :         return;
    3182             :       }
    3183             :     }
    3184             :     break;
    3185             :     case AFTER_NEWLINE:
    3186         797 :       EmitHat(compiler, on_success(), trace);
    3187         797 :       return;
    3188             :     case AT_BOUNDARY:
    3189             :     case AT_NON_BOUNDARY: {
    3190         316 :       EmitBoundaryCheck(compiler, trace);
    3191         316 :       return;
    3192             :     }
    3193             :   }
    3194        7270 :   on_success()->Emit(compiler, trace);
    3195             : }
    3196             : 
    3197             : 
    3198     3232696 : static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
    3199     3232696 :   if (quick_check == NULL) return false;
    3200     3232696 :   if (offset >= quick_check->characters()) return false;
    3201     1100849 :   return quick_check->positions(offset)->determines_perfectly;
    3202             : }
    3203             : 
    3204             : 
    3205             : static void UpdateBoundsCheck(int index, int* checked_up_to) {
    3206      876493 :   if (index > *checked_up_to) {
    3207      488669 :     *checked_up_to = index;
    3208             :   }
    3209             : }
    3210             : 
    3211             : 
    3212             : // We call this repeatedly to generate code for each pass over the text node.
    3213             : // The passes are in increasing order of difficulty because we hope one
    3214             : // of the first passes will fail in which case we are saved the work of the
    3215             : // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
    3216             : // we will check the '%' in the first pass, the case independent 'a' in the
    3217             : // second pass and the character class in the last pass.
    3218             : //
    3219             : // The passes are done from right to left, so for example to test for /bar/
    3220             : // we will first test for an 'r' with offset 2, then an 'a' with offset 1
    3221             : // and then a 'b' with offset 0.  This means we can avoid the end-of-input
    3222             : // bounds check most of the time.  In the example we only need to check for
    3223             : // end-of-input when loading the putative 'r'.
    3224             : //
    3225             : // A slight complication involves the fact that the first character may already
    3226             : // be fetched into a register by the previous node.  In this case we want to
    3227             : // do the test for that character first.  We do this in separate passes.  The
    3228             : // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
    3229             : // pass has been performed then subsequent passes will have true in
    3230             : // first_element_checked to indicate that that character does not need to be
    3231             : // checked again.
    3232             : //
    3233             : // In addition to all this we are passed a Trace, which can
    3234             : // contain an AlternativeGeneration object.  In this AlternativeGeneration
    3235             : // object we can see details of any quick check that was already passed in
    3236             : // order to get to the code we are now generating.  The quick check can involve
    3237             : // loading characters, which means we do not need to recheck the bounds
    3238             : // up to the limit the quick check already checked.  In addition the quick
    3239             : // check can have involved a mask and compare operation which may simplify
    3240             : // or obviate the need for further checks at some character positions.
    3241     3865778 : void TextNode::TextEmitPass(RegExpCompiler* compiler,
    3242             :                             TextEmitPassType pass,
    3243             :                             bool preloaded,
    3244     3948837 :                             Trace* trace,
    3245             :                             bool first_element_checked,
    3246     6271590 :                             int* checked_up_to) {
    3247     1932889 :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    3248             :   Isolate* isolate = assembler->isolate();
    3249             :   bool one_byte = compiler->one_byte();
    3250             :   Label* backtrack = trace->backtrack();
    3251     1932889 :   QuickCheckDetails* quick_check = trace->quick_check_performed();
    3252     1932889 :   int element_count = elements()->length();
    3253     1932889 :   int backward_offset = read_backward() ? -Length() : 0;
    3254     3948802 :   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
    3255     2015948 :     TextElement elm = elements()->at(i);
    3256     2015948 :     int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
    3257     2015948 :     if (elm.text_type() == TextElement::ATOM) {
    3258             :       Vector<const uc16> quarks = elm.atom()->data();
    3259     4163716 :       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
    3260     2989043 :         if (first_element_checked && i == 0 && j == 0) continue;
    3261     5929752 :         if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
    3262             :         EmitCharacterFunction* emit_function = NULL;
    3263     1917021 :         switch (pass) {
    3264             :           case NON_LATIN1_MATCH:
    3265             :             DCHECK(one_byte);
    3266      631927 :             if (quarks[j] > String::kMaxOneByteCharCode) {
    3267          35 :               assembler->GoTo(backtrack);
    3268     1932889 :               return;
    3269             :             }
    3270             :             break;
    3271             :           case NON_LETTER_CHARACTER_MATCH:
    3272             :             emit_function = &EmitAtomNonLetter;
    3273       14932 :             break;
    3274             :           case SIMPLE_CHARACTER_MATCH:
    3275             :             emit_function = &EmitSimpleCharacter;
    3276      620149 :             break;
    3277             :           case CASE_CHARACTER_MATCH:
    3278             :             emit_function = &EmitAtomLetter;
    3279       14932 :             break;
    3280             :           default:
    3281             :             break;
    3282             :         }
    3283     1916986 :         if (emit_function != NULL) {
    3284      985226 :           bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
    3285             :           bool bound_checked =
    3286             :               emit_function(isolate, compiler, quarks[j], backtrack,
    3287     1300026 :                             cp_offset + j, bounds_check, preloaded);
    3288      650013 :           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
    3289             :         }
    3290             :       }
    3291             :     } else {
    3292             :       DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
    3293      841240 :       if (pass == CHARACTER_CLASS_MATCH) {
    3294      306067 :         if (first_element_checked && i == 0) continue;
    3295      267820 :         if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
    3296             :         RegExpCharacterClass* cc = elm.char_class();
    3297      296102 :         bool bounds_check = *checked_up_to < cp_offset || read_backward();
    3298             :         EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
    3299      241451 :                       bounds_check, preloaded, zone());
    3300             :         UpdateBoundsCheck(cp_offset, checked_up_to);
    3301             :       }
    3302             :     }
    3303             :   }
    3304             : }
    3305             : 
    3306             : 
    3307     9158607 : int TextNode::Length() {
    3308     9158607 :   TextElement elm = elements()->last();
    3309             :   DCHECK(elm.cp_offset() >= 0);
    3310     9158607 :   return elm.cp_offset() + elm.length();
    3311             : }
    3312             : 
    3313             : 
    3314           0 : bool TextNode::SkipPass(int int_pass, bool ignore_case) {
    3315     2645892 :   TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
    3316     2645892 :   if (ignore_case) {
    3317      718740 :     return pass == SIMPLE_CHARACTER_MATCH;
    3318             :   } else {
    3319     1927152 :     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
    3320             :   }
    3321             : }
    3322             : 
    3323             : 
    3324       11183 : TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
    3325             :                                              ZoneList<CharacterRange>* ranges,
    3326             :                                              bool read_backward,
    3327             :                                              RegExpNode* on_success) {
    3328             :   DCHECK_NOT_NULL(ranges);
    3329       11183 :   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
    3330             :   elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),
    3331       22366 :             zone);
    3332       11183 :   return new (zone) TextNode(elms, read_backward, on_success);
    3333             : }
    3334             : 
    3335             : 
    3336       27313 : TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
    3337             :                                            CharacterRange trail,
    3338             :                                            bool read_backward,
    3339             :                                            RegExpNode* on_success) {
    3340       27313 :   ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
    3341       27313 :   ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
    3342       27313 :   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
    3343             :   elms->Add(
    3344             :       TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),
    3345       54626 :       zone);
    3346             :   elms->Add(
    3347             :       TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),
    3348       54626 :       zone);
    3349       27313 :   return new (zone) TextNode(elms, read_backward, on_success);
    3350             : }
    3351             : 
    3352             : 
    3353             : // This generates the code to match a text node.  A text node can contain
    3354             : // straight character sequences (possibly to be matched in a case-independent
    3355             : // way) and character classes.  For efficiency we do not do this in a single
    3356             : // pass from left to right.  Instead we pass over the text node several times,
    3357             : // emitting code for some character positions every time.  See the comment on
    3358             : // TextEmitPass for details.
    3359     5016893 : void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
    3360      728329 :   LimitResult limit_result = LimitVersions(compiler, trace);
    3361      844006 :   if (limit_result == DONE) return;
    3362             :   DCHECK(limit_result == CONTINUE);
    3363             : 
    3364      612652 :   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
    3365             :     compiler->SetRegExpTooBig();
    3366             :     return;
    3367             :   }
    3368             : 
    3369      612652 :   if (compiler->one_byte()) {
    3370      430258 :     int dummy = 0;
    3371      430258 :     TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
    3372             :   }
    3373             : 
    3374             :   bool first_elt_done = false;
    3375      612652 :   int bound_checked_to = trace->cp_offset() - 1;
    3376      612652 :   bound_checked_to += trace->bound_checked_up_to();
    3377             : 
    3378             :   // If a character is preloaded into the current character register then
    3379             :   // check that now.
    3380      612652 :   if (trace->characters_preloaded() == 1) {
    3381      195284 :     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
    3382      195284 :       if (!SkipPass(pass, compiler->ignore_case())) {
    3383             :         TextEmitPass(compiler,
    3384             :                      static_cast<TextEmitPassType>(pass),
    3385             :                      true,
    3386             :                      trace,
    3387             :                      false,
    3388      121021 :                      &bound_checked_to);
    3389             :       }
    3390             :     }
    3391             :     first_elt_done = true;
    3392             :   }
    3393             : 
    3394     3063260 :   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
    3395     2450608 :     if (!SkipPass(pass, compiler->ignore_case())) {
    3396             :       TextEmitPass(compiler,
    3397             :                    static_cast<TextEmitPassType>(pass),
    3398             :                    false,
    3399             :                    trace,
    3400             :                    first_elt_done,
    3401     1381610 :                    &bound_checked_to);
    3402             :     }
    3403             :   }
    3404             : 
    3405      612652 :   Trace successor_trace(*trace);
    3406             :   // If we advance backward, we may end up at the start.
    3407             :   successor_trace.AdvanceCurrentPositionInTrace(
    3408      612652 :       read_backward() ? -Length() : Length(), compiler);
    3409             :   successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
    3410      612652 :                                                : Trace::FALSE_VALUE);
    3411             :   RecursionCheck rc(compiler);
    3412      612652 :   on_success()->Emit(compiler, &successor_trace);
    3413             : }
    3414             : 
    3415             : 
    3416           0 : void Trace::InvalidateCurrentCharacter() {
    3417      283868 :   characters_preloaded_ = 0;
    3418           0 : }
    3419             : 
    3420             : 
    3421     1225304 : void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
    3422             :   // We don't have an instruction for shifting the current character register
    3423             :   // down or for using a shifted value for anything so lets just forget that
    3424             :   // we preloaded any characters into it.
    3425      612652 :   characters_preloaded_ = 0;
    3426             :   // Adjust the offsets of the quick check performed information.  This
    3427             :   // information is used to find out what we already determined about the
    3428             :   // characters by means of mask and compare.
    3429      612652 :   quick_check_performed_.Advance(by, compiler->one_byte());
    3430      612652 :   cp_offset_ += by;
    3431      612652 :   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
    3432             :     compiler->SetRegExpTooBig();
    3433           0 :     cp_offset_ = 0;
    3434             :   }
    3435     1225304 :   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
    3436      612652 : }
    3437             : 
    3438             : 
    3439      316425 : void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
    3440      155772 :   int element_count = elements()->length();
    3441      316425 :   for (int i = 0; i < element_count; i++) {
    3442      160653 :     TextElement elm = elements()->at(i);
    3443      160653 :     if (elm.text_type() == TextElement::CHAR_CLASS) {
    3444             :       RegExpCharacterClass* cc = elm.char_class();
    3445             :       // None of the standard character classes is different in the case
    3446             :       // independent case and it slows us down if we don't know that.
    3447      221619 :       if (cc->is_standard(zone())) continue;
    3448             :       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
    3449      148974 :       CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
    3450             :     }
    3451             :   }
    3452      155772 : }
    3453             : 
    3454             : 
    3455      185764 : int TextNode::GreedyLoopTextLength() { return Length(); }
    3456             : 
    3457             : 
    3458       93110 : RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
    3459      276056 :     RegExpCompiler* compiler) {
    3460       93110 :   if (read_backward()) return NULL;
    3461       92966 :   if (elements()->length() != 1) return NULL;
    3462       92526 :   TextElement elm = elements()->at(0);
    3463       92526 :   if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
    3464             :   RegExpCharacterClass* node = elm.char_class();
    3465       90929 :   ZoneList<CharacterRange>* ranges = node->ranges(zone());
    3466       90929 :   CharacterRange::Canonicalize(ranges);
    3467       90929 :   if (node->is_negated()) {
    3468       87833 :     return ranges->length() == 0 ? on_success() : NULL;
    3469             :   }
    3470       90752 :   if (ranges->length() != 1) return NULL;
    3471             :   uint32_t max_char;
    3472       89980 :   if (compiler->one_byte()) {
    3473             :     max_char = String::kMaxOneByteCharCode;
    3474             :   } else {
    3475             :     max_char = String::kMaxUtf16CodeUnit;
    3476             :   }
    3477      179960 :   return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
    3478             : }
    3479             : 
    3480             : 
    3481             : // Finds the fixed match length of a sequence of nodes that goes from
    3482             : // this alternative and back to this choice node.  If there are variable
    3483             : // length nodes or other complications in the way then return a sentinel
    3484             : // value indicating that a greedy loop cannot be constructed.
    3485      284881 : int ChoiceNode::GreedyLoopTextLengthForAlternative(
    3486      284881 :     GuardedAlternative* alternative) {
    3487             :   int length = 0;
    3488             :   RegExpNode* node = alternative->node();
    3489             :   // Later we will generate code for all these text nodes using recursion
    3490             :   // so we have to limit the max number.
    3491             :   int recursion_depth = 0;
    3492      755526 :   while (node != this) {
    3493      438565 :     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
    3494             :       return kNodeIsTooComplexForGreedyLoops;
    3495             :     }
    3496      438565 :     int node_length = node->GreedyLoopTextLength();
    3497      438565 :     if (node_length == kNodeIsTooComplexForGreedyLoops) {
    3498             :       return kNodeIsTooComplexForGreedyLoops;
    3499             :     }
    3500      185764 :     length += node_length;
    3501      185764 :     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
    3502             :     node = seq_node->on_success();
    3503             :   }
    3504       32080 :   return read_backward() ? -length : length;
    3505             : }
    3506             : 
    3507             : 
    3508           0 : void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
    3509             :   DCHECK_NULL(loop_node_);
    3510     1612556 :   AddAlternative(alt);
    3511     1612556 :   loop_node_ = alt.node();
    3512           0 : }
    3513             : 
    3514             : 
    3515           0 : void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
    3516             :   DCHECK_NULL(continue_node_);
    3517     1612556 :   AddAlternative(alt);
    3518     1612556 :   continue_node_ = alt.node();
    3519           0 : }
    3520             : 
    3521             : 
    3522      396509 : void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
    3523             :   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    3524      380469 :   if (trace->stop_node() == this) {
    3525             :     // Back edge of greedy optimized loop node graph.
    3526             :     int text_length =
    3527       32080 :         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
    3528             :     DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
    3529             :     // Update the counter-based backtracking info on the stack.  This is an
    3530             :     // optimization for greedy loops (see below).
    3531             :     DCHECK(trace->cp_offset() == text_length);
    3532       16040 :     macro_assembler->AdvanceCurrentPosition(text_length);
    3533       32080 :     macro_assembler->GoTo(trace->loop_label());
    3534       16040 :     return;
    3535             :   }
    3536             :   DCHECK_NULL(trace->stop_node());
    3537      364429 :   if (!trace->is_trivial()) {
    3538      138345 :     trace->Flush(compiler, this);
    3539      138345 :     return;
    3540             :   }
    3541      226084 :   ChoiceNode::Emit(compiler, trace);
    3542             : }
    3543             : 
    3544             : 
    3545      751971 : int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
    3546             :                                            int eats_at_least) {
    3547             :   int preload_characters = Min(4, eats_at_least);
    3548      268841 :   if (compiler->macro_assembler()->CanReadUnaligned()) {
    3549             :     bool one_byte = compiler->one_byte();
    3550      214289 :     if (one_byte) {
    3551      182578 :       if (preload_characters > 4) preload_characters = 4;
    3552             :       // We can't preload 3 characters because there is no machine instruction
    3553             :       // to do that.  We can't just load 4 because we could be reading
    3554             :       // beyond the end of the string, which could cause a memory fault.
    3555      182578 :       if (preload_characters == 3) preload_characters = 2;
    3556             :     } else {
    3557       31711 :       if (preload_characters > 2) preload_characters = 2;
    3558             :     }
    3559             :   } else {
    3560       54552 :     if (preload_characters > 1) preload_characters = 1;
    3561             :   }
    3562      268841 :   return preload_characters;
    3563             : }
    3564             : 
    3565             : 
    3566             : // This class is used when generating the alternatives in a choice node.  It
    3567             : // records the way the alternative is being code generated.
    3568             : class AlternativeGeneration: public Malloced {
    3569             :  public:
    3570             :   AlternativeGeneration()
    3571             :       : possible_success(),
    3572             :         expects_preload(false),
    3573             :         after(),
    3574     2716265 :         quick_check_details() { }
    3575             :   Label possible_success;
    3576             :   bool expects_preload;
    3577             :   Label after;
    3578             :   QuickCheckDetails quick_check_details;
    3579             : };
    3580             : 
    3581             : 
    3582             : // Creates a list of AlternativeGenerations.  If the list has a reasonable
    3583             : // size then it is on the stack, otherwise the excess is on the heap.
    3584             : class AlternativeGenerationList {
    3585             :  public:
    3586      268841 :   AlternativeGenerationList(int count, Zone* zone)
    3587     2957251 :       : alt_gens_(count, zone) {
    3588      718386 :     for (int i = 0; i < count && i < kAFew; i++) {
    3589      718386 :       alt_gens_.Add(a_few_alt_gens_ + i, zone);
    3590             :     }
    3591       27855 :     for (int i = kAFew; i < count; i++) {
    3592             :       alt_gens_.Add(new AlternativeGeneration(), zone);
    3593             :     }
    3594      268841 :   }
    3595      268841 :   ~AlternativeGenerationList() {
    3596      593392 :     for (int i = kAFew; i < alt_gens_.length(); i++) {
    3597      380261 :       delete alt_gens_[i];
    3598       27855 :       alt_gens_[i] = NULL;
    3599             :     }
    3600      268841 :   }
    3601             : 
    3602             :   AlternativeGeneration* at(int i) {
    3603     3430284 :     return alt_gens_[i];
    3604             :   }
    3605             : 
    3606             :  private:
    3607             :   static const int kAFew = 10;
    3608             :   ZoneList<AlternativeGeneration*> alt_gens_;
    3609             :   AlternativeGeneration a_few_alt_gens_[kAFew];
    3610             : };
    3611             : 
    3612             : 
    3613             : static const uc32 kRangeEndMarker = 0x110000;
    3614             : 
    3615             : // The '2' variant is has inclusive from and exclusive to.
    3616             : // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
    3617             : // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
    3618             : static const int kSpaceRanges[] = {
    3619             :     '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,
    3620             :     0x1681, 0x2000,   0x200B, 0x2028,  0x202A, 0x202F, 0x2030,
    3621             :     0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
    3622             : static const int kSpaceRangeCount = arraysize(kSpaceRanges);
    3623             : 
    3624             : static const int kWordRanges[] = {
    3625             :     '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
    3626             : static const int kWordRangeCount = arraysize(kWordRanges);
    3627             : static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
    3628             : static const int kDigitRangeCount = arraysize(kDigitRanges);
    3629             : static const int kSurrogateRanges[] = {
    3630             :     kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
    3631             : static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
    3632             : static const int kLineTerminatorRanges[] = {
    3633             :     0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
    3634             : static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
    3635             : 
    3636           0 : void BoyerMoorePositionInfo::Set(int character) {
    3637       50775 :   SetInterval(Interval(character, character));
    3638           0 : }
    3639             : 
    3640             : 
    3641      544258 : void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
    3642      272129 :   s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
    3643      272129 :   w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
    3644      272129 :   d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
    3645             :   surrogate_ =
    3646      272129 :       AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
    3647      272129 :   if (interval.to() - interval.from() >= kMapSize - 1) {
    3648       16786 :     if (map_count_ != kMapSize) {
    3649        7757 :       map_count_ = kMapSize;
    3650     1000653 :       for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
    3651             :     }
    3652             :     return;
    3653             :   }
    3654      783207 :   for (int i = interval.from(); i <= interval.to(); i++) {
    3655      856946 :     int mod_character = (i & kMask);
    3656     1713892 :     if (!map_->at(mod_character)) {
    3657      617231 :       map_count_++;
    3658      617231 :       map_->at(mod_character) = true;
    3659             :     }
    3660      856946 :     if (map_count_ == kMapSize) return;
    3661             :   }
    3662             : }
    3663             : 
    3664             : 
    3665           0 : void BoyerMoorePositionInfo::SetAll() {
    3666        6750 :   s_ = w_ = d_ = kLatticeUnknown;
    3667        6750 :   if (map_count_ != kMapSize) {
    3668        6255 :     map_count_ = kMapSize;
    3669     1601280 :     for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
    3670             :   }
    3671           0 : }
    3672             : 
    3673             : 
    3674       85830 : BoyerMooreLookahead::BoyerMooreLookahead(
    3675       85830 :     int length, RegExpCompiler* compiler, Zone* zone)
    3676             :     : length_(length),
    3677       85830 :       compiler_(compiler) {
    3678       85830 :   if (compiler->one_byte()) {
    3679       14257 :     max_char_ = String::kMaxOneByteCharCode;
    3680             :   } else {
    3681       71573 :     max_char_ = String::kMaxUtf16CodeUnit;
    3682             :   }
    3683       85830 :   bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
    3684      199264 :   for (int i = 0; i < length; i++) {
    3685      113434 :     bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
    3686             :   }
    3687       85830 : }
    3688             : 
    3689             : 
    3690             : // Find the longest range of lookahead that has the fewest number of different
    3691             : // characters that can occur at a given position.  Since we are optimizing two
    3692             : // different parameters at once this is a tradeoff.
    3693       85704 : bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
    3694             :   int biggest_points = 0;
    3695             :   // If more than 32 characters out of 128 can occur it is unlikely that we can
    3696             :   // be lucky enough to step forwards much of the time.
    3697             :   const int kMaxMax = 32;
    3698      342816 :   for (int max_number_of_chars = 4;
    3699             :        max_number_of_chars < kMaxMax;
    3700             :        max_number_of_chars *= 2) {
    3701             :     biggest_points =
    3702      257112 :         FindBestInterval(max_number_of_chars, biggest_points, from, to);
    3703             :   }
    3704       85704 :   if (biggest_points == 0) return false;
    3705        8516 :   return true;
    3706             : }
    3707             : 
    3708             : 
    3709             : // Find the highest-points range between 0 and length_ where the character
    3710             : // information is not too vague.  'Too vague' means that there are more than
    3711             : // max_number_of_chars that can occur at this position.  Calculates the number
    3712             : // of points as the product of width-of-the-range and
    3713             : // probability-of-finding-one-of-the-characters, where the probability is
    3714             : // calculated using the frequency distribution of the sample subject string.
    3715      257112 : int BoyerMooreLookahead::FindBestInterval(
    3716      599221 :     int max_number_of_chars, int old_biggest_points, int* from, int* to) {
    3717             :   int biggest_points = old_biggest_points;
    3718             :   static const int kSize = RegExpMacroAssembler::kTableSize;
    3719      755070 :   for (int i = 0; i < length_; ) {
    3720      369422 :     while (i < length_ && Count(i) > max_number_of_chars) i++;
    3721      275893 :     if (i == length_) break;
    3722             :     int remembered_from = i;
    3723             :     bool union_map[kSize];
    3724    30828288 :     for (int j = 0; j < kSize; j++) union_map[j] = false;
    3725      810239 :     while (i < length_ && Count(i) <= max_number_of_chars) {
    3726    35789780 :       BoyerMoorePositionInfo* map = bitmaps_->at(i);
    3727    35514474 :       for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
    3728      275306 :       i++;
    3729             :     }
    3730             :     int frequency = 0;
    3731    30828288 :     for (int j = 0; j < kSize; j++) {
    3732    30828288 :       if (union_map[j]) {
    3733             :         // Add 1 to the frequency to give a small per-character boost for
    3734             :         // the cases where our sampling is not good enough and many
    3735             :         // characters have a frequency of zero.  This means the frequency
    3736             :         // can theoretically be up to 2*kSize though we treat it mostly as
    3737             :         // a fraction of kSize.
    3738     1033927 :         frequency += compiler_->frequency_collator()->Frequency(j) + 1;
    3739             :       }
    3740             :     }
    3741             :     // We use the probability of skipping times the distance we are skipping to
    3742             :     // judge the effectiveness of this.  Actually we have a cut-off:  By
    3743             :     // dividing by 2 we switch off the skipping if the probability of skipping
    3744             :     // is less than 50%.  This is because the multibyte mask-and-compare
    3745             :     // skipping in quickcheck is more likely to do well on this case.
    3746             :     bool in_quickcheck_range =
    3747      244885 :         ((i - remembered_from < 4) ||
    3748        4039 :          (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
    3749             :     // Called 'probability' but it is only a rough estimate and can actually
    3750             :     // be outside the 0-kSize range.
    3751      240846 :     int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
    3752      240846 :     int points = (i - remembered_from) * probability;
    3753      240846 :     if (points > biggest_points) {
    3754        9049 :       *from = remembered_from;
    3755        9049 :       *to = i - 1;
    3756             :       biggest_points = points;
    3757             :     }
    3758             :   }
    3759      257112 :   return biggest_points;
    3760             : }
    3761             : 
    3762             : 
    3763             : // Take all the characters that will not prevent a successful match if they
    3764             : // occur in the subject string in the range between min_lookahead and
    3765             : // max_lookahead (inclusive) measured from the current position.  If the
    3766             : // character at max_lookahead offset is not one of these characters, then we
    3767             : // can safely skip forwards by the number of characters in the range.
    3768        6394 : int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
    3769             :                                       int max_lookahead,
    3770             :                                       Handle<ByteArray> boolean_skip_table) {
    3771             :   const int kSize = RegExpMacroAssembler::kTableSize;
    3772             : 
    3773             :   const int kSkipArrayEntry = 0;
    3774             :   const int kDontSkipArrayEntry = 1;
    3775             : 
    3776      824826 :   for (int i = 0; i < kSize; i++) {
    3777             :     boolean_skip_table->set(i, kSkipArrayEntry);
    3778             :   }
    3779        6394 :   int skip = max_lookahead + 1 - min_lookahead;
    3780             : 
    3781       22253 :   for (int i = max_lookahead; i >= min_lookahead; i--) {
    3782     2061670 :     BoyerMoorePositionInfo* map = bitmaps_->at(i);
    3783     2045811 :     for (int j = 0; j < kSize; j++) {
    3784     2029952 :       if (map->at(j)) {
    3785             :         boolean_skip_table->set(j, kDontSkipArrayEntry);
    3786             :       }
    3787             :     }
    3788             :   }
    3789             : 
    3790        6394 :   return skip;
    3791             : }
    3792             : 
    3793             : 
    3794             : // See comment above on the implementation of GetSkipTable.
    3795       92098 : void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
    3796             :   const int kSize = RegExpMacroAssembler::kTableSize;
    3797             : 
    3798       85704 :   int min_lookahead = 0;
    3799       85704 :   int max_lookahead = 0;
    3800             : 
    3801      165014 :   if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
    3802             : 
    3803             :   bool found_single_character = false;
    3804             :   int single_character = 0;
    3805       14776 :   for (int i = max_lookahead; i >= min_lookahead; i--) {
    3806      597277 :     BoyerMoorePositionInfo* map = bitmaps_->at(i);
    3807       25308 :     if (map->map_count() > 1 ||
    3808        3995 :         (found_single_character && map->map_count() != 0)) {
    3809             :       found_single_character = false;
    3810             :       break;
    3811             :     }
    3812      565799 :     for (int j = 0; j < kSize; j++) {
    3813      571969 :       if (map->at(j)) {
    3814             :         found_single_character = true;
    3815             :         single_character = j;
    3816             :         break;
    3817             :       }
    3818             :     }
    3819             :   }
    3820             : 
    3821        8516 :   int lookahead_width = max_lookahead + 1 - min_lookahead;
    3822             : 
    3823        8516 :   if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
    3824             :     // The mask-compare can probably handle this better.
    3825             :     return;
    3826             :   }
    3827             : 
    3828        6510 :   if (found_single_character) {
    3829             :     Label cont, again;
    3830         116 :     masm->Bind(&again);
    3831         116 :     masm->LoadCurrentCharacter(max_lookahead, &cont, true);
    3832         116 :     if (max_char_ > kSize) {
    3833             :       masm->CheckCharacterAfterAnd(single_character,
    3834             :                                    RegExpMacroAssembler::kTableMask,
    3835         116 :                                    &cont);
    3836             :     } else {
    3837           0 :       masm->CheckCharacter(single_character, &cont);
    3838             :     }
    3839         116 :     masm->AdvanceCurrentPosition(lookahead_width);
    3840         116 :     masm->GoTo(&again);
    3841         116 :     masm->Bind(&cont);
    3842             :     return;
    3843             :   }
    3844             : 
    3845             :   Factory* factory = masm->isolate()->factory();
    3846        6394 :   Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
    3847             :   int skip_distance = GetSkipTable(
    3848        6394 :       min_lookahead, max_lookahead, boolean_skip_table);
    3849             :   DCHECK(skip_distance != 0);
    3850             : 
    3851             :   Label cont, again;
    3852        6394 :   masm->Bind(&again);
    3853        6394 :   masm->LoadCurrentCharacter(max_lookahead, &cont, true);
    3854        6394 :   masm->CheckBitInTable(boolean_skip_table, &cont);
    3855        6394 :   masm->AdvanceCurrentPosition(skip_distance);
    3856        6394 :   masm->GoTo(&again);
    3857        6394 :   masm->Bind(&cont);
    3858             : }
    3859             : 
    3860             : 
    3861             : /* Code generation for choice nodes.
    3862             :  *
    3863             :  * We generate quick checks that do a mask and compare to eliminate a
    3864             :  * choice.  If the quick check succeeds then it jumps to the continuation to
    3865             :  * do slow checks and check subsequent nodes.  If it fails (the common case)
    3866             :  * it falls through to the next choice.
    3867             :  *
    3868             :  * Here is the desired flow graph.  Nodes directly below each other imply
    3869             :  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
    3870             :  * 3 doesn't have a quick check so we have to call the slow check.
    3871             :  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
    3872             :  * regexp continuation is generated directly after the Sn node, up to the
    3873             :  * next GoTo if we decide to reuse some already generated code.  Some
    3874             :  * nodes expect preload_characters to be preloaded into the current
    3875             :  * character register.  R nodes do this preloading.  Vertices are marked
    3876             :  * F for failures and S for success (possible success in the case of quick
    3877             :  * nodes).  L, V, < and > are used as arrow heads.
    3878             :  *
    3879             :  * ----------> R
    3880             :  *             |
    3881             :  *             V
    3882             :  *            Q1 -----> S1
    3883             :  *             |   S   /
    3884             :  *            F|      /
    3885             :  *             |    F/
    3886             :  *             |    /
    3887             :  *             |   R
    3888             :  *             |  /
    3889             :  *             V L
    3890             :  *            Q2 -----> S2
    3891             :  *             |   S   /
    3892             :  *            F|      /
    3893             :  *             |    F/
    3894             :  *             |    /
    3895             :  *             |   R
    3896             :  *             |  /
    3897             :  *             V L
    3898             :  *            S3
    3899             :  *             |
    3900             :  *            F|
    3901             :  *             |
    3902             :  *             R
    3903             :  *             |
    3904             :  * backtrack   V
    3905             :  * <----------Q4
    3906             :  *   \    F    |
    3907             :  *    \        |S
    3908             :  *     \   F   V
    3909             :  *      \-----S4
    3910             :  *
    3911             :  * For greedy loops we push the current position, then generate the code that
    3912             :  * eats the input specially in EmitGreedyLoop.  The other choice (the
    3913             :  * continuation) is generated by the normal code in EmitChoices, and steps back
    3914             :  * in the input to the starting position when it fails to match.  The loop code
    3915             :  * looks like this (U is the unwind code that steps back in the greedy loop).
    3916             :  *
    3917             :  *              _____
    3918             :  *             /     \
    3919             :  *             V     |
    3920             :  * ----------> S1    |
    3921             :  *            /|     |
    3922             :  *           / |S    |
    3923             :  *         F/  \_____/
    3924             :  *         /
    3925             :  *        |<-----
    3926             :  *        |      \
    3927             :  *        V       |S
    3928             :  *        Q2 ---> U----->backtrack
    3929             :  *        |  F   /
    3930             :  *       S|     /
    3931             :  *        V  F /
    3932             :  *        S2--/
    3933             :  */
    3934             : 
    3935      268841 : GreedyLoopState::GreedyLoopState(bool not_at_start) {
    3936           0 :   counter_backtrack_trace_.set_backtrack(&label_);
    3937      268841 :   if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
    3938           0 : }
    3939             : 
    3940             : 
    3941           0 : void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
    3942             : #ifdef DEBUG
    3943             :   int choice_count = alternatives_->length();
    3944             :   for (int i = 0; i < choice_count - 1; i++) {
    3945             :     GuardedAlternative alternative = alternatives_->at(i);
    3946             :     ZoneList<Guard*>* guards = alternative.guards();
    3947             :     int guard_count = (guards == NULL) ? 0 : guards->length();
    3948             :     for (int j = 0; j < guard_count; j++) {
    3949             :       DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
    3950             :     }
    3951             :   }
    3952             : #endif
    3953           0 : }
    3954             : 
    3955             : 
    3956      450228 : void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
    3957      450228 :                               Trace* current_trace,
    3958             :                               PreloadState* state) {
    3959      268841 :     if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
    3960             :       // Save some time by looking at most one machine word ahead.
    3961             :       state->eats_at_least_ =
    3962             :           EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
    3963      544161 :                       current_trace->at_start() == Trace::FALSE_VALUE);
    3964             :     }
    3965             :     state->preload_characters_ =
    3966      268841 :         CalculatePreloadCharacters(compiler, state->eats_at_least_);
    3967             : 
    3968             :     state->preload_is_current_ =
    3969      268841 :         (current_trace->characters_preloaded() == state->preload_characters_);
    3970      268841 :     state->preload_has_checked_bounds_ = state->preload_is_current_;
    3971      268841 : }
    3972             : 
    3973             : 
    3974     1779184 : void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
    3975      715588 :   int choice_count = alternatives_->length();
    3976             : 
    3977      716662 :   if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
    3978        1074 :     alternatives_->at(0).node()->Emit(compiler, trace);
    3979        1074 :     return;
    3980             :   }
    3981             : 
    3982             :   AssertGuardsMentionRegisters(trace);
    3983             : 
    3984      983355 :   LimitResult limit_result = LimitVersions(compiler, trace);
    3985      714514 :   if (limit_result == DONE) return;
    3986             :   DCHECK(limit_result == CONTINUE);
    3987             : 
    3988             :   // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
    3989             :   // other choice nodes we only flush if we are out of code size budget.
    3990      273113 :   if (trace->flush_budget() == 0 && trace->actions() != NULL) {
    3991        2136 :     trace->Flush(compiler, this);
    3992        2136 :     return;
    3993             :   }
    3994             : 
    3995             :   RecursionCheck rc(compiler);
    3996             : 
    3997             :   PreloadState preload;
    3998             :   preload.init();
    3999             :   GreedyLoopState greedy_loop_state(not_at_start());
    4000             : 
    4001      537682 :   int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
    4002      537682 :   AlternativeGenerationList alt_gens(choice_count, zone());
    4003             : 
    4004      268841 :   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
    4005             :     trace = EmitGreedyLoop(compiler,
    4006             :                            trace,
    4007             :                            &alt_gens,
    4008             :                            &preload,
    4009             :                            &greedy_loop_state,
    4010       16040 :                            text_length);
    4011             :   } else {
    4012             :     // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
    4013             :     // match the traces produced pre-cleanup.
    4014             :     Label second_choice;
    4015      252801 :     compiler->macro_assembler()->Bind(&second_choice);
    4016             : 
    4017      252801 :     preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
    4018             : 
    4019             :     EmitChoices(compiler,
    4020             :                 &alt_gens,
    4021             :                 0,
    4022             :                 trace,
    4023      252801 :                 &preload);
    4024             :   }
    4025             : 
    4026             :   // At this point we need to generate slow checks for the alternatives where
    4027             :   // the quick check was inlined.  We can recognize these because the associated
    4028             :   // label was bound.
    4029      268841 :   int new_flush_budget = trace->flush_budget() / choice_count;
    4030     1015082 :   for (int i = 0; i < choice_count; i++) {
    4031             :     AlternativeGeneration* alt_gen = alt_gens.at(i);
    4032      746241 :     Trace new_trace(*trace);
    4033             :     // If there are actions to be flushed we have to limit how many times
    4034             :     // they are flushed.  Take the budget of the parent trace and distribute
    4035             :     // it fairly amongst the children.
    4036      746241 :     if (new_trace.actions() != NULL) {
    4037             :       new_trace.set_flush_budget(new_flush_budget);
    4038             :     }
    4039             :     bool next_expects_preload =
    4040     1223641 :         i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
    4041             :     EmitOutOfLineContinuation(compiler,
    4042             :                               &new_trace,
    4043      746241 :                               alternatives_->at(i),
    4044             :                               alt_gen,
    4045             :                               preload.preload_characters_,
    4046     1492482 :                               next_expects_preload);
    4047             :   }
    4048             : }
    4049             : 
    4050             : 
    4051       16040 : Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
    4052       16040 :                                   Trace* trace,
    4053             :                                   AlternativeGenerationList* alt_gens,
    4054             :                                   PreloadState* preload,
    4055             :                                   GreedyLoopState* greedy_loop_state,
    4056       16040 :                                   int text_length) {
    4057             :   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    4058             :   // Here we have special handling for greedy loops containing only text nodes
    4059             :   // and other simple nodes.  These are handled by pushing the current
    4060             :   // position on the stack and then incrementing the current position each
    4061             :   // time around the switch.  On backtrack we decrement the current position
    4062             :   // and check it against the pushed value.  This avoids pushing backtrack
    4063             :   // information for each iteration of the loop, which could take up a lot of
    4064             :   // space.
    4065             :   DCHECK(trace->stop_node() == NULL);
    4066       16040 :   macro_assembler->PushCurrentPosition();
    4067             :   Label greedy_match_failed;
    4068       16040 :   Trace greedy_match_trace;
    4069       16040 :   if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
    4070             :   greedy_match_trace.set_backtrack(&greedy_match_failed);
    4071             :   Label loop_label;
    4072       16040 :   macro_assembler->Bind(&loop_label);
    4073       16040 :   greedy_match_trace.set_stop_node(this);
    4074             :   greedy_match_trace.set_loop_label(&loop_label);
    4075       32080 :   alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
    4076       16040 :   macro_assembler->Bind(&greedy_match_failed);
    4077             : 
    4078             :   Label second_choice;  // For use in greedy matches.
    4079       16040 :   macro_assembler->Bind(&second_choice);
    4080             : 
    4081       16040 :   Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
    4082             : 
    4083             :   EmitChoices(compiler,
    4084             :               alt_gens,
    4085             :               1,
    4086             :               new_trace,
    4087       16040 :               preload);
    4088             : 
    4089       16040 :   macro_assembler->Bind(greedy_loop_state->label());
    4090             :   // If we have unwound to the bottom then backtrack.
    4091       32080 :   macro_assembler->CheckGreedyLoop(trace->backtrack());
    4092             :   // Otherwise try the second priority at an earlier position.
    4093       16040 :   macro_assembler->AdvanceCurrentPosition(-text_length);
    4094       16040 :   macro_assembler->GoTo(&second_choice);
    4095       16040 :   return new_trace;
    4096             : }
    4097             : 
    4098      340255 : int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
    4099             :                                               Trace* trace) {
    4100             :   int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
    4101      252801 :   if (alternatives_->length() != 2) return eats_at_least;
    4102             : 
    4103      207522 :   GuardedAlternative alt1 = alternatives_->at(1);
    4104      207522 :   if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
    4105             :     return eats_at_least;
    4106             :   }
    4107             :   RegExpNode* eats_anything_node = alt1.node();
    4108      289943 :   if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
    4109             :     return eats_at_least;
    4110             :   }
    4111             : 
    4112             :   // Really we should be creating a new trace when we execute this function,
    4113             :   // but there is no need, because the code it generates cannot backtrack, and
    4114             :   // we always arrive here with a trivial trace (since it's the entry to a
    4115             :   // loop.  That also implies that there are no preloaded characters, which is
    4116             :   // good, because it means we won't be violating any assumptions by
    4117             :   // overwriting those characters with new load instructions.
    4118             :   DCHECK(trace->is_trivial());
    4119             : 
    4120       87454 :   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    4121             :   Isolate* isolate = macro_assembler->isolate();
    4122             :   // At this point we know that we are at a non-greedy loop that will eat
    4123             :   // any character one at a time.  Any non-anchored regexp has such a
    4124             :   // loop prepended to it in order to find where it starts.  We look for
    4125             :   // a pattern of the form ...abc... where we can look 6 characters ahead
    4126             :   // and step forwards 3 if the character is not one of abc.  Abc need
    4127             :   // not be atoms, they can be any reasonably limited character class or
    4128             :   // small alternation.
    4129             :   BoyerMooreLookahead* bm = bm_info(false);
    4130       87454 :   if (bm == NULL) {
    4131             :     eats_at_least = Min(kMaxLookaheadForBoyerMoore,
    4132             :                         EatsAtLeast(kMaxLookaheadForBoyerMoore,
    4133             :                                     kRecursionBudget,
    4134       87454 :                                     false));
    4135       87454 :     if (eats_at_least >= 1) {
    4136             :       bm = new(zone()) BoyerMooreLookahead(eats_at_least,
    4137             :                                            compiler,
    4138       85704 :                                            zone());
    4139      171408 :       GuardedAlternative alt0 = alternatives_->at(0);
    4140       85704 :       alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
    4141             :     }
    4142             :   }
    4143       87454 :   if (bm != NULL) {
    4144       85704 :     bm->EmitSkipInstructions(macro_assembler);
    4145             :   }
    4146       87454 :   return eats_at_least;
    4147             : }
    4148             : 
    4149             : 
    4150      999042 : void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
    4151             :                              AlternativeGenerationList* alt_gens,
    4152             :                              int first_choice,
    4153      268867 :                              Trace* trace,
    4154             :                              PreloadState* preload) {
    4155             :   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    4156      268841 :   SetUpPreLoad(compiler, trace, preload);
    4157             : 
    4158             :   // For now we just call all choices one after the other.  The idea ultimately
    4159             :   // is to use the Dispatch table to try only the relevant ones.
    4160      268841 :   int choice_count = alternatives_->length();
    4161             : 
    4162      268841 :   int new_flush_budget = trace->flush_budget() / choice_count;
    4163             : 
    4164      999042 :   for (int i = first_choice; i < choice_count; i++) {
    4165      730201 :     bool is_last = i == choice_count - 1;
    4166      730201 :     bool fall_through_on_failure = !is_last;
    4167     1460402 :     GuardedAlternative alternative = alternatives_->at(i);
    4168             :     AlternativeGeneration* alt_gen = alt_gens->at(i);
    4169     1206456 :     alt_gen->quick_check_details.set_characters(preload->preload_characters_);
    4170             :     ZoneList<Guard*>* guards = alternative.guards();
    4171      730201 :     int guard_count = (guards == NULL) ? 0 : guards->length();
    4172      730201 :     Trace new_trace(*trace);
    4173             :     new_trace.set_characters_preloaded(preload->preload_is_current_ ?
    4174             :                                          preload->preload_characters_ :
    4175      730201 :                                          0);
    4176      730201 :     if (preload->preload_has_checked_bounds_) {
    4177      499468 :       new_trace.set_bound_checked_up_to(preload->preload_characters_);
    4178             :     }
    4179             :     new_trace.quick_check_performed()->Clear();
    4180      730201 :     if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
    4181      730201 :     if (!is_last) {
    4182      461360 :       new_trace.set_backtrack(&alt_gen->after);
    4183             :     }
    4184      730201 :     alt_gen->expects_preload = preload->preload_is_current_;
    4185             :     bool generate_full_check_inline = false;
    4186     1324505 :     if (compiler->optimize() &&
    4187     1321004 :         try_to_emit_quick_check_for_alternative(i == 0) &&
    4188             :         alternative.node()->EmitQuickCheck(
    4189             :             compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
    4190             :             &alt_gen->possible_success, &alt_gen->quick_check_details,
    4191      590803 :             fall_through_on_failure)) {
    4192             :       // Quick check was generated for this choice.
    4193      253946 :       preload->preload_is_current_ = true;
    4194      253946 :       preload->preload_has_checked_bounds_ = true;
    4195             :       // If we generated the quick check to fall through on possible success,
    4196             :       // we now need to generate the full check inline.
    4197      253946 :       if (!fall_through_on_failure) {
    4198       44533 :         macro_assembler->Bind(&alt_gen->possible_success);
    4199             :         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
    4200       44533 :         new_trace.set_characters_preloaded(preload->preload_characters_);
    4201             :         new_trace.set_bound_checked_up_to(preload->preload_characters_);
    4202             :         generate_full_check_inline = true;
    4203             :       }
    4204      476255 :     } else if (alt_gen->quick_check_details.cannot_match()) {
    4205         138 :       if (!fall_through_on_failure) {
    4206          52 :         macro_assembler->GoTo(trace->backtrack());
    4207             :       }
    4208         138 :       continue;
    4209             :     } else {
    4210             :       // No quick check was generated.  Put the full code here.
    4211             :       // If this is not the first choice then there could be slow checks from
    4212             :       // previous cases that go here when they fail.  There's no reason to
    4213             :       // insist that they preload characters since the slow check we are about
    4214             :       // to generate probably can't use it.
    4215      476117 :       if (i != first_choice) {
    4216      282576 :         alt_gen->expects_preload = false;
    4217             :         new_trace.InvalidateCurrentCharacter();
    4218             :       }
    4219             :       generate_full_check_inline = true;
    4220             :     }
    4221      730063 :     if (generate_full_check_inline) {
    4222      520650 :       if (new_trace.actions() != NULL) {
    4223             :         new_trace.set_flush_budget(new_flush_budget);
    4224             :       }
    4225        6615 :       for (int j = 0; j < guard_count; j++) {
    4226        6615 :         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
    4227             :       }
    4228      520650 :       alternative.node()->Emit(compiler, &new_trace);
    4229      520650 :       preload->preload_is_current_ = false;
    4230             :     }
    4231      730063 :     macro_assembler->Bind(&alt_gen->after);
    4232             :   }
    4233      268841 : }
    4234             : 
    4235             : 
    4236      955654 : void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
    4237      176806 :                                            Trace* trace,
    4238             :                                            GuardedAlternative alternative,
    4239             :                                            AlternativeGeneration* alt_gen,
    4240             :                                            int preload_characters,
    4241             :                                            bool next_expects_preload) {
    4242     1283069 :   if (!alt_gen->possible_success.is_linked()) return;
    4243             : 
    4244             :   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
    4245      209413 :   macro_assembler->Bind(&alt_gen->possible_success);
    4246      209413 :   Trace out_of_line_trace(*trace);
    4247             :   out_of_line_trace.set_characters_preloaded(preload_characters);
    4248             :   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
    4249      209413 :   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
    4250      209413 :   ZoneList<Guard*>* guards = alternative.guards();
    4251      209413 :   int guard_count = (guards == NULL) ? 0 : guards->length();
    4252      209413 :   if (next_expects_preload) {
    4253             :     Label reload_current_char;
    4254             :     out_of_line_trace.set_backtrack(&reload_current_char);
    4255      177845 :     for (int j = 0; j < guard_count; j++) {
    4256        1039 :       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
    4257             :     }
    4258      176806 :     alternative.node()->Emit(compiler, &out_of_line_trace);
    4259      176806 :     macro_assembler->Bind(&reload_current_char);
    4260             :     // Reload the current character, since the next quick check expects that.
    4261             :     // We don't need to check bounds here because we only get into this
    4262             :     // code through a quick check which already did the checked load.
    4263             :     macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
    4264             :                                           NULL,
    4265             :                                           false,
    4266      353612 :                                           preload_characters);
    4267      176806 :     macro_assembler->GoTo(&(alt_gen->after));
    4268             :   } else {
    4269       32607 :     out_of_line_trace.set_backtrack(&(alt_gen->after));
    4270       32703 :     for (int j = 0; j < guard_count; j++) {
    4271          96 :       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
    4272             :     }
    4273       32607 :     alternative.node()->Emit(compiler, &out_of_line_trace);
    4274             :   }
    4275             : }
    4276             : 
    4277             : 
    4278      577834 : void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
    4279             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    4280      576752 :   LimitResult limit_result = LimitVersions(compiler, trace);
    4281      576752 :   if (limit_result == DONE) return;
    4282             :   DCHECK(limit_result == CONTINUE);
    4283             : 
    4284             :   RecursionCheck rc(compiler);
    4285             : 
    4286      337669 :   switch (action_type_) {
    4287             :     case STORE_POSITION: {
    4288             :       Trace::DeferredCapture
    4289             :           new_capture(data_.u_position_register.reg,
    4290             :                       data_.u_position_register.is_capture,
    4291      302875 :                       trace);
    4292      302875 :       Trace new_trace = *trace;
    4293             :       new_trace.add_action(&new_capture);
    4294      328091 :       on_success()->Emit(compiler, &new_trace);
    4295             :       break;
    4296             :     }
    4297             :     case INCREMENT_REGISTER: {
    4298             :       Trace::DeferredIncrementRegister
    4299        7305 :           new_increment(data_.u_increment_register.reg);
    4300        7305 :       Trace new_trace = *trace;
    4301             :       new_trace.add_action(&new_increment);
    4302        7305 :       on_success()->Emit(compiler, &new_trace);
    4303             :       break;
    4304             :     }
    4305             :     case SET_REGISTER: {
    4306             :       Trace::DeferredSetRegister
    4307        6226 :           new_set(data_.u_store_register.reg, data_.u_store_register.value);
    4308        6226 :       Trace new_trace = *trace;
    4309             :       new_trace.add_action(&new_set);
    4310        6226 :       on_success()->Emit(compiler, &new_trace);
    4311             :       break;
    4312             :     }
    4313             :     case CLEAR_CAPTURES: {
    4314             :       Trace::DeferredClearCaptures
    4315             :         new_capture(Interval(data_.u_clear_captures.range_from,
    4316        3897 :                              data_.u_clear_captures.range_to));
    4317        3897 :       Trace new_trace = *trace;
    4318             :       new_trace.add_action(&new_capture);
    4319        3897 :       on_success()->Emit(compiler, &new_trace);
    4320             :       break;
    4321             :     }
    4322             :     case BEGIN_SUBMATCH:
    4323       11291 :       if (!trace->is_trivial()) {
    4324        5979 :         trace->Flush(compiler, this);
    4325             :       } else {
    4326             :         assembler->WriteCurrentPositionToRegister(
    4327        5312 :             data_.u_submatch.current_position_register, 0);
    4328             :         assembler->WriteStackPointerToRegister(
    4329        5312 :             data_.u_submatch.stack_pointer_register);
    4330        5312 :         on_success()->Emit(compiler, trace);
    4331             :       }
    4332             :       break;
    4333             :     case EMPTY_MATCH_CHECK: {
    4334        1147 :       int start_pos_reg = data_.u_empty_match_check.start_register;
    4335        1147 :       int stored_pos = 0;
    4336        1147 :       int rep_reg = data_.u_empty_match_check.repetition_register;
    4337        1147 :       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
    4338        1147 :       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
    4339        1361 :       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
    4340             :         // If we know we haven't advanced and there is no minimum we
    4341             :         // can just backtrack immediately.
    4342         182 :         assembler->GoTo(trace->backtrack());
    4343        1459 :       } else if (know_dist && stored_pos < trace->cp_offset()) {
    4344             :         // If we know we've advanced we can generate the continuation
    4345             :         // immediately.
    4346         298 :         on_success()->Emit(compiler, trace);
    4347         758 :       } else if (!trace->is_trivial()) {
    4348         384 :         trace->Flush(compiler, this);
    4349             :       } else {
    4350             :         Label skip_empty_check;
    4351             :         // If we have a minimum number of repetitions we check the current
    4352             :         // number first and skip the empty check if it's not enough.
    4353         374 :         if (has_minimum) {
    4354         251 :           int limit = data_.u_empty_match_check.repetition_limit;
    4355         251 :           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
    4356             :         }
    4357             :         // If the match is empty we bail out, otherwise we fall through
    4358             :         // to the on-success continuation.
    4359             :         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
    4360         748 :                                    trace->backtrack());
    4361         374 :         assembler->Bind(&skip_empty_check);
    4362         374 :         on_success()->Emit(compiler, trace);
    4363             :       }
    4364             :       break;
    4365             :     }
    4366             :     case POSITIVE_SUBMATCH_SUCCESS: {
    4367        4928 :       if (!trace->is_trivial()) {
    4368        3124 :         trace->Flush(compiler, this);
    4369        3124 :         return;
    4370             :       }
    4371             :       assembler->ReadCurrentPositionFromRegister(
    4372        1804 :           data_.u_submatch.current_position_register);
    4373             :       assembler->ReadStackPointerFromRegister(
    4374        1804 :           data_.u_submatch.stack_pointer_register);
    4375        1804 :       int clear_register_count = data_.u_submatch.clear_register_count;
    4376        1804 :       if (clear_register_count == 0) {
    4377        1206 :         on_success()->Emit(compiler, trace);
    4378        1206 :         return;
    4379             :       }
    4380         598 :       int clear_registers_from = data_.u_submatch.clear_register_from;
    4381             :       Label clear_registers_backtrack;
    4382         598 :       Trace new_trace = *trace;
    4383             :       new_trace.set_backtrack(&clear_registers_backtrack);
    4384         598 :       on_success()->Emit(compiler, &new_trace);
    4385             : 
    4386         598 :       assembler->Bind(&clear_registers_backtrack);
    4387         598 :       int clear_registers_to = clear_registers_from + clear_register_count - 1;
    4388         598 :       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
    4389             : 
    4390             :       DCHECK(trace->backtrack() == NULL);
    4391         598 :       assembler->Backtrack();
    4392             :       return;
    4393             :     }
    4394             :     default:
    4395           0 :       UNREACHABLE();
    4396             :   }
    4397             : }
    4398             : 
    4399             : 
    4400       14675 : void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
    4401             :   RegExpMacroAssembler* assembler = compiler->macro_assembler();
    4402        6185 :   if (!trace->is_trivial()) {
    4403        2953 :     trace->Flush(compiler, this);
    4404        2953 :     return;
    4405             :   }
    4406             : 
    4407        3232 :   LimitResult limit_result = LimitVersions(compiler, trace);
    4408        3232 :   if (limit_result == DONE) return;
    4409             :   DCHECK(limit_result == CONTINUE);
    4410             : 
    4411             :   RecursionCheck rc(compiler);
    4412             : 
    4413             :   DCHECK_EQ(start_reg_ + 1, end_reg_);
    4414        2987 :   if (compiler->ignore_case()) {
    4415             :     assembler->CheckNotBackReferenceIgnoreCase(
    4416        6738 :         start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
    4417             :   } else {
    4418             :     assembler->CheckNotBackReference(start_reg_, read_backward(),
    4419        1482 :                                      trace->backtrack());
    4420             :   }
    4421             :   // We are going to advance backward, so we may end up at the start.
    4422        2987 :   if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
    4423             : 
    4424             :   // Check that the back reference does not end inside a surrogate pair.
    4425        3161 :   if (compiler->unicode() && !compiler->one_byte()) {
    4426          96 :     assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
    4427             :   }
    4428        2987 :   on_success()->Emit(compiler, trace);
    4429             : }
    4430             : 
    4431             : 
    4432             : // -------------------------------------------------------------------
    4433             : // Dot/dotty output
    4434             : 
    4435             : 
    4436             : #ifdef DEBUG
    4437             : 
    4438             : 
    4439             : class DotPrinter: public NodeVisitor {
    4440             :  public:
    4441             :   DotPrinter(std::ostream& os, bool ignore_case)  // NOLINT
    4442             :       : os_(os),
    4443             :         ignore_case_(ignore_case) {}
    4444             :   void PrintNode(const char* label, RegExpNode* node);
    4445             :   void Visit(RegExpNode* node);
    4446             :   void PrintAttributes(RegExpNode* from);
    4447             :   void PrintOnFailure(RegExpNode* from, RegExpNode* to);
    4448             : #define DECLARE_VISIT(Type)                                          \
    4449             :   virtual void Visit##Type(Type##Node* that);
    4450             : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
    4451             : #undef DECLARE_VISIT
    4452             :  private:
    4453             :   std::ostream& os_;
    4454             :   bool ignore_case_;
    4455             : };
    4456             : 
    4457             : 
    4458             : void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
    4459             :   os_ << "digraph G {\n  graph [label=\"";
    4460             :   for (int i = 0; label[i]; i++) {
    4461             :     switch (label[i]) {
    4462             :       case '\\':
    4463             :         os_ << "\\\\";
    4464             :         break;
    4465             :       case '"':
    4466             :         os_ << "\"";
    4467             :         break;
    4468             :       default:
    4469             :         os_ << label[i];
    4470             :         break;
    4471             :     }
    4472             :   }
    4473             :   os_ << "\"];\n";
    4474             :   Visit(node);
    4475             :   os_ << "}" << std::endl;
    4476             : }
    4477             : 
    4478             : 
    4479             : void DotPrinter::Visit(RegExpNode* node) {
    4480             :   if (node->info()->visited) return;
    4481             :   node->info()->visited = true;
    4482             :   node->Accept(this);
    4483             : }
    4484             : 
    4485             : 
    4486             : void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
    4487             :   os_ << "  n" << from << " -> n" << on_failure << " [style=dotted];\n";
    4488             :   Visit(on_failure);
    4489             : }
    4490             : 
    4491             : 
    4492             : class TableEntryBodyPrinter {
    4493             :  public:
    4494             :   TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice)  // NOLINT
    4495             :       : os_(os),
    4496             :         choice_(choice) {}
    4497             :   void Call(uc16 from, DispatchTable::Entry entry) {
    4498             :     OutSet* out_set = entry.out_set();
    4499             :     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
    4500             :       if (out_set->Get(i)) {
    4501             :         os_ << "    n" << choice() << ":s" << from << "o" << i << " -> n"
    4502             :             << choice()->alternatives()->at(i).node() << ";\n";
    4503             :       }
    4504             :     }
    4505             :   }
    4506             :  private:
    4507             :   ChoiceNode* choice() { return choice_; }
    4508             :   std::ostream& os_;
    4509             :   ChoiceNode* choice_;
    4510             : };
    4511             : 
    4512             : 
    4513             : class TableEntryHeaderPrinter {
    4514             :  public:
    4515             :   explicit TableEntryHeaderPrinter(std::ostream& os)  // NOLINT
    4516             :       : first_(true),
    4517             :         os_(os) {}
    4518             :   void Call(uc16 from, DispatchTable::Entry entry) {
    4519             :     if (first_) {
    4520             :       first_ = false;
    4521             :     } else {
    4522             :       os_ << "|";
    4523             :     }
    4524             :     os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
    4525             :     OutSet* out_set = entry.out_set();
    4526             :     int priority = 0;
    4527             :     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
    4528             :       if (out_set->Get(i)) {
    4529             :         if (priority > 0) os_ << "|";
    4530             :         os_ << "<s" << from << "o" << i << "> " << priority;
    4531             :         priority++;
    4532             :       }
    4533             :     }
    4534             :     os_ << "}}";
    4535             :   }
    4536             : 
    4537             :  private:
    4538             :   bool first_;
    4539             :   std::ostream& os_;
    4540             : };
    4541             : 
    4542             : 
    4543             : class AttributePrinter {
    4544             :  public:
    4545             :   explicit AttributePrinter(std::ostream& os)  // NOLINT
    4546             :       : os_(os),
    4547             :         first_(true) {}
    4548             :   void PrintSeparator() {
    4549             :     if (first_) {
    4550             :       first_ = false;
    4551             :     } else {
    4552             :       os_ << "|";
    4553             :     }
    4554             :   }
    4555             :   void PrintBit(const char* name, bool value) {
    4556             :     if (!value) return;
    4557             :     PrintSeparator();
    4558             :     os_ << "{" << name << "}";
    4559             :   }
    4560             :   void PrintPositive(const char* name, int value) {
    4561             :     if (value < 0) return;
    4562             :     PrintSeparator();
    4563             :     os_ << "{" << name << "|" << value << "}";
    4564             :   }
    4565             : 
    4566             :  private:
    4567             :   std::ostream& os_;
    4568             :   bool first_;
    4569             : };
    4570             : 
    4571             : 
    4572             : void DotPrinter::PrintAttributes(RegExpNode* that) {
    4573             :   os_ << "  a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
    4574             :       << "margin=0.1, fontsize=10, label=\"{";
    4575             :   AttributePrinter printer(os_);
    4576             :   NodeInfo* info = that->info();
    4577             :   printer.PrintBit("NI", info->follows_newline_interest);
    4578             :   printer.PrintBit("WI", info->follows_word_interest);
    4579             :   printer.PrintBit("SI", info->follows_start_interest);
    4580             :   Label* label = that->label();
    4581             :   if (label->is_bound())
    4582             :     printer.PrintPositive("@", label->pos());
    4583             :   os_ << "}\"];\n"
    4584             :       << "  a" << that << " -> n" << that
    4585             :       << " [style=dashed, color=grey, arrowhead=none];\n";
    4586             : }
    4587             : 
    4588             : 
    4589             : static const bool kPrintDispatchTable = false;
    4590             : void DotPrinter::VisitChoice(ChoiceNode* that) {
    4591             :   if (kPrintDispatchTable) {
    4592             :     os_ << "  n" << that << " [shape=Mrecord, label=\"";
    4593             :     TableEntryHeaderPrinter header_printer(os_);
    4594             :     that->GetTable(ignore_case_)->ForEach(&header_printer);
    4595             :     os_ << "\"]\n";
    4596             :     PrintAttributes(that);
    4597             :     TableEntryBodyPrinter body_printer(os_, that);
    4598             :     that->GetTable(ignore_case_)->ForEach(&body_printer);
    4599             :   } else {
    4600             :     os_ << "  n" << that << " [shape=Mrecord, label=\"?\"];\n";
    4601             :     for (int i = 0; i < that->alternatives()->length(); i++) {
    4602             :       GuardedAlternative alt = that->alternatives()->at(i);
    4603             :       os_ << "  n" << that << " -> n" << alt.node();
    4604             :     }
    4605             :   }
    4606             :   for (int i = 0; i < that->alternatives()->length(); i++) {
    4607             :     GuardedAlternative alt = that->alternatives()->at(i);
    4608             :     alt.node()->Accept(this);
    4609             :   }
    4610             : }
    4611             : 
    4612             : 
    4613             : void DotPrinter::VisitText(TextNode* that) {
    4614             :   Zone* zone = that->zone();
    4615             :   os_ << "  n" << that << " [label=\"";
    4616             :   for (int i = 0; i < that->elements()->length(); i++) {
    4617             :     if (i > 0) os_ << " ";
    4618             :     TextElement elm = that->elements()->at(i);
    4619             :     switch (elm.text_type()) {
    4620             :       case TextElement::ATOM: {
    4621             :         Vector<const uc16> data = elm.atom()->data();
    4622             :         for (int i = 0; i < data.length(); i++) {
    4623             :           os_ << static_cast<char>(data[i]);
    4624             :         }
    4625             :         break;
    4626             :       }
    4627             :       case TextElement::CHAR_CLASS: {
    4628             :         RegExpCharacterClass* node = elm.char_class();
    4629             :         os_ << "[";
    4630             :         if (node->is_negated()) os_ << "^";
    4631             :         for (int j = 0; j < node->ranges(zone)->length(); j++) {
    4632             :           CharacterRange range = node->ranges(zone)->at(j);
    4633             :           os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
    4634             :         }
    4635             :         os_ << "]";
    4636             :         break;
    4637             :       }
    4638             :       default:
    4639             :         UNREACHABLE();
    4640             :     }
    4641             :   }
    4642             :   os_ << "\", shape=box, peripheries=2];\n";
    4643             :   PrintAttributes(that);
    4644             :   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
    4645             :   Visit(that->on_success());
    4646             : }
    4647             : 
    4648             : 
    4649             : void DotPrinter::VisitBackReference(BackReferenceNode* that) {
    4650             :   os_ << "  n" << that << " [label=\"$" << that->start_register() << "..$"
    4651             :       << that->end_register() << "\", shape=doubleoctagon];\n";
    4652             :   PrintAttributes(that);
    4653             :   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
    4654             :   Visit(that->on_success());
    4655             : }
    4656             : 
    4657             : 
    4658             : void DotPrinter::VisitEnd(EndNode* that) {
    4659             :   os_ << "  n" << that << " [style=bold, shape=point];\n";
    4660             :   PrintAttributes(that);
    4661             : }
    4662             : 
    4663             : 
    4664             : void DotPrinter::VisitAssertion(AssertionNode* that) {
    4665             :   os_ << "  n" << that << " [";
    4666             :   switch (that->assertion_type()) {
    4667             :     case AssertionNode::AT_END:
    4668             :       os_ << "label=\"$\", shape=septagon";
    4669             :       break;
    4670             :     case AssertionNode::AT_START:
    4671             :       os_ << "label=\"^\", shape=septagon";
    4672             :       break;
    4673             :     case AssertionNode::AT_BOUNDARY:
    4674             :       os_ << "label=\"\\b\", shape=septagon";
    4675             :       break;
    4676             :     case AssertionNode::AT_NON_BOUNDARY:
    4677             :       os_ << "label=\"\\B\", shape=septagon";
    4678             :       break;
    4679             :     case AssertionNode::AFTER_NEWLINE:
    4680             :       os_ << "label=\"(?<=\\n)\", shape=septagon";
    4681             :       break;
    4682             :   }
    4683             :   os_ << "];\n";
    4684             :   PrintAttributes(that);
    4685             :   RegExpNode* successor = that->on_success();
    4686             :   os_ << "  n" << that << " -> n" << successor << ";\n";
    4687             :   Visit(successor);
    4688             : }
    4689             : 
    4690             : 
    4691             : void DotPrinter::VisitAction(ActionNode* that) {
    4692             :   os_ << "  n" << that << " [";
    4693             :   switch (that->action_type_) {
    4694             :     case ActionNode::SET_REGISTER:
    4695             :       os_ << "label=\"$" << that->data_.u_store_register.reg
    4696             :           << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
    4697             :       break;
    4698             :     case ActionNode::INCREMENT_REGISTER:
    4699             :       os_ << "label=\"$" << that->data_.u_increment_register.reg
    4700             :           << "++\", shape=octagon";
    4701             :       break;
    4702             :     case ActionNode::STORE_POSITION:
    4703             :       os_ << "label=\"$" << that->data_.u_position_register.reg
    4704             :           << ":=$pos\", shape=octagon";
    4705             :       break;
    4706             :     case ActionNode::BEGIN_SUBMATCH:
    4707             :       os_ << "label=\"$" << that->data_.u_submatch.current_position_register
    4708             :           << ":=$pos,begin\", shape=septagon";
    4709             :       break;
    4710             :     case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
    4711             :       os_ << "label=\"escape\", shape=septagon";
    4712             :       break;
    4713             :     case ActionNode::EMPTY_MATCH_CHECK:
    4714             :       os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
    4715             :           << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
    4716             :           << "<" << that->data_.u_empty_match_check.repetition_limit
    4717             :           << "?\", shape=septagon";
    4718             :       break;
    4719             :     case ActionNode::CLEAR_CAPTURES: {
    4720             :       os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
    4721             :           << " to $" << that->data_.u_clear_captures.range_to
    4722             :           << "\", shape=septagon";
    4723             :       break;
    4724             :     }
    4725             :   }
    4726             :   os_ << "];\n";
    4727             :   PrintAttributes(that);
    4728             :   RegExpNode* successor = that->on_success();
    4729             :   os_ << "  n" << that << " -> n" << successor << ";\n";
    4730             :   Visit(successor);
    4731             : }
    4732             : 
    4733             : 
    4734             : class DispatchTableDumper {
    4735             :  public:
    4736             :   explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
    4737             :   void Call(uc16 key, DispatchTable::Entry entry);
    4738             :  private:
    4739             :   std::ostream& os_;
    4740             : };
    4741             : 
    4742             : 
    4743             : void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
    4744             :   os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
    4745             :   OutSet* set = entry.out_set();
    4746             :   bool first = true;
    4747             :   for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
    4748             :     if (set->Get(i)) {
    4749             :       if (first) {
    4750             :         first = false;
    4751             :       } else {
    4752             :         os_ << ", ";
    4753             :       }
    4754             :       os_ << i;
    4755             :     }
    4756             :   }
    4757             :   os_ << "}\n";
    4758             : }
    4759             : 
    4760             : 
    4761             : void DispatchTable::Dump() {
    4762             :   OFStream os(stderr);
    4763             :   DispatchTableDumper dumper(os);
    4764             :   tree()->ForEach(&dumper);
    4765             : }
    4766             : 
    4767             : 
    4768             : void RegExpEngine::DotPrint(const char* label,
    4769             :                             RegExpNode* node,
    4770             :                             bool ignore_case) {
    4771             :   OFStream os(stdout);
    4772             :   DotPrinter printer(os, ignore_case);
    4773             :   printer.PrintNode(label, node);
    4774             : }
    4775             : 
    4776             : 
    4777             : #endif  // DEBUG
    4778             : 
    4779             : 
    4780             : // -------------------------------------------------------------------
    4781             : // Tree to graph conversion
    4782             : 
    4783     4873791 : RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
    4784             :                                RegExpNode* on_success) {
    4785             :   ZoneList<TextElement>* elms =
    4786     1624597 :       new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
    4787     3249194 :   elms->Add(TextElement::Atom(this), compiler->zone());
    4788             :   return new (compiler->zone())
    4789     1624597 :       TextNode(elms, compiler->read_backward(), on_success);
    4790             : }
    4791             : 
    4792             : 
    4793        5678 : RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
    4794             :                                RegExpNode* on_success) {
    4795             :   return new (compiler->zone())
    4796        5678 :       TextNode(elements(), compiler->read_backward(), on_success);
    4797             : }
    4798             : 
    4799             : 
    4800      637490 : static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
    4801             :                                  const int* special_class,
    4802             :                                  int length) {
    4803      637490 :   length--;  // Remove final marker.
    4804             :   DCHECK(special_class[length] == kRangeEndMarker);
    4805             :   DCHECK(ranges->length() != 0);
    4806             :   DCHECK(length != 0);
    4807             :   DCHECK(special_class[0] != 0);
    4808      637490 :   if (ranges->length() != (length >> 1) + 1) {
    4809             :     return false;
    4810             :   }
    4811       17224 :   CharacterRange range = ranges->at(0);
    4812       17224 :   if (range.from() != 0) {
    4813             :     return false;
    4814             :   }
    4815       32390 :   for (int i = 0; i < length; i += 2) {
    4816       33087 :     if (special_class[i] != (range.to() + 1)) {
    4817             :       return false;
    4818             :     }
    4819       64780 :     range = ranges->at((i >> 1) + 1);
    4820       32390 :     if (special_class[i+1] != range.from()) {
    4821             :       return false;
    4822             :     }
    4823             :   }
    4824       10289 :   if (range.to() != String::kMaxCodePoint) {
    4825             :     return false;
    4826             :   }
    4827       10289 :   return true;
    4828             : }
    4829             : 
    4830             : 
    4831      633818 : static bool CompareRanges(ZoneList<CharacterRange>* ranges,
    4832             :                           const int* special_class,
    4833             :                           int length) {
    4834      633818 :   length--;  // Remove final marker.
    4835             :   DCHECK(special_class[length] == kRangeEndMarker);
    4836      633818 :   if (ranges->length() * 2 != length) {
    4837             :     return false;
    4838             :   }
    4839       37008 :   for (int i = 0; i < length; i += 2) {
    4840       82576 :     CharacterRange range = ranges->at(i >> 1);
    4841       78309 :     if (range.from() != special_class[i] ||
    4842       37021 :         range.to() != special_class[i + 1] - 1) {
    4843             :       return false;
    4844             :     }
    4845             :   }
    4846             :   return true;
    4847             : }
    4848             : 
    4849             : 
    4850      300739 : bool RegExpCharacterClass::is_standard(Zone* zone) {
    4851             :   // TODO(lrn): Remove need for this function, by not throwing away information
    4852             :   // along the way.
    4853      300739 :   if (is_negated()) {
    4854             :     return false;
    4855             :   }
    4856      293942 :   if (set_.is_standard()) {
    4857             :     return true;
    4858             :   }
    4859      219358 :   if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
    4860             :     set_.set_standard_set_type('s');
    4861        1970 :     return true;
    4862             :   }
    4863      217388 :   if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
    4864             :     set_.set_standard_set_type('S');
    4865         198 :     return true;
    4866             :   }
    4867      217190 :   if (CompareInverseRanges(set_.ranges(zone),
    4868             :                            kLineTerminatorRanges,
    4869      217190 :                            kLineTerminatorRangeCount)) {
    4870             :     set_.set_standard_set_type('.');
    4871        9954 :     return true;
    4872             :   }
    4873      207236 :   if (CompareRanges(set_.ranges(zone),
    4874             :                     kLineTerminatorRanges,
    4875      207236 :                     kLineTerminatorRangeCount)) {
    4876             :     set_.set_standard_set_type('n');
    4877          12 :     return true;
    4878             :   }
    4879      207224 :   if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
    4880             :     set_.set_standard_set_type('w');
    4881        4312 :     return true;
    4882             :   }
    4883      202912 :   if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
    4884             :     set_.set_standard_set_type('W');
    4885         137 :     return true;
    4886             :   }
    4887             :   return false;
    4888             : }
    4889             : 
    4890             : 
    4891        3163 : UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
    4892             :                                            ZoneList<CharacterRange>* base)
    4893             :     : zone_(zone),
    4894             :       table_(zone),
    4895             :       bmp_(nullptr),
    4896             :       lead_surrogates_(nullptr),
    4897             :       trail_surrogates_(nullptr),
    4898        6326 :       non_bmp_(nullptr) {
    4899             :   // The unicode range splitter categorizes given character ranges into:
    4900             :   // - Code points from the BMP representable by one code unit.
    4901             :   // - Code points outside the BMP that need to be split into surrogate pairs.
    4902             :   // - Lone lead surrogates.
    4903             :   // - Lone trail surrogates.
    4904             :   // Lone surrogates are valid code points, even though no actual characters.
    4905             :   // They require special matching to make sure we do not split surrogate pairs.
    4906             :   // We use the dispatch table to accomplish this. The base range is split up
    4907             :   // by the table by the overlay ranges, and the Call callback is used to
    4908             :   // filter and collect ranges for each category.
    4909      176024 :   for (int i = 0; i < base->length(); i++) {
    4910      257710 :     table_.AddRange(base->at(i), kBase, zone_);
    4911             :   }
    4912             :   // Add overlay ranges.
    4913             :   table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
    4914        3163 :                   kBmpCodePoints, zone_);
    4915             :   table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
    4916        3163 :                   kLeadSurrogates, zone_);
    4917             :   table_.AddRange(
    4918             :       CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
    4919        3163 :       kTrailSurrogates, zone_);
    4920             :   table_.AddRange(
    4921             :       CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
    4922        3163 :       kBmpCodePoints, zone_);
    4923             :   table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
    4924        3163 :                   kNonBmpCodePoints, zone_);
    4925             :   table_.ForEach(this);
    4926        3163 : }
    4927             : 
    4928             : 
    4929      182377 : void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
    4930      182377 :   OutSet* outset = entry.out_set();
    4931      364754 :   if (!outset->Get(kBase)) return;
    4932             :   ZoneList<CharacterRange>** target = NULL;
    4933       90161 :   if (outset->Get(kBmpCodePoints)) {
    4934       62051 :     target = &bmp_;
    4935       28110 :   } else if (outset->Get(kLeadSurrogates)) {
    4936        1538 :     target = &lead_surrogates_;
    4937       26572 :   } else if (outset->Get(kTrailSurrogates)) {
    4938        1538 :     target = &trail_surrogates_;
    4939             :   } else {
    4940             :     DCHECK(outset->Get(kNonBmpCodePoints));
    4941       25034 :     target = &non_bmp_;
    4942             :   }
    4943       97980 :   if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
    4944      180322 :   (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
    4945             : }
    4946             : 
    4947             : 
    4948        8424 : void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
    4949        3156 :                       RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
    4950             :   ZoneList<CharacterRange>* bmp = splitter->bmp();
    4951        6312 :   if (bmp == nullptr) return;
    4952             :   result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
    4953        5268 :       compiler->zone(), bmp, compiler->read_backward(), on_success)));
    4954             : }
    4955             : 
    4956             : 
    4957       32564 : void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
    4958             :                              RegExpNode* on_success,
    4959        3156 :                              UnicodeRangeSplitter* splitter) {
    4960             :   ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
    4961        6312 :   if (non_bmp == nullptr) return;
    4962             :   DCHECK(compiler->unicode());
    4963             :   DCHECK(!compiler->one_byte());
    4964             :   Zone* zone = compiler->zone();
    4965        2095 :   CharacterRange::Canonicalize(non_bmp);
    4966       54244 :   for (int i = 0; i < non_bmp->length(); i++) {
    4967             :     // Match surrogate pair.
    4968             :     // E.g. [\u10005-\u11005] becomes
    4969             :     //      \ud800[\udc05-\udfff]|
    4970             :     //      [\ud801-\ud803][\udc00-\udfff]|
    4971             :     //      \ud804[\udc00-\udc05]
    4972       52149 :     uc32 from = non_bmp->at(i).from();
    4973       25027 :     uc32 to = non_bmp->at(i).to();
    4974       25027 :     uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
    4975             :     uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
    4976       25027 :     uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
    4977             :     uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
    4978       25027 :     if (from_l == to_l) {
    4979             :       // The lead surrogate is the same.
    4980             :       result->AddAlternative(
    4981             :           GuardedAlternative(TextNode::CreateForSurrogatePair(
    4982             :               zone, CharacterRange::Singleton(from_l),
    4983             :               CharacterRange::Range(from_t, to_t), compiler->read_backward(),
    4984       22176 :               on_success)));
    4985             :     } else {
    4986        2851 :       if (from_t != kTrailSurrogateStart) {
    4987             :         // Add [from_l][from_t-\udfff]
    4988             :         result->AddAlternative(
    4989             :             GuardedAlternative(TextNode::CreateForSurrogatePair(
    4990             :                 zone, CharacterRange::Singleton(from_l),
    4991             :                 CharacterRange::Range(from_t, kTrailSurrogateEnd),
    4992        1410 :                 compiler->read_backward(), on_success)));
    4993        1410 :         from_l++;
    4994             :       }
    4995        2851 :       if (to_t != kTrailSurrogateEnd) {
    4996             :         // Add [to_l][\udc00-to_t]
    4997             :         result->AddAlternative(
    4998             :             GuardedAlternative(TextNode::CreateForSurrogatePair(
    4999             :                 zone, CharacterRange::Singleton(to_l),
    5000             :                 CharacterRange::Range(kTrailSurrogateStart, to_t),
    5001        1146 :                 compiler->read_backward(), on_success)));
    5002        1146 :         to_l--;
    5003             :       }
    5004        2851 :       if (from_l <= to_l) {
    5005             :         // Add [from_l-to_l][\udc00-\udfff]
    5006             :         result->AddAlternative(
    5007             :             GuardedAlternative(TextNode::CreateForSurrogatePair(
    5008             :                 zone, CharacterRange::Range(from_l, to_l),
    5009             :                 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
    5010        2581 :                 compiler->read_backward(), on_success)));
    5011             :       }
    5012             :     }
    5013             :   }
    5014             : }
    5015             : 
    5016             : 
    5017        1537 : RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
    5018        1537 :     RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
    5019             :     ZoneList<CharacterRange>* match, RegExpNode* on_success,
    5020             :     bool read_backward) {
    5021             :   Zone* zone = compiler->zone();
    5022             :   RegExpNode* match_node = TextNode::CreateForCharacterRanges(
    5023        1537 :       zone, match, read_backward, on_success);
    5024             :   int stack_register = compiler->UnicodeLookaroundStackRegister();
    5025             :   int position_register = compiler->UnicodeLookaroundPositionRegister();
    5026             :   RegExpLookaround::Builder lookaround(false, match_node, stack_register,
    5027        1537 :                                        position_register);
    5028             :   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
    5029        1537 :       zone, lookbehind, !read_backward, lookaround.on_match_success());
    5030        1537 :   return lookaround.ForMatch(negative_match);
    5031             : }
    5032             : 
    5033             : 
    5034        1525 : RegExpNode* MatchAndNegativeLookaroundInReadDirection(
    5035        1525 :     RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
    5036             :     ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
    5037             :     bool read_backward) {
    5038             :   Zone* zone = compiler->zone();
    5039             :   int stack_register = compiler->UnicodeLookaroundStackRegister();
    5040             :   int position_register = compiler->UnicodeLookaroundPositionRegister();
    5041             :   RegExpLookaround::Builder lookaround(false, on_success, stack_register,
    5042        1525 :                                        position_register);
    5043             :   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
    5044        1525 :       zone, lookahead, read_backward, lookaround.on_match_success());
    5045             :   return TextNode::CreateForCharacterRanges(
    5046        1525 :       zone, match, read_backward, lookaround.ForMatch(negative_match));
    5047             : }
    5048             : 
    5049             : 
    5050        6218 : void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
    5051             :                            RegExpNode* on_success,
    5052        3156 :                            UnicodeRangeSplitter* splitter) {
    5053             :   ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
    5054        6312 :   if (lead_surrogates == nullptr) return;
    5055             :   Zone* zone = compiler->zone();
    5056             :   // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
    5057             :   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
    5058        1531 :       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
    5059             : 
    5060             :   RegExpNode* match;
    5061        1531 :   if (compiler->read_backward()) {
    5062             :     // Reading backward. Assert that reading forward, there is no trail
    5063             :     // surrogate, and then backward match the lead surrogate.
    5064             :     match = NegativeLookaroundAgainstReadDirectionAndMatch(
    5065         114 :         compiler, trail_surrogates, lead_surrogates, on_success, true);
    5066             :   } else {
    5067             :     // Reading forward. Forward match the lead surrogate and assert that
    5068             :     // no trail surrogate follows.
    5069             :     match = MatchAndNegativeLookaroundInReadDirection(
    5070        1417 :         compiler, lead_surrogates, trail_surrogates, on_success, false);
    5071             :   }
    5072             :   result->AddAlternative(GuardedAlternative(match));
    5073             : }
    5074             : 
    5075             : 
    5076        6218 : void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
    5077             :                             RegExpNode* on_success,
    5078        3156 :                             UnicodeRangeSplitter* splitter) {
    5079             :   ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
    5080        6312 :   if (trail_surrogates == nullptr) return;
    5081             :   Zone* zone = compiler->zone();
    5082             :   // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
    5083             :   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
    5084        1531 :       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
    5085             : 
    5086             :   RegExpNode* match;
    5087        1531 :   if (compiler->read_backward()) {
    5088             :     // Reading backward. Backward match the trail surrogate and assert that no
    5089             :     // lead surrogate precedes it.
    5090             :     match = MatchAndNegativeLookaroundInReadDirection(
    5091         108 :         compiler, trail_surrogates, lead_surrogates, on_success, true);
    5092             :   } else {
    5093             :     // Reading forward. Assert that reading backward, there is no lead
    5094             :     // surrogate, and then forward match the trail surrogate.
    5095             :     match = NegativeLookaroundAgainstReadDirectionAndMatch(
    5096        1423 :         compiler, lead_surrogates, trail_surrogates, on_success, false);
    5097             :   }
    5098             :   result->AddAlternative(GuardedAlternative(match));
    5099             : }
    5100             : 
    5101        2181 : RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
    5102             :                               RegExpNode* on_success) {
    5103             :   // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
    5104             :   DCHECK(!compiler->read_backward());
    5105             :   Zone* zone = compiler->zone();
    5106             :   // Advance any character. If the character happens to be a lead surrogate and
    5107             :   // we advanced into the middle of a surrogate pair, it will work out, as
    5108             :   // nothing will match from there. We will have to advance again, consuming
    5109             :   // the associated trail surrogate.
    5110             :   ZoneList<CharacterRange>* range = CharacterRange::List(
    5111        2181 :       zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
    5112        2181 :   return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
    5113             : }
    5114             : 
    5115         741 : void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
    5116             : #ifdef V8_INTL_SUPPORT
    5117             :   // Use ICU to compute the case fold closure over the ranges.
    5118         741 :   icu::UnicodeSet set;
    5119       95062 :   for (int i = 0; i < ranges->length(); i++) {
    5120       94321 :     set.add(ranges->at(i).from(), ranges->at(i).to());
    5121             :   }
    5122             :   ranges->Clear();
    5123         741 :   set.closeOver(USET_CASE_INSENSITIVE);
    5124             :   // Full case mapping map single characters to multiple characters.
    5125             :   // Those are represented as strings in the set. Remove them so that
    5126             :   // we end up with only simple and common case mappings.
    5127         741 :   set.removeAllStrings();
    5128        9233 :   for (int i = 0; i < set.getRangeCount(); i++) {
    5129        8492 :     ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
    5130       16984 :                 zone);
    5131             :   }
    5132             :   // No errors and everything we collected have been ranges.
    5133         741 :   CharacterRange::Canonicalize(ranges);
    5134             : #endif  // V8_INTL_SUPPORT
    5135         741 : }
    5136             : 
    5137             : 
    5138      606304 : RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
    5139             :                                          RegExpNode* on_success) {
    5140             :   set_.Canonicalize();
    5141             :   Zone* zone = compiler->zone();
    5142             :   ZoneList<CharacterRange>* ranges = this->ranges(zone);
    5143      201224 :   if (compiler->needs_unicode_case_equivalents()) {
    5144         631 :     AddUnicodeCaseEquivalents(ranges, zone);
    5145             :   }
    5146      214564 :   if (compiler->unicode() && !compiler->one_byte() &&
    5147             :       !contains_split_surrogate()) {
    5148        5357 :     if (is_negated()) {
    5149             :       ZoneList<CharacterRange>* negated =
    5150         153 :           new (zone) ZoneList<CharacterRange>(2, zone);
    5151         153 :       CharacterRange::Negate(ranges, negated, zone);
    5152             :       ranges = negated;
    5153             :     }
    5154        5357 :     if (ranges->length() == 0) {
    5155          40 :       ranges->Add(CharacterRange::Everything(), zone);
    5156             :       RegExpCharacterClass* fail =
    5157             :           new (zone) RegExpCharacterClass(ranges, NEGATED);
    5158          40 :       return new (zone) TextNode(fail, compiler->read_backward(), on_success);
    5159             :     }
    5160        5337 :     if (standard_type() == '*') {
    5161        2181 :       return UnanchoredAdvance(compiler, on_success);
    5162             :     } else {
    5163        3156 :       ChoiceNode* result = new (zone) ChoiceNode(2, zone);
    5164        3156 :       UnicodeRangeSplitter splitter(zone, ranges);
    5165        3156 :       AddBmpCharacters(compiler, result, on_success, &splitter);
    5166        3156 :       AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
    5167        3156 :       AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
    5168        3156 :       AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
    5169             :       return result;
    5170             :     }
    5171             :   } else {
    5172      391734 :     return new (zone) TextNode(this, compiler->read_backward(), on_success);
    5173             :   }
    5174             : }
    5175             : 
    5176             : 
    5177      166326 : int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
    5178      166326 :   RegExpAtom* atom1 = (*a)->AsAtom();
    5179      166326 :   RegExpAtom* atom2 = (*b)->AsAtom();
    5180      166326 :   uc16 character1 = atom1->data().at(0);
    5181      166326 :   uc16 character2 = atom2->data().at(0);
    5182      166326 :   if (character1 < character2) return -1;
    5183      148127 :   if (character1 > character2) return 1;
    5184       17321 :   return 0;
    5185             : }
    5186             : 
    5187             : 
    5188             : static unibrow::uchar Canonical(
    5189             :     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    5190             :     unibrow::uchar c) {
    5191             :   unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
    5192      118700 :   int length = canonicalize->get(c, '\0', chars);
    5193             :   DCHECK_LE(length, 1);
    5194             :   unibrow::uchar canonical = c;
    5195      118700 :   if (length == 1) canonical = chars[0];
    5196             :   return canonical;
    5197             : }
    5198             : 
    5199             : 
    5200       75482 : int CompareFirstCharCaseIndependent(
    5201             :     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
    5202             :     RegExpTree* const* a, RegExpTree* const* b) {
    5203       75482 :   RegExpAtom* atom1 = (*a)->AsAtom();
    5204       75482 :   RegExpAtom* atom2 = (*b)->AsAtom();
    5205       75482 :   unibrow::uchar character1 = atom1->data().at(0);
    5206       75482 :   unibrow::uchar character2 = atom2->data().at(0);
    5207       75482 :   if (character1 == character2) return 0;
    5208       52611 :   if (character1 >= 'a' || character2 >= 'a') {
    5209             :     character1 = Canonical(canonicalize, character1);
    5210             :     character2 = Canonical(canonicalize, character2);
    5211             :   }
    5212       52611 :   return static_cast<int>(character1) - static_cast<int>(character2);
    5213             : }
    5214             : 
    5215             : 
    5216             : // We can stable sort runs of atoms, since the order does not matter if they
    5217             : // start with different characters.
    5218             : // Returns true if any consecutive atoms were found.
    5219       13742 : bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
    5220             :   ZoneList<RegExpTree*>* alternatives = this->alternatives();
    5221       12507 :   int length = alternatives->length();
    5222             :   bool found_consecutive_atoms = false;
    5223       23914 :   for (int i = 0; i < length; i++) {
    5224       26893 :     while (i < length) {
    5225       25618 :       RegExpTree* alternative = alternatives->at(i);
    5226       25618 :       if (alternative->IsAtom()) break;
    5227       14211 :       i++;
    5228             :     }
    5229             :     // i is length or it is the index of an atom.
    5230       12682 :     if (i == length) break;
    5231             :     int first_atom = i;
    5232       11407 :     i++;
    5233       92628 :     while (i < length) {
    5234       70106 :       RegExpTree* alternative = alternatives->at(i);
    5235       70106 :       if (!alternative->IsAtom()) break;
    5236       69814 :       i++;
    5237             :     }
    5238             :     // Sort atoms to get ones with common prefixes together.
    5239             :     // This step is more tricky if we are in a case-independent regexp,
    5240             :     // because it would change /is|I/ to /I|is/, and order matters when
    5241             :     // the regexp parts don't match only disjoint starting points. To fix
    5242             :     // this we have a version of CompareFirstChar that uses case-
    5243             :     // independent character classes for comparison.
    5244             :     DCHECK_LT(first_atom, alternatives->length());
    5245             :     DCHECK_LE(i, alternatives->length());
    5246             :     DCHECK_LE(first_atom, i);
    5247       11407 :     if (compiler->ignore_case()) {
    5248             :       unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
    5249        1235 :           compiler->isolate()->regexp_macro_assembler_canonicalize();
    5250             :       auto compare_closure =
    5251             :           [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
    5252       75482 :             return CompareFirstCharCaseIndependent(canonicalize, a, b);
    5253       75482 :           };
    5254        1235 :       alternatives->StableSort(compare_closure, first_atom, i - first_atom);
    5255             :     } else {
    5256       10172 :       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
    5257             :     }
    5258       11407 :     if (i - first_atom > 1) found_consecutive_atoms = true;
    5259             :   }
    5260       12507 :   return found_consecutive_atoms;
    5261             : }
    5262             : 
    5263             : 
    5264             : // Optimizes ab|ac|az to a(?:b|c|d).
    5265       17915 : void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
    5266             :   Zone* zone = compiler->zone();
    5267             :   ZoneList<RegExpTree*>* alternatives = this->alternatives();
    5268       11075 :   int length = alternatives->length();
    5269             : 
    5270             :   int write_posn = 0;
    5271             :   int i = 0;
    5272       92226 :   while (i < length) {
    5273       70076 :     RegExpTree* alternative = alternatives->at(i);
    5274       70076 :     if (!alternative->IsAtom()) {
    5275         710 :       alternatives->at(write_posn++) = alternatives->at(i);
    5276         355 :       i++;
    5277         355 :       continue;
    5278             :     }
    5279       69721 :     RegExpAtom* atom = alternative->AsAtom();
    5280       69721 :     unibrow::uchar common_prefix = atom->data().at(0);
    5281             :     int first_with_prefix = i;
    5282             :     int prefix_length = atom->length();
    5283       69721 :     i++;
    5284      150610 :     while (i < length) {
    5285       69925 :       alternative = alternatives->at(i);
    5286       69925 :       if (!alternative->IsAtom()) break;
    5287       69814 :       atom = alternative->AsAtom();
    5288       69814 :       unibrow::uchar new_prefix = atom->data().at(0);
    5289       69814 :       if (new_prefix != common_prefix) {
    5290       58930 :         if (!compiler->ignore_case()) break;
    5291             :         unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
    5292        6840 :             compiler->isolate()->regexp_macro_assembler_canonicalize();
    5293             :         new_prefix = Canonical(canonicalize, new_prefix);
    5294             :         common_prefix = Canonical(canonicalize, common_prefix);
    5295        6840 :         if (new_prefix != common_prefix) break;
    5296             :       }
    5297             :       prefix_length = Min(prefix_length, atom->length());
    5298       11168 :       i++;
    5299             :     }
    5300       69721 :     if (i > first_with_prefix + 2) {
    5301             :       // Found worthwhile run of alternatives with common prefix of at least one
    5302             :       // character.  The sorting function above did not sort on more than one
    5303             :       // character for reasons of correctness, but there may still be a longer
    5304             :       // common prefix if the terms were similar or presorted in the input.
    5305             :       // Find out how long the common prefix is.
    5306         727 :       int run_length = i - first_with_prefix;
    5307         727 :       atom = alternatives->at(first_with_prefix)->AsAtom();
    5308        3011 :       for (int j = 1; j < run_length && prefix_length > 1; j++) {
    5309             :         RegExpAtom* old_atom =
    5310        4568 :             alternatives->at(j + first_with_prefix)->AsAtom();
    5311        5458 :         for (int k = 1; k < prefix_length; k++) {
    5312        7800 :           if (atom->data().at(k) != old_atom->data().at(k)) {
    5313             :             prefix_length = k;
    5314             :             break;
    5315             :           }
    5316             :         }
    5317             :       }
    5318             :       RegExpAtom* prefix =
    5319             :           new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
    5320         727 :       ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
    5321             :       pair->Add(prefix, zone);
    5322             :       ZoneList<RegExpTree*>* suffixes =
    5323         727 :           new (zone) ZoneList<RegExpTree*>(run_length, zone);
    5324       11998 :       for (int j = 0; j < run_length; j++) {
    5325             :         RegExpAtom* old_atom =
    5326       22542 :             alternatives->at(j + first_with_prefix)->AsAtom();
    5327             :         int len = old_atom->length();
    5328       11271 :         if (len == prefix_length) {
    5329             :           suffixes->Add(new (zone) RegExpEmpty(), zone);
    5330             :         } else {
    5331             :           RegExpTree* suffix = new (zone) RegExpAtom(
    5332             :               old_atom->data().SubVector(prefix_length, old_atom->length()));
    5333             :           suffixes->Add(suffix, zone);
    5334             :         }
    5335             :       }
    5336         727 :       pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
    5337        1454 :       alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
    5338             :     } else {
    5339             :       // Just copy any non-worthwhile alternatives.
    5340       69618 :       for (int j = first_with_prefix; j < i; j++) {
    5341      139236 :         alternatives->at(write_posn++) = alternatives->at(j);
    5342             :       }
    5343             :     }
    5344             :   }
    5345             :   alternatives->Rewind(write_posn);  // Trim end of array.
    5346       11075 : }
    5347             : 
    5348             : 
    5349             : // Optimizes b|c|z to [bcz].
    5350       12507 : void RegExpDisjunction::FixSingleCharacterDisjunctions(
    5351       12507 :     RegExpCompiler* compiler) {
    5352             :   Zone* zone = compiler->zone();
    5353             :   ZoneList<RegExpTree*>* alternatives = this->alternatives();
    5354       12507 :   int length = alternatives->length();
    5355             :   const bool unicode = compiler->unicode();
    5356             : 
    5357             :   int write_posn = 0;
    5358             :   int i = 0;
    5359      101592 :   while (i < length) {
    5360       76578 :     RegExpTree* alternative = alternatives->at(i);
    5361       76578 :     if (!alternative->IsAtom()) {
    5362       30460 :       alternatives->at(write_posn++) = alternatives->at(i);
    5363       15230 :       i++;
    5364       15230 :       continue;
    5365             :     }
    5366       61348 :     RegExpAtom* atom = alternative->AsAtom();
    5367       61348 :     if (atom->length() != 1) {
    5368      102590 :       alternatives->at(write_posn++) = alternatives->at(i);
    5369       51295 :       i++;
    5370       51295 :       continue;
    5371             :     }
    5372             :     DCHECK_IMPLIES(unicode,
    5373             :                    !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
    5374             :     bool contains_trail_surrogate =
    5375       10053 :         unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
    5376             :     int first_in_run = i;
    5377       10053 :     i++;
    5378       28708 :     while (i < length) {
    5379       18251 :       alternative = alternatives->at(i);
    5380       18251 :       if (!alternative->IsAtom()) break;
    5381       17958 :       atom = alternative->AsAtom();
    5382       17958 :       if (atom->length() != 1) break;
    5383             :       DCHECK_IMPLIES(unicode,
    5384             :                      !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
    5385             :       contains_trail_surrogate |=
    5386       17204 :           unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
    5387        8602 :       i++;
    5388             :     }
    5389       10053 :     if (i > first_in_run + 1) {
    5390             :       // Found non-trivial run of single-character alternatives.
    5391         322 :       int run_length = i - first_in_run;
    5392             :       ZoneList<CharacterRange>* ranges =
    5393         322 :           new (zone) ZoneList<CharacterRange>(2, zone);
    5394        9246 :       for (int j = 0; j < run_length; j++) {
    5395       17848 :         RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
    5396             :         DCHECK_EQ(old_atom->length(), 1);
    5397       17848 :         ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
    5398             :       }
    5399             :       RegExpCharacterClass::Flags flags;
    5400         322 :       if (unicode && contains_trail_surrogate) {
    5401             :         flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
    5402             :       }
    5403         322 :       alternatives->at(write_posn++) =
    5404         322 :           new (zone) RegExpCharacterClass(ranges, flags);
    5405             :     } else {
    5406             :       // Just copy any trivial alternatives.
    5407        9731 :       for (int j = first_in_run; j < i; j++) {
    5408       19462 :         alternatives->at(write_posn++) = alternatives->at(j);
    5409             :       }
    5410             :     }
    5411             :   }
    5412             :   alternatives->Rewind(write_posn);  // Trim end of array.
    5413       12507 : }
    5414             : 
    5415             : 
    5416       31412 : RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
    5417       15858 :                                       RegExpNode* on_success) {
    5418             :   ZoneList<RegExpTree*>* alternatives = this->alternatives();
    5419             : 
    5420       43919 :   if (alternatives->length() > 2) {
    5421       12507 :     bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
    5422       12507 :     if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
    5423       12507 :     FixSingleCharacterDisjunctions(compiler);
    5424       12507 :     if (alternatives->length() == 1) {
    5425         304 :       return alternatives->at(0)->ToNode(compiler, on_success);
    5426             :     }
    5427             :   }
    5428             : 
    5429             :   int length = alternatives->length();
    5430             : 
    5431             :   ChoiceNode* result =
    5432       15554 :       new(compiler->zone()) ChoiceNode(length, compiler->zone());
    5433       98527 :   for (int i = 0; i < length; i++) {
    5434             :     GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
    5435       82973 :                                                                on_success));
    5436             :     result->AddAlternative(alternative);
    5437             :   }
    5438             :   return result;
    5439             : }
    5440             : 
    5441             : 
    5442     1540393 : RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
    5443     3080786 :                                      RegExpNode* on_success) {
    5444             :   return ToNode(min(),
    5445             :                 max(),
    5446             :                 is_greedy(),
    5447             :                 body(),
    5448             :                 compiler,
    5449     3080786 :                 on_success);
    5450             : }
    5451             : 
    5452             : 
    5453             : // Scoped object to keep track of how much we unroll quantifier loops in the
    5454             : // regexp graph generator.
    5455             : class RegExpExpansionLimiter {
    5456             :  public:
    5457             :   static const int kMaxExpansionFactor = 6;
    5458       82020 :   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
    5459             :       : compiler_(compiler),
    5460             :         saved_expansion_factor_(compiler->current_expansion_factor()),
    5461       82020 :         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
    5462             :     DCHECK(factor > 0);
    5463       96603 :     if (ok_to_expand_) {
    5464       96603 :       if (factor > kMaxExpansionFactor) {
    5465             :         // Avoid integer overflow of the current expansion factor.
    5466             :         ok_to_expand_ = false;
    5467             :         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
    5468             :       } else {
    5469       96447 :         int new_factor = saved_expansion_factor_ * factor;
    5470       96447 :         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
    5471             :         compiler->set_current_expansion_factor(new_factor);
    5472             :       }
    5473             :     }
    5474             :   }
    5475             : 
    5476             :   ~RegExpExpansionLimiter() {
    5477             :     compiler_->set_current_expansion_factor(saved_expansion_factor_);
    5478             :   }
    5479             : 
    5480             :   bool ok_to_expand() { return ok_to_expand_; }
    5481             : 
    5482             :  private:
    5483             :   RegExpCompiler* compiler_;
    5484             :   int saved_expansion_factor_;
    5485             :   bool ok_to_expand_;
    5486             : 
    5487             :   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
    5488             : };
    5489             : 
    5490             : 
    5491     1637877 : RegExpNode* RegExpQuantifier::ToNode(int min,
    5492             :                                      int max,
    5493             :                                      bool is_greedy,
    5494             :                                      RegExpTree* body,
    5495     4894874 :                                      RegExpCompiler* compiler,
    5496             :                                      RegExpNode* on_success,
    5497             :                                      bool not_at_start) {
    5498             :   // x{f, t} becomes this:
    5499             :   //
    5500             :   //             (r++)<-.
    5501             :   //               |     `
    5502             :   //               |     (x)
    5503             :   //               v     ^
    5504             :   //      (r=0)-->(?)---/ [if r < t]
    5505             :   //               |
    5506             :   //   [if r >= f] \----> ...
    5507             :   //
    5508             : 
    5509             :   // 15.10.2.5 RepeatMatcher algorithm.
    5510             :   // The parser has already eliminated the case where max is 0.  In the case
    5511             :   // where max_match is zero the parser has removed the quantifier if min was
    5512             :   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
    5513             : 
    5514             :   // If we know that we cannot match zero length then things are a little
    5515             :   // simpler since we don't need to make the special zero length match check
    5516             :   // from step 2.1.  If the min and max are small we can unroll a little in
    5517             :   // this case.
    5518             :   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
    5519             :   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
    5520     1637877 :   if (max == 0) return on_success;  // This can happen due to recursion.
    5521     1636014 :   bool body_can_be_empty = (body->min_match() == 0);
    5522             :   int body_start_reg = RegExpCompiler::kNoRegister;
    5523     1636014 :   Interval capture_registers = body->CaptureRegisters();
    5524     1636014 :   bool needs_capture_clearing = !capture_registers.is_empty();
    5525             :   Zone* zone = compiler->zone();
    5526             : 
    5527     1636014 :   if (body_can_be_empty) {
    5528             :     body_start_reg = compiler->AllocateRegister();
    5529     1635339 :   } else if (compiler->optimize() && !needs_capture_clearing) {
    5530             :     // Only unroll if there are no captures and the body can't be
    5531             :     // empty.
    5532             :     {
    5533             :       RegExpExpansionLimiter limiter(
    5534       82020 :           compiler, min + ((max != min) ? 1 : 0));
    5535       82020 :       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
    5536        9369 :         int new_max = (max == kInfinity) ? max : max - min;
    5537             :         // Recurse once to get the loop or optional matches after the fixed
    5538             :         // ones.
    5539             :         RegExpNode* answer = ToNode(
    5540        9369 :             0, new_max, is_greedy, body, compiler, on_success, true);
    5541             :         // Unroll the forced matches from 0 to min.  This can cause chains of
    5542             :         // TextNodes (which the parser does not generate).  These should be
    5543             :         // combined if it turns out they hinder good code generation.
    5544       22743 :         for (int i = 0; i < min; i++) {
    5545       13374 :           answer = body->ToNode(compiler, answer);
    5546             :         }
    5547             :         return answer;
    5548             :       }
    5549             :     }
    5550       72651 :     if (max <= kMaxUnrolledMaxMatches && min == 0) {
    5551             :       DCHECK(max > 0);  // Due to the 'if' above.
    5552             :       RegExpExpansionLimiter limiter(compiler, max);
    5553       14583 :       if (limiter.ok_to_expand()) {
    5554             :         // Unroll the optional matches up to max.
    5555             :         RegExpNode* answer = on_success;
    5556       14205 :         for (int i = 0; i < max; i++) {
    5557       14205 :           ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
    5558       14205 :           if (is_greedy) {
    5559             :             alternation->AddAlternative(
    5560       14030 :                 GuardedAlternative(body->ToNode(compiler, answer)));
    5561             :             alternation->AddAlternative(GuardedAlternative(on_success));
    5562             :           } else {
    5563             :             alternation->AddAlternative(GuardedAlternative(on_success));
    5564             :             alternation->AddAlternative(
    5565         175 :                 GuardedAlternative(body->ToNode(compiler, answer)));
    5566             :           }
    5567             :           answer = alternation;
    5568       15773 :           if (not_at_start && !compiler->read_backward()) {
    5569             :             alternation->set_not_at_start();
    5570             :           }
    5571             :         }
    5572             :         return answer;
    5573             :       }
    5574             :     }
    5575             :   }
    5576     1612556 :   bool has_min = min > 0;
    5577     1612556 :   bool has_max = max < RegExpTree::kInfinity;
    5578     1612556 :   bool needs_counter = has_min || has_max;
    5579             :   int reg_ctr = needs_counter
    5580             :       ? compiler->AllocateRegister()
    5581     1612556 :       : RegExpCompiler::kNoRegister;
    5582             :   LoopChoiceNode* center = new (zone)
    5583     1612556 :       LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
    5584     1618786 :   if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
    5585             :   RegExpNode* loop_return = needs_counter
    5586             :       ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
    5587     1612556 :       : static_cast<RegExpNode*>(center);
    5588     1612556 :   if (body_can_be_empty) {
    5589             :     // If the body can be empty we need to check if it was and then
    5590             :     // backtrack.
    5591             :     loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
    5592             :                                               reg_ctr,
    5593             :                                               min,
    5594         675 :                                               loop_return);
    5595             :   }
    5596     1612556 :   RegExpNode* body_node = body->ToNode(compiler, loop_return);
    5597     1612556 :   if (body_can_be_empty) {
    5598             :     // If the body can be empty we need to store the start position
    5599             :     // so we can bail out if it was empty.
    5600         675 :     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
    5601             :   }
    5602     1612556 :   if (needs_capture_clearing) {
    5603             :     // Before entering the body of this loop we need to clear captures.
    5604        4107 :     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
    5605             :   }
    5606             :   GuardedAlternative body_alt(body_node);
    5607     1612556 :   if (has_max) {
    5608             :     Guard* body_guard =
    5609             :         new(zone) Guard(reg_ctr, Guard::LT, max);
    5610     1504583 :     body_alt.AddGuard(body_guard, zone);
    5611             :   }
    5612             :   GuardedAlternative rest_alt(on_success);
    5613     1612556 :   if (has_min) {
    5614             :     Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
    5615        3167 :     rest_alt.AddGuard(rest_guard, zone);
    5616             :   }
    5617     1612556 :   if (is_greedy) {
    5618             :     center->AddLoopAlternative(body_alt);
    5619             :     center->AddContinueAlternative(rest_alt);
    5620             :   } else {
    5621             :     center->AddContinueAlternative(rest_alt);
    5622             :     center->AddLoopAlternative(body_alt);
    5623             :   }
    5624     1612556 :   if (needs_counter) {
    5625     1506002 :     return ActionNode::SetRegister(reg_ctr, 0, center);
    5626             :   } else {
    5627             :     return center;
    5628             :   }
    5629             : }
    5630             : 
    5631             : namespace {
    5632             : // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
    5633             : //         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
    5634          36 : RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
    5635             :                                           RegExpNode* on_success,
    5636             :                                           RegExpAssertion::AssertionType type) {
    5637             :   DCHECK(compiler->needs_unicode_case_equivalents());
    5638             :   Zone* zone = compiler->zone();
    5639             :   ZoneList<CharacterRange>* word_range =
    5640          36 :       new (zone) ZoneList<CharacterRange>(2, zone);
    5641          36 :   CharacterRange::AddClassEscape('w', word_range, true, zone);
    5642             :   int stack_register = compiler->UnicodeLookaroundStackRegister();
    5643             :   int position_register = compiler->UnicodeLookaroundPositionRegister();
    5644          36 :   ChoiceNode* result = new (zone) ChoiceNode(2, zone);
    5645             :   // Add two choices. The (non-)boundary could start with a word or
    5646             :   // a non-word-character.
    5647         108 :   for (int i = 0; i < 2; i++) {
    5648          72 :     bool lookbehind_for_word = i == 0;
    5649             :     bool lookahead_for_word =
    5650          72 :         (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
    5651             :     // Look to the left.
    5652             :     RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
    5653          72 :                                          stack_register, position_register);
    5654             :     RegExpNode* backward = TextNode::CreateForCharacterRanges(
    5655          72 :         zone, word_range, true, lookbehind.on_match_success());
    5656             :     // Look to the right.
    5657             :     RegExpLookaround::Builder lookahead(lookahead_for_word,
    5658             :                                         lookbehind.ForMatch(backward),
    5659          72 :                                         stack_register, position_register);
    5660             :     RegExpNode* forward = TextNode::CreateForCharacterRanges(
    5661          72 :         zone, word_range, false, lookahead.on_match_success());
    5662          72 :     result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
    5663             :   }
    5664          36 :   return result;
    5665             : }
    5666             : }  // anonymous namespace
    5667             : 
    5668        9625 : RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
    5669        9625 :                                     RegExpNode* on_success) {
    5670             :   NodeInfo info;
    5671             :   Zone* zone = compiler->zone();
    5672             : 
    5673        9625 :   switch (assertion_type()) {
    5674             :     case START_OF_LINE:
    5675         797 :       return AssertionNode::AfterNewline(on_success);
    5676             :     case START_OF_INPUT:
    5677        4673 :       return AssertionNode::AtStart(on_success);
    5678             :     case BOUNDARY:
    5679             :       return compiler->needs_unicode_case_equivalents()
    5680             :                  ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)
    5681         185 :                  : AssertionNode::AtBoundary(on_success);
    5682             :     case NON_BOUNDARY:
    5683             :       return compiler->needs_unicode_case_equivalents()
    5684             :                  ? BoundaryAssertionAsLookaround(compiler, on_success,
    5685             :                                                  NON_BOUNDARY)
    5686         161 :                  : AssertionNode::AtNonBoundary(on_success);
    5687             :     case END_OF_INPUT:
    5688        3725 :       return AssertionNode::AtEnd(on_success);
    5689             :     case END_OF_LINE: {
    5690             :       // Compile $ in multiline regexps as an alternation with a positive
    5691             :       // lookahead in one side and an end-of-input on the other side.
    5692             :       // We need two registers for the lookahead.
    5693             :       int stack_pointer_register = compiler->AllocateRegister();
    5694             :       int position_register = compiler->AllocateRegister();
    5695             :       // The ChoiceNode to distinguish between a newline and end-of-input.
    5696          84 :       ChoiceNode* result = new(zone) ChoiceNode(2, zone);
    5697             :       // Create a newline atom.
    5698             :       ZoneList<CharacterRange>* newline_ranges =
    5699          84 :           new(zone) ZoneList<CharacterRange>(3, zone);
    5700          84 :       CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
    5701             :       RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
    5702             :       TextNode* newline_matcher = new (zone) TextNode(
    5703             :           newline_atom, false, ActionNode::PositiveSubmatchSuccess(
    5704             :                                    stack_pointer_register, position_register,
    5705             :                                    0,   // No captures inside.
    5706             :                                    -1,  // Ignored if no captures.
    5707         168 :                                    on_success));
    5708             :       // Create an end-of-input matcher.
    5709             :       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
    5710             :           stack_pointer_register,
    5711             :           position_register,
    5712          84 :           newline_matcher);
    5713             :       // Add the two alternatives to the ChoiceNode.
    5714             :       GuardedAlternative eol_alternative(end_of_line);
    5715             :       result->AddAlternative(eol_alternative);
    5716          84 :       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
    5717             :       result->AddAlternative(end_alternative);
    5718             :       return result;
    5719             :     }
    5720             :     default:
    5721           0 :       UNREACHABLE();
    5722             :   }
    5723             :   return on_success;
    5724             : }
    5725             : 
    5726             : 
    5727        6180 : RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
    5728        3090 :                                         RegExpNode* on_success) {
    5729             :   return new (compiler->zone())
    5730             :       BackReferenceNode(RegExpCapture::StartRegister(index()),
    5731             :                         RegExpCapture::EndRegister(index()),
    5732        3090 :                         compiler->read_backward(), on_success);
    5733             : }
    5734             : 
    5735             : 
    5736        1364 : RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
    5737             :                                 RegExpNode* on_success) {
    5738        1364 :   return on_success;
    5739             : }
    5740             : 
    5741             : 
    5742        5268 : RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
    5743             :                                    int stack_pointer_register,
    5744             :                                    int position_register,
    5745             :                                    int capture_register_count,
    5746             :                                    int capture_register_start)
    5747             :     : is_positive_(is_positive),
    5748             :       on_success_(on_success),
    5749             :       stack_pointer_register_(stack_pointer_register),
    5750        5268 :       position_register_(position_register) {
    5751        5268 :   if (is_positive_) {
    5752             :     on_match_success_ = ActionNode::PositiveSubmatchSuccess(
    5753             :         stack_pointer_register, position_register, capture_register_count,
    5754        1753 :         capture_register_start, on_success_);
    5755             :   } else {
    5756             :     Zone* zone = on_success_->zone();
    5757             :     on_match_success_ = new (zone) NegativeSubmatchSuccess(
    5758             :         stack_pointer_register, position_register, capture_register_count,
    5759        3515 :         capture_register_start, zone);
    5760             :   }
    5761        5268 : }
    5762             : 
    5763             : 
    5764        5268 : RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
    5765        5268 :   if (is_positive_) {
    5766             :     return ActionNode::BeginSubmatch(stack_pointer_register_,
    5767        1753 :                                      position_register_, match);
    5768             :   } else {
    5769        3515 :     Zone* zone = on_success_->zone();
    5770             :     // We use a ChoiceNode to represent the negative lookaround. The first
    5771             :     // alternative is the negative match. On success, the end node backtracks.
    5772             :     // On failure, the second alternative is tried and leads to success.
    5773             :     // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
    5774             :     // first exit when calculating quick checks.
    5775             :     ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
    5776        3515 :         GuardedAlternative(match), GuardedAlternative(on_success_), zone);
    5777             :     return ActionNode::BeginSubmatch(stack_pointer_register_,
    5778        3515 :                                      position_register_, choice_node);
    5779             :   }
    5780             : }
    5781             : 
    5782             : 
    5783        4024 : RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
    5784        4024 :                                      RegExpNode* on_success) {
    5785             :   int stack_pointer_register = compiler->AllocateRegister();
    5786             :   int position_register = compiler->AllocateRegister();
    5787             : 
    5788             :   const int registers_per_capture = 2;
    5789             :   const int register_of_first_capture = 2;
    5790        2012 :   int register_count = capture_count_ * registers_per_capture;
    5791             :   int register_start =
    5792        2012 :     register_of_first_capture + capture_from_ * registers_per_capture;
    5793             : 
    5794             :   RegExpNode* result;
    5795             :   bool was_reading_backward = compiler->read_backward();
    5796        2012 :   compiler->set_read_backward(type() == LOOKBEHIND);
    5797             :   Builder builder(is_positive(), on_success, stack_pointer_register,
    5798        2012 :                   position_register, register_count, register_start);
    5799        2012 :   RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
    5800        2012 :   result = builder.ForMatch(match);
    5801             :   compiler->set_read_backward(was_reading_backward);
    5802        2012 :   return result;
    5803             : }
    5804             : 
    5805             : 
    5806       42614 : RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
    5807       42614 :                                   RegExpNode* on_success) {
    5808       42614 :   return ToNode(body(), index(), compiler, on_success);
    5809             : }
    5810             : 
    5811             : 
    5812      135236 : RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
    5813             :                                   int index,
    5814      135236 :                                   RegExpCompiler* compiler,
    5815             :                                   RegExpNode* on_success) {
    5816             :   DCHECK_NOT_NULL(body);
    5817             :   int start_reg = RegExpCapture::StartRegister(index);
    5818             :   int end_reg = RegExpCapture::EndRegister(index);
    5819      135236 :   if (compiler->read_backward()) std::swap(start_reg, end_reg);
    5820      135236 :   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
    5821      135236 :   RegExpNode* body_node = body->ToNode(compiler, store_end);
    5822      135236 :   return ActionNode::StorePosition(start_reg, true, body_node);
    5823             : }
    5824             : 
    5825             : 
    5826       63292 : RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
    5827       31646 :                                       RegExpNode* on_success) {
    5828             :   ZoneList<RegExpTree*>* children = nodes();
    5829             :   RegExpNode* current = on_success;
    5830       31646 :   if (compiler->read_backward()) {
    5831        2286 :     for (int i = 0; i < children->length(); i++) {
    5832       33542 :       current = children->at(i)->ToNode(compiler, current);
    5833             :     }
    5834             :   } else {
    5835     1644910 :     for (int i = children->length() - 1; i >= 0; i--) {
    5836     1613654 :       current = children->at(i)->ToNode(compiler, current);
    5837             :     }
    5838             :   }
    5839       31646 :   return current;
    5840             : }
    5841             : 
    5842             : 
    5843       23640 : static void AddClass(const int* elmv,
    5844             :                      int elmc,
    5845             :                      ZoneList<CharacterRange>* ranges,
    5846             :                      Zone* zone) {
    5847       23640 :   elmc--;
    5848             :   DCHECK(elmv[elmc] == kRangeEndMarker);
    5849      164283 :   for (int i = 0; i < elmc; i += 2) {
    5850             :     DCHECK(elmv[i] < elmv[i + 1]);
    5851      281286 :     ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
    5852             :   }
    5853       23640 : }
    5854             : 
    5855             : 
    5856       32490 : static void AddClassNegated(const int *elmv,
    5857             :                             int elmc,
    5858             :                             ZoneList<CharacterRange>* ranges,
    5859             :                             Zone* zone) {
    5860       32490 :   elmc--;
    5861             :   DCHECK(elmv[elmc] == kRangeEndMarker);
    5862             :   DCHECK(elmv[0] != 0x0000);
    5863             :   DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
    5864             :   uc16 last = 0x0000;
    5865      136230 :   for (int i = 0; i < elmc; i += 2) {
    5866             :     DCHECK(last <= elmv[i] - 1);
    5867             :     DCHECK(elmv[i] < elmv[i + 1]);
    5868      207480 :     ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
    5869      103740 :     last = elmv[i + 1];
    5870             :   }
    5871       64980 :   ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
    5872       32490 : }
    5873             : 
    5874      144440 : void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
    5875             :                                     bool add_unicode_case_equivalents,
    5876             :                                     Zone* zone) {
    5877      144440 :   if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
    5878             :     // See #sec-runtime-semantics-wordcharacters-abstract-operation
    5879             :     // In case of unicode and ignore_case, we need to create the closure over
    5880             :     // case equivalent characters before negating.
    5881             :     ZoneList<CharacterRange>* new_ranges =
    5882         110 :         new (zone) ZoneList<CharacterRange>(2, zone);
    5883         110 :     AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
    5884         110 :     AddUnicodeCaseEquivalents(new_ranges, zone);
    5885         110 :     if (type == 'W') {
    5886             :       ZoneList<CharacterRange>* negated =
    5887          36 :           new (zone) ZoneList<CharacterRange>(2, zone);
    5888          36 :       CharacterRange::Negate(new_ranges, negated, zone);
    5889             :       new_ranges = negated;
    5890             :     }
    5891             :     ranges->AddAll(*new_ranges, zone);
    5892      144440 :     return;
    5893             :   }
    5894      144330 :   AddClassEscape(type, ranges, zone);
    5895             : }
    5896             : 
    5897      144379 : void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
    5898             :                                     Zone* zone) {
    5899      144379 :   switch (type) {
    5900             :     case 's':
    5901        9997 :       AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
    5902        9997 :       break;
    5903             :     case 'S':
    5904         934 :       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
    5905         934 :       break;
    5906             :     case 'w':
    5907        8788 :       AddClass(kWordRanges, kWordRangeCount, ranges, zone);
    5908        8788 :       break;
    5909             :     case 'W':
    5910         378 :       AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
    5911         378 :       break;
    5912             :     case 'd':
    5913        4577 :       AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
    5914        4577 :       break;
    5915             :     case 'D':
    5916         323 :       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
    5917         323 :       break;
    5918             :     case '.':
    5919             :       AddClassNegated(kLineTerminatorRanges,
    5920             :                       kLineTerminatorRangeCount,
    5921             :                       ranges,
    5922       30855 :                       zone);
    5923       30855 :       break;
    5924             :     // This is not a character range as defined by the spec but a
    5925             :     // convenient shorthand for a character class that matches any
    5926             :     // character.
    5927             :     case '*':
    5928      176718 :       ranges->Add(CharacterRange::Everything(), zone);
    5929       88359 :       break;
    5930             :     // This is the set of characters matched by the $ and ^ symbols
    5931             :     // in multiline mode.
    5932             :     case 'n':
    5933             :       AddClass(kLineTerminatorRanges,
    5934             :                kLineTerminatorRangeCount,
    5935             :                ranges,
    5936         168 :                zone);
    5937         168 :       break;
    5938             :     default:
    5939           0 :       UNREACHABLE();
    5940             :   }
    5941      144379 : }
    5942             : 
    5943             : 
    5944           0 : Vector<const int> CharacterRange::GetWordBounds() {
    5945           0 :   return Vector<const int>(kWordRanges, kWordRangeCount - 1);
    5946             : }
    5947             : 
    5948             : 
    5949       74564 : void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
    5950             :                                         ZoneList<CharacterRange>* ranges,
    5951             :                                         bool is_one_byte) {
    5952       74564 :   CharacterRange::Canonicalize(ranges);
    5953       74564 :   int range_count = ranges->length();
    5954      310576 :   for (int i = 0; i < range_count; i++) {
    5955       86069 :     CharacterRange range = ranges->at(i);
    5956             :     uc32 bottom = range.from();
    5957       91414 :     if (bottom > String::kMaxUtf16CodeUnit) return;
    5958             :     uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
    5959             :     // Nothing to be done for surrogates.
    5960       86069 :     if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
    5961       80884 :     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
    5962        8237 :       if (bottom > String::kMaxOneByteCharCode) return;
    5963        8077 :       if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
    5964             :     }
    5965             :     unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
    5966       80724 :     if (top == bottom) {
    5967             :       // If this is a singleton we just expand the one character.
    5968        5976 :       int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
    5969       10138 :       for (int i = 0; i < length; i++) {
    5970        4162 :         uc32 chr = chars[i];
    5971        4162 :         if (chr != bottom) {
    5972        4312 :           ranges->Add(CharacterRange::Singleton(chars[i]), zone);
    5973             :         }
    5974             :       }
    5975             :     } else {
    5976             :       // If this is a range we expand the characters block by block, expanding
    5977             :       // contiguous subranges (blocks) one at a time.  The approach is as
    5978             :       // follows.  For a given start character we look up the remainder of the
    5979             :       // block that contains it (represented by the end point), for instance we
    5980             :       // find 'z' if the character is 'c'.  A block is characterized by the
    5981             :       // property that all characters uncanonicalize in the same way, except
    5982             :       // that each entry in the result is incremented by the distance from the
    5983             :       // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
    5984             :       // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
    5985             :       // we've found the end point we look up its uncanonicalization and
    5986             :       // produce a range for each element.  For instance for [c-f] we look up
    5987             :       // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
    5988             :       // it is not already contained in the input, so [c-f] will be skipped but
    5989             :       // [C-F] will be added.  If this range is not completely contained in a
    5990             :       // block we do this for all the blocks covered by the range (handling
    5991             :       // characters that is not in a block as a "singleton block").
    5992             :       unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
    5993             :       int pos = bottom;
    5994    27635281 :       while (pos <= top) {
    5995             :         int length =
    5996    27560533 :             isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
    5997             :         uc32 block_end;
    5998    27560533 :         if (length == 0) {
    5999             :           block_end = pos;
    6000             :         } else {
    6001             :           DCHECK_EQ(1, length);
    6002       26159 :           block_end = equivalents[0];
    6003             :         }
    6004    27560533 :         int end = (block_end > top) ? top : block_end;
    6005             :         length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
    6006    27560533 :                                                          equivalents);
    6007    28780360 :         for (int i = 0; i < length; i++) {
    6008     1219827 :           uc32 c = equivalents[i];
    6009     1219827 :           uc32 range_from = c - (block_end - pos);
    6010     1219827 :           uc32 range_to = c - (block_end - end);
    6011     1219827 :           if (!(bottom <= range_from && range_to <= top)) {
    6012       44648 :             ranges->Add(CharacterRange::Range(range_from, range_to), zone);
    6013             :           }
    6014             :         }
    6015    27560533 :         pos = end + 1;
    6016             :       }
    6017             :     }
    6018             :   }
    6019             : }
    6020             : 
    6021             : 
    6022          14 : bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
    6023             :   DCHECK_NOT_NULL(ranges);
    6024          14 :   int n = ranges->length();
    6025          14 :   if (n <= 1) return true;
    6026          14 :   int max = ranges->at(0).to();
    6027         420 :   for (int i = 1; i < n; i++) {
    6028         406 :     CharacterRange next_range = ranges->at(i);
    6029         406 :     if (next_range.from() <= max + 1) return false;
    6030             :     max = next_range.to();
    6031             :   }
    6032             :   return true;
    6033             : }
    6034             : 
    6035             : 
    6036     2212529 : ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
    6037     2212529 :   if (ranges_ == NULL) {
    6038       88395 :     ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
    6039       88395 :     CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
    6040             :   }
    6041     2212529 :   return ranges_;
    6042             : }
    6043             : 
    6044             : 
    6045             : // Move a number of elements in a zonelist to another position
    6046             : // in the same list. Handles overlapping source and target areas.
    6047      113987 : static void MoveRanges(ZoneList<CharacterRange>* list,
    6048             :                        int from,
    6049             :                        int to,
    6050             :                        int count) {
    6051             :   // Ranges are potentially overlapping.
    6052      113987 :   if (from < to) {
    6053    11651852 :     for (int i = count - 1; i >= 0; i--) {
    6054    34676229 :       list->at(to + i) = list->at(from + i);
    6055             :     }
    6056             :   } else {
    6057     6394988 :     for (int i = 0; i < count; i++) {
    6058    19184964 :       list->at(to + i) = list->at(from + i);
    6059             :     }
    6060             :   }
    6061      113987 : }
    6062             : 
    6063             : 
    6064      188829 : static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
    6065             :                                       int count,
    6066             :                                       CharacterRange insert) {
    6067             :   // Inserts a range into list[0..count[, which must be sorted
    6068             :   // by from value and non-overlapping and non-adjacent, using at most
    6069             :   // list[0..count] for the result. Returns the number of resulting
    6070             :   // canonicalized ranges. Inserting a range may collapse existing ranges into
    6071             :   // fewer ranges, so the return value can be anything in the range 1..count+1.
    6072      188829 :   uc32 from = insert.from();
    6073      188829 :   uc32 to = insert.to();
    6074             :   int start_pos = 0;
    6075             :   int end_pos = count;
    6076    23727636 :   for (int i = count - 1; i >= 0; i--) {
    6077    23628799 :     CharacterRange current = list->at(i);
    6078    23628799 :     if (current.from() > to + 1) {
    6079             :       end_pos = i;
    6080      179830 :     } else if (current.to() + 1 < from) {
    6081       89992 :       start_pos = i + 1;
    6082             :       break;
    6083             :     }
    6084             :   }
    6085             : 
    6086             :   // Inserted range overlaps, or is adjacent to, ranges at positions
    6087             :   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
    6088             :   // not affected by the insertion.
    6089             :   // If start_pos == end_pos, the range must be inserted before start_pos.
    6090             :   // if start_pos < end_pos, the entire range from start_pos to end_pos
    6091             :   // must be merged with the insert range.
    6092             : 
    6093      188829 :   if (start_pos == end_pos) {
    6094             :     // Insert between existing ranges at position start_pos.
    6095      120391 :     if (start_pos < count) {
    6096       93109 :       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
    6097             :     }
    6098      120391 :     list->at(start_pos) = insert;
    6099      120391 :     return count + 1;
    6100             :   }
    6101       68438 :   if (start_pos + 1 == end_pos) {
    6102             :     // Replace single existing range at position start_pos.
    6103       47344 :     CharacterRange to_replace = list->at(start_pos);
    6104             :     int new_from = Min(to_replace.from(), from);
    6105             :     int new_to = Max(to_replace.to(), to);
    6106       47344 :     list->at(start_pos) = CharacterRange::Range(new_from, new_to);
    6107             :     return count;
    6108             :   }
    6109             :   // Replace a number of existing ranges from start_pos to end_pos - 1.
    6110             :   // Move the remaining ranges down.
    6111             : 
    6112       21094 :   int new_from = Min(list->at(start_pos).from(), from);
    6113       42188 :   int new_to = Max(list->at(end_pos - 1).to(), to);
    6114       21094 :   if (end_pos < count) {
    6115       20878 :     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
    6116             :   }
    6117       21094 :   list->at(start_pos) = CharacterRange::Range(new_from, new_to);
    6118       21094 :   return count - (end_pos - start_pos) + 1;
    6119             : }
    6120             : 
    6121             : 
    6122          28 : void CharacterSet::Canonicalize() {
    6123             :   // Special/default classes are always considered canonical. The result
    6124             :   // of calling ranges() will be sorted.
    6125      201280 :   if (ranges_ == NULL) return;
    6126      113137 :   CharacterRange::Canonicalize(ranges_);
    6127             : }
    6128             : 
    6129             : 
    6130      585106 : void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
    6131      585106 :   if (character_ranges->length() <= 1) return;
    6132             :   // Check whether ranges are already canonical (increasing, non-overlapping,
    6133             :   // non-adjacent).
    6134             :   int n = character_ranges->length();
    6135      100285 :   int max = character_ranges->at(0).to();
    6136             :   int i = 1;
    6137     1704636 :   while (i < n) {
    6138     1518326 :     CharacterRange current = character_ranges->at(i);
    6139     1518326 :     if (current.from() <= max + 1) {
    6140             :       break;
    6141             :     }
    6142             :     max = current.to();
    6143     1504066 :     i++;
    6144             :   }
    6145             :   // Canonical until the i'th range. If that's all of them, we are done.
    6146      100285 :   if (i == n) return;
    6147             : 
    6148             :   // The ranges at index i and forward are not canonicalized. Make them so by
    6149             :   // doing the equivalent of insertion sort (inserting each into the previous
    6150             :   // list, in order).
    6151             :   // Notice that inserting a range can reduce the number of ranges in the
    6152             :   // result due to combining of adjacent and overlapping ranges.
    6153             :   int read = i;  // Range to insert.
    6154             :   int num_canonical = i;  // Length of canonicalized part of list.
    6155      188829 :   do {
    6156             :     num_canonical = InsertRangeInCanonicalList(character_ranges,
    6157             :                                                num_canonical,
    6158      188829 :                                                character_ranges->at(read));
    6159      188829 :     read++;
    6160             :   } while (read < n);
    6161             :   character_ranges->Rewind(num_canonical);
    6162             : 
    6163             :   DCHECK(CharacterRange::IsCanonical(character_ranges));
    6164             : }
    6165             : 
    6166             : 
    6167         189 : void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
    6168             :                             ZoneList<CharacterRange>* negated_ranges,
    6169             :                             Zone* zone) {
    6170             :   DCHECK(CharacterRange::IsCanonical(ranges));
    6171             :   DCHECK_EQ(0, negated_ranges->length());
    6172         189 :   int range_count = ranges->length();
    6173             :   uc32 from = 0;
    6174             :   int i = 0;
    6175         378 :   if (range_count > 0 && ranges->at(0).from() == 0) {
    6176          32 :     from = ranges->at(0).to() + 1;
    6177             :     i = 1;
    6178             :   }
    6179        8219 :   while (i < range_count) {
    6180        8030 :     CharacterRange range = ranges->at(i);
    6181       16060 :     negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
    6182        8030 :     from = range.to() + 1;
    6183        8030 :     i++;
    6184             :   }
    6185         189 :   if (from < String::kMaxCodePoint) {
    6186             :     negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
    6187         290 :                         zone);
    6188             :   }
    6189         189 : }
    6190             : 
    6191             : 
    6192             : // -------------------------------------------------------------------
    6193             : // Splay tree
    6194             : 
    6195             : 
    6196      565258 : OutSet* OutSet::Extend(unsigned value, Zone* zone) {
    6197      272098 :   if (Get(value))
    6198             :     return this;
    6199      272091 :   if (successors(zone) != NULL) {
    6200      207466 :     for (int i = 0; i < successors(zone)->length(); i++) {
    6201      458488 :       OutSet* successor = successors(zone)->at(i);
    6202      458488 :       if (successor->Get(value))
    6203             :         return successor;
    6204             :     }
    6205             :   } else {
    6206        7054 :     successors_ = new(zone) ZoneList<OutSet*>(2, zone);
    6207             :   }
    6208       21069 :   OutSet* result = new(zone) OutSet(first_, remaining_);
    6209       21069 :   result->Set(value, zone);
    6210             :   successors(zone)->Add(result, zone);
    6211       21069 :   return result;
    6212             : }
    6213             : 
    6214             : 
    6215      983252 : void OutSet::Set(unsigned value, Zone *zone) {
    6216      983252 :   if (value < kFirstLimit) {
    6217      493969 :     first_ |= (1 << value);
    6218             :   } else {
    6219      489283 :     if (remaining_ == NULL)
    6220      132833 :       remaining_ = new(zone) ZoneList<unsigned>(1, zone);
    6221      845733 :     if (remaining_->is_empty() || !remaining_->Contains(value))
    6222             :       remaining_->Add(value, zone);
    6223             :   }
    6224      983252 : }
    6225             : 
    6226             : 
    6227    44156534 : bool OutSet::Get(unsigned value) const {
    6228    44156534 :   if (value < kFirstLimit) {
    6229     9352136 :     return (first_ & (1 << value)) != 0;
    6230    34804398 :   } else if (remaining_ == NULL) {
    6231             :     return false;
    6232             :   } else {
    6233    46719360 :     return remaining_->Contains(value);
    6234             :   }
    6235             : }
    6236             : 
    6237             : 
    6238             : const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
    6239             : 
    6240             : 
    6241      102372 : void DispatchTable::AddRange(CharacterRange full_range, int value,
    6242             :                              Zone* zone) {
    6243      102372 :   CharacterRange current = full_range;
    6244      102372 :   if (tree()->is_empty()) {
    6245             :     // If this is the first range we just insert into the table.
    6246             :     ZoneSplayTree<Config>::Locator loc;
    6247        3247 :     bool inserted = tree()->Insert(current.from(), &loc);
    6248             :     DCHECK(inserted);
    6249             :     USE(inserted);
    6250             :     loc.set_value(Entry(current.from(), current.to(),
    6251        3247 :                         empty()->Extend(value, zone)));
    6252      102372 :     return;
    6253             :   }
    6254             :   // First see if there is a range to the left of this one that
    6255             :   // overlaps.
    6256             :   ZoneSplayTree<Config>::Locator loc;
    6257       99125 :   if (tree()->FindGreatestLessThan(current.from(), &loc)) {
    6258      188508 :     Entry* entry = &loc.value();
    6259             :     // If we've found a range that overlaps with this one, and it
    6260             :     // starts strictly to the left of this one, we have to fix it
    6261             :     // because the following code only handles ranges that start on
    6262             :     // or after the start point of the range we're adding.
    6263      187388 :     if (entry->from() < current.from() && entry->to() >= current.from()) {
    6264             :       // Snap the overlapping range in half around the start point of
    6265             :       // the range we're adding.
    6266             :       CharacterRange left =
    6267         560 :           CharacterRange::Range(entry->from(), current.from() - 1);
    6268             :       CharacterRange right = CharacterRange::Range(current.from(), entry->to());
    6269             :       // The left part of the overlapping range doesn't overlap.
    6270             :       // Truncate the whole entry to be just the left part.
    6271             :       entry->set_to(left.to());
    6272             :       // The right part is the one that overlaps.  We add this part
    6273             :       // to the map and let the next step deal with merging it with
    6274             :       // the range we're adding.
    6275             :       ZoneSplayTree<Config>::Locator loc;
    6276         560 :       bool inserted = tree()->Insert(right.from(), &loc);
    6277             :       DCHECK(inserted);
    6278             :       USE(inserted);
    6279             :       loc.set_value(Entry(right.from(),
    6280             :                           right.to(),
    6281             :                           entry->out_set()));
    6282             :     }
    6283             :   }
    6284      192933 :   while (current.is_valid()) {
    6285      469090 :     if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
    6286      377375 :         (loc.value().from() <= current.to()) &&
    6287       93808 :         (loc.value().to() >= current.from())) {
    6288      370659 :       Entry* entry = &loc.value();
    6289             :       // We have overlap.  If there is space between the start point of
    6290             :       // the range we're adding and where the overlapping range starts
    6291             :       // then we have to add a range covering just that space.
    6292       93808 :       if (current.from() < entry->from()) {
    6293             :         ZoneSplayTree<Config>::Locator ins;
    6294       83328 :         bool inserted = tree()->Insert(current.from(), &ins);
    6295             :         DCHECK(inserted);
    6296             :         USE(inserted);
    6297             :         ins.set_value(Entry(current.from(),
    6298             :                             entry->from() - 1,
    6299      166656 :                             empty()->Extend(value, zone)));
    6300             :         current.set_from(entry->from());
    6301             :       }
    6302             :       DCHECK_EQ(current.from(), entry->from());
    6303             :       // If the overlapping range extends beyond the one we want to add
    6304             :       // we have to snap the right part off and add it separately.
    6305       93808 :       if (entry->to() > current.to()) {
    6306             :         ZoneSplayTree<Config>::Locator ins;
    6307        5907 :         bool inserted = tree()->Insert(current.to() + 1, &ins);
    6308             :         DCHECK(inserted);
    6309             :         USE(inserted);
    6310             :         ins.set_value(Entry(current.to() + 1,
    6311             :                             entry->to(),
    6312             :                             entry->out_set()));
    6313             :         entry->set_to(current.to());
    6314             :       }
    6315             :       DCHECK(entry->to() <= current.to());
    6316             :       // The overlapping range is now completely contained by the range
    6317             :       // we're adding so we can just update it and move the start point
    6318             :       // of the range we're adding just past it.
    6319             :       entry->AddValue(value, zone);
    6320             :       DCHECK(entry->to() + 1 > current.from());
    6321       93808 :       current.set_from(entry->to() + 1);
    6322             :     } else {
    6323             :       // There is no overlap so we can just add the range
    6324             :       ZoneSplayTree<Config>::Locator ins;
    6325       91715 :       bool inserted = tree()->Insert(current.from(), &ins);
    6326             :       DCHECK(inserted);
    6327             :       USE(inserted);
    6328             :       ins.set_value(Entry(current.from(),
    6329             :                           current.to(),
    6330       91715 :                           empty()->Extend(value, zone)));
    6331             :       break;
    6332             :     }
    6333             :   }
    6334             : }
    6335             : 
    6336             : 
    6337       77014 : OutSet* DispatchTable::Get(uc32 value) {
    6338             :   ZoneSplayTree<Config>::Locator loc;
    6339       77014 :   if (!tree()->FindGreatestLessThan(value, &loc))
    6340           0 :     return empty();
    6341      131453 :   Entry* entry = &loc.value();
    6342       77014 :   if (value <= entry->to())
    6343       54439 :     return entry->out_set();
    6344             :   else
    6345       22575 :     return empty();
    6346             : }
    6347             : 
    6348             : 
    6349             : // -------------------------------------------------------------------
    6350             : // Analysis
    6351             : 
    6352             : 
    6353     1340339 : void Analysis::EnsureAnalyzed(RegExpNode* that) {
    6354             :   StackLimitCheck check(isolate());
    6355     1340339 :   if (check.HasOverflowed()) {
    6356             :     fail("Stack overflow");
    6357             :     return;
    6358             :   }
    6359     1339898 :   if (that->info()->been_analyzed || that->info()->being_analyzed)
    6360             :     return;
    6361     1116810 :   that->info()->being_analyzed = true;
    6362     1116810 :   that->Accept(this);
    6363     1116810 :   that->info()->being_analyzed = false;
    6364     1116810 :   that->info()->been_analyzed = true;
    6365             : }
    6366             : 
    6367             : 
    6368       95682 : void Analysis::VisitEnd(EndNode* that) {
    6369             :   // nothing to do
    6370       95682 : }
    6371             : 
    6372             : 
    6373      747987 : void TextNode::CalculateOffsets() {
    6374      356864 :   int element_count = elements()->length();
    6375             :   // Set up the offsets of the elements relative to the start.  This is a fixed
    6376             :   // quantity since a TextNode can only contain fixed-width things.
    6377             :   int cp_offset = 0;
    6378      747987 :   for (int i = 0; i < element_count; i++) {
    6379             :     TextElement& elm = elements()->at(i);
    6380             :     elm.set_cp_offset(cp_offset);
    6381      391123 :     cp_offset += elm.length();
    6382             :   }
    6383      356864 : }
    6384             : 
    6385             : 
    6386      875050 : void Analysis::VisitText(TextNode* that) {
    6387      359639 :   if (ignore_case()) {
    6388      311544 :     that->MakeCaseIndependent(isolate(), is_one_byte_);
    6389             :   }
    6390      359639 :   EnsureAnalyzed(that->on_success());
    6391      359639 :   if (!has_failed()) {
    6392      356864 :     that->CalculateOffsets();
    6393             :   }
    6394      359639 : }
    6395             : 
    6396             : 
    6397      791766 : void Analysis::VisitAction(ActionNode* that) {
    6398      395883 :   RegExpNode* target = that->on_success();
    6399      395883 :   EnsureAnalyzed(target);
    6400      395883 :   if (!has_failed()) {
    6401             :     // If the next node is interested in what it follows then this node
    6402             :     // has to be interested too so it can pass the information on.
    6403             :     that->info()->AddFromFollowing(target->info());
    6404             :   }
    6405      395883 : }
    6406             : 
    6407             : 
    6408      377230 : void Analysis::VisitChoice(ChoiceNode* that) {
    6409             :   NodeInfo* info = that->info();
    6410      377230 :   for (int i = 0; i < that->alternatives()->length(); i++) {
    6411      151901 :     RegExpNode* node = that->alternatives()->at(i).node();
    6412      151901 :     EnsureAnalyzed(node);
    6413      188615 :     if (has_failed()) return;
    6414             :     // Anything the following nodes need to know has to be known by
    6415             :     // this node also, so it can pass it on.
    6416             :     info->AddFromFollowing(node->info());
    6417             :   }
    6418             : }
    6419             : 
    6420             : 
    6421     1199181 : void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
    6422             :   NodeInfo* info = that->info();
    6423     1087570 :   for (int i = 0; i < that->alternatives()->length(); i++) {
    6424      976168 :     RegExpNode* node = that->alternatives()->at(i).node();
    6425      432383 :     if (node != that->loop_node()) {
    6426      216296 :       EnsureAnalyzed(node);
    6427      432592 :       if (has_failed()) return;
    6428             :       info->AddFromFollowing(node->info());
    6429             :     }
    6430             :   }
    6431             :   // Check the loop last since it may need the value of this node
    6432             :   // to get a correct result.
    6433      111402 :   EnsureAnalyzed(that->loop_node());
    6434      111402 :   if (!has_failed()) {
    6435             :     info->AddFromFollowing(that->loop_node()->info());
    6436             :   }
    6437             : }
    6438             : 
    6439             : 
    6440        3008 : void Analysis::VisitBackReference(BackReferenceNode* that) {
    6441        3008 :   EnsureAnalyzed(that->on_success());
    6442        3008 : }
    6443             : 
    6444             : 
    6445        9588 : void Analysis::VisitAssertion(AssertionNode* that) {
    6446        9588 :   EnsureAnalyzed(that->on_success());
    6447        9588 : }
    6448             : 
    6449             : 
    6450         207 : void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
    6451             :                                      BoyerMooreLookahead* bm,
    6452             :                                      bool not_at_start) {
    6453             :   // Working out the set of characters that a backreference can match is too
    6454             :   // hard, so we just say that any character can match.
    6455             :   bm->SetRest(offset);
    6456             :   SaveBMInfo(bm, not_at_start, offset);
    6457         207 : }
    6458             : 
    6459             : 
    6460             : STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
    6461             :               RegExpMacroAssembler::kTableSize);
    6462             : 
    6463             : 
    6464       19651 : void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
    6465       19651 :                               BoyerMooreLookahead* bm, bool not_at_start) {
    6466             :   ZoneList<GuardedAlternative>* alts = alternatives();
    6467       92671 :   budget = (budget - 1) / alts->length();
    6468      146040 :   for (int i = 0; i < alts->length(); i++) {
    6469      107019 :     GuardedAlternative& alt = alts->at(i);
    6470       53650 :     if (alt.guards() != NULL && alt.guards()->length() != 0) {
    6471             :       bm->SetRest(offset);  // Give up trying to fill in info.
    6472             :       SaveBMInfo(bm, not_at_start, offset);
    6473       19651 :       return;
    6474             :     }
    6475       53369 :     alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
    6476             :   }
    6477             :   SaveBMInfo(bm, not_at_start, offset);
    6478             : }
    6479             : 
    6480             : 
    6481      146765 : void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
    6482     1261095 :                             BoyerMooreLookahead* bm, bool not_at_start) {
    6483      146765 :   if (initial_offset >= bm->length()) return;
    6484             :   int offset = initial_offset;
    6485             :   int max_char = bm->max_char();
    6486      587778 :   for (int i = 0; i < elements()->length(); i++) {
    6487      166700 :     if (offset >= bm->length()) {
    6488      134180 :       if (initial_offset == 0) set_bm_info(not_at_start, bm);
    6489             :       return;
    6490             :     }
    6491      152304 :     TextElement text = elements()->at(i);
    6492      152304 :     if (text.text_type() == TextElement::ATOM) {
    6493             :       RegExpAtom* atom = text.atom();
    6494      123870 :       for (int j = 0; j < atom->length(); j++, offset++) {
    6495       50855 :         if (offset >= bm->length()) {
    6496        5180 :           if (initial_offset == 0) set_bm_info(not_at_start, bm);
    6497             :           return;
    6498             :         }
    6499       45675 :         uc16 character = atom->data()[j];
    6500       45675 :         if (bm->compiler()->ignore_case()) {
    6501             :           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
    6502             :           int length = GetCaseIndependentLetters(
    6503             :               isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
    6504        5974 :               chars);
    6505       17048 :           for (int j = 0; j < length; j++) {
    6506       22148 :             bm->Set(offset, chars[j]);
    6507             :           }
    6508             :         } else {
    6509       79402 :           if (character <= max_char) bm->Set(offset, character);
    6510             :         }
    6511             :       }
    6512             :     } else {
    6513             :       DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
    6514             :       RegExpCharacterClass* char_class = text.char_class();
    6515             :       ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
    6516      119784 :       if (char_class->is_negated()) {
    6517        5154 :         bm->SetAll(offset);
    6518             :       } else {
    6519      899268 :         for (int k = 0; k < ranges->length(); k++) {
    6520     1120622 :           CharacterRange& range = ranges->at(k);
    6521      392319 :           if (range.from() > max_char) continue;
    6522             :           int to = Min(max_char, static_cast<int>(range.to()));
    6523      221354 :           bm->SetInterval(offset, Interval(range.from(), to));
    6524             :         }
    6525             :       }
    6526      119784 :       offset++;
    6527             :     }
    6528             :   }
    6529      127189 :   if (offset >= bm->length()) {
    6530       99226 :     if (initial_offset == 0) set_bm_info(not_at_start, bm);
    6531             :     return;
    6532             :   }
    6533       27963 :   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
    6534       27963 :                              true);  // Not at start after a text node.
    6535       27963 :   if (initial_offset == 0) set_bm_info(not_at_start, bm);
    6536             : }
    6537             : 
    6538             : 
    6539             : // -------------------------------------------------------------------
    6540             : // Dispatch table construction
    6541             : 
    6542             : 
    6543           0 : void DispatchTableConstructor::VisitEnd(EndNode* that) {
    6544             :   AddRange(CharacterRange::Everything());
    6545           0 : }
    6546             : 
    6547             : 
    6548           0 : void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
    6549             :   node->set_being_calculated(true);
    6550             :   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
    6551           0 :   for (int i = 0; i < alternatives->length(); i++) {
    6552             :     set_choice_index(i);
    6553           0 :     alternatives->at(i).node()->Accept(this);
    6554             :   }
    6555             :   node->set_being_calculated(false);
    6556           0 : }
    6557             : 
    6558             : 
    6559             : class AddDispatchRange {
    6560             :  public:
    6561             :   explicit AddDispatchRange(DispatchTableConstructor* constructor)
    6562           0 :     : constructor_(constructor) { }
    6563             :   void Call(uc32 from, DispatchTable::Entry entry);
    6564             :  private:
    6565             :   DispatchTableConstructor* constructor_;
    6566             : };
    6567             : 
    6568             : 
    6569           0 : void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
    6570           0 :   constructor_->AddRange(CharacterRange::Range(from, entry.to()));
    6571           0 : }
    6572             : 
    6573             : 
    6574           0 : void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
    6575           0 :   if (node->being_calculated())
    6576           0 :     return;
    6577           0 :   DispatchTable* table = node->GetTable(ignore_case_);
    6578             :   AddDispatchRange adder(this);
    6579             :   table->ForEach(&adder);
    6580             : }
    6581             : 
    6582             : 
    6583           0 : void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
    6584             :   // TODO(160): Find the node that we refer back to and propagate its start
    6585             :   // set back to here.  For now we just accept anything.
    6586             :   AddRange(CharacterRange::Everything());
    6587           0 : }
    6588             : 
    6589             : 
    6590           0 : void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
    6591           0 :   RegExpNode* target = that->on_success();
    6592           0 :   target->Accept(this);
    6593           0 : }
    6594             : 
    6595             : 
    6596       11018 : static int CompareRangeByFrom(const CharacterRange* a,
    6597        5509 :                               const CharacterRange* b) {
    6598       16527 :   return Compare<uc16>(a->from(), b->from());
    6599             : }
    6600             : 
    6601             : 
    6602          77 : void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
    6603        1204 :   ranges->Sort(CompareRangeByFrom);
    6604             :   uc16 last = 0;
    6605        2408 :   for (int i = 0; i < ranges->length(); i++) {
    6606        1127 :     CharacterRange range = ranges->at(i);
    6607        1127 :     if (last < range.from())
    6608         735 :       AddRange(CharacterRange::Range(last, range.from() - 1));
    6609        1127 :     if (range.to() >= last) {
    6610        1001 :       if (range.to() == String::kMaxCodePoint) {
    6611          77 :         return;
    6612             :       } else {
    6613        1001 :         last = range.to() + 1;
    6614             :       }
    6615             :     }
    6616             :   }
    6617          77 :   AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
    6618             : }
    6619             : 
    6620             : 
    6621           0 : void DispatchTableConstructor::VisitText(TextNode* that) {
    6622           0 :   TextElement elm = that->elements()->at(0);
    6623           0 :   switch (elm.text_type()) {
    6624             :     case TextElement::ATOM: {
    6625           0 :       uc16 c = elm.atom()->data()[0];
    6626           0 :       AddRange(CharacterRange::Range(c, c));
    6627             :       break;
    6628             :     }
    6629             :     case TextElement::CHAR_CLASS: {
    6630             :       RegExpCharacterClass* tree = elm.char_class();
    6631           0 :       ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
    6632           0 :       if (tree->is_negated()) {
    6633           0 :         AddInverse(ranges);
    6634             :       } else {
    6635           0 :         for (int i = 0; i < ranges->length(); i++)
    6636           0 :           AddRange(ranges->at(i));
    6637             :       }
    6638             :       break;
    6639             :     }
    6640             :     default: {
    6641           0 :       UNIMPLEMENTED();
    6642             :     }
    6643             :   }
    6644           0 : }
    6645             : 
    6646             : 
    6647           0 : void DispatchTableConstructor::VisitAction(ActionNode* that) {
    6648           0 :   RegExpNode* target = that->on_success();
    6649           0 :   target->Accept(this);
    6650           0 : }
    6651             : 
    6652             : 
    6653          50 : RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
    6654             :                                               RegExpNode* on_success) {
    6655             :   // If the regexp matching starts within a surrogate pair, step back
    6656             :   // to the lead surrogate and start matching from there.
    6657             :   DCHECK(!compiler->read_backward());
    6658             :   Zone* zone = compiler->zone();
    6659             :   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
    6660          50 :       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
    6661             :   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
    6662          50 :       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
    6663             : 
    6664          50 :   ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
    6665             : 
    6666             :   int stack_register = compiler->UnicodeLookaroundStackRegister();
    6667             :   int position_register = compiler->UnicodeLookaroundPositionRegister();
    6668             :   RegExpNode* step_back = TextNode::CreateForCharacterRanges(
    6669          50 :       zone, lead_surrogates, true, on_success);
    6670             :   RegExpLookaround::Builder builder(true, step_back, stack_register,
    6671          50 :                                     position_register);
    6672             :   RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
    6673          50 :       zone, trail_surrogates, false, builder.on_match_success());
    6674             : 
    6675             :   optional_step_back->AddAlternative(
    6676          50 :       GuardedAlternative(builder.ForMatch(match_trail)));
    6677             :   optional_step_back->AddAlternative(GuardedAlternative(on_success));
    6678             : 
    6679          50 :   return optional_step_back;
    6680             : }
    6681             : 
    6682             : 
    6683       92637 : RegExpEngine::CompilationResult RegExpEngine::Compile(
    6684             :     Isolate* isolate, Zone* zone, RegExpCompileData* data,
    6685             :     JSRegExp::Flags flags, Handle<String> pattern,
    6686             :     Handle<String> sample_subject, bool is_one_byte) {
    6687       92637 :   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
    6688             :     return IrregexpRegExpTooBig(isolate);
    6689             :   }
    6690       92622 :   bool ignore_case = flags & JSRegExp::kIgnoreCase;
    6691       92622 :   bool is_sticky = flags & JSRegExp::kSticky;
    6692       92622 :   bool is_global = flags & JSRegExp::kGlobal;
    6693             :   bool is_unicode = flags & JSRegExp::kUnicode;
    6694             :   RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
    6695       92622 :                           is_one_byte);
    6696             : 
    6697       92622 :   if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
    6698             : 
    6699             :   // Sample some characters from the middle of the string.
    6700             :   static const int kSampleSize = 128;
    6701             : 
    6702       92622 :   sample_subject = String::Flatten(sample_subject);
    6703             :   int chars_sampled = 0;
    6704       92622 :   int half_way = (sample_subject->length() - kSampleSize) / 2;
    6705     1466240 :   for (int i = Max(0, half_way);
    6706      733120 :        i < sample_subject->length() && chars_sampled < kSampleSize;
    6707             :        i++, chars_sampled++) {
    6708             :     compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
    6709             :   }
    6710             : 
    6711             :   // Wrap the body of the regexp in capture #0.
    6712             :   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
    6713             :                                                     0,
    6714             :                                                     &compiler,
    6715       92622 :                                                     compiler.accept());
    6716             :   RegExpNode* node = captured_body;
    6717       92622 :   bool is_end_anchored = data->tree->IsAnchoredAtEnd();
    6718       92622 :   bool is_start_anchored = data->tree->IsAnchoredAtStart();
    6719       92622 :   int max_length = data->tree->max_match();
    6720       92622 :   if (!is_start_anchored && !is_sticky) {
    6721             :     // Add a .*? at the beginning, outside the body capture, unless
    6722             :     // this expression is anchored at the beginning or sticky.
    6723             :     RegExpNode* loop_node = RegExpQuantifier::ToNode(
    6724             :         0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
    6725      176230 :         &compiler, captured_body, data->contains_anchor);
    6726             : 
    6727       88115 :     if (data->contains_anchor) {
    6728             :       // Unroll loop once, to take care of the case that might start
    6729             :       // at the start of input.
    6730         196 :       ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
    6731             :       first_step_node->AddAlternative(GuardedAlternative(captured_body));
    6732             :       first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
    6733         196 :           new (zone) RegExpCharacterClass('*'), false, loop_node)));
    6734             :       node = first_step_node;
    6735             :     } else {
    6736             :       node = loop_node;
    6737             :     }
    6738             :   }
    6739       92622 :   if (is_one_byte) {
    6740       20427 :     node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
    6741             :     // Do it again to propagate the new nodes to places where they were not
    6742             :     // put because they had not been calculated yet.
    6743       20427 :     if (node != NULL) {
    6744       20071 :       node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
    6745             :     }
    6746       72195 :   } else if (compiler.unicode() && (is_global || is_sticky)) {
    6747          50 :     node = OptionallyStepBackToLeadSurrogate(&compiler, node);
    6748             :   }
    6749             : 
    6750       92622 :   if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
    6751       92622 :   data->node = node;
    6752             :   Analysis analysis(isolate, flags, is_one_byte);
    6753       92622 :   analysis.EnsureAnalyzed(node);
    6754       92622 :   if (analysis.has_failed()) {
    6755             :     const char* error_message = analysis.error_message();
    6756             :     return CompilationResult(isolate, error_message);
    6757             :   }
    6758             : 
    6759             :   // Create the correct assembler for the architecture.
    6760             : #ifndef V8_INTERPRETED_REGEXP
    6761             :   // Native regexp implementation.
    6762             : 
    6763             :   NativeRegExpMacroAssembler::Mode mode =
    6764             :       is_one_byte ? NativeRegExpMacroAssembler::LATIN1
    6765       92181 :                   : NativeRegExpMacroAssembler::UC16;
    6766             : 
    6767             : #if V8_TARGET_ARCH_IA32
    6768             :   RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
    6769             :                                            (data->capture_count + 1) * 2);
    6770             : #elif V8_TARGET_ARCH_X64
    6771             :   RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
    6772      184362 :                                           (data->capture_count + 1) * 2);
    6773             : #elif V8_TARGET_ARCH_ARM
    6774             :   RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
    6775             :                                           (data->capture_count + 1) * 2);
    6776             : #elif V8_TARGET_ARCH_ARM64
    6777             :   RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
    6778             :                                             (data->capture_count + 1) * 2);
    6779             : #elif V8_TARGET_ARCH_S390
    6780             :   RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
    6781             :                                            (data->capture_count + 1) * 2);
    6782             : #elif V8_TARGET_ARCH_PPC
    6783             :   RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
    6784             :                                           (data->capture_count + 1) * 2);
    6785             : #elif V8_TARGET_ARCH_MIPS
    6786             :   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
    6787             :                                            (data->capture_count + 1) * 2);
    6788             : #elif V8_TARGET_ARCH_MIPS64
    6789             :   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
    6790             :                                            (data->capture_count + 1) * 2);
    6791             : #elif V8_TARGET_ARCH_X87
    6792             :   RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
    6793             :                                           (data->capture_count + 1) * 2);
    6794             : #else
    6795             : #error "Unsupported architecture"
    6796             : #endif
    6797             : 
    6798             : #else  // V8_INTERPRETED_REGEXP
    6799             :   // Interpreted regexp implementation.
    6800             :   EmbeddedVector<byte, 1024> codes;
    6801             :   RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
    6802             : #endif  // V8_INTERPRETED_REGEXP
    6803             : 
    6804       92181 :   macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
    6805             : 
    6806             :   // Inserted here, instead of in Assembler, because it depends on information
    6807             :   // in the AST that isn't replicated in the Node structure.
    6808             :   static const int kMaxBacksearchLimit = 1024;
    6809       92701 :   if (is_end_anchored && !is_start_anchored && !is_sticky &&
    6810         520 :       max_length < kMaxBacksearchLimit) {
    6811         243 :     macro_assembler.SetCurrentPositionFromEnd(max_length);
    6812             :   }
    6813             : 
    6814       92181 :   if (is_global) {
    6815             :     RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
    6816        5093 :     if (data->tree->min_match() > 0) {
    6817             :       mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
    6818         165 :     } else if (is_unicode) {
    6819             :       mode = RegExpMacroAssembler::GLOBAL_UNICODE;
    6820             :     }
    6821             :     macro_assembler.set_global_mode(mode);
    6822             :   }
    6823             : 
    6824             :   return compiler.Assemble(¯o_assembler,
    6825             :                            node,
    6826             :                            data->capture_count,
    6827       92181 :                            pattern);
    6828             : }
    6829             : 
    6830             : 
    6831      183560 : bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
    6832             :   Heap* heap = pattern->GetHeap();
    6833      183560 :   bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
    6834      367120 :   if (heap->isolate()->total_regexp_code_generated() >
    6835      311799 :           RegExpImpl::kRegExpCompiledLimit &&
    6836      128239 :       heap->CommittedMemoryExecutable() >
    6837             :           RegExpImpl::kRegExpExecutableMemoryLimit) {
    6838             :     too_much = true;
    6839             :   }
    6840      183560 :   return too_much;
    6841             : }
    6842             : 
    6843             : 
    6844       36699 : Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
    6845             :                                    Object* key_pattern,
    6846             :                                    FixedArray** last_match_cache,
    6847             :                                    ResultsCacheType type) {
    6848             :   FixedArray* cache;
    6849       19287 :   if (!key_string->IsInternalizedString()) return Smi::kZero;
    6850       17412 :   if (type == STRING_SPLIT_SUBSTRINGS) {
    6851             :     DCHECK(key_pattern->IsString());
    6852       17412 :     if (!key_pattern->IsInternalizedString()) return Smi::kZero;
    6853             :     cache = heap->string_split_cache();
    6854             :   } else {
    6855             :     DCHECK(type == REGEXP_MULTIPLE_INDICES);
    6856             :     DCHECK(key_pattern->IsFixedArray());
    6857             :     cache = heap->regexp_multiple_cache();
    6858             :   }
    6859             : 
    6860             :   uint32_t hash = key_string->Hash();
    6861             :   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
    6862       17412 :                     ~(kArrayEntriesPerCacheEntry - 1));
    6863       51749 :   if (cache->get(index + kStringOffset) != key_string ||
    6864       16925 :       cache->get(index + kPatternOffset) != key_pattern) {
    6865             :     index =
    6866         753 :         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
    6867        1710 :     if (cache->get(index + kStringOffset) != key_string ||
    6868         204 :         cache->get(index + kPatternOffset) != key_pattern) {
    6869             :       return Smi::kZero;
    6870             :     }
    6871             :   }
    6872             : 
    6873       33698 :   *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
    6874       33698 :   return cache->get(index + kArrayOffset);
    6875             : }
    6876             : 
    6877             : 
    6878        2438 : void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
    6879             :                                Handle<Object> key_pattern,
    6880             :                                Handle<FixedArray> value_array,
    6881             :                                Handle<FixedArray> last_match_cache,
    6882             :                                ResultsCacheType type) {
    6883             :   Factory* factory = isolate->factory();
    6884             :   Handle<FixedArray> cache;
    6885        2438 :   if (!key_string->IsInternalizedString()) return;
    6886         563 :   if (type == STRING_SPLIT_SUBSTRINGS) {
    6887             :     DCHECK(key_pattern->IsString());
    6888         563 :     if (!key_pattern->IsInternalizedString()) return;
    6889             :     cache = factory->string_split_cache();
    6890             :   } else {
    6891             :     DCHECK(type == REGEXP_MULTIPLE_INDICES);
    6892             :     DCHECK(key_pattern->IsFixedArray());
    6893             :     cache = factory->regexp_multiple_cache();
    6894             :   }
    6895             : 
    6896             :   uint32_t hash = key_string->Hash();
    6897             :   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
    6898         563 :                     ~(kArrayEntriesPerCacheEntry - 1));
    6899        1126 :   if (cache->get(index + kStringOffset) == Smi::kZero) {
    6900         437 :     cache->set(index + kStringOffset, *key_string);
    6901         874 :     cache->set(index + kPatternOffset, *key_pattern);
    6902         874 :     cache->set(index + kArrayOffset, *value_array);
    6903         874 :     cache->set(index + kLastMatchOffset, *last_match_cache);
    6904             :   } else {
    6905             :     uint32_t index2 =
    6906         126 :         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
    6907         252 :     if (cache->get(index2 + kStringOffset) == Smi::kZero) {
    6908          90 :       cache->set(index2 + kStringOffset, *key_string);
    6909         180 :       cache->set(index2 + kPatternOffset, *key_pattern);
    6910         180 :       cache->set(index2 + kArrayOffset, *value_array);
    6911         180 :       cache->set(index2 + kLastMatchOffset, *last_match_cache);
    6912             :     } else {
    6913             :       cache->set(index2 + kStringOffset, Smi::kZero);
    6914          36 :       cache->set(index2 + kPatternOffset, Smi::kZero);
    6915          36 :       cache->set(index2 + kArrayOffset, Smi::kZero);
    6916          36 :       cache->set(index2 + kLastMatchOffset, Smi::kZero);
    6917          36 :       cache->set(index + kStringOffset, *key_string);
    6918          72 :       cache->set(index + kPatternOffset, *key_pattern);
    6919          72 :       cache->set(index + kArrayOffset, *value_array);
    6920          72 :       cache->set(index + kLastMatchOffset, *last_match_cache);
    6921             :     }
    6922             :   }
    6923             :   // If the array is a reasonably short list of substrings, convert it into a
    6924             :   // list of internalized strings.
    6925        1126 :   if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
    6926        2427 :     for (int i = 0; i < value_array->length(); i++) {
    6927             :       Handle<String> str(String::cast(value_array->get(i)), isolate);
    6928         932 :       Handle<String> internalized_str = factory->InternalizeString(str);
    6929         932 :       value_array->set(i, *internalized_str);
    6930             :     }
    6931             :   }
    6932             :   // Convert backing store to a copy-on-write array.
    6933         563 :   value_array->set_map_no_write_barrier(isolate->heap()->fixed_cow_array_map());
    6934             : }
    6935             : 
    6936             : 
    6937      106692 : void RegExpResultsCache::Clear(FixedArray* cache) {
    6938    27419844 :   for (int i = 0; i < kRegExpResultsCacheSize; i++) {
    6939             :     cache->set(i, Smi::kZero);
    6940             :   }
    6941      106692 : }
    6942             : 
    6943             : }  // namespace internal
    6944             : }  // namespace v8
 |