Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/jsregexp.h"
6 :
7 : #include <memory>
8 : #include <vector>
9 :
10 : #include "src/base/platform/platform.h"
11 : #include "src/code-tracer.h"
12 : #include "src/compilation-cache.h"
13 : #include "src/elements.h"
14 : #include "src/execution.h"
15 : #include "src/heap/factory.h"
16 : #include "src/heap/heap-inl.h"
17 : #include "src/isolate-inl.h"
18 : #include "src/message-template.h"
19 : #include "src/ostreams.h"
20 : #include "src/regexp/interpreter-irregexp.h"
21 : #include "src/regexp/jsregexp-inl.h"
22 : #include "src/regexp/regexp-macro-assembler-irregexp.h"
23 : #include "src/regexp/regexp-macro-assembler-tracer.h"
24 : #include "src/regexp/regexp-macro-assembler.h"
25 : #include "src/regexp/regexp-parser.h"
26 : #include "src/regexp/regexp-stack.h"
27 : #include "src/runtime/runtime.h"
28 : #include "src/splay-tree-inl.h"
29 : #include "src/string-search.h"
30 : #include "src/unicode-decoder.h"
31 : #include "src/unicode-inl.h"
32 : #include "src/zone/zone-list-inl.h"
33 :
34 : #ifdef V8_INTL_SUPPORT
35 : #include "unicode/uniset.h"
36 : #include "unicode/utypes.h"
37 : #endif // V8_INTL_SUPPORT
38 :
39 : #if V8_TARGET_ARCH_IA32
40 : #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
41 : #elif V8_TARGET_ARCH_X64
42 : #include "src/regexp/x64/regexp-macro-assembler-x64.h"
43 : #elif V8_TARGET_ARCH_ARM64
44 : #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
45 : #elif V8_TARGET_ARCH_ARM
46 : #include "src/regexp/arm/regexp-macro-assembler-arm.h"
47 : #elif V8_TARGET_ARCH_PPC
48 : #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
49 : #elif V8_TARGET_ARCH_S390
50 : #include "src/regexp/s390/regexp-macro-assembler-s390.h"
51 : #elif V8_TARGET_ARCH_MIPS
52 : #include "src/regexp/mips/regexp-macro-assembler-mips.h"
53 : #elif V8_TARGET_ARCH_MIPS64
54 : #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
55 : #else
56 : #error Unsupported target architecture.
57 : #endif
58 :
59 : namespace v8 {
60 : namespace internal {
61 :
62 : V8_WARN_UNUSED_RESULT
63 3205 : static inline MaybeHandle<Object> ThrowRegExpException(
64 : Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
65 : Handle<String> error_text) {
66 6410 : THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
67 : pattern, error_text),
68 : Object);
69 : }
70 :
71 349 : inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
72 : Handle<String> error_text) {
73 349 : USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
74 : error_text));
75 349 : }
76 :
77 :
78 0 : ContainedInLattice AddRange(ContainedInLattice containment,
79 : const int* ranges,
80 : int ranges_length,
81 : Interval new_range) {
82 : DCHECK_EQ(1, ranges_length & 1);
83 : DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
84 992692 : if (containment == kLatticeUnknown) return containment;
85 : bool inside = false;
86 : int last = 0;
87 10421341 : for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
88 : // Consider the range from last to ranges[i].
89 : // We haven't got to the new range yet.
90 5653127 : if (ranges[i] <= new_range.from()) continue;
91 : // New range is wholly inside last-ranges[i]. Note that new_range.to() is
92 : // inclusive, but the values in ranges are not.
93 884913 : if (last <= new_range.from() && new_range.to() < ranges[i]) {
94 867386 : return Combine(containment, inside ? kLatticeIn : kLatticeOut);
95 : }
96 : return kLatticeUnknown;
97 : }
98 : return containment;
99 : }
100 :
101 : // More makes code generation slower, less makes V8 benchmark score lower.
102 : const int kMaxLookaheadForBoyerMoore = 8;
103 : // In a 3-character pattern you can maximally step forwards 3 characters
104 : // at a time, which is not always enough to pay for the extra logic.
105 : const int kPatternTooShortForBoyerMoore = 2;
106 :
107 : // Identifies the sort of regexps where the regexp engine is faster
108 : // than the code used for atom matches.
109 204870 : static bool HasFewDifferentCharacters(Handle<String> pattern) {
110 : int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
111 204870 : if (length <= kPatternTooShortForBoyerMoore) return false;
112 : const int kMod = 128;
113 : bool character_found[kMod];
114 : int different = 0;
115 : memset(&character_found[0], 0, sizeof(character_found));
116 995941 : for (int i = 0; i < length; i++) {
117 598069 : int ch = (pattern->Get(i) & (kMod - 1));
118 598069 : if (!character_found[ch]) {
119 597649 : character_found[ch] = true;
120 597649 : different++;
121 : // We declare a regexp low-alphabet if it has at least 3 times as many
122 : // characters as it has different characters.
123 597649 : if (different * 3 > length) return false;
124 : }
125 : }
126 : return true;
127 : }
128 :
129 : // Generic RegExp methods. Dispatches to implementation specific methods.
130 :
131 460444 : MaybeHandle<Object> RegExpImpl::Compile(Isolate* isolate, Handle<JSRegExp> re,
132 : Handle<String> pattern,
133 : JSRegExp::Flags flags) {
134 : DCHECK(pattern->IsFlat());
135 :
136 920888 : Zone zone(isolate->allocator(), ZONE_NAME);
137 : CompilationCache* compilation_cache = isolate->compilation_cache();
138 : MaybeHandle<FixedArray> maybe_cached =
139 460444 : compilation_cache->LookupRegExp(pattern, flags);
140 : Handle<FixedArray> cached;
141 460444 : if (maybe_cached.ToHandle(&cached)) {
142 334510 : re->set_data(*cached);
143 167255 : return re;
144 : }
145 :
146 : PostponeInterruptsScope postpone(isolate);
147 : RegExpCompileData parse_result;
148 293189 : FlatStringReader reader(isolate, pattern);
149 : DCHECK(!isolate->has_pending_exception());
150 293189 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
151 : &parse_result)) {
152 : // Throw an exception if we fail to parse the pattern.
153 2831 : return ThrowRegExpException(isolate, re, pattern, parse_result.error);
154 : }
155 :
156 : bool has_been_compiled = false;
157 :
158 885492 : if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
159 198173 : !HasFewDifferentCharacters(pattern)) {
160 : // Parse-tree is a single atom that is equal to the pattern.
161 : AtomCompile(isolate, re, pattern, flags, pattern);
162 : has_been_compiled = true;
163 106810 : } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
164 7261 : parse_result.capture_count == 0) {
165 7251 : RegExpAtom* atom = parse_result.tree->AsAtom();
166 7251 : Vector<const uc16> atom_pattern = atom->data();
167 : Handle<String> atom_string;
168 14502 : ASSIGN_RETURN_ON_EXCEPTION(
169 : isolate, atom_string,
170 : isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
171 7251 : if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
172 : AtomCompile(isolate, re, pattern, flags, atom_string);
173 : has_been_compiled = true;
174 : }
175 : }
176 290358 : if (!has_been_compiled) {
177 85576 : IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count);
178 : }
179 : DCHECK(re->data()->IsFixedArray());
180 : // Compilation succeeded so the data is set on the regexp
181 : // and we can store it in the cache.
182 : Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
183 290358 : compilation_cache->PutRegExp(pattern, flags, data);
184 :
185 290358 : return re;
186 : }
187 :
188 4350688 : MaybeHandle<Object> RegExpImpl::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
189 : Handle<String> subject, int index,
190 : Handle<RegExpMatchInfo> last_match_info) {
191 4350688 : switch (regexp->TypeTag()) {
192 : case JSRegExp::ATOM:
193 286 : return AtomExec(isolate, regexp, subject, index, last_match_info);
194 : case JSRegExp::IRREGEXP: {
195 4350402 : return IrregexpExec(isolate, regexp, subject, index, last_match_info);
196 : }
197 : default:
198 0 : UNREACHABLE();
199 : }
200 : }
201 :
202 :
203 : // RegExp Atom implementation: Simple string search using indexOf.
204 :
205 0 : void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
206 : Handle<String> pattern, JSRegExp::Flags flags,
207 : Handle<String> match_pattern) {
208 204782 : isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags,
209 204782 : match_pattern);
210 0 : }
211 :
212 273 : static void SetAtomLastCapture(Isolate* isolate,
213 : Handle<RegExpMatchInfo> last_match_info,
214 : String subject, int from, int to) {
215 : SealHandleScope shs(isolate);
216 : last_match_info->SetNumberOfCaptureRegisters(2);
217 546 : last_match_info->SetLastSubject(subject);
218 546 : last_match_info->SetLastInput(subject);
219 : last_match_info->SetCapture(0, from);
220 : last_match_info->SetCapture(1, to);
221 273 : }
222 :
223 90541 : int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
224 : Handle<String> subject, int index, int32_t* output,
225 : int output_size) {
226 : DCHECK_LE(0, index);
227 : DCHECK_LE(index, subject->length());
228 :
229 90541 : subject = String::Flatten(isolate, subject);
230 : DisallowHeapAllocation no_gc; // ensure vectors stay valid
231 :
232 90541 : String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
233 : int needle_len = needle->length();
234 : DCHECK(needle->IsFlat());
235 : DCHECK_LT(0, needle_len);
236 :
237 181082 : if (index + needle_len > subject->length()) {
238 : return RegExpImpl::RE_FAILURE;
239 : }
240 :
241 273479 : for (int i = 0; i < output_size; i += 2) {
242 181736 : String::FlatContent needle_content = needle->GetFlatContent(no_gc);
243 181736 : String::FlatContent subject_content = subject->GetFlatContent(no_gc);
244 : DCHECK(needle_content.IsFlat());
245 : DCHECK(subject_content.IsFlat());
246 : // dispatch on type of strings
247 : index =
248 : (needle_content.IsOneByte()
249 : ? (subject_content.IsOneByte()
250 : ? SearchString(isolate, subject_content.ToOneByteVector(),
251 : needle_content.ToOneByteVector(), index)
252 : : SearchString(isolate, subject_content.ToUC16Vector(),
253 : needle_content.ToOneByteVector(), index))
254 : : (subject_content.IsOneByte()
255 : ? SearchString(isolate, subject_content.ToOneByteVector(),
256 : needle_content.ToUC16Vector(), index)
257 : : SearchString(isolate, subject_content.ToUC16Vector(),
258 363472 : needle_content.ToUC16Vector(), index)));
259 181736 : if (index == -1) {
260 90266 : return i / 2; // Return number of matches.
261 : } else {
262 91470 : output[i] = index;
263 91470 : output[i+1] = index + needle_len;
264 : index += needle_len;
265 : }
266 : }
267 273 : return output_size / 2;
268 : }
269 :
270 286 : Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
271 : Handle<String> subject, int index,
272 : Handle<RegExpMatchInfo> last_match_info) {
273 : static const int kNumRegisters = 2;
274 : STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
275 : int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
276 :
277 : int res =
278 286 : AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
279 :
280 299 : if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
281 :
282 : DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
283 : SealHandleScope shs(isolate);
284 273 : SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
285 273 : output_registers[1]);
286 273 : return last_match_info;
287 : }
288 :
289 :
290 : // Irregexp implementation.
291 :
292 : // Ensures that the regexp object contains a compiled version of the
293 : // source for either one-byte or two-byte subject strings.
294 : // If the compiled version doesn't already exist, it is compiled
295 : // from the source pattern.
296 : // If compilation fails, an exception is thrown and this function
297 : // returns false.
298 4469232 : bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
299 : Handle<String> sample_subject,
300 : bool is_one_byte) {
301 : Object compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
302 4469232 : if (compiled_code != Smi::FromInt(JSRegExp::kUninitializedValue)) {
303 : DCHECK(FLAG_regexp_interpret_all ? compiled_code->IsByteArray()
304 : : compiled_code->IsCode());
305 : return true;
306 : }
307 85768 : return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
308 : }
309 :
310 85768 : bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
311 : Handle<String> sample_subject,
312 : bool is_one_byte) {
313 : // Compile the RegExp.
314 171536 : Zone zone(isolate->allocator(), ZONE_NAME);
315 : PostponeInterruptsScope postpone(isolate);
316 : #ifdef DEBUG
317 : Object entry = re->DataAt(JSRegExp::code_index(is_one_byte));
318 : // When arriving here entry can only be a smi representing an uncompiled
319 : // regexp.
320 : DCHECK(entry->IsSmi());
321 : int entry_value = Smi::ToInt(entry);
322 : DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
323 : #endif
324 :
325 85768 : JSRegExp::Flags flags = re->GetFlags();
326 :
327 : Handle<String> pattern(re->Pattern(), isolate);
328 85768 : pattern = String::Flatten(isolate, pattern);
329 : RegExpCompileData compile_data;
330 85768 : FlatStringReader reader(isolate, pattern);
331 85768 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
332 : &compile_data)) {
333 : // Throw an exception if we fail to parse the pattern.
334 : // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
335 25 : USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
336 25 : return false;
337 : }
338 : RegExpEngine::CompilationResult result =
339 : RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
340 85743 : sample_subject, is_one_byte);
341 85743 : if (result.error_message != nullptr) {
342 : // Unable to compile regexp.
343 349 : if (FLAG_abort_on_stack_or_string_length_overflow &&
344 0 : strncmp(result.error_message, "Stack overflow", 15) == 0) {
345 0 : FATAL("Aborting on stack overflow");
346 : }
347 698 : Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
348 349 : CStrVector(result.error_message)).ToHandleChecked();
349 349 : ThrowRegExpException(isolate, re, error_message);
350 : return false;
351 : }
352 :
353 : Handle<FixedArray> data =
354 : Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
355 85394 : data->set(JSRegExp::code_index(is_one_byte), result.code);
356 85394 : SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
357 : int register_max = IrregexpMaxRegisterCount(*data);
358 85394 : if (result.num_registers > register_max) {
359 : SetIrregexpMaxRegisterCount(*data, result.num_registers);
360 : }
361 :
362 : return true;
363 : }
364 :
365 0 : int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
366 : return Smi::cast(
367 0 : re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
368 : }
369 :
370 0 : void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
371 : re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
372 0 : }
373 :
374 85394 : void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray re,
375 : Handle<FixedArray> value) {
376 85394 : if (value.is_null()) {
377 85034 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
378 : } else {
379 360 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
380 : }
381 85394 : }
382 :
383 0 : int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
384 0 : return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex));
385 : }
386 :
387 0 : int RegExpImpl::IrregexpNumberOfRegisters(FixedArray re) {
388 0 : return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex));
389 : }
390 :
391 0 : ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
392 0 : return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
393 : }
394 :
395 0 : Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
396 0 : return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
397 : }
398 :
399 0 : void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
400 : Handle<String> pattern,
401 : JSRegExp::Flags flags, int capture_count) {
402 : // Initialize compiled code entries to null.
403 : isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern,
404 85576 : flags, capture_count);
405 0 : }
406 :
407 4357818 : int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
408 : Handle<String> subject) {
409 : DCHECK(subject->IsFlat());
410 :
411 : // Check representation of the underlying storage.
412 4357818 : bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
413 4357818 : if (!EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) return -1;
414 :
415 4357444 : if (FLAG_regexp_interpret_all) {
416 : // Byte-code regexp needs space allocated for all its registers.
417 : // The result captures are copied to the start of the registers array
418 : // if the match succeeds. This way those registers are not clobbered
419 : // when we set the last match info from last successful match.
420 : return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
421 4251888 : (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
422 : } else {
423 : // Native regexp only needs room to output captures. Registers are handled
424 : // internally.
425 105556 : return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
426 : }
427 : }
428 :
429 4393453 : int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
430 : Handle<String> subject, int index,
431 : int32_t* output, int output_size) {
432 : Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
433 :
434 : DCHECK_LE(0, index);
435 : DCHECK_LE(index, subject->length());
436 : DCHECK(subject->IsFlat());
437 :
438 4393453 : bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
439 :
440 4393453 : if (!FLAG_regexp_interpret_all) {
441 : DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
442 4 : do {
443 111413 : EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
444 : Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
445 : // The stack is used to allocate registers for the compiled regexp code.
446 : // This means that in case of failure, the output registers array is left
447 : // untouched and contains the capture results from the previous successful
448 : // match. We can use that to set the last match info lazily.
449 : int res = NativeRegExpMacroAssembler::Match(code, subject, output,
450 111413 : output_size, index, isolate);
451 111413 : if (res != NativeRegExpMacroAssembler::RETRY) {
452 : DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
453 : isolate->has_pending_exception());
454 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
455 : RE_SUCCESS);
456 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
457 : RE_FAILURE);
458 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
459 : RE_EXCEPTION);
460 111409 : return res;
461 : }
462 : // If result is RETRY, the string has changed representation, and we
463 : // must restart from scratch.
464 : // In this case, it means we must make sure we are prepared to handle
465 : // the, potentially, different subject (the string can switch between
466 : // being internal and external, and even between being Latin1 and UC16,
467 : // but the characters are always the same).
468 4 : IrregexpPrepare(isolate, regexp, subject);
469 4 : is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
470 : } while (true);
471 : UNREACHABLE();
472 : } else {
473 : DCHECK(FLAG_regexp_interpret_all);
474 : DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
475 : // We must have done EnsureCompiledIrregexp, so we can get the number of
476 : // registers.
477 : int number_of_capture_registers =
478 4282044 : (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
479 4282044 : int32_t* raw_output = &output[number_of_capture_registers];
480 :
481 1 : do {
482 : // We do not touch the actual capture result registers until we know there
483 : // has been a match so that we can use those capture results to set the
484 : // last match info.
485 13135271 : for (int i = number_of_capture_registers - 1; i >= 0; i--) {
486 8853226 : raw_output[i] = -1;
487 : }
488 : Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
489 : isolate);
490 :
491 : IrregexpInterpreter::Result result = IrregexpInterpreter::Match(
492 4282045 : isolate, byte_codes, subject, raw_output, index);
493 : DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION,
494 : isolate->has_pending_exception());
495 :
496 4282045 : switch (result) {
497 : case IrregexpInterpreter::SUCCESS:
498 : // Copy capture results to the start of the registers array.
499 : MemCopy(output, raw_output,
500 : number_of_capture_registers * sizeof(int32_t));
501 8389394 : return result;
502 : case IrregexpInterpreter::EXCEPTION:
503 : case IrregexpInterpreter::FAILURE:
504 : return result;
505 : case IrregexpInterpreter::RETRY:
506 : // The string has changed representation, and we must restart the
507 : // match.
508 1 : is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
509 1 : EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
510 1 : break;
511 : }
512 : } while (true);
513 : UNREACHABLE();
514 : }
515 : }
516 :
517 4350402 : MaybeHandle<Object> RegExpImpl::IrregexpExec(
518 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
519 : int previous_index, Handle<RegExpMatchInfo> last_match_info) {
520 : DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
521 :
522 4350402 : subject = String::Flatten(isolate, subject);
523 :
524 : // Prepare space for the return values.
525 : #ifdef DEBUG
526 : if (FLAG_regexp_interpret_all && FLAG_trace_regexp_bytecodes) {
527 : String pattern = regexp->Pattern();
528 : PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
529 : PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
530 : }
531 : #endif
532 : int required_registers =
533 4350402 : RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
534 4350402 : if (required_registers < 0) {
535 : // Compiling failed with an exception.
536 : DCHECK(isolate->has_pending_exception());
537 279 : return MaybeHandle<Object>();
538 : }
539 :
540 : int32_t* output_registers = nullptr;
541 4350123 : if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
542 2844 : output_registers = NewArray<int32_t>(required_registers);
543 : }
544 : std::unique_ptr<int32_t[]> auto_release(output_registers);
545 4350123 : if (output_registers == nullptr) {
546 : output_registers = isolate->jsregexp_static_offsets_vector();
547 : }
548 :
549 : int res =
550 : RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
551 4350123 : output_registers, required_registers);
552 4350123 : if (res == RE_SUCCESS) {
553 : int capture_count =
554 : IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
555 : return SetLastMatchInfo(isolate, last_match_info, subject, capture_count,
556 4161191 : output_registers);
557 : }
558 188932 : if (res == RE_EXCEPTION) {
559 : DCHECK(isolate->has_pending_exception());
560 59 : return MaybeHandle<Object>();
561 : }
562 : DCHECK(res == RE_FAILURE);
563 188873 : return isolate->factory()->null_value();
564 : }
565 :
566 4255951 : Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
567 : Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
568 : Handle<String> subject, int capture_count, int32_t* match) {
569 : // This is the only place where match infos can grow. If, after executing the
570 : // regexp, RegExpExecStub finds that the match info is too small, it restarts
571 : // execution in RegExpImpl::Exec, which finally grows the match info right
572 : // here.
573 :
574 4255951 : int capture_register_count = (capture_count + 1) * 2;
575 : Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures(
576 4255951 : isolate, last_match_info, capture_register_count);
577 : result->SetNumberOfCaptureRegisters(capture_register_count);
578 :
579 4255951 : if (*result != *last_match_info) {
580 4256 : if (*last_match_info == *isolate->regexp_last_match_info()) {
581 : // This inner condition is only needed for special situations like the
582 : // regexp fuzzer, where we pass our own custom RegExpMatchInfo to
583 : // RegExpImpl::Exec; there actually want to bypass the Isolate's match
584 : // info and execute the regexp without side effects.
585 4256 : isolate->native_context()->set_regexp_last_match_info(*result);
586 : }
587 : }
588 :
589 : DisallowHeapAllocation no_allocation;
590 4255951 : if (match != nullptr) {
591 15173507 : for (int i = 0; i < capture_register_count; i += 2) {
592 5458778 : result->SetCapture(i, match[i]);
593 5458778 : result->SetCapture(i + 1, match[i + 1]);
594 : }
595 : }
596 8511902 : result->SetLastSubject(*subject);
597 8511902 : result->SetLastInput(*subject);
598 4255951 : return result;
599 : }
600 :
601 95498 : RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
602 : Handle<String> subject, Isolate* isolate)
603 : : register_array_(nullptr),
604 : register_array_size_(0),
605 : regexp_(regexp),
606 : subject_(subject),
607 95498 : isolate_(isolate) {
608 95498 : bool interpreted = FLAG_regexp_interpret_all;
609 :
610 95498 : if (regexp_->TypeTag() == JSRegExp::ATOM) {
611 : static const int kAtomRegistersPerMatch = 2;
612 90255 : registers_per_match_ = kAtomRegistersPerMatch;
613 : // There is no distinction between interpreted and native for atom regexps.
614 : interpreted = false;
615 : } else {
616 : registers_per_match_ =
617 5243 : RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
618 5243 : if (registers_per_match_ < 0) {
619 95 : num_matches_ = -1; // Signal exception.
620 95 : return;
621 : }
622 : }
623 :
624 : DCHECK(IsGlobal(regexp->GetFlags()));
625 95403 : if (!interpreted) {
626 : register_array_size_ =
627 189494 : Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
628 94747 : max_matches_ = register_array_size_ / registers_per_match_;
629 : } else {
630 : // Global loop in interpreted regexp is not implemented. We choose
631 : // the size of the offsets vector so that it can only store one match.
632 656 : register_array_size_ = registers_per_match_;
633 656 : max_matches_ = 1;
634 : }
635 :
636 95403 : if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
637 1072 : register_array_ = NewArray<int32_t>(register_array_size_);
638 : } else {
639 94331 : register_array_ = isolate->jsregexp_static_offsets_vector();
640 : }
641 :
642 : // Set state so that fetching the results the first time triggers a call
643 : // to the compiled regexp.
644 95403 : current_match_index_ = max_matches_ - 1;
645 95403 : num_matches_ = max_matches_;
646 : DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
647 : DCHECK_GE(register_array_size_, registers_per_match_);
648 : int32_t* last_match =
649 95403 : ®ister_array_[current_match_index_ * registers_per_match_];
650 95403 : last_match[0] = -1;
651 95403 : last_match[1] = 0;
652 : }
653 :
654 7 : int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
655 7 : if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
656 7 : unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
657 : unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
658 : // Advance over the surrogate pair.
659 0 : return last_index + 2;
660 : }
661 7 : return last_index + 1;
662 : }
663 :
664 : // -------------------------------------------------------------------
665 : // Implementation of the Irregexp regular expression engine.
666 : //
667 : // The Irregexp regular expression engine is intended to be a complete
668 : // implementation of ECMAScript regular expressions. It generates either
669 : // bytecodes or native code.
670 :
671 : // The Irregexp regexp engine is structured in three steps.
672 : // 1) The parser generates an abstract syntax tree. See ast.cc.
673 : // 2) From the AST a node network is created. The nodes are all
674 : // subclasses of RegExpNode. The nodes represent states when
675 : // executing a regular expression. Several optimizations are
676 : // performed on the node network.
677 : // 3) From the nodes we generate either byte codes or native code
678 : // that can actually execute the regular expression (perform
679 : // the search). The code generation step is described in more
680 : // detail below.
681 :
682 : // Code generation.
683 : //
684 : // The nodes are divided into four main categories.
685 : // * Choice nodes
686 : // These represent places where the regular expression can
687 : // match in more than one way. For example on entry to an
688 : // alternation (foo|bar) or a repetition (*, +, ? or {}).
689 : // * Action nodes
690 : // These represent places where some action should be
691 : // performed. Examples include recording the current position
692 : // in the input string to a register (in order to implement
693 : // captures) or other actions on register for example in order
694 : // to implement the counters needed for {} repetitions.
695 : // * Matching nodes
696 : // These attempt to match some element part of the input string.
697 : // Examples of elements include character classes, plain strings
698 : // or back references.
699 : // * End nodes
700 : // These are used to implement the actions required on finding
701 : // a successful match or failing to find a match.
702 : //
703 : // The code generated (whether as byte codes or native code) maintains
704 : // some state as it runs. This consists of the following elements:
705 : //
706 : // * The capture registers. Used for string captures.
707 : // * Other registers. Used for counters etc.
708 : // * The current position.
709 : // * The stack of backtracking information. Used when a matching node
710 : // fails to find a match and needs to try an alternative.
711 : //
712 : // Conceptual regular expression execution model:
713 : //
714 : // There is a simple conceptual model of regular expression execution
715 : // which will be presented first. The actual code generated is a more
716 : // efficient simulation of the simple conceptual model:
717 : //
718 : // * Choice nodes are implemented as follows:
719 : // For each choice except the last {
720 : // push current position
721 : // push backtrack code location
722 : // <generate code to test for choice>
723 : // backtrack code location:
724 : // pop current position
725 : // }
726 : // <generate code to test for last choice>
727 : //
728 : // * Actions nodes are generated as follows
729 : // <push affected registers on backtrack stack>
730 : // <generate code to perform action>
731 : // push backtrack code location
732 : // <generate code to test for following nodes>
733 : // backtrack code location:
734 : // <pop affected registers to restore their state>
735 : // <pop backtrack location from stack and go to it>
736 : //
737 : // * Matching nodes are generated as follows:
738 : // if input string matches at current position
739 : // update current position
740 : // <generate code to test for following nodes>
741 : // else
742 : // <pop backtrack location from stack and go to it>
743 : //
744 : // Thus it can be seen that the current position is saved and restored
745 : // by the choice nodes, whereas the registers are saved and restored by
746 : // by the action nodes that manipulate them.
747 : //
748 : // The other interesting aspect of this model is that nodes are generated
749 : // at the point where they are needed by a recursive call to Emit(). If
750 : // the node has already been code generated then the Emit() call will
751 : // generate a jump to the previously generated code instead. In order to
752 : // limit recursion it is possible for the Emit() function to put the node
753 : // on a work list for later generation and instead generate a jump. The
754 : // destination of the jump is resolved later when the code is generated.
755 : //
756 : // Actual regular expression code generation.
757 : //
758 : // Code generation is actually more complicated than the above. In order
759 : // to improve the efficiency of the generated code some optimizations are
760 : // performed
761 : //
762 : // * Choice nodes have 1-character lookahead.
763 : // A choice node looks at the following character and eliminates some of
764 : // the choices immediately based on that character. This is not yet
765 : // implemented.
766 : // * Simple greedy loops store reduced backtracking information.
767 : // A quantifier like /.*foo/m will greedily match the whole input. It will
768 : // then need to backtrack to a point where it can match "foo". The naive
769 : // implementation of this would push each character position onto the
770 : // backtracking stack, then pop them off one by one. This would use space
771 : // proportional to the length of the input string. However since the "."
772 : // can only match in one way and always has a constant length (in this case
773 : // of 1) it suffices to store the current position on the top of the stack
774 : // once. Matching now becomes merely incrementing the current position and
775 : // backtracking becomes decrementing the current position and checking the
776 : // result against the stored current position. This is faster and saves
777 : // space.
778 : // * The current state is virtualized.
779 : // This is used to defer expensive operations until it is clear that they
780 : // are needed and to generate code for a node more than once, allowing
781 : // specialized an efficient versions of the code to be created. This is
782 : // explained in the section below.
783 : //
784 : // Execution state virtualization.
785 : //
786 : // Instead of emitting code, nodes that manipulate the state can record their
787 : // manipulation in an object called the Trace. The Trace object can record a
788 : // current position offset, an optional backtrack code location on the top of
789 : // the virtualized backtrack stack and some register changes. When a node is
790 : // to be emitted it can flush the Trace or update it. Flushing the Trace
791 : // will emit code to bring the actual state into line with the virtual state.
792 : // Avoiding flushing the state can postpone some work (e.g. updates of capture
793 : // registers). Postponing work can save time when executing the regular
794 : // expression since it may be found that the work never has to be done as a
795 : // failure to match can occur. In addition it is much faster to jump to a
796 : // known backtrack code location than it is to pop an unknown backtrack
797 : // location from the stack and jump there.
798 : //
799 : // The virtual state found in the Trace affects code generation. For example
800 : // the virtual state contains the difference between the actual current
801 : // position and the virtual current position, and matching code needs to use
802 : // this offset to attempt a match in the correct location of the input
803 : // string. Therefore code generated for a non-trivial trace is specialized
804 : // to that trace. The code generator therefore has the ability to generate
805 : // code for each node several times. In order to limit the size of the
806 : // generated code there is an arbitrary limit on how many specialized sets of
807 : // code may be generated for a given node. If the limit is reached, the
808 : // trace is flushed and a generic version of the code for a node is emitted.
809 : // This is subsequently used for that node. The code emitted for non-generic
810 : // trace is not recorded in the node and so it cannot currently be reused in
811 : // the event that code generation is requested for an identical trace.
812 :
813 :
814 0 : void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
815 0 : UNREACHABLE();
816 : }
817 :
818 :
819 99371 : void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
820 : text->AddElement(TextElement::Atom(this), zone);
821 99371 : }
822 :
823 :
824 7635 : void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
825 : text->AddElement(TextElement::CharClass(this), zone);
826 7635 : }
827 :
828 :
829 0 : void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
830 0 : for (int i = 0; i < elements()->length(); i++)
831 : text->AddElement(elements()->at(i), zone);
832 0 : }
833 :
834 :
835 0 : TextElement TextElement::Atom(RegExpAtom* atom) {
836 0 : return TextElement(ATOM, atom);
837 : }
838 :
839 :
840 0 : TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
841 0 : return TextElement(CHAR_CLASS, char_class);
842 : }
843 :
844 :
845 7445146 : int TextElement::length() const {
846 7445146 : switch (text_type()) {
847 : case ATOM:
848 6603781 : return atom()->length();
849 :
850 : case CHAR_CLASS:
851 : return 1;
852 : }
853 0 : UNREACHABLE();
854 : }
855 :
856 :
857 0 : DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
858 0 : if (table_ == nullptr) {
859 0 : table_ = new(zone()) DispatchTable(zone());
860 : DispatchTableConstructor cons(table_, ignore_case, zone());
861 : cons.BuildTable(this);
862 : }
863 0 : return table_;
864 : }
865 :
866 :
867 : class FrequencyCollator {
868 : public:
869 11060331 : FrequencyCollator() : total_samples_(0) {
870 22034923 : for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
871 10974592 : frequencies_[i] = CharacterFrequency(i);
872 : }
873 : }
874 :
875 : void CountCharacter(int character) {
876 456959 : int index = (character & RegExpMacroAssembler::kTableMask);
877 : frequencies_[index].Increment();
878 456959 : total_samples_++;
879 : }
880 :
881 : // Does not measure in percent, but rather per-128 (the table size from the
882 : // regexp macro assembler).
883 : int Frequency(int in_character) {
884 : DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
885 490082 : if (total_samples_ < 1) return 1; // Division by zero.
886 : int freq_in_per128 =
887 489817 : (frequencies_[in_character].counter() * 128) / total_samples_;
888 : return freq_in_per128;
889 : }
890 :
891 : private:
892 : class CharacterFrequency {
893 : public:
894 10974592 : CharacterFrequency() : counter_(0), character_(-1) { }
895 : explicit CharacterFrequency(int character)
896 : : counter_(0), character_(character) { }
897 :
898 456959 : void Increment() { counter_++; }
899 : int counter() { return counter_; }
900 : int character() { return character_; }
901 :
902 : private:
903 : int counter_;
904 : int character_;
905 : };
906 :
907 :
908 : private:
909 : CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
910 : int total_samples_;
911 : };
912 :
913 :
914 : class RegExpCompiler {
915 : public:
916 : RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
917 : bool is_one_byte);
918 :
919 : int AllocateRegister() {
920 909516 : if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
921 310203 : reg_exp_too_big_ = true;
922 : return next_register_;
923 : }
924 599313 : return next_register_++;
925 : }
926 :
927 : // Lookarounds to match lone surrogates for unicode character class matches
928 : // are never nested. We can therefore reuse registers.
929 : int UnicodeLookaroundStackRegister() {
930 2460 : if (unicode_lookaround_stack_register_ == kNoRegister) {
931 1040 : unicode_lookaround_stack_register_ = AllocateRegister();
932 : }
933 2460 : return unicode_lookaround_stack_register_;
934 : }
935 :
936 : int UnicodeLookaroundPositionRegister() {
937 2460 : if (unicode_lookaround_position_register_ == kNoRegister) {
938 1040 : unicode_lookaround_position_register_ = AllocateRegister();
939 : }
940 2460 : return unicode_lookaround_position_register_;
941 : }
942 :
943 : RegExpEngine::CompilationResult Assemble(Isolate* isolate,
944 : RegExpMacroAssembler* assembler,
945 : RegExpNode* start, int capture_count,
946 : Handle<String> pattern);
947 :
948 594380 : inline void AddWork(RegExpNode* node) {
949 949519 : if (!node->on_work_list() && !node->label()->is_bound()) {
950 : node->set_on_work_list(true);
951 210984 : work_list_->push_back(node);
952 : }
953 594380 : }
954 :
955 : static const int kImplementationOffset = 0;
956 : static const int kNumberOfRegistersOffset = 0;
957 : static const int kCodeOffset = 1;
958 :
959 : RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
960 : EndNode* accept() { return accept_; }
961 :
962 : static const int kMaxRecursion = 100;
963 : inline int recursion_depth() { return recursion_depth_; }
964 1000487 : inline void IncrementRecursionDepth() { recursion_depth_++; }
965 1000487 : inline void DecrementRecursionDepth() { recursion_depth_--; }
966 :
967 0 : void SetRegExpTooBig() { reg_exp_too_big_ = true; }
968 :
969 : inline bool one_byte() { return one_byte_; }
970 : inline bool optimize() { return optimize_; }
971 84574 : inline void set_optimize(bool value) { optimize_ = value; }
972 : inline bool limiting_recursion() { return limiting_recursion_; }
973 : inline void set_limiting_recursion(bool value) {
974 957024 : limiting_recursion_ = value;
975 : }
976 : bool read_backward() { return read_backward_; }
977 3336 : void set_read_backward(bool value) { read_backward_ = value; }
978 : FrequencyCollator* frequency_collator() { return &frequency_collator_; }
979 :
980 : int current_expansion_factor() { return current_expansion_factor_; }
981 : void set_current_expansion_factor(int value) {
982 85695 : current_expansion_factor_ = value;
983 : }
984 :
985 : Isolate* isolate() const { return isolate_; }
986 : Zone* zone() const { return zone_; }
987 :
988 : static const int kNoRegister = -1;
989 :
990 : private:
991 : EndNode* accept_;
992 : int next_register_;
993 : int unicode_lookaround_stack_register_;
994 : int unicode_lookaround_position_register_;
995 : std::vector<RegExpNode*>* work_list_;
996 : int recursion_depth_;
997 : RegExpMacroAssembler* macro_assembler_;
998 : bool one_byte_;
999 : bool reg_exp_too_big_;
1000 : bool limiting_recursion_;
1001 : bool optimize_;
1002 : bool read_backward_;
1003 : int current_expansion_factor_;
1004 : FrequencyCollator frequency_collator_;
1005 : Isolate* isolate_;
1006 : Zone* zone_;
1007 : };
1008 :
1009 :
1010 : class RecursionCheck {
1011 : public:
1012 : explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1013 : compiler->IncrementRecursionDepth();
1014 : }
1015 : ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1016 : private:
1017 : RegExpCompiler* compiler_;
1018 : };
1019 :
1020 :
1021 : static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1022 : return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1023 : }
1024 :
1025 :
1026 : // Attempts to compile the regexp using an Irregexp code generator. Returns
1027 : // a fixed array or a null handle depending on whether it succeeded.
1028 85739 : RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1029 : bool one_byte)
1030 85739 : : next_register_(2 * (capture_count + 1)),
1031 : unicode_lookaround_stack_register_(kNoRegister),
1032 : unicode_lookaround_position_register_(kNoRegister),
1033 : work_list_(nullptr),
1034 : recursion_depth_(0),
1035 : one_byte_(one_byte),
1036 : reg_exp_too_big_(false),
1037 : limiting_recursion_(false),
1038 : optimize_(FLAG_regexp_optimization),
1039 : read_backward_(false),
1040 : current_expansion_factor_(1),
1041 : frequency_collator_(),
1042 : isolate_(isolate),
1043 171478 : zone_(zone) {
1044 85739 : accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1045 : DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
1046 85739 : }
1047 :
1048 85399 : RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1049 : Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
1050 : int capture_count, Handle<String> pattern) {
1051 : #ifdef DEBUG
1052 : if (FLAG_trace_regexp_assembler)
1053 : macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1054 : else
1055 : #endif
1056 85399 : macro_assembler_ = macro_assembler;
1057 :
1058 : std::vector<RegExpNode*> work_list;
1059 85399 : work_list_ = &work_list;
1060 85399 : Label fail;
1061 85399 : macro_assembler_->PushBacktrack(&fail);
1062 : Trace new_trace;
1063 85399 : start->Emit(this, &new_trace);
1064 85399 : macro_assembler_->Bind(&fail);
1065 85399 : macro_assembler_->Fail();
1066 296383 : while (!work_list.empty()) {
1067 210984 : RegExpNode* node = work_list.back();
1068 : work_list.pop_back();
1069 : node->set_on_work_list(false);
1070 210984 : if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1071 : }
1072 85399 : if (reg_exp_too_big_) {
1073 0 : macro_assembler_->AbortedCodeGeneration();
1074 0 : return IrregexpRegExpTooBig(isolate_);
1075 : }
1076 :
1077 85399 : Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1078 170798 : isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1079 85399 : work_list_ = nullptr;
1080 : #ifdef ENABLE_DISASSEMBLER
1081 : if (FLAG_print_code && !FLAG_regexp_interpret_all) {
1082 : CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1083 : OFStream os(trace_scope.file());
1084 : Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1085 : }
1086 : #endif
1087 : #ifdef DEBUG
1088 : if (FLAG_trace_regexp_assembler) {
1089 : delete macro_assembler_;
1090 : }
1091 : #endif
1092 85399 : return RegExpEngine::CompilationResult(*code, next_register_);
1093 : }
1094 :
1095 :
1096 0 : bool Trace::DeferredAction::Mentions(int that) {
1097 2465627 : if (action_type() == ActionNode::CLEAR_CAPTURES) {
1098 : Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1099 : return range.Contains(that);
1100 : } else {
1101 2418147 : return reg() == that;
1102 : }
1103 : }
1104 :
1105 :
1106 0 : bool Trace::mentions_reg(int reg) {
1107 0 : for (DeferredAction* action = actions_; action != nullptr;
1108 : action = action->next()) {
1109 0 : if (action->Mentions(reg))
1110 : return true;
1111 : }
1112 : return false;
1113 : }
1114 :
1115 :
1116 973 : bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1117 : DCHECK_EQ(0, *cp_offset);
1118 1516 : for (DeferredAction* action = actions_; action != nullptr;
1119 : action = action->next()) {
1120 953 : if (action->Mentions(reg)) {
1121 410 : if (action->action_type() == ActionNode::STORE_POSITION) {
1122 410 : *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1123 410 : return true;
1124 : } else {
1125 : return false;
1126 : }
1127 : }
1128 : }
1129 : return false;
1130 : }
1131 :
1132 :
1133 512388 : int Trace::FindAffectedRegisters(OutSet* affected_registers,
1134 : Zone* zone) {
1135 : int max_register = RegExpCompiler::kNoRegister;
1136 933064 : for (DeferredAction* action = actions_; action != nullptr;
1137 : action = action->next()) {
1138 420676 : if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1139 : Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1140 93130 : for (int i = range.from(); i <= range.to(); i++)
1141 45084 : affected_registers->Set(i, zone);
1142 2962 : if (range.to() > max_register) max_register = range.to();
1143 : } else {
1144 417714 : affected_registers->Set(action->reg(), zone);
1145 417714 : if (action->reg() > max_register) max_register = action->reg();
1146 : }
1147 : }
1148 512388 : return max_register;
1149 : }
1150 :
1151 :
1152 512388 : void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1153 : int max_register,
1154 : const OutSet& registers_to_pop,
1155 : const OutSet& registers_to_clear) {
1156 20425968 : for (int reg = max_register; reg >= 0; reg--) {
1157 19913580 : if (registers_to_pop.Get(reg)) {
1158 52908 : assembler->PopRegister(reg);
1159 9903882 : } else if (registers_to_clear.Get(reg)) {
1160 : int clear_to = reg;
1161 472005 : while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1162 105531 : reg--;
1163 : }
1164 77706 : assembler->ClearRegisters(reg, clear_to);
1165 : }
1166 : }
1167 512388 : }
1168 :
1169 :
1170 512388 : void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1171 : int max_register,
1172 : const OutSet& affected_registers,
1173 : OutSet* registers_to_pop,
1174 : OutSet* registers_to_clear,
1175 : Zone* zone) {
1176 : // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1177 512388 : const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1178 :
1179 : // Count pushes performed to force a stack limit check occasionally.
1180 : int pushes = 0;
1181 :
1182 20637030 : for (int reg = 0; reg <= max_register; reg++) {
1183 20124642 : if (!affected_registers.Get(reg)) {
1184 : continue;
1185 : }
1186 :
1187 : // The chronologically first deferred action in the trace
1188 : // is used to infer the action needed to restore a register
1189 : // to its previous state (or not, if it's safe to ignore it).
1190 : enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1191 : DeferredActionUndoType undo_action = IGNORE;
1192 :
1193 : int value = 0;
1194 : bool absolute = false;
1195 : bool clear = false;
1196 : static const int kNoStore = kMinInt;
1197 : int store_position = kNoStore;
1198 : // This is a little tricky because we are scanning the actions in reverse
1199 : // historical order (newest first).
1200 2920901 : for (DeferredAction* action = actions_; action != nullptr;
1201 : action = action->next()) {
1202 2464674 : if (action->Mentions(reg)) {
1203 462798 : switch (action->action_type()) {
1204 : case ActionNode::SET_REGISTER: {
1205 : Trace::DeferredSetRegister* psr =
1206 : static_cast<Trace::DeferredSetRegister*>(action);
1207 3471 : if (!absolute) {
1208 3471 : value += psr->value();
1209 : absolute = true;
1210 : }
1211 : // SET_REGISTER is currently only used for newly introduced loop
1212 : // counters. They can have a significant previous value if they
1213 : // occur in a loop. TODO(lrn): Propagate this information, so
1214 : // we can set undo_action to IGNORE if we know there is no value to
1215 : // restore.
1216 : undo_action = RESTORE;
1217 : DCHECK_EQ(store_position, kNoStore);
1218 : DCHECK(!clear);
1219 : break;
1220 : }
1221 : case ActionNode::INCREMENT_REGISTER:
1222 3750 : if (!absolute) {
1223 3750 : value++;
1224 : }
1225 : DCHECK_EQ(store_position, kNoStore);
1226 : DCHECK(!clear);
1227 : undo_action = RESTORE;
1228 : break;
1229 : case ActionNode::STORE_POSITION: {
1230 : Trace::DeferredCapture* pc =
1231 : static_cast<Trace::DeferredCapture*>(action);
1232 410493 : if (!clear && store_position == kNoStore) {
1233 : store_position = pc->cp_offset();
1234 : }
1235 :
1236 : // For captures we know that stores and clears alternate.
1237 : // Other register, are never cleared, and if the occur
1238 : // inside a loop, they might be assigned more than once.
1239 410493 : if (reg <= 1) {
1240 : // Registers zero and one, aka "capture zero", is
1241 : // always set correctly if we succeed. There is no
1242 : // need to undo a setting on backtrack, because we
1243 : // will set it again or fail.
1244 : undo_action = IGNORE;
1245 : } else {
1246 190411 : undo_action = pc->is_capture() ? CLEAR : RESTORE;
1247 : }
1248 : DCHECK(!absolute);
1249 : DCHECK_EQ(value, 0);
1250 : break;
1251 : }
1252 : case ActionNode::CLEAR_CAPTURES: {
1253 : // Since we're scanning in reverse order, if we've already
1254 : // set the position we have to ignore historically earlier
1255 : // clearing operations.
1256 45084 : if (store_position == kNoStore) {
1257 : clear = true;
1258 : }
1259 : undo_action = RESTORE;
1260 : DCHECK(!absolute);
1261 : DCHECK_EQ(value, 0);
1262 : break;
1263 : }
1264 : default:
1265 0 : UNREACHABLE();
1266 : break;
1267 : }
1268 : }
1269 : }
1270 : // Prepare for the undo-action (e.g., push if it's going to be popped).
1271 456227 : if (undo_action == RESTORE) {
1272 52908 : pushes++;
1273 : RegExpMacroAssembler::StackCheckFlag stack_check =
1274 : RegExpMacroAssembler::kNoStackLimitCheck;
1275 52908 : if (pushes == push_limit) {
1276 : stack_check = RegExpMacroAssembler::kCheckStackLimit;
1277 : pushes = 0;
1278 : }
1279 :
1280 52908 : assembler->PushRegister(reg, stack_check);
1281 52908 : registers_to_pop->Set(reg, zone);
1282 403319 : } else if (undo_action == CLEAR) {
1283 183237 : registers_to_clear->Set(reg, zone);
1284 : }
1285 : // Perform the chronologically last action (or accumulated increment)
1286 : // for the register.
1287 456227 : if (store_position != kNoStore) {
1288 410493 : assembler->WriteCurrentPositionToRegister(reg, store_position);
1289 45734 : } else if (clear) {
1290 38513 : assembler->ClearRegisters(reg, reg);
1291 7221 : } else if (absolute) {
1292 3471 : assembler->SetRegister(reg, value);
1293 3750 : } else if (value != 0) {
1294 3750 : assembler->AdvanceRegister(reg, value);
1295 : }
1296 : }
1297 512388 : }
1298 :
1299 :
1300 : // This is called as we come into a loop choice node and some other tricky
1301 : // nodes. It normalizes the state of the code generator to ensure we can
1302 : // generate generic code.
1303 702001 : void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1304 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1305 :
1306 : DCHECK(!is_trivial());
1307 :
1308 702001 : if (actions_ == nullptr && backtrack() == nullptr) {
1309 : // Here we just have some deferred cp advances to fix and we are back to
1310 : // a normal situation. We may also have to forget some information gained
1311 : // through a quick check that was already performed.
1312 189613 : if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1313 : // Create a new trivial state and generate the node with that.
1314 : Trace new_state;
1315 189613 : successor->Emit(compiler, &new_state);
1316 : return;
1317 : }
1318 :
1319 : // Generate deferred actions here along with code to undo them again.
1320 : OutSet affected_registers;
1321 :
1322 512388 : if (backtrack() != nullptr) {
1323 : // Here we have a concrete backtrack location. These are set up by choice
1324 : // nodes and so they indicate that we have a deferred save of the current
1325 : // position which we may need to emit here.
1326 401805 : assembler->PushCurrentPosition();
1327 : }
1328 :
1329 : int max_register = FindAffectedRegisters(&affected_registers,
1330 512388 : compiler->zone());
1331 : OutSet registers_to_pop;
1332 : OutSet registers_to_clear;
1333 : PerformDeferredActions(assembler,
1334 : max_register,
1335 : affected_registers,
1336 : ®isters_to_pop,
1337 : ®isters_to_clear,
1338 512388 : compiler->zone());
1339 512388 : if (cp_offset_ != 0) {
1340 296033 : assembler->AdvanceCurrentPosition(cp_offset_);
1341 : }
1342 :
1343 : // Create a new trivial state and generate the node with that.
1344 512388 : Label undo;
1345 512388 : assembler->PushBacktrack(&undo);
1346 512388 : if (successor->KeepRecursing(compiler)) {
1347 : Trace new_state;
1348 138699 : successor->Emit(compiler, &new_state);
1349 : } else {
1350 373689 : compiler->AddWork(successor);
1351 747378 : assembler->GoTo(successor->label());
1352 : }
1353 :
1354 : // On backtrack we need to restore state.
1355 512388 : assembler->Bind(&undo);
1356 : RestoreAffectedRegisters(assembler,
1357 : max_register,
1358 : registers_to_pop,
1359 512388 : registers_to_clear);
1360 512388 : if (backtrack() == nullptr) {
1361 110583 : assembler->Backtrack();
1362 : } else {
1363 401805 : assembler->PopCurrentPosition();
1364 401805 : assembler->GoTo(backtrack());
1365 : }
1366 : }
1367 :
1368 :
1369 2843 : void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1370 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1371 :
1372 : // Omit flushing the trace. We discard the entire stack frame anyway.
1373 :
1374 2843 : if (!label()->is_bound()) {
1375 : // We are completely independent of the trace, since we ignore it,
1376 : // so this code can be used as the generic version.
1377 5604 : assembler->Bind(label());
1378 : }
1379 :
1380 : // Throw away everything on the backtrack stack since the start
1381 : // of the negative submatch and restore the character position.
1382 2843 : assembler->ReadCurrentPositionFromRegister(current_position_register_);
1383 2843 : assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1384 2843 : if (clear_capture_count_ > 0) {
1385 : // Clear any captures that might have been performed during the success
1386 : // of the body of the negative look-ahead.
1387 107 : int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1388 107 : assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1389 : }
1390 : // Now that we have unwound the stack we find at the top of the stack the
1391 : // backtrack that the BeginSubmatch node got.
1392 2843 : assembler->Backtrack();
1393 2843 : }
1394 :
1395 :
1396 182854 : void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1397 182854 : if (!trace->is_trivial()) {
1398 91277 : trace->Flush(compiler, this);
1399 91277 : return;
1400 : }
1401 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1402 91577 : if (!label()->is_bound()) {
1403 170776 : assembler->Bind(label());
1404 : }
1405 91577 : switch (action_) {
1406 : case ACCEPT:
1407 91277 : assembler->Succeed();
1408 91277 : return;
1409 : case BACKTRACK:
1410 300 : assembler->GoTo(trace->backtrack());
1411 300 : return;
1412 : case NEGATIVE_SUBMATCH_SUCCESS:
1413 : // This case is handled in a different virtual method.
1414 0 : UNREACHABLE();
1415 : }
1416 0 : UNIMPLEMENTED();
1417 : }
1418 :
1419 :
1420 903942 : void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1421 1807884 : if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone);
1422 903942 : guards_->Add(guard, zone);
1423 903942 : }
1424 :
1425 :
1426 903365 : ActionNode* ActionNode::SetRegister(int reg,
1427 : int val,
1428 : RegExpNode* on_success) {
1429 : ActionNode* result =
1430 : new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1431 903365 : result->data_.u_store_register.reg = reg;
1432 903365 : result->data_.u_store_register.value = val;
1433 903365 : return result;
1434 : }
1435 :
1436 :
1437 903365 : ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1438 : ActionNode* result =
1439 : new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1440 903365 : result->data_.u_increment_register.reg = reg;
1441 903365 : return result;
1442 : }
1443 :
1444 :
1445 226407 : ActionNode* ActionNode::StorePosition(int reg,
1446 : bool is_capture,
1447 : RegExpNode* on_success) {
1448 : ActionNode* result =
1449 : new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1450 226407 : result->data_.u_position_register.reg = reg;
1451 226407 : result->data_.u_position_register.is_capture = is_capture;
1452 226407 : return result;
1453 : }
1454 :
1455 :
1456 2382 : ActionNode* ActionNode::ClearCaptures(Interval range,
1457 : RegExpNode* on_success) {
1458 : ActionNode* result =
1459 : new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1460 2382 : result->data_.u_clear_captures.range_from = range.from();
1461 2382 : result->data_.u_clear_captures.range_to = range.to();
1462 2382 : return result;
1463 : }
1464 :
1465 :
1466 4467 : ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1467 : int position_reg,
1468 : RegExpNode* on_success) {
1469 : ActionNode* result =
1470 : new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1471 4467 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1472 4467 : result->data_.u_submatch.current_position_register = position_reg;
1473 4467 : return result;
1474 : }
1475 :
1476 :
1477 1655 : ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1478 : int position_reg,
1479 : int clear_register_count,
1480 : int clear_register_from,
1481 : RegExpNode* on_success) {
1482 : ActionNode* result =
1483 : new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1484 1655 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1485 1655 : result->data_.u_submatch.current_position_register = position_reg;
1486 1655 : result->data_.u_submatch.clear_register_count = clear_register_count;
1487 1655 : result->data_.u_submatch.clear_register_from = clear_register_from;
1488 1655 : return result;
1489 : }
1490 :
1491 :
1492 537 : ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1493 : int repetition_register,
1494 : int repetition_limit,
1495 : RegExpNode* on_success) {
1496 : ActionNode* result =
1497 : new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1498 537 : result->data_.u_empty_match_check.start_register = start_register;
1499 537 : result->data_.u_empty_match_check.repetition_register = repetition_register;
1500 537 : result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1501 537 : return result;
1502 : }
1503 :
1504 :
1505 : #define DEFINE_ACCEPT(Type) \
1506 : void Type##Node::Accept(NodeVisitor* visitor) { \
1507 : visitor->Visit##Type(this); \
1508 : }
1509 727470 : FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1510 : #undef DEFINE_ACCEPT
1511 :
1512 :
1513 145900 : void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1514 145900 : visitor->VisitLoopChoice(this);
1515 145900 : }
1516 :
1517 :
1518 : // -------------------------------------------------------------------
1519 : // Emit code.
1520 :
1521 :
1522 3942 : void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1523 : Guard* guard,
1524 : Trace* trace) {
1525 3942 : switch (guard->op()) {
1526 : case Guard::LT:
1527 : DCHECK(!trace->mentions_reg(guard->reg()));
1528 : macro_assembler->IfRegisterGE(guard->reg(),
1529 : guard->value(),
1530 2620 : trace->backtrack());
1531 2620 : break;
1532 : case Guard::GEQ:
1533 : DCHECK(!trace->mentions_reg(guard->reg()));
1534 : macro_assembler->IfRegisterLT(guard->reg(),
1535 : guard->value(),
1536 1322 : trace->backtrack());
1537 1322 : break;
1538 : }
1539 3942 : }
1540 :
1541 :
1542 : // Returns the number of characters in the equivalence class, omitting those
1543 : // that cannot occur in the source string because it is Latin1.
1544 22320 : static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1545 : bool one_byte_subject,
1546 : unibrow::uchar* letters,
1547 : int letter_length) {
1548 : #ifdef V8_INTL_SUPPORT
1549 44640 : icu::UnicodeSet set;
1550 22320 : set.add(character);
1551 22320 : set = set.closeOver(USET_CASE_INSENSITIVE);
1552 22320 : int32_t range_count = set.getRangeCount();
1553 : int items = 0;
1554 109048 : for (int32_t i = 0; i < range_count; i++) {
1555 43364 : UChar32 start = set.getRangeStart(i);
1556 43364 : UChar32 end = set.getRangeEnd(i);
1557 43364 : CHECK(end - start + items <= letter_length);
1558 130388 : while (start <= end) {
1559 44304 : if (one_byte_subject && start > String::kMaxOneByteCharCode) break;
1560 43512 : letters[items++] = (unibrow::uchar)(start);
1561 43512 : start++;
1562 : }
1563 : }
1564 22320 : return items;
1565 : #else
1566 : int length =
1567 : isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1568 : // Unibrow returns 0 or 1 for characters where case independence is
1569 : // trivial.
1570 : if (length == 0) {
1571 : letters[0] = character;
1572 : length = 1;
1573 : }
1574 :
1575 : if (one_byte_subject) {
1576 : int new_length = 0;
1577 : for (int i = 0; i < length; i++) {
1578 : if (letters[i] <= String::kMaxOneByteCharCode) {
1579 : letters[new_length++] = letters[i];
1580 : }
1581 : }
1582 : length = new_length;
1583 : }
1584 :
1585 : return length;
1586 : #endif // V8_INTL_SUPPORT
1587 : }
1588 :
1589 584865 : static inline bool EmitSimpleCharacter(Isolate* isolate,
1590 : RegExpCompiler* compiler,
1591 : uc16 c,
1592 : Label* on_failure,
1593 : int cp_offset,
1594 : bool check,
1595 : bool preloaded) {
1596 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1597 : bool bound_checked = false;
1598 584865 : if (!preloaded) {
1599 584865 : assembler->LoadCurrentCharacter(
1600 : cp_offset,
1601 : on_failure,
1602 1169730 : check);
1603 : bound_checked = true;
1604 : }
1605 584865 : assembler->CheckNotCharacter(c, on_failure);
1606 584865 : return bound_checked;
1607 : }
1608 :
1609 :
1610 : // Only emits non-letters (things that don't have case). Only used for case
1611 : // independent matches.
1612 5619 : static inline bool EmitAtomNonLetter(Isolate* isolate,
1613 : RegExpCompiler* compiler,
1614 : uc16 c,
1615 : Label* on_failure,
1616 : int cp_offset,
1617 : bool check,
1618 : bool preloaded) {
1619 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1620 : bool one_byte = compiler->one_byte();
1621 : unibrow::uchar chars[4];
1622 5619 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
1623 5619 : if (length < 1) {
1624 : // This can't match. Must be an one-byte subject and a non-one-byte
1625 : // character. We do not need to do anything since the one-byte pass
1626 : // already handled this.
1627 : return false; // Bounds not checked.
1628 : }
1629 : bool checked = false;
1630 : // We handle the length > 1 case in a later pass.
1631 5614 : if (length == 1) {
1632 371 : if (one_byte && c > String::kMaxOneByteCharCodeU) {
1633 : // Can't match - see above.
1634 : return false; // Bounds not checked.
1635 : }
1636 371 : if (!preloaded) {
1637 371 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1638 : checked = check;
1639 : }
1640 371 : macro_assembler->CheckNotCharacter(c, on_failure);
1641 : }
1642 : return checked;
1643 : }
1644 :
1645 :
1646 4789 : static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1647 : bool one_byte, uc16 c1, uc16 c2,
1648 : Label* on_failure) {
1649 : uc16 char_mask;
1650 4789 : if (one_byte) {
1651 : char_mask = String::kMaxOneByteCharCode;
1652 : } else {
1653 : char_mask = String::kMaxUtf16CodeUnit;
1654 : }
1655 4789 : uc16 exor = c1 ^ c2;
1656 : // Check whether exor has only one bit set.
1657 4789 : if (((exor - 1) & exor) == 0) {
1658 : // If c1 and c2 differ only by one bit.
1659 : // Ecma262UnCanonicalize always gives the highest number last.
1660 : DCHECK(c2 > c1);
1661 4699 : uc16 mask = char_mask ^ exor;
1662 4699 : macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1663 4699 : return true;
1664 : }
1665 : DCHECK(c2 > c1);
1666 90 : uc16 diff = c2 - c1;
1667 90 : if (((diff - 1) & diff) == 0 && c1 >= diff) {
1668 : // If the characters differ by 2^n but don't differ by one bit then
1669 : // subtract the difference from the found character, then do the or
1670 : // trick. We avoid the theoretical case where negative numbers are
1671 : // involved in order to simplify code generation.
1672 60 : uc16 mask = char_mask ^ diff;
1673 60 : macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1674 : diff,
1675 : mask,
1676 120 : on_failure);
1677 60 : return true;
1678 : }
1679 : return false;
1680 : }
1681 :
1682 : using EmitCharacterFunction = bool(Isolate* isolate, RegExpCompiler* compiler,
1683 : uc16 c, Label* on_failure, int cp_offset,
1684 : bool check, bool preloaded);
1685 :
1686 : // Only emits letters (things that have case). Only used for case independent
1687 : // matches.
1688 5619 : static inline bool EmitAtomLetter(Isolate* isolate,
1689 : RegExpCompiler* compiler,
1690 : uc16 c,
1691 : Label* on_failure,
1692 : int cp_offset,
1693 : bool check,
1694 : bool preloaded) {
1695 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1696 : bool one_byte = compiler->one_byte();
1697 : unibrow::uchar chars[4];
1698 5619 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars, 4);
1699 5619 : if (length <= 1) return false;
1700 : // We may not need to check against the end of the input string
1701 : // if this character lies before a character that matched.
1702 5243 : if (!preloaded) {
1703 4882 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1704 : }
1705 5243 : Label ok;
1706 5243 : switch (length) {
1707 : case 2: {
1708 4789 : if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1709 4789 : chars[1], on_failure)) {
1710 : } else {
1711 30 : macro_assembler->CheckCharacter(chars[0], &ok);
1712 30 : macro_assembler->CheckNotCharacter(chars[1], on_failure);
1713 30 : macro_assembler->Bind(&ok);
1714 : }
1715 : break;
1716 : }
1717 : case 4:
1718 60 : macro_assembler->CheckCharacter(chars[3], &ok);
1719 : V8_FALLTHROUGH;
1720 : case 3:
1721 454 : macro_assembler->CheckCharacter(chars[0], &ok);
1722 454 : macro_assembler->CheckCharacter(chars[1], &ok);
1723 454 : macro_assembler->CheckNotCharacter(chars[2], on_failure);
1724 454 : macro_assembler->Bind(&ok);
1725 454 : break;
1726 : default:
1727 0 : UNREACHABLE();
1728 : break;
1729 : }
1730 : return true;
1731 : }
1732 :
1733 :
1734 8607 : static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1735 : int border,
1736 : Label* fall_through,
1737 : Label* above_or_equal,
1738 : Label* below) {
1739 8607 : if (below != fall_through) {
1740 8242 : masm->CheckCharacterLT(border, below);
1741 8242 : if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1742 : } else {
1743 365 : masm->CheckCharacterGT(border - 1, above_or_equal);
1744 : }
1745 8607 : }
1746 :
1747 :
1748 161283 : static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1749 : int first,
1750 : int last,
1751 : Label* fall_through,
1752 : Label* in_range,
1753 : Label* out_of_range) {
1754 161283 : if (in_range == fall_through) {
1755 109069 : if (first == last) {
1756 15089 : masm->CheckNotCharacter(first, out_of_range);
1757 : } else {
1758 93980 : masm->CheckCharacterNotInRange(first, last, out_of_range);
1759 : }
1760 : } else {
1761 52214 : if (first == last) {
1762 28524 : masm->CheckCharacter(first, in_range);
1763 : } else {
1764 23690 : masm->CheckCharacterInRange(first, last, in_range);
1765 : }
1766 52214 : if (out_of_range != fall_through) masm->GoTo(out_of_range);
1767 : }
1768 161283 : }
1769 :
1770 :
1771 : // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1772 : // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1773 5887 : static void EmitUseLookupTable(
1774 : RegExpMacroAssembler* masm,
1775 : ZoneList<int>* ranges,
1776 : int start_index,
1777 : int end_index,
1778 : int min_char,
1779 : Label* fall_through,
1780 : Label* even_label,
1781 : Label* odd_label) {
1782 : static const int kSize = RegExpMacroAssembler::kTableSize;
1783 : static const int kMask = RegExpMacroAssembler::kTableMask;
1784 :
1785 : int base = (min_char & ~kMask);
1786 : USE(base);
1787 :
1788 : // Assert that everything is on one kTableSize page.
1789 : for (int i = start_index; i <= end_index; i++) {
1790 : DCHECK_EQ(ranges->at(i) & ~kMask, base);
1791 : }
1792 : DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1793 :
1794 : char templ[kSize];
1795 : Label* on_bit_set;
1796 : Label* on_bit_clear;
1797 : int bit;
1798 5887 : if (even_label == fall_through) {
1799 : on_bit_set = odd_label;
1800 : on_bit_clear = even_label;
1801 : bit = 1;
1802 : } else {
1803 : on_bit_set = even_label;
1804 : on_bit_clear = odd_label;
1805 : bit = 0;
1806 : }
1807 254475 : for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1808 124294 : templ[i] = bit;
1809 : }
1810 : int j = 0;
1811 5887 : bit ^= 1;
1812 95590 : for (int i = start_index; i < end_index; i++) {
1813 1209206 : for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1814 514900 : templ[j] = bit;
1815 : }
1816 89703 : bit ^= 1;
1817 : }
1818 234571 : for (int i = j; i < kSize; i++) {
1819 114342 : templ[i] = bit;
1820 : }
1821 : Factory* factory = masm->isolate()->factory();
1822 : // TODO(erikcorry): Cache these.
1823 5887 : Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
1824 1512959 : for (int i = 0; i < kSize; i++) {
1825 753536 : ba->set(i, templ[i]);
1826 : }
1827 5887 : masm->CheckBitInTable(ba, on_bit_set);
1828 5887 : if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1829 5887 : }
1830 :
1831 :
1832 36263 : static void CutOutRange(RegExpMacroAssembler* masm,
1833 : ZoneList<int>* ranges,
1834 : int start_index,
1835 : int end_index,
1836 : int cut_index,
1837 : Label* even_label,
1838 : Label* odd_label) {
1839 36263 : bool odd = (((cut_index - start_index) & 1) == 1);
1840 36263 : Label* in_range_label = odd ? odd_label : even_label;
1841 36263 : Label dummy;
1842 72526 : EmitDoubleBoundaryTest(masm,
1843 : ranges->at(cut_index),
1844 72526 : ranges->at(cut_index + 1) - 1,
1845 : &dummy,
1846 : in_range_label,
1847 36263 : &dummy);
1848 : DCHECK(!dummy.is_linked());
1849 : // Cut out the single range by rewriting the array. This creates a new
1850 : // range that is a merger of the two ranges on either side of the one we
1851 : // are cutting out. The oddity of the labels is preserved.
1852 75633 : for (int j = cut_index; j > start_index; j--) {
1853 39370 : ranges->at(j) = ranges->at(j - 1);
1854 : }
1855 184239 : for (int j = cut_index + 1; j < end_index; j++) {
1856 147976 : ranges->at(j) = ranges->at(j + 1);
1857 : }
1858 36263 : }
1859 :
1860 :
1861 : // Unicode case. Split the search space into kSize spaces that are handled
1862 : // with recursion.
1863 19808 : static void SplitSearchSpace(ZoneList<int>* ranges,
1864 : int start_index,
1865 : int end_index,
1866 : int* new_start_index,
1867 : int* new_end_index,
1868 : int* border) {
1869 : static const int kSize = RegExpMacroAssembler::kTableSize;
1870 : static const int kMask = RegExpMacroAssembler::kTableMask;
1871 :
1872 19808 : int first = ranges->at(start_index);
1873 19808 : int last = ranges->at(end_index) - 1;
1874 :
1875 19808 : *new_start_index = start_index;
1876 19808 : *border = (ranges->at(start_index) & ~kMask) + kSize;
1877 274872 : while (*new_start_index < end_index) {
1878 146124 : if (ranges->at(*new_start_index) > *border) break;
1879 127532 : (*new_start_index)++;
1880 : }
1881 : // new_start_index is the index of the first edge that is beyond the
1882 : // current kSize space.
1883 :
1884 : // For very large search spaces we do a binary chop search of the non-Latin1
1885 : // space instead of just going to the end of the current kSize space. The
1886 : // heuristics are complicated a little by the fact that any 128-character
1887 : // encoding space can be quickly tested with a table lookup, so we don't
1888 : // wish to do binary chop search at a smaller granularity than that. A
1889 : // 128-character space can take up a lot of space in the ranges array if,
1890 : // for example, we only want to match every second character (eg. the lower
1891 : // case characters on some Unicode pages).
1892 19808 : int binary_chop_index = (end_index + start_index) / 2;
1893 : // The first test ensures that we get to the code that handles the Latin1
1894 : // range with a single not-taken branch, speeding up this important
1895 : // character range (even non-Latin1 charset-based text has spaces and
1896 : // punctuation).
1897 54537 : if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1898 27743 : end_index - start_index > (*new_start_index - start_index) * 2 &&
1899 56223 : last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1900 23386 : ranges->at(binary_chop_index) >= first + 2 * kSize) {
1901 : int scan_forward_for_section_border = binary_chop_index;;
1902 9755 : int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1903 :
1904 126661 : while (scan_forward_for_section_border < end_index) {
1905 66206 : if (ranges->at(scan_forward_for_section_border) > new_border) {
1906 7753 : *new_start_index = scan_forward_for_section_border;
1907 7753 : *border = new_border;
1908 7753 : break;
1909 : }
1910 58453 : scan_forward_for_section_border++;
1911 : }
1912 : }
1913 :
1914 : DCHECK(*new_start_index > start_index);
1915 19808 : *new_end_index = *new_start_index - 1;
1916 19808 : if (ranges->at(*new_end_index) == *border) {
1917 2843 : (*new_end_index)--;
1918 : }
1919 39616 : if (*border >= ranges->at(end_index)) {
1920 1214 : *border = ranges->at(end_index);
1921 1214 : *new_start_index = end_index; // Won't be used.
1922 1214 : *new_end_index = end_index - 1;
1923 : }
1924 19808 : }
1925 :
1926 : // Gets a series of segment boundaries representing a character class. If the
1927 : // character is in the range between an even and an odd boundary (counting from
1928 : // start_index) then go to even_label, otherwise go to odd_label. We already
1929 : // know that the character is in the range of min_char to max_char inclusive.
1930 : // Either label can be nullptr indicating backtracking. Either label can also
1931 : // be equal to the fall_through label.
1932 203831 : static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1933 : int start_index, int end_index, uc32 min_char,
1934 : uc32 max_char, Label* fall_through,
1935 : Label* even_label, Label* odd_label) {
1936 : DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1937 : DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1938 :
1939 203831 : int first = ranges->at(start_index);
1940 203831 : int last = ranges->at(end_index) - 1;
1941 :
1942 : DCHECK_LT(min_char, first);
1943 :
1944 : // Just need to test if the character is before or on-or-after
1945 : // a particular character.
1946 203831 : if (start_index == end_index) {
1947 8607 : EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1948 8607 : return;
1949 : }
1950 :
1951 : // Another almost trivial case: There is one interval in the middle that is
1952 : // different from the end intervals.
1953 195224 : if (start_index + 1 == end_index) {
1954 : EmitDoubleBoundaryTest(
1955 125020 : masm, first, last, fall_through, even_label, odd_label);
1956 125020 : return;
1957 : }
1958 :
1959 : // It's not worth using table lookup if there are very few intervals in the
1960 : // character class.
1961 70204 : if (end_index - start_index <= 6) {
1962 : // It is faster to test for individual characters, so we look for those
1963 : // first, then try arbitrary ranges in the second round.
1964 : static int kNoCutIndex = -1;
1965 36263 : int cut = kNoCutIndex;
1966 111658 : for (int i = start_index; i < end_index; i++) {
1967 189112 : if (ranges->at(i) == ranges->at(i + 1) - 1) {
1968 : cut = i;
1969 : break;
1970 : }
1971 : }
1972 36263 : if (cut == kNoCutIndex) cut = start_index;
1973 : CutOutRange(
1974 36263 : masm, ranges, start_index, end_index, cut, even_label, odd_label);
1975 : DCHECK_GE(end_index - start_index, 2);
1976 36263 : GenerateBranches(masm,
1977 : ranges,
1978 : start_index + 1,
1979 : end_index - 1,
1980 : min_char,
1981 : max_char,
1982 : fall_through,
1983 : even_label,
1984 36263 : odd_label);
1985 36263 : return;
1986 : }
1987 :
1988 : // If there are a lot of intervals in the regexp, then we will use tables to
1989 : // determine whether the character is inside or outside the character class.
1990 : static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1991 :
1992 33941 : if ((max_char >> kBits) == (min_char >> kBits)) {
1993 : EmitUseLookupTable(masm,
1994 : ranges,
1995 : start_index,
1996 : end_index,
1997 : min_char,
1998 : fall_through,
1999 : even_label,
2000 5887 : odd_label);
2001 5887 : return;
2002 : }
2003 :
2004 28054 : if ((min_char >> kBits) != (first >> kBits)) {
2005 8246 : masm->CheckCharacterLT(first, odd_label);
2006 : GenerateBranches(masm,
2007 : ranges,
2008 : start_index + 1,
2009 : end_index,
2010 : first,
2011 : max_char,
2012 : fall_through,
2013 : odd_label,
2014 8246 : even_label);
2015 8246 : return;
2016 : }
2017 :
2018 19808 : int new_start_index = 0;
2019 19808 : int new_end_index = 0;
2020 19808 : int border = 0;
2021 :
2022 : SplitSearchSpace(ranges,
2023 : start_index,
2024 : end_index,
2025 : &new_start_index,
2026 : &new_end_index,
2027 19808 : &border);
2028 :
2029 19808 : Label handle_rest;
2030 : Label* above = &handle_rest;
2031 19808 : if (border == last + 1) {
2032 : // We didn't find any section that started after the limit, so everything
2033 : // above the border is one of the terminal labels.
2034 1214 : above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2035 : DCHECK(new_end_index == end_index - 1);
2036 : }
2037 :
2038 : DCHECK_LE(start_index, new_end_index);
2039 : DCHECK_LE(new_start_index, end_index);
2040 : DCHECK_LT(start_index, new_start_index);
2041 : DCHECK_LT(new_end_index, end_index);
2042 : DCHECK(new_end_index + 1 == new_start_index ||
2043 : (new_end_index + 2 == new_start_index &&
2044 : border == ranges->at(new_end_index + 1)));
2045 : DCHECK_LT(min_char, border - 1);
2046 : DCHECK_LT(border, max_char);
2047 : DCHECK_LT(ranges->at(new_end_index), border);
2048 : DCHECK(border < ranges->at(new_start_index) ||
2049 : (border == ranges->at(new_start_index) &&
2050 : new_start_index == end_index &&
2051 : new_end_index == end_index - 1 &&
2052 : border == last + 1));
2053 : DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2054 :
2055 19808 : masm->CheckCharacterGT(border - 1, above);
2056 19808 : Label dummy;
2057 19808 : GenerateBranches(masm,
2058 : ranges,
2059 : start_index,
2060 : new_end_index,
2061 : min_char,
2062 : border - 1,
2063 : &dummy,
2064 : even_label,
2065 19808 : odd_label);
2066 19808 : if (handle_rest.is_linked()) {
2067 18594 : masm->Bind(&handle_rest);
2068 18594 : bool flip = (new_start_index & 1) != (start_index & 1);
2069 18594 : GenerateBranches(masm,
2070 : ranges,
2071 : new_start_index,
2072 : end_index,
2073 : border,
2074 : max_char,
2075 : &dummy,
2076 : flip ? odd_label : even_label,
2077 18594 : flip ? even_label : odd_label);
2078 : }
2079 : }
2080 :
2081 :
2082 212367 : static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2083 : RegExpCharacterClass* cc, bool one_byte,
2084 : Label* on_failure, int cp_offset, bool check_offset,
2085 : bool preloaded, Zone* zone) {
2086 : ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2087 212367 : CharacterRange::Canonicalize(ranges);
2088 :
2089 : int max_char;
2090 212367 : if (one_byte) {
2091 : max_char = String::kMaxOneByteCharCode;
2092 : } else {
2093 : max_char = String::kMaxUtf16CodeUnit;
2094 : }
2095 :
2096 : int range_count = ranges->length();
2097 :
2098 212367 : int last_valid_range = range_count - 1;
2099 577443 : while (last_valid_range >= 0) {
2100 : CharacterRange& range = ranges->at(last_valid_range);
2101 394870 : if (range.from() <= max_char) {
2102 : break;
2103 : }
2104 182538 : last_valid_range--;
2105 : }
2106 :
2107 212367 : if (last_valid_range < 0) {
2108 35 : if (!cc->is_negated()) {
2109 10 : macro_assembler->GoTo(on_failure);
2110 : }
2111 35 : if (check_offset) {
2112 33 : macro_assembler->CheckPosition(cp_offset, on_failure);
2113 : }
2114 91447 : return;
2115 : }
2116 :
2117 402016 : if (last_valid_range == 0 &&
2118 : ranges->at(0).IsEverything(max_char)) {
2119 82625 : if (cc->is_negated()) {
2120 31 : macro_assembler->GoTo(on_failure);
2121 : } else {
2122 : // This is a common case hit by non-anchored expressions.
2123 82594 : if (check_offset) {
2124 53793 : macro_assembler->CheckPosition(cp_offset, on_failure);
2125 : }
2126 : }
2127 : return;
2128 : }
2129 :
2130 129707 : if (!preloaded) {
2131 117457 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2132 : }
2133 :
2134 140616 : if (cc->is_standard(zone) &&
2135 10909 : macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2136 10909 : on_failure)) {
2137 : return;
2138 : }
2139 :
2140 :
2141 : // A new list with ascending entries. Each entry is a code unit
2142 : // where there is a boundary between code units that are part of
2143 : // the class and code units that are not. Normally we insert an
2144 : // entry at zero which goes to the failure label, but if there
2145 : // was already one there we fall through for success on that entry.
2146 : // Subsequent entries have alternating meaning (success/failure).
2147 : ZoneList<int>* range_boundaries =
2148 : new(zone) ZoneList<int>(last_valid_range, zone);
2149 :
2150 120920 : bool zeroth_entry_is_failure = !cc->is_negated();
2151 :
2152 567360 : for (int i = 0; i <= last_valid_range; i++) {
2153 : CharacterRange& range = ranges->at(i);
2154 223220 : if (range.from() == 0) {
2155 : DCHECK_EQ(i, 0);
2156 3388 : zeroth_entry_is_failure = !zeroth_entry_is_failure;
2157 : } else {
2158 219832 : range_boundaries->Add(range.from(), zone);
2159 : }
2160 223220 : range_boundaries->Add(range.to() + 1, zone);
2161 : }
2162 120920 : int end_index = range_boundaries->length() - 1;
2163 120920 : if (range_boundaries->at(end_index) > max_char) {
2164 3986 : end_index--;
2165 : }
2166 :
2167 120920 : Label fall_through;
2168 120920 : GenerateBranches(macro_assembler,
2169 : range_boundaries,
2170 : 0, // start_index.
2171 : end_index,
2172 : 0, // min_char.
2173 : max_char,
2174 : &fall_through,
2175 : zeroth_entry_is_failure ? &fall_through : on_failure,
2176 120920 : zeroth_entry_is_failure ? on_failure : &fall_through);
2177 120920 : macro_assembler->Bind(&fall_through);
2178 : }
2179 :
2180 : RegExpNode::~RegExpNode() = default;
2181 :
2182 1701150 : RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2183 : Trace* trace) {
2184 : // If we are generating a greedy loop then don't stop and don't reuse code.
2185 1701150 : if (trace->stop_node() != nullptr) {
2186 : return CONTINUE;
2187 : }
2188 :
2189 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2190 1689447 : if (trace->is_trivial()) {
2191 1062203 : if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2192 : // If a generic version is already scheduled to be generated or we have
2193 : // recursed too deeply then just generate a jump to that code.
2194 220691 : macro_assembler->GoTo(&label_);
2195 : // This will queue it up for generation of a generic version if it hasn't
2196 : // already been queued.
2197 220691 : compiler->AddWork(this);
2198 220691 : return DONE;
2199 : }
2200 : // Generate generic version of the node and bind the label for later use.
2201 392113 : macro_assembler->Bind(&label_);
2202 392113 : return CONTINUE;
2203 : }
2204 :
2205 : // We are being asked to make a non-generic version. Keep track of how many
2206 : // non-generic versions we generate so as not to overdo it.
2207 1076643 : trace_count_++;
2208 1076643 : if (KeepRecursing(compiler) && compiler->optimize() &&
2209 : trace_count_ < kMaxCopiesCodeGenerated) {
2210 : return CONTINUE;
2211 : }
2212 :
2213 : // If we get here code has been generated for this node too many times or
2214 : // recursion is too deep. Time to switch to a generic version. The code for
2215 : // generic versions above can handle deep recursion properly.
2216 : bool was_limiting = compiler->limiting_recursion();
2217 : compiler->set_limiting_recursion(true);
2218 478512 : trace->Flush(compiler, this);
2219 : compiler->set_limiting_recursion(was_limiting);
2220 478512 : return DONE;
2221 : }
2222 :
2223 :
2224 0 : bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2225 2038430 : return !compiler->limiting_recursion() &&
2226 0 : compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2227 : }
2228 :
2229 :
2230 584331 : int ActionNode::EatsAtLeast(int still_to_find,
2231 : int budget,
2232 : bool not_at_start) {
2233 584331 : if (budget <= 0) return 0;
2234 571265 : if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2235 566102 : return on_success()->EatsAtLeast(still_to_find,
2236 : budget - 1,
2237 1132204 : not_at_start);
2238 : }
2239 :
2240 :
2241 90669 : void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2242 : BoyerMooreLookahead* bm, bool not_at_start) {
2243 90669 : if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2244 90669 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2245 : }
2246 : SaveBMInfo(bm, not_at_start, offset);
2247 90669 : }
2248 :
2249 :
2250 10347 : int AssertionNode::EatsAtLeast(int still_to_find,
2251 : int budget,
2252 : bool not_at_start) {
2253 10347 : if (budget <= 0) return 0;
2254 : // If we know we are not at the start and we are asked "how many characters
2255 : // will you match if you succeed?" then we can answer anything since false
2256 : // implies false. So lets just return the max answer (still_to_find) since
2257 : // that won't prevent us from preloading a lot of characters for the other
2258 : // branches in the node graph.
2259 9326 : if (assertion_type() == AT_START && not_at_start) return still_to_find;
2260 9104 : return on_success()->EatsAtLeast(still_to_find,
2261 : budget - 1,
2262 18208 : not_at_start);
2263 : }
2264 :
2265 :
2266 379 : void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2267 : BoyerMooreLookahead* bm, bool not_at_start) {
2268 : // Match the behaviour of EatsAtLeast on this node.
2269 379 : if (assertion_type() == AT_START && not_at_start) return;
2270 363 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2271 : SaveBMInfo(bm, not_at_start, offset);
2272 : }
2273 :
2274 :
2275 3112 : int BackReferenceNode::EatsAtLeast(int still_to_find,
2276 : int budget,
2277 : bool not_at_start) {
2278 3112 : if (read_backward()) return 0;
2279 3002 : if (budget <= 0) return 0;
2280 3002 : return on_success()->EatsAtLeast(still_to_find,
2281 : budget - 1,
2282 6004 : not_at_start);
2283 : }
2284 :
2285 :
2286 5766745 : int TextNode::EatsAtLeast(int still_to_find,
2287 : int budget,
2288 : bool not_at_start) {
2289 5766745 : if (read_backward()) return 0;
2290 5764893 : int answer = Length();
2291 5764893 : if (answer >= still_to_find) return answer;
2292 3399790 : if (budget <= 0) return answer;
2293 : // We are not at start after this node so we set the last argument to 'true'.
2294 2374227 : return answer + on_success()->EatsAtLeast(still_to_find - answer,
2295 : budget - 1,
2296 4748454 : true);
2297 : }
2298 :
2299 :
2300 9503 : int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2301 : bool not_at_start) {
2302 9503 : if (budget <= 0) return 0;
2303 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2304 : // afterwards.
2305 9291 : RegExpNode* node = alternatives_->at(1).node();
2306 9291 : return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2307 : }
2308 :
2309 :
2310 3556 : void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2311 : QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2312 : bool not_at_start) {
2313 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2314 : // afterwards.
2315 3556 : RegExpNode* node = alternatives_->at(1).node();
2316 3556 : return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2317 : }
2318 :
2319 :
2320 6946032 : int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2321 : int budget,
2322 : RegExpNode* ignore_this_node,
2323 : bool not_at_start) {
2324 6946032 : if (budget <= 0) return 0;
2325 : int min = 100;
2326 4819681 : int choice_count = alternatives_->length();
2327 4819681 : budget = (budget - 1) / choice_count;
2328 16633195 : for (int i = 0; i < choice_count; i++) {
2329 10404836 : RegExpNode* node = alternatives_->at(i).node();
2330 10404836 : if (node == ignore_this_node) continue;
2331 : int node_eats_at_least =
2332 10259632 : node->EatsAtLeast(still_to_find, budget, not_at_start);
2333 10259632 : if (node_eats_at_least < min) min = node_eats_at_least;
2334 10259632 : if (min == 0) return 0;
2335 : }
2336 : return min;
2337 : }
2338 :
2339 :
2340 153793 : int LoopChoiceNode::EatsAtLeast(int still_to_find,
2341 : int budget,
2342 : bool not_at_start) {
2343 153793 : return EatsAtLeastHelper(still_to_find,
2344 : budget - 1,
2345 : loop_node_,
2346 153793 : not_at_start);
2347 : }
2348 :
2349 :
2350 6792239 : int ChoiceNode::EatsAtLeast(int still_to_find,
2351 : int budget,
2352 : bool not_at_start) {
2353 6792239 : return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
2354 : }
2355 :
2356 :
2357 : // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2358 : static inline uint32_t SmearBitsRight(uint32_t v) {
2359 239875 : v |= v >> 1;
2360 239875 : v |= v >> 2;
2361 239875 : v |= v >> 4;
2362 239875 : v |= v >> 8;
2363 239875 : v |= v >> 16;
2364 : return v;
2365 : }
2366 :
2367 :
2368 271619 : bool QuickCheckDetails::Rationalize(bool asc) {
2369 : bool found_useful_op = false;
2370 : uint32_t char_mask;
2371 271619 : if (asc) {
2372 : char_mask = String::kMaxOneByteCharCode;
2373 : } else {
2374 : char_mask = String::kMaxUtf16CodeUnit;
2375 : }
2376 271619 : mask_ = 0;
2377 271619 : value_ = 0;
2378 : int char_shift = 0;
2379 1136199 : for (int i = 0; i < characters_; i++) {
2380 : Position* pos = &positions_[i];
2381 432290 : if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2382 : found_useful_op = true;
2383 : }
2384 432290 : mask_ |= (pos->mask & char_mask) << char_shift;
2385 432290 : value_ |= (pos->value & char_mask) << char_shift;
2386 432290 : char_shift += asc ? 8 : 16;
2387 : }
2388 271619 : return found_useful_op;
2389 : }
2390 :
2391 :
2392 476778 : bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2393 : Trace* bounds_check_trace,
2394 : Trace* trace,
2395 : bool preload_has_checked_bounds,
2396 : Label* on_possible_success,
2397 : QuickCheckDetails* details,
2398 : bool fall_through_on_failure) {
2399 476778 : if (details->characters() == 0) return false;
2400 271729 : GetQuickCheckDetails(
2401 543458 : details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2402 271729 : if (details->cannot_match()) return false;
2403 271619 : if (!details->Rationalize(compiler->one_byte())) return false;
2404 : DCHECK(details->characters() == 1 ||
2405 : compiler->macro_assembler()->CanReadUnaligned());
2406 : uint32_t mask = details->mask();
2407 : uint32_t value = details->value();
2408 :
2409 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2410 :
2411 226926 : if (trace->characters_preloaded() != details->characters()) {
2412 : DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2413 : // We are attempting to preload the minimum number of characters
2414 : // any choice would eat, so if the bounds check fails, then none of the
2415 : // choices can succeed, so we can just immediately backtrack, rather
2416 : // than go to the next choice.
2417 63051 : assembler->LoadCurrentCharacter(trace->cp_offset(),
2418 : bounds_check_trace->backtrack(),
2419 63051 : !preload_has_checked_bounds,
2420 126102 : details->characters());
2421 : }
2422 :
2423 :
2424 : bool need_mask = true;
2425 :
2426 226926 : if (details->characters() == 1) {
2427 : // If number of characters preloaded is 1 then we used a byte or 16 bit
2428 : // load so the value is already masked down.
2429 : uint32_t char_mask;
2430 82864 : if (compiler->one_byte()) {
2431 : char_mask = String::kMaxOneByteCharCode;
2432 : } else {
2433 : char_mask = String::kMaxUtf16CodeUnit;
2434 : }
2435 82864 : if ((mask & char_mask) == char_mask) need_mask = false;
2436 : mask &= char_mask;
2437 : } else {
2438 : // For 2-character preloads in one-byte mode or 1-character preloads in
2439 : // two-byte mode we also use a 16 bit load with zero extend.
2440 : static const uint32_t kTwoByteMask = 0xFFFF;
2441 : static const uint32_t kFourByteMask = 0xFFFFFFFF;
2442 144062 : if (details->characters() == 2 && compiler->one_byte()) {
2443 127909 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2444 16153 : } else if (details->characters() == 1 && !compiler->one_byte()) {
2445 0 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2446 : } else {
2447 16153 : if (mask == kFourByteMask) need_mask = false;
2448 : }
2449 : }
2450 :
2451 226926 : if (fall_through_on_failure) {
2452 192498 : if (need_mask) {
2453 46384 : assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2454 : } else {
2455 146114 : assembler->CheckCharacter(value, on_possible_success);
2456 : }
2457 : } else {
2458 34428 : if (need_mask) {
2459 3745 : assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2460 : } else {
2461 30683 : assembler->CheckNotCharacter(value, trace->backtrack());
2462 : }
2463 : }
2464 : return true;
2465 : }
2466 :
2467 :
2468 : // Here is the meat of GetQuickCheckDetails (see also the comment on the
2469 : // super-class in the .h file).
2470 : //
2471 : // We iterate along the text object, building up for each character a
2472 : // mask and value that can be used to test for a quick failure to match.
2473 : // The masks and values for the positions will be combined into a single
2474 : // machine word for the current character width in order to be used in
2475 : // generating a quick check.
2476 463185 : void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2477 : RegExpCompiler* compiler,
2478 : int characters_filled_in,
2479 : bool not_at_start) {
2480 : // Do not collect any quick check details if the text node reads backward,
2481 : // since it reads in the opposite direction than we use for quick checks.
2482 463185 : if (read_backward()) return;
2483 : Isolate* isolate = compiler->macro_assembler()->isolate();
2484 : DCHECK(characters_filled_in < details->characters());
2485 : int characters = details->characters();
2486 : int char_mask;
2487 463185 : if (compiler->one_byte()) {
2488 : char_mask = String::kMaxOneByteCharCode;
2489 : } else {
2490 : char_mask = String::kMaxUtf16CodeUnit;
2491 : }
2492 548289 : for (int k = 0; k < elements()->length(); k++) {
2493 467747 : TextElement elm = elements()->at(k);
2494 467747 : if (elm.text_type() == TextElement::ATOM) {
2495 : Vector<const uc16> quarks = elm.atom()->data();
2496 1163807 : for (int i = 0; i < characters && i < quarks.length(); i++) {
2497 : QuickCheckDetails::Position* pos =
2498 : details->positions(characters_filled_in);
2499 949528 : uc16 c = quarks[i];
2500 474764 : if (elm.atom()->ignore_case()) {
2501 : unibrow::uchar chars[4];
2502 6347 : int length = GetCaseIndependentLetters(
2503 6347 : isolate, c, compiler->one_byte(), chars, 4);
2504 6347 : if (length == 0) {
2505 : // This can happen because all case variants are non-Latin1, but we
2506 : // know the input is Latin1.
2507 : details->set_cannot_match();
2508 25 : pos->determines_perfectly = false;
2509 25 : return;
2510 : }
2511 6322 : if (length == 1) {
2512 : // This letter has no case equivalents, so it's nice and simple
2513 : // and the mask-compare will determine definitely whether we have
2514 : // a match at this character position.
2515 1227 : pos->mask = char_mask;
2516 1227 : pos->value = c;
2517 1227 : pos->determines_perfectly = true;
2518 : } else {
2519 5095 : uint32_t common_bits = char_mask;
2520 5095 : uint32_t bits = chars[0];
2521 16241 : for (int j = 1; j < length; j++) {
2522 5573 : uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2523 5573 : common_bits ^= differing_bits;
2524 5573 : bits &= common_bits;
2525 : }
2526 : // If length is 2 and common bits has only one zero in it then
2527 : // our mask and compare instruction will determine definitely
2528 : // whether we have a match at this character position. Otherwise
2529 : // it can only be an approximate check.
2530 5095 : uint32_t one_zero = (common_bits | ~char_mask);
2531 5095 : if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2532 4596 : pos->determines_perfectly = true;
2533 : }
2534 5095 : pos->mask = common_bits;
2535 5095 : pos->value = bits;
2536 : }
2537 : } else {
2538 : // Don't ignore case. Nice simple case where the mask-compare will
2539 : // determine definitely whether we have a match at this character
2540 : // position.
2541 468417 : if (c > char_mask) {
2542 : details->set_cannot_match();
2543 25 : pos->determines_perfectly = false;
2544 25 : return;
2545 : }
2546 468392 : pos->mask = char_mask;
2547 468392 : pos->value = c;
2548 468392 : pos->determines_perfectly = true;
2549 : }
2550 474714 : characters_filled_in++;
2551 : DCHECK(characters_filled_in <= details->characters());
2552 474714 : if (characters_filled_in == details->characters()) {
2553 : return;
2554 : }
2555 : }
2556 : } else {
2557 : QuickCheckDetails::Position* pos =
2558 : details->positions(characters_filled_in);
2559 : RegExpCharacterClass* tree = elm.char_class();
2560 : ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2561 : DCHECK(!ranges->is_empty());
2562 125716 : if (tree->is_negated()) {
2563 : // A quick check uses multi-character mask and compare. There is no
2564 : // useful way to incorporate a negative char class into this scheme
2565 : // so we just conservatively create a mask and value that will always
2566 : // succeed.
2567 3506 : pos->mask = 0;
2568 3506 : pos->value = 0;
2569 : } else {
2570 : int first_range = 0;
2571 122240 : while (ranges->at(first_range).from() > char_mask) {
2572 80 : first_range++;
2573 80 : if (first_range == ranges->length()) {
2574 : details->set_cannot_match();
2575 50 : pos->determines_perfectly = false;
2576 : return;
2577 : }
2578 : }
2579 122160 : CharacterRange range = ranges->at(first_range);
2580 122160 : uc16 from = range.from();
2581 122160 : uc16 to = range.to();
2582 122160 : if (to > char_mask) {
2583 15074 : to = char_mask;
2584 : }
2585 122160 : uint32_t differing_bits = (from ^ to);
2586 : // A mask and compare is only perfect if the differing bits form a
2587 : // number like 00011111 with one single block of trailing 1s.
2588 227109 : if ((differing_bits & (differing_bits + 1)) == 0 &&
2589 104949 : from + differing_bits == to) {
2590 95305 : pos->determines_perfectly = true;
2591 : }
2592 122160 : uint32_t common_bits = ~SmearBitsRight(differing_bits);
2593 122160 : uint32_t bits = (from & common_bits);
2594 750722 : for (int i = first_range + 1; i < ranges->length(); i++) {
2595 253201 : CharacterRange range = ranges->at(i);
2596 253201 : uc16 from = range.from();
2597 253201 : uc16 to = range.to();
2598 253201 : if (from > char_mask) continue;
2599 117715 : if (to > char_mask) to = char_mask;
2600 : // Here we are combining more ranges into the mask and compare
2601 : // value. With each new range the mask becomes more sparse and
2602 : // so the chances of a false positive rise. A character class
2603 : // with multiple ranges is assumed never to be equivalent to a
2604 : // mask and compare operation.
2605 117715 : pos->determines_perfectly = false;
2606 117715 : uint32_t new_common_bits = (from ^ to);
2607 117715 : new_common_bits = ~SmearBitsRight(new_common_bits);
2608 117715 : common_bits &= new_common_bits;
2609 117715 : bits &= new_common_bits;
2610 117715 : uint32_t differing_bits = (from & common_bits) ^ bits;
2611 117715 : common_bits ^= differing_bits;
2612 117715 : bits &= common_bits;
2613 : }
2614 122160 : pos->mask = common_bits;
2615 122160 : pos->value = bits;
2616 : }
2617 125666 : characters_filled_in++;
2618 : DCHECK(characters_filled_in <= details->characters());
2619 125666 : if (characters_filled_in == details->characters()) {
2620 : return;
2621 : }
2622 : }
2623 : }
2624 : DCHECK(characters_filled_in != details->characters());
2625 37990 : if (!details->cannot_match()) {
2626 : on_success()-> GetQuickCheckDetails(details,
2627 : compiler,
2628 : characters_filled_in,
2629 37990 : true);
2630 : }
2631 : }
2632 :
2633 :
2634 0 : void QuickCheckDetails::Clear() {
2635 1801552 : for (int i = 0; i < characters_; i++) {
2636 350165 : positions_[i].mask = 0;
2637 350165 : positions_[i].value = 0;
2638 350165 : positions_[i].determines_perfectly = false;
2639 : }
2640 1101222 : characters_ = 0;
2641 0 : }
2642 :
2643 :
2644 518222 : void QuickCheckDetails::Advance(int by, bool one_byte) {
2645 518222 : if (by >= characters_ || by < 0) {
2646 : DCHECK_IMPLIES(by < 0, characters_ == 0);
2647 : Clear();
2648 : return;
2649 : }
2650 : DCHECK_LE(characters_ - by, 4);
2651 : DCHECK_LE(characters_, 4);
2652 72975 : for (int i = 0; i < characters_ - by; i++) {
2653 24892 : positions_[i] = positions_[by + i];
2654 : }
2655 72967 : for (int i = characters_ - by; i < characters_; i++) {
2656 24888 : positions_[i].mask = 0;
2657 24888 : positions_[i].value = 0;
2658 24888 : positions_[i].determines_perfectly = false;
2659 : }
2660 23191 : characters_ -= by;
2661 : // We could change mask_ and value_ here but we would never advance unless
2662 : // they had already been used in a check and they won't be used again because
2663 : // it would gain us nothing. So there's no point.
2664 : }
2665 :
2666 :
2667 157490 : void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2668 : DCHECK(characters_ == other->characters_);
2669 157490 : if (other->cannot_match_) {
2670 : return;
2671 : }
2672 157416 : if (cannot_match_) {
2673 247 : *this = *other;
2674 247 : return;
2675 : }
2676 505865 : for (int i = from_index; i < characters_; i++) {
2677 : QuickCheckDetails::Position* pos = positions(i);
2678 : QuickCheckDetails::Position* other_pos = other->positions(i);
2679 206957 : if (pos->mask != other_pos->mask ||
2680 42719 : pos->value != other_pos->value ||
2681 10110 : !other_pos->determines_perfectly) {
2682 : // Our mask-compare operation will be approximate unless we have the
2683 : // exact same operation on both sides of the alternation.
2684 167044 : pos->determines_perfectly = false;
2685 : }
2686 174348 : pos->mask &= other_pos->mask;
2687 174348 : pos->value &= pos->mask;
2688 174348 : other_pos->value &= pos->mask;
2689 174348 : uc16 differing_bits = (pos->value ^ other_pos->value);
2690 174348 : pos->mask &= ~differing_bits;
2691 174348 : pos->value &= pos->mask;
2692 : }
2693 : }
2694 :
2695 :
2696 : class VisitMarker {
2697 : public:
2698 : explicit VisitMarker(NodeInfo* info) : info_(info) {
2699 : DCHECK(!info->visited);
2700 197217 : info->visited = true;
2701 : }
2702 : ~VisitMarker() {
2703 173221 : info_->visited = false;
2704 : }
2705 : private:
2706 : NodeInfo* info_;
2707 : };
2708 :
2709 99131 : RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
2710 99131 : if (info()->replacement_calculated) return replacement();
2711 72400 : if (depth < 0) return this;
2712 : DCHECK(!info()->visited);
2713 : VisitMarker marker(info());
2714 : return FilterSuccessor(depth - 1);
2715 : }
2716 :
2717 0 : RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
2718 132641 : RegExpNode* next = on_success_->FilterOneByte(depth - 1);
2719 132641 : if (next == nullptr) return set_replacement(nullptr);
2720 132159 : on_success_ = next;
2721 132159 : return set_replacement(this);
2722 : }
2723 :
2724 : // We need to check for the following characters: 0x39C 0x3BC 0x178.
2725 1462 : static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2726 : // TODO(dcarney): this could be a lot more efficient.
2727 4260 : return range.Contains(0x039C) || range.Contains(0x03BC) ||
2728 1462 : range.Contains(0x0178);
2729 : }
2730 :
2731 :
2732 41 : static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2733 81 : for (int i = 0; i < ranges->length(); i++) {
2734 : // TODO(dcarney): this could be a lot more efficient.
2735 46 : if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2736 : }
2737 : return false;
2738 : }
2739 :
2740 65125 : RegExpNode* TextNode::FilterOneByte(int depth) {
2741 65125 : if (info()->replacement_calculated) return replacement();
2742 60886 : if (depth < 0) return this;
2743 : DCHECK(!info()->visited);
2744 : VisitMarker marker(info());
2745 : int element_count = elements()->length();
2746 190665 : for (int i = 0; i < element_count; i++) {
2747 65347 : TextElement elm = elements()->at(i);
2748 65347 : if (elm.text_type() == TextElement::ATOM) {
2749 : Vector<const uc16> quarks = elm.atom()->data();
2750 145454 : for (int j = 0; j < quarks.length(); j++) {
2751 115234 : uint16_t c = quarks[j];
2752 57617 : if (elm.atom()->ignore_case()) {
2753 : c = unibrow::Latin1::TryConvertToLatin1(c);
2754 : }
2755 57617 : if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
2756 : // Replace quark in case we converted to Latin-1.
2757 : uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
2758 57451 : writable_quarks[j] = c;
2759 : }
2760 : } else {
2761 : DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2762 : RegExpCharacterClass* cc = elm.char_class();
2763 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2764 34795 : CharacterRange::Canonicalize(ranges);
2765 : // Now they are in order so we only need to look at the first.
2766 : int range_count = ranges->length();
2767 34795 : if (cc->is_negated()) {
2768 8434 : if (range_count != 0 &&
2769 4395 : ranges->at(0).from() == 0 &&
2770 : ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2771 : // This will be handled in a later filter.
2772 40 : if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2773 : continue;
2774 39 : return set_replacement(nullptr);
2775 : }
2776 : } else {
2777 30578 : if (range_count == 0 ||
2778 : ranges->at(0).from() > String::kMaxOneByteCharCode) {
2779 : // This will be handled in a later filter.
2780 255 : if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2781 : continue;
2782 230 : return set_replacement(nullptr);
2783 : }
2784 : }
2785 : }
2786 : }
2787 60406 : return FilterSuccessor(depth - 1);
2788 : }
2789 :
2790 59856 : RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
2791 59856 : if (info()->replacement_calculated) return replacement();
2792 46200 : if (depth < 0) return this;
2793 46110 : if (info()->visited) return this;
2794 : {
2795 : VisitMarker marker(info());
2796 :
2797 24344 : RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2798 : // If we can't continue after the loop then there is no sense in doing the
2799 : // loop.
2800 24344 : if (continue_replacement == nullptr) return set_replacement(nullptr);
2801 : }
2802 :
2803 23996 : return ChoiceNode::FilterOneByte(depth - 1);
2804 : }
2805 :
2806 29821 : RegExpNode* ChoiceNode::FilterOneByte(int depth) {
2807 29821 : if (info()->replacement_calculated) return replacement();
2808 27778 : if (depth < 0) return this;
2809 27683 : if (info()->visited) return this;
2810 : VisitMarker marker(info());
2811 27683 : int choice_count = alternatives_->length();
2812 :
2813 144231 : for (int i = 0; i < choice_count; i++) {
2814 60652 : GuardedAlternative alternative = alternatives_->at(i);
2815 63030 : if (alternative.guards() != nullptr &&
2816 : alternative.guards()->length() != 0) {
2817 2378 : set_replacement(this);
2818 : return this;
2819 : }
2820 : }
2821 :
2822 : int surviving = 0;
2823 : RegExpNode* survivor = nullptr;
2824 141149 : for (int i = 0; i < choice_count; i++) {
2825 115844 : GuardedAlternative alternative = alternatives_->at(i);
2826 57922 : RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2827 : DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2828 57922 : if (replacement != nullptr) {
2829 57776 : alternatives_->at(i).set_node(replacement);
2830 57776 : surviving++;
2831 : survivor = replacement;
2832 : }
2833 : }
2834 25371 : if (surviving < 2) return set_replacement(survivor);
2835 :
2836 25239 : set_replacement(this);
2837 25239 : if (surviving == choice_count) {
2838 : return this;
2839 : }
2840 : // Only some of the nodes survived the filtering. We need to rebuild the
2841 : // alternatives list.
2842 : ZoneList<GuardedAlternative>* new_alternatives =
2843 : new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2844 380 : for (int i = 0; i < choice_count; i++) {
2845 : RegExpNode* replacement =
2846 360 : alternatives_->at(i).node()->FilterOneByte(depth - 1);
2847 180 : if (replacement != nullptr) {
2848 130 : alternatives_->at(i).set_node(replacement);
2849 260 : new_alternatives->Add(alternatives_->at(i), zone());
2850 : }
2851 : }
2852 20 : alternatives_ = new_alternatives;
2853 20 : return this;
2854 : }
2855 :
2856 357 : RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
2857 357 : if (info()->replacement_calculated) return replacement();
2858 357 : if (depth < 0) return this;
2859 357 : if (info()->visited) return this;
2860 : VisitMarker marker(info());
2861 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2862 : // afterwards.
2863 357 : RegExpNode* node = alternatives_->at(1).node();
2864 357 : RegExpNode* replacement = node->FilterOneByte(depth - 1);
2865 362 : if (replacement == nullptr) return set_replacement(nullptr);
2866 352 : alternatives_->at(1).set_node(replacement);
2867 :
2868 352 : RegExpNode* neg_node = alternatives_->at(0).node();
2869 352 : RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2870 : // If the negative lookahead is always going to fail then
2871 : // we don't need to check it.
2872 357 : if (neg_replacement == nullptr) return set_replacement(replacement);
2873 347 : alternatives_->at(0).set_node(neg_replacement);
2874 694 : return set_replacement(this);
2875 : }
2876 :
2877 :
2878 14928 : void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2879 : RegExpCompiler* compiler,
2880 : int characters_filled_in,
2881 : bool not_at_start) {
2882 14928 : if (body_can_be_zero_length_ || info()->visited) return;
2883 : VisitMarker marker(info());
2884 11757 : return ChoiceNode::GetQuickCheckDetails(details,
2885 : compiler,
2886 : characters_filled_in,
2887 11757 : not_at_start);
2888 : }
2889 :
2890 :
2891 5136 : void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2892 : BoyerMooreLookahead* bm, bool not_at_start) {
2893 5136 : if (body_can_be_zero_length_ || budget <= 0) {
2894 : bm->SetRest(offset);
2895 : SaveBMInfo(bm, not_at_start, offset);
2896 : return;
2897 : }
2898 4919 : ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2899 : SaveBMInfo(bm, not_at_start, offset);
2900 : }
2901 :
2902 :
2903 38784 : void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2904 : RegExpCompiler* compiler,
2905 : int characters_filled_in,
2906 : bool not_at_start) {
2907 38784 : not_at_start = (not_at_start || not_at_start_);
2908 38784 : int choice_count = alternatives_->length();
2909 : DCHECK_LT(0, choice_count);
2910 38784 : alternatives_->at(0).node()->GetQuickCheckDetails(details,
2911 : compiler,
2912 : characters_filled_in,
2913 77568 : not_at_start);
2914 353764 : for (int i = 1; i < choice_count; i++) {
2915 : QuickCheckDetails new_details(details->characters());
2916 157490 : RegExpNode* node = alternatives_->at(i).node();
2917 : node->GetQuickCheckDetails(&new_details, compiler,
2918 : characters_filled_in,
2919 157490 : not_at_start);
2920 : // Here we merge the quick match details of the two branches.
2921 157490 : details->Merge(&new_details, characters_filled_in);
2922 : }
2923 38784 : }
2924 :
2925 :
2926 : // Check for [0-9A-Z_a-z].
2927 557 : static void EmitWordCheck(RegExpMacroAssembler* assembler,
2928 : Label* word,
2929 : Label* non_word,
2930 : bool fall_through_on_word) {
2931 557 : if (assembler->CheckSpecialCharacterClass(
2932 : fall_through_on_word ? 'w' : 'W',
2933 557 : fall_through_on_word ? non_word : word)) {
2934 : // Optimized implementation available.
2935 : return;
2936 : }
2937 99 : assembler->CheckCharacterGT('z', non_word);
2938 99 : assembler->CheckCharacterLT('0', non_word);
2939 99 : assembler->CheckCharacterGT('a' - 1, word);
2940 99 : assembler->CheckCharacterLT('9' + 1, word);
2941 99 : assembler->CheckCharacterLT('A', non_word);
2942 99 : assembler->CheckCharacterLT('Z' + 1, word);
2943 99 : if (fall_through_on_word) {
2944 34 : assembler->CheckNotCharacter('_', non_word);
2945 : } else {
2946 65 : assembler->CheckCharacter('_', word);
2947 : }
2948 : }
2949 :
2950 :
2951 : // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2952 : // that matches newline or the start of input).
2953 129 : static void EmitHat(RegExpCompiler* compiler,
2954 : RegExpNode* on_success,
2955 : Trace* trace) {
2956 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2957 : // We will be loading the previous character into the current character
2958 : // register.
2959 129 : Trace new_trace(*trace);
2960 : new_trace.InvalidateCurrentCharacter();
2961 :
2962 129 : Label ok;
2963 129 : if (new_trace.cp_offset() == 0) {
2964 : // The start of input counts as a newline in this context, so skip to
2965 : // ok if we are at the start.
2966 119 : assembler->CheckAtStart(&ok);
2967 : }
2968 : // We already checked that we are not at the start of input so it must be
2969 : // OK to load the previous character.
2970 129 : assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2971 : new_trace.backtrack(),
2972 258 : false);
2973 129 : if (!assembler->CheckSpecialCharacterClass('n',
2974 129 : new_trace.backtrack())) {
2975 : // Newline means \n, \r, 0x2028 or 0x2029.
2976 24 : if (!compiler->one_byte()) {
2977 2 : assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2978 : }
2979 24 : assembler->CheckCharacter('\n', &ok);
2980 24 : assembler->CheckNotCharacter('\r', new_trace.backtrack());
2981 : }
2982 129 : assembler->Bind(&ok);
2983 129 : on_success->Emit(compiler, &new_trace);
2984 129 : }
2985 :
2986 :
2987 : // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2988 255 : void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2989 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2990 : Isolate* isolate = assembler->isolate();
2991 : Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2992 255 : bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2993 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2994 255 : if (lookahead == nullptr) {
2995 : int eats_at_least =
2996 202 : Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
2997 : kRecursionBudget,
2998 202 : not_at_start));
2999 202 : if (eats_at_least >= 1) {
3000 : BoyerMooreLookahead* bm =
3001 97 : new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3002 97 : FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3003 97 : if (bm->at(0)->is_non_word())
3004 : next_is_word_character = Trace::FALSE_VALUE;
3005 97 : if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3006 : }
3007 : } else {
3008 53 : if (lookahead->at(0)->is_non_word())
3009 : next_is_word_character = Trace::FALSE_VALUE;
3010 53 : if (lookahead->at(0)->is_word())
3011 : next_is_word_character = Trace::TRUE_VALUE;
3012 : }
3013 255 : bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3014 255 : if (next_is_word_character == Trace::UNKNOWN) {
3015 151 : Label before_non_word;
3016 151 : Label before_word;
3017 151 : if (trace->characters_preloaded() != 1) {
3018 150 : assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3019 : }
3020 : // Fall through on non-word.
3021 151 : EmitWordCheck(assembler, &before_word, &before_non_word, false);
3022 : // Next character is not a word character.
3023 151 : assembler->Bind(&before_non_word);
3024 151 : Label ok;
3025 151 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3026 151 : assembler->GoTo(&ok);
3027 :
3028 151 : assembler->Bind(&before_word);
3029 151 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3030 151 : assembler->Bind(&ok);
3031 104 : } else if (next_is_word_character == Trace::TRUE_VALUE) {
3032 79 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3033 : } else {
3034 : DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3035 25 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3036 : }
3037 255 : }
3038 :
3039 :
3040 406 : void AssertionNode::BacktrackIfPrevious(
3041 : RegExpCompiler* compiler,
3042 : Trace* trace,
3043 : AssertionNode::IfPrevious backtrack_if_previous) {
3044 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3045 406 : Trace new_trace(*trace);
3046 : new_trace.InvalidateCurrentCharacter();
3047 :
3048 406 : Label fall_through, dummy;
3049 :
3050 : Label* non_word = backtrack_if_previous == kIsNonWord ?
3051 : new_trace.backtrack() :
3052 406 : &fall_through;
3053 : Label* word = backtrack_if_previous == kIsNonWord ?
3054 : &fall_through :
3055 406 : new_trace.backtrack();
3056 :
3057 406 : if (new_trace.cp_offset() == 0) {
3058 : // The start of input counts as a non-word character, so the question is
3059 : // decided if we are at the start.
3060 169 : assembler->CheckAtStart(non_word);
3061 : }
3062 : // We already checked that we are not at the start of input so it must be
3063 : // OK to load the previous character.
3064 406 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3065 406 : EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3066 :
3067 406 : assembler->Bind(&fall_through);
3068 406 : on_success()->Emit(compiler, &new_trace);
3069 406 : }
3070 :
3071 :
3072 1935 : void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3073 : RegExpCompiler* compiler,
3074 : int filled_in,
3075 : bool not_at_start) {
3076 1935 : if (assertion_type_ == AT_START && not_at_start) {
3077 : details->set_cannot_match();
3078 : return;
3079 : }
3080 1604 : return on_success()->GetQuickCheckDetails(details,
3081 : compiler,
3082 : filled_in,
3083 3208 : not_at_start);
3084 : }
3085 :
3086 :
3087 5769 : void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3088 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3089 5769 : switch (assertion_type_) {
3090 : case AT_END: {
3091 2333 : Label ok;
3092 2333 : assembler->CheckPosition(trace->cp_offset(), &ok);
3093 2333 : assembler->GoTo(trace->backtrack());
3094 2333 : assembler->Bind(&ok);
3095 : break;
3096 : }
3097 : case AT_START: {
3098 3052 : if (trace->at_start() == Trace::FALSE_VALUE) {
3099 9 : assembler->GoTo(trace->backtrack());
3100 9 : return;
3101 : }
3102 3043 : if (trace->at_start() == Trace::UNKNOWN) {
3103 3043 : assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3104 3043 : Trace at_start_trace = *trace;
3105 : at_start_trace.set_at_start(Trace::TRUE_VALUE);
3106 3043 : on_success()->Emit(compiler, &at_start_trace);
3107 : return;
3108 : }
3109 : }
3110 : break;
3111 : case AFTER_NEWLINE:
3112 129 : EmitHat(compiler, on_success(), trace);
3113 129 : return;
3114 : case AT_BOUNDARY:
3115 : case AT_NON_BOUNDARY: {
3116 255 : EmitBoundaryCheck(compiler, trace);
3117 255 : return;
3118 : }
3119 : }
3120 2333 : on_success()->Emit(compiler, trace);
3121 : }
3122 :
3123 :
3124 : static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3125 2753026 : if (quick_check == nullptr) return false;
3126 2753026 : if (offset >= quick_check->characters()) return false;
3127 854561 : return quick_check->positions(offset)->determines_perfectly;
3128 : }
3129 :
3130 :
3131 : static void UpdateBoundsCheck(int index, int* checked_up_to) {
3132 802794 : if (index > *checked_up_to) {
3133 416257 : *checked_up_to = index;
3134 : }
3135 : }
3136 :
3137 :
3138 : // We call this repeatedly to generate code for each pass over the text node.
3139 : // The passes are in increasing order of difficulty because we hope one
3140 : // of the first passes will fail in which case we are saved the work of the
3141 : // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3142 : // we will check the '%' in the first pass, the case independent 'a' in the
3143 : // second pass and the character class in the last pass.
3144 : //
3145 : // The passes are done from right to left, so for example to test for /bar/
3146 : // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3147 : // and then a 'b' with offset 0. This means we can avoid the end-of-input
3148 : // bounds check most of the time. In the example we only need to check for
3149 : // end-of-input when loading the putative 'r'.
3150 : //
3151 : // A slight complication involves the fact that the first character may already
3152 : // be fetched into a register by the previous node. In this case we want to
3153 : // do the test for that character first. We do this in separate passes. The
3154 : // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3155 : // pass has been performed then subsequent passes will have true in
3156 : // first_element_checked to indicate that that character does not need to be
3157 : // checked again.
3158 : //
3159 : // In addition to all this we are passed a Trace, which can
3160 : // contain an AlternativeGeneration object. In this AlternativeGeneration
3161 : // object we can see details of any quick check that was already passed in
3162 : // order to get to the code we are now generating. The quick check can involve
3163 : // loading characters, which means we do not need to recheck the bounds
3164 : // up to the limit the quick check already checked. In addition the quick
3165 : // check can have involved a mask and compare operation which may simplify
3166 : // or obviate the need for further checks at some character positions.
3167 2709889 : void TextNode::TextEmitPass(RegExpCompiler* compiler,
3168 : TextEmitPassType pass,
3169 : bool preloaded,
3170 : Trace* trace,
3171 : bool first_element_checked,
3172 : int* checked_up_to) {
3173 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3174 : Isolate* isolate = assembler->isolate();
3175 : bool one_byte = compiler->one_byte();
3176 : Label* backtrack = trace->backtrack();
3177 : QuickCheckDetails* quick_check = trace->quick_check_performed();
3178 : int element_count = elements()->length();
3179 2709889 : int backward_offset = read_backward() ? -Length() : 0;
3180 5679531 : for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3181 2969667 : TextElement elm = elements()->at(i);
3182 2969667 : int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3183 2969667 : if (elm.text_type() == TextElement::ATOM) {
3184 1813231 : if (SkipPass(pass, elm.atom()->ignore_case())) continue;
3185 : Vector<const uc16> quarks = elm.atom()->data();
3186 4637115 : for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3187 2592939 : if (first_element_checked && i == 0 && j == 0) continue;
3188 5024544 : if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3189 : EmitCharacterFunction* emit_function = nullptr;
3190 3410786 : uc16 quark = quarks[j];
3191 1705393 : if (elm.atom()->ignore_case()) {
3192 : // Everywhere else we assume that a non-Latin-1 character cannot match
3193 : // a Latin-1 character. Avoid the cases where this is assumption is
3194 : // invalid by using the Latin1 equivalent instead.
3195 : quark = unibrow::Latin1::TryConvertToLatin1(quark);
3196 : }
3197 1705393 : switch (pass) {
3198 : case NON_LATIN1_MATCH:
3199 : DCHECK(one_byte);
3200 518806 : if (quark > String::kMaxOneByteCharCode) {
3201 25 : assembler->GoTo(backtrack);
3202 : return;
3203 : }
3204 : break;
3205 : case NON_LETTER_CHARACTER_MATCH:
3206 : emit_function = &EmitAtomNonLetter;
3207 5619 : break;
3208 : case SIMPLE_CHARACTER_MATCH:
3209 : emit_function = &EmitSimpleCharacter;
3210 584865 : break;
3211 : case CASE_CHARACTER_MATCH:
3212 : emit_function = &EmitAtomLetter;
3213 5619 : break;
3214 : default:
3215 : break;
3216 : }
3217 1705368 : if (emit_function != nullptr) {
3218 596103 : bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3219 : bool bound_checked =
3220 596103 : emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
3221 596103 : bounds_check, preloaded);
3222 596103 : if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3223 : }
3224 : }
3225 : } else {
3226 : DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3227 1156436 : if (pass == CHARACTER_CLASS_MATCH) {
3228 280098 : if (first_element_checked && i == 0) continue;
3229 240754 : if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3230 : RegExpCharacterClass* cc = elm.char_class();
3231 212367 : bool bounds_check = *checked_up_to < cp_offset || read_backward();
3232 212367 : EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3233 212367 : bounds_check, preloaded, zone());
3234 : UpdateBoundsCheck(cp_offset, checked_up_to);
3235 : }
3236 : }
3237 : }
3238 : }
3239 :
3240 :
3241 6958219 : int TextNode::Length() {
3242 6958219 : TextElement elm = elements()->last();
3243 : DCHECK_LE(0, elm.cp_offset());
3244 6958219 : return elm.cp_offset() + elm.length();
3245 : }
3246 :
3247 0 : bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
3248 1813231 : if (ignore_case) {
3249 44992 : return pass == SIMPLE_CHARACTER_MATCH;
3250 : } else {
3251 1768239 : return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3252 : }
3253 : }
3254 :
3255 7207 : TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3256 : ZoneList<CharacterRange>* ranges,
3257 : bool read_backward,
3258 : RegExpNode* on_success,
3259 : JSRegExp::Flags flags) {
3260 : DCHECK_NOT_NULL(ranges);
3261 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3262 14414 : elms->Add(TextElement::CharClass(
3263 21621 : new (zone) RegExpCharacterClass(zone, ranges, flags)),
3264 7207 : zone);
3265 7207 : return new (zone) TextNode(elms, read_backward, on_success);
3266 : }
3267 :
3268 28195 : TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3269 : CharacterRange trail,
3270 : bool read_backward,
3271 : RegExpNode* on_success,
3272 : JSRegExp::Flags flags) {
3273 28195 : ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3274 28195 : ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3275 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3276 56390 : elms->Add(TextElement::CharClass(
3277 84585 : new (zone) RegExpCharacterClass(zone, lead_ranges, flags)),
3278 28195 : zone);
3279 56390 : elms->Add(TextElement::CharClass(
3280 84585 : new (zone) RegExpCharacterClass(zone, trail_ranges, flags)),
3281 28195 : zone);
3282 28195 : return new (zone) TextNode(elms, read_backward, on_success);
3283 : }
3284 :
3285 :
3286 : // This generates the code to match a text node. A text node can contain
3287 : // straight character sequences (possibly to be matched in a case-independent
3288 : // way) and character classes. For efficiency we do not do this in a single
3289 : // pass from left to right. Instead we pass over the text node several times,
3290 : // emitting code for some character positions every time. See the comment on
3291 : // TextEmitPass for details.
3292 620127 : void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3293 620127 : LimitResult limit_result = LimitVersions(compiler, trace);
3294 722032 : if (limit_result == DONE) return;
3295 : DCHECK(limit_result == CONTINUE);
3296 :
3297 518222 : if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3298 : compiler->SetRegExpTooBig();
3299 : return;
3300 : }
3301 :
3302 518222 : if (compiler->one_byte()) {
3303 323113 : int dummy = 0;
3304 323113 : TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3305 : }
3306 :
3307 : bool first_elt_done = false;
3308 518222 : int bound_checked_to = trace->cp_offset() - 1;
3309 518222 : bound_checked_to += trace->bound_checked_up_to();
3310 :
3311 : // If a character is preloaded into the current character register then
3312 : // check that now.
3313 518222 : if (trace->characters_preloaded() == 1) {
3314 706248 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3315 313888 : TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
3316 313888 : false, &bound_checked_to);
3317 : }
3318 : first_elt_done = true;
3319 : }
3320 :
3321 4663998 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3322 2072888 : TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
3323 2072888 : first_elt_done, &bound_checked_to);
3324 : }
3325 :
3326 518222 : Trace successor_trace(*trace);
3327 : // If we advance backward, we may end up at the start.
3328 523394 : successor_trace.AdvanceCurrentPositionInTrace(
3329 523394 : read_backward() ? -Length() : Length(), compiler);
3330 518222 : successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3331 : : Trace::FALSE_VALUE);
3332 : RecursionCheck rc(compiler);
3333 518222 : on_success()->Emit(compiler, &successor_trace);
3334 : }
3335 :
3336 :
3337 0 : void Trace::InvalidateCurrentCharacter() {
3338 228489 : characters_preloaded_ = 0;
3339 0 : }
3340 :
3341 :
3342 518222 : void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3343 : // We don't have an instruction for shifting the current character register
3344 : // down or for using a shifted value for anything so lets just forget that
3345 : // we preloaded any characters into it.
3346 518222 : characters_preloaded_ = 0;
3347 : // Adjust the offsets of the quick check performed information. This
3348 : // information is used to find out what we already determined about the
3349 : // characters by means of mask and compare.
3350 518222 : quick_check_performed_.Advance(by, compiler->one_byte());
3351 518222 : cp_offset_ += by;
3352 518222 : if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3353 : compiler->SetRegExpTooBig();
3354 0 : cp_offset_ = 0;
3355 : }
3356 1036444 : bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3357 518222 : }
3358 :
3359 :
3360 319047 : void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3361 : int element_count = elements()->length();
3362 1082697 : for (int i = 0; i < element_count; i++) {
3363 381825 : TextElement elm = elements()->at(i);
3364 381825 : if (elm.text_type() == TextElement::CHAR_CLASS) {
3365 : RegExpCharacterClass* cc = elm.char_class();
3366 : #ifdef V8_INTL_SUPPORT
3367 : bool case_equivalents_already_added =
3368 : NeedsUnicodeCaseEquivalents(cc->flags());
3369 : #else
3370 : bool case_equivalents_already_added = false;
3371 : #endif
3372 240106 : if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
3373 : // None of the standard character classes is different in the case
3374 : // independent case and it slows us down if we don't know that.
3375 68981 : if (cc->is_standard(zone())) continue;
3376 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3377 66900 : CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
3378 66900 : is_one_byte);
3379 : }
3380 : }
3381 : }
3382 319047 : }
3383 :
3384 :
3385 135273 : int TextNode::GreedyLoopTextLength() { return Length(); }
3386 :
3387 :
3388 85918 : RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3389 : RegExpCompiler* compiler) {
3390 85918 : if (read_backward()) return nullptr;
3391 85793 : if (elements()->length() != 1) return nullptr;
3392 85456 : TextElement elm = elements()->at(0);
3393 85456 : if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
3394 : RegExpCharacterClass* node = elm.char_class();
3395 : ZoneList<CharacterRange>* ranges = node->ranges(zone());
3396 84101 : CharacterRange::Canonicalize(ranges);
3397 84101 : if (node->is_negated()) {
3398 117 : return ranges->length() == 0 ? on_success() : nullptr;
3399 : }
3400 83984 : if (ranges->length() != 1) return nullptr;
3401 : uint32_t max_char;
3402 83524 : if (compiler->one_byte()) {
3403 : max_char = String::kMaxOneByteCharCode;
3404 : } else {
3405 : max_char = String::kMaxUtf16CodeUnit;
3406 : }
3407 167048 : return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
3408 : }
3409 :
3410 :
3411 : // Finds the fixed match length of a sequence of nodes that goes from
3412 : // this alternative and back to this choice node. If there are variable
3413 : // length nodes or other complications in the way then return a sentinel
3414 : // value indicating that a greedy loop cannot be constructed.
3415 225423 : int ChoiceNode::GreedyLoopTextLengthForAlternative(
3416 : GuardedAlternative* alternative) {
3417 : int length = 0;
3418 : RegExpNode* node = alternative->node();
3419 : // Later we will generate code for all these text nodes using recursion
3420 : // so we have to limit the max number.
3421 : int recursion_depth = 0;
3422 495969 : while (node != this) {
3423 337310 : if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3424 : return kNodeIsTooComplexForGreedyLoops;
3425 : }
3426 337310 : int node_length = node->GreedyLoopTextLength();
3427 337310 : if (node_length == kNodeIsTooComplexForGreedyLoops) {
3428 : return kNodeIsTooComplexForGreedyLoops;
3429 : }
3430 135273 : length += node_length;
3431 : SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3432 : node = seq_node->on_success();
3433 : }
3434 23386 : return read_backward() ? -length : length;
3435 : }
3436 :
3437 :
3438 0 : void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3439 : DCHECK_NULL(loop_node_);
3440 : AddAlternative(alt);
3441 999457 : loop_node_ = alt.node();
3442 0 : }
3443 :
3444 :
3445 0 : void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3446 : DCHECK_NULL(continue_node_);
3447 : AddAlternative(alt);
3448 999457 : continue_node_ = alt.node();
3449 0 : }
3450 :
3451 :
3452 334347 : void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3453 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3454 334347 : if (trace->stop_node() == this) {
3455 : // Back edge of greedy optimized loop node graph.
3456 : int text_length =
3457 11693 : GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3458 : DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
3459 : // Update the counter-based backtracking info on the stack. This is an
3460 : // optimization for greedy loops (see below).
3461 : DCHECK(trace->cp_offset() == text_length);
3462 11693 : macro_assembler->AdvanceCurrentPosition(text_length);
3463 11693 : macro_assembler->GoTo(trace->loop_label());
3464 11693 : return;
3465 : }
3466 : DCHECK_NULL(trace->stop_node());
3467 322654 : if (!trace->is_trivial()) {
3468 120157 : trace->Flush(compiler, this);
3469 120157 : return;
3470 : }
3471 202497 : ChoiceNode::Emit(compiler, trace);
3472 : }
3473 :
3474 :
3475 213730 : int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3476 : int eats_at_least) {
3477 : int preload_characters = Min(4, eats_at_least);
3478 : DCHECK_LE(preload_characters, 4);
3479 213730 : if (compiler->macro_assembler()->CanReadUnaligned()) {
3480 : bool one_byte = compiler->one_byte();
3481 134435 : if (one_byte) {
3482 : // We can't preload 3 characters because there is no machine instruction
3483 : // to do that. We can't just load 4 because we could be reading
3484 : // beyond the end of the string, which could cause a memory fault.
3485 106849 : if (preload_characters == 3) preload_characters = 2;
3486 : } else {
3487 27586 : if (preload_characters > 2) preload_characters = 2;
3488 : }
3489 : } else {
3490 79295 : if (preload_characters > 1) preload_characters = 1;
3491 : }
3492 213730 : return preload_characters;
3493 : }
3494 :
3495 :
3496 : // This class is used when generating the alternatives in a choice node. It
3497 : // records the way the alternative is being code generated.
3498 : class AlternativeGeneration: public Malloced {
3499 : public:
3500 : AlternativeGeneration()
3501 : : possible_success(),
3502 : expects_preload(false),
3503 : after(),
3504 2181542 : quick_check_details() { }
3505 : Label possible_success;
3506 : bool expects_preload;
3507 : Label after;
3508 : QuickCheckDetails quick_check_details;
3509 : };
3510 :
3511 :
3512 : // Creates a list of AlternativeGenerations. If the list has a reasonable
3513 : // size then it is on the stack, otherwise the excess is on the heap.
3514 : class AlternativeGenerationList {
3515 : public:
3516 213730 : AlternativeGenerationList(int count, Zone* zone)
3517 2351030 : : alt_gens_(count, zone) {
3518 1361014 : for (int i = 0; i < count && i < kAFew; i++) {
3519 573642 : alt_gens_.Add(a_few_alt_gens_ + i, zone);
3520 : }
3521 302214 : for (int i = kAFew; i < count; i++) {
3522 44242 : alt_gens_.Add(new AlternativeGeneration(), zone);
3523 : }
3524 213730 : }
3525 427460 : ~AlternativeGenerationList() {
3526 302214 : for (int i = kAFew; i < alt_gens_.length(); i++) {
3527 44242 : delete alt_gens_[i];
3528 44242 : alt_gens_[i] = nullptr;
3529 : }
3530 213730 : }
3531 :
3532 : AlternativeGeneration* at(int i) {
3533 1628229 : return alt_gens_[i];
3534 : }
3535 :
3536 : private:
3537 : static const int kAFew = 10;
3538 : ZoneList<AlternativeGeneration*> alt_gens_;
3539 : AlternativeGeneration a_few_alt_gens_[kAFew];
3540 : };
3541 :
3542 :
3543 : static const uc32 kRangeEndMarker = 0x110000;
3544 :
3545 : // The '2' variant is has inclusive from and exclusive to.
3546 : // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3547 : // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3548 : static const int kSpaceRanges[] = {
3549 : '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
3550 : 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3551 : 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3552 : static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3553 :
3554 : static const int kWordRanges[] = {
3555 : '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3556 : static const int kWordRangeCount = arraysize(kWordRanges);
3557 : static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3558 : static const int kDigitRangeCount = arraysize(kDigitRanges);
3559 : static const int kSurrogateRanges[] = {
3560 : kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3561 : static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3562 : static const int kLineTerminatorRanges[] = {
3563 : 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3564 : static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3565 :
3566 0 : void BoyerMoorePositionInfo::Set(int character) {
3567 86966 : SetInterval(Interval(character, character));
3568 0 : }
3569 :
3570 :
3571 248173 : void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3572 496346 : s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3573 496346 : w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3574 496346 : d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3575 : surrogate_ =
3576 496346 : AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3577 248173 : if (interval.to() - interval.from() >= kMapSize - 1) {
3578 13661 : if (map_count_ != kMapSize) {
3579 6372 : map_count_ = kMapSize;
3580 1637604 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3581 : }
3582 : return;
3583 : }
3584 1235656 : for (int i = interval.from(); i <= interval.to(); i++) {
3585 544913 : int mod_character = (i & kMask);
3586 1089826 : if (!map_->at(mod_character)) {
3587 374775 : map_count_++;
3588 374775 : map_->at(mod_character) = true;
3589 : }
3590 544913 : if (map_count_ == kMapSize) return;
3591 : }
3592 : }
3593 :
3594 :
3595 0 : void BoyerMoorePositionInfo::SetAll() {
3596 5507 : s_ = w_ = d_ = kLatticeUnknown;
3597 5507 : if (map_count_ != kMapSize) {
3598 5077 : map_count_ = kMapSize;
3599 1304789 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3600 : }
3601 0 : }
3602 :
3603 :
3604 80788 : BoyerMooreLookahead::BoyerMooreLookahead(
3605 : int length, RegExpCompiler* compiler, Zone* zone)
3606 : : length_(length),
3607 80788 : compiler_(compiler) {
3608 80788 : if (compiler->one_byte()) {
3609 10143 : max_char_ = String::kMaxOneByteCharCode;
3610 : } else {
3611 70645 : max_char_ = String::kMaxUtf16CodeUnit;
3612 : }
3613 80788 : bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3614 280168 : for (int i = 0; i < length; i++) {
3615 99690 : bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3616 : }
3617 80788 : }
3618 :
3619 :
3620 : // Find the longest range of lookahead that has the fewest number of different
3621 : // characters that can occur at a given position. Since we are optimizing two
3622 : // different parameters at once this is a tradeoff.
3623 0 : bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3624 : int biggest_points = 0;
3625 : // If more than 32 characters out of 128 can occur it is unlikely that we can
3626 : // be lucky enough to step forwards much of the time.
3627 : const int kMaxMax = 32;
3628 242073 : for (int max_number_of_chars = 4;
3629 322764 : max_number_of_chars < kMaxMax;
3630 : max_number_of_chars *= 2) {
3631 : biggest_points =
3632 242073 : FindBestInterval(max_number_of_chars, biggest_points, from, to);
3633 : }
3634 80691 : if (biggest_points == 0) return false;
3635 0 : return true;
3636 : }
3637 :
3638 :
3639 : // Find the highest-points range between 0 and length_ where the character
3640 : // information is not too vague. 'Too vague' means that there are more than
3641 : // max_number_of_chars that can occur at this position. Calculates the number
3642 : // of points as the product of width-of-the-range and
3643 : // probability-of-finding-one-of-the-characters, where the probability is
3644 : // calculated using the frequency distribution of the sample subject string.
3645 242073 : int BoyerMooreLookahead::FindBestInterval(
3646 : int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3647 : int biggest_points = old_biggest_points;
3648 : static const int kSize = RegExpMacroAssembler::kTableSize;
3649 700377 : for (int i = 0; i < length_; ) {
3650 569318 : while (i < length_ && Count(i) > max_number_of_chars) i++;
3651 256944 : if (i == length_) break;
3652 : int remembered_from = i;
3653 : bool union_map[kSize];
3654 29560608 : for (int j = 0; j < kSize; j++) union_map[j] = false;
3655 1014789 : while (i < length_ && Count(i) <= max_number_of_chars) {
3656 513844 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3657 66028954 : for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3658 256922 : i++;
3659 : }
3660 : int frequency = 0;
3661 58892064 : for (int j = 0; j < kSize; j++) {
3662 29331456 : if (union_map[j]) {
3663 : // Add 1 to the frequency to give a small per-character boost for
3664 : // the cases where our sampling is not good enough and many
3665 : // characters have a frequency of zero. This means the frequency
3666 : // can theoretically be up to 2*kSize though we treat it mostly as
3667 : // a fraction of kSize.
3668 980164 : frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3669 : }
3670 : }
3671 : // We use the probability of skipping times the distance we are skipping to
3672 : // judge the effectiveness of this. Actually we have a cut-off: By
3673 : // dividing by 2 we switch off the skipping if the probability of skipping
3674 : // is less than 50%. This is because the multibyte mask-and-compare
3675 : // skipping in quickcheck is more likely to do well on this case.
3676 : bool in_quickcheck_range =
3677 231971 : ((i - remembered_from < 4) ||
3678 2819 : (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3679 : // Called 'probability' but it is only a rough estimate and can actually
3680 : // be outside the 0-kSize range.
3681 229152 : int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3682 229152 : int points = (i - remembered_from) * probability;
3683 229152 : if (points > biggest_points) {
3684 5878 : *from = remembered_from;
3685 5878 : *to = i - 1;
3686 : biggest_points = points;
3687 : }
3688 : }
3689 242073 : return biggest_points;
3690 : }
3691 :
3692 :
3693 : // Take all the characters that will not prevent a successful match if they
3694 : // occur in the subject string in the range between min_lookahead and
3695 : // max_lookahead (inclusive) measured from the current position. If the
3696 : // character at max_lookahead offset is not one of these characters, then we
3697 : // can safely skip forwards by the number of characters in the range.
3698 4467 : int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3699 : int max_lookahead,
3700 : Handle<ByteArray> boolean_skip_table) {
3701 : const int kSize = RegExpMacroAssembler::kTableSize;
3702 :
3703 : const int kSkipArrayEntry = 0;
3704 : const int kDontSkipArrayEntry = 1;
3705 :
3706 1148019 : for (int i = 0; i < kSize; i++) {
3707 : boolean_skip_table->set(i, kSkipArrayEntry);
3708 : }
3709 4467 : int skip = max_lookahead + 1 - min_lookahead;
3710 :
3711 23747 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3712 19280 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3713 2477480 : for (int j = 0; j < kSize; j++) {
3714 1233920 : if (map->at(j)) {
3715 : boolean_skip_table->set(j, kDontSkipArrayEntry);
3716 : }
3717 : }
3718 : }
3719 :
3720 4467 : return skip;
3721 : }
3722 :
3723 :
3724 : // See comment above on the implementation of GetSkipTable.
3725 80691 : void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3726 : const int kSize = RegExpMacroAssembler::kTableSize;
3727 :
3728 80691 : int min_lookahead = 0;
3729 80691 : int max_lookahead = 0;
3730 :
3731 156915 : if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3732 :
3733 : bool found_single_character = false;
3734 : int single_character = 0;
3735 9860 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3736 17684 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3737 17684 : if (map->map_count() > 1 ||
3738 3201 : (found_single_character && map->map_count() != 0)) {
3739 : found_single_character = false;
3740 : break;
3741 : }
3742 808195 : for (int j = 0; j < kSize; j++) {
3743 406209 : if (map->at(j)) {
3744 : found_single_character = true;
3745 : single_character = j;
3746 : break;
3747 : }
3748 : }
3749 : }
3750 :
3751 5485 : int lookahead_width = max_lookahead + 1 - min_lookahead;
3752 :
3753 5485 : if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3754 : // The mask-compare can probably handle this better.
3755 : return;
3756 : }
3757 :
3758 4563 : if (found_single_character) {
3759 96 : Label cont, again;
3760 96 : masm->Bind(&again);
3761 96 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3762 96 : if (max_char_ > kSize) {
3763 96 : masm->CheckCharacterAfterAnd(single_character,
3764 : RegExpMacroAssembler::kTableMask,
3765 192 : &cont);
3766 : } else {
3767 0 : masm->CheckCharacter(single_character, &cont);
3768 : }
3769 96 : masm->AdvanceCurrentPosition(lookahead_width);
3770 96 : masm->GoTo(&again);
3771 96 : masm->Bind(&cont);
3772 : return;
3773 : }
3774 :
3775 : Factory* factory = masm->isolate()->factory();
3776 : Handle<ByteArray> boolean_skip_table =
3777 4467 : factory->NewByteArray(kSize, AllocationType::kOld);
3778 4467 : int skip_distance = GetSkipTable(
3779 4467 : min_lookahead, max_lookahead, boolean_skip_table);
3780 : DCHECK_NE(0, skip_distance);
3781 :
3782 4467 : Label cont, again;
3783 4467 : masm->Bind(&again);
3784 4467 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3785 4467 : masm->CheckBitInTable(boolean_skip_table, &cont);
3786 4467 : masm->AdvanceCurrentPosition(skip_distance);
3787 4467 : masm->GoTo(&again);
3788 4467 : masm->Bind(&cont);
3789 : }
3790 :
3791 :
3792 : /* Code generation for choice nodes.
3793 : *
3794 : * We generate quick checks that do a mask and compare to eliminate a
3795 : * choice. If the quick check succeeds then it jumps to the continuation to
3796 : * do slow checks and check subsequent nodes. If it fails (the common case)
3797 : * it falls through to the next choice.
3798 : *
3799 : * Here is the desired flow graph. Nodes directly below each other imply
3800 : * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3801 : * 3 doesn't have a quick check so we have to call the slow check.
3802 : * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3803 : * regexp continuation is generated directly after the Sn node, up to the
3804 : * next GoTo if we decide to reuse some already generated code. Some
3805 : * nodes expect preload_characters to be preloaded into the current
3806 : * character register. R nodes do this preloading. Vertices are marked
3807 : * F for failures and S for success (possible success in the case of quick
3808 : * nodes). L, V, < and > are used as arrow heads.
3809 : *
3810 : * ----------> R
3811 : * |
3812 : * V
3813 : * Q1 -----> S1
3814 : * | S /
3815 : * F| /
3816 : * | F/
3817 : * | /
3818 : * | R
3819 : * | /
3820 : * V L
3821 : * Q2 -----> S2
3822 : * | S /
3823 : * F| /
3824 : * | F/
3825 : * | /
3826 : * | R
3827 : * | /
3828 : * V L
3829 : * S3
3830 : * |
3831 : * F|
3832 : * |
3833 : * R
3834 : * |
3835 : * backtrack V
3836 : * <----------Q4
3837 : * \ F |
3838 : * \ |S
3839 : * \ F V
3840 : * \-----S4
3841 : *
3842 : * For greedy loops we push the current position, then generate the code that
3843 : * eats the input specially in EmitGreedyLoop. The other choice (the
3844 : * continuation) is generated by the normal code in EmitChoices, and steps back
3845 : * in the input to the starting position when it fails to match. The loop code
3846 : * looks like this (U is the unwind code that steps back in the greedy loop).
3847 : *
3848 : * _____
3849 : * / \
3850 : * V |
3851 : * ----------> S1 |
3852 : * /| |
3853 : * / |S |
3854 : * F/ \_____/
3855 : * /
3856 : * |<-----
3857 : * | \
3858 : * V |S
3859 : * Q2 ---> U----->backtrack
3860 : * | F /
3861 : * S| /
3862 : * V F /
3863 : * S2--/
3864 : */
3865 :
3866 213730 : GreedyLoopState::GreedyLoopState(bool not_at_start) {
3867 213730 : counter_backtrack_trace_.set_backtrack(&label_);
3868 213730 : if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3869 213730 : }
3870 :
3871 :
3872 0 : void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3873 : #ifdef DEBUG
3874 : int choice_count = alternatives_->length();
3875 : for (int i = 0; i < choice_count - 1; i++) {
3876 : GuardedAlternative alternative = alternatives_->at(i);
3877 : ZoneList<Guard*>* guards = alternative.guards();
3878 : int guard_count = (guards == nullptr) ? 0 : guards->length();
3879 : for (int j = 0; j < guard_count; j++) {
3880 : DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3881 : }
3882 : }
3883 : #endif
3884 0 : }
3885 :
3886 :
3887 213730 : void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3888 : Trace* current_trace,
3889 : PreloadState* state) {
3890 213730 : if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3891 : // Save some time by looking at most one machine word ahead.
3892 : state->eats_at_least_ =
3893 131516 : EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3894 263032 : current_trace->at_start() == Trace::FALSE_VALUE);
3895 : }
3896 : state->preload_characters_ =
3897 213730 : CalculatePreloadCharacters(compiler, state->eats_at_least_);
3898 :
3899 : state->preload_is_current_ =
3900 213730 : (current_trace->characters_preloaded() == state->preload_characters_);
3901 213730 : state->preload_has_checked_bounds_ = state->preload_is_current_;
3902 213730 : }
3903 :
3904 :
3905 584273 : void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3906 584273 : int choice_count = alternatives_->length();
3907 :
3908 584273 : if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3909 1358 : alternatives_->at(0).node()->Emit(compiler, trace);
3910 1358 : return;
3911 : }
3912 :
3913 : AssertGuardsMentionRegisters(trace);
3914 :
3915 582915 : LimitResult limit_result = LimitVersions(compiler, trace);
3916 582915 : if (limit_result == DONE) return;
3917 : DCHECK(limit_result == CONTINUE);
3918 :
3919 : // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3920 : // other choice nodes we only flush if we are out of code size budget.
3921 215190 : if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3922 1460 : trace->Flush(compiler, this);
3923 1460 : return;
3924 : }
3925 :
3926 : RecursionCheck rc(compiler);
3927 :
3928 : PreloadState preload;
3929 : preload.init();
3930 213730 : GreedyLoopState greedy_loop_state(not_at_start());
3931 :
3932 213730 : int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3933 427460 : AlternativeGenerationList alt_gens(choice_count, zone());
3934 :
3935 213730 : if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3936 : trace = EmitGreedyLoop(compiler,
3937 : trace,
3938 : &alt_gens,
3939 : &preload,
3940 : &greedy_loop_state,
3941 11693 : text_length);
3942 : } else {
3943 : // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3944 : // match the traces produced pre-cleanup.
3945 202037 : Label second_choice;
3946 202037 : compiler->macro_assembler()->Bind(&second_choice);
3947 :
3948 202037 : preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3949 :
3950 : EmitChoices(compiler,
3951 : &alt_gens,
3952 : 0,
3953 : trace,
3954 202037 : &preload);
3955 : }
3956 :
3957 : // At this point we need to generate slow checks for the alternatives where
3958 : // the quick check was inlined. We can recognize these because the associated
3959 : // label was bound.
3960 213730 : int new_flush_budget = trace->flush_budget() / choice_count;
3961 1449498 : for (int i = 0; i < choice_count; i++) {
3962 : AlternativeGeneration* alt_gen = alt_gens.at(i);
3963 617884 : Trace new_trace(*trace);
3964 : // If there are actions to be flushed we have to limit how many times
3965 : // they are flushed. Take the budget of the parent trace and distribute
3966 : // it fairly amongst the children.
3967 617884 : if (new_trace.actions() != nullptr) {
3968 : new_trace.set_flush_budget(new_flush_budget);
3969 : }
3970 : bool next_expects_preload =
3971 1022038 : i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3972 617884 : EmitOutOfLineContinuation(compiler,
3973 : &new_trace,
3974 617884 : alternatives_->at(i),
3975 : alt_gen,
3976 : preload.preload_characters_,
3977 617884 : next_expects_preload);
3978 : }
3979 : }
3980 :
3981 :
3982 11693 : Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3983 : Trace* trace,
3984 : AlternativeGenerationList* alt_gens,
3985 : PreloadState* preload,
3986 : GreedyLoopState* greedy_loop_state,
3987 : int text_length) {
3988 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3989 : // Here we have special handling for greedy loops containing only text nodes
3990 : // and other simple nodes. These are handled by pushing the current
3991 : // position on the stack and then incrementing the current position each
3992 : // time around the switch. On backtrack we decrement the current position
3993 : // and check it against the pushed value. This avoids pushing backtrack
3994 : // information for each iteration of the loop, which could take up a lot of
3995 : // space.
3996 : DCHECK(trace->stop_node() == nullptr);
3997 11693 : macro_assembler->PushCurrentPosition();
3998 11693 : Label greedy_match_failed;
3999 : Trace greedy_match_trace;
4000 11693 : if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4001 : greedy_match_trace.set_backtrack(&greedy_match_failed);
4002 11693 : Label loop_label;
4003 11693 : macro_assembler->Bind(&loop_label);
4004 11693 : greedy_match_trace.set_stop_node(this);
4005 : greedy_match_trace.set_loop_label(&loop_label);
4006 11693 : alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4007 11693 : macro_assembler->Bind(&greedy_match_failed);
4008 :
4009 11693 : Label second_choice; // For use in greedy matches.
4010 11693 : macro_assembler->Bind(&second_choice);
4011 :
4012 : Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4013 :
4014 : EmitChoices(compiler,
4015 : alt_gens,
4016 : 1,
4017 : new_trace,
4018 11693 : preload);
4019 :
4020 23386 : macro_assembler->Bind(greedy_loop_state->label());
4021 : // If we have unwound to the bottom then backtrack.
4022 11693 : macro_assembler->CheckGreedyLoop(trace->backtrack());
4023 : // Otherwise try the second priority at an earlier position.
4024 11693 : macro_assembler->AdvanceCurrentPosition(-text_length);
4025 11693 : macro_assembler->GoTo(&second_choice);
4026 11693 : return new_trace;
4027 : }
4028 :
4029 202037 : int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4030 : Trace* trace) {
4031 : int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4032 404074 : if (alternatives_->length() != 2) return eats_at_least;
4033 :
4034 165896 : GuardedAlternative alt1 = alternatives_->at(1);
4035 167316 : if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
4036 : return eats_at_least;
4037 : }
4038 : RegExpNode* eats_anything_node = alt1.node();
4039 164476 : if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4040 : return eats_at_least;
4041 : }
4042 :
4043 : // Really we should be creating a new trace when we execute this function,
4044 : // but there is no need, because the code it generates cannot backtrack, and
4045 : // we always arrive here with a trivial trace (since it's the entry to a
4046 : // loop. That also implies that there are no preloaded characters, which is
4047 : // good, because it means we won't be violating any assumptions by
4048 : // overwriting those characters with new load instructions.
4049 : DCHECK(trace->is_trivial());
4050 :
4051 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4052 : Isolate* isolate = macro_assembler->isolate();
4053 : // At this point we know that we are at a non-greedy loop that will eat
4054 : // any character one at a time. Any non-anchored regexp has such a
4055 : // loop prepended to it in order to find where it starts. We look for
4056 : // a pattern of the form ...abc... where we can look 6 characters ahead
4057 : // and step forwards 3 if the character is not one of abc. Abc need
4058 : // not be atoms, they can be any reasonably limited character class or
4059 : // small alternation.
4060 : BoyerMooreLookahead* bm = bm_info(false);
4061 82214 : if (bm == nullptr) {
4062 82214 : eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4063 : EatsAtLeast(kMaxLookaheadForBoyerMoore,
4064 : kRecursionBudget,
4065 82214 : false));
4066 82214 : if (eats_at_least >= 1) {
4067 : bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4068 : compiler,
4069 80691 : zone());
4070 80691 : GuardedAlternative alt0 = alternatives_->at(0);
4071 80691 : alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4072 : }
4073 : }
4074 82214 : if (bm != nullptr) {
4075 80691 : bm->EmitSkipInstructions(macro_assembler);
4076 : }
4077 : return eats_at_least;
4078 : }
4079 :
4080 :
4081 213730 : void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4082 : AlternativeGenerationList* alt_gens,
4083 : int first_choice,
4084 : Trace* trace,
4085 : PreloadState* preload) {
4086 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4087 213730 : SetUpPreLoad(compiler, trace, preload);
4088 :
4089 : // For now we just call all choices one after the other. The idea ultimately
4090 : // is to use the Dispatch table to try only the relevant ones.
4091 213730 : int choice_count = alternatives_->length();
4092 :
4093 213730 : int new_flush_budget = trace->flush_budget() / choice_count;
4094 :
4095 1426112 : for (int i = first_choice; i < choice_count; i++) {
4096 606191 : bool is_last = i == choice_count - 1;
4097 606191 : bool fall_through_on_failure = !is_last;
4098 1212382 : GuardedAlternative alternative = alternatives_->at(i);
4099 : AlternativeGeneration* alt_gen = alt_gens->at(i);
4100 606191 : alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4101 : ZoneList<Guard*>* guards = alternative.guards();
4102 606191 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4103 606191 : Trace new_trace(*trace);
4104 606191 : new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4105 : preload->preload_characters_ :
4106 : 0);
4107 606191 : if (preload->preload_has_checked_bounds_) {
4108 400688 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4109 : }
4110 : new_trace.quick_check_performed()->Clear();
4111 606191 : if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4112 606191 : if (!is_last) {
4113 392461 : new_trace.set_backtrack(&alt_gen->after);
4114 : }
4115 606191 : alt_gen->expects_preload = preload->preload_is_current_;
4116 : bool generate_full_check_inline = false;
4117 1085771 : if (compiler->optimize() &&
4118 1082969 : try_to_emit_quick_check_for_alternative(i == 0) &&
4119 953556 : alternative.node()->EmitQuickCheck(
4120 476778 : compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4121 : &alt_gen->possible_success, &alt_gen->quick_check_details,
4122 : fall_through_on_failure)) {
4123 : // Quick check was generated for this choice.
4124 226926 : preload->preload_is_current_ = true;
4125 226926 : preload->preload_has_checked_bounds_ = true;
4126 : // If we generated the quick check to fall through on possible success,
4127 : // we now need to generate the full check inline.
4128 226926 : if (!fall_through_on_failure) {
4129 34428 : macro_assembler->Bind(&alt_gen->possible_success);
4130 : new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4131 34428 : new_trace.set_characters_preloaded(preload->preload_characters_);
4132 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4133 : generate_full_check_inline = true;
4134 : }
4135 379265 : } else if (alt_gen->quick_check_details.cannot_match()) {
4136 110 : if (!fall_through_on_failure) {
4137 22 : macro_assembler->GoTo(trace->backtrack());
4138 : }
4139 110 : continue;
4140 : } else {
4141 : // No quick check was generated. Put the full code here.
4142 : // If this is not the first choice then there could be slow checks from
4143 : // previous cases that go here when they fail. There's no reason to
4144 : // insist that they preload characters since the slow check we are about
4145 : // to generate probably can't use it.
4146 379155 : if (i != first_choice) {
4147 227954 : alt_gen->expects_preload = false;
4148 : new_trace.InvalidateCurrentCharacter();
4149 : }
4150 : generate_full_check_inline = true;
4151 : }
4152 606081 : if (generate_full_check_inline) {
4153 413583 : if (new_trace.actions() != nullptr) {
4154 : new_trace.set_flush_budget(new_flush_budget);
4155 : }
4156 418535 : for (int j = 0; j < guard_count; j++) {
4157 2476 : GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4158 : }
4159 413583 : alternative.node()->Emit(compiler, &new_trace);
4160 413583 : preload->preload_is_current_ = false;
4161 : }
4162 606081 : macro_assembler->Bind(&alt_gen->after);
4163 : }
4164 213730 : }
4165 :
4166 :
4167 617884 : void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4168 : Trace* trace,
4169 : GuardedAlternative alternative,
4170 : AlternativeGeneration* alt_gen,
4171 : int preload_characters,
4172 : bool next_expects_preload) {
4173 1043270 : if (!alt_gen->possible_success.is_linked()) return;
4174 :
4175 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4176 192498 : macro_assembler->Bind(&alt_gen->possible_success);
4177 192498 : Trace out_of_line_trace(*trace);
4178 : out_of_line_trace.set_characters_preloaded(preload_characters);
4179 : out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4180 192498 : if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4181 : ZoneList<Guard*>* guards = alternative.guards();
4182 192498 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4183 192498 : if (next_expects_preload) {
4184 162871 : Label reload_current_char;
4185 : out_of_line_trace.set_backtrack(&reload_current_char);
4186 165651 : for (int j = 0; j < guard_count; j++) {
4187 1390 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4188 : }
4189 162871 : alternative.node()->Emit(compiler, &out_of_line_trace);
4190 162871 : macro_assembler->Bind(&reload_current_char);
4191 : // Reload the current character, since the next quick check expects that.
4192 : // We don't need to check bounds here because we only get into this
4193 : // code through a quick check which already did the checked load.
4194 : macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
4195 162871 : preload_characters);
4196 162871 : macro_assembler->GoTo(&(alt_gen->after));
4197 : } else {
4198 29627 : out_of_line_trace.set_backtrack(&(alt_gen->after));
4199 29779 : for (int j = 0; j < guard_count; j++) {
4200 76 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4201 : }
4202 29627 : alternative.node()->Emit(compiler, &out_of_line_trace);
4203 : }
4204 : }
4205 :
4206 :
4207 495617 : void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4208 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4209 495617 : LimitResult limit_result = LimitVersions(compiler, trace);
4210 495617 : if (limit_result == DONE) return;
4211 : DCHECK(limit_result == CONTINUE);
4212 :
4213 : RecursionCheck rc(compiler);
4214 :
4215 266244 : switch (action_type_) {
4216 : case STORE_POSITION: {
4217 : Trace::DeferredCapture
4218 : new_capture(data_.u_position_register.reg,
4219 241749 : data_.u_position_register.is_capture,
4220 241749 : trace);
4221 241749 : Trace new_trace = *trace;
4222 : new_trace.add_action(&new_capture);
4223 241749 : on_success()->Emit(compiler, &new_trace);
4224 : break;
4225 : }
4226 : case INCREMENT_REGISTER: {
4227 : Trace::DeferredIncrementRegister
4228 3750 : new_increment(data_.u_increment_register.reg);
4229 3750 : Trace new_trace = *trace;
4230 : new_trace.add_action(&new_increment);
4231 3750 : on_success()->Emit(compiler, &new_trace);
4232 : break;
4233 : }
4234 : case SET_REGISTER: {
4235 : Trace::DeferredSetRegister
4236 3471 : new_set(data_.u_store_register.reg, data_.u_store_register.value);
4237 3471 : Trace new_trace = *trace;
4238 : new_trace.add_action(&new_set);
4239 3471 : on_success()->Emit(compiler, &new_trace);
4240 : break;
4241 : }
4242 : case CLEAR_CAPTURES: {
4243 : Trace::DeferredClearCaptures
4244 : new_capture(Interval(data_.u_clear_captures.range_from,
4245 2232 : data_.u_clear_captures.range_to));
4246 2232 : Trace new_trace = *trace;
4247 : new_trace.add_action(&new_capture);
4248 2232 : on_success()->Emit(compiler, &new_trace);
4249 : break;
4250 : }
4251 : case BEGIN_SUBMATCH:
4252 9483 : if (!trace->is_trivial()) {
4253 5046 : trace->Flush(compiler, this);
4254 : } else {
4255 4437 : assembler->WriteCurrentPositionToRegister(
4256 8874 : data_.u_submatch.current_position_register, 0);
4257 4437 : assembler->WriteStackPointerToRegister(
4258 8874 : data_.u_submatch.stack_pointer_register);
4259 4437 : on_success()->Emit(compiler, trace);
4260 : }
4261 : break;
4262 : case EMPTY_MATCH_CHECK: {
4263 973 : int start_pos_reg = data_.u_empty_match_check.start_register;
4264 973 : int stored_pos = 0;
4265 973 : int rep_reg = data_.u_empty_match_check.repetition_register;
4266 : bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4267 973 : bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4268 973 : if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4269 : // If we know we haven't advanced and there is no minimum we
4270 : // can just backtrack immediately.
4271 76 : assembler->GoTo(trace->backtrack());
4272 897 : } else if (know_dist && stored_pos < trace->cp_offset()) {
4273 : // If we know we've advanced we can generate the continuation
4274 : // immediately.
4275 247 : on_success()->Emit(compiler, trace);
4276 650 : } else if (!trace->is_trivial()) {
4277 339 : trace->Flush(compiler, this);
4278 : } else {
4279 311 : Label skip_empty_check;
4280 : // If we have a minimum number of repetitions we check the current
4281 : // number first and skip the empty check if it's not enough.
4282 311 : if (has_minimum) {
4283 206 : int limit = data_.u_empty_match_check.repetition_limit;
4284 206 : assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4285 : }
4286 : // If the match is empty we bail out, otherwise we fall through
4287 : // to the on-success continuation.
4288 311 : assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4289 622 : trace->backtrack());
4290 311 : assembler->Bind(&skip_empty_check);
4291 311 : on_success()->Emit(compiler, trace);
4292 : }
4293 : break;
4294 : }
4295 : case POSITIVE_SUBMATCH_SUCCESS: {
4296 4586 : if (!trace->is_trivial()) {
4297 2956 : trace->Flush(compiler, this);
4298 2956 : return;
4299 : }
4300 1630 : assembler->ReadCurrentPositionFromRegister(
4301 3260 : data_.u_submatch.current_position_register);
4302 1630 : assembler->ReadStackPointerFromRegister(
4303 3260 : data_.u_submatch.stack_pointer_register);
4304 1630 : int clear_register_count = data_.u_submatch.clear_register_count;
4305 1630 : if (clear_register_count == 0) {
4306 1147 : on_success()->Emit(compiler, trace);
4307 1147 : return;
4308 : }
4309 483 : int clear_registers_from = data_.u_submatch.clear_register_from;
4310 483 : Label clear_registers_backtrack;
4311 483 : Trace new_trace = *trace;
4312 : new_trace.set_backtrack(&clear_registers_backtrack);
4313 483 : on_success()->Emit(compiler, &new_trace);
4314 :
4315 483 : assembler->Bind(&clear_registers_backtrack);
4316 483 : int clear_registers_to = clear_registers_from + clear_register_count - 1;
4317 483 : assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4318 :
4319 : DCHECK(trace->backtrack() == nullptr);
4320 483 : assembler->Backtrack();
4321 483 : return;
4322 : }
4323 : default:
4324 0 : UNREACHABLE();
4325 : }
4326 : }
4327 :
4328 :
4329 4745 : void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4330 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4331 4745 : if (!trace->is_trivial()) {
4332 2254 : trace->Flush(compiler, this);
4333 2254 : return;
4334 : }
4335 :
4336 2491 : LimitResult limit_result = LimitVersions(compiler, trace);
4337 2491 : if (limit_result == DONE) return;
4338 : DCHECK(limit_result == CONTINUE);
4339 :
4340 : RecursionCheck rc(compiler);
4341 :
4342 : DCHECK_EQ(start_reg_ + 1, end_reg_);
4343 2291 : if (IgnoreCase(flags_)) {
4344 1682 : assembler->CheckNotBackReferenceIgnoreCase(
4345 3364 : start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
4346 : } else {
4347 609 : assembler->CheckNotBackReference(start_reg_, read_backward(),
4348 1218 : trace->backtrack());
4349 : }
4350 : // We are going to advance backward, so we may end up at the start.
4351 2291 : if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4352 :
4353 : // Check that the back reference does not end inside a surrogate pair.
4354 2291 : if (IsUnicode(flags_) && !compiler->one_byte()) {
4355 80 : assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4356 : }
4357 2291 : on_success()->Emit(compiler, trace);
4358 : }
4359 :
4360 :
4361 : // -------------------------------------------------------------------
4362 : // Dot/dotty output
4363 :
4364 :
4365 : #ifdef DEBUG
4366 :
4367 :
4368 : class DotPrinter: public NodeVisitor {
4369 : public:
4370 : DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4371 : : os_(os),
4372 : ignore_case_(ignore_case) {}
4373 : void PrintNode(const char* label, RegExpNode* node);
4374 : void Visit(RegExpNode* node);
4375 : void PrintAttributes(RegExpNode* from);
4376 : void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4377 : #define DECLARE_VISIT(Type) \
4378 : virtual void Visit##Type(Type##Node* that);
4379 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4380 : #undef DECLARE_VISIT
4381 : private:
4382 : std::ostream& os_;
4383 : bool ignore_case_;
4384 : };
4385 :
4386 :
4387 : void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4388 : os_ << "digraph G {\n graph [label=\"";
4389 : for (int i = 0; label[i]; i++) {
4390 : switch (label[i]) {
4391 : case '\\':
4392 : os_ << "\\\\";
4393 : break;
4394 : case '"':
4395 : os_ << "\"";
4396 : break;
4397 : default:
4398 : os_ << label[i];
4399 : break;
4400 : }
4401 : }
4402 : os_ << "\"];\n";
4403 : Visit(node);
4404 : os_ << "}" << std::endl;
4405 : }
4406 :
4407 :
4408 : void DotPrinter::Visit(RegExpNode* node) {
4409 : if (node->info()->visited) return;
4410 : node->info()->visited = true;
4411 : node->Accept(this);
4412 : }
4413 :
4414 :
4415 : void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4416 : os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4417 : Visit(on_failure);
4418 : }
4419 :
4420 :
4421 : class TableEntryBodyPrinter {
4422 : public:
4423 : TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4424 : : os_(os),
4425 : choice_(choice) {}
4426 : void Call(uc16 from, DispatchTable::Entry entry) {
4427 : OutSet* out_set = entry.out_set();
4428 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4429 : if (out_set->Get(i)) {
4430 : os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4431 : << choice()->alternatives()->at(i).node() << ";\n";
4432 : }
4433 : }
4434 : }
4435 : private:
4436 : ChoiceNode* choice() { return choice_; }
4437 : std::ostream& os_;
4438 : ChoiceNode* choice_;
4439 : };
4440 :
4441 :
4442 : class TableEntryHeaderPrinter {
4443 : public:
4444 : explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4445 : : first_(true),
4446 : os_(os) {}
4447 : void Call(uc16 from, DispatchTable::Entry entry) {
4448 : if (first_) {
4449 : first_ = false;
4450 : } else {
4451 : os_ << "|";
4452 : }
4453 : os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4454 : OutSet* out_set = entry.out_set();
4455 : int priority = 0;
4456 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4457 : if (out_set->Get(i)) {
4458 : if (priority > 0) os_ << "|";
4459 : os_ << "<s" << from << "o" << i << "> " << priority;
4460 : priority++;
4461 : }
4462 : }
4463 : os_ << "}}";
4464 : }
4465 :
4466 : private:
4467 : bool first_;
4468 : std::ostream& os_;
4469 : };
4470 :
4471 :
4472 : class AttributePrinter {
4473 : public:
4474 : explicit AttributePrinter(std::ostream& os) // NOLINT
4475 : : os_(os),
4476 : first_(true) {}
4477 : void PrintSeparator() {
4478 : if (first_) {
4479 : first_ = false;
4480 : } else {
4481 : os_ << "|";
4482 : }
4483 : }
4484 : void PrintBit(const char* name, bool value) {
4485 : if (!value) return;
4486 : PrintSeparator();
4487 : os_ << "{" << name << "}";
4488 : }
4489 : void PrintPositive(const char* name, int value) {
4490 : if (value < 0) return;
4491 : PrintSeparator();
4492 : os_ << "{" << name << "|" << value << "}";
4493 : }
4494 :
4495 : private:
4496 : std::ostream& os_;
4497 : bool first_;
4498 : };
4499 :
4500 :
4501 : void DotPrinter::PrintAttributes(RegExpNode* that) {
4502 : os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4503 : << "margin=0.1, fontsize=10, label=\"{";
4504 : AttributePrinter printer(os_);
4505 : NodeInfo* info = that->info();
4506 : printer.PrintBit("NI", info->follows_newline_interest);
4507 : printer.PrintBit("WI", info->follows_word_interest);
4508 : printer.PrintBit("SI", info->follows_start_interest);
4509 : Label* label = that->label();
4510 : if (label->is_bound())
4511 : printer.PrintPositive("@", label->pos());
4512 : os_ << "}\"];\n"
4513 : << " a" << that << " -> n" << that
4514 : << " [style=dashed, color=grey, arrowhead=none];\n";
4515 : }
4516 :
4517 :
4518 : static const bool kPrintDispatchTable = false;
4519 : void DotPrinter::VisitChoice(ChoiceNode* that) {
4520 : if (kPrintDispatchTable) {
4521 : os_ << " n" << that << " [shape=Mrecord, label=\"";
4522 : TableEntryHeaderPrinter header_printer(os_);
4523 : that->GetTable(ignore_case_)->ForEach(&header_printer);
4524 : os_ << "\"]\n";
4525 : PrintAttributes(that);
4526 : TableEntryBodyPrinter body_printer(os_, that);
4527 : that->GetTable(ignore_case_)->ForEach(&body_printer);
4528 : } else {
4529 : os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4530 : for (int i = 0; i < that->alternatives()->length(); i++) {
4531 : GuardedAlternative alt = that->alternatives()->at(i);
4532 : os_ << " n" << that << " -> n" << alt.node();
4533 : }
4534 : }
4535 : for (int i = 0; i < that->alternatives()->length(); i++) {
4536 : GuardedAlternative alt = that->alternatives()->at(i);
4537 : alt.node()->Accept(this);
4538 : }
4539 : }
4540 :
4541 :
4542 : void DotPrinter::VisitText(TextNode* that) {
4543 : Zone* zone = that->zone();
4544 : os_ << " n" << that << " [label=\"";
4545 : for (int i = 0; i < that->elements()->length(); i++) {
4546 : if (i > 0) os_ << " ";
4547 : TextElement elm = that->elements()->at(i);
4548 : switch (elm.text_type()) {
4549 : case TextElement::ATOM: {
4550 : Vector<const uc16> data = elm.atom()->data();
4551 : for (int i = 0; i < data.length(); i++) {
4552 : os_ << static_cast<char>(data[i]);
4553 : }
4554 : break;
4555 : }
4556 : case TextElement::CHAR_CLASS: {
4557 : RegExpCharacterClass* node = elm.char_class();
4558 : os_ << "[";
4559 : if (node->is_negated()) os_ << "^";
4560 : for (int j = 0; j < node->ranges(zone)->length(); j++) {
4561 : CharacterRange range = node->ranges(zone)->at(j);
4562 : os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4563 : }
4564 : os_ << "]";
4565 : break;
4566 : }
4567 : default:
4568 : UNREACHABLE();
4569 : }
4570 : }
4571 : os_ << "\", shape=box, peripheries=2];\n";
4572 : PrintAttributes(that);
4573 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4574 : Visit(that->on_success());
4575 : }
4576 :
4577 :
4578 : void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4579 : os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4580 : << that->end_register() << "\", shape=doubleoctagon];\n";
4581 : PrintAttributes(that);
4582 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4583 : Visit(that->on_success());
4584 : }
4585 :
4586 :
4587 : void DotPrinter::VisitEnd(EndNode* that) {
4588 : os_ << " n" << that << " [style=bold, shape=point];\n";
4589 : PrintAttributes(that);
4590 : }
4591 :
4592 :
4593 : void DotPrinter::VisitAssertion(AssertionNode* that) {
4594 : os_ << " n" << that << " [";
4595 : switch (that->assertion_type()) {
4596 : case AssertionNode::AT_END:
4597 : os_ << "label=\"$\", shape=septagon";
4598 : break;
4599 : case AssertionNode::AT_START:
4600 : os_ << "label=\"^\", shape=septagon";
4601 : break;
4602 : case AssertionNode::AT_BOUNDARY:
4603 : os_ << "label=\"\\b\", shape=septagon";
4604 : break;
4605 : case AssertionNode::AT_NON_BOUNDARY:
4606 : os_ << "label=\"\\B\", shape=septagon";
4607 : break;
4608 : case AssertionNode::AFTER_NEWLINE:
4609 : os_ << "label=\"(?<=\\n)\", shape=septagon";
4610 : break;
4611 : }
4612 : os_ << "];\n";
4613 : PrintAttributes(that);
4614 : RegExpNode* successor = that->on_success();
4615 : os_ << " n" << that << " -> n" << successor << ";\n";
4616 : Visit(successor);
4617 : }
4618 :
4619 :
4620 : void DotPrinter::VisitAction(ActionNode* that) {
4621 : os_ << " n" << that << " [";
4622 : switch (that->action_type_) {
4623 : case ActionNode::SET_REGISTER:
4624 : os_ << "label=\"$" << that->data_.u_store_register.reg
4625 : << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4626 : break;
4627 : case ActionNode::INCREMENT_REGISTER:
4628 : os_ << "label=\"$" << that->data_.u_increment_register.reg
4629 : << "++\", shape=octagon";
4630 : break;
4631 : case ActionNode::STORE_POSITION:
4632 : os_ << "label=\"$" << that->data_.u_position_register.reg
4633 : << ":=$pos\", shape=octagon";
4634 : break;
4635 : case ActionNode::BEGIN_SUBMATCH:
4636 : os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4637 : << ":=$pos,begin\", shape=septagon";
4638 : break;
4639 : case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4640 : os_ << "label=\"escape\", shape=septagon";
4641 : break;
4642 : case ActionNode::EMPTY_MATCH_CHECK:
4643 : os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4644 : << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4645 : << "<" << that->data_.u_empty_match_check.repetition_limit
4646 : << "?\", shape=septagon";
4647 : break;
4648 : case ActionNode::CLEAR_CAPTURES: {
4649 : os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4650 : << " to $" << that->data_.u_clear_captures.range_to
4651 : << "\", shape=septagon";
4652 : break;
4653 : }
4654 : }
4655 : os_ << "];\n";
4656 : PrintAttributes(that);
4657 : RegExpNode* successor = that->on_success();
4658 : os_ << " n" << that << " -> n" << successor << ";\n";
4659 : Visit(successor);
4660 : }
4661 :
4662 :
4663 : class DispatchTableDumper {
4664 : public:
4665 : explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4666 : void Call(uc16 key, DispatchTable::Entry entry);
4667 : private:
4668 : std::ostream& os_;
4669 : };
4670 :
4671 :
4672 : void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4673 : os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4674 : OutSet* set = entry.out_set();
4675 : bool first = true;
4676 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4677 : if (set->Get(i)) {
4678 : if (first) {
4679 : first = false;
4680 : } else {
4681 : os_ << ", ";
4682 : }
4683 : os_ << i;
4684 : }
4685 : }
4686 : os_ << "}\n";
4687 : }
4688 :
4689 :
4690 : void DispatchTable::Dump() {
4691 : OFStream os(stderr);
4692 : DispatchTableDumper dumper(os);
4693 : tree()->ForEach(&dumper);
4694 : }
4695 :
4696 :
4697 : void RegExpEngine::DotPrint(const char* label,
4698 : RegExpNode* node,
4699 : bool ignore_case) {
4700 : StdoutStream os;
4701 : DotPrinter printer(os, ignore_case);
4702 : printer.PrintNode(label, node);
4703 : }
4704 :
4705 :
4706 : #endif // DEBUG
4707 :
4708 :
4709 : // -------------------------------------------------------------------
4710 : // Tree to graph conversion
4711 :
4712 992686 : RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4713 : RegExpNode* on_success) {
4714 : ZoneList<TextElement>* elms =
4715 : new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4716 992686 : elms->Add(TextElement::Atom(this), compiler->zone());
4717 : return new (compiler->zone())
4718 992686 : TextNode(elms, compiler->read_backward(), on_success);
4719 : }
4720 :
4721 :
4722 18669 : RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4723 : RegExpNode* on_success) {
4724 : return new (compiler->zone())
4725 18669 : TextNode(elements(), compiler->read_backward(), on_success);
4726 : }
4727 :
4728 :
4729 558166 : static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4730 : const int* special_class,
4731 : int length) {
4732 558166 : length--; // Remove final marker.
4733 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4734 : DCHECK_NE(0, ranges->length());
4735 : DCHECK_NE(0, length);
4736 : DCHECK_NE(0, special_class[0]);
4737 558166 : if (ranges->length() != (length >> 1) + 1) {
4738 : return false;
4739 : }
4740 10234 : CharacterRange range = ranges->at(0);
4741 10234 : if (range.from() != 0) {
4742 : return false;
4743 : }
4744 58424 : for (int i = 0; i < length; i += 2) {
4745 25579 : if (special_class[i] != (range.to() + 1)) {
4746 : return false;
4747 : }
4748 50048 : range = ranges->at((i >> 1) + 1);
4749 25024 : if (special_class[i+1] != range.from()) {
4750 : return false;
4751 : }
4752 : }
4753 7821 : if (range.to() != String::kMaxCodePoint) {
4754 : return false;
4755 : }
4756 7821 : return true;
4757 : }
4758 :
4759 :
4760 552542 : static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4761 : const int* special_class,
4762 : int length) {
4763 552542 : length--; // Remove final marker.
4764 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4765 552542 : if (ranges->length() * 2 != length) {
4766 : return false;
4767 : }
4768 27785 : for (int i = 0; i < length; i += 2) {
4769 29386 : CharacterRange range = ranges->at(i >> 1);
4770 25908 : if (range.from() != special_class[i] ||
4771 11215 : range.to() != special_class[i + 1] - 1) {
4772 : return false;
4773 : }
4774 : }
4775 : return true;
4776 : }
4777 :
4778 :
4779 198688 : bool RegExpCharacterClass::is_standard(Zone* zone) {
4780 : // TODO(lrn): Remove need for this function, by not throwing away information
4781 : // along the way.
4782 198688 : if (is_negated()) {
4783 : return false;
4784 : }
4785 193009 : if (set_.is_standard()) {
4786 : return true;
4787 : }
4788 189728 : if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4789 : set_.set_standard_set_type('s');
4790 607 : return true;
4791 : }
4792 189121 : if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4793 : set_.set_standard_set_type('S');
4794 207 : return true;
4795 : }
4796 188914 : if (CompareInverseRanges(set_.ranges(zone),
4797 : kLineTerminatorRanges,
4798 : kLineTerminatorRangeCount)) {
4799 : set_.set_standard_set_type('.');
4800 7502 : return true;
4801 : }
4802 181412 : if (CompareRanges(set_.ranges(zone),
4803 : kLineTerminatorRanges,
4804 : kLineTerminatorRangeCount)) {
4805 : set_.set_standard_set_type('n');
4806 10 : return true;
4807 : }
4808 181402 : if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4809 : set_.set_standard_set_type('w');
4810 1271 : return true;
4811 : }
4812 180131 : if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4813 : set_.set_standard_set_type('W');
4814 112 : return true;
4815 : }
4816 : return false;
4817 : }
4818 :
4819 :
4820 2587 : UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4821 : ZoneList<CharacterRange>* base)
4822 : : zone_(zone),
4823 : table_(zone),
4824 : bmp_(nullptr),
4825 : lead_surrogates_(nullptr),
4826 : trail_surrogates_(nullptr),
4827 5174 : non_bmp_(nullptr) {
4828 : // The unicode range splitter categorizes given character ranges into:
4829 : // - Code points from the BMP representable by one code unit.
4830 : // - Code points outside the BMP that need to be split into surrogate pairs.
4831 : // - Lone lead surrogates.
4832 : // - Lone trail surrogates.
4833 : // Lone surrogates are valid code points, even though no actual characters.
4834 : // They require special matching to make sure we do not split surrogate pairs.
4835 : // We use the dispatch table to accomplish this. The base range is split up
4836 : // by the table by the overlay ranges, and the Call callback is used to
4837 : // filter and collect ranges for each category.
4838 153083 : for (int i = 0; i < base->length(); i++) {
4839 150496 : table_.AddRange(base->at(i), kBase, zone_);
4840 : }
4841 : // Add overlay ranges.
4842 2587 : table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4843 2587 : kBmpCodePoints, zone_);
4844 2587 : table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4845 2587 : kLeadSurrogates, zone_);
4846 2587 : table_.AddRange(
4847 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4848 2587 : kTrailSurrogates, zone_);
4849 2587 : table_.AddRange(
4850 : CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4851 2587 : kBmpCodePoints, zone_);
4852 2587 : table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4853 2587 : kNonBmpCodePoints, zone_);
4854 : table_.ForEach(this);
4855 2587 : }
4856 :
4857 :
4858 161041 : void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4859 : OutSet* outset = entry.out_set();
4860 161041 : if (!outset->Get(kBase)) return;
4861 : ZoneList<CharacterRange>** target = nullptr;
4862 79253 : if (outset->Get(kBmpCodePoints)) {
4863 50598 : target = &bmp_;
4864 28655 : } else if (outset->Get(kLeadSurrogates)) {
4865 1175 : target = &lead_surrogates_;
4866 27480 : } else if (outset->Get(kTrailSurrogates)) {
4867 1175 : target = &trail_surrogates_;
4868 : } else {
4869 : DCHECK(outset->Get(kNonBmpCodePoints));
4870 26305 : target = &non_bmp_;
4871 : }
4872 79253 : if (*target == nullptr)
4873 18201 : *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4874 79253 : (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4875 : }
4876 :
4877 2582 : void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4878 : RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4879 : ZoneList<CharacterRange>* bmp = splitter->bmp();
4880 3037 : if (bmp == nullptr) return;
4881 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4882 2127 : result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4883 : compiler->zone(), bmp, compiler->read_backward(), on_success,
4884 : default_flags)));
4885 : }
4886 :
4887 2582 : void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4888 : RegExpNode* on_success,
4889 : UnicodeRangeSplitter* splitter) {
4890 : ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4891 3584 : if (non_bmp == nullptr) return;
4892 : DCHECK(!compiler->one_byte());
4893 : Zone* zone = compiler->zone();
4894 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4895 1580 : CharacterRange::Canonicalize(non_bmp);
4896 54180 : for (int i = 0; i < non_bmp->length(); i++) {
4897 : // Match surrogate pair.
4898 : // E.g. [\u10005-\u11005] becomes
4899 : // \ud800[\udc05-\udfff]|
4900 : // [\ud801-\ud803][\udc00-\udfff]|
4901 : // \ud804[\udc00-\udc05]
4902 : uc32 from = non_bmp->at(i).from();
4903 : uc32 to = non_bmp->at(i).to();
4904 26300 : uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4905 : uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4906 26300 : uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4907 : uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4908 26300 : if (from_l == to_l) {
4909 : // The lead surrogate is the same.
4910 : result->AddAlternative(
4911 23990 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4912 : zone, CharacterRange::Singleton(from_l),
4913 : CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4914 : on_success, default_flags)));
4915 : } else {
4916 2310 : if (from_t != kTrailSurrogateStart) {
4917 : // Add [from_l][from_t-\udfff]
4918 : result->AddAlternative(
4919 1180 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4920 : zone, CharacterRange::Singleton(from_l),
4921 : CharacterRange::Range(from_t, kTrailSurrogateEnd),
4922 : compiler->read_backward(), on_success, default_flags)));
4923 1180 : from_l++;
4924 : }
4925 2310 : if (to_t != kTrailSurrogateEnd) {
4926 : // Add [to_l][\udc00-to_t]
4927 : result->AddAlternative(
4928 925 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4929 : zone, CharacterRange::Singleton(to_l),
4930 : CharacterRange::Range(kTrailSurrogateStart, to_t),
4931 : compiler->read_backward(), on_success, default_flags)));
4932 925 : to_l--;
4933 : }
4934 2310 : if (from_l <= to_l) {
4935 : // Add [from_l-to_l][\udc00-\udfff]
4936 : result->AddAlternative(
4937 2100 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4938 : zone, CharacterRange::Range(from_l, to_l),
4939 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4940 : compiler->read_backward(), on_success, default_flags)));
4941 : }
4942 : }
4943 : }
4944 : }
4945 :
4946 1175 : RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4947 : RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
4948 : ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
4949 : JSRegExp::Flags flags) {
4950 : Zone* zone = compiler->zone();
4951 1175 : RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4952 1175 : zone, match, read_backward, on_success, flags);
4953 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4954 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4955 : RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4956 1175 : position_register);
4957 1175 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4958 2350 : zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
4959 1175 : return lookaround.ForMatch(negative_match);
4960 : }
4961 :
4962 1165 : RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4963 : RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
4964 : ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
4965 : bool read_backward, JSRegExp::Flags flags) {
4966 : Zone* zone = compiler->zone();
4967 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4968 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4969 : RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4970 1165 : position_register);
4971 1165 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4972 1165 : zone, lookahead, read_backward, lookaround.on_match_success(), flags);
4973 1165 : return TextNode::CreateForCharacterRanges(
4974 1165 : zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
4975 : }
4976 :
4977 2582 : void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4978 : RegExpNode* on_success,
4979 : UnicodeRangeSplitter* splitter) {
4980 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4981 : ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
4982 3994 : if (lead_surrogates == nullptr) return;
4983 : Zone* zone = compiler->zone();
4984 : // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4985 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
4986 1170 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
4987 :
4988 : RegExpNode* match;
4989 1170 : if (compiler->read_backward()) {
4990 : // Reading backward. Assert that reading forward, there is no trail
4991 : // surrogate, and then backward match the lead surrogate.
4992 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
4993 : compiler, trail_surrogates, lead_surrogates, on_success, true,
4994 95 : default_flags);
4995 : } else {
4996 : // Reading forward. Forward match the lead surrogate and assert that
4997 : // no trail surrogate follows.
4998 : match = MatchAndNegativeLookaroundInReadDirection(
4999 : compiler, lead_surrogates, trail_surrogates, on_success, false,
5000 1075 : default_flags);
5001 : }
5002 : result->AddAlternative(GuardedAlternative(match));
5003 : }
5004 :
5005 2582 : void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5006 : RegExpNode* on_success,
5007 : UnicodeRangeSplitter* splitter) {
5008 : JSRegExp::Flags default_flags = JSRegExp::Flags();
5009 : ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5010 3994 : if (trail_surrogates == nullptr) return;
5011 : Zone* zone = compiler->zone();
5012 : // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5013 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5014 1170 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5015 :
5016 : RegExpNode* match;
5017 1170 : if (compiler->read_backward()) {
5018 : // Reading backward. Backward match the trail surrogate and assert that no
5019 : // lead surrogate precedes it.
5020 : match = MatchAndNegativeLookaroundInReadDirection(
5021 : compiler, trail_surrogates, lead_surrogates, on_success, true,
5022 90 : default_flags);
5023 : } else {
5024 : // Reading forward. Assert that reading backward, there is no lead
5025 : // surrogate, and then forward match the trail surrogate.
5026 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
5027 : compiler, lead_surrogates, trail_surrogates, on_success, false,
5028 1080 : default_flags);
5029 : }
5030 : result->AddAlternative(GuardedAlternative(match));
5031 : }
5032 :
5033 0 : RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5034 : RegExpNode* on_success) {
5035 : // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5036 : DCHECK(!compiler->read_backward());
5037 : Zone* zone = compiler->zone();
5038 : // Advance any character. If the character happens to be a lead surrogate and
5039 : // we advanced into the middle of a surrogate pair, it will work out, as
5040 : // nothing will match from there. We will have to advance again, consuming
5041 : // the associated trail surrogate.
5042 : ZoneList<CharacterRange>* range = CharacterRange::List(
5043 0 : zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5044 : JSRegExp::Flags default_flags = JSRegExp::Flags();
5045 : return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
5046 0 : default_flags);
5047 : }
5048 :
5049 1189 : void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5050 : #ifdef V8_INTL_SUPPORT
5051 : DCHECK(CharacterRange::IsCanonical(ranges));
5052 :
5053 : // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
5054 : // See also https://crbug.com/v8/6727.
5055 : // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
5056 : // which we use frequently internally. But large ranges can also easily be
5057 : // created by the user. We might want to have a more general caching mechanism
5058 : // for such ranges.
5059 1728 : if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
5060 :
5061 : // Use ICU to compute the case fold closure over the ranges.
5062 2378 : icu::UnicodeSet set;
5063 247697 : for (int i = 0; i < ranges->length(); i++) {
5064 123254 : set.add(ranges->at(i).from(), ranges->at(i).to());
5065 : }
5066 : ranges->Clear();
5067 1189 : set.closeOver(USET_CASE_INSENSITIVE);
5068 : // Full case mapping map single characters to multiple characters.
5069 : // Those are represented as strings in the set. Remove them so that
5070 : // we end up with only simple and common case mappings.
5071 1189 : set.removeAllStrings();
5072 37731 : for (int i = 0; i < set.getRangeCount(); i++) {
5073 36542 : ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5074 18271 : zone);
5075 : }
5076 : // No errors and everything we collected have been ranges.
5077 1189 : CharacterRange::Canonicalize(ranges);
5078 : #endif // V8_INTL_SUPPORT
5079 : }
5080 :
5081 :
5082 176260 : RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5083 : RegExpNode* on_success) {
5084 : set_.Canonicalize();
5085 : Zone* zone = compiler->zone();
5086 : ZoneList<CharacterRange>* ranges = this->ranges(zone);
5087 176260 : if (NeedsUnicodeCaseEquivalents(flags_)) {
5088 949 : AddUnicodeCaseEquivalents(ranges, zone);
5089 : }
5090 178882 : if (IsUnicode(flags_) && !compiler->one_byte() &&
5091 : !contains_split_surrogate()) {
5092 2612 : if (is_negated()) {
5093 : ZoneList<CharacterRange>* negated =
5094 : new (zone) ZoneList<CharacterRange>(2, zone);
5095 140 : CharacterRange::Negate(ranges, negated, zone);
5096 : ranges = negated;
5097 : }
5098 2612 : if (ranges->length() == 0) {
5099 : JSRegExp::Flags default_flags;
5100 : RegExpCharacterClass* fail =
5101 60 : new (zone) RegExpCharacterClass(zone, ranges, default_flags);
5102 60 : return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5103 : }
5104 2582 : if (standard_type() == '*') {
5105 0 : return UnanchoredAdvance(compiler, on_success);
5106 : } else {
5107 2582 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5108 2582 : UnicodeRangeSplitter splitter(zone, ranges);
5109 2582 : AddBmpCharacters(compiler, result, on_success, &splitter);
5110 2582 : AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5111 2582 : AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5112 2582 : AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5113 : return result;
5114 : }
5115 : } else {
5116 347296 : return new (zone) TextNode(this, compiler->read_backward(), on_success);
5117 : }
5118 : }
5119 :
5120 :
5121 146822 : int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5122 146822 : RegExpAtom* atom1 = (*a)->AsAtom();
5123 146822 : RegExpAtom* atom2 = (*b)->AsAtom();
5124 146822 : uc16 character1 = atom1->data().at(0);
5125 146822 : uc16 character2 = atom2->data().at(0);
5126 146822 : if (character1 < character2) return -1;
5127 129859 : if (character1 > character2) return 1;
5128 17383 : return 0;
5129 : }
5130 :
5131 : #ifdef V8_INTL_SUPPORT
5132 :
5133 : // Case Insensitve comparesion
5134 63041 : int CompareFirstCharCaseInsensitve(RegExpTree* const* a, RegExpTree* const* b) {
5135 63041 : RegExpAtom* atom1 = (*a)->AsAtom();
5136 63041 : RegExpAtom* atom2 = (*b)->AsAtom();
5137 126082 : icu::UnicodeString character1(atom1->data().at(0));
5138 126082 : return character1.caseCompare(atom2->data().at(0), U_FOLD_CASE_DEFAULT);
5139 : }
5140 :
5141 : #else
5142 :
5143 : static unibrow::uchar Canonical(
5144 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5145 : unibrow::uchar c) {
5146 : unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5147 : int length = canonicalize->get(c, '\0', chars);
5148 : DCHECK_LE(length, 1);
5149 : unibrow::uchar canonical = c;
5150 : if (length == 1) canonical = chars[0];
5151 : return canonical;
5152 : }
5153 :
5154 : int CompareFirstCharCaseIndependent(
5155 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5156 : RegExpTree* const* a, RegExpTree* const* b) {
5157 : RegExpAtom* atom1 = (*a)->AsAtom();
5158 : RegExpAtom* atom2 = (*b)->AsAtom();
5159 : unibrow::uchar character1 = atom1->data().at(0);
5160 : unibrow::uchar character2 = atom2->data().at(0);
5161 : if (character1 == character2) return 0;
5162 : if (character1 >= 'a' || character2 >= 'a') {
5163 : character1 = Canonical(canonicalize, character1);
5164 : character2 = Canonical(canonicalize, character2);
5165 : }
5166 : return static_cast<int>(character1) - static_cast<int>(character2);
5167 : }
5168 : #endif // V8_INTL_SUPPORT
5169 :
5170 : // We can stable sort runs of atoms, since the order does not matter if they
5171 : // start with different characters.
5172 : // Returns true if any consecutive atoms were found.
5173 9303 : bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5174 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5175 : int length = alternatives->length();
5176 : bool found_consecutive_atoms = false;
5177 26557 : for (int i = 0; i < length; i++) {
5178 62485 : while (i < length) {
5179 35145 : RegExpTree* alternative = alternatives->at(i);
5180 35145 : if (alternative->IsAtom()) break;
5181 26518 : i++;
5182 : }
5183 : // i is length or it is the index of an atom.
5184 9449 : if (i == length) break;
5185 : int first_atom = i;
5186 8627 : JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
5187 8627 : i++;
5188 120869 : while (i < length) {
5189 56362 : RegExpTree* alternative = alternatives->at(i);
5190 56362 : if (!alternative->IsAtom()) break;
5191 56121 : if (alternative->AsAtom()->flags() != flags) break;
5192 56121 : i++;
5193 : }
5194 : // Sort atoms to get ones with common prefixes together.
5195 : // This step is more tricky if we are in a case-independent regexp,
5196 : // because it would change /is|I/ to /I|is/, and order matters when
5197 : // the regexp parts don't match only disjoint starting points. To fix
5198 : // this we have a version of CompareFirstChar that uses case-
5199 : // independent character classes for comparison.
5200 : DCHECK_LT(first_atom, alternatives->length());
5201 : DCHECK_LE(i, alternatives->length());
5202 : DCHECK_LE(first_atom, i);
5203 8627 : if (IgnoreCase(flags)) {
5204 : #ifdef V8_INTL_SUPPORT
5205 474 : alternatives->StableSort(CompareFirstCharCaseInsensitve, first_atom,
5206 474 : i - first_atom);
5207 : #else
5208 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5209 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5210 : auto compare_closure =
5211 : [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5212 : return CompareFirstCharCaseIndependent(canonicalize, a, b);
5213 : };
5214 : alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5215 : #endif // V8_INTL_SUPPORT
5216 : } else {
5217 8153 : alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5218 : }
5219 8627 : if (i - first_atom > 1) found_consecutive_atoms = true;
5220 : }
5221 9303 : return found_consecutive_atoms;
5222 : }
5223 :
5224 :
5225 : // Optimizes ab|ac|az to a(?:b|c|d).
5226 8370 : void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5227 : Zone* zone = compiler->zone();
5228 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5229 : int length = alternatives->length();
5230 :
5231 : int write_posn = 0;
5232 : int i = 0;
5233 73522 : while (i < length) {
5234 65152 : RegExpTree* alternative = alternatives->at(i);
5235 65152 : if (!alternative->IsAtom()) {
5236 18582 : alternatives->at(write_posn++) = alternatives->at(i);
5237 9291 : i++;
5238 9291 : continue;
5239 : }
5240 55861 : RegExpAtom* const atom = alternative->AsAtom();
5241 : JSRegExp::Flags flags = atom->flags();
5242 : #ifdef V8_INTL_SUPPORT
5243 111722 : icu::UnicodeString common_prefix(atom->data().at(0));
5244 : #else
5245 : unibrow::uchar common_prefix = atom->data().at(0);
5246 : #endif // V8_INTL_SUPPORT
5247 : int first_with_prefix = i;
5248 : int prefix_length = atom->length();
5249 55861 : i++;
5250 64491 : while (i < length) {
5251 56221 : alternative = alternatives->at(i);
5252 56221 : if (!alternative->IsAtom()) break;
5253 56121 : RegExpAtom* const atom = alternative->AsAtom();
5254 56121 : if (atom->flags() != flags) break;
5255 : #ifdef V8_INTL_SUPPORT
5256 64751 : icu::UnicodeString new_prefix(atom->data().at(0));
5257 56121 : if (new_prefix != common_prefix) {
5258 47645 : if (!IgnoreCase(flags)) break;
5259 4578 : if (common_prefix.caseCompare(new_prefix, U_FOLD_CASE_DEFAULT) != 0)
5260 : break;
5261 : }
5262 : #else
5263 : unibrow::uchar new_prefix = atom->data().at(0);
5264 : if (new_prefix != common_prefix) {
5265 : if (!IgnoreCase(flags)) break;
5266 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5267 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5268 : new_prefix = Canonical(canonicalize, new_prefix);
5269 : common_prefix = Canonical(canonicalize, common_prefix);
5270 : if (new_prefix != common_prefix) break;
5271 : }
5272 : #endif // V8_INTL_SUPPORT
5273 : prefix_length = Min(prefix_length, atom->length());
5274 8630 : i++;
5275 : }
5276 55861 : if (i > first_with_prefix + 2) {
5277 : // Found worthwhile run of alternatives with common prefix of at least one
5278 : // character. The sorting function above did not sort on more than one
5279 : // character for reasons of correctness, but there may still be a longer
5280 : // common prefix if the terms were similar or presorted in the input.
5281 : // Find out how long the common prefix is.
5282 268 : int run_length = i - first_with_prefix;
5283 268 : RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
5284 742 : for (int j = 1; j < run_length && prefix_length > 1; j++) {
5285 : RegExpAtom* old_atom =
5286 474 : alternatives->at(j + first_with_prefix)->AsAtom();
5287 477 : for (int k = 1; k < prefix_length; k++) {
5288 711 : if (atom->data().at(k) != old_atom->data().at(k)) {
5289 : prefix_length = k;
5290 : break;
5291 : }
5292 : }
5293 : }
5294 : RegExpAtom* prefix = new (zone)
5295 268 : RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
5296 : ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5297 268 : pair->Add(prefix, zone);
5298 : ZoneList<RegExpTree*>* suffixes =
5299 : new (zone) ZoneList<RegExpTree*>(run_length, zone);
5300 17600 : for (int j = 0; j < run_length; j++) {
5301 : RegExpAtom* old_atom =
5302 17332 : alternatives->at(j + first_with_prefix)->AsAtom();
5303 : int len = old_atom->length();
5304 8666 : if (len == prefix_length) {
5305 302 : suffixes->Add(new (zone) RegExpEmpty(), zone);
5306 : } else {
5307 : RegExpTree* suffix = new (zone) RegExpAtom(
5308 8515 : old_atom->data().SubVector(prefix_length, old_atom->length()),
5309 8515 : flags);
5310 8515 : suffixes->Add(suffix, zone);
5311 : }
5312 : }
5313 268 : pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5314 536 : alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5315 : } else {
5316 : // Just copy any non-worthwhile alternatives.
5317 167243 : for (int j = first_with_prefix; j < i; j++) {
5318 111650 : alternatives->at(write_posn++) = alternatives->at(j);
5319 : }
5320 : }
5321 : }
5322 : alternatives->Rewind(write_posn); // Trim end of array.
5323 8370 : }
5324 :
5325 :
5326 : // Optimizes b|c|z to [bcz].
5327 9303 : void RegExpDisjunction::FixSingleCharacterDisjunctions(
5328 : RegExpCompiler* compiler) {
5329 : Zone* zone = compiler->zone();
5330 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5331 : int length = alternatives->length();
5332 :
5333 : int write_posn = 0;
5334 : int i = 0;
5335 83975 : while (i < length) {
5336 74672 : RegExpTree* alternative = alternatives->at(i);
5337 74672 : if (!alternative->IsAtom()) {
5338 54054 : alternatives->at(write_posn++) = alternatives->at(i);
5339 27027 : i++;
5340 27027 : continue;
5341 : }
5342 47645 : RegExpAtom* const atom = alternative->AsAtom();
5343 47645 : if (atom->length() != 1) {
5344 78592 : alternatives->at(write_posn++) = alternatives->at(i);
5345 39296 : i++;
5346 39296 : continue;
5347 : }
5348 : JSRegExp::Flags flags = atom->flags();
5349 : DCHECK_IMPLIES(IsUnicode(flags),
5350 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5351 : bool contains_trail_surrogate =
5352 8349 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5353 : int first_in_run = i;
5354 8349 : i++;
5355 : // Find a run of single-character atom alternatives that have identical
5356 : // flags (case independence and unicode-ness).
5357 25223 : while (i < length) {
5358 16454 : alternative = alternatives->at(i);
5359 16454 : if (!alternative->IsAtom()) break;
5360 16223 : RegExpAtom* const atom = alternative->AsAtom();
5361 16223 : if (atom->length() != 1) break;
5362 8437 : if (atom->flags() != flags) break;
5363 : DCHECK_IMPLIES(IsUnicode(flags),
5364 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5365 : contains_trail_surrogate |=
5366 16874 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5367 8437 : i++;
5368 : }
5369 8349 : if (i > first_in_run + 1) {
5370 : // Found non-trivial run of single-character alternatives.
5371 271 : int run_length = i - first_in_run;
5372 : ZoneList<CharacterRange>* ranges =
5373 : new (zone) ZoneList<CharacterRange>(2, zone);
5374 17687 : for (int j = 0; j < run_length; j++) {
5375 17416 : RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5376 : DCHECK_EQ(old_atom->length(), 1);
5377 8708 : ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5378 : }
5379 : RegExpCharacterClass::CharacterClassFlags character_class_flags;
5380 271 : if (IsUnicode(flags) && contains_trail_surrogate) {
5381 : character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5382 : }
5383 271 : alternatives->at(write_posn++) = new (zone)
5384 813 : RegExpCharacterClass(zone, ranges, flags, character_class_flags);
5385 : } else {
5386 : // Just copy any trivial alternatives.
5387 24234 : for (int j = first_in_run; j < i; j++) {
5388 16156 : alternatives->at(write_posn++) = alternatives->at(j);
5389 : }
5390 : }
5391 : }
5392 : alternatives->Rewind(write_posn); // Trim end of array.
5393 9303 : }
5394 :
5395 :
5396 10931 : RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5397 : RegExpNode* on_success) {
5398 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5399 :
5400 10931 : if (alternatives->length() > 2) {
5401 9303 : bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5402 9303 : if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5403 9303 : FixSingleCharacterDisjunctions(compiler);
5404 9303 : if (alternatives->length() == 1) {
5405 242 : return alternatives->at(0)->ToNode(compiler, on_success);
5406 : }
5407 : }
5408 :
5409 : int length = alternatives->length();
5410 :
5411 : ChoiceNode* result =
5412 10689 : new(compiler->zone()) ChoiceNode(length, compiler->zone());
5413 166055 : for (int i = 0; i < length; i++) {
5414 : GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5415 77683 : on_success));
5416 : result->AddAlternative(alternative);
5417 : }
5418 : return result;
5419 : }
5420 :
5421 :
5422 926334 : RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5423 : RegExpNode* on_success) {
5424 926334 : return ToNode(min(),
5425 : max(),
5426 : is_greedy(),
5427 : body(),
5428 : compiler,
5429 926334 : on_success);
5430 : }
5431 :
5432 :
5433 : // Scoped object to keep track of how much we unroll quantifier loops in the
5434 : // regexp graph generator.
5435 : class RegExpExpansionLimiter {
5436 : public:
5437 : static const int kMaxExpansionFactor = 6;
5438 : RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5439 : : compiler_(compiler),
5440 : saved_expansion_factor_(compiler->current_expansion_factor()),
5441 62510 : ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5442 : DCHECK_LT(0, factor);
5443 72065 : if (ok_to_expand_) {
5444 72065 : if (factor > kMaxExpansionFactor) {
5445 : // Avoid integer overflow of the current expansion factor.
5446 : ok_to_expand_ = false;
5447 : compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5448 : } else {
5449 71937 : int new_factor = saved_expansion_factor_ * factor;
5450 71937 : ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5451 : compiler->set_current_expansion_factor(new_factor);
5452 : }
5453 : }
5454 : }
5455 :
5456 : ~RegExpExpansionLimiter() {
5457 : compiler_->set_current_expansion_factor(saved_expansion_factor_);
5458 : }
5459 :
5460 : bool ok_to_expand() { return ok_to_expand_; }
5461 :
5462 : private:
5463 : RegExpCompiler* compiler_;
5464 : int saved_expansion_factor_;
5465 : bool ok_to_expand_;
5466 :
5467 : DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5468 : };
5469 :
5470 :
5471 1013442 : RegExpNode* RegExpQuantifier::ToNode(int min,
5472 : int max,
5473 : bool is_greedy,
5474 : RegExpTree* body,
5475 : RegExpCompiler* compiler,
5476 : RegExpNode* on_success,
5477 : bool not_at_start) {
5478 : // x{f, t} becomes this:
5479 : //
5480 : // (r++)<-.
5481 : // | `
5482 : // | (x)
5483 : // v ^
5484 : // (r=0)-->(?)---/ [if r < t]
5485 : // |
5486 : // [if r >= f] \----> ...
5487 : //
5488 :
5489 : // 15.10.2.5 RepeatMatcher algorithm.
5490 : // The parser has already eliminated the case where max is 0. In the case
5491 : // where max_match is zero the parser has removed the quantifier if min was
5492 : // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5493 :
5494 : // If we know that we cannot match zero length then things are a little
5495 : // simpler since we don't need to make the special zero length match check
5496 : // from step 2.1. If the min and max are small we can unroll a little in
5497 : // this case.
5498 : static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5499 : static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5500 1013442 : if (max == 0) return on_success; // This can happen due to recursion.
5501 1013087 : bool body_can_be_empty = (body->min_match() == 0);
5502 : int body_start_reg = RegExpCompiler::kNoRegister;
5503 1013087 : Interval capture_registers = body->CaptureRegisters();
5504 : bool needs_capture_clearing = !capture_registers.is_empty();
5505 : Zone* zone = compiler->zone();
5506 :
5507 1013087 : if (body_can_be_empty) {
5508 : body_start_reg = compiler->AllocateRegister();
5509 1012550 : } else if (compiler->optimize() && !needs_capture_clearing) {
5510 : // Only unroll if there are no captures and the body can't be
5511 : // empty.
5512 : {
5513 : RegExpExpansionLimiter limiter(
5514 62510 : compiler, min + ((max != min) ? 1 : 0));
5515 62510 : if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5516 4336 : int new_max = (max == kInfinity) ? max : max - min;
5517 : // Recurse once to get the loop or optional matches after the fixed
5518 : // ones.
5519 4336 : RegExpNode* answer = ToNode(
5520 4336 : 0, new_max, is_greedy, body, compiler, on_success, true);
5521 : // Unroll the forced matches from 0 to min. This can cause chains of
5522 : // TextNodes (which the parser does not generate). These should be
5523 : // combined if it turns out they hinder good code generation.
5524 14408 : for (int i = 0; i < min; i++) {
5525 5036 : answer = body->ToNode(compiler, answer);
5526 : }
5527 : return answer;
5528 : }
5529 : }
5530 58174 : if (max <= kMaxUnrolledMaxMatches && min == 0) {
5531 : DCHECK_LT(0, max); // Due to the 'if' above.
5532 : RegExpExpansionLimiter limiter(compiler, max);
5533 9555 : if (limiter.ok_to_expand()) {
5534 : // Unroll the optional matches up to max.
5535 : RegExpNode* answer = on_success;
5536 28076 : for (int i = 0; i < max; i++) {
5537 9391 : ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5538 9391 : if (is_greedy) {
5539 9245 : alternation->AddAlternative(
5540 9245 : GuardedAlternative(body->ToNode(compiler, answer)));
5541 : alternation->AddAlternative(GuardedAlternative(on_success));
5542 : } else {
5543 : alternation->AddAlternative(GuardedAlternative(on_success));
5544 146 : alternation->AddAlternative(
5545 146 : GuardedAlternative(body->ToNode(compiler, answer)));
5546 : }
5547 : answer = alternation;
5548 9391 : if (not_at_start && !compiler->read_backward()) {
5549 : alternation->set_not_at_start();
5550 : }
5551 : }
5552 : return answer;
5553 : }
5554 : }
5555 : }
5556 999457 : bool has_min = min > 0;
5557 999457 : bool has_max = max < RegExpTree::kInfinity;
5558 999457 : bool needs_counter = has_min || has_max;
5559 : int reg_ctr = needs_counter
5560 : ? compiler->AllocateRegister()
5561 999457 : : RegExpCompiler::kNoRegister;
5562 : LoopChoiceNode* center = new (zone)
5563 999457 : LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5564 999457 : if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5565 : RegExpNode* loop_return = needs_counter
5566 : ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5567 999457 : : static_cast<RegExpNode*>(center);
5568 999457 : if (body_can_be_empty) {
5569 : // If the body can be empty we need to check if it was and then
5570 : // backtrack.
5571 : loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5572 : reg_ctr,
5573 : min,
5574 537 : loop_return);
5575 : }
5576 999457 : RegExpNode* body_node = body->ToNode(compiler, loop_return);
5577 999457 : if (body_can_be_empty) {
5578 : // If the body can be empty we need to store the start position
5579 : // so we can bail out if it was empty.
5580 537 : body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5581 : }
5582 999457 : if (needs_capture_clearing) {
5583 : // Before entering the body of this loop we need to clear captures.
5584 2382 : body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5585 : }
5586 : GuardedAlternative body_alt(body_node);
5587 999457 : if (has_max) {
5588 : Guard* body_guard =
5589 : new(zone) Guard(reg_ctr, Guard::LT, max);
5590 902620 : body_alt.AddGuard(body_guard, zone);
5591 : }
5592 : GuardedAlternative rest_alt(on_success);
5593 999457 : if (has_min) {
5594 : Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5595 1322 : rest_alt.AddGuard(rest_guard, zone);
5596 : }
5597 999457 : if (is_greedy) {
5598 : center->AddLoopAlternative(body_alt);
5599 : center->AddContinueAlternative(rest_alt);
5600 : } else {
5601 : center->AddContinueAlternative(rest_alt);
5602 : center->AddLoopAlternative(body_alt);
5603 : }
5604 999457 : if (needs_counter) {
5605 903365 : return ActionNode::SetRegister(reg_ctr, 0, center);
5606 : } else {
5607 : return center;
5608 : }
5609 : }
5610 :
5611 : namespace {
5612 : // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5613 : // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
5614 80 : RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5615 : RegExpNode* on_success,
5616 : RegExpAssertion::AssertionType type,
5617 : JSRegExp::Flags flags) {
5618 : DCHECK(NeedsUnicodeCaseEquivalents(flags));
5619 : Zone* zone = compiler->zone();
5620 : ZoneList<CharacterRange>* word_range =
5621 : new (zone) ZoneList<CharacterRange>(2, zone);
5622 80 : CharacterRange::AddClassEscape('w', word_range, true, zone);
5623 : int stack_register = compiler->UnicodeLookaroundStackRegister();
5624 : int position_register = compiler->UnicodeLookaroundPositionRegister();
5625 80 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5626 : // Add two choices. The (non-)boundary could start with a word or
5627 : // a non-word-character.
5628 400 : for (int i = 0; i < 2; i++) {
5629 160 : bool lookbehind_for_word = i == 0;
5630 : bool lookahead_for_word =
5631 160 : (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5632 : // Look to the left.
5633 : RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5634 160 : stack_register, position_register);
5635 : RegExpNode* backward = TextNode::CreateForCharacterRanges(
5636 160 : zone, word_range, true, lookbehind.on_match_success(), flags);
5637 : // Look to the right.
5638 : RegExpLookaround::Builder lookahead(lookahead_for_word,
5639 : lookbehind.ForMatch(backward),
5640 160 : stack_register, position_register);
5641 : RegExpNode* forward = TextNode::CreateForCharacterRanges(
5642 160 : zone, word_range, false, lookahead.on_match_success(), flags);
5643 160 : result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5644 : }
5645 80 : return result;
5646 : }
5647 : } // anonymous namespace
5648 :
5649 5528 : RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5650 : RegExpNode* on_success) {
5651 : NodeInfo info;
5652 : Zone* zone = compiler->zone();
5653 :
5654 5528 : switch (assertion_type()) {
5655 : case START_OF_LINE:
5656 129 : return AssertionNode::AfterNewline(on_success);
5657 : case START_OF_INPUT:
5658 3055 : return AssertionNode::AtStart(on_success);
5659 : case BOUNDARY:
5660 : return NeedsUnicodeCaseEquivalents(flags_)
5661 : ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
5662 : flags_)
5663 176 : : AssertionNode::AtBoundary(on_success);
5664 : case NON_BOUNDARY:
5665 : return NeedsUnicodeCaseEquivalents(flags_)
5666 : ? BoundaryAssertionAsLookaround(compiler, on_success,
5667 : NON_BOUNDARY, flags_)
5668 154 : : AssertionNode::AtNonBoundary(on_success);
5669 : case END_OF_INPUT:
5670 1915 : return AssertionNode::AtEnd(on_success);
5671 : case END_OF_LINE: {
5672 : // Compile $ in multiline regexps as an alternation with a positive
5673 : // lookahead in one side and an end-of-input on the other side.
5674 : // We need two registers for the lookahead.
5675 : int stack_pointer_register = compiler->AllocateRegister();
5676 : int position_register = compiler->AllocateRegister();
5677 : // The ChoiceNode to distinguish between a newline and end-of-input.
5678 99 : ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5679 : // Create a newline atom.
5680 : ZoneList<CharacterRange>* newline_ranges =
5681 : new(zone) ZoneList<CharacterRange>(3, zone);
5682 99 : CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5683 : JSRegExp::Flags default_flags = JSRegExp::Flags();
5684 : RegExpCharacterClass* newline_atom =
5685 : new (zone) RegExpCharacterClass('n', default_flags);
5686 : TextNode* newline_matcher = new (zone) TextNode(
5687 : newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5688 : stack_pointer_register, position_register,
5689 : 0, // No captures inside.
5690 : -1, // Ignored if no captures.
5691 198 : on_success));
5692 : // Create an end-of-input matcher.
5693 : RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5694 : stack_pointer_register,
5695 : position_register,
5696 99 : newline_matcher);
5697 : // Add the two alternatives to the ChoiceNode.
5698 : GuardedAlternative eol_alternative(end_of_line);
5699 : result->AddAlternative(eol_alternative);
5700 99 : GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5701 : result->AddAlternative(end_alternative);
5702 : return result;
5703 : }
5704 : default:
5705 0 : UNREACHABLE();
5706 : }
5707 : return on_success;
5708 : }
5709 :
5710 :
5711 2376 : RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5712 : RegExpNode* on_success) {
5713 : return new (compiler->zone())
5714 : BackReferenceNode(RegExpCapture::StartRegister(index()),
5715 : RegExpCapture::EndRegister(index()), flags_,
5716 4752 : compiler->read_backward(), on_success);
5717 : }
5718 :
5719 :
5720 1050 : RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5721 : RegExpNode* on_success) {
5722 1050 : return on_success;
5723 : }
5724 :
5725 :
5726 4368 : RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5727 : int stack_pointer_register,
5728 : int position_register,
5729 : int capture_register_count,
5730 : int capture_register_start)
5731 : : is_positive_(is_positive),
5732 : on_success_(on_success),
5733 : stack_pointer_register_(stack_pointer_register),
5734 4368 : position_register_(position_register) {
5735 4368 : if (is_positive_) {
5736 1556 : on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5737 : stack_pointer_register, position_register, capture_register_count,
5738 1556 : capture_register_start, on_success_);
5739 : } else {
5740 : Zone* zone = on_success_->zone();
5741 : on_match_success_ = new (zone) NegativeSubmatchSuccess(
5742 : stack_pointer_register, position_register, capture_register_count,
5743 2812 : capture_register_start, zone);
5744 : }
5745 4368 : }
5746 :
5747 :
5748 4368 : RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5749 4368 : if (is_positive_) {
5750 1556 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5751 1556 : position_register_, match);
5752 : } else {
5753 2812 : Zone* zone = on_success_->zone();
5754 : // We use a ChoiceNode to represent the negative lookaround. The first
5755 : // alternative is the negative match. On success, the end node backtracks.
5756 : // On failure, the second alternative is tried and leads to success.
5757 : // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5758 : // first exit when calculating quick checks.
5759 : ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5760 2812 : GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5761 2812 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5762 2812 : position_register_, choice_node);
5763 : }
5764 : }
5765 :
5766 :
5767 1668 : RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5768 : RegExpNode* on_success) {
5769 : int stack_pointer_register = compiler->AllocateRegister();
5770 : int position_register = compiler->AllocateRegister();
5771 :
5772 : const int registers_per_capture = 2;
5773 : const int register_of_first_capture = 2;
5774 1668 : int register_count = capture_count_ * registers_per_capture;
5775 : int register_start =
5776 1668 : register_of_first_capture + capture_from_ * registers_per_capture;
5777 :
5778 : RegExpNode* result;
5779 : bool was_reading_backward = compiler->read_backward();
5780 1668 : compiler->set_read_backward(type() == LOOKBEHIND);
5781 : Builder builder(is_positive(), on_success, stack_pointer_register,
5782 1668 : position_register, register_count, register_start);
5783 1668 : RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5784 1668 : result = builder.ForMatch(match);
5785 : compiler->set_read_backward(was_reading_backward);
5786 1668 : return result;
5787 : }
5788 :
5789 :
5790 27196 : RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5791 : RegExpNode* on_success) {
5792 27196 : return ToNode(body(), index(), compiler, on_success);
5793 : }
5794 :
5795 :
5796 112935 : RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5797 : int index,
5798 : RegExpCompiler* compiler,
5799 : RegExpNode* on_success) {
5800 : DCHECK_NOT_NULL(body);
5801 : int start_reg = RegExpCapture::StartRegister(index);
5802 : int end_reg = RegExpCapture::EndRegister(index);
5803 112935 : if (compiler->read_backward()) std::swap(start_reg, end_reg);
5804 112935 : RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5805 112935 : RegExpNode* body_node = body->ToNode(compiler, store_end);
5806 112935 : return ActionNode::StorePosition(start_reg, true, body_node);
5807 : }
5808 :
5809 :
5810 21356 : RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5811 : RegExpNode* on_success) {
5812 : ZoneList<RegExpTree*>* children = nodes();
5813 : RegExpNode* current = on_success;
5814 21356 : if (compiler->read_backward()) {
5815 1905 : for (int i = 0; i < children->length(); i++) {
5816 790 : current = children->at(i)->ToNode(compiler, current);
5817 : }
5818 : } else {
5819 997883 : for (int i = children->length() - 1; i >= 0; i--) {
5820 976852 : current = children->at(i)->ToNode(compiler, current);
5821 : }
5822 : }
5823 21356 : return current;
5824 : }
5825 :
5826 :
5827 7425 : static void AddClass(const int* elmv,
5828 : int elmc,
5829 : ZoneList<CharacterRange>* ranges,
5830 : Zone* zone) {
5831 7425 : elmc--;
5832 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5833 71985 : for (int i = 0; i < elmc; i += 2) {
5834 : DCHECK(elmv[i] < elmv[i + 1]);
5835 32280 : ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5836 : }
5837 7425 : }
5838 :
5839 :
5840 20191 : static void AddClassNegated(const int *elmv,
5841 : int elmc,
5842 : ZoneList<CharacterRange>* ranges,
5843 : Zone* zone) {
5844 20191 : elmc--;
5845 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5846 : DCHECK_NE(0x0000, elmv[0]);
5847 : DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
5848 : uc16 last = 0x0000;
5849 151855 : for (int i = 0; i < elmc; i += 2) {
5850 : DCHECK(last <= elmv[i] - 1);
5851 : DCHECK(elmv[i] < elmv[i + 1]);
5852 65832 : ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5853 65832 : last = elmv[i + 1];
5854 : }
5855 20191 : ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5856 20191 : }
5857 :
5858 110712 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5859 : bool add_unicode_case_equivalents,
5860 : Zone* zone) {
5861 110712 : if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5862 : // See #sec-runtime-semantics-wordcharacters-abstract-operation
5863 : // In case of unicode and ignore_case, we need to create the closure over
5864 : // case equivalent characters before negating.
5865 : ZoneList<CharacterRange>* new_ranges =
5866 : new (zone) ZoneList<CharacterRange>(2, zone);
5867 240 : AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5868 240 : AddUnicodeCaseEquivalents(new_ranges, zone);
5869 240 : if (type == 'W') {
5870 : ZoneList<CharacterRange>* negated =
5871 : new (zone) ZoneList<CharacterRange>(2, zone);
5872 90 : CharacterRange::Negate(new_ranges, negated, zone);
5873 : new_ranges = negated;
5874 : }
5875 : ranges->AddAll(*new_ranges, zone);
5876 : return;
5877 : }
5878 110472 : AddClassEscape(type, ranges, zone);
5879 : }
5880 :
5881 110507 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5882 : Zone* zone) {
5883 110507 : switch (type) {
5884 : case 's':
5885 1709 : AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5886 1709 : break;
5887 : case 'S':
5888 784 : AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5889 784 : break;
5890 : case 'w':
5891 2786 : AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5892 2786 : break;
5893 : case 'W':
5894 307 : AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5895 307 : break;
5896 : case 'd':
5897 2492 : AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5898 2492 : break;
5899 : case 'D':
5900 268 : AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5901 268 : break;
5902 : case '.':
5903 : AddClassNegated(kLineTerminatorRanges,
5904 : kLineTerminatorRangeCount,
5905 : ranges,
5906 18832 : zone);
5907 18832 : break;
5908 : // This is not a character range as defined by the spec but a
5909 : // convenient shorthand for a character class that matches any
5910 : // character.
5911 : case '*':
5912 83131 : ranges->Add(CharacterRange::Everything(), zone);
5913 83131 : break;
5914 : // This is the set of characters matched by the $ and ^ symbols
5915 : // in multiline mode.
5916 : case 'n':
5917 : AddClass(kLineTerminatorRanges,
5918 : kLineTerminatorRangeCount,
5919 : ranges,
5920 198 : zone);
5921 198 : break;
5922 : default:
5923 0 : UNREACHABLE();
5924 : }
5925 110507 : }
5926 :
5927 :
5928 0 : Vector<const int> CharacterRange::GetWordBounds() {
5929 0 : return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5930 : }
5931 :
5932 : // static
5933 66930 : void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5934 : ZoneList<CharacterRange>* ranges,
5935 : bool is_one_byte) {
5936 66930 : CharacterRange::Canonicalize(ranges);
5937 : int range_count = ranges->length();
5938 : #ifdef V8_INTL_SUPPORT
5939 133860 : icu::UnicodeSet already_added;
5940 133860 : icu::UnicodeSet others;
5941 210828 : for (int i = 0; i < range_count; i++) {
5942 71949 : CharacterRange range = ranges->at(i);
5943 : uc32 bottom = range.from();
5944 71949 : if (bottom > String::kMaxUtf16CodeUnit) continue;
5945 : uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5946 : // Nothing to be done for surrogates.
5947 71949 : if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
5948 69902 : if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5949 1354 : if (bottom > String::kMaxOneByteCharCode) continue;
5950 1243 : if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5951 : }
5952 69791 : already_added.add(bottom, top);
5953 15709049 : while (bottom <= top) {
5954 15639258 : icu::UnicodeString upper(bottom);
5955 7819629 : upper.toUpper();
5956 15639258 : icu::UnicodeSet expanded(bottom, bottom);
5957 7819629 : expanded.closeOver(USET_CASE_INSENSITIVE);
5958 23798823 : for (int32_t i = 0; i < expanded.getRangeCount(); i++) {
5959 7989597 : UChar32 start = expanded.getRangeStart(i);
5960 7989597 : UChar32 end = expanded.getRangeEnd(i);
5961 24239465 : while (start <= end) {
5962 16249868 : icu::UnicodeString upper2(start);
5963 8124934 : upper2.toUpper();
5964 : // Only add if the upper case are the same.
5965 8124934 : if (upper[0] == upper2[0]) {
5966 8122177 : others.add(start);
5967 : }
5968 8124934 : start++;
5969 : }
5970 : }
5971 7819629 : bottom++;
5972 : }
5973 : }
5974 66930 : others.removeAll(already_added);
5975 77310 : for (int32_t i = 0; i < others.getRangeCount(); i++) {
5976 5190 : UChar32 start = others.getRangeStart(i);
5977 5190 : UChar32 end = others.getRangeEnd(i);
5978 5190 : if (start == end) {
5979 3686 : ranges->Add(CharacterRange::Singleton(start), zone);
5980 : } else {
5981 1504 : ranges->Add(CharacterRange::Range(start, end), zone);
5982 : }
5983 : }
5984 : #else
5985 : for (int i = 0; i < range_count; i++) {
5986 : CharacterRange range = ranges->at(i);
5987 : uc32 bottom = range.from();
5988 : if (bottom > String::kMaxUtf16CodeUnit) continue;
5989 : uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5990 : // Nothing to be done for surrogates.
5991 : if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
5992 : if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5993 : if (bottom > String::kMaxOneByteCharCode) continue;
5994 : if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5995 : }
5996 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5997 : if (top == bottom) {
5998 : // If this is a singleton we just expand the one character.
5999 : int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
6000 : for (int i = 0; i < length; i++) {
6001 : uc32 chr = chars[i];
6002 : if (chr != bottom) {
6003 : ranges->Add(CharacterRange::Singleton(chars[i]), zone);
6004 : }
6005 : }
6006 : } else {
6007 : // If this is a range we expand the characters block by block, expanding
6008 : // contiguous subranges (blocks) one at a time. The approach is as
6009 : // follows. For a given start character we look up the remainder of the
6010 : // block that contains it (represented by the end point), for instance we
6011 : // find 'z' if the character is 'c'. A block is characterized by the
6012 : // property that all characters uncanonicalize in the same way, except
6013 : // that each entry in the result is incremented by the distance from the
6014 : // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
6015 : // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
6016 : // we've found the end point we look up its uncanonicalization and
6017 : // produce a range for each element. For instance for [c-f] we look up
6018 : // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
6019 : // it is not already contained in the input, so [c-f] will be skipped but
6020 : // [C-F] will be added. If this range is not completely contained in a
6021 : // block we do this for all the blocks covered by the range (handling
6022 : // characters that is not in a block as a "singleton block").
6023 : unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6024 : int pos = bottom;
6025 : while (pos <= top) {
6026 : int length =
6027 : isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
6028 : uc32 block_end;
6029 : if (length == 0) {
6030 : block_end = pos;
6031 : } else {
6032 : DCHECK_EQ(1, length);
6033 : block_end = equivalents[0];
6034 : }
6035 : int end = (block_end > top) ? top : block_end;
6036 : length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
6037 : equivalents);
6038 : for (int i = 0; i < length; i++) {
6039 : uc32 c = equivalents[i];
6040 : uc32 range_from = c - (block_end - pos);
6041 : uc32 range_to = c - (block_end - end);
6042 : if (!(bottom <= range_from && range_to <= top)) {
6043 : ranges->Add(CharacterRange::Range(range_from, range_to), zone);
6044 : }
6045 : }
6046 : pos = end + 1;
6047 : }
6048 : }
6049 : }
6050 : #endif // V8_INTL_SUPPORT
6051 66930 : }
6052 :
6053 10 : bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
6054 : DCHECK_NOT_NULL(ranges);
6055 : int n = ranges->length();
6056 10 : if (n <= 1) return true;
6057 : int max = ranges->at(0).to();
6058 590 : for (int i = 1; i < n; i++) {
6059 290 : CharacterRange next_range = ranges->at(i);
6060 290 : if (next_range.from() <= max + 1) return false;
6061 : max = next_range.to();
6062 : }
6063 : return true;
6064 : }
6065 :
6066 :
6067 1915955 : ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
6068 1915955 : if (ranges_ == nullptr) {
6069 83020 : ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
6070 83020 : CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
6071 : }
6072 1915955 : return ranges_;
6073 : }
6074 :
6075 :
6076 : // Move a number of elements in a zonelist to another position
6077 : // in the same list. Handles overlapping source and target areas.
6078 93340 : static void MoveRanges(ZoneList<CharacterRange>* list,
6079 : int from,
6080 : int to,
6081 : int count) {
6082 : // Ranges are potentially overlapping.
6083 93340 : if (from < to) {
6084 10294689 : for (int i = count - 1; i >= 0; i--) {
6085 30639657 : list->at(to + i) = list->at(from + i);
6086 : }
6087 : } else {
6088 7322102 : for (int i = 0; i < count; i++) {
6089 10965348 : list->at(to + i) = list->at(from + i);
6090 : }
6091 : }
6092 93340 : }
6093 :
6094 :
6095 171443 : static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6096 : int count,
6097 : CharacterRange insert) {
6098 : // Inserts a range into list[0..count[, which must be sorted
6099 : // by from value and non-overlapping and non-adjacent, using at most
6100 : // list[0..count] for the result. Returns the number of resulting
6101 : // canonicalized ranges. Inserting a range may collapse existing ranges into
6102 : // fewer ranges, so the return value can be anything in the range 1..count+1.
6103 : uc32 from = insert.from();
6104 : uc32 to = insert.to();
6105 : int start_pos = 0;
6106 : int end_pos = count;
6107 18369192 : for (int i = count - 1; i >= 0; i--) {
6108 18284785 : CharacterRange current = list->at(i);
6109 18284785 : if (current.from() > to + 1) {
6110 : end_pos = i;
6111 136625 : } else if (current.to() + 1 < from) {
6112 87036 : start_pos = i + 1;
6113 : break;
6114 : }
6115 : }
6116 :
6117 : // Inserted range overlaps, or is adjacent to, ranges at positions
6118 : // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6119 : // not affected by the insertion.
6120 : // If start_pos == end_pos, the range must be inserted before start_pos.
6121 : // if start_pos < end_pos, the entire range from start_pos to end_pos
6122 : // must be merged with the insert range.
6123 :
6124 171443 : if (start_pos == end_pos) {
6125 : // Insert between existing ranges at position start_pos.
6126 134102 : if (start_pos < count) {
6127 81470 : MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6128 : }
6129 134102 : list->at(start_pos) = insert;
6130 134102 : return count + 1;
6131 : }
6132 37341 : if (start_pos + 1 == end_pos) {
6133 : // Replace single existing range at position start_pos.
6134 25338 : CharacterRange to_replace = list->at(start_pos);
6135 : int new_from = Min(to_replace.from(), from);
6136 : int new_to = Max(to_replace.to(), to);
6137 25338 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6138 : return count;
6139 : }
6140 : // Replace a number of existing ranges from start_pos to end_pos - 1.
6141 : // Move the remaining ranges down.
6142 :
6143 : int new_from = Min(list->at(start_pos).from(), from);
6144 12003 : int new_to = Max(list->at(end_pos - 1).to(), to);
6145 12003 : if (end_pos < count) {
6146 11870 : MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6147 : }
6148 12003 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6149 12003 : return count - (end_pos - start_pos) + 1;
6150 : }
6151 :
6152 :
6153 20 : void CharacterSet::Canonicalize() {
6154 : // Special/default classes are always considered canonical. The result
6155 : // of calling ranges() will be sorted.
6156 176280 : if (ranges_ == nullptr) return;
6157 93508 : CharacterRange::Canonicalize(ranges_);
6158 : }
6159 :
6160 :
6161 499726 : void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6162 499726 : if (character_ranges->length() <= 1) return;
6163 : // Check whether ranges are already canonical (increasing, non-overlapping,
6164 : // non-adjacent).
6165 : int n = character_ranges->length();
6166 : int max = character_ranges->at(0).to();
6167 : int i = 1;
6168 2645496 : while (i < n) {
6169 1299626 : CharacterRange current = character_ranges->at(i);
6170 1299626 : if (current.from() <= max + 1) {
6171 : break;
6172 : }
6173 : max = current.to();
6174 1290428 : i++;
6175 : }
6176 : // Canonical until the i'th range. If that's all of them, we are done.
6177 64640 : if (i == n) return;
6178 :
6179 : // The ranges at index i and forward are not canonicalized. Make them so by
6180 : // doing the equivalent of insertion sort (inserting each into the previous
6181 : // list, in order).
6182 : // Notice that inserting a range can reduce the number of ranges in the
6183 : // result due to combining of adjacent and overlapping ranges.
6184 : int read = i; // Range to insert.
6185 : int num_canonical = i; // Length of canonicalized part of list.
6186 : do {
6187 : num_canonical = InsertRangeInCanonicalList(character_ranges,
6188 : num_canonical,
6189 171443 : character_ranges->at(read));
6190 171443 : read++;
6191 171443 : } while (read < n);
6192 : character_ranges->Rewind(num_canonical);
6193 :
6194 : DCHECK(CharacterRange::IsCanonical(character_ranges));
6195 : }
6196 :
6197 :
6198 230 : void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6199 : ZoneList<CharacterRange>* negated_ranges,
6200 : Zone* zone) {
6201 : DCHECK(CharacterRange::IsCanonical(ranges));
6202 : DCHECK_EQ(0, negated_ranges->length());
6203 : int range_count = ranges->length();
6204 : uc32 from = 0;
6205 : int i = 0;
6206 230 : if (range_count > 0 && ranges->at(0).from() == 0) {
6207 40 : from = ranges->at(0).to() + 1;
6208 : i = 1;
6209 : }
6210 14890 : while (i < range_count) {
6211 7330 : CharacterRange range = ranges->at(i);
6212 7330 : negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6213 7330 : from = range.to() + 1;
6214 7330 : i++;
6215 : }
6216 230 : if (from < String::kMaxCodePoint) {
6217 360 : negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6218 180 : zone);
6219 : }
6220 230 : }
6221 :
6222 :
6223 : // -------------------------------------------------------------------
6224 : // Splay tree
6225 :
6226 :
6227 239769 : OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6228 239769 : if (Get(value))
6229 : return this;
6230 239764 : if (successors(zone) != nullptr) {
6231 633180 : for (int i = 0; i < successors(zone)->length(); i++) {
6232 422322 : OutSet* successor = successors(zone)->at(i);
6233 422322 : if (successor->Get(value))
6234 : return successor;
6235 : }
6236 : } else {
6237 5694 : successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6238 : }
6239 33994 : OutSet* result = new(zone) OutSet(first_, remaining_);
6240 16997 : result->Set(value, zone);
6241 16997 : successors(zone)->Add(result, zone);
6242 16997 : return result;
6243 : }
6244 :
6245 :
6246 715940 : void OutSet::Set(unsigned value, Zone *zone) {
6247 715940 : if (value < kFirstLimit) {
6248 391306 : first_ |= (1 << value);
6249 : } else {
6250 324634 : if (remaining_ == nullptr)
6251 84582 : remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6252 889320 : if (remaining_->is_empty() || !remaining_->Contains(value))
6253 323584 : remaining_->Add(value, zone);
6254 : }
6255 715940 : }
6256 :
6257 :
6258 90010 : bool OutSet::Get(unsigned value) const {
6259 30858331 : if (value < kFirstLimit) {
6260 6653273 : return (first_ & (1 << value)) != 0;
6261 24501487 : } else if (remaining_ == nullptr) {
6262 : return false;
6263 : } else {
6264 0 : return remaining_->Contains(value);
6265 : }
6266 : }
6267 :
6268 :
6269 : const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6270 :
6271 :
6272 89403 : void DispatchTable::AddRange(CharacterRange full_range, int value,
6273 : Zone* zone) {
6274 89403 : CharacterRange current = full_range;
6275 89403 : if (tree()->is_empty()) {
6276 : // If this is the first range we just insert into the table.
6277 : ZoneSplayTree<Config>::Locator loc;
6278 2647 : bool inserted = tree()->Insert(current.from(), &loc);
6279 : DCHECK(inserted);
6280 : USE(inserted);
6281 5294 : loc.set_value(Entry(current.from(), current.to(),
6282 : empty()->Extend(value, zone)));
6283 : return;
6284 : }
6285 : // First see if there is a range to the left of this one that
6286 : // overlaps.
6287 : ZoneSplayTree<Config>::Locator loc;
6288 86756 : if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6289 : Entry* entry = &loc.value();
6290 : // If we've found a range that overlaps with this one, and it
6291 : // starts strictly to the left of this one, we have to fix it
6292 : // because the following code only handles ranges that start on
6293 : // or after the start point of the range we're adding.
6294 85154 : if (entry->from() < current.from() && entry->to() >= current.from()) {
6295 : // Snap the overlapping range in half around the start point of
6296 : // the range we're adding.
6297 : CharacterRange left =
6298 400 : CharacterRange::Range(entry->from(), current.from() - 1);
6299 : CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6300 : // The left part of the overlapping range doesn't overlap.
6301 : // Truncate the whole entry to be just the left part.
6302 : entry->set_to(left.to());
6303 : // The right part is the one that overlaps. We add this part
6304 : // to the map and let the next step deal with merging it with
6305 : // the range we're adding.
6306 : ZoneSplayTree<Config>::Locator loc;
6307 400 : bool inserted = tree()->Insert(right.from(), &loc);
6308 : DCHECK(inserted);
6309 : USE(inserted);
6310 : loc.set_value(Entry(right.from(),
6311 : right.to(),
6312 : entry->out_set()));
6313 : }
6314 : }
6315 168614 : while (current.is_valid()) {
6316 411141 : if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6317 244912 : (loc.value().from() <= current.to()) &&
6318 : (loc.value().to() >= current.from())) {
6319 : Entry* entry = &loc.value();
6320 : // We have overlap. If there is space between the start point of
6321 : // the range we're adding and where the overlapping range starts
6322 : // then we have to add a range covering just that space.
6323 81858 : if (current.from() < entry->from()) {
6324 : ZoneSplayTree<Config>::Locator ins;
6325 74068 : bool inserted = tree()->Insert(current.from(), &ins);
6326 : DCHECK(inserted);
6327 : USE(inserted);
6328 148136 : ins.set_value(Entry(current.from(),
6329 : entry->from() - 1,
6330 : empty()->Extend(value, zone)));
6331 : current.set_from(entry->from());
6332 : }
6333 : DCHECK_EQ(current.from(), entry->from());
6334 : // If the overlapping range extends beyond the one we want to add
6335 : // we have to snap the right part off and add it separately.
6336 81858 : if (entry->to() > current.to()) {
6337 : ZoneSplayTree<Config>::Locator ins;
6338 4430 : bool inserted = tree()->Insert(current.to() + 1, &ins);
6339 : DCHECK(inserted);
6340 : USE(inserted);
6341 : ins.set_value(Entry(current.to() + 1,
6342 : entry->to(),
6343 : entry->out_set()));
6344 : entry->set_to(current.to());
6345 : }
6346 : DCHECK(entry->to() <= current.to());
6347 : // The overlapping range is now completely contained by the range
6348 : // we're adding so we can just update it and move the start point
6349 : // of the range we're adding just past it.
6350 : entry->AddValue(value, zone);
6351 : DCHECK(entry->to() + 1 > current.from());
6352 81858 : current.set_from(entry->to() + 1);
6353 : } else {
6354 : // There is no overlap so we can just add the range
6355 : ZoneSplayTree<Config>::Locator ins;
6356 81196 : bool inserted = tree()->Insert(current.from(), &ins);
6357 : DCHECK(inserted);
6358 : USE(inserted);
6359 162392 : ins.set_value(Entry(current.from(),
6360 : current.to(),
6361 : empty()->Extend(value, zone)));
6362 : break;
6363 : }
6364 : }
6365 : }
6366 :
6367 :
6368 55010 : OutSet* DispatchTable::Get(uc32 value) {
6369 : ZoneSplayTree<Config>::Locator loc;
6370 55010 : if (!tree()->FindGreatestLessThan(value, &loc))
6371 0 : return empty();
6372 : Entry* entry = &loc.value();
6373 55010 : if (value <= entry->to())
6374 38885 : return entry->out_set();
6375 : else
6376 16125 : return empty();
6377 : }
6378 :
6379 :
6380 : // -------------------------------------------------------------------
6381 : // Analysis
6382 :
6383 :
6384 1078981 : void Analysis::EnsureAnalyzed(RegExpNode* that) {
6385 : StackLimitCheck check(isolate());
6386 1078981 : if (check.HasOverflowed()) {
6387 : fail("Stack overflow");
6388 : return;
6389 : }
6390 1078641 : if (that->info()->been_analyzed || that->info()->being_analyzed)
6391 : return;
6392 873370 : that->info()->being_analyzed = true;
6393 873370 : that->Accept(this);
6394 873370 : that->info()->being_analyzed = false;
6395 873370 : that->info()->been_analyzed = true;
6396 : }
6397 :
6398 :
6399 88201 : void Analysis::VisitEnd(EndNode* that) {
6400 : // nothing to do
6401 88201 : }
6402 :
6403 :
6404 317143 : void TextNode::CalculateOffsets() {
6405 : int element_count = elements()->length();
6406 : // Set up the offsets of the elements relative to the start. This is a fixed
6407 : // quantity since a TextNode can only contain fixed-width things.
6408 : int cp_offset = 0;
6409 1076985 : for (int i = 0; i < element_count; i++) {
6410 : TextElement& elm = elements()->at(i);
6411 : elm.set_cp_offset(cp_offset);
6412 379921 : cp_offset += elm.length();
6413 : }
6414 317143 : }
6415 :
6416 :
6417 319047 : void Analysis::VisitText(TextNode* that) {
6418 319047 : that->MakeCaseIndependent(isolate(), is_one_byte_);
6419 319047 : EnsureAnalyzed(that->on_success());
6420 319047 : if (!has_failed()) {
6421 317143 : that->CalculateOffsets();
6422 : }
6423 319047 : }
6424 :
6425 :
6426 286673 : void Analysis::VisitAction(ActionNode* that) {
6427 : RegExpNode* target = that->on_success();
6428 286673 : EnsureAnalyzed(target);
6429 286673 : if (!has_failed()) {
6430 : // If the next node is interested in what it follows then this node
6431 : // has to be interested too so it can pass the information on.
6432 : that->info()->AddFromFollowing(target->info());
6433 : }
6434 286673 : }
6435 :
6436 :
6437 25781 : void Analysis::VisitChoice(ChoiceNode* that) {
6438 : NodeInfo* info = that->info();
6439 296373 : for (int i = 0; i < that->alternatives()->length(); i++) {
6440 : RegExpNode* node = that->alternatives()->at(i).node();
6441 135296 : EnsureAnalyzed(node);
6442 135296 : if (has_failed()) return;
6443 : // Anything the following nodes need to know has to be known by
6444 : // this node also, so it can pass it on.
6445 : info->AddFromFollowing(node->info());
6446 : }
6447 : }
6448 :
6449 :
6450 145900 : void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6451 : NodeInfo* info = that->info();
6452 634416 : for (int i = 0; i < that->alternatives()->length(); i++) {
6453 : RegExpNode* node = that->alternatives()->at(i).node();
6454 291600 : if (node != that->loop_node()) {
6455 145900 : EnsureAnalyzed(node);
6456 145900 : if (has_failed()) return;
6457 : info->AddFromFollowing(node->info());
6458 : }
6459 : }
6460 : // Check the loop last since it may need the value of this node
6461 : // to get a correct result.
6462 98558 : EnsureAnalyzed(that->loop_node());
6463 98558 : if (!has_failed()) {
6464 : info->AddFromFollowing(that->loop_node()->info());
6465 : }
6466 : }
6467 :
6468 :
6469 2321 : void Analysis::VisitBackReference(BackReferenceNode* that) {
6470 2321 : EnsureAnalyzed(that->on_success());
6471 2321 : }
6472 :
6473 :
6474 5447 : void Analysis::VisitAssertion(AssertionNode* that) {
6475 5447 : EnsureAnalyzed(that->on_success());
6476 5447 : }
6477 :
6478 :
6479 188 : void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6480 : BoyerMooreLookahead* bm,
6481 : bool not_at_start) {
6482 : // Working out the set of characters that a backreference can match is too
6483 : // hard, so we just say that any character can match.
6484 : bm->SetRest(offset);
6485 : SaveBMInfo(bm, not_at_start, offset);
6486 188 : }
6487 :
6488 :
6489 : STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6490 : RegExpMacroAssembler::kTableSize);
6491 :
6492 :
6493 7838 : void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6494 : BoyerMooreLookahead* bm, bool not_at_start) {
6495 : ZoneList<GuardedAlternative>* alts = alternatives();
6496 15676 : budget = (budget - 1) / alts->length();
6497 91230 : for (int i = 0; i < alts->length(); i++) {
6498 : GuardedAlternative& alt = alts->at(i);
6499 42168 : if (alt.guards() != nullptr && alt.guards()->length() != 0) {
6500 : bm->SetRest(offset); // Give up trying to fill in info.
6501 : SaveBMInfo(bm, not_at_start, offset);
6502 : return;
6503 : }
6504 41696 : alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6505 : }
6506 : SaveBMInfo(bm, not_at_start, offset);
6507 : }
6508 :
6509 :
6510 123610 : void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6511 : BoyerMooreLookahead* bm, bool not_at_start) {
6512 123610 : if (initial_offset >= bm->length()) return;
6513 : int offset = initial_offset;
6514 : int max_char = bm->max_char();
6515 405222 : for (int i = 0; i < elements()->length(); i++) {
6516 164635 : if (offset >= bm->length()) {
6517 17472 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6518 : return;
6519 : }
6520 147163 : TextElement text = elements()->at(i);
6521 147163 : if (text.text_type() == TextElement::ATOM) {
6522 : RegExpAtom* atom = text.atom();
6523 213973 : for (int j = 0; j < atom->length(); j++, offset++) {
6524 89183 : if (offset >= bm->length()) {
6525 6357 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6526 : return;
6527 : }
6528 165652 : uc16 character = atom->data()[j];
6529 82826 : if (IgnoreCase(atom->flags())) {
6530 : unibrow::uchar chars[4];
6531 4735 : int length = GetCaseIndependentLetters(
6532 : isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6533 4735 : chars, 4);
6534 22485 : for (int j = 0; j < length; j++) {
6535 8875 : bm->Set(offset, chars[j]);
6536 : }
6537 : } else {
6538 78091 : if (character <= max_char) bm->Set(offset, character);
6539 : }
6540 : }
6541 : } else {
6542 : DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6543 : RegExpCharacterClass* char_class = text.char_class();
6544 : ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6545 98842 : if (char_class->is_negated()) {
6546 4382 : bm->SetAll(offset);
6547 : } else {
6548 673422 : for (int k = 0; k < ranges->length(); k++) {
6549 : CharacterRange& range = ranges->at(k);
6550 289481 : if (range.from() > max_char) continue;
6551 : int to = Min(max_char, static_cast<int>(range.to()));
6552 161207 : bm->SetInterval(offset, Interval(range.from(), to));
6553 : }
6554 : }
6555 98842 : offset++;
6556 : }
6557 : }
6558 99781 : if (offset >= bm->length()) {
6559 90396 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6560 : return;
6561 : }
6562 9385 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6563 18770 : true); // Not at start after a text node.
6564 9385 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6565 : }
6566 :
6567 :
6568 : // -------------------------------------------------------------------
6569 : // Dispatch table construction
6570 :
6571 :
6572 0 : void DispatchTableConstructor::VisitEnd(EndNode* that) {
6573 : AddRange(CharacterRange::Everything());
6574 0 : }
6575 :
6576 :
6577 0 : void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6578 : node->set_being_calculated(true);
6579 : ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6580 0 : for (int i = 0; i < alternatives->length(); i++) {
6581 : set_choice_index(i);
6582 0 : alternatives->at(i).node()->Accept(this);
6583 : }
6584 : node->set_being_calculated(false);
6585 0 : }
6586 :
6587 :
6588 : class AddDispatchRange {
6589 : public:
6590 : explicit AddDispatchRange(DispatchTableConstructor* constructor)
6591 0 : : constructor_(constructor) { }
6592 : void Call(uc32 from, DispatchTable::Entry entry);
6593 : private:
6594 : DispatchTableConstructor* constructor_;
6595 : };
6596 :
6597 :
6598 0 : void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6599 0 : constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6600 0 : }
6601 :
6602 :
6603 0 : void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6604 0 : if (node->being_calculated())
6605 0 : return;
6606 0 : DispatchTable* table = node->GetTable(ignore_case_);
6607 : AddDispatchRange adder(this);
6608 : table->ForEach(&adder);
6609 : }
6610 :
6611 :
6612 0 : void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6613 : // TODO(160): Find the node that we refer back to and propagate its start
6614 : // set back to here. For now we just accept anything.
6615 : AddRange(CharacterRange::Everything());
6616 0 : }
6617 :
6618 :
6619 0 : void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6620 : RegExpNode* target = that->on_success();
6621 0 : target->Accept(this);
6622 0 : }
6623 :
6624 :
6625 3935 : static int CompareRangeByFrom(const CharacterRange* a,
6626 : const CharacterRange* b) {
6627 7870 : return Compare<uc16>(a->from(), b->from());
6628 : }
6629 :
6630 :
6631 55 : void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6632 : ranges->Sort(CompareRangeByFrom);
6633 : uc16 last = 0;
6634 1665 : for (int i = 0; i < ranges->length(); i++) {
6635 805 : CharacterRange range = ranges->at(i);
6636 805 : if (last < range.from())
6637 525 : AddRange(CharacterRange::Range(last, range.from() - 1));
6638 805 : if (range.to() >= last) {
6639 715 : if (range.to() == String::kMaxCodePoint) {
6640 : return;
6641 : } else {
6642 715 : last = range.to() + 1;
6643 : }
6644 : }
6645 : }
6646 55 : AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6647 : }
6648 :
6649 :
6650 0 : void DispatchTableConstructor::VisitText(TextNode* that) {
6651 0 : TextElement elm = that->elements()->at(0);
6652 0 : switch (elm.text_type()) {
6653 : case TextElement::ATOM: {
6654 0 : uc16 c = elm.atom()->data()[0];
6655 0 : AddRange(CharacterRange::Range(c, c));
6656 : break;
6657 : }
6658 : case TextElement::CHAR_CLASS: {
6659 : RegExpCharacterClass* tree = elm.char_class();
6660 : ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6661 0 : if (tree->is_negated()) {
6662 0 : AddInverse(ranges);
6663 : } else {
6664 0 : for (int i = 0; i < ranges->length(); i++)
6665 : AddRange(ranges->at(i));
6666 : }
6667 : break;
6668 : }
6669 : default: {
6670 0 : UNIMPLEMENTED();
6671 : }
6672 : }
6673 0 : }
6674 :
6675 :
6676 0 : void DispatchTableConstructor::VisitAction(ActionNode* that) {
6677 : RegExpNode* target = that->on_success();
6678 0 : target->Accept(this);
6679 0 : }
6680 :
6681 40 : RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6682 : RegExpNode* on_success,
6683 : JSRegExp::Flags flags) {
6684 : // If the regexp matching starts within a surrogate pair, step back
6685 : // to the lead surrogate and start matching from there.
6686 : DCHECK(!compiler->read_backward());
6687 : Zone* zone = compiler->zone();
6688 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6689 40 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6690 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6691 40 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6692 :
6693 40 : ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6694 :
6695 : int stack_register = compiler->UnicodeLookaroundStackRegister();
6696 : int position_register = compiler->UnicodeLookaroundPositionRegister();
6697 : RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6698 40 : zone, lead_surrogates, true, on_success, flags);
6699 : RegExpLookaround::Builder builder(true, step_back, stack_register,
6700 40 : position_register);
6701 : RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6702 40 : zone, trail_surrogates, false, builder.on_match_success(), flags);
6703 :
6704 40 : optional_step_back->AddAlternative(
6705 : GuardedAlternative(builder.ForMatch(match_trail)));
6706 : optional_step_back->AddAlternative(GuardedAlternative(on_success));
6707 :
6708 40 : return optional_step_back;
6709 : }
6710 :
6711 :
6712 85748 : RegExpEngine::CompilationResult RegExpEngine::Compile(
6713 : Isolate* isolate, Zone* zone, RegExpCompileData* data,
6714 : JSRegExp::Flags flags, Handle<String> pattern,
6715 : Handle<String> sample_subject, bool is_one_byte) {
6716 85748 : if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6717 : return IrregexpRegExpTooBig(isolate);
6718 : }
6719 : bool is_sticky = IsSticky(flags);
6720 : bool is_global = IsGlobal(flags);
6721 : bool is_unicode = IsUnicode(flags);
6722 85739 : RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
6723 :
6724 85739 : if (compiler.optimize())
6725 84574 : compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
6726 :
6727 : // Sample some characters from the middle of the string.
6728 : static const int kSampleSize = 128;
6729 :
6730 85739 : sample_subject = String::Flatten(isolate, sample_subject);
6731 : int chars_sampled = 0;
6732 85739 : int half_way = (sample_subject->length() - kSampleSize) / 2;
6733 999657 : for (int i = Max(0, half_way);
6734 542698 : i < sample_subject->length() && chars_sampled < kSampleSize;
6735 : i++, chars_sampled++) {
6736 : compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6737 : }
6738 :
6739 : // Wrap the body of the regexp in capture #0.
6740 85739 : RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6741 : 0,
6742 : &compiler,
6743 85739 : compiler.accept());
6744 : RegExpNode* node = captured_body;
6745 85739 : bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6746 85739 : bool is_start_anchored = data->tree->IsAnchoredAtStart();
6747 85739 : int max_length = data->tree->max_match();
6748 85739 : if (!is_start_anchored && !is_sticky) {
6749 : // Add a .*? at the beginning, outside the body capture, unless
6750 : // this expression is anchored at the beginning or sticky.
6751 : JSRegExp::Flags default_flags = JSRegExp::Flags();
6752 82772 : RegExpNode* loop_node = RegExpQuantifier::ToNode(
6753 : 0, RegExpTree::kInfinity, false,
6754 : new (zone) RegExpCharacterClass('*', default_flags), &compiler,
6755 165544 : captured_body, data->contains_anchor);
6756 :
6757 82772 : if (data->contains_anchor) {
6758 : // Unroll loop once, to take care of the case that might start
6759 : // at the start of input.
6760 149 : ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6761 : first_step_node->AddAlternative(GuardedAlternative(captured_body));
6762 : first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6763 : new (zone) RegExpCharacterClass('*', default_flags), false,
6764 149 : loop_node)));
6765 : node = first_step_node;
6766 : } else {
6767 : node = loop_node;
6768 : }
6769 : }
6770 85739 : if (is_one_byte) {
6771 14663 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6772 : // Do it again to propagate the new nodes to places where they were not
6773 : // put because they had not been calculated yet.
6774 14663 : if (node != nullptr) {
6775 14363 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6776 : }
6777 71076 : } else if (is_unicode && (is_global || is_sticky)) {
6778 40 : node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags);
6779 : }
6780 :
6781 85739 : if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
6782 85739 : data->node = node;
6783 : Analysis analysis(isolate, is_one_byte);
6784 85739 : analysis.EnsureAnalyzed(node);
6785 85739 : if (analysis.has_failed()) {
6786 : const char* error_message = analysis.error_message();
6787 : return CompilationResult(isolate, error_message);
6788 : }
6789 :
6790 : // Create the correct assembler for the architecture.
6791 : std::unique_ptr<RegExpMacroAssembler> macro_assembler;
6792 85399 : if (!FLAG_regexp_interpret_all) {
6793 : // Native regexp implementation.
6794 : DCHECK(!FLAG_jitless);
6795 :
6796 : NativeRegExpMacroAssembler::Mode mode =
6797 : is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6798 82129 : : NativeRegExpMacroAssembler::UC16;
6799 :
6800 : #if V8_TARGET_ARCH_IA32
6801 : macro_assembler.reset(new RegExpMacroAssemblerIA32(
6802 : isolate, zone, mode, (data->capture_count + 1) * 2));
6803 : #elif V8_TARGET_ARCH_X64
6804 82129 : macro_assembler.reset(new RegExpMacroAssemblerX64(
6805 82129 : isolate, zone, mode, (data->capture_count + 1) * 2));
6806 : #elif V8_TARGET_ARCH_ARM
6807 : macro_assembler.reset(new RegExpMacroAssemblerARM(
6808 : isolate, zone, mode, (data->capture_count + 1) * 2));
6809 : #elif V8_TARGET_ARCH_ARM64
6810 : macro_assembler.reset(new RegExpMacroAssemblerARM64(
6811 : isolate, zone, mode, (data->capture_count + 1) * 2));
6812 : #elif V8_TARGET_ARCH_S390
6813 : macro_assembler.reset(new RegExpMacroAssemblerS390(
6814 : isolate, zone, mode, (data->capture_count + 1) * 2));
6815 : #elif V8_TARGET_ARCH_PPC
6816 : macro_assembler.reset(new RegExpMacroAssemblerPPC(
6817 : isolate, zone, mode, (data->capture_count + 1) * 2));
6818 : #elif V8_TARGET_ARCH_MIPS
6819 : macro_assembler.reset(new RegExpMacroAssemblerMIPS(
6820 : isolate, zone, mode, (data->capture_count + 1) * 2));
6821 : #elif V8_TARGET_ARCH_MIPS64
6822 : macro_assembler.reset(new RegExpMacroAssemblerMIPS(
6823 : isolate, zone, mode, (data->capture_count + 1) * 2));
6824 : #else
6825 : #error "Unsupported architecture"
6826 : #endif
6827 : } else {
6828 : DCHECK(FLAG_regexp_interpret_all);
6829 :
6830 : // Interpreted regexp implementation.
6831 3270 : macro_assembler.reset(new RegExpMacroAssemblerIrregexp(isolate, zone));
6832 : }
6833 :
6834 85399 : macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
6835 :
6836 : // Inserted here, instead of in Assembler, because it depends on information
6837 : // in the AST that isn't replicated in the Node structure.
6838 : static const int kMaxBacksearchLimit = 1024;
6839 85942 : if (is_end_anchored && !is_start_anchored && !is_sticky &&
6840 543 : max_length < kMaxBacksearchLimit) {
6841 210 : macro_assembler->SetCurrentPositionFromEnd(max_length);
6842 : }
6843 :
6844 85399 : if (is_global) {
6845 : RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6846 3824 : if (data->tree->min_match() > 0) {
6847 : mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6848 138 : } else if (is_unicode) {
6849 : mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6850 : }
6851 : macro_assembler->set_global_mode(mode);
6852 : }
6853 :
6854 : return compiler.Assemble(isolate, macro_assembler.get(), node,
6855 85399 : data->capture_count, pattern);
6856 : }
6857 :
6858 169973 : bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
6859 : Heap* heap = isolate->heap();
6860 169973 : bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6861 169973 : if (isolate->total_regexp_code_generated() >
6862 298526 : RegExpImpl::kRegExpCompiledLimit &&
6863 128553 : heap->CommittedMemoryExecutable() >
6864 : RegExpImpl::kRegExpExecutableMemoryLimit) {
6865 : too_much = true;
6866 : }
6867 169973 : return too_much;
6868 : }
6869 :
6870 36155 : Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
6871 : Object key_pattern,
6872 : FixedArray* last_match_cache,
6873 : ResultsCacheType type) {
6874 : FixedArray cache;
6875 36155 : if (!key_string->IsInternalizedString()) return Smi::kZero;
6876 5238 : if (type == STRING_SPLIT_SUBSTRINGS) {
6877 : DCHECK(key_pattern->IsString());
6878 5238 : if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6879 : cache = heap->string_split_cache();
6880 : } else {
6881 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6882 : DCHECK(key_pattern->IsFixedArray());
6883 : cache = heap->regexp_multiple_cache();
6884 : }
6885 :
6886 5238 : uint32_t hash = key_string->Hash();
6887 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6888 5238 : ~(kArrayEntriesPerCacheEntry - 1));
6889 14406 : if (cache->get(index + kStringOffset) != key_string ||
6890 3930 : cache->get(index + kPatternOffset) != key_pattern) {
6891 : index =
6892 1339 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6893 2694 : if (cache->get(index + kStringOffset) != key_string ||
6894 16 : cache->get(index + kPatternOffset) != key_pattern) {
6895 1334 : return Smi::kZero;
6896 : }
6897 : }
6898 :
6899 7808 : *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6900 3904 : return cache->get(index + kArrayOffset);
6901 : }
6902 :
6903 32251 : void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6904 : Handle<Object> key_pattern,
6905 : Handle<FixedArray> value_array,
6906 : Handle<FixedArray> last_match_cache,
6907 : ResultsCacheType type) {
6908 : Factory* factory = isolate->factory();
6909 : Handle<FixedArray> cache;
6910 32251 : if (!key_string->IsInternalizedString()) return;
6911 1334 : if (type == STRING_SPLIT_SUBSTRINGS) {
6912 : DCHECK(key_pattern->IsString());
6913 1334 : if (!key_pattern->IsInternalizedString()) return;
6914 : cache = factory->string_split_cache();
6915 : } else {
6916 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6917 : DCHECK(key_pattern->IsFixedArray());
6918 : cache = factory->regexp_multiple_cache();
6919 : }
6920 :
6921 1334 : uint32_t hash = key_string->Hash();
6922 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6923 1334 : ~(kArrayEntriesPerCacheEntry - 1));
6924 2668 : if (cache->get(index + kStringOffset) == Smi::kZero) {
6925 2260 : cache->set(index + kStringOffset, *key_string);
6926 2260 : cache->set(index + kPatternOffset, *key_pattern);
6927 2260 : cache->set(index + kArrayOffset, *value_array);
6928 2260 : cache->set(index + kLastMatchOffset, *last_match_cache);
6929 : } else {
6930 : uint32_t index2 =
6931 204 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6932 408 : if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6933 318 : cache->set(index2 + kStringOffset, *key_string);
6934 318 : cache->set(index2 + kPatternOffset, *key_pattern);
6935 318 : cache->set(index2 + kArrayOffset, *value_array);
6936 318 : cache->set(index2 + kLastMatchOffset, *last_match_cache);
6937 : } else {
6938 45 : cache->set(index2 + kStringOffset, Smi::kZero);
6939 90 : cache->set(index2 + kPatternOffset, Smi::kZero);
6940 90 : cache->set(index2 + kArrayOffset, Smi::kZero);
6941 90 : cache->set(index2 + kLastMatchOffset, Smi::kZero);
6942 90 : cache->set(index + kStringOffset, *key_string);
6943 90 : cache->set(index + kPatternOffset, *key_pattern);
6944 90 : cache->set(index + kArrayOffset, *value_array);
6945 90 : cache->set(index + kLastMatchOffset, *last_match_cache);
6946 : }
6947 : }
6948 : // If the array is a reasonably short list of substrings, convert it into a
6949 : // list of internalized strings.
6950 2668 : if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6951 13510 : for (int i = 0; i < value_array->length(); i++) {
6952 : Handle<String> str(String::cast(value_array->get(i)), isolate);
6953 6107 : Handle<String> internalized_str = factory->InternalizeString(str);
6954 12214 : value_array->set(i, *internalized_str);
6955 : }
6956 : }
6957 : // Convert backing store to a copy-on-write array.
6958 : value_array->set_map_no_write_barrier(
6959 : ReadOnlyRoots(isolate).fixed_cow_array_map());
6960 : }
6961 :
6962 137729 : void RegExpResultsCache::Clear(FixedArray cache) {
6963 70653677 : for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6964 35257975 : cache->set(i, Smi::kZero);
6965 : }
6966 137728 : }
6967 :
6968 : } // namespace internal
6969 121996 : } // namespace v8
|