Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/jsregexp.h"
6 :
7 : #include <memory>
8 : #include <vector>
9 :
10 : #include "src/base/platform/platform.h"
11 : #include "src/code-tracer.h"
12 : #include "src/compilation-cache.h"
13 : #include "src/elements.h"
14 : #include "src/execution.h"
15 : #include "src/heap/factory.h"
16 : #include "src/heap/heap-inl.h"
17 : #include "src/isolate-inl.h"
18 : #include "src/message-template.h"
19 : #include "src/ostreams.h"
20 : #include "src/regexp/interpreter-irregexp.h"
21 : #include "src/regexp/jsregexp-inl.h"
22 : #include "src/regexp/regexp-macro-assembler-irregexp.h"
23 : #include "src/regexp/regexp-macro-assembler-tracer.h"
24 : #include "src/regexp/regexp-macro-assembler.h"
25 : #include "src/regexp/regexp-parser.h"
26 : #include "src/regexp/regexp-stack.h"
27 : #include "src/runtime/runtime.h"
28 : #include "src/splay-tree-inl.h"
29 : #include "src/string-search.h"
30 : #include "src/unicode-decoder.h"
31 : #include "src/unicode-inl.h"
32 : #include "src/zone/zone-list-inl.h"
33 :
34 : #ifdef V8_INTL_SUPPORT
35 : #include "unicode/uniset.h"
36 : #include "unicode/utypes.h"
37 : #endif // V8_INTL_SUPPORT
38 :
39 : #if V8_TARGET_ARCH_IA32
40 : #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
41 : #elif V8_TARGET_ARCH_X64
42 : #include "src/regexp/x64/regexp-macro-assembler-x64.h"
43 : #elif V8_TARGET_ARCH_ARM64
44 : #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
45 : #elif V8_TARGET_ARCH_ARM
46 : #include "src/regexp/arm/regexp-macro-assembler-arm.h"
47 : #elif V8_TARGET_ARCH_PPC
48 : #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
49 : #elif V8_TARGET_ARCH_S390
50 : #include "src/regexp/s390/regexp-macro-assembler-s390.h"
51 : #elif V8_TARGET_ARCH_MIPS
52 : #include "src/regexp/mips/regexp-macro-assembler-mips.h"
53 : #elif V8_TARGET_ARCH_MIPS64
54 : #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
55 : #else
56 : #error Unsupported target architecture.
57 : #endif
58 :
59 : namespace v8 {
60 : namespace internal {
61 :
62 : V8_WARN_UNUSED_RESULT
63 3224 : static inline MaybeHandle<Object> ThrowRegExpException(
64 : Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
65 : Handle<String> error_text) {
66 6448 : THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
67 : pattern, error_text),
68 : Object);
69 : }
70 :
71 363 : inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
72 : Handle<String> error_text) {
73 726 : USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
74 : error_text));
75 363 : }
76 :
77 :
78 0 : ContainedInLattice AddRange(ContainedInLattice containment,
79 : const int* ranges,
80 : int ranges_length,
81 : Interval new_range) {
82 : DCHECK_EQ(1, ranges_length & 1);
83 : DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
84 993752 : if (containment == kLatticeUnknown) return containment;
85 : bool inside = false;
86 : int last = 0;
87 10385574 : for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
88 : // Consider the range from last to ranges[i].
89 : // We haven't got to the new range yet.
90 5633991 : if (ranges[i] <= new_range.from()) continue;
91 : // New range is wholly inside last-ranges[i]. Note that new_range.to() is
92 : // inclusive, but the values in ranges are not.
93 882408 : if (last <= new_range.from() && new_range.to() < ranges[i]) {
94 865124 : return Combine(containment, inside ? kLatticeIn : kLatticeOut);
95 : }
96 : return kLatticeUnknown;
97 : }
98 : return containment;
99 : }
100 :
101 : // More makes code generation slower, less makes V8 benchmark score lower.
102 : const int kMaxLookaheadForBoyerMoore = 8;
103 : // In a 3-character pattern you can maximally step forwards 3 characters
104 : // at a time, which is not always enough to pay for the extra logic.
105 : const int kPatternTooShortForBoyerMoore = 2;
106 :
107 : // Identifies the sort of regexps where the regexp engine is faster
108 : // than the code used for atom matches.
109 176062 : static bool HasFewDifferentCharacters(Handle<String> pattern) {
110 : int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
111 176062 : if (length <= kPatternTooShortForBoyerMoore) return false;
112 : const int kMod = 128;
113 : bool character_found[kMod];
114 : int different = 0;
115 : memset(&character_found[0], 0, sizeof(character_found));
116 851831 : for (int i = 0; i < length; i++) {
117 511601 : int ch = (pattern->Get(i) & (kMod - 1));
118 511601 : if (!character_found[ch]) {
119 511181 : character_found[ch] = true;
120 511181 : different++;
121 : // We declare a regexp low-alphabet if it has at least 3 times as many
122 : // characters as it has different characters.
123 511181 : if (different * 3 > length) return false;
124 : }
125 : }
126 : return true;
127 : }
128 :
129 : // Generic RegExp methods. Dispatches to implementation specific methods.
130 :
131 460090 : MaybeHandle<Object> RegExpImpl::Compile(Isolate* isolate, Handle<JSRegExp> re,
132 : Handle<String> pattern,
133 : JSRegExp::Flags flags) {
134 : DCHECK(pattern->IsFlat());
135 :
136 920180 : Zone zone(isolate->allocator(), ZONE_NAME);
137 : CompilationCache* compilation_cache = isolate->compilation_cache();
138 : MaybeHandle<FixedArray> maybe_cached =
139 460090 : compilation_cache->LookupRegExp(pattern, flags);
140 : Handle<FixedArray> cached;
141 460090 : if (maybe_cached.ToHandle(&cached)) {
142 391598 : re->set_data(*cached);
143 195799 : return re;
144 : }
145 :
146 : PostponeInterruptsScope postpone(isolate);
147 : RegExpCompileData parse_result;
148 264291 : FlatStringReader reader(isolate, pattern);
149 : DCHECK(!isolate->has_pending_exception());
150 264291 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
151 : &parse_result)) {
152 : // Throw an exception if we fail to parse the pattern.
153 2831 : return ThrowRegExpException(isolate, re, pattern, parse_result.error);
154 : }
155 :
156 : bool has_been_compiled = false;
157 :
158 770062 : if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
159 169349 : !HasFewDifferentCharacters(pattern)) {
160 : // Parse-tree is a single atom that is equal to the pattern.
161 : AtomCompile(isolate, re, pattern, flags, pattern);
162 : has_been_compiled = true;
163 106678 : } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
164 7237 : parse_result.capture_count == 0) {
165 7227 : RegExpAtom* atom = parse_result.tree->AsAtom();
166 7227 : Vector<const uc16> atom_pattern = atom->data();
167 : Handle<String> atom_string;
168 14454 : ASSIGN_RETURN_ON_EXCEPTION(
169 : isolate, atom_string,
170 : isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
171 7227 : if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
172 : AtomCompile(isolate, re, pattern, flags, atom_string);
173 : has_been_compiled = true;
174 : }
175 : }
176 261460 : if (!has_been_compiled) {
177 85486 : IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count);
178 : }
179 : DCHECK(re->data()->IsFixedArray());
180 : // Compilation succeeded so the data is set on the regexp
181 : // and we can store it in the cache.
182 : Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
183 261460 : compilation_cache->PutRegExp(pattern, flags, data);
184 :
185 261460 : return re;
186 : }
187 :
188 4352029 : MaybeHandle<Object> RegExpImpl::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
189 : Handle<String> subject, int index,
190 : Handle<RegExpMatchInfo> last_match_info) {
191 4352029 : switch (regexp->TypeTag()) {
192 : case JSRegExp::ATOM:
193 286 : return AtomExec(isolate, regexp, subject, index, last_match_info);
194 : case JSRegExp::IRREGEXP: {
195 4351743 : return IrregexpExec(isolate, regexp, subject, index, last_match_info);
196 : }
197 : default:
198 0 : UNREACHABLE();
199 : }
200 : }
201 :
202 :
203 : // RegExp Atom implementation: Simple string search using indexOf.
204 :
205 0 : void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
206 : Handle<String> pattern, JSRegExp::Flags flags,
207 : Handle<String> match_pattern) {
208 175974 : isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags,
209 175974 : match_pattern);
210 0 : }
211 :
212 273 : static void SetAtomLastCapture(Isolate* isolate,
213 : Handle<RegExpMatchInfo> last_match_info,
214 : String subject, int from, int to) {
215 : SealHandleScope shs(isolate);
216 : last_match_info->SetNumberOfCaptureRegisters(2);
217 546 : last_match_info->SetLastSubject(subject);
218 546 : last_match_info->SetLastInput(subject);
219 : last_match_info->SetCapture(0, from);
220 : last_match_info->SetCapture(1, to);
221 273 : }
222 :
223 90541 : int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
224 : Handle<String> subject, int index, int32_t* output,
225 : int output_size) {
226 : DCHECK_LE(0, index);
227 : DCHECK_LE(index, subject->length());
228 :
229 90541 : subject = String::Flatten(isolate, subject);
230 : DisallowHeapAllocation no_gc; // ensure vectors stay valid
231 :
232 90541 : String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
233 : int needle_len = needle->length();
234 : DCHECK(needle->IsFlat());
235 : DCHECK_LT(0, needle_len);
236 :
237 181082 : if (index + needle_len > subject->length()) {
238 : return RegExpImpl::RE_FAILURE;
239 : }
240 :
241 273479 : for (int i = 0; i < output_size; i += 2) {
242 181736 : String::FlatContent needle_content = needle->GetFlatContent(no_gc);
243 181736 : String::FlatContent subject_content = subject->GetFlatContent(no_gc);
244 : DCHECK(needle_content.IsFlat());
245 : DCHECK(subject_content.IsFlat());
246 : // dispatch on type of strings
247 : index =
248 : (needle_content.IsOneByte()
249 : ? (subject_content.IsOneByte()
250 : ? SearchString(isolate, subject_content.ToOneByteVector(),
251 : needle_content.ToOneByteVector(), index)
252 : : SearchString(isolate, subject_content.ToUC16Vector(),
253 : needle_content.ToOneByteVector(), index))
254 : : (subject_content.IsOneByte()
255 : ? SearchString(isolate, subject_content.ToOneByteVector(),
256 : needle_content.ToUC16Vector(), index)
257 : : SearchString(isolate, subject_content.ToUC16Vector(),
258 363472 : needle_content.ToUC16Vector(), index)));
259 181736 : if (index == -1) {
260 90266 : return i / 2; // Return number of matches.
261 : } else {
262 91470 : output[i] = index;
263 91470 : output[i+1] = index + needle_len;
264 : index += needle_len;
265 : }
266 : }
267 273 : return output_size / 2;
268 : }
269 :
270 286 : Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
271 : Handle<String> subject, int index,
272 : Handle<RegExpMatchInfo> last_match_info) {
273 : static const int kNumRegisters = 2;
274 : STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
275 : int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
276 :
277 : int res =
278 286 : AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
279 :
280 299 : if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
281 :
282 : DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
283 : SealHandleScope shs(isolate);
284 273 : SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
285 273 : output_registers[1]);
286 273 : return last_match_info;
287 : }
288 :
289 :
290 : // Irregexp implementation.
291 :
292 : // Ensures that the regexp object contains a compiled version of the
293 : // source for either one-byte or two-byte subject strings.
294 : // If the compiled version doesn't already exist, it is compiled
295 : // from the source pattern.
296 : // If compilation fails, an exception is thrown and this function
297 : // returns false.
298 4472686 : bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
299 : Handle<String> sample_subject,
300 : bool is_one_byte) {
301 4472686 : Object compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
302 4472686 : if (compiled_code != Smi::FromInt(JSRegExp::kUninitializedValue)) {
303 : DCHECK(FLAG_regexp_interpret_all ? compiled_code->IsByteArray()
304 : : compiled_code->IsCode());
305 : return true;
306 : }
307 85700 : return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
308 : }
309 :
310 85700 : bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
311 : Handle<String> sample_subject,
312 : bool is_one_byte) {
313 : // Compile the RegExp.
314 171400 : Zone zone(isolate->allocator(), ZONE_NAME);
315 : PostponeInterruptsScope postpone(isolate);
316 : #ifdef DEBUG
317 : Object entry = re->DataAt(JSRegExp::code_index(is_one_byte));
318 : // When arriving here entry can only be a smi representing an uncompiled
319 : // regexp.
320 : DCHECK(entry->IsSmi());
321 : int entry_value = Smi::ToInt(entry);
322 : DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
323 : #endif
324 :
325 85700 : JSRegExp::Flags flags = re->GetFlags();
326 :
327 171400 : Handle<String> pattern(re->Pattern(), isolate);
328 85700 : pattern = String::Flatten(isolate, pattern);
329 : RegExpCompileData compile_data;
330 85700 : FlatStringReader reader(isolate, pattern);
331 85700 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
332 : &compile_data)) {
333 : // Throw an exception if we fail to parse the pattern.
334 : // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
335 30 : USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
336 30 : return false;
337 : }
338 : RegExpEngine::CompilationResult result =
339 : RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
340 85670 : sample_subject, is_one_byte);
341 85670 : if (result.error_message != nullptr) {
342 : // Unable to compile regexp.
343 363 : if (FLAG_abort_on_stack_or_string_length_overflow &&
344 0 : strncmp(result.error_message, "Stack overflow", 15) == 0) {
345 0 : FATAL("Aborting on stack overflow");
346 : }
347 726 : Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
348 363 : CStrVector(result.error_message)).ToHandleChecked();
349 363 : ThrowRegExpException(isolate, re, error_message);
350 : return false;
351 : }
352 :
353 : Handle<FixedArray> data =
354 : Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
355 85307 : data->set(JSRegExp::code_index(is_one_byte), result.code);
356 85307 : SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
357 : int register_max = IrregexpMaxRegisterCount(*data);
358 85307 : if (result.num_registers > register_max) {
359 : SetIrregexpMaxRegisterCount(*data, result.num_registers);
360 : }
361 :
362 : return true;
363 : }
364 :
365 0 : int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
366 : return Smi::cast(
367 0 : re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
368 : }
369 :
370 0 : void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
371 : re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
372 0 : }
373 :
374 85307 : void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray re,
375 : Handle<FixedArray> value) {
376 85307 : if (value.is_null()) {
377 84947 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
378 : } else {
379 360 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
380 : }
381 85307 : }
382 :
383 0 : int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
384 0 : return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex));
385 : }
386 :
387 0 : int RegExpImpl::IrregexpNumberOfRegisters(FixedArray re) {
388 0 : return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex));
389 : }
390 :
391 0 : ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
392 0 : return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
393 : }
394 :
395 0 : Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
396 0 : return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
397 : }
398 :
399 0 : void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
400 : Handle<String> pattern,
401 : JSRegExp::Flags flags, int capture_count) {
402 : // Initialize compiled code entries to null.
403 : isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern,
404 85486 : flags, capture_count);
405 0 : }
406 :
407 4359162 : int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
408 : Handle<String> subject) {
409 : DCHECK(subject->IsFlat());
410 :
411 : // Check representation of the underlying storage.
412 4359162 : bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
413 4359162 : if (!EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) return -1;
414 :
415 4358769 : if (FLAG_regexp_interpret_all) {
416 : // Byte-code regexp needs space allocated for all its registers.
417 : // The result captures are copied to the start of the registers array
418 : // if the match succeeds. This way those registers are not clobbered
419 : // when we set the last match info from last successful match.
420 : return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
421 4251103 : (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
422 : } else {
423 : // Native regexp only needs room to output captures. Registers are handled
424 : // internally.
425 107666 : return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
426 : }
427 : }
428 :
429 4394778 : int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
430 : Handle<String> subject, int index,
431 : int32_t* output, int output_size) {
432 : Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
433 :
434 : DCHECK_LE(0, index);
435 : DCHECK_LE(index, subject->length());
436 : DCHECK(subject->IsFlat());
437 :
438 4394778 : bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
439 :
440 4394778 : if (!FLAG_regexp_interpret_all) {
441 : DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
442 4 : do {
443 113523 : EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
444 : Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
445 : // The stack is used to allocate registers for the compiled regexp code.
446 : // This means that in case of failure, the output registers array is left
447 : // untouched and contains the capture results from the previous successful
448 : // match. We can use that to set the last match info lazily.
449 : int res = NativeRegExpMacroAssembler::Match(code, subject, output,
450 113523 : output_size, index, isolate);
451 113523 : if (res != NativeRegExpMacroAssembler::RETRY) {
452 : DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
453 : isolate->has_pending_exception());
454 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
455 : RE_SUCCESS);
456 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
457 : RE_FAILURE);
458 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
459 : RE_EXCEPTION);
460 113519 : return res;
461 : }
462 : // If result is RETRY, the string has changed representation, and we
463 : // must restart from scratch.
464 : // In this case, it means we must make sure we are prepared to handle
465 : // the, potentially, different subject (the string can switch between
466 : // being internal and external, and even between being Latin1 and UC16,
467 : // but the characters are always the same).
468 4 : IrregexpPrepare(isolate, regexp, subject);
469 4 : is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
470 : } while (true);
471 : UNREACHABLE();
472 : } else {
473 : DCHECK(FLAG_regexp_interpret_all);
474 : DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
475 : // We must have done EnsureCompiledIrregexp, so we can get the number of
476 : // registers.
477 : int number_of_capture_registers =
478 4281259 : (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
479 4281259 : int32_t* raw_output = &output[number_of_capture_registers];
480 :
481 1 : do {
482 : // We do not touch the actual capture result registers until we know there
483 : // has been a match so that we can use those capture results to set the
484 : // last match info.
485 13132916 : for (int i = number_of_capture_registers - 1; i >= 0; i--) {
486 8851656 : raw_output[i] = -1;
487 : }
488 : Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
489 : isolate);
490 :
491 : IrregexpInterpreter::Result result = IrregexpInterpreter::Match(
492 4281260 : isolate, byte_codes, subject, raw_output, index);
493 : DCHECK_IMPLIES(result == IrregexpInterpreter::EXCEPTION,
494 : isolate->has_pending_exception());
495 :
496 4281260 : switch (result) {
497 : case IrregexpInterpreter::SUCCESS:
498 : // Copy capture results to the start of the registers array.
499 : MemCopy(output, raw_output,
500 : number_of_capture_registers * sizeof(int32_t));
501 8387825 : return result;
502 : case IrregexpInterpreter::EXCEPTION:
503 : case IrregexpInterpreter::FAILURE:
504 : return result;
505 : case IrregexpInterpreter::RETRY:
506 : // The string has changed representation, and we must restart the
507 : // match.
508 1 : is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
509 1 : EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
510 1 : break;
511 : }
512 : } while (true);
513 : UNREACHABLE();
514 : }
515 : }
516 :
517 4351743 : MaybeHandle<Object> RegExpImpl::IrregexpExec(
518 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
519 : int previous_index, Handle<RegExpMatchInfo> last_match_info) {
520 : DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
521 :
522 4351743 : subject = String::Flatten(isolate, subject);
523 :
524 : // Prepare space for the return values.
525 : #ifdef DEBUG
526 : if (FLAG_regexp_interpret_all && FLAG_trace_regexp_bytecodes) {
527 : String pattern = regexp->Pattern();
528 : PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
529 : PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
530 : }
531 : #endif
532 : int required_registers =
533 4351743 : RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
534 4351743 : if (required_registers < 0) {
535 : // Compiling failed with an exception.
536 : DCHECK(isolate->has_pending_exception());
537 298 : return MaybeHandle<Object>();
538 : }
539 :
540 : int32_t* output_registers = nullptr;
541 4351445 : if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
542 2844 : output_registers = NewArray<int32_t>(required_registers);
543 : }
544 : std::unique_ptr<int32_t[]> auto_release(output_registers);
545 4351445 : if (output_registers == nullptr) {
546 : output_registers = isolate->jsregexp_static_offsets_vector();
547 : }
548 :
549 : int res =
550 : RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
551 4351445 : output_registers, required_registers);
552 4351445 : if (res == RE_SUCCESS) {
553 : int capture_count =
554 : IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
555 : return SetLastMatchInfo(isolate, last_match_info, subject, capture_count,
556 4162519 : output_registers);
557 : }
558 188926 : if (res == RE_EXCEPTION) {
559 : DCHECK(isolate->has_pending_exception());
560 68 : return MaybeHandle<Object>();
561 : }
562 : DCHECK(res == RE_FAILURE);
563 188858 : return isolate->factory()->null_value();
564 : }
565 :
566 4257278 : Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
567 : Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
568 : Handle<String> subject, int capture_count, int32_t* match) {
569 : // This is the only place where match infos can grow. If, after executing the
570 : // regexp, RegExpExecStub finds that the match info is too small, it restarts
571 : // execution in RegExpImpl::Exec, which finally grows the match info right
572 : // here.
573 :
574 4257278 : int capture_register_count = (capture_count + 1) * 2;
575 : Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures(
576 4257278 : isolate, last_match_info, capture_register_count);
577 : result->SetNumberOfCaptureRegisters(capture_register_count);
578 :
579 4257278 : if (*result != *last_match_info) {
580 4248 : if (*last_match_info == *isolate->regexp_last_match_info()) {
581 : // This inner condition is only needed for special situations like the
582 : // regexp fuzzer, where we pass our own custom RegExpMatchInfo to
583 : // RegExpImpl::Exec; there actually want to bypass the Isolate's match
584 : // info and execute the regexp without side effects.
585 4248 : isolate->native_context()->set_regexp_last_match_info(*result);
586 : }
587 : }
588 :
589 : DisallowHeapAllocation no_allocation;
590 4257278 : if (match != nullptr) {
591 15177758 : for (int i = 0; i < capture_register_count; i += 2) {
592 5460240 : result->SetCapture(i, match[i]);
593 5460240 : result->SetCapture(i + 1, match[i + 1]);
594 : }
595 : }
596 8514556 : result->SetLastSubject(*subject);
597 8514556 : result->SetLastInput(*subject);
598 4257278 : return result;
599 : }
600 :
601 95501 : RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
602 : Handle<String> subject, Isolate* isolate)
603 : : register_array_(nullptr),
604 : register_array_size_(0),
605 : regexp_(regexp),
606 : subject_(subject),
607 95501 : isolate_(isolate) {
608 95501 : bool interpreted = FLAG_regexp_interpret_all;
609 :
610 95501 : if (regexp_->TypeTag() == JSRegExp::ATOM) {
611 : static const int kAtomRegistersPerMatch = 2;
612 90255 : registers_per_match_ = kAtomRegistersPerMatch;
613 : // There is no distinction between interpreted and native for atom regexps.
614 : interpreted = false;
615 : } else {
616 : registers_per_match_ =
617 5246 : RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
618 5246 : if (registers_per_match_ < 0) {
619 95 : num_matches_ = -1; // Signal exception.
620 95 : return;
621 : }
622 : }
623 :
624 : DCHECK(IsGlobal(regexp->GetFlags()));
625 95406 : if (!interpreted) {
626 : register_array_size_ =
627 189500 : Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
628 94750 : max_matches_ = register_array_size_ / registers_per_match_;
629 : } else {
630 : // Global loop in interpreted regexp is not implemented. We choose
631 : // the size of the offsets vector so that it can only store one match.
632 656 : register_array_size_ = registers_per_match_;
633 656 : max_matches_ = 1;
634 : }
635 :
636 95406 : if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
637 1072 : register_array_ = NewArray<int32_t>(register_array_size_);
638 : } else {
639 94334 : register_array_ = isolate->jsregexp_static_offsets_vector();
640 : }
641 :
642 : // Set state so that fetching the results the first time triggers a call
643 : // to the compiled regexp.
644 95406 : current_match_index_ = max_matches_ - 1;
645 95406 : num_matches_ = max_matches_;
646 : DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
647 : DCHECK_GE(register_array_size_, registers_per_match_);
648 : int32_t* last_match =
649 95406 : ®ister_array_[current_match_index_ * registers_per_match_];
650 95406 : last_match[0] = -1;
651 95406 : last_match[1] = 0;
652 : }
653 :
654 7 : int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
655 14 : if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
656 7 : unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
657 : unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
658 : // Advance over the surrogate pair.
659 0 : return last_index + 2;
660 : }
661 7 : return last_index + 1;
662 : }
663 :
664 : // -------------------------------------------------------------------
665 : // Implementation of the Irregexp regular expression engine.
666 : //
667 : // The Irregexp regular expression engine is intended to be a complete
668 : // implementation of ECMAScript regular expressions. It generates either
669 : // bytecodes or native code.
670 :
671 : // The Irregexp regexp engine is structured in three steps.
672 : // 1) The parser generates an abstract syntax tree. See ast.cc.
673 : // 2) From the AST a node network is created. The nodes are all
674 : // subclasses of RegExpNode. The nodes represent states when
675 : // executing a regular expression. Several optimizations are
676 : // performed on the node network.
677 : // 3) From the nodes we generate either byte codes or native code
678 : // that can actually execute the regular expression (perform
679 : // the search). The code generation step is described in more
680 : // detail below.
681 :
682 : // Code generation.
683 : //
684 : // The nodes are divided into four main categories.
685 : // * Choice nodes
686 : // These represent places where the regular expression can
687 : // match in more than one way. For example on entry to an
688 : // alternation (foo|bar) or a repetition (*, +, ? or {}).
689 : // * Action nodes
690 : // These represent places where some action should be
691 : // performed. Examples include recording the current position
692 : // in the input string to a register (in order to implement
693 : // captures) or other actions on register for example in order
694 : // to implement the counters needed for {} repetitions.
695 : // * Matching nodes
696 : // These attempt to match some element part of the input string.
697 : // Examples of elements include character classes, plain strings
698 : // or back references.
699 : // * End nodes
700 : // These are used to implement the actions required on finding
701 : // a successful match or failing to find a match.
702 : //
703 : // The code generated (whether as byte codes or native code) maintains
704 : // some state as it runs. This consists of the following elements:
705 : //
706 : // * The capture registers. Used for string captures.
707 : // * Other registers. Used for counters etc.
708 : // * The current position.
709 : // * The stack of backtracking information. Used when a matching node
710 : // fails to find a match and needs to try an alternative.
711 : //
712 : // Conceptual regular expression execution model:
713 : //
714 : // There is a simple conceptual model of regular expression execution
715 : // which will be presented first. The actual code generated is a more
716 : // efficient simulation of the simple conceptual model:
717 : //
718 : // * Choice nodes are implemented as follows:
719 : // For each choice except the last {
720 : // push current position
721 : // push backtrack code location
722 : // <generate code to test for choice>
723 : // backtrack code location:
724 : // pop current position
725 : // }
726 : // <generate code to test for last choice>
727 : //
728 : // * Actions nodes are generated as follows
729 : // <push affected registers on backtrack stack>
730 : // <generate code to perform action>
731 : // push backtrack code location
732 : // <generate code to test for following nodes>
733 : // backtrack code location:
734 : // <pop affected registers to restore their state>
735 : // <pop backtrack location from stack and go to it>
736 : //
737 : // * Matching nodes are generated as follows:
738 : // if input string matches at current position
739 : // update current position
740 : // <generate code to test for following nodes>
741 : // else
742 : // <pop backtrack location from stack and go to it>
743 : //
744 : // Thus it can be seen that the current position is saved and restored
745 : // by the choice nodes, whereas the registers are saved and restored by
746 : // by the action nodes that manipulate them.
747 : //
748 : // The other interesting aspect of this model is that nodes are generated
749 : // at the point where they are needed by a recursive call to Emit(). If
750 : // the node has already been code generated then the Emit() call will
751 : // generate a jump to the previously generated code instead. In order to
752 : // limit recursion it is possible for the Emit() function to put the node
753 : // on a work list for later generation and instead generate a jump. The
754 : // destination of the jump is resolved later when the code is generated.
755 : //
756 : // Actual regular expression code generation.
757 : //
758 : // Code generation is actually more complicated than the above. In order
759 : // to improve the efficiency of the generated code some optimizations are
760 : // performed
761 : //
762 : // * Choice nodes have 1-character lookahead.
763 : // A choice node looks at the following character and eliminates some of
764 : // the choices immediately based on that character. This is not yet
765 : // implemented.
766 : // * Simple greedy loops store reduced backtracking information.
767 : // A quantifier like /.*foo/m will greedily match the whole input. It will
768 : // then need to backtrack to a point where it can match "foo". The naive
769 : // implementation of this would push each character position onto the
770 : // backtracking stack, then pop them off one by one. This would use space
771 : // proportional to the length of the input string. However since the "."
772 : // can only match in one way and always has a constant length (in this case
773 : // of 1) it suffices to store the current position on the top of the stack
774 : // once. Matching now becomes merely incrementing the current position and
775 : // backtracking becomes decrementing the current position and checking the
776 : // result against the stored current position. This is faster and saves
777 : // space.
778 : // * The current state is virtualized.
779 : // This is used to defer expensive operations until it is clear that they
780 : // are needed and to generate code for a node more than once, allowing
781 : // specialized an efficient versions of the code to be created. This is
782 : // explained in the section below.
783 : //
784 : // Execution state virtualization.
785 : //
786 : // Instead of emitting code, nodes that manipulate the state can record their
787 : // manipulation in an object called the Trace. The Trace object can record a
788 : // current position offset, an optional backtrack code location on the top of
789 : // the virtualized backtrack stack and some register changes. When a node is
790 : // to be emitted it can flush the Trace or update it. Flushing the Trace
791 : // will emit code to bring the actual state into line with the virtual state.
792 : // Avoiding flushing the state can postpone some work (e.g. updates of capture
793 : // registers). Postponing work can save time when executing the regular
794 : // expression since it may be found that the work never has to be done as a
795 : // failure to match can occur. In addition it is much faster to jump to a
796 : // known backtrack code location than it is to pop an unknown backtrack
797 : // location from the stack and jump there.
798 : //
799 : // The virtual state found in the Trace affects code generation. For example
800 : // the virtual state contains the difference between the actual current
801 : // position and the virtual current position, and matching code needs to use
802 : // this offset to attempt a match in the correct location of the input
803 : // string. Therefore code generated for a non-trivial trace is specialized
804 : // to that trace. The code generator therefore has the ability to generate
805 : // code for each node several times. In order to limit the size of the
806 : // generated code there is an arbitrary limit on how many specialized sets of
807 : // code may be generated for a given node. If the limit is reached, the
808 : // trace is flushed and a generic version of the code for a node is emitted.
809 : // This is subsequently used for that node. The code emitted for non-generic
810 : // trace is not recorded in the node and so it cannot currently be reused in
811 : // the event that code generation is requested for an identical trace.
812 :
813 :
814 0 : void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
815 0 : UNREACHABLE();
816 : }
817 :
818 :
819 99371 : void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
820 : text->AddElement(TextElement::Atom(this), zone);
821 99371 : }
822 :
823 :
824 7635 : void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
825 : text->AddElement(TextElement::CharClass(this), zone);
826 7635 : }
827 :
828 :
829 0 : void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
830 0 : for (int i = 0; i < elements()->length(); i++)
831 : text->AddElement(elements()->at(i), zone);
832 0 : }
833 :
834 :
835 0 : TextElement TextElement::Atom(RegExpAtom* atom) {
836 0 : return TextElement(ATOM, atom);
837 : }
838 :
839 :
840 0 : TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
841 0 : return TextElement(CHAR_CLASS, char_class);
842 : }
843 :
844 :
845 7436807 : int TextElement::length() const {
846 7436807 : switch (text_type()) {
847 : case ATOM:
848 6603069 : return atom()->length();
849 :
850 : case CHAR_CLASS:
851 : return 1;
852 : }
853 0 : UNREACHABLE();
854 : }
855 :
856 :
857 0 : DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
858 0 : if (table_ == nullptr) {
859 0 : table_ = new(zone()) DispatchTable(zone());
860 : DispatchTableConstructor cons(table_, ignore_case, zone());
861 : cons.BuildTable(this);
862 : }
863 0 : return table_;
864 : }
865 :
866 :
867 : class FrequencyCollator {
868 : public:
869 11050914 : FrequencyCollator() : total_samples_(0) {
870 22016162 : for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
871 10965248 : frequencies_[i] = CharacterFrequency(i);
872 : }
873 : }
874 :
875 : void CountCharacter(int character) {
876 456028 : int index = (character & RegExpMacroAssembler::kTableMask);
877 : frequencies_[index].Increment();
878 456028 : total_samples_++;
879 : }
880 :
881 : // Does not measure in percent, but rather per-128 (the table size from the
882 : // regexp macro assembler).
883 : int Frequency(int in_character) {
884 : DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
885 487101 : if (total_samples_ < 1) return 1; // Division by zero.
886 : int freq_in_per128 =
887 486836 : (frequencies_[in_character].counter() * 128) / total_samples_;
888 : return freq_in_per128;
889 : }
890 :
891 : private:
892 : class CharacterFrequency {
893 : public:
894 10965248 : CharacterFrequency() : counter_(0), character_(-1) { }
895 : explicit CharacterFrequency(int character)
896 : : counter_(0), character_(character) { }
897 :
898 456028 : void Increment() { counter_++; }
899 : int counter() { return counter_; }
900 : int character() { return character_; }
901 :
902 : private:
903 : int counter_;
904 : int character_;
905 : };
906 :
907 :
908 : private:
909 : CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
910 : int total_samples_;
911 : };
912 :
913 :
914 : class RegExpCompiler {
915 : public:
916 : RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
917 : bool is_one_byte);
918 :
919 : int AllocateRegister() {
920 909510 : if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
921 310203 : reg_exp_too_big_ = true;
922 : return next_register_;
923 : }
924 599307 : return next_register_++;
925 : }
926 :
927 : // Lookarounds to match lone surrogates for unicode character class matches
928 : // are never nested. We can therefore reuse registers.
929 : int UnicodeLookaroundStackRegister() {
930 2460 : if (unicode_lookaround_stack_register_ == kNoRegister) {
931 1040 : unicode_lookaround_stack_register_ = AllocateRegister();
932 : }
933 2460 : return unicode_lookaround_stack_register_;
934 : }
935 :
936 : int UnicodeLookaroundPositionRegister() {
937 2460 : if (unicode_lookaround_position_register_ == kNoRegister) {
938 1040 : unicode_lookaround_position_register_ = AllocateRegister();
939 : }
940 2460 : return unicode_lookaround_position_register_;
941 : }
942 :
943 : RegExpEngine::CompilationResult Assemble(Isolate* isolate,
944 : RegExpMacroAssembler* assembler,
945 : RegExpNode* start, int capture_count,
946 : Handle<String> pattern);
947 :
948 592608 : inline void AddWork(RegExpNode* node) {
949 946295 : if (!node->on_work_list() && !node->label()->is_bound()) {
950 : node->set_on_work_list(true);
951 210432 : work_list_->push_back(node);
952 : }
953 592608 : }
954 :
955 : static const int kImplementationOffset = 0;
956 : static const int kNumberOfRegistersOffset = 0;
957 : static const int kCodeOffset = 1;
958 :
959 : RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
960 : EndNode* accept() { return accept_; }
961 :
962 : static const int kMaxRecursion = 100;
963 : inline int recursion_depth() { return recursion_depth_; }
964 998060 : inline void IncrementRecursionDepth() { recursion_depth_++; }
965 998060 : inline void DecrementRecursionDepth() { recursion_depth_--; }
966 :
967 0 : void SetRegExpTooBig() { reg_exp_too_big_ = true; }
968 :
969 : inline bool one_byte() { return one_byte_; }
970 : inline bool optimize() { return optimize_; }
971 84502 : inline void set_optimize(bool value) { optimize_ = value; }
972 : inline bool limiting_recursion() { return limiting_recursion_; }
973 : inline void set_limiting_recursion(bool value) {
974 955270 : limiting_recursion_ = value;
975 : }
976 : bool read_backward() { return read_backward_; }
977 3336 : void set_read_backward(bool value) { read_backward_ = value; }
978 : FrequencyCollator* frequency_collator() { return &frequency_collator_; }
979 :
980 : int current_expansion_factor() { return current_expansion_factor_; }
981 : void set_current_expansion_factor(int value) {
982 85549 : current_expansion_factor_ = value;
983 : }
984 :
985 : Isolate* isolate() const { return isolate_; }
986 : Zone* zone() const { return zone_; }
987 :
988 : static const int kNoRegister = -1;
989 :
990 : private:
991 : EndNode* accept_;
992 : int next_register_;
993 : int unicode_lookaround_stack_register_;
994 : int unicode_lookaround_position_register_;
995 : std::vector<RegExpNode*>* work_list_;
996 : int recursion_depth_;
997 : RegExpMacroAssembler* macro_assembler_;
998 : bool one_byte_;
999 : bool reg_exp_too_big_;
1000 : bool limiting_recursion_;
1001 : bool optimize_;
1002 : bool read_backward_;
1003 : int current_expansion_factor_;
1004 : FrequencyCollator frequency_collator_;
1005 : Isolate* isolate_;
1006 : Zone* zone_;
1007 : };
1008 :
1009 :
1010 : class RecursionCheck {
1011 : public:
1012 : explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1013 : compiler->IncrementRecursionDepth();
1014 : }
1015 : ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1016 : private:
1017 : RegExpCompiler* compiler_;
1018 : };
1019 :
1020 :
1021 : static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1022 : return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1023 : }
1024 :
1025 :
1026 : // Attempts to compile the regexp using an Irregexp code generator. Returns
1027 : // a fixed array or a null handle depending on whether it succeeded.
1028 85666 : RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1029 : bool one_byte)
1030 85666 : : next_register_(2 * (capture_count + 1)),
1031 : unicode_lookaround_stack_register_(kNoRegister),
1032 : unicode_lookaround_position_register_(kNoRegister),
1033 : work_list_(nullptr),
1034 : recursion_depth_(0),
1035 : one_byte_(one_byte),
1036 : reg_exp_too_big_(false),
1037 : limiting_recursion_(false),
1038 : optimize_(FLAG_regexp_optimization),
1039 : read_backward_(false),
1040 : current_expansion_factor_(1),
1041 : frequency_collator_(),
1042 : isolate_(isolate),
1043 171332 : zone_(zone) {
1044 85666 : accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1045 : DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
1046 85666 : }
1047 :
1048 85312 : RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1049 : Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
1050 : int capture_count, Handle<String> pattern) {
1051 : #ifdef DEBUG
1052 : if (FLAG_trace_regexp_assembler)
1053 : macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1054 : else
1055 : #endif
1056 85312 : macro_assembler_ = macro_assembler;
1057 :
1058 : std::vector<RegExpNode*> work_list;
1059 85312 : work_list_ = &work_list;
1060 85312 : Label fail;
1061 85312 : macro_assembler_->PushBacktrack(&fail);
1062 : Trace new_trace;
1063 85312 : start->Emit(this, &new_trace);
1064 85312 : macro_assembler_->Bind(&fail);
1065 85312 : macro_assembler_->Fail();
1066 295744 : while (!work_list.empty()) {
1067 210432 : RegExpNode* node = work_list.back();
1068 : work_list.pop_back();
1069 : node->set_on_work_list(false);
1070 210432 : if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1071 : }
1072 85312 : if (reg_exp_too_big_) {
1073 0 : macro_assembler_->AbortedCodeGeneration();
1074 0 : return IrregexpRegExpTooBig(isolate_);
1075 : }
1076 :
1077 85312 : Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1078 170624 : isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1079 85312 : work_list_ = nullptr;
1080 : #ifdef ENABLE_DISASSEMBLER
1081 : if (FLAG_print_code && !FLAG_regexp_interpret_all) {
1082 : CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1083 : OFStream os(trace_scope.file());
1084 : Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1085 : }
1086 : #endif
1087 : #ifdef DEBUG
1088 : if (FLAG_trace_regexp_assembler) {
1089 : delete macro_assembler_;
1090 : }
1091 : #endif
1092 85312 : return RegExpEngine::CompilationResult(*code, next_register_);
1093 : }
1094 :
1095 :
1096 0 : bool Trace::DeferredAction::Mentions(int that) {
1097 2463221 : if (action_type() == ActionNode::CLEAR_CAPTURES) {
1098 : Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1099 : return range.Contains(that);
1100 : } else {
1101 2415759 : return reg() == that;
1102 : }
1103 : }
1104 :
1105 :
1106 0 : bool Trace::mentions_reg(int reg) {
1107 0 : for (DeferredAction* action = actions_; action != nullptr;
1108 : action = action->next()) {
1109 0 : if (action->Mentions(reg))
1110 : return true;
1111 : }
1112 : return false;
1113 : }
1114 :
1115 :
1116 973 : bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1117 : DCHECK_EQ(0, *cp_offset);
1118 1516 : for (DeferredAction* action = actions_; action != nullptr;
1119 : action = action->next()) {
1120 953 : if (action->Mentions(reg)) {
1121 410 : if (action->action_type() == ActionNode::STORE_POSITION) {
1122 410 : *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1123 410 : return true;
1124 : } else {
1125 : return false;
1126 : }
1127 : }
1128 : }
1129 : return false;
1130 : }
1131 :
1132 :
1133 510785 : int Trace::FindAffectedRegisters(OutSet* affected_registers,
1134 : Zone* zone) {
1135 : int max_register = RegExpCompiler::kNoRegister;
1136 930005 : for (DeferredAction* action = actions_; action != nullptr;
1137 : action = action->next()) {
1138 419220 : if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1139 : Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1140 93100 : for (int i = range.from(); i <= range.to(); i++)
1141 45072 : affected_registers->Set(i, zone);
1142 2956 : if (range.to() > max_register) max_register = range.to();
1143 : } else {
1144 416264 : affected_registers->Set(action->reg(), zone);
1145 416264 : if (action->reg() > max_register) max_register = action->reg();
1146 : }
1147 : }
1148 510785 : return max_register;
1149 : }
1150 :
1151 :
1152 510785 : void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1153 : int max_register,
1154 : const OutSet& registers_to_pop,
1155 : const OutSet& registers_to_clear) {
1156 20414187 : for (int reg = max_register; reg >= 0; reg--) {
1157 19903402 : if (registers_to_pop.Get(reg)) {
1158 52884 : assembler->PopRegister(reg);
1159 9898817 : } else if (registers_to_clear.Get(reg)) {
1160 : int clear_to = reg;
1161 470860 : while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1162 105386 : reg--;
1163 : }
1164 77351 : assembler->ClearRegisters(reg, clear_to);
1165 : }
1166 : }
1167 510785 : }
1168 :
1169 :
1170 510785 : void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1171 : int max_register,
1172 : const OutSet& affected_registers,
1173 : OutSet* registers_to_pop,
1174 : OutSet* registers_to_clear,
1175 : Zone* zone) {
1176 : // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1177 510785 : const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1178 :
1179 : // Count pushes performed to force a stack limit check occasionally.
1180 : int pushes = 0;
1181 :
1182 20624959 : for (int reg = 0; reg <= max_register; reg++) {
1183 20114174 : if (!affected_registers.Get(reg)) {
1184 : continue;
1185 : }
1186 :
1187 : // The chronologically first deferred action in the trace
1188 : // is used to infer the action needed to restore a register
1189 : // to its previous state (or not, if it's safe to ignore it).
1190 : enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1191 : DeferredActionUndoType undo_action = IGNORE;
1192 :
1193 : int value = 0;
1194 : bool absolute = false;
1195 : bool clear = false;
1196 : static const int kNoStore = kMinInt;
1197 : int store_position = kNoStore;
1198 : // This is a little tricky because we are scanning the actions in reverse
1199 : // historical order (newest first).
1200 2917045 : for (DeferredAction* action = actions_; action != nullptr;
1201 : action = action->next()) {
1202 2462268 : if (action->Mentions(reg)) {
1203 461336 : switch (action->action_type()) {
1204 : case ActionNode::SET_REGISTER: {
1205 : Trace::DeferredSetRegister* psr =
1206 : static_cast<Trace::DeferredSetRegister*>(action);
1207 3465 : if (!absolute) {
1208 3465 : value += psr->value();
1209 : absolute = true;
1210 : }
1211 : // SET_REGISTER is currently only used for newly introduced loop
1212 : // counters. They can have a significant previous value if they
1213 : // occur in a loop. TODO(lrn): Propagate this information, so
1214 : // we can set undo_action to IGNORE if we know there is no value to
1215 : // restore.
1216 : undo_action = RESTORE;
1217 : DCHECK_EQ(store_position, kNoStore);
1218 : DCHECK(!clear);
1219 : break;
1220 : }
1221 : case ActionNode::INCREMENT_REGISTER:
1222 3744 : if (!absolute) {
1223 3744 : value++;
1224 : }
1225 : DCHECK_EQ(store_position, kNoStore);
1226 : DCHECK(!clear);
1227 : undo_action = RESTORE;
1228 : break;
1229 : case ActionNode::STORE_POSITION: {
1230 : Trace::DeferredCapture* pc =
1231 : static_cast<Trace::DeferredCapture*>(action);
1232 409055 : if (!clear && store_position == kNoStore) {
1233 : store_position = pc->cp_offset();
1234 : }
1235 :
1236 : // For captures we know that stores and clears alternate.
1237 : // Other register, are never cleared, and if the occur
1238 : // inside a loop, they might be assigned more than once.
1239 409055 : if (reg <= 1) {
1240 : // Registers zero and one, aka "capture zero", is
1241 : // always set correctly if we succeed. There is no
1242 : // need to undo a setting on backtrack, because we
1243 : // will set it again or fail.
1244 : undo_action = IGNORE;
1245 : } else {
1246 189899 : undo_action = pc->is_capture() ? CLEAR : RESTORE;
1247 : }
1248 : DCHECK(!absolute);
1249 : DCHECK_EQ(value, 0);
1250 : break;
1251 : }
1252 : case ActionNode::CLEAR_CAPTURES: {
1253 : // Since we're scanning in reverse order, if we've already
1254 : // set the position we have to ignore historically earlier
1255 : // clearing operations.
1256 45072 : if (store_position == kNoStore) {
1257 : clear = true;
1258 : }
1259 : undo_action = RESTORE;
1260 : DCHECK(!absolute);
1261 : DCHECK_EQ(value, 0);
1262 : break;
1263 : }
1264 : default:
1265 0 : UNREACHABLE();
1266 : break;
1267 : }
1268 : }
1269 : }
1270 : // Prepare for the undo-action (e.g., push if it's going to be popped).
1271 454777 : if (undo_action == RESTORE) {
1272 52884 : pushes++;
1273 : RegExpMacroAssembler::StackCheckFlag stack_check =
1274 : RegExpMacroAssembler::kNoStackLimitCheck;
1275 52884 : if (pushes == push_limit) {
1276 : stack_check = RegExpMacroAssembler::kCheckStackLimit;
1277 : pushes = 0;
1278 : }
1279 :
1280 52884 : assembler->PushRegister(reg, stack_check);
1281 52884 : registers_to_pop->Set(reg, zone);
1282 401893 : } else if (undo_action == CLEAR) {
1283 182737 : registers_to_clear->Set(reg, zone);
1284 : }
1285 : // Perform the chronologically last action (or accumulated increment)
1286 : // for the register.
1287 454777 : if (store_position != kNoStore) {
1288 409055 : assembler->WriteCurrentPositionToRegister(reg, store_position);
1289 45722 : } else if (clear) {
1290 38513 : assembler->ClearRegisters(reg, reg);
1291 7209 : } else if (absolute) {
1292 3465 : assembler->SetRegister(reg, value);
1293 3744 : } else if (value != 0) {
1294 3744 : assembler->AdvanceRegister(reg, value);
1295 : }
1296 : }
1297 510785 : }
1298 :
1299 :
1300 : // This is called as we come into a loop choice node and some other tricky
1301 : // nodes. It normalizes the state of the code generator to ensure we can
1302 : // generate generic code.
1303 700054 : void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1304 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1305 :
1306 : DCHECK(!is_trivial());
1307 :
1308 700054 : if (actions_ == nullptr && backtrack() == nullptr) {
1309 : // Here we just have some deferred cp advances to fix and we are back to
1310 : // a normal situation. We may also have to forget some information gained
1311 : // through a quick check that was already performed.
1312 189269 : if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1313 : // Create a new trivial state and generate the node with that.
1314 : Trace new_state;
1315 189269 : successor->Emit(compiler, &new_state);
1316 : return;
1317 : }
1318 :
1319 : // Generate deferred actions here along with code to undo them again.
1320 : OutSet affected_registers;
1321 :
1322 510785 : if (backtrack() != nullptr) {
1323 : // Here we have a concrete backtrack location. These are set up by choice
1324 : // nodes and so they indicate that we have a deferred save of the current
1325 : // position which we may need to emit here.
1326 400521 : assembler->PushCurrentPosition();
1327 : }
1328 :
1329 : int max_register = FindAffectedRegisters(&affected_registers,
1330 510785 : compiler->zone());
1331 : OutSet registers_to_pop;
1332 : OutSet registers_to_clear;
1333 : PerformDeferredActions(assembler,
1334 : max_register,
1335 : affected_registers,
1336 : ®isters_to_pop,
1337 : ®isters_to_clear,
1338 510785 : compiler->zone());
1339 510785 : if (cp_offset_ != 0) {
1340 294792 : assembler->AdvanceCurrentPosition(cp_offset_);
1341 : }
1342 :
1343 : // Create a new trivial state and generate the node with that.
1344 510785 : Label undo;
1345 510785 : assembler->PushBacktrack(&undo);
1346 510785 : if (successor->KeepRecursing(compiler)) {
1347 : Trace new_state;
1348 137709 : successor->Emit(compiler, &new_state);
1349 : } else {
1350 373076 : compiler->AddWork(successor);
1351 746152 : assembler->GoTo(successor->label());
1352 : }
1353 :
1354 : // On backtrack we need to restore state.
1355 510785 : assembler->Bind(&undo);
1356 : RestoreAffectedRegisters(assembler,
1357 : max_register,
1358 : registers_to_pop,
1359 510785 : registers_to_clear);
1360 510785 : if (backtrack() == nullptr) {
1361 110264 : assembler->Backtrack();
1362 : } else {
1363 400521 : assembler->PopCurrentPosition();
1364 400521 : assembler->GoTo(backtrack());
1365 : }
1366 : }
1367 :
1368 :
1369 2843 : void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1370 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1371 :
1372 : // Omit flushing the trace. We discard the entire stack frame anyway.
1373 :
1374 2843 : if (!label()->is_bound()) {
1375 : // We are completely independent of the trace, since we ignore it,
1376 : // so this code can be used as the generic version.
1377 5604 : assembler->Bind(label());
1378 : }
1379 :
1380 : // Throw away everything on the backtrack stack since the start
1381 : // of the negative submatch and restore the character position.
1382 2843 : assembler->ReadCurrentPositionFromRegister(current_position_register_);
1383 2843 : assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1384 2843 : if (clear_capture_count_ > 0) {
1385 : // Clear any captures that might have been performed during the success
1386 : // of the body of the negative look-ahead.
1387 107 : int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1388 107 : assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1389 : }
1390 : // Now that we have unwound the stack we find at the top of the stack the
1391 : // backtrack that the BeginSubmatch node got.
1392 2843 : assembler->Backtrack();
1393 2843 : }
1394 :
1395 :
1396 182600 : void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1397 182600 : if (!trace->is_trivial()) {
1398 91150 : trace->Flush(compiler, this);
1399 91150 : return;
1400 : }
1401 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1402 91450 : if (!label()->is_bound()) {
1403 170602 : assembler->Bind(label());
1404 : }
1405 91450 : switch (action_) {
1406 : case ACCEPT:
1407 91150 : assembler->Succeed();
1408 91150 : return;
1409 : case BACKTRACK:
1410 300 : assembler->GoTo(trace->backtrack());
1411 300 : return;
1412 : case NEGATIVE_SUBMATCH_SUCCESS:
1413 : // This case is handled in a different virtual method.
1414 0 : UNREACHABLE();
1415 : }
1416 0 : UNIMPLEMENTED();
1417 : }
1418 :
1419 :
1420 903936 : void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1421 1807872 : if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone);
1422 903936 : guards_->Add(guard, zone);
1423 903936 : }
1424 :
1425 :
1426 903359 : ActionNode* ActionNode::SetRegister(int reg,
1427 : int val,
1428 : RegExpNode* on_success) {
1429 : ActionNode* result =
1430 : new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1431 903359 : result->data_.u_store_register.reg = reg;
1432 903359 : result->data_.u_store_register.value = val;
1433 903359 : return result;
1434 : }
1435 :
1436 :
1437 903359 : ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1438 : ActionNode* result =
1439 : new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1440 903359 : result->data_.u_increment_register.reg = reg;
1441 903359 : return result;
1442 : }
1443 :
1444 :
1445 226183 : ActionNode* ActionNode::StorePosition(int reg,
1446 : bool is_capture,
1447 : RegExpNode* on_success) {
1448 : ActionNode* result =
1449 : new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1450 226183 : result->data_.u_position_register.reg = reg;
1451 226183 : result->data_.u_position_register.is_capture = is_capture;
1452 226183 : return result;
1453 : }
1454 :
1455 :
1456 2376 : ActionNode* ActionNode::ClearCaptures(Interval range,
1457 : RegExpNode* on_success) {
1458 : ActionNode* result =
1459 : new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1460 2376 : result->data_.u_clear_captures.range_from = range.from();
1461 2376 : result->data_.u_clear_captures.range_to = range.to();
1462 2376 : return result;
1463 : }
1464 :
1465 :
1466 4467 : ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1467 : int position_reg,
1468 : RegExpNode* on_success) {
1469 : ActionNode* result =
1470 : new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1471 4467 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1472 4467 : result->data_.u_submatch.current_position_register = position_reg;
1473 4467 : return result;
1474 : }
1475 :
1476 :
1477 1655 : ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1478 : int position_reg,
1479 : int clear_register_count,
1480 : int clear_register_from,
1481 : RegExpNode* on_success) {
1482 : ActionNode* result =
1483 : new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1484 1655 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1485 1655 : result->data_.u_submatch.current_position_register = position_reg;
1486 1655 : result->data_.u_submatch.clear_register_count = clear_register_count;
1487 1655 : result->data_.u_submatch.clear_register_from = clear_register_from;
1488 1655 : return result;
1489 : }
1490 :
1491 :
1492 537 : ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1493 : int repetition_register,
1494 : int repetition_limit,
1495 : RegExpNode* on_success) {
1496 : ActionNode* result =
1497 : new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1498 537 : result->data_.u_empty_match_check.start_register = start_register;
1499 537 : result->data_.u_empty_match_check.repetition_register = repetition_register;
1500 537 : result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1501 537 : return result;
1502 : }
1503 :
1504 :
1505 : #define DEFINE_ACCEPT(Type) \
1506 : void Type##Node::Accept(NodeVisitor* visitor) { \
1507 : visitor->Visit##Type(this); \
1508 : }
1509 725523 : FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1510 : #undef DEFINE_ACCEPT
1511 :
1512 :
1513 145800 : void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1514 145800 : visitor->VisitLoopChoice(this);
1515 145800 : }
1516 :
1517 :
1518 : // -------------------------------------------------------------------
1519 : // Emit code.
1520 :
1521 :
1522 3936 : void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1523 : Guard* guard,
1524 : Trace* trace) {
1525 3936 : switch (guard->op()) {
1526 : case Guard::LT:
1527 : DCHECK(!trace->mentions_reg(guard->reg()));
1528 : macro_assembler->IfRegisterGE(guard->reg(),
1529 : guard->value(),
1530 2614 : trace->backtrack());
1531 2614 : break;
1532 : case Guard::GEQ:
1533 : DCHECK(!trace->mentions_reg(guard->reg()));
1534 : macro_assembler->IfRegisterLT(guard->reg(),
1535 : guard->value(),
1536 1322 : trace->backtrack());
1537 1322 : break;
1538 : }
1539 3936 : }
1540 :
1541 :
1542 : // Returns the number of characters in the equivalence class, omitting those
1543 : // that cannot occur in the source string because it is Latin1.
1544 21870 : static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1545 : bool one_byte_subject,
1546 : unibrow::uchar* letters) {
1547 : int length =
1548 43740 : isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1549 : // Unibrow returns 0 or 1 for characters where case independence is
1550 : // trivial.
1551 21870 : if (length == 0) {
1552 2773 : letters[0] = character;
1553 : length = 1;
1554 : }
1555 :
1556 21870 : if (one_byte_subject) {
1557 : int new_length = 0;
1558 80415 : for (int i = 0; i < length; i++) {
1559 31770 : if (letters[i] <= String::kMaxOneByteCharCode) {
1560 31360 : letters[new_length++] = letters[i];
1561 : }
1562 : }
1563 : length = new_length;
1564 : }
1565 :
1566 21870 : return length;
1567 : }
1568 :
1569 :
1570 584679 : static inline bool EmitSimpleCharacter(Isolate* isolate,
1571 : RegExpCompiler* compiler,
1572 : uc16 c,
1573 : Label* on_failure,
1574 : int cp_offset,
1575 : bool check,
1576 : bool preloaded) {
1577 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1578 : bool bound_checked = false;
1579 584679 : if (!preloaded) {
1580 584679 : assembler->LoadCurrentCharacter(
1581 : cp_offset,
1582 : on_failure,
1583 1169358 : check);
1584 : bound_checked = true;
1585 : }
1586 584679 : assembler->CheckNotCharacter(c, on_failure);
1587 584679 : return bound_checked;
1588 : }
1589 :
1590 :
1591 : // Only emits non-letters (things that don't have case). Only used for case
1592 : // independent matches.
1593 5503 : static inline bool EmitAtomNonLetter(Isolate* isolate,
1594 : RegExpCompiler* compiler,
1595 : uc16 c,
1596 : Label* on_failure,
1597 : int cp_offset,
1598 : bool check,
1599 : bool preloaded) {
1600 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1601 : bool one_byte = compiler->one_byte();
1602 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1603 5503 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1604 5503 : if (length < 1) {
1605 : // This can't match. Must be an one-byte subject and a non-one-byte
1606 : // character. We do not need to do anything since the one-byte pass
1607 : // already handled this.
1608 : return false; // Bounds not checked.
1609 : }
1610 : bool checked = false;
1611 : // We handle the length > 1 case in a later pass.
1612 5498 : if (length == 1) {
1613 371 : if (one_byte && c > String::kMaxOneByteCharCodeU) {
1614 : // Can't match - see above.
1615 : return false; // Bounds not checked.
1616 : }
1617 371 : if (!preloaded) {
1618 371 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1619 : checked = check;
1620 : }
1621 371 : macro_assembler->CheckNotCharacter(c, on_failure);
1622 : }
1623 : return checked;
1624 : }
1625 :
1626 :
1627 4785 : static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1628 : bool one_byte, uc16 c1, uc16 c2,
1629 : Label* on_failure) {
1630 : uc16 char_mask;
1631 4785 : if (one_byte) {
1632 : char_mask = String::kMaxOneByteCharCode;
1633 : } else {
1634 : char_mask = String::kMaxUtf16CodeUnit;
1635 : }
1636 4785 : uc16 exor = c1 ^ c2;
1637 : // Check whether exor has only one bit set.
1638 4785 : if (((exor - 1) & exor) == 0) {
1639 : // If c1 and c2 differ only by one bit.
1640 : // Ecma262UnCanonicalize always gives the highest number last.
1641 : DCHECK(c2 > c1);
1642 4690 : uc16 mask = char_mask ^ exor;
1643 4690 : macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1644 4690 : return true;
1645 : }
1646 : DCHECK(c2 > c1);
1647 95 : uc16 diff = c2 - c1;
1648 95 : if (((diff - 1) & diff) == 0 && c1 >= diff) {
1649 : // If the characters differ by 2^n but don't differ by one bit then
1650 : // subtract the difference from the found character, then do the or
1651 : // trick. We avoid the theoretical case where negative numbers are
1652 : // involved in order to simplify code generation.
1653 85 : uc16 mask = char_mask ^ diff;
1654 85 : macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1655 : diff,
1656 : mask,
1657 170 : on_failure);
1658 85 : return true;
1659 : }
1660 : return false;
1661 : }
1662 :
1663 :
1664 : typedef bool EmitCharacterFunction(Isolate* isolate,
1665 : RegExpCompiler* compiler,
1666 : uc16 c,
1667 : Label* on_failure,
1668 : int cp_offset,
1669 : bool check,
1670 : bool preloaded);
1671 :
1672 : // Only emits letters (things that have case). Only used for case independent
1673 : // matches.
1674 5503 : static inline bool EmitAtomLetter(Isolate* isolate,
1675 : RegExpCompiler* compiler,
1676 : uc16 c,
1677 : Label* on_failure,
1678 : int cp_offset,
1679 : bool check,
1680 : bool preloaded) {
1681 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1682 : bool one_byte = compiler->one_byte();
1683 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1684 5503 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1685 5503 : if (length <= 1) return false;
1686 : // We may not need to check against the end of the input string
1687 : // if this character lies before a character that matched.
1688 5127 : if (!preloaded) {
1689 4792 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1690 : }
1691 5127 : Label ok;
1692 : DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth);
1693 5127 : switch (length) {
1694 : case 2: {
1695 4785 : if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1696 4785 : chars[1], on_failure)) {
1697 : } else {
1698 10 : macro_assembler->CheckCharacter(chars[0], &ok);
1699 10 : macro_assembler->CheckNotCharacter(chars[1], on_failure);
1700 10 : macro_assembler->Bind(&ok);
1701 : }
1702 : break;
1703 : }
1704 : case 4:
1705 25 : macro_assembler->CheckCharacter(chars[3], &ok);
1706 : V8_FALLTHROUGH;
1707 : case 3:
1708 342 : macro_assembler->CheckCharacter(chars[0], &ok);
1709 342 : macro_assembler->CheckCharacter(chars[1], &ok);
1710 342 : macro_assembler->CheckNotCharacter(chars[2], on_failure);
1711 342 : macro_assembler->Bind(&ok);
1712 342 : break;
1713 : default:
1714 0 : UNREACHABLE();
1715 : break;
1716 : }
1717 : return true;
1718 : }
1719 :
1720 :
1721 8607 : static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1722 : int border,
1723 : Label* fall_through,
1724 : Label* above_or_equal,
1725 : Label* below) {
1726 8607 : if (below != fall_through) {
1727 8243 : masm->CheckCharacterLT(border, below);
1728 8243 : if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1729 : } else {
1730 364 : masm->CheckCharacterGT(border - 1, above_or_equal);
1731 : }
1732 8607 : }
1733 :
1734 :
1735 158958 : static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1736 : int first,
1737 : int last,
1738 : Label* fall_through,
1739 : Label* in_range,
1740 : Label* out_of_range) {
1741 158958 : if (in_range == fall_through) {
1742 107375 : if (first == last) {
1743 14492 : masm->CheckNotCharacter(first, out_of_range);
1744 : } else {
1745 92883 : masm->CheckCharacterNotInRange(first, last, out_of_range);
1746 : }
1747 : } else {
1748 51583 : if (first == last) {
1749 28274 : masm->CheckCharacter(first, in_range);
1750 : } else {
1751 23309 : masm->CheckCharacterInRange(first, last, in_range);
1752 : }
1753 51583 : if (out_of_range != fall_through) masm->GoTo(out_of_range);
1754 : }
1755 158958 : }
1756 :
1757 :
1758 : // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1759 : // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1760 5861 : static void EmitUseLookupTable(
1761 : RegExpMacroAssembler* masm,
1762 : ZoneList<int>* ranges,
1763 : int start_index,
1764 : int end_index,
1765 : int min_char,
1766 : Label* fall_through,
1767 : Label* even_label,
1768 : Label* odd_label) {
1769 : static const int kSize = RegExpMacroAssembler::kTableSize;
1770 : static const int kMask = RegExpMacroAssembler::kTableMask;
1771 :
1772 : int base = (min_char & ~kMask);
1773 : USE(base);
1774 :
1775 : // Assert that everything is on one kTableSize page.
1776 : for (int i = start_index; i <= end_index; i++) {
1777 : DCHECK_EQ(ranges->at(i) & ~kMask, base);
1778 : }
1779 : DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1780 :
1781 : char templ[kSize];
1782 : Label* on_bit_set;
1783 : Label* on_bit_clear;
1784 : int bit;
1785 5861 : if (even_label == fall_through) {
1786 : on_bit_set = odd_label;
1787 : on_bit_clear = even_label;
1788 : bit = 1;
1789 : } else {
1790 : on_bit_set = even_label;
1791 : on_bit_clear = odd_label;
1792 : bit = 0;
1793 : }
1794 252413 : for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1795 123276 : templ[i] = bit;
1796 : }
1797 : int j = 0;
1798 5861 : bit ^= 1;
1799 95816 : for (int i = start_index; i < end_index; i++) {
1800 1204670 : for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1801 512380 : templ[j] = bit;
1802 : }
1803 89955 : bit ^= 1;
1804 : }
1805 234965 : for (int i = j; i < kSize; i++) {
1806 114552 : templ[i] = bit;
1807 : }
1808 : Factory* factory = masm->isolate()->factory();
1809 : // TODO(erikcorry): Cache these.
1810 5861 : Handle<ByteArray> ba = factory->NewByteArray(kSize, AllocationType::kOld);
1811 1506277 : for (int i = 0; i < kSize; i++) {
1812 750208 : ba->set(i, templ[i]);
1813 : }
1814 5861 : masm->CheckBitInTable(ba, on_bit_set);
1815 5861 : if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1816 5861 : }
1817 :
1818 :
1819 35749 : static void CutOutRange(RegExpMacroAssembler* masm,
1820 : ZoneList<int>* ranges,
1821 : int start_index,
1822 : int end_index,
1823 : int cut_index,
1824 : Label* even_label,
1825 : Label* odd_label) {
1826 35749 : bool odd = (((cut_index - start_index) & 1) == 1);
1827 35749 : Label* in_range_label = odd ? odd_label : even_label;
1828 35749 : Label dummy;
1829 71498 : EmitDoubleBoundaryTest(masm,
1830 : ranges->at(cut_index),
1831 71498 : ranges->at(cut_index + 1) - 1,
1832 : &dummy,
1833 : in_range_label,
1834 35749 : &dummy);
1835 : DCHECK(!dummy.is_linked());
1836 : // Cut out the single range by rewriting the array. This creates a new
1837 : // range that is a merger of the two ranges on either side of the one we
1838 : // are cutting out. The oddity of the labels is preserved.
1839 74921 : for (int j = cut_index; j > start_index; j--) {
1840 39172 : ranges->at(j) = ranges->at(j - 1);
1841 : }
1842 181431 : for (int j = cut_index + 1; j < end_index; j++) {
1843 145682 : ranges->at(j) = ranges->at(j + 1);
1844 : }
1845 35749 : }
1846 :
1847 :
1848 : // Unicode case. Split the search space into kSize spaces that are handled
1849 : // with recursion.
1850 19767 : static void SplitSearchSpace(ZoneList<int>* ranges,
1851 : int start_index,
1852 : int end_index,
1853 : int* new_start_index,
1854 : int* new_end_index,
1855 : int* border) {
1856 : static const int kSize = RegExpMacroAssembler::kTableSize;
1857 : static const int kMask = RegExpMacroAssembler::kTableMask;
1858 :
1859 19767 : int first = ranges->at(start_index);
1860 19767 : int last = ranges->at(end_index) - 1;
1861 :
1862 19767 : *new_start_index = start_index;
1863 19767 : *border = (ranges->at(start_index) & ~kMask) + kSize;
1864 278407 : while (*new_start_index < end_index) {
1865 147860 : if (ranges->at(*new_start_index) > *border) break;
1866 129320 : (*new_start_index)++;
1867 : }
1868 : // new_start_index is the index of the first edge that is beyond the
1869 : // current kSize space.
1870 :
1871 : // For very large search spaces we do a binary chop search of the non-Latin1
1872 : // space instead of just going to the end of the current kSize space. The
1873 : // heuristics are complicated a little by the fact that any 128-character
1874 : // encoding space can be quickly tested with a table lookup, so we don't
1875 : // wish to do binary chop search at a smaller granularity than that. A
1876 : // 128-character space can take up a lot of space in the ranges array if,
1877 : // for example, we only want to match every second character (eg. the lower
1878 : // case characters on some Unicode pages).
1879 19767 : int binary_chop_index = (end_index + start_index) / 2;
1880 : // The first test ensures that we get to the code that handles the Latin1
1881 : // range with a single not-taken branch, speeding up this important
1882 : // character range (even non-Latin1 charset-based text has spaces and
1883 : // punctuation).
1884 54426 : if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1885 27629 : end_index - start_index > (*new_start_index - start_index) * 2 &&
1886 56007 : last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1887 23248 : ranges->at(binary_chop_index) >= first + 2 * kSize) {
1888 : int scan_forward_for_section_border = binary_chop_index;;
1889 9782 : int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1890 :
1891 124026 : while (scan_forward_for_section_border < end_index) {
1892 65012 : if (ranges->at(scan_forward_for_section_border) > new_border) {
1893 7890 : *new_start_index = scan_forward_for_section_border;
1894 7890 : *border = new_border;
1895 7890 : break;
1896 : }
1897 57122 : scan_forward_for_section_border++;
1898 : }
1899 : }
1900 :
1901 : DCHECK(*new_start_index > start_index);
1902 19767 : *new_end_index = *new_start_index - 1;
1903 19767 : if (ranges->at(*new_end_index) == *border) {
1904 2958 : (*new_end_index)--;
1905 : }
1906 39534 : if (*border >= ranges->at(end_index)) {
1907 1225 : *border = ranges->at(end_index);
1908 1225 : *new_start_index = end_index; // Won't be used.
1909 1225 : *new_end_index = end_index - 1;
1910 : }
1911 19767 : }
1912 :
1913 : // Gets a series of segment boundaries representing a character class. If the
1914 : // character is in the range between an even and an odd boundary (counting from
1915 : // start_index) then go to even_label, otherwise go to odd_label. We already
1916 : // know that the character is in the range of min_char to max_char inclusive.
1917 : // Either label can be nullptr indicating backtracking. Either label can also
1918 : // be equal to the fall_through label.
1919 201209 : static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1920 : int start_index, int end_index, uc32 min_char,
1921 : uc32 max_char, Label* fall_through,
1922 : Label* even_label, Label* odd_label) {
1923 : DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1924 : DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1925 :
1926 201209 : int first = ranges->at(start_index);
1927 201209 : int last = ranges->at(end_index) - 1;
1928 :
1929 : DCHECK_LT(min_char, first);
1930 :
1931 : // Just need to test if the character is before or on-or-after
1932 : // a particular character.
1933 201209 : if (start_index == end_index) {
1934 8607 : EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1935 8607 : return;
1936 : }
1937 :
1938 : // Another almost trivial case: There is one interval in the middle that is
1939 : // different from the end intervals.
1940 192602 : if (start_index + 1 == end_index) {
1941 : EmitDoubleBoundaryTest(
1942 123209 : masm, first, last, fall_through, even_label, odd_label);
1943 123209 : return;
1944 : }
1945 :
1946 : // It's not worth using table lookup if there are very few intervals in the
1947 : // character class.
1948 69393 : if (end_index - start_index <= 6) {
1949 : // It is faster to test for individual characters, so we look for those
1950 : // first, then try arbitrary ranges in the second round.
1951 : static int kNoCutIndex = -1;
1952 35749 : int cut = kNoCutIndex;
1953 109745 : for (int i = start_index; i < end_index; i++) {
1954 185992 : if (ranges->at(i) == ranges->at(i + 1) - 1) {
1955 : cut = i;
1956 : break;
1957 : }
1958 : }
1959 35749 : if (cut == kNoCutIndex) cut = start_index;
1960 : CutOutRange(
1961 35749 : masm, ranges, start_index, end_index, cut, even_label, odd_label);
1962 : DCHECK_GE(end_index - start_index, 2);
1963 35749 : GenerateBranches(masm,
1964 : ranges,
1965 : start_index + 1,
1966 : end_index - 1,
1967 : min_char,
1968 : max_char,
1969 : fall_through,
1970 : even_label,
1971 35749 : odd_label);
1972 35749 : return;
1973 : }
1974 :
1975 : // If there are a lot of intervals in the regexp, then we will use tables to
1976 : // determine whether the character is inside or outside the character class.
1977 : static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1978 :
1979 33644 : if ((max_char >> kBits) == (min_char >> kBits)) {
1980 : EmitUseLookupTable(masm,
1981 : ranges,
1982 : start_index,
1983 : end_index,
1984 : min_char,
1985 : fall_through,
1986 : even_label,
1987 5861 : odd_label);
1988 5861 : return;
1989 : }
1990 :
1991 27783 : if ((min_char >> kBits) != (first >> kBits)) {
1992 8016 : masm->CheckCharacterLT(first, odd_label);
1993 : GenerateBranches(masm,
1994 : ranges,
1995 : start_index + 1,
1996 : end_index,
1997 : first,
1998 : max_char,
1999 : fall_through,
2000 : odd_label,
2001 8016 : even_label);
2002 8016 : return;
2003 : }
2004 :
2005 19767 : int new_start_index = 0;
2006 19767 : int new_end_index = 0;
2007 19767 : int border = 0;
2008 :
2009 : SplitSearchSpace(ranges,
2010 : start_index,
2011 : end_index,
2012 : &new_start_index,
2013 : &new_end_index,
2014 19767 : &border);
2015 :
2016 19767 : Label handle_rest;
2017 : Label* above = &handle_rest;
2018 19767 : if (border == last + 1) {
2019 : // We didn't find any section that started after the limit, so everything
2020 : // above the border is one of the terminal labels.
2021 1225 : above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2022 : DCHECK(new_end_index == end_index - 1);
2023 : }
2024 :
2025 : DCHECK_LE(start_index, new_end_index);
2026 : DCHECK_LE(new_start_index, end_index);
2027 : DCHECK_LT(start_index, new_start_index);
2028 : DCHECK_LT(new_end_index, end_index);
2029 : DCHECK(new_end_index + 1 == new_start_index ||
2030 : (new_end_index + 2 == new_start_index &&
2031 : border == ranges->at(new_end_index + 1)));
2032 : DCHECK_LT(min_char, border - 1);
2033 : DCHECK_LT(border, max_char);
2034 : DCHECK_LT(ranges->at(new_end_index), border);
2035 : DCHECK(border < ranges->at(new_start_index) ||
2036 : (border == ranges->at(new_start_index) &&
2037 : new_start_index == end_index &&
2038 : new_end_index == end_index - 1 &&
2039 : border == last + 1));
2040 : DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2041 :
2042 19767 : masm->CheckCharacterGT(border - 1, above);
2043 19767 : Label dummy;
2044 19767 : GenerateBranches(masm,
2045 : ranges,
2046 : start_index,
2047 : new_end_index,
2048 : min_char,
2049 : border - 1,
2050 : &dummy,
2051 : even_label,
2052 19767 : odd_label);
2053 19767 : if (handle_rest.is_linked()) {
2054 18542 : masm->Bind(&handle_rest);
2055 18542 : bool flip = (new_start_index & 1) != (start_index & 1);
2056 18542 : GenerateBranches(masm,
2057 : ranges,
2058 : new_start_index,
2059 : end_index,
2060 : border,
2061 : max_char,
2062 : &dummy,
2063 : flip ? odd_label : even_label,
2064 18542 : flip ? even_label : odd_label);
2065 : }
2066 : }
2067 :
2068 :
2069 210321 : static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2070 : RegExpCharacterClass* cc, bool one_byte,
2071 : Label* on_failure, int cp_offset, bool check_offset,
2072 : bool preloaded, Zone* zone) {
2073 : ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2074 210321 : CharacterRange::Canonicalize(ranges);
2075 :
2076 : int max_char;
2077 210321 : if (one_byte) {
2078 : max_char = String::kMaxOneByteCharCode;
2079 : } else {
2080 : max_char = String::kMaxUtf16CodeUnit;
2081 : }
2082 :
2083 : int range_count = ranges->length();
2084 :
2085 210321 : int last_valid_range = range_count - 1;
2086 571569 : while (last_valid_range >= 0) {
2087 : CharacterRange& range = ranges->at(last_valid_range);
2088 390910 : if (range.from() <= max_char) {
2089 : break;
2090 : }
2091 180624 : last_valid_range--;
2092 : }
2093 :
2094 210321 : if (last_valid_range < 0) {
2095 35 : if (!cc->is_negated()) {
2096 10 : macro_assembler->GoTo(on_failure);
2097 : }
2098 35 : if (check_offset) {
2099 33 : macro_assembler->CheckPosition(cp_offset, on_failure);
2100 : }
2101 91186 : return;
2102 : }
2103 :
2104 398551 : if (last_valid_range == 0 &&
2105 : ranges->at(0).IsEverything(max_char)) {
2106 82556 : if (cc->is_negated()) {
2107 31 : macro_assembler->GoTo(on_failure);
2108 : } else {
2109 : // This is a common case hit by non-anchored expressions.
2110 82525 : if (check_offset) {
2111 53744 : macro_assembler->CheckPosition(cp_offset, on_failure);
2112 : }
2113 : }
2114 : return;
2115 : }
2116 :
2117 127730 : if (!preloaded) {
2118 115595 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2119 : }
2120 :
2121 138399 : if (cc->is_standard(zone) &&
2122 10669 : macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2123 10669 : on_failure)) {
2124 : return;
2125 : }
2126 :
2127 :
2128 : // A new list with ascending entries. Each entry is a code unit
2129 : // where there is a boundary between code units that are part of
2130 : // the class and code units that are not. Normally we insert an
2131 : // entry at zero which goes to the failure label, but if there
2132 : // was already one there we fall through for success on that entry.
2133 : // Subsequent entries have alternating meaning (success/failure).
2134 : ZoneList<int>* range_boundaries =
2135 : new(zone) ZoneList<int>(last_valid_range, zone);
2136 :
2137 119135 : bool zeroth_entry_is_failure = !cc->is_negated();
2138 :
2139 560951 : for (int i = 0; i <= last_valid_range; i++) {
2140 : CharacterRange& range = ranges->at(i);
2141 220908 : if (range.from() == 0) {
2142 : DCHECK_EQ(i, 0);
2143 3340 : zeroth_entry_is_failure = !zeroth_entry_is_failure;
2144 : } else {
2145 217568 : range_boundaries->Add(range.from(), zone);
2146 : }
2147 220908 : range_boundaries->Add(range.to() + 1, zone);
2148 : }
2149 119135 : int end_index = range_boundaries->length() - 1;
2150 119135 : if (range_boundaries->at(end_index) > max_char) {
2151 3938 : end_index--;
2152 : }
2153 :
2154 119135 : Label fall_through;
2155 119135 : GenerateBranches(macro_assembler,
2156 : range_boundaries,
2157 : 0, // start_index.
2158 : end_index,
2159 : 0, // min_char.
2160 : max_char,
2161 : &fall_through,
2162 : zeroth_entry_is_failure ? &fall_through : on_failure,
2163 119135 : zeroth_entry_is_failure ? on_failure : &fall_through);
2164 119135 : macro_assembler->Bind(&fall_through);
2165 : }
2166 :
2167 : RegExpNode::~RegExpNode() = default;
2168 :
2169 1696687 : RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2170 : Trace* trace) {
2171 : // If we are generating a greedy loop then don't stop and don't reuse code.
2172 1696687 : if (trace->stop_node() != nullptr) {
2173 : return CONTINUE;
2174 : }
2175 :
2176 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2177 1685008 : if (trace->is_trivial()) {
2178 1059316 : if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2179 : // If a generic version is already scheduled to be generated or we have
2180 : // recursed too deeply then just generate a jump to that code.
2181 219532 : macro_assembler->GoTo(&label_);
2182 : // This will queue it up for generation of a generic version if it hasn't
2183 : // already been queued.
2184 219532 : compiler->AddWork(this);
2185 219532 : return DONE;
2186 : }
2187 : // Generate generic version of the node and bind the label for later use.
2188 391381 : macro_assembler->Bind(&label_);
2189 391381 : return CONTINUE;
2190 : }
2191 :
2192 : // We are being asked to make a non-generic version. Keep track of how many
2193 : // non-generic versions we generate so as not to overdo it.
2194 1074095 : trace_count_++;
2195 1074095 : if (KeepRecursing(compiler) && compiler->optimize() &&
2196 : trace_count_ < kMaxCopiesCodeGenerated) {
2197 : return CONTINUE;
2198 : }
2199 :
2200 : // If we get here code has been generated for this node too many times or
2201 : // recursion is too deep. Time to switch to a generic version. The code for
2202 : // generic versions above can handle deep recursion properly.
2203 : bool was_limiting = compiler->limiting_recursion();
2204 : compiler->set_limiting_recursion(true);
2205 477635 : trace->Flush(compiler, this);
2206 : compiler->set_limiting_recursion(was_limiting);
2207 477635 : return DONE;
2208 : }
2209 :
2210 :
2211 0 : bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2212 2033283 : return !compiler->limiting_recursion() &&
2213 0 : compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2214 : }
2215 :
2216 :
2217 583324 : int ActionNode::EatsAtLeast(int still_to_find,
2218 : int budget,
2219 : bool not_at_start) {
2220 583324 : if (budget <= 0) return 0;
2221 570316 : if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2222 565178 : return on_success()->EatsAtLeast(still_to_find,
2223 : budget - 1,
2224 1130356 : not_at_start);
2225 : }
2226 :
2227 :
2228 90541 : void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2229 : BoyerMooreLookahead* bm, bool not_at_start) {
2230 90541 : if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2231 90541 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2232 : }
2233 : SaveBMInfo(bm, not_at_start, offset);
2234 90541 : }
2235 :
2236 :
2237 10161 : int AssertionNode::EatsAtLeast(int still_to_find,
2238 : int budget,
2239 : bool not_at_start) {
2240 10161 : if (budget <= 0) return 0;
2241 : // If we know we are not at the start and we are asked "how many characters
2242 : // will you match if you succeed?" then we can answer anything since false
2243 : // implies false. So lets just return the max answer (still_to_find) since
2244 : // that won't prevent us from preloading a lot of characters for the other
2245 : // branches in the node graph.
2246 9152 : if (assertion_type() == AT_START && not_at_start) return still_to_find;
2247 8930 : return on_success()->EatsAtLeast(still_to_find,
2248 : budget - 1,
2249 17860 : not_at_start);
2250 : }
2251 :
2252 :
2253 379 : void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2254 : BoyerMooreLookahead* bm, bool not_at_start) {
2255 : // Match the behaviour of EatsAtLeast on this node.
2256 379 : if (assertion_type() == AT_START && not_at_start) return;
2257 363 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2258 : SaveBMInfo(bm, not_at_start, offset);
2259 : }
2260 :
2261 :
2262 3111 : int BackReferenceNode::EatsAtLeast(int still_to_find,
2263 : int budget,
2264 : bool not_at_start) {
2265 3111 : if (read_backward()) return 0;
2266 3001 : if (budget <= 0) return 0;
2267 3001 : return on_success()->EatsAtLeast(still_to_find,
2268 : budget - 1,
2269 6002 : not_at_start);
2270 : }
2271 :
2272 :
2273 5764439 : int TextNode::EatsAtLeast(int still_to_find,
2274 : int budget,
2275 : bool not_at_start) {
2276 5764439 : if (read_backward()) return 0;
2277 5762587 : int answer = Length();
2278 5762587 : if (answer >= still_to_find) return answer;
2279 3398629 : if (budget <= 0) return answer;
2280 : // We are not at start after this node so we set the last argument to 'true'.
2281 2373988 : return answer + on_success()->EatsAtLeast(still_to_find - answer,
2282 : budget - 1,
2283 4747976 : true);
2284 : }
2285 :
2286 :
2287 9503 : int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2288 : bool not_at_start) {
2289 9503 : if (budget <= 0) return 0;
2290 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2291 : // afterwards.
2292 9291 : RegExpNode* node = alternatives_->at(1).node();
2293 9291 : return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2294 : }
2295 :
2296 :
2297 3556 : void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2298 : QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2299 : bool not_at_start) {
2300 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2301 : // afterwards.
2302 3556 : RegExpNode* node = alternatives_->at(1).node();
2303 3556 : return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2304 : }
2305 :
2306 :
2307 6945187 : int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2308 : int budget,
2309 : RegExpNode* ignore_this_node,
2310 : bool not_at_start) {
2311 6945187 : if (budget <= 0) return 0;
2312 : int min = 100;
2313 4818926 : int choice_count = alternatives_->length();
2314 4818926 : budget = (budget - 1) / choice_count;
2315 16627180 : for (int i = 0; i < choice_count; i++) {
2316 10401544 : RegExpNode* node = alternatives_->at(i).node();
2317 10401544 : if (node == ignore_this_node) continue;
2318 : int node_eats_at_least =
2319 10256739 : node->EatsAtLeast(still_to_find, budget, not_at_start);
2320 10256739 : if (node_eats_at_least < min) min = node_eats_at_least;
2321 10256739 : if (min == 0) return 0;
2322 : }
2323 : return min;
2324 : }
2325 :
2326 :
2327 153250 : int LoopChoiceNode::EatsAtLeast(int still_to_find,
2328 : int budget,
2329 : bool not_at_start) {
2330 153250 : return EatsAtLeastHelper(still_to_find,
2331 : budget - 1,
2332 : loop_node_,
2333 153250 : not_at_start);
2334 : }
2335 :
2336 :
2337 6791937 : int ChoiceNode::EatsAtLeast(int still_to_find,
2338 : int budget,
2339 : bool not_at_start) {
2340 6791937 : return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
2341 : }
2342 :
2343 :
2344 : // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2345 : static inline uint32_t SmearBitsRight(uint32_t v) {
2346 240093 : v |= v >> 1;
2347 240093 : v |= v >> 2;
2348 240093 : v |= v >> 4;
2349 240093 : v |= v >> 8;
2350 240093 : v |= v >> 16;
2351 : return v;
2352 : }
2353 :
2354 :
2355 270850 : bool QuickCheckDetails::Rationalize(bool asc) {
2356 : bool found_useful_op = false;
2357 : uint32_t char_mask;
2358 270850 : if (asc) {
2359 : char_mask = String::kMaxOneByteCharCode;
2360 : } else {
2361 : char_mask = String::kMaxUtf16CodeUnit;
2362 : }
2363 270850 : mask_ = 0;
2364 270850 : value_ = 0;
2365 : int char_shift = 0;
2366 1133670 : for (int i = 0; i < characters_; i++) {
2367 : Position* pos = &positions_[i];
2368 431410 : if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2369 : found_useful_op = true;
2370 : }
2371 431410 : mask_ |= (pos->mask & char_mask) << char_shift;
2372 431410 : value_ |= (pos->value & char_mask) << char_shift;
2373 431410 : char_shift += asc ? 8 : 16;
2374 : }
2375 270850 : return found_useful_op;
2376 : }
2377 :
2378 :
2379 475413 : bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2380 : Trace* bounds_check_trace,
2381 : Trace* trace,
2382 : bool preload_has_checked_bounds,
2383 : Label* on_possible_success,
2384 : QuickCheckDetails* details,
2385 : bool fall_through_on_failure) {
2386 475413 : if (details->characters() == 0) return false;
2387 270960 : GetQuickCheckDetails(
2388 541920 : details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2389 270960 : if (details->cannot_match()) return false;
2390 270850 : if (!details->Rationalize(compiler->one_byte())) return false;
2391 : DCHECK(details->characters() == 1 ||
2392 : compiler->macro_assembler()->CanReadUnaligned());
2393 : uint32_t mask = details->mask();
2394 : uint32_t value = details->value();
2395 :
2396 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2397 :
2398 226208 : if (trace->characters_preloaded() != details->characters()) {
2399 : DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2400 : // We are attempting to preload the minimum number of characters
2401 : // any choice would eat, so if the bounds check fails, then none of the
2402 : // choices can succeed, so we can just immediately backtrack, rather
2403 : // than go to the next choice.
2404 63014 : assembler->LoadCurrentCharacter(trace->cp_offset(),
2405 : bounds_check_trace->backtrack(),
2406 63014 : !preload_has_checked_bounds,
2407 126028 : details->characters());
2408 : }
2409 :
2410 :
2411 : bool need_mask = true;
2412 :
2413 226208 : if (details->characters() == 1) {
2414 : // If number of characters preloaded is 1 then we used a byte or 16 bit
2415 : // load so the value is already masked down.
2416 : uint32_t char_mask;
2417 82209 : if (compiler->one_byte()) {
2418 : char_mask = String::kMaxOneByteCharCode;
2419 : } else {
2420 : char_mask = String::kMaxUtf16CodeUnit;
2421 : }
2422 82209 : if ((mask & char_mask) == char_mask) need_mask = false;
2423 : mask &= char_mask;
2424 : } else {
2425 : // For 2-character preloads in one-byte mode or 1-character preloads in
2426 : // two-byte mode we also use a 16 bit load with zero extend.
2427 : static const uint32_t kTwoByteMask = 0xFFFF;
2428 : static const uint32_t kFourByteMask = 0xFFFFFFFF;
2429 143999 : if (details->characters() == 2 && compiler->one_byte()) {
2430 127878 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2431 16121 : } else if (details->characters() == 1 && !compiler->one_byte()) {
2432 0 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2433 : } else {
2434 16121 : if (mask == kFourByteMask) need_mask = false;
2435 : }
2436 : }
2437 :
2438 226208 : if (fall_through_on_failure) {
2439 191792 : if (need_mask) {
2440 46376 : assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2441 : } else {
2442 145416 : assembler->CheckCharacter(value, on_possible_success);
2443 : }
2444 : } else {
2445 34416 : if (need_mask) {
2446 3739 : assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2447 : } else {
2448 30677 : assembler->CheckNotCharacter(value, trace->backtrack());
2449 : }
2450 : }
2451 : return true;
2452 : }
2453 :
2454 :
2455 : // Here is the meat of GetQuickCheckDetails (see also the comment on the
2456 : // super-class in the .h file).
2457 : //
2458 : // We iterate along the text object, building up for each character a
2459 : // mask and value that can be used to test for a quick failure to match.
2460 : // The masks and values for the positions will be combined into a single
2461 : // machine word for the current character width in order to be used in
2462 : // generating a quick check.
2463 461861 : void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2464 : RegExpCompiler* compiler,
2465 : int characters_filled_in,
2466 : bool not_at_start) {
2467 : // Do not collect any quick check details if the text node reads backward,
2468 : // since it reads in the opposite direction than we use for quick checks.
2469 461861 : if (read_backward()) return;
2470 : Isolate* isolate = compiler->macro_assembler()->isolate();
2471 : DCHECK(characters_filled_in < details->characters());
2472 : int characters = details->characters();
2473 : int char_mask;
2474 461861 : if (compiler->one_byte()) {
2475 : char_mask = String::kMaxOneByteCharCode;
2476 : } else {
2477 : char_mask = String::kMaxUtf16CodeUnit;
2478 : }
2479 546813 : for (int k = 0; k < elements()->length(); k++) {
2480 466390 : TextElement elm = elements()->at(k);
2481 466390 : if (elm.text_type() == TextElement::ATOM) {
2482 : Vector<const uc16> quarks = elm.atom()->data();
2483 1163457 : for (int i = 0; i < characters && i < quarks.length(); i++) {
2484 : QuickCheckDetails::Position* pos =
2485 : details->positions(characters_filled_in);
2486 949236 : uc16 c = quarks[i];
2487 474618 : if (elm.atom()->ignore_case()) {
2488 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2489 6250 : int length = GetCaseIndependentLetters(isolate, c,
2490 6250 : compiler->one_byte(), chars);
2491 6250 : if (length == 0) {
2492 : // This can happen because all case variants are non-Latin1, but we
2493 : // know the input is Latin1.
2494 : details->set_cannot_match();
2495 25 : pos->determines_perfectly = false;
2496 25 : return;
2497 : }
2498 6225 : if (length == 1) {
2499 : // This letter has no case equivalents, so it's nice and simple
2500 : // and the mask-compare will determine definitely whether we have
2501 : // a match at this character position.
2502 1227 : pos->mask = char_mask;
2503 1227 : pos->value = c;
2504 1227 : pos->determines_perfectly = true;
2505 : } else {
2506 4998 : uint32_t common_bits = char_mask;
2507 4998 : uint32_t bits = chars[0];
2508 15700 : for (int j = 1; j < length; j++) {
2509 5351 : uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2510 5351 : common_bits ^= differing_bits;
2511 5351 : bits &= common_bits;
2512 : }
2513 : // If length is 2 and common bits has only one zero in it then
2514 : // our mask and compare instruction will determine definitely
2515 : // whether we have a match at this character position. Otherwise
2516 : // it can only be an approximate check.
2517 4998 : uint32_t one_zero = (common_bits | ~char_mask);
2518 4998 : if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2519 4595 : pos->determines_perfectly = true;
2520 : }
2521 4998 : pos->mask = common_bits;
2522 4998 : pos->value = bits;
2523 : }
2524 : } else {
2525 : // Don't ignore case. Nice simple case where the mask-compare will
2526 : // determine definitely whether we have a match at this character
2527 : // position.
2528 468368 : if (c > char_mask) {
2529 : details->set_cannot_match();
2530 25 : pos->determines_perfectly = false;
2531 25 : return;
2532 : }
2533 468343 : pos->mask = char_mask;
2534 468343 : pos->value = c;
2535 468343 : pos->determines_perfectly = true;
2536 : }
2537 474568 : characters_filled_in++;
2538 : DCHECK(characters_filled_in <= details->characters());
2539 474568 : if (characters_filled_in == details->characters()) {
2540 : return;
2541 : }
2542 : }
2543 : } else {
2544 : QuickCheckDetails::Position* pos =
2545 : details->positions(characters_filled_in);
2546 : RegExpCharacterClass* tree = elm.char_class();
2547 : ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2548 : DCHECK(!ranges->is_empty());
2549 124450 : if (tree->is_negated()) {
2550 : // A quick check uses multi-character mask and compare. There is no
2551 : // useful way to incorporate a negative char class into this scheme
2552 : // so we just conservatively create a mask and value that will always
2553 : // succeed.
2554 3504 : pos->mask = 0;
2555 3504 : pos->value = 0;
2556 : } else {
2557 : int first_range = 0;
2558 120996 : while (ranges->at(first_range).from() > char_mask) {
2559 100 : first_range++;
2560 100 : if (first_range == ranges->length()) {
2561 : details->set_cannot_match();
2562 50 : pos->determines_perfectly = false;
2563 : return;
2564 : }
2565 : }
2566 120896 : CharacterRange range = ranges->at(first_range);
2567 120896 : uc16 from = range.from();
2568 120896 : uc16 to = range.to();
2569 120896 : if (to > char_mask) {
2570 15027 : to = char_mask;
2571 : }
2572 120896 : uint32_t differing_bits = (from ^ to);
2573 : // A mask and compare is only perfect if the differing bits form a
2574 : // number like 00011111 with one single block of trailing 1s.
2575 224625 : if ((differing_bits & (differing_bits + 1)) == 0 &&
2576 103729 : from + differing_bits == to) {
2577 94046 : pos->determines_perfectly = true;
2578 : }
2579 120896 : uint32_t common_bits = ~SmearBitsRight(differing_bits);
2580 120896 : uint32_t bits = (from & common_bits);
2581 748850 : for (int i = first_range + 1; i < ranges->length(); i++) {
2582 253529 : CharacterRange range = ranges->at(i);
2583 253529 : uc16 from = range.from();
2584 253529 : uc16 to = range.to();
2585 253529 : if (from > char_mask) continue;
2586 119197 : if (to > char_mask) to = char_mask;
2587 : // Here we are combining more ranges into the mask and compare
2588 : // value. With each new range the mask becomes more sparse and
2589 : // so the chances of a false positive rise. A character class
2590 : // with multiple ranges is assumed never to be equivalent to a
2591 : // mask and compare operation.
2592 119197 : pos->determines_perfectly = false;
2593 119197 : uint32_t new_common_bits = (from ^ to);
2594 119197 : new_common_bits = ~SmearBitsRight(new_common_bits);
2595 119197 : common_bits &= new_common_bits;
2596 119197 : bits &= new_common_bits;
2597 119197 : uint32_t differing_bits = (from & common_bits) ^ bits;
2598 119197 : common_bits ^= differing_bits;
2599 119197 : bits &= common_bits;
2600 : }
2601 120896 : pos->mask = common_bits;
2602 120896 : pos->value = bits;
2603 : }
2604 124400 : characters_filled_in++;
2605 : DCHECK(characters_filled_in <= details->characters());
2606 124400 : if (characters_filled_in == details->characters()) {
2607 : return;
2608 : }
2609 : }
2610 : }
2611 : DCHECK(characters_filled_in != details->characters());
2612 37947 : if (!details->cannot_match()) {
2613 : on_success()-> GetQuickCheckDetails(details,
2614 : compiler,
2615 : characters_filled_in,
2616 37947 : true);
2617 : }
2618 : }
2619 :
2620 :
2621 0 : void QuickCheckDetails::Clear() {
2622 1796875 : for (int i = 0; i < characters_; i++) {
2623 349336 : positions_[i].mask = 0;
2624 349336 : positions_[i].value = 0;
2625 349336 : positions_[i].determines_perfectly = false;
2626 : }
2627 1098203 : characters_ = 0;
2628 0 : }
2629 :
2630 :
2631 516610 : void QuickCheckDetails::Advance(int by, bool one_byte) {
2632 516610 : if (by >= characters_ || by < 0) {
2633 : DCHECK_IMPLIES(by < 0, characters_ == 0);
2634 : Clear();
2635 : return;
2636 : }
2637 : DCHECK_LE(characters_ - by, 4);
2638 : DCHECK_LE(characters_, 4);
2639 72957 : for (int i = 0; i < characters_ - by; i++) {
2640 24886 : positions_[i] = positions_[by + i];
2641 : }
2642 72949 : for (int i = characters_ - by; i < characters_; i++) {
2643 24882 : positions_[i].mask = 0;
2644 24882 : positions_[i].value = 0;
2645 24882 : positions_[i].determines_perfectly = false;
2646 : }
2647 23185 : characters_ -= by;
2648 : // We could change mask_ and value_ here but we would never advance unless
2649 : // they had already been used in a check and they won't be used again because
2650 : // it would gain us nothing. So there's no point.
2651 : }
2652 :
2653 :
2654 156974 : void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2655 : DCHECK(characters_ == other->characters_);
2656 156974 : if (other->cannot_match_) {
2657 : return;
2658 : }
2659 156900 : if (cannot_match_) {
2660 247 : *this = *other;
2661 247 : return;
2662 : }
2663 504267 : for (int i = from_index; i < characters_; i++) {
2664 : QuickCheckDetails::Position* pos = positions(i);
2665 : QuickCheckDetails::Position* other_pos = other->positions(i);
2666 206414 : if (pos->mask != other_pos->mask ||
2667 42713 : pos->value != other_pos->value ||
2668 10106 : !other_pos->determines_perfectly) {
2669 : // Our mask-compare operation will be approximate unless we have the
2670 : // exact same operation on both sides of the alternation.
2671 166503 : pos->determines_perfectly = false;
2672 : }
2673 173807 : pos->mask &= other_pos->mask;
2674 173807 : pos->value &= pos->mask;
2675 173807 : other_pos->value &= pos->mask;
2676 173807 : uc16 differing_bits = (pos->value ^ other_pos->value);
2677 173807 : pos->mask &= ~differing_bits;
2678 173807 : pos->value &= pos->mask;
2679 : }
2680 : }
2681 :
2682 :
2683 : class VisitMarker {
2684 : public:
2685 : explicit VisitMarker(NodeInfo* info) : info_(info) {
2686 : DCHECK(!info->visited);
2687 196642 : info->visited = true;
2688 : }
2689 : ~VisitMarker() {
2690 172716 : info_->visited = false;
2691 : }
2692 : private:
2693 : NodeInfo* info_;
2694 : };
2695 :
2696 98819 : RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
2697 98819 : if (info()->replacement_calculated) return replacement();
2698 72206 : if (depth < 0) return this;
2699 : DCHECK(!info()->visited);
2700 : VisitMarker marker(info());
2701 : return FilterSuccessor(depth - 1);
2702 : }
2703 :
2704 0 : RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
2705 132271 : RegExpNode* next = on_success_->FilterOneByte(depth - 1);
2706 132271 : if (next == nullptr) return set_replacement(nullptr);
2707 131789 : on_success_ = next;
2708 131789 : return set_replacement(this);
2709 : }
2710 :
2711 : // We need to check for the following characters: 0x39C 0x3BC 0x178.
2712 1451 : static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2713 : // TODO(dcarney): this could be a lot more efficient.
2714 4227 : return range.Contains(0x039C) || range.Contains(0x03BC) ||
2715 1451 : range.Contains(0x0178);
2716 : }
2717 :
2718 :
2719 41 : static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2720 81 : for (int i = 0; i < ranges->length(); i++) {
2721 : // TODO(dcarney): this could be a lot more efficient.
2722 46 : if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2723 : }
2724 : return false;
2725 : }
2726 :
2727 64942 : RegExpNode* TextNode::FilterOneByte(int depth) {
2728 64942 : if (info()->replacement_calculated) return replacement();
2729 60709 : if (depth < 0) return this;
2730 : DCHECK(!info()->visited);
2731 : VisitMarker marker(info());
2732 : int element_count = elements()->length();
2733 190134 : for (int i = 0; i < element_count; i++) {
2734 65170 : TextElement elm = elements()->at(i);
2735 65170 : if (elm.text_type() == TextElement::ATOM) {
2736 : Vector<const uc16> quarks = elm.atom()->data();
2737 144979 : for (int j = 0; j < quarks.length(); j++) {
2738 114828 : uint16_t c = quarks[j];
2739 57414 : if (elm.atom()->ignore_case()) {
2740 : c = unibrow::Latin1::TryConvertToLatin1(c);
2741 : }
2742 57414 : if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
2743 : // Replace quark in case we converted to Latin-1.
2744 : uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
2745 57248 : writable_quarks[j] = c;
2746 : }
2747 : } else {
2748 : DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2749 : RegExpCharacterClass* cc = elm.char_class();
2750 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2751 34687 : CharacterRange::Canonicalize(ranges);
2752 : // Now they are in order so we only need to look at the first.
2753 : int range_count = ranges->length();
2754 34687 : if (cc->is_negated()) {
2755 8390 : if (range_count != 0 &&
2756 4373 : ranges->at(0).from() == 0 &&
2757 : ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2758 : // This will be handled in a later filter.
2759 40 : if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2760 : continue;
2761 39 : return set_replacement(nullptr);
2762 : }
2763 : } else {
2764 30492 : if (range_count == 0 ||
2765 : ranges->at(0).from() > String::kMaxOneByteCharCode) {
2766 : // This will be handled in a later filter.
2767 255 : if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2768 : continue;
2769 230 : return set_replacement(nullptr);
2770 : }
2771 : }
2772 : }
2773 : }
2774 60229 : return FilterSuccessor(depth - 1);
2775 : }
2776 :
2777 59694 : RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
2778 59694 : if (info()->replacement_calculated) return replacement();
2779 46066 : if (depth < 0) return this;
2780 45976 : if (info()->visited) return this;
2781 : {
2782 : VisitMarker marker(info());
2783 :
2784 24274 : RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2785 : // If we can't continue after the loop then there is no sense in doing the
2786 : // loop.
2787 24274 : if (continue_replacement == nullptr) return set_replacement(nullptr);
2788 : }
2789 :
2790 23926 : return ChoiceNode::FilterOneByte(depth - 1);
2791 : }
2792 :
2793 29721 : RegExpNode* ChoiceNode::FilterOneByte(int depth) {
2794 29721 : if (info()->replacement_calculated) return replacement();
2795 27678 : if (depth < 0) return this;
2796 27583 : if (info()->visited) return this;
2797 : VisitMarker marker(info());
2798 27583 : int choice_count = alternatives_->length();
2799 :
2800 143743 : for (int i = 0; i < choice_count; i++) {
2801 60452 : GuardedAlternative alternative = alternatives_->at(i);
2802 62824 : if (alternative.guards() != nullptr &&
2803 : alternative.guards()->length() != 0) {
2804 2372 : set_replacement(this);
2805 : return this;
2806 : }
2807 : }
2808 :
2809 : int surviving = 0;
2810 : RegExpNode* survivor = nullptr;
2811 140667 : for (int i = 0; i < choice_count; i++) {
2812 115456 : GuardedAlternative alternative = alternatives_->at(i);
2813 57728 : RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2814 : DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2815 57728 : if (replacement != nullptr) {
2816 57582 : alternatives_->at(i).set_node(replacement);
2817 57582 : surviving++;
2818 : survivor = replacement;
2819 : }
2820 : }
2821 25277 : if (surviving < 2) return set_replacement(survivor);
2822 :
2823 25145 : set_replacement(this);
2824 25145 : if (surviving == choice_count) {
2825 : return this;
2826 : }
2827 : // Only some of the nodes survived the filtering. We need to rebuild the
2828 : // alternatives list.
2829 : ZoneList<GuardedAlternative>* new_alternatives =
2830 : new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2831 380 : for (int i = 0; i < choice_count; i++) {
2832 : RegExpNode* replacement =
2833 360 : alternatives_->at(i).node()->FilterOneByte(depth - 1);
2834 180 : if (replacement != nullptr) {
2835 130 : alternatives_->at(i).set_node(replacement);
2836 260 : new_alternatives->Add(alternatives_->at(i), zone());
2837 : }
2838 : }
2839 20 : alternatives_ = new_alternatives;
2840 20 : return this;
2841 : }
2842 :
2843 357 : RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
2844 357 : if (info()->replacement_calculated) return replacement();
2845 357 : if (depth < 0) return this;
2846 357 : if (info()->visited) return this;
2847 : VisitMarker marker(info());
2848 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2849 : // afterwards.
2850 357 : RegExpNode* node = alternatives_->at(1).node();
2851 357 : RegExpNode* replacement = node->FilterOneByte(depth - 1);
2852 362 : if (replacement == nullptr) return set_replacement(nullptr);
2853 352 : alternatives_->at(1).set_node(replacement);
2854 :
2855 352 : RegExpNode* neg_node = alternatives_->at(0).node();
2856 352 : RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2857 : // If the negative lookahead is always going to fail then
2858 : // we don't need to check it.
2859 357 : if (neg_replacement == nullptr) return set_replacement(replacement);
2860 347 : alternatives_->at(0).set_node(neg_replacement);
2861 694 : return set_replacement(this);
2862 : }
2863 :
2864 :
2865 14889 : void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2866 : RegExpCompiler* compiler,
2867 : int characters_filled_in,
2868 : bool not_at_start) {
2869 14889 : if (body_can_be_zero_length_ || info()->visited) return;
2870 : VisitMarker marker(info());
2871 11722 : return ChoiceNode::GetQuickCheckDetails(details,
2872 : compiler,
2873 : characters_filled_in,
2874 11722 : not_at_start);
2875 : }
2876 :
2877 :
2878 5126 : void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2879 : BoyerMooreLookahead* bm, bool not_at_start) {
2880 5126 : if (body_can_be_zero_length_ || budget <= 0) {
2881 : bm->SetRest(offset);
2882 : SaveBMInfo(bm, not_at_start, offset);
2883 : return;
2884 : }
2885 4910 : ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2886 : SaveBMInfo(bm, not_at_start, offset);
2887 : }
2888 :
2889 :
2890 38749 : void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2891 : RegExpCompiler* compiler,
2892 : int characters_filled_in,
2893 : bool not_at_start) {
2894 38749 : not_at_start = (not_at_start || not_at_start_);
2895 38749 : int choice_count = alternatives_->length();
2896 : DCHECK_LT(0, choice_count);
2897 38749 : alternatives_->at(0).node()->GetQuickCheckDetails(details,
2898 : compiler,
2899 : characters_filled_in,
2900 77498 : not_at_start);
2901 352697 : for (int i = 1; i < choice_count; i++) {
2902 : QuickCheckDetails new_details(details->characters());
2903 156974 : RegExpNode* node = alternatives_->at(i).node();
2904 : node->GetQuickCheckDetails(&new_details, compiler,
2905 : characters_filled_in,
2906 156974 : not_at_start);
2907 : // Here we merge the quick match details of the two branches.
2908 156974 : details->Merge(&new_details, characters_filled_in);
2909 : }
2910 38749 : }
2911 :
2912 :
2913 : // Check for [0-9A-Z_a-z].
2914 557 : static void EmitWordCheck(RegExpMacroAssembler* assembler,
2915 : Label* word,
2916 : Label* non_word,
2917 : bool fall_through_on_word) {
2918 557 : if (assembler->CheckSpecialCharacterClass(
2919 : fall_through_on_word ? 'w' : 'W',
2920 557 : fall_through_on_word ? non_word : word)) {
2921 : // Optimized implementation available.
2922 : return;
2923 : }
2924 99 : assembler->CheckCharacterGT('z', non_word);
2925 99 : assembler->CheckCharacterLT('0', non_word);
2926 99 : assembler->CheckCharacterGT('a' - 1, word);
2927 99 : assembler->CheckCharacterLT('9' + 1, word);
2928 99 : assembler->CheckCharacterLT('A', non_word);
2929 99 : assembler->CheckCharacterLT('Z' + 1, word);
2930 99 : if (fall_through_on_word) {
2931 34 : assembler->CheckNotCharacter('_', non_word);
2932 : } else {
2933 65 : assembler->CheckCharacter('_', word);
2934 : }
2935 : }
2936 :
2937 :
2938 : // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2939 : // that matches newline or the start of input).
2940 129 : static void EmitHat(RegExpCompiler* compiler,
2941 : RegExpNode* on_success,
2942 : Trace* trace) {
2943 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2944 : // We will be loading the previous character into the current character
2945 : // register.
2946 129 : Trace new_trace(*trace);
2947 : new_trace.InvalidateCurrentCharacter();
2948 :
2949 129 : Label ok;
2950 129 : if (new_trace.cp_offset() == 0) {
2951 : // The start of input counts as a newline in this context, so skip to
2952 : // ok if we are at the start.
2953 119 : assembler->CheckAtStart(&ok);
2954 : }
2955 : // We already checked that we are not at the start of input so it must be
2956 : // OK to load the previous character.
2957 129 : assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2958 : new_trace.backtrack(),
2959 258 : false);
2960 129 : if (!assembler->CheckSpecialCharacterClass('n',
2961 129 : new_trace.backtrack())) {
2962 : // Newline means \n, \r, 0x2028 or 0x2029.
2963 24 : if (!compiler->one_byte()) {
2964 2 : assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2965 : }
2966 24 : assembler->CheckCharacter('\n', &ok);
2967 24 : assembler->CheckNotCharacter('\r', new_trace.backtrack());
2968 : }
2969 129 : assembler->Bind(&ok);
2970 129 : on_success->Emit(compiler, &new_trace);
2971 129 : }
2972 :
2973 :
2974 : // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2975 255 : void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2976 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2977 : Isolate* isolate = assembler->isolate();
2978 : Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2979 255 : bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2980 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2981 255 : if (lookahead == nullptr) {
2982 : int eats_at_least =
2983 202 : Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
2984 : kRecursionBudget,
2985 202 : not_at_start));
2986 202 : if (eats_at_least >= 1) {
2987 : BoyerMooreLookahead* bm =
2988 97 : new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
2989 97 : FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2990 97 : if (bm->at(0)->is_non_word())
2991 : next_is_word_character = Trace::FALSE_VALUE;
2992 97 : if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2993 : }
2994 : } else {
2995 53 : if (lookahead->at(0)->is_non_word())
2996 : next_is_word_character = Trace::FALSE_VALUE;
2997 53 : if (lookahead->at(0)->is_word())
2998 : next_is_word_character = Trace::TRUE_VALUE;
2999 : }
3000 255 : bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3001 255 : if (next_is_word_character == Trace::UNKNOWN) {
3002 151 : Label before_non_word;
3003 151 : Label before_word;
3004 151 : if (trace->characters_preloaded() != 1) {
3005 150 : assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3006 : }
3007 : // Fall through on non-word.
3008 151 : EmitWordCheck(assembler, &before_word, &before_non_word, false);
3009 : // Next character is not a word character.
3010 151 : assembler->Bind(&before_non_word);
3011 151 : Label ok;
3012 151 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3013 151 : assembler->GoTo(&ok);
3014 :
3015 151 : assembler->Bind(&before_word);
3016 151 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3017 151 : assembler->Bind(&ok);
3018 104 : } else if (next_is_word_character == Trace::TRUE_VALUE) {
3019 79 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3020 : } else {
3021 : DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3022 25 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3023 : }
3024 255 : }
3025 :
3026 :
3027 406 : void AssertionNode::BacktrackIfPrevious(
3028 : RegExpCompiler* compiler,
3029 : Trace* trace,
3030 : AssertionNode::IfPrevious backtrack_if_previous) {
3031 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3032 406 : Trace new_trace(*trace);
3033 : new_trace.InvalidateCurrentCharacter();
3034 :
3035 406 : Label fall_through, dummy;
3036 :
3037 : Label* non_word = backtrack_if_previous == kIsNonWord ?
3038 : new_trace.backtrack() :
3039 406 : &fall_through;
3040 : Label* word = backtrack_if_previous == kIsNonWord ?
3041 : &fall_through :
3042 406 : new_trace.backtrack();
3043 :
3044 406 : if (new_trace.cp_offset() == 0) {
3045 : // The start of input counts as a non-word character, so the question is
3046 : // decided if we are at the start.
3047 169 : assembler->CheckAtStart(non_word);
3048 : }
3049 : // We already checked that we are not at the start of input so it must be
3050 : // OK to load the previous character.
3051 406 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3052 406 : EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3053 :
3054 406 : assembler->Bind(&fall_through);
3055 406 : on_success()->Emit(compiler, &new_trace);
3056 406 : }
3057 :
3058 :
3059 1935 : void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3060 : RegExpCompiler* compiler,
3061 : int filled_in,
3062 : bool not_at_start) {
3063 1935 : if (assertion_type_ == AT_START && not_at_start) {
3064 : details->set_cannot_match();
3065 : return;
3066 : }
3067 1604 : return on_success()->GetQuickCheckDetails(details,
3068 : compiler,
3069 : filled_in,
3070 3208 : not_at_start);
3071 : }
3072 :
3073 :
3074 5747 : void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3075 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3076 5747 : switch (assertion_type_) {
3077 : case AT_END: {
3078 2325 : Label ok;
3079 2325 : assembler->CheckPosition(trace->cp_offset(), &ok);
3080 2325 : assembler->GoTo(trace->backtrack());
3081 2325 : assembler->Bind(&ok);
3082 : break;
3083 : }
3084 : case AT_START: {
3085 3038 : if (trace->at_start() == Trace::FALSE_VALUE) {
3086 9 : assembler->GoTo(trace->backtrack());
3087 9 : return;
3088 : }
3089 3029 : if (trace->at_start() == Trace::UNKNOWN) {
3090 3029 : assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3091 3029 : Trace at_start_trace = *trace;
3092 : at_start_trace.set_at_start(Trace::TRUE_VALUE);
3093 3029 : on_success()->Emit(compiler, &at_start_trace);
3094 : return;
3095 : }
3096 : }
3097 : break;
3098 : case AFTER_NEWLINE:
3099 129 : EmitHat(compiler, on_success(), trace);
3100 129 : return;
3101 : case AT_BOUNDARY:
3102 : case AT_NON_BOUNDARY: {
3103 255 : EmitBoundaryCheck(compiler, trace);
3104 255 : return;
3105 : }
3106 : }
3107 2325 : on_success()->Emit(compiler, trace);
3108 : }
3109 :
3110 :
3111 : static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3112 2749326 : if (quick_check == nullptr) return false;
3113 2749326 : if (offset >= quick_check->characters()) return false;
3114 853525 : return quick_check->positions(offset)->determines_perfectly;
3115 : }
3116 :
3117 :
3118 : static void UpdateBoundsCheck(int index, int* checked_up_to) {
3119 800446 : if (index > *checked_up_to) {
3120 414555 : *checked_up_to = index;
3121 : }
3122 : }
3123 :
3124 :
3125 : // We call this repeatedly to generate code for each pass over the text node.
3126 : // The passes are in increasing order of difficulty because we hope one
3127 : // of the first passes will fail in which case we are saved the work of the
3128 : // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3129 : // we will check the '%' in the first pass, the case independent 'a' in the
3130 : // second pass and the character class in the last pass.
3131 : //
3132 : // The passes are done from right to left, so for example to test for /bar/
3133 : // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3134 : // and then a 'b' with offset 0. This means we can avoid the end-of-input
3135 : // bounds check most of the time. In the example we only need to check for
3136 : // end-of-input when loading the putative 'r'.
3137 : //
3138 : // A slight complication involves the fact that the first character may already
3139 : // be fetched into a register by the previous node. In this case we want to
3140 : // do the test for that character first. We do this in separate passes. The
3141 : // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3142 : // pass has been performed then subsequent passes will have true in
3143 : // first_element_checked to indicate that that character does not need to be
3144 : // checked again.
3145 : //
3146 : // In addition to all this we are passed a Trace, which can
3147 : // contain an AlternativeGeneration object. In this AlternativeGeneration
3148 : // object we can see details of any quick check that was already passed in
3149 : // order to get to the code we are now generating. The quick check can involve
3150 : // loading characters, which means we do not need to recheck the bounds
3151 : // up to the limit the quick check already checked. In addition the quick
3152 : // check can have involved a mask and compare operation which may simplify
3153 : // or obviate the need for further checks at some character positions.
3154 2700376 : void TextNode::TextEmitPass(RegExpCompiler* compiler,
3155 : TextEmitPassType pass,
3156 : bool preloaded,
3157 : Trace* trace,
3158 : bool first_element_checked,
3159 : int* checked_up_to) {
3160 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3161 : Isolate* isolate = assembler->isolate();
3162 : bool one_byte = compiler->one_byte();
3163 : Label* backtrack = trace->backtrack();
3164 : QuickCheckDetails* quick_check = trace->quick_check_performed();
3165 : int element_count = elements()->length();
3166 2700376 : int backward_offset = read_backward() ? -Length() : 0;
3167 5656005 : for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3168 2955654 : TextElement elm = elements()->at(i);
3169 2955654 : int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3170 2955654 : if (elm.text_type() == TextElement::ATOM) {
3171 1812552 : if (SkipPass(pass, elm.atom()->ignore_case())) continue;
3172 : Vector<const uc16> quarks = elm.atom()->data();
3173 4635131 : for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3174 2591804 : if (first_element_checked && i == 0 && j == 0) continue;
3175 5022364 : if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3176 : EmitCharacterFunction* emit_function = nullptr;
3177 3408926 : uc16 quark = quarks[j];
3178 1704463 : if (elm.atom()->ignore_case()) {
3179 : // Everywhere else we assume that a non-Latin-1 character cannot match
3180 : // a Latin-1 character. Avoid the cases where this is assumption is
3181 : // invalid by using the Latin1 equivalent instead.
3182 : quark = unibrow::Latin1::TryConvertToLatin1(quark);
3183 : }
3184 1704463 : switch (pass) {
3185 : case NON_LATIN1_MATCH:
3186 : DCHECK(one_byte);
3187 518596 : if (quark > String::kMaxOneByteCharCode) {
3188 25 : assembler->GoTo(backtrack);
3189 : return;
3190 : }
3191 : break;
3192 : case NON_LETTER_CHARACTER_MATCH:
3193 : emit_function = &EmitAtomNonLetter;
3194 5503 : break;
3195 : case SIMPLE_CHARACTER_MATCH:
3196 : emit_function = &EmitSimpleCharacter;
3197 584679 : break;
3198 : case CASE_CHARACTER_MATCH:
3199 : emit_function = &EmitAtomLetter;
3200 5503 : break;
3201 : default:
3202 : break;
3203 : }
3204 1704438 : if (emit_function != nullptr) {
3205 595685 : bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3206 : bool bound_checked =
3207 595685 : emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
3208 595685 : bounds_check, preloaded);
3209 595685 : if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3210 : }
3211 : }
3212 : } else {
3213 : DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3214 1143102 : if (pass == CHARACTER_CLASS_MATCH) {
3215 276849 : if (first_element_checked && i == 0) continue;
3216 238144 : if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3217 : RegExpCharacterClass* cc = elm.char_class();
3218 210321 : bool bounds_check = *checked_up_to < cp_offset || read_backward();
3219 210321 : EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3220 210321 : bounds_check, preloaded, zone());
3221 : UpdateBoundsCheck(cp_offset, checked_up_to);
3222 : }
3223 : }
3224 : }
3225 : }
3226 :
3227 :
3228 6952599 : int TextNode::Length() {
3229 6952599 : TextElement elm = elements()->last();
3230 : DCHECK_LE(0, elm.cp_offset());
3231 6952599 : return elm.cp_offset() + elm.length();
3232 : }
3233 :
3234 0 : bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
3235 1812552 : if (ignore_case) {
3236 44755 : return pass == SIMPLE_CHARACTER_MATCH;
3237 : } else {
3238 1767797 : return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3239 : }
3240 : }
3241 :
3242 7207 : TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3243 : ZoneList<CharacterRange>* ranges,
3244 : bool read_backward,
3245 : RegExpNode* on_success,
3246 : JSRegExp::Flags flags) {
3247 : DCHECK_NOT_NULL(ranges);
3248 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3249 14414 : elms->Add(TextElement::CharClass(
3250 21621 : new (zone) RegExpCharacterClass(zone, ranges, flags)),
3251 7207 : zone);
3252 7207 : return new (zone) TextNode(elms, read_backward, on_success);
3253 : }
3254 :
3255 27070 : TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3256 : CharacterRange trail,
3257 : bool read_backward,
3258 : RegExpNode* on_success,
3259 : JSRegExp::Flags flags) {
3260 27070 : ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3261 27070 : ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3262 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3263 54140 : elms->Add(TextElement::CharClass(
3264 81210 : new (zone) RegExpCharacterClass(zone, lead_ranges, flags)),
3265 27070 : zone);
3266 54140 : elms->Add(TextElement::CharClass(
3267 81210 : new (zone) RegExpCharacterClass(zone, trail_ranges, flags)),
3268 27070 : zone);
3269 27070 : return new (zone) TextNode(elms, read_backward, on_success);
3270 : }
3271 :
3272 :
3273 : // This generates the code to match a text node. A text node can contain
3274 : // straight character sequences (possibly to be matched in a case-independent
3275 : // way) and character classes. For efficiency we do not do this in a single
3276 : // pass from left to right. Instead we pass over the text node several times,
3277 : // emitting code for some character positions every time. See the comment on
3278 : // TextEmitPass for details.
3279 618231 : void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3280 618231 : LimitResult limit_result = LimitVersions(compiler, trace);
3281 719852 : if (limit_result == DONE) return;
3282 : DCHECK(limit_result == CONTINUE);
3283 :
3284 516610 : if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3285 : compiler->SetRegExpTooBig();
3286 : return;
3287 : }
3288 :
3289 516610 : if (compiler->one_byte()) {
3290 322668 : int dummy = 0;
3291 322668 : TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3292 : }
3293 :
3294 : bool first_elt_done = false;
3295 516610 : int bound_checked_to = trace->cp_offset() - 1;
3296 516610 : bound_checked_to += trace->bound_checked_up_to();
3297 :
3298 : // If a character is preloaded into the current character register then
3299 : // check that now.
3300 516610 : if (trace->characters_preloaded() == 1) {
3301 700353 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3302 311268 : TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
3303 311268 : false, &bound_checked_to);
3304 : }
3305 : first_elt_done = true;
3306 : }
3307 :
3308 4649490 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3309 2066440 : TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
3310 2066440 : first_elt_done, &bound_checked_to);
3311 : }
3312 :
3313 516610 : Trace successor_trace(*trace);
3314 : // If we advance backward, we may end up at the start.
3315 521782 : successor_trace.AdvanceCurrentPositionInTrace(
3316 521782 : read_backward() ? -Length() : Length(), compiler);
3317 516610 : successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3318 : : Trace::FALSE_VALUE);
3319 : RecursionCheck rc(compiler);
3320 516610 : on_success()->Emit(compiler, &successor_trace);
3321 : }
3322 :
3323 :
3324 0 : void Trace::InvalidateCurrentCharacter() {
3325 227920 : characters_preloaded_ = 0;
3326 0 : }
3327 :
3328 :
3329 516610 : void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3330 : // We don't have an instruction for shifting the current character register
3331 : // down or for using a shifted value for anything so lets just forget that
3332 : // we preloaded any characters into it.
3333 516610 : characters_preloaded_ = 0;
3334 : // Adjust the offsets of the quick check performed information. This
3335 : // information is used to find out what we already determined about the
3336 : // characters by means of mask and compare.
3337 516610 : quick_check_performed_.Advance(by, compiler->one_byte());
3338 516610 : cp_offset_ += by;
3339 516610 : if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3340 : compiler->SetRegExpTooBig();
3341 0 : cp_offset_ = 0;
3342 : }
3343 1033220 : bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3344 516610 : }
3345 :
3346 :
3347 317597 : void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3348 : int element_count = elements()->length();
3349 1076097 : for (int i = 0; i < element_count; i++) {
3350 379250 : TextElement elm = elements()->at(i);
3351 379250 : if (elm.text_type() == TextElement::CHAR_CLASS) {
3352 : RegExpCharacterClass* cc = elm.char_class();
3353 : #ifdef V8_INTL_SUPPORT
3354 : bool case_equivalents_already_added =
3355 : NeedsUnicodeCaseEquivalents(cc->flags());
3356 : #else
3357 : bool case_equivalents_already_added = false;
3358 : #endif
3359 237631 : if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
3360 : // None of the standard character classes is different in the case
3361 : // independent case and it slows us down if we don't know that.
3362 68981 : if (cc->is_standard(zone())) continue;
3363 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3364 66900 : CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
3365 66900 : is_one_byte);
3366 : }
3367 : }
3368 : }
3369 317597 : }
3370 :
3371 :
3372 135183 : int TextNode::GreedyLoopTextLength() { return Length(); }
3373 :
3374 :
3375 85868 : RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3376 : RegExpCompiler* compiler) {
3377 85868 : if (read_backward()) return nullptr;
3378 85743 : if (elements()->length() != 1) return nullptr;
3379 85381 : TextElement elm = elements()->at(0);
3380 85381 : if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
3381 : RegExpCharacterClass* node = elm.char_class();
3382 : ZoneList<CharacterRange>* ranges = node->ranges(zone());
3383 84026 : CharacterRange::Canonicalize(ranges);
3384 84026 : if (node->is_negated()) {
3385 111 : return ranges->length() == 0 ? on_success() : nullptr;
3386 : }
3387 83915 : if (ranges->length() != 1) return nullptr;
3388 : uint32_t max_char;
3389 83455 : if (compiler->one_byte()) {
3390 : max_char = String::kMaxOneByteCharCode;
3391 : } else {
3392 : max_char = String::kMaxUtf16CodeUnit;
3393 : }
3394 166910 : return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
3395 : }
3396 :
3397 :
3398 : // Finds the fixed match length of a sequence of nodes that goes from
3399 : // this alternative and back to this choice node. If there are variable
3400 : // length nodes or other complications in the way then return a sentinel
3401 : // value indicating that a greedy loop cannot be constructed.
3402 225246 : int ChoiceNode::GreedyLoopTextLengthForAlternative(
3403 : GuardedAlternative* alternative) {
3404 : int length = 0;
3405 : RegExpNode* node = alternative->node();
3406 : // Later we will generate code for all these text nodes using recursion
3407 : // so we have to limit the max number.
3408 : int recursion_depth = 0;
3409 495612 : while (node != this) {
3410 337091 : if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3411 : return kNodeIsTooComplexForGreedyLoops;
3412 : }
3413 337091 : int node_length = node->GreedyLoopTextLength();
3414 337091 : if (node_length == kNodeIsTooComplexForGreedyLoops) {
3415 : return kNodeIsTooComplexForGreedyLoops;
3416 : }
3417 135183 : length += node_length;
3418 : SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3419 : node = seq_node->on_success();
3420 : }
3421 23338 : return read_backward() ? -length : length;
3422 : }
3423 :
3424 :
3425 0 : void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3426 : DCHECK_NULL(loop_node_);
3427 : AddAlternative(alt);
3428 999366 : loop_node_ = alt.node();
3429 0 : }
3430 :
3431 :
3432 0 : void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3433 : DCHECK_NULL(continue_node_);
3434 : AddAlternative(alt);
3435 999366 : continue_node_ = alt.node();
3436 0 : }
3437 :
3438 :
3439 332426 : void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3440 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3441 332426 : if (trace->stop_node() == this) {
3442 : // Back edge of greedy optimized loop node graph.
3443 : int text_length =
3444 11669 : GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3445 : DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
3446 : // Update the counter-based backtracking info on the stack. This is an
3447 : // optimization for greedy loops (see below).
3448 : DCHECK(trace->cp_offset() == text_length);
3449 11669 : macro_assembler->AdvanceCurrentPosition(text_length);
3450 11669 : macro_assembler->GoTo(trace->loop_label());
3451 11669 : return;
3452 : }
3453 : DCHECK_NULL(trace->stop_node());
3454 320757 : if (!trace->is_trivial()) {
3455 119240 : trace->Flush(compiler, this);
3456 119240 : return;
3457 : }
3458 201517 : ChoiceNode::Emit(compiler, trace);
3459 : }
3460 :
3461 :
3462 213577 : int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3463 : int eats_at_least) {
3464 : int preload_characters = Min(4, eats_at_least);
3465 : DCHECK_LE(preload_characters, 4);
3466 213577 : if (compiler->macro_assembler()->CanReadUnaligned()) {
3467 : bool one_byte = compiler->one_byte();
3468 134307 : if (one_byte) {
3469 : // We can't preload 3 characters because there is no machine instruction
3470 : // to do that. We can't just load 4 because we could be reading
3471 : // beyond the end of the string, which could cause a memory fault.
3472 106724 : if (preload_characters == 3) preload_characters = 2;
3473 : } else {
3474 27583 : if (preload_characters > 2) preload_characters = 2;
3475 : }
3476 : } else {
3477 79270 : if (preload_characters > 1) preload_characters = 1;
3478 : }
3479 213577 : return preload_characters;
3480 : }
3481 :
3482 :
3483 : // This class is used when generating the alternatives in a choice node. It
3484 : // records the way the alternative is being code generated.
3485 : class AlternativeGeneration: public Malloced {
3486 : public:
3487 : AlternativeGeneration()
3488 : : possible_success(),
3489 : expects_preload(false),
3490 : after(),
3491 2178987 : quick_check_details() { }
3492 : Label possible_success;
3493 : bool expects_preload;
3494 : Label after;
3495 : QuickCheckDetails quick_check_details;
3496 : };
3497 :
3498 :
3499 : // Creates a list of AlternativeGenerations. If the list has a reasonable
3500 : // size then it is on the stack, otherwise the excess is on the heap.
3501 : class AlternativeGenerationList {
3502 : public:
3503 213577 : AlternativeGenerationList(int count, Zone* zone)
3504 2349347 : : alt_gens_(count, zone) {
3505 1360037 : for (int i = 0; i < count && i < kAFew; i++) {
3506 573230 : alt_gens_.Add(a_few_alt_gens_ + i, zone);
3507 : }
3508 300011 : for (int i = kAFew; i < count; i++) {
3509 43217 : alt_gens_.Add(new AlternativeGeneration(), zone);
3510 : }
3511 213577 : }
3512 427154 : ~AlternativeGenerationList() {
3513 300011 : for (int i = kAFew; i < alt_gens_.length(); i++) {
3514 43217 : delete alt_gens_[i];
3515 43217 : alt_gens_[i] = nullptr;
3516 : }
3517 213577 : }
3518 :
3519 : AlternativeGeneration* at(int i) {
3520 1624095 : return alt_gens_[i];
3521 : }
3522 :
3523 : private:
3524 : static const int kAFew = 10;
3525 : ZoneList<AlternativeGeneration*> alt_gens_;
3526 : AlternativeGeneration a_few_alt_gens_[kAFew];
3527 : };
3528 :
3529 :
3530 : static const uc32 kRangeEndMarker = 0x110000;
3531 :
3532 : // The '2' variant is has inclusive from and exclusive to.
3533 : // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3534 : // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3535 : static const int kSpaceRanges[] = {
3536 : '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
3537 : 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3538 : 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3539 : static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3540 :
3541 : static const int kWordRanges[] = {
3542 : '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3543 : static const int kWordRangeCount = arraysize(kWordRanges);
3544 : static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3545 : static const int kDigitRangeCount = arraysize(kDigitRanges);
3546 : static const int kSurrogateRanges[] = {
3547 : kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3548 : static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3549 : static const int kLineTerminatorRanges[] = {
3550 : 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3551 : static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3552 :
3553 0 : void BoyerMoorePositionInfo::Set(int character) {
3554 86629 : SetInterval(Interval(character, character));
3555 0 : }
3556 :
3557 :
3558 248438 : void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3559 496876 : s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3560 496876 : w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3561 496876 : d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3562 : surrogate_ =
3563 496876 : AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3564 248438 : if (interval.to() - interval.from() >= kMapSize - 1) {
3565 13580 : if (map_count_ != kMapSize) {
3566 6296 : map_count_ = kMapSize;
3567 1618072 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3568 : }
3569 : return;
3570 : }
3571 1234098 : for (int i = interval.from(); i <= interval.to(); i++) {
3572 544274 : int mod_character = (i & kMask);
3573 1088548 : if (!map_->at(mod_character)) {
3574 372553 : map_count_++;
3575 372553 : map_->at(mod_character) = true;
3576 : }
3577 544274 : if (map_count_ == kMapSize) return;
3578 : }
3579 : }
3580 :
3581 :
3582 0 : void BoyerMoorePositionInfo::SetAll() {
3583 5496 : s_ = w_ = d_ = kLatticeUnknown;
3584 5496 : if (map_count_ != kMapSize) {
3585 5066 : map_count_ = kMapSize;
3586 1301962 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3587 : }
3588 0 : }
3589 :
3590 :
3591 80731 : BoyerMooreLookahead::BoyerMooreLookahead(
3592 : int length, RegExpCompiler* compiler, Zone* zone)
3593 : : length_(length),
3594 80731 : compiler_(compiler) {
3595 80731 : if (compiler->one_byte()) {
3596 10107 : max_char_ = String::kMaxOneByteCharCode;
3597 : } else {
3598 70624 : max_char_ = String::kMaxUtf16CodeUnit;
3599 : }
3600 80731 : bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3601 279659 : for (int i = 0; i < length; i++) {
3602 99464 : bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3603 : }
3604 80731 : }
3605 :
3606 :
3607 : // Find the longest range of lookahead that has the fewest number of different
3608 : // characters that can occur at a given position. Since we are optimizing two
3609 : // different parameters at once this is a tradeoff.
3610 0 : bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3611 : int biggest_points = 0;
3612 : // If more than 32 characters out of 128 can occur it is unlikely that we can
3613 : // be lucky enough to step forwards much of the time.
3614 : const int kMaxMax = 32;
3615 241902 : for (int max_number_of_chars = 4;
3616 322536 : max_number_of_chars < kMaxMax;
3617 : max_number_of_chars *= 2) {
3618 : biggest_points =
3619 241902 : FindBestInterval(max_number_of_chars, biggest_points, from, to);
3620 : }
3621 80634 : if (biggest_points == 0) return false;
3622 0 : return true;
3623 : }
3624 :
3625 :
3626 : // Find the highest-points range between 0 and length_ where the character
3627 : // information is not too vague. 'Too vague' means that there are more than
3628 : // max_number_of_chars that can occur at this position. Calculates the number
3629 : // of points as the product of width-of-the-range and
3630 : // probability-of-finding-one-of-the-characters, where the probability is
3631 : // calculated using the frequency distribution of the sample subject string.
3632 241902 : int BoyerMooreLookahead::FindBestInterval(
3633 : int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3634 : int biggest_points = old_biggest_points;
3635 : static const int kSize = RegExpMacroAssembler::kTableSize;
3636 699994 : for (int i = 0; i < length_; ) {
3637 568407 : while (i < length_ && Count(i) > max_number_of_chars) i++;
3638 256761 : if (i == length_) break;
3639 : int remembered_from = i;
3640 : bool union_map[kSize];
3641 29546934 : for (int j = 0; j < kSize; j++) union_map[j] = false;
3642 1013570 : while (i < length_ && Count(i) <= max_number_of_chars) {
3643 513110 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3644 65934635 : for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3645 256555 : i++;
3646 : }
3647 : int frequency = 0;
3648 58864822 : for (int j = 0; j < kSize; j++) {
3649 29317888 : if (union_map[j]) {
3650 : // Add 1 to the frequency to give a small per-character boost for
3651 : // the cases where our sampling is not good enough and many
3652 : // characters have a frequency of zero. This means the frequency
3653 : // can theoretically be up to 2*kSize though we treat it mostly as
3654 : // a fraction of kSize.
3655 974202 : frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3656 : }
3657 : }
3658 : // We use the probability of skipping times the distance we are skipping to
3659 : // judge the effectiveness of this. Actually we have a cut-off: By
3660 : // dividing by 2 we switch off the skipping if the probability of skipping
3661 : // is less than 50%. This is because the multibyte mask-and-compare
3662 : // skipping in quickcheck is more likely to do well on this case.
3663 : bool in_quickcheck_range =
3664 231850 : ((i - remembered_from < 4) ||
3665 2804 : (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3666 : // Called 'probability' but it is only a rough estimate and can actually
3667 : // be outside the 0-kSize range.
3668 229046 : int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3669 229046 : int points = (i - remembered_from) * probability;
3670 229046 : if (points > biggest_points) {
3671 5873 : *from = remembered_from;
3672 5873 : *to = i - 1;
3673 : biggest_points = points;
3674 : }
3675 : }
3676 241902 : return biggest_points;
3677 : }
3678 :
3679 :
3680 : // Take all the characters that will not prevent a successful match if they
3681 : // occur in the subject string in the range between min_lookahead and
3682 : // max_lookahead (inclusive) measured from the current position. If the
3683 : // character at max_lookahead offset is not one of these characters, then we
3684 : // can safely skip forwards by the number of characters in the range.
3685 4463 : int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3686 : int max_lookahead,
3687 : Handle<ByteArray> boolean_skip_table) {
3688 : const int kSize = RegExpMacroAssembler::kTableSize;
3689 :
3690 : const int kSkipArrayEntry = 0;
3691 : const int kDontSkipArrayEntry = 1;
3692 :
3693 1146991 : for (int i = 0; i < kSize; i++) {
3694 : boolean_skip_table->set(i, kSkipArrayEntry);
3695 : }
3696 4463 : int skip = max_lookahead + 1 - min_lookahead;
3697 :
3698 23725 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3699 19262 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3700 2475167 : for (int j = 0; j < kSize; j++) {
3701 1232768 : if (map->at(j)) {
3702 : boolean_skip_table->set(j, kDontSkipArrayEntry);
3703 : }
3704 : }
3705 : }
3706 :
3707 4463 : return skip;
3708 : }
3709 :
3710 :
3711 : // See comment above on the implementation of GetSkipTable.
3712 80634 : void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3713 : const int kSize = RegExpMacroAssembler::kTableSize;
3714 :
3715 80634 : int min_lookahead = 0;
3716 80634 : int max_lookahead = 0;
3717 :
3718 156805 : if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3719 :
3720 : bool found_single_character = false;
3721 : int single_character = 0;
3722 9850 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3723 17666 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3724 17666 : if (map->map_count() > 1 ||
3725 3197 : (found_single_character && map->map_count() != 0)) {
3726 : found_single_character = false;
3727 : break;
3728 : }
3729 807192 : for (int j = 0; j < kSize; j++) {
3730 405705 : if (map->at(j)) {
3731 : found_single_character = true;
3732 : single_character = j;
3733 : break;
3734 : }
3735 : }
3736 : }
3737 :
3738 5480 : int lookahead_width = max_lookahead + 1 - min_lookahead;
3739 :
3740 5480 : if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3741 : // The mask-compare can probably handle this better.
3742 : return;
3743 : }
3744 :
3745 4559 : if (found_single_character) {
3746 96 : Label cont, again;
3747 96 : masm->Bind(&again);
3748 96 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3749 96 : if (max_char_ > kSize) {
3750 96 : masm->CheckCharacterAfterAnd(single_character,
3751 : RegExpMacroAssembler::kTableMask,
3752 192 : &cont);
3753 : } else {
3754 0 : masm->CheckCharacter(single_character, &cont);
3755 : }
3756 96 : masm->AdvanceCurrentPosition(lookahead_width);
3757 96 : masm->GoTo(&again);
3758 96 : masm->Bind(&cont);
3759 : return;
3760 : }
3761 :
3762 : Factory* factory = masm->isolate()->factory();
3763 : Handle<ByteArray> boolean_skip_table =
3764 4463 : factory->NewByteArray(kSize, AllocationType::kOld);
3765 4463 : int skip_distance = GetSkipTable(
3766 4463 : min_lookahead, max_lookahead, boolean_skip_table);
3767 : DCHECK_NE(0, skip_distance);
3768 :
3769 4463 : Label cont, again;
3770 4463 : masm->Bind(&again);
3771 4463 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3772 4463 : masm->CheckBitInTable(boolean_skip_table, &cont);
3773 4463 : masm->AdvanceCurrentPosition(skip_distance);
3774 4463 : masm->GoTo(&again);
3775 4463 : masm->Bind(&cont);
3776 : }
3777 :
3778 :
3779 : /* Code generation for choice nodes.
3780 : *
3781 : * We generate quick checks that do a mask and compare to eliminate a
3782 : * choice. If the quick check succeeds then it jumps to the continuation to
3783 : * do slow checks and check subsequent nodes. If it fails (the common case)
3784 : * it falls through to the next choice.
3785 : *
3786 : * Here is the desired flow graph. Nodes directly below each other imply
3787 : * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3788 : * 3 doesn't have a quick check so we have to call the slow check.
3789 : * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3790 : * regexp continuation is generated directly after the Sn node, up to the
3791 : * next GoTo if we decide to reuse some already generated code. Some
3792 : * nodes expect preload_characters to be preloaded into the current
3793 : * character register. R nodes do this preloading. Vertices are marked
3794 : * F for failures and S for success (possible success in the case of quick
3795 : * nodes). L, V, < and > are used as arrow heads.
3796 : *
3797 : * ----------> R
3798 : * |
3799 : * V
3800 : * Q1 -----> S1
3801 : * | S /
3802 : * F| /
3803 : * | F/
3804 : * | /
3805 : * | R
3806 : * | /
3807 : * V L
3808 : * Q2 -----> S2
3809 : * | S /
3810 : * F| /
3811 : * | F/
3812 : * | /
3813 : * | R
3814 : * | /
3815 : * V L
3816 : * S3
3817 : * |
3818 : * F|
3819 : * |
3820 : * R
3821 : * |
3822 : * backtrack V
3823 : * <----------Q4
3824 : * \ F |
3825 : * \ |S
3826 : * \ F V
3827 : * \-----S4
3828 : *
3829 : * For greedy loops we push the current position, then generate the code that
3830 : * eats the input specially in EmitGreedyLoop. The other choice (the
3831 : * continuation) is generated by the normal code in EmitChoices, and steps back
3832 : * in the input to the starting position when it fails to match. The loop code
3833 : * looks like this (U is the unwind code that steps back in the greedy loop).
3834 : *
3835 : * _____
3836 : * / \
3837 : * V |
3838 : * ----------> S1 |
3839 : * /| |
3840 : * / |S |
3841 : * F/ \_____/
3842 : * /
3843 : * |<-----
3844 : * | \
3845 : * V |S
3846 : * Q2 ---> U----->backtrack
3847 : * | F /
3848 : * S| /
3849 : * V F /
3850 : * S2--/
3851 : */
3852 :
3853 213577 : GreedyLoopState::GreedyLoopState(bool not_at_start) {
3854 213577 : counter_backtrack_trace_.set_backtrack(&label_);
3855 213577 : if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3856 213577 : }
3857 :
3858 :
3859 0 : void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3860 : #ifdef DEBUG
3861 : int choice_count = alternatives_->length();
3862 : for (int i = 0; i < choice_count - 1; i++) {
3863 : GuardedAlternative alternative = alternatives_->at(i);
3864 : ZoneList<Guard*>* guards = alternative.guards();
3865 : int guard_count = (guards == nullptr) ? 0 : guards->length();
3866 : for (int j = 0; j < guard_count; j++) {
3867 : DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3868 : }
3869 : }
3870 : #endif
3871 0 : }
3872 :
3873 :
3874 213577 : void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3875 : Trace* current_trace,
3876 : PreloadState* state) {
3877 213577 : if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3878 : // Save some time by looking at most one machine word ahead.
3879 : state->eats_at_least_ =
3880 131432 : EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3881 262864 : current_trace->at_start() == Trace::FALSE_VALUE);
3882 : }
3883 : state->preload_characters_ =
3884 213577 : CalculatePreloadCharacters(compiler, state->eats_at_least_);
3885 :
3886 : state->preload_is_current_ =
3887 213577 : (current_trace->characters_preloaded() == state->preload_characters_);
3888 213577 : state->preload_has_checked_bounds_ = state->preload_is_current_;
3889 213577 : }
3890 :
3891 :
3892 583225 : void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3893 583225 : int choice_count = alternatives_->length();
3894 :
3895 583225 : if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3896 1338 : alternatives_->at(0).node()->Emit(compiler, trace);
3897 1338 : return;
3898 : }
3899 :
3900 : AssertGuardsMentionRegisters(trace);
3901 :
3902 581887 : LimitResult limit_result = LimitVersions(compiler, trace);
3903 581887 : if (limit_result == DONE) return;
3904 : DCHECK(limit_result == CONTINUE);
3905 :
3906 : // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3907 : // other choice nodes we only flush if we are out of code size budget.
3908 215037 : if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3909 1460 : trace->Flush(compiler, this);
3910 1460 : return;
3911 : }
3912 :
3913 : RecursionCheck rc(compiler);
3914 :
3915 : PreloadState preload;
3916 : preload.init();
3917 213577 : GreedyLoopState greedy_loop_state(not_at_start());
3918 :
3919 213577 : int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3920 427154 : AlternativeGenerationList alt_gens(choice_count, zone());
3921 :
3922 213577 : if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3923 : trace = EmitGreedyLoop(compiler,
3924 : trace,
3925 : &alt_gens,
3926 : &preload,
3927 : &greedy_loop_state,
3928 11669 : text_length);
3929 : } else {
3930 : // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3931 : // match the traces produced pre-cleanup.
3932 201908 : Label second_choice;
3933 201908 : compiler->macro_assembler()->Bind(&second_choice);
3934 :
3935 201908 : preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3936 :
3937 : EmitChoices(compiler,
3938 : &alt_gens,
3939 : 0,
3940 : trace,
3941 201908 : &preload);
3942 : }
3943 :
3944 : // At this point we need to generate slow checks for the alternatives where
3945 : // the quick check was inlined. We can recognize these because the associated
3946 : // label was bound.
3947 213577 : int new_flush_budget = trace->flush_budget() / choice_count;
3948 1446471 : for (int i = 0; i < choice_count; i++) {
3949 : AlternativeGeneration* alt_gen = alt_gens.at(i);
3950 616447 : Trace new_trace(*trace);
3951 : // If there are actions to be flushed we have to limit how many times
3952 : // they are flushed. Take the budget of the parent trace and distribute
3953 : // it fairly amongst the children.
3954 616447 : if (new_trace.actions() != nullptr) {
3955 : new_trace.set_flush_budget(new_flush_budget);
3956 : }
3957 : bool next_expects_preload =
3958 1019317 : i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3959 616447 : EmitOutOfLineContinuation(compiler,
3960 : &new_trace,
3961 616447 : alternatives_->at(i),
3962 : alt_gen,
3963 : preload.preload_characters_,
3964 616447 : next_expects_preload);
3965 : }
3966 : }
3967 :
3968 :
3969 11669 : Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3970 : Trace* trace,
3971 : AlternativeGenerationList* alt_gens,
3972 : PreloadState* preload,
3973 : GreedyLoopState* greedy_loop_state,
3974 : int text_length) {
3975 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3976 : // Here we have special handling for greedy loops containing only text nodes
3977 : // and other simple nodes. These are handled by pushing the current
3978 : // position on the stack and then incrementing the current position each
3979 : // time around the switch. On backtrack we decrement the current position
3980 : // and check it against the pushed value. This avoids pushing backtrack
3981 : // information for each iteration of the loop, which could take up a lot of
3982 : // space.
3983 : DCHECK(trace->stop_node() == nullptr);
3984 11669 : macro_assembler->PushCurrentPosition();
3985 11669 : Label greedy_match_failed;
3986 : Trace greedy_match_trace;
3987 11669 : if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3988 : greedy_match_trace.set_backtrack(&greedy_match_failed);
3989 11669 : Label loop_label;
3990 11669 : macro_assembler->Bind(&loop_label);
3991 11669 : greedy_match_trace.set_stop_node(this);
3992 : greedy_match_trace.set_loop_label(&loop_label);
3993 11669 : alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3994 11669 : macro_assembler->Bind(&greedy_match_failed);
3995 :
3996 11669 : Label second_choice; // For use in greedy matches.
3997 11669 : macro_assembler->Bind(&second_choice);
3998 :
3999 : Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4000 :
4001 : EmitChoices(compiler,
4002 : alt_gens,
4003 : 1,
4004 : new_trace,
4005 11669 : preload);
4006 :
4007 23338 : macro_assembler->Bind(greedy_loop_state->label());
4008 : // If we have unwound to the bottom then backtrack.
4009 11669 : macro_assembler->CheckGreedyLoop(trace->backtrack());
4010 : // Otherwise try the second priority at an earlier position.
4011 11669 : macro_assembler->AdvanceCurrentPosition(-text_length);
4012 11669 : macro_assembler->GoTo(&second_choice);
4013 11669 : return new_trace;
4014 : }
4015 :
4016 201908 : int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4017 : Trace* trace) {
4018 : int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4019 403816 : if (alternatives_->length() != 2) return eats_at_least;
4020 :
4021 165798 : GuardedAlternative alt1 = alternatives_->at(1);
4022 167218 : if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
4023 : return eats_at_least;
4024 : }
4025 : RegExpNode* eats_anything_node = alt1.node();
4026 164378 : if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4027 : return eats_at_least;
4028 : }
4029 :
4030 : // Really we should be creating a new trace when we execute this function,
4031 : // but there is no need, because the code it generates cannot backtrack, and
4032 : // we always arrive here with a trivial trace (since it's the entry to a
4033 : // loop. That also implies that there are no preloaded characters, which is
4034 : // good, because it means we won't be violating any assumptions by
4035 : // overwriting those characters with new load instructions.
4036 : DCHECK(trace->is_trivial());
4037 :
4038 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4039 : Isolate* isolate = macro_assembler->isolate();
4040 : // At this point we know that we are at a non-greedy loop that will eat
4041 : // any character one at a time. Any non-anchored regexp has such a
4042 : // loop prepended to it in order to find where it starts. We look for
4043 : // a pattern of the form ...abc... where we can look 6 characters ahead
4044 : // and step forwards 3 if the character is not one of abc. Abc need
4045 : // not be atoms, they can be any reasonably limited character class or
4046 : // small alternation.
4047 : BoyerMooreLookahead* bm = bm_info(false);
4048 82145 : if (bm == nullptr) {
4049 82145 : eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4050 : EatsAtLeast(kMaxLookaheadForBoyerMoore,
4051 : kRecursionBudget,
4052 82145 : false));
4053 82145 : if (eats_at_least >= 1) {
4054 : bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4055 : compiler,
4056 80634 : zone());
4057 80634 : GuardedAlternative alt0 = alternatives_->at(0);
4058 80634 : alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4059 : }
4060 : }
4061 82145 : if (bm != nullptr) {
4062 80634 : bm->EmitSkipInstructions(macro_assembler);
4063 : }
4064 : return eats_at_least;
4065 : }
4066 :
4067 :
4068 213577 : void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4069 : AlternativeGenerationList* alt_gens,
4070 : int first_choice,
4071 : Trace* trace,
4072 : PreloadState* preload) {
4073 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4074 213577 : SetUpPreLoad(compiler, trace, preload);
4075 :
4076 : // For now we just call all choices one after the other. The idea ultimately
4077 : // is to use the Dispatch table to try only the relevant ones.
4078 213577 : int choice_count = alternatives_->length();
4079 :
4080 213577 : int new_flush_budget = trace->flush_budget() / choice_count;
4081 :
4082 1423133 : for (int i = first_choice; i < choice_count; i++) {
4083 604778 : bool is_last = i == choice_count - 1;
4084 604778 : bool fall_through_on_failure = !is_last;
4085 1209556 : GuardedAlternative alternative = alternatives_->at(i);
4086 : AlternativeGeneration* alt_gen = alt_gens->at(i);
4087 604778 : alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4088 : ZoneList<Guard*>* guards = alternative.guards();
4089 604778 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4090 604778 : Trace new_trace(*trace);
4091 604778 : new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4092 : preload->preload_characters_ :
4093 : 0);
4094 604778 : if (preload->preload_has_checked_bounds_) {
4095 399376 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4096 : }
4097 : new_trace.quick_check_performed()->Clear();
4098 604778 : if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4099 604778 : if (!is_last) {
4100 391201 : new_trace.set_backtrack(&alt_gen->after);
4101 : }
4102 604778 : alt_gen->expects_preload = preload->preload_is_current_;
4103 : bool generate_full_check_inline = false;
4104 1082993 : if (compiler->optimize() &&
4105 1080191 : try_to_emit_quick_check_for_alternative(i == 0) &&
4106 950826 : alternative.node()->EmitQuickCheck(
4107 475413 : compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4108 : &alt_gen->possible_success, &alt_gen->quick_check_details,
4109 : fall_through_on_failure)) {
4110 : // Quick check was generated for this choice.
4111 226208 : preload->preload_is_current_ = true;
4112 226208 : preload->preload_has_checked_bounds_ = true;
4113 : // If we generated the quick check to fall through on possible success,
4114 : // we now need to generate the full check inline.
4115 226208 : if (!fall_through_on_failure) {
4116 34416 : macro_assembler->Bind(&alt_gen->possible_success);
4117 : new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4118 34416 : new_trace.set_characters_preloaded(preload->preload_characters_);
4119 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4120 : generate_full_check_inline = true;
4121 : }
4122 378570 : } else if (alt_gen->quick_check_details.cannot_match()) {
4123 110 : if (!fall_through_on_failure) {
4124 22 : macro_assembler->GoTo(trace->backtrack());
4125 : }
4126 110 : continue;
4127 : } else {
4128 : // No quick check was generated. Put the full code here.
4129 : // If this is not the first choice then there could be slow checks from
4130 : // previous cases that go here when they fail. There's no reason to
4131 : // insist that they preload characters since the slow check we are about
4132 : // to generate probably can't use it.
4133 378460 : if (i != first_choice) {
4134 227385 : alt_gen->expects_preload = false;
4135 : new_trace.InvalidateCurrentCharacter();
4136 : }
4137 : generate_full_check_inline = true;
4138 : }
4139 604668 : if (generate_full_check_inline) {
4140 412876 : if (new_trace.actions() != nullptr) {
4141 : new_trace.set_flush_budget(new_flush_budget);
4142 : }
4143 417828 : for (int j = 0; j < guard_count; j++) {
4144 2476 : GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4145 : }
4146 412876 : alternative.node()->Emit(compiler, &new_trace);
4147 412876 : preload->preload_is_current_ = false;
4148 : }
4149 604668 : macro_assembler->Bind(&alt_gen->after);
4150 : }
4151 213577 : }
4152 :
4153 :
4154 616447 : void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4155 : Trace* trace,
4156 : GuardedAlternative alternative,
4157 : AlternativeGeneration* alt_gen,
4158 : int preload_characters,
4159 : bool next_expects_preload) {
4160 1041102 : if (!alt_gen->possible_success.is_linked()) return;
4161 :
4162 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4163 191792 : macro_assembler->Bind(&alt_gen->possible_success);
4164 191792 : Trace out_of_line_trace(*trace);
4165 : out_of_line_trace.set_characters_preloaded(preload_characters);
4166 : out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4167 191792 : if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4168 : ZoneList<Guard*>* guards = alternative.guards();
4169 191792 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4170 191792 : if (next_expects_preload) {
4171 162190 : Label reload_current_char;
4172 : out_of_line_trace.set_backtrack(&reload_current_char);
4173 164958 : for (int j = 0; j < guard_count; j++) {
4174 1384 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4175 : }
4176 162190 : alternative.node()->Emit(compiler, &out_of_line_trace);
4177 162190 : macro_assembler->Bind(&reload_current_char);
4178 : // Reload the current character, since the next quick check expects that.
4179 : // We don't need to check bounds here because we only get into this
4180 : // code through a quick check which already did the checked load.
4181 : macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
4182 162190 : preload_characters);
4183 162190 : macro_assembler->GoTo(&(alt_gen->after));
4184 : } else {
4185 29602 : out_of_line_trace.set_backtrack(&(alt_gen->after));
4186 29754 : for (int j = 0; j < guard_count; j++) {
4187 76 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4188 : }
4189 29602 : alternative.node()->Emit(compiler, &out_of_line_trace);
4190 : }
4191 : }
4192 :
4193 :
4194 494079 : void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4195 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4196 494079 : LimitResult limit_result = LimitVersions(compiler, trace);
4197 494079 : if (limit_result == DONE) return;
4198 : DCHECK(limit_result == CONTINUE);
4199 :
4200 : RecursionCheck rc(compiler);
4201 :
4202 265583 : switch (action_type_) {
4203 : case STORE_POSITION: {
4204 : Trace::DeferredCapture
4205 : new_capture(data_.u_position_register.reg,
4206 241131 : data_.u_position_register.is_capture,
4207 241131 : trace);
4208 241131 : Trace new_trace = *trace;
4209 : new_trace.add_action(&new_capture);
4210 241131 : on_success()->Emit(compiler, &new_trace);
4211 : break;
4212 : }
4213 : case INCREMENT_REGISTER: {
4214 : Trace::DeferredIncrementRegister
4215 3744 : new_increment(data_.u_increment_register.reg);
4216 3744 : Trace new_trace = *trace;
4217 : new_trace.add_action(&new_increment);
4218 3744 : on_success()->Emit(compiler, &new_trace);
4219 : break;
4220 : }
4221 : case SET_REGISTER: {
4222 : Trace::DeferredSetRegister
4223 3465 : new_set(data_.u_store_register.reg, data_.u_store_register.value);
4224 3465 : Trace new_trace = *trace;
4225 : new_trace.add_action(&new_set);
4226 3465 : on_success()->Emit(compiler, &new_trace);
4227 : break;
4228 : }
4229 : case CLEAR_CAPTURES: {
4230 : Trace::DeferredClearCaptures
4231 : new_capture(Interval(data_.u_clear_captures.range_from,
4232 2226 : data_.u_clear_captures.range_to));
4233 2226 : Trace new_trace = *trace;
4234 : new_trace.add_action(&new_capture);
4235 2226 : on_success()->Emit(compiler, &new_trace);
4236 : break;
4237 : }
4238 : case BEGIN_SUBMATCH:
4239 9478 : if (!trace->is_trivial()) {
4240 5041 : trace->Flush(compiler, this);
4241 : } else {
4242 4437 : assembler->WriteCurrentPositionToRegister(
4243 8874 : data_.u_submatch.current_position_register, 0);
4244 4437 : assembler->WriteStackPointerToRegister(
4245 8874 : data_.u_submatch.stack_pointer_register);
4246 4437 : on_success()->Emit(compiler, trace);
4247 : }
4248 : break;
4249 : case EMPTY_MATCH_CHECK: {
4250 973 : int start_pos_reg = data_.u_empty_match_check.start_register;
4251 973 : int stored_pos = 0;
4252 973 : int rep_reg = data_.u_empty_match_check.repetition_register;
4253 : bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4254 973 : bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4255 973 : if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4256 : // If we know we haven't advanced and there is no minimum we
4257 : // can just backtrack immediately.
4258 76 : assembler->GoTo(trace->backtrack());
4259 897 : } else if (know_dist && stored_pos < trace->cp_offset()) {
4260 : // If we know we've advanced we can generate the continuation
4261 : // immediately.
4262 247 : on_success()->Emit(compiler, trace);
4263 650 : } else if (!trace->is_trivial()) {
4264 339 : trace->Flush(compiler, this);
4265 : } else {
4266 311 : Label skip_empty_check;
4267 : // If we have a minimum number of repetitions we check the current
4268 : // number first and skip the empty check if it's not enough.
4269 311 : if (has_minimum) {
4270 206 : int limit = data_.u_empty_match_check.repetition_limit;
4271 206 : assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4272 : }
4273 : // If the match is empty we bail out, otherwise we fall through
4274 : // to the on-success continuation.
4275 311 : assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4276 622 : trace->backtrack());
4277 311 : assembler->Bind(&skip_empty_check);
4278 311 : on_success()->Emit(compiler, trace);
4279 : }
4280 : break;
4281 : }
4282 : case POSITIVE_SUBMATCH_SUCCESS: {
4283 4566 : if (!trace->is_trivial()) {
4284 2936 : trace->Flush(compiler, this);
4285 2936 : return;
4286 : }
4287 1630 : assembler->ReadCurrentPositionFromRegister(
4288 3260 : data_.u_submatch.current_position_register);
4289 1630 : assembler->ReadStackPointerFromRegister(
4290 3260 : data_.u_submatch.stack_pointer_register);
4291 1630 : int clear_register_count = data_.u_submatch.clear_register_count;
4292 1630 : if (clear_register_count == 0) {
4293 1147 : on_success()->Emit(compiler, trace);
4294 1147 : return;
4295 : }
4296 483 : int clear_registers_from = data_.u_submatch.clear_register_from;
4297 483 : Label clear_registers_backtrack;
4298 483 : Trace new_trace = *trace;
4299 : new_trace.set_backtrack(&clear_registers_backtrack);
4300 483 : on_success()->Emit(compiler, &new_trace);
4301 :
4302 483 : assembler->Bind(&clear_registers_backtrack);
4303 483 : int clear_registers_to = clear_registers_from + clear_register_count - 1;
4304 483 : assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4305 :
4306 : DCHECK(trace->backtrack() == nullptr);
4307 483 : assembler->Backtrack();
4308 483 : return;
4309 : }
4310 : default:
4311 0 : UNREACHABLE();
4312 : }
4313 : }
4314 :
4315 :
4316 4743 : void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4317 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4318 4743 : if (!trace->is_trivial()) {
4319 2253 : trace->Flush(compiler, this);
4320 2253 : return;
4321 : }
4322 :
4323 2490 : LimitResult limit_result = LimitVersions(compiler, trace);
4324 2490 : if (limit_result == DONE) return;
4325 : DCHECK(limit_result == CONTINUE);
4326 :
4327 : RecursionCheck rc(compiler);
4328 :
4329 : DCHECK_EQ(start_reg_ + 1, end_reg_);
4330 2290 : if (IgnoreCase(flags_)) {
4331 1681 : assembler->CheckNotBackReferenceIgnoreCase(
4332 3362 : start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
4333 : } else {
4334 609 : assembler->CheckNotBackReference(start_reg_, read_backward(),
4335 1218 : trace->backtrack());
4336 : }
4337 : // We are going to advance backward, so we may end up at the start.
4338 2290 : if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4339 :
4340 : // Check that the back reference does not end inside a surrogate pair.
4341 2290 : if (IsUnicode(flags_) && !compiler->one_byte()) {
4342 80 : assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4343 : }
4344 2290 : on_success()->Emit(compiler, trace);
4345 : }
4346 :
4347 :
4348 : // -------------------------------------------------------------------
4349 : // Dot/dotty output
4350 :
4351 :
4352 : #ifdef DEBUG
4353 :
4354 :
4355 : class DotPrinter: public NodeVisitor {
4356 : public:
4357 : DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4358 : : os_(os),
4359 : ignore_case_(ignore_case) {}
4360 : void PrintNode(const char* label, RegExpNode* node);
4361 : void Visit(RegExpNode* node);
4362 : void PrintAttributes(RegExpNode* from);
4363 : void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4364 : #define DECLARE_VISIT(Type) \
4365 : virtual void Visit##Type(Type##Node* that);
4366 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4367 : #undef DECLARE_VISIT
4368 : private:
4369 : std::ostream& os_;
4370 : bool ignore_case_;
4371 : };
4372 :
4373 :
4374 : void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4375 : os_ << "digraph G {\n graph [label=\"";
4376 : for (int i = 0; label[i]; i++) {
4377 : switch (label[i]) {
4378 : case '\\':
4379 : os_ << "\\\\";
4380 : break;
4381 : case '"':
4382 : os_ << "\"";
4383 : break;
4384 : default:
4385 : os_ << label[i];
4386 : break;
4387 : }
4388 : }
4389 : os_ << "\"];\n";
4390 : Visit(node);
4391 : os_ << "}" << std::endl;
4392 : }
4393 :
4394 :
4395 : void DotPrinter::Visit(RegExpNode* node) {
4396 : if (node->info()->visited) return;
4397 : node->info()->visited = true;
4398 : node->Accept(this);
4399 : }
4400 :
4401 :
4402 : void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4403 : os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4404 : Visit(on_failure);
4405 : }
4406 :
4407 :
4408 : class TableEntryBodyPrinter {
4409 : public:
4410 : TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4411 : : os_(os),
4412 : choice_(choice) {}
4413 : void Call(uc16 from, DispatchTable::Entry entry) {
4414 : OutSet* out_set = entry.out_set();
4415 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4416 : if (out_set->Get(i)) {
4417 : os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4418 : << choice()->alternatives()->at(i).node() << ";\n";
4419 : }
4420 : }
4421 : }
4422 : private:
4423 : ChoiceNode* choice() { return choice_; }
4424 : std::ostream& os_;
4425 : ChoiceNode* choice_;
4426 : };
4427 :
4428 :
4429 : class TableEntryHeaderPrinter {
4430 : public:
4431 : explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4432 : : first_(true),
4433 : os_(os) {}
4434 : void Call(uc16 from, DispatchTable::Entry entry) {
4435 : if (first_) {
4436 : first_ = false;
4437 : } else {
4438 : os_ << "|";
4439 : }
4440 : os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4441 : OutSet* out_set = entry.out_set();
4442 : int priority = 0;
4443 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4444 : if (out_set->Get(i)) {
4445 : if (priority > 0) os_ << "|";
4446 : os_ << "<s" << from << "o" << i << "> " << priority;
4447 : priority++;
4448 : }
4449 : }
4450 : os_ << "}}";
4451 : }
4452 :
4453 : private:
4454 : bool first_;
4455 : std::ostream& os_;
4456 : };
4457 :
4458 :
4459 : class AttributePrinter {
4460 : public:
4461 : explicit AttributePrinter(std::ostream& os) // NOLINT
4462 : : os_(os),
4463 : first_(true) {}
4464 : void PrintSeparator() {
4465 : if (first_) {
4466 : first_ = false;
4467 : } else {
4468 : os_ << "|";
4469 : }
4470 : }
4471 : void PrintBit(const char* name, bool value) {
4472 : if (!value) return;
4473 : PrintSeparator();
4474 : os_ << "{" << name << "}";
4475 : }
4476 : void PrintPositive(const char* name, int value) {
4477 : if (value < 0) return;
4478 : PrintSeparator();
4479 : os_ << "{" << name << "|" << value << "}";
4480 : }
4481 :
4482 : private:
4483 : std::ostream& os_;
4484 : bool first_;
4485 : };
4486 :
4487 :
4488 : void DotPrinter::PrintAttributes(RegExpNode* that) {
4489 : os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4490 : << "margin=0.1, fontsize=10, label=\"{";
4491 : AttributePrinter printer(os_);
4492 : NodeInfo* info = that->info();
4493 : printer.PrintBit("NI", info->follows_newline_interest);
4494 : printer.PrintBit("WI", info->follows_word_interest);
4495 : printer.PrintBit("SI", info->follows_start_interest);
4496 : Label* label = that->label();
4497 : if (label->is_bound())
4498 : printer.PrintPositive("@", label->pos());
4499 : os_ << "}\"];\n"
4500 : << " a" << that << " -> n" << that
4501 : << " [style=dashed, color=grey, arrowhead=none];\n";
4502 : }
4503 :
4504 :
4505 : static const bool kPrintDispatchTable = false;
4506 : void DotPrinter::VisitChoice(ChoiceNode* that) {
4507 : if (kPrintDispatchTable) {
4508 : os_ << " n" << that << " [shape=Mrecord, label=\"";
4509 : TableEntryHeaderPrinter header_printer(os_);
4510 : that->GetTable(ignore_case_)->ForEach(&header_printer);
4511 : os_ << "\"]\n";
4512 : PrintAttributes(that);
4513 : TableEntryBodyPrinter body_printer(os_, that);
4514 : that->GetTable(ignore_case_)->ForEach(&body_printer);
4515 : } else {
4516 : os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4517 : for (int i = 0; i < that->alternatives()->length(); i++) {
4518 : GuardedAlternative alt = that->alternatives()->at(i);
4519 : os_ << " n" << that << " -> n" << alt.node();
4520 : }
4521 : }
4522 : for (int i = 0; i < that->alternatives()->length(); i++) {
4523 : GuardedAlternative alt = that->alternatives()->at(i);
4524 : alt.node()->Accept(this);
4525 : }
4526 : }
4527 :
4528 :
4529 : void DotPrinter::VisitText(TextNode* that) {
4530 : Zone* zone = that->zone();
4531 : os_ << " n" << that << " [label=\"";
4532 : for (int i = 0; i < that->elements()->length(); i++) {
4533 : if (i > 0) os_ << " ";
4534 : TextElement elm = that->elements()->at(i);
4535 : switch (elm.text_type()) {
4536 : case TextElement::ATOM: {
4537 : Vector<const uc16> data = elm.atom()->data();
4538 : for (int i = 0; i < data.length(); i++) {
4539 : os_ << static_cast<char>(data[i]);
4540 : }
4541 : break;
4542 : }
4543 : case TextElement::CHAR_CLASS: {
4544 : RegExpCharacterClass* node = elm.char_class();
4545 : os_ << "[";
4546 : if (node->is_negated()) os_ << "^";
4547 : for (int j = 0; j < node->ranges(zone)->length(); j++) {
4548 : CharacterRange range = node->ranges(zone)->at(j);
4549 : os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4550 : }
4551 : os_ << "]";
4552 : break;
4553 : }
4554 : default:
4555 : UNREACHABLE();
4556 : }
4557 : }
4558 : os_ << "\", shape=box, peripheries=2];\n";
4559 : PrintAttributes(that);
4560 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4561 : Visit(that->on_success());
4562 : }
4563 :
4564 :
4565 : void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4566 : os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4567 : << that->end_register() << "\", shape=doubleoctagon];\n";
4568 : PrintAttributes(that);
4569 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4570 : Visit(that->on_success());
4571 : }
4572 :
4573 :
4574 : void DotPrinter::VisitEnd(EndNode* that) {
4575 : os_ << " n" << that << " [style=bold, shape=point];\n";
4576 : PrintAttributes(that);
4577 : }
4578 :
4579 :
4580 : void DotPrinter::VisitAssertion(AssertionNode* that) {
4581 : os_ << " n" << that << " [";
4582 : switch (that->assertion_type()) {
4583 : case AssertionNode::AT_END:
4584 : os_ << "label=\"$\", shape=septagon";
4585 : break;
4586 : case AssertionNode::AT_START:
4587 : os_ << "label=\"^\", shape=septagon";
4588 : break;
4589 : case AssertionNode::AT_BOUNDARY:
4590 : os_ << "label=\"\\b\", shape=septagon";
4591 : break;
4592 : case AssertionNode::AT_NON_BOUNDARY:
4593 : os_ << "label=\"\\B\", shape=septagon";
4594 : break;
4595 : case AssertionNode::AFTER_NEWLINE:
4596 : os_ << "label=\"(?<=\\n)\", shape=septagon";
4597 : break;
4598 : }
4599 : os_ << "];\n";
4600 : PrintAttributes(that);
4601 : RegExpNode* successor = that->on_success();
4602 : os_ << " n" << that << " -> n" << successor << ";\n";
4603 : Visit(successor);
4604 : }
4605 :
4606 :
4607 : void DotPrinter::VisitAction(ActionNode* that) {
4608 : os_ << " n" << that << " [";
4609 : switch (that->action_type_) {
4610 : case ActionNode::SET_REGISTER:
4611 : os_ << "label=\"$" << that->data_.u_store_register.reg
4612 : << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4613 : break;
4614 : case ActionNode::INCREMENT_REGISTER:
4615 : os_ << "label=\"$" << that->data_.u_increment_register.reg
4616 : << "++\", shape=octagon";
4617 : break;
4618 : case ActionNode::STORE_POSITION:
4619 : os_ << "label=\"$" << that->data_.u_position_register.reg
4620 : << ":=$pos\", shape=octagon";
4621 : break;
4622 : case ActionNode::BEGIN_SUBMATCH:
4623 : os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4624 : << ":=$pos,begin\", shape=septagon";
4625 : break;
4626 : case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4627 : os_ << "label=\"escape\", shape=septagon";
4628 : break;
4629 : case ActionNode::EMPTY_MATCH_CHECK:
4630 : os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4631 : << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4632 : << "<" << that->data_.u_empty_match_check.repetition_limit
4633 : << "?\", shape=septagon";
4634 : break;
4635 : case ActionNode::CLEAR_CAPTURES: {
4636 : os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4637 : << " to $" << that->data_.u_clear_captures.range_to
4638 : << "\", shape=septagon";
4639 : break;
4640 : }
4641 : }
4642 : os_ << "];\n";
4643 : PrintAttributes(that);
4644 : RegExpNode* successor = that->on_success();
4645 : os_ << " n" << that << " -> n" << successor << ";\n";
4646 : Visit(successor);
4647 : }
4648 :
4649 :
4650 : class DispatchTableDumper {
4651 : public:
4652 : explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4653 : void Call(uc16 key, DispatchTable::Entry entry);
4654 : private:
4655 : std::ostream& os_;
4656 : };
4657 :
4658 :
4659 : void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4660 : os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4661 : OutSet* set = entry.out_set();
4662 : bool first = true;
4663 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4664 : if (set->Get(i)) {
4665 : if (first) {
4666 : first = false;
4667 : } else {
4668 : os_ << ", ";
4669 : }
4670 : os_ << i;
4671 : }
4672 : }
4673 : os_ << "}\n";
4674 : }
4675 :
4676 :
4677 : void DispatchTable::Dump() {
4678 : OFStream os(stderr);
4679 : DispatchTableDumper dumper(os);
4680 : tree()->ForEach(&dumper);
4681 : }
4682 :
4683 :
4684 : void RegExpEngine::DotPrint(const char* label,
4685 : RegExpNode* node,
4686 : bool ignore_case) {
4687 : StdoutStream os;
4688 : DotPrinter printer(os, ignore_case);
4689 : printer.PrintNode(label, node);
4690 : }
4691 :
4692 :
4693 : #endif // DEBUG
4694 :
4695 :
4696 : // -------------------------------------------------------------------
4697 : // Tree to graph conversion
4698 :
4699 992591 : RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4700 : RegExpNode* on_success) {
4701 : ZoneList<TextElement>* elms =
4702 : new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4703 992591 : elms->Add(TextElement::Atom(this), compiler->zone());
4704 : return new (compiler->zone())
4705 992591 : TextNode(elms, compiler->read_backward(), on_success);
4706 : }
4707 :
4708 :
4709 18669 : RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4710 : RegExpNode* on_success) {
4711 : return new (compiler->zone())
4712 18669 : TextNode(elements(), compiler->read_backward(), on_success);
4713 : }
4714 :
4715 :
4716 552541 : static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4717 : const int* special_class,
4718 : int length) {
4719 552541 : length--; // Remove final marker.
4720 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4721 : DCHECK_NE(0, ranges->length());
4722 : DCHECK_NE(0, length);
4723 : DCHECK_NE(0, special_class[0]);
4724 552541 : if (ranges->length() != (length >> 1) + 1) {
4725 : return false;
4726 : }
4727 9969 : CharacterRange range = ranges->at(0);
4728 9969 : if (range.from() != 0) {
4729 : return false;
4730 : }
4731 56744 : for (int i = 0; i < length; i += 2) {
4732 24859 : if (special_class[i] != (range.to() + 1)) {
4733 : return false;
4734 : }
4735 48608 : range = ranges->at((i >> 1) + 1);
4736 24304 : if (special_class[i+1] != range.from()) {
4737 : return false;
4738 : }
4739 : }
4740 7581 : if (range.to() != String::kMaxCodePoint) {
4741 : return false;
4742 : }
4743 7581 : return true;
4744 : }
4745 :
4746 :
4747 547157 : static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4748 : const int* special_class,
4749 : int length) {
4750 547157 : length--; // Remove final marker.
4751 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4752 547157 : if (ranges->length() * 2 != length) {
4753 : return false;
4754 : }
4755 27760 : for (int i = 0; i < length; i += 2) {
4756 29336 : CharacterRange range = ranges->at(i >> 1);
4757 25883 : if (range.from() != special_class[i] ||
4758 11215 : range.to() != special_class[i + 1] - 1) {
4759 : return false;
4760 : }
4761 : }
4762 : return true;
4763 : }
4764 :
4765 :
4766 196711 : bool RegExpCharacterClass::is_standard(Zone* zone) {
4767 : // TODO(lrn): Remove need for this function, by not throwing away information
4768 : // along the way.
4769 196711 : if (is_negated()) {
4770 : return false;
4771 : }
4772 191054 : if (set_.is_standard()) {
4773 : return true;
4774 : }
4775 187773 : if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4776 : set_.set_standard_set_type('s');
4777 607 : return true;
4778 : }
4779 187166 : if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4780 : set_.set_standard_set_type('S');
4781 207 : return true;
4782 : }
4783 186959 : if (CompareInverseRanges(set_.ranges(zone),
4784 : kLineTerminatorRanges,
4785 : kLineTerminatorRangeCount)) {
4786 : set_.set_standard_set_type('.');
4787 7262 : return true;
4788 : }
4789 179697 : if (CompareRanges(set_.ranges(zone),
4790 : kLineTerminatorRanges,
4791 : kLineTerminatorRangeCount)) {
4792 : set_.set_standard_set_type('n');
4793 10 : return true;
4794 : }
4795 179687 : if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4796 : set_.set_standard_set_type('w');
4797 1271 : return true;
4798 : }
4799 178416 : if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4800 : set_.set_standard_set_type('W');
4801 112 : return true;
4802 : }
4803 : return false;
4804 : }
4805 :
4806 :
4807 2587 : UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4808 : ZoneList<CharacterRange>* base)
4809 : : zone_(zone),
4810 : table_(zone),
4811 : bmp_(nullptr),
4812 : lead_surrogates_(nullptr),
4813 : trail_surrogates_(nullptr),
4814 5174 : non_bmp_(nullptr) {
4815 : // The unicode range splitter categorizes given character ranges into:
4816 : // - Code points from the BMP representable by one code unit.
4817 : // - Code points outside the BMP that need to be split into surrogate pairs.
4818 : // - Lone lead surrogates.
4819 : // - Lone trail surrogates.
4820 : // Lone surrogates are valid code points, even though no actual characters.
4821 : // They require special matching to make sure we do not split surrogate pairs.
4822 : // We use the dispatch table to accomplish this. The base range is split up
4823 : // by the table by the overlay ranges, and the Call callback is used to
4824 : // filter and collect ranges for each category.
4825 151143 : for (int i = 0; i < base->length(); i++) {
4826 148556 : table_.AddRange(base->at(i), kBase, zone_);
4827 : }
4828 : // Add overlay ranges.
4829 2587 : table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4830 2587 : kBmpCodePoints, zone_);
4831 2587 : table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4832 2587 : kLeadSurrogates, zone_);
4833 2587 : table_.AddRange(
4834 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4835 2587 : kTrailSurrogates, zone_);
4836 2587 : table_.AddRange(
4837 : CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4838 2587 : kBmpCodePoints, zone_);
4839 2587 : table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4840 2587 : kNonBmpCodePoints, zone_);
4841 : table_.ForEach(this);
4842 2587 : }
4843 :
4844 :
4845 159101 : void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4846 : OutSet* outset = entry.out_set();
4847 159101 : if (!outset->Get(kBase)) return;
4848 : ZoneList<CharacterRange>** target = nullptr;
4849 78283 : if (outset->Get(kBmpCodePoints)) {
4850 50718 : target = &bmp_;
4851 27565 : } else if (outset->Get(kLeadSurrogates)) {
4852 1175 : target = &lead_surrogates_;
4853 26390 : } else if (outset->Get(kTrailSurrogates)) {
4854 1175 : target = &trail_surrogates_;
4855 : } else {
4856 : DCHECK(outset->Get(kNonBmpCodePoints));
4857 25215 : target = &non_bmp_;
4858 : }
4859 78283 : if (*target == nullptr)
4860 18201 : *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4861 78283 : (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4862 : }
4863 :
4864 2582 : void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4865 : RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4866 : ZoneList<CharacterRange>* bmp = splitter->bmp();
4867 3037 : if (bmp == nullptr) return;
4868 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4869 2127 : result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4870 : compiler->zone(), bmp, compiler->read_backward(), on_success,
4871 : default_flags)));
4872 : }
4873 :
4874 2582 : void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4875 : RegExpNode* on_success,
4876 : UnicodeRangeSplitter* splitter) {
4877 : ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4878 3584 : if (non_bmp == nullptr) return;
4879 : DCHECK(!compiler->one_byte());
4880 : Zone* zone = compiler->zone();
4881 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4882 1580 : CharacterRange::Canonicalize(non_bmp);
4883 52000 : for (int i = 0; i < non_bmp->length(); i++) {
4884 : // Match surrogate pair.
4885 : // E.g. [\u10005-\u11005] becomes
4886 : // \ud800[\udc05-\udfff]|
4887 : // [\ud801-\ud803][\udc00-\udfff]|
4888 : // \ud804[\udc00-\udc05]
4889 : uc32 from = non_bmp->at(i).from();
4890 : uc32 to = non_bmp->at(i).to();
4891 25210 : uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4892 : uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4893 25210 : uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4894 : uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4895 25210 : if (from_l == to_l) {
4896 : // The lead surrogate is the same.
4897 : result->AddAlternative(
4898 22930 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4899 : zone, CharacterRange::Singleton(from_l),
4900 : CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4901 : on_success, default_flags)));
4902 : } else {
4903 2280 : if (from_t != kTrailSurrogateStart) {
4904 : // Add [from_l][from_t-\udfff]
4905 : result->AddAlternative(
4906 1155 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4907 : zone, CharacterRange::Singleton(from_l),
4908 : CharacterRange::Range(from_t, kTrailSurrogateEnd),
4909 : compiler->read_backward(), on_success, default_flags)));
4910 1155 : from_l++;
4911 : }
4912 2280 : if (to_t != kTrailSurrogateEnd) {
4913 : // Add [to_l][\udc00-to_t]
4914 : result->AddAlternative(
4915 895 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4916 : zone, CharacterRange::Singleton(to_l),
4917 : CharacterRange::Range(kTrailSurrogateStart, to_t),
4918 : compiler->read_backward(), on_success, default_flags)));
4919 895 : to_l--;
4920 : }
4921 2280 : if (from_l <= to_l) {
4922 : // Add [from_l-to_l][\udc00-\udfff]
4923 : result->AddAlternative(
4924 2090 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4925 : zone, CharacterRange::Range(from_l, to_l),
4926 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4927 : compiler->read_backward(), on_success, default_flags)));
4928 : }
4929 : }
4930 : }
4931 : }
4932 :
4933 1175 : RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4934 : RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
4935 : ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
4936 : JSRegExp::Flags flags) {
4937 : Zone* zone = compiler->zone();
4938 1175 : RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4939 1175 : zone, match, read_backward, on_success, flags);
4940 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4941 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4942 : RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4943 1175 : position_register);
4944 1175 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4945 2350 : zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
4946 1175 : return lookaround.ForMatch(negative_match);
4947 : }
4948 :
4949 1165 : RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4950 : RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
4951 : ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
4952 : bool read_backward, JSRegExp::Flags flags) {
4953 : Zone* zone = compiler->zone();
4954 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4955 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4956 : RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4957 1165 : position_register);
4958 1165 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4959 1165 : zone, lookahead, read_backward, lookaround.on_match_success(), flags);
4960 1165 : return TextNode::CreateForCharacterRanges(
4961 1165 : zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
4962 : }
4963 :
4964 2582 : void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4965 : RegExpNode* on_success,
4966 : UnicodeRangeSplitter* splitter) {
4967 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4968 : ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
4969 3994 : if (lead_surrogates == nullptr) return;
4970 : Zone* zone = compiler->zone();
4971 : // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4972 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
4973 1170 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
4974 :
4975 : RegExpNode* match;
4976 1170 : if (compiler->read_backward()) {
4977 : // Reading backward. Assert that reading forward, there is no trail
4978 : // surrogate, and then backward match the lead surrogate.
4979 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
4980 : compiler, trail_surrogates, lead_surrogates, on_success, true,
4981 95 : default_flags);
4982 : } else {
4983 : // Reading forward. Forward match the lead surrogate and assert that
4984 : // no trail surrogate follows.
4985 : match = MatchAndNegativeLookaroundInReadDirection(
4986 : compiler, lead_surrogates, trail_surrogates, on_success, false,
4987 1075 : default_flags);
4988 : }
4989 : result->AddAlternative(GuardedAlternative(match));
4990 : }
4991 :
4992 2582 : void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4993 : RegExpNode* on_success,
4994 : UnicodeRangeSplitter* splitter) {
4995 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4996 : ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
4997 3994 : if (trail_surrogates == nullptr) return;
4998 : Zone* zone = compiler->zone();
4999 : // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5000 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5001 1170 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5002 :
5003 : RegExpNode* match;
5004 1170 : if (compiler->read_backward()) {
5005 : // Reading backward. Backward match the trail surrogate and assert that no
5006 : // lead surrogate precedes it.
5007 : match = MatchAndNegativeLookaroundInReadDirection(
5008 : compiler, trail_surrogates, lead_surrogates, on_success, true,
5009 90 : default_flags);
5010 : } else {
5011 : // Reading forward. Assert that reading backward, there is no lead
5012 : // surrogate, and then forward match the trail surrogate.
5013 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
5014 : compiler, lead_surrogates, trail_surrogates, on_success, false,
5015 1080 : default_flags);
5016 : }
5017 : result->AddAlternative(GuardedAlternative(match));
5018 : }
5019 :
5020 0 : RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5021 : RegExpNode* on_success) {
5022 : // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5023 : DCHECK(!compiler->read_backward());
5024 : Zone* zone = compiler->zone();
5025 : // Advance any character. If the character happens to be a lead surrogate and
5026 : // we advanced into the middle of a surrogate pair, it will work out, as
5027 : // nothing will match from there. We will have to advance again, consuming
5028 : // the associated trail surrogate.
5029 : ZoneList<CharacterRange>* range = CharacterRange::List(
5030 0 : zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5031 : JSRegExp::Flags default_flags = JSRegExp::Flags();
5032 : return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
5033 0 : default_flags);
5034 : }
5035 :
5036 1189 : void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5037 : #ifdef V8_INTL_SUPPORT
5038 : DCHECK(CharacterRange::IsCanonical(ranges));
5039 :
5040 : // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
5041 : // See also https://crbug.com/v8/6727.
5042 : // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
5043 : // which we use frequently internally. But large ranges can also easily be
5044 : // created by the user. We might want to have a more general caching mechanism
5045 : // for such ranges.
5046 1728 : if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
5047 :
5048 : // Use ICU to compute the case fold closure over the ranges.
5049 2378 : icu::UnicodeSet set;
5050 246407 : for (int i = 0; i < ranges->length(); i++) {
5051 122609 : set.add(ranges->at(i).from(), ranges->at(i).to());
5052 : }
5053 : ranges->Clear();
5054 1189 : set.closeOver(USET_CASE_INSENSITIVE);
5055 : // Full case mapping map single characters to multiple characters.
5056 : // Those are represented as strings in the set. Remove them so that
5057 : // we end up with only simple and common case mappings.
5058 1189 : set.removeAllStrings();
5059 37531 : for (int i = 0; i < set.getRangeCount(); i++) {
5060 36342 : ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5061 18171 : zone);
5062 : }
5063 : // No errors and everything we collected have been ranges.
5064 1189 : CharacterRange::Canonicalize(ranges);
5065 : #endif // V8_INTL_SUPPORT
5066 : }
5067 :
5068 :
5069 176130 : RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5070 : RegExpNode* on_success) {
5071 : set_.Canonicalize();
5072 : Zone* zone = compiler->zone();
5073 : ZoneList<CharacterRange>* ranges = this->ranges(zone);
5074 176130 : if (NeedsUnicodeCaseEquivalents(flags_)) {
5075 949 : AddUnicodeCaseEquivalents(ranges, zone);
5076 : }
5077 178752 : if (IsUnicode(flags_) && !compiler->one_byte() &&
5078 : !contains_split_surrogate()) {
5079 2612 : if (is_negated()) {
5080 : ZoneList<CharacterRange>* negated =
5081 : new (zone) ZoneList<CharacterRange>(2, zone);
5082 140 : CharacterRange::Negate(ranges, negated, zone);
5083 : ranges = negated;
5084 : }
5085 2612 : if (ranges->length() == 0) {
5086 : JSRegExp::Flags default_flags;
5087 : RegExpCharacterClass* fail =
5088 60 : new (zone) RegExpCharacterClass(zone, ranges, default_flags);
5089 60 : return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5090 : }
5091 2582 : if (standard_type() == '*') {
5092 0 : return UnanchoredAdvance(compiler, on_success);
5093 : } else {
5094 2582 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5095 2582 : UnicodeRangeSplitter splitter(zone, ranges);
5096 2582 : AddBmpCharacters(compiler, result, on_success, &splitter);
5097 2582 : AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5098 2582 : AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5099 2582 : AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5100 : return result;
5101 : }
5102 : } else {
5103 347036 : return new (zone) TextNode(this, compiler->read_backward(), on_success);
5104 : }
5105 : }
5106 :
5107 :
5108 146822 : int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5109 146822 : RegExpAtom* atom1 = (*a)->AsAtom();
5110 146822 : RegExpAtom* atom2 = (*b)->AsAtom();
5111 146822 : uc16 character1 = atom1->data().at(0);
5112 146822 : uc16 character2 = atom2->data().at(0);
5113 146822 : if (character1 < character2) return -1;
5114 129859 : if (character1 > character2) return 1;
5115 17383 : return 0;
5116 : }
5117 :
5118 :
5119 : static unibrow::uchar Canonical(
5120 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5121 : unibrow::uchar c) {
5122 : unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5123 101306 : int length = canonicalize->get(c, '\0', chars);
5124 : DCHECK_LE(length, 1);
5125 : unibrow::uchar canonical = c;
5126 101306 : if (length == 1) canonical = chars[0];
5127 : return canonical;
5128 : }
5129 :
5130 :
5131 63973 : int CompareFirstCharCaseIndependent(
5132 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5133 : RegExpTree* const* a, RegExpTree* const* b) {
5134 63973 : RegExpAtom* atom1 = (*a)->AsAtom();
5135 63973 : RegExpAtom* atom2 = (*b)->AsAtom();
5136 63973 : unibrow::uchar character1 = atom1->data().at(0);
5137 63973 : unibrow::uchar character2 = atom2->data().at(0);
5138 63973 : if (character1 == character2) return 0;
5139 46025 : if (character1 >= 'a' || character2 >= 'a') {
5140 : character1 = Canonical(canonicalize, character1);
5141 : character2 = Canonical(canonicalize, character2);
5142 : }
5143 46025 : return static_cast<int>(character1) - static_cast<int>(character2);
5144 : }
5145 :
5146 :
5147 : // We can stable sort runs of atoms, since the order does not matter if they
5148 : // start with different characters.
5149 : // Returns true if any consecutive atoms were found.
5150 9297 : bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5151 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5152 : int length = alternatives->length();
5153 : bool found_consecutive_atoms = false;
5154 26551 : for (int i = 0; i < length; i++) {
5155 62443 : while (i < length) {
5156 35127 : RegExpTree* alternative = alternatives->at(i);
5157 35127 : if (alternative->IsAtom()) break;
5158 26500 : i++;
5159 : }
5160 : // i is length or it is the index of an atom.
5161 9443 : if (i == length) break;
5162 : int first_atom = i;
5163 8627 : JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
5164 8627 : i++;
5165 120869 : while (i < length) {
5166 56362 : RegExpTree* alternative = alternatives->at(i);
5167 56362 : if (!alternative->IsAtom()) break;
5168 56121 : if (alternative->AsAtom()->flags() != flags) break;
5169 56121 : i++;
5170 : }
5171 : // Sort atoms to get ones with common prefixes together.
5172 : // This step is more tricky if we are in a case-independent regexp,
5173 : // because it would change /is|I/ to /I|is/, and order matters when
5174 : // the regexp parts don't match only disjoint starting points. To fix
5175 : // this we have a version of CompareFirstChar that uses case-
5176 : // independent character classes for comparison.
5177 : DCHECK_LT(first_atom, alternatives->length());
5178 : DCHECK_LE(i, alternatives->length());
5179 : DCHECK_LE(first_atom, i);
5180 8627 : if (IgnoreCase(flags)) {
5181 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5182 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5183 : auto compare_closure =
5184 63973 : [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5185 : return CompareFirstCharCaseIndependent(canonicalize, a, b);
5186 63973 : };
5187 474 : alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5188 : } else {
5189 8153 : alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5190 : }
5191 8627 : if (i - first_atom > 1) found_consecutive_atoms = true;
5192 : }
5193 9297 : return found_consecutive_atoms;
5194 : }
5195 :
5196 :
5197 : // Optimizes ab|ac|az to a(?:b|c|d).
5198 8370 : void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5199 : Zone* zone = compiler->zone();
5200 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5201 : int length = alternatives->length();
5202 :
5203 : int write_posn = 0;
5204 : int i = 0;
5205 73522 : while (i < length) {
5206 65152 : RegExpTree* alternative = alternatives->at(i);
5207 65152 : if (!alternative->IsAtom()) {
5208 18582 : alternatives->at(write_posn++) = alternatives->at(i);
5209 9291 : i++;
5210 : continue;
5211 : }
5212 55861 : RegExpAtom* const atom = alternative->AsAtom();
5213 : JSRegExp::Flags flags = atom->flags();
5214 55861 : unibrow::uchar common_prefix = atom->data().at(0);
5215 : int first_with_prefix = i;
5216 : int prefix_length = atom->length();
5217 55861 : i++;
5218 73121 : while (i < length) {
5219 56221 : alternative = alternatives->at(i);
5220 56221 : if (!alternative->IsAtom()) break;
5221 56121 : RegExpAtom* const atom = alternative->AsAtom();
5222 56121 : if (atom->flags() != flags) break;
5223 56121 : unibrow::uchar new_prefix = atom->data().at(0);
5224 56121 : if (new_prefix != common_prefix) {
5225 47695 : if (!IgnoreCase(flags)) break;
5226 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5227 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5228 : new_prefix = Canonical(canonicalize, new_prefix);
5229 : common_prefix = Canonical(canonicalize, common_prefix);
5230 4628 : if (new_prefix != common_prefix) break;
5231 : }
5232 : prefix_length = Min(prefix_length, atom->length());
5233 8630 : i++;
5234 : }
5235 55861 : if (i > first_with_prefix + 2) {
5236 : // Found worthwhile run of alternatives with common prefix of at least one
5237 : // character. The sorting function above did not sort on more than one
5238 : // character for reasons of correctness, but there may still be a longer
5239 : // common prefix if the terms were similar or presorted in the input.
5240 : // Find out how long the common prefix is.
5241 268 : int run_length = i - first_with_prefix;
5242 268 : RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
5243 742 : for (int j = 1; j < run_length && prefix_length > 1; j++) {
5244 : RegExpAtom* old_atom =
5245 474 : alternatives->at(j + first_with_prefix)->AsAtom();
5246 477 : for (int k = 1; k < prefix_length; k++) {
5247 711 : if (atom->data().at(k) != old_atom->data().at(k)) {
5248 : prefix_length = k;
5249 : break;
5250 : }
5251 : }
5252 : }
5253 : RegExpAtom* prefix = new (zone)
5254 268 : RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
5255 : ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5256 268 : pair->Add(prefix, zone);
5257 : ZoneList<RegExpTree*>* suffixes =
5258 : new (zone) ZoneList<RegExpTree*>(run_length, zone);
5259 17600 : for (int j = 0; j < run_length; j++) {
5260 : RegExpAtom* old_atom =
5261 17332 : alternatives->at(j + first_with_prefix)->AsAtom();
5262 : int len = old_atom->length();
5263 8666 : if (len == prefix_length) {
5264 302 : suffixes->Add(new (zone) RegExpEmpty(), zone);
5265 : } else {
5266 : RegExpTree* suffix = new (zone) RegExpAtom(
5267 8515 : old_atom->data().SubVector(prefix_length, old_atom->length()),
5268 8515 : flags);
5269 8515 : suffixes->Add(suffix, zone);
5270 : }
5271 : }
5272 268 : pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5273 536 : alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5274 : } else {
5275 : // Just copy any non-worthwhile alternatives.
5276 167243 : for (int j = first_with_prefix; j < i; j++) {
5277 111650 : alternatives->at(write_posn++) = alternatives->at(j);
5278 : }
5279 : }
5280 : }
5281 : alternatives->Rewind(write_posn); // Trim end of array.
5282 8370 : }
5283 :
5284 :
5285 : // Optimizes b|c|z to [bcz].
5286 9297 : void RegExpDisjunction::FixSingleCharacterDisjunctions(
5287 : RegExpCompiler* compiler) {
5288 : Zone* zone = compiler->zone();
5289 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5290 : int length = alternatives->length();
5291 :
5292 : int write_posn = 0;
5293 : int i = 0;
5294 83951 : while (i < length) {
5295 74654 : RegExpTree* alternative = alternatives->at(i);
5296 74654 : if (!alternative->IsAtom()) {
5297 54018 : alternatives->at(write_posn++) = alternatives->at(i);
5298 27009 : i++;
5299 27009 : continue;
5300 : }
5301 47645 : RegExpAtom* const atom = alternative->AsAtom();
5302 47645 : if (atom->length() != 1) {
5303 78592 : alternatives->at(write_posn++) = alternatives->at(i);
5304 39296 : i++;
5305 39296 : continue;
5306 : }
5307 : JSRegExp::Flags flags = atom->flags();
5308 : DCHECK_IMPLIES(IsUnicode(flags),
5309 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5310 : bool contains_trail_surrogate =
5311 8349 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5312 : int first_in_run = i;
5313 8349 : i++;
5314 : // Find a run of single-character atom alternatives that have identical
5315 : // flags (case independence and unicode-ness).
5316 25223 : while (i < length) {
5317 16454 : alternative = alternatives->at(i);
5318 16454 : if (!alternative->IsAtom()) break;
5319 16223 : RegExpAtom* const atom = alternative->AsAtom();
5320 16223 : if (atom->length() != 1) break;
5321 8437 : if (atom->flags() != flags) break;
5322 : DCHECK_IMPLIES(IsUnicode(flags),
5323 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5324 : contains_trail_surrogate |=
5325 16874 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5326 8437 : i++;
5327 : }
5328 8349 : if (i > first_in_run + 1) {
5329 : // Found non-trivial run of single-character alternatives.
5330 271 : int run_length = i - first_in_run;
5331 : ZoneList<CharacterRange>* ranges =
5332 : new (zone) ZoneList<CharacterRange>(2, zone);
5333 17687 : for (int j = 0; j < run_length; j++) {
5334 17416 : RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5335 : DCHECK_EQ(old_atom->length(), 1);
5336 8708 : ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5337 : }
5338 : RegExpCharacterClass::CharacterClassFlags character_class_flags;
5339 271 : if (IsUnicode(flags) && contains_trail_surrogate) {
5340 : character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5341 : }
5342 271 : alternatives->at(write_posn++) = new (zone)
5343 813 : RegExpCharacterClass(zone, ranges, flags, character_class_flags);
5344 : } else {
5345 : // Just copy any trivial alternatives.
5346 24234 : for (int j = first_in_run; j < i; j++) {
5347 16156 : alternatives->at(write_posn++) = alternatives->at(j);
5348 : }
5349 : }
5350 : }
5351 : alternatives->Rewind(write_posn); // Trim end of array.
5352 9297 : }
5353 :
5354 :
5355 10913 : RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5356 : RegExpNode* on_success) {
5357 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5358 :
5359 10913 : if (alternatives->length() > 2) {
5360 9297 : bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5361 9297 : if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5362 9297 : FixSingleCharacterDisjunctions(compiler);
5363 9297 : if (alternatives->length() == 1) {
5364 242 : return alternatives->at(0)->ToNode(compiler, on_success);
5365 : }
5366 : }
5367 :
5368 : int length = alternatives->length();
5369 :
5370 : ChoiceNode* result =
5371 10671 : new(compiler->zone()) ChoiceNode(length, compiler->zone());
5372 165953 : for (int i = 0; i < length; i++) {
5373 : GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5374 77641 : on_success));
5375 : result->AddAlternative(alternative);
5376 : }
5377 : return result;
5378 : }
5379 :
5380 :
5381 926280 : RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5382 : RegExpNode* on_success) {
5383 926280 : return ToNode(min(),
5384 : max(),
5385 : is_greedy(),
5386 : body(),
5387 : compiler,
5388 926280 : on_success);
5389 : }
5390 :
5391 :
5392 : // Scoped object to keep track of how much we unroll quantifier loops in the
5393 : // regexp graph generator.
5394 : class RegExpExpansionLimiter {
5395 : public:
5396 : static const int kMaxExpansionFactor = 6;
5397 : RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5398 : : compiler_(compiler),
5399 : saved_expansion_factor_(compiler->current_expansion_factor()),
5400 62408 : ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5401 : DCHECK_LT(0, factor);
5402 71951 : if (ok_to_expand_) {
5403 71951 : if (factor > kMaxExpansionFactor) {
5404 : // Avoid integer overflow of the current expansion factor.
5405 : ok_to_expand_ = false;
5406 : compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5407 : } else {
5408 71823 : int new_factor = saved_expansion_factor_ * factor;
5409 71823 : ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5410 : compiler->set_current_expansion_factor(new_factor);
5411 : }
5412 : }
5413 : }
5414 :
5415 : ~RegExpExpansionLimiter() {
5416 : compiler_->set_current_expansion_factor(saved_expansion_factor_);
5417 : }
5418 :
5419 : bool ok_to_expand() { return ok_to_expand_; }
5420 :
5421 : private:
5422 : RegExpCompiler* compiler_;
5423 : int saved_expansion_factor_;
5424 : bool ok_to_expand_;
5425 :
5426 : DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5427 : };
5428 :
5429 :
5430 1013319 : RegExpNode* RegExpQuantifier::ToNode(int min,
5431 : int max,
5432 : bool is_greedy,
5433 : RegExpTree* body,
5434 : RegExpCompiler* compiler,
5435 : RegExpNode* on_success,
5436 : bool not_at_start) {
5437 : // x{f, t} becomes this:
5438 : //
5439 : // (r++)<-.
5440 : // | `
5441 : // | (x)
5442 : // v ^
5443 : // (r=0)-->(?)---/ [if r < t]
5444 : // |
5445 : // [if r >= f] \----> ...
5446 : //
5447 :
5448 : // 15.10.2.5 RepeatMatcher algorithm.
5449 : // The parser has already eliminated the case where max is 0. In the case
5450 : // where max_match is zero the parser has removed the quantifier if min was
5451 : // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5452 :
5453 : // If we know that we cannot match zero length then things are a little
5454 : // simpler since we don't need to make the special zero length match check
5455 : // from step 2.1. If the min and max are small we can unroll a little in
5456 : // this case.
5457 : static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5458 : static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5459 1013319 : if (max == 0) return on_success; // This can happen due to recursion.
5460 1012964 : bool body_can_be_empty = (body->min_match() == 0);
5461 : int body_start_reg = RegExpCompiler::kNoRegister;
5462 1012964 : Interval capture_registers = body->CaptureRegisters();
5463 : bool needs_capture_clearing = !capture_registers.is_empty();
5464 : Zone* zone = compiler->zone();
5465 :
5466 1012964 : if (body_can_be_empty) {
5467 : body_start_reg = compiler->AllocateRegister();
5468 1012427 : } else if (compiler->optimize() && !needs_capture_clearing) {
5469 : // Only unroll if there are no captures and the body can't be
5470 : // empty.
5471 : {
5472 : RegExpExpansionLimiter limiter(
5473 62408 : compiler, min + ((max != min) ? 1 : 0));
5474 62408 : if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5475 4316 : int new_max = (max == kInfinity) ? max : max - min;
5476 : // Recurse once to get the loop or optional matches after the fixed
5477 : // ones.
5478 4316 : RegExpNode* answer = ToNode(
5479 4316 : 0, new_max, is_greedy, body, compiler, on_success, true);
5480 : // Unroll the forced matches from 0 to min. This can cause chains of
5481 : // TextNodes (which the parser does not generate). These should be
5482 : // combined if it turns out they hinder good code generation.
5483 14348 : for (int i = 0; i < min; i++) {
5484 5016 : answer = body->ToNode(compiler, answer);
5485 : }
5486 : return answer;
5487 : }
5488 : }
5489 58092 : if (max <= kMaxUnrolledMaxMatches && min == 0) {
5490 : DCHECK_LT(0, max); // Due to the 'if' above.
5491 : RegExpExpansionLimiter limiter(compiler, max);
5492 9543 : if (limiter.ok_to_expand()) {
5493 : // Unroll the optional matches up to max.
5494 : RegExpNode* answer = on_success;
5495 28040 : for (int i = 0; i < max; i++) {
5496 9379 : ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5497 9379 : if (is_greedy) {
5498 9233 : alternation->AddAlternative(
5499 9233 : GuardedAlternative(body->ToNode(compiler, answer)));
5500 : alternation->AddAlternative(GuardedAlternative(on_success));
5501 : } else {
5502 : alternation->AddAlternative(GuardedAlternative(on_success));
5503 146 : alternation->AddAlternative(
5504 146 : GuardedAlternative(body->ToNode(compiler, answer)));
5505 : }
5506 : answer = alternation;
5507 9379 : if (not_at_start && !compiler->read_backward()) {
5508 : alternation->set_not_at_start();
5509 : }
5510 : }
5511 : return answer;
5512 : }
5513 : }
5514 : }
5515 999366 : bool has_min = min > 0;
5516 999366 : bool has_max = max < RegExpTree::kInfinity;
5517 999366 : bool needs_counter = has_min || has_max;
5518 : int reg_ctr = needs_counter
5519 : ? compiler->AllocateRegister()
5520 999366 : : RegExpCompiler::kNoRegister;
5521 : LoopChoiceNode* center = new (zone)
5522 999366 : LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5523 999366 : if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5524 : RegExpNode* loop_return = needs_counter
5525 : ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5526 999366 : : static_cast<RegExpNode*>(center);
5527 999366 : if (body_can_be_empty) {
5528 : // If the body can be empty we need to check if it was and then
5529 : // backtrack.
5530 : loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5531 : reg_ctr,
5532 : min,
5533 537 : loop_return);
5534 : }
5535 999366 : RegExpNode* body_node = body->ToNode(compiler, loop_return);
5536 999366 : if (body_can_be_empty) {
5537 : // If the body can be empty we need to store the start position
5538 : // so we can bail out if it was empty.
5539 537 : body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5540 : }
5541 999366 : if (needs_capture_clearing) {
5542 : // Before entering the body of this loop we need to clear captures.
5543 2376 : body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5544 : }
5545 : GuardedAlternative body_alt(body_node);
5546 999366 : if (has_max) {
5547 : Guard* body_guard =
5548 : new(zone) Guard(reg_ctr, Guard::LT, max);
5549 902614 : body_alt.AddGuard(body_guard, zone);
5550 : }
5551 : GuardedAlternative rest_alt(on_success);
5552 999366 : if (has_min) {
5553 : Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5554 1322 : rest_alt.AddGuard(rest_guard, zone);
5555 : }
5556 999366 : if (is_greedy) {
5557 : center->AddLoopAlternative(body_alt);
5558 : center->AddContinueAlternative(rest_alt);
5559 : } else {
5560 : center->AddContinueAlternative(rest_alt);
5561 : center->AddLoopAlternative(body_alt);
5562 : }
5563 999366 : if (needs_counter) {
5564 903359 : return ActionNode::SetRegister(reg_ctr, 0, center);
5565 : } else {
5566 : return center;
5567 : }
5568 : }
5569 :
5570 : namespace {
5571 : // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5572 : // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
5573 80 : RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5574 : RegExpNode* on_success,
5575 : RegExpAssertion::AssertionType type,
5576 : JSRegExp::Flags flags) {
5577 : DCHECK(NeedsUnicodeCaseEquivalents(flags));
5578 : Zone* zone = compiler->zone();
5579 : ZoneList<CharacterRange>* word_range =
5580 : new (zone) ZoneList<CharacterRange>(2, zone);
5581 80 : CharacterRange::AddClassEscape('w', word_range, true, zone);
5582 : int stack_register = compiler->UnicodeLookaroundStackRegister();
5583 : int position_register = compiler->UnicodeLookaroundPositionRegister();
5584 80 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5585 : // Add two choices. The (non-)boundary could start with a word or
5586 : // a non-word-character.
5587 400 : for (int i = 0; i < 2; i++) {
5588 160 : bool lookbehind_for_word = i == 0;
5589 : bool lookahead_for_word =
5590 160 : (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5591 : // Look to the left.
5592 : RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5593 160 : stack_register, position_register);
5594 : RegExpNode* backward = TextNode::CreateForCharacterRanges(
5595 160 : zone, word_range, true, lookbehind.on_match_success(), flags);
5596 : // Look to the right.
5597 : RegExpLookaround::Builder lookahead(lookahead_for_word,
5598 : lookbehind.ForMatch(backward),
5599 160 : stack_register, position_register);
5600 : RegExpNode* forward = TextNode::CreateForCharacterRanges(
5601 160 : zone, word_range, false, lookahead.on_match_success(), flags);
5602 160 : result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5603 : }
5604 80 : return result;
5605 : }
5606 : } // anonymous namespace
5607 :
5608 5506 : RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5609 : RegExpNode* on_success) {
5610 : NodeInfo info;
5611 : Zone* zone = compiler->zone();
5612 :
5613 5506 : switch (assertion_type()) {
5614 : case START_OF_LINE:
5615 129 : return AssertionNode::AfterNewline(on_success);
5616 : case START_OF_INPUT:
5617 3041 : return AssertionNode::AtStart(on_success);
5618 : case BOUNDARY:
5619 : return NeedsUnicodeCaseEquivalents(flags_)
5620 : ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
5621 : flags_)
5622 176 : : AssertionNode::AtBoundary(on_success);
5623 : case NON_BOUNDARY:
5624 : return NeedsUnicodeCaseEquivalents(flags_)
5625 : ? BoundaryAssertionAsLookaround(compiler, on_success,
5626 : NON_BOUNDARY, flags_)
5627 154 : : AssertionNode::AtNonBoundary(on_success);
5628 : case END_OF_INPUT:
5629 1907 : return AssertionNode::AtEnd(on_success);
5630 : case END_OF_LINE: {
5631 : // Compile $ in multiline regexps as an alternation with a positive
5632 : // lookahead in one side and an end-of-input on the other side.
5633 : // We need two registers for the lookahead.
5634 : int stack_pointer_register = compiler->AllocateRegister();
5635 : int position_register = compiler->AllocateRegister();
5636 : // The ChoiceNode to distinguish between a newline and end-of-input.
5637 99 : ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5638 : // Create a newline atom.
5639 : ZoneList<CharacterRange>* newline_ranges =
5640 : new(zone) ZoneList<CharacterRange>(3, zone);
5641 99 : CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5642 : JSRegExp::Flags default_flags = JSRegExp::Flags();
5643 : RegExpCharacterClass* newline_atom =
5644 : new (zone) RegExpCharacterClass('n', default_flags);
5645 : TextNode* newline_matcher = new (zone) TextNode(
5646 : newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5647 : stack_pointer_register, position_register,
5648 : 0, // No captures inside.
5649 : -1, // Ignored if no captures.
5650 198 : on_success));
5651 : // Create an end-of-input matcher.
5652 : RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5653 : stack_pointer_register,
5654 : position_register,
5655 99 : newline_matcher);
5656 : // Add the two alternatives to the ChoiceNode.
5657 : GuardedAlternative eol_alternative(end_of_line);
5658 : result->AddAlternative(eol_alternative);
5659 99 : GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5660 : result->AddAlternative(end_alternative);
5661 : return result;
5662 : }
5663 : default:
5664 0 : UNREACHABLE();
5665 : }
5666 : return on_success;
5667 : }
5668 :
5669 :
5670 2380 : RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5671 : RegExpNode* on_success) {
5672 : return new (compiler->zone())
5673 : BackReferenceNode(RegExpCapture::StartRegister(index()),
5674 : RegExpCapture::EndRegister(index()), flags_,
5675 4760 : compiler->read_backward(), on_success);
5676 : }
5677 :
5678 :
5679 1026 : RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5680 : RegExpNode* on_success) {
5681 1026 : return on_success;
5682 : }
5683 :
5684 :
5685 4368 : RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5686 : int stack_pointer_register,
5687 : int position_register,
5688 : int capture_register_count,
5689 : int capture_register_start)
5690 : : is_positive_(is_positive),
5691 : on_success_(on_success),
5692 : stack_pointer_register_(stack_pointer_register),
5693 4368 : position_register_(position_register) {
5694 4368 : if (is_positive_) {
5695 1556 : on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5696 : stack_pointer_register, position_register, capture_register_count,
5697 1556 : capture_register_start, on_success_);
5698 : } else {
5699 : Zone* zone = on_success_->zone();
5700 : on_match_success_ = new (zone) NegativeSubmatchSuccess(
5701 : stack_pointer_register, position_register, capture_register_count,
5702 2812 : capture_register_start, zone);
5703 : }
5704 4368 : }
5705 :
5706 :
5707 4368 : RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5708 4368 : if (is_positive_) {
5709 1556 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5710 1556 : position_register_, match);
5711 : } else {
5712 2812 : Zone* zone = on_success_->zone();
5713 : // We use a ChoiceNode to represent the negative lookaround. The first
5714 : // alternative is the negative match. On success, the end node backtracks.
5715 : // On failure, the second alternative is tried and leads to success.
5716 : // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5717 : // first exit when calculating quick checks.
5718 : ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5719 2812 : GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5720 2812 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5721 2812 : position_register_, choice_node);
5722 : }
5723 : }
5724 :
5725 :
5726 1668 : RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5727 : RegExpNode* on_success) {
5728 : int stack_pointer_register = compiler->AllocateRegister();
5729 : int position_register = compiler->AllocateRegister();
5730 :
5731 : const int registers_per_capture = 2;
5732 : const int register_of_first_capture = 2;
5733 1668 : int register_count = capture_count_ * registers_per_capture;
5734 : int register_start =
5735 1668 : register_of_first_capture + capture_from_ * registers_per_capture;
5736 :
5737 : RegExpNode* result;
5738 : bool was_reading_backward = compiler->read_backward();
5739 1668 : compiler->set_read_backward(type() == LOOKBEHIND);
5740 : Builder builder(is_positive(), on_success, stack_pointer_register,
5741 1668 : position_register, register_count, register_start);
5742 1668 : RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5743 1668 : result = builder.ForMatch(match);
5744 : compiler->set_read_backward(was_reading_backward);
5745 1668 : return result;
5746 : }
5747 :
5748 :
5749 27157 : RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5750 : RegExpNode* on_success) {
5751 27157 : return ToNode(body(), index(), compiler, on_success);
5752 : }
5753 :
5754 :
5755 112823 : RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5756 : int index,
5757 : RegExpCompiler* compiler,
5758 : RegExpNode* on_success) {
5759 : DCHECK_NOT_NULL(body);
5760 : int start_reg = RegExpCapture::StartRegister(index);
5761 : int end_reg = RegExpCapture::EndRegister(index);
5762 112823 : if (compiler->read_backward()) std::swap(start_reg, end_reg);
5763 112823 : RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5764 112823 : RegExpNode* body_node = body->ToNode(compiler, store_end);
5765 112823 : return ActionNode::StorePosition(start_reg, true, body_node);
5766 : }
5767 :
5768 :
5769 21323 : RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5770 : RegExpNode* on_success) {
5771 : ZoneList<RegExpTree*>* children = nodes();
5772 : RegExpNode* current = on_success;
5773 21323 : if (compiler->read_backward()) {
5774 1905 : for (int i = 0; i < children->length(); i++) {
5775 790 : current = children->at(i)->ToNode(compiler, current);
5776 : }
5777 : } else {
5778 997716 : for (int i = children->length() - 1; i >= 0; i--) {
5779 976718 : current = children->at(i)->ToNode(compiler, current);
5780 : }
5781 : }
5782 21323 : return current;
5783 : }
5784 :
5785 :
5786 7409 : static void AddClass(const int* elmv,
5787 : int elmc,
5788 : ZoneList<CharacterRange>* ranges,
5789 : Zone* zone) {
5790 7409 : elmc--;
5791 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5792 71721 : for (int i = 0; i < elmc; i += 2) {
5793 : DCHECK(elmv[i] < elmv[i + 1]);
5794 32156 : ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5795 : }
5796 7409 : }
5797 :
5798 :
5799 20035 : static void AddClassNegated(const int *elmv,
5800 : int elmc,
5801 : ZoneList<CharacterRange>* ranges,
5802 : Zone* zone) {
5803 20035 : elmc--;
5804 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5805 : DCHECK_NE(0x0000, elmv[0]);
5806 : DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
5807 : uc16 last = 0x0000;
5808 150595 : for (int i = 0; i < elmc; i += 2) {
5809 : DCHECK(last <= elmv[i] - 1);
5810 : DCHECK(elmv[i] < elmv[i + 1]);
5811 65280 : ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5812 65280 : last = elmv[i + 1];
5813 : }
5814 20035 : ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5815 20035 : }
5816 :
5817 110491 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5818 : bool add_unicode_case_equivalents,
5819 : Zone* zone) {
5820 110491 : if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5821 : // See #sec-runtime-semantics-wordcharacters-abstract-operation
5822 : // In case of unicode and ignore_case, we need to create the closure over
5823 : // case equivalent characters before negating.
5824 : ZoneList<CharacterRange>* new_ranges =
5825 : new (zone) ZoneList<CharacterRange>(2, zone);
5826 240 : AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5827 240 : AddUnicodeCaseEquivalents(new_ranges, zone);
5828 240 : if (type == 'W') {
5829 : ZoneList<CharacterRange>* negated =
5830 : new (zone) ZoneList<CharacterRange>(2, zone);
5831 90 : CharacterRange::Negate(new_ranges, negated, zone);
5832 : new_ranges = negated;
5833 : }
5834 : ranges->AddAll(*new_ranges, zone);
5835 : return;
5836 : }
5837 110251 : AddClassEscape(type, ranges, zone);
5838 : }
5839 :
5840 110286 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5841 : Zone* zone) {
5842 110286 : switch (type) {
5843 : case 's':
5844 1697 : AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5845 1697 : break;
5846 : case 'S':
5847 772 : AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5848 772 : break;
5849 : case 'w':
5850 2786 : AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5851 2786 : break;
5852 : case 'W':
5853 307 : AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5854 307 : break;
5855 : case 'd':
5856 2488 : AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5857 2488 : break;
5858 : case 'D':
5859 268 : AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5860 268 : break;
5861 : case '.':
5862 : AddClassNegated(kLineTerminatorRanges,
5863 : kLineTerminatorRangeCount,
5864 : ranges,
5865 18688 : zone);
5866 18688 : break;
5867 : // This is not a character range as defined by the spec but a
5868 : // convenient shorthand for a character class that matches any
5869 : // character.
5870 : case '*':
5871 83082 : ranges->Add(CharacterRange::Everything(), zone);
5872 83082 : break;
5873 : // This is the set of characters matched by the $ and ^ symbols
5874 : // in multiline mode.
5875 : case 'n':
5876 : AddClass(kLineTerminatorRanges,
5877 : kLineTerminatorRangeCount,
5878 : ranges,
5879 198 : zone);
5880 198 : break;
5881 : default:
5882 0 : UNREACHABLE();
5883 : }
5884 110286 : }
5885 :
5886 :
5887 0 : Vector<const int> CharacterRange::GetWordBounds() {
5888 0 : return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5889 : }
5890 :
5891 : // static
5892 66955 : void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5893 : ZoneList<CharacterRange>* ranges,
5894 : bool is_one_byte) {
5895 66955 : CharacterRange::Canonicalize(ranges);
5896 : int range_count = ranges->length();
5897 210861 : for (int i = 0; i < range_count; i++) {
5898 71953 : CharacterRange range = ranges->at(i);
5899 : uc32 bottom = range.from();
5900 74100 : if (bottom > String::kMaxUtf16CodeUnit) continue;
5901 : uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5902 : // Nothing to be done for surrogates.
5903 71953 : if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
5904 69906 : if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5905 1343 : if (bottom > String::kMaxOneByteCharCode) continue;
5906 1243 : if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5907 : }
5908 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5909 69806 : if (top == bottom) {
5910 : // If this is a singleton we just expand the one character.
5911 9216 : int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5912 10016 : for (int i = 0; i < length; i++) {
5913 2704 : uc32 chr = chars[i];
5914 2704 : if (chr != bottom) {
5915 1382 : ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5916 : }
5917 : }
5918 : } else {
5919 : // If this is a range we expand the characters block by block, expanding
5920 : // contiguous subranges (blocks) one at a time. The approach is as
5921 : // follows. For a given start character we look up the remainder of the
5922 : // block that contains it (represented by the end point), for instance we
5923 : // find 'z' if the character is 'c'. A block is characterized by the
5924 : // property that all characters uncanonicalize in the same way, except
5925 : // that each entry in the result is incremented by the distance from the
5926 : // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5927 : // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5928 : // we've found the end point we look up its uncanonicalization and
5929 : // produce a range for each element. For instance for [c-f] we look up
5930 : // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5931 : // it is not already contained in the input, so [c-f] will be skipped but
5932 : // [C-F] will be added. If this range is not completely contained in a
5933 : // block we do this for all the blocks covered by the range (handling
5934 : // characters that is not in a block as a "singleton block").
5935 : unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5936 : int pos = bottom;
5937 15528016 : while (pos <= top) {
5938 : int length =
5939 15462818 : isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5940 : uc32 block_end;
5941 7731409 : if (length == 0) {
5942 : block_end = pos;
5943 : } else {
5944 : DCHECK_EQ(1, length);
5945 6349 : block_end = equivalents[0];
5946 : }
5947 7731409 : int end = (block_end > top) ? top : block_end;
5948 7731409 : length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5949 7731409 : equivalents);
5950 8398373 : for (int i = 0; i < length; i++) {
5951 333482 : uc32 c = equivalents[i];
5952 333482 : uc32 range_from = c - (block_end - pos);
5953 333482 : uc32 range_to = c - (block_end - end);
5954 333482 : if (!(bottom <= range_from && range_to <= top)) {
5955 6669 : ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5956 : }
5957 : }
5958 7731409 : pos = end + 1;
5959 : }
5960 : }
5961 : }
5962 66955 : }
5963 :
5964 :
5965 10 : bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5966 : DCHECK_NOT_NULL(ranges);
5967 : int n = ranges->length();
5968 10 : if (n <= 1) return true;
5969 : int max = ranges->at(0).to();
5970 590 : for (int i = 1; i < n; i++) {
5971 290 : CharacterRange next_range = ranges->at(i);
5972 290 : if (next_range.from() <= max + 1) return false;
5973 : max = next_range.to();
5974 : }
5975 : return true;
5976 : }
5977 :
5978 :
5979 1900724 : ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5980 1900724 : if (ranges_ == nullptr) {
5981 82971 : ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5982 82971 : CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
5983 : }
5984 1900724 : return ranges_;
5985 : }
5986 :
5987 :
5988 : // Move a number of elements in a zonelist to another position
5989 : // in the same list. Handles overlapping source and target areas.
5990 92250 : static void MoveRanges(ZoneList<CharacterRange>* list,
5991 : int from,
5992 : int to,
5993 : int count) {
5994 : // Ranges are potentially overlapping.
5995 92250 : if (from < to) {
5996 10119845 : for (int i = count - 1; i >= 0; i--) {
5997 30118101 : list->at(to + i) = list->at(from + i);
5998 : }
5999 : } else {
6000 7210016 : for (int i = 0; i < count; i++) {
6001 10797366 : list->at(to + i) = list->at(from + i);
6002 : }
6003 : }
6004 92250 : }
6005 :
6006 :
6007 173176 : static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6008 : int count,
6009 : CharacterRange insert) {
6010 : // Inserts a range into list[0..count[, which must be sorted
6011 : // by from value and non-overlapping and non-adjacent, using at most
6012 : // list[0..count] for the result. Returns the number of resulting
6013 : // canonicalized ranges. Inserting a range may collapse existing ranges into
6014 : // fewer ranges, so the return value can be anything in the range 1..count+1.
6015 : uc32 from = insert.from();
6016 : uc32 to = insert.to();
6017 : int start_pos = 0;
6018 : int end_pos = count;
6019 18092687 : for (int i = count - 1; i >= 0; i--) {
6020 18009142 : CharacterRange current = list->at(i);
6021 18009142 : if (current.from() > to + 1) {
6022 : end_pos = i;
6023 142194 : } else if (current.to() + 1 < from) {
6024 89631 : start_pos = i + 1;
6025 : break;
6026 : }
6027 : }
6028 :
6029 : // Inserted range overlaps, or is adjacent to, ranges at positions
6030 : // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6031 : // not affected by the insertion.
6032 : // If start_pos == end_pos, the range must be inserted before start_pos.
6033 : // if start_pos < end_pos, the entire range from start_pos to end_pos
6034 : // must be merged with the insert range.
6035 :
6036 173176 : if (start_pos == end_pos) {
6037 : // Insert between existing ranges at position start_pos.
6038 132771 : if (start_pos < count) {
6039 80478 : MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6040 : }
6041 132771 : list->at(start_pos) = insert;
6042 132771 : return count + 1;
6043 : }
6044 40405 : if (start_pos + 1 == end_pos) {
6045 : // Replace single existing range at position start_pos.
6046 28492 : CharacterRange to_replace = list->at(start_pos);
6047 : int new_from = Min(to_replace.from(), from);
6048 : int new_to = Max(to_replace.to(), to);
6049 28492 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6050 : return count;
6051 : }
6052 : // Replace a number of existing ranges from start_pos to end_pos - 1.
6053 : // Move the remaining ranges down.
6054 :
6055 : int new_from = Min(list->at(start_pos).from(), from);
6056 11913 : int new_to = Max(list->at(end_pos - 1).to(), to);
6057 11913 : if (end_pos < count) {
6058 11772 : MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6059 : }
6060 11913 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6061 11913 : return count - (end_pos - start_pos) + 1;
6062 : }
6063 :
6064 :
6065 20 : void CharacterSet::Canonicalize() {
6066 : // Special/default classes are always considered canonical. The result
6067 : // of calling ranges() will be sorted.
6068 176150 : if (ranges_ == nullptr) return;
6069 93427 : CharacterRange::Canonicalize(ranges_);
6070 : }
6071 :
6072 :
6073 497441 : void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6074 497441 : if (character_ranges->length() <= 1) return;
6075 : // Check whether ranges are already canonical (increasing, non-overlapping,
6076 : // non-adjacent).
6077 : int n = character_ranges->length();
6078 : int max = character_ranges->at(0).to();
6079 : int i = 1;
6080 2619592 : while (i < n) {
6081 1286968 : CharacterRange current = character_ranges->at(i);
6082 1286968 : if (current.from() <= max + 1) {
6083 : break;
6084 : }
6085 : max = current.to();
6086 1277854 : i++;
6087 : }
6088 : // Canonical until the i'th range. If that's all of them, we are done.
6089 63884 : if (i == n) return;
6090 :
6091 : // The ranges at index i and forward are not canonicalized. Make them so by
6092 : // doing the equivalent of insertion sort (inserting each into the previous
6093 : // list, in order).
6094 : // Notice that inserting a range can reduce the number of ranges in the
6095 : // result due to combining of adjacent and overlapping ranges.
6096 : int read = i; // Range to insert.
6097 : int num_canonical = i; // Length of canonicalized part of list.
6098 : do {
6099 : num_canonical = InsertRangeInCanonicalList(character_ranges,
6100 : num_canonical,
6101 173176 : character_ranges->at(read));
6102 173176 : read++;
6103 173176 : } while (read < n);
6104 : character_ranges->Rewind(num_canonical);
6105 :
6106 : DCHECK(CharacterRange::IsCanonical(character_ranges));
6107 : }
6108 :
6109 :
6110 230 : void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6111 : ZoneList<CharacterRange>* negated_ranges,
6112 : Zone* zone) {
6113 : DCHECK(CharacterRange::IsCanonical(ranges));
6114 : DCHECK_EQ(0, negated_ranges->length());
6115 : int range_count = ranges->length();
6116 : uc32 from = 0;
6117 : int i = 0;
6118 230 : if (range_count > 0 && ranges->at(0).from() == 0) {
6119 40 : from = ranges->at(0).to() + 1;
6120 : i = 1;
6121 : }
6122 14730 : while (i < range_count) {
6123 7250 : CharacterRange range = ranges->at(i);
6124 7250 : negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6125 7250 : from = range.to() + 1;
6126 7250 : i++;
6127 : }
6128 230 : if (from < String::kMaxCodePoint) {
6129 360 : negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6130 180 : zone);
6131 : }
6132 230 : }
6133 :
6134 :
6135 : // -------------------------------------------------------------------
6136 : // Splay tree
6137 :
6138 :
6139 236859 : OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6140 236859 : if (Get(value))
6141 : return this;
6142 236854 : if (successors(zone) != nullptr) {
6143 619610 : for (int i = 0; i < successors(zone)->length(); i++) {
6144 414087 : OutSet* successor = successors(zone)->at(i);
6145 414087 : if (successor->Get(value))
6146 : return successor;
6147 : }
6148 : } else {
6149 5694 : successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6150 : }
6151 33984 : OutSet* result = new(zone) OutSet(first_, remaining_);
6152 16992 : result->Set(value, zone);
6153 16992 : successors(zone)->Add(result, zone);
6154 16992 : return result;
6155 : }
6156 :
6157 :
6158 713949 : void OutSet::Set(unsigned value, Zone *zone) {
6159 713949 : if (value < kFirstLimit) {
6160 389315 : first_ |= (1 << value);
6161 : } else {
6162 324634 : if (remaining_ == nullptr)
6163 84582 : remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6164 889320 : if (remaining_->is_empty() || !remaining_->Contains(value))
6165 323584 : remaining_->Add(value, zone);
6166 : }
6167 713949 : }
6168 :
6169 :
6170 90010 : bool OutSet::Get(unsigned value) const {
6171 30831298 : if (value < kFirstLimit) {
6172 6621150 : return (first_ & (1 << value)) != 0;
6173 24501487 : } else if (remaining_ == nullptr) {
6174 : return false;
6175 : } else {
6176 0 : return remaining_->Contains(value);
6177 : }
6178 : }
6179 :
6180 :
6181 : const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6182 :
6183 :
6184 88433 : void DispatchTable::AddRange(CharacterRange full_range, int value,
6185 : Zone* zone) {
6186 88433 : CharacterRange current = full_range;
6187 88433 : if (tree()->is_empty()) {
6188 : // If this is the first range we just insert into the table.
6189 : ZoneSplayTree<Config>::Locator loc;
6190 2647 : bool inserted = tree()->Insert(current.from(), &loc);
6191 : DCHECK(inserted);
6192 : USE(inserted);
6193 5294 : loc.set_value(Entry(current.from(), current.to(),
6194 : empty()->Extend(value, zone)));
6195 : return;
6196 : }
6197 : // First see if there is a range to the left of this one that
6198 : // overlaps.
6199 : ZoneSplayTree<Config>::Locator loc;
6200 85786 : if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6201 : Entry* entry = &loc.value();
6202 : // If we've found a range that overlaps with this one, and it
6203 : // starts strictly to the left of this one, we have to fix it
6204 : // because the following code only handles ranges that start on
6205 : // or after the start point of the range we're adding.
6206 84184 : if (entry->from() < current.from() && entry->to() >= current.from()) {
6207 : // Snap the overlapping range in half around the start point of
6208 : // the range we're adding.
6209 : CharacterRange left =
6210 400 : CharacterRange::Range(entry->from(), current.from() - 1);
6211 : CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6212 : // The left part of the overlapping range doesn't overlap.
6213 : // Truncate the whole entry to be just the left part.
6214 : entry->set_to(left.to());
6215 : // The right part is the one that overlaps. We add this part
6216 : // to the map and let the next step deal with merging it with
6217 : // the range we're adding.
6218 : ZoneSplayTree<Config>::Locator loc;
6219 400 : bool inserted = tree()->Insert(right.from(), &loc);
6220 : DCHECK(inserted);
6221 : USE(inserted);
6222 : loc.set_value(Entry(right.from(),
6223 : right.to(),
6224 : entry->out_set()));
6225 : }
6226 : }
6227 166674 : while (current.is_valid()) {
6228 406291 : if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6229 242002 : (loc.value().from() <= current.to()) &&
6230 : (loc.value().to() >= current.from())) {
6231 : Entry* entry = &loc.value();
6232 : // We have overlap. If there is space between the start point of
6233 : // the range we're adding and where the overlapping range starts
6234 : // then we have to add a range covering just that space.
6235 80888 : if (current.from() < entry->from()) {
6236 : ZoneSplayTree<Config>::Locator ins;
6237 73098 : bool inserted = tree()->Insert(current.from(), &ins);
6238 : DCHECK(inserted);
6239 : USE(inserted);
6240 146196 : ins.set_value(Entry(current.from(),
6241 : entry->from() - 1,
6242 : empty()->Extend(value, zone)));
6243 : current.set_from(entry->from());
6244 : }
6245 : DCHECK_EQ(current.from(), entry->from());
6246 : // If the overlapping range extends beyond the one we want to add
6247 : // we have to snap the right part off and add it separately.
6248 80888 : if (entry->to() > current.to()) {
6249 : ZoneSplayTree<Config>::Locator ins;
6250 4430 : bool inserted = tree()->Insert(current.to() + 1, &ins);
6251 : DCHECK(inserted);
6252 : USE(inserted);
6253 : ins.set_value(Entry(current.to() + 1,
6254 : entry->to(),
6255 : entry->out_set()));
6256 : entry->set_to(current.to());
6257 : }
6258 : DCHECK(entry->to() <= current.to());
6259 : // The overlapping range is now completely contained by the range
6260 : // we're adding so we can just update it and move the start point
6261 : // of the range we're adding just past it.
6262 : entry->AddValue(value, zone);
6263 : DCHECK(entry->to() + 1 > current.from());
6264 80888 : current.set_from(entry->to() + 1);
6265 : } else {
6266 : // There is no overlap so we can just add the range
6267 : ZoneSplayTree<Config>::Locator ins;
6268 80226 : bool inserted = tree()->Insert(current.from(), &ins);
6269 : DCHECK(inserted);
6270 : USE(inserted);
6271 160452 : ins.set_value(Entry(current.from(),
6272 : current.to(),
6273 : empty()->Extend(value, zone)));
6274 : break;
6275 : }
6276 : }
6277 : }
6278 :
6279 :
6280 55010 : OutSet* DispatchTable::Get(uc32 value) {
6281 : ZoneSplayTree<Config>::Locator loc;
6282 55010 : if (!tree()->FindGreatestLessThan(value, &loc))
6283 0 : return empty();
6284 : Entry* entry = &loc.value();
6285 55010 : if (value <= entry->to())
6286 38885 : return entry->out_set();
6287 : else
6288 16125 : return empty();
6289 : }
6290 :
6291 :
6292 : // -------------------------------------------------------------------
6293 : // Analysis
6294 :
6295 :
6296 1075682 : void Analysis::EnsureAnalyzed(RegExpNode* that) {
6297 : StackLimitCheck check(isolate());
6298 1075682 : if (check.HasOverflowed()) {
6299 : fail("Stack overflow");
6300 : return;
6301 : }
6302 1075328 : if (that->info()->been_analyzed || that->info()->being_analyzed)
6303 : return;
6304 871323 : that->info()->being_analyzed = true;
6305 871323 : that->Accept(this);
6306 871323 : that->info()->being_analyzed = false;
6307 871323 : that->info()->been_analyzed = true;
6308 : }
6309 :
6310 :
6311 88114 : void Analysis::VisitEnd(EndNode* that) {
6312 : // nothing to do
6313 88114 : }
6314 :
6315 :
6316 315549 : void TextNode::CalculateOffsets() {
6317 : int element_count = elements()->length();
6318 : // Set up the offsets of the elements relative to the start. This is a fixed
6319 : // quantity since a TextNode can only contain fixed-width things.
6320 : int cp_offset = 0;
6321 1069953 : for (int i = 0; i < element_count; i++) {
6322 : TextElement& elm = elements()->at(i);
6323 : elm.set_cp_offset(cp_offset);
6324 377202 : cp_offset += elm.length();
6325 : }
6326 315549 : }
6327 :
6328 :
6329 317597 : void Analysis::VisitText(TextNode* that) {
6330 317597 : that->MakeCaseIndependent(isolate(), is_one_byte_);
6331 317597 : EnsureAnalyzed(that->on_success());
6332 317597 : if (!has_failed()) {
6333 315549 : that->CalculateOffsets();
6334 : }
6335 317597 : }
6336 :
6337 :
6338 286316 : void Analysis::VisitAction(ActionNode* that) {
6339 : RegExpNode* target = that->on_success();
6340 286316 : EnsureAnalyzed(target);
6341 286316 : if (!has_failed()) {
6342 : // If the next node is interested in what it follows then this node
6343 : // has to be interested too so it can pass the information on.
6344 : that->info()->AddFromFollowing(target->info());
6345 : }
6346 286316 : }
6347 :
6348 :
6349 25751 : void Analysis::VisitChoice(ChoiceNode* that) {
6350 : NodeInfo* info = that->info();
6351 293961 : for (int i = 0; i < that->alternatives()->length(); i++) {
6352 : RegExpNode* node = that->alternatives()->at(i).node();
6353 134105 : EnsureAnalyzed(node);
6354 134105 : if (has_failed()) return;
6355 : // Anything the following nodes need to know has to be known by
6356 : // this node also, so it can pass it on.
6357 : info->AddFromFollowing(node->info());
6358 : }
6359 : }
6360 :
6361 :
6362 145800 : void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6363 : NodeInfo* info = that->info();
6364 633878 : for (int i = 0; i < that->alternatives()->length(); i++) {
6365 : RegExpNode* node = that->alternatives()->at(i).node();
6366 291386 : if (node != that->loop_node()) {
6367 145800 : EnsureAnalyzed(node);
6368 145800 : if (has_failed()) return;
6369 : info->AddFromFollowing(node->info());
6370 : }
6371 : }
6372 : // Check the loop last since it may need the value of this node
6373 : // to get a correct result.
6374 98453 : EnsureAnalyzed(that->loop_node());
6375 98453 : if (!has_failed()) {
6376 : info->AddFromFollowing(that->loop_node()->info());
6377 : }
6378 : }
6379 :
6380 :
6381 2320 : void Analysis::VisitBackReference(BackReferenceNode* that) {
6382 2320 : EnsureAnalyzed(that->on_success());
6383 2320 : }
6384 :
6385 :
6386 5425 : void Analysis::VisitAssertion(AssertionNode* that) {
6387 5425 : EnsureAnalyzed(that->on_success());
6388 5425 : }
6389 :
6390 :
6391 188 : void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6392 : BoyerMooreLookahead* bm,
6393 : bool not_at_start) {
6394 : // Working out the set of characters that a backreference can match is too
6395 : // hard, so we just say that any character can match.
6396 : bm->SetRest(offset);
6397 : SaveBMInfo(bm, not_at_start, offset);
6398 188 : }
6399 :
6400 :
6401 : STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6402 : RegExpMacroAssembler::kTableSize);
6403 :
6404 :
6405 7809 : void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6406 : BoyerMooreLookahead* bm, bool not_at_start) {
6407 : ZoneList<GuardedAlternative>* alts = alternatives();
6408 15618 : budget = (budget - 1) / alts->length();
6409 90195 : for (int i = 0; i < alts->length(); i++) {
6410 : GuardedAlternative& alt = alts->at(i);
6411 41665 : if (alt.guards() != nullptr && alt.guards()->length() != 0) {
6412 : bm->SetRest(offset); // Give up trying to fill in info.
6413 : SaveBMInfo(bm, not_at_start, offset);
6414 : return;
6415 : }
6416 41193 : alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6417 : }
6418 : SaveBMInfo(bm, not_at_start, offset);
6419 : }
6420 :
6421 :
6422 122981 : void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6423 : BoyerMooreLookahead* bm, bool not_at_start) {
6424 122981 : if (initial_offset >= bm->length()) return;
6425 : int offset = initial_offset;
6426 : int max_char = bm->max_char();
6427 403303 : for (int i = 0; i < elements()->length(); i++) {
6428 163519 : if (offset >= bm->length()) {
6429 17007 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6430 : return;
6431 : }
6432 146512 : TextElement text = elements()->at(i);
6433 146512 : if (text.text_type() == TextElement::ATOM) {
6434 : RegExpAtom* atom = text.atom();
6435 213580 : for (int j = 0; j < atom->length(); j++, offset++) {
6436 89008 : if (offset >= bm->length()) {
6437 6351 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6438 : return;
6439 : }
6440 165314 : uc16 character = atom->data()[j];
6441 82657 : if (IgnoreCase(atom->flags())) {
6442 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6443 4614 : int length = GetCaseIndependentLetters(
6444 : isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6445 4614 : chars);
6446 21786 : for (int j = 0; j < length; j++) {
6447 8586 : bm->Set(offset, chars[j]);
6448 : }
6449 : } else {
6450 78043 : if (character <= max_char) bm->Set(offset, character);
6451 : }
6452 : }
6453 : } else {
6454 : DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6455 : RegExpCharacterClass* char_class = text.char_class();
6456 : ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6457 98246 : if (char_class->is_negated()) {
6458 4373 : bm->SetAll(offset);
6459 : } else {
6460 671633 : for (int k = 0; k < ranges->length(); k++) {
6461 : CharacterRange& range = ranges->at(k);
6462 288880 : if (range.from() > max_char) continue;
6463 : int to = Min(max_char, static_cast<int>(range.to()));
6464 161809 : bm->SetInterval(offset, Interval(range.from(), to));
6465 : }
6466 : }
6467 98246 : offset++;
6468 : }
6469 : }
6470 99623 : if (offset >= bm->length()) {
6471 90337 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6472 : return;
6473 : }
6474 9286 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6475 18572 : true); // Not at start after a text node.
6476 9286 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6477 : }
6478 :
6479 :
6480 : // -------------------------------------------------------------------
6481 : // Dispatch table construction
6482 :
6483 :
6484 0 : void DispatchTableConstructor::VisitEnd(EndNode* that) {
6485 : AddRange(CharacterRange::Everything());
6486 0 : }
6487 :
6488 :
6489 0 : void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6490 : node->set_being_calculated(true);
6491 : ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6492 0 : for (int i = 0; i < alternatives->length(); i++) {
6493 : set_choice_index(i);
6494 0 : alternatives->at(i).node()->Accept(this);
6495 : }
6496 : node->set_being_calculated(false);
6497 0 : }
6498 :
6499 :
6500 : class AddDispatchRange {
6501 : public:
6502 : explicit AddDispatchRange(DispatchTableConstructor* constructor)
6503 0 : : constructor_(constructor) { }
6504 : void Call(uc32 from, DispatchTable::Entry entry);
6505 : private:
6506 : DispatchTableConstructor* constructor_;
6507 : };
6508 :
6509 :
6510 0 : void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6511 0 : constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6512 0 : }
6513 :
6514 :
6515 0 : void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6516 0 : if (node->being_calculated())
6517 0 : return;
6518 0 : DispatchTable* table = node->GetTable(ignore_case_);
6519 : AddDispatchRange adder(this);
6520 : table->ForEach(&adder);
6521 : }
6522 :
6523 :
6524 0 : void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6525 : // TODO(160): Find the node that we refer back to and propagate its start
6526 : // set back to here. For now we just accept anything.
6527 : AddRange(CharacterRange::Everything());
6528 0 : }
6529 :
6530 :
6531 0 : void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6532 : RegExpNode* target = that->on_success();
6533 0 : target->Accept(this);
6534 0 : }
6535 :
6536 :
6537 3935 : static int CompareRangeByFrom(const CharacterRange* a,
6538 : const CharacterRange* b) {
6539 7870 : return Compare<uc16>(a->from(), b->from());
6540 : }
6541 :
6542 :
6543 55 : void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6544 : ranges->Sort(CompareRangeByFrom);
6545 : uc16 last = 0;
6546 1665 : for (int i = 0; i < ranges->length(); i++) {
6547 805 : CharacterRange range = ranges->at(i);
6548 805 : if (last < range.from())
6549 525 : AddRange(CharacterRange::Range(last, range.from() - 1));
6550 805 : if (range.to() >= last) {
6551 715 : if (range.to() == String::kMaxCodePoint) {
6552 : return;
6553 : } else {
6554 715 : last = range.to() + 1;
6555 : }
6556 : }
6557 : }
6558 55 : AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6559 : }
6560 :
6561 :
6562 0 : void DispatchTableConstructor::VisitText(TextNode* that) {
6563 0 : TextElement elm = that->elements()->at(0);
6564 0 : switch (elm.text_type()) {
6565 : case TextElement::ATOM: {
6566 0 : uc16 c = elm.atom()->data()[0];
6567 0 : AddRange(CharacterRange::Range(c, c));
6568 : break;
6569 : }
6570 : case TextElement::CHAR_CLASS: {
6571 : RegExpCharacterClass* tree = elm.char_class();
6572 : ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6573 0 : if (tree->is_negated()) {
6574 0 : AddInverse(ranges);
6575 : } else {
6576 0 : for (int i = 0; i < ranges->length(); i++)
6577 : AddRange(ranges->at(i));
6578 : }
6579 : break;
6580 : }
6581 : default: {
6582 0 : UNIMPLEMENTED();
6583 : }
6584 : }
6585 0 : }
6586 :
6587 :
6588 0 : void DispatchTableConstructor::VisitAction(ActionNode* that) {
6589 : RegExpNode* target = that->on_success();
6590 0 : target->Accept(this);
6591 0 : }
6592 :
6593 40 : RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6594 : RegExpNode* on_success,
6595 : JSRegExp::Flags flags) {
6596 : // If the regexp matching starts within a surrogate pair, step back
6597 : // to the lead surrogate and start matching from there.
6598 : DCHECK(!compiler->read_backward());
6599 : Zone* zone = compiler->zone();
6600 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6601 40 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6602 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6603 40 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6604 :
6605 40 : ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6606 :
6607 : int stack_register = compiler->UnicodeLookaroundStackRegister();
6608 : int position_register = compiler->UnicodeLookaroundPositionRegister();
6609 : RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6610 40 : zone, lead_surrogates, true, on_success, flags);
6611 : RegExpLookaround::Builder builder(true, step_back, stack_register,
6612 40 : position_register);
6613 : RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6614 40 : zone, trail_surrogates, false, builder.on_match_success(), flags);
6615 :
6616 40 : optional_step_back->AddAlternative(
6617 : GuardedAlternative(builder.ForMatch(match_trail)));
6618 : optional_step_back->AddAlternative(GuardedAlternative(on_success));
6619 :
6620 40 : return optional_step_back;
6621 : }
6622 :
6623 :
6624 85675 : RegExpEngine::CompilationResult RegExpEngine::Compile(
6625 : Isolate* isolate, Zone* zone, RegExpCompileData* data,
6626 : JSRegExp::Flags flags, Handle<String> pattern,
6627 : Handle<String> sample_subject, bool is_one_byte) {
6628 85675 : if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6629 : return IrregexpRegExpTooBig(isolate);
6630 : }
6631 : bool is_sticky = IsSticky(flags);
6632 : bool is_global = IsGlobal(flags);
6633 : bool is_unicode = IsUnicode(flags);
6634 85666 : RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
6635 :
6636 85666 : if (compiler.optimize())
6637 84502 : compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
6638 :
6639 : // Sample some characters from the middle of the string.
6640 : static const int kSampleSize = 128;
6641 :
6642 85666 : sample_subject = String::Flatten(isolate, sample_subject);
6643 : int chars_sampled = 0;
6644 85666 : int half_way = (sample_subject->length() - kSampleSize) / 2;
6645 997722 : for (int i = Max(0, half_way);
6646 541694 : i < sample_subject->length() && chars_sampled < kSampleSize;
6647 : i++, chars_sampled++) {
6648 : compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6649 : }
6650 :
6651 : // Wrap the body of the regexp in capture #0.
6652 85666 : RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6653 : 0,
6654 : &compiler,
6655 85666 : compiler.accept());
6656 : RegExpNode* node = captured_body;
6657 85666 : bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6658 85666 : bool is_start_anchored = data->tree->IsAnchoredAtStart();
6659 85666 : int max_length = data->tree->max_match();
6660 85666 : if (!is_start_anchored && !is_sticky) {
6661 : // Add a .*? at the beginning, outside the body capture, unless
6662 : // this expression is anchored at the beginning or sticky.
6663 : JSRegExp::Flags default_flags = JSRegExp::Flags();
6664 82723 : RegExpNode* loop_node = RegExpQuantifier::ToNode(
6665 : 0, RegExpTree::kInfinity, false,
6666 : new (zone) RegExpCharacterClass('*', default_flags), &compiler,
6667 165446 : captured_body, data->contains_anchor);
6668 :
6669 82723 : if (data->contains_anchor) {
6670 : // Unroll loop once, to take care of the case that might start
6671 : // at the start of input.
6672 149 : ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6673 : first_step_node->AddAlternative(GuardedAlternative(captured_body));
6674 : first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6675 : new (zone) RegExpCharacterClass('*', default_flags), false,
6676 149 : loop_node)));
6677 : node = first_step_node;
6678 : } else {
6679 : node = loop_node;
6680 : }
6681 : }
6682 85666 : if (is_one_byte) {
6683 14611 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6684 : // Do it again to propagate the new nodes to places where they were not
6685 : // put because they had not been calculated yet.
6686 14611 : if (node != nullptr) {
6687 14311 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6688 : }
6689 71055 : } else if (is_unicode && (is_global || is_sticky)) {
6690 40 : node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags);
6691 : }
6692 :
6693 85666 : if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
6694 85666 : data->node = node;
6695 : Analysis analysis(isolate, is_one_byte);
6696 85666 : analysis.EnsureAnalyzed(node);
6697 85666 : if (analysis.has_failed()) {
6698 : const char* error_message = analysis.error_message();
6699 : return CompilationResult(isolate, error_message);
6700 : }
6701 :
6702 : // Create the correct assembler for the architecture.
6703 : std::unique_ptr<RegExpMacroAssembler> macro_assembler;
6704 85312 : if (!FLAG_regexp_interpret_all) {
6705 : // Native regexp implementation.
6706 : DCHECK(!FLAG_jitless);
6707 :
6708 : NativeRegExpMacroAssembler::Mode mode =
6709 : is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6710 82055 : : NativeRegExpMacroAssembler::UC16;
6711 :
6712 : #if V8_TARGET_ARCH_IA32
6713 : macro_assembler.reset(new RegExpMacroAssemblerIA32(
6714 : isolate, zone, mode, (data->capture_count + 1) * 2));
6715 : #elif V8_TARGET_ARCH_X64
6716 82055 : macro_assembler.reset(new RegExpMacroAssemblerX64(
6717 82055 : isolate, zone, mode, (data->capture_count + 1) * 2));
6718 : #elif V8_TARGET_ARCH_ARM
6719 : macro_assembler.reset(new RegExpMacroAssemblerARM(
6720 : isolate, zone, mode, (data->capture_count + 1) * 2));
6721 : #elif V8_TARGET_ARCH_ARM64
6722 : macro_assembler.reset(new RegExpMacroAssemblerARM64(
6723 : isolate, zone, mode, (data->capture_count + 1) * 2));
6724 : #elif V8_TARGET_ARCH_S390
6725 : macro_assembler.reset(new RegExpMacroAssemblerS390(
6726 : isolate, zone, mode, (data->capture_count + 1) * 2));
6727 : #elif V8_TARGET_ARCH_PPC
6728 : macro_assembler.reset(new RegExpMacroAssemblerPPC(
6729 : isolate, zone, mode, (data->capture_count + 1) * 2));
6730 : #elif V8_TARGET_ARCH_MIPS
6731 : macro_assembler.reset(new RegExpMacroAssemblerMIPS(
6732 : isolate, zone, mode, (data->capture_count + 1) * 2));
6733 : #elif V8_TARGET_ARCH_MIPS64
6734 : macro_assembler.reset(new RegExpMacroAssemblerMIPS(
6735 : isolate, zone, mode, (data->capture_count + 1) * 2));
6736 : #else
6737 : #error "Unsupported architecture"
6738 : #endif
6739 : } else {
6740 : DCHECK(FLAG_regexp_interpret_all);
6741 :
6742 : // Interpreted regexp implementation.
6743 3257 : macro_assembler.reset(new RegExpMacroAssemblerIrregexp(isolate, zone));
6744 : }
6745 :
6746 85312 : macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
6747 :
6748 : // Inserted here, instead of in Assembler, because it depends on information
6749 : // in the AST that isn't replicated in the Node structure.
6750 : static const int kMaxBacksearchLimit = 1024;
6751 85855 : if (is_end_anchored && !is_start_anchored && !is_sticky &&
6752 543 : max_length < kMaxBacksearchLimit) {
6753 210 : macro_assembler->SetCurrentPositionFromEnd(max_length);
6754 : }
6755 :
6756 85312 : if (is_global) {
6757 : RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6758 3819 : if (data->tree->min_match() > 0) {
6759 : mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6760 138 : } else if (is_unicode) {
6761 : mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6762 : }
6763 : macro_assembler->set_global_mode(mode);
6764 : }
6765 :
6766 : return compiler.Assemble(isolate, macro_assembler.get(), node,
6767 85312 : data->capture_count, pattern);
6768 : }
6769 :
6770 169814 : bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
6771 : Heap* heap = isolate->heap();
6772 169814 : bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6773 169814 : if (isolate->total_regexp_code_generated() >
6774 298367 : RegExpImpl::kRegExpCompiledLimit &&
6775 128553 : heap->CommittedMemoryExecutable() >
6776 : RegExpImpl::kRegExpExecutableMemoryLimit) {
6777 : too_much = true;
6778 : }
6779 169814 : return too_much;
6780 : }
6781 :
6782 36147 : Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
6783 : Object key_pattern,
6784 : FixedArray* last_match_cache,
6785 : ResultsCacheType type) {
6786 : FixedArray cache;
6787 36147 : if (!key_string->IsInternalizedString()) return Smi::kZero;
6788 5230 : if (type == STRING_SPLIT_SUBSTRINGS) {
6789 : DCHECK(key_pattern->IsString());
6790 5230 : if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6791 : cache = heap->string_split_cache();
6792 : } else {
6793 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6794 : DCHECK(key_pattern->IsFixedArray());
6795 : cache = heap->regexp_multiple_cache();
6796 : }
6797 :
6798 5230 : uint32_t hash = key_string->Hash();
6799 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6800 5230 : ~(kArrayEntriesPerCacheEntry - 1));
6801 14432 : if (cache->get(index + kStringOffset) != key_string ||
6802 3972 : cache->get(index + kPatternOffset) != key_pattern) {
6803 : index =
6804 1289 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6805 2597 : if (cache->get(index + kStringOffset) != key_string ||
6806 19 : cache->get(index + kPatternOffset) != key_pattern) {
6807 1281 : return Smi::kZero;
6808 : }
6809 : }
6810 :
6811 7898 : *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6812 3949 : return cache->get(index + kArrayOffset);
6813 : }
6814 :
6815 32198 : void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6816 : Handle<Object> key_pattern,
6817 : Handle<FixedArray> value_array,
6818 : Handle<FixedArray> last_match_cache,
6819 : ResultsCacheType type) {
6820 : Factory* factory = isolate->factory();
6821 : Handle<FixedArray> cache;
6822 32198 : if (!key_string->IsInternalizedString()) return;
6823 1281 : if (type == STRING_SPLIT_SUBSTRINGS) {
6824 : DCHECK(key_pattern->IsString());
6825 1281 : if (!key_pattern->IsInternalizedString()) return;
6826 : cache = factory->string_split_cache();
6827 : } else {
6828 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6829 : DCHECK(key_pattern->IsFixedArray());
6830 : cache = factory->regexp_multiple_cache();
6831 : }
6832 :
6833 1281 : uint32_t hash = key_string->Hash();
6834 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6835 1281 : ~(kArrayEntriesPerCacheEntry - 1));
6836 2562 : if (cache->get(index + kStringOffset) == Smi::kZero) {
6837 2288 : cache->set(index + kStringOffset, *key_string);
6838 2288 : cache->set(index + kPatternOffset, *key_pattern);
6839 2288 : cache->set(index + kArrayOffset, *value_array);
6840 2288 : cache->set(index + kLastMatchOffset, *last_match_cache);
6841 : } else {
6842 : uint32_t index2 =
6843 137 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6844 274 : if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6845 188 : cache->set(index2 + kStringOffset, *key_string);
6846 188 : cache->set(index2 + kPatternOffset, *key_pattern);
6847 188 : cache->set(index2 + kArrayOffset, *value_array);
6848 188 : cache->set(index2 + kLastMatchOffset, *last_match_cache);
6849 : } else {
6850 43 : cache->set(index2 + kStringOffset, Smi::kZero);
6851 86 : cache->set(index2 + kPatternOffset, Smi::kZero);
6852 86 : cache->set(index2 + kArrayOffset, Smi::kZero);
6853 86 : cache->set(index2 + kLastMatchOffset, Smi::kZero);
6854 86 : cache->set(index + kStringOffset, *key_string);
6855 86 : cache->set(index + kPatternOffset, *key_pattern);
6856 86 : cache->set(index + kArrayOffset, *value_array);
6857 86 : cache->set(index + kLastMatchOffset, *last_match_cache);
6858 : }
6859 : }
6860 : // If the array is a reasonably short list of substrings, convert it into a
6861 : // list of internalized strings.
6862 2562 : if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6863 12729 : for (int i = 0; i < value_array->length(); i++) {
6864 : Handle<String> str(String::cast(value_array->get(i)), isolate);
6865 5740 : Handle<String> internalized_str = factory->InternalizeString(str);
6866 11480 : value_array->set(i, *internalized_str);
6867 : }
6868 : }
6869 : // Convert backing store to a copy-on-write array.
6870 : value_array->set_map_no_write_barrier(
6871 : ReadOnlyRoots(isolate).fixed_cow_array_map());
6872 : }
6873 :
6874 147910 : void RegExpResultsCache::Clear(FixedArray cache) {
6875 75877830 : for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6876 37864960 : cache->set(i, Smi::kZero);
6877 : }
6878 147910 : }
6879 :
6880 : } // namespace internal
6881 120216 : } // namespace v8
|