Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/jsregexp.h"
6 :
7 : #include <memory>
8 : #include <vector>
9 :
10 : #include "src/base/platform/platform.h"
11 : #include "src/code-tracer.h"
12 : #include "src/compilation-cache.h"
13 : #include "src/elements.h"
14 : #include "src/execution.h"
15 : #include "src/heap/factory.h"
16 : #include "src/heap/heap-inl.h"
17 : #include "src/isolate-inl.h"
18 : #include "src/message-template.h"
19 : #include "src/ostreams.h"
20 : #include "src/regexp/interpreter-irregexp.h"
21 : #include "src/regexp/jsregexp-inl.h"
22 : #include "src/regexp/regexp-macro-assembler-irregexp.h"
23 : #include "src/regexp/regexp-macro-assembler-tracer.h"
24 : #include "src/regexp/regexp-macro-assembler.h"
25 : #include "src/regexp/regexp-parser.h"
26 : #include "src/regexp/regexp-stack.h"
27 : #include "src/runtime/runtime.h"
28 : #include "src/splay-tree-inl.h"
29 : #include "src/string-search.h"
30 : #include "src/unicode-decoder.h"
31 : #include "src/unicode-inl.h"
32 : #include "src/zone/zone-list-inl.h"
33 :
34 : #ifdef V8_INTL_SUPPORT
35 : #include "unicode/uniset.h"
36 : #include "unicode/utypes.h"
37 : #endif // V8_INTL_SUPPORT
38 :
39 : #if V8_TARGET_ARCH_IA32
40 : #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
41 : #elif V8_TARGET_ARCH_X64
42 : #include "src/regexp/x64/regexp-macro-assembler-x64.h"
43 : #elif V8_TARGET_ARCH_ARM64
44 : #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
45 : #elif V8_TARGET_ARCH_ARM
46 : #include "src/regexp/arm/regexp-macro-assembler-arm.h"
47 : #elif V8_TARGET_ARCH_PPC
48 : #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
49 : #elif V8_TARGET_ARCH_S390
50 : #include "src/regexp/s390/regexp-macro-assembler-s390.h"
51 : #elif V8_TARGET_ARCH_MIPS
52 : #include "src/regexp/mips/regexp-macro-assembler-mips.h"
53 : #elif V8_TARGET_ARCH_MIPS64
54 : #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
55 : #else
56 : #error Unsupported target architecture.
57 : #endif
58 :
59 : namespace v8 {
60 : namespace internal {
61 :
62 : V8_WARN_UNUSED_RESULT
63 3167 : static inline MaybeHandle<Object> ThrowRegExpException(
64 : Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
65 : Handle<String> error_text) {
66 3167 : THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
67 : pattern, error_text),
68 : Object);
69 : }
70 :
71 319 : inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
72 : Handle<String> error_text) {
73 638 : USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
74 : error_text));
75 319 : }
76 :
77 :
78 964900 : ContainedInLattice AddRange(ContainedInLattice containment,
79 : const int* ranges,
80 : int ranges_length,
81 : Interval new_range) {
82 : DCHECK_EQ(1, ranges_length & 1);
83 : DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
84 964900 : if (containment == kLatticeUnknown) return containment;
85 : bool inside = false;
86 : int last = 0;
87 4569493 : for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
88 : // Consider the range from last to ranges[i].
89 : // We haven't got to the new range yet.
90 5424679 : if (ranges[i] <= new_range.from()) continue;
91 : // New range is wholly inside last-ranges[i]. Note that new_range.to() is
92 : // inclusive, but the values in ranges are not.
93 855186 : if (last <= new_range.from() && new_range.to() < ranges[i]) {
94 1676032 : return Combine(containment, inside ? kLatticeIn : kLatticeOut);
95 : }
96 : return kLatticeUnknown;
97 : }
98 : return containment;
99 : }
100 :
101 : // More makes code generation slower, less makes V8 benchmark score lower.
102 : const int kMaxLookaheadForBoyerMoore = 8;
103 : // In a 3-character pattern you can maximally step forwards 3 characters
104 : // at a time, which is not always enough to pay for the extra logic.
105 : const int kPatternTooShortForBoyerMoore = 2;
106 :
107 : // Identifies the sort of regexps where the regexp engine is faster
108 : // than the code used for atom matches.
109 177201 : static bool HasFewDifferentCharacters(Handle<String> pattern) {
110 : int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
111 177201 : if (length <= kPatternTooShortForBoyerMoore) return false;
112 : const int kMod = 128;
113 : bool character_found[kMod];
114 : int different = 0;
115 : memset(&character_found[0], 0, sizeof(character_found));
116 515267 : for (int i = 0; i < length; i++) {
117 1030358 : int ch = (pattern->Get(i) & (kMod - 1));
118 515179 : if (!character_found[ch]) {
119 514759 : character_found[ch] = true;
120 514759 : different++;
121 : // We declare a regexp low-alphabet if it has at least 3 times as many
122 : // characters as it has different characters.
123 514759 : if (different * 3 > length) return false;
124 : }
125 : }
126 : return true;
127 : }
128 :
129 : // Generic RegExp methods. Dispatches to implementation specific methods.
130 :
131 919626 : MaybeHandle<Object> RegExpImpl::Compile(Isolate* isolate, Handle<JSRegExp> re,
132 : Handle<String> pattern,
133 : JSRegExp::Flags flags) {
134 : DCHECK(pattern->IsFlat());
135 :
136 459813 : Zone zone(isolate->allocator(), ZONE_NAME);
137 : CompilationCache* compilation_cache = isolate->compilation_cache();
138 : MaybeHandle<FixedArray> maybe_cached =
139 459813 : compilation_cache->LookupRegExp(pattern, flags);
140 : Handle<FixedArray> cached;
141 459813 : if (maybe_cached.ToHandle(&cached)) {
142 389038 : re->set_data(*cached);
143 194519 : return re;
144 : }
145 :
146 : PostponeInterruptsScope postpone(isolate);
147 : RegExpCompileData parse_result;
148 265294 : FlatStringReader reader(isolate, pattern);
149 : DCHECK(!isolate->has_pending_exception());
150 265294 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
151 265294 : &parse_result)) {
152 : // Throw an exception if we fail to parse the pattern.
153 2798 : return ThrowRegExpException(isolate, re, pattern, parse_result.error);
154 : }
155 :
156 : bool has_been_compiled = false;
157 :
158 774665 : if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
159 170538 : !HasFewDifferentCharacters(pattern)) {
160 : // Parse-tree is a single atom that is equal to the pattern.
161 : AtomCompile(isolate, re, pattern, flags, pattern);
162 : has_been_compiled = true;
163 106425 : } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
164 7187 : parse_result.capture_count == 0) {
165 7177 : RegExpAtom* atom = parse_result.tree->AsAtom();
166 7177 : Vector<const uc16> atom_pattern = atom->data();
167 : Handle<String> atom_string;
168 14354 : ASSIGN_RETURN_ON_EXCEPTION(
169 : isolate, atom_string,
170 : isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
171 7177 : if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
172 : AtomCompile(isolate, re, pattern, flags, atom_string);
173 : has_been_compiled = true;
174 : }
175 : }
176 262496 : if (!has_been_compiled) {
177 85383 : IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count);
178 : }
179 : DCHECK(re->data()->IsFixedArray());
180 : // Compilation succeeded so the data is set on the regexp
181 : // and we can store it in the cache.
182 : Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
183 262496 : compilation_cache->PutRegExp(pattern, flags, data);
184 :
185 722309 : return re;
186 : }
187 :
188 4353070 : MaybeHandle<Object> RegExpImpl::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
189 : Handle<String> subject, int index,
190 : Handle<RegExpMatchInfo> last_match_info) {
191 4353070 : switch (regexp->TypeTag()) {
192 : case JSRegExp::ATOM:
193 286 : return AtomExec(isolate, regexp, subject, index, last_match_info);
194 : case JSRegExp::IRREGEXP: {
195 4352784 : return IrregexpExec(isolate, regexp, subject, index, last_match_info);
196 : }
197 : default:
198 0 : UNREACHABLE();
199 : }
200 : }
201 :
202 :
203 : // RegExp Atom implementation: Simple string search using indexOf.
204 :
205 0 : void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
206 : Handle<String> pattern, JSRegExp::Flags flags,
207 : Handle<String> match_pattern) {
208 : isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags,
209 177113 : match_pattern);
210 0 : }
211 :
212 273 : static void SetAtomLastCapture(Isolate* isolate,
213 : Handle<RegExpMatchInfo> last_match_info,
214 : String subject, int from, int to) {
215 : SealHandleScope shs(isolate);
216 : last_match_info->SetNumberOfCaptureRegisters(2);
217 546 : last_match_info->SetLastSubject(subject);
218 546 : last_match_info->SetLastInput(subject);
219 273 : last_match_info->SetCapture(0, from);
220 273 : last_match_info->SetCapture(1, to);
221 273 : }
222 :
223 90541 : int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
224 : Handle<String> subject, int index, int32_t* output,
225 : int output_size) {
226 : DCHECK_LE(0, index);
227 : DCHECK_LE(index, subject->length());
228 :
229 90541 : subject = String::Flatten(isolate, subject);
230 : DisallowHeapAllocation no_gc; // ensure vectors stay valid
231 :
232 181082 : String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
233 : int needle_len = needle->length();
234 : DCHECK(needle->IsFlat());
235 : DCHECK_LT(0, needle_len);
236 :
237 181082 : if (index + needle_len > subject->length()) {
238 : return RegExpImpl::RE_FAILURE;
239 : }
240 :
241 91470 : for (int i = 0; i < output_size; i += 2) {
242 181736 : String::FlatContent needle_content = needle->GetFlatContent(no_gc);
243 181736 : String::FlatContent subject_content = subject->GetFlatContent(no_gc);
244 : DCHECK(needle_content.IsFlat());
245 : DCHECK(subject_content.IsFlat());
246 : // dispatch on type of strings
247 : index =
248 181736 : (needle_content.IsOneByte()
249 : ? (subject_content.IsOneByte()
250 : ? SearchString(isolate, subject_content.ToOneByteVector(),
251 : needle_content.ToOneByteVector(), index)
252 : : SearchString(isolate, subject_content.ToUC16Vector(),
253 : needle_content.ToOneByteVector(), index))
254 : : (subject_content.IsOneByte()
255 : ? SearchString(isolate, subject_content.ToOneByteVector(),
256 : needle_content.ToUC16Vector(), index)
257 : : SearchString(isolate, subject_content.ToUC16Vector(),
258 363472 : needle_content.ToUC16Vector(), index)));
259 181736 : if (index == -1) {
260 90266 : return i / 2; // Return number of matches.
261 : } else {
262 91470 : output[i] = index;
263 91470 : output[i+1] = index + needle_len;
264 : index += needle_len;
265 : }
266 : }
267 273 : return output_size / 2;
268 : }
269 :
270 286 : Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
271 : Handle<String> subject, int index,
272 : Handle<RegExpMatchInfo> last_match_info) {
273 : static const int kNumRegisters = 2;
274 : STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
275 286 : int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
276 :
277 : int res =
278 286 : AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
279 :
280 299 : if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
281 :
282 : DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
283 : SealHandleScope shs(isolate);
284 : SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
285 546 : output_registers[1]);
286 273 : return last_match_info;
287 : }
288 :
289 :
290 : // Irregexp implementation.
291 :
292 : // Ensures that the regexp object contains a compiled version of the
293 : // source for either one-byte or two-byte subject strings.
294 : // If the compiled version doesn't already exist, it is compiled
295 : // from the source pattern.
296 : // If compilation fails, an exception is thrown and this function
297 : // returns false.
298 4471338 : bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
299 : Handle<String> sample_subject,
300 : bool is_one_byte) {
301 4471338 : Object compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
302 4471338 : if (compiled_code != Smi::FromInt(JSRegExp::kUninitializedValue)) {
303 : DCHECK(FLAG_regexp_interpret_all ? compiled_code->IsByteArray()
304 : : compiled_code->IsCode());
305 : return true;
306 : }
307 85591 : return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
308 : }
309 :
310 85591 : bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
311 : Handle<String> sample_subject,
312 : bool is_one_byte) {
313 : // Compile the RegExp.
314 85591 : Zone zone(isolate->allocator(), ZONE_NAME);
315 : PostponeInterruptsScope postpone(isolate);
316 : #ifdef DEBUG
317 : Object entry = re->DataAt(JSRegExp::code_index(is_one_byte));
318 : // When arriving here entry can only be a smi representing an uncompiled
319 : // regexp.
320 : DCHECK(entry->IsSmi());
321 : int entry_value = Smi::ToInt(entry);
322 : DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
323 : #endif
324 :
325 85591 : JSRegExp::Flags flags = re->GetFlags();
326 :
327 171182 : Handle<String> pattern(re->Pattern(), isolate);
328 85591 : pattern = String::Flatten(isolate, pattern);
329 : RegExpCompileData compile_data;
330 85591 : FlatStringReader reader(isolate, pattern);
331 85591 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
332 85591 : &compile_data)) {
333 : // Throw an exception if we fail to parse the pattern.
334 : // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
335 50 : USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
336 50 : return false;
337 : }
338 : RegExpEngine::CompilationResult result =
339 : RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
340 85541 : sample_subject, is_one_byte);
341 85541 : if (result.error_message != nullptr) {
342 : // Unable to compile regexp.
343 319 : if (FLAG_abort_on_stack_or_string_length_overflow &&
344 0 : strncmp(result.error_message, "Stack overflow", 15) == 0) {
345 0 : FATAL("Aborting on stack overflow");
346 : }
347 : Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
348 638 : CStrVector(result.error_message)).ToHandleChecked();
349 319 : ThrowRegExpException(isolate, re, error_message);
350 : return false;
351 : }
352 :
353 : Handle<FixedArray> data =
354 : Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
355 85222 : data->set(JSRegExp::code_index(is_one_byte), result.code);
356 85222 : SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
357 85222 : int register_max = IrregexpMaxRegisterCount(*data);
358 85222 : if (result.num_registers > register_max) {
359 : SetIrregexpMaxRegisterCount(*data, result.num_registers);
360 : }
361 :
362 85591 : return true;
363 : }
364 :
365 85222 : int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
366 : return Smi::cast(
367 85222 : re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
368 : }
369 :
370 0 : void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
371 : re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
372 0 : }
373 :
374 85222 : void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray re,
375 : Handle<FixedArray> value) {
376 85222 : if (value.is_null()) {
377 84862 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
378 : } else {
379 360 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
380 : }
381 85222 : }
382 :
383 12805375 : int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
384 12805375 : return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex));
385 : }
386 :
387 4254530 : int RegExpImpl::IrregexpNumberOfRegisters(FixedArray re) {
388 4254530 : return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex));
389 : }
390 :
391 4284682 : ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
392 4284682 : return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
393 : }
394 :
395 111152 : Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
396 111152 : return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
397 : }
398 :
399 0 : void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
400 : Handle<String> pattern,
401 : JSRegExp::Flags flags, int capture_count) {
402 : // Initialize compiled code entries to null.
403 : isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern,
404 85383 : flags, capture_count);
405 0 : }
406 :
407 4360186 : int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
408 : Handle<String> subject) {
409 : DCHECK(subject->IsFlat());
410 :
411 : // Check representation of the underlying storage.
412 4360186 : bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
413 4360186 : if (!EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) return -1;
414 :
415 4359817 : if (FLAG_regexp_interpret_all) {
416 : // Byte-code regexp needs space allocated for all its registers.
417 : // The result captures are copied to the start of the registers array
418 : // if the match succeeds. This way those registers are not clobbered
419 : // when we set the last match info from last successful match.
420 4254530 : return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
421 4254530 : (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
422 : } else {
423 : // Native regexp only needs room to output captures. Registers are handled
424 : // internally.
425 105287 : return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
426 : }
427 : }
428 :
429 4395834 : int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
430 : Handle<String> subject, int index,
431 : int32_t* output, int output_size) {
432 : Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
433 :
434 : DCHECK_LE(0, index);
435 : DCHECK_LE(index, subject->length());
436 : DCHECK(subject->IsFlat());
437 :
438 4395834 : bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
439 :
440 4395834 : if (!FLAG_regexp_interpret_all) {
441 : DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
442 : do {
443 111152 : EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
444 111152 : Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
445 : // The stack is used to allocate registers for the compiled regexp code.
446 : // This means that in case of failure, the output registers array is left
447 : // untouched and contains the capture results from the previous successful
448 : // match. We can use that to set the last match info lazily.
449 : int res = NativeRegExpMacroAssembler::Match(code, subject, output,
450 111152 : output_size, index, isolate);
451 111152 : if (res != NativeRegExpMacroAssembler::RETRY) {
452 : DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
453 : isolate->has_pending_exception());
454 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
455 : RE_SUCCESS);
456 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
457 : RE_FAILURE);
458 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
459 : RE_EXCEPTION);
460 111152 : return res;
461 : }
462 : // If result is RETRY, the string has changed representation, and we
463 : // must restart from scratch.
464 : // In this case, it means we must make sure we are prepared to handle
465 : // the, potentially, different subject (the string can switch between
466 : // being internal and external, and even between being Latin1 and UC16,
467 : // but the characters are always the same).
468 0 : IrregexpPrepare(isolate, regexp, subject);
469 0 : is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
470 : } while (true);
471 0 : UNREACHABLE();
472 : } else {
473 : DCHECK(FLAG_regexp_interpret_all);
474 : DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
475 : // We must have done EnsureCompiledIrregexp, so we can get the number of
476 : // registers.
477 : int number_of_capture_registers =
478 4284682 : (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
479 4284682 : int32_t* raw_output = &output[number_of_capture_registers];
480 : // We do not touch the actual capture result registers until we know there
481 : // has been a match so that we can use those capture results to set the
482 : // last match info.
483 13154376 : for (int i = number_of_capture_registers - 1; i >= 0; i--) {
484 8869694 : raw_output[i] = -1;
485 : }
486 : Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
487 8569364 : isolate);
488 :
489 : IrregexpResult result = IrregexpInterpreter::Match(
490 4284682 : isolate, byte_codes, subject, raw_output, index);
491 4284682 : if (result == RE_SUCCESS) {
492 : // Copy capture results to the start of the registers array.
493 : MemCopy(output, raw_output,
494 : number_of_capture_registers * sizeof(int32_t));
495 : }
496 4284682 : if (result == RE_EXCEPTION) {
497 : DCHECK(!isolate->has_pending_exception());
498 1 : isolate->StackOverflow();
499 : }
500 : return result;
501 : }
502 : }
503 :
504 4352784 : MaybeHandle<Object> RegExpImpl::IrregexpExec(
505 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
506 : int previous_index, Handle<RegExpMatchInfo> last_match_info) {
507 : DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
508 :
509 4352784 : subject = String::Flatten(isolate, subject);
510 :
511 : // Prepare space for the return values.
512 : #ifdef DEBUG
513 : if (FLAG_regexp_interpret_all && FLAG_trace_regexp_bytecodes) {
514 : String pattern = regexp->Pattern();
515 : PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
516 : PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
517 : }
518 : #endif
519 : int required_registers =
520 4352784 : RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
521 4352784 : if (required_registers < 0) {
522 : // Compiling failed with an exception.
523 : DCHECK(isolate->has_pending_exception());
524 279 : return MaybeHandle<Object>();
525 : }
526 :
527 : int32_t* output_registers = nullptr;
528 4352505 : if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
529 2844 : output_registers = NewArray<int32_t>(required_registers);
530 : }
531 : std::unique_ptr<int32_t[]> auto_release(output_registers);
532 4352505 : if (output_registers == nullptr) {
533 4349661 : output_registers = isolate->jsregexp_static_offsets_vector();
534 : }
535 :
536 : int res =
537 : RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
538 4352505 : output_registers, required_registers);
539 4352505 : if (res == RE_SUCCESS) {
540 : int capture_count =
541 4160876 : IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
542 : return SetLastMatchInfo(isolate, last_match_info, subject, capture_count,
543 4160876 : output_registers);
544 : }
545 191629 : if (res == RE_EXCEPTION) {
546 : DCHECK(isolate->has_pending_exception());
547 49 : return MaybeHandle<Object>();
548 : }
549 : DCHECK(res == RE_FAILURE);
550 191580 : return isolate->factory()->null_value();
551 : }
552 :
553 4255636 : Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
554 : Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
555 : Handle<String> subject, int capture_count, int32_t* match) {
556 : // This is the only place where match infos can grow. If, after executing the
557 : // regexp, RegExpExecStub finds that the match info is too small, it restarts
558 : // execution in RegExpImpl::Exec, which finally grows the match info right
559 : // here.
560 :
561 4255636 : int capture_register_count = (capture_count + 1) * 2;
562 : Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures(
563 4255636 : isolate, last_match_info, capture_register_count);
564 : result->SetNumberOfCaptureRegisters(capture_register_count);
565 :
566 4255636 : if (*result != *last_match_info) {
567 : DCHECK_EQ(*last_match_info, *isolate->regexp_last_match_info());
568 4174 : isolate->native_context()->set_regexp_last_match_info(*result);
569 : }
570 :
571 : DisallowHeapAllocation no_allocation;
572 4255636 : if (match != nullptr) {
573 5458387 : for (int i = 0; i < capture_register_count; i += 2) {
574 10916774 : result->SetCapture(i, match[i]);
575 10916774 : result->SetCapture(i + 1, match[i + 1]);
576 : }
577 : }
578 8511272 : result->SetLastSubject(*subject);
579 8511272 : result->SetLastInput(*subject);
580 4255636 : return result;
581 : }
582 :
583 95497 : RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
584 : Handle<String> subject, Isolate* isolate)
585 : : register_array_(nullptr),
586 : register_array_size_(0),
587 : regexp_(regexp),
588 : subject_(subject),
589 95497 : isolate_(isolate) {
590 95497 : bool interpreted = FLAG_regexp_interpret_all;
591 :
592 95497 : if (regexp_->TypeTag() == JSRegExp::ATOM) {
593 : static const int kAtomRegistersPerMatch = 2;
594 90255 : registers_per_match_ = kAtomRegistersPerMatch;
595 : // There is no distinction between interpreted and native for atom regexps.
596 : interpreted = false;
597 : } else {
598 : registers_per_match_ =
599 5242 : RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
600 5242 : if (registers_per_match_ < 0) {
601 90 : num_matches_ = -1; // Signal exception.
602 95587 : return;
603 : }
604 : }
605 :
606 : DCHECK(IsGlobal(regexp->GetFlags()));
607 95407 : if (!interpreted) {
608 : register_array_size_ =
609 189500 : Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
610 94750 : max_matches_ = register_array_size_ / registers_per_match_;
611 : } else {
612 : // Global loop in interpreted regexp is not implemented. We choose
613 : // the size of the offsets vector so that it can only store one match.
614 657 : register_array_size_ = registers_per_match_;
615 657 : max_matches_ = 1;
616 : }
617 :
618 95407 : if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
619 1072 : register_array_ = NewArray<int32_t>(register_array_size_);
620 : } else {
621 94335 : register_array_ = isolate->jsregexp_static_offsets_vector();
622 : }
623 :
624 : // Set state so that fetching the results the first time triggers a call
625 : // to the compiled regexp.
626 95407 : current_match_index_ = max_matches_ - 1;
627 95407 : num_matches_ = max_matches_;
628 : DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
629 : DCHECK_GE(register_array_size_, registers_per_match_);
630 : int32_t* last_match =
631 95407 : ®ister_array_[current_match_index_ * registers_per_match_];
632 95407 : last_match[0] = -1;
633 95407 : last_match[1] = 0;
634 : }
635 :
636 7 : int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
637 14 : if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
638 14 : unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
639 7 : unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
640 : // Advance over the surrogate pair.
641 0 : return last_index + 2;
642 : }
643 7 : return last_index + 1;
644 : }
645 :
646 : // -------------------------------------------------------------------
647 : // Implementation of the Irregexp regular expression engine.
648 : //
649 : // The Irregexp regular expression engine is intended to be a complete
650 : // implementation of ECMAScript regular expressions. It generates either
651 : // bytecodes or native code.
652 :
653 : // The Irregexp regexp engine is structured in three steps.
654 : // 1) The parser generates an abstract syntax tree. See ast.cc.
655 : // 2) From the AST a node network is created. The nodes are all
656 : // subclasses of RegExpNode. The nodes represent states when
657 : // executing a regular expression. Several optimizations are
658 : // performed on the node network.
659 : // 3) From the nodes we generate either byte codes or native code
660 : // that can actually execute the regular expression (perform
661 : // the search). The code generation step is described in more
662 : // detail below.
663 :
664 : // Code generation.
665 : //
666 : // The nodes are divided into four main categories.
667 : // * Choice nodes
668 : // These represent places where the regular expression can
669 : // match in more than one way. For example on entry to an
670 : // alternation (foo|bar) or a repetition (*, +, ? or {}).
671 : // * Action nodes
672 : // These represent places where some action should be
673 : // performed. Examples include recording the current position
674 : // in the input string to a register (in order to implement
675 : // captures) or other actions on register for example in order
676 : // to implement the counters needed for {} repetitions.
677 : // * Matching nodes
678 : // These attempt to match some element part of the input string.
679 : // Examples of elements include character classes, plain strings
680 : // or back references.
681 : // * End nodes
682 : // These are used to implement the actions required on finding
683 : // a successful match or failing to find a match.
684 : //
685 : // The code generated (whether as byte codes or native code) maintains
686 : // some state as it runs. This consists of the following elements:
687 : //
688 : // * The capture registers. Used for string captures.
689 : // * Other registers. Used for counters etc.
690 : // * The current position.
691 : // * The stack of backtracking information. Used when a matching node
692 : // fails to find a match and needs to try an alternative.
693 : //
694 : // Conceptual regular expression execution model:
695 : //
696 : // There is a simple conceptual model of regular expression execution
697 : // which will be presented first. The actual code generated is a more
698 : // efficient simulation of the simple conceptual model:
699 : //
700 : // * Choice nodes are implemented as follows:
701 : // For each choice except the last {
702 : // push current position
703 : // push backtrack code location
704 : // <generate code to test for choice>
705 : // backtrack code location:
706 : // pop current position
707 : // }
708 : // <generate code to test for last choice>
709 : //
710 : // * Actions nodes are generated as follows
711 : // <push affected registers on backtrack stack>
712 : // <generate code to perform action>
713 : // push backtrack code location
714 : // <generate code to test for following nodes>
715 : // backtrack code location:
716 : // <pop affected registers to restore their state>
717 : // <pop backtrack location from stack and go to it>
718 : //
719 : // * Matching nodes are generated as follows:
720 : // if input string matches at current position
721 : // update current position
722 : // <generate code to test for following nodes>
723 : // else
724 : // <pop backtrack location from stack and go to it>
725 : //
726 : // Thus it can be seen that the current position is saved and restored
727 : // by the choice nodes, whereas the registers are saved and restored by
728 : // by the action nodes that manipulate them.
729 : //
730 : // The other interesting aspect of this model is that nodes are generated
731 : // at the point where they are needed by a recursive call to Emit(). If
732 : // the node has already been code generated then the Emit() call will
733 : // generate a jump to the previously generated code instead. In order to
734 : // limit recursion it is possible for the Emit() function to put the node
735 : // on a work list for later generation and instead generate a jump. The
736 : // destination of the jump is resolved later when the code is generated.
737 : //
738 : // Actual regular expression code generation.
739 : //
740 : // Code generation is actually more complicated than the above. In order
741 : // to improve the efficiency of the generated code some optimizations are
742 : // performed
743 : //
744 : // * Choice nodes have 1-character lookahead.
745 : // A choice node looks at the following character and eliminates some of
746 : // the choices immediately based on that character. This is not yet
747 : // implemented.
748 : // * Simple greedy loops store reduced backtracking information.
749 : // A quantifier like /.*foo/m will greedily match the whole input. It will
750 : // then need to backtrack to a point where it can match "foo". The naive
751 : // implementation of this would push each character position onto the
752 : // backtracking stack, then pop them off one by one. This would use space
753 : // proportional to the length of the input string. However since the "."
754 : // can only match in one way and always has a constant length (in this case
755 : // of 1) it suffices to store the current position on the top of the stack
756 : // once. Matching now becomes merely incrementing the current position and
757 : // backtracking becomes decrementing the current position and checking the
758 : // result against the stored current position. This is faster and saves
759 : // space.
760 : // * The current state is virtualized.
761 : // This is used to defer expensive operations until it is clear that they
762 : // are needed and to generate code for a node more than once, allowing
763 : // specialized an efficient versions of the code to be created. This is
764 : // explained in the section below.
765 : //
766 : // Execution state virtualization.
767 : //
768 : // Instead of emitting code, nodes that manipulate the state can record their
769 : // manipulation in an object called the Trace. The Trace object can record a
770 : // current position offset, an optional backtrack code location on the top of
771 : // the virtualized backtrack stack and some register changes. When a node is
772 : // to be emitted it can flush the Trace or update it. Flushing the Trace
773 : // will emit code to bring the actual state into line with the virtual state.
774 : // Avoiding flushing the state can postpone some work (e.g. updates of capture
775 : // registers). Postponing work can save time when executing the regular
776 : // expression since it may be found that the work never has to be done as a
777 : // failure to match can occur. In addition it is much faster to jump to a
778 : // known backtrack code location than it is to pop an unknown backtrack
779 : // location from the stack and jump there.
780 : //
781 : // The virtual state found in the Trace affects code generation. For example
782 : // the virtual state contains the difference between the actual current
783 : // position and the virtual current position, and matching code needs to use
784 : // this offset to attempt a match in the correct location of the input
785 : // string. Therefore code generated for a non-trivial trace is specialized
786 : // to that trace. The code generator therefore has the ability to generate
787 : // code for each node several times. In order to limit the size of the
788 : // generated code there is an arbitrary limit on how many specialized sets of
789 : // code may be generated for a given node. If the limit is reached, the
790 : // trace is flushed and a generic version of the code for a node is emitted.
791 : // This is subsequently used for that node. The code emitted for non-generic
792 : // trace is not recorded in the node and so it cannot currently be reused in
793 : // the event that code generation is requested for an identical trace.
794 :
795 :
796 0 : void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
797 0 : UNREACHABLE();
798 : }
799 :
800 :
801 86555 : void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
802 86555 : text->AddElement(TextElement::Atom(this), zone);
803 86555 : }
804 :
805 :
806 7557 : void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
807 7557 : text->AddElement(TextElement::CharClass(this), zone);
808 7557 : }
809 :
810 :
811 0 : void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
812 0 : for (int i = 0; i < elements()->length(); i++)
813 0 : text->AddElement(elements()->at(i), zone);
814 0 : }
815 :
816 :
817 0 : TextElement TextElement::Atom(RegExpAtom* atom) {
818 0 : return TextElement(ATOM, atom);
819 : }
820 :
821 :
822 0 : TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
823 0 : return TextElement(CHAR_CLASS, char_class);
824 : }
825 :
826 :
827 7411807 : int TextElement::length() const {
828 7411807 : switch (text_type()) {
829 : case ATOM:
830 6578216 : return atom()->length();
831 :
832 : case CHAR_CLASS:
833 : return 1;
834 : }
835 0 : UNREACHABLE();
836 : }
837 :
838 :
839 0 : DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
840 0 : if (table_ == nullptr) {
841 0 : table_ = new(zone()) DispatchTable(zone());
842 : DispatchTableConstructor cons(table_, ignore_case, zone());
843 0 : cons.BuildTable(this);
844 : }
845 0 : return table_;
846 : }
847 :
848 :
849 : class FrequencyCollator {
850 : public:
851 11034273 : FrequencyCollator() : total_samples_(0) {
852 10948736 : for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
853 10948736 : frequencies_[i] = CharacterFrequency(i);
854 : }
855 : }
856 :
857 : void CountCharacter(int character) {
858 453565 : int index = (character & RegExpMacroAssembler::kTableMask);
859 453565 : frequencies_[index].Increment();
860 453565 : total_samples_++;
861 : }
862 :
863 : // Does not measure in percent, but rather per-128 (the table size from the
864 : // regexp macro assembler).
865 : int Frequency(int in_character) {
866 : DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
867 486722 : if (total_samples_ < 1) return 1; // Division by zero.
868 : int freq_in_per128 =
869 486457 : (frequencies_[in_character].counter() * 128) / total_samples_;
870 : return freq_in_per128;
871 : }
872 :
873 : private:
874 : class CharacterFrequency {
875 : public:
876 10948736 : CharacterFrequency() : counter_(0), character_(-1) { }
877 : explicit CharacterFrequency(int character)
878 : : counter_(0), character_(character) { }
879 :
880 453565 : void Increment() { counter_++; }
881 : int counter() { return counter_; }
882 : int character() { return character_; }
883 :
884 : private:
885 : int counter_;
886 : int character_;
887 : };
888 :
889 :
890 : private:
891 : CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
892 : int total_samples_;
893 : };
894 :
895 :
896 : class RegExpCompiler {
897 : public:
898 : RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
899 : bool is_one_byte);
900 :
901 : int AllocateRegister() {
902 909433 : if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
903 310203 : reg_exp_too_big_ = true;
904 : return next_register_;
905 : }
906 599230 : return next_register_++;
907 : }
908 :
909 : // Lookarounds to match lone surrogates for unicode character class matches
910 : // are never nested. We can therefore reuse registers.
911 : int UnicodeLookaroundStackRegister() {
912 2460 : if (unicode_lookaround_stack_register_ == kNoRegister) {
913 1040 : unicode_lookaround_stack_register_ = AllocateRegister();
914 : }
915 2460 : return unicode_lookaround_stack_register_;
916 : }
917 :
918 : int UnicodeLookaroundPositionRegister() {
919 2460 : if (unicode_lookaround_position_register_ == kNoRegister) {
920 1040 : unicode_lookaround_position_register_ = AllocateRegister();
921 : }
922 2460 : return unicode_lookaround_position_register_;
923 : }
924 :
925 : RegExpEngine::CompilationResult Assemble(Isolate* isolate,
926 : RegExpMacroAssembler* assembler,
927 : RegExpNode* start, int capture_count,
928 : Handle<String> pattern);
929 :
930 592179 : inline void AddWork(RegExpNode* node) {
931 592179 : if (!node->on_work_list() && !node->label()->is_bound()) {
932 : node->set_on_work_list(true);
933 211392 : work_list_->push_back(node);
934 : }
935 592179 : }
936 :
937 : static const int kImplementationOffset = 0;
938 : static const int kNumberOfRegistersOffset = 0;
939 : static const int kCodeOffset = 1;
940 :
941 : RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
942 : EndNode* accept() { return accept_; }
943 :
944 : static const int kMaxRecursion = 100;
945 : inline int recursion_depth() { return recursion_depth_; }
946 996073 : inline void IncrementRecursionDepth() { recursion_depth_++; }
947 996073 : inline void DecrementRecursionDepth() { recursion_depth_--; }
948 :
949 0 : void SetRegExpTooBig() { reg_exp_too_big_ = true; }
950 :
951 : inline bool one_byte() { return one_byte_; }
952 : inline bool optimize() { return optimize_; }
953 84372 : inline void set_optimize(bool value) { optimize_ = value; }
954 : inline bool limiting_recursion() { return limiting_recursion_; }
955 : inline void set_limiting_recursion(bool value) {
956 954610 : limiting_recursion_ = value;
957 : }
958 : bool read_backward() { return read_backward_; }
959 3336 : void set_read_backward(bool value) { read_backward_ = value; }
960 : FrequencyCollator* frequency_collator() { return &frequency_collator_; }
961 :
962 : int current_expansion_factor() { return current_expansion_factor_; }
963 : void set_current_expansion_factor(int value) {
964 85120 : current_expansion_factor_ = value;
965 : }
966 :
967 : Isolate* isolate() const { return isolate_; }
968 : Zone* zone() const { return zone_; }
969 :
970 : static const int kNoRegister = -1;
971 :
972 : private:
973 : EndNode* accept_;
974 : int next_register_;
975 : int unicode_lookaround_stack_register_;
976 : int unicode_lookaround_position_register_;
977 : std::vector<RegExpNode*>* work_list_;
978 : int recursion_depth_;
979 : RegExpMacroAssembler* macro_assembler_;
980 : bool one_byte_;
981 : bool reg_exp_too_big_;
982 : bool limiting_recursion_;
983 : bool optimize_;
984 : bool read_backward_;
985 : int current_expansion_factor_;
986 : FrequencyCollator frequency_collator_;
987 : Isolate* isolate_;
988 : Zone* zone_;
989 : };
990 :
991 :
992 : class RecursionCheck {
993 : public:
994 : explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
995 : compiler->IncrementRecursionDepth();
996 : }
997 : ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
998 : private:
999 : RegExpCompiler* compiler_;
1000 : };
1001 :
1002 :
1003 : static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1004 9 : return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1005 : }
1006 :
1007 :
1008 : // Attempts to compile the regexp using an Irregexp code generator. Returns
1009 : // a fixed array or a null handle depending on whether it succeeded.
1010 85537 : RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1011 : bool one_byte)
1012 85537 : : next_register_(2 * (capture_count + 1)),
1013 : unicode_lookaround_stack_register_(kNoRegister),
1014 : unicode_lookaround_position_register_(kNoRegister),
1015 : work_list_(nullptr),
1016 : recursion_depth_(0),
1017 : one_byte_(one_byte),
1018 : reg_exp_too_big_(false),
1019 : limiting_recursion_(false),
1020 : optimize_(FLAG_regexp_optimization),
1021 : read_backward_(false),
1022 : current_expansion_factor_(1),
1023 : frequency_collator_(),
1024 : isolate_(isolate),
1025 171074 : zone_(zone) {
1026 85537 : accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1027 : DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
1028 85537 : }
1029 :
1030 85227 : RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1031 : Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
1032 : int capture_count, Handle<String> pattern) {
1033 : #ifdef DEBUG
1034 : if (FLAG_trace_regexp_assembler)
1035 : macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1036 : else
1037 : #endif
1038 85227 : macro_assembler_ = macro_assembler;
1039 :
1040 : std::vector<RegExpNode*> work_list;
1041 85227 : work_list_ = &work_list;
1042 85227 : Label fail;
1043 85227 : macro_assembler_->PushBacktrack(&fail);
1044 85227 : Trace new_trace;
1045 85227 : start->Emit(this, &new_trace);
1046 85227 : macro_assembler_->Bind(&fail);
1047 85227 : macro_assembler_->Fail();
1048 381846 : while (!work_list.empty()) {
1049 211392 : RegExpNode* node = work_list.back();
1050 : work_list.pop_back();
1051 : node->set_on_work_list(false);
1052 211392 : if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1053 : }
1054 85227 : if (reg_exp_too_big_) {
1055 0 : macro_assembler_->AbortedCodeGeneration();
1056 0 : return IrregexpRegExpTooBig(isolate_);
1057 : }
1058 :
1059 85227 : Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1060 170454 : isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1061 85227 : work_list_ = nullptr;
1062 : #ifdef ENABLE_DISASSEMBLER
1063 : if (FLAG_print_code && !FLAG_regexp_interpret_all) {
1064 : CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1065 : OFStream os(trace_scope.file());
1066 : Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1067 : }
1068 : #endif
1069 : #ifdef DEBUG
1070 : if (FLAG_trace_regexp_assembler) {
1071 : delete macro_assembler_;
1072 : }
1073 : #endif
1074 85227 : return RegExpEngine::CompilationResult(*code, next_register_);
1075 : }
1076 :
1077 :
1078 4874172 : bool Trace::DeferredAction::Mentions(int that) {
1079 2460691 : if (action_type() == ActionNode::CLEAR_CAPTURES) {
1080 : Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1081 : return range.Contains(that);
1082 : } else {
1083 2413481 : return reg() == that;
1084 : }
1085 : }
1086 :
1087 :
1088 0 : bool Trace::mentions_reg(int reg) {
1089 0 : for (DeferredAction* action = actions_; action != nullptr;
1090 : action = action->next()) {
1091 0 : if (action->Mentions(reg))
1092 : return true;
1093 : }
1094 : return false;
1095 : }
1096 :
1097 :
1098 909 : bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1099 : DCHECK_EQ(0, *cp_offset);
1100 1830 : for (DeferredAction* action = actions_; action != nullptr;
1101 : action = action->next()) {
1102 921 : if (action->Mentions(reg)) {
1103 410 : if (action->action_type() == ActionNode::STORE_POSITION) {
1104 410 : *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1105 410 : return true;
1106 : } else {
1107 : return false;
1108 : }
1109 : }
1110 : }
1111 : return false;
1112 : }
1113 :
1114 :
1115 509763 : int Trace::FindAffectedRegisters(OutSet* affected_registers,
1116 : Zone* zone) {
1117 : int max_register = RegExpCompiler::kNoRegister;
1118 1756943 : for (DeferredAction* action = actions_; action != nullptr;
1119 : action = action->next()) {
1120 417646 : if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1121 : Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1122 47765 : for (int i = range.from(); i <= range.to(); i++)
1123 44886 : affected_registers->Set(i, zone);
1124 2879 : if (range.to() > max_register) max_register = range.to();
1125 : } else {
1126 414767 : affected_registers->Set(action->reg(), zone);
1127 414767 : if (action->reg() > max_register) max_register = action->reg();
1128 : }
1129 : }
1130 509763 : return max_register;
1131 : }
1132 :
1133 :
1134 509763 : void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1135 : int max_register,
1136 : const OutSet& registers_to_pop,
1137 : const OutSet& registers_to_clear) {
1138 10462520 : for (int reg = max_register; reg >= 0; reg--) {
1139 9952757 : if (registers_to_pop.Get(reg)) {
1140 52596 : assembler->PopRegister(reg);
1141 9900161 : } else if (registers_to_clear.Get(reg)) {
1142 : int clear_to = reg;
1143 182945 : while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1144 105435 : reg--;
1145 : }
1146 77510 : assembler->ClearRegisters(reg, clear_to);
1147 : }
1148 : }
1149 509763 : }
1150 :
1151 :
1152 509763 : void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1153 : int max_register,
1154 : const OutSet& affected_registers,
1155 : OutSet* registers_to_pop,
1156 : OutSet* registers_to_clear,
1157 : Zone* zone) {
1158 : // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1159 509763 : const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1160 :
1161 : // Count pushes performed to force a stack limit check occasionally.
1162 : int pushes = 0;
1163 :
1164 10567955 : for (int reg = 0; reg <= max_register; reg++) {
1165 10058192 : if (!affected_registers.Get(reg)) {
1166 : continue;
1167 : }
1168 :
1169 : // The chronologically first deferred action in the trace
1170 : // is used to infer the action needed to restore a register
1171 : // to its previous state (or not, if it's safe to ignore it).
1172 : enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1173 : DeferredActionUndoType undo_action = IGNORE;
1174 :
1175 : int value = 0;
1176 : bool absolute = false;
1177 : bool clear = false;
1178 : static const int kNoStore = kMinInt;
1179 : int store_position = kNoStore;
1180 : // This is a little tricky because we are scanning the actions in reverse
1181 : // historical order (newest first).
1182 2912985 : for (DeferredAction* action = actions_; action != nullptr;
1183 : action = action->next()) {
1184 2459770 : if (action->Mentions(reg)) {
1185 459653 : switch (action->action_type()) {
1186 : case ActionNode::SET_REGISTER: {
1187 3430 : Trace::DeferredSetRegister* psr =
1188 : static_cast<Trace::DeferredSetRegister*>(action);
1189 3430 : if (!absolute) {
1190 3430 : value += psr->value();
1191 : absolute = true;
1192 : }
1193 : // SET_REGISTER is currently only used for newly introduced loop
1194 : // counters. They can have a significant previous value if they
1195 : // occur in a loop. TODO(lrn): Propagate this information, so
1196 : // we can set undo_action to IGNORE if we know there is no value to
1197 : // restore.
1198 : undo_action = RESTORE;
1199 : DCHECK_EQ(store_position, kNoStore);
1200 : DCHECK(!clear);
1201 : break;
1202 : }
1203 : case ActionNode::INCREMENT_REGISTER:
1204 3709 : if (!absolute) {
1205 3709 : value++;
1206 : }
1207 : DCHECK_EQ(store_position, kNoStore);
1208 : DCHECK(!clear);
1209 : undo_action = RESTORE;
1210 : break;
1211 : case ActionNode::STORE_POSITION: {
1212 597582 : Trace::DeferredCapture* pc =
1213 : static_cast<Trace::DeferredCapture*>(action);
1214 407628 : if (!clear && store_position == kNoStore) {
1215 : store_position = pc->cp_offset();
1216 : }
1217 :
1218 : // For captures we know that stores and clears alternate.
1219 : // Other register, are never cleared, and if the occur
1220 : // inside a loop, they might be assigned more than once.
1221 407628 : if (reg <= 1) {
1222 : // Registers zero and one, aka "capture zero", is
1223 : // always set correctly if we succeed. There is no
1224 : // need to undo a setting on backtrack, because we
1225 : // will set it again or fail.
1226 : undo_action = IGNORE;
1227 : } else {
1228 189954 : undo_action = pc->is_capture() ? CLEAR : RESTORE;
1229 : }
1230 : DCHECK(!absolute);
1231 : DCHECK_EQ(value, 0);
1232 : break;
1233 : }
1234 : case ActionNode::CLEAR_CAPTURES: {
1235 : // Since we're scanning in reverse order, if we've already
1236 : // set the position we have to ignore historically earlier
1237 : // clearing operations.
1238 44886 : if (store_position == kNoStore) {
1239 : clear = true;
1240 : }
1241 : undo_action = RESTORE;
1242 : DCHECK(!absolute);
1243 : DCHECK_EQ(value, 0);
1244 : break;
1245 : }
1246 : default:
1247 0 : UNREACHABLE();
1248 : break;
1249 : }
1250 : }
1251 : }
1252 : // Prepare for the undo-action (e.g., push if it's going to be popped).
1253 453215 : if (undo_action == RESTORE) {
1254 52596 : pushes++;
1255 : RegExpMacroAssembler::StackCheckFlag stack_check =
1256 : RegExpMacroAssembler::kNoStackLimitCheck;
1257 52596 : if (pushes == push_limit) {
1258 : stack_check = RegExpMacroAssembler::kCheckStackLimit;
1259 : pushes = 0;
1260 : }
1261 :
1262 52596 : assembler->PushRegister(reg, stack_check);
1263 52596 : registers_to_pop->Set(reg, zone);
1264 400619 : } else if (undo_action == CLEAR) {
1265 182945 : registers_to_clear->Set(reg, zone);
1266 : }
1267 : // Perform the chronologically last action (or accumulated increment)
1268 : // for the register.
1269 453215 : if (store_position != kNoStore) {
1270 407628 : assembler->WriteCurrentPositionToRegister(reg, store_position);
1271 45587 : } else if (clear) {
1272 38448 : assembler->ClearRegisters(reg, reg);
1273 7139 : } else if (absolute) {
1274 3430 : assembler->SetRegister(reg, value);
1275 3709 : } else if (value != 0) {
1276 3709 : assembler->AdvanceRegister(reg, value);
1277 : }
1278 : }
1279 509763 : }
1280 :
1281 :
1282 : // This is called as we come into a loop choice node and some other tricky
1283 : // nodes. It normalizes the state of the code generator to ensure we can
1284 : // generate generic code.
1285 3595576 : void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1286 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1287 :
1288 : DCHECK(!is_trivial());
1289 :
1290 1157478 : if (actions_ == nullptr && backtrack() == nullptr) {
1291 : // Here we just have some deferred cp advances to fix and we are back to
1292 : // a normal situation. We may also have to forget some information gained
1293 : // through a quick check that was already performed.
1294 189615 : if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1295 : // Create a new trivial state and generate the node with that.
1296 189615 : Trace new_state;
1297 189615 : successor->Emit(compiler, &new_state);
1298 699378 : return;
1299 : }
1300 :
1301 : // Generate deferred actions here along with code to undo them again.
1302 : OutSet affected_registers;
1303 :
1304 509763 : if (backtrack() != nullptr) {
1305 : // Here we have a concrete backtrack location. These are set up by choice
1306 : // nodes and so they indicate that we have a deferred save of the current
1307 : // position which we may need to emit here.
1308 399046 : assembler->PushCurrentPosition();
1309 : }
1310 :
1311 : int max_register = FindAffectedRegisters(&affected_registers,
1312 509763 : compiler->zone());
1313 : OutSet registers_to_pop;
1314 : OutSet registers_to_clear;
1315 : PerformDeferredActions(assembler,
1316 : max_register,
1317 : affected_registers,
1318 : ®isters_to_pop,
1319 : ®isters_to_clear,
1320 509763 : compiler->zone());
1321 509763 : if (cp_offset_ != 0) {
1322 293179 : assembler->AdvanceCurrentPosition(cp_offset_);
1323 : }
1324 :
1325 : // Create a new trivial state and generate the node with that.
1326 509763 : Label undo;
1327 509763 : assembler->PushBacktrack(&undo);
1328 509763 : if (successor->KeepRecursing(compiler)) {
1329 137417 : Trace new_state;
1330 137417 : successor->Emit(compiler, &new_state);
1331 : } else {
1332 372346 : compiler->AddWork(successor);
1333 372346 : assembler->GoTo(successor->label());
1334 : }
1335 :
1336 : // On backtrack we need to restore state.
1337 509763 : assembler->Bind(&undo);
1338 : RestoreAffectedRegisters(assembler,
1339 : max_register,
1340 : registers_to_pop,
1341 509763 : registers_to_clear);
1342 509763 : if (backtrack() == nullptr) {
1343 110717 : assembler->Backtrack();
1344 : } else {
1345 399046 : assembler->PopCurrentPosition();
1346 798092 : assembler->GoTo(backtrack());
1347 : }
1348 : }
1349 :
1350 :
1351 2843 : void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1352 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1353 :
1354 : // Omit flushing the trace. We discard the entire stack frame anyway.
1355 :
1356 2843 : if (!label()->is_bound()) {
1357 : // We are completely independent of the trace, since we ignore it,
1358 : // so this code can be used as the generic version.
1359 2802 : assembler->Bind(label());
1360 : }
1361 :
1362 : // Throw away everything on the backtrack stack since the start
1363 : // of the negative submatch and restore the character position.
1364 2843 : assembler->ReadCurrentPositionFromRegister(current_position_register_);
1365 2843 : assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1366 2843 : if (clear_capture_count_ > 0) {
1367 : // Clear any captures that might have been performed during the success
1368 : // of the body of the negative look-ahead.
1369 107 : int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1370 107 : assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1371 : }
1372 : // Now that we have unwound the stack we find at the top of the stack the
1373 : // backtrack that the BeginSubmatch node got.
1374 2843 : assembler->Backtrack();
1375 2843 : }
1376 :
1377 :
1378 274053 : void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1379 182402 : if (!trace->is_trivial()) {
1380 91051 : trace->Flush(compiler, this);
1381 91051 : return;
1382 : }
1383 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1384 91351 : if (!label()->is_bound()) {
1385 85216 : assembler->Bind(label());
1386 : }
1387 91351 : switch (action_) {
1388 : case ACCEPT:
1389 91051 : assembler->Succeed();
1390 91051 : return;
1391 : case BACKTRACK:
1392 600 : assembler->GoTo(trace->backtrack());
1393 300 : return;
1394 : case NEGATIVE_SUBMATCH_SUCCESS:
1395 : // This case is handled in a different virtual method.
1396 0 : UNREACHABLE();
1397 : }
1398 0 : UNIMPLEMENTED();
1399 : }
1400 :
1401 :
1402 903901 : void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1403 1807802 : if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone);
1404 903901 : guards_->Add(guard, zone);
1405 903901 : }
1406 :
1407 :
1408 903324 : ActionNode* ActionNode::SetRegister(int reg,
1409 : int val,
1410 903324 : RegExpNode* on_success) {
1411 : ActionNode* result =
1412 : new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1413 903324 : result->data_.u_store_register.reg = reg;
1414 903324 : result->data_.u_store_register.value = val;
1415 903324 : return result;
1416 : }
1417 :
1418 :
1419 903324 : ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1420 : ActionNode* result =
1421 : new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1422 903324 : result->data_.u_increment_register.reg = reg;
1423 903324 : return result;
1424 : }
1425 :
1426 :
1427 225603 : ActionNode* ActionNode::StorePosition(int reg,
1428 : bool is_capture,
1429 225603 : RegExpNode* on_success) {
1430 : ActionNode* result =
1431 : new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1432 225603 : result->data_.u_position_register.reg = reg;
1433 225603 : result->data_.u_position_register.is_capture = is_capture;
1434 225603 : return result;
1435 : }
1436 :
1437 :
1438 2304 : ActionNode* ActionNode::ClearCaptures(Interval range,
1439 2304 : RegExpNode* on_success) {
1440 : ActionNode* result =
1441 : new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1442 2304 : result->data_.u_clear_captures.range_from = range.from();
1443 2304 : result->data_.u_clear_captures.range_to = range.to();
1444 2304 : return result;
1445 : }
1446 :
1447 :
1448 4462 : ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1449 : int position_reg,
1450 4462 : RegExpNode* on_success) {
1451 : ActionNode* result =
1452 : new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1453 4462 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1454 4462 : result->data_.u_submatch.current_position_register = position_reg;
1455 4462 : return result;
1456 : }
1457 :
1458 :
1459 1650 : ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1460 : int position_reg,
1461 : int clear_register_count,
1462 : int clear_register_from,
1463 1650 : RegExpNode* on_success) {
1464 : ActionNode* result =
1465 : new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1466 1650 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1467 1650 : result->data_.u_submatch.current_position_register = position_reg;
1468 1650 : result->data_.u_submatch.clear_register_count = clear_register_count;
1469 1650 : result->data_.u_submatch.clear_register_from = clear_register_from;
1470 1650 : return result;
1471 : }
1472 :
1473 :
1474 505 : ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1475 : int repetition_register,
1476 : int repetition_limit,
1477 505 : RegExpNode* on_success) {
1478 : ActionNode* result =
1479 : new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1480 505 : result->data_.u_empty_match_check.start_register = start_register;
1481 505 : result->data_.u_empty_match_check.repetition_register = repetition_register;
1482 505 : result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1483 505 : return result;
1484 : }
1485 :
1486 :
1487 : #define DEFINE_ACCEPT(Type) \
1488 : void Type##Node::Accept(NodeVisitor* visitor) { \
1489 : visitor->Visit##Type(this); \
1490 : }
1491 727360 : FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1492 : #undef DEFINE_ACCEPT
1493 :
1494 :
1495 149820 : void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1496 149820 : visitor->VisitLoopChoice(this);
1497 149820 : }
1498 :
1499 :
1500 : // -------------------------------------------------------------------
1501 : // Emit code.
1502 :
1503 :
1504 3901 : void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1505 7802 : Guard* guard,
1506 3901 : Trace* trace) {
1507 3901 : switch (guard->op()) {
1508 : case Guard::LT:
1509 : DCHECK(!trace->mentions_reg(guard->reg()));
1510 : macro_assembler->IfRegisterGE(guard->reg(),
1511 : guard->value(),
1512 5160 : trace->backtrack());
1513 2580 : break;
1514 : case Guard::GEQ:
1515 : DCHECK(!trace->mentions_reg(guard->reg()));
1516 : macro_assembler->IfRegisterLT(guard->reg(),
1517 : guard->value(),
1518 2642 : trace->backtrack());
1519 1321 : break;
1520 : }
1521 3901 : }
1522 :
1523 :
1524 : // Returns the number of characters in the equivalence class, omitting those
1525 : // that cannot occur in the source string because it is Latin1.
1526 21870 : static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1527 : bool one_byte_subject,
1528 : unibrow::uchar* letters) {
1529 : int length =
1530 21870 : isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1531 : // Unibrow returns 0 or 1 for characters where case independence is
1532 : // trivial.
1533 21870 : if (length == 0) {
1534 2773 : letters[0] = character;
1535 : length = 1;
1536 : }
1537 :
1538 21870 : if (one_byte_subject) {
1539 : int new_length = 0;
1540 31770 : for (int i = 0; i < length; i++) {
1541 31770 : if (letters[i] <= String::kMaxOneByteCharCode) {
1542 31360 : letters[new_length++] = letters[i];
1543 : }
1544 : }
1545 : length = new_length;
1546 : }
1547 :
1548 21870 : return length;
1549 : }
1550 :
1551 :
1552 574876 : static inline bool EmitSimpleCharacter(Isolate* isolate,
1553 574876 : RegExpCompiler* compiler,
1554 : uc16 c,
1555 : Label* on_failure,
1556 : int cp_offset,
1557 : bool check,
1558 : bool preloaded) {
1559 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1560 : bool bound_checked = false;
1561 574876 : if (!preloaded) {
1562 : assembler->LoadCurrentCharacter(
1563 : cp_offset,
1564 : on_failure,
1565 574876 : check);
1566 : bound_checked = true;
1567 : }
1568 574876 : assembler->CheckNotCharacter(c, on_failure);
1569 574876 : return bound_checked;
1570 : }
1571 :
1572 :
1573 : // Only emits non-letters (things that don't have case). Only used for case
1574 : // independent matches.
1575 5503 : static inline bool EmitAtomNonLetter(Isolate* isolate,
1576 5503 : RegExpCompiler* compiler,
1577 : uc16 c,
1578 : Label* on_failure,
1579 : int cp_offset,
1580 : bool check,
1581 : bool preloaded) {
1582 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1583 : bool one_byte = compiler->one_byte();
1584 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1585 5503 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1586 5503 : if (length < 1) {
1587 : // This can't match. Must be an one-byte subject and a non-one-byte
1588 : // character. We do not need to do anything since the one-byte pass
1589 : // already handled this.
1590 : return false; // Bounds not checked.
1591 : }
1592 : bool checked = false;
1593 : // We handle the length > 1 case in a later pass.
1594 5498 : if (length == 1) {
1595 371 : if (one_byte && c > String::kMaxOneByteCharCodeU) {
1596 : // Can't match - see above.
1597 : return false; // Bounds not checked.
1598 : }
1599 371 : if (!preloaded) {
1600 371 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1601 : checked = check;
1602 : }
1603 371 : macro_assembler->CheckNotCharacter(c, on_failure);
1604 : }
1605 5498 : return checked;
1606 : }
1607 :
1608 :
1609 4785 : static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1610 : bool one_byte, uc16 c1, uc16 c2,
1611 : Label* on_failure) {
1612 : uc16 char_mask;
1613 4785 : if (one_byte) {
1614 : char_mask = String::kMaxOneByteCharCode;
1615 : } else {
1616 : char_mask = String::kMaxUtf16CodeUnit;
1617 : }
1618 4785 : uc16 exor = c1 ^ c2;
1619 : // Check whether exor has only one bit set.
1620 4785 : if (((exor - 1) & exor) == 0) {
1621 : // If c1 and c2 differ only by one bit.
1622 : // Ecma262UnCanonicalize always gives the highest number last.
1623 : DCHECK(c2 > c1);
1624 4690 : uc16 mask = char_mask ^ exor;
1625 4690 : macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1626 4690 : return true;
1627 : }
1628 : DCHECK(c2 > c1);
1629 95 : uc16 diff = c2 - c1;
1630 95 : if (((diff - 1) & diff) == 0 && c1 >= diff) {
1631 : // If the characters differ by 2^n but don't differ by one bit then
1632 : // subtract the difference from the found character, then do the or
1633 : // trick. We avoid the theoretical case where negative numbers are
1634 : // involved in order to simplify code generation.
1635 85 : uc16 mask = char_mask ^ diff;
1636 : macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1637 : diff,
1638 : mask,
1639 85 : on_failure);
1640 85 : return true;
1641 : }
1642 : return false;
1643 : }
1644 :
1645 :
1646 : typedef bool EmitCharacterFunction(Isolate* isolate,
1647 : RegExpCompiler* compiler,
1648 : uc16 c,
1649 : Label* on_failure,
1650 : int cp_offset,
1651 : bool check,
1652 : bool preloaded);
1653 :
1654 : // Only emits letters (things that have case). Only used for case independent
1655 : // matches.
1656 5503 : static inline bool EmitAtomLetter(Isolate* isolate,
1657 5503 : RegExpCompiler* compiler,
1658 : uc16 c,
1659 : Label* on_failure,
1660 : int cp_offset,
1661 : bool check,
1662 : bool preloaded) {
1663 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1664 : bool one_byte = compiler->one_byte();
1665 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1666 5503 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1667 5503 : if (length <= 1) return false;
1668 : // We may not need to check against the end of the input string
1669 : // if this character lies before a character that matched.
1670 5127 : if (!preloaded) {
1671 4792 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1672 : }
1673 5127 : Label ok;
1674 : DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth);
1675 5127 : switch (length) {
1676 : case 2: {
1677 9570 : if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1678 9570 : chars[1], on_failure)) {
1679 : } else {
1680 10 : macro_assembler->CheckCharacter(chars[0], &ok);
1681 10 : macro_assembler->CheckNotCharacter(chars[1], on_failure);
1682 10 : macro_assembler->Bind(&ok);
1683 : }
1684 : break;
1685 : }
1686 : case 4:
1687 25 : macro_assembler->CheckCharacter(chars[3], &ok);
1688 : V8_FALLTHROUGH;
1689 : case 3:
1690 342 : macro_assembler->CheckCharacter(chars[0], &ok);
1691 342 : macro_assembler->CheckCharacter(chars[1], &ok);
1692 342 : macro_assembler->CheckNotCharacter(chars[2], on_failure);
1693 342 : macro_assembler->Bind(&ok);
1694 342 : break;
1695 : default:
1696 0 : UNREACHABLE();
1697 : break;
1698 : }
1699 : return true;
1700 : }
1701 :
1702 :
1703 8607 : static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1704 : int border,
1705 : Label* fall_through,
1706 : Label* above_or_equal,
1707 : Label* below) {
1708 8607 : if (below != fall_through) {
1709 8243 : masm->CheckCharacterLT(border, below);
1710 8243 : if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1711 : } else {
1712 364 : masm->CheckCharacterGT(border - 1, above_or_equal);
1713 : }
1714 8607 : }
1715 :
1716 :
1717 158857 : static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1718 : int first,
1719 : int last,
1720 : Label* fall_through,
1721 : Label* in_range,
1722 : Label* out_of_range) {
1723 158857 : if (in_range == fall_through) {
1724 107322 : if (first == last) {
1725 14494 : masm->CheckNotCharacter(first, out_of_range);
1726 : } else {
1727 92828 : masm->CheckCharacterNotInRange(first, last, out_of_range);
1728 : }
1729 : } else {
1730 51535 : if (first == last) {
1731 28361 : masm->CheckCharacter(first, in_range);
1732 : } else {
1733 23174 : masm->CheckCharacterInRange(first, last, in_range);
1734 : }
1735 51535 : if (out_of_range != fall_through) masm->GoTo(out_of_range);
1736 : }
1737 158857 : }
1738 :
1739 :
1740 : // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1741 : // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1742 5861 : static void EmitUseLookupTable(
1743 5861 : RegExpMacroAssembler* masm,
1744 : ZoneList<int>* ranges,
1745 : int start_index,
1746 : int end_index,
1747 : int min_char,
1748 : Label* fall_through,
1749 : Label* even_label,
1750 : Label* odd_label) {
1751 : static const int kSize = RegExpMacroAssembler::kTableSize;
1752 : static const int kMask = RegExpMacroAssembler::kTableMask;
1753 :
1754 : int base = (min_char & ~kMask);
1755 : USE(base);
1756 :
1757 : // Assert that everything is on one kTableSize page.
1758 : for (int i = start_index; i <= end_index; i++) {
1759 : DCHECK_EQ(ranges->at(i) & ~kMask, base);
1760 : }
1761 : DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1762 :
1763 : char templ[kSize];
1764 : Label* on_bit_set;
1765 : Label* on_bit_clear;
1766 : int bit;
1767 5861 : if (even_label == fall_through) {
1768 : on_bit_set = odd_label;
1769 : on_bit_clear = even_label;
1770 : bit = 1;
1771 : } else {
1772 : on_bit_set = even_label;
1773 : on_bit_clear = odd_label;
1774 : bit = 0;
1775 : }
1776 252413 : for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1777 123276 : templ[i] = bit;
1778 : }
1779 : int j = 0;
1780 5861 : bit ^= 1;
1781 95816 : for (int i = start_index; i < end_index; i++) {
1782 1204670 : for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1783 512380 : templ[j] = bit;
1784 : }
1785 89955 : bit ^= 1;
1786 : }
1787 114552 : for (int i = j; i < kSize; i++) {
1788 114552 : templ[i] = bit;
1789 : }
1790 : Factory* factory = masm->isolate()->factory();
1791 : // TODO(erikcorry): Cache these.
1792 5861 : Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1793 750208 : for (int i = 0; i < kSize; i++) {
1794 750208 : ba->set(i, templ[i]);
1795 : }
1796 5861 : masm->CheckBitInTable(ba, on_bit_set);
1797 5861 : if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1798 5861 : }
1799 :
1800 :
1801 35651 : static void CutOutRange(RegExpMacroAssembler* masm,
1802 : ZoneList<int>* ranges,
1803 : int start_index,
1804 : int end_index,
1805 : int cut_index,
1806 : Label* even_label,
1807 : Label* odd_label) {
1808 35651 : bool odd = (((cut_index - start_index) & 1) == 1);
1809 35651 : Label* in_range_label = odd ? odd_label : even_label;
1810 35651 : Label dummy;
1811 : EmitDoubleBoundaryTest(masm,
1812 : ranges->at(cut_index),
1813 35651 : ranges->at(cut_index + 1) - 1,
1814 : &dummy,
1815 : in_range_label,
1816 71302 : &dummy);
1817 : DCHECK(!dummy.is_linked());
1818 : // Cut out the single range by rewriting the array. This creates a new
1819 : // range that is a merger of the two ranges on either side of the one we
1820 : // are cutting out. The oddity of the labels is preserved.
1821 90888 : for (int j = cut_index; j > start_index; j--) {
1822 39172 : ranges->at(j) = ranges->at(j - 1);
1823 : }
1824 108156 : for (int j = cut_index + 1; j < end_index; j++) {
1825 145010 : ranges->at(j) = ranges->at(j + 1);
1826 : }
1827 35651 : }
1828 :
1829 :
1830 : // Unicode case. Split the search space into kSize spaces that are handled
1831 : // with recursion.
1832 19767 : static void SplitSearchSpace(ZoneList<int>* ranges,
1833 : int start_index,
1834 : int end_index,
1835 : int* new_start_index,
1836 : int* new_end_index,
1837 : int* border) {
1838 : static const int kSize = RegExpMacroAssembler::kTableSize;
1839 : static const int kMask = RegExpMacroAssembler::kTableMask;
1840 :
1841 19767 : int first = ranges->at(start_index);
1842 19767 : int last = ranges->at(end_index) - 1;
1843 :
1844 19767 : *new_start_index = start_index;
1845 19767 : *border = (ranges->at(start_index) & ~kMask) + kSize;
1846 168854 : while (*new_start_index < end_index) {
1847 147860 : if (ranges->at(*new_start_index) > *border) break;
1848 129320 : (*new_start_index)++;
1849 : }
1850 : // new_start_index is the index of the first edge that is beyond the
1851 : // current kSize space.
1852 :
1853 : // For very large search spaces we do a binary chop search of the non-Latin1
1854 : // space instead of just going to the end of the current kSize space. The
1855 : // heuristics are complicated a little by the fact that any 128-character
1856 : // encoding space can be quickly tested with a table lookup, so we don't
1857 : // wish to do binary chop search at a smaller granularity than that. A
1858 : // 128-character space can take up a lot of space in the ranges array if,
1859 : // for example, we only want to match every second character (eg. the lower
1860 : // case characters on some Unicode pages).
1861 19767 : int binary_chop_index = (end_index + start_index) / 2;
1862 : // The first test ensures that we get to the code that handles the Latin1
1863 : // range with a single not-taken branch, speeding up this important
1864 : // character range (even non-Latin1 charset-based text has spaces and
1865 : // punctuation).
1866 54426 : if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1867 27629 : end_index - start_index > (*new_start_index - start_index) * 2 &&
1868 56007 : last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1869 23248 : ranges->at(binary_chop_index) >= first + 2 * kSize) {
1870 : int scan_forward_for_section_border = binary_chop_index;;
1871 9782 : int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1872 :
1873 76686 : while (scan_forward_for_section_border < end_index) {
1874 65012 : if (ranges->at(scan_forward_for_section_border) > new_border) {
1875 7890 : *new_start_index = scan_forward_for_section_border;
1876 7890 : *border = new_border;
1877 7890 : break;
1878 : }
1879 57122 : scan_forward_for_section_border++;
1880 : }
1881 : }
1882 :
1883 : DCHECK(*new_start_index > start_index);
1884 19767 : *new_end_index = *new_start_index - 1;
1885 19767 : if (ranges->at(*new_end_index) == *border) {
1886 2958 : (*new_end_index)--;
1887 : }
1888 39534 : if (*border >= ranges->at(end_index)) {
1889 1225 : *border = ranges->at(end_index);
1890 1225 : *new_start_index = end_index; // Won't be used.
1891 1225 : *new_end_index = end_index - 1;
1892 : }
1893 19767 : }
1894 :
1895 : // Gets a series of segment boundaries representing a character class. If the
1896 : // character is in the range between an even and an odd boundary (counting from
1897 : // start_index) then go to even_label, otherwise go to odd_label. We already
1898 : // know that the character is in the range of min_char to max_char inclusive.
1899 : // Either label can be nullptr indicating backtracking. Either label can also
1900 : // be equal to the fall_through label.
1901 201108 : static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1902 : int start_index, int end_index, uc32 min_char,
1903 : uc32 max_char, Label* fall_through,
1904 : Label* even_label, Label* odd_label) {
1905 : DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1906 : DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1907 :
1908 201108 : int first = ranges->at(start_index);
1909 201108 : int last = ranges->at(end_index) - 1;
1910 :
1911 : DCHECK_LT(min_char, first);
1912 :
1913 : // Just need to test if the character is before or on-or-after
1914 : // a particular character.
1915 201108 : if (start_index == end_index) {
1916 8607 : EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1917 8607 : return;
1918 : }
1919 :
1920 : // Another almost trivial case: There is one interval in the middle that is
1921 : // different from the end intervals.
1922 192501 : if (start_index + 1 == end_index) {
1923 : EmitDoubleBoundaryTest(
1924 123206 : masm, first, last, fall_through, even_label, odd_label);
1925 123206 : return;
1926 : }
1927 :
1928 : // It's not worth using table lookup if there are very few intervals in the
1929 : // character class.
1930 69295 : if (end_index - start_index <= 6) {
1931 : // It is faster to test for individual characters, so we look for those
1932 : // first, then try arbitrary ranges in the second round.
1933 : static int kNoCutIndex = -1;
1934 35651 : int cut = kNoCutIndex;
1935 144754 : for (int i = start_index; i < end_index; i++) {
1936 184980 : if (ranges->at(i) == ranges->at(i + 1) - 1) {
1937 : cut = i;
1938 : break;
1939 : }
1940 : }
1941 35651 : if (cut == kNoCutIndex) cut = start_index;
1942 : CutOutRange(
1943 35651 : masm, ranges, start_index, end_index, cut, even_label, odd_label);
1944 : DCHECK_GE(end_index - start_index, 2);
1945 : GenerateBranches(masm,
1946 : ranges,
1947 : start_index + 1,
1948 : end_index - 1,
1949 : min_char,
1950 : max_char,
1951 : fall_through,
1952 : even_label,
1953 35651 : odd_label);
1954 35651 : return;
1955 : }
1956 :
1957 : // If there are a lot of intervals in the regexp, then we will use tables to
1958 : // determine whether the character is inside or outside the character class.
1959 : static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1960 :
1961 33644 : if ((max_char >> kBits) == (min_char >> kBits)) {
1962 : EmitUseLookupTable(masm,
1963 : ranges,
1964 : start_index,
1965 : end_index,
1966 : min_char,
1967 : fall_through,
1968 : even_label,
1969 5861 : odd_label);
1970 5861 : return;
1971 : }
1972 :
1973 27783 : if ((min_char >> kBits) != (first >> kBits)) {
1974 8016 : masm->CheckCharacterLT(first, odd_label);
1975 : GenerateBranches(masm,
1976 : ranges,
1977 : start_index + 1,
1978 : end_index,
1979 : first,
1980 : max_char,
1981 : fall_through,
1982 : odd_label,
1983 8016 : even_label);
1984 8016 : return;
1985 : }
1986 :
1987 19767 : int new_start_index = 0;
1988 19767 : int new_end_index = 0;
1989 19767 : int border = 0;
1990 :
1991 : SplitSearchSpace(ranges,
1992 : start_index,
1993 : end_index,
1994 : &new_start_index,
1995 : &new_end_index,
1996 19767 : &border);
1997 :
1998 19767 : Label handle_rest;
1999 : Label* above = &handle_rest;
2000 19767 : if (border == last + 1) {
2001 : // We didn't find any section that started after the limit, so everything
2002 : // above the border is one of the terminal labels.
2003 1225 : above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2004 : DCHECK(new_end_index == end_index - 1);
2005 : }
2006 :
2007 : DCHECK_LE(start_index, new_end_index);
2008 : DCHECK_LE(new_start_index, end_index);
2009 : DCHECK_LT(start_index, new_start_index);
2010 : DCHECK_LT(new_end_index, end_index);
2011 : DCHECK(new_end_index + 1 == new_start_index ||
2012 : (new_end_index + 2 == new_start_index &&
2013 : border == ranges->at(new_end_index + 1)));
2014 : DCHECK_LT(min_char, border - 1);
2015 : DCHECK_LT(border, max_char);
2016 : DCHECK_LT(ranges->at(new_end_index), border);
2017 : DCHECK(border < ranges->at(new_start_index) ||
2018 : (border == ranges->at(new_start_index) &&
2019 : new_start_index == end_index &&
2020 : new_end_index == end_index - 1 &&
2021 : border == last + 1));
2022 : DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2023 :
2024 19767 : masm->CheckCharacterGT(border - 1, above);
2025 19767 : Label dummy;
2026 : GenerateBranches(masm,
2027 : ranges,
2028 : start_index,
2029 : new_end_index,
2030 : min_char,
2031 : border - 1,
2032 : &dummy,
2033 : even_label,
2034 19767 : odd_label);
2035 19767 : if (handle_rest.is_linked()) {
2036 18542 : masm->Bind(&handle_rest);
2037 18542 : bool flip = (new_start_index & 1) != (start_index & 1);
2038 : GenerateBranches(masm,
2039 : ranges,
2040 : new_start_index,
2041 : end_index,
2042 : border,
2043 : max_char,
2044 : &dummy,
2045 : flip ? odd_label : even_label,
2046 18542 : flip ? even_label : odd_label);
2047 : }
2048 : }
2049 :
2050 :
2051 210399 : static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2052 : RegExpCharacterClass* cc, bool one_byte,
2053 : Label* on_failure, int cp_offset, bool check_offset,
2054 : bool preloaded, Zone* zone) {
2055 210399 : ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2056 210399 : CharacterRange::Canonicalize(ranges);
2057 :
2058 : int max_char;
2059 210399 : if (one_byte) {
2060 : max_char = String::kMaxOneByteCharCode;
2061 : } else {
2062 : max_char = String::kMaxUtf16CodeUnit;
2063 : }
2064 :
2065 : int range_count = ranges->length();
2066 :
2067 210399 : int last_valid_range = range_count - 1;
2068 601617 : while (last_valid_range >= 0) {
2069 391183 : CharacterRange& range = ranges->at(last_valid_range);
2070 391183 : if (range.from() <= max_char) {
2071 : break;
2072 : }
2073 180819 : last_valid_range--;
2074 : }
2075 :
2076 210399 : if (last_valid_range < 0) {
2077 35 : if (!cc->is_negated()) {
2078 10 : macro_assembler->GoTo(on_failure);
2079 : }
2080 35 : if (check_offset) {
2081 33 : macro_assembler->CheckPosition(cp_offset, on_failure);
2082 : }
2083 91267 : return;
2084 : }
2085 :
2086 398597 : if (last_valid_range == 0 &&
2087 : ranges->at(0).IsEverything(max_char)) {
2088 82499 : if (cc->is_negated()) {
2089 31 : macro_assembler->GoTo(on_failure);
2090 : } else {
2091 : // This is a common case hit by non-anchored expressions.
2092 82468 : if (check_offset) {
2093 53861 : macro_assembler->CheckPosition(cp_offset, on_failure);
2094 : }
2095 : }
2096 : return;
2097 : }
2098 :
2099 127865 : if (!preloaded) {
2100 115810 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2101 : }
2102 :
2103 138711 : if (cc->is_standard(zone) &&
2104 : macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2105 21692 : on_failure)) {
2106 : return;
2107 : }
2108 :
2109 :
2110 : // A new list with ascending entries. Each entry is a code unit
2111 : // where there is a boundary between code units that are part of
2112 : // the class and code units that are not. Normally we insert an
2113 : // entry at zero which goes to the failure label, but if there
2114 : // was already one there we fall through for success on that entry.
2115 : // Subsequent entries have alternating meaning (success/failure).
2116 119132 : ZoneList<int>* range_boundaries =
2117 119132 : new(zone) ZoneList<int>(last_valid_range, zone);
2118 :
2119 119132 : bool zeroth_entry_is_failure = !cc->is_negated();
2120 :
2121 339979 : for (int i = 0; i <= last_valid_range; i++) {
2122 441694 : CharacterRange& range = ranges->at(i);
2123 220847 : if (range.from() == 0) {
2124 : DCHECK_EQ(i, 0);
2125 3380 : zeroth_entry_is_failure = !zeroth_entry_is_failure;
2126 : } else {
2127 217467 : range_boundaries->Add(range.from(), zone);
2128 : }
2129 220847 : range_boundaries->Add(range.to() + 1, zone);
2130 : }
2131 119132 : int end_index = range_boundaries->length() - 1;
2132 119132 : if (range_boundaries->at(end_index) > max_char) {
2133 3978 : end_index--;
2134 : }
2135 :
2136 119132 : Label fall_through;
2137 : GenerateBranches(macro_assembler,
2138 : range_boundaries,
2139 : 0, // start_index.
2140 : end_index,
2141 : 0, // min_char.
2142 : max_char,
2143 : &fall_through,
2144 : zeroth_entry_is_failure ? &fall_through : on_failure,
2145 119132 : zeroth_entry_is_failure ? on_failure : &fall_through);
2146 119132 : macro_assembler->Bind(&fall_through);
2147 : }
2148 :
2149 : RegExpNode::~RegExpNode() = default;
2150 :
2151 4443622 : RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2152 2172285 : Trace* trace) {
2153 : // If we are generating a greedy loop then don't stop and don't reuse code.
2154 1694671 : if (trace->stop_node() != nullptr) {
2155 : return CONTINUE;
2156 : }
2157 :
2158 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2159 1683064 : if (trace->is_trivial()) {
2160 1539203 : if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2161 : // If a generic version is already scheduled to be generated or we have
2162 : // recursed too deeply then just generate a jump to that code.
2163 219833 : macro_assembler->GoTo(&label_);
2164 : // This will queue it up for generation of a generic version if it hasn't
2165 : // already been queued.
2166 219833 : compiler->AddWork(this);
2167 219833 : return DONE;
2168 : }
2169 : // Generate generic version of the node and bind the label for later use.
2170 392167 : macro_assembler->Bind(&label_);
2171 392167 : return CONTINUE;
2172 : }
2173 :
2174 : // We are being asked to make a non-generic version. Keep track of how many
2175 : // non-generic versions we generate so as not to overdo it.
2176 1071064 : trace_count_++;
2177 2136951 : if (KeepRecursing(compiler) && compiler->optimize() &&
2178 : trace_count_ < kMaxCopiesCodeGenerated) {
2179 : return CONTINUE;
2180 : }
2181 :
2182 : // If we get here code has been generated for this node too many times or
2183 : // recursion is too deep. Time to switch to a generic version. The code for
2184 : // generic versions above can handle deep recursion properly.
2185 : bool was_limiting = compiler->limiting_recursion();
2186 : compiler->set_limiting_recursion(true);
2187 477305 : trace->Flush(compiler, this);
2188 : compiler->set_limiting_recursion(was_limiting);
2189 477305 : return DONE;
2190 : }
2191 :
2192 :
2193 3631109 : bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2194 3631109 : return !compiler->limiting_recursion() &&
2195 0 : compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2196 : }
2197 :
2198 :
2199 583319 : int ActionNode::EatsAtLeast(int still_to_find,
2200 : int budget,
2201 : bool not_at_start) {
2202 583319 : if (budget <= 0) return 0;
2203 570257 : if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2204 565129 : return on_success()->EatsAtLeast(still_to_find,
2205 : budget - 1,
2206 565129 : not_at_start);
2207 : }
2208 :
2209 :
2210 90528 : void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2211 : BoyerMooreLookahead* bm, bool not_at_start) {
2212 90528 : if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2213 90528 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2214 : }
2215 : SaveBMInfo(bm, not_at_start, offset);
2216 90528 : }
2217 :
2218 :
2219 10241 : int AssertionNode::EatsAtLeast(int still_to_find,
2220 : int budget,
2221 9226 : bool not_at_start) {
2222 10241 : if (budget <= 0) return 0;
2223 : // If we know we are not at the start and we are asked "how many characters
2224 : // will you match if you succeed?" then we can answer anything since false
2225 : // implies false. So lets just return the max answer (still_to_find) since
2226 : // that won't prevent us from preloading a lot of characters for the other
2227 : // branches in the node graph.
2228 9226 : if (assertion_type() == AT_START && not_at_start) return still_to_find;
2229 9004 : return on_success()->EatsAtLeast(still_to_find,
2230 : budget - 1,
2231 9004 : not_at_start);
2232 : }
2233 :
2234 :
2235 379 : void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2236 379 : BoyerMooreLookahead* bm, bool not_at_start) {
2237 : // Match the behaviour of EatsAtLeast on this node.
2238 758 : if (assertion_type() == AT_START && not_at_start) return;
2239 363 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2240 : SaveBMInfo(bm, not_at_start, offset);
2241 : }
2242 :
2243 :
2244 3111 : int BackReferenceNode::EatsAtLeast(int still_to_find,
2245 : int budget,
2246 3111 : bool not_at_start) {
2247 3111 : if (read_backward()) return 0;
2248 3001 : if (budget <= 0) return 0;
2249 3001 : return on_success()->EatsAtLeast(still_to_find,
2250 : budget - 1,
2251 3001 : not_at_start);
2252 : }
2253 :
2254 :
2255 5761783 : int TextNode::EatsAtLeast(int still_to_find,
2256 : int budget,
2257 5761783 : bool not_at_start) {
2258 5761783 : if (read_backward()) return 0;
2259 5759931 : int answer = Length();
2260 5759931 : if (answer >= still_to_find) return answer;
2261 3397867 : if (budget <= 0) return answer;
2262 : // We are not at start after this node so we set the last argument to 'true'.
2263 2373950 : return answer + on_success()->EatsAtLeast(still_to_find - answer,
2264 : budget - 1,
2265 2373950 : true);
2266 : }
2267 :
2268 :
2269 9503 : int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2270 : bool not_at_start) {
2271 9503 : if (budget <= 0) return 0;
2272 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2273 : // afterwards.
2274 18582 : RegExpNode* node = alternatives_->at(1).node();
2275 9291 : return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2276 : }
2277 :
2278 :
2279 3556 : void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2280 : QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2281 : bool not_at_start) {
2282 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2283 : // afterwards.
2284 7112 : RegExpNode* node = alternatives_->at(1).node();
2285 3556 : return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2286 : }
2287 :
2288 :
2289 6945162 : int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2290 : int budget,
2291 : RegExpNode* ignore_this_node,
2292 : bool not_at_start) {
2293 6945162 : if (budget <= 0) return 0;
2294 : int min = 100;
2295 4818856 : int choice_count = alternatives_->length();
2296 4818856 : budget = (budget - 1) / choice_count;
2297 10720050 : for (int i = 0; i < choice_count; i++) {
2298 20797614 : RegExpNode* node = alternatives_->at(i).node();
2299 10398807 : if (node == ignore_this_node) continue;
2300 : int node_eats_at_least =
2301 10254200 : node->EatsAtLeast(still_to_find, budget, not_at_start);
2302 10254200 : if (node_eats_at_least < min) min = node_eats_at_least;
2303 10254200 : if (min == 0) return 0;
2304 : }
2305 : return min;
2306 : }
2307 :
2308 :
2309 153117 : int LoopChoiceNode::EatsAtLeast(int still_to_find,
2310 : int budget,
2311 : bool not_at_start) {
2312 : return EatsAtLeastHelper(still_to_find,
2313 : budget - 1,
2314 : loop_node_,
2315 153117 : not_at_start);
2316 : }
2317 :
2318 :
2319 6792045 : int ChoiceNode::EatsAtLeast(int still_to_find,
2320 : int budget,
2321 : bool not_at_start) {
2322 6792045 : return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
2323 : }
2324 :
2325 :
2326 : // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2327 : static inline uint32_t SmearBitsRight(uint32_t v) {
2328 239412 : v |= v >> 1;
2329 239412 : v |= v >> 2;
2330 239412 : v |= v >> 4;
2331 239412 : v |= v >> 8;
2332 239412 : v |= v >> 16;
2333 : return v;
2334 : }
2335 :
2336 :
2337 268936 : bool QuickCheckDetails::Rationalize(bool asc) {
2338 : bool found_useful_op = false;
2339 : uint32_t char_mask;
2340 268936 : if (asc) {
2341 : char_mask = String::kMaxOneByteCharCode;
2342 : } else {
2343 : char_mask = String::kMaxUtf16CodeUnit;
2344 : }
2345 268936 : mask_ = 0;
2346 268936 : value_ = 0;
2347 : int char_shift = 0;
2348 697115 : for (int i = 0; i < characters_; i++) {
2349 428179 : Position* pos = &positions_[i];
2350 428179 : if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2351 : found_useful_op = true;
2352 : }
2353 428179 : mask_ |= (pos->mask & char_mask) << char_shift;
2354 428179 : value_ |= (pos->value & char_mask) << char_shift;
2355 428179 : char_shift += asc ? 8 : 16;
2356 : }
2357 268936 : return found_useful_op;
2358 : }
2359 :
2360 :
2361 1188603 : bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2362 62751 : Trace* bounds_check_trace,
2363 590739 : Trace* trace,
2364 : bool preload_has_checked_bounds,
2365 : Label* on_possible_success,
2366 1416393 : QuickCheckDetails* details,
2367 : bool fall_through_on_failure) {
2368 473502 : if (details->characters() == 0) return false;
2369 : GetQuickCheckDetails(
2370 538092 : details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2371 269046 : if (details->cannot_match()) return false;
2372 268936 : if (!details->Rationalize(compiler->one_byte())) return false;
2373 : DCHECK(details->characters() == 1 ||
2374 : compiler->macro_assembler()->CanReadUnaligned());
2375 : uint32_t mask = details->mask();
2376 : uint32_t value = details->value();
2377 :
2378 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2379 :
2380 224615 : if (trace->characters_preloaded() != details->characters()) {
2381 : DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2382 : // We are attempting to preload the minimum number of characters
2383 : // any choice would eat, so if the bounds check fails, then none of the
2384 : // choices can succeed, so we can just immediately backtrack, rather
2385 : // than go to the next choice.
2386 : assembler->LoadCurrentCharacter(trace->cp_offset(),
2387 : bounds_check_trace->backtrack(),
2388 62751 : !preload_has_checked_bounds,
2389 188253 : details->characters());
2390 : }
2391 :
2392 :
2393 : bool need_mask = true;
2394 :
2395 224615 : if (details->characters() == 1) {
2396 : // If number of characters preloaded is 1 then we used a byte or 16 bit
2397 : // load so the value is already masked down.
2398 : uint32_t char_mask;
2399 81750 : if (compiler->one_byte()) {
2400 : char_mask = String::kMaxOneByteCharCode;
2401 : } else {
2402 : char_mask = String::kMaxUtf16CodeUnit;
2403 : }
2404 81750 : if ((mask & char_mask) == char_mask) need_mask = false;
2405 : mask &= char_mask;
2406 : } else {
2407 : // For 2-character preloads in one-byte mode or 1-character preloads in
2408 : // two-byte mode we also use a 16 bit load with zero extend.
2409 : static const uint32_t kTwoByteMask = 0xFFFF;
2410 : static const uint32_t kFourByteMask = 0xFFFFFFFF;
2411 282665 : if (details->characters() == 2 && compiler->one_byte()) {
2412 127814 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2413 15051 : } else if (details->characters() == 1 && !compiler->one_byte()) {
2414 0 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2415 : } else {
2416 15051 : if (mask == kFourByteMask) need_mask = false;
2417 : }
2418 : }
2419 :
2420 224615 : if (fall_through_on_failure) {
2421 190288 : if (need_mask) {
2422 46215 : assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2423 : } else {
2424 144073 : assembler->CheckCharacter(value, on_possible_success);
2425 : }
2426 : } else {
2427 34327 : if (need_mask) {
2428 7400 : assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2429 : } else {
2430 61254 : assembler->CheckNotCharacter(value, trace->backtrack());
2431 : }
2432 : }
2433 : return true;
2434 : }
2435 :
2436 :
2437 : // Here is the meat of GetQuickCheckDetails (see also the comment on the
2438 : // super-class in the .h file).
2439 : //
2440 : // We iterate along the text object, building up for each character a
2441 : // mask and value that can be used to test for a quick failure to match.
2442 : // The masks and values for the positions will be combined into a single
2443 : // machine word for the current character width in order to be used in
2444 : // generating a quick check.
2445 1545271 : void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2446 921326 : RegExpCompiler* compiler,
2447 : int characters_filled_in,
2448 957423 : bool not_at_start) {
2449 : // Do not collect any quick check details if the text node reads backward,
2450 : // since it reads in the opposite direction than we use for quick checks.
2451 457538 : if (read_backward()) return;
2452 457538 : Isolate* isolate = compiler->macro_assembler()->isolate();
2453 : DCHECK(characters_filled_in < details->characters());
2454 : int characters = details->characters();
2455 : int char_mask;
2456 457538 : if (compiler->one_byte()) {
2457 : char_mask = String::kMaxOneByteCharCode;
2458 : } else {
2459 : char_mask = String::kMaxUtf16CodeUnit;
2460 : }
2461 999770 : for (int k = 0; k < elements()->length(); k++) {
2462 462068 : TextElement elm = elements()->at(k);
2463 462068 : if (elm.text_type() == TextElement::ATOM) {
2464 : Vector<const uc16> quarks = elm.atom()->data();
2465 991436 : for (int i = 0; i < characters && i < quarks.length(); i++) {
2466 : QuickCheckDetails::Position* pos =
2467 468548 : details->positions(characters_filled_in);
2468 937096 : uc16 c = quarks[i];
2469 468548 : if (elm.atom()->ignore_case()) {
2470 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2471 : int length = GetCaseIndependentLetters(isolate, c,
2472 6250 : compiler->one_byte(), chars);
2473 6250 : if (length == 0) {
2474 : // This can happen because all case variants are non-Latin1, but we
2475 : // know the input is Latin1.
2476 : details->set_cannot_match();
2477 25 : pos->determines_perfectly = false;
2478 25 : return;
2479 : }
2480 6225 : if (length == 1) {
2481 : // This letter has no case equivalents, so it's nice and simple
2482 : // and the mask-compare will determine definitely whether we have
2483 : // a match at this character position.
2484 1227 : pos->mask = char_mask;
2485 1227 : pos->value = c;
2486 1227 : pos->determines_perfectly = true;
2487 : } else {
2488 4998 : uint32_t common_bits = char_mask;
2489 4998 : uint32_t bits = chars[0];
2490 10349 : for (int j = 1; j < length; j++) {
2491 5351 : uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2492 5351 : common_bits ^= differing_bits;
2493 5351 : bits &= common_bits;
2494 : }
2495 : // If length is 2 and common bits has only one zero in it then
2496 : // our mask and compare instruction will determine definitely
2497 : // whether we have a match at this character position. Otherwise
2498 : // it can only be an approximate check.
2499 4998 : uint32_t one_zero = (common_bits | ~char_mask);
2500 4998 : if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2501 4595 : pos->determines_perfectly = true;
2502 : }
2503 4998 : pos->mask = common_bits;
2504 4998 : pos->value = bits;
2505 : }
2506 : } else {
2507 : // Don't ignore case. Nice simple case where the mask-compare will
2508 : // determine definitely whether we have a match at this character
2509 : // position.
2510 462298 : if (c > char_mask) {
2511 : details->set_cannot_match();
2512 25 : pos->determines_perfectly = false;
2513 25 : return;
2514 : }
2515 462273 : pos->mask = char_mask;
2516 462273 : pos->value = c;
2517 462273 : pos->determines_perfectly = true;
2518 : }
2519 468498 : characters_filled_in++;
2520 : DCHECK(characters_filled_in <= details->characters());
2521 468498 : if (characters_filled_in == details->characters()) {
2522 : return;
2523 : }
2524 : }
2525 : } else {
2526 : QuickCheckDetails::Position* pos =
2527 123930 : details->positions(characters_filled_in);
2528 : RegExpCharacterClass* tree = elm.char_class();
2529 497748 : ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2530 : DCHECK(!ranges->is_empty());
2531 123930 : if (tree->is_negated()) {
2532 : // A quick check uses multi-character mask and compare. There is no
2533 : // useful way to incorporate a negative char class into this scheme
2534 : // so we just conservatively create a mask and value that will always
2535 : // succeed.
2536 3506 : pos->mask = 0;
2537 3506 : pos->value = 0;
2538 : } else {
2539 : int first_range = 0;
2540 120474 : while (ranges->at(first_range).from() > char_mask) {
2541 100 : first_range++;
2542 100 : if (first_range == ranges->length()) {
2543 : details->set_cannot_match();
2544 50 : pos->determines_perfectly = false;
2545 : return;
2546 : }
2547 : }
2548 120374 : CharacterRange range = ranges->at(first_range);
2549 120374 : uc16 from = range.from();
2550 120374 : uc16 to = range.to();
2551 120374 : if (to > char_mask) {
2552 14941 : to = char_mask;
2553 : }
2554 120374 : uint32_t differing_bits = (from ^ to);
2555 : // A mask and compare is only perfect if the differing bits form a
2556 : // number like 00011111 with one single block of trailing 1s.
2557 223703 : if ((differing_bits & (differing_bits + 1)) == 0 &&
2558 103329 : from + differing_bits == to) {
2559 93721 : pos->determines_perfectly = true;
2560 : }
2561 120374 : uint32_t common_bits = ~SmearBitsRight(differing_bits);
2562 120374 : uint32_t bits = (from & common_bits);
2563 747436 : for (int i = first_range + 1; i < ranges->length(); i++) {
2564 253344 : CharacterRange range = ranges->at(i);
2565 253344 : uc16 from = range.from();
2566 253344 : uc16 to = range.to();
2567 253344 : if (from > char_mask) continue;
2568 119038 : if (to > char_mask) to = char_mask;
2569 : // Here we are combining more ranges into the mask and compare
2570 : // value. With each new range the mask becomes more sparse and
2571 : // so the chances of a false positive rise. A character class
2572 : // with multiple ranges is assumed never to be equivalent to a
2573 : // mask and compare operation.
2574 119038 : pos->determines_perfectly = false;
2575 119038 : uint32_t new_common_bits = (from ^ to);
2576 119038 : new_common_bits = ~SmearBitsRight(new_common_bits);
2577 119038 : common_bits &= new_common_bits;
2578 119038 : bits &= new_common_bits;
2579 119038 : uint32_t differing_bits = (from & common_bits) ^ bits;
2580 119038 : common_bits ^= differing_bits;
2581 119038 : bits &= common_bits;
2582 : }
2583 120374 : pos->mask = common_bits;
2584 120374 : pos->value = bits;
2585 : }
2586 123880 : characters_filled_in++;
2587 : DCHECK(characters_filled_in <= details->characters());
2588 123880 : if (characters_filled_in == details->characters()) {
2589 : return;
2590 : }
2591 : }
2592 : }
2593 : DCHECK(characters_filled_in != details->characters());
2594 37817 : if (!details->cannot_match()) {
2595 37817 : on_success()-> GetQuickCheckDetails(details,
2596 : compiler,
2597 : characters_filled_in,
2598 37817 : true);
2599 : }
2600 : }
2601 :
2602 :
2603 0 : void QuickCheckDetails::Clear() {
2604 344277 : for (int i = 0; i < characters_; i++) {
2605 344277 : positions_[i].mask = 0;
2606 344277 : positions_[i].value = 0;
2607 344277 : positions_[i].determines_perfectly = false;
2608 : }
2609 1095252 : characters_ = 0;
2610 0 : }
2611 :
2612 :
2613 515206 : void QuickCheckDetails::Advance(int by, bool one_byte) {
2614 515206 : if (by >= characters_ || by < 0) {
2615 : DCHECK_IMPLIES(by < 0, characters_ == 0);
2616 : Clear();
2617 515206 : return;
2618 : }
2619 : DCHECK_LE(characters_ - by, 4);
2620 : DCHECK_LE(characters_, 4);
2621 24842 : for (int i = 0; i < characters_ - by; i++) {
2622 24842 : positions_[i] = positions_[by + i];
2623 : }
2624 24838 : for (int i = characters_ - by; i < characters_; i++) {
2625 24838 : positions_[i].mask = 0;
2626 24838 : positions_[i].value = 0;
2627 24838 : positions_[i].determines_perfectly = false;
2628 : }
2629 23141 : characters_ -= by;
2630 : // We could change mask_ and value_ here but we would never advance unless
2631 : // they had already been used in a check and they won't be used again because
2632 : // it would gain us nothing. So there's no point.
2633 : }
2634 :
2635 :
2636 154567 : void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2637 : DCHECK(characters_ == other->characters_);
2638 154567 : if (other->cannot_match_) {
2639 : return;
2640 : }
2641 154493 : if (cannot_match_) {
2642 247 : *this = *other;
2643 247 : return;
2644 : }
2645 170261 : for (int i = from_index; i < characters_; i++) {
2646 170261 : QuickCheckDetails::Position* pos = positions(i);
2647 170261 : QuickCheckDetails::Position* other_pos = other->positions(i);
2648 201790 : if (pos->mask != other_pos->mask ||
2649 40559 : pos->value != other_pos->value ||
2650 9030 : !other_pos->determines_perfectly) {
2651 : // Our mask-compare operation will be approximate unless we have the
2652 : // exact same operation on both sides of the alternation.
2653 164011 : pos->determines_perfectly = false;
2654 : }
2655 170261 : pos->mask &= other_pos->mask;
2656 170261 : pos->value &= pos->mask;
2657 170261 : other_pos->value &= pos->mask;
2658 170261 : uc16 differing_bits = (pos->value ^ other_pos->value);
2659 170261 : pos->mask &= ~differing_bits;
2660 170261 : pos->value &= pos->mask;
2661 : }
2662 : }
2663 :
2664 :
2665 : class VisitMarker {
2666 : public:
2667 : explicit VisitMarker(NodeInfo* info) : info_(info) {
2668 : DCHECK(!info->visited);
2669 195230 : info->visited = true;
2670 : }
2671 : ~VisitMarker() {
2672 171515 : info_->visited = false;
2673 : }
2674 : private:
2675 : NodeInfo* info_;
2676 : };
2677 :
2678 98071 : RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
2679 98071 : if (info()->replacement_calculated) return replacement();
2680 71627 : if (depth < 0) return this;
2681 : DCHECK(!info()->visited);
2682 71462 : VisitMarker marker(info());
2683 : return FilterSuccessor(depth - 1);
2684 : }
2685 :
2686 0 : RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
2687 131361 : RegExpNode* next = on_success_->FilterOneByte(depth - 1);
2688 131361 : if (next == nullptr) return set_replacement(nullptr);
2689 130879 : on_success_ = next;
2690 130879 : return set_replacement(this);
2691 : }
2692 :
2693 : // We need to check for the following characters: 0x39C 0x3BC 0x178.
2694 1451 : static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2695 : // TODO(dcarney): this could be a lot more efficient.
2696 4227 : return range.Contains(0x039C) || range.Contains(0x03BC) ||
2697 1451 : range.Contains(0x0178);
2698 : }
2699 :
2700 :
2701 102 : static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2702 122 : for (int i = 0; i < ranges->length(); i++) {
2703 : // TODO(dcarney): this could be a lot more efficient.
2704 46 : if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2705 : }
2706 : return false;
2707 : }
2708 :
2709 189705 : RegExpNode* TextNode::FilterOneByte(int depth) {
2710 99041 : if (info()->replacement_calculated) return replacement();
2711 60379 : if (depth < 0) return this;
2712 : DCHECK(!info()->visited);
2713 60334 : VisitMarker marker(info());
2714 60334 : int element_count = elements()->length();
2715 124687 : for (int i = 0; i < element_count; i++) {
2716 64788 : TextElement elm = elements()->at(i);
2717 64788 : if (elm.text_type() == TextElement::ATOM) {
2718 : Vector<const uc16> quarks = elm.atom()->data();
2719 173528 : for (int j = 0; j < quarks.length(); j++) {
2720 113200 : uint16_t c = quarks[j];
2721 56600 : if (elm.atom()->ignore_case()) {
2722 : c = unibrow::Latin1::TryConvertToLatin1(c);
2723 : }
2724 56600 : if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
2725 : // Replace quark in case we converted to Latin-1.
2726 : uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
2727 56434 : writable_quarks[j] = c;
2728 : }
2729 : } else {
2730 : DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2731 : RegExpCharacterClass* cc = elm.char_class();
2732 34458 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2733 34458 : CharacterRange::Canonicalize(ranges);
2734 : // Now they are in order so we only need to look at the first.
2735 : int range_count = ranges->length();
2736 34458 : if (cc->is_negated()) {
2737 8410 : if (range_count != 0 &&
2738 8588 : ranges->at(0).from() == 0 &&
2739 178 : ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2740 : // This will be handled in a later filter.
2741 40 : if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2742 : continue;
2743 39 : return set_replacement(nullptr);
2744 : }
2745 : } else {
2746 60506 : if (range_count == 0 ||
2747 30253 : ranges->at(0).from() > String::kMaxOneByteCharCode) {
2748 : // This will be handled in a later filter.
2749 255 : if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2750 : continue;
2751 230 : return set_replacement(nullptr);
2752 : }
2753 : }
2754 : }
2755 : }
2756 59899 : return FilterSuccessor(depth - 1);
2757 : }
2758 :
2759 59218 : RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
2760 59218 : if (info()->replacement_calculated) return replacement();
2761 45679 : if (depth < 0) return this;
2762 45589 : if (info()->visited) return this;
2763 : {
2764 24063 : VisitMarker marker(info());
2765 :
2766 24063 : RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2767 : // If we can't continue after the loop then there is no sense in doing the
2768 : // loop.
2769 24063 : if (continue_replacement == nullptr) return set_replacement(nullptr);
2770 : }
2771 :
2772 23715 : return ChoiceNode::FilterOneByte(depth - 1);
2773 : }
2774 :
2775 29515 : RegExpNode* ChoiceNode::FilterOneByte(int depth) {
2776 29665 : if (info()->replacement_calculated) return replacement();
2777 27472 : if (depth < 0) return this;
2778 27377 : if (info()->visited) return this;
2779 27377 : VisitMarker marker(info());
2780 27377 : int choice_count = alternatives_->length();
2781 :
2782 85117 : for (int i = 0; i < choice_count; i++) {
2783 60077 : GuardedAlternative alternative = alternatives_->at(i);
2784 62414 : if (alternative.guards() != nullptr &&
2785 2337 : alternative.guards()->length() != 0) {
2786 2337 : set_replacement(this);
2787 : return this;
2788 : }
2789 : }
2790 :
2791 : int surviving = 0;
2792 : RegExpNode* survivor = nullptr;
2793 57389 : for (int i = 0; i < choice_count; i++) {
2794 114778 : GuardedAlternative alternative = alternatives_->at(i);
2795 57389 : RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2796 : DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2797 57389 : if (replacement != nullptr) {
2798 57243 : alternatives_->at(i).set_node(replacement);
2799 57243 : surviving++;
2800 : survivor = replacement;
2801 : }
2802 : }
2803 25106 : if (surviving < 2) return set_replacement(survivor);
2804 :
2805 24974 : set_replacement(this);
2806 24974 : if (surviving == choice_count) {
2807 : return this;
2808 : }
2809 : // Only some of the nodes survived the filtering. We need to rebuild the
2810 : // alternatives list.
2811 : ZoneList<GuardedAlternative>* new_alternatives =
2812 20 : new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2813 200 : for (int i = 0; i < choice_count; i++) {
2814 : RegExpNode* replacement =
2815 360 : alternatives_->at(i).node()->FilterOneByte(depth - 1);
2816 180 : if (replacement != nullptr) {
2817 130 : alternatives_->at(i).set_node(replacement);
2818 260 : new_alternatives->Add(alternatives_->at(i), zone());
2819 : }
2820 : }
2821 20 : alternatives_ = new_alternatives;
2822 20 : return this;
2823 : }
2824 :
2825 357 : RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
2826 357 : if (info()->replacement_calculated) return replacement();
2827 357 : if (depth < 0) return this;
2828 357 : if (info()->visited) return this;
2829 357 : VisitMarker marker(info());
2830 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2831 : // afterwards.
2832 714 : RegExpNode* node = alternatives_->at(1).node();
2833 357 : RegExpNode* replacement = node->FilterOneByte(depth - 1);
2834 362 : if (replacement == nullptr) return set_replacement(nullptr);
2835 352 : alternatives_->at(1).set_node(replacement);
2836 :
2837 704 : RegExpNode* neg_node = alternatives_->at(0).node();
2838 352 : RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2839 : // If the negative lookahead is always going to fail then
2840 : // we don't need to check it.
2841 357 : if (neg_replacement == nullptr) return set_replacement(replacement);
2842 347 : alternatives_->at(0).set_node(neg_replacement);
2843 694 : return set_replacement(this);
2844 : }
2845 :
2846 :
2847 14676 : void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2848 : RegExpCompiler* compiler,
2849 : int characters_filled_in,
2850 : bool not_at_start) {
2851 14676 : if (body_can_be_zero_length_ || info()->visited) return;
2852 11637 : VisitMarker marker(info());
2853 : return ChoiceNode::GetQuickCheckDetails(details,
2854 : compiler,
2855 : characters_filled_in,
2856 11637 : not_at_start);
2857 : }
2858 :
2859 :
2860 5001 : void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2861 : BoyerMooreLookahead* bm, bool not_at_start) {
2862 5001 : if (body_can_be_zero_length_ || budget <= 0) {
2863 : bm->SetRest(offset);
2864 : SaveBMInfo(bm, not_at_start, offset);
2865 5001 : return;
2866 : }
2867 4815 : ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2868 : SaveBMInfo(bm, not_at_start, offset);
2869 : }
2870 :
2871 :
2872 193231 : void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2873 : RegExpCompiler* compiler,
2874 : int characters_filled_in,
2875 : bool not_at_start) {
2876 38664 : not_at_start = (not_at_start || not_at_start_);
2877 38664 : int choice_count = alternatives_->length();
2878 : DCHECK_LT(0, choice_count);
2879 38664 : alternatives_->at(0).node()->GetQuickCheckDetails(details,
2880 : compiler,
2881 : characters_filled_in,
2882 38664 : not_at_start);
2883 193231 : for (int i = 1; i < choice_count; i++) {
2884 : QuickCheckDetails new_details(details->characters());
2885 309134 : RegExpNode* node = alternatives_->at(i).node();
2886 : node->GetQuickCheckDetails(&new_details, compiler,
2887 : characters_filled_in,
2888 154567 : not_at_start);
2889 : // Here we merge the quick match details of the two branches.
2890 154567 : details->Merge(&new_details, characters_filled_in);
2891 : }
2892 38664 : }
2893 :
2894 :
2895 : // Check for [0-9A-Z_a-z].
2896 557 : static void EmitWordCheck(RegExpMacroAssembler* assembler,
2897 : Label* word,
2898 : Label* non_word,
2899 : bool fall_through_on_word) {
2900 557 : if (assembler->CheckSpecialCharacterClass(
2901 : fall_through_on_word ? 'w' : 'W',
2902 557 : fall_through_on_word ? non_word : word)) {
2903 : // Optimized implementation available.
2904 557 : return;
2905 : }
2906 99 : assembler->CheckCharacterGT('z', non_word);
2907 99 : assembler->CheckCharacterLT('0', non_word);
2908 99 : assembler->CheckCharacterGT('a' - 1, word);
2909 99 : assembler->CheckCharacterLT('9' + 1, word);
2910 99 : assembler->CheckCharacterLT('A', non_word);
2911 99 : assembler->CheckCharacterLT('Z' + 1, word);
2912 99 : if (fall_through_on_word) {
2913 34 : assembler->CheckNotCharacter('_', non_word);
2914 : } else {
2915 65 : assembler->CheckCharacter('_', word);
2916 : }
2917 : }
2918 :
2919 :
2920 : // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2921 : // that matches newline or the start of input).
2922 153 : static void EmitHat(RegExpCompiler* compiler,
2923 : RegExpNode* on_success,
2924 : Trace* trace) {
2925 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2926 : // We will be loading the previous character into the current character
2927 : // register.
2928 129 : Trace new_trace(*trace);
2929 : new_trace.InvalidateCurrentCharacter();
2930 :
2931 129 : Label ok;
2932 129 : if (new_trace.cp_offset() == 0) {
2933 : // The start of input counts as a newline in this context, so skip to
2934 : // ok if we are at the start.
2935 119 : assembler->CheckAtStart(&ok);
2936 : }
2937 : // We already checked that we are not at the start of input so it must be
2938 : // OK to load the previous character.
2939 129 : assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2940 : new_trace.backtrack(),
2941 258 : false);
2942 129 : if (!assembler->CheckSpecialCharacterClass('n',
2943 129 : new_trace.backtrack())) {
2944 : // Newline means \n, \r, 0x2028 or 0x2029.
2945 24 : if (!compiler->one_byte()) {
2946 2 : assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2947 : }
2948 24 : assembler->CheckCharacter('\n', &ok);
2949 24 : assembler->CheckNotCharacter('\r', new_trace.backtrack());
2950 : }
2951 129 : assembler->Bind(&ok);
2952 129 : on_success->Emit(compiler, &new_trace);
2953 129 : }
2954 :
2955 :
2956 : // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2957 811 : void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2958 255 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2959 : Isolate* isolate = assembler->isolate();
2960 : Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2961 255 : bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2962 150 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2963 255 : if (lookahead == nullptr) {
2964 : int eats_at_least =
2965 : Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
2966 : kRecursionBudget,
2967 202 : not_at_start));
2968 202 : if (eats_at_least >= 1) {
2969 97 : BoyerMooreLookahead* bm =
2970 97 : new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
2971 97 : FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2972 97 : if (bm->at(0)->is_non_word())
2973 : next_is_word_character = Trace::FALSE_VALUE;
2974 97 : if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2975 : }
2976 : } else {
2977 53 : if (lookahead->at(0)->is_non_word())
2978 : next_is_word_character = Trace::FALSE_VALUE;
2979 53 : if (lookahead->at(0)->is_word())
2980 : next_is_word_character = Trace::TRUE_VALUE;
2981 : }
2982 255 : bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
2983 255 : if (next_is_word_character == Trace::UNKNOWN) {
2984 151 : Label before_non_word;
2985 151 : Label before_word;
2986 151 : if (trace->characters_preloaded() != 1) {
2987 300 : assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
2988 : }
2989 : // Fall through on non-word.
2990 151 : EmitWordCheck(assembler, &before_word, &before_non_word, false);
2991 : // Next character is not a word character.
2992 151 : assembler->Bind(&before_non_word);
2993 151 : Label ok;
2994 151 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
2995 151 : assembler->GoTo(&ok);
2996 :
2997 151 : assembler->Bind(&before_word);
2998 151 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
2999 151 : assembler->Bind(&ok);
3000 104 : } else if (next_is_word_character == Trace::TRUE_VALUE) {
3001 79 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3002 : } else {
3003 : DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3004 25 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3005 : }
3006 255 : }
3007 :
3008 :
3009 406 : void AssertionNode::BacktrackIfPrevious(
3010 406 : RegExpCompiler* compiler,
3011 : Trace* trace,
3012 : AssertionNode::IfPrevious backtrack_if_previous) {
3013 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3014 406 : Trace new_trace(*trace);
3015 : new_trace.InvalidateCurrentCharacter();
3016 :
3017 406 : Label fall_through, dummy;
3018 :
3019 : Label* non_word = backtrack_if_previous == kIsNonWord ?
3020 194 : new_trace.backtrack() :
3021 406 : &fall_through;
3022 : Label* word = backtrack_if_previous == kIsNonWord ?
3023 : &fall_through :
3024 406 : new_trace.backtrack();
3025 :
3026 406 : if (new_trace.cp_offset() == 0) {
3027 : // The start of input counts as a non-word character, so the question is
3028 : // decided if we are at the start.
3029 169 : assembler->CheckAtStart(non_word);
3030 : }
3031 : // We already checked that we are not at the start of input so it must be
3032 : // OK to load the previous character.
3033 406 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3034 406 : EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3035 :
3036 406 : assembler->Bind(&fall_through);
3037 406 : on_success()->Emit(compiler, &new_trace);
3038 406 : }
3039 :
3040 :
3041 1935 : void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3042 : RegExpCompiler* compiler,
3043 : int filled_in,
3044 : bool not_at_start) {
3045 1935 : if (assertion_type_ == AT_START && not_at_start) {
3046 : details->set_cannot_match();
3047 : return;
3048 : }
3049 1604 : return on_success()->GetQuickCheckDetails(details,
3050 : compiler,
3051 : filled_in,
3052 1604 : not_at_start);
3053 : }
3054 :
3055 :
3056 16374 : void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3057 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3058 5714 : switch (assertion_type_) {
3059 : case AT_END: {
3060 2322 : Label ok;
3061 4644 : assembler->CheckPosition(trace->cp_offset(), &ok);
3062 4644 : assembler->GoTo(trace->backtrack());
3063 2322 : assembler->Bind(&ok);
3064 : break;
3065 : }
3066 : case AT_START: {
3067 3008 : if (trace->at_start() == Trace::FALSE_VALUE) {
3068 18 : assembler->GoTo(trace->backtrack());
3069 9 : return;
3070 : }
3071 2999 : if (trace->at_start() == Trace::UNKNOWN) {
3072 5998 : assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3073 2999 : Trace at_start_trace = *trace;
3074 : at_start_trace.set_at_start(Trace::TRUE_VALUE);
3075 5321 : on_success()->Emit(compiler, &at_start_trace);
3076 : return;
3077 : }
3078 : }
3079 : break;
3080 : case AFTER_NEWLINE:
3081 129 : EmitHat(compiler, on_success(), trace);
3082 129 : return;
3083 : case AT_BOUNDARY:
3084 : case AT_NON_BOUNDARY: {
3085 255 : EmitBoundaryCheck(compiler, trace);
3086 255 : return;
3087 : }
3088 : }
3089 2322 : on_success()->Emit(compiler, trace);
3090 : }
3091 :
3092 :
3093 2723814 : static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3094 2723814 : if (quick_check == nullptr) return false;
3095 2723814 : if (offset >= quick_check->characters()) return false;
3096 847971 : return quick_check->positions(offset)->determines_perfectly;
3097 : }
3098 :
3099 :
3100 : static void UpdateBoundsCheck(int index, int* checked_up_to) {
3101 790721 : if (index > *checked_up_to) {
3102 413536 : *checked_up_to = index;
3103 : }
3104 : }
3105 :
3106 :
3107 : // We call this repeatedly to generate code for each pass over the text node.
3108 : // The passes are in increasing order of difficulty because we hope one
3109 : // of the first passes will fail in which case we are saved the work of the
3110 : // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3111 : // we will check the '%' in the first pass, the case independent 'a' in the
3112 : // second pass and the character class in the last pass.
3113 : //
3114 : // The passes are done from right to left, so for example to test for /bar/
3115 : // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3116 : // and then a 'b' with offset 0. This means we can avoid the end-of-input
3117 : // bounds check most of the time. In the example we only need to check for
3118 : // end-of-input when loading the putative 'r'.
3119 : //
3120 : // A slight complication involves the fact that the first character may already
3121 : // be fetched into a register by the previous node. In this case we want to
3122 : // do the test for that character first. We do this in separate passes. The
3123 : // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3124 : // pass has been performed then subsequent passes will have true in
3125 : // first_element_checked to indicate that that character does not need to be
3126 : // checked again.
3127 : //
3128 : // In addition to all this we are passed a Trace, which can
3129 : // contain an AlternativeGeneration object. In this AlternativeGeneration
3130 : // object we can see details of any quick check that was already passed in
3131 : // order to get to the code we are now generating. The quick check can involve
3132 : // loading characters, which means we do not need to recheck the bounds
3133 : // up to the limit the quick check already checked. In addition the quick
3134 : // check can have involved a mask and compare operation which may simplify
3135 : // or obviate the need for further checks at some character positions.
3136 5385758 : void TextNode::TextEmitPass(RegExpCompiler* compiler,
3137 : TextEmitPassType pass,
3138 : bool preloaded,
3139 5620616 : Trace* trace,
3140 : bool first_element_checked,
3141 8691354 : int* checked_up_to) {
3142 2692879 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3143 : Isolate* isolate = assembler->isolate();
3144 : bool one_byte = compiler->one_byte();
3145 : Label* backtrack = trace->backtrack();
3146 2692879 : QuickCheckDetails* quick_check = trace->quick_check_performed();
3147 2692879 : int element_count = elements()->length();
3148 2692879 : int backward_offset = read_backward() ? -Length() : 0;
3149 5620591 : for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3150 2927737 : TextElement elm = elements()->at(i);
3151 2927737 : int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3152 2927737 : if (elm.text_type() == TextElement::ATOM) {
3153 1785180 : if (SkipPass(pass, elm.atom()->ignore_case())) continue;
3154 : Vector<const uc16> quarks = elm.atom()->data();
3155 4582104 : for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3156 2565673 : if (first_element_checked && i == 0 && j == 0) continue;
3157 4971326 : if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3158 : EmitCharacterFunction* emit_function = nullptr;
3159 3368552 : uc16 quark = quarks[j];
3160 1684276 : if (elm.atom()->ignore_case()) {
3161 : // Everywhere else we assume that a non-Latin-1 character cannot match
3162 : // a Latin-1 character. Avoid the cases where this is assumption is
3163 : // invalid by using the Latin1 equivalent instead.
3164 : quark = unibrow::Latin1::TryConvertToLatin1(quark);
3165 : }
3166 1684276 : switch (pass) {
3167 : case NON_LATIN1_MATCH:
3168 : DCHECK(one_byte);
3169 518015 : if (quark > String::kMaxOneByteCharCode) {
3170 25 : assembler->GoTo(backtrack);
3171 2692879 : return;
3172 : }
3173 : break;
3174 : case NON_LETTER_CHARACTER_MATCH:
3175 : emit_function = &EmitAtomNonLetter;
3176 5503 : break;
3177 : case SIMPLE_CHARACTER_MATCH:
3178 : emit_function = &EmitSimpleCharacter;
3179 574876 : break;
3180 : case CASE_CHARACTER_MATCH:
3181 : emit_function = &EmitAtomLetter;
3182 5503 : break;
3183 : default:
3184 : break;
3185 : }
3186 1684251 : if (emit_function != nullptr) {
3187 917148 : bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3188 : bool bound_checked =
3189 : emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
3190 585882 : bounds_check, preloaded);
3191 585882 : if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3192 : }
3193 : }
3194 : } else {
3195 : DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3196 1142557 : if (pass == CHARACTER_CLASS_MATCH) {
3197 276705 : if (first_element_checked && i == 0) continue;
3198 238151 : if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3199 : RegExpCharacterClass* cc = elm.char_class();
3200 256992 : bool bounds_check = *checked_up_to < cp_offset || read_backward();
3201 : EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3202 210399 : bounds_check, preloaded, zone());
3203 : UpdateBoundsCheck(cp_offset, checked_up_to);
3204 : }
3205 : }
3206 : }
3207 : }
3208 :
3209 :
3210 6946989 : int TextNode::Length() {
3211 6946989 : TextElement elm = elements()->last();
3212 : DCHECK_LE(0, elm.cp_offset());
3213 6946989 : return elm.cp_offset() + elm.length();
3214 : }
3215 :
3216 0 : bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
3217 1785180 : if (ignore_case) {
3218 44755 : return pass == SIMPLE_CHARACTER_MATCH;
3219 : } else {
3220 1740425 : return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3221 : }
3222 : }
3223 :
3224 7207 : TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3225 : ZoneList<CharacterRange>* ranges,
3226 : bool read_backward,
3227 : RegExpNode* on_success,
3228 : JSRegExp::Flags flags) {
3229 : DCHECK_NOT_NULL(ranges);
3230 7207 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3231 : elms->Add(TextElement::CharClass(
3232 21621 : new (zone) RegExpCharacterClass(zone, ranges, flags)),
3233 7207 : zone);
3234 7207 : return new (zone) TextNode(elms, read_backward, on_success);
3235 : }
3236 :
3237 27070 : TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3238 : CharacterRange trail,
3239 : bool read_backward,
3240 : RegExpNode* on_success,
3241 : JSRegExp::Flags flags) {
3242 27070 : ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3243 27070 : ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3244 27070 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3245 : elms->Add(TextElement::CharClass(
3246 81210 : new (zone) RegExpCharacterClass(zone, lead_ranges, flags)),
3247 27070 : zone);
3248 : elms->Add(TextElement::CharClass(
3249 81210 : new (zone) RegExpCharacterClass(zone, trail_ranges, flags)),
3250 27070 : zone);
3251 27070 : return new (zone) TextNode(elms, read_backward, on_success);
3252 : }
3253 :
3254 :
3255 : // This generates the code to match a text node. A text node can contain
3256 : // straight character sequences (possibly to be matched in a case-independent
3257 : // way) and character classes. For efficiency we do not do this in a single
3258 : // pass from left to right. Instead we pass over the text node several times,
3259 : // emitting code for some character positions every time. See the comment on
3260 : // TextEmitPass for details.
3261 4223669 : void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3262 617227 : LimitResult limit_result = LimitVersions(compiler, trace);
3263 719248 : if (limit_result == DONE) return;
3264 : DCHECK(limit_result == CONTINUE);
3265 :
3266 515206 : if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3267 : compiler->SetRegExpTooBig();
3268 : return;
3269 : }
3270 :
3271 515206 : if (compiler->one_byte()) {
3272 322615 : int dummy = 0;
3273 322615 : TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3274 : }
3275 :
3276 : bool first_elt_done = false;
3277 515206 : int bound_checked_to = trace->cp_offset() - 1;
3278 515206 : bound_checked_to += trace->bound_checked_up_to();
3279 :
3280 : // If a character is preloaded into the current character register then
3281 : // check that now.
3282 515206 : if (trace->characters_preloaded() == 1) {
3283 309440 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3284 : TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
3285 309440 : false, &bound_checked_to);
3286 : }
3287 : first_elt_done = true;
3288 : }
3289 :
3290 2576030 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3291 : TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
3292 2060824 : first_elt_done, &bound_checked_to);
3293 : }
3294 :
3295 515206 : Trace successor_trace(*trace);
3296 : // If we advance backward, we may end up at the start.
3297 : successor_trace.AdvanceCurrentPositionInTrace(
3298 515206 : read_backward() ? -Length() : Length(), compiler);
3299 : successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3300 515206 : : Trace::FALSE_VALUE);
3301 : RecursionCheck rc(compiler);
3302 515206 : on_success()->Emit(compiler, &successor_trace);
3303 : }
3304 :
3305 :
3306 0 : void Trace::InvalidateCurrentCharacter() {
3307 227872 : characters_preloaded_ = 0;
3308 0 : }
3309 :
3310 :
3311 1030412 : void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3312 : // We don't have an instruction for shifting the current character register
3313 : // down or for using a shifted value for anything so lets just forget that
3314 : // we preloaded any characters into it.
3315 515206 : characters_preloaded_ = 0;
3316 : // Adjust the offsets of the quick check performed information. This
3317 : // information is used to find out what we already determined about the
3318 : // characters by means of mask and compare.
3319 515206 : quick_check_performed_.Advance(by, compiler->one_byte());
3320 515206 : cp_offset_ += by;
3321 515206 : if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3322 : compiler->SetRegExpTooBig();
3323 0 : cp_offset_ = 0;
3324 : }
3325 1030412 : bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3326 515206 : }
3327 :
3328 :
3329 688601 : void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3330 316020 : int element_count = elements()->length();
3331 688601 : for (int i = 0; i < element_count; i++) {
3332 372581 : TextElement elm = elements()->at(i);
3333 372581 : if (elm.text_type() == TextElement::CHAR_CLASS) {
3334 : RegExpCharacterClass* cc = elm.char_class();
3335 : #ifdef V8_INTL_SUPPORT
3336 : bool case_equivalents_already_added =
3337 : NeedsUnicodeCaseEquivalents(cc->flags());
3338 : #else
3339 : bool case_equivalents_already_added = false;
3340 : #endif
3341 237485 : if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
3342 : // None of the standard character classes is different in the case
3343 : // independent case and it slows us down if we don't know that.
3344 135881 : if (cc->is_standard(zone())) continue;
3345 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3346 : CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
3347 133800 : is_one_byte);
3348 : }
3349 : }
3350 : }
3351 316020 : }
3352 :
3353 :
3354 135037 : int TextNode::GreedyLoopTextLength() { return Length(); }
3355 :
3356 :
3357 85789 : RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3358 254856 : RegExpCompiler* compiler) {
3359 85789 : if (read_backward()) return nullptr;
3360 85664 : if (elements()->length() != 1) return nullptr;
3361 85302 : TextElement elm = elements()->at(0);
3362 85302 : if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
3363 : RegExpCharacterClass* node = elm.char_class();
3364 167954 : ZoneList<CharacterRange>* ranges = node->ranges(zone());
3365 83977 : CharacterRange::Canonicalize(ranges);
3366 83977 : if (node->is_negated()) {
3367 82361 : return ranges->length() == 0 ? on_success() : nullptr;
3368 : }
3369 83863 : if (ranges->length() != 1) return nullptr;
3370 : uint32_t max_char;
3371 83403 : if (compiler->one_byte()) {
3372 : max_char = String::kMaxOneByteCharCode;
3373 : } else {
3374 : max_char = String::kMaxUtf16CodeUnit;
3375 : }
3376 250209 : return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
3377 : }
3378 :
3379 :
3380 : // Finds the fixed match length of a sequence of nodes that goes from
3381 : // this alternative and back to this choice node. If there are variable
3382 : // length nodes or other complications in the way then return a sentinel
3383 : // value indicating that a greedy loop cannot be constructed.
3384 224986 : int ChoiceNode::GreedyLoopTextLengthForAlternative(
3385 224986 : GuardedAlternative* alternative) {
3386 : int length = 0;
3387 : RegExpNode* node = alternative->node();
3388 : // Later we will generate code for all these text nodes using recursion
3389 : // so we have to limit the max number.
3390 : int recursion_depth = 0;
3391 585009 : while (node != this) {
3392 336829 : if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3393 : return kNodeIsTooComplexForGreedyLoops;
3394 : }
3395 336829 : int node_length = node->GreedyLoopTextLength();
3396 336829 : if (node_length == kNodeIsTooComplexForGreedyLoops) {
3397 : return kNodeIsTooComplexForGreedyLoops;
3398 : }
3399 135037 : length += node_length;
3400 135037 : SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3401 : node = seq_node->on_success();
3402 : }
3403 23194 : return read_backward() ? -length : length;
3404 : }
3405 :
3406 :
3407 0 : void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3408 : DCHECK_NULL(loop_node_);
3409 : AddAlternative(alt);
3410 999129 : loop_node_ = alt.node();
3411 0 : }
3412 :
3413 :
3414 0 : void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3415 : DCHECK_NULL(continue_node_);
3416 : AddAlternative(alt);
3417 999129 : continue_node_ = alt.node();
3418 0 : }
3419 :
3420 :
3421 343454 : void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3422 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3423 331857 : if (trace->stop_node() == this) {
3424 : // Back edge of greedy optimized loop node graph.
3425 : int text_length =
3426 23194 : GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3427 : DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
3428 : // Update the counter-based backtracking info on the stack. This is an
3429 : // optimization for greedy loops (see below).
3430 : DCHECK(trace->cp_offset() == text_length);
3431 11597 : macro_assembler->AdvanceCurrentPosition(text_length);
3432 23194 : macro_assembler->GoTo(trace->loop_label());
3433 11597 : return;
3434 : }
3435 : DCHECK_NULL(trace->stop_node());
3436 320260 : if (!trace->is_trivial()) {
3437 119035 : trace->Flush(compiler, this);
3438 119035 : return;
3439 : }
3440 201225 : ChoiceNode::Emit(compiler, trace);
3441 : }
3442 :
3443 :
3444 560792 : int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3445 : int eats_at_least) {
3446 : int preload_characters = Min(4, eats_at_least);
3447 : DCHECK_LE(preload_characters, 4);
3448 213389 : if (compiler->macro_assembler()->CanReadUnaligned()) {
3449 : bool one_byte = compiler->one_byte();
3450 134014 : if (one_byte) {
3451 : // We can't preload 3 characters because there is no machine instruction
3452 : // to do that. We can't just load 4 because we could be reading
3453 : // beyond the end of the string, which could cause a memory fault.
3454 106609 : if (preload_characters == 3) preload_characters = 2;
3455 : } else {
3456 27405 : if (preload_characters > 2) preload_characters = 2;
3457 : }
3458 : } else {
3459 79375 : if (preload_characters > 1) preload_characters = 1;
3460 : }
3461 213389 : return preload_characters;
3462 : }
3463 :
3464 :
3465 : // This class is used when generating the alternatives in a choice node. It
3466 : // records the way the alternative is being code generated.
3467 : class AlternativeGeneration: public Malloced {
3468 : public:
3469 : AlternativeGeneration()
3470 : : possible_success(),
3471 : expects_preload(false),
3472 : after(),
3473 2175817 : quick_check_details() { }
3474 : Label possible_success;
3475 : bool expects_preload;
3476 : Label after;
3477 : QuickCheckDetails quick_check_details;
3478 : };
3479 :
3480 :
3481 : // Creates a list of AlternativeGenerations. If the list has a reasonable
3482 : // size then it is on the stack, otherwise the excess is on the heap.
3483 : class AlternativeGenerationList {
3484 : public:
3485 213389 : AlternativeGenerationList(int count, Zone* zone)
3486 2347279 : : alt_gens_(count, zone) {
3487 572857 : for (int i = 0; i < count && i < kAFew; i++) {
3488 572857 : alt_gens_.Add(a_few_alt_gens_ + i, zone);
3489 : }
3490 41927 : for (int i = kAFew; i < count; i++) {
3491 41927 : alt_gens_.Add(new AlternativeGeneration(), zone);
3492 : }
3493 213389 : }
3494 213389 : ~AlternativeGenerationList() {
3495 510632 : for (int i = kAFew; i < alt_gens_.length(); i++) {
3496 381097 : delete alt_gens_[i];
3497 41927 : alt_gens_[i] = nullptr;
3498 : }
3499 213389 : }
3500 :
3501 : AlternativeGeneration* at(int i) {
3502 2837337 : return alt_gens_[i];
3503 : }
3504 :
3505 : private:
3506 : static const int kAFew = 10;
3507 : ZoneList<AlternativeGeneration*> alt_gens_;
3508 : AlternativeGeneration a_few_alt_gens_[kAFew];
3509 : };
3510 :
3511 :
3512 : static const uc32 kRangeEndMarker = 0x110000;
3513 :
3514 : // The '2' variant is has inclusive from and exclusive to.
3515 : // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3516 : // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3517 : static const int kSpaceRanges[] = {
3518 : '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
3519 : 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3520 : 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3521 : static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3522 :
3523 : static const int kWordRanges[] = {
3524 : '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3525 : static const int kWordRangeCount = arraysize(kWordRanges);
3526 : static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3527 : static const int kDigitRangeCount = arraysize(kDigitRanges);
3528 : static const int kSurrogateRanges[] = {
3529 : kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3530 : static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3531 : static const int kLineTerminatorRanges[] = {
3532 : 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3533 : static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3534 :
3535 0 : void BoyerMoorePositionInfo::Set(int character) {
3536 79581 : SetInterval(Interval(character, character));
3537 0 : }
3538 :
3539 :
3540 1202668 : void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3541 241225 : s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3542 241225 : w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3543 241225 : d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3544 : surrogate_ =
3545 241225 : AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3546 241225 : if (interval.to() - interval.from() >= kMapSize - 1) {
3547 13515 : if (map_count_ != kMapSize) {
3548 6264 : map_count_ = kMapSize;
3549 808056 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3550 : }
3551 : return;
3552 : }
3553 1212726 : for (int i = interval.from(); i <= interval.to(); i++) {
3554 536676 : int mod_character = (i & kMask);
3555 1073352 : if (!map_->at(mod_character)) {
3556 371881 : map_count_++;
3557 371881 : map_->at(mod_character) = true;
3558 : }
3559 536676 : if (map_count_ == kMapSize) return;
3560 : }
3561 : }
3562 :
3563 :
3564 0 : void BoyerMoorePositionInfo::SetAll() {
3565 5457 : s_ = w_ = d_ = kLatticeUnknown;
3566 5457 : if (map_count_ != kMapSize) {
3567 5027 : map_count_ = kMapSize;
3568 1286912 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3569 : }
3570 0 : }
3571 :
3572 :
3573 80680 : BoyerMooreLookahead::BoyerMooreLookahead(
3574 80680 : int length, RegExpCompiler* compiler, Zone* zone)
3575 : : length_(length),
3576 80680 : compiler_(compiler) {
3577 80680 : if (compiler->one_byte()) {
3578 10066 : max_char_ = String::kMaxOneByteCharCode;
3579 : } else {
3580 70614 : max_char_ = String::kMaxUtf16CodeUnit;
3581 : }
3582 80680 : bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3583 179931 : for (int i = 0; i < length; i++) {
3584 99251 : bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3585 : }
3586 80680 : }
3587 :
3588 :
3589 : // Find the longest range of lookahead that has the fewest number of different
3590 : // characters that can occur at a given position. Since we are optimizing two
3591 : // different parameters at once this is a tradeoff.
3592 80583 : bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3593 : int biggest_points = 0;
3594 : // If more than 32 characters out of 128 can occur it is unlikely that we can
3595 : // be lucky enough to step forwards much of the time.
3596 : const int kMaxMax = 32;
3597 322332 : for (int max_number_of_chars = 4;
3598 : max_number_of_chars < kMaxMax;
3599 : max_number_of_chars *= 2) {
3600 : biggest_points =
3601 241749 : FindBestInterval(max_number_of_chars, biggest_points, from, to);
3602 : }
3603 80583 : if (biggest_points == 0) return false;
3604 5455 : return true;
3605 : }
3606 :
3607 :
3608 : // Find the highest-points range between 0 and length_ where the character
3609 : // information is not too vague. 'Too vague' means that there are more than
3610 : // max_number_of_chars that can occur at this position. Calculates the number
3611 : // of points as the product of width-of-the-range and
3612 : // probability-of-finding-one-of-the-characters, where the probability is
3613 : // calculated using the frequency distribution of the sample subject string.
3614 241749 : int BoyerMooreLookahead::FindBestInterval(
3615 541035 : int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3616 : int biggest_points = old_biggest_points;
3617 : static const int kSize = RegExpMacroAssembler::kTableSize;
3618 712482 : for (int i = 0; i < length_; ) {
3619 311098 : while (i < length_ && Count(i) > max_number_of_chars) i++;
3620 256584 : if (i == length_) break;
3621 : int remembered_from = i;
3622 : bool union_map[kSize];
3623 29309952 : for (int j = 0; j < kSize; j++) union_map[j] = false;
3624 756137 : while (i < length_ && Count(i) <= max_number_of_chars) {
3625 33300670 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3626 33044511 : for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3627 256159 : i++;
3628 : }
3629 : int frequency = 0;
3630 29309952 : for (int j = 0; j < kSize; j++) {
3631 29309952 : if (union_map[j]) {
3632 : // Add 1 to the frequency to give a small per-character boost for
3633 : // the cases where our sampling is not good enough and many
3634 : // characters have a frequency of zero. This means the frequency
3635 : // can theoretically be up to 2*kSize though we treat it mostly as
3636 : // a fraction of kSize.
3637 976200 : frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3638 : }
3639 : }
3640 : // We use the probability of skipping times the distance we are skipping to
3641 : // judge the effectiveness of this. Actually we have a cut-off: By
3642 : // dividing by 2 we switch off the skipping if the probability of skipping
3643 : // is less than 50%. This is because the multibyte mask-and-compare
3644 : // skipping in quickcheck is more likely to do well on this case.
3645 : bool in_quickcheck_range =
3646 231740 : ((i - remembered_from < 4) ||
3647 2756 : (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3648 : // Called 'probability' but it is only a rough estimate and can actually
3649 : // be outside the 0-kSize range.
3650 228984 : int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3651 228984 : int points = (i - remembered_from) * probability;
3652 228984 : if (points > biggest_points) {
3653 5848 : *from = remembered_from;
3654 5848 : *to = i - 1;
3655 : biggest_points = points;
3656 : }
3657 : }
3658 241749 : return biggest_points;
3659 : }
3660 :
3661 :
3662 : // Take all the characters that will not prevent a successful match if they
3663 : // occur in the subject string in the range between min_lookahead and
3664 : // max_lookahead (inclusive) measured from the current position. If the
3665 : // character at max_lookahead offset is not one of these characters, then we
3666 : // can safely skip forwards by the number of characters in the range.
3667 4443 : int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3668 : int max_lookahead,
3669 : Handle<ByteArray> boolean_skip_table) {
3670 : const int kSize = RegExpMacroAssembler::kTableSize;
3671 :
3672 : const int kSkipArrayEntry = 0;
3673 : const int kDontSkipArrayEntry = 1;
3674 :
3675 573147 : for (int i = 0; i < kSize; i++) {
3676 : boolean_skip_table->set(i, kSkipArrayEntry);
3677 : }
3678 4443 : int skip = max_lookahead + 1 - min_lookahead;
3679 :
3680 13934 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3681 1233830 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3682 1224339 : for (int j = 0; j < kSize; j++) {
3683 1214848 : if (map->at(j)) {
3684 : boolean_skip_table->set(j, kDontSkipArrayEntry);
3685 : }
3686 : }
3687 : }
3688 :
3689 4443 : return skip;
3690 : }
3691 :
3692 :
3693 : // See comment above on the implementation of GetSkipTable.
3694 85026 : void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3695 : const int kSize = RegExpMacroAssembler::kTableSize;
3696 :
3697 80583 : int min_lookahead = 0;
3698 80583 : int max_lookahead = 0;
3699 :
3700 156723 : if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3701 :
3702 : bool found_single_character = false;
3703 : int single_character = 0;
3704 9805 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3705 421510 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3706 17586 : if (map->map_count() > 1 ||
3707 3182 : (found_single_character && map->map_count() != 0)) {
3708 : found_single_character = false;
3709 : break;
3710 : }
3711 399650 : for (int j = 0; j < kSize; j++) {
3712 403924 : if (map->at(j)) {
3713 : found_single_character = true;
3714 : single_character = j;
3715 : break;
3716 : }
3717 : }
3718 : }
3719 :
3720 5455 : int lookahead_width = max_lookahead + 1 - min_lookahead;
3721 :
3722 5455 : if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3723 : // The mask-compare can probably handle this better.
3724 : return;
3725 : }
3726 :
3727 4539 : if (found_single_character) {
3728 96 : Label cont, again;
3729 96 : masm->Bind(&again);
3730 96 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3731 96 : if (max_char_ > kSize) {
3732 : masm->CheckCharacterAfterAnd(single_character,
3733 : RegExpMacroAssembler::kTableMask,
3734 96 : &cont);
3735 : } else {
3736 0 : masm->CheckCharacter(single_character, &cont);
3737 : }
3738 96 : masm->AdvanceCurrentPosition(lookahead_width);
3739 96 : masm->GoTo(&again);
3740 96 : masm->Bind(&cont);
3741 : return;
3742 : }
3743 :
3744 : Factory* factory = masm->isolate()->factory();
3745 4443 : Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3746 : int skip_distance = GetSkipTable(
3747 4443 : min_lookahead, max_lookahead, boolean_skip_table);
3748 : DCHECK_NE(0, skip_distance);
3749 :
3750 4443 : Label cont, again;
3751 4443 : masm->Bind(&again);
3752 4443 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3753 4443 : masm->CheckBitInTable(boolean_skip_table, &cont);
3754 4443 : masm->AdvanceCurrentPosition(skip_distance);
3755 4443 : masm->GoTo(&again);
3756 4443 : masm->Bind(&cont);
3757 : }
3758 :
3759 :
3760 : /* Code generation for choice nodes.
3761 : *
3762 : * We generate quick checks that do a mask and compare to eliminate a
3763 : * choice. If the quick check succeeds then it jumps to the continuation to
3764 : * do slow checks and check subsequent nodes. If it fails (the common case)
3765 : * it falls through to the next choice.
3766 : *
3767 : * Here is the desired flow graph. Nodes directly below each other imply
3768 : * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3769 : * 3 doesn't have a quick check so we have to call the slow check.
3770 : * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3771 : * regexp continuation is generated directly after the Sn node, up to the
3772 : * next GoTo if we decide to reuse some already generated code. Some
3773 : * nodes expect preload_characters to be preloaded into the current
3774 : * character register. R nodes do this preloading. Vertices are marked
3775 : * F for failures and S for success (possible success in the case of quick
3776 : * nodes). L, V, < and > are used as arrow heads.
3777 : *
3778 : * ----------> R
3779 : * |
3780 : * V
3781 : * Q1 -----> S1
3782 : * | S /
3783 : * F| /
3784 : * | F/
3785 : * | /
3786 : * | R
3787 : * | /
3788 : * V L
3789 : * Q2 -----> S2
3790 : * | S /
3791 : * F| /
3792 : * | F/
3793 : * | /
3794 : * | R
3795 : * | /
3796 : * V L
3797 : * S3
3798 : * |
3799 : * F|
3800 : * |
3801 : * R
3802 : * |
3803 : * backtrack V
3804 : * <----------Q4
3805 : * \ F |
3806 : * \ |S
3807 : * \ F V
3808 : * \-----S4
3809 : *
3810 : * For greedy loops we push the current position, then generate the code that
3811 : * eats the input specially in EmitGreedyLoop. The other choice (the
3812 : * continuation) is generated by the normal code in EmitChoices, and steps back
3813 : * in the input to the starting position when it fails to match. The loop code
3814 : * looks like this (U is the unwind code that steps back in the greedy loop).
3815 : *
3816 : * _____
3817 : * / \
3818 : * V |
3819 : * ----------> S1 |
3820 : * /| |
3821 : * / |S |
3822 : * F/ \_____/
3823 : * /
3824 : * |<-----
3825 : * | \
3826 : * V |S
3827 : * Q2 ---> U----->backtrack
3828 : * | F /
3829 : * S| /
3830 : * V F /
3831 : * S2--/
3832 : */
3833 :
3834 213389 : GreedyLoopState::GreedyLoopState(bool not_at_start) {
3835 0 : counter_backtrack_trace_.set_backtrack(&label_);
3836 213389 : if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3837 0 : }
3838 :
3839 :
3840 0 : void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3841 : #ifdef DEBUG
3842 : int choice_count = alternatives_->length();
3843 : for (int i = 0; i < choice_count - 1; i++) {
3844 : GuardedAlternative alternative = alternatives_->at(i);
3845 : ZoneList<Guard*>* guards = alternative.guards();
3846 : int guard_count = (guards == nullptr) ? 0 : guards->length();
3847 : for (int j = 0; j < guard_count; j++) {
3848 : DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3849 : }
3850 : }
3851 : #endif
3852 0 : }
3853 :
3854 :
3855 344685 : void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3856 344685 : Trace* current_trace,
3857 : PreloadState* state) {
3858 213389 : if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3859 : // Save some time by looking at most one machine word ahead.
3860 : state->eats_at_least_ =
3861 : EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3862 393888 : current_trace->at_start() == Trace::FALSE_VALUE);
3863 : }
3864 : state->preload_characters_ =
3865 213389 : CalculatePreloadCharacters(compiler, state->eats_at_least_);
3866 :
3867 : state->preload_is_current_ =
3868 213389 : (current_trace->characters_preloaded() == state->preload_characters_);
3869 213389 : state->preload_has_checked_bounds_ = state->preload_is_current_;
3870 213389 : }
3871 :
3872 :
3873 1427817 : void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3874 582938 : int choice_count = alternatives_->length();
3875 :
3876 584276 : if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3877 1338 : alternatives_->at(0).node()->Emit(compiler, trace);
3878 1338 : return;
3879 : }
3880 :
3881 : AssertGuardsMentionRegisters(trace);
3882 :
3883 794989 : LimitResult limit_result = LimitVersions(compiler, trace);
3884 581600 : if (limit_result == DONE) return;
3885 : DCHECK(limit_result == CONTINUE);
3886 :
3887 : // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3888 : // other choice nodes we only flush if we are out of code size budget.
3889 216309 : if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3890 1460 : trace->Flush(compiler, this);
3891 1460 : return;
3892 : }
3893 :
3894 : RecursionCheck rc(compiler);
3895 :
3896 : PreloadState preload;
3897 : preload.init();
3898 : GreedyLoopState greedy_loop_state(not_at_start());
3899 :
3900 426778 : int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3901 426778 : AlternativeGenerationList alt_gens(choice_count, zone());
3902 :
3903 213389 : if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3904 : trace = EmitGreedyLoop(compiler,
3905 : trace,
3906 : &alt_gens,
3907 : &preload,
3908 : &greedy_loop_state,
3909 11597 : text_length);
3910 : } else {
3911 : // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3912 : // match the traces produced pre-cleanup.
3913 201792 : Label second_choice;
3914 201792 : compiler->macro_assembler()->Bind(&second_choice);
3915 :
3916 201792 : preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3917 :
3918 : EmitChoices(compiler,
3919 : &alt_gens,
3920 : 0,
3921 : trace,
3922 201792 : &preload);
3923 : }
3924 :
3925 : // At this point we need to generate slow checks for the alternatives where
3926 : // the quick check was inlined. We can recognize these because the associated
3927 : // label was bound.
3928 213389 : int new_flush_budget = trace->flush_budget() / choice_count;
3929 828173 : for (int i = 0; i < choice_count; i++) {
3930 : AlternativeGeneration* alt_gen = alt_gens.at(i);
3931 614784 : Trace new_trace(*trace);
3932 : // If there are actions to be flushed we have to limit how many times
3933 : // they are flushed. Take the budget of the parent trace and distribute
3934 : // it fairly amongst the children.
3935 614784 : if (new_trace.actions() != nullptr) {
3936 : new_trace.set_flush_budget(new_flush_budget);
3937 : }
3938 : bool next_expects_preload =
3939 1016179 : i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3940 : EmitOutOfLineContinuation(compiler,
3941 : &new_trace,
3942 614784 : alternatives_->at(i),
3943 : alt_gen,
3944 : preload.preload_characters_,
3945 1229568 : next_expects_preload);
3946 : }
3947 : }
3948 :
3949 :
3950 11597 : Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3951 11597 : Trace* trace,
3952 : AlternativeGenerationList* alt_gens,
3953 : PreloadState* preload,
3954 : GreedyLoopState* greedy_loop_state,
3955 11597 : int text_length) {
3956 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3957 : // Here we have special handling for greedy loops containing only text nodes
3958 : // and other simple nodes. These are handled by pushing the current
3959 : // position on the stack and then incrementing the current position each
3960 : // time around the switch. On backtrack we decrement the current position
3961 : // and check it against the pushed value. This avoids pushing backtrack
3962 : // information for each iteration of the loop, which could take up a lot of
3963 : // space.
3964 : DCHECK(trace->stop_node() == nullptr);
3965 11597 : macro_assembler->PushCurrentPosition();
3966 11597 : Label greedy_match_failed;
3967 11597 : Trace greedy_match_trace;
3968 11597 : if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3969 : greedy_match_trace.set_backtrack(&greedy_match_failed);
3970 11597 : Label loop_label;
3971 11597 : macro_assembler->Bind(&loop_label);
3972 11597 : greedy_match_trace.set_stop_node(this);
3973 : greedy_match_trace.set_loop_label(&loop_label);
3974 23194 : alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3975 11597 : macro_assembler->Bind(&greedy_match_failed);
3976 :
3977 11597 : Label second_choice; // For use in greedy matches.
3978 11597 : macro_assembler->Bind(&second_choice);
3979 :
3980 11597 : Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3981 :
3982 : EmitChoices(compiler,
3983 : alt_gens,
3984 : 1,
3985 : new_trace,
3986 11597 : preload);
3987 :
3988 11597 : macro_assembler->Bind(greedy_loop_state->label());
3989 : // If we have unwound to the bottom then backtrack.
3990 23194 : macro_assembler->CheckGreedyLoop(trace->backtrack());
3991 : // Otherwise try the second priority at an earlier position.
3992 11597 : macro_assembler->AdvanceCurrentPosition(-text_length);
3993 11597 : macro_assembler->GoTo(&second_choice);
3994 11597 : return new_trace;
3995 : }
3996 :
3997 283885 : int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
3998 : Trace* trace) {
3999 : int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4000 201792 : if (alternatives_->length() != 2) return eats_at_least;
4001 :
4002 165679 : GuardedAlternative alt1 = alternatives_->at(1);
4003 165679 : if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
4004 : return eats_at_least;
4005 : }
4006 : RegExpNode* eats_anything_node = alt1.node();
4007 244843 : if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4008 : return eats_at_least;
4009 : }
4010 :
4011 : // Really we should be creating a new trace when we execute this function,
4012 : // but there is no need, because the code it generates cannot backtrack, and
4013 : // we always arrive here with a trivial trace (since it's the entry to a
4014 : // loop. That also implies that there are no preloaded characters, which is
4015 : // good, because it means we won't be violating any assumptions by
4016 : // overwriting those characters with new load instructions.
4017 : DCHECK(trace->is_trivial());
4018 :
4019 82093 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4020 : Isolate* isolate = macro_assembler->isolate();
4021 : // At this point we know that we are at a non-greedy loop that will eat
4022 : // any character one at a time. Any non-anchored regexp has such a
4023 : // loop prepended to it in order to find where it starts. We look for
4024 : // a pattern of the form ...abc... where we can look 6 characters ahead
4025 : // and step forwards 3 if the character is not one of abc. Abc need
4026 : // not be atoms, they can be any reasonably limited character class or
4027 : // small alternation.
4028 : BoyerMooreLookahead* bm = bm_info(false);
4029 82093 : if (bm == nullptr) {
4030 : eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4031 : EatsAtLeast(kMaxLookaheadForBoyerMoore,
4032 : kRecursionBudget,
4033 82093 : false));
4034 82093 : if (eats_at_least >= 1) {
4035 : bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4036 : compiler,
4037 80583 : zone());
4038 161166 : GuardedAlternative alt0 = alternatives_->at(0);
4039 80583 : alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4040 : }
4041 : }
4042 82093 : if (bm != nullptr) {
4043 80583 : bm->EmitSkipInstructions(macro_assembler);
4044 : }
4045 82093 : return eats_at_least;
4046 : }
4047 :
4048 :
4049 816576 : void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4050 : AlternativeGenerationList* alt_gens,
4051 : int first_choice,
4052 213411 : Trace* trace,
4053 : PreloadState* preload) {
4054 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4055 213389 : SetUpPreLoad(compiler, trace, preload);
4056 :
4057 : // For now we just call all choices one after the other. The idea ultimately
4058 : // is to use the Dispatch table to try only the relevant ones.
4059 213389 : int choice_count = alternatives_->length();
4060 :
4061 213389 : int new_flush_budget = trace->flush_budget() / choice_count;
4062 :
4063 816576 : for (int i = first_choice; i < choice_count; i++) {
4064 603187 : bool is_last = i == choice_count - 1;
4065 603187 : bool fall_through_on_failure = !is_last;
4066 1206374 : GuardedAlternative alternative = alternatives_->at(i);
4067 : AlternativeGeneration* alt_gen = alt_gens->at(i);
4068 981759 : alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4069 3901 : ZoneList<Guard*>* guards = alternative.guards();
4070 603187 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4071 603187 : Trace new_trace(*trace);
4072 : new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4073 : preload->preload_characters_ :
4074 603187 : 0);
4075 603187 : if (preload->preload_has_checked_bounds_) {
4076 397875 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4077 : }
4078 : new_trace.quick_check_performed()->Clear();
4079 603187 : if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4080 603187 : if (!is_last) {
4081 389798 : new_trace.set_backtrack(&alt_gen->after);
4082 : }
4083 603187 : alt_gen->expects_preload = preload->preload_is_current_;
4084 : bool generate_full_check_inline = false;
4085 1079491 : if (compiler->optimize() &&
4086 1076689 : try_to_emit_quick_check_for_alternative(i == 0) &&
4087 : alternative.node()->EmitQuickCheck(
4088 : compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4089 : &alt_gen->possible_success, &alt_gen->quick_check_details,
4090 473502 : fall_through_on_failure)) {
4091 : // Quick check was generated for this choice.
4092 224615 : preload->preload_is_current_ = true;
4093 224615 : preload->preload_has_checked_bounds_ = true;
4094 : // If we generated the quick check to fall through on possible success,
4095 : // we now need to generate the full check inline.
4096 224615 : if (!fall_through_on_failure) {
4097 34327 : macro_assembler->Bind(&alt_gen->possible_success);
4098 : new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4099 34327 : new_trace.set_characters_preloaded(preload->preload_characters_);
4100 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4101 : generate_full_check_inline = true;
4102 : }
4103 378572 : } else if (alt_gen->quick_check_details.cannot_match()) {
4104 110 : if (!fall_through_on_failure) {
4105 44 : macro_assembler->GoTo(trace->backtrack());
4106 : }
4107 110 : continue;
4108 : } else {
4109 : // No quick check was generated. Put the full code here.
4110 : // If this is not the first choice then there could be slow checks from
4111 : // previous cases that go here when they fail. There's no reason to
4112 : // insist that they preload characters since the slow check we are about
4113 : // to generate probably can't use it.
4114 378462 : if (i != first_choice) {
4115 227337 : alt_gen->expects_preload = false;
4116 : new_trace.InvalidateCurrentCharacter();
4117 : }
4118 : generate_full_check_inline = true;
4119 : }
4120 603077 : if (generate_full_check_inline) {
4121 412789 : if (new_trace.actions() != nullptr) {
4122 : new_trace.set_flush_budget(new_flush_budget);
4123 : }
4124 2475 : for (int j = 0; j < guard_count; j++) {
4125 2475 : GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4126 : }
4127 412789 : alternative.node()->Emit(compiler, &new_trace);
4128 412789 : preload->preload_is_current_ = false;
4129 : }
4130 603077 : macro_assembler->Bind(&alt_gen->after);
4131 : }
4132 213389 : }
4133 :
4134 :
4135 805072 : void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4136 160860 : Trace* trace,
4137 : GuardedAlternative alternative,
4138 : AlternativeGeneration* alt_gen,
4139 : int preload_characters,
4140 : bool next_expects_preload) {
4141 1039280 : if (!alt_gen->possible_success.is_linked()) return;
4142 :
4143 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4144 190288 : macro_assembler->Bind(&alt_gen->possible_success);
4145 190288 : Trace out_of_line_trace(*trace);
4146 : out_of_line_trace.set_characters_preloaded(preload_characters);
4147 : out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4148 190288 : if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4149 191714 : ZoneList<Guard*>* guards = alternative.guards();
4150 190288 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4151 190288 : if (next_expects_preload) {
4152 160860 : Label reload_current_char;
4153 : out_of_line_trace.set_backtrack(&reload_current_char);
4154 162210 : for (int j = 0; j < guard_count; j++) {
4155 1350 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4156 : }
4157 160860 : alternative.node()->Emit(compiler, &out_of_line_trace);
4158 160860 : macro_assembler->Bind(&reload_current_char);
4159 : // Reload the current character, since the next quick check expects that.
4160 : // We don't need to check bounds here because we only get into this
4161 : // code through a quick check which already did the checked load.
4162 : macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
4163 321720 : preload_characters);
4164 160860 : macro_assembler->GoTo(&(alt_gen->after));
4165 : } else {
4166 29428 : out_of_line_trace.set_backtrack(&(alt_gen->after));
4167 29504 : for (int j = 0; j < guard_count; j++) {
4168 76 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4169 : }
4170 29428 : alternative.node()->Emit(compiler, &out_of_line_trace);
4171 : }
4172 : }
4173 :
4174 :
4175 494222 : void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4176 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4177 493354 : LimitResult limit_result = LimitVersions(compiler, trace);
4178 493354 : if (limit_result == DONE) return;
4179 : DCHECK(limit_result == CONTINUE);
4180 :
4181 : RecursionCheck rc(compiler);
4182 :
4183 265188 : switch (action_type_) {
4184 : case STORE_POSITION: {
4185 : Trace::DeferredCapture
4186 : new_capture(data_.u_position_register.reg,
4187 : data_.u_position_register.is_capture,
4188 240962 : trace);
4189 240962 : Trace new_trace = *trace;
4190 : new_trace.add_action(&new_capture);
4191 256838 : on_success()->Emit(compiler, &new_trace);
4192 : break;
4193 : }
4194 : case INCREMENT_REGISTER: {
4195 : Trace::DeferredIncrementRegister
4196 3709 : new_increment(data_.u_increment_register.reg);
4197 3709 : Trace new_trace = *trace;
4198 : new_trace.add_action(&new_increment);
4199 3709 : on_success()->Emit(compiler, &new_trace);
4200 : break;
4201 : }
4202 : case SET_REGISTER: {
4203 : Trace::DeferredSetRegister
4204 3430 : new_set(data_.u_store_register.reg, data_.u_store_register.value);
4205 3430 : Trace new_trace = *trace;
4206 : new_trace.add_action(&new_set);
4207 3430 : on_success()->Emit(compiler, &new_trace);
4208 : break;
4209 : }
4210 : case CLEAR_CAPTURES: {
4211 : Trace::DeferredClearCaptures
4212 : new_capture(Interval(data_.u_clear_captures.range_from,
4213 2154 : data_.u_clear_captures.range_to));
4214 2154 : Trace new_trace = *trace;
4215 : new_trace.add_action(&new_capture);
4216 2154 : on_success()->Emit(compiler, &new_trace);
4217 : break;
4218 : }
4219 : case BEGIN_SUBMATCH:
4220 9468 : if (!trace->is_trivial()) {
4221 5036 : trace->Flush(compiler, this);
4222 : } else {
4223 : assembler->WriteCurrentPositionToRegister(
4224 4432 : data_.u_submatch.current_position_register, 0);
4225 : assembler->WriteStackPointerToRegister(
4226 4432 : data_.u_submatch.stack_pointer_register);
4227 4432 : on_success()->Emit(compiler, trace);
4228 : }
4229 : break;
4230 : case EMPTY_MATCH_CHECK: {
4231 909 : int start_pos_reg = data_.u_empty_match_check.start_register;
4232 909 : int stored_pos = 0;
4233 909 : int rep_reg = data_.u_empty_match_check.repetition_register;
4234 909 : bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4235 909 : bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4236 1088 : if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4237 : // If we know we haven't advanced and there is no minimum we
4238 : // can just backtrack immediately.
4239 152 : assembler->GoTo(trace->backtrack());
4240 1167 : } else if (know_dist && stored_pos < trace->cp_offset()) {
4241 : // If we know we've advanced we can generate the continuation
4242 : // immediately.
4243 247 : on_success()->Emit(compiler, trace);
4244 586 : } else if (!trace->is_trivial()) {
4245 307 : trace->Flush(compiler, this);
4246 : } else {
4247 279 : Label skip_empty_check;
4248 : // If we have a minimum number of repetitions we check the current
4249 : // number first and skip the empty check if it's not enough.
4250 279 : if (has_minimum) {
4251 206 : int limit = data_.u_empty_match_check.repetition_limit;
4252 206 : assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4253 : }
4254 : // If the match is empty we bail out, otherwise we fall through
4255 : // to the on-success continuation.
4256 : assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4257 558 : trace->backtrack());
4258 279 : assembler->Bind(&skip_empty_check);
4259 279 : on_success()->Emit(compiler, trace);
4260 : }
4261 : break;
4262 : }
4263 : case POSITIVE_SUBMATCH_SUCCESS: {
4264 4556 : if (!trace->is_trivial()) {
4265 2931 : trace->Flush(compiler, this);
4266 2931 : return;
4267 : }
4268 : assembler->ReadCurrentPositionFromRegister(
4269 1625 : data_.u_submatch.current_position_register);
4270 : assembler->ReadStackPointerFromRegister(
4271 1625 : data_.u_submatch.stack_pointer_register);
4272 1625 : int clear_register_count = data_.u_submatch.clear_register_count;
4273 1625 : if (clear_register_count == 0) {
4274 1142 : on_success()->Emit(compiler, trace);
4275 1142 : return;
4276 : }
4277 483 : int clear_registers_from = data_.u_submatch.clear_register_from;
4278 483 : Label clear_registers_backtrack;
4279 483 : Trace new_trace = *trace;
4280 : new_trace.set_backtrack(&clear_registers_backtrack);
4281 483 : on_success()->Emit(compiler, &new_trace);
4282 :
4283 483 : assembler->Bind(&clear_registers_backtrack);
4284 483 : int clear_registers_to = clear_registers_from + clear_register_count - 1;
4285 483 : assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4286 :
4287 : DCHECK(trace->backtrack() == nullptr);
4288 483 : assembler->Backtrack();
4289 483 : return;
4290 : }
4291 : default:
4292 0 : UNREACHABLE();
4293 : }
4294 : }
4295 :
4296 :
4297 11249 : void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4298 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4299 4743 : if (!trace->is_trivial()) {
4300 2253 : trace->Flush(compiler, this);
4301 2253 : return;
4302 : }
4303 :
4304 2490 : LimitResult limit_result = LimitVersions(compiler, trace);
4305 2490 : if (limit_result == DONE) return;
4306 : DCHECK(limit_result == CONTINUE);
4307 :
4308 : RecursionCheck rc(compiler);
4309 :
4310 : DCHECK_EQ(start_reg_ + 1, end_reg_);
4311 2290 : if (IgnoreCase(flags_)) {
4312 : assembler->CheckNotBackReferenceIgnoreCase(
4313 5043 : start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
4314 : } else {
4315 : assembler->CheckNotBackReference(start_reg_, read_backward(),
4316 1218 : trace->backtrack());
4317 : }
4318 : // We are going to advance backward, so we may end up at the start.
4319 2290 : if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4320 :
4321 : // Check that the back reference does not end inside a surrogate pair.
4322 2455 : if (IsUnicode(flags_) && !compiler->one_byte()) {
4323 80 : assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4324 : }
4325 2290 : on_success()->Emit(compiler, trace);
4326 : }
4327 :
4328 :
4329 : // -------------------------------------------------------------------
4330 : // Dot/dotty output
4331 :
4332 :
4333 : #ifdef DEBUG
4334 :
4335 :
4336 : class DotPrinter: public NodeVisitor {
4337 : public:
4338 : DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4339 : : os_(os),
4340 : ignore_case_(ignore_case) {}
4341 : void PrintNode(const char* label, RegExpNode* node);
4342 : void Visit(RegExpNode* node);
4343 : void PrintAttributes(RegExpNode* from);
4344 : void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4345 : #define DECLARE_VISIT(Type) \
4346 : virtual void Visit##Type(Type##Node* that);
4347 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4348 : #undef DECLARE_VISIT
4349 : private:
4350 : std::ostream& os_;
4351 : bool ignore_case_;
4352 : };
4353 :
4354 :
4355 : void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4356 : os_ << "digraph G {\n graph [label=\"";
4357 : for (int i = 0; label[i]; i++) {
4358 : switch (label[i]) {
4359 : case '\\':
4360 : os_ << "\\\\";
4361 : break;
4362 : case '"':
4363 : os_ << "\"";
4364 : break;
4365 : default:
4366 : os_ << label[i];
4367 : break;
4368 : }
4369 : }
4370 : os_ << "\"];\n";
4371 : Visit(node);
4372 : os_ << "}" << std::endl;
4373 : }
4374 :
4375 :
4376 : void DotPrinter::Visit(RegExpNode* node) {
4377 : if (node->info()->visited) return;
4378 : node->info()->visited = true;
4379 : node->Accept(this);
4380 : }
4381 :
4382 :
4383 : void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4384 : os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4385 : Visit(on_failure);
4386 : }
4387 :
4388 :
4389 : class TableEntryBodyPrinter {
4390 : public:
4391 : TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4392 : : os_(os),
4393 : choice_(choice) {}
4394 : void Call(uc16 from, DispatchTable::Entry entry) {
4395 : OutSet* out_set = entry.out_set();
4396 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4397 : if (out_set->Get(i)) {
4398 : os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4399 : << choice()->alternatives()->at(i).node() << ";\n";
4400 : }
4401 : }
4402 : }
4403 : private:
4404 : ChoiceNode* choice() { return choice_; }
4405 : std::ostream& os_;
4406 : ChoiceNode* choice_;
4407 : };
4408 :
4409 :
4410 : class TableEntryHeaderPrinter {
4411 : public:
4412 : explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4413 : : first_(true),
4414 : os_(os) {}
4415 : void Call(uc16 from, DispatchTable::Entry entry) {
4416 : if (first_) {
4417 : first_ = false;
4418 : } else {
4419 : os_ << "|";
4420 : }
4421 : os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4422 : OutSet* out_set = entry.out_set();
4423 : int priority = 0;
4424 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4425 : if (out_set->Get(i)) {
4426 : if (priority > 0) os_ << "|";
4427 : os_ << "<s" << from << "o" << i << "> " << priority;
4428 : priority++;
4429 : }
4430 : }
4431 : os_ << "}}";
4432 : }
4433 :
4434 : private:
4435 : bool first_;
4436 : std::ostream& os_;
4437 : };
4438 :
4439 :
4440 : class AttributePrinter {
4441 : public:
4442 : explicit AttributePrinter(std::ostream& os) // NOLINT
4443 : : os_(os),
4444 : first_(true) {}
4445 : void PrintSeparator() {
4446 : if (first_) {
4447 : first_ = false;
4448 : } else {
4449 : os_ << "|";
4450 : }
4451 : }
4452 : void PrintBit(const char* name, bool value) {
4453 : if (!value) return;
4454 : PrintSeparator();
4455 : os_ << "{" << name << "}";
4456 : }
4457 : void PrintPositive(const char* name, int value) {
4458 : if (value < 0) return;
4459 : PrintSeparator();
4460 : os_ << "{" << name << "|" << value << "}";
4461 : }
4462 :
4463 : private:
4464 : std::ostream& os_;
4465 : bool first_;
4466 : };
4467 :
4468 :
4469 : void DotPrinter::PrintAttributes(RegExpNode* that) {
4470 : os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4471 : << "margin=0.1, fontsize=10, label=\"{";
4472 : AttributePrinter printer(os_);
4473 : NodeInfo* info = that->info();
4474 : printer.PrintBit("NI", info->follows_newline_interest);
4475 : printer.PrintBit("WI", info->follows_word_interest);
4476 : printer.PrintBit("SI", info->follows_start_interest);
4477 : Label* label = that->label();
4478 : if (label->is_bound())
4479 : printer.PrintPositive("@", label->pos());
4480 : os_ << "}\"];\n"
4481 : << " a" << that << " -> n" << that
4482 : << " [style=dashed, color=grey, arrowhead=none];\n";
4483 : }
4484 :
4485 :
4486 : static const bool kPrintDispatchTable = false;
4487 : void DotPrinter::VisitChoice(ChoiceNode* that) {
4488 : if (kPrintDispatchTable) {
4489 : os_ << " n" << that << " [shape=Mrecord, label=\"";
4490 : TableEntryHeaderPrinter header_printer(os_);
4491 : that->GetTable(ignore_case_)->ForEach(&header_printer);
4492 : os_ << "\"]\n";
4493 : PrintAttributes(that);
4494 : TableEntryBodyPrinter body_printer(os_, that);
4495 : that->GetTable(ignore_case_)->ForEach(&body_printer);
4496 : } else {
4497 : os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4498 : for (int i = 0; i < that->alternatives()->length(); i++) {
4499 : GuardedAlternative alt = that->alternatives()->at(i);
4500 : os_ << " n" << that << " -> n" << alt.node();
4501 : }
4502 : }
4503 : for (int i = 0; i < that->alternatives()->length(); i++) {
4504 : GuardedAlternative alt = that->alternatives()->at(i);
4505 : alt.node()->Accept(this);
4506 : }
4507 : }
4508 :
4509 :
4510 : void DotPrinter::VisitText(TextNode* that) {
4511 : Zone* zone = that->zone();
4512 : os_ << " n" << that << " [label=\"";
4513 : for (int i = 0; i < that->elements()->length(); i++) {
4514 : if (i > 0) os_ << " ";
4515 : TextElement elm = that->elements()->at(i);
4516 : switch (elm.text_type()) {
4517 : case TextElement::ATOM: {
4518 : Vector<const uc16> data = elm.atom()->data();
4519 : for (int i = 0; i < data.length(); i++) {
4520 : os_ << static_cast<char>(data[i]);
4521 : }
4522 : break;
4523 : }
4524 : case TextElement::CHAR_CLASS: {
4525 : RegExpCharacterClass* node = elm.char_class();
4526 : os_ << "[";
4527 : if (node->is_negated()) os_ << "^";
4528 : for (int j = 0; j < node->ranges(zone)->length(); j++) {
4529 : CharacterRange range = node->ranges(zone)->at(j);
4530 : os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4531 : }
4532 : os_ << "]";
4533 : break;
4534 : }
4535 : default:
4536 : UNREACHABLE();
4537 : }
4538 : }
4539 : os_ << "\", shape=box, peripheries=2];\n";
4540 : PrintAttributes(that);
4541 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4542 : Visit(that->on_success());
4543 : }
4544 :
4545 :
4546 : void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4547 : os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4548 : << that->end_register() << "\", shape=doubleoctagon];\n";
4549 : PrintAttributes(that);
4550 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4551 : Visit(that->on_success());
4552 : }
4553 :
4554 :
4555 : void DotPrinter::VisitEnd(EndNode* that) {
4556 : os_ << " n" << that << " [style=bold, shape=point];\n";
4557 : PrintAttributes(that);
4558 : }
4559 :
4560 :
4561 : void DotPrinter::VisitAssertion(AssertionNode* that) {
4562 : os_ << " n" << that << " [";
4563 : switch (that->assertion_type()) {
4564 : case AssertionNode::AT_END:
4565 : os_ << "label=\"$\", shape=septagon";
4566 : break;
4567 : case AssertionNode::AT_START:
4568 : os_ << "label=\"^\", shape=septagon";
4569 : break;
4570 : case AssertionNode::AT_BOUNDARY:
4571 : os_ << "label=\"\\b\", shape=septagon";
4572 : break;
4573 : case AssertionNode::AT_NON_BOUNDARY:
4574 : os_ << "label=\"\\B\", shape=septagon";
4575 : break;
4576 : case AssertionNode::AFTER_NEWLINE:
4577 : os_ << "label=\"(?<=\\n)\", shape=septagon";
4578 : break;
4579 : }
4580 : os_ << "];\n";
4581 : PrintAttributes(that);
4582 : RegExpNode* successor = that->on_success();
4583 : os_ << " n" << that << " -> n" << successor << ";\n";
4584 : Visit(successor);
4585 : }
4586 :
4587 :
4588 : void DotPrinter::VisitAction(ActionNode* that) {
4589 : os_ << " n" << that << " [";
4590 : switch (that->action_type_) {
4591 : case ActionNode::SET_REGISTER:
4592 : os_ << "label=\"$" << that->data_.u_store_register.reg
4593 : << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4594 : break;
4595 : case ActionNode::INCREMENT_REGISTER:
4596 : os_ << "label=\"$" << that->data_.u_increment_register.reg
4597 : << "++\", shape=octagon";
4598 : break;
4599 : case ActionNode::STORE_POSITION:
4600 : os_ << "label=\"$" << that->data_.u_position_register.reg
4601 : << ":=$pos\", shape=octagon";
4602 : break;
4603 : case ActionNode::BEGIN_SUBMATCH:
4604 : os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4605 : << ":=$pos,begin\", shape=septagon";
4606 : break;
4607 : case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4608 : os_ << "label=\"escape\", shape=septagon";
4609 : break;
4610 : case ActionNode::EMPTY_MATCH_CHECK:
4611 : os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4612 : << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4613 : << "<" << that->data_.u_empty_match_check.repetition_limit
4614 : << "?\", shape=septagon";
4615 : break;
4616 : case ActionNode::CLEAR_CAPTURES: {
4617 : os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4618 : << " to $" << that->data_.u_clear_captures.range_to
4619 : << "\", shape=septagon";
4620 : break;
4621 : }
4622 : }
4623 : os_ << "];\n";
4624 : PrintAttributes(that);
4625 : RegExpNode* successor = that->on_success();
4626 : os_ << " n" << that << " -> n" << successor << ";\n";
4627 : Visit(successor);
4628 : }
4629 :
4630 :
4631 : class DispatchTableDumper {
4632 : public:
4633 : explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4634 : void Call(uc16 key, DispatchTable::Entry entry);
4635 : private:
4636 : std::ostream& os_;
4637 : };
4638 :
4639 :
4640 : void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4641 : os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4642 : OutSet* set = entry.out_set();
4643 : bool first = true;
4644 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4645 : if (set->Get(i)) {
4646 : if (first) {
4647 : first = false;
4648 : } else {
4649 : os_ << ", ";
4650 : }
4651 : os_ << i;
4652 : }
4653 : }
4654 : os_ << "}\n";
4655 : }
4656 :
4657 :
4658 : void DispatchTable::Dump() {
4659 : OFStream os(stderr);
4660 : DispatchTableDumper dumper(os);
4661 : tree()->ForEach(&dumper);
4662 : }
4663 :
4664 :
4665 : void RegExpEngine::DotPrint(const char* label,
4666 : RegExpNode* node,
4667 : bool ignore_case) {
4668 : StdoutStream os;
4669 : DotPrinter printer(os, ignore_case);
4670 : printer.PrintNode(label, node);
4671 : }
4672 :
4673 :
4674 : #endif // DEBUG
4675 :
4676 :
4677 : // -------------------------------------------------------------------
4678 : // Tree to graph conversion
4679 :
4680 2977278 : RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4681 : RegExpNode* on_success) {
4682 : ZoneList<TextElement>* elms =
4683 992426 : new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4684 992426 : elms->Add(TextElement::Atom(this), compiler->zone());
4685 : return new (compiler->zone())
4686 992426 : TextNode(elms, compiler->read_backward(), on_success);
4687 : }
4688 :
4689 :
4690 34706 : RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4691 : RegExpNode* on_success) {
4692 : return new (compiler->zone())
4693 34706 : TextNode(elements(), compiler->read_backward(), on_success);
4694 : }
4695 :
4696 :
4697 1105534 : static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4698 : const int* special_class,
4699 : int length) {
4700 552767 : length--; // Remove final marker.
4701 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4702 : DCHECK_NE(0, ranges->length());
4703 : DCHECK_NE(0, length);
4704 : DCHECK_NE(0, special_class[0]);
4705 552767 : if (ranges->length() != (length >> 1) + 1) {
4706 : return false;
4707 : }
4708 10160 : CharacterRange range = ranges->at(0);
4709 10160 : if (range.from() != 0) {
4710 : return false;
4711 : }
4712 24877 : for (int i = 0; i < length; i += 2) {
4713 25432 : if (special_class[i] != (range.to() + 1)) {
4714 : return false;
4715 : }
4716 49754 : range = ranges->at((i >> 1) + 1);
4717 24877 : if (special_class[i+1] != range.from()) {
4718 : return false;
4719 : }
4720 : }
4721 7772 : if (range.to() != String::kMaxCodePoint) {
4722 : return false;
4723 : }
4724 7772 : return true;
4725 : }
4726 :
4727 :
4728 1094384 : static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4729 : const int* special_class,
4730 : int length) {
4731 547192 : length--; // Remove final marker.
4732 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4733 547192 : if (ranges->length() * 2 != length) {
4734 : return false;
4735 : }
4736 11204 : for (int i = 0; i < length; i += 2) {
4737 29200 : CharacterRange range = ranges->at(i >> 1);
4738 25815 : if (range.from() != special_class[i] ||
4739 11215 : range.to() != special_class[i + 1] - 1) {
4740 : return false;
4741 : }
4742 : }
4743 : return true;
4744 : }
4745 :
4746 :
4747 196846 : bool RegExpCharacterClass::is_standard(Zone* zone) {
4748 : // TODO(lrn): Remove need for this function, by not throwing away information
4749 : // along the way.
4750 196846 : if (is_negated()) {
4751 : return false;
4752 : }
4753 191179 : if (set_.is_standard()) {
4754 : return true;
4755 : }
4756 187912 : if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4757 : set_.set_standard_set_type('s');
4758 607 : return true;
4759 : }
4760 187305 : if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4761 : set_.set_standard_set_type('S');
4762 207 : return true;
4763 : }
4764 187098 : if (CompareInverseRanges(set_.ranges(zone),
4765 : kLineTerminatorRanges,
4766 187098 : kLineTerminatorRangeCount)) {
4767 : set_.set_standard_set_type('.');
4768 7453 : return true;
4769 : }
4770 179645 : if (CompareRanges(set_.ranges(zone),
4771 : kLineTerminatorRanges,
4772 179645 : kLineTerminatorRangeCount)) {
4773 : set_.set_standard_set_type('n');
4774 10 : return true;
4775 : }
4776 179635 : if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4777 : set_.set_standard_set_type('w');
4778 1271 : return true;
4779 : }
4780 178364 : if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4781 : set_.set_standard_set_type('W');
4782 112 : return true;
4783 : }
4784 : return false;
4785 : }
4786 :
4787 :
4788 2587 : UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4789 76865 : ZoneList<CharacterRange>* base)
4790 : : zone_(zone),
4791 : table_(zone),
4792 : bmp_(nullptr),
4793 : lead_surrogates_(nullptr),
4794 : trail_surrogates_(nullptr),
4795 5174 : non_bmp_(nullptr) {
4796 : // The unicode range splitter categorizes given character ranges into:
4797 : // - Code points from the BMP representable by one code unit.
4798 : // - Code points outside the BMP that need to be split into surrogate pairs.
4799 : // - Lone lead surrogates.
4800 : // - Lone trail surrogates.
4801 : // Lone surrogates are valid code points, even though no actual characters.
4802 : // They require special matching to make sure we do not split surrogate pairs.
4803 : // We use the dispatch table to accomplish this. The base range is split up
4804 : // by the table by the overlay ranges, and the Call callback is used to
4805 : // filter and collect ranges for each category.
4806 153730 : for (int i = 0; i < base->length(); i++) {
4807 148556 : table_.AddRange(base->at(i), kBase, zone_);
4808 : }
4809 : // Add overlay ranges.
4810 : table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4811 2587 : kBmpCodePoints, zone_);
4812 : table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4813 2587 : kLeadSurrogates, zone_);
4814 : table_.AddRange(
4815 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4816 2587 : kTrailSurrogates, zone_);
4817 : table_.AddRange(
4818 : CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4819 2587 : kBmpCodePoints, zone_);
4820 : table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4821 2587 : kNonBmpCodePoints, zone_);
4822 : table_.ForEach(this);
4823 2587 : }
4824 :
4825 :
4826 159101 : void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4827 159101 : OutSet* outset = entry.out_set();
4828 318202 : if (!outset->Get(kBase)) return;
4829 : ZoneList<CharacterRange>** target = nullptr;
4830 78283 : if (outset->Get(kBmpCodePoints)) {
4831 50718 : target = &bmp_;
4832 27565 : } else if (outset->Get(kLeadSurrogates)) {
4833 1175 : target = &lead_surrogates_;
4834 26390 : } else if (outset->Get(kTrailSurrogates)) {
4835 1175 : target = &trail_surrogates_;
4836 : } else {
4837 : DCHECK(outset->Get(kNonBmpCodePoints));
4838 25215 : target = &non_bmp_;
4839 : }
4840 78283 : if (*target == nullptr)
4841 12134 : *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4842 78283 : (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4843 : }
4844 :
4845 6836 : void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4846 2582 : RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4847 : ZoneList<CharacterRange>* bmp = splitter->bmp();
4848 3037 : if (bmp == nullptr) return;
4849 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4850 : result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4851 : compiler->zone(), bmp, compiler->read_backward(), on_success,
4852 4254 : default_flags)));
4853 : }
4854 :
4855 31232 : void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4856 : RegExpNode* on_success,
4857 2582 : UnicodeRangeSplitter* splitter) {
4858 26790 : ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4859 3584 : if (non_bmp == nullptr) return;
4860 : DCHECK(!compiler->one_byte());
4861 : Zone* zone = compiler->zone();
4862 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4863 1580 : CharacterRange::Canonicalize(non_bmp);
4864 53580 : for (int i = 0; i < non_bmp->length(); i++) {
4865 : // Match surrogate pair.
4866 : // E.g. [\u10005-\u11005] becomes
4867 : // \ud800[\udc05-\udfff]|
4868 : // [\ud801-\ud803][\udc00-\udfff]|
4869 : // \ud804[\udc00-\udc05]
4870 25210 : uc32 from = non_bmp->at(i).from();
4871 25210 : uc32 to = non_bmp->at(i).to();
4872 25210 : uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4873 : uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4874 25210 : uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4875 : uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4876 25210 : if (from_l == to_l) {
4877 : // The lead surrogate is the same.
4878 : result->AddAlternative(
4879 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4880 : zone, CharacterRange::Singleton(from_l),
4881 : CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4882 22930 : on_success, default_flags)));
4883 : } else {
4884 2280 : if (from_t != kTrailSurrogateStart) {
4885 : // Add [from_l][from_t-\udfff]
4886 : result->AddAlternative(
4887 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4888 : zone, CharacterRange::Singleton(from_l),
4889 : CharacterRange::Range(from_t, kTrailSurrogateEnd),
4890 1155 : compiler->read_backward(), on_success, default_flags)));
4891 1155 : from_l++;
4892 : }
4893 2280 : if (to_t != kTrailSurrogateEnd) {
4894 : // Add [to_l][\udc00-to_t]
4895 : result->AddAlternative(
4896 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4897 : zone, CharacterRange::Singleton(to_l),
4898 : CharacterRange::Range(kTrailSurrogateStart, to_t),
4899 895 : compiler->read_backward(), on_success, default_flags)));
4900 895 : to_l--;
4901 : }
4902 2280 : if (from_l <= to_l) {
4903 : // Add [from_l-to_l][\udc00-\udfff]
4904 : result->AddAlternative(
4905 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4906 : zone, CharacterRange::Range(from_l, to_l),
4907 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4908 2090 : compiler->read_backward(), on_success, default_flags)));
4909 : }
4910 : }
4911 : }
4912 : }
4913 :
4914 1175 : RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4915 1175 : RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
4916 : ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
4917 : JSRegExp::Flags flags) {
4918 : Zone* zone = compiler->zone();
4919 : RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4920 1175 : zone, match, read_backward, on_success, flags);
4921 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4922 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4923 : RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4924 1175 : position_register);
4925 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4926 1175 : zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
4927 1175 : return lookaround.ForMatch(negative_match);
4928 : }
4929 :
4930 1165 : RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4931 1165 : RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
4932 : ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
4933 : bool read_backward, JSRegExp::Flags flags) {
4934 : Zone* zone = compiler->zone();
4935 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4936 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4937 : RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4938 1165 : position_register);
4939 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4940 1165 : zone, lookahead, read_backward, lookaround.on_match_success(), flags);
4941 : return TextNode::CreateForCharacterRanges(
4942 1165 : zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
4943 : }
4944 :
4945 4922 : void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4946 : RegExpNode* on_success,
4947 2582 : UnicodeRangeSplitter* splitter) {
4948 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4949 : ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
4950 3994 : if (lead_surrogates == nullptr) return;
4951 : Zone* zone = compiler->zone();
4952 : // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4953 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
4954 1170 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
4955 :
4956 : RegExpNode* match;
4957 1170 : if (compiler->read_backward()) {
4958 : // Reading backward. Assert that reading forward, there is no trail
4959 : // surrogate, and then backward match the lead surrogate.
4960 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
4961 : compiler, trail_surrogates, lead_surrogates, on_success, true,
4962 95 : default_flags);
4963 : } else {
4964 : // Reading forward. Forward match the lead surrogate and assert that
4965 : // no trail surrogate follows.
4966 : match = MatchAndNegativeLookaroundInReadDirection(
4967 : compiler, lead_surrogates, trail_surrogates, on_success, false,
4968 1075 : default_flags);
4969 : }
4970 : result->AddAlternative(GuardedAlternative(match));
4971 : }
4972 :
4973 4922 : void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4974 : RegExpNode* on_success,
4975 2582 : UnicodeRangeSplitter* splitter) {
4976 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4977 : ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
4978 3994 : if (trail_surrogates == nullptr) return;
4979 : Zone* zone = compiler->zone();
4980 : // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
4981 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
4982 1170 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
4983 :
4984 : RegExpNode* match;
4985 1170 : if (compiler->read_backward()) {
4986 : // Reading backward. Backward match the trail surrogate and assert that no
4987 : // lead surrogate precedes it.
4988 : match = MatchAndNegativeLookaroundInReadDirection(
4989 : compiler, trail_surrogates, lead_surrogates, on_success, true,
4990 90 : default_flags);
4991 : } else {
4992 : // Reading forward. Assert that reading backward, there is no lead
4993 : // surrogate, and then forward match the trail surrogate.
4994 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
4995 : compiler, lead_surrogates, trail_surrogates, on_success, false,
4996 1080 : default_flags);
4997 : }
4998 : result->AddAlternative(GuardedAlternative(match));
4999 : }
5000 :
5001 0 : RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5002 : RegExpNode* on_success) {
5003 : // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5004 : DCHECK(!compiler->read_backward());
5005 : Zone* zone = compiler->zone();
5006 : // Advance any character. If the character happens to be a lead surrogate and
5007 : // we advanced into the middle of a surrogate pair, it will work out, as
5008 : // nothing will match from there. We will have to advance again, consuming
5009 : // the associated trail surrogate.
5010 : ZoneList<CharacterRange>* range = CharacterRange::List(
5011 0 : zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5012 : JSRegExp::Flags default_flags = JSRegExp::Flags();
5013 : return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
5014 0 : default_flags);
5015 : }
5016 :
5017 124987 : void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5018 : #ifdef V8_INTL_SUPPORT
5019 : DCHECK(CharacterRange::IsCanonical(ranges));
5020 :
5021 : // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
5022 : // See also https://crbug.com/v8/6727.
5023 : // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
5024 : // which we use frequently internally. But large ranges can also easily be
5025 : // created by the user. We might want to have a more general caching mechanism
5026 : // for such ranges.
5027 1728 : if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
5028 :
5029 : // Use ICU to compute the case fold closure over the ranges.
5030 1189 : icu::UnicodeSet set;
5031 247596 : for (int i = 0; i < ranges->length(); i++) {
5032 122609 : set.add(ranges->at(i).from(), ranges->at(i).to());
5033 : }
5034 : ranges->Clear();
5035 1189 : set.closeOver(USET_CASE_INSENSITIVE);
5036 : // Full case mapping map single characters to multiple characters.
5037 : // Those are represented as strings in the set. Remove them so that
5038 : // we end up with only simple and common case mappings.
5039 1189 : set.removeAllStrings();
5040 19360 : for (int i = 0; i < set.getRangeCount(); i++) {
5041 18171 : ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5042 18171 : zone);
5043 : }
5044 : // No errors and everything we collected have been ranges.
5045 1189 : CharacterRange::Canonicalize(ranges);
5046 : #endif // V8_INTL_SUPPORT
5047 : }
5048 :
5049 :
5050 529211 : RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5051 : RegExpNode* on_success) {
5052 : set_.Canonicalize();
5053 : Zone* zone = compiler->zone();
5054 2612 : ZoneList<CharacterRange>* ranges = this->ranges(zone);
5055 175908 : if (NeedsUnicodeCaseEquivalents(flags_)) {
5056 949 : AddUnicodeCaseEquivalents(ranges, zone);
5057 : }
5058 182599 : if (IsUnicode(flags_) && !compiler->one_byte() &&
5059 : !contains_split_surrogate()) {
5060 2612 : if (is_negated()) {
5061 : ZoneList<CharacterRange>* negated =
5062 140 : new (zone) ZoneList<CharacterRange>(2, zone);
5063 140 : CharacterRange::Negate(ranges, negated, zone);
5064 : ranges = negated;
5065 : }
5066 2612 : if (ranges->length() == 0) {
5067 : JSRegExp::Flags default_flags;
5068 : RegExpCharacterClass* fail =
5069 60 : new (zone) RegExpCharacterClass(zone, ranges, default_flags);
5070 60 : return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5071 : }
5072 2582 : if (standard_type() == '*') {
5073 0 : return UnanchoredAdvance(compiler, on_success);
5074 : } else {
5075 2582 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5076 2582 : UnicodeRangeSplitter splitter(zone, ranges);
5077 2582 : AddBmpCharacters(compiler, result, on_success, &splitter);
5078 2582 : AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5079 2582 : AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5080 2582 : AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5081 : return result;
5082 : }
5083 : } else {
5084 346592 : return new (zone) TextNode(this, compiler->read_backward(), on_success);
5085 : }
5086 : }
5087 :
5088 :
5089 146822 : int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5090 146822 : RegExpAtom* atom1 = (*a)->AsAtom();
5091 146822 : RegExpAtom* atom2 = (*b)->AsAtom();
5092 146822 : uc16 character1 = atom1->data().at(0);
5093 146822 : uc16 character2 = atom2->data().at(0);
5094 146822 : if (character1 < character2) return -1;
5095 129859 : if (character1 > character2) return 1;
5096 17383 : return 0;
5097 : }
5098 :
5099 :
5100 : static unibrow::uchar Canonical(
5101 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5102 : unibrow::uchar c) {
5103 : unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5104 101306 : int length = canonicalize->get(c, '\0', chars);
5105 : DCHECK_LE(length, 1);
5106 : unibrow::uchar canonical = c;
5107 101306 : if (length == 1) canonical = chars[0];
5108 : return canonical;
5109 : }
5110 :
5111 :
5112 63973 : int CompareFirstCharCaseIndependent(
5113 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5114 : RegExpTree* const* a, RegExpTree* const* b) {
5115 63973 : RegExpAtom* atom1 = (*a)->AsAtom();
5116 63973 : RegExpAtom* atom2 = (*b)->AsAtom();
5117 63973 : unibrow::uchar character1 = atom1->data().at(0);
5118 63973 : unibrow::uchar character2 = atom2->data().at(0);
5119 63973 : if (character1 == character2) return 0;
5120 46025 : if (character1 >= 'a' || character2 >= 'a') {
5121 : character1 = Canonical(canonicalize, character1);
5122 : character2 = Canonical(canonicalize, character2);
5123 : }
5124 46025 : return static_cast<int>(character1) - static_cast<int>(character2);
5125 : }
5126 :
5127 :
5128 : // We can stable sort runs of atoms, since the order does not matter if they
5129 : // start with different characters.
5130 : // Returns true if any consecutive atoms were found.
5131 9774 : bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5132 9300 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5133 : int length = alternatives->length();
5134 : bool found_consecutive_atoms = false;
5135 17927 : for (int i = 0; i < length; i++) {
5136 34665 : while (i < length) {
5137 33846 : RegExpTree* alternative = alternatives->at(i);
5138 33846 : if (alternative->IsAtom()) break;
5139 25219 : i++;
5140 : }
5141 : // i is length or it is the index of an atom.
5142 9446 : if (i == length) break;
5143 : int first_atom = i;
5144 8627 : JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
5145 8627 : i++;
5146 73375 : while (i < length) {
5147 56362 : RegExpTree* alternative = alternatives->at(i);
5148 56362 : if (!alternative->IsAtom()) break;
5149 56121 : if (alternative->AsAtom()->flags() != flags) break;
5150 56121 : i++;
5151 : }
5152 : // Sort atoms to get ones with common prefixes together.
5153 : // This step is more tricky if we are in a case-independent regexp,
5154 : // because it would change /is|I/ to /I|is/, and order matters when
5155 : // the regexp parts don't match only disjoint starting points. To fix
5156 : // this we have a version of CompareFirstChar that uses case-
5157 : // independent character classes for comparison.
5158 : DCHECK_LT(first_atom, alternatives->length());
5159 : DCHECK_LE(i, alternatives->length());
5160 : DCHECK_LE(first_atom, i);
5161 8627 : if (IgnoreCase(flags)) {
5162 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5163 474 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5164 : auto compare_closure =
5165 : [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5166 63973 : return CompareFirstCharCaseIndependent(canonicalize, a, b);
5167 63973 : };
5168 474 : alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5169 : } else {
5170 8153 : alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5171 : }
5172 8627 : if (i - first_atom > 1) found_consecutive_atoms = true;
5173 : }
5174 9300 : return found_consecutive_atoms;
5175 : }
5176 :
5177 :
5178 : // Optimizes ab|ac|az to a(?:b|c|d).
5179 12998 : void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5180 : Zone* zone = compiler->zone();
5181 8370 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5182 : int length = alternatives->length();
5183 :
5184 : int write_posn = 0;
5185 : int i = 0;
5186 80602 : while (i < length) {
5187 63862 : RegExpTree* alternative = alternatives->at(i);
5188 63862 : if (!alternative->IsAtom()) {
5189 16002 : alternatives->at(write_posn++) = alternatives->at(i);
5190 8001 : i++;
5191 : continue;
5192 : }
5193 55861 : RegExpAtom* const atom = alternative->AsAtom();
5194 : JSRegExp::Flags flags = atom->flags();
5195 55861 : unibrow::uchar common_prefix = atom->data().at(0);
5196 : int first_with_prefix = i;
5197 : int prefix_length = atom->length();
5198 55861 : i++;
5199 120352 : while (i < length) {
5200 56221 : alternative = alternatives->at(i);
5201 56221 : if (!alternative->IsAtom()) break;
5202 56121 : RegExpAtom* const atom = alternative->AsAtom();
5203 56121 : if (atom->flags() != flags) break;
5204 56121 : unibrow::uchar new_prefix = atom->data().at(0);
5205 56121 : if (new_prefix != common_prefix) {
5206 47695 : if (!IgnoreCase(flags)) break;
5207 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5208 4628 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5209 : new_prefix = Canonical(canonicalize, new_prefix);
5210 : common_prefix = Canonical(canonicalize, common_prefix);
5211 4628 : if (new_prefix != common_prefix) break;
5212 : }
5213 : prefix_length = Min(prefix_length, atom->length());
5214 8630 : i++;
5215 : }
5216 55861 : if (i > first_with_prefix + 2) {
5217 : // Found worthwhile run of alternatives with common prefix of at least one
5218 : // character. The sorting function above did not sort on more than one
5219 : // character for reasons of correctness, but there may still be a longer
5220 : // common prefix if the terms were similar or presorted in the input.
5221 : // Find out how long the common prefix is.
5222 268 : int run_length = i - first_with_prefix;
5223 268 : RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
5224 505 : for (int j = 1; j < run_length && prefix_length > 1; j++) {
5225 : RegExpAtom* old_atom =
5226 474 : alternatives->at(j + first_with_prefix)->AsAtom();
5227 357 : for (int k = 1; k < prefix_length; k++) {
5228 711 : if (atom->data().at(k) != old_atom->data().at(k)) {
5229 : prefix_length = k;
5230 : break;
5231 : }
5232 : }
5233 : }
5234 : RegExpAtom* prefix = new (zone)
5235 268 : RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
5236 268 : ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5237 268 : pair->Add(prefix, zone);
5238 : ZoneList<RegExpTree*>* suffixes =
5239 268 : new (zone) ZoneList<RegExpTree*>(run_length, zone);
5240 8934 : for (int j = 0; j < run_length; j++) {
5241 : RegExpAtom* old_atom =
5242 17332 : alternatives->at(j + first_with_prefix)->AsAtom();
5243 : int len = old_atom->length();
5244 8666 : if (len == prefix_length) {
5245 302 : suffixes->Add(new (zone) RegExpEmpty(), zone);
5246 : } else {
5247 : RegExpTree* suffix = new (zone) RegExpAtom(
5248 8515 : old_atom->data().SubVector(prefix_length, old_atom->length()),
5249 8515 : flags);
5250 8515 : suffixes->Add(suffix, zone);
5251 : }
5252 : }
5253 268 : pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5254 536 : alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5255 : } else {
5256 : // Just copy any non-worthwhile alternatives.
5257 55825 : for (int j = first_with_prefix; j < i; j++) {
5258 111650 : alternatives->at(write_posn++) = alternatives->at(j);
5259 : }
5260 : }
5261 : }
5262 : alternatives->Rewind(write_posn); // Trim end of array.
5263 8370 : }
5264 :
5265 :
5266 : // Optimizes b|c|z to [bcz].
5267 9300 : void RegExpDisjunction::FixSingleCharacterDisjunctions(
5268 9300 : RegExpCompiler* compiler) {
5269 : Zone* zone = compiler->zone();
5270 9300 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5271 : int length = alternatives->length();
5272 :
5273 : int write_posn = 0;
5274 : int i = 0;
5275 91973 : while (i < length) {
5276 73373 : RegExpTree* alternative = alternatives->at(i);
5277 73373 : if (!alternative->IsAtom()) {
5278 51456 : alternatives->at(write_posn++) = alternatives->at(i);
5279 25728 : i++;
5280 25728 : continue;
5281 : }
5282 47645 : RegExpAtom* const atom = alternative->AsAtom();
5283 47645 : if (atom->length() != 1) {
5284 78592 : alternatives->at(write_posn++) = alternatives->at(i);
5285 39296 : i++;
5286 39296 : continue;
5287 : }
5288 : JSRegExp::Flags flags = atom->flags();
5289 : DCHECK_IMPLIES(IsUnicode(flags),
5290 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5291 : bool contains_trail_surrogate =
5292 8349 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5293 : int first_in_run = i;
5294 8349 : i++;
5295 : // Find a run of single-character atom alternatives that have identical
5296 : // flags (case independence and unicode-ness).
5297 25135 : while (i < length) {
5298 16454 : alternative = alternatives->at(i);
5299 16454 : if (!alternative->IsAtom()) break;
5300 16223 : RegExpAtom* const atom = alternative->AsAtom();
5301 16223 : if (atom->length() != 1) break;
5302 8437 : if (atom->flags() != flags) break;
5303 : DCHECK_IMPLIES(IsUnicode(flags),
5304 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5305 : contains_trail_surrogate |=
5306 16874 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5307 8437 : i++;
5308 : }
5309 8349 : if (i > first_in_run + 1) {
5310 : // Found non-trivial run of single-character alternatives.
5311 271 : int run_length = i - first_in_run;
5312 : ZoneList<CharacterRange>* ranges =
5313 271 : new (zone) ZoneList<CharacterRange>(2, zone);
5314 8979 : for (int j = 0; j < run_length; j++) {
5315 17416 : RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5316 : DCHECK_EQ(old_atom->length(), 1);
5317 8708 : ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5318 : }
5319 : RegExpCharacterClass::CharacterClassFlags character_class_flags;
5320 271 : if (IsUnicode(flags) && contains_trail_surrogate) {
5321 : character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5322 : }
5323 271 : alternatives->at(write_posn++) = new (zone)
5324 813 : RegExpCharacterClass(zone, ranges, flags, character_class_flags);
5325 : } else {
5326 : // Just copy any trivial alternatives.
5327 8078 : for (int j = first_in_run; j < i; j++) {
5328 16156 : alternatives->at(write_posn++) = alternatives->at(j);
5329 : }
5330 : }
5331 : }
5332 : alternatives->Rewind(write_posn); // Trim end of array.
5333 9300 : }
5334 :
5335 :
5336 21592 : RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5337 10917 : RegExpNode* on_success) {
5338 30892 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5339 :
5340 10917 : if (alternatives->length() > 2) {
5341 9300 : bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5342 9300 : if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5343 9300 : FixSingleCharacterDisjunctions(compiler);
5344 9300 : if (alternatives->length() == 1) {
5345 242 : return alternatives->at(0)->ToNode(compiler, on_success);
5346 : }
5347 : }
5348 :
5349 : int length = alternatives->length();
5350 :
5351 : ChoiceNode* result =
5352 10675 : new(compiler->zone()) ChoiceNode(length, compiler->zone());
5353 87037 : for (int i = 0; i < length; i++) {
5354 : GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5355 76362 : on_success));
5356 : result->AddAlternative(alternative);
5357 : }
5358 : return result;
5359 : }
5360 :
5361 :
5362 926139 : RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5363 1852278 : RegExpNode* on_success) {
5364 : return ToNode(min(),
5365 : max(),
5366 : is_greedy(),
5367 : body(),
5368 : compiler,
5369 1852278 : on_success);
5370 : }
5371 :
5372 :
5373 : // Scoped object to keep track of how much we unroll quantifier loops in the
5374 : // regexp graph generator.
5375 : class RegExpExpansionLimiter {
5376 : public:
5377 : static const int kMaxExpansionFactor = 6;
5378 62037 : RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5379 : : compiler_(compiler),
5380 : saved_expansion_factor_(compiler->current_expansion_factor()),
5381 62037 : ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5382 : DCHECK_LT(0, factor);
5383 71577 : if (ok_to_expand_) {
5384 71577 : if (factor > kMaxExpansionFactor) {
5385 : // Avoid integer overflow of the current expansion factor.
5386 : ok_to_expand_ = false;
5387 : compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5388 : } else {
5389 71449 : int new_factor = saved_expansion_factor_ * factor;
5390 71449 : ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5391 : compiler->set_current_expansion_factor(new_factor);
5392 : }
5393 : }
5394 : }
5395 :
5396 : ~RegExpExpansionLimiter() {
5397 : compiler_->set_current_expansion_factor(saved_expansion_factor_);
5398 : }
5399 :
5400 : bool ok_to_expand() { return ok_to_expand_; }
5401 :
5402 : private:
5403 : RegExpCompiler* compiler_;
5404 : int saved_expansion_factor_;
5405 : bool ok_to_expand_;
5406 :
5407 : DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5408 : };
5409 :
5410 :
5411 1013027 : RegExpNode* RegExpQuantifier::ToNode(int min,
5412 : int max,
5413 : bool is_greedy,
5414 : RegExpTree* body,
5415 3029430 : RegExpCompiler* compiler,
5416 : RegExpNode* on_success,
5417 : bool not_at_start) {
5418 : // x{f, t} becomes this:
5419 : //
5420 : // (r++)<-.
5421 : // | `
5422 : // | (x)
5423 : // v ^
5424 : // (r=0)-->(?)---/ [if r < t]
5425 : // |
5426 : // [if r >= f] \----> ...
5427 : //
5428 :
5429 : // 15.10.2.5 RepeatMatcher algorithm.
5430 : // The parser has already eliminated the case where max is 0. In the case
5431 : // where max_match is zero the parser has removed the quantifier if min was
5432 : // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5433 :
5434 : // If we know that we cannot match zero length then things are a little
5435 : // simpler since we don't need to make the special zero length match check
5436 : // from step 2.1. If the min and max are small we can unroll a little in
5437 : // this case.
5438 : static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5439 : static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5440 1013027 : if (max == 0) return on_success; // This can happen due to recursion.
5441 1012672 : bool body_can_be_empty = (body->min_match() == 0);
5442 : int body_start_reg = RegExpCompiler::kNoRegister;
5443 1012672 : Interval capture_registers = body->CaptureRegisters();
5444 1012672 : bool needs_capture_clearing = !capture_registers.is_empty();
5445 : Zone* zone = compiler->zone();
5446 :
5447 1012672 : if (body_can_be_empty) {
5448 : body_start_reg = compiler->AllocateRegister();
5449 1012167 : } else if (compiler->optimize() && !needs_capture_clearing) {
5450 : // Only unroll if there are no captures and the body can't be
5451 : // empty.
5452 : {
5453 : RegExpExpansionLimiter limiter(
5454 62037 : compiler, min + ((max != min) ? 1 : 0));
5455 62037 : if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5456 4264 : int new_max = (max == kInfinity) ? max : max - min;
5457 : // Recurse once to get the loop or optional matches after the fixed
5458 : // ones.
5459 : RegExpNode* answer = ToNode(
5460 4264 : 0, new_max, is_greedy, body, compiler, on_success, true);
5461 : // Unroll the forced matches from 0 to min. This can cause chains of
5462 : // TextNodes (which the parser does not generate). These should be
5463 : // combined if it turns out they hinder good code generation.
5464 9228 : for (int i = 0; i < min; i++) {
5465 4964 : answer = body->ToNode(compiler, answer);
5466 : }
5467 : return answer;
5468 : }
5469 : }
5470 57773 : if (max <= kMaxUnrolledMaxMatches && min == 0) {
5471 : DCHECK_LT(0, max); // Due to the 'if' above.
5472 : RegExpExpansionLimiter limiter(compiler, max);
5473 9540 : if (limiter.ok_to_expand()) {
5474 : // Unroll the optional matches up to max.
5475 : RegExpNode* answer = on_success;
5476 9376 : for (int i = 0; i < max; i++) {
5477 9376 : ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5478 9376 : if (is_greedy) {
5479 : alternation->AddAlternative(
5480 9230 : GuardedAlternative(body->ToNode(compiler, answer)));
5481 : alternation->AddAlternative(GuardedAlternative(on_success));
5482 : } else {
5483 : alternation->AddAlternative(GuardedAlternative(on_success));
5484 : alternation->AddAlternative(
5485 146 : GuardedAlternative(body->ToNode(compiler, answer)));
5486 : }
5487 : answer = alternation;
5488 9688 : if (not_at_start && !compiler->read_backward()) {
5489 : alternation->set_not_at_start();
5490 : }
5491 : }
5492 : return answer;
5493 : }
5494 : }
5495 : }
5496 999129 : bool has_min = min > 0;
5497 999129 : bool has_max = max < RegExpTree::kInfinity;
5498 999129 : bool needs_counter = has_min || has_max;
5499 : int reg_ctr = needs_counter
5500 : ? compiler->AllocateRegister()
5501 999129 : : RegExpCompiler::kNoRegister;
5502 : LoopChoiceNode* center = new (zone)
5503 999129 : LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5504 1002958 : if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5505 : RegExpNode* loop_return = needs_counter
5506 : ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5507 999129 : : static_cast<RegExpNode*>(center);
5508 999129 : if (body_can_be_empty) {
5509 : // If the body can be empty we need to check if it was and then
5510 : // backtrack.
5511 : loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5512 : reg_ctr,
5513 : min,
5514 505 : loop_return);
5515 : }
5516 999129 : RegExpNode* body_node = body->ToNode(compiler, loop_return);
5517 999129 : if (body_can_be_empty) {
5518 : // If the body can be empty we need to store the start position
5519 : // so we can bail out if it was empty.
5520 505 : body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5521 : }
5522 999129 : if (needs_capture_clearing) {
5523 : // Before entering the body of this loop we need to clear captures.
5524 2304 : body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5525 : }
5526 : GuardedAlternative body_alt(body_node);
5527 999129 : if (has_max) {
5528 : Guard* body_guard =
5529 : new(zone) Guard(reg_ctr, Guard::LT, max);
5530 902580 : body_alt.AddGuard(body_guard, zone);
5531 : }
5532 : GuardedAlternative rest_alt(on_success);
5533 999129 : if (has_min) {
5534 : Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5535 1321 : rest_alt.AddGuard(rest_guard, zone);
5536 : }
5537 999129 : if (is_greedy) {
5538 : center->AddLoopAlternative(body_alt);
5539 : center->AddContinueAlternative(rest_alt);
5540 : } else {
5541 : center->AddContinueAlternative(rest_alt);
5542 : center->AddLoopAlternative(body_alt);
5543 : }
5544 999129 : if (needs_counter) {
5545 903324 : return ActionNode::SetRegister(reg_ctr, 0, center);
5546 : } else {
5547 : return center;
5548 : }
5549 : }
5550 :
5551 : namespace {
5552 : // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5553 : // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
5554 80 : RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5555 : RegExpNode* on_success,
5556 : RegExpAssertion::AssertionType type,
5557 : JSRegExp::Flags flags) {
5558 : DCHECK(NeedsUnicodeCaseEquivalents(flags));
5559 : Zone* zone = compiler->zone();
5560 : ZoneList<CharacterRange>* word_range =
5561 80 : new (zone) ZoneList<CharacterRange>(2, zone);
5562 80 : CharacterRange::AddClassEscape('w', word_range, true, zone);
5563 : int stack_register = compiler->UnicodeLookaroundStackRegister();
5564 : int position_register = compiler->UnicodeLookaroundPositionRegister();
5565 80 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5566 : // Add two choices. The (non-)boundary could start with a word or
5567 : // a non-word-character.
5568 240 : for (int i = 0; i < 2; i++) {
5569 160 : bool lookbehind_for_word = i == 0;
5570 : bool lookahead_for_word =
5571 160 : (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5572 : // Look to the left.
5573 : RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5574 160 : stack_register, position_register);
5575 : RegExpNode* backward = TextNode::CreateForCharacterRanges(
5576 160 : zone, word_range, true, lookbehind.on_match_success(), flags);
5577 : // Look to the right.
5578 : RegExpLookaround::Builder lookahead(lookahead_for_word,
5579 : lookbehind.ForMatch(backward),
5580 160 : stack_register, position_register);
5581 : RegExpNode* forward = TextNode::CreateForCharacterRanges(
5582 160 : zone, word_range, false, lookahead.on_match_success(), flags);
5583 160 : result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5584 : }
5585 80 : return result;
5586 : }
5587 : } // anonymous namespace
5588 :
5589 5473 : RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5590 5473 : RegExpNode* on_success) {
5591 : NodeInfo info;
5592 : Zone* zone = compiler->zone();
5593 :
5594 5473 : switch (assertion_type()) {
5595 : case START_OF_LINE:
5596 129 : return AssertionNode::AfterNewline(on_success);
5597 : case START_OF_INPUT:
5598 3011 : return AssertionNode::AtStart(on_success);
5599 : case BOUNDARY:
5600 : return NeedsUnicodeCaseEquivalents(flags_)
5601 : ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
5602 : flags_)
5603 176 : : AssertionNode::AtBoundary(on_success);
5604 : case NON_BOUNDARY:
5605 : return NeedsUnicodeCaseEquivalents(flags_)
5606 : ? BoundaryAssertionAsLookaround(compiler, on_success,
5607 : NON_BOUNDARY, flags_)
5608 154 : : AssertionNode::AtNonBoundary(on_success);
5609 : case END_OF_INPUT:
5610 1909 : return AssertionNode::AtEnd(on_success);
5611 : case END_OF_LINE: {
5612 : // Compile $ in multiline regexps as an alternation with a positive
5613 : // lookahead in one side and an end-of-input on the other side.
5614 : // We need two registers for the lookahead.
5615 : int stack_pointer_register = compiler->AllocateRegister();
5616 : int position_register = compiler->AllocateRegister();
5617 : // The ChoiceNode to distinguish between a newline and end-of-input.
5618 94 : ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5619 : // Create a newline atom.
5620 : ZoneList<CharacterRange>* newline_ranges =
5621 94 : new(zone) ZoneList<CharacterRange>(3, zone);
5622 94 : CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5623 : JSRegExp::Flags default_flags = JSRegExp::Flags();
5624 : RegExpCharacterClass* newline_atom =
5625 : new (zone) RegExpCharacterClass('n', default_flags);
5626 : TextNode* newline_matcher = new (zone) TextNode(
5627 : newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5628 : stack_pointer_register, position_register,
5629 : 0, // No captures inside.
5630 : -1, // Ignored if no captures.
5631 188 : on_success));
5632 : // Create an end-of-input matcher.
5633 : RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5634 : stack_pointer_register,
5635 : position_register,
5636 94 : newline_matcher);
5637 : // Add the two alternatives to the ChoiceNode.
5638 : GuardedAlternative eol_alternative(end_of_line);
5639 : result->AddAlternative(eol_alternative);
5640 94 : GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5641 : result->AddAlternative(end_alternative);
5642 : return result;
5643 : }
5644 : default:
5645 0 : UNREACHABLE();
5646 : }
5647 : return on_success;
5648 : }
5649 :
5650 :
5651 4740 : RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5652 2370 : RegExpNode* on_success) {
5653 : return new (compiler->zone())
5654 : BackReferenceNode(RegExpCapture::StartRegister(index()),
5655 : RegExpCapture::EndRegister(index()), flags_,
5656 4740 : compiler->read_backward(), on_success);
5657 : }
5658 :
5659 :
5660 1036 : RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5661 : RegExpNode* on_success) {
5662 1036 : return on_success;
5663 : }
5664 :
5665 :
5666 4368 : RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5667 : int stack_pointer_register,
5668 : int position_register,
5669 : int capture_register_count,
5670 : int capture_register_start)
5671 : : is_positive_(is_positive),
5672 : on_success_(on_success),
5673 : stack_pointer_register_(stack_pointer_register),
5674 4368 : position_register_(position_register) {
5675 4368 : if (is_positive_) {
5676 : on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5677 : stack_pointer_register, position_register, capture_register_count,
5678 1556 : capture_register_start, on_success_);
5679 : } else {
5680 : Zone* zone = on_success_->zone();
5681 : on_match_success_ = new (zone) NegativeSubmatchSuccess(
5682 : stack_pointer_register, position_register, capture_register_count,
5683 2812 : capture_register_start, zone);
5684 : }
5685 4368 : }
5686 :
5687 :
5688 4368 : RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5689 4368 : if (is_positive_) {
5690 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5691 1556 : position_register_, match);
5692 : } else {
5693 2812 : Zone* zone = on_success_->zone();
5694 : // We use a ChoiceNode to represent the negative lookaround. The first
5695 : // alternative is the negative match. On success, the end node backtracks.
5696 : // On failure, the second alternative is tried and leads to success.
5697 : // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5698 : // first exit when calculating quick checks.
5699 : ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5700 2812 : GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5701 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5702 2812 : position_register_, choice_node);
5703 : }
5704 : }
5705 :
5706 :
5707 3336 : RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5708 3336 : RegExpNode* on_success) {
5709 : int stack_pointer_register = compiler->AllocateRegister();
5710 : int position_register = compiler->AllocateRegister();
5711 :
5712 : const int registers_per_capture = 2;
5713 : const int register_of_first_capture = 2;
5714 1668 : int register_count = capture_count_ * registers_per_capture;
5715 : int register_start =
5716 1668 : register_of_first_capture + capture_from_ * registers_per_capture;
5717 :
5718 : RegExpNode* result;
5719 : bool was_reading_backward = compiler->read_backward();
5720 1668 : compiler->set_read_backward(type() == LOOKBEHIND);
5721 : Builder builder(is_positive(), on_success, stack_pointer_register,
5722 1668 : position_register, register_count, register_start);
5723 1668 : RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5724 1668 : result = builder.ForMatch(match);
5725 : compiler->set_read_backward(was_reading_backward);
5726 1668 : return result;
5727 : }
5728 :
5729 :
5730 27012 : RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5731 27012 : RegExpNode* on_success) {
5732 27012 : return ToNode(body(), index(), compiler, on_success);
5733 : }
5734 :
5735 :
5736 112549 : RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5737 : int index,
5738 112549 : RegExpCompiler* compiler,
5739 : RegExpNode* on_success) {
5740 : DCHECK_NOT_NULL(body);
5741 : int start_reg = RegExpCapture::StartRegister(index);
5742 : int end_reg = RegExpCapture::EndRegister(index);
5743 112549 : if (compiler->read_backward()) std::swap(start_reg, end_reg);
5744 112549 : RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5745 112549 : RegExpNode* body_node = body->ToNode(compiler, store_end);
5746 112549 : return ActionNode::StorePosition(start_reg, true, body_node);
5747 : }
5748 :
5749 :
5750 42404 : RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5751 21202 : RegExpNode* on_success) {
5752 21992 : ZoneList<RegExpTree*>* children = nodes();
5753 : RegExpNode* current = on_success;
5754 21202 : if (compiler->read_backward()) {
5755 1905 : for (int i = 0; i < children->length(); i++) {
5756 790 : current = children->at(i)->ToNode(compiler, current);
5757 : }
5758 : } else {
5759 997301 : for (int i = children->length() - 1; i >= 0; i--) {
5760 976424 : current = children->at(i)->ToNode(compiler, current);
5761 : }
5762 : }
5763 21202 : return current;
5764 : }
5765 :
5766 :
5767 7397 : static void AddClass(const int* elmv,
5768 : int elmc,
5769 : ZoneList<CharacterRange>* ranges,
5770 : Zone* zone) {
5771 7397 : elmc--;
5772 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5773 39575 : for (int i = 0; i < elmc; i += 2) {
5774 : DCHECK(elmv[i] < elmv[i + 1]);
5775 32178 : ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5776 : }
5777 7397 : }
5778 :
5779 :
5780 19915 : static void AddClassNegated(const int *elmv,
5781 : int elmc,
5782 : ZoneList<CharacterRange>* ranges,
5783 : Zone* zone) {
5784 19915 : elmc--;
5785 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5786 : DCHECK_NE(0x0000, elmv[0]);
5787 : DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
5788 : uc16 last = 0x0000;
5789 84877 : for (int i = 0; i < elmc; i += 2) {
5790 : DCHECK(last <= elmv[i] - 1);
5791 : DCHECK(elmv[i] < elmv[i + 1]);
5792 64962 : ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5793 64962 : last = elmv[i + 1];
5794 : }
5795 19915 : ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5796 19915 : }
5797 :
5798 110260 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5799 : bool add_unicode_case_equivalents,
5800 : Zone* zone) {
5801 110260 : if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5802 : // See #sec-runtime-semantics-wordcharacters-abstract-operation
5803 : // In case of unicode and ignore_case, we need to create the closure over
5804 : // case equivalent characters before negating.
5805 : ZoneList<CharacterRange>* new_ranges =
5806 240 : new (zone) ZoneList<CharacterRange>(2, zone);
5807 240 : AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5808 240 : AddUnicodeCaseEquivalents(new_ranges, zone);
5809 240 : if (type == 'W') {
5810 : ZoneList<CharacterRange>* negated =
5811 90 : new (zone) ZoneList<CharacterRange>(2, zone);
5812 90 : CharacterRange::Negate(new_ranges, negated, zone);
5813 : new_ranges = negated;
5814 : }
5815 : ranges->AddAll(*new_ranges, zone);
5816 110260 : return;
5817 : }
5818 110020 : AddClassEscape(type, ranges, zone);
5819 : }
5820 :
5821 110055 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5822 : Zone* zone) {
5823 110055 : switch (type) {
5824 : case 's':
5825 1703 : AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5826 1703 : break;
5827 : case 'S':
5828 778 : AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5829 778 : break;
5830 : case 'w':
5831 2786 : AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5832 2786 : break;
5833 : case 'W':
5834 307 : AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5835 307 : break;
5836 : case 'd':
5837 2480 : AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5838 2480 : break;
5839 : case 'D':
5840 268 : AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5841 268 : break;
5842 : case '.':
5843 : AddClassNegated(kLineTerminatorRanges,
5844 : kLineTerminatorRangeCount,
5845 : ranges,
5846 18562 : zone);
5847 18562 : break;
5848 : // This is not a character range as defined by the spec but a
5849 : // convenient shorthand for a character class that matches any
5850 : // character.
5851 : case '*':
5852 82983 : ranges->Add(CharacterRange::Everything(), zone);
5853 82983 : break;
5854 : // This is the set of characters matched by the $ and ^ symbols
5855 : // in multiline mode.
5856 : case 'n':
5857 : AddClass(kLineTerminatorRanges,
5858 : kLineTerminatorRangeCount,
5859 : ranges,
5860 188 : zone);
5861 188 : break;
5862 : default:
5863 0 : UNREACHABLE();
5864 : }
5865 110055 : }
5866 :
5867 :
5868 0 : Vector<const int> CharacterRange::GetWordBounds() {
5869 0 : return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5870 : }
5871 :
5872 : // static
5873 66955 : void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5874 66955 : ZoneList<CharacterRange>* ranges,
5875 : bool is_one_byte) {
5876 66955 : CharacterRange::Canonicalize(ranges);
5877 : int range_count = ranges->length();
5878 138908 : for (int i = 0; i < range_count; i++) {
5879 71953 : CharacterRange range = ranges->at(i);
5880 : uc32 bottom = range.from();
5881 74100 : if (bottom > String::kMaxUtf16CodeUnit) continue;
5882 : uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5883 : // Nothing to be done for surrogates.
5884 71953 : if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
5885 69906 : if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5886 1343 : if (bottom > String::kMaxOneByteCharCode) continue;
5887 1243 : if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5888 : }
5889 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5890 69806 : if (top == bottom) {
5891 : // If this is a singleton we just expand the one character.
5892 4608 : int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5893 7312 : for (int i = 0; i < length; i++) {
5894 2704 : uc32 chr = chars[i];
5895 2704 : if (chr != bottom) {
5896 1382 : ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5897 : }
5898 : }
5899 : } else {
5900 : // If this is a range we expand the characters block by block, expanding
5901 : // contiguous subranges (blocks) one at a time. The approach is as
5902 : // follows. For a given start character we look up the remainder of the
5903 : // block that contains it (represented by the end point), for instance we
5904 : // find 'z' if the character is 'c'. A block is characterized by the
5905 : // property that all characters uncanonicalize in the same way, except
5906 : // that each entry in the result is incremented by the distance from the
5907 : // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5908 : // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5909 : // we've found the end point we look up its uncanonicalization and
5910 : // produce a range for each element. For instance for [c-f] we look up
5911 : // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5912 : // it is not already contained in the input, so [c-f] will be skipped but
5913 : // [C-F] will be added. If this range is not completely contained in a
5914 : // block we do this for all the blocks covered by the range (handling
5915 : // characters that is not in a block as a "singleton block").
5916 : unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5917 : int pos = bottom;
5918 7796607 : while (pos <= top) {
5919 : int length =
5920 7731409 : isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5921 : uc32 block_end;
5922 7731409 : if (length == 0) {
5923 : block_end = pos;
5924 : } else {
5925 : DCHECK_EQ(1, length);
5926 6349 : block_end = equivalents[0];
5927 : }
5928 7731409 : int end = (block_end > top) ? top : block_end;
5929 : length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5930 7731409 : equivalents);
5931 8064891 : for (int i = 0; i < length; i++) {
5932 333482 : uc32 c = equivalents[i];
5933 333482 : uc32 range_from = c - (block_end - pos);
5934 333482 : uc32 range_to = c - (block_end - end);
5935 333482 : if (!(bottom <= range_from && range_to <= top)) {
5936 6669 : ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5937 : }
5938 : }
5939 7731409 : pos = end + 1;
5940 : }
5941 : }
5942 : }
5943 66955 : }
5944 :
5945 :
5946 10 : bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5947 : DCHECK_NOT_NULL(ranges);
5948 : int n = ranges->length();
5949 10 : if (n <= 1) return true;
5950 10 : int max = ranges->at(0).to();
5951 300 : for (int i = 1; i < n; i++) {
5952 290 : CharacterRange next_range = ranges->at(i);
5953 290 : if (next_range.from() <= max + 1) return false;
5954 : max = next_range.to();
5955 : }
5956 : return true;
5957 : }
5958 :
5959 :
5960 1899986 : ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5961 1899986 : if (ranges_ == nullptr) {
5962 82867 : ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5963 82867 : CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
5964 : }
5965 1899986 : return ranges_;
5966 : }
5967 :
5968 :
5969 : // Move a number of elements in a zonelist to another position
5970 : // in the same list. Handles overlapping source and target areas.
5971 92240 : static void MoveRanges(ZoneList<CharacterRange>* list,
5972 : int from,
5973 : int to,
5974 : int count) {
5975 : // Ranges are potentially overlapping.
5976 92240 : if (from < to) {
5977 10119743 : for (int i = count - 1; i >= 0; i--) {
5978 30117897 : list->at(to + i) = list->at(from + i);
5979 : }
5980 : } else {
5981 3599230 : for (int i = 0; i < count; i++) {
5982 10797690 : list->at(to + i) = list->at(from + i);
5983 : }
5984 : }
5985 92240 : }
5986 :
5987 :
5988 173178 : static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5989 : int count,
5990 : CharacterRange insert) {
5991 : // Inserts a range into list[0..count[, which must be sorted
5992 : // by from value and non-overlapping and non-adjacent, using at most
5993 : // list[0..count] for the result. Returns the number of resulting
5994 : // canonicalized ranges. Inserting a range may collapse existing ranges into
5995 : // fewer ranges, so the return value can be anything in the range 1..count+1.
5996 173178 : uc32 from = insert.from();
5997 173178 : uc32 to = insert.to();
5998 : int start_pos = 0;
5999 : int end_pos = count;
6000 18092819 : for (int i = count - 1; i >= 0; i--) {
6001 18009272 : CharacterRange current = list->at(i);
6002 18009272 : if (current.from() > to + 1) {
6003 : end_pos = i;
6004 142257 : } else if (current.to() + 1 < from) {
6005 89631 : start_pos = i + 1;
6006 : break;
6007 : }
6008 : }
6009 :
6010 : // Inserted range overlaps, or is adjacent to, ranges at positions
6011 : // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6012 : // not affected by the insertion.
6013 : // If start_pos == end_pos, the range must be inserted before start_pos.
6014 : // if start_pos < end_pos, the entire range from start_pos to end_pos
6015 : // must be merged with the insert range.
6016 :
6017 173178 : if (start_pos == end_pos) {
6018 : // Insert between existing ranges at position start_pos.
6019 132737 : if (start_pos < count) {
6020 80444 : MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6021 : }
6022 132737 : list->at(start_pos) = insert;
6023 132737 : return count + 1;
6024 : }
6025 40441 : if (start_pos + 1 == end_pos) {
6026 : // Replace single existing range at position start_pos.
6027 28501 : CharacterRange to_replace = list->at(start_pos);
6028 : int new_from = Min(to_replace.from(), from);
6029 : int new_to = Max(to_replace.to(), to);
6030 28501 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6031 : return count;
6032 : }
6033 : // Replace a number of existing ranges from start_pos to end_pos - 1.
6034 : // Move the remaining ranges down.
6035 :
6036 11940 : int new_from = Min(list->at(start_pos).from(), from);
6037 23880 : int new_to = Max(list->at(end_pos - 1).to(), to);
6038 11940 : if (end_pos < count) {
6039 11796 : MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6040 : }
6041 11940 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6042 11940 : return count - (end_pos - start_pos) + 1;
6043 : }
6044 :
6045 :
6046 20 : void CharacterSet::Canonicalize() {
6047 : // Special/default classes are always considered canonical. The result
6048 : // of calling ranges() will be sorted.
6049 175948 : if (ranges_ == nullptr) return;
6050 93304 : CharacterRange::Canonicalize(ranges_);
6051 : }
6052 :
6053 :
6054 497118 : void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6055 497118 : if (character_ranges->length() <= 1) return;
6056 : // Check whether ranges are already canonical (increasing, non-overlapping,
6057 : // non-adjacent).
6058 : int n = character_ranges->length();
6059 63837 : int max = character_ranges->at(0).to();
6060 : int i = 1;
6061 1405638 : while (i < n) {
6062 1287050 : CharacterRange current = character_ranges->at(i);
6063 1287050 : if (current.from() <= max + 1) {
6064 : break;
6065 : }
6066 : max = current.to();
6067 1277964 : i++;
6068 : }
6069 : // Canonical until the i'th range. If that's all of them, we are done.
6070 63837 : if (i == n) return;
6071 :
6072 : // The ranges at index i and forward are not canonicalized. Make them so by
6073 : // doing the equivalent of insertion sort (inserting each into the previous
6074 : // list, in order).
6075 : // Notice that inserting a range can reduce the number of ranges in the
6076 : // result due to combining of adjacent and overlapping ranges.
6077 : int read = i; // Range to insert.
6078 : int num_canonical = i; // Length of canonicalized part of list.
6079 173178 : do {
6080 : num_canonical = InsertRangeInCanonicalList(character_ranges,
6081 : num_canonical,
6082 173178 : character_ranges->at(read));
6083 173178 : read++;
6084 : } while (read < n);
6085 : character_ranges->Rewind(num_canonical);
6086 :
6087 : DCHECK(CharacterRange::IsCanonical(character_ranges));
6088 : }
6089 :
6090 :
6091 230 : void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6092 : ZoneList<CharacterRange>* negated_ranges,
6093 : Zone* zone) {
6094 : DCHECK(CharacterRange::IsCanonical(ranges));
6095 : DCHECK_EQ(0, negated_ranges->length());
6096 : int range_count = ranges->length();
6097 : uc32 from = 0;
6098 : int i = 0;
6099 460 : if (range_count > 0 && ranges->at(0).from() == 0) {
6100 40 : from = ranges->at(0).to() + 1;
6101 : i = 1;
6102 : }
6103 7480 : while (i < range_count) {
6104 7250 : CharacterRange range = ranges->at(i);
6105 7250 : negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6106 7250 : from = range.to() + 1;
6107 7250 : i++;
6108 : }
6109 230 : if (from < String::kMaxCodePoint) {
6110 : negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6111 180 : zone);
6112 : }
6113 230 : }
6114 :
6115 :
6116 : // -------------------------------------------------------------------
6117 : // Splay tree
6118 :
6119 :
6120 490705 : OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6121 236859 : if (Get(value))
6122 : return this;
6123 236854 : if (successors(zone) != nullptr) {
6124 194225 : for (int i = 0; i < successors(zone)->length(); i++) {
6125 414087 : OutSet* successor = successors(zone)->at(i);
6126 414087 : if (successor->Get(value))
6127 : return successor;
6128 : }
6129 : } else {
6130 5694 : successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6131 : }
6132 33984 : OutSet* result = new(zone) OutSet(first_, remaining_);
6133 16992 : result->Set(value, zone);
6134 16992 : successors(zone)->Add(result, zone);
6135 16992 : return result;
6136 : }
6137 :
6138 :
6139 712186 : void OutSet::Set(unsigned value, Zone *zone) {
6140 712186 : if (value < kFirstLimit) {
6141 387552 : first_ |= (1 << value);
6142 : } else {
6143 889320 : if (remaining_ == nullptr)
6144 84582 : remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6145 889320 : if (remaining_->is_empty() || !remaining_->Contains(value))
6146 323584 : remaining_->Add(value, zone);
6147 : }
6148 712186 : }
6149 :
6150 :
6151 31126350 : bool OutSet::Get(unsigned value) const {
6152 31126350 : if (value < kFirstLimit) {
6153 6624863 : return (first_ & (1 << value)) != 0;
6154 24501487 : } else if (remaining_ == nullptr) {
6155 : return false;
6156 : } else {
6157 16427384 : return remaining_->Contains(value);
6158 : }
6159 : }
6160 :
6161 :
6162 : const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6163 :
6164 :
6165 88433 : void DispatchTable::AddRange(CharacterRange full_range, int value,
6166 : Zone* zone) {
6167 88433 : CharacterRange current = full_range;
6168 88433 : if (tree()->is_empty()) {
6169 : // If this is the first range we just insert into the table.
6170 : ZoneSplayTree<Config>::Locator loc;
6171 2647 : bool inserted = tree()->Insert(current.from(), &loc);
6172 : DCHECK(inserted);
6173 : USE(inserted);
6174 : loc.set_value(Entry(current.from(), current.to(),
6175 2647 : empty()->Extend(value, zone)));
6176 88433 : return;
6177 : }
6178 : // First see if there is a range to the left of this one that
6179 : // overlaps.
6180 : ZoneSplayTree<Config>::Locator loc;
6181 85786 : if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6182 163758 : Entry* entry = &loc.value();
6183 : // If we've found a range that overlaps with this one, and it
6184 : // starts strictly to the left of this one, we have to fix it
6185 : // because the following code only handles ranges that start on
6186 : // or after the start point of the range we're adding.
6187 162958 : if (entry->from() < current.from() && entry->to() >= current.from()) {
6188 : // Snap the overlapping range in half around the start point of
6189 : // the range we're adding.
6190 : CharacterRange left =
6191 400 : CharacterRange::Range(entry->from(), current.from() - 1);
6192 : CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6193 : // The left part of the overlapping range doesn't overlap.
6194 : // Truncate the whole entry to be just the left part.
6195 : entry->set_to(left.to());
6196 : // The right part is the one that overlaps. We add this part
6197 : // to the map and let the next step deal with merging it with
6198 : // the range we're adding.
6199 : ZoneSplayTree<Config>::Locator loc;
6200 400 : bool inserted = tree()->Insert(right.from(), &loc);
6201 : DCHECK(inserted);
6202 : USE(inserted);
6203 : loc.set_value(Entry(right.from(),
6204 : right.to(),
6205 : entry->out_set()));
6206 : }
6207 : }
6208 166674 : while (current.is_valid()) {
6209 406291 : if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6210 326065 : (loc.value().from() <= current.to()) &&
6211 80888 : (loc.value().to() >= current.from())) {
6212 320192 : Entry* entry = &loc.value();
6213 : // We have overlap. If there is space between the start point of
6214 : // the range we're adding and where the overlapping range starts
6215 : // then we have to add a range covering just that space.
6216 80888 : if (current.from() < entry->from()) {
6217 : ZoneSplayTree<Config>::Locator ins;
6218 73098 : bool inserted = tree()->Insert(current.from(), &ins);
6219 : DCHECK(inserted);
6220 : USE(inserted);
6221 : ins.set_value(Entry(current.from(),
6222 : entry->from() - 1,
6223 146196 : empty()->Extend(value, zone)));
6224 : current.set_from(entry->from());
6225 : }
6226 : DCHECK_EQ(current.from(), entry->from());
6227 : // If the overlapping range extends beyond the one we want to add
6228 : // we have to snap the right part off and add it separately.
6229 80888 : if (entry->to() > current.to()) {
6230 : ZoneSplayTree<Config>::Locator ins;
6231 4430 : bool inserted = tree()->Insert(current.to() + 1, &ins);
6232 : DCHECK(inserted);
6233 : USE(inserted);
6234 : ins.set_value(Entry(current.to() + 1,
6235 : entry->to(),
6236 : entry->out_set()));
6237 : entry->set_to(current.to());
6238 : }
6239 : DCHECK(entry->to() <= current.to());
6240 : // The overlapping range is now completely contained by the range
6241 : // we're adding so we can just update it and move the start point
6242 : // of the range we're adding just past it.
6243 : entry->AddValue(value, zone);
6244 : DCHECK(entry->to() + 1 > current.from());
6245 80888 : current.set_from(entry->to() + 1);
6246 : } else {
6247 : // There is no overlap so we can just add the range
6248 : ZoneSplayTree<Config>::Locator ins;
6249 80226 : bool inserted = tree()->Insert(current.from(), &ins);
6250 : DCHECK(inserted);
6251 : USE(inserted);
6252 : ins.set_value(Entry(current.from(),
6253 : current.to(),
6254 80226 : empty()->Extend(value, zone)));
6255 : break;
6256 : }
6257 : }
6258 : }
6259 :
6260 :
6261 55010 : OutSet* DispatchTable::Get(uc32 value) {
6262 : ZoneSplayTree<Config>::Locator loc;
6263 55010 : if (!tree()->FindGreatestLessThan(value, &loc))
6264 0 : return empty();
6265 93895 : Entry* entry = &loc.value();
6266 55010 : if (value <= entry->to())
6267 38885 : return entry->out_set();
6268 : else
6269 16125 : return empty();
6270 : }
6271 :
6272 :
6273 : // -------------------------------------------------------------------
6274 : // Analysis
6275 :
6276 :
6277 1080011 : void Analysis::EnsureAnalyzed(RegExpNode* that) {
6278 : StackLimitCheck check(isolate());
6279 1080011 : if (check.HasOverflowed()) {
6280 : fail("Stack overflow");
6281 : return;
6282 : }
6283 1079701 : if (that->info()->been_analyzed || that->info()->being_analyzed)
6284 : return;
6285 877180 : that->info()->being_analyzed = true;
6286 877180 : that->Accept(this);
6287 877180 : that->info()->being_analyzed = false;
6288 877180 : that->info()->been_analyzed = true;
6289 : }
6290 :
6291 :
6292 88029 : void Analysis::VisitEnd(EndNode* that) {
6293 : // nothing to do
6294 88029 : }
6295 :
6296 :
6297 684851 : void TextNode::CalculateOffsets() {
6298 314145 : int element_count = elements()->length();
6299 : // Set up the offsets of the elements relative to the start. This is a fixed
6300 : // quantity since a TextNode can only contain fixed-width things.
6301 : int cp_offset = 0;
6302 684851 : for (int i = 0; i < element_count; i++) {
6303 : TextElement& elm = elements()->at(i);
6304 : elm.set_cp_offset(cp_offset);
6305 370706 : cp_offset += elm.length();
6306 : }
6307 314145 : }
6308 :
6309 :
6310 948060 : void Analysis::VisitText(TextNode* that) {
6311 632040 : that->MakeCaseIndependent(isolate(), is_one_byte_);
6312 316020 : EnsureAnalyzed(that->on_success());
6313 316020 : if (!has_failed()) {
6314 314145 : that->CalculateOffsets();
6315 : }
6316 316020 : }
6317 :
6318 :
6319 579724 : void Analysis::VisitAction(ActionNode* that) {
6320 289862 : RegExpNode* target = that->on_success();
6321 289862 : EnsureAnalyzed(target);
6322 289862 : if (!has_failed()) {
6323 : // If the next node is interested in what it follows then this node
6324 : // has to be interested too so it can pass the information on.
6325 : that->info()->AddFromFollowing(target->info());
6326 : }
6327 289862 : }
6328 :
6329 :
6330 317114 : void Analysis::VisitChoice(ChoiceNode* that) {
6331 : NodeInfo* info = that->info();
6332 317114 : for (int i = 0; i < that->alternatives()->length(); i++) {
6333 132810 : RegExpNode* node = that->alternatives()->at(i).node();
6334 132810 : EnsureAnalyzed(node);
6335 158557 : if (has_failed()) return;
6336 : // Anything the following nodes need to know has to be known by
6337 : // this node also, so it can pass it on.
6338 : info->AddFromFollowing(node->info());
6339 : }
6340 : }
6341 :
6342 :
6343 893890 : void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6344 : NodeInfo* info = that->info();
6345 795460 : for (int i = 0; i < that->alternatives()->length(); i++) {
6346 697200 : RegExpNode* node = that->alternatives()->at(i).node();
6347 299470 : if (node != that->loop_node()) {
6348 149820 : EnsureAnalyzed(node);
6349 299640 : if (has_failed()) return;
6350 : info->AddFromFollowing(node->info());
6351 : }
6352 : }
6353 : // Check the loop last since it may need the value of this node
6354 : // to get a correct result.
6355 98260 : EnsureAnalyzed(that->loop_node());
6356 98260 : if (!has_failed()) {
6357 : info->AddFromFollowing(that->loop_node()->info());
6358 : }
6359 : }
6360 :
6361 :
6362 2310 : void Analysis::VisitBackReference(BackReferenceNode* that) {
6363 2310 : EnsureAnalyzed(that->on_success());
6364 2310 : }
6365 :
6366 :
6367 5392 : void Analysis::VisitAssertion(AssertionNode* that) {
6368 5392 : EnsureAnalyzed(that->on_success());
6369 5392 : }
6370 :
6371 :
6372 188 : void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6373 : BoyerMooreLookahead* bm,
6374 : bool not_at_start) {
6375 : // Working out the set of characters that a backreference can match is too
6376 : // hard, so we just say that any character can match.
6377 : bm->SetRest(offset);
6378 : SaveBMInfo(bm, not_at_start, offset);
6379 188 : }
6380 :
6381 :
6382 : STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6383 : RegExpMacroAssembler::kTableSize);
6384 :
6385 :
6386 7705 : void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6387 7705 : BoyerMooreLookahead* bm, bool not_at_start) {
6388 55106 : ZoneList<GuardedAlternative>* alts = alternatives();
6389 15410 : budget = (budget - 1) / alts->length();
6390 94802 : for (int i = 0; i < alts->length(); i++) {
6391 79627 : GuardedAlternative& alt = alts->at(i);
6392 39931 : if (alt.guards() != nullptr && alt.guards()->length() != 0) {
6393 : bm->SetRest(offset); // Give up trying to fill in info.
6394 : SaveBMInfo(bm, not_at_start, offset);
6395 7705 : return;
6396 : }
6397 39696 : alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6398 : }
6399 : SaveBMInfo(bm, not_at_start, offset);
6400 : }
6401 :
6402 :
6403 121520 : void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6404 1090963 : BoyerMooreLookahead* bm, bool not_at_start) {
6405 121520 : if (initial_offset >= bm->length()) return;
6406 : int offset = initial_offset;
6407 : int max_char = bm->max_char();
6408 516116 : for (int i = 0; i < elements()->length(); i++) {
6409 158635 : if (offset >= bm->length()) {
6410 114306 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6411 : return;
6412 : }
6413 142518 : TextElement text = elements()->at(i);
6414 142518 : if (text.text_type() == TextElement::ATOM) {
6415 : RegExpAtom* atom = text.atom();
6416 195547 : for (int j = 0; j < atom->length(); j++, offset++) {
6417 81589 : if (offset >= bm->length()) {
6418 5980 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6419 : return;
6420 : }
6421 151218 : uc16 character = atom->data()[j];
6422 75609 : if (IgnoreCase(atom->flags())) {
6423 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6424 : int length = GetCaseIndependentLetters(
6425 : isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6426 4614 : chars);
6427 13200 : for (int j = 0; j < length; j++) {
6428 17172 : bm->Set(offset, chars[j]);
6429 : }
6430 : } else {
6431 141990 : if (character <= max_char) bm->Set(offset, character);
6432 : }
6433 : }
6434 : } else {
6435 : DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6436 : RegExpCharacterClass* char_class = text.char_class();
6437 382501 : ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6438 98189 : if (char_class->is_negated()) {
6439 4379 : bm->SetAll(offset);
6440 : } else {
6441 671192 : for (int k = 0; k < ranges->length(); k++) {
6442 450335 : CharacterRange& range = ranges->at(k);
6443 288691 : if (range.from() > max_char) continue;
6444 : int to = Min(max_char, static_cast<int>(range.to()));
6445 161644 : bm->SetInterval(offset, Interval(range.from(), to));
6446 : }
6447 : }
6448 98189 : offset++;
6449 : }
6450 : }
6451 99423 : if (offset >= bm->length()) {
6452 90184 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6453 : return;
6454 : }
6455 9239 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6456 9239 : true); // Not at start after a text node.
6457 9239 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6458 : }
6459 :
6460 :
6461 : // -------------------------------------------------------------------
6462 : // Dispatch table construction
6463 :
6464 :
6465 0 : void DispatchTableConstructor::VisitEnd(EndNode* that) {
6466 : AddRange(CharacterRange::Everything());
6467 0 : }
6468 :
6469 :
6470 0 : void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6471 : node->set_being_calculated(true);
6472 0 : ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6473 0 : for (int i = 0; i < alternatives->length(); i++) {
6474 : set_choice_index(i);
6475 0 : alternatives->at(i).node()->Accept(this);
6476 : }
6477 : node->set_being_calculated(false);
6478 0 : }
6479 :
6480 :
6481 : class AddDispatchRange {
6482 : public:
6483 : explicit AddDispatchRange(DispatchTableConstructor* constructor)
6484 0 : : constructor_(constructor) { }
6485 : void Call(uc32 from, DispatchTable::Entry entry);
6486 : private:
6487 : DispatchTableConstructor* constructor_;
6488 : };
6489 :
6490 :
6491 0 : void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6492 0 : constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6493 0 : }
6494 :
6495 :
6496 0 : void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6497 0 : if (node->being_calculated())
6498 0 : return;
6499 0 : DispatchTable* table = node->GetTable(ignore_case_);
6500 : AddDispatchRange adder(this);
6501 : table->ForEach(&adder);
6502 : }
6503 :
6504 :
6505 0 : void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6506 : // TODO(160): Find the node that we refer back to and propagate its start
6507 : // set back to here. For now we just accept anything.
6508 : AddRange(CharacterRange::Everything());
6509 0 : }
6510 :
6511 :
6512 0 : void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6513 0 : RegExpNode* target = that->on_success();
6514 0 : target->Accept(this);
6515 0 : }
6516 :
6517 :
6518 7870 : static int CompareRangeByFrom(const CharacterRange* a,
6519 3935 : const CharacterRange* b) {
6520 11805 : return Compare<uc16>(a->from(), b->from());
6521 : }
6522 :
6523 :
6524 915 : void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6525 : ranges->Sort(CompareRangeByFrom);
6526 : uc16 last = 0;
6527 1720 : for (int i = 0; i < ranges->length(); i++) {
6528 805 : CharacterRange range = ranges->at(i);
6529 805 : if (last < range.from())
6530 525 : AddRange(CharacterRange::Range(last, range.from() - 1));
6531 805 : if (range.to() >= last) {
6532 715 : if (range.to() == String::kMaxCodePoint) {
6533 55 : return;
6534 : } else {
6535 715 : last = range.to() + 1;
6536 : }
6537 : }
6538 : }
6539 55 : AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6540 : }
6541 :
6542 :
6543 0 : void DispatchTableConstructor::VisitText(TextNode* that) {
6544 0 : TextElement elm = that->elements()->at(0);
6545 0 : switch (elm.text_type()) {
6546 : case TextElement::ATOM: {
6547 0 : uc16 c = elm.atom()->data()[0];
6548 0 : AddRange(CharacterRange::Range(c, c));
6549 : break;
6550 : }
6551 : case TextElement::CHAR_CLASS: {
6552 : RegExpCharacterClass* tree = elm.char_class();
6553 0 : ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6554 0 : if (tree->is_negated()) {
6555 0 : AddInverse(ranges);
6556 : } else {
6557 0 : for (int i = 0; i < ranges->length(); i++)
6558 : AddRange(ranges->at(i));
6559 : }
6560 : break;
6561 : }
6562 : default: {
6563 0 : UNIMPLEMENTED();
6564 : }
6565 : }
6566 0 : }
6567 :
6568 :
6569 0 : void DispatchTableConstructor::VisitAction(ActionNode* that) {
6570 0 : RegExpNode* target = that->on_success();
6571 0 : target->Accept(this);
6572 0 : }
6573 :
6574 40 : RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6575 : RegExpNode* on_success,
6576 : JSRegExp::Flags flags) {
6577 : // If the regexp matching starts within a surrogate pair, step back
6578 : // to the lead surrogate and start matching from there.
6579 : DCHECK(!compiler->read_backward());
6580 : Zone* zone = compiler->zone();
6581 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6582 40 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6583 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6584 40 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6585 :
6586 40 : ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6587 :
6588 : int stack_register = compiler->UnicodeLookaroundStackRegister();
6589 : int position_register = compiler->UnicodeLookaroundPositionRegister();
6590 : RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6591 40 : zone, lead_surrogates, true, on_success, flags);
6592 : RegExpLookaround::Builder builder(true, step_back, stack_register,
6593 40 : position_register);
6594 : RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6595 40 : zone, trail_surrogates, false, builder.on_match_success(), flags);
6596 :
6597 : optional_step_back->AddAlternative(
6598 40 : GuardedAlternative(builder.ForMatch(match_trail)));
6599 : optional_step_back->AddAlternative(GuardedAlternative(on_success));
6600 :
6601 40 : return optional_step_back;
6602 : }
6603 :
6604 :
6605 85546 : RegExpEngine::CompilationResult RegExpEngine::Compile(
6606 : Isolate* isolate, Zone* zone, RegExpCompileData* data,
6607 : JSRegExp::Flags flags, Handle<String> pattern,
6608 : Handle<String> sample_subject, bool is_one_byte) {
6609 85546 : if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6610 : return IrregexpRegExpTooBig(isolate);
6611 : }
6612 : bool is_sticky = IsSticky(flags);
6613 : bool is_global = IsGlobal(flags);
6614 : bool is_unicode = IsUnicode(flags);
6615 85537 : RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
6616 :
6617 85537 : if (compiler.optimize())
6618 84372 : compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
6619 :
6620 : // Sample some characters from the middle of the string.
6621 : static const int kSampleSize = 128;
6622 :
6623 85537 : sample_subject = String::Flatten(isolate, sample_subject);
6624 : int chars_sampled = 0;
6625 85537 : int half_way = (sample_subject->length() - kSampleSize) / 2;
6626 1078204 : for (int i = Max(0, half_way);
6627 539102 : i < sample_subject->length() && chars_sampled < kSampleSize;
6628 : i++, chars_sampled++) {
6629 907130 : compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6630 : }
6631 :
6632 : // Wrap the body of the regexp in capture #0.
6633 : RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6634 : 0,
6635 : &compiler,
6636 85537 : compiler.accept());
6637 : RegExpNode* node = captured_body;
6638 85537 : bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6639 85537 : bool is_start_anchored = data->tree->IsAnchoredAtStart();
6640 85537 : int max_length = data->tree->max_match();
6641 85537 : if (!is_start_anchored && !is_sticky) {
6642 : // Add a .*? at the beginning, outside the body capture, unless
6643 : // this expression is anchored at the beginning or sticky.
6644 : JSRegExp::Flags default_flags = JSRegExp::Flags();
6645 : RegExpNode* loop_node = RegExpQuantifier::ToNode(
6646 : 0, RegExpTree::kInfinity, false,
6647 : new (zone) RegExpCharacterClass('*', default_flags), &compiler,
6648 165248 : captured_body, data->contains_anchor);
6649 :
6650 82624 : if (data->contains_anchor) {
6651 : // Unroll loop once, to take care of the case that might start
6652 : // at the start of input.
6653 149 : ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6654 : first_step_node->AddAlternative(GuardedAlternative(captured_body));
6655 : first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6656 : new (zone) RegExpCharacterClass('*', default_flags), false,
6657 149 : loop_node)));
6658 : node = first_step_node;
6659 : } else {
6660 : node = loop_node;
6661 : }
6662 : }
6663 85537 : if (is_one_byte) {
6664 14492 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6665 : // Do it again to propagate the new nodes to places where they were not
6666 : // put because they had not been calculated yet.
6667 14492 : if (node != nullptr) {
6668 14192 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6669 : }
6670 71045 : } else if (is_unicode && (is_global || is_sticky)) {
6671 40 : node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags);
6672 : }
6673 :
6674 85537 : if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
6675 85537 : data->node = node;
6676 : Analysis analysis(isolate, is_one_byte);
6677 85537 : analysis.EnsureAnalyzed(node);
6678 85537 : if (analysis.has_failed()) {
6679 : const char* error_message = analysis.error_message();
6680 310 : return CompilationResult(isolate, error_message);
6681 : }
6682 :
6683 : // Create the correct assembler for the architecture.
6684 : std::unique_ptr<RegExpMacroAssembler> macro_assembler;
6685 85227 : if (!FLAG_regexp_interpret_all) {
6686 : // Native regexp implementation.
6687 : DCHECK(!FLAG_jitless);
6688 :
6689 : NativeRegExpMacroAssembler::Mode mode =
6690 : is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6691 81986 : : NativeRegExpMacroAssembler::UC16;
6692 :
6693 : #if V8_TARGET_ARCH_IA32
6694 : macro_assembler.reset(new RegExpMacroAssemblerIA32(
6695 : isolate, zone, mode, (data->capture_count + 1) * 2));
6696 : #elif V8_TARGET_ARCH_X64
6697 : macro_assembler.reset(new RegExpMacroAssemblerX64(
6698 81986 : isolate, zone, mode, (data->capture_count + 1) * 2));
6699 : #elif V8_TARGET_ARCH_ARM
6700 : macro_assembler.reset(new RegExpMacroAssemblerARM(
6701 : isolate, zone, mode, (data->capture_count + 1) * 2));
6702 : #elif V8_TARGET_ARCH_ARM64
6703 : macro_assembler.reset(new RegExpMacroAssemblerARM64(
6704 : isolate, zone, mode, (data->capture_count + 1) * 2));
6705 : #elif V8_TARGET_ARCH_S390
6706 : macro_assembler.reset(new RegExpMacroAssemblerS390(
6707 : isolate, zone, mode, (data->capture_count + 1) * 2));
6708 : #elif V8_TARGET_ARCH_PPC
6709 : macro_assembler.reset(new RegExpMacroAssemblerPPC(
6710 : isolate, zone, mode, (data->capture_count + 1) * 2));
6711 : #elif V8_TARGET_ARCH_MIPS
6712 : macro_assembler.reset(new RegExpMacroAssemblerMIPS(
6713 : isolate, zone, mode, (data->capture_count + 1) * 2));
6714 : #elif V8_TARGET_ARCH_MIPS64
6715 : macro_assembler.reset(new RegExpMacroAssemblerMIPS(
6716 : isolate, zone, mode, (data->capture_count + 1) * 2));
6717 : #else
6718 : #error "Unsupported architecture"
6719 : #endif
6720 : } else {
6721 : DCHECK(FLAG_regexp_interpret_all);
6722 :
6723 : // Interpreted regexp implementation.
6724 3241 : macro_assembler.reset(new RegExpMacroAssemblerIrregexp(isolate, zone));
6725 : }
6726 :
6727 85227 : macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
6728 :
6729 : // Inserted here, instead of in Assembler, because it depends on information
6730 : // in the AST that isn't replicated in the Node structure.
6731 : static const int kMaxBacksearchLimit = 1024;
6732 85769 : if (is_end_anchored && !is_start_anchored && !is_sticky &&
6733 542 : max_length < kMaxBacksearchLimit) {
6734 210 : macro_assembler->SetCurrentPositionFromEnd(max_length);
6735 : }
6736 :
6737 85227 : if (is_global) {
6738 : RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6739 3819 : if (data->tree->min_match() > 0) {
6740 : mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6741 138 : } else if (is_unicode) {
6742 : mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6743 : }
6744 : macro_assembler->set_global_mode(mode);
6745 : }
6746 :
6747 : return compiler.Assemble(isolate, macro_assembler.get(), node,
6748 85227 : data->capture_count, pattern);
6749 : }
6750 :
6751 339198 : bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
6752 169599 : Heap* heap = isolate->heap();
6753 169599 : bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6754 169599 : if (isolate->total_regexp_code_generated() >
6755 298152 : RegExpImpl::kRegExpCompiledLimit &&
6756 128553 : heap->CommittedMemoryExecutable() >
6757 : RegExpImpl::kRegExpExecutableMemoryLimit) {
6758 : too_much = true;
6759 : }
6760 169599 : return too_much;
6761 : }
6762 :
6763 36142 : Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
6764 : Object key_pattern,
6765 : FixedArray* last_match_cache,
6766 : ResultsCacheType type) {
6767 : FixedArray cache;
6768 36142 : if (!key_string->IsInternalizedString()) return Smi::kZero;
6769 5230 : if (type == STRING_SPLIT_SUBSTRINGS) {
6770 : DCHECK(key_pattern->IsString());
6771 5230 : if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6772 5230 : cache = heap->string_split_cache();
6773 : } else {
6774 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6775 : DCHECK(key_pattern->IsFixedArray());
6776 0 : cache = heap->regexp_multiple_cache();
6777 : }
6778 :
6779 5230 : uint32_t hash = key_string->Hash();
6780 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6781 5230 : ~(kArrayEntriesPerCacheEntry - 1));
6782 14339 : if (cache->get(index + kStringOffset) != key_string ||
6783 3879 : cache->get(index + kPatternOffset) != key_pattern) {
6784 : index =
6785 1382 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6786 2855 : if (cache->get(index + kStringOffset) != key_string ||
6787 91 : cache->get(index + kPatternOffset) != key_pattern) {
6788 1302 : return Smi::kZero;
6789 : }
6790 : }
6791 :
6792 7856 : *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6793 3928 : return cache->get(index + kArrayOffset);
6794 : }
6795 :
6796 32214 : void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6797 : Handle<Object> key_pattern,
6798 : Handle<FixedArray> value_array,
6799 : Handle<FixedArray> last_match_cache,
6800 : ResultsCacheType type) {
6801 : Factory* factory = isolate->factory();
6802 : Handle<FixedArray> cache;
6803 64428 : if (!key_string->IsInternalizedString()) return;
6804 1302 : if (type == STRING_SPLIT_SUBSTRINGS) {
6805 : DCHECK(key_pattern->IsString());
6806 2604 : if (!key_pattern->IsInternalizedString()) return;
6807 : cache = factory->string_split_cache();
6808 : } else {
6809 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6810 : DCHECK(key_pattern->IsFixedArray());
6811 : cache = factory->regexp_multiple_cache();
6812 : }
6813 :
6814 1302 : uint32_t hash = key_string->Hash();
6815 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6816 1302 : ~(kArrayEntriesPerCacheEntry - 1));
6817 2604 : if (cache->get(index + kStringOffset) == Smi::kZero) {
6818 2306 : cache->set(index + kStringOffset, *key_string);
6819 2306 : cache->set(index + kPatternOffset, *key_pattern);
6820 2306 : cache->set(index + kArrayOffset, *value_array);
6821 2306 : cache->set(index + kLastMatchOffset, *last_match_cache);
6822 : } else {
6823 : uint32_t index2 =
6824 149 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6825 298 : if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6826 194 : cache->set(index2 + kStringOffset, *key_string);
6827 194 : cache->set(index2 + kPatternOffset, *key_pattern);
6828 194 : cache->set(index2 + kArrayOffset, *value_array);
6829 194 : cache->set(index2 + kLastMatchOffset, *last_match_cache);
6830 : } else {
6831 52 : cache->set(index2 + kStringOffset, Smi::kZero);
6832 104 : cache->set(index2 + kPatternOffset, Smi::kZero);
6833 104 : cache->set(index2 + kArrayOffset, Smi::kZero);
6834 104 : cache->set(index2 + kLastMatchOffset, Smi::kZero);
6835 104 : cache->set(index + kStringOffset, *key_string);
6836 104 : cache->set(index + kPatternOffset, *key_pattern);
6837 104 : cache->set(index + kArrayOffset, *value_array);
6838 104 : cache->set(index + kLastMatchOffset, *last_match_cache);
6839 : }
6840 : }
6841 : // If the array is a reasonably short list of substrings, convert it into a
6842 : // list of internalized strings.
6843 2604 : if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6844 13203 : for (int i = 0; i < value_array->length(); i++) {
6845 : Handle<String> str(String::cast(value_array->get(i)), isolate);
6846 5968 : Handle<String> internalized_str = factory->InternalizeString(str);
6847 11936 : value_array->set(i, *internalized_str);
6848 : }
6849 : }
6850 : // Convert backing store to a copy-on-write array.
6851 : value_array->set_map_no_write_barrier(
6852 : ReadOnlyRoots(isolate).fixed_cow_array_map());
6853 : }
6854 :
6855 149020 : void RegExpResultsCache::Clear(FixedArray cache) {
6856 38298140 : for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6857 38149120 : cache->set(i, Smi::kZero);
6858 : }
6859 149020 : }
6860 :
6861 : } // namespace internal
6862 178779 : } // namespace v8
|