Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/jsregexp.h"
6 :
7 : #include <memory>
8 : #include <vector>
9 :
10 : #include "src/base/platform/platform.h"
11 : #include "src/code-tracer.h"
12 : #include "src/compilation-cache.h"
13 : #include "src/elements.h"
14 : #include "src/execution.h"
15 : #include "src/heap/factory.h"
16 : #include "src/isolate-inl.h"
17 : #include "src/message-template.h"
18 : #include "src/ostreams.h"
19 : #include "src/regexp/interpreter-irregexp.h"
20 : #include "src/regexp/jsregexp-inl.h"
21 : #include "src/regexp/regexp-macro-assembler-irregexp.h"
22 : #include "src/regexp/regexp-macro-assembler-tracer.h"
23 : #include "src/regexp/regexp-macro-assembler.h"
24 : #include "src/regexp/regexp-parser.h"
25 : #include "src/regexp/regexp-stack.h"
26 : #include "src/runtime/runtime.h"
27 : #include "src/splay-tree-inl.h"
28 : #include "src/string-search.h"
29 : #include "src/unicode-decoder.h"
30 : #include "src/unicode-inl.h"
31 :
32 : #ifdef V8_INTL_SUPPORT
33 : #include "unicode/uniset.h"
34 : #include "unicode/utypes.h"
35 : #endif // V8_INTL_SUPPORT
36 :
37 : #ifndef V8_INTERPRETED_REGEXP
38 : #if V8_TARGET_ARCH_IA32
39 : #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
40 : #elif V8_TARGET_ARCH_X64
41 : #include "src/regexp/x64/regexp-macro-assembler-x64.h"
42 : #elif V8_TARGET_ARCH_ARM64
43 : #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
44 : #elif V8_TARGET_ARCH_ARM
45 : #include "src/regexp/arm/regexp-macro-assembler-arm.h"
46 : #elif V8_TARGET_ARCH_PPC
47 : #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
48 : #elif V8_TARGET_ARCH_S390
49 : #include "src/regexp/s390/regexp-macro-assembler-s390.h"
50 : #elif V8_TARGET_ARCH_MIPS
51 : #include "src/regexp/mips/regexp-macro-assembler-mips.h"
52 : #elif V8_TARGET_ARCH_MIPS64
53 : #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
54 : #else
55 : #error Unsupported target architecture.
56 : #endif
57 : #endif
58 :
59 :
60 : namespace v8 {
61 : namespace internal {
62 :
63 : V8_WARN_UNUSED_RESULT
64 3307 : static inline MaybeHandle<Object> ThrowRegExpException(
65 : Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
66 : Handle<String> error_text) {
67 3307 : THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
68 : pattern, error_text),
69 : Object);
70 : }
71 :
72 436 : inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
73 : Handle<String> error_text) {
74 872 : USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
75 : error_text));
76 436 : }
77 :
78 :
79 965320 : ContainedInLattice AddRange(ContainedInLattice containment,
80 : const int* ranges,
81 : int ranges_length,
82 : Interval new_range) {
83 : DCHECK_EQ(1, ranges_length & 1);
84 : DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
85 965320 : if (containment == kLatticeUnknown) return containment;
86 : bool inside = false;
87 : int last = 0;
88 4568459 : for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
89 : // Consider the range from last to ranges[i].
90 : // We haven't got to the new range yet.
91 5423796 : if (ranges[i] <= new_range.from()) continue;
92 : // New range is wholly inside last-ranges[i]. Note that new_range.to() is
93 : // inclusive, but the values in ranges are not.
94 855337 : if (last <= new_range.from() && new_range.to() < ranges[i]) {
95 1676116 : return Combine(containment, inside ? kLatticeIn : kLatticeOut);
96 : }
97 : return kLatticeUnknown;
98 : }
99 : return containment;
100 : }
101 :
102 : // More makes code generation slower, less makes V8 benchmark score lower.
103 : const int kMaxLookaheadForBoyerMoore = 8;
104 : // In a 3-character pattern you can maximally step forwards 3 characters
105 : // at a time, which is not always enough to pay for the extra logic.
106 : const int kPatternTooShortForBoyerMoore = 2;
107 :
108 : // Identifies the sort of regexps where the regexp engine is faster
109 : // than the code used for atom matches.
110 205476 : static bool HasFewDifferentCharacters(Handle<String> pattern) {
111 : int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
112 205476 : if (length <= kPatternTooShortForBoyerMoore) return false;
113 : const int kMod = 128;
114 : bool character_found[kMod];
115 : int different = 0;
116 : memset(&character_found[0], 0, sizeof(character_found));
117 599581 : for (int i = 0; i < length; i++) {
118 1198986 : int ch = (pattern->Get(i) & (kMod - 1));
119 599493 : if (!character_found[ch]) {
120 599066 : character_found[ch] = true;
121 599066 : different++;
122 : // We declare a regexp low-alphabet if it has at least 3 times as many
123 : // characters as it has different characters.
124 599066 : if (different * 3 > length) return false;
125 : }
126 : }
127 : return true;
128 : }
129 :
130 : // Generic RegExp methods. Dispatches to implementation specific methods.
131 :
132 922966 : MaybeHandle<Object> RegExpImpl::Compile(Isolate* isolate, Handle<JSRegExp> re,
133 : Handle<String> pattern,
134 : JSRegExp::Flags flags) {
135 : DCHECK(pattern->IsFlat());
136 :
137 461483 : Zone zone(isolate->allocator(), ZONE_NAME);
138 : CompilationCache* compilation_cache = isolate->compilation_cache();
139 : MaybeHandle<FixedArray> maybe_cached =
140 461483 : compilation_cache->LookupRegExp(pattern, flags);
141 : Handle<FixedArray> cached;
142 461483 : if (maybe_cached.ToHandle(&cached)) {
143 335684 : re->set_data(*cached);
144 167842 : return re;
145 : }
146 :
147 : PostponeInterruptsScope postpone(isolate);
148 : RegExpCompileData parse_result;
149 293641 : FlatStringReader reader(isolate, pattern);
150 : DCHECK(!isolate->has_pending_exception());
151 293641 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
152 293641 : &parse_result)) {
153 : // Throw an exception if we fail to parse the pattern.
154 2826 : return ThrowRegExpException(isolate, re, pattern, parse_result.error);
155 : }
156 :
157 : bool has_been_compiled = false;
158 :
159 887588 : if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
160 198739 : !HasFewDifferentCharacters(pattern)) {
161 : // Parse-tree is a single atom that is equal to the pattern.
162 : AtomCompile(isolate, re, pattern, flags, pattern);
163 : has_been_compiled = true;
164 106693 : } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
165 7262 : parse_result.capture_count == 0) {
166 7252 : RegExpAtom* atom = parse_result.tree->AsAtom();
167 7252 : Vector<const uc16> atom_pattern = atom->data();
168 : Handle<String> atom_string;
169 14504 : ASSIGN_RETURN_ON_EXCEPTION(
170 : isolate, atom_string,
171 : isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
172 7252 : if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
173 : AtomCompile(isolate, re, pattern, flags, atom_string);
174 : has_been_compiled = true;
175 : }
176 : }
177 290815 : if (!has_been_compiled) {
178 85427 : IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count);
179 : }
180 : DCHECK(re->data()->IsFixedArray());
181 : // Compilation succeeded so the data is set on the regexp
182 : // and we can store it in the cache.
183 581630 : Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
184 290815 : compilation_cache->PutRegExp(pattern, flags, data);
185 :
186 752298 : return re;
187 : }
188 :
189 104937 : MaybeHandle<Object> RegExpImpl::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
190 : Handle<String> subject, int index,
191 : Handle<RegExpMatchInfo> last_match_info) {
192 104937 : switch (regexp->TypeTag()) {
193 : case JSRegExp::ATOM:
194 286 : return AtomExec(isolate, regexp, subject, index, last_match_info);
195 : case JSRegExp::IRREGEXP: {
196 104651 : return IrregexpExec(isolate, regexp, subject, index, last_match_info);
197 : }
198 : default:
199 0 : UNREACHABLE();
200 : }
201 : }
202 :
203 :
204 : // RegExp Atom implementation: Simple string search using indexOf.
205 :
206 0 : void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
207 : Handle<String> pattern, JSRegExp::Flags flags,
208 : Handle<String> match_pattern) {
209 : isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags,
210 205388 : match_pattern);
211 0 : }
212 :
213 273 : static void SetAtomLastCapture(Isolate* isolate,
214 : Handle<RegExpMatchInfo> last_match_info,
215 : String subject, int from, int to) {
216 : SealHandleScope shs(isolate);
217 : last_match_info->SetNumberOfCaptureRegisters(2);
218 546 : last_match_info->SetLastSubject(subject);
219 546 : last_match_info->SetLastInput(subject);
220 273 : last_match_info->SetCapture(0, from);
221 273 : last_match_info->SetCapture(1, to);
222 273 : }
223 :
224 90541 : int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
225 : Handle<String> subject, int index, int32_t* output,
226 : int output_size) {
227 : DCHECK_LE(0, index);
228 : DCHECK_LE(index, subject->length());
229 :
230 90541 : subject = String::Flatten(isolate, subject);
231 : DisallowHeapAllocation no_gc; // ensure vectors stay valid
232 :
233 181082 : String needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
234 : int needle_len = needle->length();
235 : DCHECK(needle->IsFlat());
236 : DCHECK_LT(0, needle_len);
237 :
238 181082 : if (index + needle_len > subject->length()) {
239 : return RegExpImpl::RE_FAILURE;
240 : }
241 :
242 91470 : for (int i = 0; i < output_size; i += 2) {
243 181736 : String::FlatContent needle_content = needle->GetFlatContent(no_gc);
244 181736 : String::FlatContent subject_content = subject->GetFlatContent(no_gc);
245 : DCHECK(needle_content.IsFlat());
246 : DCHECK(subject_content.IsFlat());
247 : // dispatch on type of strings
248 : index =
249 181736 : (needle_content.IsOneByte()
250 : ? (subject_content.IsOneByte()
251 : ? SearchString(isolate, subject_content.ToOneByteVector(),
252 : needle_content.ToOneByteVector(), index)
253 : : SearchString(isolate, subject_content.ToUC16Vector(),
254 : needle_content.ToOneByteVector(), index))
255 : : (subject_content.IsOneByte()
256 : ? SearchString(isolate, subject_content.ToOneByteVector(),
257 : needle_content.ToUC16Vector(), index)
258 : : SearchString(isolate, subject_content.ToUC16Vector(),
259 363472 : needle_content.ToUC16Vector(), index)));
260 181736 : if (index == -1) {
261 90266 : return i / 2; // Return number of matches.
262 : } else {
263 91470 : output[i] = index;
264 91470 : output[i+1] = index + needle_len;
265 : index += needle_len;
266 : }
267 : }
268 273 : return output_size / 2;
269 : }
270 :
271 286 : Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
272 : Handle<String> subject, int index,
273 : Handle<RegExpMatchInfo> last_match_info) {
274 : static const int kNumRegisters = 2;
275 : STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
276 286 : int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
277 :
278 : int res =
279 286 : AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
280 :
281 299 : if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
282 :
283 : DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
284 : SealHandleScope shs(isolate);
285 : SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
286 546 : output_registers[1]);
287 273 : return last_match_info;
288 : }
289 :
290 :
291 : // Irregexp implementation.
292 :
293 : // Ensures that the regexp object contains a compiled version of the
294 : // source for either one-byte or two-byte subject strings.
295 : // If the compiled version doesn't already exist, it is compiled
296 : // from the source pattern.
297 : // If compilation fails, an exception is thrown and this function
298 : // returns false.
299 230147 : bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
300 : Handle<String> sample_subject,
301 : bool is_one_byte) {
302 230147 : Object compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
303 : #ifdef V8_INTERPRETED_REGEXP
304 : if (compiled_code->IsByteArray()) return true;
305 : #else // V8_INTERPRETED_REGEXP (RegExp native code)
306 230147 : if (compiled_code->IsCode()) return true;
307 : #endif
308 85754 : return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
309 : }
310 :
311 85754 : bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
312 : Handle<String> sample_subject,
313 : bool is_one_byte) {
314 : // Compile the RegExp.
315 85754 : Zone zone(isolate->allocator(), ZONE_NAME);
316 : PostponeInterruptsScope postpone(isolate);
317 : #ifdef DEBUG
318 : Object entry = re->DataAt(JSRegExp::code_index(is_one_byte));
319 : // When arriving here entry can only be a smi representing an uncompiled
320 : // regexp.
321 : DCHECK(entry->IsSmi());
322 : int entry_value = Smi::ToInt(entry);
323 : DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
324 : #endif
325 :
326 85754 : JSRegExp::Flags flags = re->GetFlags();
327 :
328 171508 : Handle<String> pattern(re->Pattern(), isolate);
329 85754 : pattern = String::Flatten(isolate, pattern);
330 : RegExpCompileData compile_data;
331 85754 : FlatStringReader reader(isolate, pattern);
332 85754 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
333 85754 : &compile_data)) {
334 : // Throw an exception if we fail to parse the pattern.
335 : // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
336 45 : USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
337 45 : return false;
338 : }
339 : RegExpEngine::CompilationResult result =
340 : RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
341 85709 : sample_subject, is_one_byte);
342 85709 : if (result.error_message != nullptr) {
343 : // Unable to compile regexp.
344 436 : if (FLAG_abort_on_stack_or_string_length_overflow &&
345 0 : strncmp(result.error_message, "Stack overflow", 15) == 0) {
346 0 : FATAL("Aborting on stack overflow");
347 : }
348 : Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
349 872 : CStrVector(result.error_message)).ToHandleChecked();
350 436 : ThrowRegExpException(isolate, re, error_message);
351 : return false;
352 : }
353 :
354 : Handle<FixedArray> data =
355 170546 : Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
356 85273 : data->set(JSRegExp::code_index(is_one_byte), result.code);
357 85273 : SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
358 85273 : int register_max = IrregexpMaxRegisterCount(*data);
359 85273 : if (result.num_registers > register_max) {
360 : SetIrregexpMaxRegisterCount(*data, result.num_registers);
361 : }
362 :
363 85754 : return true;
364 : }
365 :
366 85273 : int RegExpImpl::IrregexpMaxRegisterCount(FixedArray re) {
367 : return Smi::cast(
368 85273 : re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
369 : }
370 :
371 0 : void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray re, int value) {
372 : re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
373 0 : }
374 :
375 85273 : void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray re,
376 : Handle<FixedArray> value) {
377 85273 : if (value.is_null()) {
378 84913 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
379 : } else {
380 360 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
381 : }
382 85273 : }
383 :
384 198663 : int RegExpImpl::IrregexpNumberOfCaptures(FixedArray re) {
385 198663 : return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex));
386 : }
387 :
388 0 : int RegExpImpl::IrregexpNumberOfRegisters(FixedArray re) {
389 0 : return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex));
390 : }
391 :
392 0 : ByteArray RegExpImpl::IrregexpByteCode(FixedArray re, bool is_one_byte) {
393 0 : return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
394 : }
395 :
396 118099 : Code RegExpImpl::IrregexpNativeCode(FixedArray re, bool is_one_byte) {
397 118099 : return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
398 : }
399 :
400 0 : void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
401 : Handle<String> pattern,
402 : JSRegExp::Flags flags, int capture_count) {
403 : // Initialize compiled code entries to null.
404 : isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern,
405 85427 : flags, capture_count);
406 0 : }
407 :
408 112048 : int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
409 : Handle<String> subject) {
410 : DCHECK(subject->IsFlat());
411 :
412 : // Check representation of the underlying storage.
413 112048 : bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
414 112048 : if (!EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) return -1;
415 :
416 : #ifdef V8_INTERPRETED_REGEXP
417 : // Byte-code regexp needs space allocated for all its registers.
418 : // The result captures are copied to the start of the registers array
419 : // if the match succeeds. This way those registers are not clobbered
420 : // when we set the last match info from last successful match.
421 : return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
422 : (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
423 : #else // V8_INTERPRETED_REGEXP
424 : // Native regexp only needs room to output captures. Registers are handled
425 : // internally.
426 223134 : return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
427 : #endif // V8_INTERPRETED_REGEXP
428 : }
429 :
430 118099 : int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
431 : Handle<String> subject, int index,
432 : int32_t* output, int output_size) {
433 236198 : Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
434 :
435 : DCHECK_LE(0, index);
436 : DCHECK_LE(index, subject->length());
437 : DCHECK(subject->IsFlat());
438 :
439 118099 : bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
440 :
441 : #ifndef V8_INTERPRETED_REGEXP
442 : DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
443 : do {
444 118099 : EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
445 118099 : Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
446 : // The stack is used to allocate registers for the compiled regexp code.
447 : // This means that in case of failure, the output registers array is left
448 : // untouched and contains the capture results from the previous successful
449 : // match. We can use that to set the last match info lazily.
450 : NativeRegExpMacroAssembler::Result res =
451 : NativeRegExpMacroAssembler::Match(code,
452 : subject,
453 : output,
454 : output_size,
455 : index,
456 118099 : isolate);
457 118099 : if (res != NativeRegExpMacroAssembler::RETRY) {
458 : DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
459 : isolate->has_pending_exception());
460 : STATIC_ASSERT(
461 : static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
462 : STATIC_ASSERT(
463 : static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
464 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
465 : == RE_EXCEPTION);
466 118099 : return static_cast<IrregexpResult>(res);
467 : }
468 : // If result is RETRY, the string has changed representation, and we
469 : // must restart from scratch.
470 : // In this case, it means we must make sure we are prepared to handle
471 : // the, potentially, different subject (the string can switch between
472 : // being internal and external, and even between being Latin1 and UC16,
473 : // but the characters are always the same).
474 0 : IrregexpPrepare(isolate, regexp, subject);
475 0 : is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
476 : } while (true);
477 0 : UNREACHABLE();
478 : #else // V8_INTERPRETED_REGEXP
479 :
480 : DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
481 : // We must have done EnsureCompiledIrregexp, so we can get the number of
482 : // registers.
483 : int number_of_capture_registers =
484 : (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
485 : int32_t* raw_output = &output[number_of_capture_registers];
486 : // We do not touch the actual capture result registers until we know there
487 : // has been a match so that we can use those capture results to set the
488 : // last match info.
489 : for (int i = number_of_capture_registers - 1; i >= 0; i--) {
490 : raw_output[i] = -1;
491 : }
492 : Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
493 : isolate);
494 :
495 : IrregexpResult result = IrregexpInterpreter::Match(isolate,
496 : byte_codes,
497 : subject,
498 : raw_output,
499 : index);
500 : if (result == RE_SUCCESS) {
501 : // Copy capture results to the start of the registers array.
502 : MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
503 : }
504 : if (result == RE_EXCEPTION) {
505 : DCHECK(!isolate->has_pending_exception());
506 : isolate->StackOverflow();
507 : }
508 : return result;
509 : #endif // V8_INTERPRETED_REGEXP
510 : }
511 :
512 104651 : MaybeHandle<Object> RegExpImpl::IrregexpExec(
513 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
514 : int previous_index, Handle<RegExpMatchInfo> last_match_info) {
515 : DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
516 :
517 104651 : subject = String::Flatten(isolate, subject);
518 :
519 : // Prepare space for the return values.
520 : #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
521 : if (FLAG_trace_regexp_bytecodes) {
522 : String pattern = regexp->Pattern();
523 : PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
524 : PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
525 : }
526 : #endif
527 : int required_registers =
528 104651 : RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
529 104651 : if (required_registers < 0) {
530 : // Compiling failed with an exception.
531 : DCHECK(isolate->has_pending_exception());
532 351 : return MaybeHandle<Object>();
533 : }
534 :
535 : int32_t* output_registers = nullptr;
536 104300 : if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
537 2801 : output_registers = NewArray<int32_t>(required_registers);
538 : }
539 : std::unique_ptr<int32_t[]> auto_release(output_registers);
540 104300 : if (output_registers == nullptr) {
541 101499 : output_registers = isolate->jsregexp_static_offsets_vector();
542 : }
543 :
544 : int res =
545 : RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
546 104300 : output_registers, required_registers);
547 104300 : if (res == RE_SUCCESS) {
548 : int capture_count =
549 174192 : IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
550 : return SetLastMatchInfo(isolate, last_match_info, subject, capture_count,
551 87096 : output_registers);
552 : }
553 17204 : if (res == RE_EXCEPTION) {
554 : DCHECK(isolate->has_pending_exception());
555 52 : return MaybeHandle<Object>();
556 : }
557 : DCHECK(res == RE_FAILURE);
558 17152 : return isolate->factory()->null_value();
559 : }
560 :
561 181946 : Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
562 : Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
563 : Handle<String> subject, int capture_count, int32_t* match) {
564 : // This is the only place where match infos can grow. If, after executing the
565 : // regexp, RegExpExecStub finds that the match info is too small, it restarts
566 : // execution in RegExpImpl::Exec, which finally grows the match info right
567 : // here.
568 :
569 181946 : int capture_register_count = (capture_count + 1) * 2;
570 : Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures(
571 181946 : isolate, last_match_info, capture_register_count);
572 : result->SetNumberOfCaptureRegisters(capture_register_count);
573 :
574 181946 : if (*result != *last_match_info) {
575 : // The match info has been reallocated, update the corresponding reference
576 : // on the native context.
577 4234 : if (*last_match_info == *isolate->regexp_last_match_info()) {
578 4216 : isolate->native_context()->set_regexp_last_match_info(*result);
579 18 : } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
580 18 : isolate->native_context()->set_regexp_internal_match_info(*result);
581 : }
582 : }
583 :
584 : DisallowHeapAllocation no_allocation;
585 181946 : if (match != nullptr) {
586 1344670 : for (int i = 0; i < capture_register_count; i += 2) {
587 2689340 : result->SetCapture(i, match[i]);
588 2689340 : result->SetCapture(i + 1, match[i + 1]);
589 : }
590 : }
591 363892 : result->SetLastSubject(*subject);
592 363892 : result->SetLastInput(*subject);
593 181946 : return result;
594 : }
595 :
596 95492 : RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
597 : Handle<String> subject, Isolate* isolate)
598 : : register_array_(nullptr),
599 : register_array_size_(0),
600 : regexp_(regexp),
601 : subject_(subject),
602 95492 : isolate_(isolate) {
603 : #ifdef V8_INTERPRETED_REGEXP
604 : bool interpreted = true;
605 : #else
606 : bool interpreted = false;
607 : #endif // V8_INTERPRETED_REGEXP
608 :
609 95492 : if (regexp_->TypeTag() == JSRegExp::ATOM) {
610 : static const int kAtomRegistersPerMatch = 2;
611 90255 : registers_per_match_ = kAtomRegistersPerMatch;
612 : // There is no distinction between interpreted and native for atom regexps.
613 : interpreted = false;
614 : } else {
615 : registers_per_match_ =
616 5237 : RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
617 5237 : if (registers_per_match_ < 0) {
618 130 : num_matches_ = -1; // Signal exception.
619 95622 : return;
620 : }
621 : }
622 :
623 : DCHECK(IsGlobal(regexp->GetFlags()));
624 : if (!interpreted) {
625 : register_array_size_ =
626 190724 : Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
627 95362 : max_matches_ = register_array_size_ / registers_per_match_;
628 : } else {
629 : // Global loop in interpreted regexp is not implemented. We choose
630 : // the size of the offsets vector so that it can only store one match.
631 : register_array_size_ = registers_per_match_;
632 : max_matches_ = 1;
633 : }
634 :
635 95362 : if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
636 1032 : register_array_ = NewArray<int32_t>(register_array_size_);
637 : } else {
638 94330 : register_array_ = isolate->jsregexp_static_offsets_vector();
639 : }
640 :
641 : // Set state so that fetching the results the first time triggers a call
642 : // to the compiled regexp.
643 95362 : current_match_index_ = max_matches_ - 1;
644 95362 : num_matches_ = max_matches_;
645 : DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
646 : DCHECK_GE(register_array_size_, registers_per_match_);
647 : int32_t* last_match =
648 95362 : ®ister_array_[current_match_index_ * registers_per_match_];
649 95362 : last_match[0] = -1;
650 95362 : last_match[1] = 0;
651 : }
652 :
653 0 : int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
654 0 : if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
655 0 : unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
656 0 : unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
657 : // Advance over the surrogate pair.
658 0 : return last_index + 2;
659 : }
660 0 : return last_index + 1;
661 : }
662 :
663 : // -------------------------------------------------------------------
664 : // Implementation of the Irregexp regular expression engine.
665 : //
666 : // The Irregexp regular expression engine is intended to be a complete
667 : // implementation of ECMAScript regular expressions. It generates either
668 : // bytecodes or native code.
669 :
670 : // The Irregexp regexp engine is structured in three steps.
671 : // 1) The parser generates an abstract syntax tree. See ast.cc.
672 : // 2) From the AST a node network is created. The nodes are all
673 : // subclasses of RegExpNode. The nodes represent states when
674 : // executing a regular expression. Several optimizations are
675 : // performed on the node network.
676 : // 3) From the nodes we generate either byte codes or native code
677 : // that can actually execute the regular expression (perform
678 : // the search). The code generation step is described in more
679 : // detail below.
680 :
681 : // Code generation.
682 : //
683 : // The nodes are divided into four main categories.
684 : // * Choice nodes
685 : // These represent places where the regular expression can
686 : // match in more than one way. For example on entry to an
687 : // alternation (foo|bar) or a repetition (*, +, ? or {}).
688 : // * Action nodes
689 : // These represent places where some action should be
690 : // performed. Examples include recording the current position
691 : // in the input string to a register (in order to implement
692 : // captures) or other actions on register for example in order
693 : // to implement the counters needed for {} repetitions.
694 : // * Matching nodes
695 : // These attempt to match some element part of the input string.
696 : // Examples of elements include character classes, plain strings
697 : // or back references.
698 : // * End nodes
699 : // These are used to implement the actions required on finding
700 : // a successful match or failing to find a match.
701 : //
702 : // The code generated (whether as byte codes or native code) maintains
703 : // some state as it runs. This consists of the following elements:
704 : //
705 : // * The capture registers. Used for string captures.
706 : // * Other registers. Used for counters etc.
707 : // * The current position.
708 : // * The stack of backtracking information. Used when a matching node
709 : // fails to find a match and needs to try an alternative.
710 : //
711 : // Conceptual regular expression execution model:
712 : //
713 : // There is a simple conceptual model of regular expression execution
714 : // which will be presented first. The actual code generated is a more
715 : // efficient simulation of the simple conceptual model:
716 : //
717 : // * Choice nodes are implemented as follows:
718 : // For each choice except the last {
719 : // push current position
720 : // push backtrack code location
721 : // <generate code to test for choice>
722 : // backtrack code location:
723 : // pop current position
724 : // }
725 : // <generate code to test for last choice>
726 : //
727 : // * Actions nodes are generated as follows
728 : // <push affected registers on backtrack stack>
729 : // <generate code to perform action>
730 : // push backtrack code location
731 : // <generate code to test for following nodes>
732 : // backtrack code location:
733 : // <pop affected registers to restore their state>
734 : // <pop backtrack location from stack and go to it>
735 : //
736 : // * Matching nodes are generated as follows:
737 : // if input string matches at current position
738 : // update current position
739 : // <generate code to test for following nodes>
740 : // else
741 : // <pop backtrack location from stack and go to it>
742 : //
743 : // Thus it can be seen that the current position is saved and restored
744 : // by the choice nodes, whereas the registers are saved and restored by
745 : // by the action nodes that manipulate them.
746 : //
747 : // The other interesting aspect of this model is that nodes are generated
748 : // at the point where they are needed by a recursive call to Emit(). If
749 : // the node has already been code generated then the Emit() call will
750 : // generate a jump to the previously generated code instead. In order to
751 : // limit recursion it is possible for the Emit() function to put the node
752 : // on a work list for later generation and instead generate a jump. The
753 : // destination of the jump is resolved later when the code is generated.
754 : //
755 : // Actual regular expression code generation.
756 : //
757 : // Code generation is actually more complicated than the above. In order
758 : // to improve the efficiency of the generated code some optimizations are
759 : // performed
760 : //
761 : // * Choice nodes have 1-character lookahead.
762 : // A choice node looks at the following character and eliminates some of
763 : // the choices immediately based on that character. This is not yet
764 : // implemented.
765 : // * Simple greedy loops store reduced backtracking information.
766 : // A quantifier like /.*foo/m will greedily match the whole input. It will
767 : // then need to backtrack to a point where it can match "foo". The naive
768 : // implementation of this would push each character position onto the
769 : // backtracking stack, then pop them off one by one. This would use space
770 : // proportional to the length of the input string. However since the "."
771 : // can only match in one way and always has a constant length (in this case
772 : // of 1) it suffices to store the current position on the top of the stack
773 : // once. Matching now becomes merely incrementing the current position and
774 : // backtracking becomes decrementing the current position and checking the
775 : // result against the stored current position. This is faster and saves
776 : // space.
777 : // * The current state is virtualized.
778 : // This is used to defer expensive operations until it is clear that they
779 : // are needed and to generate code for a node more than once, allowing
780 : // specialized an efficient versions of the code to be created. This is
781 : // explained in the section below.
782 : //
783 : // Execution state virtualization.
784 : //
785 : // Instead of emitting code, nodes that manipulate the state can record their
786 : // manipulation in an object called the Trace. The Trace object can record a
787 : // current position offset, an optional backtrack code location on the top of
788 : // the virtualized backtrack stack and some register changes. When a node is
789 : // to be emitted it can flush the Trace or update it. Flushing the Trace
790 : // will emit code to bring the actual state into line with the virtual state.
791 : // Avoiding flushing the state can postpone some work (e.g. updates of capture
792 : // registers). Postponing work can save time when executing the regular
793 : // expression since it may be found that the work never has to be done as a
794 : // failure to match can occur. In addition it is much faster to jump to a
795 : // known backtrack code location than it is to pop an unknown backtrack
796 : // location from the stack and jump there.
797 : //
798 : // The virtual state found in the Trace affects code generation. For example
799 : // the virtual state contains the difference between the actual current
800 : // position and the virtual current position, and matching code needs to use
801 : // this offset to attempt a match in the correct location of the input
802 : // string. Therefore code generated for a non-trivial trace is specialized
803 : // to that trace. The code generator therefore has the ability to generate
804 : // code for each node several times. In order to limit the size of the
805 : // generated code there is an arbitrary limit on how many specialized sets of
806 : // code may be generated for a given node. If the limit is reached, the
807 : // trace is flushed and a generic version of the code for a node is emitted.
808 : // This is subsequently used for that node. The code emitted for non-generic
809 : // trace is not recorded in the node and so it cannot currently be reused in
810 : // the event that code generation is requested for an identical trace.
811 :
812 :
813 0 : void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
814 0 : UNREACHABLE();
815 : }
816 :
817 :
818 86701 : void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
819 86701 : text->AddElement(TextElement::Atom(this), zone);
820 86701 : }
821 :
822 :
823 7670 : void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
824 7670 : text->AddElement(TextElement::CharClass(this), zone);
825 7670 : }
826 :
827 :
828 0 : void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
829 0 : for (int i = 0; i < elements()->length(); i++)
830 0 : text->AddElement(elements()->at(i), zone);
831 0 : }
832 :
833 :
834 0 : TextElement TextElement::Atom(RegExpAtom* atom) {
835 0 : return TextElement(ATOM, atom);
836 : }
837 :
838 :
839 0 : TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
840 0 : return TextElement(CHAR_CLASS, char_class);
841 : }
842 :
843 :
844 7414741 : int TextElement::length() const {
845 7414741 : switch (text_type()) {
846 : case ATOM:
847 6579694 : return atom()->length();
848 :
849 : case CHAR_CLASS:
850 : return 1;
851 : }
852 0 : UNREACHABLE();
853 : }
854 :
855 :
856 0 : DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
857 0 : if (table_ == nullptr) {
858 0 : table_ = new(zone()) DispatchTable(zone());
859 : DispatchTableConstructor cons(table_, ignore_case, zone());
860 0 : cons.BuildTable(this);
861 : }
862 0 : return table_;
863 : }
864 :
865 :
866 : class FrequencyCollator {
867 : public:
868 11055945 : FrequencyCollator() : total_samples_(0) {
869 10970240 : for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
870 10970240 : frequencies_[i] = CharacterFrequency(i);
871 : }
872 : }
873 :
874 : void CountCharacter(int character) {
875 455056 : int index = (character & RegExpMacroAssembler::kTableMask);
876 455056 : frequencies_[index].Increment();
877 455056 : total_samples_++;
878 : }
879 :
880 : // Does not measure in percent, but rather per-128 (the table size from the
881 : // regexp macro assembler).
882 : int Frequency(int in_character) {
883 : DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
884 486436 : if (total_samples_ < 1) return 1; // Division by zero.
885 : int freq_in_per128 =
886 486171 : (frequencies_[in_character].counter() * 128) / total_samples_;
887 : return freq_in_per128;
888 : }
889 :
890 : private:
891 : class CharacterFrequency {
892 : public:
893 10970240 : CharacterFrequency() : counter_(0), character_(-1) { }
894 : explicit CharacterFrequency(int character)
895 : : counter_(0), character_(character) { }
896 :
897 455056 : void Increment() { counter_++; }
898 : int counter() { return counter_; }
899 : int character() { return character_; }
900 :
901 : private:
902 : int counter_;
903 : int character_;
904 : };
905 :
906 :
907 : private:
908 : CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
909 : int total_samples_;
910 : };
911 :
912 :
913 : class RegExpCompiler {
914 : public:
915 : RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
916 : bool is_one_byte);
917 :
918 : int AllocateRegister() {
919 909470 : if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
920 310203 : reg_exp_too_big_ = true;
921 : return next_register_;
922 : }
923 599267 : return next_register_++;
924 : }
925 :
926 : // Lookarounds to match lone surrogates for unicode character class matches
927 : // are never nested. We can therefore reuse registers.
928 : int UnicodeLookaroundStackRegister() {
929 2460 : if (unicode_lookaround_stack_register_ == kNoRegister) {
930 1040 : unicode_lookaround_stack_register_ = AllocateRegister();
931 : }
932 2460 : return unicode_lookaround_stack_register_;
933 : }
934 :
935 : int UnicodeLookaroundPositionRegister() {
936 2460 : if (unicode_lookaround_position_register_ == kNoRegister) {
937 1040 : unicode_lookaround_position_register_ = AllocateRegister();
938 : }
939 2460 : return unicode_lookaround_position_register_;
940 : }
941 :
942 : RegExpEngine::CompilationResult Assemble(Isolate* isolate,
943 : RegExpMacroAssembler* assembler,
944 : RegExpNode* start, int capture_count,
945 : Handle<String> pattern);
946 :
947 592598 : inline void AddWork(RegExpNode* node) {
948 592598 : if (!node->on_work_list() && !node->label()->is_bound()) {
949 : node->set_on_work_list(true);
950 211647 : work_list_->push_back(node);
951 : }
952 592598 : }
953 :
954 : static const int kImplementationOffset = 0;
955 : static const int kNumberOfRegistersOffset = 0;
956 : static const int kCodeOffset = 1;
957 :
958 : RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
959 : EndNode* accept() { return accept_; }
960 :
961 : static const int kMaxRecursion = 100;
962 : inline int recursion_depth() { return recursion_depth_; }
963 997552 : inline void IncrementRecursionDepth() { recursion_depth_++; }
964 997552 : inline void DecrementRecursionDepth() { recursion_depth_--; }
965 :
966 0 : void SetRegExpTooBig() { reg_exp_too_big_ = true; }
967 :
968 : inline bool one_byte() { return one_byte_; }
969 : inline bool optimize() { return optimize_; }
970 84553 : inline void set_optimize(bool value) { optimize_ = value; }
971 : inline bool limiting_recursion() { return limiting_recursion_; }
972 : inline void set_limiting_recursion(bool value) {
973 955120 : limiting_recursion_ = value;
974 : }
975 : bool read_backward() { return read_backward_; }
976 3336 : void set_read_backward(bool value) { read_backward_ = value; }
977 : FrequencyCollator* frequency_collator() { return &frequency_collator_; }
978 :
979 : int current_expansion_factor() { return current_expansion_factor_; }
980 : void set_current_expansion_factor(int value) {
981 85643 : current_expansion_factor_ = value;
982 : }
983 :
984 : Isolate* isolate() const { return isolate_; }
985 : Zone* zone() const { return zone_; }
986 :
987 : static const int kNoRegister = -1;
988 :
989 : private:
990 : EndNode* accept_;
991 : int next_register_;
992 : int unicode_lookaround_stack_register_;
993 : int unicode_lookaround_position_register_;
994 : std::vector<RegExpNode*>* work_list_;
995 : int recursion_depth_;
996 : RegExpMacroAssembler* macro_assembler_;
997 : bool one_byte_;
998 : bool reg_exp_too_big_;
999 : bool limiting_recursion_;
1000 : bool optimize_;
1001 : bool read_backward_;
1002 : int current_expansion_factor_;
1003 : FrequencyCollator frequency_collator_;
1004 : Isolate* isolate_;
1005 : Zone* zone_;
1006 : };
1007 :
1008 :
1009 : class RecursionCheck {
1010 : public:
1011 : explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1012 : compiler->IncrementRecursionDepth();
1013 : }
1014 : ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1015 : private:
1016 : RegExpCompiler* compiler_;
1017 : };
1018 :
1019 :
1020 : static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1021 9 : return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1022 : }
1023 :
1024 :
1025 : // Attempts to compile the regexp using an Irregexp code generator. Returns
1026 : // a fixed array or a null handle depending on whether it succeeded.
1027 85705 : RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1028 : bool one_byte)
1029 85705 : : next_register_(2 * (capture_count + 1)),
1030 : unicode_lookaround_stack_register_(kNoRegister),
1031 : unicode_lookaround_position_register_(kNoRegister),
1032 : work_list_(nullptr),
1033 : recursion_depth_(0),
1034 : one_byte_(one_byte),
1035 : reg_exp_too_big_(false),
1036 : limiting_recursion_(false),
1037 : optimize_(FLAG_regexp_optimization),
1038 : read_backward_(false),
1039 : current_expansion_factor_(1),
1040 : frequency_collator_(),
1041 : isolate_(isolate),
1042 171410 : zone_(zone) {
1043 85705 : accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1044 : DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
1045 85705 : }
1046 :
1047 85278 : RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1048 : Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
1049 : int capture_count, Handle<String> pattern) {
1050 : #ifdef DEBUG
1051 : if (FLAG_trace_regexp_assembler)
1052 : macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1053 : else
1054 : #endif
1055 85278 : macro_assembler_ = macro_assembler;
1056 :
1057 : std::vector<RegExpNode*> work_list;
1058 85278 : work_list_ = &work_list;
1059 85278 : Label fail;
1060 85278 : macro_assembler_->PushBacktrack(&fail);
1061 85278 : Trace new_trace;
1062 85278 : start->Emit(this, &new_trace);
1063 85278 : macro_assembler_->Bind(&fail);
1064 85278 : macro_assembler_->Fail();
1065 382203 : while (!work_list.empty()) {
1066 211647 : RegExpNode* node = work_list.back();
1067 : work_list.pop_back();
1068 : node->set_on_work_list(false);
1069 211647 : if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1070 : }
1071 85278 : if (reg_exp_too_big_) {
1072 0 : macro_assembler_->AbortedCodeGeneration();
1073 0 : return IrregexpRegExpTooBig(isolate_);
1074 : }
1075 :
1076 85278 : Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1077 170556 : isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1078 85278 : work_list_ = nullptr;
1079 : #if defined(ENABLE_DISASSEMBLER) && !defined(V8_INTERPRETED_REGEXP)
1080 : if (FLAG_print_code) {
1081 : CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1082 : OFStream os(trace_scope.file());
1083 : Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1084 : }
1085 : #endif
1086 : #ifdef DEBUG
1087 : if (FLAG_trace_regexp_assembler) {
1088 : delete macro_assembler_;
1089 : }
1090 : #endif
1091 85278 : return RegExpEngine::CompilationResult(*code, next_register_);
1092 : }
1093 :
1094 :
1095 4879067 : bool Trace::DeferredAction::Mentions(int that) {
1096 2463192 : if (action_type() == ActionNode::CLEAR_CAPTURES) {
1097 : Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1098 : return range.Contains(that);
1099 : } else {
1100 2415875 : return reg() == that;
1101 : }
1102 : }
1103 :
1104 :
1105 0 : bool Trace::mentions_reg(int reg) {
1106 0 : for (DeferredAction* action = actions_; action != nullptr;
1107 : action = action->next()) {
1108 0 : if (action->Mentions(reg))
1109 : return true;
1110 : }
1111 : return false;
1112 : }
1113 :
1114 :
1115 913 : bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1116 : DCHECK_EQ(0, *cp_offset);
1117 1836 : for (DeferredAction* action = actions_; action != nullptr;
1118 : action = action->next()) {
1119 923 : if (action->Mentions(reg)) {
1120 410 : if (action->action_type() == ActionNode::STORE_POSITION) {
1121 410 : *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1122 410 : return true;
1123 : } else {
1124 : return false;
1125 : }
1126 : }
1127 : }
1128 : return false;
1129 : }
1130 :
1131 :
1132 510256 : int Trace::FindAffectedRegisters(OutSet* affected_registers,
1133 : Zone* zone) {
1134 : int max_register = RegExpCompiler::kNoRegister;
1135 1760075 : for (DeferredAction* action = actions_; action != nullptr;
1136 : action = action->next()) {
1137 418547 : if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1138 : Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1139 47873 : for (int i = range.from(); i <= range.to(); i++)
1140 44962 : affected_registers->Set(i, zone);
1141 2911 : if (range.to() > max_register) max_register = range.to();
1142 : } else {
1143 415636 : affected_registers->Set(action->reg(), zone);
1144 415636 : if (action->reg() > max_register) max_register = action->reg();
1145 : }
1146 : }
1147 510256 : return max_register;
1148 : }
1149 :
1150 :
1151 510256 : void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1152 : int max_register,
1153 : const OutSet& registers_to_pop,
1154 : const OutSet& registers_to_clear) {
1155 10466542 : for (int reg = max_register; reg >= 0; reg--) {
1156 9956286 : if (registers_to_pop.Get(reg)) {
1157 52744 : assembler->PopRegister(reg);
1158 9903542 : } else if (registers_to_clear.Get(reg)) {
1159 : int clear_to = reg;
1160 183545 : while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1161 105690 : reg--;
1162 : }
1163 77855 : assembler->ClearRegisters(reg, clear_to);
1164 : }
1165 : }
1166 510256 : }
1167 :
1168 :
1169 510256 : void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1170 : int max_register,
1171 : const OutSet& affected_registers,
1172 : OutSet* registers_to_pop,
1173 : OutSet* registers_to_clear,
1174 : Zone* zone) {
1175 : // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1176 510256 : const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1177 :
1178 : // Count pushes performed to force a stack limit check occasionally.
1179 : int pushes = 0;
1180 :
1181 10572232 : for (int reg = 0; reg <= max_register; reg++) {
1182 10061976 : if (!affected_registers.Get(reg)) {
1183 : continue;
1184 : }
1185 :
1186 : // The chronologically first deferred action in the trace
1187 : // is used to infer the action needed to restore a register
1188 : // to its previous state (or not, if it's safe to ignore it).
1189 : enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1190 : DeferredActionUndoType undo_action = IGNORE;
1191 :
1192 : int value = 0;
1193 : bool absolute = false;
1194 : bool clear = false;
1195 : static const int kNoStore = kMinInt;
1196 : int store_position = kNoStore;
1197 : // This is a little tricky because we are scanning the actions in reverse
1198 : // historical order (newest first).
1199 2916378 : for (DeferredAction* action = actions_; action != nullptr;
1200 : action = action->next()) {
1201 2462269 : if (action->Mentions(reg)) {
1202 460598 : switch (action->action_type()) {
1203 : case ActionNode::SET_REGISTER: {
1204 3465 : Trace::DeferredSetRegister* psr =
1205 : static_cast<Trace::DeferredSetRegister*>(action);
1206 3465 : if (!absolute) {
1207 3465 : value += psr->value();
1208 : absolute = true;
1209 : }
1210 : // SET_REGISTER is currently only used for newly introduced loop
1211 : // counters. They can have a significant previous value if they
1212 : // occur in a loop. TODO(lrn): Propagate this information, so
1213 : // we can set undo_action to IGNORE if we know there is no value to
1214 : // restore.
1215 : undo_action = RESTORE;
1216 : DCHECK_EQ(store_position, kNoStore);
1217 : DCHECK(!clear);
1218 : break;
1219 : }
1220 : case ActionNode::INCREMENT_REGISTER:
1221 3744 : if (!absolute) {
1222 3744 : value++;
1223 : }
1224 : DCHECK_EQ(store_position, kNoStore);
1225 : DCHECK(!clear);
1226 : undo_action = RESTORE;
1227 : break;
1228 : case ActionNode::STORE_POSITION: {
1229 599034 : Trace::DeferredCapture* pc =
1230 : static_cast<Trace::DeferredCapture*>(action);
1231 408427 : if (!clear && store_position == kNoStore) {
1232 : store_position = pc->cp_offset();
1233 : }
1234 :
1235 : // For captures we know that stores and clears alternate.
1236 : // Other register, are never cleared, and if the occur
1237 : // inside a loop, they might be assigned more than once.
1238 408427 : if (reg <= 1) {
1239 : // Registers zero and one, aka "capture zero", is
1240 : // always set correctly if we succeed. There is no
1241 : // need to undo a setting on backtrack, because we
1242 : // will set it again or fail.
1243 : undo_action = IGNORE;
1244 : } else {
1245 190607 : undo_action = pc->is_capture() ? CLEAR : RESTORE;
1246 : }
1247 : DCHECK(!absolute);
1248 : DCHECK_EQ(value, 0);
1249 : break;
1250 : }
1251 : case ActionNode::CLEAR_CAPTURES: {
1252 : // Since we're scanning in reverse order, if we've already
1253 : // set the position we have to ignore historically earlier
1254 : // clearing operations.
1255 44962 : if (store_position == kNoStore) {
1256 : clear = true;
1257 : }
1258 : undo_action = RESTORE;
1259 : DCHECK(!absolute);
1260 : DCHECK_EQ(value, 0);
1261 : break;
1262 : }
1263 : default:
1264 0 : UNREACHABLE();
1265 : break;
1266 : }
1267 : }
1268 : }
1269 : // Prepare for the undo-action (e.g., push if it's going to be popped).
1270 454109 : if (undo_action == RESTORE) {
1271 52744 : pushes++;
1272 : RegExpMacroAssembler::StackCheckFlag stack_check =
1273 : RegExpMacroAssembler::kNoStackLimitCheck;
1274 52744 : if (pushes == push_limit) {
1275 : stack_check = RegExpMacroAssembler::kCheckStackLimit;
1276 : pushes = 0;
1277 : }
1278 :
1279 52744 : assembler->PushRegister(reg, stack_check);
1280 52744 : registers_to_pop->Set(reg, zone);
1281 401365 : } else if (undo_action == CLEAR) {
1282 183545 : registers_to_clear->Set(reg, zone);
1283 : }
1284 : // Perform the chronologically last action (or accumulated increment)
1285 : // for the register.
1286 454109 : if (store_position != kNoStore) {
1287 408427 : assembler->WriteCurrentPositionToRegister(reg, store_position);
1288 45682 : } else if (clear) {
1289 38473 : assembler->ClearRegisters(reg, reg);
1290 7209 : } else if (absolute) {
1291 3465 : assembler->SetRegister(reg, value);
1292 3744 : } else if (value != 0) {
1293 3744 : assembler->AdvanceRegister(reg, value);
1294 : }
1295 : }
1296 510256 : }
1297 :
1298 :
1299 : // This is called as we come into a loop choice node and some other tricky
1300 : // nodes. It normalizes the state of the code generator to ensure we can
1301 : // generate generic code.
1302 3598643 : void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1303 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1304 :
1305 : DCHECK(!is_trivial());
1306 :
1307 1158301 : if (actions_ == nullptr && backtrack() == nullptr) {
1308 : // Here we just have some deferred cp advances to fix and we are back to
1309 : // a normal situation. We may also have to forget some information gained
1310 : // through a quick check that was already performed.
1311 189766 : if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1312 : // Create a new trivial state and generate the node with that.
1313 189766 : Trace new_state;
1314 189766 : successor->Emit(compiler, &new_state);
1315 700022 : return;
1316 : }
1317 :
1318 : // Generate deferred actions here along with code to undo them again.
1319 : OutSet affected_registers;
1320 :
1321 510256 : if (backtrack() != nullptr) {
1322 : // Here we have a concrete backtrack location. These are set up by choice
1323 : // nodes and so they indicate that we have a deferred save of the current
1324 : // position which we may need to emit here.
1325 399318 : assembler->PushCurrentPosition();
1326 : }
1327 :
1328 : int max_register = FindAffectedRegisters(&affected_registers,
1329 510256 : compiler->zone());
1330 : OutSet registers_to_pop;
1331 : OutSet registers_to_clear;
1332 : PerformDeferredActions(assembler,
1333 : max_register,
1334 : affected_registers,
1335 : ®isters_to_pop,
1336 : ®isters_to_clear,
1337 510256 : compiler->zone());
1338 510256 : if (cp_offset_ != 0) {
1339 293401 : assembler->AdvanceCurrentPosition(cp_offset_);
1340 : }
1341 :
1342 : // Create a new trivial state and generate the node with that.
1343 510256 : Label undo;
1344 510256 : assembler->PushBacktrack(&undo);
1345 510256 : if (successor->KeepRecursing(compiler)) {
1346 137780 : Trace new_state;
1347 137780 : successor->Emit(compiler, &new_state);
1348 : } else {
1349 372476 : compiler->AddWork(successor);
1350 372476 : assembler->GoTo(successor->label());
1351 : }
1352 :
1353 : // On backtrack we need to restore state.
1354 510256 : assembler->Bind(&undo);
1355 : RestoreAffectedRegisters(assembler,
1356 : max_register,
1357 : registers_to_pop,
1358 510256 : registers_to_clear);
1359 510256 : if (backtrack() == nullptr) {
1360 110938 : assembler->Backtrack();
1361 : } else {
1362 399318 : assembler->PopCurrentPosition();
1363 798636 : assembler->GoTo(backtrack());
1364 : }
1365 : }
1366 :
1367 :
1368 2843 : void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1369 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1370 :
1371 : // Omit flushing the trace. We discard the entire stack frame anyway.
1372 :
1373 2843 : if (!label()->is_bound()) {
1374 : // We are completely independent of the trace, since we ignore it,
1375 : // so this code can be used as the generic version.
1376 2802 : assembler->Bind(label());
1377 : }
1378 :
1379 : // Throw away everything on the backtrack stack since the start
1380 : // of the negative submatch and restore the character position.
1381 2843 : assembler->ReadCurrentPositionFromRegister(current_position_register_);
1382 2843 : assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1383 2843 : if (clear_capture_count_ > 0) {
1384 : // Clear any captures that might have been performed during the success
1385 : // of the body of the negative look-ahead.
1386 107 : int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1387 107 : assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1388 : }
1389 : // Now that we have unwound the stack we find at the top of the stack the
1390 : // backtrack that the BeginSubmatch node got.
1391 2843 : assembler->Backtrack();
1392 2843 : }
1393 :
1394 :
1395 274236 : void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1396 182529 : if (!trace->is_trivial()) {
1397 91117 : trace->Flush(compiler, this);
1398 91117 : return;
1399 : }
1400 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1401 91412 : if (!label()->is_bound()) {
1402 85267 : assembler->Bind(label());
1403 : }
1404 91412 : switch (action_) {
1405 : case ACCEPT:
1406 91117 : assembler->Succeed();
1407 91117 : return;
1408 : case BACKTRACK:
1409 590 : assembler->GoTo(trace->backtrack());
1410 295 : return;
1411 : case NEGATIVE_SUBMATCH_SUCCESS:
1412 : // This case is handled in a different virtual method.
1413 0 : UNREACHABLE();
1414 : }
1415 0 : UNIMPLEMENTED();
1416 : }
1417 :
1418 :
1419 903937 : void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1420 1807874 : if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone);
1421 903937 : guards_->Add(guard, zone);
1422 903937 : }
1423 :
1424 :
1425 903359 : ActionNode* ActionNode::SetRegister(int reg,
1426 : int val,
1427 903359 : RegExpNode* on_success) {
1428 : ActionNode* result =
1429 : new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1430 903359 : result->data_.u_store_register.reg = reg;
1431 903359 : result->data_.u_store_register.value = val;
1432 903359 : return result;
1433 : }
1434 :
1435 :
1436 903359 : ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1437 : ActionNode* result =
1438 : new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1439 903359 : result->data_.u_increment_register.reg = reg;
1440 903359 : return result;
1441 : }
1442 :
1443 :
1444 226157 : ActionNode* ActionNode::StorePosition(int reg,
1445 : bool is_capture,
1446 226157 : RegExpNode* on_success) {
1447 : ActionNode* result =
1448 : new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1449 226157 : result->data_.u_position_register.reg = reg;
1450 226157 : result->data_.u_position_register.is_capture = is_capture;
1451 226157 : return result;
1452 : }
1453 :
1454 :
1455 2336 : ActionNode* ActionNode::ClearCaptures(Interval range,
1456 2336 : RegExpNode* on_success) {
1457 : ActionNode* result =
1458 : new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1459 2336 : result->data_.u_clear_captures.range_from = range.from();
1460 2336 : result->data_.u_clear_captures.range_to = range.to();
1461 2336 : return result;
1462 : }
1463 :
1464 :
1465 4462 : ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1466 : int position_reg,
1467 4462 : RegExpNode* on_success) {
1468 : ActionNode* result =
1469 : new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1470 4462 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1471 4462 : result->data_.u_submatch.current_position_register = position_reg;
1472 4462 : return result;
1473 : }
1474 :
1475 :
1476 1650 : ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1477 : int position_reg,
1478 : int clear_register_count,
1479 : int clear_register_from,
1480 1650 : RegExpNode* on_success) {
1481 : ActionNode* result =
1482 : new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1483 1650 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1484 1650 : result->data_.u_submatch.current_position_register = position_reg;
1485 1650 : result->data_.u_submatch.clear_register_count = clear_register_count;
1486 1650 : result->data_.u_submatch.clear_register_from = clear_register_from;
1487 1650 : return result;
1488 : }
1489 :
1490 :
1491 507 : ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1492 : int repetition_register,
1493 : int repetition_limit,
1494 507 : RegExpNode* on_success) {
1495 : ActionNode* result =
1496 : new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1497 507 : result->data_.u_empty_match_check.start_register = start_register;
1498 507 : result->data_.u_empty_match_check.repetition_register = repetition_register;
1499 507 : result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1500 507 : return result;
1501 : }
1502 :
1503 :
1504 : #define DEFINE_ACCEPT(Type) \
1505 : void Type##Node::Accept(NodeVisitor* visitor) { \
1506 : visitor->Visit##Type(this); \
1507 : }
1508 717159 : FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1509 : #undef DEFINE_ACCEPT
1510 :
1511 :
1512 138972 : void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1513 138972 : visitor->VisitLoopChoice(this);
1514 138972 : }
1515 :
1516 :
1517 : // -------------------------------------------------------------------
1518 : // Emit code.
1519 :
1520 :
1521 3937 : void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1522 7874 : Guard* guard,
1523 3937 : Trace* trace) {
1524 3937 : switch (guard->op()) {
1525 : case Guard::LT:
1526 : DCHECK(!trace->mentions_reg(guard->reg()));
1527 : macro_assembler->IfRegisterGE(guard->reg(),
1528 : guard->value(),
1529 5208 : trace->backtrack());
1530 2604 : break;
1531 : case Guard::GEQ:
1532 : DCHECK(!trace->mentions_reg(guard->reg()));
1533 : macro_assembler->IfRegisterLT(guard->reg(),
1534 : guard->value(),
1535 2666 : trace->backtrack());
1536 1333 : break;
1537 : }
1538 3937 : }
1539 :
1540 :
1541 : // Returns the number of characters in the equivalence class, omitting those
1542 : // that cannot occur in the source string because it is Latin1.
1543 21888 : static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1544 : bool one_byte_subject,
1545 : unibrow::uchar* letters) {
1546 : int length =
1547 21888 : isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1548 : // Unibrow returns 0 or 1 for characters where case independence is
1549 : // trivial.
1550 21888 : if (length == 0) {
1551 2765 : letters[0] = character;
1552 : length = 1;
1553 : }
1554 :
1555 21888 : if (one_byte_subject) {
1556 : int new_length = 0;
1557 31762 : for (int i = 0; i < length; i++) {
1558 31762 : if (letters[i] <= String::kMaxOneByteCharCode) {
1559 31352 : letters[new_length++] = letters[i];
1560 : }
1561 : }
1562 : length = new_length;
1563 : }
1564 :
1565 21888 : return length;
1566 : }
1567 :
1568 :
1569 549668 : static inline bool EmitSimpleCharacter(Isolate* isolate,
1570 549668 : RegExpCompiler* compiler,
1571 : uc16 c,
1572 : Label* on_failure,
1573 : int cp_offset,
1574 : bool check,
1575 : bool preloaded) {
1576 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1577 : bool bound_checked = false;
1578 549668 : if (!preloaded) {
1579 : assembler->LoadCurrentCharacter(
1580 : cp_offset,
1581 : on_failure,
1582 549668 : check);
1583 : bound_checked = true;
1584 : }
1585 549668 : assembler->CheckNotCharacter(c, on_failure);
1586 549668 : return bound_checked;
1587 : }
1588 :
1589 :
1590 : // Only emits non-letters (things that don't have case). Only used for case
1591 : // independent matches.
1592 5344 : static inline bool EmitAtomNonLetter(Isolate* isolate,
1593 5344 : RegExpCompiler* compiler,
1594 : uc16 c,
1595 : Label* on_failure,
1596 : int cp_offset,
1597 : bool check,
1598 : bool preloaded) {
1599 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1600 : bool one_byte = compiler->one_byte();
1601 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1602 5344 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1603 5344 : if (length < 1) {
1604 : // This can't match. Must be an one-byte subject and a non-one-byte
1605 : // character. We do not need to do anything since the one-byte pass
1606 : // already handled this.
1607 : return false; // Bounds not checked.
1608 : }
1609 : bool checked = false;
1610 : // We handle the length > 1 case in a later pass.
1611 5339 : if (length == 1) {
1612 356 : if (one_byte && c > String::kMaxOneByteCharCodeU) {
1613 : // Can't match - see above.
1614 : return false; // Bounds not checked.
1615 : }
1616 356 : if (!preloaded) {
1617 356 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1618 : checked = check;
1619 : }
1620 356 : macro_assembler->CheckNotCharacter(c, on_failure);
1621 : }
1622 5339 : return checked;
1623 : }
1624 :
1625 :
1626 4641 : static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1627 : bool one_byte, uc16 c1, uc16 c2,
1628 : Label* on_failure) {
1629 : uc16 char_mask;
1630 4641 : if (one_byte) {
1631 : char_mask = String::kMaxOneByteCharCode;
1632 : } else {
1633 : char_mask = String::kMaxUtf16CodeUnit;
1634 : }
1635 4641 : uc16 exor = c1 ^ c2;
1636 : // Check whether exor has only one bit set.
1637 4641 : if (((exor - 1) & exor) == 0) {
1638 : // If c1 and c2 differ only by one bit.
1639 : // Ecma262UnCanonicalize always gives the highest number last.
1640 : DCHECK(c2 > c1);
1641 4546 : uc16 mask = char_mask ^ exor;
1642 4546 : macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1643 4546 : return true;
1644 : }
1645 : DCHECK(c2 > c1);
1646 95 : uc16 diff = c2 - c1;
1647 95 : if (((diff - 1) & diff) == 0 && c1 >= diff) {
1648 : // If the characters differ by 2^n but don't differ by one bit then
1649 : // subtract the difference from the found character, then do the or
1650 : // trick. We avoid the theoretical case where negative numbers are
1651 : // involved in order to simplify code generation.
1652 85 : uc16 mask = char_mask ^ diff;
1653 : macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1654 : diff,
1655 : mask,
1656 85 : on_failure);
1657 85 : return true;
1658 : }
1659 : return false;
1660 : }
1661 :
1662 :
1663 : typedef bool EmitCharacterFunction(Isolate* isolate,
1664 : RegExpCompiler* compiler,
1665 : uc16 c,
1666 : Label* on_failure,
1667 : int cp_offset,
1668 : bool check,
1669 : bool preloaded);
1670 :
1671 : // Only emits letters (things that have case). Only used for case independent
1672 : // matches.
1673 5344 : static inline bool EmitAtomLetter(Isolate* isolate,
1674 5344 : RegExpCompiler* compiler,
1675 : uc16 c,
1676 : Label* on_failure,
1677 : int cp_offset,
1678 : bool check,
1679 : bool preloaded) {
1680 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1681 : bool one_byte = compiler->one_byte();
1682 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1683 5344 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1684 5344 : if (length <= 1) return false;
1685 : // We may not need to check against the end of the input string
1686 : // if this character lies before a character that matched.
1687 4983 : if (!preloaded) {
1688 4651 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1689 : }
1690 4983 : Label ok;
1691 : DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth);
1692 4983 : switch (length) {
1693 : case 2: {
1694 9282 : if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1695 9282 : chars[1], on_failure)) {
1696 : } else {
1697 10 : macro_assembler->CheckCharacter(chars[0], &ok);
1698 10 : macro_assembler->CheckNotCharacter(chars[1], on_failure);
1699 10 : macro_assembler->Bind(&ok);
1700 : }
1701 : break;
1702 : }
1703 : case 4:
1704 25 : macro_assembler->CheckCharacter(chars[3], &ok);
1705 : V8_FALLTHROUGH;
1706 : case 3:
1707 342 : macro_assembler->CheckCharacter(chars[0], &ok);
1708 342 : macro_assembler->CheckCharacter(chars[1], &ok);
1709 342 : macro_assembler->CheckNotCharacter(chars[2], on_failure);
1710 342 : macro_assembler->Bind(&ok);
1711 342 : break;
1712 : default:
1713 0 : UNREACHABLE();
1714 : break;
1715 : }
1716 : return true;
1717 : }
1718 :
1719 :
1720 8306 : static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1721 : int border,
1722 : Label* fall_through,
1723 : Label* above_or_equal,
1724 : Label* below) {
1725 8306 : if (below != fall_through) {
1726 7942 : masm->CheckCharacterLT(border, below);
1727 7942 : if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1728 : } else {
1729 364 : masm->CheckCharacterGT(border - 1, above_or_equal);
1730 : }
1731 8306 : }
1732 :
1733 :
1734 155280 : static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1735 : int first,
1736 : int last,
1737 : Label* fall_through,
1738 : Label* in_range,
1739 : Label* out_of_range) {
1740 155280 : if (in_range == fall_through) {
1741 107327 : if (first == last) {
1742 14477 : masm->CheckNotCharacter(first, out_of_range);
1743 : } else {
1744 92850 : masm->CheckCharacterNotInRange(first, last, out_of_range);
1745 : }
1746 : } else {
1747 47953 : if (first == last) {
1748 25344 : masm->CheckCharacter(first, in_range);
1749 : } else {
1750 22609 : masm->CheckCharacterInRange(first, last, in_range);
1751 : }
1752 47953 : if (out_of_range != fall_through) masm->GoTo(out_of_range);
1753 : }
1754 155280 : }
1755 :
1756 :
1757 : // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1758 : // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1759 5861 : static void EmitUseLookupTable(
1760 5861 : RegExpMacroAssembler* masm,
1761 : ZoneList<int>* ranges,
1762 : int start_index,
1763 : int end_index,
1764 : int min_char,
1765 : Label* fall_through,
1766 : Label* even_label,
1767 : Label* odd_label) {
1768 : static const int kSize = RegExpMacroAssembler::kTableSize;
1769 : static const int kMask = RegExpMacroAssembler::kTableMask;
1770 :
1771 : int base = (min_char & ~kMask);
1772 : USE(base);
1773 :
1774 : // Assert that everything is on one kTableSize page.
1775 : for (int i = start_index; i <= end_index; i++) {
1776 : DCHECK_EQ(ranges->at(i) & ~kMask, base);
1777 : }
1778 : DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1779 :
1780 : char templ[kSize];
1781 : Label* on_bit_set;
1782 : Label* on_bit_clear;
1783 : int bit;
1784 5861 : if (even_label == fall_through) {
1785 : on_bit_set = odd_label;
1786 : on_bit_clear = even_label;
1787 : bit = 1;
1788 : } else {
1789 : on_bit_set = even_label;
1790 : on_bit_clear = odd_label;
1791 : bit = 0;
1792 : }
1793 252413 : for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1794 123276 : templ[i] = bit;
1795 : }
1796 : int j = 0;
1797 5861 : bit ^= 1;
1798 95816 : for (int i = start_index; i < end_index; i++) {
1799 1204670 : for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1800 512380 : templ[j] = bit;
1801 : }
1802 89955 : bit ^= 1;
1803 : }
1804 114552 : for (int i = j; i < kSize; i++) {
1805 114552 : templ[i] = bit;
1806 : }
1807 : Factory* factory = masm->isolate()->factory();
1808 : // TODO(erikcorry): Cache these.
1809 5861 : Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1810 750208 : for (int i = 0; i < kSize; i++) {
1811 750208 : ba->set(i, templ[i]);
1812 : }
1813 5861 : masm->CheckBitInTable(ba, on_bit_set);
1814 5861 : if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1815 5861 : }
1816 :
1817 :
1818 33358 : static void CutOutRange(RegExpMacroAssembler* masm,
1819 : ZoneList<int>* ranges,
1820 : int start_index,
1821 : int end_index,
1822 : int cut_index,
1823 : Label* even_label,
1824 : Label* odd_label) {
1825 33358 : bool odd = (((cut_index - start_index) & 1) == 1);
1826 33358 : Label* in_range_label = odd ? odd_label : even_label;
1827 33358 : Label dummy;
1828 : EmitDoubleBoundaryTest(masm,
1829 : ranges->at(cut_index),
1830 33358 : ranges->at(cut_index + 1) - 1,
1831 : &dummy,
1832 : in_range_label,
1833 66716 : &dummy);
1834 : DCHECK(!dummy.is_linked());
1835 : // Cut out the single range by rewriting the array. This creates a new
1836 : // range that is a merger of the two ranges on either side of the one we
1837 : // are cutting out. The oddity of the labels is preserved.
1838 84902 : for (int j = cut_index; j > start_index; j--) {
1839 36372 : ranges->at(j) = ranges->at(j - 1);
1840 : }
1841 101600 : for (int j = cut_index + 1; j < end_index; j++) {
1842 136484 : ranges->at(j) = ranges->at(j + 1);
1843 : }
1844 33358 : }
1845 :
1846 :
1847 : // Unicode case. Split the search space into kSize spaces that are handled
1848 : // with recursion.
1849 19466 : static void SplitSearchSpace(ZoneList<int>* ranges,
1850 : int start_index,
1851 : int end_index,
1852 : int* new_start_index,
1853 : int* new_end_index,
1854 : int* border) {
1855 : static const int kSize = RegExpMacroAssembler::kTableSize;
1856 : static const int kMask = RegExpMacroAssembler::kTableMask;
1857 :
1858 19466 : int first = ranges->at(start_index);
1859 19466 : int last = ranges->at(end_index) - 1;
1860 :
1861 19466 : *new_start_index = start_index;
1862 19466 : *border = (ranges->at(start_index) & ~kMask) + kSize;
1863 166145 : while (*new_start_index < end_index) {
1864 145753 : if (ranges->at(*new_start_index) > *border) break;
1865 127213 : (*new_start_index)++;
1866 : }
1867 : // new_start_index is the index of the first edge that is beyond the
1868 : // current kSize space.
1869 :
1870 : // For very large search spaces we do a binary chop search of the non-Latin1
1871 : // space instead of just going to the end of the current kSize space. The
1872 : // heuristics are complicated a little by the fact that any 128-character
1873 : // encoding space can be quickly tested with a table lookup, so we don't
1874 : // wish to do binary chop search at a smaller granularity than that. A
1875 : // 128-character space can take up a lot of space in the ranges array if,
1876 : // for example, we only want to match every second character (eg. the lower
1877 : // case characters on some Unicode pages).
1878 19466 : int binary_chop_index = (end_index + start_index) / 2;
1879 : // The first test ensures that we get to the code that handles the Latin1
1880 : // range with a single not-taken branch, speeding up this important
1881 : // character range (even non-Latin1 charset-based text has spaces and
1882 : // punctuation).
1883 53824 : if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1884 27629 : end_index - start_index > (*new_start_index - start_index) * 2 &&
1885 55706 : last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1886 23248 : ranges->at(binary_chop_index) >= first + 2 * kSize) {
1887 : int scan_forward_for_section_border = binary_chop_index;;
1888 9782 : int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1889 :
1890 76686 : while (scan_forward_for_section_border < end_index) {
1891 65012 : if (ranges->at(scan_forward_for_section_border) > new_border) {
1892 7890 : *new_start_index = scan_forward_for_section_border;
1893 7890 : *border = new_border;
1894 7890 : break;
1895 : }
1896 57122 : scan_forward_for_section_border++;
1897 : }
1898 : }
1899 :
1900 : DCHECK(*new_start_index > start_index);
1901 19466 : *new_end_index = *new_start_index - 1;
1902 19466 : if (ranges->at(*new_end_index) == *border) {
1903 2958 : (*new_end_index)--;
1904 : }
1905 38932 : if (*border >= ranges->at(end_index)) {
1906 924 : *border = ranges->at(end_index);
1907 924 : *new_start_index = end_index; // Won't be used.
1908 924 : *new_end_index = end_index - 1;
1909 : }
1910 19466 : }
1911 :
1912 : // Gets a series of segment boundaries representing a character class. If the
1913 : // character is in the range between an even and an odd boundary (counting from
1914 : // start_index) then go to even_label, otherwise go to odd_label. We already
1915 : // know that the character is in the range of min_char to max_char inclusive.
1916 : // Either label can be nullptr indicating backtracking. Either label can also
1917 : // be equal to the fall_through label.
1918 196929 : static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1919 : int start_index, int end_index, uc32 min_char,
1920 : uc32 max_char, Label* fall_through,
1921 : Label* even_label, Label* odd_label) {
1922 : DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1923 : DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1924 :
1925 196929 : int first = ranges->at(start_index);
1926 196929 : int last = ranges->at(end_index) - 1;
1927 :
1928 : DCHECK_LT(min_char, first);
1929 :
1930 : // Just need to test if the character is before or on-or-after
1931 : // a particular character.
1932 196929 : if (start_index == end_index) {
1933 8306 : EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1934 8306 : return;
1935 : }
1936 :
1937 : // Another almost trivial case: There is one interval in the middle that is
1938 : // different from the end intervals.
1939 188623 : if (start_index + 1 == end_index) {
1940 : EmitDoubleBoundaryTest(
1941 121922 : masm, first, last, fall_through, even_label, odd_label);
1942 121922 : return;
1943 : }
1944 :
1945 : // It's not worth using table lookup if there are very few intervals in the
1946 : // character class.
1947 66701 : if (end_index - start_index <= 6) {
1948 : // It is faster to test for individual characters, so we look for those
1949 : // first, then try arbitrary ranges in the second round.
1950 : static int kNoCutIndex = -1;
1951 33358 : int cut = kNoCutIndex;
1952 137410 : for (int i = start_index; i < end_index; i++) {
1953 175858 : if (ranges->at(i) == ranges->at(i + 1) - 1) {
1954 : cut = i;
1955 : break;
1956 : }
1957 : }
1958 33358 : if (cut == kNoCutIndex) cut = start_index;
1959 : CutOutRange(
1960 33358 : masm, ranges, start_index, end_index, cut, even_label, odd_label);
1961 : DCHECK_GE(end_index - start_index, 2);
1962 : GenerateBranches(masm,
1963 : ranges,
1964 : start_index + 1,
1965 : end_index - 1,
1966 : min_char,
1967 : max_char,
1968 : fall_through,
1969 : even_label,
1970 33358 : odd_label);
1971 33358 : return;
1972 : }
1973 :
1974 : // If there are a lot of intervals in the regexp, then we will use tables to
1975 : // determine whether the character is inside or outside the character class.
1976 : static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1977 :
1978 33343 : if ((max_char >> kBits) == (min_char >> kBits)) {
1979 : EmitUseLookupTable(masm,
1980 : ranges,
1981 : start_index,
1982 : end_index,
1983 : min_char,
1984 : fall_through,
1985 : even_label,
1986 5861 : odd_label);
1987 5861 : return;
1988 : }
1989 :
1990 27482 : if ((min_char >> kBits) != (first >> kBits)) {
1991 8016 : masm->CheckCharacterLT(first, odd_label);
1992 : GenerateBranches(masm,
1993 : ranges,
1994 : start_index + 1,
1995 : end_index,
1996 : first,
1997 : max_char,
1998 : fall_through,
1999 : odd_label,
2000 8016 : even_label);
2001 8016 : return;
2002 : }
2003 :
2004 19466 : int new_start_index = 0;
2005 19466 : int new_end_index = 0;
2006 19466 : int border = 0;
2007 :
2008 : SplitSearchSpace(ranges,
2009 : start_index,
2010 : end_index,
2011 : &new_start_index,
2012 : &new_end_index,
2013 19466 : &border);
2014 :
2015 19466 : Label handle_rest;
2016 : Label* above = &handle_rest;
2017 19466 : if (border == last + 1) {
2018 : // We didn't find any section that started after the limit, so everything
2019 : // above the border is one of the terminal labels.
2020 924 : above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2021 : DCHECK(new_end_index == end_index - 1);
2022 : }
2023 :
2024 : DCHECK_LE(start_index, new_end_index);
2025 : DCHECK_LE(new_start_index, end_index);
2026 : DCHECK_LT(start_index, new_start_index);
2027 : DCHECK_LT(new_end_index, end_index);
2028 : DCHECK(new_end_index + 1 == new_start_index ||
2029 : (new_end_index + 2 == new_start_index &&
2030 : border == ranges->at(new_end_index + 1)));
2031 : DCHECK_LT(min_char, border - 1);
2032 : DCHECK_LT(border, max_char);
2033 : DCHECK_LT(ranges->at(new_end_index), border);
2034 : DCHECK(border < ranges->at(new_start_index) ||
2035 : (border == ranges->at(new_start_index) &&
2036 : new_start_index == end_index &&
2037 : new_end_index == end_index - 1 &&
2038 : border == last + 1));
2039 : DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2040 :
2041 19466 : masm->CheckCharacterGT(border - 1, above);
2042 19466 : Label dummy;
2043 : GenerateBranches(masm,
2044 : ranges,
2045 : start_index,
2046 : new_end_index,
2047 : min_char,
2048 : border - 1,
2049 : &dummy,
2050 : even_label,
2051 19466 : odd_label);
2052 19466 : if (handle_rest.is_linked()) {
2053 18542 : masm->Bind(&handle_rest);
2054 18542 : bool flip = (new_start_index & 1) != (start_index & 1);
2055 : GenerateBranches(masm,
2056 : ranges,
2057 : new_start_index,
2058 : end_index,
2059 : border,
2060 : max_char,
2061 : &dummy,
2062 : flip ? odd_label : even_label,
2063 18542 : flip ? even_label : odd_label);
2064 : }
2065 : }
2066 :
2067 :
2068 210709 : static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2069 : RegExpCharacterClass* cc, bool one_byte,
2070 : Label* on_failure, int cp_offset, bool check_offset,
2071 : bool preloaded, Zone* zone) {
2072 210709 : ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2073 210709 : CharacterRange::Canonicalize(ranges);
2074 :
2075 : int max_char;
2076 210709 : if (one_byte) {
2077 : max_char = String::kMaxOneByteCharCode;
2078 : } else {
2079 : max_char = String::kMaxUtf16CodeUnit;
2080 : }
2081 :
2082 : int range_count = ranges->length();
2083 :
2084 210709 : int last_valid_range = range_count - 1;
2085 602422 : while (last_valid_range >= 0) {
2086 391678 : CharacterRange& range = ranges->at(last_valid_range);
2087 391678 : if (range.from() <= max_char) {
2088 : break;
2089 : }
2090 181004 : last_valid_range--;
2091 : }
2092 :
2093 210709 : if (last_valid_range < 0) {
2094 35 : if (!cc->is_negated()) {
2095 10 : macro_assembler->GoTo(on_failure);
2096 : }
2097 35 : if (check_offset) {
2098 35 : macro_assembler->CheckPosition(cp_offset, on_failure);
2099 : }
2100 93162 : return;
2101 : }
2102 :
2103 398932 : if (last_valid_range == 0 &&
2104 : ranges->at(0).IsEverything(max_char)) {
2105 82505 : if (cc->is_negated()) {
2106 31 : macro_assembler->GoTo(on_failure);
2107 : } else {
2108 : // This is a common case hit by non-anchored expressions.
2109 82474 : if (check_offset) {
2110 53820 : macro_assembler->CheckPosition(cp_offset, on_failure);
2111 : }
2112 : }
2113 : return;
2114 : }
2115 :
2116 128169 : if (!preloaded) {
2117 116437 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2118 : }
2119 :
2120 139182 : if (cc->is_standard(zone) &&
2121 : macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2122 22026 : on_failure)) {
2123 : return;
2124 : }
2125 :
2126 :
2127 : // A new list with ascending entries. Each entry is a code unit
2128 : // where there is a boundary between code units that are part of
2129 : // the class and code units that are not. Normally we insert an
2130 : // entry at zero which goes to the failure label, but if there
2131 : // was already one there we fall through for success on that entry.
2132 : // Subsequent entries have alternating meaning (success/failure).
2133 117547 : ZoneList<int>* range_boundaries =
2134 117547 : new(zone) ZoneList<int>(last_valid_range, zone);
2135 :
2136 117547 : bool zeroth_entry_is_failure = !cc->is_negated();
2137 :
2138 333135 : for (int i = 0; i <= last_valid_range; i++) {
2139 431176 : CharacterRange& range = ranges->at(i);
2140 215588 : if (range.from() == 0) {
2141 : DCHECK_EQ(i, 0);
2142 1999 : zeroth_entry_is_failure = !zeroth_entry_is_failure;
2143 : } else {
2144 213589 : range_boundaries->Add(range.from(), zone);
2145 : }
2146 215588 : range_boundaries->Add(range.to() + 1, zone);
2147 : }
2148 117547 : int end_index = range_boundaries->length() - 1;
2149 117547 : if (range_boundaries->at(end_index) > max_char) {
2150 2597 : end_index--;
2151 : }
2152 :
2153 117547 : Label fall_through;
2154 : GenerateBranches(macro_assembler,
2155 : range_boundaries,
2156 : 0, // start_index.
2157 : end_index,
2158 : 0, // min_char.
2159 : max_char,
2160 : &fall_through,
2161 : zeroth_entry_is_failure ? &fall_through : on_failure,
2162 117547 : zeroth_entry_is_failure ? on_failure : &fall_through);
2163 117547 : macro_assembler->Bind(&fall_through);
2164 : }
2165 :
2166 : RegExpNode::~RegExpNode() = default;
2167 :
2168 4448772 : RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2169 2174867 : Trace* trace) {
2170 : // If we are generating a greedy loop then don't stop and don't reuse code.
2171 1696694 : if (trace->stop_node() != nullptr) {
2172 : return CONTINUE;
2173 : }
2174 :
2175 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2176 1684989 : if (trace->is_trivial()) {
2177 1541044 : if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2178 : // If a generic version is already scheduled to be generated or we have
2179 : // recursed too deeply then just generate a jump to that code.
2180 220122 : macro_assembler->GoTo(&label_);
2181 : // This will queue it up for generation of a generic version if it hasn't
2182 : // already been queued.
2183 220122 : compiler->AddWork(this);
2184 220122 : return DONE;
2185 : }
2186 : // Generate generic version of the node and bind the label for later use.
2187 392601 : macro_assembler->Bind(&label_);
2188 392601 : return CONTINUE;
2189 : }
2190 :
2191 : // We are being asked to make a non-generic version. Keep track of how many
2192 : // non-generic versions we generate so as not to overdo it.
2193 1072266 : trace_count_++;
2194 2139355 : if (KeepRecursing(compiler) && compiler->optimize() &&
2195 : trace_count_ < kMaxCopiesCodeGenerated) {
2196 : return CONTINUE;
2197 : }
2198 :
2199 : // If we get here code has been generated for this node too many times or
2200 : // recursion is too deep. Time to switch to a generic version. The code for
2201 : // generic versions above can handle deep recursion properly.
2202 : bool was_limiting = compiler->limiting_recursion();
2203 : compiler->set_limiting_recursion(true);
2204 477560 : trace->Flush(compiler, this);
2205 : compiler->set_limiting_recursion(was_limiting);
2206 477560 : return DONE;
2207 : }
2208 :
2209 :
2210 3635362 : bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2211 3635362 : return !compiler->limiting_recursion() &&
2212 0 : compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2213 : }
2214 :
2215 :
2216 585284 : int ActionNode::EatsAtLeast(int still_to_find,
2217 : int budget,
2218 : bool not_at_start) {
2219 585284 : if (budget <= 0) return 0;
2220 572024 : if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2221 566896 : return on_success()->EatsAtLeast(still_to_find,
2222 : budget - 1,
2223 566896 : not_at_start);
2224 : }
2225 :
2226 :
2227 90547 : void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2228 : BoyerMooreLookahead* bm, bool not_at_start) {
2229 90547 : if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2230 90547 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2231 : }
2232 : SaveBMInfo(bm, not_at_start, offset);
2233 90547 : }
2234 :
2235 :
2236 10619 : int AssertionNode::EatsAtLeast(int still_to_find,
2237 : int budget,
2238 9582 : bool not_at_start) {
2239 10619 : if (budget <= 0) return 0;
2240 : // If we know we are not at the start and we are asked "how many characters
2241 : // will you match if you succeed?" then we can answer anything since false
2242 : // implies false. So lets just return the max answer (still_to_find) since
2243 : // that won't prevent us from preloading a lot of characters for the other
2244 : // branches in the node graph.
2245 9582 : if (assertion_type() == AT_START && not_at_start) return still_to_find;
2246 9360 : return on_success()->EatsAtLeast(still_to_find,
2247 : budget - 1,
2248 9360 : not_at_start);
2249 : }
2250 :
2251 :
2252 379 : void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2253 379 : BoyerMooreLookahead* bm, bool not_at_start) {
2254 : // Match the behaviour of EatsAtLeast on this node.
2255 758 : if (assertion_type() == AT_START && not_at_start) return;
2256 363 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2257 : SaveBMInfo(bm, not_at_start, offset);
2258 : }
2259 :
2260 :
2261 3111 : int BackReferenceNode::EatsAtLeast(int still_to_find,
2262 : int budget,
2263 3111 : bool not_at_start) {
2264 3111 : if (read_backward()) return 0;
2265 3001 : if (budget <= 0) return 0;
2266 3001 : return on_success()->EatsAtLeast(still_to_find,
2267 : budget - 1,
2268 3001 : not_at_start);
2269 : }
2270 :
2271 :
2272 5762496 : int TextNode::EatsAtLeast(int still_to_find,
2273 : int budget,
2274 5762496 : bool not_at_start) {
2275 5762496 : if (read_backward()) return 0;
2276 5760644 : int answer = Length();
2277 5760644 : if (answer >= still_to_find) return answer;
2278 3398486 : if (budget <= 0) return answer;
2279 : // We are not at start after this node so we set the last argument to 'true'.
2280 2374547 : return answer + on_success()->EatsAtLeast(still_to_find - answer,
2281 : budget - 1,
2282 2374547 : true);
2283 : }
2284 :
2285 :
2286 9503 : int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2287 : bool not_at_start) {
2288 9503 : if (budget <= 0) return 0;
2289 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2290 : // afterwards.
2291 18582 : RegExpNode* node = alternatives_->at(1).node();
2292 9291 : return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2293 : }
2294 :
2295 :
2296 3807 : void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2297 : QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2298 : bool not_at_start) {
2299 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2300 : // afterwards.
2301 7614 : RegExpNode* node = alternatives_->at(1).node();
2302 3807 : return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2303 : }
2304 :
2305 :
2306 6946669 : int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2307 : int budget,
2308 : RegExpNode* ignore_this_node,
2309 : bool not_at_start) {
2310 6946669 : if (budget <= 0) return 0;
2311 : int min = 100;
2312 4820198 : int choice_count = alternatives_->length();
2313 4820198 : budget = (budget - 1) / choice_count;
2314 10722748 : for (int i = 0; i < choice_count; i++) {
2315 20802814 : RegExpNode* node = alternatives_->at(i).node();
2316 10401407 : if (node == ignore_this_node) continue;
2317 : int node_eats_at_least =
2318 10256091 : node->EatsAtLeast(still_to_find, budget, not_at_start);
2319 10256091 : if (node_eats_at_least < min) min = node_eats_at_least;
2320 10256091 : if (min == 0) return 0;
2321 : }
2322 : return min;
2323 : }
2324 :
2325 :
2326 154091 : int LoopChoiceNode::EatsAtLeast(int still_to_find,
2327 : int budget,
2328 : bool not_at_start) {
2329 : return EatsAtLeastHelper(still_to_find,
2330 : budget - 1,
2331 : loop_node_,
2332 154091 : not_at_start);
2333 : }
2334 :
2335 :
2336 6792578 : int ChoiceNode::EatsAtLeast(int still_to_find,
2337 : int budget,
2338 : bool not_at_start) {
2339 6792578 : return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
2340 : }
2341 :
2342 :
2343 : // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2344 : static inline uint32_t SmearBitsRight(uint32_t v) {
2345 245325 : v |= v >> 1;
2346 245325 : v |= v >> 2;
2347 245325 : v |= v >> 4;
2348 245325 : v |= v >> 8;
2349 245325 : v |= v >> 16;
2350 : return v;
2351 : }
2352 :
2353 :
2354 269024 : bool QuickCheckDetails::Rationalize(bool asc) {
2355 : bool found_useful_op = false;
2356 : uint32_t char_mask;
2357 269024 : if (asc) {
2358 : char_mask = String::kMaxOneByteCharCode;
2359 : } else {
2360 : char_mask = String::kMaxUtf16CodeUnit;
2361 : }
2362 269024 : mask_ = 0;
2363 269024 : value_ = 0;
2364 : int char_shift = 0;
2365 735548 : for (int i = 0; i < characters_; i++) {
2366 466524 : Position* pos = &positions_[i];
2367 466524 : if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2368 : found_useful_op = true;
2369 : }
2370 466524 : mask_ |= (pos->mask & char_mask) << char_shift;
2371 466524 : value_ |= (pos->value & char_mask) << char_shift;
2372 466524 : char_shift += asc ? 8 : 16;
2373 : }
2374 269024 : return found_useful_op;
2375 : }
2376 :
2377 :
2378 1188933 : bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2379 62922 : Trace* bounds_check_trace,
2380 591317 : Trace* trace,
2381 : bool preload_has_checked_bounds,
2382 : Label* on_possible_success,
2383 1417531 : QuickCheckDetails* details,
2384 : bool fall_through_on_failure) {
2385 473927 : if (details->characters() == 0) return false;
2386 : GetQuickCheckDetails(
2387 538276 : details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2388 269138 : if (details->cannot_match()) return false;
2389 269024 : if (!details->Rationalize(compiler->one_byte())) return false;
2390 : DCHECK(details->characters() == 1 ||
2391 : compiler->macro_assembler()->CanReadUnaligned());
2392 : uint32_t mask = details->mask();
2393 : uint32_t value = details->value();
2394 :
2395 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2396 :
2397 224822 : if (trace->characters_preloaded() != details->characters()) {
2398 : DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2399 : // We are attempting to preload the minimum number of characters
2400 : // any choice would eat, so if the bounds check fails, then none of the
2401 : // choices can succeed, so we can just immediately backtrack, rather
2402 : // than go to the next choice.
2403 : assembler->LoadCurrentCharacter(trace->cp_offset(),
2404 : bounds_check_trace->backtrack(),
2405 62922 : !preload_has_checked_bounds,
2406 188766 : details->characters());
2407 : }
2408 :
2409 :
2410 : bool need_mask = true;
2411 :
2412 224822 : if (details->characters() == 1) {
2413 : // If number of characters preloaded is 1 then we used a byte or 16 bit
2414 : // load so the value is already masked down.
2415 : uint32_t char_mask;
2416 46925 : if (compiler->one_byte()) {
2417 : char_mask = String::kMaxOneByteCharCode;
2418 : } else {
2419 : char_mask = String::kMaxUtf16CodeUnit;
2420 : }
2421 46925 : if ((mask & char_mask) == char_mask) need_mask = false;
2422 : mask &= char_mask;
2423 : } else {
2424 : // For 2-character preloads in one-byte mode or 1-character preloads in
2425 : // two-byte mode we also use a 16 bit load with zero extend.
2426 : static const uint32_t kTwoByteMask = 0xFFFF;
2427 : static const uint32_t kFourByteMask = 0xFFFFFFFF;
2428 352132 : if (details->characters() == 2 && compiler->one_byte()) {
2429 159280 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2430 18617 : } else if (details->characters() == 1 && !compiler->one_byte()) {
2431 0 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2432 : } else {
2433 18617 : if (mask == kFourByteMask) need_mask = false;
2434 : }
2435 : }
2436 :
2437 224822 : if (fall_through_on_failure) {
2438 190387 : if (need_mask) {
2439 51650 : assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2440 : } else {
2441 138737 : assembler->CheckCharacter(value, on_possible_success);
2442 : }
2443 : } else {
2444 34435 : if (need_mask) {
2445 7696 : assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2446 : } else {
2447 61174 : assembler->CheckNotCharacter(value, trace->backtrack());
2448 : }
2449 : }
2450 : return true;
2451 : }
2452 :
2453 :
2454 : // Here is the meat of GetQuickCheckDetails (see also the comment on the
2455 : // super-class in the .h file).
2456 : //
2457 : // We iterate along the text object, building up for each character a
2458 : // mask and value that can be used to test for a quick failure to match.
2459 : // The masks and values for the positions will be combined into a single
2460 : // machine word for the current character width in order to be used in
2461 : // generating a quick check.
2462 1697604 : void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2463 995862 : RegExpCompiler* compiler,
2464 : int characters_filled_in,
2465 1040561 : bool not_at_start) {
2466 : // Do not collect any quick check details if the text node reads backward,
2467 : // since it reads in the opposite direction than we use for quick checks.
2468 494641 : if (read_backward()) return;
2469 494641 : Isolate* isolate = compiler->macro_assembler()->isolate();
2470 : DCHECK(characters_filled_in < details->characters());
2471 : int characters = details->characters();
2472 : int char_mask;
2473 494641 : if (compiler->one_byte()) {
2474 : char_mask = String::kMaxOneByteCharCode;
2475 : } else {
2476 : char_mask = String::kMaxUtf16CodeUnit;
2477 : }
2478 1091840 : for (int k = 0; k < elements()->length(); k++) {
2479 499944 : TextElement elm = elements()->at(k);
2480 499944 : if (elm.text_type() == TextElement::ATOM) {
2481 : Vector<const uc16> quarks = elm.atom()->data();
2482 1136622 : for (int i = 0; i < characters && i < quarks.length(); i++) {
2483 : QuickCheckDetails::Position* pos =
2484 535039 : details->positions(characters_filled_in);
2485 1070078 : uc16 c = quarks[i];
2486 535039 : if (elm.atom()->ignore_case()) {
2487 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2488 : int length = GetCaseIndependentLetters(isolate, c,
2489 6580 : compiler->one_byte(), chars);
2490 6580 : if (length == 0) {
2491 : // This can happen because all case variants are non-Latin1, but we
2492 : // know the input is Latin1.
2493 : details->set_cannot_match();
2494 25 : pos->determines_perfectly = false;
2495 25 : return;
2496 : }
2497 6555 : if (length == 1) {
2498 : // This letter has no case equivalents, so it's nice and simple
2499 : // and the mask-compare will determine definitely whether we have
2500 : // a match at this character position.
2501 1249 : pos->mask = char_mask;
2502 1249 : pos->value = c;
2503 1249 : pos->determines_perfectly = true;
2504 : } else {
2505 5306 : uint32_t common_bits = char_mask;
2506 5306 : uint32_t bits = chars[0];
2507 10969 : for (int j = 1; j < length; j++) {
2508 5663 : uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2509 5663 : common_bits ^= differing_bits;
2510 5663 : bits &= common_bits;
2511 : }
2512 : // If length is 2 and common bits has only one zero in it then
2513 : // our mask and compare instruction will determine definitely
2514 : // whether we have a match at this character position. Otherwise
2515 : // it can only be an approximate check.
2516 5306 : uint32_t one_zero = (common_bits | ~char_mask);
2517 5306 : if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2518 4894 : pos->determines_perfectly = true;
2519 : }
2520 5306 : pos->mask = common_bits;
2521 5306 : pos->value = bits;
2522 : }
2523 : } else {
2524 : // Don't ignore case. Nice simple case where the mask-compare will
2525 : // determine definitely whether we have a match at this character
2526 : // position.
2527 528459 : if (c > char_mask) {
2528 : details->set_cannot_match();
2529 25 : pos->determines_perfectly = false;
2530 25 : return;
2531 : }
2532 528434 : pos->mask = char_mask;
2533 528434 : pos->value = c;
2534 528434 : pos->determines_perfectly = true;
2535 : }
2536 534989 : characters_filled_in++;
2537 : DCHECK(characters_filled_in <= details->characters());
2538 534989 : if (characters_filled_in == details->characters()) {
2539 : return;
2540 : }
2541 : }
2542 : } else {
2543 : QuickCheckDetails::Position* pos =
2544 127407 : details->positions(characters_filled_in);
2545 : RegExpCharacterClass* tree = elm.char_class();
2546 507814 : ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2547 : DCHECK(!ranges->is_empty());
2548 127407 : if (tree->is_negated()) {
2549 : // A quick check uses multi-character mask and compare. There is no
2550 : // useful way to incorporate a negative char class into this scheme
2551 : // so we just conservatively create a mask and value that will always
2552 : // succeed.
2553 3636 : pos->mask = 0;
2554 3636 : pos->value = 0;
2555 : } else {
2556 : int first_range = 0;
2557 123821 : while (ranges->at(first_range).from() > char_mask) {
2558 100 : first_range++;
2559 100 : if (first_range == ranges->length()) {
2560 : details->set_cannot_match();
2561 50 : pos->determines_perfectly = false;
2562 : return;
2563 : }
2564 : }
2565 123721 : CharacterRange range = ranges->at(first_range);
2566 123721 : uc16 from = range.from();
2567 123721 : uc16 to = range.to();
2568 123721 : if (to > char_mask) {
2569 15982 : to = char_mask;
2570 : }
2571 123721 : uint32_t differing_bits = (from ^ to);
2572 : // A mask and compare is only perfect if the differing bits form a
2573 : // number like 00011111 with one single block of trailing 1s.
2574 229146 : if ((differing_bits & (differing_bits + 1)) == 0 &&
2575 105425 : from + differing_bits == to) {
2576 95775 : pos->determines_perfectly = true;
2577 : }
2578 123721 : uint32_t common_bits = ~SmearBitsRight(differing_bits);
2579 123721 : uint32_t bits = (from & common_bits);
2580 760614 : for (int i = first_range + 1; i < ranges->length(); i++) {
2581 256586 : CharacterRange range = ranges->at(i);
2582 256586 : uc16 from = range.from();
2583 256586 : uc16 to = range.to();
2584 256586 : if (from > char_mask) continue;
2585 121604 : if (to > char_mask) to = char_mask;
2586 : // Here we are combining more ranges into the mask and compare
2587 : // value. With each new range the mask becomes more sparse and
2588 : // so the chances of a false positive rise. A character class
2589 : // with multiple ranges is assumed never to be equivalent to a
2590 : // mask and compare operation.
2591 121604 : pos->determines_perfectly = false;
2592 121604 : uint32_t new_common_bits = (from ^ to);
2593 121604 : new_common_bits = ~SmearBitsRight(new_common_bits);
2594 121604 : common_bits &= new_common_bits;
2595 121604 : bits &= new_common_bits;
2596 121604 : uint32_t differing_bits = (from & common_bits) ^ bits;
2597 121604 : common_bits ^= differing_bits;
2598 121604 : bits &= common_bits;
2599 : }
2600 123721 : pos->mask = common_bits;
2601 123721 : pos->value = bits;
2602 : }
2603 127357 : characters_filled_in++;
2604 : DCHECK(characters_filled_in <= details->characters());
2605 127357 : if (characters_filled_in == details->characters()) {
2606 : return;
2607 : }
2608 : }
2609 : }
2610 : DCHECK(characters_filled_in != details->characters());
2611 45976 : if (!details->cannot_match()) {
2612 45976 : on_success()-> GetQuickCheckDetails(details,
2613 : compiler,
2614 : characters_filled_in,
2615 45976 : true);
2616 : }
2617 : }
2618 :
2619 :
2620 0 : void QuickCheckDetails::Clear() {
2621 375116 : for (int i = 0; i < characters_; i++) {
2622 375116 : positions_[i].mask = 0;
2623 375116 : positions_[i].value = 0;
2624 375116 : positions_[i].determines_perfectly = false;
2625 : }
2626 1090785 : characters_ = 0;
2627 0 : }
2628 :
2629 :
2630 515743 : void QuickCheckDetails::Advance(int by, bool one_byte) {
2631 515743 : if (by >= characters_ || by < 0) {
2632 : DCHECK_IMPLIES(by < 0, characters_ == 0);
2633 : Clear();
2634 515743 : return;
2635 : }
2636 : DCHECK_LE(characters_ - by, 4);
2637 : DCHECK_LE(characters_, 4);
2638 30619 : for (int i = 0; i < characters_ - by; i++) {
2639 30619 : positions_[i] = positions_[by + i];
2640 : }
2641 30669 : for (int i = characters_ - by; i < characters_; i++) {
2642 30669 : positions_[i].mask = 0;
2643 30669 : positions_[i].value = 0;
2644 30669 : positions_[i].determines_perfectly = false;
2645 : }
2646 28580 : characters_ -= by;
2647 : // We could change mask_ and value_ here but we would never advance unless
2648 : // they had already been used in a check and they won't be used again because
2649 : // it would gain us nothing. So there's no point.
2650 : }
2651 :
2652 :
2653 183926 : void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2654 : DCHECK(characters_ == other->characters_);
2655 183926 : if (other->cannot_match_) {
2656 : return;
2657 : }
2658 183842 : if (cannot_match_) {
2659 265 : *this = *other;
2660 265 : return;
2661 : }
2662 202826 : for (int i = from_index; i < characters_; i++) {
2663 202826 : QuickCheckDetails::Position* pos = positions(i);
2664 202826 : QuickCheckDetails::Position* other_pos = other->positions(i);
2665 240663 : if (pos->mask != other_pos->mask ||
2666 47933 : pos->value != other_pos->value ||
2667 10096 : !other_pos->determines_perfectly) {
2668 : // Our mask-compare operation will be approximate unless we have the
2669 : // exact same operation on both sides of the alternation.
2670 195819 : pos->determines_perfectly = false;
2671 : }
2672 202826 : pos->mask &= other_pos->mask;
2673 202826 : pos->value &= pos->mask;
2674 202826 : other_pos->value &= pos->mask;
2675 202826 : uc16 differing_bits = (pos->value ^ other_pos->value);
2676 202826 : pos->mask &= ~differing_bits;
2677 202826 : pos->value &= pos->mask;
2678 : }
2679 : }
2680 :
2681 :
2682 : class VisitMarker {
2683 : public:
2684 : explicit VisitMarker(NodeInfo* info) : info_(info) {
2685 : DCHECK(!info->visited);
2686 198408 : info->visited = true;
2687 : }
2688 : ~VisitMarker() {
2689 174408 : info_->visited = false;
2690 : }
2691 : private:
2692 : NodeInfo* info_;
2693 : };
2694 :
2695 99118 : RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
2696 99118 : if (info()->replacement_calculated) return replacement();
2697 72343 : if (depth < 0) return this;
2698 : DCHECK(!info()->visited);
2699 72191 : VisitMarker marker(info());
2700 : return FilterSuccessor(depth - 1);
2701 : }
2702 :
2703 0 : RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
2704 132374 : RegExpNode* next = on_success_->FilterOneByte(depth - 1);
2705 132374 : if (next == nullptr) return set_replacement(nullptr);
2706 131897 : on_success_ = next;
2707 131897 : return set_replacement(this);
2708 : }
2709 :
2710 : // We need to check for the following characters: 0x39C 0x3BC 0x178.
2711 1446 : static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2712 : // TODO(dcarney): this could be a lot more efficient.
2713 4212 : return range.Contains(0x039C) || range.Contains(0x03BC) ||
2714 1446 : range.Contains(0x0178);
2715 : }
2716 :
2717 :
2718 87 : static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2719 102 : for (int i = 0; i < ranges->length(); i++) {
2720 : // TODO(dcarney): this could be a lot more efficient.
2721 41 : if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2722 : }
2723 : return false;
2724 : }
2725 :
2726 190711 : RegExpNode* TextNode::FilterOneByte(int depth) {
2727 99450 : if (info()->replacement_calculated) return replacement();
2728 60658 : if (depth < 0) return this;
2729 : DCHECK(!info()->visited);
2730 60613 : VisitMarker marker(info());
2731 60613 : int element_count = elements()->length();
2732 125359 : for (int i = 0; i < element_count; i++) {
2733 65176 : TextElement elm = elements()->at(i);
2734 65176 : if (elm.text_type() == TextElement::ATOM) {
2735 : Vector<const uc16> quarks = elm.atom()->data();
2736 177276 : for (int j = 0; j < quarks.length(); j++) {
2737 116312 : uint16_t c = quarks[j];
2738 58156 : if (elm.atom()->ignore_case()) {
2739 : c = unibrow::Latin1::TryConvertToLatin1(c);
2740 : }
2741 58156 : if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
2742 : // Replace quark in case we converted to Latin-1.
2743 : uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
2744 57990 : writable_quarks[j] = c;
2745 : }
2746 : } else {
2747 : DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2748 : RegExpCharacterClass* cc = elm.char_class();
2749 34528 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2750 34528 : CharacterRange::Canonicalize(ranges);
2751 : // Now they are in order so we only need to look at the first.
2752 : int range_count = ranges->length();
2753 34528 : if (cc->is_negated()) {
2754 8538 : if (range_count != 0 &&
2755 8716 : ranges->at(0).from() == 0 &&
2756 178 : ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2757 : // This will be handled in a later filter.
2758 40 : if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2759 : continue;
2760 39 : return set_replacement(nullptr);
2761 : }
2762 : } else {
2763 60518 : if (range_count == 0 ||
2764 30259 : ranges->at(0).from() > String::kMaxOneByteCharCode) {
2765 : // This will be handled in a later filter.
2766 250 : if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2767 : continue;
2768 225 : return set_replacement(nullptr);
2769 : }
2770 : }
2771 : }
2772 : }
2773 60183 : return FilterSuccessor(depth - 1);
2774 : }
2775 :
2776 59897 : RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
2777 59897 : if (info()->replacement_calculated) return replacement();
2778 46219 : if (depth < 0) return this;
2779 46129 : if (info()->visited) return this;
2780 : {
2781 24343 : VisitMarker marker(info());
2782 :
2783 24343 : RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2784 : // If we can't continue after the loop then there is no sense in doing the
2785 : // loop.
2786 24343 : if (continue_replacement == nullptr) return set_replacement(nullptr);
2787 : }
2788 :
2789 24000 : return ChoiceNode::FilterOneByte(depth - 1);
2790 : }
2791 :
2792 29868 : RegExpNode* ChoiceNode::FilterOneByte(int depth) {
2793 30018 : if (info()->replacement_calculated) return replacement();
2794 27824 : if (depth < 0) return this;
2795 27729 : if (info()->visited) return this;
2796 27729 : VisitMarker marker(info());
2797 27729 : int choice_count = alternatives_->length();
2798 :
2799 86140 : for (int i = 0; i < choice_count; i++) {
2800 60773 : GuardedAlternative alternative = alternatives_->at(i);
2801 63135 : if (alternative.guards() != nullptr &&
2802 2362 : alternative.guards()->length() != 0) {
2803 2362 : set_replacement(this);
2804 : return this;
2805 : }
2806 : }
2807 :
2808 : int surviving = 0;
2809 : RegExpNode* survivor = nullptr;
2810 58054 : for (int i = 0; i < choice_count; i++) {
2811 116108 : GuardedAlternative alternative = alternatives_->at(i);
2812 58054 : RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2813 : DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2814 58054 : if (replacement != nullptr) {
2815 57908 : alternatives_->at(i).set_node(replacement);
2816 57908 : surviving++;
2817 : survivor = replacement;
2818 : }
2819 : }
2820 25433 : if (surviving < 2) return set_replacement(survivor);
2821 :
2822 25301 : set_replacement(this);
2823 25301 : if (surviving == choice_count) {
2824 : return this;
2825 : }
2826 : // Only some of the nodes survived the filtering. We need to rebuild the
2827 : // alternatives list.
2828 : ZoneList<GuardedAlternative>* new_alternatives =
2829 20 : new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2830 200 : for (int i = 0; i < choice_count; i++) {
2831 : RegExpNode* replacement =
2832 360 : alternatives_->at(i).node()->FilterOneByte(depth - 1);
2833 180 : if (replacement != nullptr) {
2834 130 : alternatives_->at(i).set_node(replacement);
2835 260 : new_alternatives->Add(alternatives_->at(i), zone());
2836 : }
2837 : }
2838 20 : alternatives_ = new_alternatives;
2839 20 : return this;
2840 : }
2841 :
2842 357 : RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
2843 357 : if (info()->replacement_calculated) return replacement();
2844 357 : if (depth < 0) return this;
2845 357 : if (info()->visited) return this;
2846 357 : VisitMarker marker(info());
2847 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2848 : // afterwards.
2849 714 : RegExpNode* node = alternatives_->at(1).node();
2850 357 : RegExpNode* replacement = node->FilterOneByte(depth - 1);
2851 362 : if (replacement == nullptr) return set_replacement(nullptr);
2852 352 : alternatives_->at(1).set_node(replacement);
2853 :
2854 704 : RegExpNode* neg_node = alternatives_->at(0).node();
2855 352 : RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2856 : // If the negative lookahead is always going to fail then
2857 : // we don't need to check it.
2858 357 : if (neg_replacement == nullptr) return set_replacement(replacement);
2859 347 : alternatives_->at(0).set_node(neg_replacement);
2860 694 : return set_replacement(this);
2861 : }
2862 :
2863 :
2864 16630 : void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2865 : RegExpCompiler* compiler,
2866 : int characters_filled_in,
2867 : bool not_at_start) {
2868 16630 : if (body_can_be_zero_length_ || info()->visited) return;
2869 13175 : VisitMarker marker(info());
2870 : return ChoiceNode::GetQuickCheckDetails(details,
2871 : compiler,
2872 : characters_filled_in,
2873 13175 : not_at_start);
2874 : }
2875 :
2876 :
2877 5019 : void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2878 : BoyerMooreLookahead* bm, bool not_at_start) {
2879 5019 : if (body_can_be_zero_length_ || budget <= 0) {
2880 : bm->SetRest(offset);
2881 : SaveBMInfo(bm, not_at_start, offset);
2882 5019 : return;
2883 : }
2884 4830 : ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2885 : SaveBMInfo(bm, not_at_start, offset);
2886 : }
2887 :
2888 :
2889 229536 : void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2890 : RegExpCompiler* compiler,
2891 : int characters_filled_in,
2892 : bool not_at_start) {
2893 45610 : not_at_start = (not_at_start || not_at_start_);
2894 45610 : int choice_count = alternatives_->length();
2895 : DCHECK_LT(0, choice_count);
2896 45610 : alternatives_->at(0).node()->GetQuickCheckDetails(details,
2897 : compiler,
2898 : characters_filled_in,
2899 45610 : not_at_start);
2900 229536 : for (int i = 1; i < choice_count; i++) {
2901 : QuickCheckDetails new_details(details->characters());
2902 367852 : RegExpNode* node = alternatives_->at(i).node();
2903 : node->GetQuickCheckDetails(&new_details, compiler,
2904 : characters_filled_in,
2905 183926 : not_at_start);
2906 : // Here we merge the quick match details of the two branches.
2907 183926 : details->Merge(&new_details, characters_filled_in);
2908 : }
2909 45610 : }
2910 :
2911 :
2912 : // Check for [0-9A-Z_a-z].
2913 557 : static void EmitWordCheck(RegExpMacroAssembler* assembler,
2914 : Label* word,
2915 : Label* non_word,
2916 : bool fall_through_on_word) {
2917 557 : if (assembler->CheckSpecialCharacterClass(
2918 : fall_through_on_word ? 'w' : 'W',
2919 557 : fall_through_on_word ? non_word : word)) {
2920 : // Optimized implementation available.
2921 557 : return;
2922 : }
2923 0 : assembler->CheckCharacterGT('z', non_word);
2924 0 : assembler->CheckCharacterLT('0', non_word);
2925 0 : assembler->CheckCharacterGT('a' - 1, word);
2926 0 : assembler->CheckCharacterLT('9' + 1, word);
2927 0 : assembler->CheckCharacterLT('A', non_word);
2928 0 : assembler->CheckCharacterLT('Z' + 1, word);
2929 0 : if (fall_through_on_word) {
2930 0 : assembler->CheckNotCharacter('_', non_word);
2931 : } else {
2932 0 : assembler->CheckCharacter('_', word);
2933 : }
2934 : }
2935 :
2936 :
2937 : // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2938 : // that matches newline or the start of input).
2939 129 : static void EmitHat(RegExpCompiler* compiler,
2940 : RegExpNode* on_success,
2941 : Trace* trace) {
2942 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2943 : // We will be loading the previous character into the current character
2944 : // register.
2945 129 : Trace new_trace(*trace);
2946 : new_trace.InvalidateCurrentCharacter();
2947 :
2948 129 : Label ok;
2949 129 : if (new_trace.cp_offset() == 0) {
2950 : // The start of input counts as a newline in this context, so skip to
2951 : // ok if we are at the start.
2952 119 : assembler->CheckAtStart(&ok);
2953 : }
2954 : // We already checked that we are not at the start of input so it must be
2955 : // OK to load the previous character.
2956 129 : assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2957 : new_trace.backtrack(),
2958 258 : false);
2959 129 : if (!assembler->CheckSpecialCharacterClass('n',
2960 129 : new_trace.backtrack())) {
2961 : // Newline means \n, \r, 0x2028 or 0x2029.
2962 0 : if (!compiler->one_byte()) {
2963 0 : assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2964 : }
2965 0 : assembler->CheckCharacter('\n', &ok);
2966 0 : assembler->CheckNotCharacter('\r', new_trace.backtrack());
2967 : }
2968 129 : assembler->Bind(&ok);
2969 129 : on_success->Emit(compiler, &new_trace);
2970 129 : }
2971 :
2972 :
2973 : // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
2974 811 : void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2975 255 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2976 : Isolate* isolate = assembler->isolate();
2977 : Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2978 255 : bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2979 150 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2980 255 : if (lookahead == nullptr) {
2981 : int eats_at_least =
2982 : Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
2983 : kRecursionBudget,
2984 202 : not_at_start));
2985 202 : if (eats_at_least >= 1) {
2986 97 : BoyerMooreLookahead* bm =
2987 97 : new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
2988 97 : FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2989 97 : if (bm->at(0)->is_non_word())
2990 : next_is_word_character = Trace::FALSE_VALUE;
2991 97 : if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
2992 : }
2993 : } else {
2994 53 : if (lookahead->at(0)->is_non_word())
2995 : next_is_word_character = Trace::FALSE_VALUE;
2996 53 : if (lookahead->at(0)->is_word())
2997 : next_is_word_character = Trace::TRUE_VALUE;
2998 : }
2999 255 : bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3000 255 : if (next_is_word_character == Trace::UNKNOWN) {
3001 151 : Label before_non_word;
3002 151 : Label before_word;
3003 151 : if (trace->characters_preloaded() != 1) {
3004 300 : assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3005 : }
3006 : // Fall through on non-word.
3007 151 : EmitWordCheck(assembler, &before_word, &before_non_word, false);
3008 : // Next character is not a word character.
3009 151 : assembler->Bind(&before_non_word);
3010 151 : Label ok;
3011 151 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3012 151 : assembler->GoTo(&ok);
3013 :
3014 151 : assembler->Bind(&before_word);
3015 151 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3016 151 : assembler->Bind(&ok);
3017 104 : } else if (next_is_word_character == Trace::TRUE_VALUE) {
3018 79 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3019 : } else {
3020 : DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3021 25 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3022 : }
3023 255 : }
3024 :
3025 :
3026 406 : void AssertionNode::BacktrackIfPrevious(
3027 406 : RegExpCompiler* compiler,
3028 : Trace* trace,
3029 : AssertionNode::IfPrevious backtrack_if_previous) {
3030 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3031 406 : Trace new_trace(*trace);
3032 : new_trace.InvalidateCurrentCharacter();
3033 :
3034 406 : Label fall_through, dummy;
3035 :
3036 : Label* non_word = backtrack_if_previous == kIsNonWord ?
3037 194 : new_trace.backtrack() :
3038 406 : &fall_through;
3039 : Label* word = backtrack_if_previous == kIsNonWord ?
3040 : &fall_through :
3041 406 : new_trace.backtrack();
3042 :
3043 406 : if (new_trace.cp_offset() == 0) {
3044 : // The start of input counts as a non-word character, so the question is
3045 : // decided if we are at the start.
3046 169 : assembler->CheckAtStart(non_word);
3047 : }
3048 : // We already checked that we are not at the start of input so it must be
3049 : // OK to load the previous character.
3050 406 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3051 406 : EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3052 :
3053 406 : assembler->Bind(&fall_through);
3054 406 : on_success()->Emit(compiler, &new_trace);
3055 406 : }
3056 :
3057 :
3058 1994 : void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3059 : RegExpCompiler* compiler,
3060 : int filled_in,
3061 : bool not_at_start) {
3062 1994 : if (assertion_type_ == AT_START && not_at_start) {
3063 : details->set_cannot_match();
3064 : return;
3065 : }
3066 1631 : return on_success()->GetQuickCheckDetails(details,
3067 : compiler,
3068 : filled_in,
3069 1631 : not_at_start);
3070 : }
3071 :
3072 :
3073 16689 : void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3074 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3075 5819 : switch (assertion_type_) {
3076 : case AT_END: {
3077 2377 : Label ok;
3078 4754 : assembler->CheckPosition(trace->cp_offset(), &ok);
3079 4754 : assembler->GoTo(trace->backtrack());
3080 2377 : assembler->Bind(&ok);
3081 : break;
3082 : }
3083 : case AT_START: {
3084 3058 : if (trace->at_start() == Trace::FALSE_VALUE) {
3085 10 : assembler->GoTo(trace->backtrack());
3086 5 : return;
3087 : }
3088 3053 : if (trace->at_start() == Trace::UNKNOWN) {
3089 6106 : assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3090 3053 : Trace at_start_trace = *trace;
3091 : at_start_trace.set_at_start(Trace::TRUE_VALUE);
3092 5430 : on_success()->Emit(compiler, &at_start_trace);
3093 : return;
3094 : }
3095 : }
3096 : break;
3097 : case AFTER_NEWLINE:
3098 129 : EmitHat(compiler, on_success(), trace);
3099 129 : return;
3100 : case AT_BOUNDARY:
3101 : case AT_NON_BOUNDARY: {
3102 255 : EmitBoundaryCheck(compiler, trace);
3103 255 : return;
3104 : }
3105 : }
3106 2377 : on_success()->Emit(compiler, trace);
3107 : }
3108 :
3109 :
3110 2729942 : static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3111 2729942 : if (quick_check == nullptr) return false;
3112 2729942 : if (offset >= quick_check->characters()) return false;
3113 927826 : return quick_check->positions(offset)->determines_perfectly;
3114 : }
3115 :
3116 :
3117 : static void UpdateBoundsCheck(int index, int* checked_up_to) {
3118 765664 : if (index > *checked_up_to) {
3119 407817 : *checked_up_to = index;
3120 : }
3121 : }
3122 :
3123 :
3124 : // We call this repeatedly to generate code for each pass over the text node.
3125 : // The passes are in increasing order of difficulty because we hope one
3126 : // of the first passes will fail in which case we are saved the work of the
3127 : // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3128 : // we will check the '%' in the first pass, the case independent 'a' in the
3129 : // second pass and the character class in the last pass.
3130 : //
3131 : // The passes are done from right to left, so for example to test for /bar/
3132 : // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3133 : // and then a 'b' with offset 0. This means we can avoid the end-of-input
3134 : // bounds check most of the time. In the example we only need to check for
3135 : // end-of-input when loading the putative 'r'.
3136 : //
3137 : // A slight complication involves the fact that the first character may already
3138 : // be fetched into a register by the previous node. In this case we want to
3139 : // do the test for that character first. We do this in separate passes. The
3140 : // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3141 : // pass has been performed then subsequent passes will have true in
3142 : // first_element_checked to indicate that that character does not need to be
3143 : // checked again.
3144 : //
3145 : // In addition to all this we are passed a Trace, which can
3146 : // contain an AlternativeGeneration object. In this AlternativeGeneration
3147 : // object we can see details of any quick check that was already passed in
3148 : // order to get to the code we are now generating. The quick check can involve
3149 : // loading characters, which means we do not need to recheck the bounds
3150 : // up to the limit the quick check already checked. In addition the quick
3151 : // check can have involved a mask and compare operation which may simplify
3152 : // or obviate the need for further checks at some character positions.
3153 5139420 : void TextNode::TextEmitPass(RegExpCompiler* compiler,
3154 : TextEmitPassType pass,
3155 : bool preloaded,
3156 5375253 : Trace* trace,
3157 : bool first_element_checked,
3158 8303466 : int* checked_up_to) {
3159 2569710 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3160 : Isolate* isolate = assembler->isolate();
3161 : bool one_byte = compiler->one_byte();
3162 : Label* backtrack = trace->backtrack();
3163 2569710 : QuickCheckDetails* quick_check = trace->quick_check_performed();
3164 2569710 : int element_count = elements()->length();
3165 2569710 : int backward_offset = read_backward() ? -Length() : 0;
3166 5375228 : for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3167 2805543 : TextElement elm = elements()->at(i);
3168 2805543 : int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3169 2805543 : if (elm.text_type() == TextElement::ATOM) {
3170 1662919 : if (SkipPass(pass, elm.atom()->ignore_case())) continue;
3171 : Vector<const uc16> quarks = elm.atom()->data();
3172 4465650 : for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3173 2509366 : if (first_element_checked && i == 0 && j == 0) continue;
3174 4982738 : if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3175 : EmitCharacterFunction* emit_function = nullptr;
3176 3221906 : uc16 quark = quarks[j];
3177 1610953 : if (elm.atom()->ignore_case()) {
3178 : // Everywhere else we assume that a non-Latin-1 character cannot match
3179 : // a Latin-1 character. Avoid the cases where this is assumption is
3180 : // invalid by using the Latin1 equivalent instead.
3181 : quark = unibrow::Latin1::TryConvertToLatin1(quark);
3182 : }
3183 1610953 : switch (pass) {
3184 : case NON_LATIN1_MATCH:
3185 : DCHECK(one_byte);
3186 495585 : if (quark > String::kMaxOneByteCharCode) {
3187 25 : assembler->GoTo(backtrack);
3188 2569710 : return;
3189 : }
3190 : break;
3191 : case NON_LETTER_CHARACTER_MATCH:
3192 : emit_function = &EmitAtomNonLetter;
3193 5344 : break;
3194 : case SIMPLE_CHARACTER_MATCH:
3195 : emit_function = &EmitSimpleCharacter;
3196 549668 : break;
3197 : case CASE_CHARACTER_MATCH:
3198 : emit_function = &EmitAtomLetter;
3199 5344 : break;
3200 : default:
3201 : break;
3202 : }
3203 1610928 : if (emit_function != nullptr) {
3204 871795 : bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3205 : bool bound_checked =
3206 : emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
3207 560356 : bounds_check, preloaded);
3208 560356 : if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3209 : }
3210 : }
3211 : } else {
3212 : DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3213 1142624 : if (pass == CHARACTER_CLASS_MATCH) {
3214 276606 : if (first_element_checked && i == 0) continue;
3215 238573 : if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3216 : RegExpCharacterClass* cc = elm.char_class();
3217 257773 : bool bounds_check = *checked_up_to < cp_offset || read_backward();
3218 : EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3219 210709 : bounds_check, preloaded, zone());
3220 : UpdateBoundsCheck(cp_offset, checked_up_to);
3221 : }
3222 : }
3223 : }
3224 : }
3225 :
3226 :
3227 6949071 : int TextNode::Length() {
3228 6949071 : TextElement elm = elements()->last();
3229 : DCHECK_LE(0, elm.cp_offset());
3230 6949071 : return elm.cp_offset() + elm.length();
3231 : }
3232 :
3233 0 : bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
3234 1662919 : if (ignore_case) {
3235 44329 : return pass == SIMPLE_CHARACTER_MATCH;
3236 : } else {
3237 1618590 : return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3238 : }
3239 : }
3240 :
3241 7202 : TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3242 : ZoneList<CharacterRange>* ranges,
3243 : bool read_backward,
3244 : RegExpNode* on_success,
3245 : JSRegExp::Flags flags) {
3246 : DCHECK_NOT_NULL(ranges);
3247 7202 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3248 : elms->Add(TextElement::CharClass(
3249 21606 : new (zone) RegExpCharacterClass(zone, ranges, flags)),
3250 7202 : zone);
3251 7202 : return new (zone) TextNode(elms, read_backward, on_success);
3252 : }
3253 :
3254 27070 : TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3255 : CharacterRange trail,
3256 : bool read_backward,
3257 : RegExpNode* on_success,
3258 : JSRegExp::Flags flags) {
3259 27070 : ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3260 27070 : ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3261 27070 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3262 : elms->Add(TextElement::CharClass(
3263 81210 : new (zone) RegExpCharacterClass(zone, lead_ranges, flags)),
3264 27070 : zone);
3265 : elms->Add(TextElement::CharClass(
3266 81210 : new (zone) RegExpCharacterClass(zone, trail_ranges, flags)),
3267 27070 : zone);
3268 27070 : return new (zone) TextNode(elms, read_backward, on_success);
3269 : }
3270 :
3271 :
3272 : // This generates the code to match a text node. A text node can contain
3273 : // straight character sequences (possibly to be matched in a case-independent
3274 : // way) and character classes. For efficiency we do not do this in a single
3275 : // pass from left to right. Instead we pass over the text node several times,
3276 : // emitting code for some character positions every time. See the comment on
3277 : // TextEmitPass for details.
3278 4228090 : void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3279 617889 : LimitResult limit_result = LimitVersions(compiler, trace);
3280 720035 : if (limit_result == DONE) return;
3281 : DCHECK(limit_result == CONTINUE);
3282 :
3283 515743 : if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3284 : compiler->SetRegExpTooBig();
3285 : return;
3286 : }
3287 :
3288 515743 : if (compiler->one_byte()) {
3289 323190 : int dummy = 0;
3290 323190 : TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3291 : }
3292 :
3293 : bool first_elt_done = false;
3294 515743 : int bound_checked_to = trace->cp_offset() - 1;
3295 515743 : bound_checked_to += trace->bound_checked_up_to();
3296 :
3297 : // If a character is preloaded into the current character register then
3298 : // check that now.
3299 515743 : if (trace->characters_preloaded() == 1) {
3300 183548 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3301 : TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
3302 183548 : false, &bound_checked_to);
3303 : }
3304 : first_elt_done = true;
3305 : }
3306 :
3307 2578715 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3308 : TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
3309 2062972 : first_elt_done, &bound_checked_to);
3310 : }
3311 :
3312 515743 : Trace successor_trace(*trace);
3313 : // If we advance backward, we may end up at the start.
3314 : successor_trace.AdvanceCurrentPositionInTrace(
3315 515743 : read_backward() ? -Length() : Length(), compiler);
3316 : successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3317 515743 : : Trace::FALSE_VALUE);
3318 : RecursionCheck rc(compiler);
3319 515743 : on_success()->Emit(compiler, &successor_trace);
3320 : }
3321 :
3322 :
3323 0 : void Trace::InvalidateCurrentCharacter() {
3324 227976 : characters_preloaded_ = 0;
3325 0 : }
3326 :
3327 :
3328 1031486 : void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3329 : // We don't have an instruction for shifting the current character register
3330 : // down or for using a shifted value for anything so lets just forget that
3331 : // we preloaded any characters into it.
3332 515743 : characters_preloaded_ = 0;
3333 : // Adjust the offsets of the quick check performed information. This
3334 : // information is used to find out what we already determined about the
3335 : // characters by means of mask and compare.
3336 515743 : quick_check_performed_.Advance(by, compiler->one_byte());
3337 515743 : cp_offset_ += by;
3338 515743 : if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3339 : compiler->SetRegExpTooBig();
3340 0 : cp_offset_ = 0;
3341 : }
3342 1031486 : bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3343 515743 : }
3344 :
3345 :
3346 689147 : void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3347 316241 : int element_count = elements()->length();
3348 689147 : for (int i = 0; i < element_count; i++) {
3349 372906 : TextElement elm = elements()->at(i);
3350 372906 : if (elm.text_type() == TextElement::CHAR_CLASS) {
3351 : RegExpCharacterClass* cc = elm.char_class();
3352 : #ifdef V8_INTL_SUPPORT
3353 : bool case_equivalents_already_added =
3354 : NeedsUnicodeCaseEquivalents(cc->flags());
3355 : #else
3356 : bool case_equivalents_already_added = false;
3357 : #endif
3358 237557 : if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
3359 : // None of the standard character classes is different in the case
3360 : // independent case and it slows us down if we don't know that.
3361 135861 : if (cc->is_standard(zone())) continue;
3362 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3363 : CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
3364 133780 : is_one_byte);
3365 : }
3366 : }
3367 : }
3368 316241 : }
3369 :
3370 :
3371 135332 : int TextNode::GreedyLoopTextLength() { return Length(); }
3372 :
3373 :
3374 85827 : RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3375 254945 : RegExpCompiler* compiler) {
3376 85827 : if (read_backward()) return nullptr;
3377 85702 : if (elements()->length() != 1) return nullptr;
3378 85328 : TextElement elm = elements()->at(0);
3379 85328 : if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
3380 : RegExpCharacterClass* node = elm.char_class();
3381 168002 : ZoneList<CharacterRange>* ranges = node->ranges(zone());
3382 84001 : CharacterRange::Canonicalize(ranges);
3383 84001 : if (node->is_negated()) {
3384 82385 : return ranges->length() == 0 ? on_success() : nullptr;
3385 : }
3386 83876 : if (ranges->length() != 1) return nullptr;
3387 : uint32_t max_char;
3388 83416 : if (compiler->one_byte()) {
3389 : max_char = String::kMaxOneByteCharCode;
3390 : } else {
3391 : max_char = String::kMaxUtf16CodeUnit;
3392 : }
3393 250248 : return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
3394 : }
3395 :
3396 :
3397 : // Finds the fixed match length of a sequence of nodes that goes from
3398 : // this alternative and back to this choice node. If there are variable
3399 : // length nodes or other complications in the way then return a sentinel
3400 : // value indicating that a greedy loop cannot be constructed.
3401 225345 : int ChoiceNode::GreedyLoopTextLengthForAlternative(
3402 225345 : GuardedAlternative* alternative) {
3403 : int length = 0;
3404 : RegExpNode* node = alternative->node();
3405 : // Later we will generate code for all these text nodes using recursion
3406 : // so we have to limit the max number.
3407 : int recursion_depth = 0;
3408 586022 : while (node != this) {
3409 337287 : if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3410 : return kNodeIsTooComplexForGreedyLoops;
3411 : }
3412 337287 : int node_length = node->GreedyLoopTextLength();
3413 337287 : if (node_length == kNodeIsTooComplexForGreedyLoops) {
3414 : return kNodeIsTooComplexForGreedyLoops;
3415 : }
3416 135332 : length += node_length;
3417 135332 : SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3418 : node = seq_node->on_success();
3419 : }
3420 23390 : return read_backward() ? -length : length;
3421 : }
3422 :
3423 :
3424 0 : void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3425 : DCHECK_NULL(loop_node_);
3426 : AddAlternative(alt);
3427 999400 : loop_node_ = alt.node();
3428 0 : }
3429 :
3430 :
3431 0 : void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3432 : DCHECK_NULL(continue_node_);
3433 : AddAlternative(alt);
3434 999400 : continue_node_ = alt.node();
3435 0 : }
3436 :
3437 :
3438 344292 : void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3439 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3440 332597 : if (trace->stop_node() == this) {
3441 : // Back edge of greedy optimized loop node graph.
3442 : int text_length =
3443 23390 : GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3444 : DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
3445 : // Update the counter-based backtracking info on the stack. This is an
3446 : // optimization for greedy loops (see below).
3447 : DCHECK(trace->cp_offset() == text_length);
3448 11695 : macro_assembler->AdvanceCurrentPosition(text_length);
3449 23390 : macro_assembler->GoTo(trace->loop_label());
3450 11695 : return;
3451 : }
3452 : DCHECK_NULL(trace->stop_node());
3453 320902 : if (!trace->is_trivial()) {
3454 119354 : trace->Flush(compiler, this);
3455 119354 : return;
3456 : }
3457 201548 : ChoiceNode::Emit(compiler, trace);
3458 : }
3459 :
3460 :
3461 588832 : int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3462 : int eats_at_least) {
3463 : int preload_characters = Min(4, eats_at_least);
3464 : DCHECK_LE(preload_characters, 4);
3465 213650 : if (compiler->macro_assembler()->CanReadUnaligned()) {
3466 : bool one_byte = compiler->one_byte();
3467 161532 : if (one_byte) {
3468 : // We can't preload 3 characters because there is no machine instruction
3469 : // to do that. We can't just load 4 because we could be reading
3470 : // beyond the end of the string, which could cause a memory fault.
3471 132371 : if (preload_characters == 3) preload_characters = 2;
3472 : } else {
3473 29161 : if (preload_characters > 2) preload_characters = 2;
3474 : }
3475 : } else {
3476 52118 : if (preload_characters > 1) preload_characters = 1;
3477 : }
3478 213650 : return preload_characters;
3479 : }
3480 :
3481 :
3482 : // This class is used when generating the alternatives in a choice node. It
3483 : // records the way the alternative is being code generated.
3484 : class AlternativeGeneration: public Malloced {
3485 : public:
3486 : AlternativeGeneration()
3487 : : possible_success(),
3488 : expects_preload(false),
3489 : after(),
3490 2178427 : quick_check_details() { }
3491 : Label possible_success;
3492 : bool expects_preload;
3493 : Label after;
3494 : QuickCheckDetails quick_check_details;
3495 : };
3496 :
3497 :
3498 : // Creates a list of AlternativeGenerations. If the list has a reasonable
3499 : // size then it is on the stack, otherwise the excess is on the heap.
3500 : class AlternativeGenerationList {
3501 : public:
3502 213650 : AlternativeGenerationList(int count, Zone* zone)
3503 2350150 : : alt_gens_(count, zone) {
3504 573390 : for (int i = 0; i < count && i < kAFew; i++) {
3505 573390 : alt_gens_.Add(a_few_alt_gens_ + i, zone);
3506 : }
3507 41927 : for (int i = kAFew; i < count; i++) {
3508 41927 : alt_gens_.Add(new AlternativeGeneration(), zone);
3509 : }
3510 213650 : }
3511 213650 : ~AlternativeGenerationList() {
3512 511154 : for (int i = kAFew; i < alt_gens_.length(); i++) {
3513 381358 : delete alt_gens_[i];
3514 41927 : alt_gens_[i] = nullptr;
3515 : }
3516 213650 : }
3517 :
3518 : AlternativeGeneration* at(int i) {
3519 2839545 : return alt_gens_[i];
3520 : }
3521 :
3522 : private:
3523 : static const int kAFew = 10;
3524 : ZoneList<AlternativeGeneration*> alt_gens_;
3525 : AlternativeGeneration a_few_alt_gens_[kAFew];
3526 : };
3527 :
3528 :
3529 : static const uc32 kRangeEndMarker = 0x110000;
3530 :
3531 : // The '2' variant is has inclusive from and exclusive to.
3532 : // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3533 : // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3534 : static const int kSpaceRanges[] = {
3535 : '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
3536 : 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3537 : 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3538 : static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3539 :
3540 : static const int kWordRanges[] = {
3541 : '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3542 : static const int kWordRangeCount = arraysize(kWordRanges);
3543 : static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3544 : static const int kDigitRangeCount = arraysize(kDigitRanges);
3545 : static const int kSurrogateRanges[] = {
3546 : kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3547 : static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3548 : static const int kLineTerminatorRanges[] = {
3549 : 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3550 : static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3551 :
3552 0 : void BoyerMoorePositionInfo::Set(int character) {
3553 79612 : SetInterval(Interval(character, character));
3554 0 : }
3555 :
3556 :
3557 1203235 : void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3558 241330 : s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3559 241330 : w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3560 241330 : d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3561 : surrogate_ =
3562 241330 : AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3563 241330 : if (interval.to() - interval.from() >= kMapSize - 1) {
3564 13550 : if (map_count_ != kMapSize) {
3565 6301 : map_count_ = kMapSize;
3566 812829 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3567 : }
3568 : return;
3569 : }
3570 1213370 : for (int i = interval.from(); i <= interval.to(); i++) {
3571 537030 : int mod_character = (i & kMask);
3572 1074060 : if (!map_->at(mod_character)) {
3573 372167 : map_count_++;
3574 372167 : map_->at(mod_character) = true;
3575 : }
3576 537030 : if (map_count_ == kMapSize) return;
3577 : }
3578 : }
3579 :
3580 :
3581 0 : void BoyerMoorePositionInfo::SetAll() {
3582 5476 : s_ = w_ = d_ = kLatticeUnknown;
3583 5476 : if (map_count_ != kMapSize) {
3584 5046 : map_count_ = kMapSize;
3585 1291776 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3586 : }
3587 0 : }
3588 :
3589 :
3590 80670 : BoyerMooreLookahead::BoyerMooreLookahead(
3591 80670 : int length, RegExpCompiler* compiler, Zone* zone)
3592 : : length_(length),
3593 80670 : compiler_(compiler) {
3594 80670 : if (compiler->one_byte()) {
3595 10076 : max_char_ = String::kMaxOneByteCharCode;
3596 : } else {
3597 70594 : max_char_ = String::kMaxUtf16CodeUnit;
3598 : }
3599 80670 : bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3600 179910 : for (int i = 0; i < length; i++) {
3601 99240 : bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3602 : }
3603 80670 : }
3604 :
3605 :
3606 : // Find the longest range of lookahead that has the fewest number of different
3607 : // characters that can occur at a given position. Since we are optimizing two
3608 : // different parameters at once this is a tradeoff.
3609 80573 : bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3610 : int biggest_points = 0;
3611 : // If more than 32 characters out of 128 can occur it is unlikely that we can
3612 : // be lucky enough to step forwards much of the time.
3613 : const int kMaxMax = 32;
3614 322292 : for (int max_number_of_chars = 4;
3615 : max_number_of_chars < kMaxMax;
3616 : max_number_of_chars *= 2) {
3617 : biggest_points =
3618 241719 : FindBestInterval(max_number_of_chars, biggest_points, from, to);
3619 : }
3620 80573 : if (biggest_points == 0) return false;
3621 5448 : return true;
3622 : }
3623 :
3624 :
3625 : // Find the highest-points range between 0 and length_ where the character
3626 : // information is not too vague. 'Too vague' means that there are more than
3627 : // max_number_of_chars that can occur at this position. Calculates the number
3628 : // of points as the product of width-of-the-range and
3629 : // probability-of-finding-one-of-the-characters, where the probability is
3630 : // calculated using the frequency distribution of the sample subject string.
3631 241719 : int BoyerMooreLookahead::FindBestInterval(
3632 540915 : int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3633 : int biggest_points = old_biggest_points;
3634 : static const int kSize = RegExpMacroAssembler::kTableSize;
3635 712361 : for (int i = 0; i < length_; ) {
3636 311335 : while (i < length_ && Count(i) > max_number_of_chars) i++;
3637 256528 : if (i == length_) break;
3638 : int remembered_from = i;
3639 : bool union_map[kSize];
3640 29302144 : for (int j = 0; j < kSize; j++) union_map[j] = false;
3641 755686 : while (i < length_ && Count(i) <= max_number_of_chars) {
3642 33277010 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3643 33021033 : for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3644 255977 : i++;
3645 : }
3646 : int frequency = 0;
3647 29302144 : for (int j = 0; j < kSize; j++) {
3648 29302144 : if (union_map[j]) {
3649 : // Add 1 to the frequency to give a small per-character boost for
3650 : // the cases where our sampling is not good enough and many
3651 : // characters have a frequency of zero. This means the frequency
3652 : // can theoretically be up to 2*kSize though we treat it mostly as
3653 : // a fraction of kSize.
3654 975604 : frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3655 : }
3656 : }
3657 : // We use the probability of skipping times the distance we are skipping to
3658 : // judge the effectiveness of this. Actually we have a cut-off: By
3659 : // dividing by 2 we switch off the skipping if the probability of skipping
3660 : // is less than 50%. This is because the multibyte mask-and-compare
3661 : // skipping in quickcheck is more likely to do well on this case.
3662 : bool in_quickcheck_range =
3663 231655 : ((i - remembered_from < 4) ||
3664 2732 : (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3665 : // Called 'probability' but it is only a rough estimate and can actually
3666 : // be outside the 0-kSize range.
3667 228923 : int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3668 228923 : int points = (i - remembered_from) * probability;
3669 228923 : if (points > biggest_points) {
3670 5841 : *from = remembered_from;
3671 5841 : *to = i - 1;
3672 : biggest_points = points;
3673 : }
3674 : }
3675 241719 : return biggest_points;
3676 : }
3677 :
3678 :
3679 : // Take all the characters that will not prevent a successful match if they
3680 : // occur in the subject string in the range between min_lookahead and
3681 : // max_lookahead (inclusive) measured from the current position. If the
3682 : // character at max_lookahead offset is not one of these characters, then we
3683 : // can safely skip forwards by the number of characters in the range.
3684 4441 : int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3685 : int max_lookahead,
3686 : Handle<ByteArray> boolean_skip_table) {
3687 : const int kSize = RegExpMacroAssembler::kTableSize;
3688 :
3689 : const int kSkipArrayEntry = 0;
3690 : const int kDontSkipArrayEntry = 1;
3691 :
3692 572889 : for (int i = 0; i < kSize; i++) {
3693 : boolean_skip_table->set(i, kSkipArrayEntry);
3694 : }
3695 4441 : int skip = max_lookahead + 1 - min_lookahead;
3696 :
3697 13939 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3698 1234740 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3699 1225242 : for (int j = 0; j < kSize; j++) {
3700 1215744 : if (map->at(j)) {
3701 : boolean_skip_table->set(j, kDontSkipArrayEntry);
3702 : }
3703 : }
3704 : }
3705 :
3706 4441 : return skip;
3707 : }
3708 :
3709 :
3710 : // See comment above on the implementation of GetSkipTable.
3711 85014 : void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3712 : const int kSize = RegExpMacroAssembler::kTableSize;
3713 :
3714 80573 : int min_lookahead = 0;
3715 80573 : int max_lookahead = 0;
3716 :
3717 156705 : if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3718 :
3719 : bool found_single_character = false;
3720 : int single_character = 0;
3721 9794 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3722 421408 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3723 17574 : if (map->map_count() > 1 ||
3724 3183 : (found_single_character && map->map_count() != 0)) {
3725 : found_single_character = false;
3726 : break;
3727 : }
3728 399564 : for (int j = 0; j < kSize; j++) {
3729 403834 : if (map->at(j)) {
3730 : found_single_character = true;
3731 : single_character = j;
3732 : break;
3733 : }
3734 : }
3735 : }
3736 :
3737 5448 : int lookahead_width = max_lookahead + 1 - min_lookahead;
3738 :
3739 5448 : if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3740 : // The mask-compare can probably handle this better.
3741 : return;
3742 : }
3743 :
3744 4537 : if (found_single_character) {
3745 96 : Label cont, again;
3746 96 : masm->Bind(&again);
3747 96 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3748 96 : if (max_char_ > kSize) {
3749 : masm->CheckCharacterAfterAnd(single_character,
3750 : RegExpMacroAssembler::kTableMask,
3751 96 : &cont);
3752 : } else {
3753 0 : masm->CheckCharacter(single_character, &cont);
3754 : }
3755 96 : masm->AdvanceCurrentPosition(lookahead_width);
3756 96 : masm->GoTo(&again);
3757 96 : masm->Bind(&cont);
3758 : return;
3759 : }
3760 :
3761 : Factory* factory = masm->isolate()->factory();
3762 4441 : Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3763 : int skip_distance = GetSkipTable(
3764 4441 : min_lookahead, max_lookahead, boolean_skip_table);
3765 : DCHECK_NE(0, skip_distance);
3766 :
3767 4441 : Label cont, again;
3768 4441 : masm->Bind(&again);
3769 4441 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3770 4441 : masm->CheckBitInTable(boolean_skip_table, &cont);
3771 4441 : masm->AdvanceCurrentPosition(skip_distance);
3772 4441 : masm->GoTo(&again);
3773 4441 : masm->Bind(&cont);
3774 : }
3775 :
3776 :
3777 : /* Code generation for choice nodes.
3778 : *
3779 : * We generate quick checks that do a mask and compare to eliminate a
3780 : * choice. If the quick check succeeds then it jumps to the continuation to
3781 : * do slow checks and check subsequent nodes. If it fails (the common case)
3782 : * it falls through to the next choice.
3783 : *
3784 : * Here is the desired flow graph. Nodes directly below each other imply
3785 : * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3786 : * 3 doesn't have a quick check so we have to call the slow check.
3787 : * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3788 : * regexp continuation is generated directly after the Sn node, up to the
3789 : * next GoTo if we decide to reuse some already generated code. Some
3790 : * nodes expect preload_characters to be preloaded into the current
3791 : * character register. R nodes do this preloading. Vertices are marked
3792 : * F for failures and S for success (possible success in the case of quick
3793 : * nodes). L, V, < and > are used as arrow heads.
3794 : *
3795 : * ----------> R
3796 : * |
3797 : * V
3798 : * Q1 -----> S1
3799 : * | S /
3800 : * F| /
3801 : * | F/
3802 : * | /
3803 : * | R
3804 : * | /
3805 : * V L
3806 : * Q2 -----> S2
3807 : * | S /
3808 : * F| /
3809 : * | F/
3810 : * | /
3811 : * | R
3812 : * | /
3813 : * V L
3814 : * S3
3815 : * |
3816 : * F|
3817 : * |
3818 : * R
3819 : * |
3820 : * backtrack V
3821 : * <----------Q4
3822 : * \ F |
3823 : * \ |S
3824 : * \ F V
3825 : * \-----S4
3826 : *
3827 : * For greedy loops we push the current position, then generate the code that
3828 : * eats the input specially in EmitGreedyLoop. The other choice (the
3829 : * continuation) is generated by the normal code in EmitChoices, and steps back
3830 : * in the input to the starting position when it fails to match. The loop code
3831 : * looks like this (U is the unwind code that steps back in the greedy loop).
3832 : *
3833 : * _____
3834 : * / \
3835 : * V |
3836 : * ----------> S1 |
3837 : * /| |
3838 : * / |S |
3839 : * F/ \_____/
3840 : * /
3841 : * |<-----
3842 : * | \
3843 : * V |S
3844 : * Q2 ---> U----->backtrack
3845 : * | F /
3846 : * S| /
3847 : * V F /
3848 : * S2--/
3849 : */
3850 :
3851 213650 : GreedyLoopState::GreedyLoopState(bool not_at_start) {
3852 0 : counter_backtrack_trace_.set_backtrack(&label_);
3853 213650 : if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3854 0 : }
3855 :
3856 :
3857 0 : void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3858 : #ifdef DEBUG
3859 : int choice_count = alternatives_->length();
3860 : for (int i = 0; i < choice_count - 1; i++) {
3861 : GuardedAlternative alternative = alternatives_->at(i);
3862 : ZoneList<Guard*>* guards = alternative.guards();
3863 : int guard_count = (guards == nullptr) ? 0 : guards->length();
3864 : for (int j = 0; j < guard_count; j++) {
3865 : DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3866 : }
3867 : }
3868 : #endif
3869 0 : }
3870 :
3871 :
3872 345194 : void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3873 345194 : Trace* current_trace,
3874 : PreloadState* state) {
3875 213650 : if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3876 : // Save some time by looking at most one machine word ahead.
3877 : state->eats_at_least_ =
3878 : EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3879 394632 : current_trace->at_start() == Trace::FALSE_VALUE);
3880 : }
3881 : state->preload_characters_ =
3882 213650 : CalculatePreloadCharacters(compiler, state->eats_at_least_);
3883 :
3884 : state->preload_is_current_ =
3885 213650 : (current_trace->characters_preloaded() == state->preload_characters_);
3886 213650 : state->preload_has_checked_bounds_ = state->preload_is_current_;
3887 213650 : }
3888 :
3889 :
3890 1429183 : void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3891 583358 : int choice_count = alternatives_->length();
3892 :
3893 584691 : if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3894 1333 : alternatives_->at(0).node()->Emit(compiler, trace);
3895 1333 : return;
3896 : }
3897 :
3898 : AssertGuardsMentionRegisters(trace);
3899 :
3900 795675 : LimitResult limit_result = LimitVersions(compiler, trace);
3901 582025 : if (limit_result == DONE) return;
3902 : DCHECK(limit_result == CONTINUE);
3903 :
3904 : // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3905 : // other choice nodes we only flush if we are out of code size budget.
3906 216570 : if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3907 1460 : trace->Flush(compiler, this);
3908 1460 : return;
3909 : }
3910 :
3911 : RecursionCheck rc(compiler);
3912 :
3913 : PreloadState preload;
3914 : preload.init();
3915 : GreedyLoopState greedy_loop_state(not_at_start());
3916 :
3917 427300 : int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3918 427300 : AlternativeGenerationList alt_gens(choice_count, zone());
3919 :
3920 213650 : if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3921 : trace = EmitGreedyLoop(compiler,
3922 : trace,
3923 : &alt_gens,
3924 : &preload,
3925 : &greedy_loop_state,
3926 11695 : text_length);
3927 : } else {
3928 : // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3929 : // match the traces produced pre-cleanup.
3930 201955 : Label second_choice;
3931 201955 : compiler->macro_assembler()->Bind(&second_choice);
3932 :
3933 201955 : preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3934 :
3935 : EmitChoices(compiler,
3936 : &alt_gens,
3937 : 0,
3938 : trace,
3939 201955 : &preload);
3940 : }
3941 :
3942 : // At this point we need to generate slow checks for the alternatives where
3943 : // the quick check was inlined. We can recognize these because the associated
3944 : // label was bound.
3945 213650 : int new_flush_budget = trace->flush_budget() / choice_count;
3946 828967 : for (int i = 0; i < choice_count; i++) {
3947 : AlternativeGeneration* alt_gen = alt_gens.at(i);
3948 615317 : Trace new_trace(*trace);
3949 : // If there are actions to be flushed we have to limit how many times
3950 : // they are flushed. Take the budget of the parent trace and distribute
3951 : // it fairly amongst the children.
3952 615317 : if (new_trace.actions() != nullptr) {
3953 : new_trace.set_flush_budget(new_flush_budget);
3954 : }
3955 : bool next_expects_preload =
3956 1016984 : i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3957 : EmitOutOfLineContinuation(compiler,
3958 : &new_trace,
3959 615317 : alternatives_->at(i),
3960 : alt_gen,
3961 : preload.preload_characters_,
3962 1230634 : next_expects_preload);
3963 : }
3964 : }
3965 :
3966 :
3967 11695 : Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3968 11695 : Trace* trace,
3969 : AlternativeGenerationList* alt_gens,
3970 : PreloadState* preload,
3971 : GreedyLoopState* greedy_loop_state,
3972 11695 : int text_length) {
3973 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3974 : // Here we have special handling for greedy loops containing only text nodes
3975 : // and other simple nodes. These are handled by pushing the current
3976 : // position on the stack and then incrementing the current position each
3977 : // time around the switch. On backtrack we decrement the current position
3978 : // and check it against the pushed value. This avoids pushing backtrack
3979 : // information for each iteration of the loop, which could take up a lot of
3980 : // space.
3981 : DCHECK(trace->stop_node() == nullptr);
3982 11695 : macro_assembler->PushCurrentPosition();
3983 11695 : Label greedy_match_failed;
3984 11695 : Trace greedy_match_trace;
3985 11695 : if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3986 : greedy_match_trace.set_backtrack(&greedy_match_failed);
3987 11695 : Label loop_label;
3988 11695 : macro_assembler->Bind(&loop_label);
3989 11695 : greedy_match_trace.set_stop_node(this);
3990 : greedy_match_trace.set_loop_label(&loop_label);
3991 23390 : alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
3992 11695 : macro_assembler->Bind(&greedy_match_failed);
3993 :
3994 11695 : Label second_choice; // For use in greedy matches.
3995 11695 : macro_assembler->Bind(&second_choice);
3996 :
3997 11695 : Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
3998 :
3999 : EmitChoices(compiler,
4000 : alt_gens,
4001 : 1,
4002 : new_trace,
4003 11695 : preload);
4004 :
4005 11695 : macro_assembler->Bind(greedy_loop_state->label());
4006 : // If we have unwound to the bottom then backtrack.
4007 23390 : macro_assembler->CheckGreedyLoop(trace->backtrack());
4008 : // Otherwise try the second priority at an earlier position.
4009 11695 : macro_assembler->AdvanceCurrentPosition(-text_length);
4010 11695 : macro_assembler->GoTo(&second_choice);
4011 11695 : return new_trace;
4012 : }
4013 :
4014 284061 : int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4015 : Trace* trace) {
4016 : int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4017 201955 : if (alternatives_->length() != 2) return eats_at_least;
4018 :
4019 165831 : GuardedAlternative alt1 = alternatives_->at(1);
4020 165831 : if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
4021 : return eats_at_least;
4022 : }
4023 : RegExpNode* eats_anything_node = alt1.node();
4024 244973 : if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4025 : return eats_at_least;
4026 : }
4027 :
4028 : // Really we should be creating a new trace when we execute this function,
4029 : // but there is no need, because the code it generates cannot backtrack, and
4030 : // we always arrive here with a trivial trace (since it's the entry to a
4031 : // loop. That also implies that there are no preloaded characters, which is
4032 : // good, because it means we won't be violating any assumptions by
4033 : // overwriting those characters with new load instructions.
4034 : DCHECK(trace->is_trivial());
4035 :
4036 82106 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4037 : Isolate* isolate = macro_assembler->isolate();
4038 : // At this point we know that we are at a non-greedy loop that will eat
4039 : // any character one at a time. Any non-anchored regexp has such a
4040 : // loop prepended to it in order to find where it starts. We look for
4041 : // a pattern of the form ...abc... where we can look 6 characters ahead
4042 : // and step forwards 3 if the character is not one of abc. Abc need
4043 : // not be atoms, they can be any reasonably limited character class or
4044 : // small alternation.
4045 : BoyerMooreLookahead* bm = bm_info(false);
4046 82106 : if (bm == nullptr) {
4047 : eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4048 : EatsAtLeast(kMaxLookaheadForBoyerMoore,
4049 : kRecursionBudget,
4050 82106 : false));
4051 82106 : if (eats_at_least >= 1) {
4052 : bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4053 : compiler,
4054 80573 : zone());
4055 161146 : GuardedAlternative alt0 = alternatives_->at(0);
4056 80573 : alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4057 : }
4058 : }
4059 82106 : if (bm != nullptr) {
4060 80573 : bm->EmitSkipInstructions(macro_assembler);
4061 : }
4062 82106 : return eats_at_least;
4063 : }
4064 :
4065 :
4066 817272 : void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4067 : AlternativeGenerationList* alt_gens,
4068 : int first_choice,
4069 213672 : Trace* trace,
4070 : PreloadState* preload) {
4071 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4072 213650 : SetUpPreLoad(compiler, trace, preload);
4073 :
4074 : // For now we just call all choices one after the other. The idea ultimately
4075 : // is to use the Dispatch table to try only the relevant ones.
4076 213650 : int choice_count = alternatives_->length();
4077 :
4078 213650 : int new_flush_budget = trace->flush_budget() / choice_count;
4079 :
4080 817272 : for (int i = first_choice; i < choice_count; i++) {
4081 603622 : bool is_last = i == choice_count - 1;
4082 603622 : bool fall_through_on_failure = !is_last;
4083 1207244 : GuardedAlternative alternative = alternatives_->at(i);
4084 : AlternativeGeneration* alt_gen = alt_gens->at(i);
4085 982422 : alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4086 3937 : ZoneList<Guard*>* guards = alternative.guards();
4087 603622 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4088 603622 : Trace new_trace(*trace);
4089 : new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4090 : preload->preload_characters_ :
4091 603622 : 0);
4092 603622 : if (preload->preload_has_checked_bounds_) {
4093 398311 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4094 : }
4095 : new_trace.quick_check_performed()->Clear();
4096 603622 : if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4097 603622 : if (!is_last) {
4098 389972 : new_trace.set_backtrack(&alt_gen->after);
4099 : }
4100 603622 : alt_gen->expects_preload = preload->preload_is_current_;
4101 : bool generate_full_check_inline = false;
4102 1080351 : if (compiler->optimize() &&
4103 1077549 : try_to_emit_quick_check_for_alternative(i == 0) &&
4104 : alternative.node()->EmitQuickCheck(
4105 : compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4106 : &alt_gen->possible_success, &alt_gen->quick_check_details,
4107 473927 : fall_through_on_failure)) {
4108 : // Quick check was generated for this choice.
4109 224822 : preload->preload_is_current_ = true;
4110 224822 : preload->preload_has_checked_bounds_ = true;
4111 : // If we generated the quick check to fall through on possible success,
4112 : // we now need to generate the full check inline.
4113 224822 : if (!fall_through_on_failure) {
4114 34435 : macro_assembler->Bind(&alt_gen->possible_success);
4115 : new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4116 34435 : new_trace.set_characters_preloaded(preload->preload_characters_);
4117 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4118 : generate_full_check_inline = true;
4119 : }
4120 378800 : } else if (alt_gen->quick_check_details.cannot_match()) {
4121 114 : if (!fall_through_on_failure) {
4122 44 : macro_assembler->GoTo(trace->backtrack());
4123 : }
4124 114 : continue;
4125 : } else {
4126 : // No quick check was generated. Put the full code here.
4127 : // If this is not the first choice then there could be slow checks from
4128 : // previous cases that go here when they fail. There's no reason to
4129 : // insist that they preload characters since the slow check we are about
4130 : // to generate probably can't use it.
4131 378686 : if (i != first_choice) {
4132 227441 : alt_gen->expects_preload = false;
4133 : new_trace.InvalidateCurrentCharacter();
4134 : }
4135 : generate_full_check_inline = true;
4136 : }
4137 603508 : if (generate_full_check_inline) {
4138 413121 : if (new_trace.actions() != nullptr) {
4139 : new_trace.set_flush_budget(new_flush_budget);
4140 : }
4141 2492 : for (int j = 0; j < guard_count; j++) {
4142 2492 : GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4143 : }
4144 413121 : alternative.node()->Emit(compiler, &new_trace);
4145 413121 : preload->preload_is_current_ = false;
4146 : }
4147 603508 : macro_assembler->Bind(&alt_gen->after);
4148 : }
4149 213650 : }
4150 :
4151 :
4152 805704 : void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4153 160900 : Trace* trace,
4154 : GuardedAlternative alternative,
4155 : AlternativeGeneration* alt_gen,
4156 : int preload_characters,
4157 : bool next_expects_preload) {
4158 1040247 : if (!alt_gen->possible_success.is_linked()) return;
4159 :
4160 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4161 190387 : macro_assembler->Bind(&alt_gen->possible_success);
4162 190387 : Trace out_of_line_trace(*trace);
4163 : out_of_line_trace.set_characters_preloaded(preload_characters);
4164 : out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4165 190387 : if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4166 191832 : ZoneList<Guard*>* guards = alternative.guards();
4167 190387 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4168 190387 : if (next_expects_preload) {
4169 160900 : Label reload_current_char;
4170 : out_of_line_trace.set_backtrack(&reload_current_char);
4171 162269 : for (int j = 0; j < guard_count; j++) {
4172 1369 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4173 : }
4174 160900 : alternative.node()->Emit(compiler, &out_of_line_trace);
4175 160900 : macro_assembler->Bind(&reload_current_char);
4176 : // Reload the current character, since the next quick check expects that.
4177 : // We don't need to check bounds here because we only get into this
4178 : // code through a quick check which already did the checked load.
4179 : macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
4180 321800 : preload_characters);
4181 160900 : macro_assembler->GoTo(&(alt_gen->after));
4182 : } else {
4183 29487 : out_of_line_trace.set_backtrack(&(alt_gen->after));
4184 29563 : for (int j = 0; j < guard_count; j++) {
4185 76 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4186 : }
4187 29487 : alternative.node()->Emit(compiler, &out_of_line_trace);
4188 : }
4189 : }
4190 :
4191 :
4192 495160 : void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4193 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4194 494290 : LimitResult limit_result = LimitVersions(compiler, trace);
4195 494290 : if (limit_result == DONE) return;
4196 : DCHECK(limit_result == CONTINUE);
4197 :
4198 : RecursionCheck rc(compiler);
4199 :
4200 265869 : switch (action_type_) {
4201 : case STORE_POSITION: {
4202 : Trace::DeferredCapture
4203 : new_capture(data_.u_position_register.reg,
4204 : data_.u_position_register.is_capture,
4205 241535 : trace);
4206 241535 : Trace new_trace = *trace;
4207 : new_trace.add_action(&new_capture);
4208 257515 : on_success()->Emit(compiler, &new_trace);
4209 : break;
4210 : }
4211 : case INCREMENT_REGISTER: {
4212 : Trace::DeferredIncrementRegister
4213 3744 : new_increment(data_.u_increment_register.reg);
4214 3744 : Trace new_trace = *trace;
4215 : new_trace.add_action(&new_increment);
4216 3744 : on_success()->Emit(compiler, &new_trace);
4217 : break;
4218 : }
4219 : case SET_REGISTER: {
4220 : Trace::DeferredSetRegister
4221 3465 : new_set(data_.u_store_register.reg, data_.u_store_register.value);
4222 3465 : Trace new_trace = *trace;
4223 : new_trace.add_action(&new_set);
4224 3465 : on_success()->Emit(compiler, &new_trace);
4225 : break;
4226 : }
4227 : case CLEAR_CAPTURES: {
4228 : Trace::DeferredClearCaptures
4229 : new_capture(Interval(data_.u_clear_captures.range_from,
4230 2186 : data_.u_clear_captures.range_to));
4231 2186 : Trace new_trace = *trace;
4232 : new_trace.add_action(&new_capture);
4233 2186 : on_success()->Emit(compiler, &new_trace);
4234 : break;
4235 : }
4236 : case BEGIN_SUBMATCH:
4237 9470 : if (!trace->is_trivial()) {
4238 5038 : trace->Flush(compiler, this);
4239 : } else {
4240 : assembler->WriteCurrentPositionToRegister(
4241 4432 : data_.u_submatch.current_position_register, 0);
4242 : assembler->WriteStackPointerToRegister(
4243 4432 : data_.u_submatch.stack_pointer_register);
4244 4432 : on_success()->Emit(compiler, trace);
4245 : }
4246 : break;
4247 : case EMPTY_MATCH_CHECK: {
4248 913 : int start_pos_reg = data_.u_empty_match_check.start_register;
4249 913 : int stored_pos = 0;
4250 913 : int rep_reg = data_.u_empty_match_check.repetition_register;
4251 913 : bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4252 913 : bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4253 1092 : if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4254 : // If we know we haven't advanced and there is no minimum we
4255 : // can just backtrack immediately.
4256 152 : assembler->GoTo(trace->backtrack());
4257 1171 : } else if (know_dist && stored_pos < trace->cp_offset()) {
4258 : // If we know we've advanced we can generate the continuation
4259 : // immediately.
4260 247 : on_success()->Emit(compiler, trace);
4261 590 : } else if (!trace->is_trivial()) {
4262 309 : trace->Flush(compiler, this);
4263 : } else {
4264 281 : Label skip_empty_check;
4265 : // If we have a minimum number of repetitions we check the current
4266 : // number first and skip the empty check if it's not enough.
4267 281 : if (has_minimum) {
4268 206 : int limit = data_.u_empty_match_check.repetition_limit;
4269 206 : assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4270 : }
4271 : // If the match is empty we bail out, otherwise we fall through
4272 : // to the on-success continuation.
4273 : assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4274 562 : trace->backtrack());
4275 281 : assembler->Bind(&skip_empty_check);
4276 281 : on_success()->Emit(compiler, trace);
4277 : }
4278 : break;
4279 : }
4280 : case POSITIVE_SUBMATCH_SUCCESS: {
4281 4556 : if (!trace->is_trivial()) {
4282 2931 : trace->Flush(compiler, this);
4283 2931 : return;
4284 : }
4285 : assembler->ReadCurrentPositionFromRegister(
4286 1625 : data_.u_submatch.current_position_register);
4287 : assembler->ReadStackPointerFromRegister(
4288 1625 : data_.u_submatch.stack_pointer_register);
4289 1625 : int clear_register_count = data_.u_submatch.clear_register_count;
4290 1625 : if (clear_register_count == 0) {
4291 1142 : on_success()->Emit(compiler, trace);
4292 1142 : return;
4293 : }
4294 483 : int clear_registers_from = data_.u_submatch.clear_register_from;
4295 483 : Label clear_registers_backtrack;
4296 483 : Trace new_trace = *trace;
4297 : new_trace.set_backtrack(&clear_registers_backtrack);
4298 483 : on_success()->Emit(compiler, &new_trace);
4299 :
4300 483 : assembler->Bind(&clear_registers_backtrack);
4301 483 : int clear_registers_to = clear_registers_from + clear_register_count - 1;
4302 483 : assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4303 :
4304 : DCHECK(trace->backtrack() == nullptr);
4305 483 : assembler->Backtrack();
4306 : return;
4307 : }
4308 : default:
4309 0 : UNREACHABLE();
4310 : }
4311 : }
4312 :
4313 :
4314 11249 : void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4315 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4316 4743 : if (!trace->is_trivial()) {
4317 2253 : trace->Flush(compiler, this);
4318 2253 : return;
4319 : }
4320 :
4321 2490 : LimitResult limit_result = LimitVersions(compiler, trace);
4322 2490 : if (limit_result == DONE) return;
4323 : DCHECK(limit_result == CONTINUE);
4324 :
4325 : RecursionCheck rc(compiler);
4326 :
4327 : DCHECK_EQ(start_reg_ + 1, end_reg_);
4328 2290 : if (IgnoreCase(flags_)) {
4329 : assembler->CheckNotBackReferenceIgnoreCase(
4330 5043 : start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
4331 : } else {
4332 : assembler->CheckNotBackReference(start_reg_, read_backward(),
4333 1218 : trace->backtrack());
4334 : }
4335 : // We are going to advance backward, so we may end up at the start.
4336 2290 : if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4337 :
4338 : // Check that the back reference does not end inside a surrogate pair.
4339 2455 : if (IsUnicode(flags_) && !compiler->one_byte()) {
4340 80 : assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4341 : }
4342 2290 : on_success()->Emit(compiler, trace);
4343 : }
4344 :
4345 :
4346 : // -------------------------------------------------------------------
4347 : // Dot/dotty output
4348 :
4349 :
4350 : #ifdef DEBUG
4351 :
4352 :
4353 : class DotPrinter: public NodeVisitor {
4354 : public:
4355 : DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4356 : : os_(os),
4357 : ignore_case_(ignore_case) {}
4358 : void PrintNode(const char* label, RegExpNode* node);
4359 : void Visit(RegExpNode* node);
4360 : void PrintAttributes(RegExpNode* from);
4361 : void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4362 : #define DECLARE_VISIT(Type) \
4363 : virtual void Visit##Type(Type##Node* that);
4364 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4365 : #undef DECLARE_VISIT
4366 : private:
4367 : std::ostream& os_;
4368 : bool ignore_case_;
4369 : };
4370 :
4371 :
4372 : void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4373 : os_ << "digraph G {\n graph [label=\"";
4374 : for (int i = 0; label[i]; i++) {
4375 : switch (label[i]) {
4376 : case '\\':
4377 : os_ << "\\\\";
4378 : break;
4379 : case '"':
4380 : os_ << "\"";
4381 : break;
4382 : default:
4383 : os_ << label[i];
4384 : break;
4385 : }
4386 : }
4387 : os_ << "\"];\n";
4388 : Visit(node);
4389 : os_ << "}" << std::endl;
4390 : }
4391 :
4392 :
4393 : void DotPrinter::Visit(RegExpNode* node) {
4394 : if (node->info()->visited) return;
4395 : node->info()->visited = true;
4396 : node->Accept(this);
4397 : }
4398 :
4399 :
4400 : void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4401 : os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4402 : Visit(on_failure);
4403 : }
4404 :
4405 :
4406 : class TableEntryBodyPrinter {
4407 : public:
4408 : TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4409 : : os_(os),
4410 : choice_(choice) {}
4411 : void Call(uc16 from, DispatchTable::Entry entry) {
4412 : OutSet* out_set = entry.out_set();
4413 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4414 : if (out_set->Get(i)) {
4415 : os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4416 : << choice()->alternatives()->at(i).node() << ";\n";
4417 : }
4418 : }
4419 : }
4420 : private:
4421 : ChoiceNode* choice() { return choice_; }
4422 : std::ostream& os_;
4423 : ChoiceNode* choice_;
4424 : };
4425 :
4426 :
4427 : class TableEntryHeaderPrinter {
4428 : public:
4429 : explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4430 : : first_(true),
4431 : os_(os) {}
4432 : void Call(uc16 from, DispatchTable::Entry entry) {
4433 : if (first_) {
4434 : first_ = false;
4435 : } else {
4436 : os_ << "|";
4437 : }
4438 : os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4439 : OutSet* out_set = entry.out_set();
4440 : int priority = 0;
4441 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4442 : if (out_set->Get(i)) {
4443 : if (priority > 0) os_ << "|";
4444 : os_ << "<s" << from << "o" << i << "> " << priority;
4445 : priority++;
4446 : }
4447 : }
4448 : os_ << "}}";
4449 : }
4450 :
4451 : private:
4452 : bool first_;
4453 : std::ostream& os_;
4454 : };
4455 :
4456 :
4457 : class AttributePrinter {
4458 : public:
4459 : explicit AttributePrinter(std::ostream& os) // NOLINT
4460 : : os_(os),
4461 : first_(true) {}
4462 : void PrintSeparator() {
4463 : if (first_) {
4464 : first_ = false;
4465 : } else {
4466 : os_ << "|";
4467 : }
4468 : }
4469 : void PrintBit(const char* name, bool value) {
4470 : if (!value) return;
4471 : PrintSeparator();
4472 : os_ << "{" << name << "}";
4473 : }
4474 : void PrintPositive(const char* name, int value) {
4475 : if (value < 0) return;
4476 : PrintSeparator();
4477 : os_ << "{" << name << "|" << value << "}";
4478 : }
4479 :
4480 : private:
4481 : std::ostream& os_;
4482 : bool first_;
4483 : };
4484 :
4485 :
4486 : void DotPrinter::PrintAttributes(RegExpNode* that) {
4487 : os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4488 : << "margin=0.1, fontsize=10, label=\"{";
4489 : AttributePrinter printer(os_);
4490 : NodeInfo* info = that->info();
4491 : printer.PrintBit("NI", info->follows_newline_interest);
4492 : printer.PrintBit("WI", info->follows_word_interest);
4493 : printer.PrintBit("SI", info->follows_start_interest);
4494 : Label* label = that->label();
4495 : if (label->is_bound())
4496 : printer.PrintPositive("@", label->pos());
4497 : os_ << "}\"];\n"
4498 : << " a" << that << " -> n" << that
4499 : << " [style=dashed, color=grey, arrowhead=none];\n";
4500 : }
4501 :
4502 :
4503 : static const bool kPrintDispatchTable = false;
4504 : void DotPrinter::VisitChoice(ChoiceNode* that) {
4505 : if (kPrintDispatchTable) {
4506 : os_ << " n" << that << " [shape=Mrecord, label=\"";
4507 : TableEntryHeaderPrinter header_printer(os_);
4508 : that->GetTable(ignore_case_)->ForEach(&header_printer);
4509 : os_ << "\"]\n";
4510 : PrintAttributes(that);
4511 : TableEntryBodyPrinter body_printer(os_, that);
4512 : that->GetTable(ignore_case_)->ForEach(&body_printer);
4513 : } else {
4514 : os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4515 : for (int i = 0; i < that->alternatives()->length(); i++) {
4516 : GuardedAlternative alt = that->alternatives()->at(i);
4517 : os_ << " n" << that << " -> n" << alt.node();
4518 : }
4519 : }
4520 : for (int i = 0; i < that->alternatives()->length(); i++) {
4521 : GuardedAlternative alt = that->alternatives()->at(i);
4522 : alt.node()->Accept(this);
4523 : }
4524 : }
4525 :
4526 :
4527 : void DotPrinter::VisitText(TextNode* that) {
4528 : Zone* zone = that->zone();
4529 : os_ << " n" << that << " [label=\"";
4530 : for (int i = 0; i < that->elements()->length(); i++) {
4531 : if (i > 0) os_ << " ";
4532 : TextElement elm = that->elements()->at(i);
4533 : switch (elm.text_type()) {
4534 : case TextElement::ATOM: {
4535 : Vector<const uc16> data = elm.atom()->data();
4536 : for (int i = 0; i < data.length(); i++) {
4537 : os_ << static_cast<char>(data[i]);
4538 : }
4539 : break;
4540 : }
4541 : case TextElement::CHAR_CLASS: {
4542 : RegExpCharacterClass* node = elm.char_class();
4543 : os_ << "[";
4544 : if (node->is_negated()) os_ << "^";
4545 : for (int j = 0; j < node->ranges(zone)->length(); j++) {
4546 : CharacterRange range = node->ranges(zone)->at(j);
4547 : os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4548 : }
4549 : os_ << "]";
4550 : break;
4551 : }
4552 : default:
4553 : UNREACHABLE();
4554 : }
4555 : }
4556 : os_ << "\", shape=box, peripheries=2];\n";
4557 : PrintAttributes(that);
4558 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4559 : Visit(that->on_success());
4560 : }
4561 :
4562 :
4563 : void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4564 : os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4565 : << that->end_register() << "\", shape=doubleoctagon];\n";
4566 : PrintAttributes(that);
4567 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4568 : Visit(that->on_success());
4569 : }
4570 :
4571 :
4572 : void DotPrinter::VisitEnd(EndNode* that) {
4573 : os_ << " n" << that << " [style=bold, shape=point];\n";
4574 : PrintAttributes(that);
4575 : }
4576 :
4577 :
4578 : void DotPrinter::VisitAssertion(AssertionNode* that) {
4579 : os_ << " n" << that << " [";
4580 : switch (that->assertion_type()) {
4581 : case AssertionNode::AT_END:
4582 : os_ << "label=\"$\", shape=septagon";
4583 : break;
4584 : case AssertionNode::AT_START:
4585 : os_ << "label=\"^\", shape=septagon";
4586 : break;
4587 : case AssertionNode::AT_BOUNDARY:
4588 : os_ << "label=\"\\b\", shape=septagon";
4589 : break;
4590 : case AssertionNode::AT_NON_BOUNDARY:
4591 : os_ << "label=\"\\B\", shape=septagon";
4592 : break;
4593 : case AssertionNode::AFTER_NEWLINE:
4594 : os_ << "label=\"(?<=\\n)\", shape=septagon";
4595 : break;
4596 : }
4597 : os_ << "];\n";
4598 : PrintAttributes(that);
4599 : RegExpNode* successor = that->on_success();
4600 : os_ << " n" << that << " -> n" << successor << ";\n";
4601 : Visit(successor);
4602 : }
4603 :
4604 :
4605 : void DotPrinter::VisitAction(ActionNode* that) {
4606 : os_ << " n" << that << " [";
4607 : switch (that->action_type_) {
4608 : case ActionNode::SET_REGISTER:
4609 : os_ << "label=\"$" << that->data_.u_store_register.reg
4610 : << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4611 : break;
4612 : case ActionNode::INCREMENT_REGISTER:
4613 : os_ << "label=\"$" << that->data_.u_increment_register.reg
4614 : << "++\", shape=octagon";
4615 : break;
4616 : case ActionNode::STORE_POSITION:
4617 : os_ << "label=\"$" << that->data_.u_position_register.reg
4618 : << ":=$pos\", shape=octagon";
4619 : break;
4620 : case ActionNode::BEGIN_SUBMATCH:
4621 : os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4622 : << ":=$pos,begin\", shape=septagon";
4623 : break;
4624 : case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4625 : os_ << "label=\"escape\", shape=septagon";
4626 : break;
4627 : case ActionNode::EMPTY_MATCH_CHECK:
4628 : os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4629 : << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4630 : << "<" << that->data_.u_empty_match_check.repetition_limit
4631 : << "?\", shape=septagon";
4632 : break;
4633 : case ActionNode::CLEAR_CAPTURES: {
4634 : os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4635 : << " to $" << that->data_.u_clear_captures.range_to
4636 : << "\", shape=septagon";
4637 : break;
4638 : }
4639 : }
4640 : os_ << "];\n";
4641 : PrintAttributes(that);
4642 : RegExpNode* successor = that->on_success();
4643 : os_ << " n" << that << " -> n" << successor << ";\n";
4644 : Visit(successor);
4645 : }
4646 :
4647 :
4648 : class DispatchTableDumper {
4649 : public:
4650 : explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4651 : void Call(uc16 key, DispatchTable::Entry entry);
4652 : private:
4653 : std::ostream& os_;
4654 : };
4655 :
4656 :
4657 : void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4658 : os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4659 : OutSet* set = entry.out_set();
4660 : bool first = true;
4661 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4662 : if (set->Get(i)) {
4663 : if (first) {
4664 : first = false;
4665 : } else {
4666 : os_ << ", ";
4667 : }
4668 : os_ << i;
4669 : }
4670 : }
4671 : os_ << "}\n";
4672 : }
4673 :
4674 :
4675 : void DispatchTable::Dump() {
4676 : OFStream os(stderr);
4677 : DispatchTableDumper dumper(os);
4678 : tree()->ForEach(&dumper);
4679 : }
4680 :
4681 :
4682 : void RegExpEngine::DotPrint(const char* label,
4683 : RegExpNode* node,
4684 : bool ignore_case) {
4685 : StdoutStream os;
4686 : DotPrinter printer(os, ignore_case);
4687 : printer.PrintNode(label, node);
4688 : }
4689 :
4690 :
4691 : #endif // DEBUG
4692 :
4693 :
4694 : // -------------------------------------------------------------------
4695 : // Tree to graph conversion
4696 :
4697 2978085 : RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4698 : RegExpNode* on_success) {
4699 : ZoneList<TextElement>* elms =
4700 992695 : new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4701 992695 : elms->Add(TextElement::Atom(this), compiler->zone());
4702 : return new (compiler->zone())
4703 992695 : TextNode(elms, compiler->read_backward(), on_success);
4704 : }
4705 :
4706 :
4707 34754 : RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4708 : RegExpNode* on_success) {
4709 : return new (compiler->zone())
4710 34754 : TextNode(elements(), compiler->read_backward(), on_success);
4711 : }
4712 :
4713 :
4714 1106480 : static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4715 : const int* special_class,
4716 : int length) {
4717 553240 : length--; // Remove final marker.
4718 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4719 : DCHECK_NE(0, ranges->length());
4720 : DCHECK_NE(0, length);
4721 : DCHECK_NE(0, special_class[0]);
4722 553240 : if (ranges->length() != (length >> 1) + 1) {
4723 : return false;
4724 : }
4725 10302 : CharacterRange range = ranges->at(0);
4726 10302 : if (range.from() != 0) {
4727 : return false;
4728 : }
4729 25303 : for (int i = 0; i < length; i += 2) {
4730 25858 : if (special_class[i] != (range.to() + 1)) {
4731 : return false;
4732 : }
4733 50606 : range = ranges->at((i >> 1) + 1);
4734 25303 : if (special_class[i+1] != range.from()) {
4735 : return false;
4736 : }
4737 : }
4738 7914 : if (range.to() != String::kMaxCodePoint) {
4739 : return false;
4740 : }
4741 7914 : return true;
4742 : }
4743 :
4744 :
4745 1095048 : static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4746 : const int* special_class,
4747 : int length) {
4748 547524 : length--; // Remove final marker.
4749 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4750 547524 : if (ranges->length() * 2 != length) {
4751 : return false;
4752 : }
4753 11214 : for (int i = 0; i < length; i += 2) {
4754 29322 : CharacterRange range = ranges->at(i >> 1);
4755 25886 : if (range.from() != special_class[i] ||
4756 11225 : range.to() != special_class[i + 1] - 1) {
4757 : return false;
4758 : }
4759 : }
4760 : return true;
4761 : }
4762 :
4763 :
4764 197140 : bool RegExpCharacterClass::is_standard(Zone* zone) {
4765 : // TODO(lrn): Remove need for this function, by not throwing away information
4766 : // along the way.
4767 197140 : if (is_negated()) {
4768 : return false;
4769 : }
4770 191409 : if (set_.is_standard()) {
4771 : return true;
4772 : }
4773 188118 : if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4774 : set_.set_standard_set_type('s');
4775 608 : return true;
4776 : }
4777 187510 : if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4778 : set_.set_standard_set_type('S');
4779 207 : return true;
4780 : }
4781 187303 : if (CompareInverseRanges(set_.ranges(zone),
4782 : kLineTerminatorRanges,
4783 187303 : kLineTerminatorRangeCount)) {
4784 : set_.set_standard_set_type('.');
4785 7595 : return true;
4786 : }
4787 179708 : if (CompareRanges(set_.ranges(zone),
4788 : kLineTerminatorRanges,
4789 179708 : kLineTerminatorRangeCount)) {
4790 : set_.set_standard_set_type('n');
4791 10 : return true;
4792 : }
4793 179698 : if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4794 : set_.set_standard_set_type('w');
4795 1271 : return true;
4796 : }
4797 178427 : if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4798 : set_.set_standard_set_type('W');
4799 112 : return true;
4800 : }
4801 : return false;
4802 : }
4803 :
4804 :
4805 2582 : UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4806 76845 : ZoneList<CharacterRange>* base)
4807 : : zone_(zone),
4808 : table_(zone),
4809 : bmp_(nullptr),
4810 : lead_surrogates_(nullptr),
4811 : trail_surrogates_(nullptr),
4812 5164 : non_bmp_(nullptr) {
4813 : // The unicode range splitter categorizes given character ranges into:
4814 : // - Code points from the BMP representable by one code unit.
4815 : // - Code points outside the BMP that need to be split into surrogate pairs.
4816 : // - Lone lead surrogates.
4817 : // - Lone trail surrogates.
4818 : // Lone surrogates are valid code points, even though no actual characters.
4819 : // They require special matching to make sure we do not split surrogate pairs.
4820 : // We use the dispatch table to accomplish this. The base range is split up
4821 : // by the table by the overlay ranges, and the Call callback is used to
4822 : // filter and collect ranges for each category.
4823 153690 : for (int i = 0; i < base->length(); i++) {
4824 148526 : table_.AddRange(base->at(i), kBase, zone_);
4825 : }
4826 : // Add overlay ranges.
4827 : table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4828 2582 : kBmpCodePoints, zone_);
4829 : table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4830 2582 : kLeadSurrogates, zone_);
4831 : table_.AddRange(
4832 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4833 2582 : kTrailSurrogates, zone_);
4834 : table_.AddRange(
4835 : CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4836 2582 : kBmpCodePoints, zone_);
4837 : table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4838 2582 : kNonBmpCodePoints, zone_);
4839 : table_.ForEach(this);
4840 2582 : }
4841 :
4842 :
4843 159046 : void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4844 159046 : OutSet* outset = entry.out_set();
4845 318092 : if (!outset->Get(kBase)) return;
4846 : ZoneList<CharacterRange>** target = nullptr;
4847 78268 : if (outset->Get(kBmpCodePoints)) {
4848 50703 : target = &bmp_;
4849 27565 : } else if (outset->Get(kLeadSurrogates)) {
4850 1175 : target = &lead_surrogates_;
4851 26390 : } else if (outset->Get(kTrailSurrogates)) {
4852 1175 : target = &trail_surrogates_;
4853 : } else {
4854 : DCHECK(outset->Get(kNonBmpCodePoints));
4855 25215 : target = &non_bmp_;
4856 : }
4857 78268 : if (*target == nullptr)
4858 12124 : *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4859 78268 : (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4860 : }
4861 :
4862 6821 : void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4863 2577 : RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4864 : ZoneList<CharacterRange>* bmp = splitter->bmp();
4865 3032 : if (bmp == nullptr) return;
4866 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4867 : result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4868 : compiler->zone(), bmp, compiler->read_backward(), on_success,
4869 4244 : default_flags)));
4870 : }
4871 :
4872 31227 : void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4873 : RegExpNode* on_success,
4874 2577 : UnicodeRangeSplitter* splitter) {
4875 26790 : ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4876 3574 : if (non_bmp == nullptr) return;
4877 : DCHECK(!compiler->one_byte());
4878 : Zone* zone = compiler->zone();
4879 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4880 1580 : CharacterRange::Canonicalize(non_bmp);
4881 53580 : for (int i = 0; i < non_bmp->length(); i++) {
4882 : // Match surrogate pair.
4883 : // E.g. [\u10005-\u11005] becomes
4884 : // \ud800[\udc05-\udfff]|
4885 : // [\ud801-\ud803][\udc00-\udfff]|
4886 : // \ud804[\udc00-\udc05]
4887 25210 : uc32 from = non_bmp->at(i).from();
4888 25210 : uc32 to = non_bmp->at(i).to();
4889 25210 : uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4890 : uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4891 25210 : uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4892 : uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4893 25210 : if (from_l == to_l) {
4894 : // The lead surrogate is the same.
4895 : result->AddAlternative(
4896 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4897 : zone, CharacterRange::Singleton(from_l),
4898 : CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4899 22930 : on_success, default_flags)));
4900 : } else {
4901 2280 : if (from_t != kTrailSurrogateStart) {
4902 : // Add [from_l][from_t-\udfff]
4903 : result->AddAlternative(
4904 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4905 : zone, CharacterRange::Singleton(from_l),
4906 : CharacterRange::Range(from_t, kTrailSurrogateEnd),
4907 1155 : compiler->read_backward(), on_success, default_flags)));
4908 1155 : from_l++;
4909 : }
4910 2280 : if (to_t != kTrailSurrogateEnd) {
4911 : // Add [to_l][\udc00-to_t]
4912 : result->AddAlternative(
4913 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4914 : zone, CharacterRange::Singleton(to_l),
4915 : CharacterRange::Range(kTrailSurrogateStart, to_t),
4916 895 : compiler->read_backward(), on_success, default_flags)));
4917 895 : to_l--;
4918 : }
4919 2280 : if (from_l <= to_l) {
4920 : // Add [from_l-to_l][\udc00-\udfff]
4921 : result->AddAlternative(
4922 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4923 : zone, CharacterRange::Range(from_l, to_l),
4924 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4925 2090 : compiler->read_backward(), on_success, default_flags)));
4926 : }
4927 : }
4928 : }
4929 : }
4930 :
4931 1175 : RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4932 1175 : RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
4933 : ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
4934 : JSRegExp::Flags flags) {
4935 : Zone* zone = compiler->zone();
4936 : RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4937 1175 : zone, match, read_backward, on_success, flags);
4938 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4939 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4940 : RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4941 1175 : position_register);
4942 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4943 1175 : zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
4944 1175 : return lookaround.ForMatch(negative_match);
4945 : }
4946 :
4947 1165 : RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4948 1165 : RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
4949 : ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
4950 : bool read_backward, JSRegExp::Flags flags) {
4951 : Zone* zone = compiler->zone();
4952 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4953 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4954 : RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4955 1165 : position_register);
4956 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4957 1165 : zone, lookahead, read_backward, lookaround.on_match_success(), flags);
4958 : return TextNode::CreateForCharacterRanges(
4959 1165 : zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
4960 : }
4961 :
4962 4917 : void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4963 : RegExpNode* on_success,
4964 2577 : UnicodeRangeSplitter* splitter) {
4965 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4966 : ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
4967 3984 : if (lead_surrogates == nullptr) return;
4968 : Zone* zone = compiler->zone();
4969 : // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4970 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
4971 1170 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
4972 :
4973 : RegExpNode* match;
4974 1170 : if (compiler->read_backward()) {
4975 : // Reading backward. Assert that reading forward, there is no trail
4976 : // surrogate, and then backward match the lead surrogate.
4977 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
4978 : compiler, trail_surrogates, lead_surrogates, on_success, true,
4979 95 : default_flags);
4980 : } else {
4981 : // Reading forward. Forward match the lead surrogate and assert that
4982 : // no trail surrogate follows.
4983 : match = MatchAndNegativeLookaroundInReadDirection(
4984 : compiler, lead_surrogates, trail_surrogates, on_success, false,
4985 1075 : default_flags);
4986 : }
4987 : result->AddAlternative(GuardedAlternative(match));
4988 : }
4989 :
4990 4917 : void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4991 : RegExpNode* on_success,
4992 2577 : UnicodeRangeSplitter* splitter) {
4993 : JSRegExp::Flags default_flags = JSRegExp::Flags();
4994 : ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
4995 3984 : if (trail_surrogates == nullptr) return;
4996 : Zone* zone = compiler->zone();
4997 : // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
4998 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
4999 1170 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5000 :
5001 : RegExpNode* match;
5002 1170 : if (compiler->read_backward()) {
5003 : // Reading backward. Backward match the trail surrogate and assert that no
5004 : // lead surrogate precedes it.
5005 : match = MatchAndNegativeLookaroundInReadDirection(
5006 : compiler, trail_surrogates, lead_surrogates, on_success, true,
5007 90 : default_flags);
5008 : } else {
5009 : // Reading forward. Assert that reading backward, there is no lead
5010 : // surrogate, and then forward match the trail surrogate.
5011 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
5012 : compiler, lead_surrogates, trail_surrogates, on_success, false,
5013 1080 : default_flags);
5014 : }
5015 : result->AddAlternative(GuardedAlternative(match));
5016 : }
5017 :
5018 0 : RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5019 : RegExpNode* on_success) {
5020 : // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5021 : DCHECK(!compiler->read_backward());
5022 : Zone* zone = compiler->zone();
5023 : // Advance any character. If the character happens to be a lead surrogate and
5024 : // we advanced into the middle of a surrogate pair, it will work out, as
5025 : // nothing will match from there. We will have to advance again, consuming
5026 : // the associated trail surrogate.
5027 : ZoneList<CharacterRange>* range = CharacterRange::List(
5028 0 : zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5029 : JSRegExp::Flags default_flags = JSRegExp::Flags();
5030 : return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
5031 0 : default_flags);
5032 : }
5033 :
5034 124972 : void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5035 : #ifdef V8_INTL_SUPPORT
5036 : DCHECK(CharacterRange::IsCanonical(ranges));
5037 :
5038 : // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
5039 : // See also https://crbug.com/v8/6727.
5040 : // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
5041 : // which we use frequently internally. But large ranges can also easily be
5042 : // created by the user. We might want to have a more general caching mechanism
5043 : // for such ranges.
5044 1718 : if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
5045 :
5046 : // Use ICU to compute the case fold closure over the ranges.
5047 1184 : icu::UnicodeSet set;
5048 247576 : for (int i = 0; i < ranges->length(); i++) {
5049 122604 : set.add(ranges->at(i).from(), ranges->at(i).to());
5050 : }
5051 : ranges->Clear();
5052 1184 : set.closeOver(USET_CASE_INSENSITIVE);
5053 : // Full case mapping map single characters to multiple characters.
5054 : // Those are represented as strings in the set. Remove them so that
5055 : // we end up with only simple and common case mappings.
5056 1184 : set.removeAllStrings();
5057 19340 : for (int i = 0; i < set.getRangeCount(); i++) {
5058 18156 : ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5059 18156 : zone);
5060 : }
5061 : // No errors and everything we collected have been ranges.
5062 1184 : CharacterRange::Canonicalize(ranges);
5063 : #endif // V8_INTL_SUPPORT
5064 : }
5065 :
5066 :
5067 529175 : RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5068 : RegExpNode* on_success) {
5069 : set_.Canonicalize();
5070 : Zone* zone = compiler->zone();
5071 2607 : ZoneList<CharacterRange>* ranges = this->ranges(zone);
5072 175896 : if (NeedsUnicodeCaseEquivalents(flags_)) {
5073 944 : AddUnicodeCaseEquivalents(ranges, zone);
5074 : }
5075 182577 : if (IsUnicode(flags_) && !compiler->one_byte() &&
5076 : !contains_split_surrogate()) {
5077 2607 : if (is_negated()) {
5078 : ZoneList<CharacterRange>* negated =
5079 140 : new (zone) ZoneList<CharacterRange>(2, zone);
5080 140 : CharacterRange::Negate(ranges, negated, zone);
5081 : ranges = negated;
5082 : }
5083 2607 : if (ranges->length() == 0) {
5084 : JSRegExp::Flags default_flags;
5085 : RegExpCharacterClass* fail =
5086 60 : new (zone) RegExpCharacterClass(zone, ranges, default_flags);
5087 60 : return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5088 : }
5089 2577 : if (standard_type() == '*') {
5090 0 : return UnanchoredAdvance(compiler, on_success);
5091 : } else {
5092 2577 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5093 2577 : UnicodeRangeSplitter splitter(zone, ranges);
5094 2577 : AddBmpCharacters(compiler, result, on_success, &splitter);
5095 2577 : AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5096 2577 : AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5097 2577 : AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5098 : return result;
5099 : }
5100 : } else {
5101 346578 : return new (zone) TextNode(this, compiler->read_backward(), on_success);
5102 : }
5103 : }
5104 :
5105 :
5106 146822 : int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5107 146822 : RegExpAtom* atom1 = (*a)->AsAtom();
5108 146822 : RegExpAtom* atom2 = (*b)->AsAtom();
5109 146822 : uc16 character1 = atom1->data().at(0);
5110 146822 : uc16 character2 = atom2->data().at(0);
5111 146822 : if (character1 < character2) return -1;
5112 129859 : if (character1 > character2) return 1;
5113 17383 : return 0;
5114 : }
5115 :
5116 :
5117 : static unibrow::uchar Canonical(
5118 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5119 : unibrow::uchar c) {
5120 : unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5121 101306 : int length = canonicalize->get(c, '\0', chars);
5122 : DCHECK_LE(length, 1);
5123 : unibrow::uchar canonical = c;
5124 101306 : if (length == 1) canonical = chars[0];
5125 : return canonical;
5126 : }
5127 :
5128 :
5129 63973 : int CompareFirstCharCaseIndependent(
5130 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5131 : RegExpTree* const* a, RegExpTree* const* b) {
5132 63973 : RegExpAtom* atom1 = (*a)->AsAtom();
5133 63973 : RegExpAtom* atom2 = (*b)->AsAtom();
5134 63973 : unibrow::uchar character1 = atom1->data().at(0);
5135 63973 : unibrow::uchar character2 = atom2->data().at(0);
5136 63973 : if (character1 == character2) return 0;
5137 46025 : if (character1 >= 'a' || character2 >= 'a') {
5138 : character1 = Canonical(canonicalize, character1);
5139 : character2 = Canonical(canonicalize, character2);
5140 : }
5141 46025 : return static_cast<int>(character1) - static_cast<int>(character2);
5142 : }
5143 :
5144 :
5145 : // We can stable sort runs of atoms, since the order does not matter if they
5146 : // start with different characters.
5147 : // Returns true if any consecutive atoms were found.
5148 9785 : bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5149 9311 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5150 : int length = alternatives->length();
5151 : bool found_consecutive_atoms = false;
5152 17938 : for (int i = 0; i < length; i++) {
5153 34709 : while (i < length) {
5154 33879 : RegExpTree* alternative = alternatives->at(i);
5155 33879 : if (alternative->IsAtom()) break;
5156 25252 : i++;
5157 : }
5158 : // i is length or it is the index of an atom.
5159 9457 : if (i == length) break;
5160 : int first_atom = i;
5161 8627 : JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
5162 8627 : i++;
5163 73375 : while (i < length) {
5164 56362 : RegExpTree* alternative = alternatives->at(i);
5165 56362 : if (!alternative->IsAtom()) break;
5166 56121 : if (alternative->AsAtom()->flags() != flags) break;
5167 56121 : i++;
5168 : }
5169 : // Sort atoms to get ones with common prefixes together.
5170 : // This step is more tricky if we are in a case-independent regexp,
5171 : // because it would change /is|I/ to /I|is/, and order matters when
5172 : // the regexp parts don't match only disjoint starting points. To fix
5173 : // this we have a version of CompareFirstChar that uses case-
5174 : // independent character classes for comparison.
5175 : DCHECK_LT(first_atom, alternatives->length());
5176 : DCHECK_LE(i, alternatives->length());
5177 : DCHECK_LE(first_atom, i);
5178 8627 : if (IgnoreCase(flags)) {
5179 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5180 474 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5181 : auto compare_closure =
5182 : [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5183 63973 : return CompareFirstCharCaseIndependent(canonicalize, a, b);
5184 63973 : };
5185 474 : alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5186 : } else {
5187 8153 : alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5188 : }
5189 8627 : if (i - first_atom > 1) found_consecutive_atoms = true;
5190 : }
5191 9311 : return found_consecutive_atoms;
5192 : }
5193 :
5194 :
5195 : // Optimizes ab|ac|az to a(?:b|c|d).
5196 12998 : void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5197 : Zone* zone = compiler->zone();
5198 8370 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5199 : int length = alternatives->length();
5200 :
5201 : int write_posn = 0;
5202 : int i = 0;
5203 80602 : while (i < length) {
5204 63862 : RegExpTree* alternative = alternatives->at(i);
5205 63862 : if (!alternative->IsAtom()) {
5206 16002 : alternatives->at(write_posn++) = alternatives->at(i);
5207 8001 : i++;
5208 : continue;
5209 : }
5210 55861 : RegExpAtom* const atom = alternative->AsAtom();
5211 : JSRegExp::Flags flags = atom->flags();
5212 55861 : unibrow::uchar common_prefix = atom->data().at(0);
5213 : int first_with_prefix = i;
5214 : int prefix_length = atom->length();
5215 55861 : i++;
5216 120352 : while (i < length) {
5217 56221 : alternative = alternatives->at(i);
5218 56221 : if (!alternative->IsAtom()) break;
5219 56121 : RegExpAtom* const atom = alternative->AsAtom();
5220 56121 : if (atom->flags() != flags) break;
5221 56121 : unibrow::uchar new_prefix = atom->data().at(0);
5222 56121 : if (new_prefix != common_prefix) {
5223 47695 : if (!IgnoreCase(flags)) break;
5224 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5225 4628 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5226 : new_prefix = Canonical(canonicalize, new_prefix);
5227 : common_prefix = Canonical(canonicalize, common_prefix);
5228 4628 : if (new_prefix != common_prefix) break;
5229 : }
5230 : prefix_length = Min(prefix_length, atom->length());
5231 8630 : i++;
5232 : }
5233 55861 : if (i > first_with_prefix + 2) {
5234 : // Found worthwhile run of alternatives with common prefix of at least one
5235 : // character. The sorting function above did not sort on more than one
5236 : // character for reasons of correctness, but there may still be a longer
5237 : // common prefix if the terms were similar or presorted in the input.
5238 : // Find out how long the common prefix is.
5239 268 : int run_length = i - first_with_prefix;
5240 268 : RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
5241 505 : for (int j = 1; j < run_length && prefix_length > 1; j++) {
5242 : RegExpAtom* old_atom =
5243 474 : alternatives->at(j + first_with_prefix)->AsAtom();
5244 357 : for (int k = 1; k < prefix_length; k++) {
5245 711 : if (atom->data().at(k) != old_atom->data().at(k)) {
5246 : prefix_length = k;
5247 : break;
5248 : }
5249 : }
5250 : }
5251 : RegExpAtom* prefix = new (zone)
5252 268 : RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
5253 268 : ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5254 268 : pair->Add(prefix, zone);
5255 : ZoneList<RegExpTree*>* suffixes =
5256 268 : new (zone) ZoneList<RegExpTree*>(run_length, zone);
5257 8934 : for (int j = 0; j < run_length; j++) {
5258 : RegExpAtom* old_atom =
5259 17332 : alternatives->at(j + first_with_prefix)->AsAtom();
5260 : int len = old_atom->length();
5261 8666 : if (len == prefix_length) {
5262 302 : suffixes->Add(new (zone) RegExpEmpty(), zone);
5263 : } else {
5264 : RegExpTree* suffix = new (zone) RegExpAtom(
5265 8515 : old_atom->data().SubVector(prefix_length, old_atom->length()),
5266 8515 : flags);
5267 8515 : suffixes->Add(suffix, zone);
5268 : }
5269 : }
5270 268 : pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5271 536 : alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5272 : } else {
5273 : // Just copy any non-worthwhile alternatives.
5274 55825 : for (int j = first_with_prefix; j < i; j++) {
5275 111650 : alternatives->at(write_posn++) = alternatives->at(j);
5276 : }
5277 : }
5278 : }
5279 : alternatives->Rewind(write_posn); // Trim end of array.
5280 8370 : }
5281 :
5282 :
5283 : // Optimizes b|c|z to [bcz].
5284 9311 : void RegExpDisjunction::FixSingleCharacterDisjunctions(
5285 9311 : RegExpCompiler* compiler) {
5286 : Zone* zone = compiler->zone();
5287 9311 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5288 : int length = alternatives->length();
5289 :
5290 : int write_posn = 0;
5291 : int i = 0;
5292 92028 : while (i < length) {
5293 73406 : RegExpTree* alternative = alternatives->at(i);
5294 73406 : if (!alternative->IsAtom()) {
5295 51522 : alternatives->at(write_posn++) = alternatives->at(i);
5296 25761 : i++;
5297 25761 : continue;
5298 : }
5299 47645 : RegExpAtom* const atom = alternative->AsAtom();
5300 47645 : if (atom->length() != 1) {
5301 78592 : alternatives->at(write_posn++) = alternatives->at(i);
5302 39296 : i++;
5303 39296 : continue;
5304 : }
5305 : JSRegExp::Flags flags = atom->flags();
5306 : DCHECK_IMPLIES(IsUnicode(flags),
5307 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5308 : bool contains_trail_surrogate =
5309 8349 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5310 : int first_in_run = i;
5311 8349 : i++;
5312 : // Find a run of single-character atom alternatives that have identical
5313 : // flags (case independence and unicode-ness).
5314 25135 : while (i < length) {
5315 16454 : alternative = alternatives->at(i);
5316 16454 : if (!alternative->IsAtom()) break;
5317 16223 : RegExpAtom* const atom = alternative->AsAtom();
5318 16223 : if (atom->length() != 1) break;
5319 8437 : if (atom->flags() != flags) break;
5320 : DCHECK_IMPLIES(IsUnicode(flags),
5321 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5322 : contains_trail_surrogate |=
5323 16874 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5324 8437 : i++;
5325 : }
5326 8349 : if (i > first_in_run + 1) {
5327 : // Found non-trivial run of single-character alternatives.
5328 271 : int run_length = i - first_in_run;
5329 : ZoneList<CharacterRange>* ranges =
5330 271 : new (zone) ZoneList<CharacterRange>(2, zone);
5331 8979 : for (int j = 0; j < run_length; j++) {
5332 17416 : RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5333 : DCHECK_EQ(old_atom->length(), 1);
5334 8708 : ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5335 : }
5336 : RegExpCharacterClass::CharacterClassFlags character_class_flags;
5337 271 : if (IsUnicode(flags) && contains_trail_surrogate) {
5338 : character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5339 : }
5340 271 : alternatives->at(write_posn++) = new (zone)
5341 813 : RegExpCharacterClass(zone, ranges, flags, character_class_flags);
5342 : } else {
5343 : // Just copy any trivial alternatives.
5344 8078 : for (int j = first_in_run; j < i; j++) {
5345 16156 : alternatives->at(write_posn++) = alternatives->at(j);
5346 : }
5347 : }
5348 : }
5349 : alternatives->Rewind(write_posn); // Trim end of array.
5350 9311 : }
5351 :
5352 :
5353 21658 : RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5354 10950 : RegExpNode* on_success) {
5355 30969 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5356 :
5357 10950 : if (alternatives->length() > 2) {
5358 9311 : bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5359 9311 : if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5360 9311 : FixSingleCharacterDisjunctions(compiler);
5361 9311 : if (alternatives->length() == 1) {
5362 242 : return alternatives->at(0)->ToNode(compiler, on_success);
5363 : }
5364 : }
5365 :
5366 : int length = alternatives->length();
5367 :
5368 : ChoiceNode* result =
5369 10708 : new(compiler->zone()) ChoiceNode(length, compiler->zone());
5370 87147 : for (int i = 0; i < length; i++) {
5371 : GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5372 76439 : on_success));
5373 : result->AddAlternative(alternative);
5374 : }
5375 : return result;
5376 : }
5377 :
5378 :
5379 926331 : RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5380 1852662 : RegExpNode* on_success) {
5381 : return ToNode(min(),
5382 : max(),
5383 : is_greedy(),
5384 : body(),
5385 : compiler,
5386 1852662 : on_success);
5387 : }
5388 :
5389 :
5390 : // Scoped object to keep track of how much we unroll quantifier loops in the
5391 : // regexp graph generator.
5392 : class RegExpExpansionLimiter {
5393 : public:
5394 : static const int kMaxExpansionFactor = 6;
5395 62407 : RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5396 : : compiler_(compiler),
5397 : saved_expansion_factor_(compiler->current_expansion_factor()),
5398 62407 : ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5399 : DCHECK_LT(0, factor);
5400 71982 : if (ok_to_expand_) {
5401 71982 : if (factor > kMaxExpansionFactor) {
5402 : // Avoid integer overflow of the current expansion factor.
5403 : ok_to_expand_ = false;
5404 : compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5405 : } else {
5406 71854 : int new_factor = saved_expansion_factor_ * factor;
5407 71854 : ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5408 : compiler->set_current_expansion_factor(new_factor);
5409 : }
5410 : }
5411 : }
5412 :
5413 : ~RegExpExpansionLimiter() {
5414 : compiler_->set_current_expansion_factor(saved_expansion_factor_);
5415 : }
5416 :
5417 : bool ok_to_expand() { return ok_to_expand_; }
5418 :
5419 : private:
5420 : RegExpCompiler* compiler_;
5421 : int saved_expansion_factor_;
5422 : bool ok_to_expand_;
5423 :
5424 : DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5425 : };
5426 :
5427 :
5428 1013416 : RegExpNode* RegExpQuantifier::ToNode(int min,
5429 : int max,
5430 : bool is_greedy,
5431 : RegExpTree* body,
5432 3030572 : RegExpCompiler* compiler,
5433 : RegExpNode* on_success,
5434 : bool not_at_start) {
5435 : // x{f, t} becomes this:
5436 : //
5437 : // (r++)<-.
5438 : // | `
5439 : // | (x)
5440 : // v ^
5441 : // (r=0)-->(?)---/ [if r < t]
5442 : // |
5443 : // [if r >= f] \----> ...
5444 : //
5445 :
5446 : // 15.10.2.5 RepeatMatcher algorithm.
5447 : // The parser has already eliminated the case where max is 0. In the case
5448 : // where max_match is zero the parser has removed the quantifier if min was
5449 : // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5450 :
5451 : // If we know that we cannot match zero length then things are a little
5452 : // simpler since we don't need to make the special zero length match check
5453 : // from step 2.1. If the min and max are small we can unroll a little in
5454 : // this case.
5455 : static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5456 : static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5457 1013416 : if (max == 0) return on_success; // This can happen due to recursion.
5458 1013061 : bool body_can_be_empty = (body->min_match() == 0);
5459 : int body_start_reg = RegExpCompiler::kNoRegister;
5460 1013061 : Interval capture_registers = body->CaptureRegisters();
5461 1013061 : bool needs_capture_clearing = !capture_registers.is_empty();
5462 : Zone* zone = compiler->zone();
5463 :
5464 1013061 : if (body_can_be_empty) {
5465 : body_start_reg = compiler->AllocateRegister();
5466 1012554 : } else if (compiler->optimize() && !needs_capture_clearing) {
5467 : // Only unroll if there are no captures and the body can't be
5468 : // empty.
5469 : {
5470 : RegExpExpansionLimiter limiter(
5471 62407 : compiler, min + ((max != min) ? 1 : 0));
5472 62407 : if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5473 4347 : int new_max = (max == kInfinity) ? max : max - min;
5474 : // Recurse once to get the loop or optional matches after the fixed
5475 : // ones.
5476 : RegExpNode* answer = ToNode(
5477 4347 : 0, new_max, is_greedy, body, compiler, on_success, true);
5478 : // Unroll the forced matches from 0 to min. This can cause chains of
5479 : // TextNodes (which the parser does not generate). These should be
5480 : // combined if it turns out they hinder good code generation.
5481 9399 : for (int i = 0; i < min; i++) {
5482 5052 : answer = body->ToNode(compiler, answer);
5483 : }
5484 : return answer;
5485 : }
5486 : }
5487 58060 : if (max <= kMaxUnrolledMaxMatches && min == 0) {
5488 : DCHECK_LT(0, max); // Due to the 'if' above.
5489 : RegExpExpansionLimiter limiter(compiler, max);
5490 9575 : if (limiter.ok_to_expand()) {
5491 : // Unroll the optional matches up to max.
5492 : RegExpNode* answer = on_success;
5493 9411 : for (int i = 0; i < max; i++) {
5494 9411 : ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5495 9411 : if (is_greedy) {
5496 : alternation->AddAlternative(
5497 9265 : GuardedAlternative(body->ToNode(compiler, answer)));
5498 : alternation->AddAlternative(GuardedAlternative(on_success));
5499 : } else {
5500 : alternation->AddAlternative(GuardedAlternative(on_success));
5501 : alternation->AddAlternative(
5502 146 : GuardedAlternative(body->ToNode(compiler, answer)));
5503 : }
5504 : answer = alternation;
5505 9734 : if (not_at_start && !compiler->read_backward()) {
5506 : alternation->set_not_at_start();
5507 : }
5508 : }
5509 : return answer;
5510 : }
5511 : }
5512 : }
5513 999400 : bool has_min = min > 0;
5514 999400 : bool has_max = max < RegExpTree::kInfinity;
5515 999400 : bool needs_counter = has_min || has_max;
5516 : int reg_ctr = needs_counter
5517 : ? compiler->AllocateRegister()
5518 999400 : : RegExpCompiler::kNoRegister;
5519 : LoopChoiceNode* center = new (zone)
5520 999400 : LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5521 1003301 : if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5522 : RegExpNode* loop_return = needs_counter
5523 : ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5524 999400 : : static_cast<RegExpNode*>(center);
5525 999400 : if (body_can_be_empty) {
5526 : // If the body can be empty we need to check if it was and then
5527 : // backtrack.
5528 : loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5529 : reg_ctr,
5530 : min,
5531 507 : loop_return);
5532 : }
5533 999400 : RegExpNode* body_node = body->ToNode(compiler, loop_return);
5534 999400 : if (body_can_be_empty) {
5535 : // If the body can be empty we need to store the start position
5536 : // so we can bail out if it was empty.
5537 507 : body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5538 : }
5539 999400 : if (needs_capture_clearing) {
5540 : // Before entering the body of this loop we need to clear captures.
5541 2336 : body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5542 : }
5543 : GuardedAlternative body_alt(body_node);
5544 999400 : if (has_max) {
5545 : Guard* body_guard =
5546 : new(zone) Guard(reg_ctr, Guard::LT, max);
5547 902604 : body_alt.AddGuard(body_guard, zone);
5548 : }
5549 : GuardedAlternative rest_alt(on_success);
5550 999400 : if (has_min) {
5551 : Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5552 1333 : rest_alt.AddGuard(rest_guard, zone);
5553 : }
5554 999400 : if (is_greedy) {
5555 : center->AddLoopAlternative(body_alt);
5556 : center->AddContinueAlternative(rest_alt);
5557 : } else {
5558 : center->AddContinueAlternative(rest_alt);
5559 : center->AddLoopAlternative(body_alt);
5560 : }
5561 999400 : if (needs_counter) {
5562 903359 : return ActionNode::SetRegister(reg_ctr, 0, center);
5563 : } else {
5564 : return center;
5565 : }
5566 : }
5567 :
5568 : namespace {
5569 : // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5570 : // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
5571 80 : RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5572 : RegExpNode* on_success,
5573 : RegExpAssertion::AssertionType type,
5574 : JSRegExp::Flags flags) {
5575 : DCHECK(NeedsUnicodeCaseEquivalents(flags));
5576 : Zone* zone = compiler->zone();
5577 : ZoneList<CharacterRange>* word_range =
5578 80 : new (zone) ZoneList<CharacterRange>(2, zone);
5579 80 : CharacterRange::AddClassEscape('w', word_range, true, zone);
5580 : int stack_register = compiler->UnicodeLookaroundStackRegister();
5581 : int position_register = compiler->UnicodeLookaroundPositionRegister();
5582 80 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5583 : // Add two choices. The (non-)boundary could start with a word or
5584 : // a non-word-character.
5585 240 : for (int i = 0; i < 2; i++) {
5586 160 : bool lookbehind_for_word = i == 0;
5587 : bool lookahead_for_word =
5588 160 : (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5589 : // Look to the left.
5590 : RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5591 160 : stack_register, position_register);
5592 : RegExpNode* backward = TextNode::CreateForCharacterRanges(
5593 160 : zone, word_range, true, lookbehind.on_match_success(), flags);
5594 : // Look to the right.
5595 : RegExpLookaround::Builder lookahead(lookahead_for_word,
5596 : lookbehind.ForMatch(backward),
5597 160 : stack_register, position_register);
5598 : RegExpNode* forward = TextNode::CreateForCharacterRanges(
5599 160 : zone, word_range, false, lookahead.on_match_success(), flags);
5600 160 : result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5601 : }
5602 80 : return result;
5603 : }
5604 : } // anonymous namespace
5605 :
5606 5576 : RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5607 5576 : RegExpNode* on_success) {
5608 : NodeInfo info;
5609 : Zone* zone = compiler->zone();
5610 :
5611 5576 : switch (assertion_type()) {
5612 : case START_OF_LINE:
5613 129 : return AssertionNode::AfterNewline(on_success);
5614 : case START_OF_INPUT:
5615 3065 : return AssertionNode::AtStart(on_success);
5616 : case BOUNDARY:
5617 : return NeedsUnicodeCaseEquivalents(flags_)
5618 : ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
5619 : flags_)
5620 176 : : AssertionNode::AtBoundary(on_success);
5621 : case NON_BOUNDARY:
5622 : return NeedsUnicodeCaseEquivalents(flags_)
5623 : ? BoundaryAssertionAsLookaround(compiler, on_success,
5624 : NON_BOUNDARY, flags_)
5625 154 : : AssertionNode::AtNonBoundary(on_success);
5626 : case END_OF_INPUT:
5627 1958 : return AssertionNode::AtEnd(on_success);
5628 : case END_OF_LINE: {
5629 : // Compile $ in multiline regexps as an alternation with a positive
5630 : // lookahead in one side and an end-of-input on the other side.
5631 : // We need two registers for the lookahead.
5632 : int stack_pointer_register = compiler->AllocateRegister();
5633 : int position_register = compiler->AllocateRegister();
5634 : // The ChoiceNode to distinguish between a newline and end-of-input.
5635 94 : ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5636 : // Create a newline atom.
5637 : ZoneList<CharacterRange>* newline_ranges =
5638 94 : new(zone) ZoneList<CharacterRange>(3, zone);
5639 94 : CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5640 : JSRegExp::Flags default_flags = JSRegExp::Flags();
5641 : RegExpCharacterClass* newline_atom =
5642 : new (zone) RegExpCharacterClass('n', default_flags);
5643 : TextNode* newline_matcher = new (zone) TextNode(
5644 : newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5645 : stack_pointer_register, position_register,
5646 : 0, // No captures inside.
5647 : -1, // Ignored if no captures.
5648 188 : on_success));
5649 : // Create an end-of-input matcher.
5650 : RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5651 : stack_pointer_register,
5652 : position_register,
5653 94 : newline_matcher);
5654 : // Add the two alternatives to the ChoiceNode.
5655 : GuardedAlternative eol_alternative(end_of_line);
5656 : result->AddAlternative(eol_alternative);
5657 94 : GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5658 : result->AddAlternative(end_alternative);
5659 : return result;
5660 : }
5661 : default:
5662 0 : UNREACHABLE();
5663 : }
5664 : return on_success;
5665 : }
5666 :
5667 :
5668 4830 : RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5669 2415 : RegExpNode* on_success) {
5670 : return new (compiler->zone())
5671 : BackReferenceNode(RegExpCapture::StartRegister(index()),
5672 : RegExpCapture::EndRegister(index()), flags_,
5673 4830 : compiler->read_backward(), on_success);
5674 : }
5675 :
5676 :
5677 1068 : RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5678 : RegExpNode* on_success) {
5679 1068 : return on_success;
5680 : }
5681 :
5682 :
5683 4368 : RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5684 : int stack_pointer_register,
5685 : int position_register,
5686 : int capture_register_count,
5687 : int capture_register_start)
5688 : : is_positive_(is_positive),
5689 : on_success_(on_success),
5690 : stack_pointer_register_(stack_pointer_register),
5691 4368 : position_register_(position_register) {
5692 4368 : if (is_positive_) {
5693 : on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5694 : stack_pointer_register, position_register, capture_register_count,
5695 1556 : capture_register_start, on_success_);
5696 : } else {
5697 : Zone* zone = on_success_->zone();
5698 : on_match_success_ = new (zone) NegativeSubmatchSuccess(
5699 : stack_pointer_register, position_register, capture_register_count,
5700 2812 : capture_register_start, zone);
5701 : }
5702 4368 : }
5703 :
5704 :
5705 4368 : RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5706 4368 : if (is_positive_) {
5707 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5708 1556 : position_register_, match);
5709 : } else {
5710 2812 : Zone* zone = on_success_->zone();
5711 : // We use a ChoiceNode to represent the negative lookaround. The first
5712 : // alternative is the negative match. On success, the end node backtracks.
5713 : // On failure, the second alternative is tried and leads to success.
5714 : // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5715 : // first exit when calculating quick checks.
5716 : ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5717 2812 : GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5718 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5719 2812 : position_register_, choice_node);
5720 : }
5721 : }
5722 :
5723 :
5724 3336 : RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5725 3336 : RegExpNode* on_success) {
5726 : int stack_pointer_register = compiler->AllocateRegister();
5727 : int position_register = compiler->AllocateRegister();
5728 :
5729 : const int registers_per_capture = 2;
5730 : const int register_of_first_capture = 2;
5731 1668 : int register_count = capture_count_ * registers_per_capture;
5732 : int register_start =
5733 1668 : register_of_first_capture + capture_from_ * registers_per_capture;
5734 :
5735 : RegExpNode* result;
5736 : bool was_reading_backward = compiler->read_backward();
5737 1668 : compiler->set_read_backward(type() == LOOKBEHIND);
5738 : Builder builder(is_positive(), on_success, stack_pointer_register,
5739 1668 : position_register, register_count, register_start);
5740 1668 : RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5741 1668 : result = builder.ForMatch(match);
5742 : compiler->set_read_backward(was_reading_backward);
5743 1668 : return result;
5744 : }
5745 :
5746 :
5747 27120 : RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5748 27120 : RegExpNode* on_success) {
5749 27120 : return ToNode(body(), index(), compiler, on_success);
5750 : }
5751 :
5752 :
5753 112825 : RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5754 : int index,
5755 112825 : RegExpCompiler* compiler,
5756 : RegExpNode* on_success) {
5757 : DCHECK_NOT_NULL(body);
5758 : int start_reg = RegExpCapture::StartRegister(index);
5759 : int end_reg = RegExpCapture::EndRegister(index);
5760 112825 : if (compiler->read_backward()) std::swap(start_reg, end_reg);
5761 112825 : RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5762 112825 : RegExpNode* body_node = body->ToNode(compiler, store_end);
5763 112825 : return ActionNode::StorePosition(start_reg, true, body_node);
5764 : }
5765 :
5766 :
5767 42716 : RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5768 21358 : RegExpNode* on_success) {
5769 22148 : ZoneList<RegExpTree*>* children = nodes();
5770 : RegExpNode* current = on_success;
5771 21358 : if (compiler->read_backward()) {
5772 1905 : for (int i = 0; i < children->length(); i++) {
5773 790 : current = children->at(i)->ToNode(compiler, current);
5774 : }
5775 : } else {
5776 997660 : for (int i = children->length() - 1; i >= 0; i--) {
5777 976627 : current = children->at(i)->ToNode(compiler, current);
5778 : }
5779 : }
5780 21358 : return current;
5781 : }
5782 :
5783 :
5784 7443 : static void AddClass(const int* elmv,
5785 : int elmc,
5786 : ZoneList<CharacterRange>* ranges,
5787 : Zone* zone) {
5788 7443 : elmc--;
5789 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5790 39883 : for (int i = 0; i < elmc; i += 2) {
5791 : DCHECK(elmv[i] < elmv[i + 1]);
5792 32440 : ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5793 : }
5794 7443 : }
5795 :
5796 :
5797 19670 : static void AddClassNegated(const int *elmv,
5798 : int elmc,
5799 : ZoneList<CharacterRange>* ranges,
5800 : Zone* zone) {
5801 19670 : elmc--;
5802 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5803 : DCHECK_NE(0x0000, elmv[0]);
5804 : DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
5805 : uc16 last = 0x0000;
5806 84051 : for (int i = 0; i < elmc; i += 2) {
5807 : DCHECK(last <= elmv[i] - 1);
5808 : DCHECK(elmv[i] < elmv[i + 1]);
5809 64381 : ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5810 64381 : last = elmv[i + 1];
5811 : }
5812 19670 : ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5813 19670 : }
5814 :
5815 110175 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5816 : bool add_unicode_case_equivalents,
5817 : Zone* zone) {
5818 110175 : if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5819 : // See #sec-runtime-semantics-wordcharacters-abstract-operation
5820 : // In case of unicode and ignore_case, we need to create the closure over
5821 : // case equivalent characters before negating.
5822 : ZoneList<CharacterRange>* new_ranges =
5823 240 : new (zone) ZoneList<CharacterRange>(2, zone);
5824 240 : AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5825 240 : AddUnicodeCaseEquivalents(new_ranges, zone);
5826 240 : if (type == 'W') {
5827 : ZoneList<CharacterRange>* negated =
5828 90 : new (zone) ZoneList<CharacterRange>(2, zone);
5829 90 : CharacterRange::Negate(new_ranges, negated, zone);
5830 : new_ranges = negated;
5831 : }
5832 : ranges->AddAll(*new_ranges, zone);
5833 110175 : return;
5834 : }
5835 109935 : AddClassEscape(type, ranges, zone);
5836 : }
5837 :
5838 109970 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5839 : Zone* zone) {
5840 109970 : switch (type) {
5841 : case 's':
5842 1727 : AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5843 1727 : break;
5844 : case 'S':
5845 800 : AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5846 800 : break;
5847 : case 'w':
5848 2786 : AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5849 2786 : break;
5850 : case 'W':
5851 307 : AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5852 307 : break;
5853 : case 'd':
5854 2502 : AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5855 2502 : break;
5856 : case 'D':
5857 268 : AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5858 268 : break;
5859 : case '.':
5860 : AddClassNegated(kLineTerminatorRanges,
5861 : kLineTerminatorRangeCount,
5862 : ranges,
5863 18295 : zone);
5864 18295 : break;
5865 : // This is not a character range as defined by the spec but a
5866 : // convenient shorthand for a character class that matches any
5867 : // character.
5868 : case '*':
5869 83097 : ranges->Add(CharacterRange::Everything(), zone);
5870 83097 : break;
5871 : // This is the set of characters matched by the $ and ^ symbols
5872 : // in multiline mode.
5873 : case 'n':
5874 : AddClass(kLineTerminatorRanges,
5875 : kLineTerminatorRangeCount,
5876 : ranges,
5877 188 : zone);
5878 188 : break;
5879 : default:
5880 0 : UNREACHABLE();
5881 : }
5882 109970 : }
5883 :
5884 :
5885 0 : Vector<const int> CharacterRange::GetWordBounds() {
5886 0 : return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5887 : }
5888 :
5889 : // static
5890 66945 : void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5891 66945 : ZoneList<CharacterRange>* ranges,
5892 : bool is_one_byte) {
5893 66945 : CharacterRange::Canonicalize(ranges);
5894 : int range_count = ranges->length();
5895 138888 : for (int i = 0; i < range_count; i++) {
5896 71943 : CharacterRange range = ranges->at(i);
5897 : uc32 bottom = range.from();
5898 74090 : if (bottom > String::kMaxUtf16CodeUnit) continue;
5899 : uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5900 : // Nothing to be done for surrogates.
5901 71943 : if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
5902 69896 : if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5903 1343 : if (bottom > String::kMaxOneByteCharCode) continue;
5904 1243 : if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5905 : }
5906 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5907 69796 : if (top == bottom) {
5908 : // If this is a singleton we just expand the one character.
5909 4598 : int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5910 7292 : for (int i = 0; i < length; i++) {
5911 2694 : uc32 chr = chars[i];
5912 2694 : if (chr != bottom) {
5913 1377 : ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5914 : }
5915 : }
5916 : } else {
5917 : // If this is a range we expand the characters block by block, expanding
5918 : // contiguous subranges (blocks) one at a time. The approach is as
5919 : // follows. For a given start character we look up the remainder of the
5920 : // block that contains it (represented by the end point), for instance we
5921 : // find 'z' if the character is 'c'. A block is characterized by the
5922 : // property that all characters uncanonicalize in the same way, except
5923 : // that each entry in the result is incremented by the distance from the
5924 : // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5925 : // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5926 : // we've found the end point we look up its uncanonicalization and
5927 : // produce a range for each element. For instance for [c-f] we look up
5928 : // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5929 : // it is not already contained in the input, so [c-f] will be skipped but
5930 : // [C-F] will be added. If this range is not completely contained in a
5931 : // block we do this for all the blocks covered by the range (handling
5932 : // characters that is not in a block as a "singleton block").
5933 : unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5934 : int pos = bottom;
5935 7796607 : while (pos <= top) {
5936 : int length =
5937 7731409 : isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5938 : uc32 block_end;
5939 7731409 : if (length == 0) {
5940 : block_end = pos;
5941 : } else {
5942 : DCHECK_EQ(1, length);
5943 6349 : block_end = equivalents[0];
5944 : }
5945 7731409 : int end = (block_end > top) ? top : block_end;
5946 : length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5947 7731409 : equivalents);
5948 8064891 : for (int i = 0; i < length; i++) {
5949 333482 : uc32 c = equivalents[i];
5950 333482 : uc32 range_from = c - (block_end - pos);
5951 333482 : uc32 range_to = c - (block_end - end);
5952 333482 : if (!(bottom <= range_from && range_to <= top)) {
5953 6669 : ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5954 : }
5955 : }
5956 7731409 : pos = end + 1;
5957 : }
5958 : }
5959 : }
5960 66945 : }
5961 :
5962 :
5963 10 : bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5964 : DCHECK_NOT_NULL(ranges);
5965 : int n = ranges->length();
5966 10 : if (n <= 1) return true;
5967 10 : int max = ranges->at(0).to();
5968 300 : for (int i = 1; i < n; i++) {
5969 290 : CharacterRange next_range = ranges->at(i);
5970 290 : if (next_range.from() <= max + 1) return false;
5971 : max = next_range.to();
5972 : }
5973 : return true;
5974 : }
5975 :
5976 :
5977 1904678 : ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5978 1904678 : if (ranges_ == nullptr) {
5979 82981 : ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5980 82981 : CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
5981 : }
5982 1904678 : return ranges_;
5983 : }
5984 :
5985 :
5986 : // Move a number of elements in a zonelist to another position
5987 : // in the same list. Handles overlapping source and target areas.
5988 92331 : static void MoveRanges(ZoneList<CharacterRange>* list,
5989 : int from,
5990 : int to,
5991 : int count) {
5992 : // Ranges are potentially overlapping.
5993 92331 : if (from < to) {
5994 10119762 : for (int i = count - 1; i >= 0; i--) {
5995 30117945 : list->at(to + i) = list->at(from + i);
5996 : }
5997 : } else {
5998 3599626 : for (int i = 0; i < count; i++) {
5999 10798878 : list->at(to + i) = list->at(from + i);
6000 : }
6001 : }
6002 92331 : }
6003 :
6004 :
6005 173313 : static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6006 : int count,
6007 : CharacterRange insert) {
6008 : // Inserts a range into list[0..count[, which must be sorted
6009 : // by from value and non-overlapping and non-adjacent, using at most
6010 : // list[0..count] for the result. Returns the number of resulting
6011 : // canonicalized ranges. Inserting a range may collapse existing ranges into
6012 : // fewer ranges, so the return value can be anything in the range 1..count+1.
6013 173313 : uc32 from = insert.from();
6014 173313 : uc32 to = insert.to();
6015 : int start_pos = 0;
6016 : int end_pos = count;
6017 18093696 : for (int i = count - 1; i >= 0; i--) {
6018 18010014 : CharacterRange current = list->at(i);
6019 18010014 : if (current.from() > to + 1) {
6020 : end_pos = i;
6021 142488 : } else if (current.to() + 1 < from) {
6022 89631 : start_pos = i + 1;
6023 : break;
6024 : }
6025 : }
6026 :
6027 : // Inserted range overlaps, or is adjacent to, ranges at positions
6028 : // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6029 : // not affected by the insertion.
6030 : // If start_pos == end_pos, the range must be inserted before start_pos.
6031 : // if start_pos < end_pos, the entire range from start_pos to end_pos
6032 : // must be merged with the insert range.
6033 :
6034 173313 : if (start_pos == end_pos) {
6035 : // Insert between existing ranges at position start_pos.
6036 132740 : if (start_pos < count) {
6037 80447 : MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6038 : }
6039 132740 : list->at(start_pos) = insert;
6040 132740 : return count + 1;
6041 : }
6042 40573 : if (start_pos + 1 == end_pos) {
6043 : // Replace single existing range at position start_pos.
6044 28534 : CharacterRange to_replace = list->at(start_pos);
6045 : int new_from = Min(to_replace.from(), from);
6046 : int new_to = Max(to_replace.to(), to);
6047 28534 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6048 : return count;
6049 : }
6050 : // Replace a number of existing ranges from start_pos to end_pos - 1.
6051 : // Move the remaining ranges down.
6052 :
6053 12039 : int new_from = Min(list->at(start_pos).from(), from);
6054 24078 : int new_to = Max(list->at(end_pos - 1).to(), to);
6055 12039 : if (end_pos < count) {
6056 11884 : MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6057 : }
6058 12039 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6059 12039 : return count - (end_pos - start_pos) + 1;
6060 : }
6061 :
6062 :
6063 20 : void CharacterSet::Canonicalize() {
6064 : // Special/default classes are always considered canonical. The result
6065 : // of calling ranges() will be sorted.
6066 175936 : if (ranges_ == nullptr) return;
6067 93178 : CharacterRange::Canonicalize(ranges_);
6068 : }
6069 :
6070 :
6071 497381 : void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6072 497381 : if (character_ranges->length() <= 1) return;
6073 : // Check whether ranges are already canonical (increasing, non-overlapping,
6074 : // non-adjacent).
6075 : int n = character_ranges->length();
6076 63699 : int max = character_ranges->at(0).to();
6077 : int i = 1;
6078 1404639 : while (i < n) {
6079 1286357 : CharacterRange current = character_ranges->at(i);
6080 1286357 : if (current.from() <= max + 1) {
6081 : break;
6082 : }
6083 : max = current.to();
6084 1277241 : i++;
6085 : }
6086 : // Canonical until the i'th range. If that's all of them, we are done.
6087 63699 : if (i == n) return;
6088 :
6089 : // The ranges at index i and forward are not canonicalized. Make them so by
6090 : // doing the equivalent of insertion sort (inserting each into the previous
6091 : // list, in order).
6092 : // Notice that inserting a range can reduce the number of ranges in the
6093 : // result due to combining of adjacent and overlapping ranges.
6094 : int read = i; // Range to insert.
6095 : int num_canonical = i; // Length of canonicalized part of list.
6096 173313 : do {
6097 : num_canonical = InsertRangeInCanonicalList(character_ranges,
6098 : num_canonical,
6099 173313 : character_ranges->at(read));
6100 173313 : read++;
6101 : } while (read < n);
6102 : character_ranges->Rewind(num_canonical);
6103 :
6104 : DCHECK(CharacterRange::IsCanonical(character_ranges));
6105 : }
6106 :
6107 :
6108 230 : void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6109 : ZoneList<CharacterRange>* negated_ranges,
6110 : Zone* zone) {
6111 : DCHECK(CharacterRange::IsCanonical(ranges));
6112 : DCHECK_EQ(0, negated_ranges->length());
6113 : int range_count = ranges->length();
6114 : uc32 from = 0;
6115 : int i = 0;
6116 460 : if (range_count > 0 && ranges->at(0).from() == 0) {
6117 40 : from = ranges->at(0).to() + 1;
6118 : i = 1;
6119 : }
6120 7480 : while (i < range_count) {
6121 7250 : CharacterRange range = ranges->at(i);
6122 7250 : negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6123 7250 : from = range.to() + 1;
6124 7250 : i++;
6125 : }
6126 230 : if (from < String::kMaxCodePoint) {
6127 : negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6128 180 : zone);
6129 : }
6130 230 : }
6131 :
6132 :
6133 : // -------------------------------------------------------------------
6134 : // Splay tree
6135 :
6136 :
6137 490535 : OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6138 236789 : if (Get(value))
6139 : return this;
6140 236784 : if (successors(zone) != nullptr) {
6141 194155 : for (int i = 0; i < successors(zone)->length(); i++) {
6142 413977 : OutSet* successor = successors(zone)->at(i);
6143 413977 : if (successor->Get(value))
6144 : return successor;
6145 : }
6146 : } else {
6147 5684 : successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6148 : }
6149 33924 : OutSet* result = new(zone) OutSet(first_, remaining_);
6150 16962 : result->Set(value, zone);
6151 16962 : successors(zone)->Add(result, zone);
6152 16962 : return result;
6153 : }
6154 :
6155 :
6156 713849 : void OutSet::Set(unsigned value, Zone *zone) {
6157 713849 : if (value < kFirstLimit) {
6158 389215 : first_ |= (1 << value);
6159 : } else {
6160 889320 : if (remaining_ == nullptr)
6161 84582 : remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6162 889320 : if (remaining_->is_empty() || !remaining_->Contains(value))
6163 323584 : remaining_->Add(value, zone);
6164 : }
6165 713849 : }
6166 :
6167 :
6168 31137394 : bool OutSet::Get(unsigned value) const {
6169 31137394 : if (value < kFirstLimit) {
6170 6635907 : return (first_ & (1 << value)) != 0;
6171 24501487 : } else if (remaining_ == nullptr) {
6172 : return false;
6173 : } else {
6174 16427384 : return remaining_->Contains(value);
6175 : }
6176 : }
6177 :
6178 :
6179 : const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6180 :
6181 :
6182 88393 : void DispatchTable::AddRange(CharacterRange full_range, int value,
6183 : Zone* zone) {
6184 88393 : CharacterRange current = full_range;
6185 88393 : if (tree()->is_empty()) {
6186 : // If this is the first range we just insert into the table.
6187 : ZoneSplayTree<Config>::Locator loc;
6188 2642 : bool inserted = tree()->Insert(current.from(), &loc);
6189 : DCHECK(inserted);
6190 : USE(inserted);
6191 : loc.set_value(Entry(current.from(), current.to(),
6192 2642 : empty()->Extend(value, zone)));
6193 88393 : return;
6194 : }
6195 : // First see if there is a range to the left of this one that
6196 : // overlaps.
6197 : ZoneSplayTree<Config>::Locator loc;
6198 85751 : if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6199 163698 : Entry* entry = &loc.value();
6200 : // If we've found a range that overlaps with this one, and it
6201 : // starts strictly to the left of this one, we have to fix it
6202 : // because the following code only handles ranges that start on
6203 : // or after the start point of the range we're adding.
6204 162898 : if (entry->from() < current.from() && entry->to() >= current.from()) {
6205 : // Snap the overlapping range in half around the start point of
6206 : // the range we're adding.
6207 : CharacterRange left =
6208 400 : CharacterRange::Range(entry->from(), current.from() - 1);
6209 : CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6210 : // The left part of the overlapping range doesn't overlap.
6211 : // Truncate the whole entry to be just the left part.
6212 : entry->set_to(left.to());
6213 : // The right part is the one that overlaps. We add this part
6214 : // to the map and let the next step deal with merging it with
6215 : // the range we're adding.
6216 : ZoneSplayTree<Config>::Locator loc;
6217 400 : bool inserted = tree()->Insert(right.from(), &loc);
6218 : DCHECK(inserted);
6219 : USE(inserted);
6220 : loc.set_value(Entry(right.from(),
6221 : right.to(),
6222 : entry->out_set()));
6223 : }
6224 : }
6225 166624 : while (current.is_valid()) {
6226 406176 : if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6227 325985 : (loc.value().from() <= current.to()) &&
6228 80873 : (loc.value().to() >= current.from())) {
6229 320132 : Entry* entry = &loc.value();
6230 : // We have overlap. If there is space between the start point of
6231 : // the range we're adding and where the overlapping range starts
6232 : // then we have to add a range covering just that space.
6233 80873 : if (current.from() < entry->from()) {
6234 : ZoneSplayTree<Config>::Locator ins;
6235 73083 : bool inserted = tree()->Insert(current.from(), &ins);
6236 : DCHECK(inserted);
6237 : USE(inserted);
6238 : ins.set_value(Entry(current.from(),
6239 : entry->from() - 1,
6240 146166 : empty()->Extend(value, zone)));
6241 : current.set_from(entry->from());
6242 : }
6243 : DCHECK_EQ(current.from(), entry->from());
6244 : // If the overlapping range extends beyond the one we want to add
6245 : // we have to snap the right part off and add it separately.
6246 80873 : if (entry->to() > current.to()) {
6247 : ZoneSplayTree<Config>::Locator ins;
6248 4430 : bool inserted = tree()->Insert(current.to() + 1, &ins);
6249 : DCHECK(inserted);
6250 : USE(inserted);
6251 : ins.set_value(Entry(current.to() + 1,
6252 : entry->to(),
6253 : entry->out_set()));
6254 : entry->set_to(current.to());
6255 : }
6256 : DCHECK(entry->to() <= current.to());
6257 : // The overlapping range is now completely contained by the range
6258 : // we're adding so we can just update it and move the start point
6259 : // of the range we're adding just past it.
6260 : entry->AddValue(value, zone);
6261 : DCHECK(entry->to() + 1 > current.from());
6262 80873 : current.set_from(entry->to() + 1);
6263 : } else {
6264 : // There is no overlap so we can just add the range
6265 : ZoneSplayTree<Config>::Locator ins;
6266 80191 : bool inserted = tree()->Insert(current.from(), &ins);
6267 : DCHECK(inserted);
6268 : USE(inserted);
6269 : ins.set_value(Entry(current.from(),
6270 : current.to(),
6271 80191 : empty()->Extend(value, zone)));
6272 : break;
6273 : }
6274 : }
6275 : }
6276 :
6277 :
6278 55010 : OutSet* DispatchTable::Get(uc32 value) {
6279 : ZoneSplayTree<Config>::Locator loc;
6280 55010 : if (!tree()->FindGreatestLessThan(value, &loc))
6281 0 : return empty();
6282 93895 : Entry* entry = &loc.value();
6283 55010 : if (value <= entry->to())
6284 38885 : return entry->out_set();
6285 : else
6286 16125 : return empty();
6287 : }
6288 :
6289 :
6290 : // -------------------------------------------------------------------
6291 : // Analysis
6292 :
6293 :
6294 1059317 : void Analysis::EnsureAnalyzed(RegExpNode* that) {
6295 : StackLimitCheck check(isolate());
6296 1059317 : if (check.HasOverflowed()) {
6297 : fail("Stack overflow");
6298 : return;
6299 : }
6300 1058890 : if (that->info()->been_analyzed || that->info()->being_analyzed)
6301 : return;
6302 856131 : that->info()->being_analyzed = true;
6303 856131 : that->Accept(this);
6304 856131 : that->info()->being_analyzed = false;
6305 856131 : that->info()->been_analyzed = true;
6306 : }
6307 :
6308 :
6309 88080 : void Analysis::VisitEnd(EndNode* that) {
6310 : // nothing to do
6311 88080 : }
6312 :
6313 :
6314 685933 : void TextNode::CalculateOffsets() {
6315 314634 : int element_count = elements()->length();
6316 : // Set up the offsets of the elements relative to the start. This is a fixed
6317 : // quantity since a TextNode can only contain fixed-width things.
6318 : int cp_offset = 0;
6319 685933 : for (int i = 0; i < element_count; i++) {
6320 : TextElement& elm = elements()->at(i);
6321 : elm.set_cp_offset(cp_offset);
6322 371299 : cp_offset += elm.length();
6323 : }
6324 314634 : }
6325 :
6326 :
6327 948723 : void Analysis::VisitText(TextNode* that) {
6328 632482 : that->MakeCaseIndependent(isolate(), is_one_byte_);
6329 316241 : EnsureAnalyzed(that->on_success());
6330 316241 : if (!has_failed()) {
6331 314634 : that->CalculateOffsets();
6332 : }
6333 316241 : }
6334 :
6335 :
6336 558426 : void Analysis::VisitAction(ActionNode* that) {
6337 279213 : RegExpNode* target = that->on_success();
6338 279213 : EnsureAnalyzed(target);
6339 279213 : if (!has_failed()) {
6340 : // If the next node is interested in what it follows then this node
6341 : // has to be interested too so it can pass the information on.
6342 : that->info()->AddFromFollowing(target->info());
6343 : }
6344 279213 : }
6345 :
6346 :
6347 317524 : void Analysis::VisitChoice(ChoiceNode* that) {
6348 : NodeInfo* info = that->info();
6349 317524 : for (int i = 0; i < that->alternatives()->length(); i++) {
6350 132952 : RegExpNode* node = that->alternatives()->at(i).node();
6351 132952 : EnsureAnalyzed(node);
6352 158762 : if (has_failed()) return;
6353 : // Anything the following nodes need to know has to be known by
6354 : // this node also, so it can pass it on.
6355 : info->AddFromFollowing(node->info());
6356 : }
6357 : }
6358 :
6359 :
6360 850948 : void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6361 : NodeInfo* info = that->info();
6362 752332 : for (int i = 0; i < that->alternatives()->length(); i++) {
6363 653913 : RegExpNode* node = that->alternatives()->at(i).node();
6364 277747 : if (node != that->loop_node()) {
6365 138972 : EnsureAnalyzed(node);
6366 277944 : if (has_failed()) return;
6367 : info->AddFromFollowing(node->info());
6368 : }
6369 : }
6370 : // Check the loop last since it may need the value of this node
6371 : // to get a correct result.
6372 98419 : EnsureAnalyzed(that->loop_node());
6373 98419 : if (!has_failed()) {
6374 : info->AddFromFollowing(that->loop_node()->info());
6375 : }
6376 : }
6377 :
6378 :
6379 2320 : void Analysis::VisitBackReference(BackReferenceNode* that) {
6380 2320 : EnsureAnalyzed(that->on_success());
6381 2320 : }
6382 :
6383 :
6384 5495 : void Analysis::VisitAssertion(AssertionNode* that) {
6385 5495 : EnsureAnalyzed(that->on_success());
6386 5495 : }
6387 :
6388 :
6389 188 : void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6390 : BoyerMooreLookahead* bm,
6391 : bool not_at_start) {
6392 : // Working out the set of characters that a backreference can match is too
6393 : // hard, so we just say that any character can match.
6394 : bm->SetRest(offset);
6395 : SaveBMInfo(bm, not_at_start, offset);
6396 188 : }
6397 :
6398 :
6399 : STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6400 : RegExpMacroAssembler::kTableSize);
6401 :
6402 :
6403 7716 : void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6404 7716 : BoyerMooreLookahead* bm, bool not_at_start) {
6405 55154 : ZoneList<GuardedAlternative>* alts = alternatives();
6406 15432 : budget = (budget - 1) / alts->length();
6407 94876 : for (int i = 0; i < alts->length(); i++) {
6408 79680 : GuardedAlternative& alt = alts->at(i);
6409 39958 : if (alt.guards() != nullptr && alt.guards()->length() != 0) {
6410 : bm->SetRest(offset); // Give up trying to fill in info.
6411 : SaveBMInfo(bm, not_at_start, offset);
6412 7716 : return;
6413 : }
6414 39722 : alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6415 : }
6416 : SaveBMInfo(bm, not_at_start, offset);
6417 : }
6418 :
6419 :
6420 121562 : void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6421 1091347 : BoyerMooreLookahead* bm, bool not_at_start) {
6422 121562 : if (initial_offset >= bm->length()) return;
6423 : int offset = initial_offset;
6424 : int max_char = bm->max_char();
6425 516254 : for (int i = 0; i < elements()->length(); i++) {
6426 158675 : if (offset >= bm->length()) {
6427 114334 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6428 : return;
6429 : }
6430 142558 : TextElement text = elements()->at(i);
6431 142558 : if (text.text_type() == TextElement::ATOM) {
6432 : RegExpAtom* atom = text.atom();
6433 195609 : for (int j = 0; j < atom->length(); j++, offset++) {
6434 81627 : if (offset >= bm->length()) {
6435 5993 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6436 : return;
6437 : }
6438 151268 : uc16 character = atom->data()[j];
6439 75634 : if (IgnoreCase(atom->flags())) {
6440 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6441 : int length = GetCaseIndependentLetters(
6442 : isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6443 4620 : chars);
6444 13218 : for (int j = 0; j < length; j++) {
6445 17196 : bm->Set(offset, chars[j]);
6446 : }
6447 : } else {
6448 142028 : if (character <= max_char) bm->Set(offset, character);
6449 : }
6450 : }
6451 : } else {
6452 : DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6453 : RegExpCharacterClass* char_class = text.char_class();
6454 382636 : ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6455 98217 : if (char_class->is_negated()) {
6456 4392 : bm->SetAll(offset);
6457 : } else {
6458 671447 : for (int k = 0; k < ranges->length(); k++) {
6459 450529 : CharacterRange& range = ranges->at(k);
6460 288811 : if (range.from() > max_char) continue;
6461 : int to = Min(max_char, static_cast<int>(range.to()));
6462 161718 : bm->SetInterval(offset, Interval(range.from(), to));
6463 : }
6464 : }
6465 98217 : offset++;
6466 : }
6467 : }
6468 99452 : if (offset >= bm->length()) {
6469 90173 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6470 : return;
6471 : }
6472 9279 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6473 9279 : true); // Not at start after a text node.
6474 9279 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6475 : }
6476 :
6477 :
6478 : // -------------------------------------------------------------------
6479 : // Dispatch table construction
6480 :
6481 :
6482 0 : void DispatchTableConstructor::VisitEnd(EndNode* that) {
6483 : AddRange(CharacterRange::Everything());
6484 0 : }
6485 :
6486 :
6487 0 : void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6488 : node->set_being_calculated(true);
6489 0 : ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6490 0 : for (int i = 0; i < alternatives->length(); i++) {
6491 : set_choice_index(i);
6492 0 : alternatives->at(i).node()->Accept(this);
6493 : }
6494 : node->set_being_calculated(false);
6495 0 : }
6496 :
6497 :
6498 : class AddDispatchRange {
6499 : public:
6500 : explicit AddDispatchRange(DispatchTableConstructor* constructor)
6501 0 : : constructor_(constructor) { }
6502 : void Call(uc32 from, DispatchTable::Entry entry);
6503 : private:
6504 : DispatchTableConstructor* constructor_;
6505 : };
6506 :
6507 :
6508 0 : void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6509 0 : constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6510 0 : }
6511 :
6512 :
6513 0 : void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6514 0 : if (node->being_calculated())
6515 0 : return;
6516 0 : DispatchTable* table = node->GetTable(ignore_case_);
6517 : AddDispatchRange adder(this);
6518 : table->ForEach(&adder);
6519 : }
6520 :
6521 :
6522 0 : void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6523 : // TODO(160): Find the node that we refer back to and propagate its start
6524 : // set back to here. For now we just accept anything.
6525 : AddRange(CharacterRange::Everything());
6526 0 : }
6527 :
6528 :
6529 0 : void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6530 0 : RegExpNode* target = that->on_success();
6531 0 : target->Accept(this);
6532 0 : }
6533 :
6534 :
6535 7870 : static int CompareRangeByFrom(const CharacterRange* a,
6536 3935 : const CharacterRange* b) {
6537 11805 : return Compare<uc16>(a->from(), b->from());
6538 : }
6539 :
6540 :
6541 915 : void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6542 : ranges->Sort(CompareRangeByFrom);
6543 : uc16 last = 0;
6544 1720 : for (int i = 0; i < ranges->length(); i++) {
6545 805 : CharacterRange range = ranges->at(i);
6546 805 : if (last < range.from())
6547 525 : AddRange(CharacterRange::Range(last, range.from() - 1));
6548 805 : if (range.to() >= last) {
6549 715 : if (range.to() == String::kMaxCodePoint) {
6550 55 : return;
6551 : } else {
6552 715 : last = range.to() + 1;
6553 : }
6554 : }
6555 : }
6556 55 : AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6557 : }
6558 :
6559 :
6560 0 : void DispatchTableConstructor::VisitText(TextNode* that) {
6561 0 : TextElement elm = that->elements()->at(0);
6562 0 : switch (elm.text_type()) {
6563 : case TextElement::ATOM: {
6564 0 : uc16 c = elm.atom()->data()[0];
6565 0 : AddRange(CharacterRange::Range(c, c));
6566 : break;
6567 : }
6568 : case TextElement::CHAR_CLASS: {
6569 : RegExpCharacterClass* tree = elm.char_class();
6570 0 : ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6571 0 : if (tree->is_negated()) {
6572 0 : AddInverse(ranges);
6573 : } else {
6574 0 : for (int i = 0; i < ranges->length(); i++)
6575 : AddRange(ranges->at(i));
6576 : }
6577 : break;
6578 : }
6579 : default: {
6580 0 : UNIMPLEMENTED();
6581 : }
6582 : }
6583 0 : }
6584 :
6585 :
6586 0 : void DispatchTableConstructor::VisitAction(ActionNode* that) {
6587 0 : RegExpNode* target = that->on_success();
6588 0 : target->Accept(this);
6589 0 : }
6590 :
6591 40 : RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6592 : RegExpNode* on_success,
6593 : JSRegExp::Flags flags) {
6594 : // If the regexp matching starts within a surrogate pair, step back
6595 : // to the lead surrogate and start matching from there.
6596 : DCHECK(!compiler->read_backward());
6597 : Zone* zone = compiler->zone();
6598 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6599 40 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6600 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6601 40 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6602 :
6603 40 : ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6604 :
6605 : int stack_register = compiler->UnicodeLookaroundStackRegister();
6606 : int position_register = compiler->UnicodeLookaroundPositionRegister();
6607 : RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6608 40 : zone, lead_surrogates, true, on_success, flags);
6609 : RegExpLookaround::Builder builder(true, step_back, stack_register,
6610 40 : position_register);
6611 : RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6612 40 : zone, trail_surrogates, false, builder.on_match_success(), flags);
6613 :
6614 : optional_step_back->AddAlternative(
6615 40 : GuardedAlternative(builder.ForMatch(match_trail)));
6616 : optional_step_back->AddAlternative(GuardedAlternative(on_success));
6617 :
6618 40 : return optional_step_back;
6619 : }
6620 :
6621 :
6622 85714 : RegExpEngine::CompilationResult RegExpEngine::Compile(
6623 : Isolate* isolate, Zone* zone, RegExpCompileData* data,
6624 : JSRegExp::Flags flags, Handle<String> pattern,
6625 : Handle<String> sample_subject, bool is_one_byte) {
6626 85714 : if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6627 : return IrregexpRegExpTooBig(isolate);
6628 : }
6629 : bool is_sticky = IsSticky(flags);
6630 : bool is_global = IsGlobal(flags);
6631 : bool is_unicode = IsUnicode(flags);
6632 85705 : RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
6633 :
6634 85705 : if (compiler.optimize())
6635 84553 : compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
6636 :
6637 : // Sample some characters from the middle of the string.
6638 : static const int kSampleSize = 128;
6639 :
6640 85705 : sample_subject = String::Flatten(isolate, sample_subject);
6641 : int chars_sampled = 0;
6642 85705 : int half_way = (sample_subject->length() - kSampleSize) / 2;
6643 1081522 : for (int i = Max(0, half_way);
6644 540761 : i < sample_subject->length() && chars_sampled < kSampleSize;
6645 : i++, chars_sampled++) {
6646 910112 : compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6647 : }
6648 :
6649 : // Wrap the body of the regexp in capture #0.
6650 : RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6651 : 0,
6652 : &compiler,
6653 85705 : compiler.accept());
6654 : RegExpNode* node = captured_body;
6655 85705 : bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6656 85705 : bool is_start_anchored = data->tree->IsAnchoredAtStart();
6657 85705 : int max_length = data->tree->max_match();
6658 85705 : if (!is_start_anchored && !is_sticky) {
6659 : // Add a .*? at the beginning, outside the body capture, unless
6660 : // this expression is anchored at the beginning or sticky.
6661 : JSRegExp::Flags default_flags = JSRegExp::Flags();
6662 : RegExpNode* loop_node = RegExpQuantifier::ToNode(
6663 : 0, RegExpTree::kInfinity, false,
6664 : new (zone) RegExpCharacterClass('*', default_flags), &compiler,
6665 165476 : captured_body, data->contains_anchor);
6666 :
6667 82738 : if (data->contains_anchor) {
6668 : // Unroll loop once, to take care of the case that might start
6669 : // at the start of input.
6670 149 : ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6671 : first_step_node->AddAlternative(GuardedAlternative(captured_body));
6672 : first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6673 : new (zone) RegExpCharacterClass('*', default_flags), false,
6674 149 : loop_node)));
6675 : node = first_step_node;
6676 : } else {
6677 : node = loop_node;
6678 : }
6679 : }
6680 85705 : if (is_one_byte) {
6681 14680 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6682 : // Do it again to propagate the new nodes to places where they were not
6683 : // put because they had not been calculated yet.
6684 14680 : if (node != nullptr) {
6685 14385 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6686 : }
6687 71025 : } else if (is_unicode && (is_global || is_sticky)) {
6688 40 : node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags);
6689 : }
6690 :
6691 85705 : if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
6692 85705 : data->node = node;
6693 : Analysis analysis(isolate, is_one_byte);
6694 85705 : analysis.EnsureAnalyzed(node);
6695 85705 : if (analysis.has_failed()) {
6696 : const char* error_message = analysis.error_message();
6697 427 : return CompilationResult(isolate, error_message);
6698 : }
6699 :
6700 : // Create the correct assembler for the architecture.
6701 : #ifndef V8_INTERPRETED_REGEXP
6702 : DCHECK(!FLAG_jitless);
6703 :
6704 : // Native regexp implementation.
6705 :
6706 : NativeRegExpMacroAssembler::Mode mode =
6707 : is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6708 85278 : : NativeRegExpMacroAssembler::UC16;
6709 :
6710 : #if V8_TARGET_ARCH_IA32
6711 : RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6712 : (data->capture_count + 1) * 2);
6713 : #elif V8_TARGET_ARCH_X64
6714 : RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6715 170556 : (data->capture_count + 1) * 2);
6716 : #elif V8_TARGET_ARCH_ARM
6717 : RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6718 : (data->capture_count + 1) * 2);
6719 : #elif V8_TARGET_ARCH_ARM64
6720 : RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6721 : (data->capture_count + 1) * 2);
6722 : #elif V8_TARGET_ARCH_S390
6723 : RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6724 : (data->capture_count + 1) * 2);
6725 : #elif V8_TARGET_ARCH_PPC
6726 : RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6727 : (data->capture_count + 1) * 2);
6728 : #elif V8_TARGET_ARCH_MIPS
6729 : RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6730 : (data->capture_count + 1) * 2);
6731 : #elif V8_TARGET_ARCH_MIPS64
6732 : RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6733 : (data->capture_count + 1) * 2);
6734 : #else
6735 : #error "Unsupported architecture"
6736 : #endif
6737 :
6738 : #else // V8_INTERPRETED_REGEXP
6739 : // Interpreted regexp implementation.
6740 : EmbeddedVector<byte, 1024> codes;
6741 : RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6742 : #endif // V8_INTERPRETED_REGEXP
6743 :
6744 85278 : macro_assembler.set_slow_safe(TooMuchRegExpCode(isolate, pattern));
6745 :
6746 : // Inserted here, instead of in Assembler, because it depends on information
6747 : // in the AST that isn't replicated in the Node structure.
6748 : static const int kMaxBacksearchLimit = 1024;
6749 85832 : if (is_end_anchored && !is_start_anchored && !is_sticky &&
6750 554 : max_length < kMaxBacksearchLimit) {
6751 210 : macro_assembler.SetCurrentPositionFromEnd(max_length);
6752 : }
6753 :
6754 85278 : if (is_global) {
6755 : RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6756 3823 : if (data->tree->min_match() > 0) {
6757 : mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6758 138 : } else if (is_unicode) {
6759 : mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6760 : }
6761 : macro_assembler.set_global_mode(mode);
6762 : }
6763 :
6764 : return compiler.Assemble(isolate, ¯o_assembler, node, data->capture_count,
6765 85278 : pattern);
6766 : }
6767 :
6768 169831 : bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
6769 169831 : Heap* heap = isolate->heap();
6770 169831 : bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6771 339662 : if (heap->isolate()->total_regexp_code_generated() >
6772 298384 : RegExpImpl::kRegExpCompiledLimit &&
6773 128553 : heap->CommittedMemoryExecutable() >
6774 : RegExpImpl::kRegExpExecutableMemoryLimit) {
6775 : too_much = true;
6776 : }
6777 169831 : return too_much;
6778 : }
6779 :
6780 36879 : Object RegExpResultsCache::Lookup(Heap* heap, String key_string,
6781 : Object key_pattern,
6782 : FixedArray* last_match_cache,
6783 : ResultsCacheType type) {
6784 : FixedArray cache;
6785 36879 : if (!key_string->IsInternalizedString()) return Smi::kZero;
6786 5309 : if (type == STRING_SPLIT_SUBSTRINGS) {
6787 : DCHECK(key_pattern->IsString());
6788 5309 : if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6789 5309 : cache = heap->string_split_cache();
6790 : } else {
6791 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6792 : DCHECK(key_pattern->IsFixedArray());
6793 0 : cache = heap->regexp_multiple_cache();
6794 : }
6795 :
6796 5309 : uint32_t hash = key_string->Hash();
6797 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6798 5309 : ~(kArrayEntriesPerCacheEntry - 1));
6799 14576 : if (cache->get(index + kStringOffset) != key_string ||
6800 3958 : cache->get(index + kPatternOffset) != key_pattern) {
6801 : index =
6802 1382 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6803 2818 : if (cache->get(index + kStringOffset) != key_string ||
6804 54 : cache->get(index + kPatternOffset) != key_pattern) {
6805 1339 : return Smi::kZero;
6806 : }
6807 : }
6808 :
6809 7940 : *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6810 3970 : return cache->get(index + kArrayOffset);
6811 : }
6812 :
6813 32909 : void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6814 : Handle<Object> key_pattern,
6815 : Handle<FixedArray> value_array,
6816 : Handle<FixedArray> last_match_cache,
6817 : ResultsCacheType type) {
6818 : Factory* factory = isolate->factory();
6819 : Handle<FixedArray> cache;
6820 65818 : if (!key_string->IsInternalizedString()) return;
6821 1339 : if (type == STRING_SPLIT_SUBSTRINGS) {
6822 : DCHECK(key_pattern->IsString());
6823 2678 : if (!key_pattern->IsInternalizedString()) return;
6824 : cache = factory->string_split_cache();
6825 : } else {
6826 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6827 : DCHECK(key_pattern->IsFixedArray());
6828 : cache = factory->regexp_multiple_cache();
6829 : }
6830 :
6831 1339 : uint32_t hash = key_string->Hash();
6832 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6833 1339 : ~(kArrayEntriesPerCacheEntry - 1));
6834 2678 : if (cache->get(index + kStringOffset) == Smi::kZero) {
6835 2378 : cache->set(index + kStringOffset, *key_string);
6836 2378 : cache->set(index + kPatternOffset, *key_pattern);
6837 2378 : cache->set(index + kArrayOffset, *value_array);
6838 2378 : cache->set(index + kLastMatchOffset, *last_match_cache);
6839 : } else {
6840 : uint32_t index2 =
6841 150 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6842 300 : if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6843 190 : cache->set(index2 + kStringOffset, *key_string);
6844 190 : cache->set(index2 + kPatternOffset, *key_pattern);
6845 190 : cache->set(index2 + kArrayOffset, *value_array);
6846 190 : cache->set(index2 + kLastMatchOffset, *last_match_cache);
6847 : } else {
6848 55 : cache->set(index2 + kStringOffset, Smi::kZero);
6849 110 : cache->set(index2 + kPatternOffset, Smi::kZero);
6850 110 : cache->set(index2 + kArrayOffset, Smi::kZero);
6851 110 : cache->set(index2 + kLastMatchOffset, Smi::kZero);
6852 110 : cache->set(index + kStringOffset, *key_string);
6853 110 : cache->set(index + kPatternOffset, *key_pattern);
6854 110 : cache->set(index + kArrayOffset, *value_array);
6855 110 : cache->set(index + kLastMatchOffset, *last_match_cache);
6856 : }
6857 : }
6858 : // If the array is a reasonably short list of substrings, convert it into a
6859 : // list of internalized strings.
6860 2678 : if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6861 13579 : for (int i = 0; i < value_array->length(); i++) {
6862 : Handle<String> str(String::cast(value_array->get(i)), isolate);
6863 6138 : Handle<String> internalized_str = factory->InternalizeString(str);
6864 12276 : value_array->set(i, *internalized_str);
6865 : }
6866 : }
6867 : // Convert backing store to a copy-on-write array.
6868 : value_array->set_map_no_write_barrier(
6869 : ReadOnlyRoots(isolate).fixed_cow_array_map());
6870 : }
6871 :
6872 166984 : void RegExpResultsCache::Clear(FixedArray cache) {
6873 42914888 : for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6874 42747904 : cache->set(i, Smi::kZero);
6875 : }
6876 166984 : }
6877 :
6878 : } // namespace internal
6879 183867 : } // namespace v8
|