Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/jsregexp.h"
6 :
7 : #include <memory>
8 : #include <vector>
9 :
10 : #include "src/base/platform/platform.h"
11 : #include "src/compilation-cache.h"
12 : #include "src/elements.h"
13 : #include "src/execution.h"
14 : #include "src/factory.h"
15 : #include "src/isolate-inl.h"
16 : #include "src/messages.h"
17 : #include "src/ostreams.h"
18 : #include "src/regexp/interpreter-irregexp.h"
19 : #include "src/regexp/jsregexp-inl.h"
20 : #include "src/regexp/regexp-macro-assembler-irregexp.h"
21 : #include "src/regexp/regexp-macro-assembler-tracer.h"
22 : #include "src/regexp/regexp-macro-assembler.h"
23 : #include "src/regexp/regexp-parser.h"
24 : #include "src/regexp/regexp-stack.h"
25 : #include "src/runtime/runtime.h"
26 : #include "src/splay-tree-inl.h"
27 : #include "src/string-search.h"
28 : #include "src/unicode-decoder.h"
29 : #include "src/unicode-inl.h"
30 :
31 : #ifdef V8_INTL_SUPPORT
32 : #include "unicode/uniset.h"
33 : #include "unicode/utypes.h"
34 : #endif // V8_INTL_SUPPORT
35 :
36 : #ifndef V8_INTERPRETED_REGEXP
37 : #if V8_TARGET_ARCH_IA32
38 : #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
39 : #elif V8_TARGET_ARCH_X64
40 : #include "src/regexp/x64/regexp-macro-assembler-x64.h"
41 : #elif V8_TARGET_ARCH_ARM64
42 : #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
43 : #elif V8_TARGET_ARCH_ARM
44 : #include "src/regexp/arm/regexp-macro-assembler-arm.h"
45 : #elif V8_TARGET_ARCH_PPC
46 : #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
47 : #elif V8_TARGET_ARCH_S390
48 : #include "src/regexp/s390/regexp-macro-assembler-s390.h"
49 : #elif V8_TARGET_ARCH_MIPS
50 : #include "src/regexp/mips/regexp-macro-assembler-mips.h"
51 : #elif V8_TARGET_ARCH_MIPS64
52 : #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
53 : #else
54 : #error Unsupported target architecture.
55 : #endif
56 : #endif
57 :
58 :
59 : namespace v8 {
60 : namespace internal {
61 :
62 : MUST_USE_RESULT
63 2998 : static inline MaybeHandle<Object> ThrowRegExpException(
64 : Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
65 : Isolate* isolate = re->GetIsolate();
66 5996 : THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
67 : pattern, error_text),
68 : Object);
69 : }
70 :
71 :
72 458 : inline void ThrowRegExpException(Handle<JSRegExp> re,
73 : Handle<String> error_text) {
74 458 : USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
75 458 : }
76 :
77 :
78 1066172 : ContainedInLattice AddRange(ContainedInLattice containment,
79 : const int* ranges,
80 : int ranges_length,
81 : Interval new_range) {
82 : DCHECK_EQ(1, ranges_length & 1);
83 : DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
84 1066172 : if (containment == kLatticeUnknown) return containment;
85 : bool inside = false;
86 : int last = 0;
87 3674031 : for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
88 : // Consider the range from last to ranges[i].
89 : // We haven't got to the new range yet.
90 4576567 : if (ranges[i] <= new_range.from()) continue;
91 : // New range is wholly inside last-ranges[i]. Note that new_range.to() is
92 : // inclusive, but the values in ranges are not.
93 902536 : if (last <= new_range.from() && new_range.to() < ranges[i]) {
94 1770034 : return Combine(containment, inside ? kLatticeIn : kLatticeOut);
95 : }
96 : return kLatticeUnknown;
97 : }
98 : return containment;
99 : }
100 :
101 : // Generic RegExp methods. Dispatches to implementation specific methods.
102 :
103 : // In a 3-character pattern you can maximally step forwards 3 characters
104 : // at a time, which is not always enough to pay for the extra logic.
105 : const int kPatternTooShortForBoyerMoore = 2;
106 :
107 518367 : MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
108 : Handle<String> pattern,
109 : JSRegExp::Flags flags) {
110 : DCHECK(pattern->IsFlat());
111 :
112 1036734 : Isolate* isolate = re->GetIsolate();
113 518367 : Zone zone(isolate->allocator(), ZONE_NAME);
114 : CompilationCache* compilation_cache = isolate->compilation_cache();
115 : MaybeHandle<FixedArray> maybe_cached =
116 518367 : compilation_cache->LookupRegExp(pattern, flags);
117 : Handle<FixedArray> cached;
118 518367 : if (maybe_cached.ToHandle(&cached)) {
119 217504 : re->set_data(*cached);
120 217504 : return re;
121 : }
122 :
123 : PostponeInterruptsScope postpone(isolate);
124 : RegExpCompileData parse_result;
125 300863 : FlatStringReader reader(isolate, pattern);
126 : DCHECK(!isolate->has_pending_exception());
127 300863 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
128 300863 : &parse_result)) {
129 : // Throw an exception if we fail to parse the pattern.
130 2456 : return ThrowRegExpException(re, pattern, parse_result.error);
131 : }
132 :
133 : bool has_been_compiled = false;
134 :
135 696550 : if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
136 497211 : !(flags & JSRegExp::kSticky) &&
137 : pattern->length() <= kPatternTooShortForBoyerMoore) {
138 : // Parse-tree is a single atom that is equal to the pattern.
139 1005 : AtomCompile(re, pattern, flags, pattern);
140 : has_been_compiled = true;
141 1004068 : } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
142 501746 : !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
143 204332 : RegExpAtom* atom = parse_result.tree->AsAtom();
144 204332 : Vector<const uc16> atom_pattern = atom->data();
145 204332 : if (atom_pattern.length() <= kPatternTooShortForBoyerMoore) {
146 : Handle<String> atom_string;
147 6880 : ASSIGN_RETURN_ON_EXCEPTION(
148 : isolate, atom_string,
149 : isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
150 3440 : AtomCompile(re, pattern, flags, atom_string);
151 : has_been_compiled = true;
152 : }
153 : }
154 298407 : if (!has_been_compiled) {
155 293962 : IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
156 : }
157 : DCHECK(re->data()->IsFixedArray());
158 : // Compilation succeeded so the data is set on the regexp
159 : // and we can store it in the cache.
160 : Handle<FixedArray> data(FixedArray::cast(re->data()));
161 298407 : compilation_cache->PutRegExp(pattern, flags, data);
162 :
163 816774 : return re;
164 : }
165 :
166 224336 : MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
167 : Handle<String> subject, int index,
168 : Handle<RegExpMatchInfo> last_match_info) {
169 224336 : switch (regexp->TypeTag()) {
170 : case JSRegExp::ATOM:
171 294 : return AtomExec(regexp, subject, index, last_match_info);
172 : case JSRegExp::IRREGEXP: {
173 224042 : return IrregexpExec(regexp, subject, index, last_match_info);
174 : }
175 : default:
176 0 : UNREACHABLE();
177 : }
178 : }
179 :
180 :
181 : // RegExp Atom implementation: Simple string search using indexOf.
182 :
183 :
184 4445 : void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
185 : Handle<String> pattern,
186 : JSRegExp::Flags flags,
187 : Handle<String> match_pattern) {
188 : re->GetIsolate()->factory()->SetRegExpAtomData(re,
189 : JSRegExp::ATOM,
190 : pattern,
191 : flags,
192 4445 : match_pattern);
193 4445 : }
194 :
195 280 : static void SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info,
196 : String* subject, int from, int to) {
197 : SealHandleScope shs(last_match_info->GetIsolate());
198 : last_match_info->SetNumberOfCaptureRegisters(2);
199 : last_match_info->SetLastSubject(subject);
200 : last_match_info->SetLastInput(subject);
201 : last_match_info->SetCapture(0, from);
202 : last_match_info->SetCapture(1, to);
203 280 : }
204 :
205 :
206 90550 : int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
207 : Handle<String> subject,
208 : int index,
209 : int32_t* output,
210 : int output_size) {
211 : Isolate* isolate = regexp->GetIsolate();
212 :
213 : DCHECK_LE(0, index);
214 : DCHECK_LE(index, subject->length());
215 :
216 90550 : subject = String::Flatten(subject);
217 : DisallowHeapAllocation no_gc; // ensure vectors stay valid
218 :
219 : String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
220 : int needle_len = needle->length();
221 : DCHECK(needle->IsFlat());
222 : DCHECK_LT(0, needle_len);
223 :
224 181100 : if (index + needle_len > subject->length()) {
225 : return RegExpImpl::RE_FAILURE;
226 : }
227 :
228 91535 : for (int i = 0; i < output_size; i += 2) {
229 181803 : String::FlatContent needle_content = needle->GetFlatContent();
230 181803 : String::FlatContent subject_content = subject->GetFlatContent();
231 : DCHECK(needle_content.IsFlat());
232 : DCHECK(subject_content.IsFlat());
233 : // dispatch on type of strings
234 : index =
235 181803 : (needle_content.IsOneByte()
236 : ? (subject_content.IsOneByte()
237 : ? SearchString(isolate, subject_content.ToOneByteVector(),
238 : needle_content.ToOneByteVector(), index)
239 : : SearchString(isolate, subject_content.ToUC16Vector(),
240 : needle_content.ToOneByteVector(), index))
241 : : (subject_content.IsOneByte()
242 : ? SearchString(isolate, subject_content.ToOneByteVector(),
243 : needle_content.ToUC16Vector(), index)
244 : : SearchString(isolate, subject_content.ToUC16Vector(),
245 363606 : needle_content.ToUC16Vector(), index)));
246 181803 : if (index == -1) {
247 90268 : return i / 2; // Return number of matches.
248 : } else {
249 91535 : output[i] = index;
250 91535 : output[i+1] = index + needle_len;
251 : index += needle_len;
252 : }
253 : }
254 280 : return output_size / 2;
255 : }
256 :
257 294 : Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject,
258 : int index,
259 : Handle<RegExpMatchInfo> last_match_info) {
260 : Isolate* isolate = re->GetIsolate();
261 :
262 : static const int kNumRegisters = 2;
263 : STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
264 294 : int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
265 :
266 294 : int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
267 :
268 308 : if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
269 :
270 : DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
271 : SealHandleScope shs(isolate);
272 : SetAtomLastCapture(last_match_info, *subject, output_registers[0],
273 560 : output_registers[1]);
274 280 : return last_match_info;
275 : }
276 :
277 :
278 : // Irregexp implementation.
279 :
280 : // Ensures that the regexp object contains a compiled version of the
281 : // source for either one-byte or two-byte subject strings.
282 : // If the compiled version doesn't already exist, it is compiled
283 : // from the source pattern.
284 : // If compilation fails, an exception is thrown and this function
285 : // returns false.
286 971807 : bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
287 : Handle<String> sample_subject,
288 : bool is_one_byte) {
289 : Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
290 : #ifdef V8_INTERPRETED_REGEXP
291 : if (compiled_code->IsByteArray()) return true;
292 : #else // V8_INTERPRETED_REGEXP (RegExp native code)
293 971807 : if (compiled_code->IsCode()) return true;
294 : #endif
295 94054 : return CompileIrregexp(re, sample_subject, is_one_byte);
296 : }
297 :
298 :
299 94054 : bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
300 : Handle<String> sample_subject,
301 : bool is_one_byte) {
302 : // Compile the RegExp.
303 94054 : Isolate* isolate = re->GetIsolate();
304 94054 : Zone zone(isolate->allocator(), ZONE_NAME);
305 : PostponeInterruptsScope postpone(isolate);
306 : #ifdef DEBUG
307 : Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
308 : // When arriving here entry can only be a smi representing an uncompiled
309 : // regexp.
310 : DCHECK(entry->IsSmi());
311 : int entry_value = Smi::ToInt(entry);
312 : DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
313 : #endif
314 :
315 94054 : JSRegExp::Flags flags = re->GetFlags();
316 :
317 : Handle<String> pattern(re->Pattern());
318 94054 : pattern = String::Flatten(pattern);
319 : RegExpCompileData compile_data;
320 94054 : FlatStringReader reader(isolate, pattern);
321 94054 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
322 94054 : &compile_data)) {
323 : // Throw an exception if we fail to parse the pattern.
324 : // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
325 84 : USE(ThrowRegExpException(re, pattern, compile_data.error));
326 84 : return false;
327 : }
328 : RegExpEngine::CompilationResult result =
329 : RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
330 93970 : sample_subject, is_one_byte);
331 93970 : if (result.error_message != nullptr) {
332 : // Unable to compile regexp.
333 : Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
334 916 : CStrVector(result.error_message)).ToHandleChecked();
335 458 : ThrowRegExpException(re, error_message);
336 : return false;
337 : }
338 :
339 : Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
340 187024 : data->set(JSRegExp::code_index(is_one_byte), result.code);
341 93512 : SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
342 : int register_max = IrregexpMaxRegisterCount(*data);
343 93512 : if (result.num_registers > register_max) {
344 : SetIrregexpMaxRegisterCount(*data, result.num_registers);
345 : }
346 :
347 94054 : return true;
348 : }
349 :
350 :
351 0 : int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
352 : return Smi::cast(
353 0 : re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
354 : }
355 :
356 :
357 0 : void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
358 : re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
359 0 : }
360 :
361 93512 : void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
362 : Handle<FixedArray> value) {
363 93512 : if (value.is_null()) {
364 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
365 : } else {
366 355 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
367 : }
368 93512 : }
369 :
370 0 : int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
371 0 : return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex));
372 : }
373 :
374 :
375 0 : int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
376 0 : return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex));
377 : }
378 :
379 :
380 0 : ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
381 0 : return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
382 : }
383 :
384 :
385 0 : Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
386 0 : return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
387 : }
388 :
389 :
390 293962 : void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
391 : Handle<String> pattern,
392 : JSRegExp::Flags flags,
393 : int capture_count) {
394 : // Initialize compiled code entries to null.
395 : re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
396 : JSRegExp::IRREGEXP,
397 : pattern,
398 : flags,
399 293962 : capture_count);
400 293962 : }
401 :
402 :
403 372909 : int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
404 : Handle<String> subject) {
405 : DCHECK(subject->IsFlat());
406 :
407 : // Check representation of the underlying storage.
408 372909 : bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
409 372909 : if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
410 :
411 : #ifdef V8_INTERPRETED_REGEXP
412 : // Byte-code regexp needs space allocated for all its registers.
413 : // The result captures are copied to the start of the registers array
414 : // if the match succeeds. This way those registers are not clobbered
415 : // when we set the last match info from last successful match.
416 : return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
417 : (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
418 : #else // V8_INTERPRETED_REGEXP
419 : // Native regexp only needs room to output captures. Registers are handled
420 : // internally.
421 372367 : return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
422 : #endif // V8_INTERPRETED_REGEXP
423 : }
424 :
425 :
426 598898 : int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
427 : Handle<String> subject,
428 : int index,
429 : int32_t* output,
430 : int output_size) {
431 : Isolate* isolate = regexp->GetIsolate();
432 :
433 : Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
434 :
435 : DCHECK_LE(0, index);
436 : DCHECK_LE(index, subject->length());
437 : DCHECK(subject->IsFlat());
438 :
439 598898 : bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
440 :
441 : #ifndef V8_INTERPRETED_REGEXP
442 : DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
443 : do {
444 598898 : EnsureCompiledIrregexp(regexp, subject, is_one_byte);
445 : Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
446 : // The stack is used to allocate registers for the compiled regexp code.
447 : // This means that in case of failure, the output registers array is left
448 : // untouched and contains the capture results from the previous successful
449 : // match. We can use that to set the last match info lazily.
450 : NativeRegExpMacroAssembler::Result res =
451 : NativeRegExpMacroAssembler::Match(code,
452 : subject,
453 : output,
454 : output_size,
455 : index,
456 598898 : isolate);
457 598898 : if (res != NativeRegExpMacroAssembler::RETRY) {
458 : DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
459 : isolate->has_pending_exception());
460 : STATIC_ASSERT(
461 : static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
462 : STATIC_ASSERT(
463 : static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
464 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
465 : == RE_EXCEPTION);
466 598898 : return static_cast<IrregexpResult>(res);
467 : }
468 : // If result is RETRY, the string has changed representation, and we
469 : // must restart from scratch.
470 : // In this case, it means we must make sure we are prepared to handle
471 : // the, potentially, different subject (the string can switch between
472 : // being internal and external, and even between being Latin1 and UC16,
473 : // but the characters are always the same).
474 0 : IrregexpPrepare(regexp, subject);
475 0 : is_one_byte = subject->IsOneByteRepresentationUnderneath();
476 : } while (true);
477 0 : UNREACHABLE();
478 : #else // V8_INTERPRETED_REGEXP
479 :
480 : DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
481 : // We must have done EnsureCompiledIrregexp, so we can get the number of
482 : // registers.
483 : int number_of_capture_registers =
484 : (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
485 : int32_t* raw_output = &output[number_of_capture_registers];
486 : // We do not touch the actual capture result registers until we know there
487 : // has been a match so that we can use those capture results to set the
488 : // last match info.
489 : for (int i = number_of_capture_registers - 1; i >= 0; i--) {
490 : raw_output[i] = -1;
491 : }
492 : Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
493 : isolate);
494 :
495 : IrregexpResult result = IrregexpInterpreter::Match(isolate,
496 : byte_codes,
497 : subject,
498 : raw_output,
499 : index);
500 : if (result == RE_SUCCESS) {
501 : // Copy capture results to the start of the registers array.
502 : MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
503 : }
504 : if (result == RE_EXCEPTION) {
505 : DCHECK(!isolate->has_pending_exception());
506 : isolate->StackOverflow();
507 : }
508 : return result;
509 : #endif // V8_INTERPRETED_REGEXP
510 : }
511 :
512 224042 : MaybeHandle<Object> RegExpImpl::IrregexpExec(
513 : Handle<JSRegExp> regexp, Handle<String> subject, int previous_index,
514 : Handle<RegExpMatchInfo> last_match_info) {
515 : Isolate* isolate = regexp->GetIsolate();
516 : DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
517 :
518 224042 : subject = String::Flatten(subject);
519 :
520 : // Prepare space for the return values.
521 : #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
522 : if (FLAG_trace_regexp_bytecodes) {
523 : String* pattern = regexp->Pattern();
524 : PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
525 : PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
526 : }
527 : #endif
528 224042 : int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
529 224042 : if (required_registers < 0) {
530 : // Compiling failed with an exception.
531 : DCHECK(isolate->has_pending_exception());
532 242 : return MaybeHandle<Object>();
533 : }
534 :
535 : int32_t* output_registers = nullptr;
536 223800 : if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
537 1802 : output_registers = NewArray<int32_t>(required_registers);
538 : }
539 : std::unique_ptr<int32_t[]> auto_release(output_registers);
540 223800 : if (output_registers == nullptr) {
541 221998 : output_registers = isolate->jsregexp_static_offsets_vector();
542 : }
543 :
544 : int res = RegExpImpl::IrregexpExecRaw(
545 223800 : regexp, subject, previous_index, output_registers, required_registers);
546 223800 : if (res == RE_SUCCESS) {
547 : int capture_count =
548 : IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
549 : return SetLastMatchInfo(
550 202088 : last_match_info, subject, capture_count, output_registers);
551 : }
552 21712 : if (res == RE_EXCEPTION) {
553 : DCHECK(isolate->has_pending_exception());
554 131 : return MaybeHandle<Object>();
555 : }
556 : DCHECK(res == RE_FAILURE);
557 21581 : return isolate->factory()->null_value();
558 : }
559 :
560 309058 : Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
561 : Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
562 : int capture_count, int32_t* match) {
563 : // This is the only place where match infos can grow. If, after executing the
564 : // regexp, RegExpExecStub finds that the match info is too small, it restarts
565 : // execution in RegExpImpl::Exec, which finally grows the match info right
566 : // here.
567 :
568 309058 : int capture_register_count = (capture_count + 1) * 2;
569 : Handle<RegExpMatchInfo> result =
570 309058 : RegExpMatchInfo::ReserveCaptures(last_match_info, capture_register_count);
571 : result->SetNumberOfCaptureRegisters(capture_register_count);
572 :
573 309058 : if (*result != *last_match_info) {
574 : // The match info has been reallocated, update the corresponding reference
575 : // on the native context.
576 : Isolate* isolate = last_match_info->GetIsolate();
577 6880 : if (*last_match_info == *isolate->regexp_last_match_info()) {
578 4676 : isolate->native_context()->set_regexp_last_match_info(*result);
579 2204 : } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
580 2204 : isolate->native_context()->set_regexp_internal_match_info(*result);
581 : }
582 : }
583 :
584 : DisallowHeapAllocation no_allocation;
585 309058 : if (match != nullptr) {
586 1024333 : for (int i = 0; i < capture_register_count; i += 2) {
587 1024333 : result->SetCapture(i, match[i]);
588 1024333 : result->SetCapture(i + 1, match[i + 1]);
589 : }
590 : }
591 : result->SetLastSubject(*subject);
592 : result->SetLastInput(*subject);
593 309058 : return result;
594 : }
595 :
596 229574 : RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
597 : Handle<String> subject, Isolate* isolate)
598 : : register_array_(nullptr),
599 : register_array_size_(0),
600 : regexp_(regexp),
601 229574 : subject_(subject) {
602 : #ifdef V8_INTERPRETED_REGEXP
603 : bool interpreted = true;
604 : #else
605 : bool interpreted = false;
606 : #endif // V8_INTERPRETED_REGEXP
607 :
608 229574 : if (regexp_->TypeTag() == JSRegExp::ATOM) {
609 : static const int kAtomRegistersPerMatch = 2;
610 90256 : registers_per_match_ = kAtomRegistersPerMatch;
611 : // There is no distinction between interpreted and native for atom regexps.
612 : interpreted = false;
613 : } else {
614 139318 : registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
615 139318 : if (registers_per_match_ < 0) {
616 138 : num_matches_ = -1; // Signal exception.
617 229712 : return;
618 : }
619 : }
620 :
621 : DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
622 : if (!interpreted) {
623 : register_array_size_ =
624 458872 : Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
625 229436 : max_matches_ = register_array_size_ / registers_per_match_;
626 : } else {
627 : // Global loop in interpreted regexp is not implemented. We choose
628 : // the size of the offsets vector so that it can only store one match.
629 : register_array_size_ = registers_per_match_;
630 : max_matches_ = 1;
631 : }
632 :
633 229436 : if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
634 1209 : register_array_ = NewArray<int32_t>(register_array_size_);
635 : } else {
636 228227 : register_array_ = isolate->jsregexp_static_offsets_vector();
637 : }
638 :
639 : // Set state so that fetching the results the first time triggers a call
640 : // to the compiled regexp.
641 229436 : current_match_index_ = max_matches_ - 1;
642 229436 : num_matches_ = max_matches_;
643 : DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
644 : DCHECK_GE(register_array_size_, registers_per_match_);
645 : int32_t* last_match =
646 229436 : ®ister_array_[current_match_index_ * registers_per_match_];
647 229436 : last_match[0] = -1;
648 229436 : last_match[1] = 0;
649 : }
650 :
651 289 : int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
652 568 : if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
653 558 : last_index + 1 < subject_->length() &&
654 847 : unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
655 279 : unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
656 : // Advance over the surrogate pair.
657 279 : return last_index + 2;
658 : }
659 10 : return last_index + 1;
660 : }
661 :
662 : // -------------------------------------------------------------------
663 : // Implementation of the Irregexp regular expression engine.
664 : //
665 : // The Irregexp regular expression engine is intended to be a complete
666 : // implementation of ECMAScript regular expressions. It generates either
667 : // bytecodes or native code.
668 :
669 : // The Irregexp regexp engine is structured in three steps.
670 : // 1) The parser generates an abstract syntax tree. See ast.cc.
671 : // 2) From the AST a node network is created. The nodes are all
672 : // subclasses of RegExpNode. The nodes represent states when
673 : // executing a regular expression. Several optimizations are
674 : // performed on the node network.
675 : // 3) From the nodes we generate either byte codes or native code
676 : // that can actually execute the regular expression (perform
677 : // the search). The code generation step is described in more
678 : // detail below.
679 :
680 : // Code generation.
681 : //
682 : // The nodes are divided into four main categories.
683 : // * Choice nodes
684 : // These represent places where the regular expression can
685 : // match in more than one way. For example on entry to an
686 : // alternation (foo|bar) or a repetition (*, +, ? or {}).
687 : // * Action nodes
688 : // These represent places where some action should be
689 : // performed. Examples include recording the current position
690 : // in the input string to a register (in order to implement
691 : // captures) or other actions on register for example in order
692 : // to implement the counters needed for {} repetitions.
693 : // * Matching nodes
694 : // These attempt to match some element part of the input string.
695 : // Examples of elements include character classes, plain strings
696 : // or back references.
697 : // * End nodes
698 : // These are used to implement the actions required on finding
699 : // a successful match or failing to find a match.
700 : //
701 : // The code generated (whether as byte codes or native code) maintains
702 : // some state as it runs. This consists of the following elements:
703 : //
704 : // * The capture registers. Used for string captures.
705 : // * Other registers. Used for counters etc.
706 : // * The current position.
707 : // * The stack of backtracking information. Used when a matching node
708 : // fails to find a match and needs to try an alternative.
709 : //
710 : // Conceptual regular expression execution model:
711 : //
712 : // There is a simple conceptual model of regular expression execution
713 : // which will be presented first. The actual code generated is a more
714 : // efficient simulation of the simple conceptual model:
715 : //
716 : // * Choice nodes are implemented as follows:
717 : // For each choice except the last {
718 : // push current position
719 : // push backtrack code location
720 : // <generate code to test for choice>
721 : // backtrack code location:
722 : // pop current position
723 : // }
724 : // <generate code to test for last choice>
725 : //
726 : // * Actions nodes are generated as follows
727 : // <push affected registers on backtrack stack>
728 : // <generate code to perform action>
729 : // push backtrack code location
730 : // <generate code to test for following nodes>
731 : // backtrack code location:
732 : // <pop affected registers to restore their state>
733 : // <pop backtrack location from stack and go to it>
734 : //
735 : // * Matching nodes are generated as follows:
736 : // if input string matches at current position
737 : // update current position
738 : // <generate code to test for following nodes>
739 : // else
740 : // <pop backtrack location from stack and go to it>
741 : //
742 : // Thus it can be seen that the current position is saved and restored
743 : // by the choice nodes, whereas the registers are saved and restored by
744 : // by the action nodes that manipulate them.
745 : //
746 : // The other interesting aspect of this model is that nodes are generated
747 : // at the point where they are needed by a recursive call to Emit(). If
748 : // the node has already been code generated then the Emit() call will
749 : // generate a jump to the previously generated code instead. In order to
750 : // limit recursion it is possible for the Emit() function to put the node
751 : // on a work list for later generation and instead generate a jump. The
752 : // destination of the jump is resolved later when the code is generated.
753 : //
754 : // Actual regular expression code generation.
755 : //
756 : // Code generation is actually more complicated than the above. In order
757 : // to improve the efficiency of the generated code some optimizations are
758 : // performed
759 : //
760 : // * Choice nodes have 1-character lookahead.
761 : // A choice node looks at the following character and eliminates some of
762 : // the choices immediately based on that character. This is not yet
763 : // implemented.
764 : // * Simple greedy loops store reduced backtracking information.
765 : // A quantifier like /.*foo/m will greedily match the whole input. It will
766 : // then need to backtrack to a point where it can match "foo". The naive
767 : // implementation of this would push each character position onto the
768 : // backtracking stack, then pop them off one by one. This would use space
769 : // proportional to the length of the input string. However since the "."
770 : // can only match in one way and always has a constant length (in this case
771 : // of 1) it suffices to store the current position on the top of the stack
772 : // once. Matching now becomes merely incrementing the current position and
773 : // backtracking becomes decrementing the current position and checking the
774 : // result against the stored current position. This is faster and saves
775 : // space.
776 : // * The current state is virtualized.
777 : // This is used to defer expensive operations until it is clear that they
778 : // are needed and to generate code for a node more than once, allowing
779 : // specialized an efficient versions of the code to be created. This is
780 : // explained in the section below.
781 : //
782 : // Execution state virtualization.
783 : //
784 : // Instead of emitting code, nodes that manipulate the state can record their
785 : // manipulation in an object called the Trace. The Trace object can record a
786 : // current position offset, an optional backtrack code location on the top of
787 : // the virtualized backtrack stack and some register changes. When a node is
788 : // to be emitted it can flush the Trace or update it. Flushing the Trace
789 : // will emit code to bring the actual state into line with the virtual state.
790 : // Avoiding flushing the state can postpone some work (e.g. updates of capture
791 : // registers). Postponing work can save time when executing the regular
792 : // expression since it may be found that the work never has to be done as a
793 : // failure to match can occur. In addition it is much faster to jump to a
794 : // known backtrack code location than it is to pop an unknown backtrack
795 : // location from the stack and jump there.
796 : //
797 : // The virtual state found in the Trace affects code generation. For example
798 : // the virtual state contains the difference between the actual current
799 : // position and the virtual current position, and matching code needs to use
800 : // this offset to attempt a match in the correct location of the input
801 : // string. Therefore code generated for a non-trivial trace is specialized
802 : // to that trace. The code generator therefore has the ability to generate
803 : // code for each node several times. In order to limit the size of the
804 : // generated code there is an arbitrary limit on how many specialized sets of
805 : // code may be generated for a given node. If the limit is reached, the
806 : // trace is flushed and a generic version of the code for a node is emitted.
807 : // This is subsequently used for that node. The code emitted for non-generic
808 : // trace is not recorded in the node and so it cannot currently be reused in
809 : // the event that code generation is requested for an identical trace.
810 :
811 :
812 0 : void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
813 0 : UNREACHABLE();
814 : }
815 :
816 :
817 7537 : void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
818 7537 : text->AddElement(TextElement::Atom(this), zone);
819 7537 : }
820 :
821 :
822 8987 : void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
823 8987 : text->AddElement(TextElement::CharClass(this), zone);
824 8987 : }
825 :
826 :
827 0 : void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
828 0 : for (int i = 0; i < elements()->length(); i++)
829 0 : text->AddElement(elements()->at(i), zone);
830 0 : }
831 :
832 :
833 0 : TextElement TextElement::Atom(RegExpAtom* atom) {
834 0 : return TextElement(ATOM, atom);
835 : }
836 :
837 :
838 0 : TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
839 0 : return TextElement(CHAR_CLASS, char_class);
840 : }
841 :
842 :
843 8279570 : int TextElement::length() const {
844 8279570 : switch (text_type()) {
845 : case ATOM:
846 7378597 : return atom()->length();
847 :
848 : case CHAR_CLASS:
849 : return 1;
850 : }
851 0 : UNREACHABLE();
852 : }
853 :
854 :
855 0 : DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
856 0 : if (table_ == nullptr) {
857 0 : table_ = new(zone()) DispatchTable(zone());
858 : DispatchTableConstructor cons(table_, ignore_case, zone());
859 0 : cons.BuildTable(this);
860 : }
861 0 : return table_;
862 : }
863 :
864 :
865 : class FrequencyCollator {
866 : public:
867 12121614 : FrequencyCollator() : total_samples_(0) {
868 12027648 : for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
869 12027648 : frequencies_[i] = CharacterFrequency(i);
870 : }
871 : }
872 :
873 : void CountCharacter(int character) {
874 904114 : int index = (character & RegExpMacroAssembler::kTableMask);
875 904114 : frequencies_[index].Increment();
876 904114 : total_samples_++;
877 : }
878 :
879 : // Does not measure in percent, but rather per-128 (the table size from the
880 : // regexp macro assembler).
881 : int Frequency(int in_character) {
882 : DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
883 567013 : if (total_samples_ < 1) return 1; // Division by zero.
884 : int freq_in_per128 =
885 566668 : (frequencies_[in_character].counter() * 128) / total_samples_;
886 : return freq_in_per128;
887 : }
888 :
889 : private:
890 : class CharacterFrequency {
891 : public:
892 12027648 : CharacterFrequency() : counter_(0), character_(-1) { }
893 : explicit CharacterFrequency(int character)
894 : : counter_(0), character_(character) { }
895 :
896 904114 : void Increment() { counter_++; }
897 : int counter() { return counter_; }
898 : int character() { return character_; }
899 :
900 : private:
901 : int counter_;
902 : int character_;
903 : };
904 :
905 :
906 : private:
907 : CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
908 : int total_samples_;
909 : };
910 :
911 :
912 : class RegExpCompiler {
913 : public:
914 : RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
915 : JSRegExp::Flags flags, bool is_one_byte);
916 :
917 : int AllocateRegister() {
918 1012042 : if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
919 344670 : reg_exp_too_big_ = true;
920 : return next_register_;
921 : }
922 667372 : return next_register_++;
923 : }
924 :
925 : // Lookarounds to match lone surrogates for unicode character class matches
926 : // are never nested. We can therefore reuse registers.
927 : int UnicodeLookaroundStackRegister() {
928 2615 : if (unicode_lookaround_stack_register_ == kNoRegister) {
929 1124 : unicode_lookaround_stack_register_ = AllocateRegister();
930 : }
931 2615 : return unicode_lookaround_stack_register_;
932 : }
933 :
934 : int UnicodeLookaroundPositionRegister() {
935 2615 : if (unicode_lookaround_position_register_ == kNoRegister) {
936 1124 : unicode_lookaround_position_register_ = AllocateRegister();
937 : }
938 2615 : return unicode_lookaround_position_register_;
939 : }
940 :
941 : RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
942 : RegExpNode* start,
943 : int capture_count,
944 : Handle<String> pattern);
945 :
946 609095 : inline void AddWork(RegExpNode* node) {
947 609095 : if (!node->on_work_list() && !node->label()->is_bound()) {
948 : node->set_on_work_list(true);
949 215416 : work_list_->push_back(node);
950 : }
951 609095 : }
952 :
953 : static const int kImplementationOffset = 0;
954 : static const int kNumberOfRegistersOffset = 0;
955 : static const int kCodeOffset = 1;
956 :
957 : RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
958 : EndNode* accept() { return accept_; }
959 :
960 : static const int kMaxRecursion = 100;
961 : inline int recursion_depth() { return recursion_depth_; }
962 1127194 : inline void IncrementRecursionDepth() { recursion_depth_++; }
963 1127194 : inline void DecrementRecursionDepth() { recursion_depth_--; }
964 :
965 0 : void SetRegExpTooBig() { reg_exp_too_big_ = true; }
966 :
967 : inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
968 12575 : inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
969 : // Both unicode and ignore_case flags are set. We need to use ICU to find
970 : // the closure over case equivalents.
971 : inline bool needs_unicode_case_equivalents() {
972 206075 : return unicode() && ignore_case();
973 : }
974 : inline bool one_byte() { return one_byte_; }
975 : inline bool optimize() { return optimize_; }
976 92778 : inline void set_optimize(bool value) { optimize_ = value; }
977 : inline bool limiting_recursion() { return limiting_recursion_; }
978 : inline void set_limiting_recursion(bool value) {
979 965724 : limiting_recursion_ = value;
980 : }
981 : bool read_backward() { return read_backward_; }
982 3390 : void set_read_backward(bool value) { read_backward_ = value; }
983 : FrequencyCollator* frequency_collator() { return &frequency_collator_; }
984 :
985 : int current_expansion_factor() { return current_expansion_factor_; }
986 : void set_current_expansion_factor(int value) {
987 115753 : current_expansion_factor_ = value;
988 : }
989 :
990 : Isolate* isolate() const { return isolate_; }
991 : Zone* zone() const { return zone_; }
992 :
993 : static const int kNoRegister = -1;
994 :
995 : private:
996 : EndNode* accept_;
997 : int next_register_;
998 : int unicode_lookaround_stack_register_;
999 : int unicode_lookaround_position_register_;
1000 : std::vector<RegExpNode*>* work_list_;
1001 : int recursion_depth_;
1002 : RegExpMacroAssembler* macro_assembler_;
1003 : JSRegExp::Flags flags_;
1004 : bool one_byte_;
1005 : bool reg_exp_too_big_;
1006 : bool limiting_recursion_;
1007 : bool optimize_;
1008 : bool read_backward_;
1009 : int current_expansion_factor_;
1010 : FrequencyCollator frequency_collator_;
1011 : Isolate* isolate_;
1012 : Zone* zone_;
1013 : };
1014 :
1015 :
1016 : class RecursionCheck {
1017 : public:
1018 : explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1019 : compiler->IncrementRecursionDepth();
1020 : }
1021 : ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1022 : private:
1023 : RegExpCompiler* compiler_;
1024 : };
1025 :
1026 :
1027 : static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1028 : return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1029 : }
1030 :
1031 :
1032 : // Attempts to compile the regexp using an Irregexp code generator. Returns
1033 : // a fixed array or a null handle depending on whether it succeeded.
1034 93966 : RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1035 : JSRegExp::Flags flags, bool one_byte)
1036 93966 : : next_register_(2 * (capture_count + 1)),
1037 : unicode_lookaround_stack_register_(kNoRegister),
1038 : unicode_lookaround_position_register_(kNoRegister),
1039 : work_list_(nullptr),
1040 : recursion_depth_(0),
1041 : flags_(flags),
1042 : one_byte_(one_byte),
1043 : reg_exp_too_big_(false),
1044 : limiting_recursion_(false),
1045 : optimize_(FLAG_regexp_optimization),
1046 : read_backward_(false),
1047 : current_expansion_factor_(1),
1048 : frequency_collator_(),
1049 : isolate_(isolate),
1050 187932 : zone_(zone) {
1051 93966 : accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1052 : DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
1053 93966 : }
1054 :
1055 :
1056 93518 : RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1057 : RegExpMacroAssembler* macro_assembler,
1058 : RegExpNode* start,
1059 : int capture_count,
1060 : Handle<String> pattern) {
1061 : Isolate* isolate = pattern->GetHeap()->isolate();
1062 :
1063 : #ifdef DEBUG
1064 : if (FLAG_trace_regexp_assembler)
1065 : macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1066 : else
1067 : #endif
1068 93518 : macro_assembler_ = macro_assembler;
1069 :
1070 : std::vector<RegExpNode*> work_list;
1071 93518 : work_list_ = &work_list;
1072 : Label fail;
1073 93518 : macro_assembler_->PushBacktrack(&fail);
1074 93518 : Trace new_trace;
1075 93518 : start->Emit(this, &new_trace);
1076 93518 : macro_assembler_->Bind(&fail);
1077 93518 : macro_assembler_->Fail();
1078 402452 : while (!work_list.empty()) {
1079 215416 : RegExpNode* node = work_list.back();
1080 : work_list.pop_back();
1081 : node->set_on_work_list(false);
1082 215416 : if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1083 : }
1084 93518 : if (reg_exp_too_big_) {
1085 0 : macro_assembler_->AbortedCodeGeneration();
1086 0 : return IrregexpRegExpTooBig(isolate_);
1087 : }
1088 :
1089 93518 : Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1090 93518 : isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1091 93518 : work_list_ = nullptr;
1092 : #if defined(ENABLE_DISASSEMBLER) && !defined(V8_INTERPRETED_REGEXP)
1093 : if (FLAG_print_code) {
1094 : CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1095 : OFStream os(trace_scope.file());
1096 : Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1097 : }
1098 : #endif
1099 : #ifdef DEBUG
1100 : if (FLAG_trace_regexp_assembler) {
1101 : delete macro_assembler_;
1102 : }
1103 : #endif
1104 93518 : return RegExpEngine::CompilationResult(*code, next_register_);
1105 : }
1106 :
1107 :
1108 6294312 : bool Trace::DeferredAction::Mentions(int that) {
1109 3181380 : if (action_type() == ActionNode::CLEAR_CAPTURES) {
1110 : Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1111 : return range.Contains(that);
1112 : } else {
1113 3112932 : return reg() == that;
1114 : }
1115 : }
1116 :
1117 :
1118 0 : bool Trace::mentions_reg(int reg) {
1119 0 : for (DeferredAction* action = actions_; action != nullptr;
1120 : action = action->next()) {
1121 0 : if (action->Mentions(reg))
1122 : return true;
1123 : }
1124 : return false;
1125 : }
1126 :
1127 :
1128 967 : bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1129 : DCHECK_EQ(0, *cp_offset);
1130 1904 : for (DeferredAction* action = actions_; action != nullptr;
1131 : action = action->next()) {
1132 937 : if (action->Mentions(reg)) {
1133 414 : if (action->action_type() == ActionNode::STORE_POSITION) {
1134 414 : *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1135 414 : return true;
1136 : } else {
1137 : return false;
1138 : }
1139 : }
1140 : }
1141 : return false;
1142 : }
1143 :
1144 :
1145 535199 : int Trace::FindAffectedRegisters(OutSet* affected_registers,
1146 : Zone* zone) {
1147 : int max_register = RegExpCompiler::kNoRegister;
1148 2028299 : for (DeferredAction* action = actions_; action != nullptr;
1149 : action = action->next()) {
1150 501454 : if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1151 : Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1152 69111 : for (int i = range.from(); i <= range.to(); i++)
1153 63480 : affected_registers->Set(i, zone);
1154 5631 : if (range.to() > max_register) max_register = range.to();
1155 : } else {
1156 495823 : affected_registers->Set(action->reg(), zone);
1157 495823 : if (action->reg() > max_register) max_register = action->reg();
1158 : }
1159 : }
1160 535199 : return max_register;
1161 : }
1162 :
1163 :
1164 535199 : void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1165 : int max_register,
1166 : const OutSet& registers_to_pop,
1167 : const OutSet& registers_to_clear) {
1168 12842751 : for (int reg = max_register; reg >= 0; reg--) {
1169 12307552 : if (registers_to_pop.Get(reg)) {
1170 77019 : assembler->PopRegister(reg);
1171 12230533 : } else if (registers_to_clear.Get(reg)) {
1172 : int clear_to = reg;
1173 240259 : while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1174 138883 : reg--;
1175 : }
1176 101376 : assembler->ClearRegisters(reg, clear_to);
1177 : }
1178 : }
1179 535199 : }
1180 :
1181 :
1182 535199 : void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1183 : int max_register,
1184 : const OutSet& affected_registers,
1185 : OutSet* registers_to_pop,
1186 : OutSet* registers_to_clear,
1187 : Zone* zone) {
1188 : // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1189 535199 : const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1190 :
1191 : // Count pushes performed to force a stack limit check occasionally.
1192 : int pushes = 0;
1193 :
1194 12981634 : for (int reg = 0; reg <= max_register; reg++) {
1195 12446435 : if (!affected_registers.Get(reg)) {
1196 : continue;
1197 : }
1198 :
1199 : // The chronologically first deferred action in the trace
1200 : // is used to infer the action needed to restore a register
1201 : // to its previous state (or not, if it's safe to ignore it).
1202 : enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1203 : DeferredActionUndoType undo_action = IGNORE;
1204 :
1205 : int value = 0;
1206 : bool absolute = false;
1207 : bool clear = false;
1208 : static const int kNoStore = kMinInt;
1209 : int store_position = kNoStore;
1210 : // This is a little tricky because we are scanning the actions in reverse
1211 : // historical order (newest first).
1212 3727242 : for (DeferredAction* action = actions_; action != nullptr;
1213 : action = action->next()) {
1214 3180443 : if (action->Mentions(reg)) {
1215 559303 : switch (action->action_type()) {
1216 : case ActionNode::SET_REGISTER: {
1217 6013 : Trace::DeferredSetRegister* psr =
1218 : static_cast<Trace::DeferredSetRegister*>(action);
1219 6013 : if (!absolute) {
1220 6013 : value += psr->value();
1221 : absolute = true;
1222 : }
1223 : // SET_REGISTER is currently only used for newly introduced loop
1224 : // counters. They can have a significant previous value if they
1225 : // occur in a loop. TODO(lrn): Propagate this information, so
1226 : // we can set undo_action to IGNORE if we know there is no value to
1227 : // restore.
1228 : undo_action = RESTORE;
1229 : DCHECK_EQ(store_position, kNoStore);
1230 : DCHECK(!clear);
1231 : break;
1232 : }
1233 : case ActionNode::INCREMENT_REGISTER:
1234 6910 : if (!absolute) {
1235 6910 : value++;
1236 : }
1237 : DCHECK_EQ(store_position, kNoStore);
1238 : DCHECK(!clear);
1239 : undo_action = RESTORE;
1240 : break;
1241 : case ActionNode::STORE_POSITION: {
1242 736279 : Trace::DeferredCapture* pc =
1243 : static_cast<Trace::DeferredCapture*>(action);
1244 482900 : if (!clear && store_position == kNoStore) {
1245 : store_position = pc->cp_offset();
1246 : }
1247 :
1248 : // For captures we know that stores and clears alternate.
1249 : // Other register, are never cleared, and if the occur
1250 : // inside a loop, they might be assigned more than once.
1251 482900 : if (reg <= 1) {
1252 : // Registers zero and one, aka "capture zero", is
1253 : // always set correctly if we succeed. There is no
1254 : // need to undo a setting on backtrack, because we
1255 : // will set it again or fail.
1256 : undo_action = IGNORE;
1257 : } else {
1258 253379 : undo_action = pc->is_capture() ? CLEAR : RESTORE;
1259 : }
1260 : DCHECK(!absolute);
1261 : DCHECK_EQ(value, 0);
1262 : break;
1263 : }
1264 : case ActionNode::CLEAR_CAPTURES: {
1265 : // Since we're scanning in reverse order, if we've already
1266 : // set the position we have to ignore historically earlier
1267 : // clearing operations.
1268 63480 : if (store_position == kNoStore) {
1269 : clear = true;
1270 : }
1271 : undo_action = RESTORE;
1272 : DCHECK(!absolute);
1273 : DCHECK_EQ(value, 0);
1274 : break;
1275 : }
1276 : default:
1277 0 : UNREACHABLE();
1278 : break;
1279 : }
1280 : }
1281 : }
1282 : // Prepare for the undo-action (e.g., push if it's going to be popped).
1283 546799 : if (undo_action == RESTORE) {
1284 77019 : pushes++;
1285 : RegExpMacroAssembler::StackCheckFlag stack_check =
1286 : RegExpMacroAssembler::kNoStackLimitCheck;
1287 77019 : if (pushes == push_limit) {
1288 : stack_check = RegExpMacroAssembler::kCheckStackLimit;
1289 : pushes = 0;
1290 : }
1291 :
1292 77019 : assembler->PushRegister(reg, stack_check);
1293 77019 : registers_to_pop->Set(reg, zone);
1294 469780 : } else if (undo_action == CLEAR) {
1295 240259 : registers_to_clear->Set(reg, zone);
1296 : }
1297 : // Perform the chronologically last action (or accumulated increment)
1298 : // for the register.
1299 546799 : if (store_position != kNoStore) {
1300 482900 : assembler->WriteCurrentPositionToRegister(reg, store_position);
1301 63899 : } else if (clear) {
1302 50976 : assembler->ClearRegisters(reg, reg);
1303 12923 : } else if (absolute) {
1304 6013 : assembler->SetRegister(reg, value);
1305 6910 : } else if (value != 0) {
1306 6910 : assembler->AdvanceRegister(reg, value);
1307 : }
1308 : }
1309 535199 : }
1310 :
1311 :
1312 : // This is called as we come into a loop choice node and some other tricky
1313 : // nodes. It normalizes the state of the code generator to ensure we can
1314 : // generate generic code.
1315 3771194 : void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1316 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1317 :
1318 : DCHECK(!is_trivial());
1319 :
1320 1210945 : if (actions_ == nullptr && backtrack() == nullptr) {
1321 : // Here we just have some deferred cp advances to fix and we are back to
1322 : // a normal situation. We may also have to forget some information gained
1323 : // through a quick check that was already performed.
1324 198336 : if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1325 : // Create a new trivial state and generate the node with that.
1326 198336 : Trace new_state;
1327 198336 : successor->Emit(compiler, &new_state);
1328 733535 : return;
1329 : }
1330 :
1331 : // Generate deferred actions here along with code to undo them again.
1332 : OutSet affected_registers;
1333 :
1334 535199 : if (backtrack() != nullptr) {
1335 : // Here we have a concrete backtrack location. These are set up by choice
1336 : // nodes and so they indicate that we have a deferred save of the current
1337 : // position which we may need to emit here.
1338 419453 : assembler->PushCurrentPosition();
1339 : }
1340 :
1341 : int max_register = FindAffectedRegisters(&affected_registers,
1342 535199 : compiler->zone());
1343 : OutSet registers_to_pop;
1344 : OutSet registers_to_clear;
1345 : PerformDeferredActions(assembler,
1346 : max_register,
1347 : affected_registers,
1348 : ®isters_to_pop,
1349 : ®isters_to_clear,
1350 535199 : compiler->zone());
1351 535199 : if (cp_offset_ != 0) {
1352 310091 : assembler->AdvanceCurrentPosition(cp_offset_);
1353 : }
1354 :
1355 : // Create a new trivial state and generate the node with that.
1356 : Label undo;
1357 535199 : assembler->PushBacktrack(&undo);
1358 535199 : if (successor->KeepRecursing(compiler)) {
1359 159045 : Trace new_state;
1360 159045 : successor->Emit(compiler, &new_state);
1361 : } else {
1362 376154 : compiler->AddWork(successor);
1363 376154 : assembler->GoTo(successor->label());
1364 : }
1365 :
1366 : // On backtrack we need to restore state.
1367 535199 : assembler->Bind(&undo);
1368 : RestoreAffectedRegisters(assembler,
1369 : max_register,
1370 : registers_to_pop,
1371 535199 : registers_to_clear);
1372 535199 : if (backtrack() == nullptr) {
1373 115746 : assembler->Backtrack();
1374 : } else {
1375 419453 : assembler->PopCurrentPosition();
1376 838906 : assembler->GoTo(backtrack());
1377 : }
1378 : }
1379 :
1380 :
1381 2957 : void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1382 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1383 :
1384 : // Omit flushing the trace. We discard the entire stack frame anyway.
1385 :
1386 2957 : if (!label()->is_bound()) {
1387 : // We are completely independent of the trace, since we ignore it,
1388 : // so this code can be used as the generic version.
1389 2916 : assembler->Bind(label());
1390 : }
1391 :
1392 : // Throw away everything on the backtrack stack since the start
1393 : // of the negative submatch and restore the character position.
1394 2957 : assembler->ReadCurrentPositionFromRegister(current_position_register_);
1395 2957 : assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1396 2957 : if (clear_capture_count_ > 0) {
1397 : // Clear any captures that might have been performed during the success
1398 : // of the body of the negative look-ahead.
1399 115 : int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1400 115 : assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1401 : }
1402 : // Now that we have unwound the stack we find at the top of the stack the
1403 : // backtrack that the BeginSubmatch node got.
1404 2957 : assembler->Backtrack();
1405 2957 : }
1406 :
1407 :
1408 311769 : void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1409 207540 : if (!trace->is_trivial()) {
1410 103617 : trace->Flush(compiler, this);
1411 103617 : return;
1412 : }
1413 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1414 103923 : if (!label()->is_bound()) {
1415 93507 : assembler->Bind(label());
1416 : }
1417 103923 : switch (action_) {
1418 : case ACCEPT:
1419 103617 : assembler->Succeed();
1420 103617 : return;
1421 : case BACKTRACK:
1422 612 : assembler->GoTo(trace->backtrack());
1423 306 : return;
1424 : case NEGATIVE_SUBMATCH_SUCCESS:
1425 : // This case is handled in a different virtual method.
1426 0 : UNREACHABLE();
1427 : }
1428 0 : UNIMPLEMENTED();
1429 : }
1430 :
1431 :
1432 1007347 : void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1433 2014694 : if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone);
1434 1007347 : guards_->Add(guard, zone);
1435 1007347 : }
1436 :
1437 :
1438 1005686 : ActionNode* ActionNode::SetRegister(int reg,
1439 : int val,
1440 1005686 : RegExpNode* on_success) {
1441 : ActionNode* result =
1442 : new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1443 1005686 : result->data_.u_store_register.reg = reg;
1444 1005686 : result->data_.u_store_register.value = val;
1445 1005686 : return result;
1446 : }
1447 :
1448 :
1449 1005686 : ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1450 : ActionNode* result =
1451 : new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1452 1005686 : result->data_.u_increment_register.reg = reg;
1453 1005686 : return result;
1454 : }
1455 :
1456 :
1457 263710 : ActionNode* ActionNode::StorePosition(int reg,
1458 : bool is_capture,
1459 263710 : RegExpNode* on_success) {
1460 : ActionNode* result =
1461 : new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1462 263710 : result->data_.u_position_register.reg = reg;
1463 263710 : result->data_.u_position_register.is_capture = is_capture;
1464 263710 : return result;
1465 : }
1466 :
1467 :
1468 3983 : ActionNode* ActionNode::ClearCaptures(Interval range,
1469 3983 : RegExpNode* on_success) {
1470 : ActionNode* result =
1471 : new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1472 3983 : result->data_.u_clear_captures.range_from = range.from();
1473 3983 : result->data_.u_clear_captures.range_to = range.to();
1474 3983 : return result;
1475 : }
1476 :
1477 :
1478 4473 : ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1479 : int position_reg,
1480 4473 : RegExpNode* on_success) {
1481 : ActionNode* result =
1482 : new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1483 4473 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1484 4473 : result->data_.u_submatch.current_position_register = position_reg;
1485 4473 : return result;
1486 : }
1487 :
1488 :
1489 1545 : ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1490 : int position_reg,
1491 : int clear_register_count,
1492 : int clear_register_from,
1493 1545 : RegExpNode* on_success) {
1494 : ActionNode* result =
1495 : new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1496 1545 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1497 1545 : result->data_.u_submatch.current_position_register = position_reg;
1498 1545 : result->data_.u_submatch.clear_register_count = clear_register_count;
1499 1545 : result->data_.u_submatch.clear_register_from = clear_register_from;
1500 1545 : return result;
1501 : }
1502 :
1503 :
1504 572 : ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1505 : int repetition_register,
1506 : int repetition_limit,
1507 572 : RegExpNode* on_success) {
1508 : ActionNode* result =
1509 : new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1510 572 : result->data_.u_empty_match_check.start_register = start_register;
1511 572 : result->data_.u_empty_match_check.repetition_register = repetition_register;
1512 572 : result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1513 572 : return result;
1514 : }
1515 :
1516 :
1517 : #define DEFINE_ACCEPT(Type) \
1518 : void Type##Node::Accept(NodeVisitor* visitor) { \
1519 : visitor->Visit##Type(this); \
1520 : }
1521 824232 : FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1522 : #undef DEFINE_ACCEPT
1523 :
1524 :
1525 174344 : void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1526 174344 : visitor->VisitLoopChoice(this);
1527 174344 : }
1528 :
1529 :
1530 : // -------------------------------------------------------------------
1531 : // Emit code.
1532 :
1533 :
1534 7347 : void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1535 14694 : Guard* guard,
1536 7347 : Trace* trace) {
1537 7347 : switch (guard->op()) {
1538 : case Guard::LT:
1539 : DCHECK(!trace->mentions_reg(guard->reg()));
1540 : macro_assembler->IfRegisterGE(guard->reg(),
1541 : guard->value(),
1542 8820 : trace->backtrack());
1543 4410 : break;
1544 : case Guard::GEQ:
1545 : DCHECK(!trace->mentions_reg(guard->reg()));
1546 : macro_assembler->IfRegisterLT(guard->reg(),
1547 : guard->value(),
1548 5874 : trace->backtrack());
1549 2937 : break;
1550 : }
1551 7347 : }
1552 :
1553 :
1554 : // Returns the number of characters in the equivalence class, omitting those
1555 : // that cannot occur in the source string because it is Latin1.
1556 55859 : static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1557 : bool one_byte_subject,
1558 : unibrow::uchar* letters) {
1559 : int length =
1560 55859 : isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1561 : // Unibrow returns 0 or 1 for characters where case independence is
1562 : // trivial.
1563 55859 : if (length == 0) {
1564 7235 : letters[0] = character;
1565 : length = 1;
1566 : }
1567 :
1568 55859 : if (one_byte_subject) {
1569 : int new_length = 0;
1570 95108 : for (int i = 0; i < length; i++) {
1571 95108 : if (letters[i] <= String::kMaxOneByteCharCode) {
1572 94784 : letters[new_length++] = letters[i];
1573 : }
1574 : }
1575 : length = new_length;
1576 : }
1577 :
1578 55859 : return length;
1579 : }
1580 :
1581 :
1582 586829 : static inline bool EmitSimpleCharacter(Isolate* isolate,
1583 586829 : RegExpCompiler* compiler,
1584 : uc16 c,
1585 : Label* on_failure,
1586 : int cp_offset,
1587 : bool check,
1588 : bool preloaded) {
1589 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1590 : bool bound_checked = false;
1591 586829 : if (!preloaded) {
1592 : assembler->LoadCurrentCharacter(
1593 : cp_offset,
1594 : on_failure,
1595 586829 : check);
1596 : bound_checked = true;
1597 : }
1598 586829 : assembler->CheckNotCharacter(c, on_failure);
1599 586829 : return bound_checked;
1600 : }
1601 :
1602 :
1603 : // Only emits non-letters (things that don't have case). Only used for case
1604 : // independent matches.
1605 13714 : static inline bool EmitAtomNonLetter(Isolate* isolate,
1606 13714 : RegExpCompiler* compiler,
1607 : uc16 c,
1608 : Label* on_failure,
1609 : int cp_offset,
1610 : bool check,
1611 : bool preloaded) {
1612 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1613 : bool one_byte = compiler->one_byte();
1614 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1615 13714 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1616 13714 : if (length < 1) {
1617 : // This can't match. Must be an one-byte subject and a non-one-byte
1618 : // character. We do not need to do anything since the one-byte pass
1619 : // already handled this.
1620 : return false; // Bounds not checked.
1621 : }
1622 : bool checked = false;
1623 : // We handle the length > 1 case in a later pass.
1624 13708 : if (length == 1) {
1625 1487 : if (one_byte && c > String::kMaxOneByteCharCodeU) {
1626 : // Can't match - see above.
1627 : return false; // Bounds not checked.
1628 : }
1629 1487 : if (!preloaded) {
1630 1487 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1631 : checked = check;
1632 : }
1633 1487 : macro_assembler->CheckNotCharacter(c, on_failure);
1634 : }
1635 13708 : return checked;
1636 : }
1637 :
1638 :
1639 11894 : static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1640 : bool one_byte, uc16 c1, uc16 c2,
1641 : Label* on_failure) {
1642 : uc16 char_mask;
1643 11894 : if (one_byte) {
1644 : char_mask = String::kMaxOneByteCharCode;
1645 : } else {
1646 : char_mask = String::kMaxUtf16CodeUnit;
1647 : }
1648 11894 : uc16 exor = c1 ^ c2;
1649 : // Check whether exor has only one bit set.
1650 11894 : if (((exor - 1) & exor) == 0) {
1651 : // If c1 and c2 differ only by one bit.
1652 : // Ecma262UnCanonicalize always gives the highest number last.
1653 : DCHECK(c2 > c1);
1654 11795 : uc16 mask = char_mask ^ exor;
1655 11795 : macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1656 11795 : return true;
1657 : }
1658 : DCHECK(c2 > c1);
1659 99 : uc16 diff = c2 - c1;
1660 99 : if (((diff - 1) & diff) == 0 && c1 >= diff) {
1661 : // If the characters differ by 2^n but don't differ by one bit then
1662 : // subtract the difference from the found character, then do the or
1663 : // trick. We avoid the theoretical case where negative numbers are
1664 : // involved in order to simplify code generation.
1665 87 : uc16 mask = char_mask ^ diff;
1666 : macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1667 : diff,
1668 : mask,
1669 87 : on_failure);
1670 87 : return true;
1671 : }
1672 : return false;
1673 : }
1674 :
1675 :
1676 : typedef bool EmitCharacterFunction(Isolate* isolate,
1677 : RegExpCompiler* compiler,
1678 : uc16 c,
1679 : Label* on_failure,
1680 : int cp_offset,
1681 : bool check,
1682 : bool preloaded);
1683 :
1684 : // Only emits letters (things that have case). Only used for case independent
1685 : // matches.
1686 13714 : static inline bool EmitAtomLetter(Isolate* isolate,
1687 13714 : RegExpCompiler* compiler,
1688 : uc16 c,
1689 : Label* on_failure,
1690 : int cp_offset,
1691 : bool check,
1692 : bool preloaded) {
1693 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1694 : bool one_byte = compiler->one_byte();
1695 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1696 13714 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1697 13714 : if (length <= 1) return false;
1698 : // We may not need to check against the end of the input string
1699 : // if this character lies before a character that matched.
1700 12221 : if (!preloaded) {
1701 11891 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1702 : }
1703 : Label ok;
1704 : DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth);
1705 12221 : switch (length) {
1706 : case 2: {
1707 23788 : if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1708 23788 : chars[1], on_failure)) {
1709 : } else {
1710 12 : macro_assembler->CheckCharacter(chars[0], &ok);
1711 12 : macro_assembler->CheckNotCharacter(chars[1], on_failure);
1712 12 : macro_assembler->Bind(&ok);
1713 : }
1714 : break;
1715 : }
1716 : case 4:
1717 25 : macro_assembler->CheckCharacter(chars[3], &ok);
1718 : // Fall through!
1719 : case 3:
1720 327 : macro_assembler->CheckCharacter(chars[0], &ok);
1721 327 : macro_assembler->CheckCharacter(chars[1], &ok);
1722 327 : macro_assembler->CheckNotCharacter(chars[2], on_failure);
1723 327 : macro_assembler->Bind(&ok);
1724 327 : break;
1725 : default:
1726 0 : UNREACHABLE();
1727 : break;
1728 : }
1729 : return true;
1730 : }
1731 :
1732 :
1733 8837 : static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1734 : int border,
1735 : Label* fall_through,
1736 : Label* above_or_equal,
1737 : Label* below) {
1738 8837 : if (below != fall_through) {
1739 8461 : masm->CheckCharacterLT(border, below);
1740 8461 : if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1741 : } else {
1742 376 : masm->CheckCharacterGT(border - 1, above_or_equal);
1743 : }
1744 8837 : }
1745 :
1746 :
1747 166104 : static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1748 : int first,
1749 : int last,
1750 : Label* fall_through,
1751 : Label* in_range,
1752 : Label* out_of_range) {
1753 166104 : if (in_range == fall_through) {
1754 115335 : if (first == last) {
1755 12819 : masm->CheckNotCharacter(first, out_of_range);
1756 : } else {
1757 102516 : masm->CheckCharacterNotInRange(first, last, out_of_range);
1758 : }
1759 : } else {
1760 50769 : if (first == last) {
1761 25419 : masm->CheckCharacter(first, in_range);
1762 : } else {
1763 25350 : masm->CheckCharacterInRange(first, last, in_range);
1764 : }
1765 50769 : if (out_of_range != fall_through) masm->GoTo(out_of_range);
1766 : }
1767 166104 : }
1768 :
1769 :
1770 : // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1771 : // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1772 5701 : static void EmitUseLookupTable(
1773 5701 : RegExpMacroAssembler* masm,
1774 : ZoneList<int>* ranges,
1775 : int start_index,
1776 : int end_index,
1777 : int min_char,
1778 : Label* fall_through,
1779 : Label* even_label,
1780 : Label* odd_label) {
1781 : static const int kSize = RegExpMacroAssembler::kTableSize;
1782 : static const int kMask = RegExpMacroAssembler::kTableMask;
1783 :
1784 : int base = (min_char & ~kMask);
1785 : USE(base);
1786 :
1787 : // Assert that everything is on one kTableSize page.
1788 : for (int i = start_index; i <= end_index; i++) {
1789 : DCHECK_EQ(ranges->at(i) & ~kMask, base);
1790 : }
1791 : DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1792 :
1793 : char templ[kSize];
1794 : Label* on_bit_set;
1795 : Label* on_bit_clear;
1796 : int bit;
1797 5701 : if (even_label == fall_through) {
1798 : on_bit_set = odd_label;
1799 : on_bit_clear = even_label;
1800 : bit = 1;
1801 : } else {
1802 : on_bit_set = even_label;
1803 : on_bit_clear = odd_label;
1804 : bit = 0;
1805 : }
1806 229535 : for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1807 111917 : templ[i] = bit;
1808 : }
1809 : int j = 0;
1810 5701 : bit ^= 1;
1811 94364 : for (int i = start_index; i < end_index; i++) {
1812 1187892 : for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1813 505283 : templ[j] = bit;
1814 : }
1815 88663 : bit ^= 1;
1816 : }
1817 112528 : for (int i = j; i < kSize; i++) {
1818 112528 : templ[i] = bit;
1819 : }
1820 : Factory* factory = masm->isolate()->factory();
1821 : // TODO(erikcorry): Cache these.
1822 5701 : Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1823 729728 : for (int i = 0; i < kSize; i++) {
1824 729728 : ba->set(i, templ[i]);
1825 : }
1826 5701 : masm->CheckBitInTable(ba, on_bit_set);
1827 5701 : if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1828 5701 : }
1829 :
1830 :
1831 36410 : static void CutOutRange(RegExpMacroAssembler* masm,
1832 : ZoneList<int>* ranges,
1833 : int start_index,
1834 : int end_index,
1835 : int cut_index,
1836 : Label* even_label,
1837 : Label* odd_label) {
1838 36410 : bool odd = (((cut_index - start_index) & 1) == 1);
1839 36410 : Label* in_range_label = odd ? odd_label : even_label;
1840 : Label dummy;
1841 : EmitDoubleBoundaryTest(masm,
1842 : ranges->at(cut_index),
1843 36410 : ranges->at(cut_index + 1) - 1,
1844 : &dummy,
1845 : in_range_label,
1846 72820 : &dummy);
1847 : DCHECK(!dummy.is_linked());
1848 : // Cut out the single range by rewriting the array. This creates a new
1849 : // range that is a merger of the two ranges on either side of the one we
1850 : // are cutting out. The oddity of the labels is preserved.
1851 92023 : for (int j = cut_index; j > start_index; j--) {
1852 38406 : ranges->at(j) = ranges->at(j - 1);
1853 : }
1854 109730 : for (int j = cut_index + 1; j < end_index; j++) {
1855 146640 : ranges->at(j) = ranges->at(j + 1);
1856 : }
1857 36410 : }
1858 :
1859 :
1860 : // Unicode case. Split the search space into kSize spaces that are handled
1861 : // with recursion.
1862 19443 : static void SplitSearchSpace(ZoneList<int>* ranges,
1863 : int start_index,
1864 : int end_index,
1865 : int* new_start_index,
1866 : int* new_end_index,
1867 : int* border) {
1868 : static const int kSize = RegExpMacroAssembler::kTableSize;
1869 : static const int kMask = RegExpMacroAssembler::kTableMask;
1870 :
1871 19443 : int first = ranges->at(start_index);
1872 19443 : int last = ranges->at(end_index) - 1;
1873 :
1874 19443 : *new_start_index = start_index;
1875 19443 : *border = (ranges->at(start_index) & ~kMask) + kSize;
1876 167634 : while (*new_start_index < end_index) {
1877 146986 : if (ranges->at(*new_start_index) > *border) break;
1878 128748 : (*new_start_index)++;
1879 : }
1880 : // new_start_index is the index of the first edge that is beyond the
1881 : // current kSize space.
1882 :
1883 : // For very large search spaces we do a binary chop search of the non-Latin1
1884 : // space instead of just going to the end of the current kSize space. The
1885 : // heuristics are complicated a little by the fact that any 128-character
1886 : // encoding space can be quickly tested with a table lookup, so we don't
1887 : // wish to do binary chop search at a smaller granularity than that. A
1888 : // 128-character space can take up a lot of space in the ranges array if,
1889 : // for example, we only want to match every second character (eg. the lower
1890 : // case characters on some Unicode pages).
1891 19443 : int binary_chop_index = (end_index + start_index) / 2;
1892 : // The first test ensures that we get to the code that handles the Latin1
1893 : // range with a single not-taken branch, speeding up this important
1894 : // character range (even non-Latin1 charset-based text has spaces and
1895 : // punctuation).
1896 53869 : if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1897 27682 : end_index - start_index > (*new_start_index - start_index) * 2 &&
1898 55484 : last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1899 23112 : ranges->at(binary_chop_index) >= first + 2 * kSize) {
1900 : int scan_forward_for_section_border = binary_chop_index;;
1901 9570 : int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1902 :
1903 72129 : while (scan_forward_for_section_border < end_index) {
1904 60893 : if (ranges->at(scan_forward_for_section_border) > new_border) {
1905 7904 : *new_start_index = scan_forward_for_section_border;
1906 7904 : *border = new_border;
1907 7904 : break;
1908 : }
1909 52989 : scan_forward_for_section_border++;
1910 : }
1911 : }
1912 :
1913 : DCHECK(*new_start_index > start_index);
1914 19443 : *new_end_index = *new_start_index - 1;
1915 19443 : if (ranges->at(*new_end_index) == *border) {
1916 2965 : (*new_end_index)--;
1917 : }
1918 38886 : if (*border >= ranges->at(end_index)) {
1919 1203 : *border = ranges->at(end_index);
1920 1203 : *new_start_index = end_index; // Won't be used.
1921 1203 : *new_end_index = end_index - 1;
1922 : }
1923 19443 : }
1924 :
1925 : // Gets a series of segment boundaries representing a character class. If the
1926 : // character is in the range between an even and an odd boundary (counting from
1927 : // start_index) then go to even_label, otherwise go to odd_label. We already
1928 : // know that the character is in the range of min_char to max_char inclusive.
1929 : // Either label can be nullptr indicating backtracking. Either label can also
1930 : // be equal to the fall_through label.
1931 208492 : static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1932 : int start_index, int end_index, uc32 min_char,
1933 : uc32 max_char, Label* fall_through,
1934 : Label* even_label, Label* odd_label) {
1935 : DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1936 : DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1937 :
1938 208492 : int first = ranges->at(start_index);
1939 208492 : int last = ranges->at(end_index) - 1;
1940 :
1941 : DCHECK_LT(min_char, first);
1942 :
1943 : // Just need to test if the character is before or on-or-after
1944 : // a particular character.
1945 208492 : if (start_index == end_index) {
1946 8837 : EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1947 8837 : return;
1948 : }
1949 :
1950 : // Another almost trivial case: There is one interval in the middle that is
1951 : // different from the end intervals.
1952 199655 : if (start_index + 1 == end_index) {
1953 : EmitDoubleBoundaryTest(
1954 129694 : masm, first, last, fall_through, even_label, odd_label);
1955 129694 : return;
1956 : }
1957 :
1958 : // It's not worth using table lookup if there are very few intervals in the
1959 : // character class.
1960 69961 : if (end_index - start_index <= 6) {
1961 : // It is faster to test for individual characters, so we look for those
1962 : // first, then try arbitrary ranges in the second round.
1963 : static int kNoCutIndex = -1;
1964 36410 : int cut = kNoCutIndex;
1965 151135 : for (int i = start_index; i < end_index; i++) {
1966 191624 : if (ranges->at(i) == ranges->at(i + 1) - 1) {
1967 : cut = i;
1968 : break;
1969 : }
1970 : }
1971 36410 : if (cut == kNoCutIndex) cut = start_index;
1972 : CutOutRange(
1973 36410 : masm, ranges, start_index, end_index, cut, even_label, odd_label);
1974 : DCHECK_GE(end_index - start_index, 2);
1975 : GenerateBranches(masm,
1976 : ranges,
1977 : start_index + 1,
1978 : end_index - 1,
1979 : min_char,
1980 : max_char,
1981 : fall_through,
1982 : even_label,
1983 36410 : odd_label);
1984 36410 : return;
1985 : }
1986 :
1987 : // If there are a lot of intervals in the regexp, then we will use tables to
1988 : // determine whether the character is inside or outside the character class.
1989 : static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1990 :
1991 33551 : if ((max_char >> kBits) == (min_char >> kBits)) {
1992 : EmitUseLookupTable(masm,
1993 : ranges,
1994 : start_index,
1995 : end_index,
1996 : min_char,
1997 : fall_through,
1998 : even_label,
1999 5701 : odd_label);
2000 5701 : return;
2001 : }
2002 :
2003 27850 : if ((min_char >> kBits) != (first >> kBits)) {
2004 8407 : masm->CheckCharacterLT(first, odd_label);
2005 : GenerateBranches(masm,
2006 : ranges,
2007 : start_index + 1,
2008 : end_index,
2009 : first,
2010 : max_char,
2011 : fall_through,
2012 : odd_label,
2013 8407 : even_label);
2014 8407 : return;
2015 : }
2016 :
2017 19443 : int new_start_index = 0;
2018 19443 : int new_end_index = 0;
2019 19443 : int border = 0;
2020 :
2021 : SplitSearchSpace(ranges,
2022 : start_index,
2023 : end_index,
2024 : &new_start_index,
2025 : &new_end_index,
2026 19443 : &border);
2027 :
2028 : Label handle_rest;
2029 : Label* above = &handle_rest;
2030 19443 : if (border == last + 1) {
2031 : // We didn't find any section that started after the limit, so everything
2032 : // above the border is one of the terminal labels.
2033 1203 : above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2034 : DCHECK(new_end_index == end_index - 1);
2035 : }
2036 :
2037 : DCHECK_LE(start_index, new_end_index);
2038 : DCHECK_LE(new_start_index, end_index);
2039 : DCHECK_LT(start_index, new_start_index);
2040 : DCHECK_LT(new_end_index, end_index);
2041 : DCHECK(new_end_index + 1 == new_start_index ||
2042 : (new_end_index + 2 == new_start_index &&
2043 : border == ranges->at(new_end_index + 1)));
2044 : DCHECK_LT(min_char, border - 1);
2045 : DCHECK_LT(border, max_char);
2046 : DCHECK_LT(ranges->at(new_end_index), border);
2047 : DCHECK(border < ranges->at(new_start_index) ||
2048 : (border == ranges->at(new_start_index) &&
2049 : new_start_index == end_index &&
2050 : new_end_index == end_index - 1 &&
2051 : border == last + 1));
2052 : DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2053 :
2054 19443 : masm->CheckCharacterGT(border - 1, above);
2055 : Label dummy;
2056 : GenerateBranches(masm,
2057 : ranges,
2058 : start_index,
2059 : new_end_index,
2060 : min_char,
2061 : border - 1,
2062 : &dummy,
2063 : even_label,
2064 19443 : odd_label);
2065 19443 : if (handle_rest.is_linked()) {
2066 18240 : masm->Bind(&handle_rest);
2067 18240 : bool flip = (new_start_index & 1) != (start_index & 1);
2068 : GenerateBranches(masm,
2069 : ranges,
2070 : new_start_index,
2071 : end_index,
2072 : border,
2073 : max_char,
2074 : &dummy,
2075 : flip ? odd_label : even_label,
2076 18240 : flip ? even_label : odd_label);
2077 : }
2078 : }
2079 :
2080 :
2081 231341 : static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2082 : RegExpCharacterClass* cc, bool one_byte,
2083 : Label* on_failure, int cp_offset, bool check_offset,
2084 : bool preloaded, Zone* zone) {
2085 231341 : ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2086 231341 : CharacterRange::Canonicalize(ranges);
2087 :
2088 : int max_char;
2089 231341 : if (one_byte) {
2090 : max_char = String::kMaxOneByteCharCode;
2091 : } else {
2092 : max_char = String::kMaxUtf16CodeUnit;
2093 : }
2094 :
2095 : int range_count = ranges->length();
2096 :
2097 231341 : int last_valid_range = range_count - 1;
2098 652520 : while (last_valid_range >= 0) {
2099 421141 : CharacterRange& range = ranges->at(last_valid_range);
2100 421141 : if (range.from() <= max_char) {
2101 : break;
2102 : }
2103 189838 : last_valid_range--;
2104 : }
2105 :
2106 231341 : if (last_valid_range < 0) {
2107 38 : if (!cc->is_negated()) {
2108 12 : macro_assembler->GoTo(on_failure);
2109 : }
2110 38 : if (check_offset) {
2111 38 : macro_assembler->CheckPosition(cp_offset, on_failure);
2112 : }
2113 105349 : return;
2114 : }
2115 :
2116 432391 : if (last_valid_range == 0 &&
2117 : ranges->at(0).IsEverything(max_char)) {
2118 89459 : if (cc->is_negated()) {
2119 18 : macro_assembler->GoTo(on_failure);
2120 : } else {
2121 : // This is a common case hit by non-anchored expressions.
2122 89441 : if (check_offset) {
2123 53890 : macro_assembler->CheckPosition(cp_offset, on_failure);
2124 : }
2125 : }
2126 : return;
2127 : }
2128 :
2129 141844 : if (!preloaded) {
2130 128356 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2131 : }
2132 :
2133 158020 : if (cc->is_standard(zone) &&
2134 : macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2135 32352 : on_failure)) {
2136 : return;
2137 : }
2138 :
2139 :
2140 : // A new list with ascending entries. Each entry is a code unit
2141 : // where there is a boundary between code units that are part of
2142 : // the class and code units that are not. Normally we insert an
2143 : // entry at zero which goes to the failure label, but if there
2144 : // was already one there we fall through for success on that entry.
2145 : // Subsequent entries have alternating meaning (success/failure).
2146 125992 : ZoneList<int>* range_boundaries =
2147 125992 : new(zone) ZoneList<int>(last_valid_range, zone);
2148 :
2149 125992 : bool zeroth_entry_is_failure = !cc->is_negated();
2150 :
2151 352299 : for (int i = 0; i <= last_valid_range; i++) {
2152 452614 : CharacterRange& range = ranges->at(i);
2153 226307 : if (range.from() == 0) {
2154 : DCHECK_EQ(i, 0);
2155 2019 : zeroth_entry_is_failure = !zeroth_entry_is_failure;
2156 : } else {
2157 224288 : range_boundaries->Add(range.from(), zone);
2158 : }
2159 226307 : range_boundaries->Add(range.to() + 1, zone);
2160 : }
2161 125992 : int end_index = range_boundaries->length() - 1;
2162 125992 : if (range_boundaries->at(end_index) > max_char) {
2163 2611 : end_index--;
2164 : }
2165 :
2166 : Label fall_through;
2167 : GenerateBranches(macro_assembler,
2168 : range_boundaries,
2169 : 0, // start_index.
2170 : end_index,
2171 : 0, // min_char.
2172 : max_char,
2173 : &fall_through,
2174 : zeroth_entry_is_failure ? &fall_through : on_failure,
2175 125992 : zeroth_entry_is_failure ? on_failure : &fall_through);
2176 125992 : macro_assembler->Bind(&fall_through);
2177 : }
2178 :
2179 :
2180 0 : RegExpNode::~RegExpNode() {
2181 0 : }
2182 :
2183 :
2184 4852679 : RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2185 2346931 : Trace* trace) {
2186 : // If we are generating a greedy loop then don't stop and don't reuse code.
2187 1844840 : if (trace->stop_node() != nullptr) {
2188 : return CONTINUE;
2189 : }
2190 :
2191 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2192 1830021 : if (trace->is_trivial()) {
2193 1619992 : if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2194 : // If a generic version is already scheduled to be generated or we have
2195 : // recursed too deeply then just generate a jump to that code.
2196 232941 : macro_assembler->GoTo(&label_);
2197 : // This will queue it up for generation of a generic version if it hasn't
2198 : // already been queued.
2199 232941 : compiler->AddWork(this);
2200 232941 : return DONE;
2201 : }
2202 : // Generate generic version of the node and bind the label for later use.
2203 413600 : macro_assembler->Bind(&label_);
2204 413600 : return CONTINUE;
2205 : }
2206 :
2207 : // We are being asked to make a non-generic version. Keep track of how many
2208 : // non-generic versions we generate so as not to overdo it.
2209 1183480 : trace_count_++;
2210 2361298 : if (KeepRecursing(compiler) && compiler->optimize() &&
2211 : trace_count_ < kMaxCopiesCodeGenerated) {
2212 : return CONTINUE;
2213 : }
2214 :
2215 : // If we get here code has been generated for this node too many times or
2216 : // recursion is too deep. Time to switch to a generic version. The code for
2217 : // generic versions above can handle deep recursion properly.
2218 : bool was_limiting = compiler->limiting_recursion();
2219 : compiler->set_limiting_recursion(true);
2220 482862 : trace->Flush(compiler, this);
2221 : compiler->set_limiting_recursion(was_limiting);
2222 482862 : return DONE;
2223 : }
2224 :
2225 :
2226 3946217 : bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2227 3946217 : return !compiler->limiting_recursion() &&
2228 0 : compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2229 : }
2230 :
2231 :
2232 701508 : int ActionNode::EatsAtLeast(int still_to_find,
2233 : int budget,
2234 : bool not_at_start) {
2235 701508 : if (budget <= 0) return 0;
2236 685384 : if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2237 680617 : return on_success()->EatsAtLeast(still_to_find,
2238 : budget - 1,
2239 680617 : not_at_start);
2240 : }
2241 :
2242 :
2243 101777 : void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2244 : BoyerMooreLookahead* bm, bool not_at_start) {
2245 101777 : if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2246 101777 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2247 : }
2248 : SaveBMInfo(bm, not_at_start, offset);
2249 101777 : }
2250 :
2251 :
2252 16817 : int AssertionNode::EatsAtLeast(int still_to_find,
2253 : int budget,
2254 15956 : bool not_at_start) {
2255 16817 : if (budget <= 0) return 0;
2256 : // If we know we are not at the start and we are asked "how many characters
2257 : // will you match if you succeed?" then we can answer anything since false
2258 : // implies false. So lets just return the max answer (still_to_find) since
2259 : // that won't prevent us from preloading a lot of characters for the other
2260 : // branches in the node graph.
2261 15956 : if (assertion_type() == AT_START && not_at_start) return still_to_find;
2262 15718 : return on_success()->EatsAtLeast(still_to_find,
2263 : budget - 1,
2264 15718 : not_at_start);
2265 : }
2266 :
2267 :
2268 1072 : void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2269 1072 : BoyerMooreLookahead* bm, bool not_at_start) {
2270 : // Match the behaviour of EatsAtLeast on this node.
2271 2144 : if (assertion_type() == AT_START && not_at_start) return;
2272 1055 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2273 : SaveBMInfo(bm, not_at_start, offset);
2274 : }
2275 :
2276 :
2277 3390 : int BackReferenceNode::EatsAtLeast(int still_to_find,
2278 : int budget,
2279 3390 : bool not_at_start) {
2280 3390 : if (read_backward()) return 0;
2281 3280 : if (budget <= 0) return 0;
2282 3280 : return on_success()->EatsAtLeast(still_to_find,
2283 : budget - 1,
2284 3280 : not_at_start);
2285 : }
2286 :
2287 :
2288 6621837 : int TextNode::EatsAtLeast(int still_to_find,
2289 : int budget,
2290 6621837 : bool not_at_start) {
2291 6621837 : if (read_backward()) return 0;
2292 6620329 : int answer = Length();
2293 6620329 : if (answer >= still_to_find) return answer;
2294 4012424 : if (budget <= 0) return answer;
2295 : // We are not at start after this node so we set the last argument to 'true'.
2296 2806268 : return answer + on_success()->EatsAtLeast(still_to_find - answer,
2297 : budget - 1,
2298 2806268 : true);
2299 : }
2300 :
2301 :
2302 9517 : int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2303 : bool not_at_start) {
2304 9517 : if (budget <= 0) return 0;
2305 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2306 : // afterwards.
2307 18830 : RegExpNode* node = alternatives_->at(1).node();
2308 9415 : return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2309 : }
2310 :
2311 :
2312 3766 : void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2313 : QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2314 : bool not_at_start) {
2315 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2316 : // afterwards.
2317 7532 : RegExpNode* node = alternatives_->at(1).node();
2318 3766 : return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2319 : }
2320 :
2321 :
2322 8298475 : int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2323 : int budget,
2324 : RegExpNode* ignore_this_node,
2325 : bool not_at_start) {
2326 8298475 : if (budget <= 0) return 0;
2327 : int min = 100;
2328 5747350 : int choice_count = alternatives_->length();
2329 5747350 : budget = (budget - 1) / choice_count;
2330 12537192 : for (int i = 0; i < choice_count; i++) {
2331 24402192 : RegExpNode* node = alternatives_->at(i).node();
2332 12201096 : if (node == ignore_this_node) continue;
2333 : int node_eats_at_least =
2334 12020932 : node->EatsAtLeast(still_to_find, budget, not_at_start);
2335 12020932 : if (node_eats_at_least < min) min = node_eats_at_least;
2336 12020932 : if (min == 0) return 0;
2337 : }
2338 : return min;
2339 : }
2340 :
2341 :
2342 189573 : int LoopChoiceNode::EatsAtLeast(int still_to_find,
2343 : int budget,
2344 : bool not_at_start) {
2345 : return EatsAtLeastHelper(still_to_find,
2346 : budget - 1,
2347 : loop_node_,
2348 189573 : not_at_start);
2349 : }
2350 :
2351 :
2352 8108902 : int ChoiceNode::EatsAtLeast(int still_to_find,
2353 : int budget,
2354 : bool not_at_start) {
2355 8108902 : return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
2356 : }
2357 :
2358 :
2359 : // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2360 : static inline uint32_t SmearBitsRight(uint32_t v) {
2361 301434 : v |= v >> 1;
2362 301434 : v |= v >> 2;
2363 301434 : v |= v >> 4;
2364 301434 : v |= v >> 8;
2365 301434 : v |= v >> 16;
2366 : return v;
2367 : }
2368 :
2369 :
2370 276914 : bool QuickCheckDetails::Rationalize(bool asc) {
2371 : bool found_useful_op = false;
2372 : uint32_t char_mask;
2373 276914 : if (asc) {
2374 : char_mask = String::kMaxOneByteCharCode;
2375 : } else {
2376 : char_mask = String::kMaxUtf16CodeUnit;
2377 : }
2378 276914 : mask_ = 0;
2379 276914 : value_ = 0;
2380 : int char_shift = 0;
2381 788605 : for (int i = 0; i < characters_; i++) {
2382 511691 : Position* pos = &positions_[i];
2383 511691 : if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2384 : found_useful_op = true;
2385 : }
2386 511691 : mask_ |= (pos->mask & char_mask) << char_shift;
2387 511691 : value_ |= (pos->value & char_mask) << char_shift;
2388 511691 : char_shift += asc ? 8 : 16;
2389 : }
2390 276914 : return found_useful_op;
2391 : }
2392 :
2393 :
2394 1235811 : bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2395 73751 : Trace* bounds_check_trace,
2396 614266 : Trace* trace,
2397 : bool preload_has_checked_bounds,
2398 : Label* on_possible_success,
2399 1474676 : QuickCheckDetails* details,
2400 : bool fall_through_on_failure) {
2401 523071 : if (details->characters() == 0) return false;
2402 : GetQuickCheckDetails(
2403 554062 : details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2404 277031 : if (details->cannot_match()) return false;
2405 276914 : if (!details->Rationalize(compiler->one_byte())) return false;
2406 : DCHECK(details->characters() == 1 ||
2407 : compiler->macro_assembler()->CanReadUnaligned());
2408 : uint32_t mask = details->mask();
2409 : uint32_t value = details->value();
2410 :
2411 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2412 :
2413 224858 : if (trace->characters_preloaded() != details->characters()) {
2414 : DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2415 : // We are attempting to preload the minimum number of characters
2416 : // any choice would eat, so if the bounds check fails, then none of the
2417 : // choices can succeed, so we can just immediately backtrack, rather
2418 : // than go to the next choice.
2419 : assembler->LoadCurrentCharacter(trace->cp_offset(),
2420 : bounds_check_trace->backtrack(),
2421 73751 : !preload_has_checked_bounds,
2422 221253 : details->characters());
2423 : }
2424 :
2425 :
2426 : bool need_mask = true;
2427 :
2428 224858 : if (details->characters() == 1) {
2429 : // If number of characters preloaded is 1 then we used a byte or 16 bit
2430 : // load so the value is already masked down.
2431 : uint32_t char_mask;
2432 46723 : if (compiler->one_byte()) {
2433 : char_mask = String::kMaxOneByteCharCode;
2434 : } else {
2435 : char_mask = String::kMaxUtf16CodeUnit;
2436 : }
2437 46723 : if ((mask & char_mask) == char_mask) need_mask = false;
2438 : mask &= char_mask;
2439 : } else {
2440 : // For 2-character preloads in one-byte mode or 1-character preloads in
2441 : // two-byte mode we also use a 16 bit load with zero extend.
2442 : static const uint32_t kTwoByteMask = 0xffff;
2443 : static const uint32_t kFourByteMask = 0xffffffff;
2444 342380 : if (details->characters() == 2 && compiler->one_byte()) {
2445 162515 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2446 15620 : } else if (details->characters() == 1 && !compiler->one_byte()) {
2447 0 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2448 : } else {
2449 15620 : if (mask == kFourByteMask) need_mask = false;
2450 : }
2451 : }
2452 :
2453 224858 : if (fall_through_on_failure) {
2454 186232 : if (need_mask) {
2455 57017 : assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2456 : } else {
2457 129215 : assembler->CheckCharacter(value, on_possible_success);
2458 : }
2459 : } else {
2460 38626 : if (need_mask) {
2461 13470 : assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2462 : } else {
2463 63782 : assembler->CheckNotCharacter(value, trace->backtrack());
2464 : }
2465 : }
2466 : return true;
2467 : }
2468 :
2469 :
2470 : // Here is the meat of GetQuickCheckDetails (see also the comment on the
2471 : // super-class in the .h file).
2472 : //
2473 : // We iterate along the text object, building up for each character a
2474 : // mask and value that can be used to test for a quick failure to match.
2475 : // The masks and values for the positions will be combined into a single
2476 : // machine word for the current character width in order to be used in
2477 : // generating a quick check.
2478 1776513 : void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2479 1039771 : RegExpCompiler* compiler,
2480 : int characters_filled_in,
2481 1091448 : bool not_at_start) {
2482 : // Do not collect any quick check details if the text node reads backward,
2483 : // since it reads in the opposite direction than we use for quick checks.
2484 508078 : if (read_backward()) return;
2485 508078 : Isolate* isolate = compiler->macro_assembler()->isolate();
2486 : DCHECK(characters_filled_in < details->characters());
2487 : int characters = details->characters();
2488 : int char_mask;
2489 508078 : if (compiler->one_byte()) {
2490 : char_mask = String::kMaxOneByteCharCode;
2491 : } else {
2492 : char_mask = String::kMaxUtf16CodeUnit;
2493 : }
2494 1166740 : for (int k = 0; k < elements()->length(); k++) {
2495 513358 : TextElement elm = elements()->at(k);
2496 513358 : if (elm.text_type() == TextElement::ATOM) {
2497 : Vector<const uc16> quarks = elm.atom()->data();
2498 1162556 : for (int i = 0; i < characters && i < quarks.length(); i++) {
2499 : QuickCheckDetails::Position* pos =
2500 542953 : details->positions(characters_filled_in);
2501 1085906 : uc16 c = quarks[i];
2502 542953 : if (compiler->ignore_case()) {
2503 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2504 : int length = GetCaseIndependentLetters(isolate, c,
2505 23615 : compiler->one_byte(), chars);
2506 23615 : if (length == 0) {
2507 : // This can happen because all case variants are non-Latin1, but we
2508 : // know the input is Latin1.
2509 : details->set_cannot_match();
2510 30 : pos->determines_perfectly = false;
2511 30 : return;
2512 : }
2513 23585 : if (length == 1) {
2514 : // This letter has no case equivalents, so it's nice and simple
2515 : // and the mask-compare will determine definitely whether we have
2516 : // a match at this character position.
2517 3288 : pos->mask = char_mask;
2518 3288 : pos->value = c;
2519 3288 : pos->determines_perfectly = true;
2520 : } else {
2521 20297 : uint32_t common_bits = char_mask;
2522 20297 : uint32_t bits = chars[0];
2523 40937 : for (int j = 1; j < length; j++) {
2524 20640 : uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2525 20640 : common_bits ^= differing_bits;
2526 20640 : bits &= common_bits;
2527 : }
2528 : // If length is 2 and common bits has only one zero in it then
2529 : // our mask and compare instruction will determine definitely
2530 : // whether we have a match at this character position. Otherwise
2531 : // it can only be an approximate check.
2532 20297 : uint32_t one_zero = (common_bits | ~char_mask);
2533 20297 : if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2534 19895 : pos->determines_perfectly = true;
2535 : }
2536 20297 : pos->mask = common_bits;
2537 20297 : pos->value = bits;
2538 : }
2539 : } else {
2540 : // Don't ignore case. Nice simple case where the mask-compare will
2541 : // determine definitely whether we have a match at this character
2542 : // position.
2543 519338 : if (c > char_mask) {
2544 : details->set_cannot_match();
2545 30 : pos->determines_perfectly = false;
2546 30 : return;
2547 : }
2548 519308 : pos->mask = char_mask;
2549 519308 : pos->value = c;
2550 519308 : pos->determines_perfectly = true;
2551 : }
2552 542893 : characters_filled_in++;
2553 : DCHECK(characters_filled_in <= details->characters());
2554 542893 : if (characters_filled_in == details->characters()) {
2555 : return;
2556 : }
2557 : }
2558 : } else {
2559 : QuickCheckDetails::Position* pos =
2560 147512 : details->positions(characters_filled_in);
2561 : RegExpCharacterClass* tree = elm.char_class();
2562 604194 : ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2563 147512 : if (tree->is_negated()) {
2564 : // A quick check uses multi-character mask and compare. There is no
2565 : // useful way to incorporate a negative char class into this scheme
2566 : // so we just conservatively create a mask and value that will always
2567 : // succeed.
2568 3656 : pos->mask = 0;
2569 3656 : pos->value = 0;
2570 : } else {
2571 : int first_range = 0;
2572 143900 : while (ranges->at(first_range).from() > char_mask) {
2573 104 : first_range++;
2574 104 : if (first_range == ranges->length()) {
2575 : details->set_cannot_match();
2576 60 : pos->determines_perfectly = false;
2577 : return;
2578 : }
2579 : }
2580 143796 : CharacterRange range = ranges->at(first_range);
2581 143796 : uc16 from = range.from();
2582 143796 : uc16 to = range.to();
2583 143796 : if (to > char_mask) {
2584 28640 : to = char_mask;
2585 : }
2586 143796 : uint32_t differing_bits = (from ^ to);
2587 : // A mask and compare is only perfect if the differing bits form a
2588 : // number like 00011111 with one single block of trailing 1s.
2589 255069 : if ((differing_bits & (differing_bits + 1)) == 0 &&
2590 111273 : from + differing_bits == to) {
2591 101671 : pos->determines_perfectly = true;
2592 : }
2593 143796 : uint32_t common_bits = ~SmearBitsRight(differing_bits);
2594 143796 : uint32_t bits = (from & common_bits);
2595 913156 : for (int i = first_range + 1; i < ranges->length(); i++) {
2596 312782 : CharacterRange range = ranges->at(i);
2597 312782 : uc16 from = range.from();
2598 312782 : uc16 to = range.to();
2599 312782 : if (from > char_mask) continue;
2600 157638 : if (to > char_mask) to = char_mask;
2601 : // Here we are combining more ranges into the mask and compare
2602 : // value. With each new range the mask becomes more sparse and
2603 : // so the chances of a false positive rise. A character class
2604 : // with multiple ranges is assumed never to be equivalent to a
2605 : // mask and compare operation.
2606 157638 : pos->determines_perfectly = false;
2607 157638 : uint32_t new_common_bits = (from ^ to);
2608 157638 : new_common_bits = ~SmearBitsRight(new_common_bits);
2609 157638 : common_bits &= new_common_bits;
2610 157638 : bits &= new_common_bits;
2611 157638 : uint32_t differing_bits = (from & common_bits) ^ bits;
2612 157638 : common_bits ^= differing_bits;
2613 157638 : bits &= common_bits;
2614 : }
2615 143796 : pos->mask = common_bits;
2616 143796 : pos->value = bits;
2617 : }
2618 147452 : characters_filled_in++;
2619 : DCHECK(characters_filled_in <= details->characters());
2620 147452 : if (characters_filled_in == details->characters()) {
2621 : return;
2622 : }
2623 : }
2624 : }
2625 : DCHECK(characters_filled_in != details->characters());
2626 70012 : if (!details->cannot_match()) {
2627 70012 : on_success()-> GetQuickCheckDetails(details,
2628 : compiler,
2629 : characters_filled_in,
2630 70012 : true);
2631 : }
2632 : }
2633 :
2634 :
2635 0 : void QuickCheckDetails::Clear() {
2636 368215 : for (int i = 0; i < characters_; i++) {
2637 368215 : positions_[i].mask = 0;
2638 368215 : positions_[i].value = 0;
2639 368215 : positions_[i].determines_perfectly = false;
2640 : }
2641 1172690 : characters_ = 0;
2642 0 : }
2643 :
2644 :
2645 554752 : void QuickCheckDetails::Advance(int by, bool one_byte) {
2646 554752 : if (by >= characters_ || by < 0) {
2647 : DCHECK_IMPLIES(by < 0, characters_ == 0);
2648 : Clear();
2649 554752 : return;
2650 : }
2651 : DCHECK_LE(characters_ - by, 4);
2652 : DCHECK_LE(characters_, 4);
2653 44247 : for (int i = 0; i < characters_ - by; i++) {
2654 44247 : positions_[i] = positions_[by + i];
2655 : }
2656 37645 : for (int i = characters_ - by; i < characters_; i++) {
2657 37645 : positions_[i].mask = 0;
2658 37645 : positions_[i].value = 0;
2659 37645 : positions_[i].determines_perfectly = false;
2660 : }
2661 34940 : characters_ -= by;
2662 : // We could change mask_ and value_ here but we would never advance unless
2663 : // they had already been used in a check and they won't be used again because
2664 : // it would gain us nothing. So there's no point.
2665 : }
2666 :
2667 :
2668 173623 : void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2669 : DCHECK(characters_ == other->characters_);
2670 173623 : if (other->cannot_match_) {
2671 : return;
2672 : }
2673 173535 : if (cannot_match_) {
2674 286 : *this = *other;
2675 286 : return;
2676 : }
2677 200331 : for (int i = from_index; i < characters_; i++) {
2678 200331 : QuickCheckDetails::Position* pos = positions(i);
2679 200331 : QuickCheckDetails::Position* other_pos = other->positions(i);
2680 235767 : if (pos->mask != other_pos->mask ||
2681 41770 : pos->value != other_pos->value ||
2682 6334 : !other_pos->determines_perfectly) {
2683 : // Our mask-compare operation will be approximate unless we have the
2684 : // exact same operation on both sides of the alternation.
2685 198131 : pos->determines_perfectly = false;
2686 : }
2687 200331 : pos->mask &= other_pos->mask;
2688 200331 : pos->value &= pos->mask;
2689 200331 : other_pos->value &= pos->mask;
2690 200331 : uc16 differing_bits = (pos->value ^ other_pos->value);
2691 200331 : pos->mask &= ~differing_bits;
2692 200331 : pos->value &= pos->mask;
2693 : }
2694 : }
2695 :
2696 :
2697 : class VisitMarker {
2698 : public:
2699 : explicit VisitMarker(NodeInfo* info) : info_(info) {
2700 : DCHECK(!info->visited);
2701 300319 : info->visited = true;
2702 : }
2703 : ~VisitMarker() {
2704 265465 : info_->visited = false;
2705 : }
2706 : private:
2707 : NodeInfo* info_;
2708 : };
2709 :
2710 :
2711 148351 : RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2712 148351 : if (info()->replacement_calculated) return replacement();
2713 106729 : if (depth < 0) return this;
2714 : DCHECK(!info()->visited);
2715 106540 : VisitMarker marker(info());
2716 : return FilterSuccessor(depth - 1, ignore_case);
2717 : }
2718 :
2719 :
2720 0 : RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2721 200810 : RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2722 200810 : if (next == nullptr) return set_replacement(nullptr);
2723 200286 : on_success_ = next;
2724 200286 : return set_replacement(this);
2725 : }
2726 :
2727 :
2728 : // We need to check for the following characters: 0x39c 0x3bc 0x178.
2729 9927 : static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2730 : // TODO(dcarney): this could be a lot more efficient.
2731 9839 : return range.Contains(0x39c) ||
2732 19741 : range.Contains(0x3bc) || range.Contains(0x178);
2733 : }
2734 :
2735 :
2736 67 : static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2737 78 : for (int i = 0; i < ranges->length(); i++) {
2738 : // TODO(dcarney): this could be a lot more efficient.
2739 34 : if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2740 : }
2741 : return false;
2742 : }
2743 :
2744 :
2745 295499 : RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2746 154749 : if (info()->replacement_calculated) return replacement();
2747 94780 : if (depth < 0) return this;
2748 : DCHECK(!info()->visited);
2749 94726 : VisitMarker marker(info());
2750 94726 : int element_count = elements()->length();
2751 194206 : for (int i = 0; i < element_count; i++) {
2752 99936 : TextElement elm = elements()->at(i);
2753 99936 : if (elm.text_type() == TextElement::ATOM) {
2754 : Vector<const uc16> quarks = elm.atom()->data();
2755 448222 : for (int j = 0; j < quarks.length(); j++) {
2756 356560 : uint16_t c = quarks[j];
2757 178280 : if (c <= String::kMaxOneByteCharCode) continue;
2758 348 : if (!ignore_case) return set_replacement(nullptr);
2759 : // Here, we need to check for characters whose upper and lower cases
2760 : // are outside the Latin-1 range.
2761 : uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2762 : // Character is outside Latin-1 completely
2763 196 : if (converted == 0) return set_replacement(nullptr);
2764 : // Convert quark to Latin-1 in place.
2765 : uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2766 40 : copy[j] = converted;
2767 : }
2768 : } else {
2769 : DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2770 : RegExpCharacterClass* cc = elm.char_class();
2771 53912 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2772 53912 : CharacterRange::Canonicalize(ranges);
2773 : // Now they are in order so we only need to look at the first.
2774 : int range_count = ranges->length();
2775 53912 : if (cc->is_negated()) {
2776 8304 : if (range_count != 0 &&
2777 8484 : ranges->at(0).from() == 0 &&
2778 180 : ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2779 : // This will be handled in a later filter.
2780 38 : if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2781 37 : return set_replacement(nullptr);
2782 : }
2783 : } else {
2784 99515 : if (range_count == 0 ||
2785 49755 : ranges->at(0).from() > String::kMaxOneByteCharCode) {
2786 : // This will be handled in a later filter.
2787 248 : if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2788 226 : return set_replacement(nullptr);
2789 : }
2790 : }
2791 : }
2792 : }
2793 94270 : return FilterSuccessor(depth - 1, ignore_case);
2794 : }
2795 :
2796 :
2797 87666 : RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2798 87666 : if (info()->replacement_calculated) return replacement();
2799 66676 : if (depth < 0) return this;
2800 66569 : if (info()->visited) return this;
2801 : {
2802 35218 : VisitMarker marker(info());
2803 :
2804 : RegExpNode* continue_replacement =
2805 35218 : continue_node_->FilterOneByte(depth - 1, ignore_case);
2806 : // If we can't continue after the loop then there is no sense in doing the
2807 : // loop.
2808 35218 : if (continue_replacement == nullptr) return set_replacement(nullptr);
2809 : }
2810 :
2811 34854 : return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2812 : }
2813 :
2814 :
2815 43357 : RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2816 43513 : if (info()->replacement_calculated) return replacement();
2817 41271 : if (depth < 0) return this;
2818 41169 : if (info()->visited) return this;
2819 41169 : VisitMarker marker(info());
2820 41169 : int choice_count = alternatives_->length();
2821 :
2822 128276 : for (int i = 0; i < choice_count; i++) {
2823 90828 : GuardedAlternative alternative = alternatives_->at(i);
2824 94549 : if (alternative.guards() != nullptr &&
2825 3721 : alternative.guards()->length() != 0) {
2826 3721 : set_replacement(this);
2827 : return this;
2828 : }
2829 : }
2830 :
2831 : int surviving = 0;
2832 : RegExpNode* survivor = nullptr;
2833 86407 : for (int i = 0; i < choice_count; i++) {
2834 172814 : GuardedAlternative alternative = alternatives_->at(i);
2835 : RegExpNode* replacement =
2836 86407 : alternative.node()->FilterOneByte(depth - 1, ignore_case);
2837 : DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2838 86407 : if (replacement != nullptr) {
2839 86244 : alternatives_->at(i).set_node(replacement);
2840 86244 : surviving++;
2841 : survivor = replacement;
2842 : }
2843 : }
2844 37523 : if (surviving < 2) return set_replacement(survivor);
2845 :
2846 37373 : set_replacement(this);
2847 37373 : if (surviving == choice_count) {
2848 : return this;
2849 : }
2850 : // Only some of the nodes survived the filtering. We need to rebuild the
2851 : // alternatives list.
2852 : ZoneList<GuardedAlternative>* new_alternatives =
2853 22 : new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2854 208 : for (int i = 0; i < choice_count; i++) {
2855 : RegExpNode* replacement =
2856 372 : alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2857 186 : if (replacement != nullptr) {
2858 134 : alternatives_->at(i).set_node(replacement);
2859 268 : new_alternatives->Add(alternatives_->at(i), zone());
2860 : }
2861 : }
2862 22 : alternatives_ = new_alternatives;
2863 22 : return this;
2864 : }
2865 :
2866 :
2867 331 : RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2868 : bool ignore_case) {
2869 331 : if (info()->replacement_calculated) return replacement();
2870 331 : if (depth < 0) return this;
2871 331 : if (info()->visited) return this;
2872 331 : VisitMarker marker(info());
2873 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2874 : // afterwards.
2875 662 : RegExpNode* node = alternatives_->at(1).node();
2876 331 : RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2877 337 : if (replacement == nullptr) return set_replacement(nullptr);
2878 325 : alternatives_->at(1).set_node(replacement);
2879 :
2880 650 : RegExpNode* neg_node = alternatives_->at(0).node();
2881 325 : RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2882 : // If the negative lookahead is always going to fail then
2883 : // we don't need to check it.
2884 331 : if (neg_replacement == nullptr) return set_replacement(replacement);
2885 319 : alternatives_->at(0).set_node(neg_replacement);
2886 638 : return set_replacement(this);
2887 : }
2888 :
2889 :
2890 33937 : void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2891 : RegExpCompiler* compiler,
2892 : int characters_filled_in,
2893 : bool not_at_start) {
2894 33937 : if (body_can_be_zero_length_ || info()->visited) return;
2895 22335 : VisitMarker marker(info());
2896 : return ChoiceNode::GetQuickCheckDetails(details,
2897 : compiler,
2898 : characters_filled_in,
2899 22335 : not_at_start);
2900 : }
2901 :
2902 :
2903 14030 : void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2904 : BoyerMooreLookahead* bm, bool not_at_start) {
2905 14030 : if (body_can_be_zero_length_ || budget <= 0) {
2906 : bm->SetRest(offset);
2907 : SaveBMInfo(bm, not_at_start, offset);
2908 14030 : return;
2909 : }
2910 13818 : ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2911 : SaveBMInfo(bm, not_at_start, offset);
2912 : }
2913 :
2914 :
2915 229425 : void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2916 : RegExpCompiler* compiler,
2917 : int characters_filled_in,
2918 : bool not_at_start) {
2919 55802 : not_at_start = (not_at_start || not_at_start_);
2920 55802 : int choice_count = alternatives_->length();
2921 : DCHECK_LT(0, choice_count);
2922 55802 : alternatives_->at(0).node()->GetQuickCheckDetails(details,
2923 : compiler,
2924 : characters_filled_in,
2925 55802 : not_at_start);
2926 229425 : for (int i = 1; i < choice_count; i++) {
2927 : QuickCheckDetails new_details(details->characters());
2928 347246 : RegExpNode* node = alternatives_->at(i).node();
2929 : node->GetQuickCheckDetails(&new_details, compiler,
2930 : characters_filled_in,
2931 173623 : not_at_start);
2932 : // Here we merge the quick match details of the two branches.
2933 173623 : details->Merge(&new_details, characters_filled_in);
2934 : }
2935 55802 : }
2936 :
2937 :
2938 : // Check for [0-9A-Z_a-z].
2939 584 : static void EmitWordCheck(RegExpMacroAssembler* assembler,
2940 : Label* word,
2941 : Label* non_word,
2942 : bool fall_through_on_word) {
2943 584 : if (assembler->CheckSpecialCharacterClass(
2944 : fall_through_on_word ? 'w' : 'W',
2945 584 : fall_through_on_word ? non_word : word)) {
2946 : // Optimized implementation available.
2947 584 : return;
2948 : }
2949 0 : assembler->CheckCharacterGT('z', non_word);
2950 0 : assembler->CheckCharacterLT('0', non_word);
2951 0 : assembler->CheckCharacterGT('a' - 1, word);
2952 0 : assembler->CheckCharacterLT('9' + 1, word);
2953 0 : assembler->CheckCharacterLT('A', non_word);
2954 0 : assembler->CheckCharacterLT('Z' + 1, word);
2955 0 : if (fall_through_on_word) {
2956 0 : assembler->CheckNotCharacter('_', non_word);
2957 : } else {
2958 0 : assembler->CheckCharacter('_', word);
2959 : }
2960 : }
2961 :
2962 :
2963 : // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2964 : // that matches newline or the start of input).
2965 782 : static void EmitHat(RegExpCompiler* compiler,
2966 : RegExpNode* on_success,
2967 : Trace* trace) {
2968 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2969 : // We will be loading the previous character into the current character
2970 : // register.
2971 782 : Trace new_trace(*trace);
2972 : new_trace.InvalidateCurrentCharacter();
2973 :
2974 : Label ok;
2975 782 : if (new_trace.cp_offset() == 0) {
2976 : // The start of input counts as a newline in this context, so skip to
2977 : // ok if we are at the start.
2978 777 : assembler->CheckAtStart(&ok);
2979 : }
2980 : // We already checked that we are not at the start of input so it must be
2981 : // OK to load the previous character.
2982 782 : assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2983 : new_trace.backtrack(),
2984 1564 : false);
2985 782 : if (!assembler->CheckSpecialCharacterClass('n',
2986 782 : new_trace.backtrack())) {
2987 : // Newline means \n, \r, 0x2028 or 0x2029.
2988 0 : if (!compiler->one_byte()) {
2989 0 : assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
2990 : }
2991 0 : assembler->CheckCharacter('\n', &ok);
2992 0 : assembler->CheckNotCharacter('\r', new_trace.backtrack());
2993 : }
2994 782 : assembler->Bind(&ok);
2995 782 : on_success->Emit(compiler, &new_trace);
2996 782 : }
2997 :
2998 : // More makes code generation slower, less makes V8 benchmark score lower.
2999 : const int kMaxLookaheadForBoyerMoore = 8;
3000 :
3001 : // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3002 857 : void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3003 274 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3004 : Isolate* isolate = assembler->isolate();
3005 : Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3006 274 : bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3007 167 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3008 274 : if (lookahead == nullptr) {
3009 : int eats_at_least =
3010 : Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3011 : kRecursionBudget,
3012 215 : not_at_start));
3013 215 : if (eats_at_least >= 1) {
3014 108 : BoyerMooreLookahead* bm =
3015 108 : new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3016 108 : FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3017 108 : if (bm->at(0)->is_non_word())
3018 : next_is_word_character = Trace::FALSE_VALUE;
3019 108 : if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3020 : }
3021 : } else {
3022 59 : if (lookahead->at(0)->is_non_word())
3023 : next_is_word_character = Trace::FALSE_VALUE;
3024 59 : if (lookahead->at(0)->is_word())
3025 : next_is_word_character = Trace::TRUE_VALUE;
3026 : }
3027 274 : bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3028 274 : if (next_is_word_character == Trace::UNKNOWN) {
3029 : Label before_non_word;
3030 : Label before_word;
3031 155 : if (trace->characters_preloaded() != 1) {
3032 308 : assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3033 : }
3034 : // Fall through on non-word.
3035 155 : EmitWordCheck(assembler, &before_word, &before_non_word, false);
3036 : // Next character is not a word character.
3037 155 : assembler->Bind(&before_non_word);
3038 : Label ok;
3039 155 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3040 155 : assembler->GoTo(&ok);
3041 :
3042 155 : assembler->Bind(&before_word);
3043 155 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3044 155 : assembler->Bind(&ok);
3045 119 : } else if (next_is_word_character == Trace::TRUE_VALUE) {
3046 89 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3047 : } else {
3048 : DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3049 30 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3050 : }
3051 274 : }
3052 :
3053 :
3054 429 : void AssertionNode::BacktrackIfPrevious(
3055 429 : RegExpCompiler* compiler,
3056 : Trace* trace,
3057 : AssertionNode::IfPrevious backtrack_if_previous) {
3058 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3059 429 : Trace new_trace(*trace);
3060 : new_trace.InvalidateCurrentCharacter();
3061 :
3062 : Label fall_through, dummy;
3063 :
3064 : Label* non_word = backtrack_if_previous == kIsNonWord ?
3065 204 : new_trace.backtrack() :
3066 429 : &fall_through;
3067 : Label* word = backtrack_if_previous == kIsNonWord ?
3068 : &fall_through :
3069 429 : new_trace.backtrack();
3070 :
3071 429 : if (new_trace.cp_offset() == 0) {
3072 : // The start of input counts as a non-word character, so the question is
3073 : // decided if we are at the start.
3074 163 : assembler->CheckAtStart(non_word);
3075 : }
3076 : // We already checked that we are not at the start of input so it must be
3077 : // OK to load the previous character.
3078 429 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3079 429 : EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3080 :
3081 429 : assembler->Bind(&fall_through);
3082 429 : on_success()->Emit(compiler, &new_trace);
3083 429 : }
3084 :
3085 :
3086 3353 : void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3087 : RegExpCompiler* compiler,
3088 : int filled_in,
3089 : bool not_at_start) {
3090 3353 : if (assertion_type_ == AT_START && not_at_start) {
3091 : details->set_cannot_match();
3092 : return;
3093 : }
3094 2982 : return on_success()->GetQuickCheckDetails(details,
3095 : compiler,
3096 : filled_in,
3097 2982 : not_at_start);
3098 : }
3099 :
3100 :
3101 35052 : void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3102 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3103 12388 : switch (assertion_type_) {
3104 : case AT_END: {
3105 : Label ok;
3106 14100 : assembler->CheckPosition(trace->cp_offset(), &ok);
3107 14100 : assembler->GoTo(trace->backtrack());
3108 7050 : assembler->Bind(&ok);
3109 : break;
3110 : }
3111 : case AT_START: {
3112 4282 : if (trace->at_start() == Trace::FALSE_VALUE) {
3113 34 : assembler->GoTo(trace->backtrack());
3114 17 : return;
3115 : }
3116 4265 : if (trace->at_start() == Trace::UNKNOWN) {
3117 8530 : assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3118 4265 : Trace at_start_trace = *trace;
3119 : at_start_trace.set_at_start(Trace::TRUE_VALUE);
3120 11315 : on_success()->Emit(compiler, &at_start_trace);
3121 : return;
3122 : }
3123 : }
3124 : break;
3125 : case AFTER_NEWLINE:
3126 782 : EmitHat(compiler, on_success(), trace);
3127 782 : return;
3128 : case AT_BOUNDARY:
3129 : case AT_NON_BOUNDARY: {
3130 274 : EmitBoundaryCheck(compiler, trace);
3131 274 : return;
3132 : }
3133 : }
3134 7050 : on_success()->Emit(compiler, trace);
3135 : }
3136 :
3137 :
3138 3015895 : static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3139 3015895 : if (quick_check == nullptr) return false;
3140 3015895 : if (offset >= quick_check->characters()) return false;
3141 999718 : return quick_check->positions(offset)->determines_perfectly;
3142 : }
3143 :
3144 :
3145 : static void UpdateBoundsCheck(int index, int* checked_up_to) {
3146 831846 : if (index > *checked_up_to) {
3147 432996 : *checked_up_to = index;
3148 : }
3149 : }
3150 :
3151 :
3152 : // We call this repeatedly to generate code for each pass over the text node.
3153 : // The passes are in increasing order of difficulty because we hope one
3154 : // of the first passes will fail in which case we are saved the work of the
3155 : // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3156 : // we will check the '%' in the first pass, the case independent 'a' in the
3157 : // second pass and the character class in the last pass.
3158 : //
3159 : // The passes are done from right to left, so for example to test for /bar/
3160 : // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3161 : // and then a 'b' with offset 0. This means we can avoid the end-of-input
3162 : // bounds check most of the time. In the example we only need to check for
3163 : // end-of-input when loading the putative 'r'.
3164 : //
3165 : // A slight complication involves the fact that the first character may already
3166 : // be fetched into a register by the previous node. In this case we want to
3167 : // do the test for that character first. We do this in separate passes. The
3168 : // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3169 : // pass has been performed then subsequent passes will have true in
3170 : // first_element_checked to indicate that that character does not need to be
3171 : // checked again.
3172 : //
3173 : // In addition to all this we are passed a Trace, which can
3174 : // contain an AlternativeGeneration object. In this AlternativeGeneration
3175 : // object we can see details of any quick check that was already passed in
3176 : // order to get to the code we are now generating. The quick check can involve
3177 : // loading characters, which means we do not need to recheck the bounds
3178 : // up to the limit the quick check already checked. In addition the quick
3179 : // check can have involved a mask and compare operation which may simplify
3180 : // or obviate the need for further checks at some character positions.
3181 3517190 : void TextNode::TextEmitPass(RegExpCompiler* compiler,
3182 : TextEmitPassType pass,
3183 : bool preloaded,
3184 3584943 : Trace* trace,
3185 : bool first_element_checked,
3186 5744355 : int* checked_up_to) {
3187 1758595 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3188 : Isolate* isolate = assembler->isolate();
3189 : bool one_byte = compiler->one_byte();
3190 : Label* backtrack = trace->backtrack();
3191 1758595 : QuickCheckDetails* quick_check = trace->quick_check_performed();
3192 1758595 : int element_count = elements()->length();
3193 1758595 : int backward_offset = read_backward() ? -Length() : 0;
3194 3584913 : for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3195 1826348 : TextElement elm = elements()->at(i);
3196 1826348 : int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3197 1826348 : if (elm.text_type() == TextElement::ATOM) {
3198 : Vector<const uc16> quarks = elm.atom()->data();
3199 4790892 : for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3200 2781227 : if (first_element_checked && i == 0 && j == 0) continue;
3201 5521052 : if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3202 : EmitCharacterFunction* emit_function = nullptr;
3203 1811275 : switch (pass) {
3204 : case NON_LATIN1_MATCH:
3205 : DCHECK(one_byte);
3206 1192950 : if (quarks[j] > String::kMaxOneByteCharCode) {
3207 30 : assembler->GoTo(backtrack);
3208 1758595 : return;
3209 : }
3210 : break;
3211 : case NON_LETTER_CHARACTER_MATCH:
3212 : emit_function = &EmitAtomNonLetter;
3213 13714 : break;
3214 : case SIMPLE_CHARACTER_MATCH:
3215 : emit_function = &EmitSimpleCharacter;
3216 586829 : break;
3217 : case CASE_CHARACTER_MATCH:
3218 : emit_function = &EmitAtomLetter;
3219 13714 : break;
3220 : default:
3221 : break;
3222 : }
3223 1811245 : if (emit_function != nullptr) {
3224 959058 : bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3225 : bool bound_checked =
3226 614257 : emit_function(isolate, compiler, quarks[j], backtrack,
3227 1228514 : cp_offset + j, bounds_check, preloaded);
3228 614257 : if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3229 : }
3230 : }
3231 : } else {
3232 : DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3233 811150 : if (pass == CHARACTER_CLASS_MATCH) {
3234 291796 : if (first_element_checked && i == 0) continue;
3235 255369 : if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3236 : RegExpCharacterClass* cc = elm.char_class();
3237 287357 : bool bounds_check = *checked_up_to < cp_offset || read_backward();
3238 : EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3239 231341 : bounds_check, preloaded, zone());
3240 : UpdateBoundsCheck(cp_offset, checked_up_to);
3241 : }
3242 : }
3243 : }
3244 : }
3245 :
3246 :
3247 7897091 : int TextNode::Length() {
3248 7897091 : TextElement elm = elements()->last();
3249 : DCHECK_LE(0, elm.cp_offset());
3250 7897091 : return elm.cp_offset() + elm.length();
3251 : }
3252 :
3253 :
3254 0 : bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3255 2400964 : TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3256 2400964 : if (ignore_case) {
3257 711180 : return pass == SIMPLE_CHARACTER_MATCH;
3258 : } else {
3259 1689784 : return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3260 : }
3261 : }
3262 :
3263 :
3264 9300 : TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3265 : ZoneList<CharacterRange>* ranges,
3266 : bool read_backward,
3267 : RegExpNode* on_success) {
3268 : DCHECK_NOT_NULL(ranges);
3269 9300 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3270 : elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),
3271 9300 : zone);
3272 9300 : return new (zone) TextNode(elms, read_backward, on_success);
3273 : }
3274 :
3275 :
3276 22751 : TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3277 : CharacterRange trail,
3278 : bool read_backward,
3279 : RegExpNode* on_success) {
3280 22751 : ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3281 22751 : ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3282 22751 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3283 : elms->Add(
3284 : TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),
3285 22751 : zone);
3286 : elms->Add(
3287 : TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),
3288 22751 : zone);
3289 22751 : return new (zone) TextNode(elms, read_backward, on_success);
3290 : }
3291 :
3292 :
3293 : // This generates the code to match a text node. A text node can contain
3294 : // straight character sequences (possibly to be matched in a case-independent
3295 : // way) and character classes. For efficiency we do not do this in a single
3296 : // pass from left to right. Instead we pass over the text node several times,
3297 : // emitting code for some character positions every time. See the comment on
3298 : // TextEmitPass for details.
3299 4541693 : void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3300 658429 : LimitResult limit_result = LimitVersions(compiler, trace);
3301 762106 : if (limit_result == DONE) return;
3302 : DCHECK(limit_result == CONTINUE);
3303 :
3304 554752 : if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3305 : compiler->SetRegExpTooBig();
3306 : return;
3307 : }
3308 :
3309 554752 : if (compiler->one_byte()) {
3310 380318 : int dummy = 0;
3311 380318 : TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3312 : }
3313 :
3314 : bool first_elt_done = false;
3315 554752 : int bound_checked_to = trace->cp_offset() - 1;
3316 554752 : bound_checked_to += trace->bound_checked_up_to();
3317 :
3318 : // If a character is preloaded into the current character register then
3319 : // check that now.
3320 554752 : if (trace->characters_preloaded() == 1) {
3321 181956 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3322 181956 : if (!SkipPass(pass, compiler->ignore_case())) {
3323 : TextEmitPass(compiler,
3324 : static_cast<TextEmitPassType>(pass),
3325 : true,
3326 : trace,
3327 : false,
3328 115056 : &bound_checked_to);
3329 : }
3330 : }
3331 : first_elt_done = true;
3332 : }
3333 :
3334 2773760 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3335 2219008 : if (!SkipPass(pass, compiler->ignore_case())) {
3336 : TextEmitPass(compiler,
3337 : static_cast<TextEmitPassType>(pass),
3338 : false,
3339 : trace,
3340 : first_elt_done,
3341 1263221 : &bound_checked_to);
3342 : }
3343 : }
3344 :
3345 554752 : Trace successor_trace(*trace);
3346 : // If we advance backward, we may end up at the start.
3347 : successor_trace.AdvanceCurrentPositionInTrace(
3348 554752 : read_backward() ? -Length() : Length(), compiler);
3349 : successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3350 554752 : : Trace::FALSE_VALUE);
3351 : RecursionCheck rc(compiler);
3352 554752 : on_success()->Emit(compiler, &successor_trace);
3353 : }
3354 :
3355 :
3356 0 : void Trace::InvalidateCurrentCharacter() {
3357 256818 : characters_preloaded_ = 0;
3358 0 : }
3359 :
3360 :
3361 1109504 : void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3362 : // We don't have an instruction for shifting the current character register
3363 : // down or for using a shifted value for anything so lets just forget that
3364 : // we preloaded any characters into it.
3365 554752 : characters_preloaded_ = 0;
3366 : // Adjust the offsets of the quick check performed information. This
3367 : // information is used to find out what we already determined about the
3368 : // characters by means of mask and compare.
3369 554752 : quick_check_performed_.Advance(by, compiler->one_byte());
3370 554752 : cp_offset_ += by;
3371 554752 : if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3372 : compiler->SetRegExpTooBig();
3373 0 : cp_offset_ = 0;
3374 : }
3375 1109504 : bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3376 554752 : }
3377 :
3378 :
3379 309929 : void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3380 153221 : int element_count = elements()->length();
3381 309929 : for (int i = 0; i < element_count; i++) {
3382 156708 : TextElement elm = elements()->at(i);
3383 156708 : if (elm.text_type() == TextElement::CHAR_CLASS) {
3384 : RegExpCharacterClass* cc = elm.char_class();
3385 : // None of the standard character classes is different in the case
3386 : // independent case and it slows us down if we don't know that.
3387 218617 : if (cc->is_standard(zone())) continue;
3388 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3389 147410 : CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
3390 : }
3391 : }
3392 153221 : }
3393 :
3394 :
3395 161022 : int TextNode::GreedyLoopTextLength() { return Length(); }
3396 :
3397 :
3398 94144 : RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3399 279568 : RegExpCompiler* compiler) {
3400 94144 : if (read_backward()) return nullptr;
3401 94024 : if (elements()->length() != 1) return nullptr;
3402 93632 : TextElement elm = elements()->at(0);
3403 93632 : if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
3404 : RegExpCharacterClass* node = elm.char_class();
3405 184438 : ZoneList<CharacterRange>* ranges = node->ranges(zone());
3406 92219 : CharacterRange::Canonicalize(ranges);
3407 92219 : if (node->is_negated()) {
3408 89470 : return ranges->length() == 0 ? on_success() : nullptr;
3409 : }
3410 92087 : if (ranges->length() != 1) return nullptr;
3411 : uint32_t max_char;
3412 91400 : if (compiler->one_byte()) {
3413 : max_char = String::kMaxOneByteCharCode;
3414 : } else {
3415 : max_char = String::kMaxUtf16CodeUnit;
3416 : }
3417 274200 : return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
3418 : }
3419 :
3420 :
3421 : // Finds the fixed match length of a sequence of nodes that goes from
3422 : // this alternative and back to this choice node. If there are variable
3423 : // length nodes or other complications in the way then return a sentinel
3424 : // value indicating that a greedy loop cannot be constructed.
3425 260591 : int ChoiceNode::GreedyLoopTextLengthForAlternative(
3426 260591 : GuardedAlternative* alternative) {
3427 : int length = 0;
3428 : RegExpNode* node = alternative->node();
3429 : // Later we will generate code for all these text nodes using recursion
3430 : // so we have to limit the max number.
3431 : int recursion_depth = 0;
3432 682204 : while (node != this) {
3433 391995 : if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3434 : return kNodeIsTooComplexForGreedyLoops;
3435 : }
3436 391995 : int node_length = node->GreedyLoopTextLength();
3437 391995 : if (node_length == kNodeIsTooComplexForGreedyLoops) {
3438 : return kNodeIsTooComplexForGreedyLoops;
3439 : }
3440 161022 : length += node_length;
3441 161022 : SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3442 : node = seq_node->on_success();
3443 : }
3444 29618 : return read_backward() ? -length : length;
3445 : }
3446 :
3447 :
3448 0 : void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3449 : DCHECK_NULL(loop_node_);
3450 : AddAlternative(alt);
3451 1112399 : loop_node_ = alt.node();
3452 0 : }
3453 :
3454 :
3455 0 : void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3456 : DCHECK_NULL(continue_node_);
3457 : AddAlternative(alt);
3458 1112399 : continue_node_ = alt.node();
3459 0 : }
3460 :
3461 :
3462 388585 : void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3463 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3464 373776 : if (trace->stop_node() == this) {
3465 : // Back edge of greedy optimized loop node graph.
3466 : int text_length =
3467 29618 : GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3468 : DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
3469 : // Update the counter-based backtracking info on the stack. This is an
3470 : // optimization for greedy loops (see below).
3471 : DCHECK(trace->cp_offset() == text_length);
3472 14809 : macro_assembler->AdvanceCurrentPosition(text_length);
3473 29618 : macro_assembler->GoTo(trace->loop_label());
3474 14809 : return;
3475 : }
3476 : DCHECK_NULL(trace->stop_node());
3477 358967 : if (!trace->is_trivial()) {
3478 134755 : trace->Flush(compiler, this);
3479 134755 : return;
3480 : }
3481 224212 : ChoiceNode::Emit(compiler, trace);
3482 : }
3483 :
3484 :
3485 685238 : int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3486 : int eats_at_least) {
3487 : int preload_characters = Min(4, eats_at_least);
3488 245782 : if (compiler->macro_assembler()->CanReadUnaligned()) {
3489 : bool one_byte = compiler->one_byte();
3490 193674 : if (one_byte) {
3491 162783 : if (preload_characters > 4) preload_characters = 4;
3492 : // We can't preload 3 characters because there is no machine instruction
3493 : // to do that. We can't just load 4 because we could be reading
3494 : // beyond the end of the string, which could cause a memory fault.
3495 162783 : if (preload_characters == 3) preload_characters = 2;
3496 : } else {
3497 30891 : if (preload_characters > 2) preload_characters = 2;
3498 : }
3499 : } else {
3500 52108 : if (preload_characters > 1) preload_characters = 1;
3501 : }
3502 245782 : return preload_characters;
3503 : }
3504 :
3505 :
3506 : // This class is used when generating the alternatives in a choice node. It
3507 : // records the way the alternative is being code generated.
3508 : class AlternativeGeneration: public Malloced {
3509 : public:
3510 : AlternativeGeneration()
3511 : : possible_success(),
3512 : expects_preload(false),
3513 : after(),
3514 2481218 : quick_check_details() { }
3515 : Label possible_success;
3516 : bool expects_preload;
3517 : Label after;
3518 : QuickCheckDetails quick_check_details;
3519 : };
3520 :
3521 :
3522 : // Creates a list of AlternativeGenerations. If the list has a reasonable
3523 : // size then it is on the stack, otherwise the excess is on the heap.
3524 : class AlternativeGenerationList {
3525 : public:
3526 245782 : AlternativeGenerationList(int count, Zone* zone)
3527 2703602 : : alt_gens_(count, zone) {
3528 644289 : for (int i = 0; i < count && i < kAFew; i++) {
3529 644289 : alt_gens_.Add(a_few_alt_gens_ + i, zone);
3530 : }
3531 23398 : for (int i = kAFew; i < count; i++) {
3532 23398 : alt_gens_.Add(new AlternativeGeneration(), zone);
3533 : }
3534 245782 : }
3535 245782 : ~AlternativeGenerationList() {
3536 538360 : for (int i = kAFew; i < alt_gens_.length(); i++) {
3537 339374 : delete alt_gens_[i];
3538 23398 : alt_gens_[i] = nullptr;
3539 : }
3540 245782 : }
3541 :
3542 : AlternativeGeneration* at(int i) {
3543 3063035 : return alt_gens_[i];
3544 : }
3545 :
3546 : private:
3547 : static const int kAFew = 10;
3548 : ZoneList<AlternativeGeneration*> alt_gens_;
3549 : AlternativeGeneration a_few_alt_gens_[kAFew];
3550 : };
3551 :
3552 :
3553 : static const uc32 kRangeEndMarker = 0x110000;
3554 :
3555 : // The '2' variant is has inclusive from and exclusive to.
3556 : // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3557 : // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3558 : static const int kSpaceRanges[] = {
3559 : '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
3560 : 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3561 : 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3562 : static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3563 :
3564 : static const int kWordRanges[] = {
3565 : '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3566 : static const int kWordRangeCount = arraysize(kWordRanges);
3567 : static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3568 : static const int kDigitRangeCount = arraysize(kDigitRanges);
3569 : static const int kSurrogateRanges[] = {
3570 : kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3571 : static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3572 : static const int kLineTerminatorRanges[] = {
3573 : 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3574 : static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3575 :
3576 0 : void BoyerMoorePositionInfo::Set(int character) {
3577 65072 : SetInterval(Interval(character, character));
3578 0 : }
3579 :
3580 :
3581 1546247 : void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3582 266543 : s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3583 266543 : w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3584 266543 : d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3585 : surrogate_ =
3586 266543 : AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3587 266543 : if (interval.to() - interval.from() >= kMapSize - 1) {
3588 14172 : if (map_count_ != kMapSize) {
3589 6371 : map_count_ = kMapSize;
3590 821859 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3591 : }
3592 : return;
3593 : }
3594 1773951 : for (int i = interval.from(); i <= interval.to(); i++) {
3595 817418 : int mod_character = (i & kMask);
3596 1634836 : if (!map_->at(mod_character)) {
3597 586685 : map_count_++;
3598 586685 : map_->at(mod_character) = true;
3599 : }
3600 817418 : if (map_count_ == kMapSize) return;
3601 : }
3602 : }
3603 :
3604 :
3605 0 : void BoyerMoorePositionInfo::SetAll() {
3606 5675 : s_ = w_ = d_ = kLatticeUnknown;
3607 5675 : if (map_count_ != kMapSize) {
3608 5238 : map_count_ = kMapSize;
3609 1340928 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3610 : }
3611 0 : }
3612 :
3613 :
3614 87816 : BoyerMooreLookahead::BoyerMooreLookahead(
3615 87816 : int length, RegExpCompiler* compiler, Zone* zone)
3616 : : length_(length),
3617 87816 : compiler_(compiler) {
3618 87816 : if (compiler->one_byte()) {
3619 16860 : max_char_ = String::kMaxOneByteCharCode;
3620 : } else {
3621 70956 : max_char_ = String::kMaxUtf16CodeUnit;
3622 : }
3623 87816 : bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3624 218311 : for (int i = 0; i < length; i++) {
3625 130495 : bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3626 : }
3627 87816 : }
3628 :
3629 :
3630 : // Find the longest range of lookahead that has the fewest number of different
3631 : // characters that can occur at a given position. Since we are optimizing two
3632 : // different parameters at once this is a tradeoff.
3633 87708 : bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3634 : int biggest_points = 0;
3635 : // If more than 32 characters out of 128 can occur it is unlikely that we can
3636 : // be lucky enough to step forwards much of the time.
3637 : const int kMaxMax = 32;
3638 350832 : for (int max_number_of_chars = 4;
3639 : max_number_of_chars < kMaxMax;
3640 : max_number_of_chars *= 2) {
3641 : biggest_points =
3642 263124 : FindBestInterval(max_number_of_chars, biggest_points, from, to);
3643 : }
3644 87708 : if (biggest_points == 0) return false;
3645 10971 : return true;
3646 : }
3647 :
3648 :
3649 : // Find the highest-points range between 0 and length_ where the character
3650 : // information is not too vague. 'Too vague' means that there are more than
3651 : // max_number_of_chars that can occur at this position. Calculates the number
3652 : // of points as the product of width-of-the-range and
3653 : // probability-of-finding-one-of-the-characters, where the probability is
3654 : // calculated using the frequency distribution of the sample subject string.
3655 263124 : int BoyerMooreLookahead::FindBestInterval(
3656 653831 : int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3657 : int biggest_points = old_biggest_points;
3658 : static const int kSize = RegExpMacroAssembler::kTableSize;
3659 773890 : for (int i = 0; i < length_; ) {
3660 360426 : while (i < length_ && Count(i) > max_number_of_chars) i++;
3661 278434 : if (i == length_) break;
3662 : int remembered_from = i;
3663 : bool union_map[kSize];
3664 31698176 : for (int j = 0; j < kSize; j++) union_map[j] = false;
3665 931926 : while (i < length_ && Count(i) <= max_number_of_chars) {
3666 43483310 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3667 43148823 : for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3668 334487 : i++;
3669 : }
3670 : int frequency = 0;
3671 31698176 : for (int j = 0; j < kSize; j++) {
3672 31698176 : if (union_map[j]) {
3673 : // Add 1 to the frequency to give a small per-character boost for
3674 : // the cases where our sampling is not good enough and many
3675 : // characters have a frequency of zero. This means the frequency
3676 : // can theoretically be up to 2*kSize though we treat it mostly as
3677 : // a fraction of kSize.
3678 1150819 : frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3679 : }
3680 : }
3681 : // We use the probability of skipping times the distance we are skipping to
3682 : // judge the effectiveness of this. Actually we have a cut-off: By
3683 : // dividing by 2 we switch off the skipping if the probability of skipping
3684 : // is less than 50%. This is because the multibyte mask-and-compare
3685 : // skipping in quickcheck is more likely to do well on this case.
3686 : bool in_quickcheck_range =
3687 264435 : ((i - remembered_from < 4) ||
3688 16793 : (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3689 : // Called 'probability' but it is only a rough estimate and can actually
3690 : // be outside the 0-kSize range.
3691 247642 : int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3692 247642 : int points = (i - remembered_from) * probability;
3693 247642 : if (points > biggest_points) {
3694 11333 : *from = remembered_from;
3695 11333 : *to = i - 1;
3696 : biggest_points = points;
3697 : }
3698 : }
3699 263124 : return biggest_points;
3700 : }
3701 :
3702 :
3703 : // Take all the characters that will not prevent a successful match if they
3704 : // occur in the subject string in the range between min_lookahead and
3705 : // max_lookahead (inclusive) measured from the current position. If the
3706 : // character at max_lookahead offset is not one of these characters, then we
3707 : // can safely skip forwards by the number of characters in the range.
3708 9035 : int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3709 : int max_lookahead,
3710 : Handle<ByteArray> boolean_skip_table) {
3711 : const int kSize = RegExpMacroAssembler::kTableSize;
3712 :
3713 : const int kSkipArrayEntry = 0;
3714 : const int kDontSkipArrayEntry = 1;
3715 :
3716 1165515 : for (int i = 0; i < kSize; i++) {
3717 : boolean_skip_table->set(i, kSkipArrayEntry);
3718 : }
3719 9035 : int skip = max_lookahead + 1 - min_lookahead;
3720 :
3721 39517 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3722 3962660 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3723 3932178 : for (int j = 0; j < kSize; j++) {
3724 3901696 : if (map->at(j)) {
3725 : boolean_skip_table->set(j, kDontSkipArrayEntry);
3726 : }
3727 : }
3728 : }
3729 :
3730 9035 : return skip;
3731 : }
3732 :
3733 :
3734 : // See comment above on the implementation of GetSkipTable.
3735 96743 : void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3736 : const int kSize = RegExpMacroAssembler::kTableSize;
3737 :
3738 87708 : int min_lookahead = 0;
3739 87708 : int max_lookahead = 0;
3740 :
3741 166381 : if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3742 :
3743 : bool found_single_character = false;
3744 : int single_character = 0;
3745 19884 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3746 773491 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3747 35896 : if (map->map_count() > 1 ||
3748 6852 : (found_single_character && map->map_count() != 0)) {
3749 : found_single_character = false;
3750 : break;
3751 : }
3752 728758 : for (int j = 0; j < kSize; j++) {
3753 737595 : if (map->at(j)) {
3754 : found_single_character = true;
3755 : single_character = j;
3756 : break;
3757 : }
3758 : }
3759 : }
3760 :
3761 10971 : int lookahead_width = max_lookahead + 1 - min_lookahead;
3762 :
3763 10971 : if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3764 : // The mask-compare can probably handle this better.
3765 : return;
3766 : }
3767 :
3768 9149 : if (found_single_character) {
3769 : Label cont, again;
3770 114 : masm->Bind(&again);
3771 114 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3772 114 : if (max_char_ > kSize) {
3773 : masm->CheckCharacterAfterAnd(single_character,
3774 : RegExpMacroAssembler::kTableMask,
3775 114 : &cont);
3776 : } else {
3777 0 : masm->CheckCharacter(single_character, &cont);
3778 : }
3779 114 : masm->AdvanceCurrentPosition(lookahead_width);
3780 114 : masm->GoTo(&again);
3781 114 : masm->Bind(&cont);
3782 : return;
3783 : }
3784 :
3785 : Factory* factory = masm->isolate()->factory();
3786 9035 : Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3787 : int skip_distance = GetSkipTable(
3788 9035 : min_lookahead, max_lookahead, boolean_skip_table);
3789 : DCHECK_NE(0, skip_distance);
3790 :
3791 : Label cont, again;
3792 9035 : masm->Bind(&again);
3793 9035 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3794 9035 : masm->CheckBitInTable(boolean_skip_table, &cont);
3795 9035 : masm->AdvanceCurrentPosition(skip_distance);
3796 9035 : masm->GoTo(&again);
3797 9035 : masm->Bind(&cont);
3798 : }
3799 :
3800 :
3801 : /* Code generation for choice nodes.
3802 : *
3803 : * We generate quick checks that do a mask and compare to eliminate a
3804 : * choice. If the quick check succeeds then it jumps to the continuation to
3805 : * do slow checks and check subsequent nodes. If it fails (the common case)
3806 : * it falls through to the next choice.
3807 : *
3808 : * Here is the desired flow graph. Nodes directly below each other imply
3809 : * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3810 : * 3 doesn't have a quick check so we have to call the slow check.
3811 : * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3812 : * regexp continuation is generated directly after the Sn node, up to the
3813 : * next GoTo if we decide to reuse some already generated code. Some
3814 : * nodes expect preload_characters to be preloaded into the current
3815 : * character register. R nodes do this preloading. Vertices are marked
3816 : * F for failures and S for success (possible success in the case of quick
3817 : * nodes). L, V, < and > are used as arrow heads.
3818 : *
3819 : * ----------> R
3820 : * |
3821 : * V
3822 : * Q1 -----> S1
3823 : * | S /
3824 : * F| /
3825 : * | F/
3826 : * | /
3827 : * | R
3828 : * | /
3829 : * V L
3830 : * Q2 -----> S2
3831 : * | S /
3832 : * F| /
3833 : * | F/
3834 : * | /
3835 : * | R
3836 : * | /
3837 : * V L
3838 : * S3
3839 : * |
3840 : * F|
3841 : * |
3842 : * R
3843 : * |
3844 : * backtrack V
3845 : * <----------Q4
3846 : * \ F |
3847 : * \ |S
3848 : * \ F V
3849 : * \-----S4
3850 : *
3851 : * For greedy loops we push the current position, then generate the code that
3852 : * eats the input specially in EmitGreedyLoop. The other choice (the
3853 : * continuation) is generated by the normal code in EmitChoices, and steps back
3854 : * in the input to the starting position when it fails to match. The loop code
3855 : * looks like this (U is the unwind code that steps back in the greedy loop).
3856 : *
3857 : * _____
3858 : * / \
3859 : * V |
3860 : * ----------> S1 |
3861 : * /| |
3862 : * / |S |
3863 : * F/ \_____/
3864 : * /
3865 : * |<-----
3866 : * | \
3867 : * V |S
3868 : * Q2 ---> U----->backtrack
3869 : * | F /
3870 : * S| /
3871 : * V F /
3872 : * S2--/
3873 : */
3874 :
3875 245782 : GreedyLoopState::GreedyLoopState(bool not_at_start) {
3876 0 : counter_backtrack_trace_.set_backtrack(&label_);
3877 245782 : if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3878 0 : }
3879 :
3880 :
3881 0 : void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3882 : #ifdef DEBUG
3883 : int choice_count = alternatives_->length();
3884 : for (int i = 0; i < choice_count - 1; i++) {
3885 : GuardedAlternative alternative = alternatives_->at(i);
3886 : ZoneList<Guard*>* guards = alternative.guards();
3887 : int guard_count = (guards == nullptr) ? 0 : guards->length();
3888 : for (int j = 0; j < guard_count; j++) {
3889 : DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3890 : }
3891 : }
3892 : #endif
3893 0 : }
3894 :
3895 :
3896 402396 : void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3897 402396 : Trace* current_trace,
3898 : PreloadState* state) {
3899 245782 : if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3900 : // Save some time by looking at most one machine word ahead.
3901 : state->eats_at_least_ =
3902 : EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3903 469842 : current_trace->at_start() == Trace::FALSE_VALUE);
3904 : }
3905 : state->preload_characters_ =
3906 245782 : CalculatePreloadCharacters(compiler, state->eats_at_least_);
3907 :
3908 : state->preload_is_current_ =
3909 245782 : (current_trace->characters_preloaded() == state->preload_characters_);
3910 245782 : state->preload_has_checked_bounds_ = state->preload_is_current_;
3911 245782 : }
3912 :
3913 :
3914 1609496 : void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3915 637491 : int choice_count = alternatives_->length();
3916 :
3917 638387 : if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3918 896 : alternatives_->at(0).node()->Emit(compiler, trace);
3919 896 : return;
3920 : }
3921 :
3922 : AssertGuardsMentionRegisters(trace);
3923 :
3924 882377 : LimitResult limit_result = LimitVersions(compiler, trace);
3925 636595 : if (limit_result == DONE) return;
3926 : DCHECK(limit_result == CONTINUE);
3927 :
3928 : // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3929 : // other choice nodes we only flush if we are out of code size budget.
3930 249468 : if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3931 1843 : trace->Flush(compiler, this);
3932 1843 : return;
3933 : }
3934 :
3935 : RecursionCheck rc(compiler);
3936 :
3937 : PreloadState preload;
3938 : preload.init();
3939 : GreedyLoopState greedy_loop_state(not_at_start());
3940 :
3941 491564 : int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3942 491564 : AlternativeGenerationList alt_gens(choice_count, zone());
3943 :
3944 245782 : if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3945 : trace = EmitGreedyLoop(compiler,
3946 : trace,
3947 : &alt_gens,
3948 : &preload,
3949 : &greedy_loop_state,
3950 14809 : text_length);
3951 : } else {
3952 : // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3953 : // match the traces produced pre-cleanup.
3954 : Label second_choice;
3955 230973 : compiler->macro_assembler()->Bind(&second_choice);
3956 :
3957 230973 : preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3958 :
3959 : EmitChoices(compiler,
3960 : &alt_gens,
3961 : 0,
3962 : trace,
3963 230973 : &preload);
3964 : }
3965 :
3966 : // At this point we need to generate slow checks for the alternatives where
3967 : // the quick check was inlined. We can recognize these because the associated
3968 : // label was bound.
3969 245782 : int new_flush_budget = trace->flush_budget() / choice_count;
3970 913469 : for (int i = 0; i < choice_count; i++) {
3971 : AlternativeGeneration* alt_gen = alt_gens.at(i);
3972 667687 : Trace new_trace(*trace);
3973 : // If there are actions to be flushed we have to limit how many times
3974 : // they are flushed. Take the budget of the parent trace and distribute
3975 : // it fairly amongst the children.
3976 667687 : if (new_trace.actions() != nullptr) {
3977 : new_trace.set_flush_budget(new_flush_budget);
3978 : }
3979 : bool next_expects_preload =
3980 1089592 : i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3981 : EmitOutOfLineContinuation(compiler,
3982 : &new_trace,
3983 667687 : alternatives_->at(i),
3984 : alt_gen,
3985 : preload.preload_characters_,
3986 1335374 : next_expects_preload);
3987 : }
3988 : }
3989 :
3990 :
3991 14809 : Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3992 14809 : Trace* trace,
3993 : AlternativeGenerationList* alt_gens,
3994 : PreloadState* preload,
3995 : GreedyLoopState* greedy_loop_state,
3996 14809 : int text_length) {
3997 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3998 : // Here we have special handling for greedy loops containing only text nodes
3999 : // and other simple nodes. These are handled by pushing the current
4000 : // position on the stack and then incrementing the current position each
4001 : // time around the switch. On backtrack we decrement the current position
4002 : // and check it against the pushed value. This avoids pushing backtrack
4003 : // information for each iteration of the loop, which could take up a lot of
4004 : // space.
4005 : DCHECK(trace->stop_node() == nullptr);
4006 14809 : macro_assembler->PushCurrentPosition();
4007 : Label greedy_match_failed;
4008 14809 : Trace greedy_match_trace;
4009 14809 : if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4010 : greedy_match_trace.set_backtrack(&greedy_match_failed);
4011 : Label loop_label;
4012 14809 : macro_assembler->Bind(&loop_label);
4013 14809 : greedy_match_trace.set_stop_node(this);
4014 : greedy_match_trace.set_loop_label(&loop_label);
4015 29618 : alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4016 14809 : macro_assembler->Bind(&greedy_match_failed);
4017 :
4018 : Label second_choice; // For use in greedy matches.
4019 14809 : macro_assembler->Bind(&second_choice);
4020 :
4021 14809 : Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4022 :
4023 : EmitChoices(compiler,
4024 : alt_gens,
4025 : 1,
4026 : new_trace,
4027 14809 : preload);
4028 :
4029 14809 : macro_assembler->Bind(greedy_loop_state->label());
4030 : // If we have unwound to the bottom then backtrack.
4031 29618 : macro_assembler->CheckGreedyLoop(trace->backtrack());
4032 : // Otherwise try the second priority at an earlier position.
4033 14809 : macro_assembler->AdvanceCurrentPosition(-text_length);
4034 14809 : macro_assembler->GoTo(&second_choice);
4035 14809 : return new_trace;
4036 : }
4037 :
4038 320141 : int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4039 : Trace* trace) {
4040 : int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4041 230973 : if (alternatives_->length() != 2) return eats_at_least;
4042 :
4043 192794 : GuardedAlternative alt1 = alternatives_->at(1);
4044 192794 : if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
4045 : return eats_at_least;
4046 : }
4047 : RegExpNode* eats_anything_node = alt1.node();
4048 277467 : if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4049 : return eats_at_least;
4050 : }
4051 :
4052 : // Really we should be creating a new trace when we execute this function,
4053 : // but there is no need, because the code it generates cannot backtrack, and
4054 : // we always arrive here with a trivial trace (since it's the entry to a
4055 : // loop. That also implies that there are no preloaded characters, which is
4056 : // good, because it means we won't be violating any assumptions by
4057 : // overwriting those characters with new load instructions.
4058 : DCHECK(trace->is_trivial());
4059 :
4060 89168 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4061 : Isolate* isolate = macro_assembler->isolate();
4062 : // At this point we know that we are at a non-greedy loop that will eat
4063 : // any character one at a time. Any non-anchored regexp has such a
4064 : // loop prepended to it in order to find where it starts. We look for
4065 : // a pattern of the form ...abc... where we can look 6 characters ahead
4066 : // and step forwards 3 if the character is not one of abc. Abc need
4067 : // not be atoms, they can be any reasonably limited character class or
4068 : // small alternation.
4069 : BoyerMooreLookahead* bm = bm_info(false);
4070 89168 : if (bm == nullptr) {
4071 : eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4072 : EatsAtLeast(kMaxLookaheadForBoyerMoore,
4073 : kRecursionBudget,
4074 89168 : false));
4075 89168 : if (eats_at_least >= 1) {
4076 : bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4077 : compiler,
4078 87708 : zone());
4079 175416 : GuardedAlternative alt0 = alternatives_->at(0);
4080 87708 : alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4081 : }
4082 : }
4083 89168 : if (bm != nullptr) {
4084 87708 : bm->EmitSkipInstructions(macro_assembler);
4085 : }
4086 89168 : return eats_at_least;
4087 : }
4088 :
4089 :
4090 898660 : void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4091 : AlternativeGenerationList* alt_gens,
4092 : int first_choice,
4093 245804 : Trace* trace,
4094 : PreloadState* preload) {
4095 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4096 245782 : SetUpPreLoad(compiler, trace, preload);
4097 :
4098 : // For now we just call all choices one after the other. The idea ultimately
4099 : // is to use the Dispatch table to try only the relevant ones.
4100 245782 : int choice_count = alternatives_->length();
4101 :
4102 245782 : int new_flush_budget = trace->flush_budget() / choice_count;
4103 :
4104 898660 : for (int i = first_choice; i < choice_count; i++) {
4105 652878 : bool is_last = i == choice_count - 1;
4106 652878 : bool fall_through_on_failure = !is_last;
4107 1305756 : GuardedAlternative alternative = alternatives_->at(i);
4108 : AlternativeGeneration* alt_gen = alt_gens->at(i);
4109 1080898 : alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4110 7347 : ZoneList<Guard*>* guards = alternative.guards();
4111 652878 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4112 652878 : Trace new_trace(*trace);
4113 : new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4114 : preload->preload_characters_ :
4115 652878 : 0);
4116 652878 : if (preload->preload_has_checked_bounds_) {
4117 435808 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4118 : }
4119 : new_trace.quick_check_performed()->Clear();
4120 652878 : if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4121 652878 : if (!is_last) {
4122 407096 : new_trace.set_backtrack(&alt_gen->after);
4123 : }
4124 652878 : alt_gen->expects_preload = preload->preload_is_current_;
4125 : bool generate_full_check_inline = false;
4126 1178865 : if (compiler->optimize() &&
4127 1175949 : try_to_emit_quick_check_for_alternative(i == 0) &&
4128 : alternative.node()->EmitQuickCheck(
4129 : compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4130 : &alt_gen->possible_success, &alt_gen->quick_check_details,
4131 523071 : fall_through_on_failure)) {
4132 : // Quick check was generated for this choice.
4133 224858 : preload->preload_is_current_ = true;
4134 224858 : preload->preload_has_checked_bounds_ = true;
4135 : // If we generated the quick check to fall through on possible success,
4136 : // we now need to generate the full check inline.
4137 224858 : if (!fall_through_on_failure) {
4138 38626 : macro_assembler->Bind(&alt_gen->possible_success);
4139 : new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4140 38626 : new_trace.set_characters_preloaded(preload->preload_characters_);
4141 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4142 : generate_full_check_inline = true;
4143 : }
4144 428020 : } else if (alt_gen->quick_check_details.cannot_match()) {
4145 117 : if (!fall_through_on_failure) {
4146 44 : macro_assembler->GoTo(trace->backtrack());
4147 : }
4148 117 : continue;
4149 : } else {
4150 : // No quick check was generated. Put the full code here.
4151 : // If this is not the first choice then there could be slow checks from
4152 : // previous cases that go here when they fail. There's no reason to
4153 : // insist that they preload characters since the slow check we are about
4154 : // to generate probably can't use it.
4155 427903 : if (i != first_choice) {
4156 255607 : alt_gen->expects_preload = false;
4157 : new_trace.InvalidateCurrentCharacter();
4158 : }
4159 : generate_full_check_inline = true;
4160 : }
4161 652761 : if (generate_full_check_inline) {
4162 466529 : if (new_trace.actions() != nullptr) {
4163 : new_trace.set_flush_budget(new_flush_budget);
4164 : }
4165 6166 : for (int j = 0; j < guard_count; j++) {
4166 6166 : GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4167 : }
4168 466529 : alternative.node()->Emit(compiler, &new_trace);
4169 466529 : preload->preload_is_current_ = false;
4170 : }
4171 652761 : macro_assembler->Bind(&alt_gen->after);
4172 : }
4173 245782 : }
4174 :
4175 :
4176 853919 : void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4177 149797 : Trace* trace,
4178 : GuardedAlternative alternative,
4179 : AlternativeGeneration* alt_gen,
4180 : int preload_characters,
4181 : bool next_expects_preload) {
4182 1149142 : if (!alt_gen->possible_success.is_linked()) return;
4183 :
4184 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4185 186232 : macro_assembler->Bind(&alt_gen->possible_success);
4186 186232 : Trace out_of_line_trace(*trace);
4187 : out_of_line_trace.set_characters_preloaded(preload_characters);
4188 : out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4189 186232 : if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4190 187413 : ZoneList<Guard*>* guards = alternative.guards();
4191 186232 : int guard_count = (guards == nullptr) ? 0 : guards->length();
4192 186232 : if (next_expects_preload) {
4193 : Label reload_current_char;
4194 : out_of_line_trace.set_backtrack(&reload_current_char);
4195 150896 : for (int j = 0; j < guard_count; j++) {
4196 1099 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4197 : }
4198 149797 : alternative.node()->Emit(compiler, &out_of_line_trace);
4199 149797 : macro_assembler->Bind(&reload_current_char);
4200 : // Reload the current character, since the next quick check expects that.
4201 : // We don't need to check bounds here because we only get into this
4202 : // code through a quick check which already did the checked load.
4203 : macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
4204 299594 : preload_characters);
4205 149797 : macro_assembler->GoTo(&(alt_gen->after));
4206 : } else {
4207 36435 : out_of_line_trace.set_backtrack(&(alt_gen->after));
4208 36517 : for (int j = 0; j < guard_count; j++) {
4209 82 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4210 : }
4211 36435 : alternative.node()->Emit(compiler, &out_of_line_trace);
4212 : }
4213 : }
4214 :
4215 :
4216 547969 : void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4217 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4218 547060 : LimitResult limit_result = LimitVersions(compiler, trace);
4219 547060 : if (limit_result == DONE) return;
4220 : DCHECK(limit_result == CONTINUE);
4221 :
4222 : RecursionCheck rc(compiler);
4223 :
4224 324111 : switch (action_type_) {
4225 : case STORE_POSITION: {
4226 : Trace::DeferredCapture
4227 : new_capture(data_.u_position_register.reg,
4228 : data_.u_position_register.is_capture,
4229 292851 : trace);
4230 292851 : Trace new_trace = *trace;
4231 : new_trace.add_action(&new_capture);
4232 316099 : on_success()->Emit(compiler, &new_trace);
4233 : break;
4234 : }
4235 : case INCREMENT_REGISTER: {
4236 : Trace::DeferredIncrementRegister
4237 6910 : new_increment(data_.u_increment_register.reg);
4238 6910 : Trace new_trace = *trace;
4239 : new_trace.add_action(&new_increment);
4240 6910 : on_success()->Emit(compiler, &new_trace);
4241 : break;
4242 : }
4243 : case SET_REGISTER: {
4244 : Trace::DeferredSetRegister
4245 6013 : new_set(data_.u_store_register.reg, data_.u_store_register.value);
4246 6013 : Trace new_trace = *trace;
4247 : new_trace.add_action(&new_set);
4248 6013 : on_success()->Emit(compiler, &new_trace);
4249 : break;
4250 : }
4251 : case CLEAR_CAPTURES: {
4252 : Trace::DeferredClearCaptures
4253 : new_capture(Interval(data_.u_clear_captures.range_from,
4254 3803 : data_.u_clear_captures.range_to));
4255 3803 : Trace new_trace = *trace;
4256 : new_trace.add_action(&new_capture);
4257 3803 : on_success()->Emit(compiler, &new_trace);
4258 : break;
4259 : }
4260 : case BEGIN_SUBMATCH:
4261 9434 : if (!trace->is_trivial()) {
4262 4995 : trace->Flush(compiler, this);
4263 : } else {
4264 : assembler->WriteCurrentPositionToRegister(
4265 4439 : data_.u_submatch.current_position_register, 0);
4266 : assembler->WriteStackPointerToRegister(
4267 4439 : data_.u_submatch.stack_pointer_register);
4268 4439 : on_success()->Emit(compiler, trace);
4269 : }
4270 : break;
4271 : case EMPTY_MATCH_CHECK: {
4272 967 : int start_pos_reg = data_.u_empty_match_check.start_register;
4273 967 : int stored_pos = 0;
4274 967 : int rep_reg = data_.u_empty_match_check.repetition_register;
4275 967 : bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4276 967 : bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4277 1146 : if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4278 : // If we know we haven't advanced and there is no minimum we
4279 : // can just backtrack immediately.
4280 152 : assembler->GoTo(trace->backtrack());
4281 1229 : } else if (know_dist && stored_pos < trace->cp_offset()) {
4282 : // If we know we've advanced we can generate the continuation
4283 : // immediately.
4284 250 : on_success()->Emit(compiler, trace);
4285 641 : } else if (!trace->is_trivial()) {
4286 325 : trace->Flush(compiler, this);
4287 : } else {
4288 : Label skip_empty_check;
4289 : // If we have a minimum number of repetitions we check the current
4290 : // number first and skip the empty check if it's not enough.
4291 316 : if (has_minimum) {
4292 211 : int limit = data_.u_empty_match_check.repetition_limit;
4293 211 : assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4294 : }
4295 : // If the match is empty we bail out, otherwise we fall through
4296 : // to the on-success continuation.
4297 : assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4298 632 : trace->backtrack());
4299 316 : assembler->Bind(&skip_empty_check);
4300 316 : on_success()->Emit(compiler, trace);
4301 : }
4302 : break;
4303 : }
4304 : case POSITIVE_SUBMATCH_SUCCESS: {
4305 4133 : if (!trace->is_trivial()) {
4306 2616 : trace->Flush(compiler, this);
4307 2616 : return;
4308 : }
4309 : assembler->ReadCurrentPositionFromRegister(
4310 1517 : data_.u_submatch.current_position_register);
4311 : assembler->ReadStackPointerFromRegister(
4312 1517 : data_.u_submatch.stack_pointer_register);
4313 1517 : int clear_register_count = data_.u_submatch.clear_register_count;
4314 1517 : if (clear_register_count == 0) {
4315 1013 : on_success()->Emit(compiler, trace);
4316 1013 : return;
4317 : }
4318 504 : int clear_registers_from = data_.u_submatch.clear_register_from;
4319 : Label clear_registers_backtrack;
4320 504 : Trace new_trace = *trace;
4321 : new_trace.set_backtrack(&clear_registers_backtrack);
4322 504 : on_success()->Emit(compiler, &new_trace);
4323 :
4324 504 : assembler->Bind(&clear_registers_backtrack);
4325 504 : int clear_registers_to = clear_registers_from + clear_register_count - 1;
4326 504 : assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4327 :
4328 : DCHECK(trace->backtrack() == nullptr);
4329 504 : assembler->Backtrack();
4330 : return;
4331 : }
4332 : default:
4333 0 : UNREACHABLE();
4334 : }
4335 : }
4336 :
4337 :
4338 12526 : void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4339 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4340 5278 : if (!trace->is_trivial()) {
4341 2522 : trace->Flush(compiler, this);
4342 2522 : return;
4343 : }
4344 :
4345 2756 : LimitResult limit_result = LimitVersions(compiler, trace);
4346 2756 : if (limit_result == DONE) return;
4347 : DCHECK(limit_result == CONTINUE);
4348 :
4349 : RecursionCheck rc(compiler);
4350 :
4351 : DCHECK_EQ(start_reg_ + 1, end_reg_);
4352 2549 : if (compiler->ignore_case()) {
4353 : assembler->CheckNotBackReferenceIgnoreCase(
4354 5775 : start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
4355 : } else {
4356 : assembler->CheckNotBackReference(start_reg_, read_backward(),
4357 1248 : trace->backtrack());
4358 : }
4359 : // We are going to advance backward, so we may end up at the start.
4360 2549 : if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4361 :
4362 : // Check that the back reference does not end inside a surrogate pair.
4363 2694 : if (compiler->unicode() && !compiler->one_byte()) {
4364 80 : assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4365 : }
4366 2549 : on_success()->Emit(compiler, trace);
4367 : }
4368 :
4369 :
4370 : // -------------------------------------------------------------------
4371 : // Dot/dotty output
4372 :
4373 :
4374 : #ifdef DEBUG
4375 :
4376 :
4377 : class DotPrinter: public NodeVisitor {
4378 : public:
4379 : DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4380 : : os_(os),
4381 : ignore_case_(ignore_case) {}
4382 : void PrintNode(const char* label, RegExpNode* node);
4383 : void Visit(RegExpNode* node);
4384 : void PrintAttributes(RegExpNode* from);
4385 : void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4386 : #define DECLARE_VISIT(Type) \
4387 : virtual void Visit##Type(Type##Node* that);
4388 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4389 : #undef DECLARE_VISIT
4390 : private:
4391 : std::ostream& os_;
4392 : bool ignore_case_;
4393 : };
4394 :
4395 :
4396 : void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4397 : os_ << "digraph G {\n graph [label=\"";
4398 : for (int i = 0; label[i]; i++) {
4399 : switch (label[i]) {
4400 : case '\\':
4401 : os_ << "\\\\";
4402 : break;
4403 : case '"':
4404 : os_ << "\"";
4405 : break;
4406 : default:
4407 : os_ << label[i];
4408 : break;
4409 : }
4410 : }
4411 : os_ << "\"];\n";
4412 : Visit(node);
4413 : os_ << "}" << std::endl;
4414 : }
4415 :
4416 :
4417 : void DotPrinter::Visit(RegExpNode* node) {
4418 : if (node->info()->visited) return;
4419 : node->info()->visited = true;
4420 : node->Accept(this);
4421 : }
4422 :
4423 :
4424 : void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4425 : os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4426 : Visit(on_failure);
4427 : }
4428 :
4429 :
4430 : class TableEntryBodyPrinter {
4431 : public:
4432 : TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4433 : : os_(os),
4434 : choice_(choice) {}
4435 : void Call(uc16 from, DispatchTable::Entry entry) {
4436 : OutSet* out_set = entry.out_set();
4437 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4438 : if (out_set->Get(i)) {
4439 : os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4440 : << choice()->alternatives()->at(i).node() << ";\n";
4441 : }
4442 : }
4443 : }
4444 : private:
4445 : ChoiceNode* choice() { return choice_; }
4446 : std::ostream& os_;
4447 : ChoiceNode* choice_;
4448 : };
4449 :
4450 :
4451 : class TableEntryHeaderPrinter {
4452 : public:
4453 : explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4454 : : first_(true),
4455 : os_(os) {}
4456 : void Call(uc16 from, DispatchTable::Entry entry) {
4457 : if (first_) {
4458 : first_ = false;
4459 : } else {
4460 : os_ << "|";
4461 : }
4462 : os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4463 : OutSet* out_set = entry.out_set();
4464 : int priority = 0;
4465 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4466 : if (out_set->Get(i)) {
4467 : if (priority > 0) os_ << "|";
4468 : os_ << "<s" << from << "o" << i << "> " << priority;
4469 : priority++;
4470 : }
4471 : }
4472 : os_ << "}}";
4473 : }
4474 :
4475 : private:
4476 : bool first_;
4477 : std::ostream& os_;
4478 : };
4479 :
4480 :
4481 : class AttributePrinter {
4482 : public:
4483 : explicit AttributePrinter(std::ostream& os) // NOLINT
4484 : : os_(os),
4485 : first_(true) {}
4486 : void PrintSeparator() {
4487 : if (first_) {
4488 : first_ = false;
4489 : } else {
4490 : os_ << "|";
4491 : }
4492 : }
4493 : void PrintBit(const char* name, bool value) {
4494 : if (!value) return;
4495 : PrintSeparator();
4496 : os_ << "{" << name << "}";
4497 : }
4498 : void PrintPositive(const char* name, int value) {
4499 : if (value < 0) return;
4500 : PrintSeparator();
4501 : os_ << "{" << name << "|" << value << "}";
4502 : }
4503 :
4504 : private:
4505 : std::ostream& os_;
4506 : bool first_;
4507 : };
4508 :
4509 :
4510 : void DotPrinter::PrintAttributes(RegExpNode* that) {
4511 : os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4512 : << "margin=0.1, fontsize=10, label=\"{";
4513 : AttributePrinter printer(os_);
4514 : NodeInfo* info = that->info();
4515 : printer.PrintBit("NI", info->follows_newline_interest);
4516 : printer.PrintBit("WI", info->follows_word_interest);
4517 : printer.PrintBit("SI", info->follows_start_interest);
4518 : Label* label = that->label();
4519 : if (label->is_bound())
4520 : printer.PrintPositive("@", label->pos());
4521 : os_ << "}\"];\n"
4522 : << " a" << that << " -> n" << that
4523 : << " [style=dashed, color=grey, arrowhead=none];\n";
4524 : }
4525 :
4526 :
4527 : static const bool kPrintDispatchTable = false;
4528 : void DotPrinter::VisitChoice(ChoiceNode* that) {
4529 : if (kPrintDispatchTable) {
4530 : os_ << " n" << that << " [shape=Mrecord, label=\"";
4531 : TableEntryHeaderPrinter header_printer(os_);
4532 : that->GetTable(ignore_case_)->ForEach(&header_printer);
4533 : os_ << "\"]\n";
4534 : PrintAttributes(that);
4535 : TableEntryBodyPrinter body_printer(os_, that);
4536 : that->GetTable(ignore_case_)->ForEach(&body_printer);
4537 : } else {
4538 : os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4539 : for (int i = 0; i < that->alternatives()->length(); i++) {
4540 : GuardedAlternative alt = that->alternatives()->at(i);
4541 : os_ << " n" << that << " -> n" << alt.node();
4542 : }
4543 : }
4544 : for (int i = 0; i < that->alternatives()->length(); i++) {
4545 : GuardedAlternative alt = that->alternatives()->at(i);
4546 : alt.node()->Accept(this);
4547 : }
4548 : }
4549 :
4550 :
4551 : void DotPrinter::VisitText(TextNode* that) {
4552 : Zone* zone = that->zone();
4553 : os_ << " n" << that << " [label=\"";
4554 : for (int i = 0; i < that->elements()->length(); i++) {
4555 : if (i > 0) os_ << " ";
4556 : TextElement elm = that->elements()->at(i);
4557 : switch (elm.text_type()) {
4558 : case TextElement::ATOM: {
4559 : Vector<const uc16> data = elm.atom()->data();
4560 : for (int i = 0; i < data.length(); i++) {
4561 : os_ << static_cast<char>(data[i]);
4562 : }
4563 : break;
4564 : }
4565 : case TextElement::CHAR_CLASS: {
4566 : RegExpCharacterClass* node = elm.char_class();
4567 : os_ << "[";
4568 : if (node->is_negated()) os_ << "^";
4569 : for (int j = 0; j < node->ranges(zone)->length(); j++) {
4570 : CharacterRange range = node->ranges(zone)->at(j);
4571 : os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4572 : }
4573 : os_ << "]";
4574 : break;
4575 : }
4576 : default:
4577 : UNREACHABLE();
4578 : }
4579 : }
4580 : os_ << "\", shape=box, peripheries=2];\n";
4581 : PrintAttributes(that);
4582 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4583 : Visit(that->on_success());
4584 : }
4585 :
4586 :
4587 : void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4588 : os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4589 : << that->end_register() << "\", shape=doubleoctagon];\n";
4590 : PrintAttributes(that);
4591 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4592 : Visit(that->on_success());
4593 : }
4594 :
4595 :
4596 : void DotPrinter::VisitEnd(EndNode* that) {
4597 : os_ << " n" << that << " [style=bold, shape=point];\n";
4598 : PrintAttributes(that);
4599 : }
4600 :
4601 :
4602 : void DotPrinter::VisitAssertion(AssertionNode* that) {
4603 : os_ << " n" << that << " [";
4604 : switch (that->assertion_type()) {
4605 : case AssertionNode::AT_END:
4606 : os_ << "label=\"$\", shape=septagon";
4607 : break;
4608 : case AssertionNode::AT_START:
4609 : os_ << "label=\"^\", shape=septagon";
4610 : break;
4611 : case AssertionNode::AT_BOUNDARY:
4612 : os_ << "label=\"\\b\", shape=septagon";
4613 : break;
4614 : case AssertionNode::AT_NON_BOUNDARY:
4615 : os_ << "label=\"\\B\", shape=septagon";
4616 : break;
4617 : case AssertionNode::AFTER_NEWLINE:
4618 : os_ << "label=\"(?<=\\n)\", shape=septagon";
4619 : break;
4620 : }
4621 : os_ << "];\n";
4622 : PrintAttributes(that);
4623 : RegExpNode* successor = that->on_success();
4624 : os_ << " n" << that << " -> n" << successor << ";\n";
4625 : Visit(successor);
4626 : }
4627 :
4628 :
4629 : void DotPrinter::VisitAction(ActionNode* that) {
4630 : os_ << " n" << that << " [";
4631 : switch (that->action_type_) {
4632 : case ActionNode::SET_REGISTER:
4633 : os_ << "label=\"$" << that->data_.u_store_register.reg
4634 : << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4635 : break;
4636 : case ActionNode::INCREMENT_REGISTER:
4637 : os_ << "label=\"$" << that->data_.u_increment_register.reg
4638 : << "++\", shape=octagon";
4639 : break;
4640 : case ActionNode::STORE_POSITION:
4641 : os_ << "label=\"$" << that->data_.u_position_register.reg
4642 : << ":=$pos\", shape=octagon";
4643 : break;
4644 : case ActionNode::BEGIN_SUBMATCH:
4645 : os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4646 : << ":=$pos,begin\", shape=septagon";
4647 : break;
4648 : case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4649 : os_ << "label=\"escape\", shape=septagon";
4650 : break;
4651 : case ActionNode::EMPTY_MATCH_CHECK:
4652 : os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4653 : << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4654 : << "<" << that->data_.u_empty_match_check.repetition_limit
4655 : << "?\", shape=septagon";
4656 : break;
4657 : case ActionNode::CLEAR_CAPTURES: {
4658 : os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4659 : << " to $" << that->data_.u_clear_captures.range_to
4660 : << "\", shape=septagon";
4661 : break;
4662 : }
4663 : }
4664 : os_ << "];\n";
4665 : PrintAttributes(that);
4666 : RegExpNode* successor = that->on_success();
4667 : os_ << " n" << that << " -> n" << successor << ";\n";
4668 : Visit(successor);
4669 : }
4670 :
4671 :
4672 : class DispatchTableDumper {
4673 : public:
4674 : explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4675 : void Call(uc16 key, DispatchTable::Entry entry);
4676 : private:
4677 : std::ostream& os_;
4678 : };
4679 :
4680 :
4681 : void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4682 : os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4683 : OutSet* set = entry.out_set();
4684 : bool first = true;
4685 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4686 : if (set->Get(i)) {
4687 : if (first) {
4688 : first = false;
4689 : } else {
4690 : os_ << ", ";
4691 : }
4692 : os_ << i;
4693 : }
4694 : }
4695 : os_ << "}\n";
4696 : }
4697 :
4698 :
4699 : void DispatchTable::Dump() {
4700 : OFStream os(stderr);
4701 : DispatchTableDumper dumper(os);
4702 : tree()->ForEach(&dumper);
4703 : }
4704 :
4705 :
4706 : void RegExpEngine::DotPrint(const char* label,
4707 : RegExpNode* node,
4708 : bool ignore_case) {
4709 : OFStream os(stdout);
4710 : DotPrinter printer(os, ignore_case);
4711 : printer.PrintNode(label, node);
4712 : }
4713 :
4714 :
4715 : #endif // DEBUG
4716 :
4717 :
4718 : // -------------------------------------------------------------------
4719 : // Tree to graph conversion
4720 :
4721 3337143 : RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4722 : RegExpNode* on_success) {
4723 : ZoneList<TextElement>* elms =
4724 1112381 : new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4725 1112381 : elms->Add(TextElement::Atom(this), compiler->zone());
4726 : return new (compiler->zone())
4727 1112381 : TextNode(elms, compiler->read_backward(), on_success);
4728 : }
4729 :
4730 :
4731 4968 : RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4732 : RegExpNode* on_success) {
4733 : return new (compiler->zone())
4734 4968 : TextNode(elements(), compiler->read_backward(), on_success);
4735 : }
4736 :
4737 :
4738 1210188 : static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4739 : const int* special_class,
4740 : int length) {
4741 605094 : length--; // Remove final marker.
4742 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4743 : DCHECK_NE(0, ranges->length());
4744 : DCHECK_NE(0, length);
4745 : DCHECK_NE(0, special_class[0]);
4746 605094 : if (ranges->length() != (length >> 1) + 1) {
4747 : return false;
4748 : }
4749 14795 : CharacterRange range = ranges->at(0);
4750 14795 : if (range.from() != 0) {
4751 : return false;
4752 : }
4753 26812 : for (int i = 0; i < length; i += 2) {
4754 27388 : if (special_class[i] != (range.to() + 1)) {
4755 : return false;
4756 : }
4757 53624 : range = ranges->at((i >> 1) + 1);
4758 26812 : if (special_class[i+1] != range.from()) {
4759 : return false;
4760 : }
4761 : }
4762 8504 : if (range.to() != String::kMaxCodePoint) {
4763 : return false;
4764 : }
4765 8504 : return true;
4766 : }
4767 :
4768 :
4769 1205306 : static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4770 : const int* special_class,
4771 : int length) {
4772 602653 : length--; // Remove final marker.
4773 : DCHECK_EQ(kRangeEndMarker, special_class[length]);
4774 602653 : if (ranges->length() * 2 != length) {
4775 : return false;
4776 : }
4777 34696 : for (int i = 0; i < length; i += 2) {
4778 77942 : CharacterRange range = ranges->at(i >> 1);
4779 73678 : if (range.from() != special_class[i] ||
4780 34707 : range.to() != special_class[i + 1] - 1) {
4781 : return false;
4782 : }
4783 : }
4784 : return true;
4785 : }
4786 :
4787 :
4788 286756 : bool RegExpCharacterClass::is_standard(Zone* zone) {
4789 : // TODO(lrn): Remove need for this function, by not throwing away information
4790 : // along the way.
4791 286756 : if (is_negated()) {
4792 : return false;
4793 : }
4794 280853 : if (set_.is_standard()) {
4795 : return true;
4796 : }
4797 207761 : if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4798 : set_.set_standard_set_type('s');
4799 1923 : return true;
4800 : }
4801 205838 : if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4802 : set_.set_standard_set_type('S');
4803 169 : return true;
4804 : }
4805 205669 : if (CompareInverseRanges(set_.ranges(zone),
4806 : kLineTerminatorRanges,
4807 205669 : kLineTerminatorRangeCount)) {
4808 : set_.set_standard_set_type('.');
4809 8218 : return true;
4810 : }
4811 197451 : if (CompareRanges(set_.ranges(zone),
4812 : kLineTerminatorRanges,
4813 197451 : kLineTerminatorRangeCount)) {
4814 : set_.set_standard_set_type('n');
4815 10 : return true;
4816 : }
4817 197441 : if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4818 : set_.set_standard_set_type('w');
4819 3854 : return true;
4820 : }
4821 193587 : if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4822 : set_.set_standard_set_type('W');
4823 117 : return true;
4824 : }
4825 : return false;
4826 : }
4827 :
4828 :
4829 2632 : UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4830 73317 : ZoneList<CharacterRange>* base)
4831 : : zone_(zone),
4832 : table_(zone),
4833 : bmp_(nullptr),
4834 : lead_surrogates_(nullptr),
4835 : trail_surrogates_(nullptr),
4836 5264 : non_bmp_(nullptr) {
4837 : // The unicode range splitter categorizes given character ranges into:
4838 : // - Code points from the BMP representable by one code unit.
4839 : // - Code points outside the BMP that need to be split into surrogate pairs.
4840 : // - Lone lead surrogates.
4841 : // - Lone trail surrogates.
4842 : // Lone surrogates are valid code points, even though no actual characters.
4843 : // They require special matching to make sure we do not split surrogate pairs.
4844 : // We use the dispatch table to accomplish this. The base range is split up
4845 : // by the table by the overlay ranges, and the Call callback is used to
4846 : // filter and collect ranges for each category.
4847 146634 : for (int i = 0; i < base->length(); i++) {
4848 141370 : table_.AddRange(base->at(i), kBase, zone_);
4849 : }
4850 : // Add overlay ranges.
4851 : table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4852 2632 : kBmpCodePoints, zone_);
4853 : table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4854 2632 : kLeadSurrogates, zone_);
4855 : table_.AddRange(
4856 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4857 2632 : kTrailSurrogates, zone_);
4858 : table_.AddRange(
4859 : CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4860 2632 : kBmpCodePoints, zone_);
4861 : table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4862 2632 : kNonBmpCodePoints, zone_);
4863 : table_.ForEach(this);
4864 2632 : }
4865 :
4866 :
4867 151926 : void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4868 151926 : OutSet* outset = entry.out_set();
4869 303852 : if (!outset->Get(kBase)) return;
4870 : ZoneList<CharacterRange>** target = nullptr;
4871 75093 : if (outset->Get(kBmpCodePoints)) {
4872 51687 : target = &bmp_;
4873 23406 : } else if (outset->Get(kLeadSurrogates)) {
4874 1277 : target = &lead_surrogates_;
4875 22129 : } else if (outset->Get(kTrailSurrogates)) {
4876 1277 : target = &trail_surrogates_;
4877 : } else {
4878 : DCHECK(outset->Get(kNonBmpCodePoints));
4879 20852 : target = &non_bmp_;
4880 : }
4881 75093 : if (*target == nullptr)
4882 12996 : *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4883 75093 : (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4884 : }
4885 :
4886 :
4887 7008 : void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4888 2626 : RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4889 : ZoneList<CharacterRange>* bmp = splitter->bmp();
4890 5252 : if (bmp == nullptr) return;
4891 : result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4892 4382 : compiler->zone(), bmp, compiler->read_backward(), on_success)));
4893 : }
4894 :
4895 :
4896 27118 : void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4897 : RegExpNode* on_success,
4898 2626 : UnicodeRangeSplitter* splitter) {
4899 22587 : ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4900 5252 : if (non_bmp == nullptr) return;
4901 : DCHECK(compiler->unicode());
4902 : DCHECK(!compiler->one_byte());
4903 : Zone* zone = compiler->zone();
4904 1741 : CharacterRange::Canonicalize(non_bmp);
4905 45174 : for (int i = 0; i < non_bmp->length(); i++) {
4906 : // Match surrogate pair.
4907 : // E.g. [\u10005-\u11005] becomes
4908 : // \ud800[\udc05-\udfff]|
4909 : // [\ud801-\ud803][\udc00-\udfff]|
4910 : // \ud804[\udc00-\udc05]
4911 20846 : uc32 from = non_bmp->at(i).from();
4912 20846 : uc32 to = non_bmp->at(i).to();
4913 20846 : uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4914 : uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4915 20846 : uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4916 : uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4917 20846 : if (from_l == to_l) {
4918 : // The lead surrogate is the same.
4919 : result->AddAlternative(
4920 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4921 : zone, CharacterRange::Singleton(from_l),
4922 : CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4923 18475 : on_success)));
4924 : } else {
4925 2371 : if (from_t != kTrailSurrogateStart) {
4926 : // Add [from_l][from_t-\udfff]
4927 : result->AddAlternative(
4928 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4929 : zone, CharacterRange::Singleton(from_l),
4930 : CharacterRange::Range(from_t, kTrailSurrogateEnd),
4931 1175 : compiler->read_backward(), on_success)));
4932 1175 : from_l++;
4933 : }
4934 2371 : if (to_t != kTrailSurrogateEnd) {
4935 : // Add [to_l][\udc00-to_t]
4936 : result->AddAlternative(
4937 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4938 : zone, CharacterRange::Singleton(to_l),
4939 : CharacterRange::Range(kTrailSurrogateStart, to_t),
4940 955 : compiler->read_backward(), on_success)));
4941 955 : to_l--;
4942 : }
4943 2371 : if (from_l <= to_l) {
4944 : // Add [from_l-to_l][\udc00-\udfff]
4945 : result->AddAlternative(
4946 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4947 : zone, CharacterRange::Range(from_l, to_l),
4948 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4949 2146 : compiler->read_backward(), on_success)));
4950 : }
4951 : }
4952 : }
4953 : }
4954 :
4955 :
4956 1276 : RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4957 1276 : RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
4958 : ZoneList<CharacterRange>* match, RegExpNode* on_success,
4959 : bool read_backward) {
4960 : Zone* zone = compiler->zone();
4961 : RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4962 1276 : zone, match, read_backward, on_success);
4963 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4964 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4965 : RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4966 1276 : position_register);
4967 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4968 1276 : zone, lookbehind, !read_backward, lookaround.on_match_success());
4969 1276 : return lookaround.ForMatch(negative_match);
4970 : }
4971 :
4972 :
4973 1266 : RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4974 1266 : RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
4975 : ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
4976 : bool read_backward) {
4977 : Zone* zone = compiler->zone();
4978 : int stack_register = compiler->UnicodeLookaroundStackRegister();
4979 : int position_register = compiler->UnicodeLookaroundPositionRegister();
4980 : RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4981 1266 : position_register);
4982 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4983 1266 : zone, lookahead, read_backward, lookaround.on_match_success());
4984 : return TextNode::CreateForCharacterRanges(
4985 1266 : zone, match, read_backward, lookaround.ForMatch(negative_match));
4986 : }
4987 :
4988 :
4989 5168 : void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4990 : RegExpNode* on_success,
4991 2626 : UnicodeRangeSplitter* splitter) {
4992 : ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
4993 5252 : if (lead_surrogates == nullptr) return;
4994 : Zone* zone = compiler->zone();
4995 : // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4996 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
4997 1271 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
4998 :
4999 : RegExpNode* match;
5000 1271 : if (compiler->read_backward()) {
5001 : // Reading backward. Assert that reading forward, there is no trail
5002 : // surrogate, and then backward match the lead surrogate.
5003 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
5004 95 : compiler, trail_surrogates, lead_surrogates, on_success, true);
5005 : } else {
5006 : // Reading forward. Forward match the lead surrogate and assert that
5007 : // no trail surrogate follows.
5008 : match = MatchAndNegativeLookaroundInReadDirection(
5009 1176 : compiler, lead_surrogates, trail_surrogates, on_success, false);
5010 : }
5011 : result->AddAlternative(GuardedAlternative(match));
5012 : }
5013 :
5014 :
5015 5168 : void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5016 : RegExpNode* on_success,
5017 2626 : UnicodeRangeSplitter* splitter) {
5018 : ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5019 5252 : if (trail_surrogates == nullptr) return;
5020 : Zone* zone = compiler->zone();
5021 : // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5022 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5023 1271 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5024 :
5025 : RegExpNode* match;
5026 1271 : if (compiler->read_backward()) {
5027 : // Reading backward. Backward match the trail surrogate and assert that no
5028 : // lead surrogate precedes it.
5029 : match = MatchAndNegativeLookaroundInReadDirection(
5030 90 : compiler, trail_surrogates, lead_surrogates, on_success, true);
5031 : } else {
5032 : // Reading forward. Assert that reading backward, there is no lead
5033 : // surrogate, and then forward match the trail surrogate.
5034 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
5035 1181 : compiler, lead_surrogates, trail_surrogates, on_success, false);
5036 : }
5037 : result->AddAlternative(GuardedAlternative(match));
5038 : }
5039 :
5040 1819 : RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5041 : RegExpNode* on_success) {
5042 : // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5043 : DCHECK(!compiler->read_backward());
5044 : Zone* zone = compiler->zone();
5045 : // Advance any character. If the character happens to be a lead surrogate and
5046 : // we advanced into the middle of a surrogate pair, it will work out, as
5047 : // nothing will match from there. We will have to advance again, consuming
5048 : // the associated trail surrogate.
5049 : ZoneList<CharacterRange>* range = CharacterRange::List(
5050 1819 : zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5051 1819 : return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5052 : }
5053 :
5054 39829 : void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5055 : #ifdef V8_INTL_SUPPORT
5056 : DCHECK(CharacterRange::IsCanonical(ranges));
5057 :
5058 : // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
5059 : // See also https://crbug.com/v8/6727.
5060 : // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
5061 : // which we use frequently internally. But large ranges can also easily be
5062 : // created by the user. We might want to have a more general caching mechanism
5063 : // for such ranges.
5064 1255 : if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
5065 :
5066 : // Use ICU to compute the case fold closure over the ranges.
5067 399 : icu::UnicodeSet set;
5068 78396 : for (int i = 0; i < ranges->length(); i++) {
5069 38799 : set.add(ranges->at(i).from(), ranges->at(i).to());
5070 : }
5071 : ranges->Clear();
5072 399 : set.closeOver(USET_CASE_INSENSITIVE);
5073 : // Full case mapping map single characters to multiple characters.
5074 : // Those are represented as strings in the set. Remove them so that
5075 : // we end up with only simple and common case mappings.
5076 399 : set.removeAllStrings();
5077 7296 : for (int i = 0; i < set.getRangeCount(); i++) {
5078 6897 : ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5079 6897 : zone);
5080 : }
5081 : // No errors and everything we collected have been ranges.
5082 399 : CharacterRange::Canonicalize(ranges);
5083 : #endif // V8_INTL_SUPPORT
5084 : }
5085 :
5086 :
5087 599451 : RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5088 : RegExpNode* on_success) {
5089 : set_.Canonicalize();
5090 : Zone* zone = compiler->zone();
5091 4462 : ZoneList<CharacterRange>* ranges = this->ranges(zone);
5092 199081 : if (compiler->needs_unicode_case_equivalents()) {
5093 539 : AddUnicodeCaseEquivalents(ranges, zone);
5094 : }
5095 210208 : if (compiler->unicode() && !compiler->one_byte() &&
5096 : !contains_split_surrogate()) {
5097 4462 : if (is_negated()) {
5098 : ZoneList<CharacterRange>* negated =
5099 128 : new (zone) ZoneList<CharacterRange>(2, zone);
5100 128 : CharacterRange::Negate(ranges, negated, zone);
5101 : ranges = negated;
5102 : }
5103 4462 : if (ranges->length() == 0) {
5104 17 : ranges->Add(CharacterRange::Everything(), zone);
5105 : RegExpCharacterClass* fail =
5106 : new (zone) RegExpCharacterClass(ranges, NEGATED);
5107 34 : return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5108 : }
5109 4445 : if (standard_type() == '*') {
5110 1819 : return UnanchoredAdvance(compiler, on_success);
5111 : } else {
5112 2626 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5113 2626 : UnicodeRangeSplitter splitter(zone, ranges);
5114 2626 : AddBmpCharacters(compiler, result, on_success, &splitter);
5115 2626 : AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5116 2626 : AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5117 2626 : AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5118 : return result;
5119 : }
5120 : } else {
5121 389238 : return new (zone) TextNode(this, compiler->read_backward(), on_success);
5122 : }
5123 : }
5124 :
5125 :
5126 151449 : int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5127 151449 : RegExpAtom* atom1 = (*a)->AsAtom();
5128 151449 : RegExpAtom* atom2 = (*b)->AsAtom();
5129 151449 : uc16 character1 = atom1->data().at(0);
5130 151449 : uc16 character2 = atom2->data().at(0);
5131 151449 : if (character1 < character2) return -1;
5132 132811 : if (character1 > character2) return 1;
5133 17400 : return 0;
5134 : }
5135 :
5136 :
5137 : static unibrow::uchar Canonical(
5138 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5139 : unibrow::uchar c) {
5140 : unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5141 117602 : int length = canonicalize->get(c, '\0', chars);
5142 : DCHECK_LE(length, 1);
5143 : unibrow::uchar canonical = c;
5144 117602 : if (length == 1) canonical = chars[0];
5145 : return canonical;
5146 : }
5147 :
5148 :
5149 74886 : int CompareFirstCharCaseIndependent(
5150 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5151 : RegExpTree* const* a, RegExpTree* const* b) {
5152 74886 : RegExpAtom* atom1 = (*a)->AsAtom();
5153 74886 : RegExpAtom* atom2 = (*b)->AsAtom();
5154 74886 : unibrow::uchar character1 = atom1->data().at(0);
5155 74886 : unibrow::uchar character2 = atom2->data().at(0);
5156 74886 : if (character1 == character2) return 0;
5157 52210 : if (character1 >= 'a' || character2 >= 'a') {
5158 : character1 = Canonical(canonicalize, character1);
5159 : character2 = Canonical(canonicalize, character2);
5160 : }
5161 52210 : return static_cast<int>(character1) - static_cast<int>(character2);
5162 : }
5163 :
5164 :
5165 : // We can stable sort runs of atoms, since the order does not matter if they
5166 : // start with different characters.
5167 : // Returns true if any consecutive atoms were found.
5168 11791 : bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5169 10650 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5170 : int length = alternatives->length();
5171 : bool found_consecutive_atoms = false;
5172 20345 : for (int i = 0; i < length; i++) {
5173 23007 : while (i < length) {
5174 21904 : RegExpTree* alternative = alternatives->at(i);
5175 21904 : if (alternative->IsAtom()) break;
5176 12209 : i++;
5177 : }
5178 : // i is length or it is the index of an atom.
5179 10798 : if (i == length) break;
5180 : int first_atom = i;
5181 9695 : i++;
5182 81646 : while (i < length) {
5183 62502 : RegExpTree* alternative = alternatives->at(i);
5184 62502 : if (!alternative->IsAtom()) break;
5185 62256 : i++;
5186 : }
5187 : // Sort atoms to get ones with common prefixes together.
5188 : // This step is more tricky if we are in a case-independent regexp,
5189 : // because it would change /is|I/ to /I|is/, and order matters when
5190 : // the regexp parts don't match only disjoint starting points. To fix
5191 : // this we have a version of CompareFirstChar that uses case-
5192 : // independent character classes for comparison.
5193 : DCHECK_LT(first_atom, alternatives->length());
5194 : DCHECK_LE(i, alternatives->length());
5195 : DCHECK_LE(first_atom, i);
5196 9695 : if (compiler->ignore_case()) {
5197 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5198 1141 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5199 : auto compare_closure =
5200 : [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5201 74886 : return CompareFirstCharCaseIndependent(canonicalize, a, b);
5202 74886 : };
5203 1141 : alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5204 : } else {
5205 8554 : alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5206 : }
5207 9695 : if (i - first_atom > 1) found_consecutive_atoms = true;
5208 : }
5209 10650 : return found_consecutive_atoms;
5210 : }
5211 :
5212 :
5213 : // Optimizes ab|ac|az to a(?:b|c|d).
5214 16105 : void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5215 : Zone* zone = compiler->zone();
5216 9414 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5217 : int length = alternatives->length();
5218 :
5219 : int write_posn = 0;
5220 : int i = 0;
5221 79700 : while (i < length) {
5222 60872 : RegExpTree* alternative = alternatives->at(i);
5223 60872 : if (!alternative->IsAtom()) {
5224 598 : alternatives->at(write_posn++) = alternatives->at(i);
5225 299 : i++;
5226 299 : continue;
5227 : }
5228 60573 : RegExpAtom* atom = alternative->AsAtom();
5229 60573 : unibrow::uchar common_prefix = atom->data().at(0);
5230 : int first_with_prefix = i;
5231 : int prefix_length = atom->length();
5232 60573 : i++;
5233 132243 : while (i < length) {
5234 62349 : alternative = alternatives->at(i);
5235 62349 : if (!alternative->IsAtom()) break;
5236 62256 : atom = alternative->AsAtom();
5237 62256 : unibrow::uchar new_prefix = atom->data().at(0);
5238 62256 : if (new_prefix != common_prefix) {
5239 51403 : if (!compiler->ignore_case()) break;
5240 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5241 6691 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5242 : new_prefix = Canonical(canonicalize, new_prefix);
5243 : common_prefix = Canonical(canonicalize, common_prefix);
5244 6691 : if (new_prefix != common_prefix) break;
5245 : }
5246 : prefix_length = Min(prefix_length, atom->length());
5247 11097 : i++;
5248 : }
5249 60573 : if (i > first_with_prefix + 2) {
5250 : // Found worthwhile run of alternatives with common prefix of at least one
5251 : // character. The sorting function above did not sort on more than one
5252 : // character for reasons of correctness, but there may still be a longer
5253 : // common prefix if the terms were similar or presorted in the input.
5254 : // Find out how long the common prefix is.
5255 694 : int run_length = i - first_with_prefix;
5256 694 : atom = alternatives->at(first_with_prefix)->AsAtom();
5257 2935 : for (int j = 1; j < run_length && prefix_length > 1; j++) {
5258 : RegExpAtom* old_atom =
5259 4482 : alternatives->at(j + first_with_prefix)->AsAtom();
5260 5361 : for (int k = 1; k < prefix_length; k++) {
5261 11523 : if (atom->data().at(k) != old_atom->data().at(k)) {
5262 : prefix_length = k;
5263 : break;
5264 : }
5265 : }
5266 : }
5267 : RegExpAtom* prefix =
5268 694 : new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5269 694 : ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5270 694 : pair->Add(prefix, zone);
5271 : ZoneList<RegExpTree*>* suffixes =
5272 694 : new (zone) ZoneList<RegExpTree*>(run_length, zone);
5273 11838 : for (int j = 0; j < run_length; j++) {
5274 : RegExpAtom* old_atom =
5275 22288 : alternatives->at(j + first_with_prefix)->AsAtom();
5276 : int len = old_atom->length();
5277 11144 : if (len == prefix_length) {
5278 173 : suffixes->Add(new (zone) RegExpEmpty(), zone);
5279 : } else {
5280 : RegExpTree* suffix = new (zone) RegExpAtom(
5281 21942 : old_atom->data().SubVector(prefix_length, old_atom->length()));
5282 10971 : suffixes->Add(suffix, zone);
5283 : }
5284 : }
5285 694 : pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5286 1388 : alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5287 : } else {
5288 : // Just copy any non-worthwhile alternatives.
5289 60526 : for (int j = first_with_prefix; j < i; j++) {
5290 121052 : alternatives->at(write_posn++) = alternatives->at(j);
5291 : }
5292 : }
5293 : }
5294 : alternatives->Rewind(write_posn); // Trim end of array.
5295 9414 : }
5296 :
5297 :
5298 : // Optimizes b|c|z to [bcz].
5299 10650 : void RegExpDisjunction::FixSingleCharacterDisjunctions(
5300 10650 : RegExpCompiler* compiler) {
5301 : Zone* zone = compiler->zone();
5302 10650 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5303 : int length = alternatives->length();
5304 : const bool unicode = compiler->unicode();
5305 :
5306 : int write_posn = 0;
5307 : int i = 0;
5308 86755 : while (i < length) {
5309 65455 : RegExpTree* alternative = alternatives->at(i);
5310 65455 : if (!alternative->IsAtom()) {
5311 26298 : alternatives->at(write_posn++) = alternatives->at(i);
5312 13149 : i++;
5313 13149 : continue;
5314 : }
5315 52306 : RegExpAtom* atom = alternative->AsAtom();
5316 52306 : if (atom->length() != 1) {
5317 87816 : alternatives->at(write_posn++) = alternatives->at(i);
5318 43908 : i++;
5319 43908 : continue;
5320 : }
5321 : DCHECK_IMPLIES(unicode,
5322 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5323 : bool contains_trail_surrogate =
5324 8398 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5325 : int first_in_run = i;
5326 8398 : i++;
5327 25297 : while (i < length) {
5328 16547 : alternative = alternatives->at(i);
5329 16547 : if (!alternative->IsAtom()) break;
5330 16300 : atom = alternative->AsAtom();
5331 16300 : if (atom->length() != 1) break;
5332 : DCHECK_IMPLIES(unicode,
5333 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5334 : contains_trail_surrogate |=
5335 17002 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5336 8501 : i++;
5337 : }
5338 8398 : if (i > first_in_run + 1) {
5339 : // Found non-trivial run of single-character alternatives.
5340 282 : int run_length = i - first_in_run;
5341 : ZoneList<CharacterRange>* ranges =
5342 282 : new (zone) ZoneList<CharacterRange>(2, zone);
5343 9065 : for (int j = 0; j < run_length; j++) {
5344 17566 : RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5345 : DCHECK_EQ(old_atom->length(), 1);
5346 8783 : ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5347 : }
5348 : RegExpCharacterClass::Flags flags;
5349 282 : if (unicode && contains_trail_surrogate) {
5350 : flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5351 : }
5352 282 : alternatives->at(write_posn++) =
5353 282 : new (zone) RegExpCharacterClass(ranges, flags);
5354 : } else {
5355 : // Just copy any trivial alternatives.
5356 8116 : for (int j = first_in_run; j < i; j++) {
5357 16232 : alternatives->at(write_posn++) = alternatives->at(j);
5358 : }
5359 : }
5360 : }
5361 : alternatives->Rewind(write_posn); // Trim end of array.
5362 10650 : }
5363 :
5364 :
5365 27154 : RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5366 13712 : RegExpNode* on_success) {
5367 37804 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5368 :
5369 13712 : if (alternatives->length() > 2) {
5370 10650 : bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5371 10650 : if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5372 10650 : FixSingleCharacterDisjunctions(compiler);
5373 10650 : if (alternatives->length() == 1) {
5374 270 : return alternatives->at(0)->ToNode(compiler, on_success);
5375 : }
5376 : }
5377 :
5378 : int length = alternatives->length();
5379 :
5380 : ChoiceNode* result =
5381 13442 : new(compiler->zone()) ChoiceNode(length, compiler->zone());
5382 84748 : for (int i = 0; i < length; i++) {
5383 : GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5384 71306 : on_success));
5385 : result->AddAlternative(alternative);
5386 : }
5387 : return result;
5388 : }
5389 :
5390 :
5391 1036455 : RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5392 2072910 : RegExpNode* on_success) {
5393 : return ToNode(min(),
5394 : max(),
5395 : is_greedy(),
5396 : body(),
5397 : compiler,
5398 2072910 : on_success);
5399 : }
5400 :
5401 :
5402 : // Scoped object to keep track of how much we unroll quantifier loops in the
5403 : // regexp graph generator.
5404 : class RegExpExpansionLimiter {
5405 : public:
5406 : static const int kMaxExpansionFactor = 6;
5407 81598 : RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5408 : : compiler_(compiler),
5409 : saved_expansion_factor_(compiler->current_expansion_factor()),
5410 81598 : ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5411 : DCHECK_LT(0, factor);
5412 94200 : if (ok_to_expand_) {
5413 94200 : if (factor > kMaxExpansionFactor) {
5414 : // Avoid integer overflow of the current expansion factor.
5415 : ok_to_expand_ = false;
5416 : compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5417 : } else {
5418 94064 : int new_factor = saved_expansion_factor_ * factor;
5419 94064 : ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5420 : compiler->set_current_expansion_factor(new_factor);
5421 : }
5422 : }
5423 : }
5424 :
5425 : ~RegExpExpansionLimiter() {
5426 : compiler_->set_current_expansion_factor(saved_expansion_factor_);
5427 : }
5428 :
5429 : bool ok_to_expand() { return ok_to_expand_; }
5430 :
5431 : private:
5432 : RegExpCompiler* compiler_;
5433 : int saved_expansion_factor_;
5434 : bool ok_to_expand_;
5435 :
5436 : DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5437 : };
5438 :
5439 :
5440 1135675 : RegExpNode* RegExpQuantifier::ToNode(int min,
5441 : int max,
5442 : bool is_greedy,
5443 : RegExpTree* body,
5444 3390589 : RegExpCompiler* compiler,
5445 : RegExpNode* on_success,
5446 : bool not_at_start) {
5447 : // x{f, t} becomes this:
5448 : //
5449 : // (r++)<-.
5450 : // | `
5451 : // | (x)
5452 : // v ^
5453 : // (r=0)-->(?)---/ [if r < t]
5454 : // |
5455 : // [if r >= f] \----> ...
5456 : //
5457 :
5458 : // 15.10.2.5 RepeatMatcher algorithm.
5459 : // The parser has already eliminated the case where max is 0. In the case
5460 : // where max_match is zero the parser has removed the quantifier if min was
5461 : // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5462 :
5463 : // If we know that we cannot match zero length then things are a little
5464 : // simpler since we don't need to make the special zero length match check
5465 : // from step 2.1. If the min and max are small we can unroll a little in
5466 : // this case.
5467 : static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5468 : static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5469 1135675 : if (max == 0) return on_success; // This can happen due to recursion.
5470 1133952 : bool body_can_be_empty = (body->min_match() == 0);
5471 : int body_start_reg = RegExpCompiler::kNoRegister;
5472 1133952 : Interval capture_registers = body->CaptureRegisters();
5473 1133952 : bool needs_capture_clearing = !capture_registers.is_empty();
5474 : Zone* zone = compiler->zone();
5475 :
5476 1133952 : if (body_can_be_empty) {
5477 : body_start_reg = compiler->AllocateRegister();
5478 1133380 : } else if (compiler->optimize() && !needs_capture_clearing) {
5479 : // Only unroll if there are no captures and the body can't be
5480 : // empty.
5481 : {
5482 : RegExpExpansionLimiter limiter(
5483 81598 : compiler, min + ((max != min) ? 1 : 0));
5484 81598 : if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5485 9395 : int new_max = (max == kInfinity) ? max : max - min;
5486 : // Recurse once to get the loop or optional matches after the fixed
5487 : // ones.
5488 : RegExpNode* answer = ToNode(
5489 9395 : 0, new_max, is_greedy, body, compiler, on_success, true);
5490 : // Unroll the forced matches from 0 to min. This can cause chains of
5491 : // TextNodes (which the parser does not generate). These should be
5492 : // combined if it turns out they hinder good code generation.
5493 22492 : for (int i = 0; i < min; i++) {
5494 13097 : answer = body->ToNode(compiler, answer);
5495 : }
5496 : return answer;
5497 : }
5498 : }
5499 72203 : if (max <= kMaxUnrolledMaxMatches && min == 0) {
5500 : DCHECK_LT(0, max); // Due to the 'if' above.
5501 : RegExpExpansionLimiter limiter(compiler, max);
5502 12602 : if (limiter.ok_to_expand()) {
5503 : // Unroll the optional matches up to max.
5504 : RegExpNode* answer = on_success;
5505 12258 : for (int i = 0; i < max; i++) {
5506 12258 : ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5507 12258 : if (is_greedy) {
5508 : alternation->AddAlternative(
5509 12108 : GuardedAlternative(body->ToNode(compiler, answer)));
5510 : alternation->AddAlternative(GuardedAlternative(on_success));
5511 : } else {
5512 : alternation->AddAlternative(GuardedAlternative(on_success));
5513 : alternation->AddAlternative(
5514 150 : GuardedAlternative(body->ToNode(compiler, answer)));
5515 : }
5516 : answer = alternation;
5517 13687 : if (not_at_start && !compiler->read_backward()) {
5518 : alternation->set_not_at_start();
5519 : }
5520 : }
5521 : return answer;
5522 : }
5523 : }
5524 : }
5525 1112399 : bool has_min = min > 0;
5526 1112399 : bool has_max = max < RegExpTree::kInfinity;
5527 1112399 : bool needs_counter = has_min || has_max;
5528 : int reg_ctr = needs_counter
5529 : ? compiler->AllocateRegister()
5530 1112399 : : RegExpCompiler::kNoRegister;
5531 : LoopChoiceNode* center = new (zone)
5532 1112399 : LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5533 1118891 : if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5534 : RegExpNode* loop_return = needs_counter
5535 : ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5536 1112399 : : static_cast<RegExpNode*>(center);
5537 1112399 : if (body_can_be_empty) {
5538 : // If the body can be empty we need to check if it was and then
5539 : // backtrack.
5540 : loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5541 : reg_ctr,
5542 : min,
5543 572 : loop_return);
5544 : }
5545 1112399 : RegExpNode* body_node = body->ToNode(compiler, loop_return);
5546 1112399 : if (body_can_be_empty) {
5547 : // If the body can be empty we need to store the start position
5548 : // so we can bail out if it was empty.
5549 572 : body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5550 : }
5551 1112399 : if (needs_capture_clearing) {
5552 : // Before entering the body of this loop we need to clear captures.
5553 3983 : body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5554 : }
5555 : GuardedAlternative body_alt(body_node);
5556 1112399 : if (has_max) {
5557 : Guard* body_guard =
5558 : new(zone) Guard(reg_ctr, Guard::LT, max);
5559 1004410 : body_alt.AddGuard(body_guard, zone);
5560 : }
5561 : GuardedAlternative rest_alt(on_success);
5562 1112399 : if (has_min) {
5563 : Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5564 2937 : rest_alt.AddGuard(rest_guard, zone);
5565 : }
5566 1112399 : if (is_greedy) {
5567 : center->AddLoopAlternative(body_alt);
5568 : center->AddContinueAlternative(rest_alt);
5569 : } else {
5570 : center->AddContinueAlternative(rest_alt);
5571 : center->AddLoopAlternative(body_alt);
5572 : }
5573 1112399 : if (needs_counter) {
5574 1005686 : return ActionNode::SetRegister(reg_ctr, 0, center);
5575 : } else {
5576 : return center;
5577 : }
5578 : }
5579 :
5580 : namespace {
5581 : // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5582 : // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
5583 30 : RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5584 : RegExpNode* on_success,
5585 : RegExpAssertion::AssertionType type) {
5586 : DCHECK(compiler->needs_unicode_case_equivalents());
5587 : Zone* zone = compiler->zone();
5588 : ZoneList<CharacterRange>* word_range =
5589 30 : new (zone) ZoneList<CharacterRange>(2, zone);
5590 30 : CharacterRange::AddClassEscape('w', word_range, true, zone);
5591 : int stack_register = compiler->UnicodeLookaroundStackRegister();
5592 : int position_register = compiler->UnicodeLookaroundPositionRegister();
5593 30 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5594 : // Add two choices. The (non-)boundary could start with a word or
5595 : // a non-word-character.
5596 90 : for (int i = 0; i < 2; i++) {
5597 60 : bool lookbehind_for_word = i == 0;
5598 : bool lookahead_for_word =
5599 60 : (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5600 : // Look to the left.
5601 : RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5602 60 : stack_register, position_register);
5603 : RegExpNode* backward = TextNode::CreateForCharacterRanges(
5604 60 : zone, word_range, true, lookbehind.on_match_success());
5605 : // Look to the right.
5606 : RegExpLookaround::Builder lookahead(lookahead_for_word,
5607 : lookbehind.ForMatch(backward),
5608 60 : stack_register, position_register);
5609 : RegExpNode* forward = TextNode::CreateForCharacterRanges(
5610 60 : zone, word_range, false, lookahead.on_match_success());
5611 60 : result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5612 : }
5613 30 : return result;
5614 : }
5615 : } // anonymous namespace
5616 :
5617 8832 : RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5618 8832 : RegExpNode* on_success) {
5619 : NodeInfo info;
5620 : Zone* zone = compiler->zone();
5621 :
5622 8832 : switch (assertion_type()) {
5623 : case START_OF_LINE:
5624 782 : return AssertionNode::AfterNewline(on_success);
5625 : case START_OF_INPUT:
5626 4277 : return AssertionNode::AtStart(on_success);
5627 : case BOUNDARY:
5628 : return compiler->needs_unicode_case_equivalents()
5629 : ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)
5630 160 : : AssertionNode::AtBoundary(on_success);
5631 : case NON_BOUNDARY:
5632 : return compiler->needs_unicode_case_equivalents()
5633 : ? BoundaryAssertionAsLookaround(compiler, on_success,
5634 : NON_BOUNDARY)
5635 139 : : AssertionNode::AtNonBoundary(on_success);
5636 : case END_OF_INPUT:
5637 3401 : return AssertionNode::AtEnd(on_success);
5638 : case END_OF_LINE: {
5639 : // Compile $ in multiline regexps as an alternation with a positive
5640 : // lookahead in one side and an end-of-input on the other side.
5641 : // We need two registers for the lookahead.
5642 : int stack_pointer_register = compiler->AllocateRegister();
5643 : int position_register = compiler->AllocateRegister();
5644 : // The ChoiceNode to distinguish between a newline and end-of-input.
5645 73 : ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5646 : // Create a newline atom.
5647 : ZoneList<CharacterRange>* newline_ranges =
5648 73 : new(zone) ZoneList<CharacterRange>(3, zone);
5649 73 : CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5650 : RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5651 : TextNode* newline_matcher = new (zone) TextNode(
5652 : newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5653 : stack_pointer_register, position_register,
5654 : 0, // No captures inside.
5655 : -1, // Ignored if no captures.
5656 146 : on_success));
5657 : // Create an end-of-input matcher.
5658 : RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5659 : stack_pointer_register,
5660 : position_register,
5661 73 : newline_matcher);
5662 : // Add the two alternatives to the ChoiceNode.
5663 : GuardedAlternative eol_alternative(end_of_line);
5664 : result->AddAlternative(eol_alternative);
5665 73 : GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5666 : result->AddAlternative(end_alternative);
5667 : return result;
5668 : }
5669 : default:
5670 0 : UNREACHABLE();
5671 : }
5672 : return on_success;
5673 : }
5674 :
5675 :
5676 5338 : RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5677 2669 : RegExpNode* on_success) {
5678 : return new (compiler->zone())
5679 : BackReferenceNode(RegExpCapture::StartRegister(index()),
5680 : RegExpCapture::EndRegister(index()),
5681 2669 : compiler->read_backward(), on_success);
5682 : }
5683 :
5684 :
5685 1116 : RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5686 : RegExpNode* on_success) {
5687 1116 : return on_success;
5688 : }
5689 :
5690 :
5691 4400 : RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5692 : int stack_pointer_register,
5693 : int position_register,
5694 : int capture_register_count,
5695 : int capture_register_start)
5696 : : is_positive_(is_positive),
5697 : on_success_(on_success),
5698 : stack_pointer_register_(stack_pointer_register),
5699 4400 : position_register_(position_register) {
5700 4400 : if (is_positive_) {
5701 : on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5702 : stack_pointer_register, position_register, capture_register_count,
5703 1472 : capture_register_start, on_success_);
5704 : } else {
5705 : Zone* zone = on_success_->zone();
5706 : on_match_success_ = new (zone) NegativeSubmatchSuccess(
5707 : stack_pointer_register, position_register, capture_register_count,
5708 2928 : capture_register_start, zone);
5709 : }
5710 4400 : }
5711 :
5712 :
5713 4400 : RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5714 4400 : if (is_positive_) {
5715 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5716 1472 : position_register_, match);
5717 : } else {
5718 2928 : Zone* zone = on_success_->zone();
5719 : // We use a ChoiceNode to represent the negative lookaround. The first
5720 : // alternative is the negative match. On success, the end node backtracks.
5721 : // On failure, the second alternative is tried and leads to success.
5722 : // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5723 : // first exit when calculating quick checks.
5724 : ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5725 2928 : GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5726 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5727 2928 : position_register_, choice_node);
5728 : }
5729 : }
5730 :
5731 :
5732 3390 : RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5733 3390 : RegExpNode* on_success) {
5734 : int stack_pointer_register = compiler->AllocateRegister();
5735 : int position_register = compiler->AllocateRegister();
5736 :
5737 : const int registers_per_capture = 2;
5738 : const int register_of_first_capture = 2;
5739 1695 : int register_count = capture_count_ * registers_per_capture;
5740 : int register_start =
5741 1695 : register_of_first_capture + capture_from_ * registers_per_capture;
5742 :
5743 : RegExpNode* result;
5744 : bool was_reading_backward = compiler->read_backward();
5745 1695 : compiler->set_read_backward(type() == LOOKBEHIND);
5746 : Builder builder(is_positive(), on_success, stack_pointer_register,
5747 1695 : position_register, register_count, register_start);
5748 1695 : RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5749 1695 : result = builder.ForMatch(match);
5750 : compiler->set_read_backward(was_reading_backward);
5751 1695 : return result;
5752 : }
5753 :
5754 :
5755 37603 : RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5756 37603 : RegExpNode* on_success) {
5757 37603 : return ToNode(body(), index(), compiler, on_success);
5758 : }
5759 :
5760 :
5761 131569 : RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5762 : int index,
5763 131569 : RegExpCompiler* compiler,
5764 : RegExpNode* on_success) {
5765 : DCHECK_NOT_NULL(body);
5766 : int start_reg = RegExpCapture::StartRegister(index);
5767 : int end_reg = RegExpCapture::EndRegister(index);
5768 131569 : if (compiler->read_backward()) std::swap(start_reg, end_reg);
5769 131569 : RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5770 131569 : RegExpNode* body_node = body->ToNode(compiler, store_end);
5771 131569 : return ActionNode::StorePosition(start_reg, true, body_node);
5772 : }
5773 :
5774 :
5775 56580 : RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5776 28290 : RegExpNode* on_success) {
5777 29080 : ZoneList<RegExpTree*>* children = nodes();
5778 : RegExpNode* current = on_success;
5779 28290 : if (compiler->read_backward()) {
5780 1905 : for (int i = 0; i < children->length(); i++) {
5781 790 : current = children->at(i)->ToNode(compiler, current);
5782 : }
5783 : } else {
5784 1128899 : for (int i = children->length() - 1; i >= 0; i--) {
5785 1100934 : current = children->at(i)->ToNode(compiler, current);
5786 : }
5787 : }
5788 28290 : return current;
5789 : }
5790 :
5791 :
5792 21899 : static void AddClass(const int* elmv,
5793 : int elmc,
5794 : ZoneList<CharacterRange>* ranges,
5795 : Zone* zone) {
5796 21899 : elmc--;
5797 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5798 142823 : for (int i = 0; i < elmc; i += 2) {
5799 : DCHECK(elmv[i] < elmv[i + 1]);
5800 120924 : ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5801 : }
5802 21899 : }
5803 :
5804 :
5805 25619 : static void AddClassNegated(const int *elmv,
5806 : int elmc,
5807 : ZoneList<CharacterRange>* ranges,
5808 : Zone* zone) {
5809 25619 : elmc--;
5810 : DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5811 : DCHECK_NE(0x0000, elmv[0]);
5812 : DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
5813 : uc16 last = 0x0000;
5814 107558 : for (int i = 0; i < elmc; i += 2) {
5815 : DCHECK(last <= elmv[i] - 1);
5816 : DCHECK(elmv[i] < elmv[i + 1]);
5817 81939 : ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5818 81939 : last = elmv[i + 1];
5819 : }
5820 25619 : ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5821 25619 : }
5822 :
5823 137506 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5824 : bool add_unicode_case_equivalents,
5825 : Zone* zone) {
5826 137506 : if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5827 : // See #sec-runtime-semantics-wordcharacters-abstract-operation
5828 : // In case of unicode and ignore_case, we need to create the closure over
5829 : // case equivalent characters before negating.
5830 : ZoneList<CharacterRange>* new_ranges =
5831 92 : new (zone) ZoneList<CharacterRange>(2, zone);
5832 92 : AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5833 92 : AddUnicodeCaseEquivalents(new_ranges, zone);
5834 92 : if (type == 'W') {
5835 : ZoneList<CharacterRange>* negated =
5836 30 : new (zone) ZoneList<CharacterRange>(2, zone);
5837 30 : CharacterRange::Negate(new_ranges, negated, zone);
5838 : new_ranges = negated;
5839 : }
5840 : ranges->AddAll(*new_ranges, zone);
5841 137506 : return;
5842 : }
5843 137414 : AddClassEscape(type, ranges, zone);
5844 : }
5845 :
5846 137456 : void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5847 : Zone* zone) {
5848 137456 : switch (type) {
5849 : case 's':
5850 8317 : AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5851 8317 : break;
5852 : case 'S':
5853 759 : AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5854 759 : break;
5855 : case 'w':
5856 7868 : AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5857 7868 : break;
5858 : case 'W':
5859 325 : AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5860 325 : break;
5861 : case 'd':
5862 5476 : AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5863 5476 : break;
5864 : case 'D':
5865 278 : AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5866 278 : break;
5867 : case '.':
5868 : AddClassNegated(kLineTerminatorRanges,
5869 : kLineTerminatorRangeCount,
5870 : ranges,
5871 24257 : zone);
5872 24257 : break;
5873 : // This is not a character range as defined by the spec but a
5874 : // convenient shorthand for a character class that matches any
5875 : // character.
5876 : case '*':
5877 90030 : ranges->Add(CharacterRange::Everything(), zone);
5878 90030 : break;
5879 : // This is the set of characters matched by the $ and ^ symbols
5880 : // in multiline mode.
5881 : case 'n':
5882 : AddClass(kLineTerminatorRanges,
5883 : kLineTerminatorRangeCount,
5884 : ranges,
5885 146 : zone);
5886 146 : break;
5887 : default:
5888 0 : UNREACHABLE();
5889 : }
5890 137456 : }
5891 :
5892 :
5893 0 : Vector<const int> CharacterRange::GetWordBounds() {
5894 0 : return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5895 : }
5896 :
5897 : // static
5898 73771 : void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5899 73771 : ZoneList<CharacterRange>* ranges,
5900 : bool is_one_byte) {
5901 73771 : CharacterRange::Canonicalize(ranges);
5902 : int range_count = ranges->length();
5903 160963 : for (int i = 0; i < range_count; i++) {
5904 87192 : CharacterRange range = ranges->at(i);
5905 : uc32 bottom = range.from();
5906 94630 : if (bottom > String::kMaxUtf16CodeUnit) continue;
5907 : uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5908 : // Nothing to be done for surrogates.
5909 86222 : if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
5910 81559 : if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5911 9748 : if (bottom > String::kMaxOneByteCharCode) continue;
5912 7943 : if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5913 : }
5914 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5915 79754 : if (top == bottom) {
5916 : // If this is a singleton we just expand the one character.
5917 5731 : int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5918 9674 : for (int i = 0; i < length; i++) {
5919 3943 : uc32 chr = chars[i];
5920 3943 : if (chr != bottom) {
5921 2038 : ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5922 : }
5923 : }
5924 : } else {
5925 : // If this is a range we expand the characters block by block, expanding
5926 : // contiguous subranges (blocks) one at a time. The approach is as
5927 : // follows. For a given start character we look up the remainder of the
5928 : // block that contains it (represented by the end point), for instance we
5929 : // find 'z' if the character is 'c'. A block is characterized by the
5930 : // property that all characters uncanonicalize in the same way, except
5931 : // that each entry in the result is incremented by the distance from the
5932 : // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5933 : // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5934 : // we've found the end point we look up its uncanonicalization and
5935 : // produce a range for each element. For instance for [c-f] we look up
5936 : // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5937 : // it is not already contained in the input, so [c-f] will be skipped but
5938 : // [C-F] will be added. If this range is not completely contained in a
5939 : // block we do this for all the blocks covered by the range (handling
5940 : // characters that is not in a block as a "singleton block").
5941 : unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5942 : int pos = bottom;
5943 23122276 : while (pos <= top) {
5944 : int length =
5945 23048253 : isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5946 : uc32 block_end;
5947 23048253 : if (length == 0) {
5948 : block_end = pos;
5949 : } else {
5950 : DCHECK_EQ(1, length);
5951 22829 : block_end = equivalents[0];
5952 : }
5953 23048253 : int end = (block_end > top) ? top : block_end;
5954 : length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5955 23048253 : equivalents);
5956 24073984 : for (int i = 0; i < length; i++) {
5957 1025731 : uc32 c = equivalents[i];
5958 1025731 : uc32 range_from = c - (block_end - pos);
5959 1025731 : uc32 range_to = c - (block_end - end);
5960 1025731 : if (!(bottom <= range_from && range_to <= top)) {
5961 20817 : ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5962 : }
5963 : }
5964 23048253 : pos = end + 1;
5965 : }
5966 : }
5967 : }
5968 73771 : }
5969 :
5970 :
5971 12 : bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5972 : DCHECK_NOT_NULL(ranges);
5973 : int n = ranges->length();
5974 12 : if (n <= 1) return true;
5975 12 : int max = ranges->at(0).to();
5976 360 : for (int i = 1; i < n; i++) {
5977 348 : CharacterRange next_range = ranges->at(i);
5978 348 : if (next_range.from() <= max + 1) return false;
5979 : max = next_range.to();
5980 : }
5981 : return true;
5982 : }
5983 :
5984 :
5985 2126011 : ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5986 2126011 : if (ranges_ == nullptr) {
5987 90063 : ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5988 90063 : CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
5989 : }
5990 2126011 : return ranges_;
5991 : }
5992 :
5993 :
5994 : // Move a number of elements in a zonelist to another position
5995 : // in the same list. Handles overlapping source and target areas.
5996 90320 : static void MoveRanges(ZoneList<CharacterRange>* list,
5997 : int from,
5998 : int to,
5999 : int count) {
6000 : // Ranges are potentially overlapping.
6001 90320 : if (from < to) {
6002 9644083 : for (int i = count - 1; i >= 0; i--) {
6003 28697052 : list->at(to + i) = list->at(from + i);
6004 : }
6005 : } else {
6006 3563893 : for (int i = 0; i < count; i++) {
6007 10691679 : list->at(to + i) = list->at(from + i);
6008 : }
6009 : }
6010 90320 : }
6011 :
6012 :
6013 158229 : static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6014 : int count,
6015 : CharacterRange insert) {
6016 : // Inserts a range into list[0..count[, which must be sorted
6017 : // by from value and non-overlapping and non-adjacent, using at most
6018 : // list[0..count] for the result. Returns the number of resulting
6019 : // canonicalized ranges. Inserting a range may collapse existing ranges into
6020 : // fewer ranges, so the return value can be anything in the range 1..count+1.
6021 158229 : uc32 from = insert.from();
6022 158229 : uc32 to = insert.to();
6023 : int start_pos = 0;
6024 : int end_pos = count;
6025 17930548 : for (int i = count - 1; i >= 0; i--) {
6026 17846257 : CharacterRange current = list->at(i);
6027 17846257 : if (current.from() > to + 1) {
6028 : end_pos = i;
6029 140474 : } else if (current.to() + 1 < from) {
6030 73938 : start_pos = i + 1;
6031 : break;
6032 : }
6033 : }
6034 :
6035 : // Inserted range overlaps, or is adjacent to, ranges at positions
6036 : // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6037 : // not affected by the insertion.
6038 : // If start_pos == end_pos, the range must be inserted before start_pos.
6039 : // if start_pos < end_pos, the entire range from start_pos to end_pos
6040 : // must be merged with the insert range.
6041 :
6042 158229 : if (start_pos == end_pos) {
6043 : // Insert between existing ranges at position start_pos.
6044 104034 : if (start_pos < count) {
6045 78399 : MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6046 : }
6047 104034 : list->at(start_pos) = insert;
6048 104034 : return count + 1;
6049 : }
6050 54195 : if (start_pos + 1 == end_pos) {
6051 : // Replace single existing range at position start_pos.
6052 42109 : CharacterRange to_replace = list->at(start_pos);
6053 : int new_from = Min(to_replace.from(), from);
6054 : int new_to = Max(to_replace.to(), to);
6055 42109 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6056 : return count;
6057 : }
6058 : // Replace a number of existing ranges from start_pos to end_pos - 1.
6059 : // Move the remaining ranges down.
6060 :
6061 12086 : int new_from = Min(list->at(start_pos).from(), from);
6062 24172 : int new_to = Max(list->at(end_pos - 1).to(), to);
6063 12086 : if (end_pos < count) {
6064 11921 : MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6065 : }
6066 12086 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6067 12086 : return count - (end_pos - start_pos) + 1;
6068 : }
6069 :
6070 :
6071 24 : void CharacterSet::Canonicalize() {
6072 : // Special/default classes are always considered canonical. The result
6073 : // of calling ranges() will be sorted.
6074 199129 : if (ranges_ == nullptr) return;
6075 109280 : CharacterRange::Canonicalize(ranges_);
6076 : }
6077 :
6078 :
6079 568651 : void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6080 568651 : if (character_ranges->length() <= 1) return;
6081 : // Check whether ranges are already canonical (increasing, non-overlapping,
6082 : // non-adjacent).
6083 : int n = character_ranges->length();
6084 89016 : int max = character_ranges->at(0).to();
6085 : int i = 1;
6086 1468591 : while (i < n) {
6087 1303907 : CharacterRange current = character_ranges->at(i);
6088 1303907 : if (current.from() <= max + 1) {
6089 : break;
6090 : }
6091 : max = current.to();
6092 1290559 : i++;
6093 : }
6094 : // Canonical until the i'th range. If that's all of them, we are done.
6095 89016 : if (i == n) return;
6096 :
6097 : // The ranges at index i and forward are not canonicalized. Make them so by
6098 : // doing the equivalent of insertion sort (inserting each into the previous
6099 : // list, in order).
6100 : // Notice that inserting a range can reduce the number of ranges in the
6101 : // result due to combining of adjacent and overlapping ranges.
6102 : int read = i; // Range to insert.
6103 : int num_canonical = i; // Length of canonicalized part of list.
6104 158229 : do {
6105 : num_canonical = InsertRangeInCanonicalList(character_ranges,
6106 : num_canonical,
6107 158229 : character_ranges->at(read));
6108 158229 : read++;
6109 : } while (read < n);
6110 : character_ranges->Rewind(num_canonical);
6111 :
6112 : DCHECK(CharacterRange::IsCanonical(character_ranges));
6113 : }
6114 :
6115 :
6116 158 : void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6117 : ZoneList<CharacterRange>* negated_ranges,
6118 : Zone* zone) {
6119 : DCHECK(CharacterRange::IsCanonical(ranges));
6120 : DCHECK_EQ(0, negated_ranges->length());
6121 : int range_count = ranges->length();
6122 : uc32 from = 0;
6123 : int i = 0;
6124 316 : if (range_count > 0 && ranges->at(0).from() == 0) {
6125 27 : from = ranges->at(0).to() + 1;
6126 : i = 1;
6127 : }
6128 6850 : while (i < range_count) {
6129 6692 : CharacterRange range = ranges->at(i);
6130 6692 : negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6131 6692 : from = range.to() + 1;
6132 6692 : i++;
6133 : }
6134 158 : if (from < String::kMaxCodePoint) {
6135 : negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6136 121 : zone);
6137 : }
6138 158 : }
6139 :
6140 :
6141 : // -------------------------------------------------------------------
6142 : // Splay tree
6143 :
6144 :
6145 471133 : OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6146 226787 : if (Get(value))
6147 : return this;
6148 226781 : if (successors(zone) != nullptr) {
6149 172902 : for (int i = 0; i < successors(zone)->length(); i++) {
6150 382118 : OutSet* successor = successors(zone)->at(i);
6151 382118 : if (successor->Get(value))
6152 : return successor;
6153 : }
6154 : } else {
6155 5888 : successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6156 : }
6157 35130 : OutSet* result = new(zone) OutSet(first_, remaining_);
6158 17565 : result->Set(value, zone);
6159 17565 : successors(zone)->Add(result, zone);
6160 17565 : return result;
6161 : }
6162 :
6163 :
6164 894146 : void OutSet::Set(unsigned value, Zone *zone) {
6165 894146 : if (value < kFirstLimit) {
6166 469292 : first_ |= (1 << value);
6167 : } else {
6168 1158582 : if (remaining_ == nullptr)
6169 115980 : remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6170 1158582 : if (remaining_->is_empty() || !remaining_->Contains(value))
6171 421294 : remaining_->Add(value, zone);
6172 : }
6173 894146 : }
6174 :
6175 :
6176 38214250 : bool OutSet::Get(unsigned value) const {
6177 38214250 : if (value < kFirstLimit) {
6178 8234030 : return (first_ & (1 << value)) != 0;
6179 29980220 : } else if (remaining_ == nullptr) {
6180 : return false;
6181 : } else {
6182 20124872 : return remaining_->Contains(value);
6183 : }
6184 : }
6185 :
6186 :
6187 : const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6188 :
6189 :
6190 85309 : void DispatchTable::AddRange(CharacterRange full_range, int value,
6191 : Zone* zone) {
6192 85309 : CharacterRange current = full_range;
6193 85309 : if (tree()->is_empty()) {
6194 : // If this is the first range we just insert into the table.
6195 : ZoneSplayTree<Config>::Locator loc;
6196 2704 : bool inserted = tree()->Insert(current.from(), &loc);
6197 : DCHECK(inserted);
6198 : USE(inserted);
6199 : loc.set_value(Entry(current.from(), current.to(),
6200 2704 : empty()->Extend(value, zone)));
6201 85309 : return;
6202 : }
6203 : // First see if there is a range to the left of this one that
6204 : // overlaps.
6205 : ZoneSplayTree<Config>::Locator loc;
6206 82605 : if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6207 157136 : Entry* entry = &loc.value();
6208 : // If we've found a range that overlaps with this one, and it
6209 : // starts strictly to the left of this one, we have to fix it
6210 : // because the following code only handles ranges that start on
6211 : // or after the start point of the range we're adding.
6212 156176 : if (entry->from() < current.from() && entry->to() >= current.from()) {
6213 : // Snap the overlapping range in half around the start point of
6214 : // the range we're adding.
6215 : CharacterRange left =
6216 480 : CharacterRange::Range(entry->from(), current.from() - 1);
6217 : CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6218 : // The left part of the overlapping range doesn't overlap.
6219 : // Truncate the whole entry to be just the left part.
6220 : entry->set_to(left.to());
6221 : // The right part is the one that overlaps. We add this part
6222 : // to the map and let the next step deal with merging it with
6223 : // the range we're adding.
6224 : ZoneSplayTree<Config>::Locator loc;
6225 480 : bool inserted = tree()->Insert(right.from(), &loc);
6226 : DCHECK(inserted);
6227 : USE(inserted);
6228 : loc.set_value(Entry(right.from(),
6229 : right.to(),
6230 : entry->out_set()));
6231 : }
6232 : }
6233 160824 : while (current.is_valid()) {
6234 391063 : if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6235 314626 : (loc.value().from() <= current.to()) &&
6236 78219 : (loc.value().to() >= current.from())) {
6237 309002 : Entry* entry = &loc.value();
6238 : // We have overlap. If there is space between the start point of
6239 : // the range we're adding and where the overlapping range starts
6240 : // then we have to add a range covering just that space.
6241 78219 : if (current.from() < entry->from()) {
6242 : ZoneSplayTree<Config>::Locator ins;
6243 69427 : bool inserted = tree()->Insert(current.from(), &ins);
6244 : DCHECK(inserted);
6245 : USE(inserted);
6246 : ins.set_value(Entry(current.from(),
6247 : entry->from() - 1,
6248 138854 : empty()->Extend(value, zone)));
6249 : current.set_from(entry->from());
6250 : }
6251 : DCHECK_EQ(current.from(), entry->from());
6252 : // If the overlapping range extends beyond the one we want to add
6253 : // we have to snap the right part off and add it separately.
6254 78219 : if (entry->to() > current.to()) {
6255 : ZoneSplayTree<Config>::Locator ins;
6256 4918 : bool inserted = tree()->Insert(current.to() + 1, &ins);
6257 : DCHECK(inserted);
6258 : USE(inserted);
6259 : ins.set_value(Entry(current.to() + 1,
6260 : entry->to(),
6261 : entry->out_set()));
6262 : entry->set_to(current.to());
6263 : }
6264 : DCHECK(entry->to() <= current.to());
6265 : // The overlapping range is now completely contained by the range
6266 : // we're adding so we can just update it and move the start point
6267 : // of the range we're adding just past it.
6268 : entry->AddValue(value, zone);
6269 : DCHECK(entry->to() + 1 > current.from());
6270 78219 : current.set_from(entry->to() + 1);
6271 : } else {
6272 : // There is no overlap so we can just add the range
6273 : ZoneSplayTree<Config>::Locator ins;
6274 76437 : bool inserted = tree()->Insert(current.from(), &ins);
6275 : DCHECK(inserted);
6276 : USE(inserted);
6277 : ins.set_value(Entry(current.from(),
6278 : current.to(),
6279 76437 : empty()->Extend(value, zone)));
6280 : break;
6281 : }
6282 : }
6283 : }
6284 :
6285 :
6286 66012 : OutSet* DispatchTable::Get(uc32 value) {
6287 : ZoneSplayTree<Config>::Locator loc;
6288 66012 : if (!tree()->FindGreatestLessThan(value, &loc))
6289 0 : return empty();
6290 112674 : Entry* entry = &loc.value();
6291 66012 : if (value <= entry->to())
6292 46662 : return entry->out_set();
6293 : else
6294 19350 : return empty();
6295 : }
6296 :
6297 :
6298 : // -------------------------------------------------------------------
6299 : // Analysis
6300 :
6301 :
6302 1205521 : void Analysis::EnsureAnalyzed(RegExpNode* that) {
6303 : StackLimitCheck check(isolate());
6304 1205521 : if (check.HasOverflowed()) {
6305 : fail("Stack overflow");
6306 : return;
6307 : }
6308 1205073 : if (that->info()->been_analyzed || that->info()->being_analyzed)
6309 : return;
6310 998576 : that->info()->being_analyzed = true;
6311 998576 : that->Accept(this);
6312 998576 : that->info()->being_analyzed = false;
6313 998576 : that->info()->been_analyzed = true;
6314 : }
6315 :
6316 :
6317 96434 : void Analysis::VisitEnd(EndNode* that) {
6318 : // nothing to do
6319 96434 : }
6320 :
6321 :
6322 703714 : void TextNode::CalculateOffsets() {
6323 337759 : int element_count = elements()->length();
6324 : // Set up the offsets of the elements relative to the start. This is a fixed
6325 : // quantity since a TextNode can only contain fixed-width things.
6326 : int cp_offset = 0;
6327 703714 : for (int i = 0; i < element_count; i++) {
6328 : TextElement& elm = elements()->at(i);
6329 : elm.set_cp_offset(cp_offset);
6330 365955 : cp_offset += elm.length();
6331 : }
6332 337759 : }
6333 :
6334 :
6335 832991 : void Analysis::VisitText(TextNode* that) {
6336 339885 : if (ignore_case()) {
6337 306442 : that->MakeCaseIndependent(isolate(), is_one_byte_);
6338 : }
6339 339885 : EnsureAnalyzed(that->on_success());
6340 339885 : if (!has_failed()) {
6341 337759 : that->CalculateOffsets();
6342 : }
6343 339885 : }
6344 :
6345 :
6346 690102 : void Analysis::VisitAction(ActionNode* that) {
6347 345051 : RegExpNode* target = that->on_success();
6348 345051 : EnsureAnalyzed(target);
6349 345051 : if (!has_failed()) {
6350 : // If the next node is interested in what it follows then this node
6351 : // has to be interested too so it can pass the information on.
6352 : that->info()->AddFromFollowing(target->info());
6353 : }
6354 345051 : }
6355 :
6356 :
6357 322122 : void Analysis::VisitChoice(ChoiceNode* that) {
6358 : NodeInfo* info = that->info();
6359 322122 : for (int i = 0; i < that->alternatives()->length(); i++) {
6360 129567 : RegExpNode* node = that->alternatives()->at(i).node();
6361 129567 : EnsureAnalyzed(node);
6362 161061 : if (has_failed()) return;
6363 : // Anything the following nodes need to know has to be known by
6364 : // this node also, so it can pass it on.
6365 : info->AddFromFollowing(node->info());
6366 : }
6367 : }
6368 :
6369 :
6370 1031212 : void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6371 : NodeInfo* info = that->info();
6372 919688 : for (int i = 0; i < that->alternatives()->length(); i++) {
6373 808348 : RegExpNode* node = that->alternatives()->at(i).node();
6374 348504 : if (node != that->loop_node()) {
6375 174344 : EnsureAnalyzed(node);
6376 348688 : if (has_failed()) return;
6377 : info->AddFromFollowing(node->info());
6378 : }
6379 : }
6380 : // Check the loop last since it may need the value of this node
6381 : // to get a correct result.
6382 111340 : EnsureAnalyzed(that->loop_node());
6383 111340 : if (!has_failed()) {
6384 : info->AddFromFollowing(that->loop_node()->info());
6385 : }
6386 : }
6387 :
6388 :
6389 2567 : void Analysis::VisitBackReference(BackReferenceNode* that) {
6390 2567 : EnsureAnalyzed(that->on_success());
6391 2567 : }
6392 :
6393 :
6394 8801 : void Analysis::VisitAssertion(AssertionNode* that) {
6395 8801 : EnsureAnalyzed(that->on_success());
6396 8801 : }
6397 :
6398 :
6399 176 : void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6400 : BoyerMooreLookahead* bm,
6401 : bool not_at_start) {
6402 : // Working out the set of characters that a backreference can match is too
6403 : // hard, so we just say that any character can match.
6404 : bm->SetRest(offset);
6405 : SaveBMInfo(bm, not_at_start, offset);
6406 176 : }
6407 :
6408 :
6409 : STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6410 : RegExpMacroAssembler::kTableSize);
6411 :
6412 :
6413 16584 : void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6414 16584 : BoyerMooreLookahead* bm, bool not_at_start) {
6415 78054 : ZoneList<GuardedAlternative>* alts = alternatives();
6416 33168 : budget = (budget - 1) / alts->length();
6417 122940 : for (int i = 0; i < alts->length(); i++) {
6418 90013 : GuardedAlternative& alt = alts->at(i);
6419 45127 : if (alt.guards() != nullptr && alt.guards()->length() != 0) {
6420 : bm->SetRest(offset); // Give up trying to fill in info.
6421 : SaveBMInfo(bm, not_at_start, offset);
6422 16584 : return;
6423 : }
6424 44886 : alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6425 : }
6426 : SaveBMInfo(bm, not_at_start, offset);
6427 : }
6428 :
6429 :
6430 139941 : void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6431 1242743 : BoyerMooreLookahead* bm, bool not_at_start) {
6432 139941 : if (initial_offset >= bm->length()) return;
6433 : int offset = initial_offset;
6434 : int max_char = bm->max_char();
6435 561670 : for (int i = 0; i < elements()->length(); i++) {
6436 156274 : if (offset >= bm->length()) {
6437 124250 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6438 : return;
6439 : }
6440 144166 : TextElement text = elements()->at(i);
6441 144166 : if (text.text_type() == TextElement::ATOM) {
6442 : RegExpAtom* atom = text.atom();
6443 154054 : for (int j = 0; j < atom->length(); j++, offset++) {
6444 64287 : if (offset >= bm->length()) {
6445 3272 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6446 : return;
6447 : }
6448 122030 : uc16 character = atom->data()[j];
6449 61015 : if (bm->compiler()->ignore_case()) {
6450 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6451 : int length = GetCaseIndependentLetters(
6452 : isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6453 4816 : chars);
6454 13689 : for (int j = 0; j < length; j++) {
6455 17746 : bm->Set(offset, chars[j]);
6456 : }
6457 : } else {
6458 112398 : if (character <= max_char) bm->Set(offset, character);
6459 : }
6460 : }
6461 : } else {
6462 : DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6463 : RegExpCharacterClass* char_class = text.char_class();
6464 456456 : ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6465 112142 : if (char_class->is_negated()) {
6466 4530 : bm->SetAll(offset);
6467 : } else {
6468 805300 : for (int k = 0; k < ranges->length(); k++) {
6469 550315 : CharacterRange& range = ranges->at(k);
6470 348844 : if (range.from() > max_char) continue;
6471 : int to = Min(max_char, static_cast<int>(range.to()));
6472 201471 : bm->SetInterval(offset, Interval(range.from(), to));
6473 : }
6474 : }
6475 112142 : offset++;
6476 : }
6477 : }
6478 124561 : if (offset >= bm->length()) {
6479 100333 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6480 : return;
6481 : }
6482 24228 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6483 24228 : true); // Not at start after a text node.
6484 24228 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6485 : }
6486 :
6487 :
6488 : // -------------------------------------------------------------------
6489 : // Dispatch table construction
6490 :
6491 :
6492 0 : void DispatchTableConstructor::VisitEnd(EndNode* that) {
6493 : AddRange(CharacterRange::Everything());
6494 0 : }
6495 :
6496 :
6497 0 : void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6498 : node->set_being_calculated(true);
6499 0 : ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6500 0 : for (int i = 0; i < alternatives->length(); i++) {
6501 : set_choice_index(i);
6502 0 : alternatives->at(i).node()->Accept(this);
6503 : }
6504 : node->set_being_calculated(false);
6505 0 : }
6506 :
6507 :
6508 : class AddDispatchRange {
6509 : public:
6510 : explicit AddDispatchRange(DispatchTableConstructor* constructor)
6511 0 : : constructor_(constructor) { }
6512 : void Call(uc32 from, DispatchTable::Entry entry);
6513 : private:
6514 : DispatchTableConstructor* constructor_;
6515 : };
6516 :
6517 :
6518 0 : void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6519 0 : constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6520 0 : }
6521 :
6522 :
6523 0 : void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6524 0 : if (node->being_calculated())
6525 0 : return;
6526 0 : DispatchTable* table = node->GetTable(ignore_case_);
6527 : AddDispatchRange adder(this);
6528 : table->ForEach(&adder);
6529 : }
6530 :
6531 :
6532 0 : void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6533 : // TODO(160): Find the node that we refer back to and propagate its start
6534 : // set back to here. For now we just accept anything.
6535 : AddRange(CharacterRange::Everything());
6536 0 : }
6537 :
6538 :
6539 0 : void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6540 0 : RegExpNode* target = that->on_success();
6541 0 : target->Accept(this);
6542 0 : }
6543 :
6544 :
6545 9444 : static int CompareRangeByFrom(const CharacterRange* a,
6546 4722 : const CharacterRange* b) {
6547 14166 : return Compare<uc16>(a->from(), b->from());
6548 : }
6549 :
6550 :
6551 1098 : void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6552 : ranges->Sort(CompareRangeByFrom);
6553 : uc16 last = 0;
6554 2064 : for (int i = 0; i < ranges->length(); i++) {
6555 966 : CharacterRange range = ranges->at(i);
6556 966 : if (last < range.from())
6557 630 : AddRange(CharacterRange::Range(last, range.from() - 1));
6558 966 : if (range.to() >= last) {
6559 858 : if (range.to() == String::kMaxCodePoint) {
6560 66 : return;
6561 : } else {
6562 858 : last = range.to() + 1;
6563 : }
6564 : }
6565 : }
6566 66 : AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6567 : }
6568 :
6569 :
6570 0 : void DispatchTableConstructor::VisitText(TextNode* that) {
6571 0 : TextElement elm = that->elements()->at(0);
6572 0 : switch (elm.text_type()) {
6573 : case TextElement::ATOM: {
6574 0 : uc16 c = elm.atom()->data()[0];
6575 0 : AddRange(CharacterRange::Range(c, c));
6576 : break;
6577 : }
6578 : case TextElement::CHAR_CLASS: {
6579 : RegExpCharacterClass* tree = elm.char_class();
6580 0 : ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6581 0 : if (tree->is_negated()) {
6582 0 : AddInverse(ranges);
6583 : } else {
6584 0 : for (int i = 0; i < ranges->length(); i++)
6585 : AddRange(ranges->at(i));
6586 : }
6587 : break;
6588 : }
6589 : default: {
6590 0 : UNIMPLEMENTED();
6591 : }
6592 : }
6593 0 : }
6594 :
6595 :
6596 0 : void DispatchTableConstructor::VisitAction(ActionNode* that) {
6597 0 : RegExpNode* target = that->on_success();
6598 0 : target->Accept(this);
6599 0 : }
6600 :
6601 :
6602 43 : RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6603 : RegExpNode* on_success) {
6604 : // If the regexp matching starts within a surrogate pair, step back
6605 : // to the lead surrogate and start matching from there.
6606 : DCHECK(!compiler->read_backward());
6607 : Zone* zone = compiler->zone();
6608 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6609 43 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6610 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6611 43 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6612 :
6613 43 : ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6614 :
6615 : int stack_register = compiler->UnicodeLookaroundStackRegister();
6616 : int position_register = compiler->UnicodeLookaroundPositionRegister();
6617 : RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6618 43 : zone, lead_surrogates, true, on_success);
6619 : RegExpLookaround::Builder builder(true, step_back, stack_register,
6620 43 : position_register);
6621 : RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6622 43 : zone, trail_surrogates, false, builder.on_match_success());
6623 :
6624 : optional_step_back->AddAlternative(
6625 43 : GuardedAlternative(builder.ForMatch(match_trail)));
6626 : optional_step_back->AddAlternative(GuardedAlternative(on_success));
6627 :
6628 43 : return optional_step_back;
6629 : }
6630 :
6631 :
6632 93976 : RegExpEngine::CompilationResult RegExpEngine::Compile(
6633 : Isolate* isolate, Zone* zone, RegExpCompileData* data,
6634 : JSRegExp::Flags flags, Handle<String> pattern,
6635 : Handle<String> sample_subject, bool is_one_byte) {
6636 93976 : if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6637 : return IrregexpRegExpTooBig(isolate);
6638 : }
6639 93966 : bool ignore_case = flags & JSRegExp::kIgnoreCase;
6640 93966 : bool is_sticky = flags & JSRegExp::kSticky;
6641 93966 : bool is_global = flags & JSRegExp::kGlobal;
6642 : bool is_unicode = flags & JSRegExp::kUnicode;
6643 : RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6644 93966 : is_one_byte);
6645 :
6646 93966 : if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6647 :
6648 : // Sample some characters from the middle of the string.
6649 : static const int kSampleSize = 128;
6650 :
6651 93966 : sample_subject = String::Flatten(sample_subject);
6652 : int chars_sampled = 0;
6653 93966 : int half_way = (sample_subject->length() - kSampleSize) / 2;
6654 1996160 : for (int i = Max(0, half_way);
6655 998080 : i < sample_subject->length() && chars_sampled < kSampleSize;
6656 : i++, chars_sampled++) {
6657 : compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6658 : }
6659 :
6660 : // Wrap the body of the regexp in capture #0.
6661 : RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6662 : 0,
6663 : &compiler,
6664 93966 : compiler.accept());
6665 : RegExpNode* node = captured_body;
6666 93966 : bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6667 93966 : bool is_start_anchored = data->tree->IsAnchoredAtStart();
6668 93966 : int max_length = data->tree->max_match();
6669 93966 : if (!is_start_anchored && !is_sticky) {
6670 : // Add a .*? at the beginning, outside the body capture, unless
6671 : // this expression is anchored at the beginning or sticky.
6672 : RegExpNode* loop_node = RegExpQuantifier::ToNode(
6673 : 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6674 179650 : &compiler, captured_body, data->contains_anchor);
6675 :
6676 89825 : if (data->contains_anchor) {
6677 : // Unroll loop once, to take care of the case that might start
6678 : // at the start of input.
6679 165 : ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6680 : first_step_node->AddAlternative(GuardedAlternative(captured_body));
6681 : first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6682 165 : new (zone) RegExpCharacterClass('*'), false, loop_node)));
6683 : node = first_step_node;
6684 : } else {
6685 : node = loop_node;
6686 : }
6687 : }
6688 93966 : if (is_one_byte) {
6689 22513 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6690 : // Do it again to propagate the new nodes to places where they were not
6691 : // put because they had not been calculated yet.
6692 22513 : if (node != nullptr) {
6693 22207 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6694 : }
6695 71453 : } else if (compiler.unicode() && (is_global || is_sticky)) {
6696 43 : node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6697 : }
6698 :
6699 93966 : if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
6700 93966 : data->node = node;
6701 : Analysis analysis(isolate, flags, is_one_byte);
6702 93966 : analysis.EnsureAnalyzed(node);
6703 93966 : if (analysis.has_failed()) {
6704 : const char* error_message = analysis.error_message();
6705 : return CompilationResult(isolate, error_message);
6706 : }
6707 :
6708 : // Create the correct assembler for the architecture.
6709 : #ifndef V8_INTERPRETED_REGEXP
6710 : // Native regexp implementation.
6711 :
6712 : NativeRegExpMacroAssembler::Mode mode =
6713 : is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6714 93518 : : NativeRegExpMacroAssembler::UC16;
6715 :
6716 : #if V8_TARGET_ARCH_IA32
6717 : RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6718 : (data->capture_count + 1) * 2);
6719 : #elif V8_TARGET_ARCH_X64
6720 : RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6721 187036 : (data->capture_count + 1) * 2);
6722 : #elif V8_TARGET_ARCH_ARM
6723 : RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6724 : (data->capture_count + 1) * 2);
6725 : #elif V8_TARGET_ARCH_ARM64
6726 : RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6727 : (data->capture_count + 1) * 2);
6728 : #elif V8_TARGET_ARCH_S390
6729 : RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6730 : (data->capture_count + 1) * 2);
6731 : #elif V8_TARGET_ARCH_PPC
6732 : RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6733 : (data->capture_count + 1) * 2);
6734 : #elif V8_TARGET_ARCH_MIPS
6735 : RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6736 : (data->capture_count + 1) * 2);
6737 : #elif V8_TARGET_ARCH_MIPS64
6738 : RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6739 : (data->capture_count + 1) * 2);
6740 : #else
6741 : #error "Unsupported architecture"
6742 : #endif
6743 :
6744 : #else // V8_INTERPRETED_REGEXP
6745 : // Interpreted regexp implementation.
6746 : EmbeddedVector<byte, 1024> codes;
6747 : RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6748 : #endif // V8_INTERPRETED_REGEXP
6749 :
6750 93518 : macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6751 :
6752 : // Inserted here, instead of in Assembler, because it depends on information
6753 : // in the AST that isn't replicated in the Node structure.
6754 : static const int kMaxBacksearchLimit = 1024;
6755 94072 : if (is_end_anchored && !is_start_anchored && !is_sticky &&
6756 554 : max_length < kMaxBacksearchLimit) {
6757 230 : macro_assembler.SetCurrentPositionFromEnd(max_length);
6758 : }
6759 :
6760 93518 : if (is_global) {
6761 : RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6762 7156 : if (data->tree->min_match() > 0) {
6763 : mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6764 141 : } else if (is_unicode) {
6765 : mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6766 : }
6767 : macro_assembler.set_global_mode(mode);
6768 : }
6769 :
6770 : return compiler.Assemble(¯o_assembler,
6771 : node,
6772 : data->capture_count,
6773 93518 : pattern);
6774 : }
6775 :
6776 :
6777 186296 : bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6778 : Heap* heap = pattern->GetHeap();
6779 186296 : bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6780 372592 : if (heap->isolate()->total_regexp_code_generated() >
6781 314390 : RegExpImpl::kRegExpCompiledLimit &&
6782 128094 : heap->CommittedMemoryExecutable() >
6783 : RegExpImpl::kRegExpExecutableMemoryLimit) {
6784 : too_much = true;
6785 : }
6786 186296 : return too_much;
6787 : }
6788 :
6789 :
6790 62376 : Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6791 : Object* key_pattern,
6792 : FixedArray** last_match_cache,
6793 : ResultsCacheType type) {
6794 : FixedArray* cache;
6795 49837 : if (!key_string->IsInternalizedString()) return Smi::kZero;
6796 12539 : if (type == STRING_SPLIT_SUBSTRINGS) {
6797 : DCHECK(key_pattern->IsString());
6798 12539 : if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6799 : cache = heap->string_split_cache();
6800 : } else {
6801 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6802 : DCHECK(key_pattern->IsFixedArray());
6803 : cache = heap->regexp_multiple_cache();
6804 : }
6805 :
6806 : uint32_t hash = key_string->Hash();
6807 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6808 12539 : ~(kArrayEntriesPerCacheEntry - 1));
6809 36655 : if (cache->get(index + kStringOffset) != key_string ||
6810 11577 : cache->get(index + kPatternOffset) != key_pattern) {
6811 : index =
6812 1240 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6813 2647 : if (cache->get(index + kStringOffset) != key_string ||
6814 167 : cache->get(index + kPatternOffset) != key_pattern) {
6815 : return Smi::kZero;
6816 : }
6817 : }
6818 :
6819 22900 : *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6820 22900 : return cache->get(index + kArrayOffset);
6821 : }
6822 :
6823 :
6824 38387 : void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6825 : Handle<Object> key_pattern,
6826 : Handle<FixedArray> value_array,
6827 : Handle<FixedArray> last_match_cache,
6828 : ResultsCacheType type) {
6829 : Factory* factory = isolate->factory();
6830 : Handle<FixedArray> cache;
6831 38387 : if (!key_string->IsInternalizedString()) return;
6832 1089 : if (type == STRING_SPLIT_SUBSTRINGS) {
6833 : DCHECK(key_pattern->IsString());
6834 1089 : if (!key_pattern->IsInternalizedString()) return;
6835 : cache = factory->string_split_cache();
6836 : } else {
6837 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6838 : DCHECK(key_pattern->IsFixedArray());
6839 : cache = factory->regexp_multiple_cache();
6840 : }
6841 :
6842 : uint32_t hash = key_string->Hash();
6843 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6844 1089 : ~(kArrayEntriesPerCacheEntry - 1));
6845 2178 : if (cache->get(index + kStringOffset) == Smi::kZero) {
6846 960 : cache->set(index + kStringOffset, *key_string);
6847 1920 : cache->set(index + kPatternOffset, *key_pattern);
6848 1920 : cache->set(index + kArrayOffset, *value_array);
6849 1920 : cache->set(index + kLastMatchOffset, *last_match_cache);
6850 : } else {
6851 : uint32_t index2 =
6852 129 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6853 258 : if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6854 106 : cache->set(index2 + kStringOffset, *key_string);
6855 212 : cache->set(index2 + kPatternOffset, *key_pattern);
6856 212 : cache->set(index2 + kArrayOffset, *value_array);
6857 212 : cache->set(index2 + kLastMatchOffset, *last_match_cache);
6858 : } else {
6859 : cache->set(index2 + kStringOffset, Smi::kZero);
6860 23 : cache->set(index2 + kPatternOffset, Smi::kZero);
6861 23 : cache->set(index2 + kArrayOffset, Smi::kZero);
6862 23 : cache->set(index2 + kLastMatchOffset, Smi::kZero);
6863 23 : cache->set(index + kStringOffset, *key_string);
6864 46 : cache->set(index + kPatternOffset, *key_pattern);
6865 46 : cache->set(index + kArrayOffset, *value_array);
6866 46 : cache->set(index + kLastMatchOffset, *last_match_cache);
6867 : }
6868 : }
6869 : // If the array is a reasonably short list of substrings, convert it into a
6870 : // list of internalized strings.
6871 2178 : if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6872 7507 : for (int i = 0; i < value_array->length(); i++) {
6873 : Handle<String> str(String::cast(value_array->get(i)), isolate);
6874 3212 : Handle<String> internalized_str = factory->InternalizeString(str);
6875 3212 : value_array->set(i, *internalized_str);
6876 : }
6877 : }
6878 : // Convert backing store to a copy-on-write array.
6879 1089 : value_array->set_map_no_write_barrier(isolate->heap()->fixed_cow_array_map());
6880 : }
6881 :
6882 :
6883 113600 : void RegExpResultsCache::Clear(FixedArray* cache) {
6884 29195200 : for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6885 : cache->set(i, Smi::kZero);
6886 : }
6887 113600 : }
6888 :
6889 : } // namespace internal
6890 : } // namespace v8
|