Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/regexp/jsregexp.h"
6 :
7 : #include <memory>
8 :
9 : #include "src/base/platform/platform.h"
10 : #include "src/compilation-cache.h"
11 : #include "src/elements.h"
12 : #include "src/execution.h"
13 : #include "src/factory.h"
14 : #include "src/isolate-inl.h"
15 : #include "src/messages.h"
16 : #include "src/ostreams.h"
17 : #include "src/regexp/interpreter-irregexp.h"
18 : #include "src/regexp/jsregexp-inl.h"
19 : #include "src/regexp/regexp-macro-assembler-irregexp.h"
20 : #include "src/regexp/regexp-macro-assembler-tracer.h"
21 : #include "src/regexp/regexp-macro-assembler.h"
22 : #include "src/regexp/regexp-parser.h"
23 : #include "src/regexp/regexp-stack.h"
24 : #include "src/runtime/runtime.h"
25 : #include "src/splay-tree-inl.h"
26 : #include "src/string-search.h"
27 : #include "src/unicode-decoder.h"
28 :
29 : #ifdef V8_INTL_SUPPORT
30 : #include "unicode/uniset.h"
31 : #include "unicode/utypes.h"
32 : #endif // V8_INTL_SUPPORT
33 :
34 : #ifndef V8_INTERPRETED_REGEXP
35 : #if V8_TARGET_ARCH_IA32
36 : #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
37 : #elif V8_TARGET_ARCH_X64
38 : #include "src/regexp/x64/regexp-macro-assembler-x64.h"
39 : #elif V8_TARGET_ARCH_ARM64
40 : #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
41 : #elif V8_TARGET_ARCH_ARM
42 : #include "src/regexp/arm/regexp-macro-assembler-arm.h"
43 : #elif V8_TARGET_ARCH_PPC
44 : #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
45 : #elif V8_TARGET_ARCH_S390
46 : #include "src/regexp/s390/regexp-macro-assembler-s390.h"
47 : #elif V8_TARGET_ARCH_MIPS
48 : #include "src/regexp/mips/regexp-macro-assembler-mips.h"
49 : #elif V8_TARGET_ARCH_MIPS64
50 : #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
51 : #elif V8_TARGET_ARCH_X87
52 : #include "src/regexp/x87/regexp-macro-assembler-x87.h"
53 : #else
54 : #error Unsupported target architecture.
55 : #endif
56 : #endif
57 :
58 :
59 : namespace v8 {
60 : namespace internal {
61 :
62 : MUST_USE_RESULT
63 4076 : static inline MaybeHandle<Object> ThrowRegExpException(
64 : Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
65 : Isolate* isolate = re->GetIsolate();
66 8152 : THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
67 : pattern, error_text),
68 : Object);
69 : }
70 :
71 :
72 456 : inline void ThrowRegExpException(Handle<JSRegExp> re,
73 : Handle<String> error_text) {
74 456 : USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
75 456 : }
76 :
77 :
78 1088516 : ContainedInLattice AddRange(ContainedInLattice containment,
79 : const int* ranges,
80 : int ranges_length,
81 : Interval new_range) {
82 : DCHECK((ranges_length & 1) == 1);
83 : DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
84 1088516 : if (containment == kLatticeUnknown) return containment;
85 : bool inside = false;
86 : int last = 0;
87 3700949 : for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
88 : // Consider the range from last to ranges[i].
89 : // We haven't got to the new range yet.
90 4584555 : if (ranges[i] <= new_range.from()) continue;
91 : // New range is wholly inside last-ranges[i]. Note that new_range.to() is
92 : // inclusive, but the values in ranges are not.
93 883606 : if (last <= new_range.from() && new_range.to() < ranges[i]) {
94 1723778 : return Combine(containment, inside ? kLatticeIn : kLatticeOut);
95 : }
96 : return kLatticeUnknown;
97 : }
98 : return containment;
99 : }
100 :
101 :
102 : // More makes code generation slower, less makes V8 benchmark score lower.
103 : const int kMaxLookaheadForBoyerMoore = 8;
104 : // In a 3-character pattern you can maximally step forwards 3 characters
105 : // at a time, which is not always enough to pay for the extra logic.
106 : const int kPatternTooShortForBoyerMoore = 2;
107 :
108 :
109 : // Identifies the sort of regexps where the regexp engine is faster
110 : // than the code used for atom matches.
111 262562 : static bool HasFewDifferentCharacters(Handle<String> pattern) {
112 : int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
113 262562 : if (length <= kPatternTooShortForBoyerMoore) return false;
114 : const int kMod = 128;
115 : bool character_found[kMod];
116 : int different = 0;
117 : memset(&character_found[0], 0, sizeof(character_found));
118 768996 : for (int i = 0; i < length; i++) {
119 768884 : int ch = (pattern->Get(i) & (kMod - 1));
120 768884 : if (!character_found[ch]) {
121 768428 : character_found[ch] = true;
122 768428 : different++;
123 : // We declare a regexp low-alphabet if it has at least 3 times as many
124 : // characters as it has different characters.
125 768428 : if (different * 3 > length) return false;
126 : }
127 : }
128 : return true;
129 : }
130 :
131 :
132 : // Generic RegExp methods. Dispatches to implementation specific methods.
133 :
134 :
135 719988 : MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
136 : Handle<String> pattern,
137 : JSRegExp::Flags flags) {
138 : DCHECK(pattern->IsFlat());
139 :
140 1439976 : Isolate* isolate = re->GetIsolate();
141 719988 : Zone zone(isolate->allocator(), ZONE_NAME);
142 : CompilationCache* compilation_cache = isolate->compilation_cache();
143 : MaybeHandle<FixedArray> maybe_cached =
144 719988 : compilation_cache->LookupRegExp(pattern, flags);
145 : Handle<FixedArray> cached;
146 719988 : if (maybe_cached.ToHandle(&cached)) {
147 356415 : re->set_data(*cached);
148 : return re;
149 : }
150 :
151 : PostponeInterruptsScope postpone(isolate);
152 : RegExpCompileData parse_result;
153 363573 : FlatStringReader reader(isolate, pattern);
154 363573 : if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
155 363573 : &parse_result)) {
156 : // Throw an exception if we fail to parse the pattern.
157 3562 : return ThrowRegExpException(re, pattern, parse_result.error);
158 : }
159 :
160 : bool has_been_compiled = false;
161 :
162 870650 : if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
163 615020 : !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
164 : // Parse-tree is a single atom that is equal to the pattern.
165 254957 : AtomCompile(re, pattern, flags, pattern);
166 : has_been_compiled = true;
167 225918 : } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
168 112621 : !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
169 7553 : RegExpAtom* atom = parse_result.tree->AsAtom();
170 7553 : Vector<const uc16> atom_pattern = atom->data();
171 : Handle<String> atom_string;
172 15106 : ASSIGN_RETURN_ON_EXCEPTION(
173 : isolate, atom_string,
174 : isolate->factory()->NewStringFromTwoByte(atom_pattern),
175 : Object);
176 7553 : if (!HasFewDifferentCharacters(atom_string)) {
177 7493 : AtomCompile(re, pattern, flags, atom_string);
178 : has_been_compiled = true;
179 : }
180 : }
181 360011 : if (!has_been_compiled) {
182 97561 : IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
183 : }
184 : DCHECK(re->data()->IsFixedArray());
185 : // Compilation succeeded so the data is set on the regexp
186 : // and we can store it in the cache.
187 : Handle<FixedArray> data(FixedArray::cast(re->data()));
188 360011 : compilation_cache->PutRegExp(pattern, flags, data);
189 :
190 719988 : return re;
191 : }
192 :
193 611183 : MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
194 : Handle<String> subject, int index,
195 : Handle<RegExpMatchInfo> last_match_info) {
196 611183 : switch (regexp->TypeTag()) {
197 : case JSRegExp::ATOM:
198 486154 : return AtomExec(regexp, subject, index, last_match_info);
199 : case JSRegExp::IRREGEXP: {
200 125029 : return IrregexpExec(regexp, subject, index, last_match_info);
201 : }
202 : default:
203 0 : UNREACHABLE();
204 : return MaybeHandle<Object>();
205 : }
206 : }
207 :
208 :
209 : // RegExp Atom implementation: Simple string search using indexOf.
210 :
211 :
212 262450 : void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
213 : Handle<String> pattern,
214 : JSRegExp::Flags flags,
215 : Handle<String> match_pattern) {
216 : re->GetIsolate()->factory()->SetRegExpAtomData(re,
217 : JSRegExp::ATOM,
218 : pattern,
219 : flags,
220 262450 : match_pattern);
221 262450 : }
222 :
223 330120 : static void SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info,
224 : String* subject, int from, int to) {
225 : SealHandleScope shs(last_match_info->GetIsolate());
226 : last_match_info->SetNumberOfCaptureRegisters(2);
227 : last_match_info->SetLastSubject(subject);
228 : last_match_info->SetLastInput(subject);
229 : last_match_info->SetCapture(0, from);
230 : last_match_info->SetCapture(1, to);
231 330120 : }
232 :
233 :
234 626544 : int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
235 : Handle<String> subject,
236 : int index,
237 : int32_t* output,
238 : int output_size) {
239 : Isolate* isolate = regexp->GetIsolate();
240 :
241 : DCHECK(0 <= index);
242 : DCHECK(index <= subject->length());
243 :
244 626544 : subject = String::Flatten(subject);
245 : DisallowHeapAllocation no_gc; // ensure vectors stay valid
246 :
247 : String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
248 : int needle_len = needle->length();
249 : DCHECK(needle->IsFlat());
250 : DCHECK_LT(0, needle_len);
251 :
252 1253088 : if (index + needle_len > subject->length()) {
253 : return RegExpImpl::RE_FAILURE;
254 : }
255 :
256 471738 : for (int i = 0; i < output_size; i += 2) {
257 767514 : String::FlatContent needle_content = needle->GetFlatContent();
258 767514 : String::FlatContent subject_content = subject->GetFlatContent();
259 : DCHECK(needle_content.IsFlat());
260 : DCHECK(subject_content.IsFlat());
261 : // dispatch on type of strings
262 : index =
263 767514 : (needle_content.IsOneByte()
264 : ? (subject_content.IsOneByte()
265 : ? SearchString(isolate, subject_content.ToOneByteVector(),
266 : needle_content.ToOneByteVector(), index)
267 : : SearchString(isolate, subject_content.ToUC16Vector(),
268 : needle_content.ToOneByteVector(), index))
269 : : (subject_content.IsOneByte()
270 : ? SearchString(isolate, subject_content.ToOneByteVector(),
271 : needle_content.ToUC16Vector(), index)
272 : : SearchString(isolate, subject_content.ToUC16Vector(),
273 767514 : needle_content.ToUC16Vector(), index)));
274 767514 : if (index == -1) {
275 295776 : return i / 2; // Return number of matches.
276 : } else {
277 471738 : output[i] = index;
278 471738 : output[i+1] = index + needle_len;
279 : index += needle_len;
280 : }
281 : }
282 330120 : return output_size / 2;
283 : }
284 :
285 486154 : Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject,
286 : int index,
287 : Handle<RegExpMatchInfo> last_match_info) {
288 : Isolate* isolate = re->GetIsolate();
289 :
290 : static const int kNumRegisters = 2;
291 : STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
292 486154 : int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
293 :
294 486154 : int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
295 :
296 642188 : if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
297 :
298 : DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
299 : SealHandleScope shs(isolate);
300 : SetAtomLastCapture(last_match_info, *subject, output_registers[0],
301 660240 : output_registers[1]);
302 330120 : return last_match_info;
303 : }
304 :
305 :
306 : // Irregexp implementation.
307 :
308 : // Ensures that the regexp object contains a compiled version of the
309 : // source for either one-byte or two-byte subject strings.
310 : // If the compiled version doesn't already exist, it is compiled
311 : // from the source pattern.
312 : // If compilation fails, an exception is thrown and this function
313 : // returns false.
314 1057858 : bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
315 : Handle<String> sample_subject,
316 : bool is_one_byte) {
317 : Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
318 : #ifdef V8_INTERPRETED_REGEXP
319 : if (compiled_code->IsByteArray()) return true;
320 : #else // V8_INTERPRETED_REGEXP (RegExp native code)
321 1057858 : if (compiled_code->IsCode()) return true;
322 : #endif
323 : // We could potentially have marked this as flushable, but have kept
324 : // a saved version if we did not flush it yet.
325 : Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
326 93313 : if (saved_code->IsCode()) {
327 : // Reinstate the code in the original place.
328 : re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
329 : DCHECK(compiled_code->IsSmi());
330 625 : return true;
331 : }
332 92688 : return CompileIrregexp(re, sample_subject, is_one_byte);
333 : }
334 :
335 :
336 92688 : bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
337 : Handle<String> sample_subject,
338 : bool is_one_byte) {
339 : // Compile the RegExp.
340 92688 : Isolate* isolate = re->GetIsolate();
341 92688 : Zone zone(isolate->allocator(), ZONE_NAME);
342 : PostponeInterruptsScope postpone(isolate);
343 : // If we had a compilation error the last time this is saved at the
344 : // saved code index.
345 : Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
346 : // When arriving here entry can only be a smi, either representing an
347 : // uncompiled regexp, a previous compilation error, or code that has
348 : // been flushed.
349 : DCHECK(entry->IsSmi());
350 : int entry_value = Smi::cast(entry)->value();
351 : DCHECK(entry_value == JSRegExp::kUninitializedValue ||
352 : entry_value == JSRegExp::kCompilationErrorValue ||
353 : (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
354 :
355 92688 : if (entry_value == JSRegExp::kCompilationErrorValue) {
356 : // A previous compilation failed and threw an error which we store in
357 : // the saved code index (we store the error message, not the actual
358 : // error). Recreate the error object and throw it.
359 : Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
360 : DCHECK(error_string->IsString());
361 : Handle<String> error_message(String::cast(error_string));
362 0 : ThrowRegExpException(re, error_message);
363 : return false;
364 : }
365 :
366 92688 : JSRegExp::Flags flags = re->GetFlags();
367 :
368 : Handle<String> pattern(re->Pattern());
369 92688 : pattern = String::Flatten(pattern);
370 : RegExpCompileData compile_data;
371 92688 : FlatStringReader reader(isolate, pattern);
372 92688 : if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
373 92688 : &compile_data)) {
374 : // Throw an exception if we fail to parse the pattern.
375 : // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
376 58 : USE(ThrowRegExpException(re, pattern, compile_data.error));
377 58 : return false;
378 : }
379 : RegExpEngine::CompilationResult result =
380 : RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
381 92630 : sample_subject, is_one_byte);
382 92630 : if (result.error_message != NULL) {
383 : // Unable to compile regexp.
384 : Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
385 912 : CStrVector(result.error_message)).ToHandleChecked();
386 456 : ThrowRegExpException(re, error_message);
387 : return false;
388 : }
389 :
390 : Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
391 184348 : data->set(JSRegExp::code_index(is_one_byte), result.code);
392 92174 : SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
393 : int register_max = IrregexpMaxRegisterCount(*data);
394 92174 : if (result.num_registers > register_max) {
395 : SetIrregexpMaxRegisterCount(*data, result.num_registers);
396 : }
397 :
398 92688 : return true;
399 : }
400 :
401 :
402 0 : int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
403 : return Smi::cast(
404 0 : re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
405 : }
406 :
407 :
408 0 : void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
409 : re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
410 0 : }
411 :
412 92174 : void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
413 : Handle<FixedArray> value) {
414 92174 : if (value.is_null()) {
415 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
416 : } else {
417 438 : re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
418 : }
419 92174 : }
420 :
421 0 : int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
422 0 : return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
423 : }
424 :
425 :
426 0 : int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
427 0 : return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
428 : }
429 :
430 :
431 0 : ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
432 0 : return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
433 : }
434 :
435 :
436 0 : Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
437 0 : return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
438 : }
439 :
440 :
441 97561 : void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
442 : Handle<String> pattern,
443 : JSRegExp::Flags flags,
444 : int capture_count) {
445 : // Initialize compiled code entries to null.
446 : re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
447 : JSRegExp::IRREGEXP,
448 : pattern,
449 : flags,
450 97561 : capture_count);
451 97561 : }
452 :
453 :
454 357478 : int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
455 : Handle<String> subject) {
456 : DCHECK(subject->IsFlat());
457 :
458 : // Check representation of the underlying storage.
459 357478 : bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
460 357478 : if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
461 :
462 : #ifdef V8_INTERPRETED_REGEXP
463 : // Byte-code regexp needs space allocated for all its registers.
464 : // The result captures are copied to the start of the registers array
465 : // if the match succeeds. This way those registers are not clobbered
466 : // when we set the last match info from last successful match.
467 : return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
468 : (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
469 : #else // V8_INTERPRETED_REGEXP
470 : // Native regexp only needs room to output captures. Registers are handled
471 : // internally.
472 356964 : return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
473 : #endif // V8_INTERPRETED_REGEXP
474 : }
475 :
476 :
477 700380 : int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
478 : Handle<String> subject,
479 : int index,
480 : int32_t* output,
481 : int output_size) {
482 : Isolate* isolate = regexp->GetIsolate();
483 :
484 : Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
485 :
486 : DCHECK(index >= 0);
487 : DCHECK(index <= subject->length());
488 : DCHECK(subject->IsFlat());
489 :
490 700380 : bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
491 :
492 : #ifndef V8_INTERPRETED_REGEXP
493 : DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
494 : do {
495 700380 : EnsureCompiledIrregexp(regexp, subject, is_one_byte);
496 : Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
497 : // The stack is used to allocate registers for the compiled regexp code.
498 : // This means that in case of failure, the output registers array is left
499 : // untouched and contains the capture results from the previous successful
500 : // match. We can use that to set the last match info lazily.
501 : NativeRegExpMacroAssembler::Result res =
502 : NativeRegExpMacroAssembler::Match(code,
503 : subject,
504 : output,
505 : output_size,
506 : index,
507 700380 : isolate);
508 700380 : if (res != NativeRegExpMacroAssembler::RETRY) {
509 : DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
510 : isolate->has_pending_exception());
511 : STATIC_ASSERT(
512 : static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
513 : STATIC_ASSERT(
514 : static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
515 : STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
516 : == RE_EXCEPTION);
517 700380 : return static_cast<IrregexpResult>(res);
518 : }
519 : // If result is RETRY, the string has changed representation, and we
520 : // must restart from scratch.
521 : // In this case, it means we must make sure we are prepared to handle
522 : // the, potentially, different subject (the string can switch between
523 : // being internal and external, and even between being Latin1 and UC16,
524 : // but the characters are always the same).
525 0 : IrregexpPrepare(regexp, subject);
526 0 : is_one_byte = subject->IsOneByteRepresentationUnderneath();
527 : } while (true);
528 : UNREACHABLE();
529 0 : return RE_EXCEPTION;
530 : #else // V8_INTERPRETED_REGEXP
531 :
532 : DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
533 : // We must have done EnsureCompiledIrregexp, so we can get the number of
534 : // registers.
535 : int number_of_capture_registers =
536 : (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
537 : int32_t* raw_output = &output[number_of_capture_registers];
538 : // We do not touch the actual capture result registers until we know there
539 : // has been a match so that we can use those capture results to set the
540 : // last match info.
541 : for (int i = number_of_capture_registers - 1; i >= 0; i--) {
542 : raw_output[i] = -1;
543 : }
544 : Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
545 : isolate);
546 :
547 : IrregexpResult result = IrregexpInterpreter::Match(isolate,
548 : byte_codes,
549 : subject,
550 : raw_output,
551 : index);
552 : if (result == RE_SUCCESS) {
553 : // Copy capture results to the start of the registers array.
554 : MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
555 : }
556 : if (result == RE_EXCEPTION) {
557 : DCHECK(!isolate->has_pending_exception());
558 : isolate->StackOverflow();
559 : }
560 : return result;
561 : #endif // V8_INTERPRETED_REGEXP
562 : }
563 :
564 125029 : MaybeHandle<Object> RegExpImpl::IrregexpExec(
565 : Handle<JSRegExp> regexp, Handle<String> subject, int previous_index,
566 : Handle<RegExpMatchInfo> last_match_info) {
567 : Isolate* isolate = regexp->GetIsolate();
568 : DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
569 :
570 125029 : subject = String::Flatten(subject);
571 :
572 : // Prepare space for the return values.
573 : #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
574 : if (FLAG_trace_regexp_bytecodes) {
575 : String* pattern = regexp->Pattern();
576 : PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
577 : PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
578 : }
579 : #endif
580 125029 : int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
581 125029 : if (required_registers < 0) {
582 : // Compiling failed with an exception.
583 : DCHECK(isolate->has_pending_exception());
584 : return MaybeHandle<Object>();
585 : }
586 :
587 : int32_t* output_registers = NULL;
588 124760 : if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
589 2422 : output_registers = NewArray<int32_t>(required_registers);
590 : }
591 : std::unique_ptr<int32_t[]> auto_release(output_registers);
592 124760 : if (output_registers == NULL) {
593 122338 : output_registers = isolate->jsregexp_static_offsets_vector();
594 : }
595 :
596 : int res = RegExpImpl::IrregexpExecRaw(
597 124760 : regexp, subject, previous_index, output_registers, required_registers);
598 124760 : if (res == RE_SUCCESS) {
599 : int capture_count =
600 : IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
601 : return SetLastMatchInfo(
602 97638 : last_match_info, subject, capture_count, output_registers);
603 : }
604 27122 : if (res == RE_EXCEPTION) {
605 : DCHECK(isolate->has_pending_exception());
606 : return MaybeHandle<Object>();
607 : }
608 : DCHECK(res == RE_FAILURE);
609 : return isolate->factory()->null_value();
610 : }
611 :
612 259838 : Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
613 : Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
614 : int capture_count, int32_t* match) {
615 : // This is the only place where match infos can grow. If, after executing the
616 : // regexp, RegExpExecStub finds that the match info is too small, it restarts
617 : // execution in RegExpImpl::Exec, which finally grows the match info right
618 : // here.
619 :
620 259838 : int capture_register_count = (capture_count + 1) * 2;
621 : Handle<RegExpMatchInfo> result =
622 259838 : RegExpMatchInfo::ReserveCaptures(last_match_info, capture_register_count);
623 : result->SetNumberOfCaptureRegisters(capture_register_count);
624 :
625 259838 : if (*result != *last_match_info) {
626 : // The match info has been reallocated, update the corresponding reference
627 : // on the native context.
628 : Isolate* isolate = last_match_info->GetIsolate();
629 9720 : if (*last_match_info == *isolate->regexp_last_match_info()) {
630 6188 : isolate->native_context()->set_regexp_last_match_info(*result);
631 3532 : } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
632 3532 : isolate->native_context()->set_regexp_internal_match_info(*result);
633 : }
634 : }
635 :
636 : DisallowHeapAllocation no_allocation;
637 259838 : if (match != NULL) {
638 1176454 : for (int i = 0; i < capture_register_count; i += 2) {
639 1176454 : result->SetCapture(i, match[i]);
640 1176454 : result->SetCapture(i + 1, match[i + 1]);
641 : }
642 : }
643 : result->SetLastSubject(*subject);
644 : result->SetLastInput(*subject);
645 259838 : return result;
646 : }
647 :
648 :
649 361004 : RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
650 : Handle<String> subject,
651 : Isolate* isolate)
652 : : register_array_(NULL),
653 : register_array_size_(0),
654 : regexp_(regexp),
655 361004 : subject_(subject) {
656 : #ifdef V8_INTERPRETED_REGEXP
657 : bool interpreted = true;
658 : #else
659 : bool interpreted = false;
660 : #endif // V8_INTERPRETED_REGEXP
661 :
662 361004 : if (regexp_->TypeTag() == JSRegExp::ATOM) {
663 : static const int kAtomRegistersPerMatch = 2;
664 140390 : registers_per_match_ = kAtomRegistersPerMatch;
665 : // There is no distinction between interpreted and native for atom regexps.
666 : interpreted = false;
667 : } else {
668 220614 : registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
669 220614 : if (registers_per_match_ < 0) {
670 117 : num_matches_ = -1; // Signal exception.
671 361121 : return;
672 : }
673 : }
674 :
675 : DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
676 : if (!interpreted) {
677 : register_array_size_ =
678 721774 : Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
679 360887 : max_matches_ = register_array_size_ / registers_per_match_;
680 : } else {
681 : // Global loop in interpreted regexp is not implemented. We choose
682 : // the size of the offsets vector so that it can only store one match.
683 : register_array_size_ = registers_per_match_;
684 : max_matches_ = 1;
685 : }
686 :
687 360887 : if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
688 1764 : register_array_ = NewArray<int32_t>(register_array_size_);
689 : } else {
690 359123 : register_array_ = isolate->jsregexp_static_offsets_vector();
691 : }
692 :
693 : // Set state so that fetching the results the first time triggers a call
694 : // to the compiled regexp.
695 360887 : current_match_index_ = max_matches_ - 1;
696 360887 : num_matches_ = max_matches_;
697 : DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
698 : DCHECK_GE(register_array_size_, registers_per_match_);
699 : int32_t* last_match =
700 360887 : ®ister_array_[current_match_index_ * registers_per_match_];
701 360887 : last_match[0] = -1;
702 360887 : last_match[1] = 0;
703 : }
704 :
705 449 : int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
706 883 : if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
707 868 : last_index + 1 < subject_->length() &&
708 1317 : unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
709 434 : unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
710 : // Advance over the surrogate pair.
711 434 : return last_index + 2;
712 : }
713 15 : return last_index + 1;
714 : }
715 :
716 : // -------------------------------------------------------------------
717 : // Implementation of the Irregexp regular expression engine.
718 : //
719 : // The Irregexp regular expression engine is intended to be a complete
720 : // implementation of ECMAScript regular expressions. It generates either
721 : // bytecodes or native code.
722 :
723 : // The Irregexp regexp engine is structured in three steps.
724 : // 1) The parser generates an abstract syntax tree. See ast.cc.
725 : // 2) From the AST a node network is created. The nodes are all
726 : // subclasses of RegExpNode. The nodes represent states when
727 : // executing a regular expression. Several optimizations are
728 : // performed on the node network.
729 : // 3) From the nodes we generate either byte codes or native code
730 : // that can actually execute the regular expression (perform
731 : // the search). The code generation step is described in more
732 : // detail below.
733 :
734 : // Code generation.
735 : //
736 : // The nodes are divided into four main categories.
737 : // * Choice nodes
738 : // These represent places where the regular expression can
739 : // match in more than one way. For example on entry to an
740 : // alternation (foo|bar) or a repetition (*, +, ? or {}).
741 : // * Action nodes
742 : // These represent places where some action should be
743 : // performed. Examples include recording the current position
744 : // in the input string to a register (in order to implement
745 : // captures) or other actions on register for example in order
746 : // to implement the counters needed for {} repetitions.
747 : // * Matching nodes
748 : // These attempt to match some element part of the input string.
749 : // Examples of elements include character classes, plain strings
750 : // or back references.
751 : // * End nodes
752 : // These are used to implement the actions required on finding
753 : // a successful match or failing to find a match.
754 : //
755 : // The code generated (whether as byte codes or native code) maintains
756 : // some state as it runs. This consists of the following elements:
757 : //
758 : // * The capture registers. Used for string captures.
759 : // * Other registers. Used for counters etc.
760 : // * The current position.
761 : // * The stack of backtracking information. Used when a matching node
762 : // fails to find a match and needs to try an alternative.
763 : //
764 : // Conceptual regular expression execution model:
765 : //
766 : // There is a simple conceptual model of regular expression execution
767 : // which will be presented first. The actual code generated is a more
768 : // efficient simulation of the simple conceptual model:
769 : //
770 : // * Choice nodes are implemented as follows:
771 : // For each choice except the last {
772 : // push current position
773 : // push backtrack code location
774 : // <generate code to test for choice>
775 : // backtrack code location:
776 : // pop current position
777 : // }
778 : // <generate code to test for last choice>
779 : //
780 : // * Actions nodes are generated as follows
781 : // <push affected registers on backtrack stack>
782 : // <generate code to perform action>
783 : // push backtrack code location
784 : // <generate code to test for following nodes>
785 : // backtrack code location:
786 : // <pop affected registers to restore their state>
787 : // <pop backtrack location from stack and go to it>
788 : //
789 : // * Matching nodes are generated as follows:
790 : // if input string matches at current position
791 : // update current position
792 : // <generate code to test for following nodes>
793 : // else
794 : // <pop backtrack location from stack and go to it>
795 : //
796 : // Thus it can be seen that the current position is saved and restored
797 : // by the choice nodes, whereas the registers are saved and restored by
798 : // by the action nodes that manipulate them.
799 : //
800 : // The other interesting aspect of this model is that nodes are generated
801 : // at the point where they are needed by a recursive call to Emit(). If
802 : // the node has already been code generated then the Emit() call will
803 : // generate a jump to the previously generated code instead. In order to
804 : // limit recursion it is possible for the Emit() function to put the node
805 : // on a work list for later generation and instead generate a jump. The
806 : // destination of the jump is resolved later when the code is generated.
807 : //
808 : // Actual regular expression code generation.
809 : //
810 : // Code generation is actually more complicated than the above. In order
811 : // to improve the efficiency of the generated code some optimizations are
812 : // performed
813 : //
814 : // * Choice nodes have 1-character lookahead.
815 : // A choice node looks at the following character and eliminates some of
816 : // the choices immediately based on that character. This is not yet
817 : // implemented.
818 : // * Simple greedy loops store reduced backtracking information.
819 : // A quantifier like /.*foo/m will greedily match the whole input. It will
820 : // then need to backtrack to a point where it can match "foo". The naive
821 : // implementation of this would push each character position onto the
822 : // backtracking stack, then pop them off one by one. This would use space
823 : // proportional to the length of the input string. However since the "."
824 : // can only match in one way and always has a constant length (in this case
825 : // of 1) it suffices to store the current position on the top of the stack
826 : // once. Matching now becomes merely incrementing the current position and
827 : // backtracking becomes decrementing the current position and checking the
828 : // result against the stored current position. This is faster and saves
829 : // space.
830 : // * The current state is virtualized.
831 : // This is used to defer expensive operations until it is clear that they
832 : // are needed and to generate code for a node more than once, allowing
833 : // specialized an efficient versions of the code to be created. This is
834 : // explained in the section below.
835 : //
836 : // Execution state virtualization.
837 : //
838 : // Instead of emitting code, nodes that manipulate the state can record their
839 : // manipulation in an object called the Trace. The Trace object can record a
840 : // current position offset, an optional backtrack code location on the top of
841 : // the virtualized backtrack stack and some register changes. When a node is
842 : // to be emitted it can flush the Trace or update it. Flushing the Trace
843 : // will emit code to bring the actual state into line with the virtual state.
844 : // Avoiding flushing the state can postpone some work (e.g. updates of capture
845 : // registers). Postponing work can save time when executing the regular
846 : // expression since it may be found that the work never has to be done as a
847 : // failure to match can occur. In addition it is much faster to jump to a
848 : // known backtrack code location than it is to pop an unknown backtrack
849 : // location from the stack and jump there.
850 : //
851 : // The virtual state found in the Trace affects code generation. For example
852 : // the virtual state contains the difference between the actual current
853 : // position and the virtual current position, and matching code needs to use
854 : // this offset to attempt a match in the correct location of the input
855 : // string. Therefore code generated for a non-trivial trace is specialized
856 : // to that trace. The code generator therefore has the ability to generate
857 : // code for each node several times. In order to limit the size of the
858 : // generated code there is an arbitrary limit on how many specialized sets of
859 : // code may be generated for a given node. If the limit is reached, the
860 : // trace is flushed and a generic version of the code for a node is emitted.
861 : // This is subsequently used for that node. The code emitted for non-generic
862 : // trace is not recorded in the node and so it cannot currently be reused in
863 : // the event that code generation is requested for an identical trace.
864 :
865 :
866 0 : void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
867 0 : UNREACHABLE();
868 : }
869 :
870 :
871 9319 : void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
872 9319 : text->AddElement(TextElement::Atom(this), zone);
873 9319 : }
874 :
875 :
876 11004 : void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
877 11004 : text->AddElement(TextElement::CharClass(this), zone);
878 11004 : }
879 :
880 :
881 0 : void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
882 0 : for (int i = 0; i < elements()->length(); i++)
883 0 : text->AddElement(elements()->at(i), zone);
884 0 : }
885 :
886 :
887 0 : TextElement TextElement::Atom(RegExpAtom* atom) {
888 0 : return TextElement(ATOM, atom);
889 : }
890 :
891 :
892 0 : TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
893 0 : return TextElement(CHAR_CLASS, char_class);
894 : }
895 :
896 :
897 9570053 : int TextElement::length() const {
898 9570053 : switch (text_type()) {
899 : case ATOM:
900 8627839 : return atom()->length();
901 :
902 : case CHAR_CLASS:
903 : return 1;
904 : }
905 0 : UNREACHABLE();
906 : return 0;
907 : }
908 :
909 :
910 0 : DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
911 0 : if (table_ == NULL) {
912 0 : table_ = new(zone()) DispatchTable(zone());
913 : DispatchTableConstructor cons(table_, ignore_case, zone());
914 0 : cons.BuildTable(this);
915 : }
916 0 : return table_;
917 : }
918 :
919 :
920 : class FrequencyCollator {
921 : public:
922 11948238 : FrequencyCollator() : total_samples_(0) {
923 11855616 : for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
924 11855616 : frequencies_[i] = CharacterFrequency(i);
925 : }
926 : }
927 :
928 : void CountCharacter(int character) {
929 640498 : int index = (character & RegExpMacroAssembler::kTableMask);
930 640498 : frequencies_[index].Increment();
931 640498 : total_samples_++;
932 : }
933 :
934 : // Does not measure in percent, but rather per-128 (the table size from the
935 : // regexp macro assembler).
936 : int Frequency(int in_character) {
937 : DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
938 514944 : if (total_samples_ < 1) return 1; // Division by zero.
939 : int freq_in_per128 =
940 514604 : (frequencies_[in_character].counter() * 128) / total_samples_;
941 : return freq_in_per128;
942 : }
943 :
944 : private:
945 : class CharacterFrequency {
946 : public:
947 11855616 : CharacterFrequency() : counter_(0), character_(-1) { }
948 : explicit CharacterFrequency(int character)
949 : : counter_(0), character_(character) { }
950 :
951 640498 : void Increment() { counter_++; }
952 : int counter() { return counter_; }
953 : int character() { return character_; }
954 :
955 : private:
956 : int counter_;
957 : int character_;
958 : };
959 :
960 :
961 : private:
962 : CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
963 : int total_samples_;
964 : };
965 :
966 :
967 : class RegExpCompiler {
968 : public:
969 : RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
970 : JSRegExp::Flags flags, bool is_one_byte);
971 :
972 : int AllocateRegister() {
973 1513575 : if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
974 517005 : reg_exp_too_big_ = true;
975 : return next_register_;
976 : }
977 996570 : return next_register_++;
978 : }
979 :
980 : // Lookarounds to match lone surrogates for unicode character class matches
981 : // are never nested. We can therefore reuse registers.
982 : int UnicodeLookaroundStackRegister() {
983 3148 : if (unicode_lookaround_stack_register_ == kNoRegister) {
984 1353 : unicode_lookaround_stack_register_ = AllocateRegister();
985 : }
986 3148 : return unicode_lookaround_stack_register_;
987 : }
988 :
989 : int UnicodeLookaroundPositionRegister() {
990 3148 : if (unicode_lookaround_position_register_ == kNoRegister) {
991 1353 : unicode_lookaround_position_register_ = AllocateRegister();
992 : }
993 3148 : return unicode_lookaround_position_register_;
994 : }
995 :
996 : RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
997 : RegExpNode* start,
998 : int capture_count,
999 : Handle<String> pattern);
1000 :
1001 679160 : inline void AddWork(RegExpNode* node) {
1002 679160 : if (!node->on_work_list() && !node->label()->is_bound()) {
1003 : node->set_on_work_list(true);
1004 232894 : work_list_->Add(node);
1005 : }
1006 679160 : }
1007 :
1008 : static const int kImplementationOffset = 0;
1009 : static const int kNumberOfRegistersOffset = 0;
1010 : static const int kCodeOffset = 1;
1011 :
1012 : RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
1013 : EndNode* accept() { return accept_; }
1014 :
1015 : static const int kMaxRecursion = 100;
1016 : inline int recursion_depth() { return recursion_depth_; }
1017 1222149 : inline void IncrementRecursionDepth() { recursion_depth_++; }
1018 1222149 : inline void DecrementRecursionDepth() { recursion_depth_--; }
1019 :
1020 0 : void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1021 :
1022 : inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
1023 14753 : inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
1024 : // Both unicode and ignore_case flags are set. We need to use ICU to find
1025 : // the closure over case equivalents.
1026 : inline bool needs_unicode_case_equivalents() {
1027 209589 : return unicode() && ignore_case();
1028 : }
1029 : inline bool one_byte() { return one_byte_; }
1030 : inline bool optimize() { return optimize_; }
1031 91379 : inline void set_optimize(bool value) { optimize_ = value; }
1032 : inline bool limiting_recursion() { return limiting_recursion_; }
1033 : inline void set_limiting_recursion(bool value) {
1034 1097654 : limiting_recursion_ = value;
1035 : }
1036 : bool read_backward() { return read_backward_; }
1037 4024 : void set_read_backward(bool value) { read_backward_ = value; }
1038 : FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1039 :
1040 : int current_expansion_factor() { return current_expansion_factor_; }
1041 : void set_current_expansion_factor(int value) {
1042 120061 : current_expansion_factor_ = value;
1043 : }
1044 :
1045 : Isolate* isolate() const { return isolate_; }
1046 : Zone* zone() const { return zone_; }
1047 :
1048 : static const int kNoRegister = -1;
1049 :
1050 : private:
1051 : EndNode* accept_;
1052 : int next_register_;
1053 : int unicode_lookaround_stack_register_;
1054 : int unicode_lookaround_position_register_;
1055 : List<RegExpNode*>* work_list_;
1056 : int recursion_depth_;
1057 : RegExpMacroAssembler* macro_assembler_;
1058 : JSRegExp::Flags flags_;
1059 : bool one_byte_;
1060 : bool reg_exp_too_big_;
1061 : bool limiting_recursion_;
1062 : bool optimize_;
1063 : bool read_backward_;
1064 : int current_expansion_factor_;
1065 : FrequencyCollator frequency_collator_;
1066 : Isolate* isolate_;
1067 : Zone* zone_;
1068 : };
1069 :
1070 :
1071 : class RecursionCheck {
1072 : public:
1073 : explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1074 : compiler->IncrementRecursionDepth();
1075 : }
1076 : ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1077 : private:
1078 : RegExpCompiler* compiler_;
1079 : };
1080 :
1081 :
1082 : static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1083 : return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1084 : }
1085 :
1086 :
1087 : // Attempts to compile the regexp using an Irregexp code generator. Returns
1088 : // a fixed array or a null handle depending on whether it succeeded.
1089 92622 : RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1090 : JSRegExp::Flags flags, bool one_byte)
1091 92622 : : next_register_(2 * (capture_count + 1)),
1092 : unicode_lookaround_stack_register_(kNoRegister),
1093 : unicode_lookaround_position_register_(kNoRegister),
1094 : work_list_(NULL),
1095 : recursion_depth_(0),
1096 : flags_(flags),
1097 : one_byte_(one_byte),
1098 : reg_exp_too_big_(false),
1099 : limiting_recursion_(false),
1100 : optimize_(FLAG_regexp_optimization),
1101 : read_backward_(false),
1102 : current_expansion_factor_(1),
1103 : frequency_collator_(),
1104 : isolate_(isolate),
1105 185244 : zone_(zone) {
1106 92622 : accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1107 : DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1108 92622 : }
1109 :
1110 :
1111 92181 : RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1112 : RegExpMacroAssembler* macro_assembler,
1113 : RegExpNode* start,
1114 : int capture_count,
1115 : Handle<String> pattern) {
1116 : Isolate* isolate = pattern->GetHeap()->isolate();
1117 :
1118 : #ifdef DEBUG
1119 : if (FLAG_trace_regexp_assembler)
1120 : macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1121 : else
1122 : #endif
1123 92181 : macro_assembler_ = macro_assembler;
1124 :
1125 : List <RegExpNode*> work_list(0);
1126 92181 : work_list_ = &work_list;
1127 : Label fail;
1128 92181 : macro_assembler_->PushBacktrack(&fail);
1129 92181 : Trace new_trace;
1130 92181 : start->Emit(this, &new_trace);
1131 92181 : macro_assembler_->Bind(&fail);
1132 92181 : macro_assembler_->Fail();
1133 417256 : while (!work_list.is_empty()) {
1134 : RegExpNode* node = work_list.RemoveLast();
1135 : node->set_on_work_list(false);
1136 232894 : if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1137 : }
1138 92181 : if (reg_exp_too_big_) {
1139 0 : macro_assembler_->AbortedCodeGeneration();
1140 0 : return IrregexpRegExpTooBig(isolate_);
1141 : }
1142 :
1143 92181 : Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1144 92181 : isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1145 92181 : work_list_ = NULL;
1146 : #ifdef ENABLE_DISASSEMBLER
1147 : if (FLAG_print_code) {
1148 : CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1149 : OFStream os(trace_scope.file());
1150 : Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1151 : }
1152 : #endif
1153 : #ifdef DEBUG
1154 : if (FLAG_trace_regexp_assembler) {
1155 : delete macro_assembler_;
1156 : }
1157 : #endif
1158 92181 : return RegExpEngine::CompilationResult(*code, next_register_);
1159 : }
1160 :
1161 :
1162 7083898 : bool Trace::DeferredAction::Mentions(int that) {
1163 3580420 : if (action_type() == ActionNode::CLEAR_CAPTURES) {
1164 : Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1165 : return range.Contains(that);
1166 : } else {
1167 3503478 : return reg() == that;
1168 : }
1169 : }
1170 :
1171 :
1172 0 : bool Trace::mentions_reg(int reg) {
1173 0 : for (DeferredAction* action = actions_;
1174 : action != NULL;
1175 : action = action->next()) {
1176 0 : if (action->Mentions(reg))
1177 : return true;
1178 : }
1179 : return false;
1180 : }
1181 :
1182 :
1183 1147 : bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1184 : DCHECK_EQ(0, *cp_offset);
1185 2263 : for (DeferredAction* action = actions_;
1186 : action != NULL;
1187 : action = action->next()) {
1188 1116 : if (action->Mentions(reg)) {
1189 494 : if (action->action_type() == ActionNode::STORE_POSITION) {
1190 494 : *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1191 494 : return true;
1192 : } else {
1193 : return false;
1194 : }
1195 : }
1196 : }
1197 : return false;
1198 : }
1199 :
1200 :
1201 595306 : int Trace::FindAffectedRegisters(OutSet* affected_registers,
1202 : Zone* zone) {
1203 : int max_register = RegExpCompiler::kNoRegister;
1204 2196117 : for (DeferredAction* action = actions_;
1205 : action != NULL;
1206 : action = action->next()) {
1207 537517 : if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1208 : Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1209 77690 : for (int i = range.from(); i <= range.to(); i++)
1210 71820 : affected_registers->Set(i, zone);
1211 5870 : if (range.to() > max_register) max_register = range.to();
1212 : } else {
1213 531647 : affected_registers->Set(action->reg(), zone);
1214 531647 : if (action->reg() > max_register) max_register = action->reg();
1215 : }
1216 : }
1217 595306 : return max_register;
1218 : }
1219 :
1220 :
1221 595306 : void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1222 : int max_register,
1223 : const OutSet& registers_to_pop,
1224 : const OutSet& registers_to_clear) {
1225 14804544 : for (int reg = max_register; reg >= 0; reg--) {
1226 14209238 : if (registers_to_pop.Get(reg)) {
1227 86082 : assembler->PopRegister(reg);
1228 14123156 : } else if (registers_to_clear.Get(reg)) {
1229 : int clear_to = reg;
1230 272634 : while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1231 158448 : reg--;
1232 : }
1233 114186 : assembler->ClearRegisters(reg, clear_to);
1234 : }
1235 : }
1236 595306 : }
1237 :
1238 :
1239 595306 : void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1240 : int max_register,
1241 : const OutSet& affected_registers,
1242 : OutSet* registers_to_pop,
1243 : OutSet* registers_to_clear,
1244 : Zone* zone) {
1245 : // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1246 595306 : const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1247 :
1248 : // Count pushes performed to force a stack limit check occasionally.
1249 : int pushes = 0;
1250 :
1251 14962992 : for (int reg = 0; reg <= max_register; reg++) {
1252 14367686 : if (!affected_registers.Get(reg)) {
1253 : continue;
1254 : }
1255 :
1256 : // The chronologically first deferred action in the trace
1257 : // is used to infer the action needed to restore a register
1258 : // to its previous state (or not, if it's safe to ignore it).
1259 : enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1260 : DeferredActionUndoType undo_action = IGNORE;
1261 :
1262 : int value = 0;
1263 : bool absolute = false;
1264 : bool clear = false;
1265 : static const int kNoStore = kMinInt;
1266 : int store_position = kNoStore;
1267 : // This is a little tricky because we are scanning the actions in reverse
1268 : // historical order (newest first).
1269 4169636 : for (DeferredAction* action = actions_;
1270 : action != NULL;
1271 : action = action->next()) {
1272 3579304 : if (action->Mentions(reg)) {
1273 603467 : switch (action->action_type()) {
1274 : case ActionNode::SET_REGISTER: {
1275 6226 : Trace::DeferredSetRegister* psr =
1276 : static_cast<Trace::DeferredSetRegister*>(action);
1277 6226 : if (!absolute) {
1278 6226 : value += psr->value();
1279 : absolute = true;
1280 : }
1281 : // SET_REGISTER is currently only used for newly introduced loop
1282 : // counters. They can have a significant previous value if they
1283 : // occour in a loop. TODO(lrn): Propagate this information, so
1284 : // we can set undo_action to IGNORE if we know there is no value to
1285 : // restore.
1286 : undo_action = RESTORE;
1287 : DCHECK_EQ(store_position, kNoStore);
1288 : DCHECK(!clear);
1289 : break;
1290 : }
1291 : case ActionNode::INCREMENT_REGISTER:
1292 7305 : if (!absolute) {
1293 7305 : value++;
1294 : }
1295 : DCHECK_EQ(store_position, kNoStore);
1296 : DCHECK(!clear);
1297 : undo_action = RESTORE;
1298 : break;
1299 : case ActionNode::STORE_POSITION: {
1300 804616 : Trace::DeferredCapture* pc =
1301 : static_cast<Trace::DeferredCapture*>(action);
1302 518116 : if (!clear && store_position == kNoStore) {
1303 : store_position = pc->cp_offset();
1304 : }
1305 :
1306 : // For captures we know that stores and clears alternate.
1307 : // Other register, are never cleared, and if the occur
1308 : // inside a loop, they might be assigned more than once.
1309 518116 : if (reg <= 1) {
1310 : // Registers zero and one, aka "capture zero", is
1311 : // always set correctly if we succeed. There is no
1312 : // need to undo a setting on backtrack, because we
1313 : // will set it again or fail.
1314 : undo_action = IGNORE;
1315 : } else {
1316 286500 : undo_action = pc->is_capture() ? CLEAR : RESTORE;
1317 : }
1318 : DCHECK(!absolute);
1319 : DCHECK_EQ(value, 0);
1320 : break;
1321 : }
1322 : case ActionNode::CLEAR_CAPTURES: {
1323 : // Since we're scanning in reverse order, if we've already
1324 : // set the position we have to ignore historically earlier
1325 : // clearing operations.
1326 71820 : if (store_position == kNoStore) {
1327 : clear = true;
1328 : }
1329 : undo_action = RESTORE;
1330 : DCHECK(!absolute);
1331 : DCHECK_EQ(value, 0);
1332 : break;
1333 : }
1334 : default:
1335 0 : UNREACHABLE();
1336 : break;
1337 : }
1338 : }
1339 : }
1340 : // Prepare for the undo-action (e.g., push if it's going to be popped).
1341 590332 : if (undo_action == RESTORE) {
1342 86082 : pushes++;
1343 : RegExpMacroAssembler::StackCheckFlag stack_check =
1344 : RegExpMacroAssembler::kNoStackLimitCheck;
1345 86082 : if (pushes == push_limit) {
1346 : stack_check = RegExpMacroAssembler::kCheckStackLimit;
1347 : pushes = 0;
1348 : }
1349 :
1350 86082 : assembler->PushRegister(reg, stack_check);
1351 86082 : registers_to_pop->Set(reg, zone);
1352 504250 : } else if (undo_action == CLEAR) {
1353 272634 : registers_to_clear->Set(reg, zone);
1354 : }
1355 : // Perform the chronologically last action (or accumulated increment)
1356 : // for the register.
1357 590332 : if (store_position != kNoStore) {
1358 518116 : assembler->WriteCurrentPositionToRegister(reg, store_position);
1359 72216 : } else if (clear) {
1360 58685 : assembler->ClearRegisters(reg, reg);
1361 13531 : } else if (absolute) {
1362 6226 : assembler->SetRegister(reg, value);
1363 7305 : } else if (value != 0) {
1364 7305 : assembler->AdvanceRegister(reg, value);
1365 : }
1366 : }
1367 595306 : }
1368 :
1369 :
1370 : // This is called as we come into a loop choice node and some other tricky
1371 : // nodes. It normalizes the state of the code generator to ensure we can
1372 : // generate generic code.
1373 4193998 : void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1374 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1375 :
1376 : DCHECK(!is_trivial());
1377 :
1378 1338330 : if (actions_ == NULL && backtrack() == NULL) {
1379 : // Here we just have some deferred cp advances to fix and we are back to
1380 : // a normal situation. We may also have to forget some information gained
1381 : // through a quick check that was already performed.
1382 209740 : if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1383 : // Create a new trivial state and generate the node with that.
1384 209740 : Trace new_state;
1385 209740 : successor->Emit(compiler, &new_state);
1386 805046 : return;
1387 : }
1388 :
1389 : // Generate deferred actions here along with code to undo them again.
1390 : OutSet affected_registers;
1391 :
1392 595306 : if (backtrack() != NULL) {
1393 : // Here we have a concrete backtrack location. These are set up by choice
1394 : // nodes and so they indicate that we have a deferred save of the current
1395 : // position which we may need to emit here.
1396 474444 : assembler->PushCurrentPosition();
1397 : }
1398 :
1399 : int max_register = FindAffectedRegisters(&affected_registers,
1400 595306 : compiler->zone());
1401 : OutSet registers_to_pop;
1402 : OutSet registers_to_clear;
1403 : PerformDeferredActions(assembler,
1404 : max_register,
1405 : affected_registers,
1406 : ®isters_to_pop,
1407 : ®isters_to_clear,
1408 595306 : compiler->zone());
1409 595306 : if (cp_offset_ != 0) {
1410 352299 : assembler->AdvanceCurrentPosition(cp_offset_);
1411 : }
1412 :
1413 : // Create a new trivial state and generate the node with that.
1414 : Label undo;
1415 595306 : assembler->PushBacktrack(&undo);
1416 595306 : if (successor->KeepRecursing(compiler)) {
1417 165861 : Trace new_state;
1418 165861 : successor->Emit(compiler, &new_state);
1419 : } else {
1420 429445 : compiler->AddWork(successor);
1421 429445 : assembler->GoTo(successor->label());
1422 : }
1423 :
1424 : // On backtrack we need to restore state.
1425 595306 : assembler->Bind(&undo);
1426 : RestoreAffectedRegisters(assembler,
1427 : max_register,
1428 : registers_to_pop,
1429 595306 : registers_to_clear);
1430 595306 : if (backtrack() == NULL) {
1431 120862 : assembler->Backtrack();
1432 : } else {
1433 474444 : assembler->PopCurrentPosition();
1434 948888 : assembler->GoTo(backtrack());
1435 : }
1436 : }
1437 :
1438 :
1439 3550 : void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1440 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1441 :
1442 : // Omit flushing the trace. We discard the entire stack frame anyway.
1443 :
1444 3550 : if (!label()->is_bound()) {
1445 : // We are completely independent of the trace, since we ignore it,
1446 : // so this code can be used as the generic version.
1447 3501 : assembler->Bind(label());
1448 : }
1449 :
1450 : // Throw away everything on the backtrack stack since the start
1451 : // of the negative submatch and restore the character position.
1452 3550 : assembler->ReadCurrentPositionFromRegister(current_position_register_);
1453 3550 : assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1454 3550 : if (clear_capture_count_ > 0) {
1455 : // Clear any captures that might have been performed during the success
1456 : // of the body of the negative look-ahead.
1457 136 : int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1458 136 : assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1459 : }
1460 : // Now that we have unwound the stack we find at the top of the stack the
1461 : // backtrack that the BeginSubmatch node got.
1462 3550 : assembler->Backtrack();
1463 3550 : }
1464 :
1465 :
1466 310962 : void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1467 206952 : if (!trace->is_trivial()) {
1468 103298 : trace->Flush(compiler, this);
1469 103298 : return;
1470 : }
1471 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1472 103654 : if (!label()->is_bound()) {
1473 92168 : assembler->Bind(label());
1474 : }
1475 103654 : switch (action_) {
1476 : case ACCEPT:
1477 103298 : assembler->Succeed();
1478 103298 : return;
1479 : case BACKTRACK:
1480 712 : assembler->GoTo(trace->backtrack());
1481 356 : return;
1482 : case NEGATIVE_SUBMATCH_SUCCESS:
1483 : // This case is handled in a different virtual method.
1484 0 : UNREACHABLE();
1485 : }
1486 0 : UNIMPLEMENTED();
1487 : }
1488 :
1489 :
1490 1507750 : void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1491 1507750 : if (guards_ == NULL)
1492 1507750 : guards_ = new(zone) ZoneList<Guard*>(1, zone);
1493 1507750 : guards_->Add(guard, zone);
1494 1507750 : }
1495 :
1496 :
1497 1506002 : ActionNode* ActionNode::SetRegister(int reg,
1498 : int val,
1499 1506002 : RegExpNode* on_success) {
1500 : ActionNode* result =
1501 : new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1502 1506002 : result->data_.u_store_register.reg = reg;
1503 1506002 : result->data_.u_store_register.value = val;
1504 1506002 : return result;
1505 : }
1506 :
1507 :
1508 1506002 : ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1509 : ActionNode* result =
1510 : new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1511 1506002 : result->data_.u_increment_register.reg = reg;
1512 1506002 : return result;
1513 : }
1514 :
1515 :
1516 271147 : ActionNode* ActionNode::StorePosition(int reg,
1517 : bool is_capture,
1518 271147 : RegExpNode* on_success) {
1519 : ActionNode* result =
1520 : new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1521 271147 : result->data_.u_position_register.reg = reg;
1522 271147 : result->data_.u_position_register.is_capture = is_capture;
1523 271147 : return result;
1524 : }
1525 :
1526 :
1527 4107 : ActionNode* ActionNode::ClearCaptures(Interval range,
1528 4107 : RegExpNode* on_success) {
1529 : ActionNode* result =
1530 : new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1531 4107 : result->data_.u_clear_captures.range_from = range.from();
1532 4107 : result->data_.u_clear_captures.range_to = range.to();
1533 4107 : return result;
1534 : }
1535 :
1536 :
1537 5352 : ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1538 : int position_reg,
1539 5352 : RegExpNode* on_success) {
1540 : ActionNode* result =
1541 : new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1542 5352 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1543 5352 : result->data_.u_submatch.current_position_register = position_reg;
1544 5352 : return result;
1545 : }
1546 :
1547 :
1548 1837 : ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1549 : int position_reg,
1550 : int clear_register_count,
1551 : int clear_register_from,
1552 1837 : RegExpNode* on_success) {
1553 : ActionNode* result =
1554 : new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1555 1837 : result->data_.u_submatch.stack_pointer_register = stack_reg;
1556 1837 : result->data_.u_submatch.current_position_register = position_reg;
1557 1837 : result->data_.u_submatch.clear_register_count = clear_register_count;
1558 1837 : result->data_.u_submatch.clear_register_from = clear_register_from;
1559 1837 : return result;
1560 : }
1561 :
1562 :
1563 675 : ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1564 : int repetition_register,
1565 : int repetition_limit,
1566 675 : RegExpNode* on_success) {
1567 : ActionNode* result =
1568 : new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1569 675 : result->data_.u_empty_match_check.start_register = start_register;
1570 675 : result->data_.u_empty_match_check.repetition_register = repetition_register;
1571 675 : result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1572 675 : return result;
1573 : }
1574 :
1575 :
1576 : #define DEFINE_ACCEPT(Type) \
1577 : void Type##Node::Accept(NodeVisitor* visitor) { \
1578 : visitor->Visit##Type(this); \
1579 : }
1580 900514 : FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1581 : #undef DEFINE_ACCEPT
1582 :
1583 :
1584 216296 : void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1585 216296 : visitor->VisitLoopChoice(this);
1586 216296 : }
1587 :
1588 :
1589 : // -------------------------------------------------------------------
1590 : // Emit code.
1591 :
1592 :
1593 7750 : void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1594 15500 : Guard* guard,
1595 7750 : Trace* trace) {
1596 7750 : switch (guard->op()) {
1597 : case Guard::LT:
1598 : DCHECK(!trace->mentions_reg(guard->reg()));
1599 : macro_assembler->IfRegisterGE(guard->reg(),
1600 : guard->value(),
1601 9166 : trace->backtrack());
1602 4583 : break;
1603 : case Guard::GEQ:
1604 : DCHECK(!trace->mentions_reg(guard->reg()));
1605 : macro_assembler->IfRegisterLT(guard->reg(),
1606 : guard->value(),
1607 6334 : trace->backtrack());
1608 3167 : break;
1609 : }
1610 7750 : }
1611 :
1612 :
1613 : // Returns the number of characters in the equivalence class, omitting those
1614 : // that cannot occur in the source string because it is Latin1.
1615 60974 : static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1616 : bool one_byte_subject,
1617 : unibrow::uchar* letters) {
1618 : int length =
1619 60974 : isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1620 : // Unibrow returns 0 or 1 for characters where case independence is
1621 : // trivial.
1622 60974 : if (length == 0) {
1623 7751 : letters[0] = character;
1624 : length = 1;
1625 : }
1626 :
1627 60974 : if (one_byte_subject) {
1628 : int new_length = 0;
1629 103107 : for (int i = 0; i < length; i++) {
1630 103107 : if (letters[i] <= String::kMaxOneByteCharCode) {
1631 102727 : letters[new_length++] = letters[i];
1632 : }
1633 : }
1634 : length = new_length;
1635 : }
1636 :
1637 60974 : return length;
1638 : }
1639 :
1640 :
1641 620149 : static inline bool EmitSimpleCharacter(Isolate* isolate,
1642 620149 : RegExpCompiler* compiler,
1643 : uc16 c,
1644 : Label* on_failure,
1645 : int cp_offset,
1646 : bool check,
1647 : bool preloaded) {
1648 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
1649 : bool bound_checked = false;
1650 620149 : if (!preloaded) {
1651 : assembler->LoadCurrentCharacter(
1652 : cp_offset,
1653 : on_failure,
1654 620149 : check);
1655 : bound_checked = true;
1656 : }
1657 620149 : assembler->CheckNotCharacter(c, on_failure);
1658 620149 : return bound_checked;
1659 : }
1660 :
1661 :
1662 : // Only emits non-letters (things that don't have case). Only used for case
1663 : // independent matches.
1664 14932 : static inline bool EmitAtomNonLetter(Isolate* isolate,
1665 14932 : RegExpCompiler* compiler,
1666 : uc16 c,
1667 : Label* on_failure,
1668 : int cp_offset,
1669 : bool check,
1670 : bool preloaded) {
1671 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1672 : bool one_byte = compiler->one_byte();
1673 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1674 14932 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1675 14932 : if (length < 1) {
1676 : // This can't match. Must be an one-byte subject and a non-one-byte
1677 : // character. We do not need to do anything since the one-byte pass
1678 : // already handled this.
1679 : return false; // Bounds not checked.
1680 : }
1681 : bool checked = false;
1682 : // We handle the length > 1 case in a later pass.
1683 14925 : if (length == 1) {
1684 1543 : if (one_byte && c > String::kMaxOneByteCharCodeU) {
1685 : // Can't match - see above.
1686 : return false; // Bounds not checked.
1687 : }
1688 1543 : if (!preloaded) {
1689 1543 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1690 : checked = check;
1691 : }
1692 1543 : macro_assembler->CheckNotCharacter(c, on_failure);
1693 : }
1694 14925 : return checked;
1695 : }
1696 :
1697 :
1698 12992 : static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1699 : bool one_byte, uc16 c1, uc16 c2,
1700 : Label* on_failure) {
1701 : uc16 char_mask;
1702 12992 : if (one_byte) {
1703 : char_mask = String::kMaxOneByteCharCode;
1704 : } else {
1705 : char_mask = String::kMaxUtf16CodeUnit;
1706 : }
1707 12992 : uc16 exor = c1 ^ c2;
1708 : // Check whether exor has only one bit set.
1709 12992 : if (((exor - 1) & exor) == 0) {
1710 : // If c1 and c2 differ only by one bit.
1711 : // Ecma262UnCanonicalize always gives the highest number last.
1712 : DCHECK(c2 > c1);
1713 12874 : uc16 mask = char_mask ^ exor;
1714 12874 : macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1715 12874 : return true;
1716 : }
1717 : DCHECK(c2 > c1);
1718 118 : uc16 diff = c2 - c1;
1719 118 : if (((diff - 1) & diff) == 0 && c1 >= diff) {
1720 : // If the characters differ by 2^n but don't differ by one bit then
1721 : // subtract the difference from the found character, then do the or
1722 : // trick. We avoid the theoretical case where negative numbers are
1723 : // involved in order to simplify code generation.
1724 104 : uc16 mask = char_mask ^ diff;
1725 : macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1726 : diff,
1727 : mask,
1728 104 : on_failure);
1729 104 : return true;
1730 : }
1731 : return false;
1732 : }
1733 :
1734 :
1735 : typedef bool EmitCharacterFunction(Isolate* isolate,
1736 : RegExpCompiler* compiler,
1737 : uc16 c,
1738 : Label* on_failure,
1739 : int cp_offset,
1740 : bool check,
1741 : bool preloaded);
1742 :
1743 : // Only emits letters (things that have case). Only used for case independent
1744 : // matches.
1745 14932 : static inline bool EmitAtomLetter(Isolate* isolate,
1746 14932 : RegExpCompiler* compiler,
1747 : uc16 c,
1748 : Label* on_failure,
1749 : int cp_offset,
1750 : bool check,
1751 : bool preloaded) {
1752 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1753 : bool one_byte = compiler->one_byte();
1754 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1755 14932 : int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1756 14932 : if (length <= 1) return false;
1757 : // We may not need to check against the end of the input string
1758 : // if this character lies before a character that matched.
1759 13382 : if (!preloaded) {
1760 12989 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1761 : }
1762 : Label ok;
1763 : DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1764 13382 : switch (length) {
1765 : case 2: {
1766 25984 : if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1767 25984 : chars[1], on_failure)) {
1768 : } else {
1769 14 : macro_assembler->CheckCharacter(chars[0], &ok);
1770 14 : macro_assembler->CheckNotCharacter(chars[1], on_failure);
1771 14 : macro_assembler->Bind(&ok);
1772 : }
1773 : break;
1774 : }
1775 : case 4:
1776 30 : macro_assembler->CheckCharacter(chars[3], &ok);
1777 : // Fall through!
1778 : case 3:
1779 390 : macro_assembler->CheckCharacter(chars[0], &ok);
1780 390 : macro_assembler->CheckCharacter(chars[1], &ok);
1781 390 : macro_assembler->CheckNotCharacter(chars[2], on_failure);
1782 390 : macro_assembler->Bind(&ok);
1783 390 : break;
1784 : default:
1785 0 : UNREACHABLE();
1786 : break;
1787 : }
1788 : return true;
1789 : }
1790 :
1791 :
1792 9788 : static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1793 : int border,
1794 : Label* fall_through,
1795 : Label* above_or_equal,
1796 : Label* below) {
1797 9788 : if (below != fall_through) {
1798 9377 : masm->CheckCharacterLT(border, below);
1799 9377 : if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1800 : } else {
1801 411 : masm->CheckCharacterGT(border - 1, above_or_equal);
1802 : }
1803 9788 : }
1804 :
1805 :
1806 178894 : static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1807 : int first,
1808 : int last,
1809 : Label* fall_through,
1810 : Label* in_range,
1811 : Label* out_of_range) {
1812 178894 : if (in_range == fall_through) {
1813 123270 : if (first == last) {
1814 15327 : masm->CheckNotCharacter(first, out_of_range);
1815 : } else {
1816 107943 : masm->CheckCharacterNotInRange(first, last, out_of_range);
1817 : }
1818 : } else {
1819 55624 : if (first == last) {
1820 28254 : masm->CheckCharacter(first, in_range);
1821 : } else {
1822 27370 : masm->CheckCharacterInRange(first, last, in_range);
1823 : }
1824 55624 : if (out_of_range != fall_through) masm->GoTo(out_of_range);
1825 : }
1826 178894 : }
1827 :
1828 :
1829 : // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1830 : // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
1831 6432 : static void EmitUseLookupTable(
1832 6432 : RegExpMacroAssembler* masm,
1833 : ZoneList<int>* ranges,
1834 : int start_index,
1835 : int end_index,
1836 : int min_char,
1837 : Label* fall_through,
1838 : Label* even_label,
1839 : Label* odd_label) {
1840 : static const int kSize = RegExpMacroAssembler::kTableSize;
1841 : static const int kMask = RegExpMacroAssembler::kTableMask;
1842 :
1843 : int base = (min_char & ~kMask);
1844 : USE(base);
1845 :
1846 : // Assert that everything is on one kTableSize page.
1847 : for (int i = start_index; i <= end_index; i++) {
1848 : DCHECK_EQ(ranges->at(i) & ~kMask, base);
1849 : }
1850 : DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1851 :
1852 : char templ[kSize];
1853 : Label* on_bit_set;
1854 : Label* on_bit_clear;
1855 : int bit;
1856 6432 : if (even_label == fall_through) {
1857 : on_bit_set = odd_label;
1858 : on_bit_clear = even_label;
1859 : bit = 1;
1860 : } else {
1861 : on_bit_set = even_label;
1862 : on_bit_clear = odd_label;
1863 : bit = 0;
1864 : }
1865 262224 : for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1866 127896 : templ[i] = bit;
1867 : }
1868 : int j = 0;
1869 6432 : bit ^= 1;
1870 108651 : for (int i = start_index; i < end_index; i++) {
1871 1338774 : for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1872 567168 : templ[j] = bit;
1873 : }
1874 102219 : bit ^= 1;
1875 : }
1876 128232 : for (int i = j; i < kSize; i++) {
1877 128232 : templ[i] = bit;
1878 : }
1879 : Factory* factory = masm->isolate()->factory();
1880 : // TODO(erikcorry): Cache these.
1881 6432 : Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1882 823296 : for (int i = 0; i < kSize; i++) {
1883 823296 : ba->set(i, templ[i]);
1884 : }
1885 6432 : masm->CheckBitInTable(ba, on_bit_set);
1886 6432 : if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1887 6432 : }
1888 :
1889 :
1890 39249 : static void CutOutRange(RegExpMacroAssembler* masm,
1891 : ZoneList<int>* ranges,
1892 : int start_index,
1893 : int end_index,
1894 : int cut_index,
1895 : Label* even_label,
1896 : Label* odd_label) {
1897 39249 : bool odd = (((cut_index - start_index) & 1) == 1);
1898 39249 : Label* in_range_label = odd ? odd_label : even_label;
1899 : Label dummy;
1900 : EmitDoubleBoundaryTest(masm,
1901 : ranges->at(cut_index),
1902 39249 : ranges->at(cut_index + 1) - 1,
1903 : &dummy,
1904 : in_range_label,
1905 78498 : &dummy);
1906 : DCHECK(!dummy.is_linked());
1907 : // Cut out the single range by rewriting the array. This creates a new
1908 : // range that is a merger of the two ranges on either side of the one we
1909 : // are cutting out. The oddity of the labels is preserved.
1910 99371 : for (int j = cut_index; j > start_index; j--) {
1911 41746 : ranges->at(j) = ranges->at(j - 1);
1912 : }
1913 117857 : for (int j = cut_index + 1; j < end_index; j++) {
1914 157216 : ranges->at(j) = ranges->at(j + 1);
1915 : }
1916 39249 : }
1917 :
1918 :
1919 : // Unicode case. Split the search space into kSize spaces that are handled
1920 : // with recursion.
1921 21679 : static void SplitSearchSpace(ZoneList<int>* ranges,
1922 : int start_index,
1923 : int end_index,
1924 : int* new_start_index,
1925 : int* new_end_index,
1926 : int* border) {
1927 : static const int kSize = RegExpMacroAssembler::kTableSize;
1928 : static const int kMask = RegExpMacroAssembler::kTableMask;
1929 :
1930 21679 : int first = ranges->at(start_index);
1931 21679 : int last = ranges->at(end_index) - 1;
1932 :
1933 21679 : *new_start_index = start_index;
1934 21679 : *border = (ranges->at(start_index) & ~kMask) + kSize;
1935 190400 : while (*new_start_index < end_index) {
1936 167441 : if (ranges->at(*new_start_index) > *border) break;
1937 147042 : (*new_start_index)++;
1938 : }
1939 : // new_start_index is the index of the first edge that is beyond the
1940 : // current kSize space.
1941 :
1942 : // For very large search spaces we do a binary chop search of the non-Latin1
1943 : // space instead of just going to the end of the current kSize space. The
1944 : // heuristics are complicated a little by the fact that any 128-character
1945 : // encoding space can be quickly tested with a table lookup, so we don't
1946 : // wish to do binary chop search at a smaller granularity than that. A
1947 : // 128-character space can take up a lot of space in the ranges array if,
1948 : // for example, we only want to match every second character (eg. the lower
1949 : // case characters on some Unicode pages).
1950 21679 : int binary_chop_index = (end_index + start_index) / 2;
1951 : // The first test ensures that we get to the code that handles the Latin1
1952 : // range with a single not-taken branch, speeding up this important
1953 : // character range (even non-Latin1 charset-based text has spaces and
1954 : // punctuation).
1955 60175 : if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1956 31024 : end_index - start_index > (*new_start_index - start_index) * 2 &&
1957 61872 : last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1958 25728 : ranges->at(binary_chop_index) >= first + 2 * kSize) {
1959 : int scan_forward_for_section_border = binary_chop_index;;
1960 10614 : int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1961 :
1962 82386 : while (scan_forward_for_section_border < end_index) {
1963 69843 : if (ranges->at(scan_forward_for_section_border) > new_border) {
1964 8685 : *new_start_index = scan_forward_for_section_border;
1965 8685 : *border = new_border;
1966 8685 : break;
1967 : }
1968 61158 : scan_forward_for_section_border++;
1969 : }
1970 : }
1971 :
1972 : DCHECK(*new_start_index > start_index);
1973 21679 : *new_end_index = *new_start_index - 1;
1974 21679 : if (ranges->at(*new_end_index) == *border) {
1975 3467 : (*new_end_index)--;
1976 : }
1977 43358 : if (*border >= ranges->at(end_index)) {
1978 1278 : *border = ranges->at(end_index);
1979 1278 : *new_start_index = end_index; // Won't be used.
1980 1278 : *new_end_index = end_index - 1;
1981 : }
1982 21679 : }
1983 :
1984 :
1985 : // Gets a series of segment boundaries representing a character class. If the
1986 : // character is in the range between an even and an odd boundary (counting from
1987 : // start_index) then go to even_label, otherwise go to odd_label. We already
1988 : // know that the character is in the range of min_char to max_char inclusive.
1989 : // Either label can be NULL indicating backtracking. Either label can also be
1990 : // equal to the fall_through label.
1991 225968 : static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1992 : int start_index, int end_index, uc32 min_char,
1993 : uc32 max_char, Label* fall_through,
1994 : Label* even_label, Label* odd_label) {
1995 : DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1996 : DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1997 :
1998 225968 : int first = ranges->at(start_index);
1999 225968 : int last = ranges->at(end_index) - 1;
2000 :
2001 : DCHECK_LT(min_char, first);
2002 :
2003 : // Just need to test if the character is before or on-or-after
2004 : // a particular character.
2005 225968 : if (start_index == end_index) {
2006 9788 : EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
2007 9788 : return;
2008 : }
2009 :
2010 : // Another almost trivial case: There is one interval in the middle that is
2011 : // different from the end intervals.
2012 216180 : if (start_index + 1 == end_index) {
2013 : EmitDoubleBoundaryTest(
2014 139645 : masm, first, last, fall_through, even_label, odd_label);
2015 139645 : return;
2016 : }
2017 :
2018 : // It's not worth using table lookup if there are very few intervals in the
2019 : // character class.
2020 76535 : if (end_index - start_index <= 6) {
2021 : // It is faster to test for individual characters, so we look for those
2022 : // first, then try arbitrary ranges in the second round.
2023 : static int kNoCutIndex = -1;
2024 39249 : int cut = kNoCutIndex;
2025 161546 : for (int i = start_index; i < end_index; i++) {
2026 204512 : if (ranges->at(i) == ranges->at(i + 1) - 1) {
2027 : cut = i;
2028 : break;
2029 : }
2030 : }
2031 39249 : if (cut == kNoCutIndex) cut = start_index;
2032 : CutOutRange(
2033 39249 : masm, ranges, start_index, end_index, cut, even_label, odd_label);
2034 : DCHECK_GE(end_index - start_index, 2);
2035 : GenerateBranches(masm,
2036 : ranges,
2037 : start_index + 1,
2038 : end_index - 1,
2039 : min_char,
2040 : max_char,
2041 : fall_through,
2042 : even_label,
2043 39249 : odd_label);
2044 39249 : return;
2045 : }
2046 :
2047 : // If there are a lot of intervals in the regexp, then we will use tables to
2048 : // determine whether the character is inside or outside the character class.
2049 : static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2050 :
2051 37286 : if ((max_char >> kBits) == (min_char >> kBits)) {
2052 : EmitUseLookupTable(masm,
2053 : ranges,
2054 : start_index,
2055 : end_index,
2056 : min_char,
2057 : fall_through,
2058 : even_label,
2059 6432 : odd_label);
2060 6432 : return;
2061 : }
2062 :
2063 30854 : if ((min_char >> kBits) != (first >> kBits)) {
2064 9175 : masm->CheckCharacterLT(first, odd_label);
2065 : GenerateBranches(masm,
2066 : ranges,
2067 : start_index + 1,
2068 : end_index,
2069 : first,
2070 : max_char,
2071 : fall_through,
2072 : odd_label,
2073 9175 : even_label);
2074 9175 : return;
2075 : }
2076 :
2077 21679 : int new_start_index = 0;
2078 21679 : int new_end_index = 0;
2079 21679 : int border = 0;
2080 :
2081 : SplitSearchSpace(ranges,
2082 : start_index,
2083 : end_index,
2084 : &new_start_index,
2085 : &new_end_index,
2086 21679 : &border);
2087 :
2088 : Label handle_rest;
2089 : Label* above = &handle_rest;
2090 21679 : if (border == last + 1) {
2091 : // We didn't find any section that started after the limit, so everything
2092 : // above the border is one of the terminal labels.
2093 1278 : above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2094 : DCHECK(new_end_index == end_index - 1);
2095 : }
2096 :
2097 : DCHECK_LE(start_index, new_end_index);
2098 : DCHECK_LE(new_start_index, end_index);
2099 : DCHECK_LT(start_index, new_start_index);
2100 : DCHECK_LT(new_end_index, end_index);
2101 : DCHECK(new_end_index + 1 == new_start_index ||
2102 : (new_end_index + 2 == new_start_index &&
2103 : border == ranges->at(new_end_index + 1)));
2104 : DCHECK_LT(min_char, border - 1);
2105 : DCHECK_LT(border, max_char);
2106 : DCHECK_LT(ranges->at(new_end_index), border);
2107 : DCHECK(border < ranges->at(new_start_index) ||
2108 : (border == ranges->at(new_start_index) &&
2109 : new_start_index == end_index &&
2110 : new_end_index == end_index - 1 &&
2111 : border == last + 1));
2112 : DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2113 :
2114 21679 : masm->CheckCharacterGT(border - 1, above);
2115 : Label dummy;
2116 : GenerateBranches(masm,
2117 : ranges,
2118 : start_index,
2119 : new_end_index,
2120 : min_char,
2121 : border - 1,
2122 : &dummy,
2123 : even_label,
2124 21679 : odd_label);
2125 21679 : if (handle_rest.is_linked()) {
2126 20401 : masm->Bind(&handle_rest);
2127 20401 : bool flip = (new_start_index & 1) != (start_index & 1);
2128 : GenerateBranches(masm,
2129 : ranges,
2130 : new_start_index,
2131 : end_index,
2132 : border,
2133 : max_char,
2134 : &dummy,
2135 : flip ? odd_label : even_label,
2136 20401 : flip ? even_label : odd_label);
2137 : }
2138 : }
2139 :
2140 :
2141 241451 : static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2142 : RegExpCharacterClass* cc, bool one_byte,
2143 : Label* on_failure, int cp_offset, bool check_offset,
2144 : bool preloaded, Zone* zone) {
2145 : ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2146 241451 : CharacterRange::Canonicalize(ranges);
2147 :
2148 : int max_char;
2149 241451 : if (one_byte) {
2150 : max_char = String::kMaxOneByteCharCode;
2151 : } else {
2152 : max_char = String::kMaxUtf16CodeUnit;
2153 : }
2154 :
2155 241451 : int range_count = ranges->length();
2156 :
2157 241451 : int last_valid_range = range_count - 1;
2158 700980 : while (last_valid_range >= 0) {
2159 459484 : CharacterRange& range = ranges->at(last_valid_range);
2160 459484 : if (range.from() <= max_char) {
2161 : break;
2162 : }
2163 218078 : last_valid_range--;
2164 : }
2165 :
2166 241451 : if (last_valid_range < 0) {
2167 45 : if (!cc->is_negated()) {
2168 14 : macro_assembler->GoTo(on_failure);
2169 : }
2170 45 : if (check_offset) {
2171 45 : macro_assembler->CheckPosition(cp_offset, on_failure);
2172 : }
2173 105987 : return;
2174 : }
2175 :
2176 449397 : if (last_valid_range == 0 &&
2177 207991 : ranges->at(0).IsEverything(max_char)) {
2178 87799 : if (cc->is_negated()) {
2179 21 : macro_assembler->GoTo(on_failure);
2180 : } else {
2181 : // This is a common case hit by non-anchored expressions.
2182 87778 : if (check_offset) {
2183 56209 : macro_assembler->CheckPosition(cp_offset, on_failure);
2184 : }
2185 : }
2186 : return;
2187 : }
2188 :
2189 153607 : if (!preloaded) {
2190 140426 : macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2191 : }
2192 :
2193 172129 : if (cc->is_standard(zone) &&
2194 : macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2195 37044 : on_failure)) {
2196 : return;
2197 : }
2198 :
2199 :
2200 : // A new list with ascending entries. Each entry is a code unit
2201 : // where there is a boundary between code units that are part of
2202 : // the class and code units that are not. Normally we insert an
2203 : // entry at zero which goes to the failure label, but if there
2204 : // was already one there we fall through for success on that entry.
2205 : // Subsequent entries have alternating meaning (success/failure).
2206 : ZoneList<int>* range_boundaries =
2207 135464 : new(zone) ZoneList<int>(last_valid_range, zone);
2208 :
2209 135464 : bool zeroth_entry_is_failure = !cc->is_negated();
2210 :
2211 383249 : for (int i = 0; i <= last_valid_range; i++) {
2212 495570 : CharacterRange& range = ranges->at(i);
2213 247785 : if (range.from() == 0) {
2214 : DCHECK_EQ(i, 0);
2215 2398 : zeroth_entry_is_failure = !zeroth_entry_is_failure;
2216 : } else {
2217 490774 : range_boundaries->Add(range.from(), zone);
2218 : }
2219 495570 : range_boundaries->Add(range.to() + 1, zone);
2220 : }
2221 135464 : int end_index = range_boundaries->length() - 1;
2222 135464 : if (range_boundaries->at(end_index) > max_char) {
2223 3025 : end_index--;
2224 : }
2225 :
2226 : Label fall_through;
2227 : GenerateBranches(macro_assembler,
2228 : range_boundaries,
2229 : 0, // start_index.
2230 : end_index,
2231 : 0, // min_char.
2232 : max_char,
2233 : &fall_through,
2234 : zeroth_entry_is_failure ? &fall_through : on_failure,
2235 135464 : zeroth_entry_is_failure ? on_failure : &fall_through);
2236 135464 : macro_assembler->Bind(&fall_through);
2237 : }
2238 :
2239 :
2240 0 : RegExpNode::~RegExpNode() {
2241 0 : }
2242 :
2243 :
2244 5341068 : RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2245 2558996 : Trace* trace) {
2246 : // If we are generating a greedy loop then don't stop and don't reuse code.
2247 2022827 : if (trace->stop_node() != NULL) {
2248 : return CONTINUE;
2249 : }
2250 :
2251 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2252 2006775 : if (trace->is_trivial()) {
2253 1724621 : if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2254 : // If a generic version is already scheduled to be generated or we have
2255 : // recursed too deeply then just generate a jump to that code.
2256 249715 : macro_assembler->GoTo(&label_);
2257 : // This will queue it up for generation of a generic version if it hasn't
2258 : // already been queued.
2259 249715 : compiler->AddWork(this);
2260 249715 : return DONE;
2261 : }
2262 : // Generate generic version of the node and bind the label for later use.
2263 438904 : macro_assembler->Bind(&label_);
2264 438904 : return CONTINUE;
2265 : }
2266 :
2267 : // We are being asked to make a non-generic version. Keep track of how many
2268 : // non-generic versions we generate so as not to overdo it.
2269 1318156 : trace_count_++;
2270 2629622 : if (KeepRecursing(compiler) && compiler->optimize() &&
2271 : trace_count_ < kMaxCopiesCodeGenerated) {
2272 : return CONTINUE;
2273 : }
2274 :
2275 : // If we get here code has been generated for this node too many times or
2276 : // recursion is too deep. Time to switch to a generic version. The code for
2277 : // generic versions above can handle deep recursion properly.
2278 : bool was_limiting = compiler->limiting_recursion();
2279 : compiler->set_limiting_recursion(true);
2280 548827 : trace->Flush(compiler, this);
2281 : compiler->set_limiting_recursion(was_limiting);
2282 548827 : return DONE;
2283 : }
2284 :
2285 :
2286 4336278 : bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2287 4336278 : return !compiler->limiting_recursion() &&
2288 0 : compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2289 : }
2290 :
2291 :
2292 779046 : int ActionNode::EatsAtLeast(int still_to_find,
2293 : int budget,
2294 : bool not_at_start) {
2295 779046 : if (budget <= 0) return 0;
2296 760101 : if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2297 754400 : return on_success()->EatsAtLeast(still_to_find,
2298 : budget - 1,
2299 754400 : not_at_start);
2300 : }
2301 :
2302 :
2303 101768 : void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2304 : BoyerMooreLookahead* bm, bool not_at_start) {
2305 101768 : if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2306 101768 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2307 : }
2308 : SaveBMInfo(bm, not_at_start, offset);
2309 101768 : }
2310 :
2311 :
2312 18518 : int AssertionNode::EatsAtLeast(int still_to_find,
2313 : int budget,
2314 17645 : bool not_at_start) {
2315 18518 : if (budget <= 0) return 0;
2316 : // If we know we are not at the start and we are asked "how many characters
2317 : // will you match if you succeed?" then we can answer anything since false
2318 : // implies false. So lets just return the max answer (still_to_find) since
2319 : // that won't prevent us from preloading a lot of characters for the other
2320 : // branches in the node graph.
2321 17645 : if (assertion_type() == AT_START && not_at_start) return still_to_find;
2322 17363 : return on_success()->EatsAtLeast(still_to_find,
2323 : budget - 1,
2324 17363 : not_at_start);
2325 : }
2326 :
2327 :
2328 1134 : void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2329 1134 : BoyerMooreLookahead* bm, bool not_at_start) {
2330 : // Match the behaviour of EatsAtLeast on this node.
2331 2268 : if (assertion_type() == AT_START && not_at_start) return;
2332 1114 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2333 : SaveBMInfo(bm, not_at_start, offset);
2334 : }
2335 :
2336 :
2337 3988 : int BackReferenceNode::EatsAtLeast(int still_to_find,
2338 : int budget,
2339 3988 : bool not_at_start) {
2340 3988 : if (read_backward()) return 0;
2341 3856 : if (budget <= 0) return 0;
2342 3856 : return on_success()->EatsAtLeast(still_to_find,
2343 : budget - 1,
2344 3856 : not_at_start);
2345 : }
2346 :
2347 :
2348 7741859 : int TextNode::EatsAtLeast(int still_to_find,
2349 : int budget,
2350 7741859 : bool not_at_start) {
2351 7741859 : if (read_backward()) return 0;
2352 7740050 : int answer = Length();
2353 7740050 : if (answer >= still_to_find) return answer;
2354 4665853 : if (budget <= 0) return answer;
2355 : // We are not at start after this node so we set the last argument to 'true'.
2356 3257497 : return answer + on_success()->EatsAtLeast(still_to_find - answer,
2357 : budget - 1,
2358 3257497 : true);
2359 : }
2360 :
2361 :
2362 11380 : int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2363 : bool not_at_start) {
2364 11380 : if (budget <= 0) return 0;
2365 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2366 : // afterwards.
2367 22522 : RegExpNode* node = alternatives_->at(1).node();
2368 11261 : return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2369 : }
2370 :
2371 :
2372 4508 : void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2373 : QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2374 : bool not_at_start) {
2375 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2376 : // afterwards.
2377 9016 : RegExpNode* node = alternatives_->at(1).node();
2378 4508 : return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2379 : }
2380 :
2381 :
2382 9665326 : int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2383 : int budget,
2384 : RegExpNode* ignore_this_node,
2385 : bool not_at_start) {
2386 9665326 : if (budget <= 0) return 0;
2387 : int min = 100;
2388 6689076 : int choice_count = alternatives_->length();
2389 6689076 : budget = (budget - 1) / choice_count;
2390 14608442 : for (int i = 0; i < choice_count; i++) {
2391 28458628 : RegExpNode* node = alternatives_->at(i).node();
2392 14229314 : if (node == ignore_this_node) continue;
2393 : int node_eats_at_least =
2394 14041857 : node->EatsAtLeast(still_to_find, budget, not_at_start);
2395 14041857 : if (node_eats_at_least < min) min = node_eats_at_least;
2396 14041857 : if (min == 0) return 0;
2397 : }
2398 : return min;
2399 : }
2400 :
2401 :
2402 198628 : int LoopChoiceNode::EatsAtLeast(int still_to_find,
2403 : int budget,
2404 : bool not_at_start) {
2405 : return EatsAtLeastHelper(still_to_find,
2406 : budget - 1,
2407 : loop_node_,
2408 198628 : not_at_start);
2409 : }
2410 :
2411 :
2412 9466698 : int ChoiceNode::EatsAtLeast(int still_to_find,
2413 : int budget,
2414 : bool not_at_start) {
2415 : return EatsAtLeastHelper(still_to_find,
2416 : budget,
2417 : NULL,
2418 9466698 : not_at_start);
2419 : }
2420 :
2421 :
2422 : // Takes the left-most 1-bit and smears it out, setting all bits to its right.
2423 : static inline uint32_t SmearBitsRight(uint32_t v) {
2424 324912 : v |= v >> 1;
2425 324912 : v |= v >> 2;
2426 324912 : v |= v >> 4;
2427 324912 : v |= v >> 8;
2428 324912 : v |= v >> 16;
2429 : return v;
2430 : }
2431 :
2432 :
2433 304720 : bool QuickCheckDetails::Rationalize(bool asc) {
2434 : bool found_useful_op = false;
2435 : uint32_t char_mask;
2436 304720 : if (asc) {
2437 : char_mask = String::kMaxOneByteCharCode;
2438 : } else {
2439 : char_mask = String::kMaxUtf16CodeUnit;
2440 : }
2441 304720 : mask_ = 0;
2442 304720 : value_ = 0;
2443 : int char_shift = 0;
2444 849012 : for (int i = 0; i < characters_; i++) {
2445 544292 : Position* pos = &positions_[i];
2446 544292 : if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2447 : found_useful_op = true;
2448 : }
2449 544292 : mask_ |= (pos->mask & char_mask) << char_shift;
2450 544292 : value_ |= (pos->value & char_mask) << char_shift;
2451 544292 : char_shift += asc ? 8 : 16;
2452 : }
2453 304720 : return found_useful_op;
2454 : }
2455 :
2456 :
2457 1394488 : bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2458 75702 : Trace* bounds_check_trace,
2459 679039 : Trace* trace,
2460 : bool preload_has_checked_bounds,
2461 : Label* on_possible_success,
2462 1657499 : QuickCheckDetails* details,
2463 : bool fall_through_on_failure) {
2464 590803 : if (details->characters() == 0) return false;
2465 : GetQuickCheckDetails(
2466 609716 : details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2467 304858 : if (details->cannot_match()) return false;
2468 304720 : if (!details->Rationalize(compiler->one_byte())) return false;
2469 : DCHECK(details->characters() == 1 ||
2470 : compiler->macro_assembler()->CanReadUnaligned());
2471 : uint32_t mask = details->mask();
2472 : uint32_t value = details->value();
2473 :
2474 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
2475 :
2476 253946 : if (trace->characters_preloaded() != details->characters()) {
2477 : DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2478 : // We are attempting to preload the minimum number of characters
2479 : // any choice would eat, so if the bounds check fails, then none of the
2480 : // choices can succeed, so we can just immediately backtrack, rather
2481 : // than go to the next choice.
2482 : assembler->LoadCurrentCharacter(trace->cp_offset(),
2483 : bounds_check_trace->backtrack(),
2484 75702 : !preload_has_checked_bounds,
2485 227106 : details->characters());
2486 : }
2487 :
2488 :
2489 : bool need_mask = true;
2490 :
2491 253946 : if (details->characters() == 1) {
2492 : // If number of characters preloaded is 1 then we used a byte or 16 bit
2493 : // load so the value is already masked down.
2494 : uint32_t char_mask;
2495 50248 : if (compiler->one_byte()) {
2496 : char_mask = String::kMaxOneByteCharCode;
2497 : } else {
2498 : char_mask = String::kMaxUtf16CodeUnit;
2499 : }
2500 50248 : if ((mask & char_mask) == char_mask) need_mask = false;
2501 : mask &= char_mask;
2502 : } else {
2503 : // For 2-character preloads in one-byte mode or 1-character preloads in
2504 : // two-byte mode we also use a 16 bit load with zero extend.
2505 : static const uint32_t kTwoByteMask = 0xffff;
2506 : static const uint32_t kFourByteMask = 0xffffffff;
2507 398469 : if (details->characters() == 2 && compiler->one_byte()) {
2508 192853 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2509 10845 : } else if (details->characters() == 1 && !compiler->one_byte()) {
2510 0 : if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2511 : } else {
2512 10845 : if (mask == kFourByteMask) need_mask = false;
2513 : }
2514 : }
2515 :
2516 253946 : if (fall_through_on_failure) {
2517 209413 : if (need_mask) {
2518 61784 : assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2519 : } else {
2520 147629 : assembler->CheckCharacter(value, on_possible_success);
2521 : }
2522 : } else {
2523 44533 : if (need_mask) {
2524 12706 : assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2525 : } else {
2526 76360 : assembler->CheckNotCharacter(value, trace->backtrack());
2527 : }
2528 : }
2529 : return true;
2530 : }
2531 :
2532 :
2533 : // Here is the meat of GetQuickCheckDetails (see also the comment on the
2534 : // super-class in the .h file).
2535 : //
2536 : // We iterate along the text object, building up for each character a
2537 : // mask and value that can be used to test for a quick failure to match.
2538 : // The masks and values for the positions will be combined into a single
2539 : // machine word for the current character width in order to be used in
2540 : // generating a quick check.
2541 1935795 : void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2542 1147124 : RegExpCompiler* compiler,
2543 : int characters_filled_in,
2544 1194150 : bool not_at_start) {
2545 : // Do not collect any quick check details if the text node reads backward,
2546 : // since it reads in the opposite direction than we use for quick checks.
2547 560994 : if (read_backward()) return;
2548 560994 : Isolate* isolate = compiler->macro_assembler()->isolate();
2549 : DCHECK(characters_filled_in < details->characters());
2550 : int characters = details->characters();
2551 : int char_mask;
2552 560994 : if (compiler->one_byte()) {
2553 : char_mask = String::kMaxOneByteCharCode;
2554 : } else {
2555 : char_mask = String::kMaxUtf16CodeUnit;
2556 : }
2557 1266312 : for (int k = 0; k < elements()->length(); k++) {
2558 567367 : TextElement elm = elements()->at(k);
2559 567367 : if (elm.text_type() == TextElement::ATOM) {
2560 : Vector<const uc16> quarks = elm.atom()->data();
2561 641361 : for (int i = 0; i < characters && i < quarks.length(); i++) {
2562 : QuickCheckDetails::Position* pos =
2563 599759 : details->positions(characters_filled_in);
2564 599759 : uc16 c = quarks[i];
2565 599759 : if (compiler->ignore_case()) {
2566 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2567 : int length = GetCaseIndependentLetters(isolate, c,
2568 25136 : compiler->one_byte(), chars);
2569 25136 : if (length == 0) {
2570 : // This can happen because all case variants are non-Latin1, but we
2571 : // know the input is Latin1.
2572 : details->set_cannot_match();
2573 35 : pos->determines_perfectly = false;
2574 35 : return;
2575 : }
2576 25101 : if (length == 1) {
2577 : // This letter has no case equivalents, so it's nice and simple
2578 : // and the mask-compare will determine definitely whether we have
2579 : // a match at this character position.
2580 3533 : pos->mask = char_mask;
2581 3533 : pos->value = c;
2582 3533 : pos->determines_perfectly = true;
2583 : } else {
2584 21568 : uint32_t common_bits = char_mask;
2585 21568 : uint32_t bits = chars[0];
2586 43545 : for (int j = 1; j < length; j++) {
2587 21977 : uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2588 21977 : common_bits ^= differing_bits;
2589 21977 : bits &= common_bits;
2590 : }
2591 : // If length is 2 and common bits has only one zero in it then
2592 : // our mask and compare instruction will determine definitely
2593 : // whether we have a match at this character position. Otherwise
2594 : // it can only be an approximate check.
2595 21568 : uint32_t one_zero = (common_bits | ~char_mask);
2596 21568 : if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2597 21089 : pos->determines_perfectly = true;
2598 : }
2599 21568 : pos->mask = common_bits;
2600 21568 : pos->value = bits;
2601 : }
2602 : } else {
2603 : // Don't ignore case. Nice simple case where the mask-compare will
2604 : // determine definitely whether we have a match at this character
2605 : // position.
2606 574623 : if (c > char_mask) {
2607 : details->set_cannot_match();
2608 35 : pos->determines_perfectly = false;
2609 35 : return;
2610 : }
2611 574588 : pos->mask = char_mask;
2612 574588 : pos->value = c;
2613 574588 : pos->determines_perfectly = true;
2614 : }
2615 599689 : characters_filled_in++;
2616 : DCHECK(characters_filled_in <= details->characters());
2617 599689 : if (characters_filled_in == details->characters()) {
2618 : return;
2619 : }
2620 : }
2621 : } else {
2622 : QuickCheckDetails::Position* pos =
2623 148399 : details->positions(characters_filled_in);
2624 : RegExpCharacterClass* tree = elm.char_class();
2625 148399 : ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2626 148399 : if (tree->is_negated()) {
2627 : // A quick check uses multi-character mask and compare. There is no
2628 : // useful way to incorporate a negative char class into this scheme
2629 : // so we just conservatively create a mask and value that will always
2630 : // succeed.
2631 3848 : pos->mask = 0;
2632 3848 : pos->value = 0;
2633 : } else {
2634 : int first_range = 0;
2635 144589 : while (ranges->at(first_range).from() > char_mask) {
2636 108 : first_range++;
2637 503602 : if (first_range == ranges->length()) {
2638 : details->set_cannot_match();
2639 70 : pos->determines_perfectly = false;
2640 : return;
2641 : }
2642 : }
2643 144481 : CharacterRange range = ranges->at(first_range);
2644 144481 : uc16 from = range.from();
2645 144481 : uc16 to = range.to();
2646 144481 : if (to > char_mask) {
2647 22720 : to = char_mask;
2648 : }
2649 144481 : uint32_t differing_bits = (from ^ to);
2650 : // A mask and compare is only perfect if the differing bits form a
2651 : // number like 00011111 with one single block of trailing 1s.
2652 255417 : if ((differing_bits & (differing_bits + 1)) == 0 &&
2653 110936 : from + differing_bits == to) {
2654 101979 : pos->determines_perfectly = true;
2655 : }
2656 144481 : uint32_t common_bits = ~SmearBitsRight(differing_bits);
2657 144481 : uint32_t bits = (from & common_bits);
2658 1006988 : for (int i = first_range + 1; i < ranges->length(); i++) {
2659 359013 : CharacterRange range = ranges->at(i);
2660 359013 : uc16 from = range.from();
2661 359013 : uc16 to = range.to();
2662 359013 : if (from > char_mask) continue;
2663 180431 : if (to > char_mask) to = char_mask;
2664 : // Here we are combining more ranges into the mask and compare
2665 : // value. With each new range the mask becomes more sparse and
2666 : // so the chances of a false positive rise. A character class
2667 : // with multiple ranges is assumed never to be equivalent to a
2668 : // mask and compare operation.
2669 180431 : pos->determines_perfectly = false;
2670 180431 : uint32_t new_common_bits = (from ^ to);
2671 180431 : new_common_bits = ~SmearBitsRight(new_common_bits);
2672 180431 : common_bits &= new_common_bits;
2673 180431 : bits &= new_common_bits;
2674 180431 : uint32_t differing_bits = (from & common_bits) ^ bits;
2675 180431 : common_bits ^= differing_bits;
2676 180431 : bits &= common_bits;
2677 : }
2678 144481 : pos->mask = common_bits;
2679 144481 : pos->value = bits;
2680 : }
2681 148329 : characters_filled_in++;
2682 : DCHECK(characters_filled_in <= details->characters());
2683 148329 : if (characters_filled_in == details->characters()) {
2684 : return;
2685 : }
2686 : }
2687 : }
2688 : DCHECK(characters_filled_in != details->characters());
2689 65789 : if (!details->cannot_match()) {
2690 65789 : on_success()-> GetQuickCheckDetails(details,
2691 : compiler,
2692 : characters_filled_in,
2693 65789 : true);
2694 : }
2695 : }
2696 :
2697 :
2698 0 : void QuickCheckDetails::Clear() {
2699 407849 : for (int i = 0; i < characters_; i++) {
2700 407849 : positions_[i].mask = 0;
2701 407849 : positions_[i].value = 0;
2702 407849 : positions_[i].determines_perfectly = false;
2703 : }
2704 1304652 : characters_ = 0;
2705 0 : }
2706 :
2707 :
2708 612652 : void QuickCheckDetails::Advance(int by, bool one_byte) {
2709 612652 : if (by >= characters_ || by < 0) {
2710 : DCHECK_IMPLIES(by < 0, characters_ == 0);
2711 : Clear();
2712 612652 : return;
2713 : }
2714 : DCHECK_LE(characters_ - by, 4);
2715 : DCHECK_LE(characters_, 4);
2716 47062 : for (int i = 0; i < characters_ - by; i++) {
2717 47062 : positions_[i] = positions_[by + i];
2718 : }
2719 39020 : for (int i = characters_ - by; i < characters_; i++) {
2720 39020 : positions_[i].mask = 0;
2721 39020 : positions_[i].value = 0;
2722 39020 : positions_[i].determines_perfectly = false;
2723 : }
2724 38201 : characters_ -= by;
2725 : // We could change mask_ and value_ here but we would never advance unless
2726 : // they had already been used in a check and they won't be used again because
2727 : // it would gain us nothing. So there's no point.
2728 : }
2729 :
2730 :
2731 199109 : void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2732 : DCHECK(characters_ == other->characters_);
2733 199109 : if (other->cannot_match_) {
2734 : return;
2735 : }
2736 199005 : if (cannot_match_) {
2737 336 : *this = *other;
2738 336 : return;
2739 : }
2740 217714 : for (int i = from_index; i < characters_; i++) {
2741 217714 : QuickCheckDetails::Position* pos = positions(i);
2742 217714 : QuickCheckDetails::Position* other_pos = other->positions(i);
2743 258620 : if (pos->mask != other_pos->mask ||
2744 47526 : pos->value != other_pos->value ||
2745 6620 : !other_pos->determines_perfectly) {
2746 : // Our mask-compare operation will be approximate unless we have the
2747 : // exact same operation on both sides of the alternation.
2748 215416 : pos->determines_perfectly = false;
2749 : }
2750 217714 : pos->mask &= other_pos->mask;
2751 217714 : pos->value &= pos->mask;
2752 217714 : other_pos->value &= pos->mask;
2753 217714 : uc16 differing_bits = (pos->value ^ other_pos->value);
2754 217714 : pos->mask &= ~differing_bits;
2755 217714 : pos->value &= pos->mask;
2756 : }
2757 : }
2758 :
2759 :
2760 : class VisitMarker {
2761 : public:
2762 : explicit VisitMarker(NodeInfo* info) : info_(info) {
2763 : DCHECK(!info->visited);
2764 298699 : info->visited = true;
2765 : }
2766 : ~VisitMarker() {
2767 265237 : info_->visited = false;
2768 : }
2769 : private:
2770 : NodeInfo* info_;
2771 : };
2772 :
2773 :
2774 150800 : RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2775 150800 : if (info()->replacement_calculated) return replacement();
2776 110286 : if (depth < 0) return this;
2777 : DCHECK(!info()->visited);
2778 110040 : VisitMarker marker(info());
2779 : return FilterSuccessor(depth - 1, ignore_case);
2780 : }
2781 :
2782 :
2783 0 : RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2784 204225 : RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2785 204225 : if (next == NULL) return set_replacement(NULL);
2786 203615 : on_success_ = next;
2787 203615 : return set_replacement(this);
2788 : }
2789 :
2790 :
2791 : // We need to check for the following characters: 0x39c 0x3bc 0x178.
2792 8359 : static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2793 : // TODO(dcarney): this could be a lot more efficient.
2794 8273 : return range.Contains(0x39c) ||
2795 16620 : range.Contains(0x3bc) || range.Contains(0x178);
2796 : }
2797 :
2798 :
2799 26 : static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2800 64 : for (int i = 0; i < ranges->length(); i++) {
2801 : // TODO(dcarney): this could be a lot more efficient.
2802 58 : if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2803 : }
2804 : return false;
2805 : }
2806 :
2807 :
2808 297565 : RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2809 156395 : if (info()->replacement_calculated) return replacement();
2810 94780 : if (depth < 0) return this;
2811 : DCHECK(!info()->visited);
2812 94717 : VisitMarker marker(info());
2813 94717 : int element_count = elements()->length();
2814 195614 : for (int i = 0; i < element_count; i++) {
2815 101429 : TextElement elm = elements()->at(i);
2816 101429 : if (elm.text_type() == TextElement::ATOM) {
2817 : Vector<const uc16> quarks = elm.atom()->data();
2818 154576 : for (int j = 0; j < quarks.length(); j++) {
2819 108342 : uint16_t c = quarks[j];
2820 108342 : if (c <= String::kMaxOneByteCharCode) continue;
2821 393 : if (!ignore_case) return set_replacement(NULL);
2822 : // Here, we need to check for characters whose upper and lower cases
2823 : // are outside the Latin-1 range.
2824 : uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2825 : // Character is outside Latin-1 completely
2826 231 : if (converted == 0) return set_replacement(NULL);
2827 : // Convert quark to Latin-1 in place.
2828 : uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2829 47 : copy[j] = converted;
2830 : }
2831 : } else {
2832 : DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2833 : RegExpCharacterClass* cc = elm.char_class();
2834 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2835 54976 : CharacterRange::Canonicalize(ranges);
2836 : // Now they are in order so we only need to look at the first.
2837 54976 : int range_count = ranges->length();
2838 54976 : if (cc->is_negated()) {
2839 9674 : if (range_count != 0 &&
2840 9871 : ranges->at(0).from() == 0 &&
2841 197 : ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2842 : // This will be handled in a later filter.
2843 44 : if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2844 43 : return set_replacement(NULL);
2845 : }
2846 : } else {
2847 100272 : if (range_count == 0 ||
2848 50133 : ranges->at(0).from() > String::kMaxOneByteCharCode) {
2849 : // This will be handled in a later filter.
2850 289 : if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2851 270 : return set_replacement(NULL);
2852 : }
2853 : }
2854 : }
2855 : }
2856 94185 : return FilterSuccessor(depth - 1, ignore_case);
2857 : }
2858 :
2859 :
2860 82711 : RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2861 82711 : if (info()->replacement_calculated) return replacement();
2862 63751 : if (depth < 0) return this;
2863 63623 : if (info()->visited) return this;
2864 : {
2865 33886 : VisitMarker marker(info());
2866 :
2867 : RegExpNode* continue_replacement =
2868 33886 : continue_node_->FilterOneByte(depth - 1, ignore_case);
2869 : // If we can't continue after the loop then there is no sense in doing the
2870 : // loop.
2871 33886 : if (continue_replacement == NULL) return set_replacement(NULL);
2872 : }
2873 :
2874 33462 : return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2875 : }
2876 :
2877 :
2878 42999 : RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2879 43185 : if (info()->replacement_calculated) return replacement();
2880 40528 : if (depth < 0) return this;
2881 40407 : if (info()->visited) return this;
2882 40407 : VisitMarker marker(info());
2883 40407 : int choice_count = alternatives_->length();
2884 :
2885 126590 : for (int i = 0; i < choice_count; i++) {
2886 90114 : GuardedAlternative alternative = alternatives_->at(i);
2887 90114 : if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2888 3931 : set_replacement(this);
2889 : return this;
2890 : }
2891 : }
2892 :
2893 : int surviving = 0;
2894 : RegExpNode* survivor = NULL;
2895 85419 : for (int i = 0; i < choice_count; i++) {
2896 170838 : GuardedAlternative alternative = alternatives_->at(i);
2897 : RegExpNode* replacement =
2898 85419 : alternative.node()->FilterOneByte(depth - 1, ignore_case);
2899 : DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2900 85419 : if (replacement != NULL) {
2901 85228 : alternatives_->at(i).set_node(replacement);
2902 85228 : surviving++;
2903 : survivor = replacement;
2904 : }
2905 : }
2906 36563 : if (surviving < 2) return set_replacement(survivor);
2907 :
2908 36389 : set_replacement(this);
2909 36389 : if (surviving == choice_count) {
2910 : return this;
2911 : }
2912 : // Only some of the nodes survived the filtering. We need to rebuild the
2913 : // alternatives list.
2914 : ZoneList<GuardedAlternative>* new_alternatives =
2915 26 : new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2916 248 : for (int i = 0; i < choice_count; i++) {
2917 : RegExpNode* replacement =
2918 444 : alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2919 222 : if (replacement != NULL) {
2920 160 : alternatives_->at(i).set_node(replacement);
2921 160 : new_alternatives->Add(alternatives_->at(i), zone());
2922 : }
2923 : }
2924 26 : alternatives_ = new_alternatives;
2925 26 : return this;
2926 : }
2927 :
2928 :
2929 387 : RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2930 : bool ignore_case) {
2931 387 : if (info()->replacement_calculated) return replacement();
2932 387 : if (depth < 0) return this;
2933 387 : if (info()->visited) return this;
2934 387 : VisitMarker marker(info());
2935 : // Alternative 0 is the negative lookahead, alternative 1 is what comes
2936 : // afterwards.
2937 774 : RegExpNode* node = alternatives_->at(1).node();
2938 387 : RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2939 394 : if (replacement == NULL) return set_replacement(NULL);
2940 380 : alternatives_->at(1).set_node(replacement);
2941 :
2942 760 : RegExpNode* neg_node = alternatives_->at(0).node();
2943 380 : RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2944 : // If the negative lookahead is always going to fail then
2945 : // we don't need to check it.
2946 387 : if (neg_replacement == NULL) return set_replacement(replacement);
2947 373 : alternatives_->at(0).set_node(neg_replacement);
2948 746 : return set_replacement(this);
2949 : }
2950 :
2951 :
2952 26860 : void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2953 : RegExpCompiler* compiler,
2954 : int characters_filled_in,
2955 : bool not_at_start) {
2956 26860 : if (body_can_be_zero_length_ || info()->visited) return;
2957 19262 : VisitMarker marker(info());
2958 : return ChoiceNode::GetQuickCheckDetails(details,
2959 : compiler,
2960 : characters_filled_in,
2961 19262 : not_at_start);
2962 : }
2963 :
2964 :
2965 16894 : void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2966 : BoyerMooreLookahead* bm, bool not_at_start) {
2967 16894 : if (body_can_be_zero_length_ || budget <= 0) {
2968 : bm->SetRest(offset);
2969 : SaveBMInfo(bm, not_at_start, offset);
2970 16894 : return;
2971 : }
2972 16375 : ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2973 : SaveBMInfo(bm, not_at_start, offset);
2974 : }
2975 :
2976 :
2977 257628 : void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2978 : RegExpCompiler* compiler,
2979 : int characters_filled_in,
2980 : bool not_at_start) {
2981 58519 : not_at_start = (not_at_start || not_at_start_);
2982 58519 : int choice_count = alternatives_->length();
2983 : DCHECK(choice_count > 0);
2984 58519 : alternatives_->at(0).node()->GetQuickCheckDetails(details,
2985 : compiler,
2986 : characters_filled_in,
2987 58519 : not_at_start);
2988 257628 : for (int i = 1; i < choice_count; i++) {
2989 : QuickCheckDetails new_details(details->characters());
2990 398218 : RegExpNode* node = alternatives_->at(i).node();
2991 : node->GetQuickCheckDetails(&new_details, compiler,
2992 : characters_filled_in,
2993 199109 : not_at_start);
2994 : // Here we merge the quick match details of the two branches.
2995 199109 : details->Merge(&new_details, characters_filled_in);
2996 : }
2997 58519 : }
2998 :
2999 :
3000 : // Check for [0-9A-Z_a-z].
3001 674 : static void EmitWordCheck(RegExpMacroAssembler* assembler,
3002 : Label* word,
3003 : Label* non_word,
3004 : bool fall_through_on_word) {
3005 674 : if (assembler->CheckSpecialCharacterClass(
3006 : fall_through_on_word ? 'w' : 'W',
3007 674 : fall_through_on_word ? non_word : word)) {
3008 : // Optimized implementation available.
3009 674 : return;
3010 : }
3011 0 : assembler->CheckCharacterGT('z', non_word);
3012 0 : assembler->CheckCharacterLT('0', non_word);
3013 0 : assembler->CheckCharacterGT('a' - 1, word);
3014 0 : assembler->CheckCharacterLT('9' + 1, word);
3015 0 : assembler->CheckCharacterLT('A', non_word);
3016 0 : assembler->CheckCharacterLT('Z' + 1, word);
3017 0 : if (fall_through_on_word) {
3018 0 : assembler->CheckNotCharacter('_', non_word);
3019 : } else {
3020 0 : assembler->CheckCharacter('_', word);
3021 : }
3022 : }
3023 :
3024 :
3025 : // Emit the code to check for a ^ in multiline mode (1-character lookbehind
3026 : // that matches newline or the start of input).
3027 797 : static void EmitHat(RegExpCompiler* compiler,
3028 : RegExpNode* on_success,
3029 : Trace* trace) {
3030 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3031 : // We will be loading the previous character into the current character
3032 : // register.
3033 797 : Trace new_trace(*trace);
3034 : new_trace.InvalidateCurrentCharacter();
3035 :
3036 : Label ok;
3037 797 : if (new_trace.cp_offset() == 0) {
3038 : // The start of input counts as a newline in this context, so skip to
3039 : // ok if we are at the start.
3040 791 : assembler->CheckAtStart(&ok);
3041 : }
3042 : // We already checked that we are not at the start of input so it must be
3043 : // OK to load the previous character.
3044 797 : assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3045 : new_trace.backtrack(),
3046 1594 : false);
3047 797 : if (!assembler->CheckSpecialCharacterClass('n',
3048 797 : new_trace.backtrack())) {
3049 : // Newline means \n, \r, 0x2028 or 0x2029.
3050 0 : if (!compiler->one_byte()) {
3051 0 : assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3052 : }
3053 0 : assembler->CheckCharacter('\n', &ok);
3054 0 : assembler->CheckNotCharacter('\r', new_trace.backtrack());
3055 : }
3056 797 : assembler->Bind(&ok);
3057 797 : on_success->Emit(compiler, &new_trace);
3058 797 : }
3059 :
3060 :
3061 : // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
3062 989 : void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3063 316 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3064 : Isolate* isolate = assembler->isolate();
3065 : Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3066 316 : bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3067 193 : BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3068 316 : if (lookahead == NULL) {
3069 : int eats_at_least =
3070 : Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3071 : kRecursionBudget,
3072 249 : not_at_start));
3073 249 : if (eats_at_least >= 1) {
3074 126 : BoyerMooreLookahead* bm =
3075 126 : new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3076 126 : FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3077 126 : if (bm->at(0)->is_non_word())
3078 : next_is_word_character = Trace::FALSE_VALUE;
3079 126 : if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3080 : }
3081 : } else {
3082 67 : if (lookahead->at(0)->is_non_word())
3083 : next_is_word_character = Trace::FALSE_VALUE;
3084 67 : if (lookahead->at(0)->is_word())
3085 : next_is_word_character = Trace::TRUE_VALUE;
3086 : }
3087 316 : bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3088 316 : if (next_is_word_character == Trace::UNKNOWN) {
3089 : Label before_non_word;
3090 : Label before_word;
3091 179 : if (trace->characters_preloaded() != 1) {
3092 356 : assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3093 : }
3094 : // Fall through on non-word.
3095 179 : EmitWordCheck(assembler, &before_word, &before_non_word, false);
3096 : // Next character is not a word character.
3097 179 : assembler->Bind(&before_non_word);
3098 : Label ok;
3099 179 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3100 179 : assembler->GoTo(&ok);
3101 :
3102 179 : assembler->Bind(&before_word);
3103 179 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3104 179 : assembler->Bind(&ok);
3105 137 : } else if (next_is_word_character == Trace::TRUE_VALUE) {
3106 102 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3107 : } else {
3108 : DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3109 35 : BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3110 : }
3111 316 : }
3112 :
3113 :
3114 495 : void AssertionNode::BacktrackIfPrevious(
3115 495 : RegExpCompiler* compiler,
3116 : Trace* trace,
3117 : AssertionNode::IfPrevious backtrack_if_previous) {
3118 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3119 495 : Trace new_trace(*trace);
3120 : new_trace.InvalidateCurrentCharacter();
3121 :
3122 : Label fall_through, dummy;
3123 :
3124 : Label* non_word = backtrack_if_previous == kIsNonWord ?
3125 235 : new_trace.backtrack() :
3126 495 : &fall_through;
3127 : Label* word = backtrack_if_previous == kIsNonWord ?
3128 : &fall_through :
3129 495 : new_trace.backtrack();
3130 :
3131 495 : if (new_trace.cp_offset() == 0) {
3132 : // The start of input counts as a non-word character, so the question is
3133 : // decided if we are at the start.
3134 188 : assembler->CheckAtStart(non_word);
3135 : }
3136 : // We already checked that we are not at the start of input so it must be
3137 : // OK to load the previous character.
3138 495 : assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3139 495 : EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3140 :
3141 495 : assembler->Bind(&fall_through);
3142 495 : on_success()->Emit(compiler, &new_trace);
3143 495 : }
3144 :
3145 :
3146 3509 : void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3147 : RegExpCompiler* compiler,
3148 : int filled_in,
3149 : bool not_at_start) {
3150 3509 : if (assertion_type_ == AT_START && not_at_start) {
3151 : details->set_cannot_match();
3152 : return;
3153 : }
3154 3071 : return on_success()->GetQuickCheckDetails(details,
3155 : compiler,
3156 : filled_in,
3157 3071 : not_at_start);
3158 : }
3159 :
3160 :
3161 36963 : void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3162 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3163 13063 : switch (assertion_type_) {
3164 : case AT_END: {
3165 : Label ok;
3166 14540 : assembler->CheckPosition(trace->cp_offset(), &ok);
3167 14540 : assembler->GoTo(trace->backtrack());
3168 7270 : assembler->Bind(&ok);
3169 : break;
3170 : }
3171 : case AT_START: {
3172 4680 : if (trace->at_start() == Trace::FALSE_VALUE) {
3173 42 : assembler->GoTo(trace->backtrack());
3174 21 : return;
3175 : }
3176 4659 : if (trace->at_start() == Trace::UNKNOWN) {
3177 9318 : assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3178 4659 : Trace at_start_trace = *trace;
3179 : at_start_trace.set_at_start(Trace::TRUE_VALUE);
3180 11929 : on_success()->Emit(compiler, &at_start_trace);
3181 : return;
3182 : }
3183 : }
3184 : break;
3185 : case AFTER_NEWLINE:
3186 797 : EmitHat(compiler, on_success(), trace);
3187 797 : return;
3188 : case AT_BOUNDARY:
3189 : case AT_NON_BOUNDARY: {
3190 316 : EmitBoundaryCheck(compiler, trace);
3191 316 : return;
3192 : }
3193 : }
3194 7270 : on_success()->Emit(compiler, trace);
3195 : }
3196 :
3197 :
3198 3232696 : static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3199 3232696 : if (quick_check == NULL) return false;
3200 3232696 : if (offset >= quick_check->characters()) return false;
3201 1100849 : return quick_check->positions(offset)->determines_perfectly;
3202 : }
3203 :
3204 :
3205 : static void UpdateBoundsCheck(int index, int* checked_up_to) {
3206 876493 : if (index > *checked_up_to) {
3207 488669 : *checked_up_to = index;
3208 : }
3209 : }
3210 :
3211 :
3212 : // We call this repeatedly to generate code for each pass over the text node.
3213 : // The passes are in increasing order of difficulty because we hope one
3214 : // of the first passes will fail in which case we are saved the work of the
3215 : // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3216 : // we will check the '%' in the first pass, the case independent 'a' in the
3217 : // second pass and the character class in the last pass.
3218 : //
3219 : // The passes are done from right to left, so for example to test for /bar/
3220 : // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3221 : // and then a 'b' with offset 0. This means we can avoid the end-of-input
3222 : // bounds check most of the time. In the example we only need to check for
3223 : // end-of-input when loading the putative 'r'.
3224 : //
3225 : // A slight complication involves the fact that the first character may already
3226 : // be fetched into a register by the previous node. In this case we want to
3227 : // do the test for that character first. We do this in separate passes. The
3228 : // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3229 : // pass has been performed then subsequent passes will have true in
3230 : // first_element_checked to indicate that that character does not need to be
3231 : // checked again.
3232 : //
3233 : // In addition to all this we are passed a Trace, which can
3234 : // contain an AlternativeGeneration object. In this AlternativeGeneration
3235 : // object we can see details of any quick check that was already passed in
3236 : // order to get to the code we are now generating. The quick check can involve
3237 : // loading characters, which means we do not need to recheck the bounds
3238 : // up to the limit the quick check already checked. In addition the quick
3239 : // check can have involved a mask and compare operation which may simplify
3240 : // or obviate the need for further checks at some character positions.
3241 3865778 : void TextNode::TextEmitPass(RegExpCompiler* compiler,
3242 : TextEmitPassType pass,
3243 : bool preloaded,
3244 3948837 : Trace* trace,
3245 : bool first_element_checked,
3246 6271590 : int* checked_up_to) {
3247 1932889 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
3248 : Isolate* isolate = assembler->isolate();
3249 : bool one_byte = compiler->one_byte();
3250 : Label* backtrack = trace->backtrack();
3251 1932889 : QuickCheckDetails* quick_check = trace->quick_check_performed();
3252 1932889 : int element_count = elements()->length();
3253 1932889 : int backward_offset = read_backward() ? -Length() : 0;
3254 3948802 : for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3255 2015948 : TextElement elm = elements()->at(i);
3256 2015948 : int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3257 2015948 : if (elm.text_type() == TextElement::ATOM) {
3258 : Vector<const uc16> quarks = elm.atom()->data();
3259 4163716 : for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3260 2989043 : if (first_element_checked && i == 0 && j == 0) continue;
3261 5929752 : if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3262 : EmitCharacterFunction* emit_function = NULL;
3263 1917021 : switch (pass) {
3264 : case NON_LATIN1_MATCH:
3265 : DCHECK(one_byte);
3266 631927 : if (quarks[j] > String::kMaxOneByteCharCode) {
3267 35 : assembler->GoTo(backtrack);
3268 1932889 : return;
3269 : }
3270 : break;
3271 : case NON_LETTER_CHARACTER_MATCH:
3272 : emit_function = &EmitAtomNonLetter;
3273 14932 : break;
3274 : case SIMPLE_CHARACTER_MATCH:
3275 : emit_function = &EmitSimpleCharacter;
3276 620149 : break;
3277 : case CASE_CHARACTER_MATCH:
3278 : emit_function = &EmitAtomLetter;
3279 14932 : break;
3280 : default:
3281 : break;
3282 : }
3283 1916986 : if (emit_function != NULL) {
3284 985226 : bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3285 : bool bound_checked =
3286 : emit_function(isolate, compiler, quarks[j], backtrack,
3287 1300026 : cp_offset + j, bounds_check, preloaded);
3288 650013 : if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3289 : }
3290 : }
3291 : } else {
3292 : DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3293 841240 : if (pass == CHARACTER_CLASS_MATCH) {
3294 306067 : if (first_element_checked && i == 0) continue;
3295 267820 : if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3296 : RegExpCharacterClass* cc = elm.char_class();
3297 296102 : bool bounds_check = *checked_up_to < cp_offset || read_backward();
3298 : EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3299 241451 : bounds_check, preloaded, zone());
3300 : UpdateBoundsCheck(cp_offset, checked_up_to);
3301 : }
3302 : }
3303 : }
3304 : }
3305 :
3306 :
3307 9158607 : int TextNode::Length() {
3308 9158607 : TextElement elm = elements()->last();
3309 : DCHECK(elm.cp_offset() >= 0);
3310 9158607 : return elm.cp_offset() + elm.length();
3311 : }
3312 :
3313 :
3314 0 : bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3315 2645892 : TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3316 2645892 : if (ignore_case) {
3317 718740 : return pass == SIMPLE_CHARACTER_MATCH;
3318 : } else {
3319 1927152 : return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3320 : }
3321 : }
3322 :
3323 :
3324 11183 : TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3325 : ZoneList<CharacterRange>* ranges,
3326 : bool read_backward,
3327 : RegExpNode* on_success) {
3328 : DCHECK_NOT_NULL(ranges);
3329 11183 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3330 : elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),
3331 22366 : zone);
3332 11183 : return new (zone) TextNode(elms, read_backward, on_success);
3333 : }
3334 :
3335 :
3336 27313 : TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3337 : CharacterRange trail,
3338 : bool read_backward,
3339 : RegExpNode* on_success) {
3340 27313 : ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3341 27313 : ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3342 27313 : ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3343 : elms->Add(
3344 : TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),
3345 54626 : zone);
3346 : elms->Add(
3347 : TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),
3348 54626 : zone);
3349 27313 : return new (zone) TextNode(elms, read_backward, on_success);
3350 : }
3351 :
3352 :
3353 : // This generates the code to match a text node. A text node can contain
3354 : // straight character sequences (possibly to be matched in a case-independent
3355 : // way) and character classes. For efficiency we do not do this in a single
3356 : // pass from left to right. Instead we pass over the text node several times,
3357 : // emitting code for some character positions every time. See the comment on
3358 : // TextEmitPass for details.
3359 5016893 : void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3360 728329 : LimitResult limit_result = LimitVersions(compiler, trace);
3361 844006 : if (limit_result == DONE) return;
3362 : DCHECK(limit_result == CONTINUE);
3363 :
3364 612652 : if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3365 : compiler->SetRegExpTooBig();
3366 : return;
3367 : }
3368 :
3369 612652 : if (compiler->one_byte()) {
3370 430258 : int dummy = 0;
3371 430258 : TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3372 : }
3373 :
3374 : bool first_elt_done = false;
3375 612652 : int bound_checked_to = trace->cp_offset() - 1;
3376 612652 : bound_checked_to += trace->bound_checked_up_to();
3377 :
3378 : // If a character is preloaded into the current character register then
3379 : // check that now.
3380 612652 : if (trace->characters_preloaded() == 1) {
3381 195284 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3382 195284 : if (!SkipPass(pass, compiler->ignore_case())) {
3383 : TextEmitPass(compiler,
3384 : static_cast<TextEmitPassType>(pass),
3385 : true,
3386 : trace,
3387 : false,
3388 121021 : &bound_checked_to);
3389 : }
3390 : }
3391 : first_elt_done = true;
3392 : }
3393 :
3394 3063260 : for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3395 2450608 : if (!SkipPass(pass, compiler->ignore_case())) {
3396 : TextEmitPass(compiler,
3397 : static_cast<TextEmitPassType>(pass),
3398 : false,
3399 : trace,
3400 : first_elt_done,
3401 1381610 : &bound_checked_to);
3402 : }
3403 : }
3404 :
3405 612652 : Trace successor_trace(*trace);
3406 : // If we advance backward, we may end up at the start.
3407 : successor_trace.AdvanceCurrentPositionInTrace(
3408 612652 : read_backward() ? -Length() : Length(), compiler);
3409 : successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3410 612652 : : Trace::FALSE_VALUE);
3411 : RecursionCheck rc(compiler);
3412 612652 : on_success()->Emit(compiler, &successor_trace);
3413 : }
3414 :
3415 :
3416 0 : void Trace::InvalidateCurrentCharacter() {
3417 283868 : characters_preloaded_ = 0;
3418 0 : }
3419 :
3420 :
3421 1225304 : void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3422 : // We don't have an instruction for shifting the current character register
3423 : // down or for using a shifted value for anything so lets just forget that
3424 : // we preloaded any characters into it.
3425 612652 : characters_preloaded_ = 0;
3426 : // Adjust the offsets of the quick check performed information. This
3427 : // information is used to find out what we already determined about the
3428 : // characters by means of mask and compare.
3429 612652 : quick_check_performed_.Advance(by, compiler->one_byte());
3430 612652 : cp_offset_ += by;
3431 612652 : if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3432 : compiler->SetRegExpTooBig();
3433 0 : cp_offset_ = 0;
3434 : }
3435 1225304 : bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3436 612652 : }
3437 :
3438 :
3439 316425 : void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3440 155772 : int element_count = elements()->length();
3441 316425 : for (int i = 0; i < element_count; i++) {
3442 160653 : TextElement elm = elements()->at(i);
3443 160653 : if (elm.text_type() == TextElement::CHAR_CLASS) {
3444 : RegExpCharacterClass* cc = elm.char_class();
3445 : // None of the standard character classes is different in the case
3446 : // independent case and it slows us down if we don't know that.
3447 221619 : if (cc->is_standard(zone())) continue;
3448 : ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3449 148974 : CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
3450 : }
3451 : }
3452 155772 : }
3453 :
3454 :
3455 185764 : int TextNode::GreedyLoopTextLength() { return Length(); }
3456 :
3457 :
3458 93110 : RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3459 276056 : RegExpCompiler* compiler) {
3460 93110 : if (read_backward()) return NULL;
3461 92966 : if (elements()->length() != 1) return NULL;
3462 92526 : TextElement elm = elements()->at(0);
3463 92526 : if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3464 : RegExpCharacterClass* node = elm.char_class();
3465 90929 : ZoneList<CharacterRange>* ranges = node->ranges(zone());
3466 90929 : CharacterRange::Canonicalize(ranges);
3467 90929 : if (node->is_negated()) {
3468 87833 : return ranges->length() == 0 ? on_success() : NULL;
3469 : }
3470 90752 : if (ranges->length() != 1) return NULL;
3471 : uint32_t max_char;
3472 89980 : if (compiler->one_byte()) {
3473 : max_char = String::kMaxOneByteCharCode;
3474 : } else {
3475 : max_char = String::kMaxUtf16CodeUnit;
3476 : }
3477 179960 : return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3478 : }
3479 :
3480 :
3481 : // Finds the fixed match length of a sequence of nodes that goes from
3482 : // this alternative and back to this choice node. If there are variable
3483 : // length nodes or other complications in the way then return a sentinel
3484 : // value indicating that a greedy loop cannot be constructed.
3485 284881 : int ChoiceNode::GreedyLoopTextLengthForAlternative(
3486 284881 : GuardedAlternative* alternative) {
3487 : int length = 0;
3488 : RegExpNode* node = alternative->node();
3489 : // Later we will generate code for all these text nodes using recursion
3490 : // so we have to limit the max number.
3491 : int recursion_depth = 0;
3492 755526 : while (node != this) {
3493 438565 : if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3494 : return kNodeIsTooComplexForGreedyLoops;
3495 : }
3496 438565 : int node_length = node->GreedyLoopTextLength();
3497 438565 : if (node_length == kNodeIsTooComplexForGreedyLoops) {
3498 : return kNodeIsTooComplexForGreedyLoops;
3499 : }
3500 185764 : length += node_length;
3501 185764 : SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3502 : node = seq_node->on_success();
3503 : }
3504 32080 : return read_backward() ? -length : length;
3505 : }
3506 :
3507 :
3508 0 : void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3509 : DCHECK_NULL(loop_node_);
3510 1612556 : AddAlternative(alt);
3511 1612556 : loop_node_ = alt.node();
3512 0 : }
3513 :
3514 :
3515 0 : void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3516 : DCHECK_NULL(continue_node_);
3517 1612556 : AddAlternative(alt);
3518 1612556 : continue_node_ = alt.node();
3519 0 : }
3520 :
3521 :
3522 396509 : void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3523 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3524 380469 : if (trace->stop_node() == this) {
3525 : // Back edge of greedy optimized loop node graph.
3526 : int text_length =
3527 32080 : GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3528 : DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3529 : // Update the counter-based backtracking info on the stack. This is an
3530 : // optimization for greedy loops (see below).
3531 : DCHECK(trace->cp_offset() == text_length);
3532 16040 : macro_assembler->AdvanceCurrentPosition(text_length);
3533 32080 : macro_assembler->GoTo(trace->loop_label());
3534 16040 : return;
3535 : }
3536 : DCHECK_NULL(trace->stop_node());
3537 364429 : if (!trace->is_trivial()) {
3538 138345 : trace->Flush(compiler, this);
3539 138345 : return;
3540 : }
3541 226084 : ChoiceNode::Emit(compiler, trace);
3542 : }
3543 :
3544 :
3545 751971 : int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3546 : int eats_at_least) {
3547 : int preload_characters = Min(4, eats_at_least);
3548 268841 : if (compiler->macro_assembler()->CanReadUnaligned()) {
3549 : bool one_byte = compiler->one_byte();
3550 214289 : if (one_byte) {
3551 182578 : if (preload_characters > 4) preload_characters = 4;
3552 : // We can't preload 3 characters because there is no machine instruction
3553 : // to do that. We can't just load 4 because we could be reading
3554 : // beyond the end of the string, which could cause a memory fault.
3555 182578 : if (preload_characters == 3) preload_characters = 2;
3556 : } else {
3557 31711 : if (preload_characters > 2) preload_characters = 2;
3558 : }
3559 : } else {
3560 54552 : if (preload_characters > 1) preload_characters = 1;
3561 : }
3562 268841 : return preload_characters;
3563 : }
3564 :
3565 :
3566 : // This class is used when generating the alternatives in a choice node. It
3567 : // records the way the alternative is being code generated.
3568 : class AlternativeGeneration: public Malloced {
3569 : public:
3570 : AlternativeGeneration()
3571 : : possible_success(),
3572 : expects_preload(false),
3573 : after(),
3574 2716265 : quick_check_details() { }
3575 : Label possible_success;
3576 : bool expects_preload;
3577 : Label after;
3578 : QuickCheckDetails quick_check_details;
3579 : };
3580 :
3581 :
3582 : // Creates a list of AlternativeGenerations. If the list has a reasonable
3583 : // size then it is on the stack, otherwise the excess is on the heap.
3584 : class AlternativeGenerationList {
3585 : public:
3586 268841 : AlternativeGenerationList(int count, Zone* zone)
3587 2957251 : : alt_gens_(count, zone) {
3588 718386 : for (int i = 0; i < count && i < kAFew; i++) {
3589 718386 : alt_gens_.Add(a_few_alt_gens_ + i, zone);
3590 : }
3591 27855 : for (int i = kAFew; i < count; i++) {
3592 : alt_gens_.Add(new AlternativeGeneration(), zone);
3593 : }
3594 268841 : }
3595 268841 : ~AlternativeGenerationList() {
3596 593392 : for (int i = kAFew; i < alt_gens_.length(); i++) {
3597 380261 : delete alt_gens_[i];
3598 27855 : alt_gens_[i] = NULL;
3599 : }
3600 268841 : }
3601 :
3602 : AlternativeGeneration* at(int i) {
3603 3430284 : return alt_gens_[i];
3604 : }
3605 :
3606 : private:
3607 : static const int kAFew = 10;
3608 : ZoneList<AlternativeGeneration*> alt_gens_;
3609 : AlternativeGeneration a_few_alt_gens_[kAFew];
3610 : };
3611 :
3612 :
3613 : static const uc32 kRangeEndMarker = 0x110000;
3614 :
3615 : // The '2' variant is has inclusive from and exclusive to.
3616 : // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3617 : // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3618 : static const int kSpaceRanges[] = {
3619 : '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680,
3620 : 0x1681, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3621 : 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3622 : static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3623 :
3624 : static const int kWordRanges[] = {
3625 : '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3626 : static const int kWordRangeCount = arraysize(kWordRanges);
3627 : static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3628 : static const int kDigitRangeCount = arraysize(kDigitRanges);
3629 : static const int kSurrogateRanges[] = {
3630 : kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3631 : static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3632 : static const int kLineTerminatorRanges[] = {
3633 : 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3634 : static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3635 :
3636 0 : void BoyerMoorePositionInfo::Set(int character) {
3637 50775 : SetInterval(Interval(character, character));
3638 0 : }
3639 :
3640 :
3641 544258 : void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3642 272129 : s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3643 272129 : w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3644 272129 : d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3645 : surrogate_ =
3646 272129 : AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3647 272129 : if (interval.to() - interval.from() >= kMapSize - 1) {
3648 16786 : if (map_count_ != kMapSize) {
3649 7757 : map_count_ = kMapSize;
3650 1000653 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3651 : }
3652 : return;
3653 : }
3654 783207 : for (int i = interval.from(); i <= interval.to(); i++) {
3655 856946 : int mod_character = (i & kMask);
3656 1713892 : if (!map_->at(mod_character)) {
3657 617231 : map_count_++;
3658 617231 : map_->at(mod_character) = true;
3659 : }
3660 856946 : if (map_count_ == kMapSize) return;
3661 : }
3662 : }
3663 :
3664 :
3665 0 : void BoyerMoorePositionInfo::SetAll() {
3666 6750 : s_ = w_ = d_ = kLatticeUnknown;
3667 6750 : if (map_count_ != kMapSize) {
3668 6255 : map_count_ = kMapSize;
3669 1601280 : for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3670 : }
3671 0 : }
3672 :
3673 :
3674 85830 : BoyerMooreLookahead::BoyerMooreLookahead(
3675 85830 : int length, RegExpCompiler* compiler, Zone* zone)
3676 : : length_(length),
3677 85830 : compiler_(compiler) {
3678 85830 : if (compiler->one_byte()) {
3679 14257 : max_char_ = String::kMaxOneByteCharCode;
3680 : } else {
3681 71573 : max_char_ = String::kMaxUtf16CodeUnit;
3682 : }
3683 85830 : bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3684 199264 : for (int i = 0; i < length; i++) {
3685 113434 : bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3686 : }
3687 85830 : }
3688 :
3689 :
3690 : // Find the longest range of lookahead that has the fewest number of different
3691 : // characters that can occur at a given position. Since we are optimizing two
3692 : // different parameters at once this is a tradeoff.
3693 85704 : bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3694 : int biggest_points = 0;
3695 : // If more than 32 characters out of 128 can occur it is unlikely that we can
3696 : // be lucky enough to step forwards much of the time.
3697 : const int kMaxMax = 32;
3698 342816 : for (int max_number_of_chars = 4;
3699 : max_number_of_chars < kMaxMax;
3700 : max_number_of_chars *= 2) {
3701 : biggest_points =
3702 257112 : FindBestInterval(max_number_of_chars, biggest_points, from, to);
3703 : }
3704 85704 : if (biggest_points == 0) return false;
3705 8516 : return true;
3706 : }
3707 :
3708 :
3709 : // Find the highest-points range between 0 and length_ where the character
3710 : // information is not too vague. 'Too vague' means that there are more than
3711 : // max_number_of_chars that can occur at this position. Calculates the number
3712 : // of points as the product of width-of-the-range and
3713 : // probability-of-finding-one-of-the-characters, where the probability is
3714 : // calculated using the frequency distribution of the sample subject string.
3715 257112 : int BoyerMooreLookahead::FindBestInterval(
3716 599221 : int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3717 : int biggest_points = old_biggest_points;
3718 : static const int kSize = RegExpMacroAssembler::kTableSize;
3719 755070 : for (int i = 0; i < length_; ) {
3720 369422 : while (i < length_ && Count(i) > max_number_of_chars) i++;
3721 275893 : if (i == length_) break;
3722 : int remembered_from = i;
3723 : bool union_map[kSize];
3724 30828288 : for (int j = 0; j < kSize; j++) union_map[j] = false;
3725 810239 : while (i < length_ && Count(i) <= max_number_of_chars) {
3726 35789780 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3727 35514474 : for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3728 275306 : i++;
3729 : }
3730 : int frequency = 0;
3731 30828288 : for (int j = 0; j < kSize; j++) {
3732 30828288 : if (union_map[j]) {
3733 : // Add 1 to the frequency to give a small per-character boost for
3734 : // the cases where our sampling is not good enough and many
3735 : // characters have a frequency of zero. This means the frequency
3736 : // can theoretically be up to 2*kSize though we treat it mostly as
3737 : // a fraction of kSize.
3738 1033927 : frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3739 : }
3740 : }
3741 : // We use the probability of skipping times the distance we are skipping to
3742 : // judge the effectiveness of this. Actually we have a cut-off: By
3743 : // dividing by 2 we switch off the skipping if the probability of skipping
3744 : // is less than 50%. This is because the multibyte mask-and-compare
3745 : // skipping in quickcheck is more likely to do well on this case.
3746 : bool in_quickcheck_range =
3747 244885 : ((i - remembered_from < 4) ||
3748 4039 : (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3749 : // Called 'probability' but it is only a rough estimate and can actually
3750 : // be outside the 0-kSize range.
3751 240846 : int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3752 240846 : int points = (i - remembered_from) * probability;
3753 240846 : if (points > biggest_points) {
3754 9049 : *from = remembered_from;
3755 9049 : *to = i - 1;
3756 : biggest_points = points;
3757 : }
3758 : }
3759 257112 : return biggest_points;
3760 : }
3761 :
3762 :
3763 : // Take all the characters that will not prevent a successful match if they
3764 : // occur in the subject string in the range between min_lookahead and
3765 : // max_lookahead (inclusive) measured from the current position. If the
3766 : // character at max_lookahead offset is not one of these characters, then we
3767 : // can safely skip forwards by the number of characters in the range.
3768 6394 : int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3769 : int max_lookahead,
3770 : Handle<ByteArray> boolean_skip_table) {
3771 : const int kSize = RegExpMacroAssembler::kTableSize;
3772 :
3773 : const int kSkipArrayEntry = 0;
3774 : const int kDontSkipArrayEntry = 1;
3775 :
3776 824826 : for (int i = 0; i < kSize; i++) {
3777 : boolean_skip_table->set(i, kSkipArrayEntry);
3778 : }
3779 6394 : int skip = max_lookahead + 1 - min_lookahead;
3780 :
3781 22253 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3782 2061670 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3783 2045811 : for (int j = 0; j < kSize; j++) {
3784 2029952 : if (map->at(j)) {
3785 : boolean_skip_table->set(j, kDontSkipArrayEntry);
3786 : }
3787 : }
3788 : }
3789 :
3790 6394 : return skip;
3791 : }
3792 :
3793 :
3794 : // See comment above on the implementation of GetSkipTable.
3795 92098 : void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3796 : const int kSize = RegExpMacroAssembler::kTableSize;
3797 :
3798 85704 : int min_lookahead = 0;
3799 85704 : int max_lookahead = 0;
3800 :
3801 165014 : if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3802 :
3803 : bool found_single_character = false;
3804 : int single_character = 0;
3805 14776 : for (int i = max_lookahead; i >= min_lookahead; i--) {
3806 597277 : BoyerMoorePositionInfo* map = bitmaps_->at(i);
3807 25308 : if (map->map_count() > 1 ||
3808 3995 : (found_single_character && map->map_count() != 0)) {
3809 : found_single_character = false;
3810 : break;
3811 : }
3812 565799 : for (int j = 0; j < kSize; j++) {
3813 571969 : if (map->at(j)) {
3814 : found_single_character = true;
3815 : single_character = j;
3816 : break;
3817 : }
3818 : }
3819 : }
3820 :
3821 8516 : int lookahead_width = max_lookahead + 1 - min_lookahead;
3822 :
3823 8516 : if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3824 : // The mask-compare can probably handle this better.
3825 : return;
3826 : }
3827 :
3828 6510 : if (found_single_character) {
3829 : Label cont, again;
3830 116 : masm->Bind(&again);
3831 116 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3832 116 : if (max_char_ > kSize) {
3833 : masm->CheckCharacterAfterAnd(single_character,
3834 : RegExpMacroAssembler::kTableMask,
3835 116 : &cont);
3836 : } else {
3837 0 : masm->CheckCharacter(single_character, &cont);
3838 : }
3839 116 : masm->AdvanceCurrentPosition(lookahead_width);
3840 116 : masm->GoTo(&again);
3841 116 : masm->Bind(&cont);
3842 : return;
3843 : }
3844 :
3845 : Factory* factory = masm->isolate()->factory();
3846 6394 : Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3847 : int skip_distance = GetSkipTable(
3848 6394 : min_lookahead, max_lookahead, boolean_skip_table);
3849 : DCHECK(skip_distance != 0);
3850 :
3851 : Label cont, again;
3852 6394 : masm->Bind(&again);
3853 6394 : masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3854 6394 : masm->CheckBitInTable(boolean_skip_table, &cont);
3855 6394 : masm->AdvanceCurrentPosition(skip_distance);
3856 6394 : masm->GoTo(&again);
3857 6394 : masm->Bind(&cont);
3858 : }
3859 :
3860 :
3861 : /* Code generation for choice nodes.
3862 : *
3863 : * We generate quick checks that do a mask and compare to eliminate a
3864 : * choice. If the quick check succeeds then it jumps to the continuation to
3865 : * do slow checks and check subsequent nodes. If it fails (the common case)
3866 : * it falls through to the next choice.
3867 : *
3868 : * Here is the desired flow graph. Nodes directly below each other imply
3869 : * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3870 : * 3 doesn't have a quick check so we have to call the slow check.
3871 : * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3872 : * regexp continuation is generated directly after the Sn node, up to the
3873 : * next GoTo if we decide to reuse some already generated code. Some
3874 : * nodes expect preload_characters to be preloaded into the current
3875 : * character register. R nodes do this preloading. Vertices are marked
3876 : * F for failures and S for success (possible success in the case of quick
3877 : * nodes). L, V, < and > are used as arrow heads.
3878 : *
3879 : * ----------> R
3880 : * |
3881 : * V
3882 : * Q1 -----> S1
3883 : * | S /
3884 : * F| /
3885 : * | F/
3886 : * | /
3887 : * | R
3888 : * | /
3889 : * V L
3890 : * Q2 -----> S2
3891 : * | S /
3892 : * F| /
3893 : * | F/
3894 : * | /
3895 : * | R
3896 : * | /
3897 : * V L
3898 : * S3
3899 : * |
3900 : * F|
3901 : * |
3902 : * R
3903 : * |
3904 : * backtrack V
3905 : * <----------Q4
3906 : * \ F |
3907 : * \ |S
3908 : * \ F V
3909 : * \-----S4
3910 : *
3911 : * For greedy loops we push the current position, then generate the code that
3912 : * eats the input specially in EmitGreedyLoop. The other choice (the
3913 : * continuation) is generated by the normal code in EmitChoices, and steps back
3914 : * in the input to the starting position when it fails to match. The loop code
3915 : * looks like this (U is the unwind code that steps back in the greedy loop).
3916 : *
3917 : * _____
3918 : * / \
3919 : * V |
3920 : * ----------> S1 |
3921 : * /| |
3922 : * / |S |
3923 : * F/ \_____/
3924 : * /
3925 : * |<-----
3926 : * | \
3927 : * V |S
3928 : * Q2 ---> U----->backtrack
3929 : * | F /
3930 : * S| /
3931 : * V F /
3932 : * S2--/
3933 : */
3934 :
3935 268841 : GreedyLoopState::GreedyLoopState(bool not_at_start) {
3936 0 : counter_backtrack_trace_.set_backtrack(&label_);
3937 268841 : if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3938 0 : }
3939 :
3940 :
3941 0 : void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3942 : #ifdef DEBUG
3943 : int choice_count = alternatives_->length();
3944 : for (int i = 0; i < choice_count - 1; i++) {
3945 : GuardedAlternative alternative = alternatives_->at(i);
3946 : ZoneList<Guard*>* guards = alternative.guards();
3947 : int guard_count = (guards == NULL) ? 0 : guards->length();
3948 : for (int j = 0; j < guard_count; j++) {
3949 : DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3950 : }
3951 : }
3952 : #endif
3953 0 : }
3954 :
3955 :
3956 450228 : void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3957 450228 : Trace* current_trace,
3958 : PreloadState* state) {
3959 268841 : if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3960 : // Save some time by looking at most one machine word ahead.
3961 : state->eats_at_least_ =
3962 : EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3963 544161 : current_trace->at_start() == Trace::FALSE_VALUE);
3964 : }
3965 : state->preload_characters_ =
3966 268841 : CalculatePreloadCharacters(compiler, state->eats_at_least_);
3967 :
3968 : state->preload_is_current_ =
3969 268841 : (current_trace->characters_preloaded() == state->preload_characters_);
3970 268841 : state->preload_has_checked_bounds_ = state->preload_is_current_;
3971 268841 : }
3972 :
3973 :
3974 1779184 : void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3975 715588 : int choice_count = alternatives_->length();
3976 :
3977 716662 : if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3978 1074 : alternatives_->at(0).node()->Emit(compiler, trace);
3979 1074 : return;
3980 : }
3981 :
3982 : AssertGuardsMentionRegisters(trace);
3983 :
3984 983355 : LimitResult limit_result = LimitVersions(compiler, trace);
3985 714514 : if (limit_result == DONE) return;
3986 : DCHECK(limit_result == CONTINUE);
3987 :
3988 : // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3989 : // other choice nodes we only flush if we are out of code size budget.
3990 273113 : if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3991 2136 : trace->Flush(compiler, this);
3992 2136 : return;
3993 : }
3994 :
3995 : RecursionCheck rc(compiler);
3996 :
3997 : PreloadState preload;
3998 : preload.init();
3999 : GreedyLoopState greedy_loop_state(not_at_start());
4000 :
4001 537682 : int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
4002 537682 : AlternativeGenerationList alt_gens(choice_count, zone());
4003 :
4004 268841 : if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
4005 : trace = EmitGreedyLoop(compiler,
4006 : trace,
4007 : &alt_gens,
4008 : &preload,
4009 : &greedy_loop_state,
4010 16040 : text_length);
4011 : } else {
4012 : // TODO(erikcorry): Delete this. We don't need this label, but it makes us
4013 : // match the traces produced pre-cleanup.
4014 : Label second_choice;
4015 252801 : compiler->macro_assembler()->Bind(&second_choice);
4016 :
4017 252801 : preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
4018 :
4019 : EmitChoices(compiler,
4020 : &alt_gens,
4021 : 0,
4022 : trace,
4023 252801 : &preload);
4024 : }
4025 :
4026 : // At this point we need to generate slow checks for the alternatives where
4027 : // the quick check was inlined. We can recognize these because the associated
4028 : // label was bound.
4029 268841 : int new_flush_budget = trace->flush_budget() / choice_count;
4030 1015082 : for (int i = 0; i < choice_count; i++) {
4031 : AlternativeGeneration* alt_gen = alt_gens.at(i);
4032 746241 : Trace new_trace(*trace);
4033 : // If there are actions to be flushed we have to limit how many times
4034 : // they are flushed. Take the budget of the parent trace and distribute
4035 : // it fairly amongst the children.
4036 746241 : if (new_trace.actions() != NULL) {
4037 : new_trace.set_flush_budget(new_flush_budget);
4038 : }
4039 : bool next_expects_preload =
4040 1223641 : i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
4041 : EmitOutOfLineContinuation(compiler,
4042 : &new_trace,
4043 746241 : alternatives_->at(i),
4044 : alt_gen,
4045 : preload.preload_characters_,
4046 1492482 : next_expects_preload);
4047 : }
4048 : }
4049 :
4050 :
4051 16040 : Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4052 16040 : Trace* trace,
4053 : AlternativeGenerationList* alt_gens,
4054 : PreloadState* preload,
4055 : GreedyLoopState* greedy_loop_state,
4056 16040 : int text_length) {
4057 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4058 : // Here we have special handling for greedy loops containing only text nodes
4059 : // and other simple nodes. These are handled by pushing the current
4060 : // position on the stack and then incrementing the current position each
4061 : // time around the switch. On backtrack we decrement the current position
4062 : // and check it against the pushed value. This avoids pushing backtrack
4063 : // information for each iteration of the loop, which could take up a lot of
4064 : // space.
4065 : DCHECK(trace->stop_node() == NULL);
4066 16040 : macro_assembler->PushCurrentPosition();
4067 : Label greedy_match_failed;
4068 16040 : Trace greedy_match_trace;
4069 16040 : if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4070 : greedy_match_trace.set_backtrack(&greedy_match_failed);
4071 : Label loop_label;
4072 16040 : macro_assembler->Bind(&loop_label);
4073 16040 : greedy_match_trace.set_stop_node(this);
4074 : greedy_match_trace.set_loop_label(&loop_label);
4075 32080 : alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4076 16040 : macro_assembler->Bind(&greedy_match_failed);
4077 :
4078 : Label second_choice; // For use in greedy matches.
4079 16040 : macro_assembler->Bind(&second_choice);
4080 :
4081 16040 : Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4082 :
4083 : EmitChoices(compiler,
4084 : alt_gens,
4085 : 1,
4086 : new_trace,
4087 16040 : preload);
4088 :
4089 16040 : macro_assembler->Bind(greedy_loop_state->label());
4090 : // If we have unwound to the bottom then backtrack.
4091 32080 : macro_assembler->CheckGreedyLoop(trace->backtrack());
4092 : // Otherwise try the second priority at an earlier position.
4093 16040 : macro_assembler->AdvanceCurrentPosition(-text_length);
4094 16040 : macro_assembler->GoTo(&second_choice);
4095 16040 : return new_trace;
4096 : }
4097 :
4098 340255 : int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4099 : Trace* trace) {
4100 : int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4101 252801 : if (alternatives_->length() != 2) return eats_at_least;
4102 :
4103 207522 : GuardedAlternative alt1 = alternatives_->at(1);
4104 207522 : if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4105 : return eats_at_least;
4106 : }
4107 : RegExpNode* eats_anything_node = alt1.node();
4108 289943 : if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4109 : return eats_at_least;
4110 : }
4111 :
4112 : // Really we should be creating a new trace when we execute this function,
4113 : // but there is no need, because the code it generates cannot backtrack, and
4114 : // we always arrive here with a trivial trace (since it's the entry to a
4115 : // loop. That also implies that there are no preloaded characters, which is
4116 : // good, because it means we won't be violating any assumptions by
4117 : // overwriting those characters with new load instructions.
4118 : DCHECK(trace->is_trivial());
4119 :
4120 87454 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4121 : Isolate* isolate = macro_assembler->isolate();
4122 : // At this point we know that we are at a non-greedy loop that will eat
4123 : // any character one at a time. Any non-anchored regexp has such a
4124 : // loop prepended to it in order to find where it starts. We look for
4125 : // a pattern of the form ...abc... where we can look 6 characters ahead
4126 : // and step forwards 3 if the character is not one of abc. Abc need
4127 : // not be atoms, they can be any reasonably limited character class or
4128 : // small alternation.
4129 : BoyerMooreLookahead* bm = bm_info(false);
4130 87454 : if (bm == NULL) {
4131 : eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4132 : EatsAtLeast(kMaxLookaheadForBoyerMoore,
4133 : kRecursionBudget,
4134 87454 : false));
4135 87454 : if (eats_at_least >= 1) {
4136 : bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4137 : compiler,
4138 85704 : zone());
4139 171408 : GuardedAlternative alt0 = alternatives_->at(0);
4140 85704 : alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4141 : }
4142 : }
4143 87454 : if (bm != NULL) {
4144 85704 : bm->EmitSkipInstructions(macro_assembler);
4145 : }
4146 87454 : return eats_at_least;
4147 : }
4148 :
4149 :
4150 999042 : void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4151 : AlternativeGenerationList* alt_gens,
4152 : int first_choice,
4153 268867 : Trace* trace,
4154 : PreloadState* preload) {
4155 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4156 268841 : SetUpPreLoad(compiler, trace, preload);
4157 :
4158 : // For now we just call all choices one after the other. The idea ultimately
4159 : // is to use the Dispatch table to try only the relevant ones.
4160 268841 : int choice_count = alternatives_->length();
4161 :
4162 268841 : int new_flush_budget = trace->flush_budget() / choice_count;
4163 :
4164 999042 : for (int i = first_choice; i < choice_count; i++) {
4165 730201 : bool is_last = i == choice_count - 1;
4166 730201 : bool fall_through_on_failure = !is_last;
4167 1460402 : GuardedAlternative alternative = alternatives_->at(i);
4168 : AlternativeGeneration* alt_gen = alt_gens->at(i);
4169 1206456 : alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4170 : ZoneList<Guard*>* guards = alternative.guards();
4171 730201 : int guard_count = (guards == NULL) ? 0 : guards->length();
4172 730201 : Trace new_trace(*trace);
4173 : new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4174 : preload->preload_characters_ :
4175 730201 : 0);
4176 730201 : if (preload->preload_has_checked_bounds_) {
4177 499468 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4178 : }
4179 : new_trace.quick_check_performed()->Clear();
4180 730201 : if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4181 730201 : if (!is_last) {
4182 461360 : new_trace.set_backtrack(&alt_gen->after);
4183 : }
4184 730201 : alt_gen->expects_preload = preload->preload_is_current_;
4185 : bool generate_full_check_inline = false;
4186 1324505 : if (compiler->optimize() &&
4187 1321004 : try_to_emit_quick_check_for_alternative(i == 0) &&
4188 : alternative.node()->EmitQuickCheck(
4189 : compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4190 : &alt_gen->possible_success, &alt_gen->quick_check_details,
4191 590803 : fall_through_on_failure)) {
4192 : // Quick check was generated for this choice.
4193 253946 : preload->preload_is_current_ = true;
4194 253946 : preload->preload_has_checked_bounds_ = true;
4195 : // If we generated the quick check to fall through on possible success,
4196 : // we now need to generate the full check inline.
4197 253946 : if (!fall_through_on_failure) {
4198 44533 : macro_assembler->Bind(&alt_gen->possible_success);
4199 : new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4200 44533 : new_trace.set_characters_preloaded(preload->preload_characters_);
4201 : new_trace.set_bound_checked_up_to(preload->preload_characters_);
4202 : generate_full_check_inline = true;
4203 : }
4204 476255 : } else if (alt_gen->quick_check_details.cannot_match()) {
4205 138 : if (!fall_through_on_failure) {
4206 52 : macro_assembler->GoTo(trace->backtrack());
4207 : }
4208 138 : continue;
4209 : } else {
4210 : // No quick check was generated. Put the full code here.
4211 : // If this is not the first choice then there could be slow checks from
4212 : // previous cases that go here when they fail. There's no reason to
4213 : // insist that they preload characters since the slow check we are about
4214 : // to generate probably can't use it.
4215 476117 : if (i != first_choice) {
4216 282576 : alt_gen->expects_preload = false;
4217 : new_trace.InvalidateCurrentCharacter();
4218 : }
4219 : generate_full_check_inline = true;
4220 : }
4221 730063 : if (generate_full_check_inline) {
4222 520650 : if (new_trace.actions() != NULL) {
4223 : new_trace.set_flush_budget(new_flush_budget);
4224 : }
4225 6615 : for (int j = 0; j < guard_count; j++) {
4226 6615 : GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4227 : }
4228 520650 : alternative.node()->Emit(compiler, &new_trace);
4229 520650 : preload->preload_is_current_ = false;
4230 : }
4231 730063 : macro_assembler->Bind(&alt_gen->after);
4232 : }
4233 268841 : }
4234 :
4235 :
4236 955654 : void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4237 176806 : Trace* trace,
4238 : GuardedAlternative alternative,
4239 : AlternativeGeneration* alt_gen,
4240 : int preload_characters,
4241 : bool next_expects_preload) {
4242 1283069 : if (!alt_gen->possible_success.is_linked()) return;
4243 :
4244 : RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4245 209413 : macro_assembler->Bind(&alt_gen->possible_success);
4246 209413 : Trace out_of_line_trace(*trace);
4247 : out_of_line_trace.set_characters_preloaded(preload_characters);
4248 : out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4249 209413 : if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4250 209413 : ZoneList<Guard*>* guards = alternative.guards();
4251 209413 : int guard_count = (guards == NULL) ? 0 : guards->length();
4252 209413 : if (next_expects_preload) {
4253 : Label reload_current_char;
4254 : out_of_line_trace.set_backtrack(&reload_current_char);
4255 177845 : for (int j = 0; j < guard_count; j++) {
4256 1039 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4257 : }
4258 176806 : alternative.node()->Emit(compiler, &out_of_line_trace);
4259 176806 : macro_assembler->Bind(&reload_current_char);
4260 : // Reload the current character, since the next quick check expects that.
4261 : // We don't need to check bounds here because we only get into this
4262 : // code through a quick check which already did the checked load.
4263 : macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4264 : NULL,
4265 : false,
4266 353612 : preload_characters);
4267 176806 : macro_assembler->GoTo(&(alt_gen->after));
4268 : } else {
4269 32607 : out_of_line_trace.set_backtrack(&(alt_gen->after));
4270 32703 : for (int j = 0; j < guard_count; j++) {
4271 96 : GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4272 : }
4273 32607 : alternative.node()->Emit(compiler, &out_of_line_trace);
4274 : }
4275 : }
4276 :
4277 :
4278 577834 : void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4279 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4280 576752 : LimitResult limit_result = LimitVersions(compiler, trace);
4281 576752 : if (limit_result == DONE) return;
4282 : DCHECK(limit_result == CONTINUE);
4283 :
4284 : RecursionCheck rc(compiler);
4285 :
4286 337669 : switch (action_type_) {
4287 : case STORE_POSITION: {
4288 : Trace::DeferredCapture
4289 : new_capture(data_.u_position_register.reg,
4290 : data_.u_position_register.is_capture,
4291 302875 : trace);
4292 302875 : Trace new_trace = *trace;
4293 : new_trace.add_action(&new_capture);
4294 328091 : on_success()->Emit(compiler, &new_trace);
4295 : break;
4296 : }
4297 : case INCREMENT_REGISTER: {
4298 : Trace::DeferredIncrementRegister
4299 7305 : new_increment(data_.u_increment_register.reg);
4300 7305 : Trace new_trace = *trace;
4301 : new_trace.add_action(&new_increment);
4302 7305 : on_success()->Emit(compiler, &new_trace);
4303 : break;
4304 : }
4305 : case SET_REGISTER: {
4306 : Trace::DeferredSetRegister
4307 6226 : new_set(data_.u_store_register.reg, data_.u_store_register.value);
4308 6226 : Trace new_trace = *trace;
4309 : new_trace.add_action(&new_set);
4310 6226 : on_success()->Emit(compiler, &new_trace);
4311 : break;
4312 : }
4313 : case CLEAR_CAPTURES: {
4314 : Trace::DeferredClearCaptures
4315 : new_capture(Interval(data_.u_clear_captures.range_from,
4316 3897 : data_.u_clear_captures.range_to));
4317 3897 : Trace new_trace = *trace;
4318 : new_trace.add_action(&new_capture);
4319 3897 : on_success()->Emit(compiler, &new_trace);
4320 : break;
4321 : }
4322 : case BEGIN_SUBMATCH:
4323 11291 : if (!trace->is_trivial()) {
4324 5979 : trace->Flush(compiler, this);
4325 : } else {
4326 : assembler->WriteCurrentPositionToRegister(
4327 5312 : data_.u_submatch.current_position_register, 0);
4328 : assembler->WriteStackPointerToRegister(
4329 5312 : data_.u_submatch.stack_pointer_register);
4330 5312 : on_success()->Emit(compiler, trace);
4331 : }
4332 : break;
4333 : case EMPTY_MATCH_CHECK: {
4334 1147 : int start_pos_reg = data_.u_empty_match_check.start_register;
4335 1147 : int stored_pos = 0;
4336 1147 : int rep_reg = data_.u_empty_match_check.repetition_register;
4337 1147 : bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4338 1147 : bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4339 1361 : if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4340 : // If we know we haven't advanced and there is no minimum we
4341 : // can just backtrack immediately.
4342 182 : assembler->GoTo(trace->backtrack());
4343 1459 : } else if (know_dist && stored_pos < trace->cp_offset()) {
4344 : // If we know we've advanced we can generate the continuation
4345 : // immediately.
4346 298 : on_success()->Emit(compiler, trace);
4347 758 : } else if (!trace->is_trivial()) {
4348 384 : trace->Flush(compiler, this);
4349 : } else {
4350 : Label skip_empty_check;
4351 : // If we have a minimum number of repetitions we check the current
4352 : // number first and skip the empty check if it's not enough.
4353 374 : if (has_minimum) {
4354 251 : int limit = data_.u_empty_match_check.repetition_limit;
4355 251 : assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4356 : }
4357 : // If the match is empty we bail out, otherwise we fall through
4358 : // to the on-success continuation.
4359 : assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4360 748 : trace->backtrack());
4361 374 : assembler->Bind(&skip_empty_check);
4362 374 : on_success()->Emit(compiler, trace);
4363 : }
4364 : break;
4365 : }
4366 : case POSITIVE_SUBMATCH_SUCCESS: {
4367 4928 : if (!trace->is_trivial()) {
4368 3124 : trace->Flush(compiler, this);
4369 3124 : return;
4370 : }
4371 : assembler->ReadCurrentPositionFromRegister(
4372 1804 : data_.u_submatch.current_position_register);
4373 : assembler->ReadStackPointerFromRegister(
4374 1804 : data_.u_submatch.stack_pointer_register);
4375 1804 : int clear_register_count = data_.u_submatch.clear_register_count;
4376 1804 : if (clear_register_count == 0) {
4377 1206 : on_success()->Emit(compiler, trace);
4378 1206 : return;
4379 : }
4380 598 : int clear_registers_from = data_.u_submatch.clear_register_from;
4381 : Label clear_registers_backtrack;
4382 598 : Trace new_trace = *trace;
4383 : new_trace.set_backtrack(&clear_registers_backtrack);
4384 598 : on_success()->Emit(compiler, &new_trace);
4385 :
4386 598 : assembler->Bind(&clear_registers_backtrack);
4387 598 : int clear_registers_to = clear_registers_from + clear_register_count - 1;
4388 598 : assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4389 :
4390 : DCHECK(trace->backtrack() == NULL);
4391 598 : assembler->Backtrack();
4392 : return;
4393 : }
4394 : default:
4395 0 : UNREACHABLE();
4396 : }
4397 : }
4398 :
4399 :
4400 14675 : void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4401 : RegExpMacroAssembler* assembler = compiler->macro_assembler();
4402 6185 : if (!trace->is_trivial()) {
4403 2953 : trace->Flush(compiler, this);
4404 2953 : return;
4405 : }
4406 :
4407 3232 : LimitResult limit_result = LimitVersions(compiler, trace);
4408 3232 : if (limit_result == DONE) return;
4409 : DCHECK(limit_result == CONTINUE);
4410 :
4411 : RecursionCheck rc(compiler);
4412 :
4413 : DCHECK_EQ(start_reg_ + 1, end_reg_);
4414 2987 : if (compiler->ignore_case()) {
4415 : assembler->CheckNotBackReferenceIgnoreCase(
4416 6738 : start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
4417 : } else {
4418 : assembler->CheckNotBackReference(start_reg_, read_backward(),
4419 1482 : trace->backtrack());
4420 : }
4421 : // We are going to advance backward, so we may end up at the start.
4422 2987 : if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4423 :
4424 : // Check that the back reference does not end inside a surrogate pair.
4425 3161 : if (compiler->unicode() && !compiler->one_byte()) {
4426 96 : assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4427 : }
4428 2987 : on_success()->Emit(compiler, trace);
4429 : }
4430 :
4431 :
4432 : // -------------------------------------------------------------------
4433 : // Dot/dotty output
4434 :
4435 :
4436 : #ifdef DEBUG
4437 :
4438 :
4439 : class DotPrinter: public NodeVisitor {
4440 : public:
4441 : DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4442 : : os_(os),
4443 : ignore_case_(ignore_case) {}
4444 : void PrintNode(const char* label, RegExpNode* node);
4445 : void Visit(RegExpNode* node);
4446 : void PrintAttributes(RegExpNode* from);
4447 : void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4448 : #define DECLARE_VISIT(Type) \
4449 : virtual void Visit##Type(Type##Node* that);
4450 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4451 : #undef DECLARE_VISIT
4452 : private:
4453 : std::ostream& os_;
4454 : bool ignore_case_;
4455 : };
4456 :
4457 :
4458 : void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4459 : os_ << "digraph G {\n graph [label=\"";
4460 : for (int i = 0; label[i]; i++) {
4461 : switch (label[i]) {
4462 : case '\\':
4463 : os_ << "\\\\";
4464 : break;
4465 : case '"':
4466 : os_ << "\"";
4467 : break;
4468 : default:
4469 : os_ << label[i];
4470 : break;
4471 : }
4472 : }
4473 : os_ << "\"];\n";
4474 : Visit(node);
4475 : os_ << "}" << std::endl;
4476 : }
4477 :
4478 :
4479 : void DotPrinter::Visit(RegExpNode* node) {
4480 : if (node->info()->visited) return;
4481 : node->info()->visited = true;
4482 : node->Accept(this);
4483 : }
4484 :
4485 :
4486 : void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4487 : os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4488 : Visit(on_failure);
4489 : }
4490 :
4491 :
4492 : class TableEntryBodyPrinter {
4493 : public:
4494 : TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4495 : : os_(os),
4496 : choice_(choice) {}
4497 : void Call(uc16 from, DispatchTable::Entry entry) {
4498 : OutSet* out_set = entry.out_set();
4499 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4500 : if (out_set->Get(i)) {
4501 : os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4502 : << choice()->alternatives()->at(i).node() << ";\n";
4503 : }
4504 : }
4505 : }
4506 : private:
4507 : ChoiceNode* choice() { return choice_; }
4508 : std::ostream& os_;
4509 : ChoiceNode* choice_;
4510 : };
4511 :
4512 :
4513 : class TableEntryHeaderPrinter {
4514 : public:
4515 : explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4516 : : first_(true),
4517 : os_(os) {}
4518 : void Call(uc16 from, DispatchTable::Entry entry) {
4519 : if (first_) {
4520 : first_ = false;
4521 : } else {
4522 : os_ << "|";
4523 : }
4524 : os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4525 : OutSet* out_set = entry.out_set();
4526 : int priority = 0;
4527 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4528 : if (out_set->Get(i)) {
4529 : if (priority > 0) os_ << "|";
4530 : os_ << "<s" << from << "o" << i << "> " << priority;
4531 : priority++;
4532 : }
4533 : }
4534 : os_ << "}}";
4535 : }
4536 :
4537 : private:
4538 : bool first_;
4539 : std::ostream& os_;
4540 : };
4541 :
4542 :
4543 : class AttributePrinter {
4544 : public:
4545 : explicit AttributePrinter(std::ostream& os) // NOLINT
4546 : : os_(os),
4547 : first_(true) {}
4548 : void PrintSeparator() {
4549 : if (first_) {
4550 : first_ = false;
4551 : } else {
4552 : os_ << "|";
4553 : }
4554 : }
4555 : void PrintBit(const char* name, bool value) {
4556 : if (!value) return;
4557 : PrintSeparator();
4558 : os_ << "{" << name << "}";
4559 : }
4560 : void PrintPositive(const char* name, int value) {
4561 : if (value < 0) return;
4562 : PrintSeparator();
4563 : os_ << "{" << name << "|" << value << "}";
4564 : }
4565 :
4566 : private:
4567 : std::ostream& os_;
4568 : bool first_;
4569 : };
4570 :
4571 :
4572 : void DotPrinter::PrintAttributes(RegExpNode* that) {
4573 : os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4574 : << "margin=0.1, fontsize=10, label=\"{";
4575 : AttributePrinter printer(os_);
4576 : NodeInfo* info = that->info();
4577 : printer.PrintBit("NI", info->follows_newline_interest);
4578 : printer.PrintBit("WI", info->follows_word_interest);
4579 : printer.PrintBit("SI", info->follows_start_interest);
4580 : Label* label = that->label();
4581 : if (label->is_bound())
4582 : printer.PrintPositive("@", label->pos());
4583 : os_ << "}\"];\n"
4584 : << " a" << that << " -> n" << that
4585 : << " [style=dashed, color=grey, arrowhead=none];\n";
4586 : }
4587 :
4588 :
4589 : static const bool kPrintDispatchTable = false;
4590 : void DotPrinter::VisitChoice(ChoiceNode* that) {
4591 : if (kPrintDispatchTable) {
4592 : os_ << " n" << that << " [shape=Mrecord, label=\"";
4593 : TableEntryHeaderPrinter header_printer(os_);
4594 : that->GetTable(ignore_case_)->ForEach(&header_printer);
4595 : os_ << "\"]\n";
4596 : PrintAttributes(that);
4597 : TableEntryBodyPrinter body_printer(os_, that);
4598 : that->GetTable(ignore_case_)->ForEach(&body_printer);
4599 : } else {
4600 : os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4601 : for (int i = 0; i < that->alternatives()->length(); i++) {
4602 : GuardedAlternative alt = that->alternatives()->at(i);
4603 : os_ << " n" << that << " -> n" << alt.node();
4604 : }
4605 : }
4606 : for (int i = 0; i < that->alternatives()->length(); i++) {
4607 : GuardedAlternative alt = that->alternatives()->at(i);
4608 : alt.node()->Accept(this);
4609 : }
4610 : }
4611 :
4612 :
4613 : void DotPrinter::VisitText(TextNode* that) {
4614 : Zone* zone = that->zone();
4615 : os_ << " n" << that << " [label=\"";
4616 : for (int i = 0; i < that->elements()->length(); i++) {
4617 : if (i > 0) os_ << " ";
4618 : TextElement elm = that->elements()->at(i);
4619 : switch (elm.text_type()) {
4620 : case TextElement::ATOM: {
4621 : Vector<const uc16> data = elm.atom()->data();
4622 : for (int i = 0; i < data.length(); i++) {
4623 : os_ << static_cast<char>(data[i]);
4624 : }
4625 : break;
4626 : }
4627 : case TextElement::CHAR_CLASS: {
4628 : RegExpCharacterClass* node = elm.char_class();
4629 : os_ << "[";
4630 : if (node->is_negated()) os_ << "^";
4631 : for (int j = 0; j < node->ranges(zone)->length(); j++) {
4632 : CharacterRange range = node->ranges(zone)->at(j);
4633 : os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4634 : }
4635 : os_ << "]";
4636 : break;
4637 : }
4638 : default:
4639 : UNREACHABLE();
4640 : }
4641 : }
4642 : os_ << "\", shape=box, peripheries=2];\n";
4643 : PrintAttributes(that);
4644 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4645 : Visit(that->on_success());
4646 : }
4647 :
4648 :
4649 : void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4650 : os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4651 : << that->end_register() << "\", shape=doubleoctagon];\n";
4652 : PrintAttributes(that);
4653 : os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4654 : Visit(that->on_success());
4655 : }
4656 :
4657 :
4658 : void DotPrinter::VisitEnd(EndNode* that) {
4659 : os_ << " n" << that << " [style=bold, shape=point];\n";
4660 : PrintAttributes(that);
4661 : }
4662 :
4663 :
4664 : void DotPrinter::VisitAssertion(AssertionNode* that) {
4665 : os_ << " n" << that << " [";
4666 : switch (that->assertion_type()) {
4667 : case AssertionNode::AT_END:
4668 : os_ << "label=\"$\", shape=septagon";
4669 : break;
4670 : case AssertionNode::AT_START:
4671 : os_ << "label=\"^\", shape=septagon";
4672 : break;
4673 : case AssertionNode::AT_BOUNDARY:
4674 : os_ << "label=\"\\b\", shape=septagon";
4675 : break;
4676 : case AssertionNode::AT_NON_BOUNDARY:
4677 : os_ << "label=\"\\B\", shape=septagon";
4678 : break;
4679 : case AssertionNode::AFTER_NEWLINE:
4680 : os_ << "label=\"(?<=\\n)\", shape=septagon";
4681 : break;
4682 : }
4683 : os_ << "];\n";
4684 : PrintAttributes(that);
4685 : RegExpNode* successor = that->on_success();
4686 : os_ << " n" << that << " -> n" << successor << ";\n";
4687 : Visit(successor);
4688 : }
4689 :
4690 :
4691 : void DotPrinter::VisitAction(ActionNode* that) {
4692 : os_ << " n" << that << " [";
4693 : switch (that->action_type_) {
4694 : case ActionNode::SET_REGISTER:
4695 : os_ << "label=\"$" << that->data_.u_store_register.reg
4696 : << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4697 : break;
4698 : case ActionNode::INCREMENT_REGISTER:
4699 : os_ << "label=\"$" << that->data_.u_increment_register.reg
4700 : << "++\", shape=octagon";
4701 : break;
4702 : case ActionNode::STORE_POSITION:
4703 : os_ << "label=\"$" << that->data_.u_position_register.reg
4704 : << ":=$pos\", shape=octagon";
4705 : break;
4706 : case ActionNode::BEGIN_SUBMATCH:
4707 : os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4708 : << ":=$pos,begin\", shape=septagon";
4709 : break;
4710 : case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4711 : os_ << "label=\"escape\", shape=septagon";
4712 : break;
4713 : case ActionNode::EMPTY_MATCH_CHECK:
4714 : os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4715 : << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4716 : << "<" << that->data_.u_empty_match_check.repetition_limit
4717 : << "?\", shape=septagon";
4718 : break;
4719 : case ActionNode::CLEAR_CAPTURES: {
4720 : os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4721 : << " to $" << that->data_.u_clear_captures.range_to
4722 : << "\", shape=septagon";
4723 : break;
4724 : }
4725 : }
4726 : os_ << "];\n";
4727 : PrintAttributes(that);
4728 : RegExpNode* successor = that->on_success();
4729 : os_ << " n" << that << " -> n" << successor << ";\n";
4730 : Visit(successor);
4731 : }
4732 :
4733 :
4734 : class DispatchTableDumper {
4735 : public:
4736 : explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4737 : void Call(uc16 key, DispatchTable::Entry entry);
4738 : private:
4739 : std::ostream& os_;
4740 : };
4741 :
4742 :
4743 : void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4744 : os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4745 : OutSet* set = entry.out_set();
4746 : bool first = true;
4747 : for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4748 : if (set->Get(i)) {
4749 : if (first) {
4750 : first = false;
4751 : } else {
4752 : os_ << ", ";
4753 : }
4754 : os_ << i;
4755 : }
4756 : }
4757 : os_ << "}\n";
4758 : }
4759 :
4760 :
4761 : void DispatchTable::Dump() {
4762 : OFStream os(stderr);
4763 : DispatchTableDumper dumper(os);
4764 : tree()->ForEach(&dumper);
4765 : }
4766 :
4767 :
4768 : void RegExpEngine::DotPrint(const char* label,
4769 : RegExpNode* node,
4770 : bool ignore_case) {
4771 : OFStream os(stdout);
4772 : DotPrinter printer(os, ignore_case);
4773 : printer.PrintNode(label, node);
4774 : }
4775 :
4776 :
4777 : #endif // DEBUG
4778 :
4779 :
4780 : // -------------------------------------------------------------------
4781 : // Tree to graph conversion
4782 :
4783 4873791 : RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4784 : RegExpNode* on_success) {
4785 : ZoneList<TextElement>* elms =
4786 1624597 : new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4787 3249194 : elms->Add(TextElement::Atom(this), compiler->zone());
4788 : return new (compiler->zone())
4789 1624597 : TextNode(elms, compiler->read_backward(), on_success);
4790 : }
4791 :
4792 :
4793 5678 : RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4794 : RegExpNode* on_success) {
4795 : return new (compiler->zone())
4796 5678 : TextNode(elements(), compiler->read_backward(), on_success);
4797 : }
4798 :
4799 :
4800 637490 : static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4801 : const int* special_class,
4802 : int length) {
4803 637490 : length--; // Remove final marker.
4804 : DCHECK(special_class[length] == kRangeEndMarker);
4805 : DCHECK(ranges->length() != 0);
4806 : DCHECK(length != 0);
4807 : DCHECK(special_class[0] != 0);
4808 637490 : if (ranges->length() != (length >> 1) + 1) {
4809 : return false;
4810 : }
4811 17224 : CharacterRange range = ranges->at(0);
4812 17224 : if (range.from() != 0) {
4813 : return false;
4814 : }
4815 32390 : for (int i = 0; i < length; i += 2) {
4816 33087 : if (special_class[i] != (range.to() + 1)) {
4817 : return false;
4818 : }
4819 64780 : range = ranges->at((i >> 1) + 1);
4820 32390 : if (special_class[i+1] != range.from()) {
4821 : return false;
4822 : }
4823 : }
4824 10289 : if (range.to() != String::kMaxCodePoint) {
4825 : return false;
4826 : }
4827 10289 : return true;
4828 : }
4829 :
4830 :
4831 633818 : static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4832 : const int* special_class,
4833 : int length) {
4834 633818 : length--; // Remove final marker.
4835 : DCHECK(special_class[length] == kRangeEndMarker);
4836 633818 : if (ranges->length() * 2 != length) {
4837 : return false;
4838 : }
4839 37008 : for (int i = 0; i < length; i += 2) {
4840 82576 : CharacterRange range = ranges->at(i >> 1);
4841 78309 : if (range.from() != special_class[i] ||
4842 37021 : range.to() != special_class[i + 1] - 1) {
4843 : return false;
4844 : }
4845 : }
4846 : return true;
4847 : }
4848 :
4849 :
4850 300739 : bool RegExpCharacterClass::is_standard(Zone* zone) {
4851 : // TODO(lrn): Remove need for this function, by not throwing away information
4852 : // along the way.
4853 300739 : if (is_negated()) {
4854 : return false;
4855 : }
4856 293942 : if (set_.is_standard()) {
4857 : return true;
4858 : }
4859 219358 : if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4860 : set_.set_standard_set_type('s');
4861 1970 : return true;
4862 : }
4863 217388 : if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4864 : set_.set_standard_set_type('S');
4865 198 : return true;
4866 : }
4867 217190 : if (CompareInverseRanges(set_.ranges(zone),
4868 : kLineTerminatorRanges,
4869 217190 : kLineTerminatorRangeCount)) {
4870 : set_.set_standard_set_type('.');
4871 9954 : return true;
4872 : }
4873 207236 : if (CompareRanges(set_.ranges(zone),
4874 : kLineTerminatorRanges,
4875 207236 : kLineTerminatorRangeCount)) {
4876 : set_.set_standard_set_type('n');
4877 12 : return true;
4878 : }
4879 207224 : if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4880 : set_.set_standard_set_type('w');
4881 4312 : return true;
4882 : }
4883 202912 : if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4884 : set_.set_standard_set_type('W');
4885 137 : return true;
4886 : }
4887 : return false;
4888 : }
4889 :
4890 :
4891 3163 : UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4892 : ZoneList<CharacterRange>* base)
4893 : : zone_(zone),
4894 : table_(zone),
4895 : bmp_(nullptr),
4896 : lead_surrogates_(nullptr),
4897 : trail_surrogates_(nullptr),
4898 6326 : non_bmp_(nullptr) {
4899 : // The unicode range splitter categorizes given character ranges into:
4900 : // - Code points from the BMP representable by one code unit.
4901 : // - Code points outside the BMP that need to be split into surrogate pairs.
4902 : // - Lone lead surrogates.
4903 : // - Lone trail surrogates.
4904 : // Lone surrogates are valid code points, even though no actual characters.
4905 : // They require special matching to make sure we do not split surrogate pairs.
4906 : // We use the dispatch table to accomplish this. The base range is split up
4907 : // by the table by the overlay ranges, and the Call callback is used to
4908 : // filter and collect ranges for each category.
4909 176024 : for (int i = 0; i < base->length(); i++) {
4910 257710 : table_.AddRange(base->at(i), kBase, zone_);
4911 : }
4912 : // Add overlay ranges.
4913 : table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4914 3163 : kBmpCodePoints, zone_);
4915 : table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4916 3163 : kLeadSurrogates, zone_);
4917 : table_.AddRange(
4918 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4919 3163 : kTrailSurrogates, zone_);
4920 : table_.AddRange(
4921 : CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4922 3163 : kBmpCodePoints, zone_);
4923 : table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4924 3163 : kNonBmpCodePoints, zone_);
4925 : table_.ForEach(this);
4926 3163 : }
4927 :
4928 :
4929 182377 : void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4930 182377 : OutSet* outset = entry.out_set();
4931 364754 : if (!outset->Get(kBase)) return;
4932 : ZoneList<CharacterRange>** target = NULL;
4933 90161 : if (outset->Get(kBmpCodePoints)) {
4934 62051 : target = &bmp_;
4935 28110 : } else if (outset->Get(kLeadSurrogates)) {
4936 1538 : target = &lead_surrogates_;
4937 26572 : } else if (outset->Get(kTrailSurrogates)) {
4938 1538 : target = &trail_surrogates_;
4939 : } else {
4940 : DCHECK(outset->Get(kNonBmpCodePoints));
4941 25034 : target = &non_bmp_;
4942 : }
4943 97980 : if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4944 180322 : (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4945 : }
4946 :
4947 :
4948 8424 : void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4949 3156 : RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4950 : ZoneList<CharacterRange>* bmp = splitter->bmp();
4951 6312 : if (bmp == nullptr) return;
4952 : result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4953 5268 : compiler->zone(), bmp, compiler->read_backward(), on_success)));
4954 : }
4955 :
4956 :
4957 32564 : void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4958 : RegExpNode* on_success,
4959 3156 : UnicodeRangeSplitter* splitter) {
4960 : ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4961 6312 : if (non_bmp == nullptr) return;
4962 : DCHECK(compiler->unicode());
4963 : DCHECK(!compiler->one_byte());
4964 : Zone* zone = compiler->zone();
4965 2095 : CharacterRange::Canonicalize(non_bmp);
4966 54244 : for (int i = 0; i < non_bmp->length(); i++) {
4967 : // Match surrogate pair.
4968 : // E.g. [\u10005-\u11005] becomes
4969 : // \ud800[\udc05-\udfff]|
4970 : // [\ud801-\ud803][\udc00-\udfff]|
4971 : // \ud804[\udc00-\udc05]
4972 52149 : uc32 from = non_bmp->at(i).from();
4973 25027 : uc32 to = non_bmp->at(i).to();
4974 25027 : uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4975 : uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4976 25027 : uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4977 : uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4978 25027 : if (from_l == to_l) {
4979 : // The lead surrogate is the same.
4980 : result->AddAlternative(
4981 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4982 : zone, CharacterRange::Singleton(from_l),
4983 : CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4984 22176 : on_success)));
4985 : } else {
4986 2851 : if (from_t != kTrailSurrogateStart) {
4987 : // Add [from_l][from_t-\udfff]
4988 : result->AddAlternative(
4989 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4990 : zone, CharacterRange::Singleton(from_l),
4991 : CharacterRange::Range(from_t, kTrailSurrogateEnd),
4992 1410 : compiler->read_backward(), on_success)));
4993 1410 : from_l++;
4994 : }
4995 2851 : if (to_t != kTrailSurrogateEnd) {
4996 : // Add [to_l][\udc00-to_t]
4997 : result->AddAlternative(
4998 : GuardedAlternative(TextNode::CreateForSurrogatePair(
4999 : zone, CharacterRange::Singleton(to_l),
5000 : CharacterRange::Range(kTrailSurrogateStart, to_t),
5001 1146 : compiler->read_backward(), on_success)));
5002 1146 : to_l--;
5003 : }
5004 2851 : if (from_l <= to_l) {
5005 : // Add [from_l-to_l][\udc00-\udfff]
5006 : result->AddAlternative(
5007 : GuardedAlternative(TextNode::CreateForSurrogatePair(
5008 : zone, CharacterRange::Range(from_l, to_l),
5009 : CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
5010 2581 : compiler->read_backward(), on_success)));
5011 : }
5012 : }
5013 : }
5014 : }
5015 :
5016 :
5017 1537 : RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
5018 1537 : RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
5019 : ZoneList<CharacterRange>* match, RegExpNode* on_success,
5020 : bool read_backward) {
5021 : Zone* zone = compiler->zone();
5022 : RegExpNode* match_node = TextNode::CreateForCharacterRanges(
5023 1537 : zone, match, read_backward, on_success);
5024 : int stack_register = compiler->UnicodeLookaroundStackRegister();
5025 : int position_register = compiler->UnicodeLookaroundPositionRegister();
5026 : RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5027 1537 : position_register);
5028 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5029 1537 : zone, lookbehind, !read_backward, lookaround.on_match_success());
5030 1537 : return lookaround.ForMatch(negative_match);
5031 : }
5032 :
5033 :
5034 1525 : RegExpNode* MatchAndNegativeLookaroundInReadDirection(
5035 1525 : RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
5036 : ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
5037 : bool read_backward) {
5038 : Zone* zone = compiler->zone();
5039 : int stack_register = compiler->UnicodeLookaroundStackRegister();
5040 : int position_register = compiler->UnicodeLookaroundPositionRegister();
5041 : RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5042 1525 : position_register);
5043 : RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5044 1525 : zone, lookahead, read_backward, lookaround.on_match_success());
5045 : return TextNode::CreateForCharacterRanges(
5046 1525 : zone, match, read_backward, lookaround.ForMatch(negative_match));
5047 : }
5048 :
5049 :
5050 6218 : void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5051 : RegExpNode* on_success,
5052 3156 : UnicodeRangeSplitter* splitter) {
5053 : ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5054 6312 : if (lead_surrogates == nullptr) return;
5055 : Zone* zone = compiler->zone();
5056 : // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5057 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5058 1531 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5059 :
5060 : RegExpNode* match;
5061 1531 : if (compiler->read_backward()) {
5062 : // Reading backward. Assert that reading forward, there is no trail
5063 : // surrogate, and then backward match the lead surrogate.
5064 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
5065 114 : compiler, trail_surrogates, lead_surrogates, on_success, true);
5066 : } else {
5067 : // Reading forward. Forward match the lead surrogate and assert that
5068 : // no trail surrogate follows.
5069 : match = MatchAndNegativeLookaroundInReadDirection(
5070 1417 : compiler, lead_surrogates, trail_surrogates, on_success, false);
5071 : }
5072 : result->AddAlternative(GuardedAlternative(match));
5073 : }
5074 :
5075 :
5076 6218 : void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5077 : RegExpNode* on_success,
5078 3156 : UnicodeRangeSplitter* splitter) {
5079 : ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5080 6312 : if (trail_surrogates == nullptr) return;
5081 : Zone* zone = compiler->zone();
5082 : // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5083 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5084 1531 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5085 :
5086 : RegExpNode* match;
5087 1531 : if (compiler->read_backward()) {
5088 : // Reading backward. Backward match the trail surrogate and assert that no
5089 : // lead surrogate precedes it.
5090 : match = MatchAndNegativeLookaroundInReadDirection(
5091 108 : compiler, trail_surrogates, lead_surrogates, on_success, true);
5092 : } else {
5093 : // Reading forward. Assert that reading backward, there is no lead
5094 : // surrogate, and then forward match the trail surrogate.
5095 : match = NegativeLookaroundAgainstReadDirectionAndMatch(
5096 1423 : compiler, lead_surrogates, trail_surrogates, on_success, false);
5097 : }
5098 : result->AddAlternative(GuardedAlternative(match));
5099 : }
5100 :
5101 2181 : RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5102 : RegExpNode* on_success) {
5103 : // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5104 : DCHECK(!compiler->read_backward());
5105 : Zone* zone = compiler->zone();
5106 : // Advance any character. If the character happens to be a lead surrogate and
5107 : // we advanced into the middle of a surrogate pair, it will work out, as
5108 : // nothing will match from there. We will have to advance again, consuming
5109 : // the associated trail surrogate.
5110 : ZoneList<CharacterRange>* range = CharacterRange::List(
5111 2181 : zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5112 2181 : return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5113 : }
5114 :
5115 741 : void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5116 : #ifdef V8_INTL_SUPPORT
5117 : // Use ICU to compute the case fold closure over the ranges.
5118 741 : icu::UnicodeSet set;
5119 95062 : for (int i = 0; i < ranges->length(); i++) {
5120 94321 : set.add(ranges->at(i).from(), ranges->at(i).to());
5121 : }
5122 : ranges->Clear();
5123 741 : set.closeOver(USET_CASE_INSENSITIVE);
5124 : // Full case mapping map single characters to multiple characters.
5125 : // Those are represented as strings in the set. Remove them so that
5126 : // we end up with only simple and common case mappings.
5127 741 : set.removeAllStrings();
5128 9233 : for (int i = 0; i < set.getRangeCount(); i++) {
5129 8492 : ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5130 16984 : zone);
5131 : }
5132 : // No errors and everything we collected have been ranges.
5133 741 : CharacterRange::Canonicalize(ranges);
5134 : #endif // V8_INTL_SUPPORT
5135 741 : }
5136 :
5137 :
5138 606304 : RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5139 : RegExpNode* on_success) {
5140 : set_.Canonicalize();
5141 : Zone* zone = compiler->zone();
5142 : ZoneList<CharacterRange>* ranges = this->ranges(zone);
5143 201224 : if (compiler->needs_unicode_case_equivalents()) {
5144 631 : AddUnicodeCaseEquivalents(ranges, zone);
5145 : }
5146 214564 : if (compiler->unicode() && !compiler->one_byte() &&
5147 : !contains_split_surrogate()) {
5148 5357 : if (is_negated()) {
5149 : ZoneList<CharacterRange>* negated =
5150 153 : new (zone) ZoneList<CharacterRange>(2, zone);
5151 153 : CharacterRange::Negate(ranges, negated, zone);
5152 : ranges = negated;
5153 : }
5154 5357 : if (ranges->length() == 0) {
5155 40 : ranges->Add(CharacterRange::Everything(), zone);
5156 : RegExpCharacterClass* fail =
5157 : new (zone) RegExpCharacterClass(ranges, NEGATED);
5158 40 : return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5159 : }
5160 5337 : if (standard_type() == '*') {
5161 2181 : return UnanchoredAdvance(compiler, on_success);
5162 : } else {
5163 3156 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5164 3156 : UnicodeRangeSplitter splitter(zone, ranges);
5165 3156 : AddBmpCharacters(compiler, result, on_success, &splitter);
5166 3156 : AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5167 3156 : AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5168 3156 : AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5169 : return result;
5170 : }
5171 : } else {
5172 391734 : return new (zone) TextNode(this, compiler->read_backward(), on_success);
5173 : }
5174 : }
5175 :
5176 :
5177 166326 : int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5178 166326 : RegExpAtom* atom1 = (*a)->AsAtom();
5179 166326 : RegExpAtom* atom2 = (*b)->AsAtom();
5180 166326 : uc16 character1 = atom1->data().at(0);
5181 166326 : uc16 character2 = atom2->data().at(0);
5182 166326 : if (character1 < character2) return -1;
5183 148127 : if (character1 > character2) return 1;
5184 17321 : return 0;
5185 : }
5186 :
5187 :
5188 : static unibrow::uchar Canonical(
5189 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5190 : unibrow::uchar c) {
5191 : unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5192 118700 : int length = canonicalize->get(c, '\0', chars);
5193 : DCHECK_LE(length, 1);
5194 : unibrow::uchar canonical = c;
5195 118700 : if (length == 1) canonical = chars[0];
5196 : return canonical;
5197 : }
5198 :
5199 :
5200 75482 : int CompareFirstCharCaseIndependent(
5201 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5202 : RegExpTree* const* a, RegExpTree* const* b) {
5203 75482 : RegExpAtom* atom1 = (*a)->AsAtom();
5204 75482 : RegExpAtom* atom2 = (*b)->AsAtom();
5205 75482 : unibrow::uchar character1 = atom1->data().at(0);
5206 75482 : unibrow::uchar character2 = atom2->data().at(0);
5207 75482 : if (character1 == character2) return 0;
5208 52611 : if (character1 >= 'a' || character2 >= 'a') {
5209 : character1 = Canonical(canonicalize, character1);
5210 : character2 = Canonical(canonicalize, character2);
5211 : }
5212 52611 : return static_cast<int>(character1) - static_cast<int>(character2);
5213 : }
5214 :
5215 :
5216 : // We can stable sort runs of atoms, since the order does not matter if they
5217 : // start with different characters.
5218 : // Returns true if any consecutive atoms were found.
5219 13742 : bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5220 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5221 12507 : int length = alternatives->length();
5222 : bool found_consecutive_atoms = false;
5223 23914 : for (int i = 0; i < length; i++) {
5224 26893 : while (i < length) {
5225 25618 : RegExpTree* alternative = alternatives->at(i);
5226 25618 : if (alternative->IsAtom()) break;
5227 14211 : i++;
5228 : }
5229 : // i is length or it is the index of an atom.
5230 12682 : if (i == length) break;
5231 : int first_atom = i;
5232 11407 : i++;
5233 92628 : while (i < length) {
5234 70106 : RegExpTree* alternative = alternatives->at(i);
5235 70106 : if (!alternative->IsAtom()) break;
5236 69814 : i++;
5237 : }
5238 : // Sort atoms to get ones with common prefixes together.
5239 : // This step is more tricky if we are in a case-independent regexp,
5240 : // because it would change /is|I/ to /I|is/, and order matters when
5241 : // the regexp parts don't match only disjoint starting points. To fix
5242 : // this we have a version of CompareFirstChar that uses case-
5243 : // independent character classes for comparison.
5244 : DCHECK_LT(first_atom, alternatives->length());
5245 : DCHECK_LE(i, alternatives->length());
5246 : DCHECK_LE(first_atom, i);
5247 11407 : if (compiler->ignore_case()) {
5248 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5249 1235 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5250 : auto compare_closure =
5251 : [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5252 75482 : return CompareFirstCharCaseIndependent(canonicalize, a, b);
5253 75482 : };
5254 1235 : alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5255 : } else {
5256 10172 : alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5257 : }
5258 11407 : if (i - first_atom > 1) found_consecutive_atoms = true;
5259 : }
5260 12507 : return found_consecutive_atoms;
5261 : }
5262 :
5263 :
5264 : // Optimizes ab|ac|az to a(?:b|c|d).
5265 17915 : void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5266 : Zone* zone = compiler->zone();
5267 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5268 11075 : int length = alternatives->length();
5269 :
5270 : int write_posn = 0;
5271 : int i = 0;
5272 92226 : while (i < length) {
5273 70076 : RegExpTree* alternative = alternatives->at(i);
5274 70076 : if (!alternative->IsAtom()) {
5275 710 : alternatives->at(write_posn++) = alternatives->at(i);
5276 355 : i++;
5277 355 : continue;
5278 : }
5279 69721 : RegExpAtom* atom = alternative->AsAtom();
5280 69721 : unibrow::uchar common_prefix = atom->data().at(0);
5281 : int first_with_prefix = i;
5282 : int prefix_length = atom->length();
5283 69721 : i++;
5284 150610 : while (i < length) {
5285 69925 : alternative = alternatives->at(i);
5286 69925 : if (!alternative->IsAtom()) break;
5287 69814 : atom = alternative->AsAtom();
5288 69814 : unibrow::uchar new_prefix = atom->data().at(0);
5289 69814 : if (new_prefix != common_prefix) {
5290 58930 : if (!compiler->ignore_case()) break;
5291 : unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5292 6840 : compiler->isolate()->regexp_macro_assembler_canonicalize();
5293 : new_prefix = Canonical(canonicalize, new_prefix);
5294 : common_prefix = Canonical(canonicalize, common_prefix);
5295 6840 : if (new_prefix != common_prefix) break;
5296 : }
5297 : prefix_length = Min(prefix_length, atom->length());
5298 11168 : i++;
5299 : }
5300 69721 : if (i > first_with_prefix + 2) {
5301 : // Found worthwhile run of alternatives with common prefix of at least one
5302 : // character. The sorting function above did not sort on more than one
5303 : // character for reasons of correctness, but there may still be a longer
5304 : // common prefix if the terms were similar or presorted in the input.
5305 : // Find out how long the common prefix is.
5306 727 : int run_length = i - first_with_prefix;
5307 727 : atom = alternatives->at(first_with_prefix)->AsAtom();
5308 3011 : for (int j = 1; j < run_length && prefix_length > 1; j++) {
5309 : RegExpAtom* old_atom =
5310 4568 : alternatives->at(j + first_with_prefix)->AsAtom();
5311 5458 : for (int k = 1; k < prefix_length; k++) {
5312 7800 : if (atom->data().at(k) != old_atom->data().at(k)) {
5313 : prefix_length = k;
5314 : break;
5315 : }
5316 : }
5317 : }
5318 : RegExpAtom* prefix =
5319 : new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5320 727 : ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5321 : pair->Add(prefix, zone);
5322 : ZoneList<RegExpTree*>* suffixes =
5323 727 : new (zone) ZoneList<RegExpTree*>(run_length, zone);
5324 11998 : for (int j = 0; j < run_length; j++) {
5325 : RegExpAtom* old_atom =
5326 22542 : alternatives->at(j + first_with_prefix)->AsAtom();
5327 : int len = old_atom->length();
5328 11271 : if (len == prefix_length) {
5329 : suffixes->Add(new (zone) RegExpEmpty(), zone);
5330 : } else {
5331 : RegExpTree* suffix = new (zone) RegExpAtom(
5332 : old_atom->data().SubVector(prefix_length, old_atom->length()));
5333 : suffixes->Add(suffix, zone);
5334 : }
5335 : }
5336 727 : pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5337 1454 : alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5338 : } else {
5339 : // Just copy any non-worthwhile alternatives.
5340 69618 : for (int j = first_with_prefix; j < i; j++) {
5341 139236 : alternatives->at(write_posn++) = alternatives->at(j);
5342 : }
5343 : }
5344 : }
5345 : alternatives->Rewind(write_posn); // Trim end of array.
5346 11075 : }
5347 :
5348 :
5349 : // Optimizes b|c|z to [bcz].
5350 12507 : void RegExpDisjunction::FixSingleCharacterDisjunctions(
5351 12507 : RegExpCompiler* compiler) {
5352 : Zone* zone = compiler->zone();
5353 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5354 12507 : int length = alternatives->length();
5355 : const bool unicode = compiler->unicode();
5356 :
5357 : int write_posn = 0;
5358 : int i = 0;
5359 101592 : while (i < length) {
5360 76578 : RegExpTree* alternative = alternatives->at(i);
5361 76578 : if (!alternative->IsAtom()) {
5362 30460 : alternatives->at(write_posn++) = alternatives->at(i);
5363 15230 : i++;
5364 15230 : continue;
5365 : }
5366 61348 : RegExpAtom* atom = alternative->AsAtom();
5367 61348 : if (atom->length() != 1) {
5368 102590 : alternatives->at(write_posn++) = alternatives->at(i);
5369 51295 : i++;
5370 51295 : continue;
5371 : }
5372 : DCHECK_IMPLIES(unicode,
5373 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5374 : bool contains_trail_surrogate =
5375 10053 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5376 : int first_in_run = i;
5377 10053 : i++;
5378 28708 : while (i < length) {
5379 18251 : alternative = alternatives->at(i);
5380 18251 : if (!alternative->IsAtom()) break;
5381 17958 : atom = alternative->AsAtom();
5382 17958 : if (atom->length() != 1) break;
5383 : DCHECK_IMPLIES(unicode,
5384 : !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5385 : contains_trail_surrogate |=
5386 17204 : unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5387 8602 : i++;
5388 : }
5389 10053 : if (i > first_in_run + 1) {
5390 : // Found non-trivial run of single-character alternatives.
5391 322 : int run_length = i - first_in_run;
5392 : ZoneList<CharacterRange>* ranges =
5393 322 : new (zone) ZoneList<CharacterRange>(2, zone);
5394 9246 : for (int j = 0; j < run_length; j++) {
5395 17848 : RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5396 : DCHECK_EQ(old_atom->length(), 1);
5397 17848 : ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5398 : }
5399 : RegExpCharacterClass::Flags flags;
5400 322 : if (unicode && contains_trail_surrogate) {
5401 : flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5402 : }
5403 322 : alternatives->at(write_posn++) =
5404 322 : new (zone) RegExpCharacterClass(ranges, flags);
5405 : } else {
5406 : // Just copy any trivial alternatives.
5407 9731 : for (int j = first_in_run; j < i; j++) {
5408 19462 : alternatives->at(write_posn++) = alternatives->at(j);
5409 : }
5410 : }
5411 : }
5412 : alternatives->Rewind(write_posn); // Trim end of array.
5413 12507 : }
5414 :
5415 :
5416 31412 : RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5417 15858 : RegExpNode* on_success) {
5418 : ZoneList<RegExpTree*>* alternatives = this->alternatives();
5419 :
5420 43919 : if (alternatives->length() > 2) {
5421 12507 : bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5422 12507 : if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5423 12507 : FixSingleCharacterDisjunctions(compiler);
5424 12507 : if (alternatives->length() == 1) {
5425 304 : return alternatives->at(0)->ToNode(compiler, on_success);
5426 : }
5427 : }
5428 :
5429 : int length = alternatives->length();
5430 :
5431 : ChoiceNode* result =
5432 15554 : new(compiler->zone()) ChoiceNode(length, compiler->zone());
5433 98527 : for (int i = 0; i < length; i++) {
5434 : GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5435 82973 : on_success));
5436 : result->AddAlternative(alternative);
5437 : }
5438 : return result;
5439 : }
5440 :
5441 :
5442 1540393 : RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5443 3080786 : RegExpNode* on_success) {
5444 : return ToNode(min(),
5445 : max(),
5446 : is_greedy(),
5447 : body(),
5448 : compiler,
5449 3080786 : on_success);
5450 : }
5451 :
5452 :
5453 : // Scoped object to keep track of how much we unroll quantifier loops in the
5454 : // regexp graph generator.
5455 : class RegExpExpansionLimiter {
5456 : public:
5457 : static const int kMaxExpansionFactor = 6;
5458 82020 : RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5459 : : compiler_(compiler),
5460 : saved_expansion_factor_(compiler->current_expansion_factor()),
5461 82020 : ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5462 : DCHECK(factor > 0);
5463 96603 : if (ok_to_expand_) {
5464 96603 : if (factor > kMaxExpansionFactor) {
5465 : // Avoid integer overflow of the current expansion factor.
5466 : ok_to_expand_ = false;
5467 : compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5468 : } else {
5469 96447 : int new_factor = saved_expansion_factor_ * factor;
5470 96447 : ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5471 : compiler->set_current_expansion_factor(new_factor);
5472 : }
5473 : }
5474 : }
5475 :
5476 : ~RegExpExpansionLimiter() {
5477 : compiler_->set_current_expansion_factor(saved_expansion_factor_);
5478 : }
5479 :
5480 : bool ok_to_expand() { return ok_to_expand_; }
5481 :
5482 : private:
5483 : RegExpCompiler* compiler_;
5484 : int saved_expansion_factor_;
5485 : bool ok_to_expand_;
5486 :
5487 : DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5488 : };
5489 :
5490 :
5491 1637877 : RegExpNode* RegExpQuantifier::ToNode(int min,
5492 : int max,
5493 : bool is_greedy,
5494 : RegExpTree* body,
5495 4894874 : RegExpCompiler* compiler,
5496 : RegExpNode* on_success,
5497 : bool not_at_start) {
5498 : // x{f, t} becomes this:
5499 : //
5500 : // (r++)<-.
5501 : // | `
5502 : // | (x)
5503 : // v ^
5504 : // (r=0)-->(?)---/ [if r < t]
5505 : // |
5506 : // [if r >= f] \----> ...
5507 : //
5508 :
5509 : // 15.10.2.5 RepeatMatcher algorithm.
5510 : // The parser has already eliminated the case where max is 0. In the case
5511 : // where max_match is zero the parser has removed the quantifier if min was
5512 : // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5513 :
5514 : // If we know that we cannot match zero length then things are a little
5515 : // simpler since we don't need to make the special zero length match check
5516 : // from step 2.1. If the min and max are small we can unroll a little in
5517 : // this case.
5518 : static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5519 : static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5520 1637877 : if (max == 0) return on_success; // This can happen due to recursion.
5521 1636014 : bool body_can_be_empty = (body->min_match() == 0);
5522 : int body_start_reg = RegExpCompiler::kNoRegister;
5523 1636014 : Interval capture_registers = body->CaptureRegisters();
5524 1636014 : bool needs_capture_clearing = !capture_registers.is_empty();
5525 : Zone* zone = compiler->zone();
5526 :
5527 1636014 : if (body_can_be_empty) {
5528 : body_start_reg = compiler->AllocateRegister();
5529 1635339 : } else if (compiler->optimize() && !needs_capture_clearing) {
5530 : // Only unroll if there are no captures and the body can't be
5531 : // empty.
5532 : {
5533 : RegExpExpansionLimiter limiter(
5534 82020 : compiler, min + ((max != min) ? 1 : 0));
5535 82020 : if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5536 9369 : int new_max = (max == kInfinity) ? max : max - min;
5537 : // Recurse once to get the loop or optional matches after the fixed
5538 : // ones.
5539 : RegExpNode* answer = ToNode(
5540 9369 : 0, new_max, is_greedy, body, compiler, on_success, true);
5541 : // Unroll the forced matches from 0 to min. This can cause chains of
5542 : // TextNodes (which the parser does not generate). These should be
5543 : // combined if it turns out they hinder good code generation.
5544 22743 : for (int i = 0; i < min; i++) {
5545 13374 : answer = body->ToNode(compiler, answer);
5546 : }
5547 : return answer;
5548 : }
5549 : }
5550 72651 : if (max <= kMaxUnrolledMaxMatches && min == 0) {
5551 : DCHECK(max > 0); // Due to the 'if' above.
5552 : RegExpExpansionLimiter limiter(compiler, max);
5553 14583 : if (limiter.ok_to_expand()) {
5554 : // Unroll the optional matches up to max.
5555 : RegExpNode* answer = on_success;
5556 14205 : for (int i = 0; i < max; i++) {
5557 14205 : ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5558 14205 : if (is_greedy) {
5559 : alternation->AddAlternative(
5560 14030 : GuardedAlternative(body->ToNode(compiler, answer)));
5561 : alternation->AddAlternative(GuardedAlternative(on_success));
5562 : } else {
5563 : alternation->AddAlternative(GuardedAlternative(on_success));
5564 : alternation->AddAlternative(
5565 175 : GuardedAlternative(body->ToNode(compiler, answer)));
5566 : }
5567 : answer = alternation;
5568 15773 : if (not_at_start && !compiler->read_backward()) {
5569 : alternation->set_not_at_start();
5570 : }
5571 : }
5572 : return answer;
5573 : }
5574 : }
5575 : }
5576 1612556 : bool has_min = min > 0;
5577 1612556 : bool has_max = max < RegExpTree::kInfinity;
5578 1612556 : bool needs_counter = has_min || has_max;
5579 : int reg_ctr = needs_counter
5580 : ? compiler->AllocateRegister()
5581 1612556 : : RegExpCompiler::kNoRegister;
5582 : LoopChoiceNode* center = new (zone)
5583 1612556 : LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5584 1618786 : if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5585 : RegExpNode* loop_return = needs_counter
5586 : ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5587 1612556 : : static_cast<RegExpNode*>(center);
5588 1612556 : if (body_can_be_empty) {
5589 : // If the body can be empty we need to check if it was and then
5590 : // backtrack.
5591 : loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5592 : reg_ctr,
5593 : min,
5594 675 : loop_return);
5595 : }
5596 1612556 : RegExpNode* body_node = body->ToNode(compiler, loop_return);
5597 1612556 : if (body_can_be_empty) {
5598 : // If the body can be empty we need to store the start position
5599 : // so we can bail out if it was empty.
5600 675 : body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5601 : }
5602 1612556 : if (needs_capture_clearing) {
5603 : // Before entering the body of this loop we need to clear captures.
5604 4107 : body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5605 : }
5606 : GuardedAlternative body_alt(body_node);
5607 1612556 : if (has_max) {
5608 : Guard* body_guard =
5609 : new(zone) Guard(reg_ctr, Guard::LT, max);
5610 1504583 : body_alt.AddGuard(body_guard, zone);
5611 : }
5612 : GuardedAlternative rest_alt(on_success);
5613 1612556 : if (has_min) {
5614 : Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5615 3167 : rest_alt.AddGuard(rest_guard, zone);
5616 : }
5617 1612556 : if (is_greedy) {
5618 : center->AddLoopAlternative(body_alt);
5619 : center->AddContinueAlternative(rest_alt);
5620 : } else {
5621 : center->AddContinueAlternative(rest_alt);
5622 : center->AddLoopAlternative(body_alt);
5623 : }
5624 1612556 : if (needs_counter) {
5625 1506002 : return ActionNode::SetRegister(reg_ctr, 0, center);
5626 : } else {
5627 : return center;
5628 : }
5629 : }
5630 :
5631 : namespace {
5632 : // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5633 : // \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
5634 36 : RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5635 : RegExpNode* on_success,
5636 : RegExpAssertion::AssertionType type) {
5637 : DCHECK(compiler->needs_unicode_case_equivalents());
5638 : Zone* zone = compiler->zone();
5639 : ZoneList<CharacterRange>* word_range =
5640 36 : new (zone) ZoneList<CharacterRange>(2, zone);
5641 36 : CharacterRange::AddClassEscape('w', word_range, true, zone);
5642 : int stack_register = compiler->UnicodeLookaroundStackRegister();
5643 : int position_register = compiler->UnicodeLookaroundPositionRegister();
5644 36 : ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5645 : // Add two choices. The (non-)boundary could start with a word or
5646 : // a non-word-character.
5647 108 : for (int i = 0; i < 2; i++) {
5648 72 : bool lookbehind_for_word = i == 0;
5649 : bool lookahead_for_word =
5650 72 : (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5651 : // Look to the left.
5652 : RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5653 72 : stack_register, position_register);
5654 : RegExpNode* backward = TextNode::CreateForCharacterRanges(
5655 72 : zone, word_range, true, lookbehind.on_match_success());
5656 : // Look to the right.
5657 : RegExpLookaround::Builder lookahead(lookahead_for_word,
5658 : lookbehind.ForMatch(backward),
5659 72 : stack_register, position_register);
5660 : RegExpNode* forward = TextNode::CreateForCharacterRanges(
5661 72 : zone, word_range, false, lookahead.on_match_success());
5662 72 : result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5663 : }
5664 36 : return result;
5665 : }
5666 : } // anonymous namespace
5667 :
5668 9625 : RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5669 9625 : RegExpNode* on_success) {
5670 : NodeInfo info;
5671 : Zone* zone = compiler->zone();
5672 :
5673 9625 : switch (assertion_type()) {
5674 : case START_OF_LINE:
5675 797 : return AssertionNode::AfterNewline(on_success);
5676 : case START_OF_INPUT:
5677 4673 : return AssertionNode::AtStart(on_success);
5678 : case BOUNDARY:
5679 : return compiler->needs_unicode_case_equivalents()
5680 : ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)
5681 185 : : AssertionNode::AtBoundary(on_success);
5682 : case NON_BOUNDARY:
5683 : return compiler->needs_unicode_case_equivalents()
5684 : ? BoundaryAssertionAsLookaround(compiler, on_success,
5685 : NON_BOUNDARY)
5686 161 : : AssertionNode::AtNonBoundary(on_success);
5687 : case END_OF_INPUT:
5688 3725 : return AssertionNode::AtEnd(on_success);
5689 : case END_OF_LINE: {
5690 : // Compile $ in multiline regexps as an alternation with a positive
5691 : // lookahead in one side and an end-of-input on the other side.
5692 : // We need two registers for the lookahead.
5693 : int stack_pointer_register = compiler->AllocateRegister();
5694 : int position_register = compiler->AllocateRegister();
5695 : // The ChoiceNode to distinguish between a newline and end-of-input.
5696 84 : ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5697 : // Create a newline atom.
5698 : ZoneList<CharacterRange>* newline_ranges =
5699 84 : new(zone) ZoneList<CharacterRange>(3, zone);
5700 84 : CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5701 : RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5702 : TextNode* newline_matcher = new (zone) TextNode(
5703 : newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5704 : stack_pointer_register, position_register,
5705 : 0, // No captures inside.
5706 : -1, // Ignored if no captures.
5707 168 : on_success));
5708 : // Create an end-of-input matcher.
5709 : RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5710 : stack_pointer_register,
5711 : position_register,
5712 84 : newline_matcher);
5713 : // Add the two alternatives to the ChoiceNode.
5714 : GuardedAlternative eol_alternative(end_of_line);
5715 : result->AddAlternative(eol_alternative);
5716 84 : GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5717 : result->AddAlternative(end_alternative);
5718 : return result;
5719 : }
5720 : default:
5721 0 : UNREACHABLE();
5722 : }
5723 : return on_success;
5724 : }
5725 :
5726 :
5727 6180 : RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5728 3090 : RegExpNode* on_success) {
5729 : return new (compiler->zone())
5730 : BackReferenceNode(RegExpCapture::StartRegister(index()),
5731 : RegExpCapture::EndRegister(index()),
5732 3090 : compiler->read_backward(), on_success);
5733 : }
5734 :
5735 :
5736 1364 : RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5737 : RegExpNode* on_success) {
5738 1364 : return on_success;
5739 : }
5740 :
5741 :
5742 5268 : RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5743 : int stack_pointer_register,
5744 : int position_register,
5745 : int capture_register_count,
5746 : int capture_register_start)
5747 : : is_positive_(is_positive),
5748 : on_success_(on_success),
5749 : stack_pointer_register_(stack_pointer_register),
5750 5268 : position_register_(position_register) {
5751 5268 : if (is_positive_) {
5752 : on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5753 : stack_pointer_register, position_register, capture_register_count,
5754 1753 : capture_register_start, on_success_);
5755 : } else {
5756 : Zone* zone = on_success_->zone();
5757 : on_match_success_ = new (zone) NegativeSubmatchSuccess(
5758 : stack_pointer_register, position_register, capture_register_count,
5759 3515 : capture_register_start, zone);
5760 : }
5761 5268 : }
5762 :
5763 :
5764 5268 : RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5765 5268 : if (is_positive_) {
5766 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5767 1753 : position_register_, match);
5768 : } else {
5769 3515 : Zone* zone = on_success_->zone();
5770 : // We use a ChoiceNode to represent the negative lookaround. The first
5771 : // alternative is the negative match. On success, the end node backtracks.
5772 : // On failure, the second alternative is tried and leads to success.
5773 : // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5774 : // first exit when calculating quick checks.
5775 : ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5776 3515 : GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5777 : return ActionNode::BeginSubmatch(stack_pointer_register_,
5778 3515 : position_register_, choice_node);
5779 : }
5780 : }
5781 :
5782 :
5783 4024 : RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5784 4024 : RegExpNode* on_success) {
5785 : int stack_pointer_register = compiler->AllocateRegister();
5786 : int position_register = compiler->AllocateRegister();
5787 :
5788 : const int registers_per_capture = 2;
5789 : const int register_of_first_capture = 2;
5790 2012 : int register_count = capture_count_ * registers_per_capture;
5791 : int register_start =
5792 2012 : register_of_first_capture + capture_from_ * registers_per_capture;
5793 :
5794 : RegExpNode* result;
5795 : bool was_reading_backward = compiler->read_backward();
5796 2012 : compiler->set_read_backward(type() == LOOKBEHIND);
5797 : Builder builder(is_positive(), on_success, stack_pointer_register,
5798 2012 : position_register, register_count, register_start);
5799 2012 : RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5800 2012 : result = builder.ForMatch(match);
5801 : compiler->set_read_backward(was_reading_backward);
5802 2012 : return result;
5803 : }
5804 :
5805 :
5806 42614 : RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5807 42614 : RegExpNode* on_success) {
5808 42614 : return ToNode(body(), index(), compiler, on_success);
5809 : }
5810 :
5811 :
5812 135236 : RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5813 : int index,
5814 135236 : RegExpCompiler* compiler,
5815 : RegExpNode* on_success) {
5816 : DCHECK_NOT_NULL(body);
5817 : int start_reg = RegExpCapture::StartRegister(index);
5818 : int end_reg = RegExpCapture::EndRegister(index);
5819 135236 : if (compiler->read_backward()) std::swap(start_reg, end_reg);
5820 135236 : RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5821 135236 : RegExpNode* body_node = body->ToNode(compiler, store_end);
5822 135236 : return ActionNode::StorePosition(start_reg, true, body_node);
5823 : }
5824 :
5825 :
5826 63292 : RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5827 31646 : RegExpNode* on_success) {
5828 : ZoneList<RegExpTree*>* children = nodes();
5829 : RegExpNode* current = on_success;
5830 31646 : if (compiler->read_backward()) {
5831 2286 : for (int i = 0; i < children->length(); i++) {
5832 33542 : current = children->at(i)->ToNode(compiler, current);
5833 : }
5834 : } else {
5835 1644910 : for (int i = children->length() - 1; i >= 0; i--) {
5836 1613654 : current = children->at(i)->ToNode(compiler, current);
5837 : }
5838 : }
5839 31646 : return current;
5840 : }
5841 :
5842 :
5843 23640 : static void AddClass(const int* elmv,
5844 : int elmc,
5845 : ZoneList<CharacterRange>* ranges,
5846 : Zone* zone) {
5847 23640 : elmc--;
5848 : DCHECK(elmv[elmc] == kRangeEndMarker);
5849 164283 : for (int i = 0; i < elmc; i += 2) {
5850 : DCHECK(elmv[i] < elmv[i + 1]);
5851 281286 : ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5852 : }
5853 23640 : }
5854 :
5855 :
5856 32490 : static void AddClassNegated(const int *elmv,
5857 : int elmc,
5858 : ZoneList<CharacterRange>* ranges,
5859 : Zone* zone) {
5860 32490 : elmc--;
5861 : DCHECK(elmv[elmc] == kRangeEndMarker);
5862 : DCHECK(elmv[0] != 0x0000);
5863 : DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
5864 : uc16 last = 0x0000;
5865 136230 : for (int i = 0; i < elmc; i += 2) {
5866 : DCHECK(last <= elmv[i] - 1);
5867 : DCHECK(elmv[i] < elmv[i + 1]);
5868 207480 : ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5869 103740 : last = elmv[i + 1];
5870 : }
5871 64980 : ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5872 32490 : }
5873 :
5874 144440 : void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
5875 : bool add_unicode_case_equivalents,
5876 : Zone* zone) {
5877 144440 : if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5878 : // See #sec-runtime-semantics-wordcharacters-abstract-operation
5879 : // In case of unicode and ignore_case, we need to create the closure over
5880 : // case equivalent characters before negating.
5881 : ZoneList<CharacterRange>* new_ranges =
5882 110 : new (zone) ZoneList<CharacterRange>(2, zone);
5883 110 : AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5884 110 : AddUnicodeCaseEquivalents(new_ranges, zone);
5885 110 : if (type == 'W') {
5886 : ZoneList<CharacterRange>* negated =
5887 36 : new (zone) ZoneList<CharacterRange>(2, zone);
5888 36 : CharacterRange::Negate(new_ranges, negated, zone);
5889 : new_ranges = negated;
5890 : }
5891 : ranges->AddAll(*new_ranges, zone);
5892 144440 : return;
5893 : }
5894 144330 : AddClassEscape(type, ranges, zone);
5895 : }
5896 :
5897 144379 : void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
5898 : Zone* zone) {
5899 144379 : switch (type) {
5900 : case 's':
5901 9997 : AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5902 9997 : break;
5903 : case 'S':
5904 934 : AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5905 934 : break;
5906 : case 'w':
5907 8788 : AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5908 8788 : break;
5909 : case 'W':
5910 378 : AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5911 378 : break;
5912 : case 'd':
5913 4577 : AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5914 4577 : break;
5915 : case 'D':
5916 323 : AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5917 323 : break;
5918 : case '.':
5919 : AddClassNegated(kLineTerminatorRanges,
5920 : kLineTerminatorRangeCount,
5921 : ranges,
5922 30855 : zone);
5923 30855 : break;
5924 : // This is not a character range as defined by the spec but a
5925 : // convenient shorthand for a character class that matches any
5926 : // character.
5927 : case '*':
5928 176718 : ranges->Add(CharacterRange::Everything(), zone);
5929 88359 : break;
5930 : // This is the set of characters matched by the $ and ^ symbols
5931 : // in multiline mode.
5932 : case 'n':
5933 : AddClass(kLineTerminatorRanges,
5934 : kLineTerminatorRangeCount,
5935 : ranges,
5936 168 : zone);
5937 168 : break;
5938 : default:
5939 0 : UNREACHABLE();
5940 : }
5941 144379 : }
5942 :
5943 :
5944 0 : Vector<const int> CharacterRange::GetWordBounds() {
5945 0 : return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5946 : }
5947 :
5948 :
5949 74564 : void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5950 : ZoneList<CharacterRange>* ranges,
5951 : bool is_one_byte) {
5952 74564 : CharacterRange::Canonicalize(ranges);
5953 74564 : int range_count = ranges->length();
5954 310576 : for (int i = 0; i < range_count; i++) {
5955 86069 : CharacterRange range = ranges->at(i);
5956 : uc32 bottom = range.from();
5957 91414 : if (bottom > String::kMaxUtf16CodeUnit) return;
5958 : uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5959 : // Nothing to be done for surrogates.
5960 86069 : if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5961 80884 : if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5962 8237 : if (bottom > String::kMaxOneByteCharCode) return;
5963 8077 : if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5964 : }
5965 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5966 80724 : if (top == bottom) {
5967 : // If this is a singleton we just expand the one character.
5968 5976 : int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5969 10138 : for (int i = 0; i < length; i++) {
5970 4162 : uc32 chr = chars[i];
5971 4162 : if (chr != bottom) {
5972 4312 : ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5973 : }
5974 : }
5975 : } else {
5976 : // If this is a range we expand the characters block by block, expanding
5977 : // contiguous subranges (blocks) one at a time. The approach is as
5978 : // follows. For a given start character we look up the remainder of the
5979 : // block that contains it (represented by the end point), for instance we
5980 : // find 'z' if the character is 'c'. A block is characterized by the
5981 : // property that all characters uncanonicalize in the same way, except
5982 : // that each entry in the result is incremented by the distance from the
5983 : // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5984 : // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5985 : // we've found the end point we look up its uncanonicalization and
5986 : // produce a range for each element. For instance for [c-f] we look up
5987 : // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5988 : // it is not already contained in the input, so [c-f] will be skipped but
5989 : // [C-F] will be added. If this range is not completely contained in a
5990 : // block we do this for all the blocks covered by the range (handling
5991 : // characters that is not in a block as a "singleton block").
5992 : unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5993 : int pos = bottom;
5994 27635281 : while (pos <= top) {
5995 : int length =
5996 27560533 : isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5997 : uc32 block_end;
5998 27560533 : if (length == 0) {
5999 : block_end = pos;
6000 : } else {
6001 : DCHECK_EQ(1, length);
6002 26159 : block_end = equivalents[0];
6003 : }
6004 27560533 : int end = (block_end > top) ? top : block_end;
6005 : length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
6006 27560533 : equivalents);
6007 28780360 : for (int i = 0; i < length; i++) {
6008 1219827 : uc32 c = equivalents[i];
6009 1219827 : uc32 range_from = c - (block_end - pos);
6010 1219827 : uc32 range_to = c - (block_end - end);
6011 1219827 : if (!(bottom <= range_from && range_to <= top)) {
6012 44648 : ranges->Add(CharacterRange::Range(range_from, range_to), zone);
6013 : }
6014 : }
6015 27560533 : pos = end + 1;
6016 : }
6017 : }
6018 : }
6019 : }
6020 :
6021 :
6022 14 : bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
6023 : DCHECK_NOT_NULL(ranges);
6024 14 : int n = ranges->length();
6025 14 : if (n <= 1) return true;
6026 14 : int max = ranges->at(0).to();
6027 420 : for (int i = 1; i < n; i++) {
6028 406 : CharacterRange next_range = ranges->at(i);
6029 406 : if (next_range.from() <= max + 1) return false;
6030 : max = next_range.to();
6031 : }
6032 : return true;
6033 : }
6034 :
6035 :
6036 2212529 : ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
6037 2212529 : if (ranges_ == NULL) {
6038 88395 : ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
6039 88395 : CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
6040 : }
6041 2212529 : return ranges_;
6042 : }
6043 :
6044 :
6045 : // Move a number of elements in a zonelist to another position
6046 : // in the same list. Handles overlapping source and target areas.
6047 113987 : static void MoveRanges(ZoneList<CharacterRange>* list,
6048 : int from,
6049 : int to,
6050 : int count) {
6051 : // Ranges are potentially overlapping.
6052 113987 : if (from < to) {
6053 11651852 : for (int i = count - 1; i >= 0; i--) {
6054 34676229 : list->at(to + i) = list->at(from + i);
6055 : }
6056 : } else {
6057 6394988 : for (int i = 0; i < count; i++) {
6058 19184964 : list->at(to + i) = list->at(from + i);
6059 : }
6060 : }
6061 113987 : }
6062 :
6063 :
6064 188829 : static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6065 : int count,
6066 : CharacterRange insert) {
6067 : // Inserts a range into list[0..count[, which must be sorted
6068 : // by from value and non-overlapping and non-adjacent, using at most
6069 : // list[0..count] for the result. Returns the number of resulting
6070 : // canonicalized ranges. Inserting a range may collapse existing ranges into
6071 : // fewer ranges, so the return value can be anything in the range 1..count+1.
6072 188829 : uc32 from = insert.from();
6073 188829 : uc32 to = insert.to();
6074 : int start_pos = 0;
6075 : int end_pos = count;
6076 23727636 : for (int i = count - 1; i >= 0; i--) {
6077 23628799 : CharacterRange current = list->at(i);
6078 23628799 : if (current.from() > to + 1) {
6079 : end_pos = i;
6080 179830 : } else if (current.to() + 1 < from) {
6081 89992 : start_pos = i + 1;
6082 : break;
6083 : }
6084 : }
6085 :
6086 : // Inserted range overlaps, or is adjacent to, ranges at positions
6087 : // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6088 : // not affected by the insertion.
6089 : // If start_pos == end_pos, the range must be inserted before start_pos.
6090 : // if start_pos < end_pos, the entire range from start_pos to end_pos
6091 : // must be merged with the insert range.
6092 :
6093 188829 : if (start_pos == end_pos) {
6094 : // Insert between existing ranges at position start_pos.
6095 120391 : if (start_pos < count) {
6096 93109 : MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6097 : }
6098 120391 : list->at(start_pos) = insert;
6099 120391 : return count + 1;
6100 : }
6101 68438 : if (start_pos + 1 == end_pos) {
6102 : // Replace single existing range at position start_pos.
6103 47344 : CharacterRange to_replace = list->at(start_pos);
6104 : int new_from = Min(to_replace.from(), from);
6105 : int new_to = Max(to_replace.to(), to);
6106 47344 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6107 : return count;
6108 : }
6109 : // Replace a number of existing ranges from start_pos to end_pos - 1.
6110 : // Move the remaining ranges down.
6111 :
6112 21094 : int new_from = Min(list->at(start_pos).from(), from);
6113 42188 : int new_to = Max(list->at(end_pos - 1).to(), to);
6114 21094 : if (end_pos < count) {
6115 20878 : MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6116 : }
6117 21094 : list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6118 21094 : return count - (end_pos - start_pos) + 1;
6119 : }
6120 :
6121 :
6122 28 : void CharacterSet::Canonicalize() {
6123 : // Special/default classes are always considered canonical. The result
6124 : // of calling ranges() will be sorted.
6125 201280 : if (ranges_ == NULL) return;
6126 113137 : CharacterRange::Canonicalize(ranges_);
6127 : }
6128 :
6129 :
6130 585106 : void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6131 585106 : if (character_ranges->length() <= 1) return;
6132 : // Check whether ranges are already canonical (increasing, non-overlapping,
6133 : // non-adjacent).
6134 : int n = character_ranges->length();
6135 100285 : int max = character_ranges->at(0).to();
6136 : int i = 1;
6137 1704636 : while (i < n) {
6138 1518326 : CharacterRange current = character_ranges->at(i);
6139 1518326 : if (current.from() <= max + 1) {
6140 : break;
6141 : }
6142 : max = current.to();
6143 1504066 : i++;
6144 : }
6145 : // Canonical until the i'th range. If that's all of them, we are done.
6146 100285 : if (i == n) return;
6147 :
6148 : // The ranges at index i and forward are not canonicalized. Make them so by
6149 : // doing the equivalent of insertion sort (inserting each into the previous
6150 : // list, in order).
6151 : // Notice that inserting a range can reduce the number of ranges in the
6152 : // result due to combining of adjacent and overlapping ranges.
6153 : int read = i; // Range to insert.
6154 : int num_canonical = i; // Length of canonicalized part of list.
6155 188829 : do {
6156 : num_canonical = InsertRangeInCanonicalList(character_ranges,
6157 : num_canonical,
6158 188829 : character_ranges->at(read));
6159 188829 : read++;
6160 : } while (read < n);
6161 : character_ranges->Rewind(num_canonical);
6162 :
6163 : DCHECK(CharacterRange::IsCanonical(character_ranges));
6164 : }
6165 :
6166 :
6167 189 : void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6168 : ZoneList<CharacterRange>* negated_ranges,
6169 : Zone* zone) {
6170 : DCHECK(CharacterRange::IsCanonical(ranges));
6171 : DCHECK_EQ(0, negated_ranges->length());
6172 189 : int range_count = ranges->length();
6173 : uc32 from = 0;
6174 : int i = 0;
6175 378 : if (range_count > 0 && ranges->at(0).from() == 0) {
6176 32 : from = ranges->at(0).to() + 1;
6177 : i = 1;
6178 : }
6179 8219 : while (i < range_count) {
6180 8030 : CharacterRange range = ranges->at(i);
6181 16060 : negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6182 8030 : from = range.to() + 1;
6183 8030 : i++;
6184 : }
6185 189 : if (from < String::kMaxCodePoint) {
6186 : negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6187 290 : zone);
6188 : }
6189 189 : }
6190 :
6191 :
6192 : // -------------------------------------------------------------------
6193 : // Splay tree
6194 :
6195 :
6196 565258 : OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6197 272098 : if (Get(value))
6198 : return this;
6199 272091 : if (successors(zone) != NULL) {
6200 207466 : for (int i = 0; i < successors(zone)->length(); i++) {
6201 458488 : OutSet* successor = successors(zone)->at(i);
6202 458488 : if (successor->Get(value))
6203 : return successor;
6204 : }
6205 : } else {
6206 7054 : successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6207 : }
6208 21069 : OutSet* result = new(zone) OutSet(first_, remaining_);
6209 21069 : result->Set(value, zone);
6210 : successors(zone)->Add(result, zone);
6211 21069 : return result;
6212 : }
6213 :
6214 :
6215 983252 : void OutSet::Set(unsigned value, Zone *zone) {
6216 983252 : if (value < kFirstLimit) {
6217 493969 : first_ |= (1 << value);
6218 : } else {
6219 489283 : if (remaining_ == NULL)
6220 132833 : remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6221 845733 : if (remaining_->is_empty() || !remaining_->Contains(value))
6222 : remaining_->Add(value, zone);
6223 : }
6224 983252 : }
6225 :
6226 :
6227 44156534 : bool OutSet::Get(unsigned value) const {
6228 44156534 : if (value < kFirstLimit) {
6229 9352136 : return (first_ & (1 << value)) != 0;
6230 34804398 : } else if (remaining_ == NULL) {
6231 : return false;
6232 : } else {
6233 46719360 : return remaining_->Contains(value);
6234 : }
6235 : }
6236 :
6237 :
6238 : const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6239 :
6240 :
6241 102372 : void DispatchTable::AddRange(CharacterRange full_range, int value,
6242 : Zone* zone) {
6243 102372 : CharacterRange current = full_range;
6244 102372 : if (tree()->is_empty()) {
6245 : // If this is the first range we just insert into the table.
6246 : ZoneSplayTree<Config>::Locator loc;
6247 3247 : bool inserted = tree()->Insert(current.from(), &loc);
6248 : DCHECK(inserted);
6249 : USE(inserted);
6250 : loc.set_value(Entry(current.from(), current.to(),
6251 3247 : empty()->Extend(value, zone)));
6252 102372 : return;
6253 : }
6254 : // First see if there is a range to the left of this one that
6255 : // overlaps.
6256 : ZoneSplayTree<Config>::Locator loc;
6257 99125 : if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6258 188508 : Entry* entry = &loc.value();
6259 : // If we've found a range that overlaps with this one, and it
6260 : // starts strictly to the left of this one, we have to fix it
6261 : // because the following code only handles ranges that start on
6262 : // or after the start point of the range we're adding.
6263 187388 : if (entry->from() < current.from() && entry->to() >= current.from()) {
6264 : // Snap the overlapping range in half around the start point of
6265 : // the range we're adding.
6266 : CharacterRange left =
6267 560 : CharacterRange::Range(entry->from(), current.from() - 1);
6268 : CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6269 : // The left part of the overlapping range doesn't overlap.
6270 : // Truncate the whole entry to be just the left part.
6271 : entry->set_to(left.to());
6272 : // The right part is the one that overlaps. We add this part
6273 : // to the map and let the next step deal with merging it with
6274 : // the range we're adding.
6275 : ZoneSplayTree<Config>::Locator loc;
6276 560 : bool inserted = tree()->Insert(right.from(), &loc);
6277 : DCHECK(inserted);
6278 : USE(inserted);
6279 : loc.set_value(Entry(right.from(),
6280 : right.to(),
6281 : entry->out_set()));
6282 : }
6283 : }
6284 192933 : while (current.is_valid()) {
6285 469090 : if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6286 377375 : (loc.value().from() <= current.to()) &&
6287 93808 : (loc.value().to() >= current.from())) {
6288 370659 : Entry* entry = &loc.value();
6289 : // We have overlap. If there is space between the start point of
6290 : // the range we're adding and where the overlapping range starts
6291 : // then we have to add a range covering just that space.
6292 93808 : if (current.from() < entry->from()) {
6293 : ZoneSplayTree<Config>::Locator ins;
6294 83328 : bool inserted = tree()->Insert(current.from(), &ins);
6295 : DCHECK(inserted);
6296 : USE(inserted);
6297 : ins.set_value(Entry(current.from(),
6298 : entry->from() - 1,
6299 166656 : empty()->Extend(value, zone)));
6300 : current.set_from(entry->from());
6301 : }
6302 : DCHECK_EQ(current.from(), entry->from());
6303 : // If the overlapping range extends beyond the one we want to add
6304 : // we have to snap the right part off and add it separately.
6305 93808 : if (entry->to() > current.to()) {
6306 : ZoneSplayTree<Config>::Locator ins;
6307 5907 : bool inserted = tree()->Insert(current.to() + 1, &ins);
6308 : DCHECK(inserted);
6309 : USE(inserted);
6310 : ins.set_value(Entry(current.to() + 1,
6311 : entry->to(),
6312 : entry->out_set()));
6313 : entry->set_to(current.to());
6314 : }
6315 : DCHECK(entry->to() <= current.to());
6316 : // The overlapping range is now completely contained by the range
6317 : // we're adding so we can just update it and move the start point
6318 : // of the range we're adding just past it.
6319 : entry->AddValue(value, zone);
6320 : DCHECK(entry->to() + 1 > current.from());
6321 93808 : current.set_from(entry->to() + 1);
6322 : } else {
6323 : // There is no overlap so we can just add the range
6324 : ZoneSplayTree<Config>::Locator ins;
6325 91715 : bool inserted = tree()->Insert(current.from(), &ins);
6326 : DCHECK(inserted);
6327 : USE(inserted);
6328 : ins.set_value(Entry(current.from(),
6329 : current.to(),
6330 91715 : empty()->Extend(value, zone)));
6331 : break;
6332 : }
6333 : }
6334 : }
6335 :
6336 :
6337 77014 : OutSet* DispatchTable::Get(uc32 value) {
6338 : ZoneSplayTree<Config>::Locator loc;
6339 77014 : if (!tree()->FindGreatestLessThan(value, &loc))
6340 0 : return empty();
6341 131453 : Entry* entry = &loc.value();
6342 77014 : if (value <= entry->to())
6343 54439 : return entry->out_set();
6344 : else
6345 22575 : return empty();
6346 : }
6347 :
6348 :
6349 : // -------------------------------------------------------------------
6350 : // Analysis
6351 :
6352 :
6353 1340339 : void Analysis::EnsureAnalyzed(RegExpNode* that) {
6354 : StackLimitCheck check(isolate());
6355 1340339 : if (check.HasOverflowed()) {
6356 : fail("Stack overflow");
6357 : return;
6358 : }
6359 1339898 : if (that->info()->been_analyzed || that->info()->being_analyzed)
6360 : return;
6361 1116810 : that->info()->being_analyzed = true;
6362 1116810 : that->Accept(this);
6363 1116810 : that->info()->being_analyzed = false;
6364 1116810 : that->info()->been_analyzed = true;
6365 : }
6366 :
6367 :
6368 95682 : void Analysis::VisitEnd(EndNode* that) {
6369 : // nothing to do
6370 95682 : }
6371 :
6372 :
6373 747987 : void TextNode::CalculateOffsets() {
6374 356864 : int element_count = elements()->length();
6375 : // Set up the offsets of the elements relative to the start. This is a fixed
6376 : // quantity since a TextNode can only contain fixed-width things.
6377 : int cp_offset = 0;
6378 747987 : for (int i = 0; i < element_count; i++) {
6379 : TextElement& elm = elements()->at(i);
6380 : elm.set_cp_offset(cp_offset);
6381 391123 : cp_offset += elm.length();
6382 : }
6383 356864 : }
6384 :
6385 :
6386 875050 : void Analysis::VisitText(TextNode* that) {
6387 359639 : if (ignore_case()) {
6388 311544 : that->MakeCaseIndependent(isolate(), is_one_byte_);
6389 : }
6390 359639 : EnsureAnalyzed(that->on_success());
6391 359639 : if (!has_failed()) {
6392 356864 : that->CalculateOffsets();
6393 : }
6394 359639 : }
6395 :
6396 :
6397 791766 : void Analysis::VisitAction(ActionNode* that) {
6398 395883 : RegExpNode* target = that->on_success();
6399 395883 : EnsureAnalyzed(target);
6400 395883 : if (!has_failed()) {
6401 : // If the next node is interested in what it follows then this node
6402 : // has to be interested too so it can pass the information on.
6403 : that->info()->AddFromFollowing(target->info());
6404 : }
6405 395883 : }
6406 :
6407 :
6408 377230 : void Analysis::VisitChoice(ChoiceNode* that) {
6409 : NodeInfo* info = that->info();
6410 377230 : for (int i = 0; i < that->alternatives()->length(); i++) {
6411 151901 : RegExpNode* node = that->alternatives()->at(i).node();
6412 151901 : EnsureAnalyzed(node);
6413 188615 : if (has_failed()) return;
6414 : // Anything the following nodes need to know has to be known by
6415 : // this node also, so it can pass it on.
6416 : info->AddFromFollowing(node->info());
6417 : }
6418 : }
6419 :
6420 :
6421 1199181 : void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6422 : NodeInfo* info = that->info();
6423 1087570 : for (int i = 0; i < that->alternatives()->length(); i++) {
6424 976168 : RegExpNode* node = that->alternatives()->at(i).node();
6425 432383 : if (node != that->loop_node()) {
6426 216296 : EnsureAnalyzed(node);
6427 432592 : if (has_failed()) return;
6428 : info->AddFromFollowing(node->info());
6429 : }
6430 : }
6431 : // Check the loop last since it may need the value of this node
6432 : // to get a correct result.
6433 111402 : EnsureAnalyzed(that->loop_node());
6434 111402 : if (!has_failed()) {
6435 : info->AddFromFollowing(that->loop_node()->info());
6436 : }
6437 : }
6438 :
6439 :
6440 3008 : void Analysis::VisitBackReference(BackReferenceNode* that) {
6441 3008 : EnsureAnalyzed(that->on_success());
6442 3008 : }
6443 :
6444 :
6445 9588 : void Analysis::VisitAssertion(AssertionNode* that) {
6446 9588 : EnsureAnalyzed(that->on_success());
6447 9588 : }
6448 :
6449 :
6450 207 : void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6451 : BoyerMooreLookahead* bm,
6452 : bool not_at_start) {
6453 : // Working out the set of characters that a backreference can match is too
6454 : // hard, so we just say that any character can match.
6455 : bm->SetRest(offset);
6456 : SaveBMInfo(bm, not_at_start, offset);
6457 207 : }
6458 :
6459 :
6460 : STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6461 : RegExpMacroAssembler::kTableSize);
6462 :
6463 :
6464 19651 : void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6465 19651 : BoyerMooreLookahead* bm, bool not_at_start) {
6466 : ZoneList<GuardedAlternative>* alts = alternatives();
6467 92671 : budget = (budget - 1) / alts->length();
6468 146040 : for (int i = 0; i < alts->length(); i++) {
6469 107019 : GuardedAlternative& alt = alts->at(i);
6470 53650 : if (alt.guards() != NULL && alt.guards()->length() != 0) {
6471 : bm->SetRest(offset); // Give up trying to fill in info.
6472 : SaveBMInfo(bm, not_at_start, offset);
6473 19651 : return;
6474 : }
6475 53369 : alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6476 : }
6477 : SaveBMInfo(bm, not_at_start, offset);
6478 : }
6479 :
6480 :
6481 146765 : void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6482 1261095 : BoyerMooreLookahead* bm, bool not_at_start) {
6483 146765 : if (initial_offset >= bm->length()) return;
6484 : int offset = initial_offset;
6485 : int max_char = bm->max_char();
6486 587778 : for (int i = 0; i < elements()->length(); i++) {
6487 166700 : if (offset >= bm->length()) {
6488 134180 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6489 : return;
6490 : }
6491 152304 : TextElement text = elements()->at(i);
6492 152304 : if (text.text_type() == TextElement::ATOM) {
6493 : RegExpAtom* atom = text.atom();
6494 123870 : for (int j = 0; j < atom->length(); j++, offset++) {
6495 50855 : if (offset >= bm->length()) {
6496 5180 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6497 : return;
6498 : }
6499 45675 : uc16 character = atom->data()[j];
6500 45675 : if (bm->compiler()->ignore_case()) {
6501 : unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6502 : int length = GetCaseIndependentLetters(
6503 : isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6504 5974 : chars);
6505 17048 : for (int j = 0; j < length; j++) {
6506 22148 : bm->Set(offset, chars[j]);
6507 : }
6508 : } else {
6509 79402 : if (character <= max_char) bm->Set(offset, character);
6510 : }
6511 : }
6512 : } else {
6513 : DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6514 : RegExpCharacterClass* char_class = text.char_class();
6515 : ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6516 119784 : if (char_class->is_negated()) {
6517 5154 : bm->SetAll(offset);
6518 : } else {
6519 899268 : for (int k = 0; k < ranges->length(); k++) {
6520 1120622 : CharacterRange& range = ranges->at(k);
6521 392319 : if (range.from() > max_char) continue;
6522 : int to = Min(max_char, static_cast<int>(range.to()));
6523 221354 : bm->SetInterval(offset, Interval(range.from(), to));
6524 : }
6525 : }
6526 119784 : offset++;
6527 : }
6528 : }
6529 127189 : if (offset >= bm->length()) {
6530 99226 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6531 : return;
6532 : }
6533 27963 : on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6534 27963 : true); // Not at start after a text node.
6535 27963 : if (initial_offset == 0) set_bm_info(not_at_start, bm);
6536 : }
6537 :
6538 :
6539 : // -------------------------------------------------------------------
6540 : // Dispatch table construction
6541 :
6542 :
6543 0 : void DispatchTableConstructor::VisitEnd(EndNode* that) {
6544 : AddRange(CharacterRange::Everything());
6545 0 : }
6546 :
6547 :
6548 0 : void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6549 : node->set_being_calculated(true);
6550 : ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6551 0 : for (int i = 0; i < alternatives->length(); i++) {
6552 : set_choice_index(i);
6553 0 : alternatives->at(i).node()->Accept(this);
6554 : }
6555 : node->set_being_calculated(false);
6556 0 : }
6557 :
6558 :
6559 : class AddDispatchRange {
6560 : public:
6561 : explicit AddDispatchRange(DispatchTableConstructor* constructor)
6562 0 : : constructor_(constructor) { }
6563 : void Call(uc32 from, DispatchTable::Entry entry);
6564 : private:
6565 : DispatchTableConstructor* constructor_;
6566 : };
6567 :
6568 :
6569 0 : void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6570 0 : constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6571 0 : }
6572 :
6573 :
6574 0 : void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6575 0 : if (node->being_calculated())
6576 0 : return;
6577 0 : DispatchTable* table = node->GetTable(ignore_case_);
6578 : AddDispatchRange adder(this);
6579 : table->ForEach(&adder);
6580 : }
6581 :
6582 :
6583 0 : void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6584 : // TODO(160): Find the node that we refer back to and propagate its start
6585 : // set back to here. For now we just accept anything.
6586 : AddRange(CharacterRange::Everything());
6587 0 : }
6588 :
6589 :
6590 0 : void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6591 0 : RegExpNode* target = that->on_success();
6592 0 : target->Accept(this);
6593 0 : }
6594 :
6595 :
6596 11018 : static int CompareRangeByFrom(const CharacterRange* a,
6597 5509 : const CharacterRange* b) {
6598 16527 : return Compare<uc16>(a->from(), b->from());
6599 : }
6600 :
6601 :
6602 77 : void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6603 1204 : ranges->Sort(CompareRangeByFrom);
6604 : uc16 last = 0;
6605 2408 : for (int i = 0; i < ranges->length(); i++) {
6606 1127 : CharacterRange range = ranges->at(i);
6607 1127 : if (last < range.from())
6608 735 : AddRange(CharacterRange::Range(last, range.from() - 1));
6609 1127 : if (range.to() >= last) {
6610 1001 : if (range.to() == String::kMaxCodePoint) {
6611 77 : return;
6612 : } else {
6613 1001 : last = range.to() + 1;
6614 : }
6615 : }
6616 : }
6617 77 : AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6618 : }
6619 :
6620 :
6621 0 : void DispatchTableConstructor::VisitText(TextNode* that) {
6622 0 : TextElement elm = that->elements()->at(0);
6623 0 : switch (elm.text_type()) {
6624 : case TextElement::ATOM: {
6625 0 : uc16 c = elm.atom()->data()[0];
6626 0 : AddRange(CharacterRange::Range(c, c));
6627 : break;
6628 : }
6629 : case TextElement::CHAR_CLASS: {
6630 : RegExpCharacterClass* tree = elm.char_class();
6631 0 : ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6632 0 : if (tree->is_negated()) {
6633 0 : AddInverse(ranges);
6634 : } else {
6635 0 : for (int i = 0; i < ranges->length(); i++)
6636 0 : AddRange(ranges->at(i));
6637 : }
6638 : break;
6639 : }
6640 : default: {
6641 0 : UNIMPLEMENTED();
6642 : }
6643 : }
6644 0 : }
6645 :
6646 :
6647 0 : void DispatchTableConstructor::VisitAction(ActionNode* that) {
6648 0 : RegExpNode* target = that->on_success();
6649 0 : target->Accept(this);
6650 0 : }
6651 :
6652 :
6653 50 : RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6654 : RegExpNode* on_success) {
6655 : // If the regexp matching starts within a surrogate pair, step back
6656 : // to the lead surrogate and start matching from there.
6657 : DCHECK(!compiler->read_backward());
6658 : Zone* zone = compiler->zone();
6659 : ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6660 50 : zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6661 : ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6662 50 : zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6663 :
6664 50 : ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6665 :
6666 : int stack_register = compiler->UnicodeLookaroundStackRegister();
6667 : int position_register = compiler->UnicodeLookaroundPositionRegister();
6668 : RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6669 50 : zone, lead_surrogates, true, on_success);
6670 : RegExpLookaround::Builder builder(true, step_back, stack_register,
6671 50 : position_register);
6672 : RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6673 50 : zone, trail_surrogates, false, builder.on_match_success());
6674 :
6675 : optional_step_back->AddAlternative(
6676 50 : GuardedAlternative(builder.ForMatch(match_trail)));
6677 : optional_step_back->AddAlternative(GuardedAlternative(on_success));
6678 :
6679 50 : return optional_step_back;
6680 : }
6681 :
6682 :
6683 92637 : RegExpEngine::CompilationResult RegExpEngine::Compile(
6684 : Isolate* isolate, Zone* zone, RegExpCompileData* data,
6685 : JSRegExp::Flags flags, Handle<String> pattern,
6686 : Handle<String> sample_subject, bool is_one_byte) {
6687 92637 : if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6688 : return IrregexpRegExpTooBig(isolate);
6689 : }
6690 92622 : bool ignore_case = flags & JSRegExp::kIgnoreCase;
6691 92622 : bool is_sticky = flags & JSRegExp::kSticky;
6692 92622 : bool is_global = flags & JSRegExp::kGlobal;
6693 : bool is_unicode = flags & JSRegExp::kUnicode;
6694 : RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6695 92622 : is_one_byte);
6696 :
6697 92622 : if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6698 :
6699 : // Sample some characters from the middle of the string.
6700 : static const int kSampleSize = 128;
6701 :
6702 92622 : sample_subject = String::Flatten(sample_subject);
6703 : int chars_sampled = 0;
6704 92622 : int half_way = (sample_subject->length() - kSampleSize) / 2;
6705 1466240 : for (int i = Max(0, half_way);
6706 733120 : i < sample_subject->length() && chars_sampled < kSampleSize;
6707 : i++, chars_sampled++) {
6708 : compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6709 : }
6710 :
6711 : // Wrap the body of the regexp in capture #0.
6712 : RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6713 : 0,
6714 : &compiler,
6715 92622 : compiler.accept());
6716 : RegExpNode* node = captured_body;
6717 92622 : bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6718 92622 : bool is_start_anchored = data->tree->IsAnchoredAtStart();
6719 92622 : int max_length = data->tree->max_match();
6720 92622 : if (!is_start_anchored && !is_sticky) {
6721 : // Add a .*? at the beginning, outside the body capture, unless
6722 : // this expression is anchored at the beginning or sticky.
6723 : RegExpNode* loop_node = RegExpQuantifier::ToNode(
6724 : 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6725 176230 : &compiler, captured_body, data->contains_anchor);
6726 :
6727 88115 : if (data->contains_anchor) {
6728 : // Unroll loop once, to take care of the case that might start
6729 : // at the start of input.
6730 196 : ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6731 : first_step_node->AddAlternative(GuardedAlternative(captured_body));
6732 : first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6733 196 : new (zone) RegExpCharacterClass('*'), false, loop_node)));
6734 : node = first_step_node;
6735 : } else {
6736 : node = loop_node;
6737 : }
6738 : }
6739 92622 : if (is_one_byte) {
6740 20427 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6741 : // Do it again to propagate the new nodes to places where they were not
6742 : // put because they had not been calculated yet.
6743 20427 : if (node != NULL) {
6744 20071 : node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6745 : }
6746 72195 : } else if (compiler.unicode() && (is_global || is_sticky)) {
6747 50 : node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6748 : }
6749 :
6750 92622 : if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6751 92622 : data->node = node;
6752 : Analysis analysis(isolate, flags, is_one_byte);
6753 92622 : analysis.EnsureAnalyzed(node);
6754 92622 : if (analysis.has_failed()) {
6755 : const char* error_message = analysis.error_message();
6756 : return CompilationResult(isolate, error_message);
6757 : }
6758 :
6759 : // Create the correct assembler for the architecture.
6760 : #ifndef V8_INTERPRETED_REGEXP
6761 : // Native regexp implementation.
6762 :
6763 : NativeRegExpMacroAssembler::Mode mode =
6764 : is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6765 92181 : : NativeRegExpMacroAssembler::UC16;
6766 :
6767 : #if V8_TARGET_ARCH_IA32
6768 : RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6769 : (data->capture_count + 1) * 2);
6770 : #elif V8_TARGET_ARCH_X64
6771 : RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6772 184362 : (data->capture_count + 1) * 2);
6773 : #elif V8_TARGET_ARCH_ARM
6774 : RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6775 : (data->capture_count + 1) * 2);
6776 : #elif V8_TARGET_ARCH_ARM64
6777 : RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6778 : (data->capture_count + 1) * 2);
6779 : #elif V8_TARGET_ARCH_S390
6780 : RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6781 : (data->capture_count + 1) * 2);
6782 : #elif V8_TARGET_ARCH_PPC
6783 : RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6784 : (data->capture_count + 1) * 2);
6785 : #elif V8_TARGET_ARCH_MIPS
6786 : RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6787 : (data->capture_count + 1) * 2);
6788 : #elif V8_TARGET_ARCH_MIPS64
6789 : RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6790 : (data->capture_count + 1) * 2);
6791 : #elif V8_TARGET_ARCH_X87
6792 : RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6793 : (data->capture_count + 1) * 2);
6794 : #else
6795 : #error "Unsupported architecture"
6796 : #endif
6797 :
6798 : #else // V8_INTERPRETED_REGEXP
6799 : // Interpreted regexp implementation.
6800 : EmbeddedVector<byte, 1024> codes;
6801 : RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6802 : #endif // V8_INTERPRETED_REGEXP
6803 :
6804 92181 : macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6805 :
6806 : // Inserted here, instead of in Assembler, because it depends on information
6807 : // in the AST that isn't replicated in the Node structure.
6808 : static const int kMaxBacksearchLimit = 1024;
6809 92701 : if (is_end_anchored && !is_start_anchored && !is_sticky &&
6810 520 : max_length < kMaxBacksearchLimit) {
6811 243 : macro_assembler.SetCurrentPositionFromEnd(max_length);
6812 : }
6813 :
6814 92181 : if (is_global) {
6815 : RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6816 5093 : if (data->tree->min_match() > 0) {
6817 : mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6818 165 : } else if (is_unicode) {
6819 : mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6820 : }
6821 : macro_assembler.set_global_mode(mode);
6822 : }
6823 :
6824 : return compiler.Assemble(¯o_assembler,
6825 : node,
6826 : data->capture_count,
6827 92181 : pattern);
6828 : }
6829 :
6830 :
6831 183560 : bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6832 : Heap* heap = pattern->GetHeap();
6833 183560 : bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6834 367120 : if (heap->isolate()->total_regexp_code_generated() >
6835 311799 : RegExpImpl::kRegExpCompiledLimit &&
6836 128239 : heap->CommittedMemoryExecutable() >
6837 : RegExpImpl::kRegExpExecutableMemoryLimit) {
6838 : too_much = true;
6839 : }
6840 183560 : return too_much;
6841 : }
6842 :
6843 :
6844 36699 : Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6845 : Object* key_pattern,
6846 : FixedArray** last_match_cache,
6847 : ResultsCacheType type) {
6848 : FixedArray* cache;
6849 19287 : if (!key_string->IsInternalizedString()) return Smi::kZero;
6850 17412 : if (type == STRING_SPLIT_SUBSTRINGS) {
6851 : DCHECK(key_pattern->IsString());
6852 17412 : if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6853 : cache = heap->string_split_cache();
6854 : } else {
6855 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6856 : DCHECK(key_pattern->IsFixedArray());
6857 : cache = heap->regexp_multiple_cache();
6858 : }
6859 :
6860 : uint32_t hash = key_string->Hash();
6861 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6862 17412 : ~(kArrayEntriesPerCacheEntry - 1));
6863 51749 : if (cache->get(index + kStringOffset) != key_string ||
6864 16925 : cache->get(index + kPatternOffset) != key_pattern) {
6865 : index =
6866 753 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6867 1710 : if (cache->get(index + kStringOffset) != key_string ||
6868 204 : cache->get(index + kPatternOffset) != key_pattern) {
6869 : return Smi::kZero;
6870 : }
6871 : }
6872 :
6873 33698 : *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6874 33698 : return cache->get(index + kArrayOffset);
6875 : }
6876 :
6877 :
6878 2438 : void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6879 : Handle<Object> key_pattern,
6880 : Handle<FixedArray> value_array,
6881 : Handle<FixedArray> last_match_cache,
6882 : ResultsCacheType type) {
6883 : Factory* factory = isolate->factory();
6884 : Handle<FixedArray> cache;
6885 2438 : if (!key_string->IsInternalizedString()) return;
6886 563 : if (type == STRING_SPLIT_SUBSTRINGS) {
6887 : DCHECK(key_pattern->IsString());
6888 563 : if (!key_pattern->IsInternalizedString()) return;
6889 : cache = factory->string_split_cache();
6890 : } else {
6891 : DCHECK(type == REGEXP_MULTIPLE_INDICES);
6892 : DCHECK(key_pattern->IsFixedArray());
6893 : cache = factory->regexp_multiple_cache();
6894 : }
6895 :
6896 : uint32_t hash = key_string->Hash();
6897 : uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6898 563 : ~(kArrayEntriesPerCacheEntry - 1));
6899 1126 : if (cache->get(index + kStringOffset) == Smi::kZero) {
6900 437 : cache->set(index + kStringOffset, *key_string);
6901 874 : cache->set(index + kPatternOffset, *key_pattern);
6902 874 : cache->set(index + kArrayOffset, *value_array);
6903 874 : cache->set(index + kLastMatchOffset, *last_match_cache);
6904 : } else {
6905 : uint32_t index2 =
6906 126 : ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6907 252 : if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6908 90 : cache->set(index2 + kStringOffset, *key_string);
6909 180 : cache->set(index2 + kPatternOffset, *key_pattern);
6910 180 : cache->set(index2 + kArrayOffset, *value_array);
6911 180 : cache->set(index2 + kLastMatchOffset, *last_match_cache);
6912 : } else {
6913 : cache->set(index2 + kStringOffset, Smi::kZero);
6914 36 : cache->set(index2 + kPatternOffset, Smi::kZero);
6915 36 : cache->set(index2 + kArrayOffset, Smi::kZero);
6916 36 : cache->set(index2 + kLastMatchOffset, Smi::kZero);
6917 36 : cache->set(index + kStringOffset, *key_string);
6918 72 : cache->set(index + kPatternOffset, *key_pattern);
6919 72 : cache->set(index + kArrayOffset, *value_array);
6920 72 : cache->set(index + kLastMatchOffset, *last_match_cache);
6921 : }
6922 : }
6923 : // If the array is a reasonably short list of substrings, convert it into a
6924 : // list of internalized strings.
6925 1126 : if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6926 2427 : for (int i = 0; i < value_array->length(); i++) {
6927 : Handle<String> str(String::cast(value_array->get(i)), isolate);
6928 932 : Handle<String> internalized_str = factory->InternalizeString(str);
6929 932 : value_array->set(i, *internalized_str);
6930 : }
6931 : }
6932 : // Convert backing store to a copy-on-write array.
6933 563 : value_array->set_map_no_write_barrier(isolate->heap()->fixed_cow_array_map());
6934 : }
6935 :
6936 :
6937 106692 : void RegExpResultsCache::Clear(FixedArray* cache) {
6938 27419844 : for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6939 : cache->set(i, Smi::kZero);
6940 : }
6941 106692 : }
6942 :
6943 : } // namespace internal
6944 : } // namespace v8
|