Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_REGEXP_JSREGEXP_H_
6 : #define V8_REGEXP_JSREGEXP_H_
7 :
8 : #include "src/allocation.h"
9 : #include "src/isolate.h"
10 : #include "src/objects/js-regexp.h"
11 : #include "src/regexp/regexp-ast.h"
12 : #include "src/regexp/regexp-macro-assembler.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 :
17 : class NodeVisitor;
18 : class RegExpCompiler;
19 : class RegExpMacroAssembler;
20 : class RegExpNode;
21 : class RegExpTree;
22 : class BoyerMooreLookahead;
23 :
24 : inline bool IgnoreCase(JSRegExp::Flags flags) {
25 : return (flags & JSRegExp::kIgnoreCase) != 0;
26 : }
27 :
28 : inline bool IsUnicode(JSRegExp::Flags flags) {
29 87386 : return (flags & JSRegExp::kUnicode) != 0;
30 : }
31 :
32 : inline bool IsSticky(JSRegExp::Flags flags) {
33 85705 : return (flags & JSRegExp::kSticky) != 0;
34 : }
35 :
36 : inline bool IsGlobal(JSRegExp::Flags flags) {
37 85705 : return (flags & JSRegExp::kGlobal) != 0;
38 : }
39 :
40 : inline bool DotAll(JSRegExp::Flags flags) {
41 : return (flags & JSRegExp::kDotAll) != 0;
42 : }
43 :
44 : inline bool Multiline(JSRegExp::Flags flags) {
45 : return (flags & JSRegExp::kMultiline) != 0;
46 : }
47 :
48 : inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
49 : // Both unicode and ignore_case flags are set. We need to use ICU to find
50 : // the closure over case equivalents.
51 419756 : return IsUnicode(flags) && IgnoreCase(flags);
52 : }
53 :
54 : class RegExpImpl {
55 : public:
56 : // Whether V8 is compiled with native regexp support or not.
57 : static bool UsesNativeRegExp() {
58 : #ifdef V8_INTERPRETED_REGEXP
59 : return false;
60 : #else
61 : return true;
62 : #endif
63 : }
64 :
65 : // Returns a string representation of a regular expression.
66 : // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
67 : // This function calls the garbage collector if necessary.
68 : static Handle<String> ToString(Handle<Object> value);
69 :
70 : // Parses the RegExp pattern and prepares the JSRegExp object with
71 : // generic data and choice of implementation - as well as what
72 : // the implementation wants to store in the data field.
73 : // Returns false if compilation fails.
74 : V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
75 : Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
76 : JSRegExp::Flags flags);
77 :
78 : // See ECMA-262 section 15.10.6.2.
79 : // This function calls the garbage collector if necessary.
80 : V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
81 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
82 : int index, Handle<RegExpMatchInfo> last_match_info);
83 :
84 : // Prepares a JSRegExp object with Irregexp-specific data.
85 : static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
86 : Handle<String> pattern, JSRegExp::Flags flags,
87 : int capture_register_count);
88 :
89 : static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
90 : Handle<String> pattern, JSRegExp::Flags flags,
91 : Handle<String> match_pattern);
92 :
93 : static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
94 : Handle<String> subject, int index, int32_t* output,
95 : int output_size);
96 :
97 : static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
98 : Handle<String> subject, int index,
99 : Handle<RegExpMatchInfo> last_match_info);
100 :
101 : enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
102 :
103 : // Prepare a RegExp for being executed one or more times (using
104 : // IrregexpExecOnce) on the subject.
105 : // This ensures that the regexp is compiled for the subject, and that
106 : // the subject is flat.
107 : // Returns the number of integer spaces required by IrregexpExecOnce
108 : // as its "registers" argument. If the regexp cannot be compiled,
109 : // an exception is set as pending, and this function returns negative.
110 : static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
111 : Handle<String> subject);
112 :
113 : // Execute a regular expression on the subject, starting from index.
114 : // If matching succeeds, return the number of matches. This can be larger
115 : // than one in the case of global regular expressions.
116 : // The captures and subcaptures are stored into the registers vector.
117 : // If matching fails, returns RE_FAILURE.
118 : // If execution fails, sets a pending exception and returns RE_EXCEPTION.
119 : static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
120 : Handle<String> subject, int index, int32_t* output,
121 : int output_size);
122 :
123 : // Execute an Irregexp bytecode pattern.
124 : // On a successful match, the result is a JSArray containing
125 : // captured positions. On a failure, the result is the null value.
126 : // Returns an empty handle in case of an exception.
127 : V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
128 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
129 : int index, Handle<RegExpMatchInfo> last_match_info);
130 :
131 : // Set last match info. If match is nullptr, then setting captures is
132 : // omitted.
133 : static Handle<RegExpMatchInfo> SetLastMatchInfo(
134 : Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
135 : Handle<String> subject, int capture_count, int32_t* match);
136 :
137 : class GlobalCache {
138 : public:
139 : GlobalCache(Handle<JSRegExp> regexp,
140 : Handle<String> subject,
141 : Isolate* isolate);
142 :
143 : V8_INLINE ~GlobalCache();
144 :
145 : // Fetch the next entry in the cache for global regexp match results.
146 : // This does not set the last match info. Upon failure, nullptr is
147 : // returned. The cause can be checked with Result(). The previous result is
148 : // still in available in memory when a failure happens.
149 : V8_INLINE int32_t* FetchNext();
150 :
151 : V8_INLINE int32_t* LastSuccessfulMatch();
152 :
153 : V8_INLINE bool HasException() { return num_matches_ < 0; }
154 :
155 : private:
156 : int AdvanceZeroLength(int last_index);
157 :
158 : int num_matches_;
159 : int max_matches_;
160 : int current_match_index_;
161 : int registers_per_match_;
162 : // Pointer to the last set of captures.
163 : int32_t* register_array_;
164 : int register_array_size_;
165 : Handle<JSRegExp> regexp_;
166 : Handle<String> subject_;
167 : Isolate* isolate_;
168 : };
169 :
170 : // For acting on the JSRegExp data FixedArray.
171 : static int IrregexpMaxRegisterCount(FixedArray re);
172 : static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
173 : static void SetIrregexpCaptureNameMap(FixedArray re,
174 : Handle<FixedArray> value);
175 : static int IrregexpNumberOfCaptures(FixedArray re);
176 : static int IrregexpNumberOfRegisters(FixedArray re);
177 : static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
178 : static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
179 :
180 : // Limit the space regexps take up on the heap. In order to limit this we
181 : // would like to keep track of the amount of regexp code on the heap. This
182 : // is not tracked, however. As a conservative approximation we track the
183 : // total regexp code compiled including code that has subsequently been freed
184 : // and the total executable memory at any point.
185 : static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
186 : static const size_t kRegExpCompiledLimit = 1 * MB;
187 : static const int kRegExpTooLargeToOptimize = 20 * KB;
188 :
189 : private:
190 : static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
191 : Handle<String> sample_subject, bool is_one_byte);
192 : static inline bool EnsureCompiledIrregexp(Isolate* isolate,
193 : Handle<JSRegExp> re,
194 : Handle<String> sample_subject,
195 : bool is_one_byte);
196 : };
197 :
198 :
199 : // Represents the location of one element relative to the intersection of
200 : // two sets. Corresponds to the four areas of a Venn diagram.
201 : enum ElementInSetsRelation {
202 : kInsideNone = 0,
203 : kInsideFirst = 1,
204 : kInsideSecond = 2,
205 : kInsideBoth = 3
206 : };
207 :
208 :
209 : // A set of unsigned integers that behaves especially well on small
210 : // integers (< 32). May do zone-allocation.
211 : class OutSet: public ZoneObject {
212 : public:
213 1023154 : OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
214 : OutSet* Extend(unsigned value, Zone* zone);
215 : bool Get(unsigned value) const;
216 : static const unsigned kFirstLimit = 32;
217 :
218 : private:
219 : // Destructively set a value in this set. In most cases you want
220 : // to use Extend instead to ensure that only one instance exists
221 : // that contains the same values.
222 : void Set(unsigned value, Zone* zone);
223 :
224 : // The successors are a list of sets that contain the same values
225 : // as this set and the one more value that is not present in this
226 : // set.
227 : ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
228 :
229 : OutSet(uint32_t first, ZoneList<unsigned>* remaining)
230 16962 : : first_(first), remaining_(remaining), successors_(nullptr) {}
231 : uint32_t first_;
232 : ZoneList<unsigned>* remaining_;
233 : ZoneList<OutSet*>* successors_;
234 : friend class Trace;
235 : };
236 :
237 :
238 : // A mapping from integers, specified as ranges, to a set of integers.
239 : // Used for mapping character ranges to choices.
240 : class DispatchTable : public ZoneObject {
241 : public:
242 : explicit DispatchTable(Zone* zone) : tree_(zone) { }
243 :
244 : class Entry {
245 : public:
246 : Entry() : from_(0), to_(0), out_set_(nullptr) {}
247 : Entry(uc32 from, uc32 to, OutSet* out_set)
248 : : from_(from), to_(to), out_set_(out_set) {
249 : DCHECK(from <= to);
250 : }
251 : uc32 from() { return from_; }
252 : uc32 to() { return to_; }
253 4830 : void set_to(uc32 value) { to_ = value; }
254 : void AddValue(int value, Zone* zone) {
255 80873 : out_set_ = out_set_->Extend(value, zone);
256 : }
257 : OutSet* out_set() { return out_set_; }
258 : private:
259 : uc32 from_;
260 : uc32 to_;
261 : OutSet* out_set_;
262 : };
263 :
264 : class Config {
265 : public:
266 : typedef uc32 Key;
267 : typedef Entry Value;
268 : static const uc32 kNoKey;
269 : static const Entry NoValue() { return Value(); }
270 : static inline int Compare(uc32 a, uc32 b) {
271 1570085 : if (a == b)
272 : return 0;
273 1537295 : else if (a < b)
274 : return -1;
275 : else
276 : return 1;
277 : }
278 : };
279 :
280 : void AddRange(CharacterRange range, int value, Zone* zone);
281 : OutSet* Get(uc32 value);
282 : void Dump();
283 :
284 : template <typename Callback>
285 : void ForEach(Callback* callback) {
286 2582 : return tree()->ForEach(callback);
287 : }
288 :
289 : private:
290 : // There can't be a static empty set since it allocates its
291 : // successors in a zone and caches them.
292 : OutSet* empty() { return &empty_; }
293 : OutSet empty_;
294 : ZoneSplayTree<Config>* tree() { return &tree_; }
295 : ZoneSplayTree<Config> tree_;
296 : };
297 :
298 :
299 : // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
300 : class UnicodeRangeSplitter {
301 : public:
302 : UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
303 : void Call(uc32 from, DispatchTable::Entry entry);
304 :
305 : ZoneList<CharacterRange>* bmp() { return bmp_; }
306 : ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
307 : ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
308 : ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
309 :
310 : private:
311 : static const int kBase = 0;
312 : // Separate ranges into
313 : static const int kBmpCodePoints = 1;
314 : static const int kLeadSurrogates = 2;
315 : static const int kTrailSurrogates = 3;
316 : static const int kNonBmpCodePoints = 4;
317 :
318 : Zone* zone_;
319 : DispatchTable table_;
320 : ZoneList<CharacterRange>* bmp_;
321 : ZoneList<CharacterRange>* lead_surrogates_;
322 : ZoneList<CharacterRange>* trail_surrogates_;
323 : ZoneList<CharacterRange>* non_bmp_;
324 : };
325 :
326 :
327 : #define FOR_EACH_NODE_TYPE(VISIT) \
328 : VISIT(End) \
329 : VISIT(Action) \
330 : VISIT(Choice) \
331 : VISIT(BackReference) \
332 : VISIT(Assertion) \
333 : VISIT(Text)
334 :
335 :
336 : class Trace;
337 : struct PreloadState;
338 : class GreedyLoopState;
339 : class AlternativeGenerationList;
340 :
341 : struct NodeInfo {
342 : NodeInfo()
343 : : being_analyzed(false),
344 : been_analyzed(false),
345 : follows_word_interest(false),
346 : follows_newline_interest(false),
347 : follows_start_interest(false),
348 : at_end(false),
349 : visited(false),
350 4381730 : replacement_calculated(false) { }
351 :
352 : // Returns true if the interests and assumptions of this node
353 : // matches the given one.
354 : bool Matches(NodeInfo* that) {
355 : return (at_end == that->at_end) &&
356 : (follows_word_interest == that->follows_word_interest) &&
357 : (follows_newline_interest == that->follows_newline_interest) &&
358 : (follows_start_interest == that->follows_start_interest);
359 : }
360 :
361 : // Updates the interests of this node given the interests of the
362 : // node preceding it.
363 : void AddFromPreceding(NodeInfo* that) {
364 : at_end |= that->at_end;
365 : follows_word_interest |= that->follows_word_interest;
366 : follows_newline_interest |= that->follows_newline_interest;
367 : follows_start_interest |= that->follows_start_interest;
368 : }
369 :
370 : bool HasLookbehind() {
371 : return follows_word_interest ||
372 : follows_newline_interest ||
373 : follows_start_interest;
374 : }
375 :
376 : // Sets the interests of this node to include the interests of the
377 : // following node.
378 : void AddFromFollowing(NodeInfo* that) {
379 566759 : follows_word_interest |= that->follows_word_interest;
380 566759 : follows_newline_interest |= that->follows_newline_interest;
381 566759 : follows_start_interest |= that->follows_start_interest;
382 : }
383 :
384 : void ResetCompilationState() {
385 : being_analyzed = false;
386 : been_analyzed = false;
387 : }
388 :
389 : bool being_analyzed: 1;
390 : bool been_analyzed: 1;
391 :
392 : // These bits are set of this node has to know what the preceding
393 : // character was.
394 : bool follows_word_interest: 1;
395 : bool follows_newline_interest: 1;
396 : bool follows_start_interest: 1;
397 :
398 : bool at_end: 1;
399 : bool visited: 1;
400 : bool replacement_calculated: 1;
401 : };
402 :
403 :
404 : // Details of a quick mask-compare check that can look ahead in the
405 : // input stream.
406 : class QuickCheckDetails {
407 : public:
408 : QuickCheckDetails()
409 : : characters_(0),
410 : mask_(0),
411 : value_(0),
412 14082980 : cannot_match_(false) { }
413 : explicit QuickCheckDetails(int characters)
414 : : characters_(characters),
415 : mask_(0),
416 : value_(0),
417 919630 : cannot_match_(false) { }
418 : bool Rationalize(bool one_byte);
419 : // Merge in the information from another branch of an alternation.
420 : void Merge(QuickCheckDetails* other, int from_index);
421 : // Advance the current position by some amount.
422 : void Advance(int by, bool one_byte);
423 : void Clear();
424 : bool cannot_match() { return cannot_match_; }
425 463 : void set_cannot_match() { cannot_match_ = true; }
426 : struct Position {
427 12002088 : Position() : mask(0), value(0), determines_perfectly(false) { }
428 : uc16 mask;
429 : uc16 value;
430 : bool determines_perfectly;
431 : };
432 : int characters() { return characters_; }
433 603622 : void set_characters(int characters) { characters_ = characters; }
434 : Position* positions(int index) {
435 : DCHECK_LE(0, index);
436 : DCHECK_GT(characters_, index);
437 1793098 : return positions_ + index;
438 : }
439 : uint32_t mask() { return mask_; }
440 : uint32_t value() { return value_; }
441 :
442 : private:
443 : // How many characters do we have quick check information from. This is
444 : // the same for all branches of a choice node.
445 : int characters_;
446 : Position positions_[4];
447 : // These values are the condensate of the above array after Rationalize().
448 : uint32_t mask_;
449 : uint32_t value_;
450 : // If set to true, there is no way this quick check can match at all.
451 : // E.g., if it requires to be at the start of the input, and isn't.
452 : bool cannot_match_;
453 : };
454 :
455 :
456 : extern int kUninitializedRegExpNodePlaceHolder;
457 :
458 :
459 0 : class RegExpNode: public ZoneObject {
460 : public:
461 : explicit RegExpNode(Zone* zone)
462 : : replacement_(nullptr),
463 : on_work_list_(false),
464 : trace_count_(0),
465 8763460 : zone_(zone) {
466 4381730 : bm_info_[0] = bm_info_[1] = nullptr;
467 : }
468 : virtual ~RegExpNode();
469 : virtual void Accept(NodeVisitor* visitor) = 0;
470 : // Generates a goto to this node or actually generates the code at this point.
471 : virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
472 : // How many characters must this node consume at a minimum in order to
473 : // succeed. If we have found at least 'still_to_find' characters that
474 : // must be consumed there is no need to ask any following nodes whether
475 : // they are sure to eat any more characters. The not_at_start argument is
476 : // used to indicate that we know we are not at the start of the input. In
477 : // this case anchored branches will always fail and can be ignored when
478 : // determining how many characters are consumed on success.
479 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
480 : // Emits some quick code that checks whether the preloaded characters match.
481 : // Falls through on certain failure, jumps to the label on possible success.
482 : // If the node cannot make a quick check it does nothing and returns false.
483 : bool EmitQuickCheck(RegExpCompiler* compiler,
484 : Trace* bounds_check_trace,
485 : Trace* trace,
486 : bool preload_has_checked_bounds,
487 : Label* on_possible_success,
488 : QuickCheckDetails* details_return,
489 : bool fall_through_on_failure);
490 : // For a given number of characters this returns a mask and a value. The
491 : // next n characters are anded with the mask and compared with the value.
492 : // A comparison failure indicates the node cannot match the next n characters.
493 : // A comparison success indicates the node may match.
494 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
495 : RegExpCompiler* compiler,
496 : int characters_filled_in,
497 : bool not_at_start) = 0;
498 : static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
499 87188 : virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
500 : // Only returns the successor for a text node of length 1 that matches any
501 : // character and that has no guards on it.
502 78573 : virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
503 : RegExpCompiler* compiler) {
504 78573 : return nullptr;
505 : }
506 :
507 : // Collects information on the possible code units (mod 128) that can match if
508 : // we look forward. This is used for a Boyer-Moore-like string searching
509 : // implementation. TODO(erikcorry): This should share more code with
510 : // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
511 : // the number of nodes we are willing to look at in order to create this data.
512 : static const int kRecursionBudget = 200;
513 : bool KeepRecursing(RegExpCompiler* compiler);
514 0 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
515 : BoyerMooreLookahead* bm, bool not_at_start) {
516 0 : UNREACHABLE();
517 : }
518 :
519 : // If we know that the input is one-byte then there are some nodes that can
520 : // never match. This method returns a node that can be substituted for
521 : // itself, or nullptr if the node can never match.
522 14563 : virtual RegExpNode* FilterOneByte(int depth) { return this; }
523 : // Helper for FilterOneByte.
524 : RegExpNode* replacement() {
525 : DCHECK(info()->replacement_calculated);
526 : return replacement_;
527 : }
528 : RegExpNode* set_replacement(RegExpNode* replacement) {
529 161233 : info()->replacement_calculated = true;
530 161233 : replacement_ = replacement;
531 : return replacement; // For convenience.
532 : }
533 :
534 : // We want to avoid recalculating the lookahead info, so we store it on the
535 : // node. Only info that is for this node is stored. We can tell that the
536 : // info is for this node when offset == 0, so the information is calculated
537 : // relative to this node.
538 : void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
539 103833 : if (offset == 0) set_bm_info(not_at_start, bm);
540 : }
541 :
542 : Label* label() { return &label_; }
543 : // If non-generic code is generated for a node (i.e. the node is not at the
544 : // start of the trace) then it cannot be reused. This variable sets a limit
545 : // on how often we allow that to happen before we insist on starting a new
546 : // trace and generating generic code for a node that can be reused by flushing
547 : // the deferred actions in the current trace and generating a goto.
548 : static const int kMaxCopiesCodeGenerated = 10;
549 :
550 : bool on_work_list() { return on_work_list_; }
551 423294 : void set_on_work_list(bool value) { on_work_list_ = value; }
552 :
553 : NodeInfo* info() { return &info_; }
554 :
555 : BoyerMooreLookahead* bm_info(bool not_at_start) {
556 82361 : return bm_info_[not_at_start ? 1 : 0];
557 : }
558 :
559 : Zone* zone() const { return zone_; }
560 :
561 : protected:
562 : enum LimitResult { DONE, CONTINUE };
563 : RegExpNode* replacement_;
564 :
565 : LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
566 :
567 : void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
568 196092 : bm_info_[not_at_start ? 1 : 0] = bm;
569 : }
570 :
571 : private:
572 : static const int kFirstCharBudget = 10;
573 : Label label_;
574 : bool on_work_list_;
575 : NodeInfo info_;
576 : // This variable keeps track of how many times code has been generated for
577 : // this node (in different traces). We don't keep track of where the
578 : // generated code is located unless the code is generated at the start of
579 : // a trace, in which case it is generic and can be reused by flushing the
580 : // deferred operations in the current trace and generating a goto.
581 : int trace_count_;
582 : BoyerMooreLookahead* bm_info_[2];
583 :
584 : Zone* zone_;
585 : };
586 :
587 :
588 0 : class SeqRegExpNode: public RegExpNode {
589 : public:
590 3267647 : explicit SeqRegExpNode(RegExpNode* on_success)
591 3267647 : : RegExpNode(on_success->zone()), on_success_(on_success) { }
592 : RegExpNode* on_success() { return on_success_; }
593 : void set_on_success(RegExpNode* node) { on_success_ = node; }
594 : RegExpNode* FilterOneByte(int depth) override;
595 0 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
596 : BoyerMooreLookahead* bm, bool not_at_start) override {
597 0 : on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
598 0 : if (offset == 0) set_bm_info(not_at_start, bm);
599 0 : }
600 :
601 : protected:
602 : RegExpNode* FilterSuccessor(int depth);
603 :
604 : private:
605 : RegExpNode* on_success_;
606 : };
607 :
608 :
609 0 : class ActionNode: public SeqRegExpNode {
610 : public:
611 : enum ActionType {
612 : SET_REGISTER,
613 : INCREMENT_REGISTER,
614 : STORE_POSITION,
615 : BEGIN_SUBMATCH,
616 : POSITIVE_SUBMATCH_SUCCESS,
617 : EMPTY_MATCH_CHECK,
618 : CLEAR_CAPTURES
619 : };
620 : static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
621 : static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
622 : static ActionNode* StorePosition(int reg,
623 : bool is_capture,
624 : RegExpNode* on_success);
625 : static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
626 : static ActionNode* BeginSubmatch(int stack_pointer_reg,
627 : int position_reg,
628 : RegExpNode* on_success);
629 : static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
630 : int restore_reg,
631 : int clear_capture_count,
632 : int clear_capture_from,
633 : RegExpNode* on_success);
634 : static ActionNode* EmptyMatchCheck(int start_register,
635 : int repetition_register,
636 : int repetition_limit,
637 : RegExpNode* on_success);
638 : void Accept(NodeVisitor* visitor) override;
639 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
640 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
641 67287 : void GetQuickCheckDetails(QuickCheckDetails* details,
642 : RegExpCompiler* compiler, int filled_in,
643 : bool not_at_start) override {
644 67287 : return on_success()->GetQuickCheckDetails(
645 67287 : details, compiler, filled_in, not_at_start);
646 : }
647 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
648 : BoyerMooreLookahead* bm, bool not_at_start) override;
649 : ActionType action_type() { return action_type_; }
650 : // TODO(erikcorry): We should allow some action nodes in greedy loops.
651 114767 : int GreedyLoopTextLength() override {
652 114767 : return kNodeIsTooComplexForGreedyLoops;
653 : }
654 :
655 : private:
656 : union {
657 : struct {
658 : int reg;
659 : int value;
660 : } u_store_register;
661 : struct {
662 : int reg;
663 : } u_increment_register;
664 : struct {
665 : int reg;
666 : bool is_capture;
667 : } u_position_register;
668 : struct {
669 : int stack_pointer_register;
670 : int current_position_register;
671 : int clear_register_count;
672 : int clear_register_from;
673 : } u_submatch;
674 : struct {
675 : int start_register;
676 : int repetition_register;
677 : int repetition_limit;
678 : } u_empty_match_check;
679 : struct {
680 : int range_from;
681 : int range_to;
682 : } u_clear_captures;
683 : } data_;
684 : ActionNode(ActionType action_type, RegExpNode* on_success)
685 : : SeqRegExpNode(on_success),
686 2041830 : action_type_(action_type) { }
687 : ActionType action_type_;
688 : friend class DotPrinter;
689 : };
690 :
691 :
692 0 : class TextNode: public SeqRegExpNode {
693 : public:
694 : TextNode(ZoneList<TextElement>* elms, bool read_backward,
695 : RegExpNode* on_success)
696 1044344 : : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
697 173562 : TextNode(RegExpCharacterClass* that, bool read_backward,
698 : RegExpNode* on_success)
699 : : SeqRegExpNode(on_success),
700 347124 : elms_(new (zone()) ZoneList<TextElement>(1, zone())),
701 347124 : read_backward_(read_backward) {
702 173562 : elms_->Add(TextElement::CharClass(that), zone());
703 173562 : }
704 : // Create TextNode for a single character class for the given ranges.
705 : static TextNode* CreateForCharacterRanges(Zone* zone,
706 : ZoneList<CharacterRange>* ranges,
707 : bool read_backward,
708 : RegExpNode* on_success,
709 : JSRegExp::Flags flags);
710 : // Create TextNode for a surrogate pair with a range given for the
711 : // lead and the trail surrogate each.
712 : static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
713 : CharacterRange trail,
714 : bool read_backward,
715 : RegExpNode* on_success,
716 : JSRegExp::Flags flags);
717 : void Accept(NodeVisitor* visitor) override;
718 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
719 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
720 : void GetQuickCheckDetails(QuickCheckDetails* details,
721 : RegExpCompiler* compiler, int characters_filled_in,
722 : bool not_at_start) override;
723 : ZoneList<TextElement>* elements() { return elms_; }
724 : bool read_backward() { return read_backward_; }
725 : void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
726 : int GreedyLoopTextLength() override;
727 : RegExpNode* GetSuccessorOfOmnivorousTextNode(
728 : RegExpCompiler* compiler) override;
729 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
730 : BoyerMooreLookahead* bm, bool not_at_start) override;
731 : void CalculateOffsets();
732 : RegExpNode* FilterOneByte(int depth) override;
733 :
734 : private:
735 : enum TextEmitPassType {
736 : NON_LATIN1_MATCH, // Check for characters that can't match.
737 : SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
738 : NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
739 : CASE_CHARACTER_MATCH, // Case-independent single character check.
740 : CHARACTER_CLASS_MATCH // Character class.
741 : };
742 : static bool SkipPass(TextEmitPassType pass, bool ignore_case);
743 : static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
744 : static const int kLastPass = CHARACTER_CLASS_MATCH;
745 : void TextEmitPass(RegExpCompiler* compiler,
746 : TextEmitPassType pass,
747 : bool preloaded,
748 : Trace* trace,
749 : bool first_element_checked,
750 : int* checked_up_to);
751 : int Length();
752 : ZoneList<TextElement>* elms_;
753 : bool read_backward_;
754 : };
755 :
756 :
757 0 : class AssertionNode: public SeqRegExpNode {
758 : public:
759 : enum AssertionType {
760 : AT_END,
761 : AT_START,
762 : AT_BOUNDARY,
763 : AT_NON_BOUNDARY,
764 : AFTER_NEWLINE
765 : };
766 2052 : static AssertionNode* AtEnd(RegExpNode* on_success) {
767 2052 : return new(on_success->zone()) AssertionNode(AT_END, on_success);
768 : }
769 3065 : static AssertionNode* AtStart(RegExpNode* on_success) {
770 3065 : return new(on_success->zone()) AssertionNode(AT_START, on_success);
771 : }
772 136 : static AssertionNode* AtBoundary(RegExpNode* on_success) {
773 136 : return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
774 : }
775 114 : static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
776 114 : return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
777 : }
778 129 : static AssertionNode* AfterNewline(RegExpNode* on_success) {
779 129 : return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
780 : }
781 : void Accept(NodeVisitor* visitor) override;
782 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
783 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
784 : void GetQuickCheckDetails(QuickCheckDetails* details,
785 : RegExpCompiler* compiler, int filled_in,
786 : bool not_at_start) override;
787 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
788 : BoyerMooreLookahead* bm, bool not_at_start) override;
789 : AssertionType assertion_type() { return assertion_type_; }
790 :
791 : private:
792 : void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
793 : enum IfPrevious { kIsNonWord, kIsWord };
794 : void BacktrackIfPrevious(RegExpCompiler* compiler,
795 : Trace* trace,
796 : IfPrevious backtrack_if_previous);
797 : AssertionNode(AssertionType t, RegExpNode* on_success)
798 5496 : : SeqRegExpNode(on_success), assertion_type_(t) { }
799 : AssertionType assertion_type_;
800 : };
801 :
802 :
803 0 : class BackReferenceNode: public SeqRegExpNode {
804 : public:
805 : BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
806 : bool read_backward, RegExpNode* on_success)
807 : : SeqRegExpNode(on_success),
808 : start_reg_(start_reg),
809 : end_reg_(end_reg),
810 : flags_(flags),
811 2415 : read_backward_(read_backward) {}
812 : void Accept(NodeVisitor* visitor) override;
813 : int start_register() { return start_reg_; }
814 : int end_register() { return end_reg_; }
815 : bool read_backward() { return read_backward_; }
816 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
817 : int EatsAtLeast(int still_to_find, int recursion_depth,
818 : bool not_at_start) override;
819 581 : void GetQuickCheckDetails(QuickCheckDetails* details,
820 : RegExpCompiler* compiler, int characters_filled_in,
821 : bool not_at_start) override {
822 581 : return;
823 : }
824 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
825 : BoyerMooreLookahead* bm, bool not_at_start) override;
826 :
827 : private:
828 : int start_reg_;
829 : int end_reg_;
830 : JSRegExp::Flags flags_;
831 : bool read_backward_;
832 : };
833 :
834 :
835 0 : class EndNode: public RegExpNode {
836 : public:
837 : enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
838 88812 : EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
839 : void Accept(NodeVisitor* visitor) override;
840 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
841 115356 : int EatsAtLeast(int still_to_find, int recursion_depth,
842 : bool not_at_start) override {
843 115356 : return 0;
844 : }
845 0 : void GetQuickCheckDetails(QuickCheckDetails* details,
846 : RegExpCompiler* compiler, int characters_filled_in,
847 : bool not_at_start) override {
848 : // Returning 0 from EatsAtLeast should ensure we never get here.
849 0 : UNREACHABLE();
850 : }
851 0 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
852 : BoyerMooreLookahead* bm, bool not_at_start) override {
853 : // Returning 0 from EatsAtLeast should ensure we never get here.
854 0 : UNREACHABLE();
855 : }
856 :
857 : private:
858 : Action action_;
859 : };
860 :
861 :
862 0 : class NegativeSubmatchSuccess: public EndNode {
863 : public:
864 : NegativeSubmatchSuccess(int stack_pointer_reg,
865 : int position_reg,
866 : int clear_capture_count,
867 : int clear_capture_start,
868 : Zone* zone)
869 : : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
870 : stack_pointer_register_(stack_pointer_reg),
871 : current_position_register_(position_reg),
872 : clear_capture_count_(clear_capture_count),
873 2812 : clear_capture_start_(clear_capture_start) { }
874 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
875 :
876 : private:
877 : int stack_pointer_register_;
878 : int current_position_register_;
879 : int clear_capture_count_;
880 : int clear_capture_start_;
881 : };
882 :
883 :
884 : class Guard: public ZoneObject {
885 : public:
886 : enum Relation { LT, GEQ };
887 : Guard(int reg, Relation op, int value)
888 : : reg_(reg),
889 : op_(op),
890 903937 : value_(value) { }
891 : int reg() { return reg_; }
892 : Relation op() { return op_; }
893 : int value() { return value_; }
894 :
895 : private:
896 : int reg_;
897 : Relation op_;
898 : int value_;
899 : };
900 :
901 :
902 : class GuardedAlternative {
903 : public:
904 : explicit GuardedAlternative(RegExpNode* node)
905 2001612 : : node_(node), guards_(nullptr) {}
906 : void AddGuard(Guard* guard, Zone* zone);
907 : RegExpNode* node() { return node_; }
908 58737 : void set_node(RegExpNode* node) { node_ = node; }
909 : ZoneList<Guard*>* guards() { return guards_; }
910 :
911 : private:
912 : RegExpNode* node_;
913 : ZoneList<Guard*>* guards_;
914 : };
915 :
916 :
917 : class AlternativeGeneration;
918 :
919 :
920 0 : class ChoiceNode: public RegExpNode {
921 : public:
922 1025271 : explicit ChoiceNode(int expected_size, Zone* zone)
923 : : RegExpNode(zone),
924 : alternatives_(new (zone)
925 1025271 : ZoneList<GuardedAlternative>(expected_size, zone)),
926 : table_(nullptr),
927 : not_at_start_(false),
928 2050542 : being_calculated_(false) {}
929 : void Accept(NodeVisitor* visitor) override;
930 2131943 : void AddAlternative(GuardedAlternative node) {
931 4251623 : alternatives()->Add(node, zone());
932 : }
933 : ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
934 : DispatchTable* GetTable(bool ignore_case);
935 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
936 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
937 : int EatsAtLeastHelper(int still_to_find,
938 : int budget,
939 : RegExpNode* ignore_this_node,
940 : bool not_at_start);
941 : void GetQuickCheckDetails(QuickCheckDetails* details,
942 : RegExpCompiler* compiler, int characters_filled_in,
943 : bool not_at_start) override;
944 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
945 : BoyerMooreLookahead* bm, bool not_at_start) override;
946 :
947 : bool being_calculated() { return being_calculated_; }
948 : bool not_at_start() { return not_at_start_; }
949 4134 : void set_not_at_start() { not_at_start_ = true; }
950 0 : void set_being_calculated(bool b) { being_calculated_ = b; }
951 471125 : virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
952 471125 : return true;
953 : }
954 : RegExpNode* FilterOneByte(int depth) override;
955 0 : virtual bool read_backward() { return false; }
956 :
957 : protected:
958 : int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
959 : ZoneList<GuardedAlternative>* alternatives_;
960 :
961 : private:
962 : friend class DispatchTableConstructor;
963 : friend class Analysis;
964 : void GenerateGuard(RegExpMacroAssembler* macro_assembler,
965 : Guard* guard,
966 : Trace* trace);
967 : int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
968 : void EmitOutOfLineContinuation(RegExpCompiler* compiler,
969 : Trace* trace,
970 : GuardedAlternative alternative,
971 : AlternativeGeneration* alt_gen,
972 : int preload_characters,
973 : bool next_expects_preload);
974 : void SetUpPreLoad(RegExpCompiler* compiler,
975 : Trace* current_trace,
976 : PreloadState* preloads);
977 : void AssertGuardsMentionRegisters(Trace* trace);
978 : int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
979 : Trace* EmitGreedyLoop(RegExpCompiler* compiler,
980 : Trace* trace,
981 : AlternativeGenerationList* alt_gens,
982 : PreloadState* preloads,
983 : GreedyLoopState* greedy_loop_state,
984 : int text_length);
985 : void EmitChoices(RegExpCompiler* compiler,
986 : AlternativeGenerationList* alt_gens,
987 : int first_choice,
988 : Trace* trace,
989 : PreloadState* preloads);
990 : DispatchTable* table_;
991 : // If true, this node is never checked at the start of the input.
992 : // Allows a new trace to start with at_start() set to false.
993 : bool not_at_start_;
994 : bool being_calculated_;
995 : };
996 :
997 :
998 0 : class NegativeLookaroundChoiceNode : public ChoiceNode {
999 : public:
1000 2812 : explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
1001 : GuardedAlternative then_do_this,
1002 : Zone* zone)
1003 2812 : : ChoiceNode(2, zone) {
1004 : AddAlternative(this_must_fail);
1005 : AddAlternative(then_do_this);
1006 2812 : }
1007 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
1008 : void GetQuickCheckDetails(QuickCheckDetails* details,
1009 : RegExpCompiler* compiler, int characters_filled_in,
1010 : bool not_at_start) override;
1011 1105 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
1012 : BoyerMooreLookahead* bm, bool not_at_start) override {
1013 2210 : alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
1014 1105 : not_at_start);
1015 1105 : if (offset == 0) set_bm_info(not_at_start, bm);
1016 1105 : }
1017 : // For a negative lookahead we don't emit the quick check for the
1018 : // alternative that is expected to fail. This is because quick check code
1019 : // starts by loading enough characters for the alternative that takes fewest
1020 : // characters, but on a negative lookahead the negative branch did not take
1021 : // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1022 5604 : bool try_to_emit_quick_check_for_alternative(bool is_first) override {
1023 5604 : return !is_first;
1024 : }
1025 : RegExpNode* FilterOneByte(int depth) override;
1026 : };
1027 :
1028 :
1029 0 : class LoopChoiceNode: public ChoiceNode {
1030 : public:
1031 : LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1032 : : ChoiceNode(2, zone),
1033 : loop_node_(nullptr),
1034 : continue_node_(nullptr),
1035 : body_can_be_zero_length_(body_can_be_zero_length),
1036 999400 : read_backward_(read_backward) {}
1037 : void AddLoopAlternative(GuardedAlternative alt);
1038 : void AddContinueAlternative(GuardedAlternative alt);
1039 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
1040 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
1041 : void GetQuickCheckDetails(QuickCheckDetails* details,
1042 : RegExpCompiler* compiler, int characters_filled_in,
1043 : bool not_at_start) override;
1044 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
1045 : BoyerMooreLookahead* bm, bool not_at_start) override;
1046 : RegExpNode* loop_node() { return loop_node_; }
1047 : RegExpNode* continue_node() { return continue_node_; }
1048 : bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1049 23390 : bool read_backward() override { return read_backward_; }
1050 : void Accept(NodeVisitor* visitor) override;
1051 : RegExpNode* FilterOneByte(int depth) override;
1052 :
1053 : private:
1054 : // AddAlternative is made private for loop nodes because alternatives
1055 : // should not be added freely, we need to keep track of which node
1056 : // goes back to the node itself.
1057 : void AddAlternative(GuardedAlternative node) {
1058 : ChoiceNode::AddAlternative(node);
1059 : }
1060 :
1061 : RegExpNode* loop_node_;
1062 : RegExpNode* continue_node_;
1063 : bool body_can_be_zero_length_;
1064 : bool read_backward_;
1065 : };
1066 :
1067 :
1068 : // Improve the speed that we scan for an initial point where a non-anchored
1069 : // regexp can match by using a Boyer-Moore-like table. This is done by
1070 : // identifying non-greedy non-capturing loops in the nodes that eat any
1071 : // character one at a time. For example in the middle of the regexp
1072 : // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1073 : // inserted at the start of any non-anchored regexp.
1074 : //
1075 : // When we have found such a loop we look ahead in the nodes to find the set of
1076 : // characters that can come at given distances. For example for the regexp
1077 : // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1078 : // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1079 : // the lookahead info where the set of characters is reasonably constrained. In
1080 : // our example this is from index 1 to 2 (0 is not constrained). We can now
1081 : // look 3 characters ahead and if we don't find one of [f, o] (the union of
1082 : // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1083 : //
1084 : // For Unicode input strings we do the same, but modulo 128.
1085 : //
1086 : // We also look at the first string fed to the regexp and use that to get a hint
1087 : // of the character frequencies in the inputs. This affects the assessment of
1088 : // whether the set of characters is 'reasonably constrained'.
1089 : //
1090 : // We also have another lookahead mechanism (called quick check in the code),
1091 : // which uses a wide load of multiple characters followed by a mask and compare
1092 : // to determine whether a match is possible at this point.
1093 : enum ContainedInLattice {
1094 : kNotYet = 0,
1095 : kLatticeIn = 1,
1096 : kLatticeOut = 2,
1097 : kLatticeUnknown = 3 // Can also mean both in and out.
1098 : };
1099 :
1100 :
1101 : inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1102 838058 : return static_cast<ContainedInLattice>(a | b);
1103 : }
1104 :
1105 :
1106 : ContainedInLattice AddRange(ContainedInLattice a,
1107 : const int* ranges,
1108 : int ranges_size,
1109 : Interval new_range);
1110 :
1111 :
1112 : class BoyerMoorePositionInfo : public ZoneObject {
1113 : public:
1114 99240 : explicit BoyerMoorePositionInfo(Zone* zone)
1115 99240 : : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1116 : map_count_(0),
1117 : w_(kNotYet),
1118 : s_(kNotYet),
1119 : d_(kNotYet),
1120 99240 : surrogate_(kNotYet) {
1121 12801960 : for (int i = 0; i < kMapSize; i++) {
1122 12702720 : map_->Add(false, zone);
1123 : }
1124 99240 : }
1125 :
1126 : bool& at(int i) { return map_->at(i); }
1127 :
1128 : static const int kMapSize = 128;
1129 : static const int kMask = kMapSize - 1;
1130 :
1131 : int map_count() const { return map_count_; }
1132 :
1133 : void Set(int character);
1134 : void SetInterval(const Interval& interval);
1135 : void SetAll();
1136 : bool is_non_word() { return w_ == kLatticeOut; }
1137 : bool is_word() { return w_ == kLatticeIn; }
1138 :
1139 : private:
1140 : ZoneList<bool>* map_;
1141 : int map_count_; // Number of set bits in the map.
1142 : ContainedInLattice w_; // The \w character class.
1143 : ContainedInLattice s_; // The \s character class.
1144 : ContainedInLattice d_; // The \d character class.
1145 : ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1146 : };
1147 :
1148 :
1149 : class BoyerMooreLookahead : public ZoneObject {
1150 : public:
1151 : BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1152 :
1153 : int length() { return length_; }
1154 : int max_char() { return max_char_; }
1155 : RegExpCompiler* compiler() { return compiler_; }
1156 :
1157 : int Count(int map_number) {
1158 540915 : return bitmaps_->at(map_number)->map_count();
1159 : }
1160 :
1161 150 : BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1162 :
1163 79612 : void Set(int map_number, int character) {
1164 159224 : if (character > max_char_) return;
1165 159224 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1166 : info->Set(character);
1167 : }
1168 :
1169 323436 : void SetInterval(int map_number, const Interval& interval) {
1170 323436 : if (interval.from() > max_char_) return;
1171 323436 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1172 161718 : if (interval.to() > max_char_) {
1173 0 : info->SetInterval(Interval(interval.from(), max_char_));
1174 : } else {
1175 161718 : info->SetInterval(interval);
1176 : }
1177 : }
1178 :
1179 5476 : void SetAll(int map_number) {
1180 10952 : bitmaps_->at(map_number)->SetAll();
1181 5476 : }
1182 :
1183 1084 : void SetRest(int from_map) {
1184 1084 : for (int i = from_map; i < length_; i++) SetAll(i);
1185 : }
1186 : void EmitSkipInstructions(RegExpMacroAssembler* masm);
1187 :
1188 : private:
1189 : // This is the value obtained by EatsAtLeast. If we do not have at least this
1190 : // many characters left in the sample string then the match is bound to fail.
1191 : // Therefore it is OK to read a character this far ahead of the current match
1192 : // point.
1193 : int length_;
1194 : RegExpCompiler* compiler_;
1195 : // 0xff for Latin1, 0xffff for UTF-16.
1196 : int max_char_;
1197 : ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1198 :
1199 : int GetSkipTable(int min_lookahead,
1200 : int max_lookahead,
1201 : Handle<ByteArray> boolean_skip_table);
1202 : bool FindWorthwhileInterval(int* from, int* to);
1203 : int FindBestInterval(
1204 : int max_number_of_chars, int old_biggest_points, int* from, int* to);
1205 : };
1206 :
1207 :
1208 : // There are many ways to generate code for a node. This class encapsulates
1209 : // the current way we should be generating. In other words it encapsulates
1210 : // the current state of the code generator. The effect of this is that we
1211 : // generate code for paths that the matcher can take through the regular
1212 : // expression. A given node in the regexp can be code-generated several times
1213 : // as it can be part of several traces. For example for the regexp:
1214 : // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1215 : // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1216 : // to match foo is generated only once (the traces have a common prefix). The
1217 : // code to store the capture is deferred and generated (twice) after the places
1218 : // where baz has been matched.
1219 : class Trace {
1220 : public:
1221 : // A value for a property that is either known to be true, know to be false,
1222 : // or not known.
1223 : enum TriBool {
1224 : UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1225 : };
1226 :
1227 : class DeferredAction {
1228 : public:
1229 : DeferredAction(ActionNode::ActionType action_type, int reg)
1230 250930 : : action_type_(action_type), reg_(reg), next_(nullptr) {}
1231 : DeferredAction* next() { return next_; }
1232 : bool Mentions(int reg);
1233 : int reg() { return reg_; }
1234 : ActionNode::ActionType action_type() { return action_type_; }
1235 : private:
1236 : ActionNode::ActionType action_type_;
1237 : int reg_;
1238 : DeferredAction* next_;
1239 : friend class Trace;
1240 : };
1241 :
1242 : class DeferredCapture : public DeferredAction {
1243 : public:
1244 241535 : DeferredCapture(int reg, bool is_capture, Trace* trace)
1245 : : DeferredAction(ActionNode::STORE_POSITION, reg),
1246 : cp_offset_(trace->cp_offset()),
1247 241535 : is_capture_(is_capture) { }
1248 : int cp_offset() { return cp_offset_; }
1249 : bool is_capture() { return is_capture_; }
1250 : private:
1251 : int cp_offset_;
1252 : bool is_capture_;
1253 : void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1254 : };
1255 :
1256 : class DeferredSetRegister : public DeferredAction {
1257 : public:
1258 : DeferredSetRegister(int reg, int value)
1259 : : DeferredAction(ActionNode::SET_REGISTER, reg),
1260 3465 : value_(value) { }
1261 : int value() { return value_; }
1262 : private:
1263 : int value_;
1264 : };
1265 :
1266 : class DeferredClearCaptures : public DeferredAction {
1267 : public:
1268 : explicit DeferredClearCaptures(Interval range)
1269 : : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1270 2186 : range_(range) { }
1271 : Interval range() { return range_; }
1272 : private:
1273 : Interval range_;
1274 : };
1275 :
1276 : class DeferredIncrementRegister : public DeferredAction {
1277 : public:
1278 : explicit DeferredIncrementRegister(int reg)
1279 : : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1280 : };
1281 :
1282 638169 : Trace()
1283 : : cp_offset_(0),
1284 : actions_(nullptr),
1285 : backtrack_(nullptr),
1286 : stop_node_(nullptr),
1287 : loop_label_(nullptr),
1288 : characters_preloaded_(0),
1289 : bound_checked_up_to_(0),
1290 : flush_budget_(100),
1291 1276338 : at_start_(UNKNOWN) {}
1292 :
1293 : // End the trace. This involves flushing the deferred actions in the trace
1294 : // and pushing a backtrack location onto the backtrack stack. Once this is
1295 : // done we can start a new trace or go to one that has already been
1296 : // generated.
1297 : void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1298 : int cp_offset() { return cp_offset_; }
1299 : DeferredAction* actions() { return actions_; }
1300 : // A trivial trace is one that has no deferred actions or other state that
1301 : // affects the assumptions used when generating code. There is no recorded
1302 : // backtrack location in a trivial trace, so with a trivial trace we will
1303 : // generate code that, on a failure to match, gets the backtrack location
1304 : // from the backtrack stack rather than using a direct jump instruction. We
1305 : // always start code generation with a trivial trace and non-trivial traces
1306 : // are created as we emit code for nodes or add to the list of deferred
1307 : // actions in the trace. The location of the code generated for a node using
1308 : // a trivial trace is recorded in a label in the node so that gotos can be
1309 : // generated to that code.
1310 2207779 : bool is_trivial() {
1311 3495158 : return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
1312 1864174 : characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
1313 4037815 : quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
1314 : }
1315 : TriBool at_start() { return at_start_; }
1316 531567 : void set_at_start(TriBool at_start) { at_start_ = at_start; }
1317 : Label* backtrack() { return backtrack_; }
1318 : Label* loop_label() { return loop_label_; }
1319 : RegExpNode* stop_node() { return stop_node_; }
1320 : int characters_preloaded() { return characters_preloaded_; }
1321 : int bound_checked_up_to() { return bound_checked_up_to_; }
1322 : int flush_budget() { return flush_budget_; }
1323 : QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1324 : bool mentions_reg(int reg);
1325 : // Returns true if a deferred position store exists to the specified
1326 : // register and stores the offset in the out-parameter. Otherwise
1327 : // returns false.
1328 : bool GetStoredPosition(int reg, int* cp_offset);
1329 : // These set methods and AdvanceCurrentPositionInTrace should be used only on
1330 : // new traces - the intention is that traces are immutable after creation.
1331 : void add_action(DeferredAction* new_action) {
1332 : DCHECK(new_action->next_ == nullptr);
1333 250930 : new_action->next_ = actions_;
1334 250930 : actions_ = new_action;
1335 : }
1336 806187 : void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1337 11695 : void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1338 11695 : void set_loop_label(Label* label) { loop_label_ = label; }
1339 828444 : void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1340 432746 : void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1341 192638 : void set_flush_budget(int to) { flush_budget_ = to; }
1342 : void set_quick_check_performed(QuickCheckDetails* d) {
1343 224822 : quick_check_performed_ = *d;
1344 : }
1345 : void InvalidateCurrentCharacter();
1346 : void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1347 :
1348 : private:
1349 : int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1350 : void PerformDeferredActions(RegExpMacroAssembler* macro,
1351 : int max_register,
1352 : const OutSet& affected_registers,
1353 : OutSet* registers_to_pop,
1354 : OutSet* registers_to_clear,
1355 : Zone* zone);
1356 : void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1357 : int max_register,
1358 : const OutSet& registers_to_pop,
1359 : const OutSet& registers_to_clear);
1360 : int cp_offset_;
1361 : DeferredAction* actions_;
1362 : Label* backtrack_;
1363 : RegExpNode* stop_node_;
1364 : Label* loop_label_;
1365 : int characters_preloaded_;
1366 : int bound_checked_up_to_;
1367 : QuickCheckDetails quick_check_performed_;
1368 : int flush_budget_;
1369 : TriBool at_start_;
1370 : };
1371 :
1372 :
1373 : class GreedyLoopState {
1374 : public:
1375 : explicit GreedyLoopState(bool not_at_start);
1376 :
1377 : Label* label() { return &label_; }
1378 : Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1379 :
1380 : private:
1381 : Label label_;
1382 : Trace counter_backtrack_trace_;
1383 : };
1384 :
1385 :
1386 : struct PreloadState {
1387 : static const int kEatsAtLeastNotYetInitialized = -1;
1388 : bool preload_is_current_;
1389 : bool preload_has_checked_bounds_;
1390 : int preload_characters_;
1391 : int eats_at_least_;
1392 : void init() {
1393 213650 : eats_at_least_ = kEatsAtLeastNotYetInitialized;
1394 : }
1395 : };
1396 :
1397 :
1398 85760 : class NodeVisitor {
1399 : public:
1400 85760 : virtual ~NodeVisitor() = default;
1401 : #define DECLARE_VISIT(Type) \
1402 : virtual void Visit##Type(Type##Node* that) = 0;
1403 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1404 : #undef DECLARE_VISIT
1405 0 : virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1406 : };
1407 :
1408 :
1409 : // Node visitor used to add the start set of the alternatives to the
1410 : // dispatch table of a choice node.
1411 55 : class DispatchTableConstructor: public NodeVisitor {
1412 : public:
1413 : DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1414 : Zone* zone)
1415 : : table_(table),
1416 : choice_index_(-1),
1417 : ignore_case_(ignore_case),
1418 55 : zone_(zone) { }
1419 :
1420 : void BuildTable(ChoiceNode* node);
1421 :
1422 580 : void AddRange(CharacterRange range) {
1423 1160 : table()->AddRange(range, choice_index_, zone_);
1424 : }
1425 :
1426 : void AddInverse(ZoneList<CharacterRange>* ranges);
1427 :
1428 : #define DECLARE_VISIT(Type) \
1429 : virtual void Visit##Type(Type##Node* that);
1430 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1431 : #undef DECLARE_VISIT
1432 :
1433 : DispatchTable* table() { return table_; }
1434 55 : void set_choice_index(int value) { choice_index_ = value; }
1435 :
1436 : protected:
1437 : DispatchTable* table_;
1438 : int choice_index_;
1439 : bool ignore_case_;
1440 : Zone* zone_;
1441 : };
1442 :
1443 :
1444 : // Assertion propagation moves information about assertions such as
1445 : // \b to the affected nodes. For instance, in /.\b./ information must
1446 : // be propagated to the first '.' that whatever follows needs to know
1447 : // if it matched a word or a non-word, and to the second '.' that it
1448 : // has to check if it succeeds a word or non-word. In this case the
1449 : // result will be something like:
1450 : //
1451 : // +-------+ +------------+
1452 : // | . | | . |
1453 : // +-------+ ---> +------------+
1454 : // | word? | | check word |
1455 : // +-------+ +------------+
1456 85705 : class Analysis: public NodeVisitor {
1457 : public:
1458 : Analysis(Isolate* isolate, bool is_one_byte)
1459 85705 : : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
1460 : void EnsureAnalyzed(RegExpNode* node);
1461 :
1462 : #define DECLARE_VISIT(Type) void Visit##Type(Type##Node* that) override;
1463 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1464 : #undef DECLARE_VISIT
1465 : void VisitLoopChoice(LoopChoiceNode* that) override;
1466 :
1467 : bool has_failed() { return error_message_ != nullptr; }
1468 : const char* error_message() {
1469 : DCHECK(error_message_ != nullptr);
1470 : return error_message_;
1471 : }
1472 : void fail(const char* error_message) {
1473 427 : error_message_ = error_message;
1474 : }
1475 :
1476 : Isolate* isolate() const { return isolate_; }
1477 :
1478 : private:
1479 : Isolate* isolate_;
1480 : bool is_one_byte_;
1481 : const char* error_message_;
1482 :
1483 : DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1484 : };
1485 :
1486 :
1487 : struct RegExpCompileData {
1488 : RegExpCompileData()
1489 : : tree(nullptr),
1490 : node(nullptr),
1491 : simple(true),
1492 : contains_anchor(false),
1493 761480 : capture_count(0) {}
1494 : RegExpTree* tree;
1495 : RegExpNode* node;
1496 : bool simple;
1497 : bool contains_anchor;
1498 : Handle<FixedArray> capture_name_map;
1499 : Handle<String> error;
1500 : int capture_count;
1501 : };
1502 :
1503 :
1504 : class RegExpEngine: public AllStatic {
1505 : public:
1506 : struct CompilationResult {
1507 : inline CompilationResult(Isolate* isolate, const char* error_message);
1508 : CompilationResult(Object code, int registers)
1509 85278 : : code(code), num_registers(registers) {}
1510 : const char* const error_message = nullptr;
1511 : Object const code;
1512 : int const num_registers = 0;
1513 : };
1514 :
1515 : static CompilationResult Compile(Isolate* isolate, Zone* zone,
1516 : RegExpCompileData* input,
1517 : JSRegExp::Flags flags,
1518 : Handle<String> pattern,
1519 : Handle<String> sample_subject,
1520 : bool is_one_byte);
1521 :
1522 : static bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern);
1523 :
1524 : static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1525 : };
1526 :
1527 :
1528 : class RegExpResultsCache : public AllStatic {
1529 : public:
1530 : enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1531 :
1532 : // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
1533 : // On success, the returned result is guaranteed to be a COW-array.
1534 : static Object Lookup(Heap* heap, String key_string, Object key_pattern,
1535 : FixedArray* last_match_out, ResultsCacheType type);
1536 : // Attempt to add value_array to the cache specified by type. On success,
1537 : // value_array is turned into a COW-array.
1538 : static void Enter(Isolate* isolate, Handle<String> key_string,
1539 : Handle<Object> key_pattern, Handle<FixedArray> value_array,
1540 : Handle<FixedArray> last_match_cache, ResultsCacheType type);
1541 : static void Clear(FixedArray cache);
1542 : static const int kRegExpResultsCacheSize = 0x100;
1543 :
1544 : private:
1545 : static const int kArrayEntriesPerCacheEntry = 4;
1546 : static const int kStringOffset = 0;
1547 : static const int kPatternOffset = 1;
1548 : static const int kArrayOffset = 2;
1549 : static const int kLastMatchOffset = 3;
1550 : };
1551 :
1552 : } // namespace internal
1553 : } // namespace v8
1554 :
1555 : #endif // V8_REGEXP_JSREGEXP_H_
|