Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_REGEXP_JSREGEXP_H_
6 : #define V8_REGEXP_JSREGEXP_H_
7 :
8 : #include "src/allocation.h"
9 : #include "src/isolate.h"
10 : #include "src/objects/js-regexp.h"
11 : #include "src/regexp/regexp-ast.h"
12 : #include "src/regexp/regexp-macro-assembler.h"
13 : #include "src/zone/zone-splay-tree.h"
14 :
15 : namespace v8 {
16 : namespace internal {
17 :
18 : class NodeVisitor;
19 : class RegExpCompiler;
20 : class RegExpMacroAssembler;
21 : class RegExpNode;
22 : class RegExpTree;
23 : class BoyerMooreLookahead;
24 :
25 : inline bool IgnoreCase(JSRegExp::Flags flags) {
26 : return (flags & JSRegExp::kIgnoreCase) != 0;
27 : }
28 :
29 : inline bool IsUnicode(JSRegExp::Flags flags) {
30 1682 : return (flags & JSRegExp::kUnicode) != 0;
31 : }
32 :
33 : inline bool IsSticky(JSRegExp::Flags flags) {
34 85739 : return (flags & JSRegExp::kSticky) != 0;
35 : }
36 :
37 : inline bool IsGlobal(JSRegExp::Flags flags) {
38 85739 : return (flags & JSRegExp::kGlobal) != 0;
39 : }
40 :
41 : inline bool DotAll(JSRegExp::Flags flags) {
42 : return (flags & JSRegExp::kDotAll) != 0;
43 : }
44 :
45 : inline bool Multiline(JSRegExp::Flags flags) {
46 : return (flags & JSRegExp::kMultiline) != 0;
47 : }
48 :
49 : inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
50 : // Both unicode and ignore_case flags are set. We need to use ICU to find
51 : // the closure over case equivalents.
52 422674 : return IsUnicode(flags) && IgnoreCase(flags);
53 : }
54 :
55 : class RegExpImpl {
56 : public:
57 : // Whether the irregexp engine generates native code or interpreter bytecode.
58 : static bool UsesNativeRegExp() { return !FLAG_regexp_interpret_all; }
59 :
60 : // Returns a string representation of a regular expression.
61 : // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
62 : // This function calls the garbage collector if necessary.
63 : static Handle<String> ToString(Handle<Object> value);
64 :
65 : // Parses the RegExp pattern and prepares the JSRegExp object with
66 : // generic data and choice of implementation - as well as what
67 : // the implementation wants to store in the data field.
68 : // Returns false if compilation fails.
69 : V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
70 : Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
71 : JSRegExp::Flags flags);
72 :
73 : // See ECMA-262 section 15.10.6.2.
74 : // This function calls the garbage collector if necessary.
75 : V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
76 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
77 : int index, Handle<RegExpMatchInfo> last_match_info);
78 :
79 : // Prepares a JSRegExp object with Irregexp-specific data.
80 : static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
81 : Handle<String> pattern, JSRegExp::Flags flags,
82 : int capture_register_count);
83 :
84 : static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
85 : Handle<String> pattern, JSRegExp::Flags flags,
86 : Handle<String> match_pattern);
87 :
88 : static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
89 : Handle<String> subject, int index, int32_t* output,
90 : int output_size);
91 :
92 : static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
93 : Handle<String> subject, int index,
94 : Handle<RegExpMatchInfo> last_match_info);
95 :
96 : enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
97 :
98 : // Prepare a RegExp for being executed one or more times (using
99 : // IrregexpExecOnce) on the subject.
100 : // This ensures that the regexp is compiled for the subject, and that
101 : // the subject is flat.
102 : // Returns the number of integer spaces required by IrregexpExecOnce
103 : // as its "registers" argument. If the regexp cannot be compiled,
104 : // an exception is set as pending, and this function returns negative.
105 : static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
106 : Handle<String> subject);
107 :
108 : // Execute a regular expression on the subject, starting from index.
109 : // If matching succeeds, return the number of matches. This can be larger
110 : // than one in the case of global regular expressions.
111 : // The captures and subcaptures are stored into the registers vector.
112 : // If matching fails, returns RE_FAILURE.
113 : // If execution fails, sets a pending exception and returns RE_EXCEPTION.
114 : static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
115 : Handle<String> subject, int index, int32_t* output,
116 : int output_size);
117 :
118 : // Execute an Irregexp bytecode pattern.
119 : // On a successful match, the result is a JSArray containing
120 : // captured positions. On a failure, the result is the null value.
121 : // Returns an empty handle in case of an exception.
122 : V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
123 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
124 : int index, Handle<RegExpMatchInfo> last_match_info);
125 :
126 : // Set last match info. If match is nullptr, then setting captures is
127 : // omitted.
128 : static Handle<RegExpMatchInfo> SetLastMatchInfo(
129 : Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
130 : Handle<String> subject, int capture_count, int32_t* match);
131 :
132 : class GlobalCache {
133 : public:
134 : GlobalCache(Handle<JSRegExp> regexp,
135 : Handle<String> subject,
136 : Isolate* isolate);
137 :
138 : V8_INLINE ~GlobalCache();
139 :
140 : // Fetch the next entry in the cache for global regexp match results.
141 : // This does not set the last match info. Upon failure, nullptr is
142 : // returned. The cause can be checked with Result(). The previous result is
143 : // still in available in memory when a failure happens.
144 : V8_INLINE int32_t* FetchNext();
145 :
146 : V8_INLINE int32_t* LastSuccessfulMatch();
147 :
148 190901 : V8_INLINE bool HasException() { return num_matches_ < 0; }
149 :
150 : private:
151 : int AdvanceZeroLength(int last_index);
152 :
153 : int num_matches_;
154 : int max_matches_;
155 : int current_match_index_;
156 : int registers_per_match_;
157 : // Pointer to the last set of captures.
158 : int32_t* register_array_;
159 : int register_array_size_;
160 : Handle<JSRegExp> regexp_;
161 : Handle<String> subject_;
162 : Isolate* isolate_;
163 : };
164 :
165 : // For acting on the JSRegExp data FixedArray.
166 : static int IrregexpMaxRegisterCount(FixedArray re);
167 : static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
168 : static void SetIrregexpCaptureNameMap(FixedArray re,
169 : Handle<FixedArray> value);
170 : static int IrregexpNumberOfCaptures(FixedArray re);
171 : static int IrregexpNumberOfRegisters(FixedArray re);
172 : static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
173 : static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
174 :
175 : // Limit the space regexps take up on the heap. In order to limit this we
176 : // would like to keep track of the amount of regexp code on the heap. This
177 : // is not tracked, however. As a conservative approximation we track the
178 : // total regexp code compiled including code that has subsequently been freed
179 : // and the total executable memory at any point.
180 : static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
181 : static const size_t kRegExpCompiledLimit = 1 * MB;
182 : static const int kRegExpTooLargeToOptimize = 20 * KB;
183 :
184 : private:
185 : static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
186 : Handle<String> sample_subject, bool is_one_byte);
187 : static inline bool EnsureCompiledIrregexp(Isolate* isolate,
188 : Handle<JSRegExp> re,
189 : Handle<String> sample_subject,
190 : bool is_one_byte);
191 : };
192 :
193 :
194 : // Represents the location of one element relative to the intersection of
195 : // two sets. Corresponds to the four areas of a Venn diagram.
196 : enum ElementInSetsRelation {
197 : kInsideNone = 0,
198 : kInsideFirst = 1,
199 : kInsideSecond = 2,
200 : kInsideBoth = 3
201 : };
202 :
203 :
204 : // A set of unsigned integers that behaves especially well on small
205 : // integers (< 32). May do zone-allocation.
206 : class OutSet: public ZoneObject {
207 : public:
208 1027423 : OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
209 : OutSet* Extend(unsigned value, Zone* zone);
210 : V8_EXPORT_PRIVATE bool Get(unsigned value) const;
211 : static const unsigned kFirstLimit = 32;
212 :
213 : private:
214 : // Destructively set a value in this set. In most cases you want
215 : // to use Extend instead to ensure that only one instance exists
216 : // that contains the same values.
217 : void Set(unsigned value, Zone* zone);
218 :
219 : // The successors are a list of sets that contain the same values
220 : // as this set and the one more value that is not present in this
221 : // set.
222 : ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
223 :
224 : OutSet(uint32_t first, ZoneList<unsigned>* remaining)
225 16997 : : first_(first), remaining_(remaining), successors_(nullptr) {}
226 : uint32_t first_;
227 : ZoneList<unsigned>* remaining_;
228 : ZoneList<OutSet*>* successors_;
229 : friend class Trace;
230 : };
231 :
232 :
233 : // A mapping from integers, specified as ranges, to a set of integers.
234 : // Used for mapping character ranges to choices.
235 2647 : class DispatchTable : public ZoneObject {
236 : public:
237 : explicit DispatchTable(Zone* zone) : tree_(zone) { }
238 :
239 : class Entry {
240 : public:
241 : Entry() : from_(0), to_(0), out_set_(nullptr) {}
242 : Entry(uc32 from, uc32 to, OutSet* out_set)
243 : : from_(from), to_(to), out_set_(out_set) {
244 : DCHECK(from <= to);
245 : }
246 : uc32 from() { return from_; }
247 : uc32 to() { return to_; }
248 4830 : void set_to(uc32 value) { to_ = value; }
249 : void AddValue(int value, Zone* zone) {
250 81858 : out_set_ = out_set_->Extend(value, zone);
251 : }
252 : OutSet* out_set() { return out_set_; }
253 : private:
254 : uc32 from_;
255 : uc32 to_;
256 : OutSet* out_set_;
257 : };
258 :
259 : class Config {
260 : public:
261 : using Key = uc32;
262 : using Value = Entry;
263 : static const uc32 kNoKey;
264 : static const Entry NoValue() { return Value(); }
265 : static inline int Compare(uc32 a, uc32 b) {
266 1587140 : if (a == b)
267 : return 0;
268 1554355 : else if (a < b)
269 : return -1;
270 : else
271 : return 1;
272 : }
273 : };
274 :
275 : V8_EXPORT_PRIVATE void AddRange(CharacterRange range, int value, Zone* zone);
276 : V8_EXPORT_PRIVATE OutSet* Get(uc32 value);
277 : void Dump();
278 :
279 : template <typename Callback>
280 : void ForEach(Callback* callback) {
281 2587 : return tree()->ForEach(callback);
282 : }
283 :
284 : private:
285 : // There can't be a static empty set since it allocates its
286 : // successors in a zone and caches them.
287 174036 : OutSet* empty() { return &empty_; }
288 : OutSet empty_;
289 : ZoneSplayTree<Config>* tree() { return &tree_; }
290 : ZoneSplayTree<Config> tree_;
291 : };
292 :
293 :
294 : // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
295 2587 : class UnicodeRangeSplitter {
296 : public:
297 : V8_EXPORT_PRIVATE UnicodeRangeSplitter(Zone* zone,
298 : ZoneList<CharacterRange>* base);
299 : void Call(uc32 from, DispatchTable::Entry entry);
300 :
301 : ZoneList<CharacterRange>* bmp() { return bmp_; }
302 : ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
303 : ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
304 : ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
305 :
306 : private:
307 : static const int kBase = 0;
308 : // Separate ranges into
309 : static const int kBmpCodePoints = 1;
310 : static const int kLeadSurrogates = 2;
311 : static const int kTrailSurrogates = 3;
312 : static const int kNonBmpCodePoints = 4;
313 :
314 : Zone* zone_;
315 : DispatchTable table_;
316 : ZoneList<CharacterRange>* bmp_;
317 : ZoneList<CharacterRange>* lead_surrogates_;
318 : ZoneList<CharacterRange>* trail_surrogates_;
319 : ZoneList<CharacterRange>* non_bmp_;
320 : };
321 :
322 : #define FOR_EACH_NODE_TYPE(VISIT) \
323 : VISIT(End) \
324 : VISIT(Action) \
325 : VISIT(Choice) \
326 : VISIT(BackReference) \
327 : VISIT(Assertion) \
328 : VISIT(Text)
329 :
330 :
331 : class Trace;
332 : struct PreloadState;
333 : class GreedyLoopState;
334 : class AlternativeGenerationList;
335 :
336 : struct NodeInfo {
337 : NodeInfo()
338 : : being_analyzed(false),
339 : been_analyzed(false),
340 : follows_word_interest(false),
341 : follows_newline_interest(false),
342 : follows_start_interest(false),
343 : at_end(false),
344 : visited(false),
345 4384835 : replacement_calculated(false) { }
346 :
347 : // Returns true if the interests and assumptions of this node
348 : // matches the given one.
349 : bool Matches(NodeInfo* that) {
350 : return (at_end == that->at_end) &&
351 : (follows_word_interest == that->follows_word_interest) &&
352 : (follows_newline_interest == that->follows_newline_interest) &&
353 : (follows_start_interest == that->follows_start_interest);
354 : }
355 :
356 : // Updates the interests of this node given the interests of the
357 : // node preceding it.
358 : void AddFromPreceding(NodeInfo* that) {
359 : at_end |= that->at_end;
360 : follows_word_interest |= that->follows_word_interest;
361 : follows_newline_interest |= that->follows_newline_interest;
362 : follows_start_interest |= that->follows_start_interest;
363 : }
364 :
365 : bool HasLookbehind() {
366 : return follows_word_interest ||
367 : follows_newline_interest ||
368 : follows_start_interest;
369 : }
370 :
371 : // Sets the interests of this node to include the interests of the
372 : // following node.
373 : void AddFromFollowing(NodeInfo* that) {
374 569751 : follows_word_interest |= that->follows_word_interest;
375 569751 : follows_newline_interest |= that->follows_newline_interest;
376 569751 : follows_start_interest |= that->follows_start_interest;
377 : }
378 :
379 : void ResetCompilationState() {
380 : being_analyzed = false;
381 : been_analyzed = false;
382 : }
383 :
384 : bool being_analyzed: 1;
385 : bool been_analyzed: 1;
386 :
387 : // These bits are set of this node has to know what the preceding
388 : // character was.
389 : bool follows_word_interest: 1;
390 : bool follows_newline_interest: 1;
391 : bool follows_start_interest: 1;
392 :
393 : bool at_end: 1;
394 : bool visited: 1;
395 : bool replacement_calculated: 1;
396 : };
397 :
398 :
399 : // Details of a quick mask-compare check that can look ahead in the
400 : // input stream.
401 : class QuickCheckDetails {
402 : public:
403 : QuickCheckDetails()
404 : : characters_(0),
405 : mask_(0),
406 : value_(0),
407 14103380 : cannot_match_(false) { }
408 : explicit QuickCheckDetails(int characters)
409 : : characters_(characters),
410 : mask_(0),
411 : value_(0),
412 787450 : cannot_match_(false) { }
413 : bool Rationalize(bool one_byte);
414 : // Merge in the information from another branch of an alternation.
415 : void Merge(QuickCheckDetails* other, int from_index);
416 : // Advance the current position by some amount.
417 : void Advance(int by, bool one_byte);
418 : void Clear();
419 : bool cannot_match() { return cannot_match_; }
420 431 : void set_cannot_match() { cannot_match_ = true; }
421 : struct Position {
422 11912664 : Position() : mask(0), value(0), determines_perfectly(false) { }
423 : uc16 mask;
424 : uc16 value;
425 : bool determines_perfectly;
426 : };
427 : int characters() { return characters_; }
428 606191 : void set_characters(int characters) { characters_ = characters; }
429 : Position* positions(int index) {
430 : DCHECK_LE(0, index);
431 : DCHECK_GT(characters_, index);
432 1629389 : return positions_ + index;
433 : }
434 : uint32_t mask() { return mask_; }
435 : uint32_t value() { return value_; }
436 :
437 : private:
438 : // How many characters do we have quick check information from. This is
439 : // the same for all branches of a choice node.
440 : int characters_;
441 : Position positions_[4];
442 : // These values are the condensate of the above array after Rationalize().
443 : uint32_t mask_;
444 : uint32_t value_;
445 : // If set to true, there is no way this quick check can match at all.
446 : // E.g., if it requires to be at the start of the input, and isn't.
447 : bool cannot_match_;
448 : };
449 :
450 :
451 : extern int kUninitializedRegExpNodePlaceHolder;
452 :
453 :
454 0 : class RegExpNode: public ZoneObject {
455 : public:
456 : explicit RegExpNode(Zone* zone)
457 : : replacement_(nullptr),
458 : on_work_list_(false),
459 : trace_count_(0),
460 13154505 : zone_(zone) {
461 4384835 : bm_info_[0] = bm_info_[1] = nullptr;
462 : }
463 : virtual ~RegExpNode();
464 : virtual void Accept(NodeVisitor* visitor) = 0;
465 : // Generates a goto to this node or actually generates the code at this point.
466 : virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
467 : // How many characters must this node consume at a minimum in order to
468 : // succeed. If we have found at least 'still_to_find' characters that
469 : // must be consumed there is no need to ask any following nodes whether
470 : // they are sure to eat any more characters. The not_at_start argument is
471 : // used to indicate that we know we are not at the start of the input. In
472 : // this case anchored branches will always fail and can be ignored when
473 : // determining how many characters are consumed on success.
474 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
475 : // Emits some quick code that checks whether the preloaded characters match.
476 : // Falls through on certain failure, jumps to the label on possible success.
477 : // If the node cannot make a quick check it does nothing and returns false.
478 : bool EmitQuickCheck(RegExpCompiler* compiler,
479 : Trace* bounds_check_trace,
480 : Trace* trace,
481 : bool preload_has_checked_bounds,
482 : Label* on_possible_success,
483 : QuickCheckDetails* details_return,
484 : bool fall_through_on_failure);
485 : // For a given number of characters this returns a mask and a value. The
486 : // next n characters are anded with the mask and compared with the value.
487 : // A comparison failure indicates the node cannot match the next n characters.
488 : // A comparison success indicates the node may match.
489 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
490 : RegExpCompiler* compiler,
491 : int characters_filled_in,
492 : bool not_at_start) = 0;
493 : static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
494 87129 : virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
495 : // Only returns the successor for a text node of length 1 that matches any
496 : // character and that has no guards on it.
497 78558 : virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
498 : RegExpCompiler* compiler) {
499 78558 : return nullptr;
500 : }
501 :
502 : // Collects information on the possible code units (mod 128) that can match if
503 : // we look forward. This is used for a Boyer-Moore-like string searching
504 : // implementation. TODO(erikcorry): This should share more code with
505 : // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
506 : // the number of nodes we are willing to look at in order to create this data.
507 : static const int kRecursionBudget = 200;
508 : bool KeepRecursing(RegExpCompiler* compiler);
509 0 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
510 : BoyerMooreLookahead* bm, bool not_at_start) {
511 0 : UNREACHABLE();
512 : }
513 :
514 : // If we know that the input is one-byte then there are some nodes that can
515 : // never match. This method returns a node that can be substituted for
516 : // itself, or nullptr if the node can never match.
517 14528 : virtual RegExpNode* FilterOneByte(int depth) { return this; }
518 : // Helper for FilterOneByte.
519 : RegExpNode* replacement() {
520 : DCHECK(info()->replacement_calculated);
521 : return replacement_;
522 : }
523 : RegExpNode* set_replacement(RegExpNode* replacement) {
524 161464 : info()->replacement_calculated = true;
525 161464 : replacement_ = replacement;
526 : return replacement; // For convenience.
527 : }
528 :
529 : // We want to avoid recalculating the lookahead info, so we store it on the
530 : // node. Only info that is for this node is stored. We can tell that the
531 : // info is for this node when offset == 0, so the information is calculated
532 : // relative to this node.
533 : void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
534 104194 : if (offset == 0) set_bm_info(not_at_start, bm);
535 : }
536 :
537 461879 : Label* label() { return &label_; }
538 : // If non-generic code is generated for a node (i.e. the node is not at the
539 : // start of the trace) then it cannot be reused. This variable sets a limit
540 : // on how often we allow that to happen before we insist on starting a new
541 : // trace and generating generic code for a node that can be reused by flushing
542 : // the deferred actions in the current trace and generating a goto.
543 : static const int kMaxCopiesCodeGenerated = 10;
544 :
545 : bool on_work_list() { return on_work_list_; }
546 421968 : void set_on_work_list(bool value) { on_work_list_ = value; }
547 :
548 197217 : NodeInfo* info() { return &info_; }
549 :
550 : BoyerMooreLookahead* bm_info(bool not_at_start) {
551 82469 : return bm_info_[not_at_start ? 1 : 0];
552 : }
553 :
554 : Zone* zone() const { return zone_; }
555 :
556 : protected:
557 : enum LimitResult { DONE, CONTINUE };
558 : RegExpNode* replacement_;
559 :
560 : LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
561 :
562 : void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
563 198139 : bm_info_[not_at_start ? 1 : 0] = bm;
564 : }
565 :
566 : private:
567 : static const int kFirstCharBudget = 10;
568 : Label label_;
569 : bool on_work_list_;
570 : NodeInfo info_;
571 : // This variable keeps track of how many times code has been generated for
572 : // this node (in different traces). We don't keep track of where the
573 : // generated code is located unless the code is generated at the start of
574 : // a trace, in which case it is generic and can be reused by flushing the
575 : // deferred operations in the current trace and generating a goto.
576 : int trace_count_;
577 : BoyerMooreLookahead* bm_info_[2];
578 :
579 : Zone* zone_;
580 : };
581 :
582 :
583 0 : class SeqRegExpNode: public RegExpNode {
584 : public:
585 : explicit SeqRegExpNode(RegExpNode* on_success)
586 3270685 : : RegExpNode(on_success->zone()), on_success_(on_success) { }
587 : RegExpNode* on_success() { return on_success_; }
588 : void set_on_success(RegExpNode* node) { on_success_ = node; }
589 : RegExpNode* FilterOneByte(int depth) override;
590 0 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
591 : BoyerMooreLookahead* bm, bool not_at_start) override {
592 0 : on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
593 0 : if (offset == 0) set_bm_info(not_at_start, bm);
594 0 : }
595 :
596 : protected:
597 : RegExpNode* FilterSuccessor(int depth);
598 :
599 : private:
600 : RegExpNode* on_success_;
601 : };
602 :
603 :
604 0 : class ActionNode: public SeqRegExpNode {
605 : public:
606 : enum ActionType {
607 : SET_REGISTER,
608 : INCREMENT_REGISTER,
609 : STORE_POSITION,
610 : BEGIN_SUBMATCH,
611 : POSITIVE_SUBMATCH_SUCCESS,
612 : EMPTY_MATCH_CHECK,
613 : CLEAR_CAPTURES
614 : };
615 : static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
616 : static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
617 : static ActionNode* StorePosition(int reg,
618 : bool is_capture,
619 : RegExpNode* on_success);
620 : static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
621 : static ActionNode* BeginSubmatch(int stack_pointer_reg,
622 : int position_reg,
623 : RegExpNode* on_success);
624 : static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
625 : int restore_reg,
626 : int clear_capture_count,
627 : int clear_capture_from,
628 : RegExpNode* on_success);
629 : static ActionNode* EmptyMatchCheck(int start_register,
630 : int repetition_register,
631 : int repetition_limit,
632 : RegExpNode* on_success);
633 : void Accept(NodeVisitor* visitor) override;
634 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
635 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
636 64962 : void GetQuickCheckDetails(QuickCheckDetails* details,
637 : RegExpCompiler* compiler, int filled_in,
638 : bool not_at_start) override {
639 64962 : return on_success()->GetQuickCheckDetails(
640 129924 : details, compiler, filled_in, not_at_start);
641 : }
642 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
643 : BoyerMooreLookahead* bm, bool not_at_start) override;
644 : ActionType action_type() { return action_type_; }
645 : // TODO(erikcorry): We should allow some action nodes in greedy loops.
646 114908 : int GreedyLoopTextLength() override {
647 114908 : return kNodeIsTooComplexForGreedyLoops;
648 : }
649 :
650 : private:
651 : union {
652 : struct {
653 : int reg;
654 : int value;
655 : } u_store_register;
656 : struct {
657 : int reg;
658 : } u_increment_register;
659 : struct {
660 : int reg;
661 : bool is_capture;
662 : } u_position_register;
663 : struct {
664 : int stack_pointer_register;
665 : int current_position_register;
666 : int clear_register_count;
667 : int clear_register_from;
668 : } u_submatch;
669 : struct {
670 : int start_register;
671 : int repetition_register;
672 : int repetition_limit;
673 : } u_empty_match_check;
674 : struct {
675 : int range_from;
676 : int range_to;
677 : } u_clear_captures;
678 : } data_;
679 : ActionNode(ActionType action_type, RegExpNode* on_success)
680 : : SeqRegExpNode(on_success),
681 2042178 : action_type_(action_type) { }
682 : ActionType action_type_;
683 : friend class DotPrinter;
684 : };
685 :
686 :
687 0 : class TextNode: public SeqRegExpNode {
688 : public:
689 : TextNode(ZoneList<TextElement>* elms, bool read_backward,
690 : RegExpNode* on_success)
691 1046757 : : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
692 173926 : TextNode(RegExpCharacterClass* that, bool read_backward,
693 : RegExpNode* on_success)
694 : : SeqRegExpNode(on_success),
695 : elms_(new (zone()) ZoneList<TextElement>(1, zone())),
696 347852 : read_backward_(read_backward) {
697 173926 : elms_->Add(TextElement::CharClass(that), zone());
698 173926 : }
699 : // Create TextNode for a single character class for the given ranges.
700 : static TextNode* CreateForCharacterRanges(Zone* zone,
701 : ZoneList<CharacterRange>* ranges,
702 : bool read_backward,
703 : RegExpNode* on_success,
704 : JSRegExp::Flags flags);
705 : // Create TextNode for a surrogate pair with a range given for the
706 : // lead and the trail surrogate each.
707 : static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
708 : CharacterRange trail,
709 : bool read_backward,
710 : RegExpNode* on_success,
711 : JSRegExp::Flags flags);
712 : void Accept(NodeVisitor* visitor) override;
713 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
714 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
715 : void GetQuickCheckDetails(QuickCheckDetails* details,
716 : RegExpCompiler* compiler, int characters_filled_in,
717 : bool not_at_start) override;
718 : ZoneList<TextElement>* elements() { return elms_; }
719 : bool read_backward() { return read_backward_; }
720 : void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
721 : int GreedyLoopTextLength() override;
722 : RegExpNode* GetSuccessorOfOmnivorousTextNode(
723 : RegExpCompiler* compiler) override;
724 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
725 : BoyerMooreLookahead* bm, bool not_at_start) override;
726 : void CalculateOffsets();
727 : RegExpNode* FilterOneByte(int depth) override;
728 :
729 : private:
730 : enum TextEmitPassType {
731 : NON_LATIN1_MATCH, // Check for characters that can't match.
732 : SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
733 : NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
734 : CASE_CHARACTER_MATCH, // Case-independent single character check.
735 : CHARACTER_CLASS_MATCH // Character class.
736 : };
737 : static bool SkipPass(TextEmitPassType pass, bool ignore_case);
738 : static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
739 : static const int kLastPass = CHARACTER_CLASS_MATCH;
740 : void TextEmitPass(RegExpCompiler* compiler,
741 : TextEmitPassType pass,
742 : bool preloaded,
743 : Trace* trace,
744 : bool first_element_checked,
745 : int* checked_up_to);
746 : int Length();
747 : ZoneList<TextElement>* elms_;
748 : bool read_backward_;
749 : };
750 :
751 :
752 0 : class AssertionNode: public SeqRegExpNode {
753 : public:
754 : enum AssertionType {
755 : AT_END,
756 : AT_START,
757 : AT_BOUNDARY,
758 : AT_NON_BOUNDARY,
759 : AFTER_NEWLINE
760 : };
761 2014 : static AssertionNode* AtEnd(RegExpNode* on_success) {
762 2014 : return new(on_success->zone()) AssertionNode(AT_END, on_success);
763 : }
764 3055 : static AssertionNode* AtStart(RegExpNode* on_success) {
765 3055 : return new(on_success->zone()) AssertionNode(AT_START, on_success);
766 : }
767 136 : static AssertionNode* AtBoundary(RegExpNode* on_success) {
768 136 : return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
769 : }
770 114 : static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
771 114 : return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
772 : }
773 129 : static AssertionNode* AfterNewline(RegExpNode* on_success) {
774 129 : return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
775 : }
776 : void Accept(NodeVisitor* visitor) override;
777 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
778 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
779 : void GetQuickCheckDetails(QuickCheckDetails* details,
780 : RegExpCompiler* compiler, int filled_in,
781 : bool not_at_start) override;
782 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
783 : BoyerMooreLookahead* bm, bool not_at_start) override;
784 : AssertionType assertion_type() { return assertion_type_; }
785 :
786 : private:
787 : void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
788 : enum IfPrevious { kIsNonWord, kIsWord };
789 : void BacktrackIfPrevious(RegExpCompiler* compiler,
790 : Trace* trace,
791 : IfPrevious backtrack_if_previous);
792 : AssertionNode(AssertionType t, RegExpNode* on_success)
793 5448 : : SeqRegExpNode(on_success), assertion_type_(t) { }
794 : AssertionType assertion_type_;
795 : };
796 :
797 :
798 0 : class BackReferenceNode: public SeqRegExpNode {
799 : public:
800 : BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
801 : bool read_backward, RegExpNode* on_success)
802 : : SeqRegExpNode(on_success),
803 : start_reg_(start_reg),
804 : end_reg_(end_reg),
805 : flags_(flags),
806 2376 : read_backward_(read_backward) {}
807 : void Accept(NodeVisitor* visitor) override;
808 : int start_register() { return start_reg_; }
809 : int end_register() { return end_reg_; }
810 : bool read_backward() { return read_backward_; }
811 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
812 : int EatsAtLeast(int still_to_find, int recursion_depth,
813 : bool not_at_start) override;
814 522 : void GetQuickCheckDetails(QuickCheckDetails* details,
815 : RegExpCompiler* compiler, int characters_filled_in,
816 : bool not_at_start) override {
817 522 : return;
818 : }
819 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
820 : BoyerMooreLookahead* bm, bool not_at_start) override;
821 :
822 : private:
823 : int start_reg_;
824 : int end_reg_;
825 : JSRegExp::Flags flags_;
826 : bool read_backward_;
827 : };
828 :
829 :
830 0 : class EndNode: public RegExpNode {
831 : public:
832 : enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
833 88851 : EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
834 : void Accept(NodeVisitor* visitor) override;
835 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
836 115220 : int EatsAtLeast(int still_to_find, int recursion_depth,
837 : bool not_at_start) override {
838 115220 : return 0;
839 : }
840 0 : void GetQuickCheckDetails(QuickCheckDetails* details,
841 : RegExpCompiler* compiler, int characters_filled_in,
842 : bool not_at_start) override {
843 : // Returning 0 from EatsAtLeast should ensure we never get here.
844 0 : UNREACHABLE();
845 : }
846 0 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
847 : BoyerMooreLookahead* bm, bool not_at_start) override {
848 : // Returning 0 from EatsAtLeast should ensure we never get here.
849 0 : UNREACHABLE();
850 : }
851 :
852 : private:
853 : Action action_;
854 : };
855 :
856 :
857 0 : class NegativeSubmatchSuccess: public EndNode {
858 : public:
859 : NegativeSubmatchSuccess(int stack_pointer_reg,
860 : int position_reg,
861 : int clear_capture_count,
862 : int clear_capture_start,
863 : Zone* zone)
864 : : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
865 : stack_pointer_register_(stack_pointer_reg),
866 : current_position_register_(position_reg),
867 : clear_capture_count_(clear_capture_count),
868 2812 : clear_capture_start_(clear_capture_start) { }
869 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
870 :
871 : private:
872 : int stack_pointer_register_;
873 : int current_position_register_;
874 : int clear_capture_count_;
875 : int clear_capture_start_;
876 : };
877 :
878 :
879 : class Guard: public ZoneObject {
880 : public:
881 : enum Relation { LT, GEQ };
882 : Guard(int reg, Relation op, int value)
883 : : reg_(reg),
884 : op_(op),
885 903942 : value_(value) { }
886 : int reg() { return reg_; }
887 : Relation op() { return op_; }
888 : int value() { return value_; }
889 :
890 : private:
891 : int reg_;
892 : Relation op_;
893 : int value_;
894 : };
895 :
896 :
897 : class GuardedAlternative {
898 : public:
899 : explicit GuardedAlternative(RegExpNode* node)
900 2001726 : : node_(node), guards_(nullptr) {}
901 : void AddGuard(Guard* guard, Zone* zone);
902 : RegExpNode* node() { return node_; }
903 58605 : void set_node(RegExpNode* node) { node_ = node; }
904 : ZoneList<Guard*>* guards() { return guards_; }
905 :
906 : private:
907 : RegExpNode* node_;
908 : ZoneList<Guard*>* guards_;
909 : };
910 :
911 :
912 : class AlternativeGeneration;
913 :
914 :
915 0 : class ChoiceNode: public RegExpNode {
916 : public:
917 1025299 : explicit ChoiceNode(int expected_size, Zone* zone)
918 : : RegExpNode(zone),
919 : alternatives_(new (zone)
920 : ZoneList<GuardedAlternative>(expected_size, zone)),
921 : table_(nullptr),
922 : not_at_start_(false),
923 2050598 : being_calculated_(false) {}
924 : void Accept(NodeVisitor* visitor) override;
925 : void AddAlternative(GuardedAlternative node) {
926 2122158 : alternatives()->Add(node, zone());
927 : }
928 : ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
929 : DispatchTable* GetTable(bool ignore_case);
930 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
931 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
932 : int EatsAtLeastHelper(int still_to_find,
933 : int budget,
934 : RegExpNode* ignore_this_node,
935 : bool not_at_start);
936 : void GetQuickCheckDetails(QuickCheckDetails* details,
937 : RegExpCompiler* compiler, int characters_filled_in,
938 : bool not_at_start) override;
939 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
940 : BoyerMooreLookahead* bm, bool not_at_start) override;
941 :
942 : bool being_calculated() { return being_calculated_; }
943 : bool not_at_start() { return not_at_start_; }
944 4123 : void set_not_at_start() { not_at_start_ = true; }
945 0 : void set_being_calculated(bool b) { being_calculated_ = b; }
946 473976 : virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
947 473976 : return true;
948 : }
949 : RegExpNode* FilterOneByte(int depth) override;
950 0 : virtual bool read_backward() { return false; }
951 :
952 : protected:
953 : int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
954 : ZoneList<GuardedAlternative>* alternatives_;
955 :
956 : private:
957 : friend class DispatchTableConstructor;
958 : friend class Analysis;
959 : void GenerateGuard(RegExpMacroAssembler* macro_assembler,
960 : Guard* guard,
961 : Trace* trace);
962 : int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
963 : void EmitOutOfLineContinuation(RegExpCompiler* compiler,
964 : Trace* trace,
965 : GuardedAlternative alternative,
966 : AlternativeGeneration* alt_gen,
967 : int preload_characters,
968 : bool next_expects_preload);
969 : void SetUpPreLoad(RegExpCompiler* compiler,
970 : Trace* current_trace,
971 : PreloadState* preloads);
972 : void AssertGuardsMentionRegisters(Trace* trace);
973 : int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
974 : Trace* EmitGreedyLoop(RegExpCompiler* compiler,
975 : Trace* trace,
976 : AlternativeGenerationList* alt_gens,
977 : PreloadState* preloads,
978 : GreedyLoopState* greedy_loop_state,
979 : int text_length);
980 : void EmitChoices(RegExpCompiler* compiler,
981 : AlternativeGenerationList* alt_gens,
982 : int first_choice,
983 : Trace* trace,
984 : PreloadState* preloads);
985 : DispatchTable* table_;
986 : // If true, this node is never checked at the start of the input.
987 : // Allows a new trace to start with at_start() set to false.
988 : bool not_at_start_;
989 : bool being_calculated_;
990 : };
991 :
992 :
993 0 : class NegativeLookaroundChoiceNode : public ChoiceNode {
994 : public:
995 2812 : explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
996 : GuardedAlternative then_do_this,
997 : Zone* zone)
998 2812 : : ChoiceNode(2, zone) {
999 : AddAlternative(this_must_fail);
1000 : AddAlternative(then_do_this);
1001 2812 : }
1002 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
1003 : void GetQuickCheckDetails(QuickCheckDetails* details,
1004 : RegExpCompiler* compiler, int characters_filled_in,
1005 : bool not_at_start) override;
1006 1105 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
1007 : BoyerMooreLookahead* bm, bool not_at_start) override {
1008 2210 : alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
1009 2210 : not_at_start);
1010 1105 : if (offset == 0) set_bm_info(not_at_start, bm);
1011 1105 : }
1012 : // For a negative lookahead we don't emit the quick check for the
1013 : // alternative that is expected to fail. This is because quick check code
1014 : // starts by loading enough characters for the alternative that takes fewest
1015 : // characters, but on a negative lookahead the negative branch did not take
1016 : // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1017 5604 : bool try_to_emit_quick_check_for_alternative(bool is_first) override {
1018 5604 : return !is_first;
1019 : }
1020 : RegExpNode* FilterOneByte(int depth) override;
1021 : };
1022 :
1023 :
1024 0 : class LoopChoiceNode: public ChoiceNode {
1025 : public:
1026 : LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1027 : : ChoiceNode(2, zone),
1028 : loop_node_(nullptr),
1029 : continue_node_(nullptr),
1030 : body_can_be_zero_length_(body_can_be_zero_length),
1031 999457 : read_backward_(read_backward) {}
1032 : void AddLoopAlternative(GuardedAlternative alt);
1033 : void AddContinueAlternative(GuardedAlternative alt);
1034 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
1035 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
1036 : void GetQuickCheckDetails(QuickCheckDetails* details,
1037 : RegExpCompiler* compiler, int characters_filled_in,
1038 : bool not_at_start) override;
1039 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
1040 : BoyerMooreLookahead* bm, bool not_at_start) override;
1041 : RegExpNode* loop_node() { return loop_node_; }
1042 : RegExpNode* continue_node() { return continue_node_; }
1043 : bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1044 23386 : bool read_backward() override { return read_backward_; }
1045 : void Accept(NodeVisitor* visitor) override;
1046 : RegExpNode* FilterOneByte(int depth) override;
1047 :
1048 : private:
1049 : // AddAlternative is made private for loop nodes because alternatives
1050 : // should not be added freely, we need to keep track of which node
1051 : // goes back to the node itself.
1052 : void AddAlternative(GuardedAlternative node) {
1053 : ChoiceNode::AddAlternative(node);
1054 : }
1055 :
1056 : RegExpNode* loop_node_;
1057 : RegExpNode* continue_node_;
1058 : bool body_can_be_zero_length_;
1059 : bool read_backward_;
1060 : };
1061 :
1062 :
1063 : // Improve the speed that we scan for an initial point where a non-anchored
1064 : // regexp can match by using a Boyer-Moore-like table. This is done by
1065 : // identifying non-greedy non-capturing loops in the nodes that eat any
1066 : // character one at a time. For example in the middle of the regexp
1067 : // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1068 : // inserted at the start of any non-anchored regexp.
1069 : //
1070 : // When we have found such a loop we look ahead in the nodes to find the set of
1071 : // characters that can come at given distances. For example for the regexp
1072 : // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1073 : // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1074 : // the lookahead info where the set of characters is reasonably constrained. In
1075 : // our example this is from index 1 to 2 (0 is not constrained). We can now
1076 : // look 3 characters ahead and if we don't find one of [f, o] (the union of
1077 : // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1078 : //
1079 : // For Unicode input strings we do the same, but modulo 128.
1080 : //
1081 : // We also look at the first string fed to the regexp and use that to get a hint
1082 : // of the character frequencies in the inputs. This affects the assessment of
1083 : // whether the set of characters is 'reasonably constrained'.
1084 : //
1085 : // We also have another lookahead mechanism (called quick check in the code),
1086 : // which uses a wide load of multiple characters followed by a mask and compare
1087 : // to determine whether a match is possible at this point.
1088 : enum ContainedInLattice {
1089 : kNotYet = 0,
1090 : kLatticeIn = 1,
1091 : kLatticeOut = 2,
1092 : kLatticeUnknown = 3 // Can also mean both in and out.
1093 : };
1094 :
1095 :
1096 : inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1097 867386 : return static_cast<ContainedInLattice>(a | b);
1098 : }
1099 :
1100 :
1101 : ContainedInLattice AddRange(ContainedInLattice a,
1102 : const int* ranges,
1103 : int ranges_size,
1104 : Interval new_range);
1105 :
1106 :
1107 : class BoyerMoorePositionInfo : public ZoneObject {
1108 : public:
1109 99690 : explicit BoyerMoorePositionInfo(Zone* zone)
1110 : : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1111 : map_count_(0),
1112 : w_(kNotYet),
1113 : s_(kNotYet),
1114 : d_(kNotYet),
1115 99690 : surrogate_(kNotYet) {
1116 25620330 : for (int i = 0; i < kMapSize; i++) {
1117 12760320 : map_->Add(false, zone);
1118 : }
1119 99690 : }
1120 :
1121 : bool& at(int i) { return map_->at(i); }
1122 :
1123 : static const int kMapSize = 128;
1124 : static const int kMask = kMapSize - 1;
1125 :
1126 : int map_count() const { return map_count_; }
1127 :
1128 : void Set(int character);
1129 : void SetInterval(const Interval& interval);
1130 : void SetAll();
1131 : bool is_non_word() { return w_ == kLatticeOut; }
1132 : bool is_word() { return w_ == kLatticeIn; }
1133 :
1134 : private:
1135 : ZoneList<bool>* map_;
1136 : int map_count_; // Number of set bits in the map.
1137 : ContainedInLattice w_; // The \w character class.
1138 : ContainedInLattice s_; // The \s character class.
1139 : ContainedInLattice d_; // The \d character class.
1140 : ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1141 : };
1142 :
1143 :
1144 : class BoyerMooreLookahead : public ZoneObject {
1145 : public:
1146 : BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1147 :
1148 : int length() { return length_; }
1149 : int max_char() { return max_char_; }
1150 : RegExpCompiler* compiler() { return compiler_; }
1151 :
1152 : int Count(int map_number) {
1153 542556 : return bitmaps_->at(map_number)->map_count();
1154 : }
1155 :
1156 150 : BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1157 :
1158 : void Set(int map_number, int character) {
1159 86966 : if (character > max_char_) return;
1160 173932 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1161 : info->Set(character);
1162 : }
1163 :
1164 161207 : void SetInterval(int map_number, const Interval& interval) {
1165 161207 : if (interval.from() > max_char_) return;
1166 322414 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1167 161207 : if (interval.to() > max_char_) {
1168 0 : info->SetInterval(Interval(interval.from(), max_char_));
1169 : } else {
1170 161207 : info->SetInterval(interval);
1171 : }
1172 : }
1173 :
1174 5507 : void SetAll(int map_number) {
1175 11014 : bitmaps_->at(map_number)->SetAll();
1176 5507 : }
1177 :
1178 : void SetRest(int from_map) {
1179 1766 : for (int i = from_map; i < length_; i++) SetAll(i);
1180 : }
1181 : void EmitSkipInstructions(RegExpMacroAssembler* masm);
1182 :
1183 : private:
1184 : // This is the value obtained by EatsAtLeast. If we do not have at least this
1185 : // many characters left in the sample string then the match is bound to fail.
1186 : // Therefore it is OK to read a character this far ahead of the current match
1187 : // point.
1188 : int length_;
1189 : RegExpCompiler* compiler_;
1190 : // 0xff for Latin1, 0xffff for UTF-16.
1191 : int max_char_;
1192 : ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1193 :
1194 : int GetSkipTable(int min_lookahead,
1195 : int max_lookahead,
1196 : Handle<ByteArray> boolean_skip_table);
1197 : bool FindWorthwhileInterval(int* from, int* to);
1198 : int FindBestInterval(
1199 : int max_number_of_chars, int old_biggest_points, int* from, int* to);
1200 : };
1201 :
1202 :
1203 : // There are many ways to generate code for a node. This class encapsulates
1204 : // the current way we should be generating. In other words it encapsulates
1205 : // the current state of the code generator. The effect of this is that we
1206 : // generate code for paths that the matcher can take through the regular
1207 : // expression. A given node in the regexp can be code-generated several times
1208 : // as it can be part of several traces. For example for the regexp:
1209 : // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1210 : // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1211 : // to match foo is generated only once (the traces have a common prefix). The
1212 : // code to store the capture is deferred and generated (twice) after the places
1213 : // where baz has been matched.
1214 : class Trace {
1215 : public:
1216 : // A value for a property that is either known to be true, know to be false,
1217 : // or not known.
1218 : enum TriBool {
1219 : UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1220 : };
1221 :
1222 : class DeferredAction {
1223 : public:
1224 : DeferredAction(ActionNode::ActionType action_type, int reg)
1225 251202 : : action_type_(action_type), reg_(reg), next_(nullptr) {}
1226 : DeferredAction* next() { return next_; }
1227 : bool Mentions(int reg);
1228 : int reg() { return reg_; }
1229 : ActionNode::ActionType action_type() { return action_type_; }
1230 : private:
1231 : ActionNode::ActionType action_type_;
1232 : int reg_;
1233 : DeferredAction* next_;
1234 : friend class Trace;
1235 : };
1236 :
1237 : class DeferredCapture : public DeferredAction {
1238 : public:
1239 : DeferredCapture(int reg, bool is_capture, Trace* trace)
1240 : : DeferredAction(ActionNode::STORE_POSITION, reg),
1241 : cp_offset_(trace->cp_offset()),
1242 241749 : is_capture_(is_capture) { }
1243 : int cp_offset() { return cp_offset_; }
1244 : bool is_capture() { return is_capture_; }
1245 : private:
1246 : int cp_offset_;
1247 : bool is_capture_;
1248 : void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1249 : };
1250 :
1251 : class DeferredSetRegister : public DeferredAction {
1252 : public:
1253 : DeferredSetRegister(int reg, int value)
1254 : : DeferredAction(ActionNode::SET_REGISTER, reg),
1255 3471 : value_(value) { }
1256 : int value() { return value_; }
1257 : private:
1258 : int value_;
1259 : };
1260 :
1261 : class DeferredClearCaptures : public DeferredAction {
1262 : public:
1263 : explicit DeferredClearCaptures(Interval range)
1264 : : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1265 2232 : range_(range) { }
1266 : Interval range() { return range_; }
1267 : private:
1268 : Interval range_;
1269 : };
1270 :
1271 : class DeferredIncrementRegister : public DeferredAction {
1272 : public:
1273 : explicit DeferredIncrementRegister(int reg)
1274 : : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1275 : };
1276 :
1277 : Trace()
1278 : : cp_offset_(0),
1279 : actions_(nullptr),
1280 : backtrack_(nullptr),
1281 : stop_node_(nullptr),
1282 : loop_label_(nullptr),
1283 : characters_preloaded_(0),
1284 : bound_checked_up_to_(0),
1285 : flush_budget_(100),
1286 1278268 : at_start_(UNKNOWN) {}
1287 :
1288 : // End the trace. This involves flushing the deferred actions in the trace
1289 : // and pushing a backtrack location onto the backtrack stack. Once this is
1290 : // done we can start a new trace or go to one that has already been
1291 : // generated.
1292 : void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1293 : int cp_offset() { return cp_offset_; }
1294 : DeferredAction* actions() { return actions_; }
1295 : // A trivial trace is one that has no deferred actions or other state that
1296 : // affects the assumptions used when generating code. There is no recorded
1297 : // backtrack location in a trivial trace, so with a trivial trace we will
1298 : // generate code that, on a failure to match, gets the backtrack location
1299 : // from the backtrack stack rather than using a direct jump instruction. We
1300 : // always start code generation with a trivial trace and non-trivial traces
1301 : // are created as we emit code for nodes or add to the list of deferred
1302 : // actions in the trace. The location of the code generated for a node using
1303 : // a trivial trace is recorded in a label in the node so that gotos can be
1304 : // generated to that code.
1305 : bool is_trivial() {
1306 3502752 : return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
1307 950555 : characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
1308 3130658 : quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
1309 : }
1310 : TriBool at_start() { return at_start_; }
1311 534005 : void set_at_start(TriBool at_start) { at_start_ = at_start; }
1312 : Label* backtrack() { return backtrack_; }
1313 : Label* loop_label() { return loop_label_; }
1314 : RegExpNode* stop_node() { return stop_node_; }
1315 : int characters_preloaded() { return characters_preloaded_; }
1316 : int bound_checked_up_to() { return bound_checked_up_to_; }
1317 : int flush_budget() { return flush_budget_; }
1318 2709889 : QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1319 : bool mentions_reg(int reg);
1320 : // Returns true if a deferred position store exists to the specified
1321 : // register and stores the offset in the out-parameter. Otherwise
1322 : // returns false.
1323 : bool GetStoredPosition(int reg, int* cp_offset);
1324 : // These set methods and AdvanceCurrentPositionInTrace should be used only on
1325 : // new traces - the intention is that traces are immutable after creation.
1326 : void add_action(DeferredAction* new_action) {
1327 : DCHECK(new_action->next_ == nullptr);
1328 251202 : new_action->next_ = actions_;
1329 251202 : actions_ = new_action;
1330 : }
1331 810865 : void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1332 11693 : void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1333 11693 : void set_loop_label(Label* label) { loop_label_ = label; }
1334 833117 : void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1335 435116 : void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1336 194702 : void set_flush_budget(int to) { flush_budget_ = to; }
1337 : void set_quick_check_performed(QuickCheckDetails* d) {
1338 226926 : quick_check_performed_ = *d;
1339 : }
1340 : void InvalidateCurrentCharacter();
1341 : void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1342 :
1343 : private:
1344 : int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1345 : void PerformDeferredActions(RegExpMacroAssembler* macro,
1346 : int max_register,
1347 : const OutSet& affected_registers,
1348 : OutSet* registers_to_pop,
1349 : OutSet* registers_to_clear,
1350 : Zone* zone);
1351 : void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1352 : int max_register,
1353 : const OutSet& registers_to_pop,
1354 : const OutSet& registers_to_clear);
1355 : int cp_offset_;
1356 : DeferredAction* actions_;
1357 : Label* backtrack_;
1358 : RegExpNode* stop_node_;
1359 : Label* loop_label_;
1360 : int characters_preloaded_;
1361 : int bound_checked_up_to_;
1362 : QuickCheckDetails quick_check_performed_;
1363 : int flush_budget_;
1364 : TriBool at_start_;
1365 : };
1366 :
1367 :
1368 : class GreedyLoopState {
1369 : public:
1370 : explicit GreedyLoopState(bool not_at_start);
1371 :
1372 11693 : Label* label() { return &label_; }
1373 11693 : Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1374 :
1375 : private:
1376 : Label label_;
1377 : Trace counter_backtrack_trace_;
1378 : };
1379 :
1380 :
1381 : struct PreloadState {
1382 : static const int kEatsAtLeastNotYetInitialized = -1;
1383 : bool preload_is_current_;
1384 : bool preload_has_checked_bounds_;
1385 : int preload_characters_;
1386 : int eats_at_least_;
1387 : void init() {
1388 213730 : eats_at_least_ = kEatsAtLeastNotYetInitialized;
1389 : }
1390 : };
1391 :
1392 :
1393 85794 : class NodeVisitor {
1394 : public:
1395 85794 : virtual ~NodeVisitor() = default;
1396 : #define DECLARE_VISIT(Type) \
1397 : virtual void Visit##Type(Type##Node* that) = 0;
1398 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1399 : #undef DECLARE_VISIT
1400 0 : virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1401 : };
1402 :
1403 :
1404 : // Node visitor used to add the start set of the alternatives to the
1405 : // dispatch table of a choice node.
1406 110 : class V8_EXPORT_PRIVATE DispatchTableConstructor : public NodeVisitor {
1407 : public:
1408 : DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1409 : Zone* zone)
1410 : : table_(table),
1411 : choice_index_(-1),
1412 : ignore_case_(ignore_case),
1413 55 : zone_(zone) { }
1414 :
1415 : void BuildTable(ChoiceNode* node);
1416 :
1417 : void AddRange(CharacterRange range) {
1418 580 : table()->AddRange(range, choice_index_, zone_);
1419 : }
1420 :
1421 : void AddInverse(ZoneList<CharacterRange>* ranges);
1422 :
1423 : #define DECLARE_VISIT(Type) \
1424 : virtual void Visit##Type(Type##Node* that);
1425 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1426 : #undef DECLARE_VISIT
1427 :
1428 : DispatchTable* table() { return table_; }
1429 55 : void set_choice_index(int value) { choice_index_ = value; }
1430 :
1431 : protected:
1432 : DispatchTable* table_;
1433 : int choice_index_;
1434 : bool ignore_case_;
1435 : Zone* zone_;
1436 : };
1437 :
1438 : // Assertion propagation moves information about assertions such as
1439 : // \b to the affected nodes. For instance, in /.\b./ information must
1440 : // be propagated to the first '.' that whatever follows needs to know
1441 : // if it matched a word or a non-word, and to the second '.' that it
1442 : // has to check if it succeeds a word or non-word. In this case the
1443 : // result will be something like:
1444 : //
1445 : // +-------+ +------------+
1446 : // | . | | . |
1447 : // +-------+ ---> +------------+
1448 : // | word? | | check word |
1449 : // +-------+ +------------+
1450 171478 : class Analysis: public NodeVisitor {
1451 : public:
1452 : Analysis(Isolate* isolate, bool is_one_byte)
1453 85739 : : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
1454 : void EnsureAnalyzed(RegExpNode* node);
1455 :
1456 : #define DECLARE_VISIT(Type) void Visit##Type(Type##Node* that) override;
1457 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1458 : #undef DECLARE_VISIT
1459 : void VisitLoopChoice(LoopChoiceNode* that) override;
1460 :
1461 : bool has_failed() { return error_message_ != nullptr; }
1462 : const char* error_message() {
1463 : DCHECK(error_message_ != nullptr);
1464 : return error_message_;
1465 : }
1466 : void fail(const char* error_message) {
1467 340 : error_message_ = error_message;
1468 : }
1469 :
1470 : Isolate* isolate() const { return isolate_; }
1471 :
1472 : private:
1473 : Isolate* isolate_;
1474 : bool is_one_byte_;
1475 : const char* error_message_;
1476 :
1477 : DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1478 : };
1479 :
1480 :
1481 : struct RegExpCompileData {
1482 : RegExpCompileData()
1483 : : tree(nullptr),
1484 : node(nullptr),
1485 : simple(true),
1486 : contains_anchor(false),
1487 760604 : capture_count(0) {}
1488 : RegExpTree* tree;
1489 : RegExpNode* node;
1490 : bool simple;
1491 : bool contains_anchor;
1492 : Handle<FixedArray> capture_name_map;
1493 : Handle<String> error;
1494 : int capture_count;
1495 : };
1496 :
1497 :
1498 : class RegExpEngine: public AllStatic {
1499 : public:
1500 : struct CompilationResult {
1501 : inline CompilationResult(Isolate* isolate, const char* error_message);
1502 : CompilationResult(Object code, int registers)
1503 85399 : : code(code), num_registers(registers) {}
1504 : const char* const error_message = nullptr;
1505 : Object const code;
1506 : int const num_registers = 0;
1507 : };
1508 :
1509 : V8_EXPORT_PRIVATE static CompilationResult Compile(
1510 : Isolate* isolate, Zone* zone, RegExpCompileData* input,
1511 : JSRegExp::Flags flags, Handle<String> pattern,
1512 : Handle<String> sample_subject, bool is_one_byte);
1513 :
1514 : static bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern);
1515 :
1516 : V8_EXPORT_PRIVATE static void DotPrint(const char* label, RegExpNode* node,
1517 : bool ignore_case);
1518 : };
1519 :
1520 :
1521 : class RegExpResultsCache : public AllStatic {
1522 : public:
1523 : enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1524 :
1525 : // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
1526 : // On success, the returned result is guaranteed to be a COW-array.
1527 : static Object Lookup(Heap* heap, String key_string, Object key_pattern,
1528 : FixedArray* last_match_out, ResultsCacheType type);
1529 : // Attempt to add value_array to the cache specified by type. On success,
1530 : // value_array is turned into a COW-array.
1531 : static void Enter(Isolate* isolate, Handle<String> key_string,
1532 : Handle<Object> key_pattern, Handle<FixedArray> value_array,
1533 : Handle<FixedArray> last_match_cache, ResultsCacheType type);
1534 : static void Clear(FixedArray cache);
1535 : static const int kRegExpResultsCacheSize = 0x100;
1536 :
1537 : private:
1538 : static const int kArrayEntriesPerCacheEntry = 4;
1539 : static const int kStringOffset = 0;
1540 : static const int kPatternOffset = 1;
1541 : static const int kArrayOffset = 2;
1542 : static const int kLastMatchOffset = 3;
1543 : };
1544 :
1545 : } // namespace internal
1546 : } // namespace v8
1547 :
1548 : #endif // V8_REGEXP_JSREGEXP_H_
|