Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_REGEXP_JSREGEXP_H_
6 : #define V8_REGEXP_JSREGEXP_H_
7 :
8 : #include "src/allocation.h"
9 : #include "src/isolate.h"
10 : #include "src/objects/js-regexp.h"
11 : #include "src/regexp/regexp-ast.h"
12 : #include "src/regexp/regexp-macro-assembler.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 :
17 : class NodeVisitor;
18 : class RegExpCompiler;
19 : class RegExpMacroAssembler;
20 : class RegExpNode;
21 : class RegExpTree;
22 : class BoyerMooreLookahead;
23 :
24 : inline bool IgnoreCase(JSRegExp::Flags flags) {
25 : return (flags & JSRegExp::kIgnoreCase) != 0;
26 : }
27 :
28 : inline bool IsUnicode(JSRegExp::Flags flags) {
29 87218 : return (flags & JSRegExp::kUnicode) != 0;
30 : }
31 :
32 : inline bool IsSticky(JSRegExp::Flags flags) {
33 85537 : return (flags & JSRegExp::kSticky) != 0;
34 : }
35 :
36 : inline bool IsGlobal(JSRegExp::Flags flags) {
37 85537 : return (flags & JSRegExp::kGlobal) != 0;
38 : }
39 :
40 : inline bool DotAll(JSRegExp::Flags flags) {
41 : return (flags & JSRegExp::kDotAll) != 0;
42 : }
43 :
44 : inline bool Multiline(JSRegExp::Flags flags) {
45 : return (flags & JSRegExp::kMultiline) != 0;
46 : }
47 :
48 : inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
49 : // Both unicode and ignore_case flags are set. We need to use ICU to find
50 : // the closure over case equivalents.
51 419701 : return IsUnicode(flags) && IgnoreCase(flags);
52 : }
53 :
54 : class RegExpImpl {
55 : public:
56 : // Whether the irregexp engine generates native code or interpreter bytecode.
57 : static bool UsesNativeRegExp() { return !FLAG_regexp_interpret_all; }
58 :
59 : // Returns a string representation of a regular expression.
60 : // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
61 : // This function calls the garbage collector if necessary.
62 : static Handle<String> ToString(Handle<Object> value);
63 :
64 : // Parses the RegExp pattern and prepares the JSRegExp object with
65 : // generic data and choice of implementation - as well as what
66 : // the implementation wants to store in the data field.
67 : // Returns false if compilation fails.
68 : V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
69 : Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
70 : JSRegExp::Flags flags);
71 :
72 : // See ECMA-262 section 15.10.6.2.
73 : // This function calls the garbage collector if necessary.
74 : V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
75 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
76 : int index, Handle<RegExpMatchInfo> last_match_info);
77 :
78 : // Prepares a JSRegExp object with Irregexp-specific data.
79 : static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
80 : Handle<String> pattern, JSRegExp::Flags flags,
81 : int capture_register_count);
82 :
83 : static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
84 : Handle<String> pattern, JSRegExp::Flags flags,
85 : Handle<String> match_pattern);
86 :
87 : static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
88 : Handle<String> subject, int index, int32_t* output,
89 : int output_size);
90 :
91 : static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
92 : Handle<String> subject, int index,
93 : Handle<RegExpMatchInfo> last_match_info);
94 :
95 : enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
96 :
97 : // Prepare a RegExp for being executed one or more times (using
98 : // IrregexpExecOnce) on the subject.
99 : // This ensures that the regexp is compiled for the subject, and that
100 : // the subject is flat.
101 : // Returns the number of integer spaces required by IrregexpExecOnce
102 : // as its "registers" argument. If the regexp cannot be compiled,
103 : // an exception is set as pending, and this function returns negative.
104 : static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
105 : Handle<String> subject);
106 :
107 : // Execute a regular expression on the subject, starting from index.
108 : // If matching succeeds, return the number of matches. This can be larger
109 : // than one in the case of global regular expressions.
110 : // The captures and subcaptures are stored into the registers vector.
111 : // If matching fails, returns RE_FAILURE.
112 : // If execution fails, sets a pending exception and returns RE_EXCEPTION.
113 : static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
114 : Handle<String> subject, int index, int32_t* output,
115 : int output_size);
116 :
117 : // Execute an Irregexp bytecode pattern.
118 : // On a successful match, the result is a JSArray containing
119 : // captured positions. On a failure, the result is the null value.
120 : // Returns an empty handle in case of an exception.
121 : V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
122 : Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
123 : int index, Handle<RegExpMatchInfo> last_match_info);
124 :
125 : // Set last match info. If match is nullptr, then setting captures is
126 : // omitted.
127 : static Handle<RegExpMatchInfo> SetLastMatchInfo(
128 : Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
129 : Handle<String> subject, int capture_count, int32_t* match);
130 :
131 : class GlobalCache {
132 : public:
133 : GlobalCache(Handle<JSRegExp> regexp,
134 : Handle<String> subject,
135 : Isolate* isolate);
136 :
137 : V8_INLINE ~GlobalCache();
138 :
139 : // Fetch the next entry in the cache for global regexp match results.
140 : // This does not set the last match info. Upon failure, nullptr is
141 : // returned. The cause can be checked with Result(). The previous result is
142 : // still in available in memory when a failure happens.
143 : V8_INLINE int32_t* FetchNext();
144 :
145 : V8_INLINE int32_t* LastSuccessfulMatch();
146 :
147 : V8_INLINE bool HasException() { return num_matches_ < 0; }
148 :
149 : private:
150 : int AdvanceZeroLength(int last_index);
151 :
152 : int num_matches_;
153 : int max_matches_;
154 : int current_match_index_;
155 : int registers_per_match_;
156 : // Pointer to the last set of captures.
157 : int32_t* register_array_;
158 : int register_array_size_;
159 : Handle<JSRegExp> regexp_;
160 : Handle<String> subject_;
161 : Isolate* isolate_;
162 : };
163 :
164 : // For acting on the JSRegExp data FixedArray.
165 : static int IrregexpMaxRegisterCount(FixedArray re);
166 : static void SetIrregexpMaxRegisterCount(FixedArray re, int value);
167 : static void SetIrregexpCaptureNameMap(FixedArray re,
168 : Handle<FixedArray> value);
169 : static int IrregexpNumberOfCaptures(FixedArray re);
170 : static int IrregexpNumberOfRegisters(FixedArray re);
171 : static ByteArray IrregexpByteCode(FixedArray re, bool is_one_byte);
172 : static Code IrregexpNativeCode(FixedArray re, bool is_one_byte);
173 :
174 : // Limit the space regexps take up on the heap. In order to limit this we
175 : // would like to keep track of the amount of regexp code on the heap. This
176 : // is not tracked, however. As a conservative approximation we track the
177 : // total regexp code compiled including code that has subsequently been freed
178 : // and the total executable memory at any point.
179 : static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
180 : static const size_t kRegExpCompiledLimit = 1 * MB;
181 : static const int kRegExpTooLargeToOptimize = 20 * KB;
182 :
183 : private:
184 : static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
185 : Handle<String> sample_subject, bool is_one_byte);
186 : static inline bool EnsureCompiledIrregexp(Isolate* isolate,
187 : Handle<JSRegExp> re,
188 : Handle<String> sample_subject,
189 : bool is_one_byte);
190 : };
191 :
192 :
193 : // Represents the location of one element relative to the intersection of
194 : // two sets. Corresponds to the four areas of a Venn diagram.
195 : enum ElementInSetsRelation {
196 : kInsideNone = 0,
197 : kInsideFirst = 1,
198 : kInsideSecond = 2,
199 : kInsideBoth = 3
200 : };
201 :
202 :
203 : // A set of unsigned integers that behaves especially well on small
204 : // integers (< 32). May do zone-allocation.
205 : class OutSet: public ZoneObject {
206 : public:
207 1022173 : OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
208 : OutSet* Extend(unsigned value, Zone* zone);
209 : bool Get(unsigned value) const;
210 : static const unsigned kFirstLimit = 32;
211 :
212 : private:
213 : // Destructively set a value in this set. In most cases you want
214 : // to use Extend instead to ensure that only one instance exists
215 : // that contains the same values.
216 : void Set(unsigned value, Zone* zone);
217 :
218 : // The successors are a list of sets that contain the same values
219 : // as this set and the one more value that is not present in this
220 : // set.
221 : ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
222 :
223 : OutSet(uint32_t first, ZoneList<unsigned>* remaining)
224 16992 : : first_(first), remaining_(remaining), successors_(nullptr) {}
225 : uint32_t first_;
226 : ZoneList<unsigned>* remaining_;
227 : ZoneList<OutSet*>* successors_;
228 : friend class Trace;
229 : };
230 :
231 :
232 : // A mapping from integers, specified as ranges, to a set of integers.
233 : // Used for mapping character ranges to choices.
234 : class DispatchTable : public ZoneObject {
235 : public:
236 : explicit DispatchTable(Zone* zone) : tree_(zone) { }
237 :
238 : class Entry {
239 : public:
240 : Entry() : from_(0), to_(0), out_set_(nullptr) {}
241 : Entry(uc32 from, uc32 to, OutSet* out_set)
242 : : from_(from), to_(to), out_set_(out_set) {
243 : DCHECK(from <= to);
244 : }
245 : uc32 from() { return from_; }
246 : uc32 to() { return to_; }
247 4830 : void set_to(uc32 value) { to_ = value; }
248 : void AddValue(int value, Zone* zone) {
249 80888 : out_set_ = out_set_->Extend(value, zone);
250 : }
251 : OutSet* out_set() { return out_set_; }
252 : private:
253 : uc32 from_;
254 : uc32 to_;
255 : OutSet* out_set_;
256 : };
257 :
258 : class Config {
259 : public:
260 : typedef uc32 Key;
261 : typedef Entry Value;
262 : static const uc32 kNoKey;
263 : static const Entry NoValue() { return Value(); }
264 : static inline int Compare(uc32 a, uc32 b) {
265 1570410 : if (a == b)
266 : return 0;
267 1537620 : else if (a < b)
268 : return -1;
269 : else
270 : return 1;
271 : }
272 : };
273 :
274 : void AddRange(CharacterRange range, int value, Zone* zone);
275 : OutSet* Get(uc32 value);
276 : void Dump();
277 :
278 : template <typename Callback>
279 : void ForEach(Callback* callback) {
280 2587 : return tree()->ForEach(callback);
281 : }
282 :
283 : private:
284 : // There can't be a static empty set since it allocates its
285 : // successors in a zone and caches them.
286 : OutSet* empty() { return &empty_; }
287 : OutSet empty_;
288 : ZoneSplayTree<Config>* tree() { return &tree_; }
289 : ZoneSplayTree<Config> tree_;
290 : };
291 :
292 :
293 : // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
294 : class UnicodeRangeSplitter {
295 : public:
296 : UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
297 : void Call(uc32 from, DispatchTable::Entry entry);
298 :
299 : ZoneList<CharacterRange>* bmp() { return bmp_; }
300 : ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
301 : ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
302 : ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
303 :
304 : private:
305 : static const int kBase = 0;
306 : // Separate ranges into
307 : static const int kBmpCodePoints = 1;
308 : static const int kLeadSurrogates = 2;
309 : static const int kTrailSurrogates = 3;
310 : static const int kNonBmpCodePoints = 4;
311 :
312 : Zone* zone_;
313 : DispatchTable table_;
314 : ZoneList<CharacterRange>* bmp_;
315 : ZoneList<CharacterRange>* lead_surrogates_;
316 : ZoneList<CharacterRange>* trail_surrogates_;
317 : ZoneList<CharacterRange>* non_bmp_;
318 : };
319 :
320 :
321 : #define FOR_EACH_NODE_TYPE(VISIT) \
322 : VISIT(End) \
323 : VISIT(Action) \
324 : VISIT(Choice) \
325 : VISIT(BackReference) \
326 : VISIT(Assertion) \
327 : VISIT(Text)
328 :
329 :
330 : class Trace;
331 : struct PreloadState;
332 : class GreedyLoopState;
333 : class AlternativeGenerationList;
334 :
335 : struct NodeInfo {
336 : NodeInfo()
337 : : being_analyzed(false),
338 : been_analyzed(false),
339 : follows_word_interest(false),
340 : follows_newline_interest(false),
341 : follows_start_interest(false),
342 : at_end(false),
343 : visited(false),
344 4380146 : replacement_calculated(false) { }
345 :
346 : // Returns true if the interests and assumptions of this node
347 : // matches the given one.
348 : bool Matches(NodeInfo* that) {
349 : return (at_end == that->at_end) &&
350 : (follows_word_interest == that->follows_word_interest) &&
351 : (follows_newline_interest == that->follows_newline_interest) &&
352 : (follows_start_interest == that->follows_start_interest);
353 : }
354 :
355 : // Updates the interests of this node given the interests of the
356 : // node preceding it.
357 : void AddFromPreceding(NodeInfo* that) {
358 : at_end |= that->at_end;
359 : follows_word_interest |= that->follows_word_interest;
360 : follows_newline_interest |= that->follows_newline_interest;
361 : follows_start_interest |= that->follows_start_interest;
362 : }
363 :
364 : bool HasLookbehind() {
365 : return follows_word_interest ||
366 : follows_newline_interest ||
367 : follows_start_interest;
368 : }
369 :
370 : // Sets the interests of this node to include the interests of the
371 : // following node.
372 : void AddFromFollowing(NodeInfo* that) {
373 565793 : follows_word_interest |= that->follows_word_interest;
374 565793 : follows_newline_interest |= that->follows_newline_interest;
375 565793 : follows_start_interest |= that->follows_start_interest;
376 : }
377 :
378 : void ResetCompilationState() {
379 : being_analyzed = false;
380 : been_analyzed = false;
381 : }
382 :
383 : bool being_analyzed: 1;
384 : bool been_analyzed: 1;
385 :
386 : // These bits are set of this node has to know what the preceding
387 : // character was.
388 : bool follows_word_interest: 1;
389 : bool follows_newline_interest: 1;
390 : bool follows_start_interest: 1;
391 :
392 : bool at_end: 1;
393 : bool visited: 1;
394 : bool replacement_calculated: 1;
395 : };
396 :
397 :
398 : // Details of a quick mask-compare check that can look ahead in the
399 : // input stream.
400 : class QuickCheckDetails {
401 : public:
402 : QuickCheckDetails()
403 : : characters_(0),
404 : mask_(0),
405 : value_(0),
406 14065310 : cannot_match_(false) { }
407 : explicit QuickCheckDetails(int characters)
408 : : characters_(characters),
409 : mask_(0),
410 : value_(0),
411 772835 : cannot_match_(false) { }
412 : bool Rationalize(bool one_byte);
413 : // Merge in the information from another branch of an alternation.
414 : void Merge(QuickCheckDetails* other, int from_index);
415 : // Advance the current position by some amount.
416 : void Advance(int by, bool one_byte);
417 : void Clear();
418 : bool cannot_match() { return cannot_match_; }
419 431 : void set_cannot_match() { cannot_match_ = true; }
420 : struct Position {
421 11870516 : Position() : mask(0), value(0), determines_perfectly(false) { }
422 : uc16 mask;
423 : uc16 value;
424 : bool determines_perfectly;
425 : };
426 : int characters() { return characters_; }
427 603187 : void set_characters(int characters) { characters_ = characters; }
428 : Position* positions(int index) {
429 : DCHECK_LE(0, index);
430 : DCHECK_GT(characters_, index);
431 1610710 : return positions_ + index;
432 : }
433 : uint32_t mask() { return mask_; }
434 : uint32_t value() { return value_; }
435 :
436 : private:
437 : // How many characters do we have quick check information from. This is
438 : // the same for all branches of a choice node.
439 : int characters_;
440 : Position positions_[4];
441 : // These values are the condensate of the above array after Rationalize().
442 : uint32_t mask_;
443 : uint32_t value_;
444 : // If set to true, there is no way this quick check can match at all.
445 : // E.g., if it requires to be at the start of the input, and isn't.
446 : bool cannot_match_;
447 : };
448 :
449 :
450 : extern int kUninitializedRegExpNodePlaceHolder;
451 :
452 :
453 0 : class RegExpNode: public ZoneObject {
454 : public:
455 : explicit RegExpNode(Zone* zone)
456 : : replacement_(nullptr),
457 : on_work_list_(false),
458 : trace_count_(0),
459 8760292 : zone_(zone) {
460 4380146 : bm_info_[0] = bm_info_[1] = nullptr;
461 : }
462 : virtual ~RegExpNode();
463 : virtual void Accept(NodeVisitor* visitor) = 0;
464 : // Generates a goto to this node or actually generates the code at this point.
465 : virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
466 : // How many characters must this node consume at a minimum in order to
467 : // succeed. If we have found at least 'still_to_find' characters that
468 : // must be consumed there is no need to ask any following nodes whether
469 : // they are sure to eat any more characters. The not_at_start argument is
470 : // used to indicate that we know we are not at the start of the input. In
471 : // this case anchored branches will always fail and can be ignored when
472 : // determining how many characters are consumed on success.
473 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
474 : // Emits some quick code that checks whether the preloaded characters match.
475 : // Falls through on certain failure, jumps to the label on possible success.
476 : // If the node cannot make a quick check it does nothing and returns false.
477 : bool EmitQuickCheck(RegExpCompiler* compiler,
478 : Trace* bounds_check_trace,
479 : Trace* trace,
480 : bool preload_has_checked_bounds,
481 : Label* on_possible_success,
482 : QuickCheckDetails* details_return,
483 : bool fall_through_on_failure);
484 : // For a given number of characters this returns a mask and a value. The
485 : // next n characters are anded with the mask and compared with the value.
486 : // A comparison failure indicates the node cannot match the next n characters.
487 : // A comparison success indicates the node may match.
488 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
489 : RegExpCompiler* compiler,
490 : int characters_filled_in,
491 : bool not_at_start) = 0;
492 : static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
493 87111 : virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
494 : // Only returns the successor for a text node of length 1 that matches any
495 : // character and that has no guards on it.
496 78471 : virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
497 : RegExpCompiler* compiler) {
498 78471 : return nullptr;
499 : }
500 :
501 : // Collects information on the possible code units (mod 128) that can match if
502 : // we look forward. This is used for a Boyer-Moore-like string searching
503 : // implementation. TODO(erikcorry): This should share more code with
504 : // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
505 : // the number of nodes we are willing to look at in order to create this data.
506 : static const int kRecursionBudget = 200;
507 : bool KeepRecursing(RegExpCompiler* compiler);
508 0 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
509 : BoyerMooreLookahead* bm, bool not_at_start) {
510 0 : UNREACHABLE();
511 : }
512 :
513 : // If we know that the input is one-byte then there are some nodes that can
514 : // never match. This method returns a node that can be substituted for
515 : // itself, or nullptr if the node can never match.
516 14357 : virtual RegExpNode* FilterOneByte(int depth) { return this; }
517 : // Helper for FilterOneByte.
518 : RegExpNode* replacement() {
519 : DCHECK(info()->replacement_calculated);
520 : return replacement_;
521 : }
522 : RegExpNode* set_replacement(RegExpNode* replacement) {
523 159878 : info()->replacement_calculated = true;
524 159878 : replacement_ = replacement;
525 : return replacement; // For convenience.
526 : }
527 :
528 : // We want to avoid recalculating the lookahead info, so we store it on the
529 : // node. Only info that is for this node is stored. We can tell that the
530 : // info is for this node when offset == 0, so the information is calculated
531 : // relative to this node.
532 : void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
533 103785 : if (offset == 0) set_bm_info(not_at_start, bm);
534 : }
535 :
536 : Label* label() { return &label_; }
537 : // If non-generic code is generated for a node (i.e. the node is not at the
538 : // start of the trace) then it cannot be reused. This variable sets a limit
539 : // on how often we allow that to happen before we insist on starting a new
540 : // trace and generating generic code for a node that can be reused by flushing
541 : // the deferred actions in the current trace and generating a goto.
542 : static const int kMaxCopiesCodeGenerated = 10;
543 :
544 : bool on_work_list() { return on_work_list_; }
545 422784 : void set_on_work_list(bool value) { on_work_list_ = value; }
546 :
547 : NodeInfo* info() { return &info_; }
548 :
549 : BoyerMooreLookahead* bm_info(bool not_at_start) {
550 82348 : return bm_info_[not_at_start ? 1 : 0];
551 : }
552 :
553 : Zone* zone() const { return zone_; }
554 :
555 : protected:
556 : enum LimitResult { DONE, CONTINUE };
557 : RegExpNode* replacement_;
558 :
559 : LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
560 :
561 : void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
562 196103 : bm_info_[not_at_start ? 1 : 0] = bm;
563 : }
564 :
565 : private:
566 : static const int kFirstCharBudget = 10;
567 : Label label_;
568 : bool on_work_list_;
569 : NodeInfo info_;
570 : // This variable keeps track of how many times code has been generated for
571 : // this node (in different traces). We don't keep track of where the
572 : // generated code is located unless the code is generated at the start of
573 : // a trace, in which case it is generic and can be reused by flushing the
574 : // deferred operations in the current trace and generating a goto.
575 : int trace_count_;
576 : BoyerMooreLookahead* bm_info_[2];
577 :
578 : Zone* zone_;
579 : };
580 :
581 :
582 0 : class SeqRegExpNode: public RegExpNode {
583 : public:
584 3266560 : explicit SeqRegExpNode(RegExpNode* on_success)
585 3266560 : : RegExpNode(on_success->zone()), on_success_(on_success) { }
586 : RegExpNode* on_success() { return on_success_; }
587 : void set_on_success(RegExpNode* node) { on_success_ = node; }
588 : RegExpNode* FilterOneByte(int depth) override;
589 0 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
590 : BoyerMooreLookahead* bm, bool not_at_start) override {
591 0 : on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
592 0 : if (offset == 0) set_bm_info(not_at_start, bm);
593 0 : }
594 :
595 : protected:
596 : RegExpNode* FilterSuccessor(int depth);
597 :
598 : private:
599 : RegExpNode* on_success_;
600 : };
601 :
602 :
603 0 : class ActionNode: public SeqRegExpNode {
604 : public:
605 : enum ActionType {
606 : SET_REGISTER,
607 : INCREMENT_REGISTER,
608 : STORE_POSITION,
609 : BEGIN_SUBMATCH,
610 : POSITIVE_SUBMATCH_SUCCESS,
611 : EMPTY_MATCH_CHECK,
612 : CLEAR_CAPTURES
613 : };
614 : static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
615 : static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
616 : static ActionNode* StorePosition(int reg,
617 : bool is_capture,
618 : RegExpNode* on_success);
619 : static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
620 : static ActionNode* BeginSubmatch(int stack_pointer_reg,
621 : int position_reg,
622 : RegExpNode* on_success);
623 : static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
624 : int restore_reg,
625 : int clear_capture_count,
626 : int clear_capture_from,
627 : RegExpNode* on_success);
628 : static ActionNode* EmptyMatchCheck(int start_register,
629 : int repetition_register,
630 : int repetition_limit,
631 : RegExpNode* on_success);
632 : void Accept(NodeVisitor* visitor) override;
633 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
634 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
635 64286 : void GetQuickCheckDetails(QuickCheckDetails* details,
636 : RegExpCompiler* compiler, int filled_in,
637 : bool not_at_start) override {
638 64286 : return on_success()->GetQuickCheckDetails(
639 64286 : details, compiler, filled_in, not_at_start);
640 : }
641 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
642 : BoyerMooreLookahead* bm, bool not_at_start) override;
643 : ActionType action_type() { return action_type_; }
644 : // TODO(erikcorry): We should allow some action nodes in greedy loops.
645 114681 : int GreedyLoopTextLength() override {
646 114681 : return kNodeIsTooComplexForGreedyLoops;
647 : }
648 :
649 : private:
650 : union {
651 : struct {
652 : int reg;
653 : int value;
654 : } u_store_register;
655 : struct {
656 : int reg;
657 : } u_increment_register;
658 : struct {
659 : int reg;
660 : bool is_capture;
661 : } u_position_register;
662 : struct {
663 : int stack_pointer_register;
664 : int current_position_register;
665 : int clear_register_count;
666 : int clear_register_from;
667 : } u_submatch;
668 : struct {
669 : int start_register;
670 : int repetition_register;
671 : int repetition_limit;
672 : } u_empty_match_check;
673 : struct {
674 : int range_from;
675 : int range_to;
676 : } u_clear_captures;
677 : } data_;
678 : ActionNode(ActionType action_type, RegExpNode* on_success)
679 : : SeqRegExpNode(on_success),
680 2041172 : action_type_(action_type) { }
681 : ActionType action_type_;
682 : friend class DotPrinter;
683 : };
684 :
685 :
686 0 : class TextNode: public SeqRegExpNode {
687 : public:
688 : TextNode(ZoneList<TextElement>* elms, bool read_backward,
689 : RegExpNode* on_success)
690 1044056 : : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
691 173569 : TextNode(RegExpCharacterClass* that, bool read_backward,
692 : RegExpNode* on_success)
693 : : SeqRegExpNode(on_success),
694 347138 : elms_(new (zone()) ZoneList<TextElement>(1, zone())),
695 347138 : read_backward_(read_backward) {
696 173569 : elms_->Add(TextElement::CharClass(that), zone());
697 173569 : }
698 : // Create TextNode for a single character class for the given ranges.
699 : static TextNode* CreateForCharacterRanges(Zone* zone,
700 : ZoneList<CharacterRange>* ranges,
701 : bool read_backward,
702 : RegExpNode* on_success,
703 : JSRegExp::Flags flags);
704 : // Create TextNode for a surrogate pair with a range given for the
705 : // lead and the trail surrogate each.
706 : static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
707 : CharacterRange trail,
708 : bool read_backward,
709 : RegExpNode* on_success,
710 : JSRegExp::Flags flags);
711 : void Accept(NodeVisitor* visitor) override;
712 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
713 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
714 : void GetQuickCheckDetails(QuickCheckDetails* details,
715 : RegExpCompiler* compiler, int characters_filled_in,
716 : bool not_at_start) override;
717 : ZoneList<TextElement>* elements() { return elms_; }
718 : bool read_backward() { return read_backward_; }
719 : void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
720 : int GreedyLoopTextLength() override;
721 : RegExpNode* GetSuccessorOfOmnivorousTextNode(
722 : RegExpCompiler* compiler) override;
723 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
724 : BoyerMooreLookahead* bm, bool not_at_start) override;
725 : void CalculateOffsets();
726 : RegExpNode* FilterOneByte(int depth) override;
727 :
728 : private:
729 : enum TextEmitPassType {
730 : NON_LATIN1_MATCH, // Check for characters that can't match.
731 : SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
732 : NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
733 : CASE_CHARACTER_MATCH, // Case-independent single character check.
734 : CHARACTER_CLASS_MATCH // Character class.
735 : };
736 : static bool SkipPass(TextEmitPassType pass, bool ignore_case);
737 : static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
738 : static const int kLastPass = CHARACTER_CLASS_MATCH;
739 : void TextEmitPass(RegExpCompiler* compiler,
740 : TextEmitPassType pass,
741 : bool preloaded,
742 : Trace* trace,
743 : bool first_element_checked,
744 : int* checked_up_to);
745 : int Length();
746 : ZoneList<TextElement>* elms_;
747 : bool read_backward_;
748 : };
749 :
750 :
751 0 : class AssertionNode: public SeqRegExpNode {
752 : public:
753 : enum AssertionType {
754 : AT_END,
755 : AT_START,
756 : AT_BOUNDARY,
757 : AT_NON_BOUNDARY,
758 : AFTER_NEWLINE
759 : };
760 2003 : static AssertionNode* AtEnd(RegExpNode* on_success) {
761 2003 : return new(on_success->zone()) AssertionNode(AT_END, on_success);
762 : }
763 3011 : static AssertionNode* AtStart(RegExpNode* on_success) {
764 3011 : return new(on_success->zone()) AssertionNode(AT_START, on_success);
765 : }
766 136 : static AssertionNode* AtBoundary(RegExpNode* on_success) {
767 136 : return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
768 : }
769 114 : static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
770 114 : return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
771 : }
772 129 : static AssertionNode* AfterNewline(RegExpNode* on_success) {
773 129 : return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
774 : }
775 : void Accept(NodeVisitor* visitor) override;
776 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
777 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
778 : void GetQuickCheckDetails(QuickCheckDetails* details,
779 : RegExpCompiler* compiler, int filled_in,
780 : bool not_at_start) override;
781 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
782 : BoyerMooreLookahead* bm, bool not_at_start) override;
783 : AssertionType assertion_type() { return assertion_type_; }
784 :
785 : private:
786 : void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
787 : enum IfPrevious { kIsNonWord, kIsWord };
788 : void BacktrackIfPrevious(RegExpCompiler* compiler,
789 : Trace* trace,
790 : IfPrevious backtrack_if_previous);
791 : AssertionNode(AssertionType t, RegExpNode* on_success)
792 5393 : : SeqRegExpNode(on_success), assertion_type_(t) { }
793 : AssertionType assertion_type_;
794 : };
795 :
796 :
797 0 : class BackReferenceNode: public SeqRegExpNode {
798 : public:
799 : BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
800 : bool read_backward, RegExpNode* on_success)
801 : : SeqRegExpNode(on_success),
802 : start_reg_(start_reg),
803 : end_reg_(end_reg),
804 : flags_(flags),
805 2370 : read_backward_(read_backward) {}
806 : void Accept(NodeVisitor* visitor) override;
807 : int start_register() { return start_reg_; }
808 : int end_register() { return end_reg_; }
809 : bool read_backward() { return read_backward_; }
810 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
811 : int EatsAtLeast(int still_to_find, int recursion_depth,
812 : bool not_at_start) override;
813 522 : void GetQuickCheckDetails(QuickCheckDetails* details,
814 : RegExpCompiler* compiler, int characters_filled_in,
815 : bool not_at_start) override {
816 522 : return;
817 : }
818 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
819 : BoyerMooreLookahead* bm, bool not_at_start) override;
820 :
821 : private:
822 : int start_reg_;
823 : int end_reg_;
824 : JSRegExp::Flags flags_;
825 : bool read_backward_;
826 : };
827 :
828 :
829 0 : class EndNode: public RegExpNode {
830 : public:
831 : enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
832 88649 : EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
833 : void Accept(NodeVisitor* visitor) override;
834 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
835 115047 : int EatsAtLeast(int still_to_find, int recursion_depth,
836 : bool not_at_start) override {
837 115047 : return 0;
838 : }
839 0 : void GetQuickCheckDetails(QuickCheckDetails* details,
840 : RegExpCompiler* compiler, int characters_filled_in,
841 : bool not_at_start) override {
842 : // Returning 0 from EatsAtLeast should ensure we never get here.
843 0 : UNREACHABLE();
844 : }
845 0 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
846 : BoyerMooreLookahead* bm, bool not_at_start) override {
847 : // Returning 0 from EatsAtLeast should ensure we never get here.
848 0 : UNREACHABLE();
849 : }
850 :
851 : private:
852 : Action action_;
853 : };
854 :
855 :
856 0 : class NegativeSubmatchSuccess: public EndNode {
857 : public:
858 : NegativeSubmatchSuccess(int stack_pointer_reg,
859 : int position_reg,
860 : int clear_capture_count,
861 : int clear_capture_start,
862 : Zone* zone)
863 : : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
864 : stack_pointer_register_(stack_pointer_reg),
865 : current_position_register_(position_reg),
866 : clear_capture_count_(clear_capture_count),
867 2812 : clear_capture_start_(clear_capture_start) { }
868 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
869 :
870 : private:
871 : int stack_pointer_register_;
872 : int current_position_register_;
873 : int clear_capture_count_;
874 : int clear_capture_start_;
875 : };
876 :
877 :
878 : class Guard: public ZoneObject {
879 : public:
880 : enum Relation { LT, GEQ };
881 : Guard(int reg, Relation op, int value)
882 : : reg_(reg),
883 : op_(op),
884 903901 : value_(value) { }
885 : int reg() { return reg_; }
886 : Relation op() { return op_; }
887 : int value() { return value_; }
888 :
889 : private:
890 : int reg_;
891 : Relation op_;
892 : int value_;
893 : };
894 :
895 :
896 : class GuardedAlternative {
897 : public:
898 : explicit GuardedAlternative(RegExpNode* node)
899 2001070 : : node_(node), guards_(nullptr) {}
900 : void AddGuard(Guard* guard, Zone* zone);
901 : RegExpNode* node() { return node_; }
902 58072 : void set_node(RegExpNode* node) { node_ = node; }
903 : ZoneList<Guard*>* guards() { return guards_; }
904 :
905 : private:
906 : RegExpNode* node_;
907 : ZoneList<Guard*>* guards_;
908 : };
909 :
910 :
911 : class AlternativeGeneration;
912 :
913 :
914 0 : class ChoiceNode: public RegExpNode {
915 : public:
916 1024937 : explicit ChoiceNode(int expected_size, Zone* zone)
917 : : RegExpNode(zone),
918 : alternatives_(new (zone)
919 1024937 : ZoneList<GuardedAlternative>(expected_size, zone)),
920 : table_(nullptr),
921 : not_at_start_(false),
922 2049874 : being_calculated_(false) {}
923 : void Accept(NodeVisitor* visitor) override;
924 2131259 : void AddAlternative(GuardedAlternative node) {
925 4250290 : alternatives()->Add(node, zone());
926 : }
927 : ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
928 : DispatchTable* GetTable(bool ignore_case);
929 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
930 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
931 : int EatsAtLeastHelper(int still_to_find,
932 : int budget,
933 : RegExpNode* ignore_this_node,
934 : bool not_at_start);
935 : void GetQuickCheckDetails(QuickCheckDetails* details,
936 : RegExpCompiler* compiler, int characters_filled_in,
937 : bool not_at_start) override;
938 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
939 : BoyerMooreLookahead* bm, bool not_at_start) override;
940 :
941 : bool being_calculated() { return being_calculated_; }
942 : bool not_at_start() { return not_at_start_; }
943 4051 : void set_not_at_start() { not_at_start_ = true; }
944 0 : void set_being_calculated(bool b) { being_calculated_ = b; }
945 470700 : virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
946 470700 : return true;
947 : }
948 : RegExpNode* FilterOneByte(int depth) override;
949 0 : virtual bool read_backward() { return false; }
950 :
951 : protected:
952 : int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
953 : ZoneList<GuardedAlternative>* alternatives_;
954 :
955 : private:
956 : friend class DispatchTableConstructor;
957 : friend class Analysis;
958 : void GenerateGuard(RegExpMacroAssembler* macro_assembler,
959 : Guard* guard,
960 : Trace* trace);
961 : int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
962 : void EmitOutOfLineContinuation(RegExpCompiler* compiler,
963 : Trace* trace,
964 : GuardedAlternative alternative,
965 : AlternativeGeneration* alt_gen,
966 : int preload_characters,
967 : bool next_expects_preload);
968 : void SetUpPreLoad(RegExpCompiler* compiler,
969 : Trace* current_trace,
970 : PreloadState* preloads);
971 : void AssertGuardsMentionRegisters(Trace* trace);
972 : int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
973 : Trace* EmitGreedyLoop(RegExpCompiler* compiler,
974 : Trace* trace,
975 : AlternativeGenerationList* alt_gens,
976 : PreloadState* preloads,
977 : GreedyLoopState* greedy_loop_state,
978 : int text_length);
979 : void EmitChoices(RegExpCompiler* compiler,
980 : AlternativeGenerationList* alt_gens,
981 : int first_choice,
982 : Trace* trace,
983 : PreloadState* preloads);
984 : DispatchTable* table_;
985 : // If true, this node is never checked at the start of the input.
986 : // Allows a new trace to start with at_start() set to false.
987 : bool not_at_start_;
988 : bool being_calculated_;
989 : };
990 :
991 :
992 0 : class NegativeLookaroundChoiceNode : public ChoiceNode {
993 : public:
994 2812 : explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
995 : GuardedAlternative then_do_this,
996 : Zone* zone)
997 2812 : : ChoiceNode(2, zone) {
998 : AddAlternative(this_must_fail);
999 : AddAlternative(then_do_this);
1000 2812 : }
1001 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
1002 : void GetQuickCheckDetails(QuickCheckDetails* details,
1003 : RegExpCompiler* compiler, int characters_filled_in,
1004 : bool not_at_start) override;
1005 1105 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
1006 : BoyerMooreLookahead* bm, bool not_at_start) override {
1007 2210 : alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
1008 1105 : not_at_start);
1009 1105 : if (offset == 0) set_bm_info(not_at_start, bm);
1010 1105 : }
1011 : // For a negative lookahead we don't emit the quick check for the
1012 : // alternative that is expected to fail. This is because quick check code
1013 : // starts by loading enough characters for the alternative that takes fewest
1014 : // characters, but on a negative lookahead the negative branch did not take
1015 : // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1016 5604 : bool try_to_emit_quick_check_for_alternative(bool is_first) override {
1017 5604 : return !is_first;
1018 : }
1019 : RegExpNode* FilterOneByte(int depth) override;
1020 : };
1021 :
1022 :
1023 0 : class LoopChoiceNode: public ChoiceNode {
1024 : public:
1025 : LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1026 : : ChoiceNode(2, zone),
1027 : loop_node_(nullptr),
1028 : continue_node_(nullptr),
1029 : body_can_be_zero_length_(body_can_be_zero_length),
1030 999129 : read_backward_(read_backward) {}
1031 : void AddLoopAlternative(GuardedAlternative alt);
1032 : void AddContinueAlternative(GuardedAlternative alt);
1033 : void Emit(RegExpCompiler* compiler, Trace* trace) override;
1034 : int EatsAtLeast(int still_to_find, int budget, bool not_at_start) override;
1035 : void GetQuickCheckDetails(QuickCheckDetails* details,
1036 : RegExpCompiler* compiler, int characters_filled_in,
1037 : bool not_at_start) override;
1038 : void FillInBMInfo(Isolate* isolate, int offset, int budget,
1039 : BoyerMooreLookahead* bm, bool not_at_start) override;
1040 : RegExpNode* loop_node() { return loop_node_; }
1041 : RegExpNode* continue_node() { return continue_node_; }
1042 : bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1043 23194 : bool read_backward() override { return read_backward_; }
1044 : void Accept(NodeVisitor* visitor) override;
1045 : RegExpNode* FilterOneByte(int depth) override;
1046 :
1047 : private:
1048 : // AddAlternative is made private for loop nodes because alternatives
1049 : // should not be added freely, we need to keep track of which node
1050 : // goes back to the node itself.
1051 : void AddAlternative(GuardedAlternative node) {
1052 : ChoiceNode::AddAlternative(node);
1053 : }
1054 :
1055 : RegExpNode* loop_node_;
1056 : RegExpNode* continue_node_;
1057 : bool body_can_be_zero_length_;
1058 : bool read_backward_;
1059 : };
1060 :
1061 :
1062 : // Improve the speed that we scan for an initial point where a non-anchored
1063 : // regexp can match by using a Boyer-Moore-like table. This is done by
1064 : // identifying non-greedy non-capturing loops in the nodes that eat any
1065 : // character one at a time. For example in the middle of the regexp
1066 : // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1067 : // inserted at the start of any non-anchored regexp.
1068 : //
1069 : // When we have found such a loop we look ahead in the nodes to find the set of
1070 : // characters that can come at given distances. For example for the regexp
1071 : // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1072 : // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1073 : // the lookahead info where the set of characters is reasonably constrained. In
1074 : // our example this is from index 1 to 2 (0 is not constrained). We can now
1075 : // look 3 characters ahead and if we don't find one of [f, o] (the union of
1076 : // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1077 : //
1078 : // For Unicode input strings we do the same, but modulo 128.
1079 : //
1080 : // We also look at the first string fed to the regexp and use that to get a hint
1081 : // of the character frequencies in the inputs. This affects the assessment of
1082 : // whether the set of characters is 'reasonably constrained'.
1083 : //
1084 : // We also have another lookahead mechanism (called quick check in the code),
1085 : // which uses a wide load of multiple characters followed by a mask and compare
1086 : // to determine whether a match is possible at this point.
1087 : enum ContainedInLattice {
1088 : kNotYet = 0,
1089 : kLatticeIn = 1,
1090 : kLatticeOut = 2,
1091 : kLatticeUnknown = 3 // Can also mean both in and out.
1092 : };
1093 :
1094 :
1095 : inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1096 838016 : return static_cast<ContainedInLattice>(a | b);
1097 : }
1098 :
1099 :
1100 : ContainedInLattice AddRange(ContainedInLattice a,
1101 : const int* ranges,
1102 : int ranges_size,
1103 : Interval new_range);
1104 :
1105 :
1106 : class BoyerMoorePositionInfo : public ZoneObject {
1107 : public:
1108 99251 : explicit BoyerMoorePositionInfo(Zone* zone)
1109 99251 : : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1110 : map_count_(0),
1111 : w_(kNotYet),
1112 : s_(kNotYet),
1113 : d_(kNotYet),
1114 99251 : surrogate_(kNotYet) {
1115 12803379 : for (int i = 0; i < kMapSize; i++) {
1116 12704128 : map_->Add(false, zone);
1117 : }
1118 99251 : }
1119 :
1120 : bool& at(int i) { return map_->at(i); }
1121 :
1122 : static const int kMapSize = 128;
1123 : static const int kMask = kMapSize - 1;
1124 :
1125 : int map_count() const { return map_count_; }
1126 :
1127 : void Set(int character);
1128 : void SetInterval(const Interval& interval);
1129 : void SetAll();
1130 : bool is_non_word() { return w_ == kLatticeOut; }
1131 : bool is_word() { return w_ == kLatticeIn; }
1132 :
1133 : private:
1134 : ZoneList<bool>* map_;
1135 : int map_count_; // Number of set bits in the map.
1136 : ContainedInLattice w_; // The \w character class.
1137 : ContainedInLattice s_; // The \s character class.
1138 : ContainedInLattice d_; // The \d character class.
1139 : ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1140 : };
1141 :
1142 :
1143 : class BoyerMooreLookahead : public ZoneObject {
1144 : public:
1145 : BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1146 :
1147 : int length() { return length_; }
1148 : int max_char() { return max_char_; }
1149 : RegExpCompiler* compiler() { return compiler_; }
1150 :
1151 : int Count(int map_number) {
1152 541035 : return bitmaps_->at(map_number)->map_count();
1153 : }
1154 :
1155 150 : BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1156 :
1157 79581 : void Set(int map_number, int character) {
1158 159162 : if (character > max_char_) return;
1159 159162 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1160 : info->Set(character);
1161 : }
1162 :
1163 323288 : void SetInterval(int map_number, const Interval& interval) {
1164 323288 : if (interval.from() > max_char_) return;
1165 323288 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1166 161644 : if (interval.to() > max_char_) {
1167 0 : info->SetInterval(Interval(interval.from(), max_char_));
1168 : } else {
1169 161644 : info->SetInterval(interval);
1170 : }
1171 : }
1172 :
1173 5457 : void SetAll(int map_number) {
1174 10914 : bitmaps_->at(map_number)->SetAll();
1175 5457 : }
1176 :
1177 1078 : void SetRest(int from_map) {
1178 1078 : for (int i = from_map; i < length_; i++) SetAll(i);
1179 : }
1180 : void EmitSkipInstructions(RegExpMacroAssembler* masm);
1181 :
1182 : private:
1183 : // This is the value obtained by EatsAtLeast. If we do not have at least this
1184 : // many characters left in the sample string then the match is bound to fail.
1185 : // Therefore it is OK to read a character this far ahead of the current match
1186 : // point.
1187 : int length_;
1188 : RegExpCompiler* compiler_;
1189 : // 0xff for Latin1, 0xffff for UTF-16.
1190 : int max_char_;
1191 : ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1192 :
1193 : int GetSkipTable(int min_lookahead,
1194 : int max_lookahead,
1195 : Handle<ByteArray> boolean_skip_table);
1196 : bool FindWorthwhileInterval(int* from, int* to);
1197 : int FindBestInterval(
1198 : int max_number_of_chars, int old_biggest_points, int* from, int* to);
1199 : };
1200 :
1201 :
1202 : // There are many ways to generate code for a node. This class encapsulates
1203 : // the current way we should be generating. In other words it encapsulates
1204 : // the current state of the code generator. The effect of this is that we
1205 : // generate code for paths that the matcher can take through the regular
1206 : // expression. A given node in the regexp can be code-generated several times
1207 : // as it can be part of several traces. For example for the regexp:
1208 : // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1209 : // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1210 : // to match foo is generated only once (the traces have a common prefix). The
1211 : // code to store the capture is deferred and generated (twice) after the places
1212 : // where baz has been matched.
1213 : class Trace {
1214 : public:
1215 : // A value for a property that is either known to be true, know to be false,
1216 : // or not known.
1217 : enum TriBool {
1218 : UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1219 : };
1220 :
1221 : class DeferredAction {
1222 : public:
1223 : DeferredAction(ActionNode::ActionType action_type, int reg)
1224 250255 : : action_type_(action_type), reg_(reg), next_(nullptr) {}
1225 : DeferredAction* next() { return next_; }
1226 : bool Mentions(int reg);
1227 : int reg() { return reg_; }
1228 : ActionNode::ActionType action_type() { return action_type_; }
1229 : private:
1230 : ActionNode::ActionType action_type_;
1231 : int reg_;
1232 : DeferredAction* next_;
1233 : friend class Trace;
1234 : };
1235 :
1236 : class DeferredCapture : public DeferredAction {
1237 : public:
1238 240962 : DeferredCapture(int reg, bool is_capture, Trace* trace)
1239 : : DeferredAction(ActionNode::STORE_POSITION, reg),
1240 : cp_offset_(trace->cp_offset()),
1241 240962 : is_capture_(is_capture) { }
1242 : int cp_offset() { return cp_offset_; }
1243 : bool is_capture() { return is_capture_; }
1244 : private:
1245 : int cp_offset_;
1246 : bool is_capture_;
1247 : void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1248 : };
1249 :
1250 : class DeferredSetRegister : public DeferredAction {
1251 : public:
1252 : DeferredSetRegister(int reg, int value)
1253 : : DeferredAction(ActionNode::SET_REGISTER, reg),
1254 3430 : value_(value) { }
1255 : int value() { return value_; }
1256 : private:
1257 : int value_;
1258 : };
1259 :
1260 : class DeferredClearCaptures : public DeferredAction {
1261 : public:
1262 : explicit DeferredClearCaptures(Interval range)
1263 : : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1264 2154 : range_(range) { }
1265 : Interval range() { return range_; }
1266 : private:
1267 : Interval range_;
1268 : };
1269 :
1270 : class DeferredIncrementRegister : public DeferredAction {
1271 : public:
1272 : explicit DeferredIncrementRegister(int reg)
1273 : : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1274 : };
1275 :
1276 637245 : Trace()
1277 : : cp_offset_(0),
1278 : actions_(nullptr),
1279 : backtrack_(nullptr),
1280 : stop_node_(nullptr),
1281 : loop_label_(nullptr),
1282 : characters_preloaded_(0),
1283 : bound_checked_up_to_(0),
1284 : flush_budget_(100),
1285 1274490 : at_start_(UNKNOWN) {}
1286 :
1287 : // End the trace. This involves flushing the deferred actions in the trace
1288 : // and pushing a backtrack location onto the backtrack stack. Once this is
1289 : // done we can start a new trace or go to one that has already been
1290 : // generated.
1291 : void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1292 : int cp_offset() { return cp_offset_; }
1293 : DeferredAction* actions() { return actions_; }
1294 : // A trivial trace is one that has no deferred actions or other state that
1295 : // affects the assumptions used when generating code. There is no recorded
1296 : // backtrack location in a trivial trace, so with a trivial trace we will
1297 : // generate code that, on a failure to match, gets the backtrack location
1298 : // from the backtrack stack rather than using a direct jump instruction. We
1299 : // always start code generation with a trivial trace and non-trivial traces
1300 : // are created as we emit code for nodes or add to the list of deferred
1301 : // actions in the trace. The location of the code generated for a node using
1302 : // a trivial trace is recorded in a label in the node so that gotos can be
1303 : // generated to that code.
1304 2205079 : bool is_trivial() {
1305 3490711 : return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
1306 1861837 : characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
1307 4032861 : quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
1308 : }
1309 : TriBool at_start() { return at_start_; }
1310 530729 : void set_at_start(TriBool at_start) { at_start_ = at_start; }
1311 : Label* backtrack() { return backtrack_; }
1312 : Label* loop_label() { return loop_label_; }
1313 : RegExpNode* stop_node() { return stop_node_; }
1314 : int characters_preloaded() { return characters_preloaded_; }
1315 : int bound_checked_up_to() { return bound_checked_up_to_; }
1316 : int flush_budget() { return flush_budget_; }
1317 : QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1318 : bool mentions_reg(int reg);
1319 : // Returns true if a deferred position store exists to the specified
1320 : // register and stores the offset in the out-parameter. Otherwise
1321 : // returns false.
1322 : bool GetStoredPosition(int reg, int* cp_offset);
1323 : // These set methods and AdvanceCurrentPositionInTrace should be used only on
1324 : // new traces - the intention is that traces are immutable after creation.
1325 : void add_action(DeferredAction* new_action) {
1326 : DCHECK(new_action->next_ == nullptr);
1327 250255 : new_action->next_ = actions_;
1328 250255 : actions_ = new_action;
1329 : }
1330 805555 : void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1331 11597 : void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1332 11597 : void set_loop_label(Label* label) { loop_label_ = label; }
1333 827802 : void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1334 432202 : void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1335 192279 : void set_flush_budget(int to) { flush_budget_ = to; }
1336 : void set_quick_check_performed(QuickCheckDetails* d) {
1337 224615 : quick_check_performed_ = *d;
1338 : }
1339 : void InvalidateCurrentCharacter();
1340 : void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1341 :
1342 : private:
1343 : int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1344 : void PerformDeferredActions(RegExpMacroAssembler* macro,
1345 : int max_register,
1346 : const OutSet& affected_registers,
1347 : OutSet* registers_to_pop,
1348 : OutSet* registers_to_clear,
1349 : Zone* zone);
1350 : void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1351 : int max_register,
1352 : const OutSet& registers_to_pop,
1353 : const OutSet& registers_to_clear);
1354 : int cp_offset_;
1355 : DeferredAction* actions_;
1356 : Label* backtrack_;
1357 : RegExpNode* stop_node_;
1358 : Label* loop_label_;
1359 : int characters_preloaded_;
1360 : int bound_checked_up_to_;
1361 : QuickCheckDetails quick_check_performed_;
1362 : int flush_budget_;
1363 : TriBool at_start_;
1364 : };
1365 :
1366 :
1367 : class GreedyLoopState {
1368 : public:
1369 : explicit GreedyLoopState(bool not_at_start);
1370 :
1371 : Label* label() { return &label_; }
1372 : Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1373 :
1374 : private:
1375 : Label label_;
1376 : Trace counter_backtrack_trace_;
1377 : };
1378 :
1379 :
1380 : struct PreloadState {
1381 : static const int kEatsAtLeastNotYetInitialized = -1;
1382 : bool preload_is_current_;
1383 : bool preload_has_checked_bounds_;
1384 : int preload_characters_;
1385 : int eats_at_least_;
1386 : void init() {
1387 213389 : eats_at_least_ = kEatsAtLeastNotYetInitialized;
1388 : }
1389 : };
1390 :
1391 :
1392 85592 : class NodeVisitor {
1393 : public:
1394 85592 : virtual ~NodeVisitor() = default;
1395 : #define DECLARE_VISIT(Type) \
1396 : virtual void Visit##Type(Type##Node* that) = 0;
1397 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1398 : #undef DECLARE_VISIT
1399 0 : virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1400 : };
1401 :
1402 :
1403 : // Node visitor used to add the start set of the alternatives to the
1404 : // dispatch table of a choice node.
1405 55 : class DispatchTableConstructor: public NodeVisitor {
1406 : public:
1407 : DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1408 : Zone* zone)
1409 : : table_(table),
1410 : choice_index_(-1),
1411 : ignore_case_(ignore_case),
1412 55 : zone_(zone) { }
1413 :
1414 : void BuildTable(ChoiceNode* node);
1415 :
1416 580 : void AddRange(CharacterRange range) {
1417 1160 : table()->AddRange(range, choice_index_, zone_);
1418 : }
1419 :
1420 : void AddInverse(ZoneList<CharacterRange>* ranges);
1421 :
1422 : #define DECLARE_VISIT(Type) \
1423 : virtual void Visit##Type(Type##Node* that);
1424 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1425 : #undef DECLARE_VISIT
1426 :
1427 : DispatchTable* table() { return table_; }
1428 55 : void set_choice_index(int value) { choice_index_ = value; }
1429 :
1430 : protected:
1431 : DispatchTable* table_;
1432 : int choice_index_;
1433 : bool ignore_case_;
1434 : Zone* zone_;
1435 : };
1436 :
1437 :
1438 : // Assertion propagation moves information about assertions such as
1439 : // \b to the affected nodes. For instance, in /.\b./ information must
1440 : // be propagated to the first '.' that whatever follows needs to know
1441 : // if it matched a word or a non-word, and to the second '.' that it
1442 : // has to check if it succeeds a word or non-word. In this case the
1443 : // result will be something like:
1444 : //
1445 : // +-------+ +------------+
1446 : // | . | | . |
1447 : // +-------+ ---> +------------+
1448 : // | word? | | check word |
1449 : // +-------+ +------------+
1450 85537 : class Analysis: public NodeVisitor {
1451 : public:
1452 : Analysis(Isolate* isolate, bool is_one_byte)
1453 85537 : : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
1454 : void EnsureAnalyzed(RegExpNode* node);
1455 :
1456 : #define DECLARE_VISIT(Type) void Visit##Type(Type##Node* that) override;
1457 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1458 : #undef DECLARE_VISIT
1459 : void VisitLoopChoice(LoopChoiceNode* that) override;
1460 :
1461 : bool has_failed() { return error_message_ != nullptr; }
1462 : const char* error_message() {
1463 : DCHECK(error_message_ != nullptr);
1464 : return error_message_;
1465 : }
1466 : void fail(const char* error_message) {
1467 310 : error_message_ = error_message;
1468 : }
1469 :
1470 : Isolate* isolate() const { return isolate_; }
1471 :
1472 : private:
1473 : Isolate* isolate_;
1474 : bool is_one_byte_;
1475 : const char* error_message_;
1476 :
1477 : DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1478 : };
1479 :
1480 :
1481 : struct RegExpCompileData {
1482 : RegExpCompileData()
1483 : : tree(nullptr),
1484 : node(nullptr),
1485 : simple(true),
1486 : contains_anchor(false),
1487 704460 : capture_count(0) {}
1488 : RegExpTree* tree;
1489 : RegExpNode* node;
1490 : bool simple;
1491 : bool contains_anchor;
1492 : Handle<FixedArray> capture_name_map;
1493 : Handle<String> error;
1494 : int capture_count;
1495 : };
1496 :
1497 :
1498 : class RegExpEngine: public AllStatic {
1499 : public:
1500 : struct CompilationResult {
1501 : inline CompilationResult(Isolate* isolate, const char* error_message);
1502 : CompilationResult(Object code, int registers)
1503 85227 : : code(code), num_registers(registers) {}
1504 : const char* const error_message = nullptr;
1505 : Object const code;
1506 : int const num_registers = 0;
1507 : };
1508 :
1509 : static CompilationResult Compile(Isolate* isolate, Zone* zone,
1510 : RegExpCompileData* input,
1511 : JSRegExp::Flags flags,
1512 : Handle<String> pattern,
1513 : Handle<String> sample_subject,
1514 : bool is_one_byte);
1515 :
1516 : static bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern);
1517 :
1518 : static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1519 : };
1520 :
1521 :
1522 : class RegExpResultsCache : public AllStatic {
1523 : public:
1524 : enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1525 :
1526 : // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
1527 : // On success, the returned result is guaranteed to be a COW-array.
1528 : static Object Lookup(Heap* heap, String key_string, Object key_pattern,
1529 : FixedArray* last_match_out, ResultsCacheType type);
1530 : // Attempt to add value_array to the cache specified by type. On success,
1531 : // value_array is turned into a COW-array.
1532 : static void Enter(Isolate* isolate, Handle<String> key_string,
1533 : Handle<Object> key_pattern, Handle<FixedArray> value_array,
1534 : Handle<FixedArray> last_match_cache, ResultsCacheType type);
1535 : static void Clear(FixedArray cache);
1536 : static const int kRegExpResultsCacheSize = 0x100;
1537 :
1538 : private:
1539 : static const int kArrayEntriesPerCacheEntry = 4;
1540 : static const int kStringOffset = 0;
1541 : static const int kPatternOffset = 1;
1542 : static const int kArrayOffset = 2;
1543 : static const int kLastMatchOffset = 3;
1544 : };
1545 :
1546 : } // namespace internal
1547 : } // namespace v8
1548 :
1549 : #endif // V8_REGEXP_JSREGEXP_H_
|