Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_REGEXP_JSREGEXP_H_
6 : #define V8_REGEXP_JSREGEXP_H_
7 :
8 : #include "src/allocation.h"
9 : #include "src/assembler.h"
10 : #include "src/objects/js-regexp.h"
11 : #include "src/regexp/regexp-ast.h"
12 : #include "src/regexp/regexp-macro-assembler.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 :
17 : class NodeVisitor;
18 : class RegExpCompiler;
19 : class RegExpMacroAssembler;
20 : class RegExpNode;
21 : class RegExpTree;
22 : class BoyerMooreLookahead;
23 :
24 : class RegExpImpl {
25 : public:
26 : // Whether V8 is compiled with native regexp support or not.
27 : static bool UsesNativeRegExp() {
28 : #ifdef V8_INTERPRETED_REGEXP
29 : return false;
30 : #else
31 : return true;
32 : #endif
33 : }
34 :
35 : // Returns a string representation of a regular expression.
36 : // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
37 : // This function calls the garbage collector if necessary.
38 : static Handle<String> ToString(Handle<Object> value);
39 :
40 : // Parses the RegExp pattern and prepares the JSRegExp object with
41 : // generic data and choice of implementation - as well as what
42 : // the implementation wants to store in the data field.
43 : // Returns false if compilation fails.
44 : MUST_USE_RESULT static MaybeHandle<Object> Compile(Handle<JSRegExp> re,
45 : Handle<String> pattern,
46 : JSRegExp::Flags flags);
47 :
48 : // See ECMA-262 section 15.10.6.2.
49 : // This function calls the garbage collector if necessary.
50 : V8_EXPORT_PRIVATE MUST_USE_RESULT static MaybeHandle<Object> Exec(
51 : Handle<JSRegExp> regexp, Handle<String> subject, int index,
52 : Handle<RegExpMatchInfo> last_match_info);
53 :
54 : // Prepares a JSRegExp object with Irregexp-specific data.
55 : static void IrregexpInitialize(Handle<JSRegExp> re,
56 : Handle<String> pattern,
57 : JSRegExp::Flags flags,
58 : int capture_register_count);
59 :
60 :
61 : static void AtomCompile(Handle<JSRegExp> re,
62 : Handle<String> pattern,
63 : JSRegExp::Flags flags,
64 : Handle<String> match_pattern);
65 :
66 :
67 : static int AtomExecRaw(Handle<JSRegExp> regexp,
68 : Handle<String> subject,
69 : int index,
70 : int32_t* output,
71 : int output_size);
72 :
73 : static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
74 : Handle<String> subject, int index,
75 : Handle<RegExpMatchInfo> last_match_info);
76 :
77 : enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
78 :
79 : // Prepare a RegExp for being executed one or more times (using
80 : // IrregexpExecOnce) on the subject.
81 : // This ensures that the regexp is compiled for the subject, and that
82 : // the subject is flat.
83 : // Returns the number of integer spaces required by IrregexpExecOnce
84 : // as its "registers" argument. If the regexp cannot be compiled,
85 : // an exception is set as pending, and this function returns negative.
86 : static int IrregexpPrepare(Handle<JSRegExp> regexp,
87 : Handle<String> subject);
88 :
89 : // Execute a regular expression on the subject, starting from index.
90 : // If matching succeeds, return the number of matches. This can be larger
91 : // than one in the case of global regular expressions.
92 : // The captures and subcaptures are stored into the registers vector.
93 : // If matching fails, returns RE_FAILURE.
94 : // If execution fails, sets a pending exception and returns RE_EXCEPTION.
95 : static int IrregexpExecRaw(Handle<JSRegExp> regexp,
96 : Handle<String> subject,
97 : int index,
98 : int32_t* output,
99 : int output_size);
100 :
101 : // Execute an Irregexp bytecode pattern.
102 : // On a successful match, the result is a JSArray containing
103 : // captured positions. On a failure, the result is the null value.
104 : // Returns an empty handle in case of an exception.
105 : MUST_USE_RESULT static MaybeHandle<Object> IrregexpExec(
106 : Handle<JSRegExp> regexp, Handle<String> subject, int index,
107 : Handle<RegExpMatchInfo> last_match_info);
108 :
109 : // Set last match info. If match is nullptr, then setting captures is
110 : // omitted.
111 : static Handle<RegExpMatchInfo> SetLastMatchInfo(
112 : Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
113 : int capture_count, int32_t* match);
114 :
115 : class GlobalCache {
116 : public:
117 : GlobalCache(Handle<JSRegExp> regexp,
118 : Handle<String> subject,
119 : Isolate* isolate);
120 :
121 : INLINE(~GlobalCache());
122 :
123 : // Fetch the next entry in the cache for global regexp match results.
124 : // This does not set the last match info. Upon failure, nullptr is
125 : // returned. The cause can be checked with Result(). The previous result is
126 : // still in available in memory when a failure happens.
127 : INLINE(int32_t* FetchNext());
128 :
129 : INLINE(int32_t* LastSuccessfulMatch());
130 :
131 : INLINE(bool HasException()) { return num_matches_ < 0; }
132 :
133 : private:
134 : int AdvanceZeroLength(int last_index);
135 :
136 : int num_matches_;
137 : int max_matches_;
138 : int current_match_index_;
139 : int registers_per_match_;
140 : // Pointer to the last set of captures.
141 : int32_t* register_array_;
142 : int register_array_size_;
143 : Handle<JSRegExp> regexp_;
144 : Handle<String> subject_;
145 : };
146 :
147 : // For acting on the JSRegExp data FixedArray.
148 : static int IrregexpMaxRegisterCount(FixedArray* re);
149 : static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
150 : static void SetIrregexpCaptureNameMap(FixedArray* re,
151 : Handle<FixedArray> value);
152 : static int IrregexpNumberOfCaptures(FixedArray* re);
153 : static int IrregexpNumberOfRegisters(FixedArray* re);
154 : static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
155 : static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
156 :
157 : // Limit the space regexps take up on the heap. In order to limit this we
158 : // would like to keep track of the amount of regexp code on the heap. This
159 : // is not tracked, however. As a conservative approximation we track the
160 : // total regexp code compiled including code that has subsequently been freed
161 : // and the total executable memory at any point.
162 : static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
163 : static const size_t kRegExpCompiledLimit = 1 * MB;
164 : static const int kRegExpTooLargeToOptimize = 20 * KB;
165 :
166 : private:
167 : static bool CompileIrregexp(Handle<JSRegExp> re,
168 : Handle<String> sample_subject, bool is_one_byte);
169 : static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re,
170 : Handle<String> sample_subject,
171 : bool is_one_byte);
172 : };
173 :
174 :
175 : // Represents the location of one element relative to the intersection of
176 : // two sets. Corresponds to the four areas of a Venn diagram.
177 : enum ElementInSetsRelation {
178 : kInsideNone = 0,
179 : kInsideFirst = 1,
180 : kInsideSecond = 2,
181 : kInsideBoth = 3
182 : };
183 :
184 :
185 : // A set of unsigned integers that behaves especially well on small
186 : // integers (< 32). May do zone-allocation.
187 : class OutSet: public ZoneObject {
188 : public:
189 1073102 : OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
190 : OutSet* Extend(unsigned value, Zone* zone);
191 : bool Get(unsigned value) const;
192 : static const unsigned kFirstLimit = 32;
193 :
194 : private:
195 : // Destructively set a value in this set. In most cases you want
196 : // to use Extend instead to ensure that only one instance exists
197 : // that contains the same values.
198 : void Set(unsigned value, Zone* zone);
199 :
200 : // The successors are a list of sets that contain the same values
201 : // as this set and the one more value that is not present in this
202 : // set.
203 : ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
204 :
205 : OutSet(uint32_t first, ZoneList<unsigned>* remaining)
206 17565 : : first_(first), remaining_(remaining), successors_(nullptr) {}
207 : uint32_t first_;
208 : ZoneList<unsigned>* remaining_;
209 : ZoneList<OutSet*>* successors_;
210 : friend class Trace;
211 : };
212 :
213 :
214 : // A mapping from integers, specified as ranges, to a set of integers.
215 : // Used for mapping character ranges to choices.
216 : class DispatchTable : public ZoneObject {
217 : public:
218 : explicit DispatchTable(Zone* zone) : tree_(zone) { }
219 :
220 : class Entry {
221 : public:
222 : Entry() : from_(0), to_(0), out_set_(nullptr) {}
223 : Entry(uc32 from, uc32 to, OutSet* out_set)
224 : : from_(from), to_(to), out_set_(out_set) {
225 : DCHECK(from <= to);
226 : }
227 : uc32 from() { return from_; }
228 : uc32 to() { return to_; }
229 5398 : void set_to(uc32 value) { to_ = value; }
230 : void AddValue(int value, Zone* zone) {
231 78219 : out_set_ = out_set_->Extend(value, zone);
232 : }
233 : OutSet* out_set() { return out_set_; }
234 : private:
235 : uc32 from_;
236 : uc32 to_;
237 : OutSet* out_set_;
238 : };
239 :
240 : class Config {
241 : public:
242 : typedef uc32 Key;
243 : typedef Entry Value;
244 : static const uc32 kNoKey;
245 : static const Entry NoValue() { return Value(); }
246 : static inline int Compare(uc32 a, uc32 b) {
247 1558716 : if (a == b)
248 : return 0;
249 1521641 : else if (a < b)
250 : return -1;
251 : else
252 : return 1;
253 : }
254 : };
255 :
256 : void AddRange(CharacterRange range, int value, Zone* zone);
257 : OutSet* Get(uc32 value);
258 : void Dump();
259 :
260 : template <typename Callback>
261 : void ForEach(Callback* callback) {
262 2632 : return tree()->ForEach(callback);
263 : }
264 :
265 : private:
266 : // There can't be a static empty set since it allocates its
267 : // successors in a zone and caches them.
268 : OutSet* empty() { return &empty_; }
269 : OutSet empty_;
270 : ZoneSplayTree<Config>* tree() { return &tree_; }
271 : ZoneSplayTree<Config> tree_;
272 : };
273 :
274 :
275 : // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
276 : class UnicodeRangeSplitter {
277 : public:
278 : UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
279 : void Call(uc32 from, DispatchTable::Entry entry);
280 :
281 : ZoneList<CharacterRange>* bmp() { return bmp_; }
282 : ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
283 : ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
284 : ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
285 :
286 : private:
287 : static const int kBase = 0;
288 : // Separate ranges into
289 : static const int kBmpCodePoints = 1;
290 : static const int kLeadSurrogates = 2;
291 : static const int kTrailSurrogates = 3;
292 : static const int kNonBmpCodePoints = 4;
293 :
294 : Zone* zone_;
295 : DispatchTable table_;
296 : ZoneList<CharacterRange>* bmp_;
297 : ZoneList<CharacterRange>* lead_surrogates_;
298 : ZoneList<CharacterRange>* trail_surrogates_;
299 : ZoneList<CharacterRange>* non_bmp_;
300 : };
301 :
302 :
303 : #define FOR_EACH_NODE_TYPE(VISIT) \
304 : VISIT(End) \
305 : VISIT(Action) \
306 : VISIT(Choice) \
307 : VISIT(BackReference) \
308 : VISIT(Assertion) \
309 : VISIT(Text)
310 :
311 :
312 : class Trace;
313 : struct PreloadState;
314 : class GreedyLoopState;
315 : class AlternativeGenerationList;
316 :
317 : struct NodeInfo {
318 : NodeInfo()
319 : : being_analyzed(false),
320 : been_analyzed(false),
321 : follows_word_interest(false),
322 : follows_newline_interest(false),
323 : follows_start_interest(false),
324 : at_end(false),
325 : visited(false),
326 4880080 : replacement_calculated(false) { }
327 :
328 : // Returns true if the interests and assumptions of this node
329 : // matches the given one.
330 : bool Matches(NodeInfo* that) {
331 : return (at_end == that->at_end) &&
332 : (follows_word_interest == that->follows_word_interest) &&
333 : (follows_newline_interest == that->follows_newline_interest) &&
334 : (follows_start_interest == that->follows_start_interest);
335 : }
336 :
337 : // Updates the interests of this node given the interests of the
338 : // node preceding it.
339 : void AddFromPreceding(NodeInfo* that) {
340 : at_end |= that->at_end;
341 : follows_word_interest |= that->follows_word_interest;
342 : follows_newline_interest |= that->follows_newline_interest;
343 : follows_start_interest |= that->follows_start_interest;
344 : }
345 :
346 : bool HasLookbehind() {
347 : return follows_word_interest ||
348 : follows_newline_interest ||
349 : follows_start_interest;
350 : }
351 :
352 : // Sets the interests of this node to include the interests of the
353 : // following node.
354 : void AddFromFollowing(NodeInfo* that) {
355 632164 : follows_word_interest |= that->follows_word_interest;
356 632164 : follows_newline_interest |= that->follows_newline_interest;
357 632164 : follows_start_interest |= that->follows_start_interest;
358 : }
359 :
360 : void ResetCompilationState() {
361 : being_analyzed = false;
362 : been_analyzed = false;
363 : }
364 :
365 : bool being_analyzed: 1;
366 : bool been_analyzed: 1;
367 :
368 : // These bits are set of this node has to know what the preceding
369 : // character was.
370 : bool follows_word_interest: 1;
371 : bool follows_newline_interest: 1;
372 : bool follows_start_interest: 1;
373 :
374 : bool at_end: 1;
375 : bool visited: 1;
376 : bool replacement_calculated: 1;
377 : };
378 :
379 :
380 : // Details of a quick mask-compare check that can look ahead in the
381 : // input stream.
382 : class QuickCheckDetails {
383 : public:
384 : QuickCheckDetails()
385 : : characters_(0),
386 : mask_(0),
387 : value_(0),
388 15963540 : cannot_match_(false) { }
389 : explicit QuickCheckDetails(int characters)
390 : : characters_(characters),
391 : mask_(0),
392 : value_(0),
393 868115 : cannot_match_(false) { }
394 : bool Rationalize(bool one_byte);
395 : // Merge in the information from another branch of an alternation.
396 : void Merge(QuickCheckDetails* other, int from_index);
397 : // Advance the current position by some amount.
398 : void Advance(int by, bool one_byte);
399 : void Clear();
400 : bool cannot_match() { return cannot_match_; }
401 491 : void set_cannot_match() { cannot_match_ = true; }
402 : struct Position {
403 13465324 : Position() : mask(0), value(0), determines_perfectly(false) { }
404 : uc16 mask;
405 : uc16 value;
406 : bool determines_perfectly;
407 : };
408 : int characters() { return characters_; }
409 652878 : void set_characters(int characters) { characters_ = characters; }
410 : Position* positions(int index) {
411 : DCHECK_LE(0, index);
412 : DCHECK_GT(characters_, index);
413 1890514 : return positions_ + index;
414 : }
415 : uint32_t mask() { return mask_; }
416 : uint32_t value() { return value_; }
417 :
418 : private:
419 : // How many characters do we have quick check information from. This is
420 : // the same for all branches of a choice node.
421 : int characters_;
422 : Position positions_[4];
423 : // These values are the condensate of the above array after Rationalize().
424 : uint32_t mask_;
425 : uint32_t value_;
426 : // If set to true, there is no way this quick check can match at all.
427 : // E.g., if it requires to be at the start of the input, and isn't.
428 : bool cannot_match_;
429 : };
430 :
431 :
432 : extern int kUninitializedRegExpNodePlaceHolder;
433 :
434 :
435 : class RegExpNode: public ZoneObject {
436 : public:
437 : explicit RegExpNode(Zone* zone)
438 : : replacement_(nullptr),
439 : on_work_list_(false),
440 : trace_count_(0),
441 14640240 : zone_(zone) {
442 4880080 : bm_info_[0] = bm_info_[1] = nullptr;
443 : }
444 : virtual ~RegExpNode();
445 : virtual void Accept(NodeVisitor* visitor) = 0;
446 : // Generates a goto to this node or actually generates the code at this point.
447 : virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
448 : // How many characters must this node consume at a minimum in order to
449 : // succeed. If we have found at least 'still_to_find' characters that
450 : // must be consumed there is no need to ask any following nodes whether
451 : // they are sure to eat any more characters. The not_at_start argument is
452 : // used to indicate that we know we are not at the start of the input. In
453 : // this case anchored branches will always fail and can be ignored when
454 : // determining how many characters are consumed on success.
455 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
456 : // Emits some quick code that checks whether the preloaded characters match.
457 : // Falls through on certain failure, jumps to the label on possible success.
458 : // If the node cannot make a quick check it does nothing and returns false.
459 : bool EmitQuickCheck(RegExpCompiler* compiler,
460 : Trace* bounds_check_trace,
461 : Trace* trace,
462 : bool preload_has_checked_bounds,
463 : Label* on_possible_success,
464 : QuickCheckDetails* details_return,
465 : bool fall_through_on_failure);
466 : // For a given number of characters this returns a mask and a value. The
467 : // next n characters are anded with the mask and compared with the value.
468 : // A comparison failure indicates the node cannot match the next n characters.
469 : // A comparison success indicates the node may match.
470 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
471 : RegExpCompiler* compiler,
472 : int characters_filled_in,
473 : bool not_at_start) = 0;
474 : static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
475 98187 : virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
476 : // Only returns the successor for a text node of length 1 that matches any
477 : // character and that has no guards on it.
478 95615 : virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
479 : RegExpCompiler* compiler) {
480 95615 : return nullptr;
481 : }
482 :
483 : // Collects information on the possible code units (mod 128) that can match if
484 : // we look forward. This is used for a Boyer-Moore-like string searching
485 : // implementation. TODO(erikcorry): This should share more code with
486 : // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
487 : // the number of nodes we are willing to look at in order to create this data.
488 : static const int kRecursionBudget = 200;
489 : bool KeepRecursing(RegExpCompiler* compiler);
490 0 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
491 : BoyerMooreLookahead* bm, bool not_at_start) {
492 0 : UNREACHABLE();
493 : }
494 :
495 : // If we know that the input is one-byte then there are some nodes that can
496 : // never match. This method returns a node that can be substituted for
497 : // itself, or nullptr if the node can never match.
498 22309 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
499 22309 : return this;
500 : }
501 : // Helper for FilterOneByte.
502 : RegExpNode* replacement() {
503 : DCHECK(info()->replacement_calculated);
504 : return replacement_;
505 : }
506 : RegExpNode* set_replacement(RegExpNode* replacement) {
507 243130 : info()->replacement_calculated = true;
508 243130 : replacement_ = replacement;
509 : return replacement; // For convenience.
510 : }
511 :
512 : // We want to avoid recalculating the lookahead info, so we store it on the
513 : // node. Only info that is for this node is stored. We can tell that the
514 : // info is for this node when offset == 0, so the information is calculated
515 : // relative to this node.
516 : void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
517 133622 : if (offset == 0) set_bm_info(not_at_start, bm);
518 : }
519 :
520 : Label* label() { return &label_; }
521 : // If non-generic code is generated for a node (i.e. the node is not at the
522 : // start of the trace) then it cannot be reused. This variable sets a limit
523 : // on how often we allow that to happen before we insist on starting a new
524 : // trace and generating generic code for a node that can be reused by flushing
525 : // the deferred actions in the current trace and generating a goto.
526 : static const int kMaxCopiesCodeGenerated = 10;
527 :
528 : bool on_work_list() { return on_work_list_; }
529 430832 : void set_on_work_list(bool value) { on_work_list_ = value; }
530 :
531 : NodeInfo* info() { return &info_; }
532 :
533 : BoyerMooreLookahead* bm_info(bool not_at_start) {
534 89442 : return bm_info_[not_at_start ? 1 : 0];
535 : }
536 :
537 : Zone* zone() const { return zone_; }
538 :
539 : protected:
540 : enum LimitResult { DONE, CONTINUE };
541 : RegExpNode* replacement_;
542 :
543 : LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
544 :
545 : void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
546 199753 : bm_info_[not_at_start ? 1 : 0] = bm;
547 : }
548 :
549 : private:
550 : static const int kFirstCharBudget = 10;
551 : Label label_;
552 : bool on_work_list_;
553 : NodeInfo info_;
554 : // This variable keeps track of how many times code has been generated for
555 : // this node (in different traces). We don't keep track of where the
556 : // generated code is located unless the code is generated at the start of
557 : // a trace, in which case it is generic and can be reused by flushing the
558 : // deferred operations in the current trace and generating a goto.
559 : int trace_count_;
560 : BoyerMooreLookahead* bm_info_[2];
561 :
562 : Zone* zone_;
563 : };
564 :
565 :
566 0 : class SeqRegExpNode: public RegExpNode {
567 : public:
568 3638916 : explicit SeqRegExpNode(RegExpNode* on_success)
569 3638916 : : RegExpNode(on_success->zone()), on_success_(on_success) { }
570 : RegExpNode* on_success() { return on_success_; }
571 : void set_on_success(RegExpNode* node) { on_success_ = node; }
572 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
573 0 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
574 : BoyerMooreLookahead* bm, bool not_at_start) {
575 0 : on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
576 0 : if (offset == 0) set_bm_info(not_at_start, bm);
577 0 : }
578 :
579 : protected:
580 : RegExpNode* FilterSuccessor(int depth, bool ignore_case);
581 :
582 : private:
583 : RegExpNode* on_success_;
584 : };
585 :
586 :
587 0 : class ActionNode: public SeqRegExpNode {
588 : public:
589 : enum ActionType {
590 : SET_REGISTER,
591 : INCREMENT_REGISTER,
592 : STORE_POSITION,
593 : BEGIN_SUBMATCH,
594 : POSITIVE_SUBMATCH_SUCCESS,
595 : EMPTY_MATCH_CHECK,
596 : CLEAR_CAPTURES
597 : };
598 : static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
599 : static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
600 : static ActionNode* StorePosition(int reg,
601 : bool is_capture,
602 : RegExpNode* on_success);
603 : static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
604 : static ActionNode* BeginSubmatch(int stack_pointer_reg,
605 : int position_reg,
606 : RegExpNode* on_success);
607 : static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
608 : int restore_reg,
609 : int clear_capture_count,
610 : int clear_capture_from,
611 : RegExpNode* on_success);
612 : static ActionNode* EmptyMatchCheck(int start_register,
613 : int repetition_register,
614 : int repetition_limit,
615 : RegExpNode* on_success);
616 : virtual void Accept(NodeVisitor* visitor);
617 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
618 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
619 84910 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
620 : RegExpCompiler* compiler,
621 : int filled_in,
622 : bool not_at_start) {
623 84910 : return on_success()->GetQuickCheckDetails(
624 84910 : details, compiler, filled_in, not_at_start);
625 : }
626 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
627 : BoyerMooreLookahead* bm, bool not_at_start);
628 : ActionType action_type() { return action_type_; }
629 : // TODO(erikcorry): We should allow some action nodes in greedy loops.
630 132786 : virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
631 :
632 : private:
633 : union {
634 : struct {
635 : int reg;
636 : int value;
637 : } u_store_register;
638 : struct {
639 : int reg;
640 : } u_increment_register;
641 : struct {
642 : int reg;
643 : bool is_capture;
644 : } u_position_register;
645 : struct {
646 : int stack_pointer_register;
647 : int current_position_register;
648 : int clear_register_count;
649 : int clear_register_from;
650 : } u_submatch;
651 : struct {
652 : int start_register;
653 : int repetition_register;
654 : int repetition_limit;
655 : } u_empty_match_check;
656 : struct {
657 : int range_from;
658 : int range_to;
659 : } u_clear_captures;
660 : } data_;
661 : ActionNode(ActionType action_type, RegExpNode* on_success)
662 : : SeqRegExpNode(on_success),
663 2285655 : action_type_(action_type) { }
664 : ActionType action_type_;
665 : friend class DotPrinter;
666 : };
667 :
668 :
669 0 : class TextNode: public SeqRegExpNode {
670 : public:
671 : TextNode(ZoneList<TextElement>* elms, bool read_backward,
672 : RegExpNode* on_success)
673 1146916 : : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
674 194874 : TextNode(RegExpCharacterClass* that, bool read_backward,
675 : RegExpNode* on_success)
676 : : SeqRegExpNode(on_success),
677 389748 : elms_(new (zone()) ZoneList<TextElement>(1, zone())),
678 389748 : read_backward_(read_backward) {
679 194874 : elms_->Add(TextElement::CharClass(that), zone());
680 194874 : }
681 : // Create TextNode for a single character class for the given ranges.
682 : static TextNode* CreateForCharacterRanges(Zone* zone,
683 : ZoneList<CharacterRange>* ranges,
684 : bool read_backward,
685 : RegExpNode* on_success);
686 : // Create TextNode for a surrogate pair with a range given for the
687 : // lead and the trail surrogate each.
688 : static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
689 : CharacterRange trail,
690 : bool read_backward,
691 : RegExpNode* on_success);
692 : virtual void Accept(NodeVisitor* visitor);
693 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
694 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
695 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
696 : RegExpCompiler* compiler,
697 : int characters_filled_in,
698 : bool not_at_start);
699 : ZoneList<TextElement>* elements() { return elms_; }
700 : bool read_backward() { return read_backward_; }
701 : void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
702 : virtual int GreedyLoopTextLength();
703 : virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
704 : RegExpCompiler* compiler);
705 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
706 : BoyerMooreLookahead* bm, bool not_at_start);
707 : void CalculateOffsets();
708 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
709 :
710 : private:
711 : enum TextEmitPassType {
712 : NON_LATIN1_MATCH, // Check for characters that can't match.
713 : SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
714 : NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
715 : CASE_CHARACTER_MATCH, // Case-independent single character check.
716 : CHARACTER_CLASS_MATCH // Character class.
717 : };
718 : static bool SkipPass(int pass, bool ignore_case);
719 : static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
720 : static const int kLastPass = CHARACTER_CLASS_MATCH;
721 : void TextEmitPass(RegExpCompiler* compiler,
722 : TextEmitPassType pass,
723 : bool preloaded,
724 : Trace* trace,
725 : bool first_element_checked,
726 : int* checked_up_to);
727 : int Length();
728 : ZoneList<TextElement>* elms_;
729 : bool read_backward_;
730 : };
731 :
732 :
733 0 : class AssertionNode: public SeqRegExpNode {
734 : public:
735 : enum AssertionType {
736 : AT_END,
737 : AT_START,
738 : AT_BOUNDARY,
739 : AT_NON_BOUNDARY,
740 : AFTER_NEWLINE
741 : };
742 3474 : static AssertionNode* AtEnd(RegExpNode* on_success) {
743 3474 : return new(on_success->zone()) AssertionNode(AT_END, on_success);
744 : }
745 4277 : static AssertionNode* AtStart(RegExpNode* on_success) {
746 4277 : return new(on_success->zone()) AssertionNode(AT_START, on_success);
747 : }
748 145 : static AssertionNode* AtBoundary(RegExpNode* on_success) {
749 145 : return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
750 : }
751 124 : static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
752 124 : return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
753 : }
754 782 : static AssertionNode* AfterNewline(RegExpNode* on_success) {
755 782 : return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
756 : }
757 : virtual void Accept(NodeVisitor* visitor);
758 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
759 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
760 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
761 : RegExpCompiler* compiler,
762 : int filled_in,
763 : bool not_at_start);
764 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
765 : BoyerMooreLookahead* bm, bool not_at_start);
766 : AssertionType assertion_type() { return assertion_type_; }
767 :
768 : private:
769 : void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
770 : enum IfPrevious { kIsNonWord, kIsWord };
771 : void BacktrackIfPrevious(RegExpCompiler* compiler,
772 : Trace* trace,
773 : IfPrevious backtrack_if_previous);
774 : AssertionNode(AssertionType t, RegExpNode* on_success)
775 8802 : : SeqRegExpNode(on_success), assertion_type_(t) { }
776 : AssertionType assertion_type_;
777 : };
778 :
779 :
780 0 : class BackReferenceNode: public SeqRegExpNode {
781 : public:
782 : BackReferenceNode(int start_reg, int end_reg, bool read_backward,
783 : RegExpNode* on_success)
784 : : SeqRegExpNode(on_success),
785 : start_reg_(start_reg),
786 : end_reg_(end_reg),
787 2669 : read_backward_(read_backward) {}
788 : virtual void Accept(NodeVisitor* visitor);
789 : int start_register() { return start_reg_; }
790 : int end_register() { return end_reg_; }
791 : bool read_backward() { return read_backward_; }
792 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
793 : virtual int EatsAtLeast(int still_to_find,
794 : int recursion_depth,
795 : bool not_at_start);
796 615 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
797 : RegExpCompiler* compiler,
798 : int characters_filled_in,
799 : bool not_at_start) {
800 615 : return;
801 : }
802 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
803 : BoyerMooreLookahead* bm, bool not_at_start);
804 :
805 : private:
806 : int start_reg_;
807 : int end_reg_;
808 : bool read_backward_;
809 : };
810 :
811 :
812 0 : class EndNode: public RegExpNode {
813 : public:
814 : enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
815 97200 : EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
816 : virtual void Accept(NodeVisitor* visitor);
817 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
818 130683 : virtual int EatsAtLeast(int still_to_find,
819 : int recursion_depth,
820 130683 : bool not_at_start) { return 0; }
821 0 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
822 : RegExpCompiler* compiler,
823 : int characters_filled_in,
824 : bool not_at_start) {
825 : // Returning 0 from EatsAtLeast should ensure we never get here.
826 0 : UNREACHABLE();
827 : }
828 0 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
829 : BoyerMooreLookahead* bm, bool not_at_start) {
830 : // Returning 0 from EatsAtLeast should ensure we never get here.
831 0 : UNREACHABLE();
832 : }
833 :
834 : private:
835 : Action action_;
836 : };
837 :
838 :
839 0 : class NegativeSubmatchSuccess: public EndNode {
840 : public:
841 : NegativeSubmatchSuccess(int stack_pointer_reg,
842 : int position_reg,
843 : int clear_capture_count,
844 : int clear_capture_start,
845 : Zone* zone)
846 : : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
847 : stack_pointer_register_(stack_pointer_reg),
848 : current_position_register_(position_reg),
849 : clear_capture_count_(clear_capture_count),
850 2928 : clear_capture_start_(clear_capture_start) { }
851 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
852 :
853 : private:
854 : int stack_pointer_register_;
855 : int current_position_register_;
856 : int clear_capture_count_;
857 : int clear_capture_start_;
858 : };
859 :
860 :
861 : class Guard: public ZoneObject {
862 : public:
863 : enum Relation { LT, GEQ };
864 : Guard(int reg, Relation op, int value)
865 : : reg_(reg),
866 : op_(op),
867 1007347 : value_(value) { }
868 : int reg() { return reg_; }
869 : Relation op() { return op_; }
870 : int value() { return value_; }
871 :
872 : private:
873 : int reg_;
874 : Relation op_;
875 : int value_;
876 : };
877 :
878 :
879 : class GuardedAlternative {
880 : public:
881 : explicit GuardedAlternative(RegExpNode* node)
882 2227726 : : node_(node), guards_(nullptr) {}
883 : void AddGuard(Guard* guard, Zone* zone);
884 : RegExpNode* node() { return node_; }
885 87022 : void set_node(RegExpNode* node) { node_ = node; }
886 : ZoneList<Guard*>* guards() { return guards_; }
887 :
888 : private:
889 : RegExpNode* node_;
890 : ZoneList<Guard*>* guards_;
891 : };
892 :
893 :
894 : class AlternativeGeneration;
895 :
896 :
897 0 : class ChoiceNode: public RegExpNode {
898 : public:
899 1143964 : explicit ChoiceNode(int expected_size, Zone* zone)
900 : : RegExpNode(zone),
901 : alternatives_(new (zone)
902 1143964 : ZoneList<GuardedAlternative>(expected_size, zone)),
903 : table_(nullptr),
904 : not_at_start_(false),
905 2287928 : being_calculated_(false) {}
906 : virtual void Accept(NodeVisitor* visitor);
907 2354582 : void AddAlternative(GuardedAlternative node) {
908 4693935 : alternatives()->Add(node, zone());
909 : }
910 : ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
911 : DispatchTable* GetTable(bool ignore_case);
912 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
913 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
914 : int EatsAtLeastHelper(int still_to_find,
915 : int budget,
916 : RegExpNode* ignore_this_node,
917 : bool not_at_start);
918 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
919 : RegExpCompiler* compiler,
920 : int characters_filled_in,
921 : bool not_at_start);
922 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
923 : BoyerMooreLookahead* bm, bool not_at_start);
924 :
925 : bool being_calculated() { return being_calculated_; }
926 : bool not_at_start() { return not_at_start_; }
927 7831 : void set_not_at_start() { not_at_start_ = true; }
928 0 : void set_being_calculated(bool b) { being_calculated_ = b; }
929 520155 : virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
930 520155 : return true;
931 : }
932 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
933 0 : virtual bool read_backward() { return false; }
934 :
935 : protected:
936 : int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
937 : ZoneList<GuardedAlternative>* alternatives_;
938 :
939 : private:
940 : friend class DispatchTableConstructor;
941 : friend class Analysis;
942 : void GenerateGuard(RegExpMacroAssembler* macro_assembler,
943 : Guard* guard,
944 : Trace* trace);
945 : int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
946 : void EmitOutOfLineContinuation(RegExpCompiler* compiler,
947 : Trace* trace,
948 : GuardedAlternative alternative,
949 : AlternativeGeneration* alt_gen,
950 : int preload_characters,
951 : bool next_expects_preload);
952 : void SetUpPreLoad(RegExpCompiler* compiler,
953 : Trace* current_trace,
954 : PreloadState* preloads);
955 : void AssertGuardsMentionRegisters(Trace* trace);
956 : int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
957 : Trace* EmitGreedyLoop(RegExpCompiler* compiler,
958 : Trace* trace,
959 : AlternativeGenerationList* alt_gens,
960 : PreloadState* preloads,
961 : GreedyLoopState* greedy_loop_state,
962 : int text_length);
963 : void EmitChoices(RegExpCompiler* compiler,
964 : AlternativeGenerationList* alt_gens,
965 : int first_choice,
966 : Trace* trace,
967 : PreloadState* preloads);
968 : DispatchTable* table_;
969 : // If true, this node is never checked at the start of the input.
970 : // Allows a new trace to start with at_start() set to false.
971 : bool not_at_start_;
972 : bool being_calculated_;
973 : };
974 :
975 :
976 0 : class NegativeLookaroundChoiceNode : public ChoiceNode {
977 : public:
978 2928 : explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
979 : GuardedAlternative then_do_this,
980 : Zone* zone)
981 2928 : : ChoiceNode(2, zone) {
982 : AddAlternative(this_must_fail);
983 : AddAlternative(then_do_this);
984 2928 : }
985 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
986 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
987 : RegExpCompiler* compiler,
988 : int characters_filled_in,
989 : bool not_at_start);
990 1187 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
991 : BoyerMooreLookahead* bm, bool not_at_start) {
992 2374 : alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
993 1187 : not_at_start);
994 1187 : if (offset == 0) set_bm_info(not_at_start, bm);
995 1187 : }
996 : // For a negative lookahead we don't emit the quick check for the
997 : // alternative that is expected to fail. This is because quick check code
998 : // starts by loading enough characters for the alternative that takes fewest
999 : // characters, but on a negative lookahead the negative branch did not take
1000 : // part in that calculation (EatsAtLeast) so the assumptions don't hold.
1001 5832 : virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
1002 5832 : return !is_first;
1003 : }
1004 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1005 : };
1006 :
1007 :
1008 0 : class LoopChoiceNode: public ChoiceNode {
1009 : public:
1010 : LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1011 : : ChoiceNode(2, zone),
1012 : loop_node_(nullptr),
1013 : continue_node_(nullptr),
1014 : body_can_be_zero_length_(body_can_be_zero_length),
1015 1112399 : read_backward_(read_backward) {}
1016 : void AddLoopAlternative(GuardedAlternative alt);
1017 : void AddContinueAlternative(GuardedAlternative alt);
1018 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1019 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1020 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1021 : RegExpCompiler* compiler,
1022 : int characters_filled_in,
1023 : bool not_at_start);
1024 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1025 : BoyerMooreLookahead* bm, bool not_at_start);
1026 : RegExpNode* loop_node() { return loop_node_; }
1027 : RegExpNode* continue_node() { return continue_node_; }
1028 : bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1029 29618 : virtual bool read_backward() { return read_backward_; }
1030 : virtual void Accept(NodeVisitor* visitor);
1031 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1032 :
1033 : private:
1034 : // AddAlternative is made private for loop nodes because alternatives
1035 : // should not be added freely, we need to keep track of which node
1036 : // goes back to the node itself.
1037 : void AddAlternative(GuardedAlternative node) {
1038 : ChoiceNode::AddAlternative(node);
1039 : }
1040 :
1041 : RegExpNode* loop_node_;
1042 : RegExpNode* continue_node_;
1043 : bool body_can_be_zero_length_;
1044 : bool read_backward_;
1045 : };
1046 :
1047 :
1048 : // Improve the speed that we scan for an initial point where a non-anchored
1049 : // regexp can match by using a Boyer-Moore-like table. This is done by
1050 : // identifying non-greedy non-capturing loops in the nodes that eat any
1051 : // character one at a time. For example in the middle of the regexp
1052 : // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1053 : // inserted at the start of any non-anchored regexp.
1054 : //
1055 : // When we have found such a loop we look ahead in the nodes to find the set of
1056 : // characters that can come at given distances. For example for the regexp
1057 : // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1058 : // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1059 : // the lookahead info where the set of characters is reasonably constrained. In
1060 : // our example this is from index 1 to 2 (0 is not constrained). We can now
1061 : // look 3 characters ahead and if we don't find one of [f, o] (the union of
1062 : // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1063 : //
1064 : // For Unicode input strings we do the same, but modulo 128.
1065 : //
1066 : // We also look at the first string fed to the regexp and use that to get a hint
1067 : // of the character frequencies in the inputs. This affects the assessment of
1068 : // whether the set of characters is 'reasonably constrained'.
1069 : //
1070 : // We also have another lookahead mechanism (called quick check in the code),
1071 : // which uses a wide load of multiple characters followed by a mask and compare
1072 : // to determine whether a match is possible at this point.
1073 : enum ContainedInLattice {
1074 : kNotYet = 0,
1075 : kLatticeIn = 1,
1076 : kLatticeOut = 2,
1077 : kLatticeUnknown = 3 // Can also mean both in and out.
1078 : };
1079 :
1080 :
1081 : inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1082 885017 : return static_cast<ContainedInLattice>(a | b);
1083 : }
1084 :
1085 :
1086 : ContainedInLattice AddRange(ContainedInLattice a,
1087 : const int* ranges,
1088 : int ranges_size,
1089 : Interval new_range);
1090 :
1091 :
1092 : class BoyerMoorePositionInfo : public ZoneObject {
1093 : public:
1094 130495 : explicit BoyerMoorePositionInfo(Zone* zone)
1095 130495 : : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1096 : map_count_(0),
1097 : w_(kNotYet),
1098 : s_(kNotYet),
1099 : d_(kNotYet),
1100 130495 : surrogate_(kNotYet) {
1101 16833855 : for (int i = 0; i < kMapSize; i++) {
1102 16703360 : map_->Add(false, zone);
1103 : }
1104 130495 : }
1105 :
1106 : bool& at(int i) { return map_->at(i); }
1107 :
1108 : static const int kMapSize = 128;
1109 : static const int kMask = kMapSize - 1;
1110 :
1111 : int map_count() const { return map_count_; }
1112 :
1113 : void Set(int character);
1114 : void SetInterval(const Interval& interval);
1115 : void SetAll();
1116 : bool is_non_word() { return w_ == kLatticeOut; }
1117 : bool is_word() { return w_ == kLatticeIn; }
1118 :
1119 : private:
1120 : ZoneList<bool>* map_;
1121 : int map_count_; // Number of set bits in the map.
1122 : ContainedInLattice w_; // The \w character class.
1123 : ContainedInLattice s_; // The \s character class.
1124 : ContainedInLattice d_; // The \d character class.
1125 : ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1126 : };
1127 :
1128 :
1129 : class BoyerMooreLookahead : public ZoneObject {
1130 : public:
1131 : BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1132 :
1133 : int length() { return length_; }
1134 : int max_char() { return max_char_; }
1135 : RegExpCompiler* compiler() { return compiler_; }
1136 :
1137 : int Count(int map_number) {
1138 653831 : return bitmaps_->at(map_number)->map_count();
1139 : }
1140 :
1141 167 : BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1142 :
1143 65072 : void Set(int map_number, int character) {
1144 130144 : if (character > max_char_) return;
1145 130144 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1146 : info->Set(character);
1147 : }
1148 :
1149 402942 : void SetInterval(int map_number, const Interval& interval) {
1150 402942 : if (interval.from() > max_char_) return;
1151 402942 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1152 201471 : if (interval.to() > max_char_) {
1153 0 : info->SetInterval(Interval(interval.from(), max_char_));
1154 : } else {
1155 201471 : info->SetInterval(interval);
1156 : }
1157 : }
1158 :
1159 5675 : void SetAll(int map_number) {
1160 11350 : bitmaps_->at(map_number)->SetAll();
1161 5675 : }
1162 :
1163 1145 : void SetRest(int from_map) {
1164 1145 : for (int i = from_map; i < length_; i++) SetAll(i);
1165 : }
1166 : void EmitSkipInstructions(RegExpMacroAssembler* masm);
1167 :
1168 : private:
1169 : // This is the value obtained by EatsAtLeast. If we do not have at least this
1170 : // many characters left in the sample string then the match is bound to fail.
1171 : // Therefore it is OK to read a character this far ahead of the current match
1172 : // point.
1173 : int length_;
1174 : RegExpCompiler* compiler_;
1175 : // 0xff for Latin1, 0xffff for UTF-16.
1176 : int max_char_;
1177 : ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1178 :
1179 : int GetSkipTable(int min_lookahead,
1180 : int max_lookahead,
1181 : Handle<ByteArray> boolean_skip_table);
1182 : bool FindWorthwhileInterval(int* from, int* to);
1183 : int FindBestInterval(
1184 : int max_number_of_chars, int old_biggest_points, int* from, int* to);
1185 : };
1186 :
1187 :
1188 : // There are many ways to generate code for a node. This class encapsulates
1189 : // the current way we should be generating. In other words it encapsulates
1190 : // the current state of the code generator. The effect of this is that we
1191 : // generate code for paths that the matcher can take through the regular
1192 : // expression. A given node in the regexp can be code-generated several times
1193 : // as it can be part of several traces. For example for the regexp:
1194 : // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1195 : // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1196 : // to match foo is generated only once (the traces have a common prefix). The
1197 : // code to store the capture is deferred and generated (twice) after the places
1198 : // where baz has been matched.
1199 : class Trace {
1200 : public:
1201 : // A value for a property that is either known to be true, know to be false,
1202 : // or not known.
1203 : enum TriBool {
1204 : UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1205 : };
1206 :
1207 : class DeferredAction {
1208 : public:
1209 : DeferredAction(ActionNode::ActionType action_type, int reg)
1210 309577 : : action_type_(action_type), reg_(reg), next_(nullptr) {}
1211 : DeferredAction* next() { return next_; }
1212 : bool Mentions(int reg);
1213 : int reg() { return reg_; }
1214 : ActionNode::ActionType action_type() { return action_type_; }
1215 : private:
1216 : ActionNode::ActionType action_type_;
1217 : int reg_;
1218 : DeferredAction* next_;
1219 : friend class Trace;
1220 : };
1221 :
1222 : class DeferredCapture : public DeferredAction {
1223 : public:
1224 292851 : DeferredCapture(int reg, bool is_capture, Trace* trace)
1225 : : DeferredAction(ActionNode::STORE_POSITION, reg),
1226 : cp_offset_(trace->cp_offset()),
1227 292851 : is_capture_(is_capture) { }
1228 : int cp_offset() { return cp_offset_; }
1229 : bool is_capture() { return is_capture_; }
1230 : private:
1231 : int cp_offset_;
1232 : bool is_capture_;
1233 : void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1234 : };
1235 :
1236 : class DeferredSetRegister : public DeferredAction {
1237 : public:
1238 : DeferredSetRegister(int reg, int value)
1239 : : DeferredAction(ActionNode::SET_REGISTER, reg),
1240 6013 : value_(value) { }
1241 : int value() { return value_; }
1242 : private:
1243 : int value_;
1244 : };
1245 :
1246 : class DeferredClearCaptures : public DeferredAction {
1247 : public:
1248 : explicit DeferredClearCaptures(Interval range)
1249 : : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1250 3803 : range_(range) { }
1251 : Interval range() { return range_; }
1252 : private:
1253 : Interval range_;
1254 : };
1255 :
1256 : class DeferredIncrementRegister : public DeferredAction {
1257 : public:
1258 : explicit DeferredIncrementRegister(int reg)
1259 : : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1260 : };
1261 :
1262 711490 : Trace()
1263 : : cp_offset_(0),
1264 : actions_(nullptr),
1265 : backtrack_(nullptr),
1266 : stop_node_(nullptr),
1267 : loop_label_(nullptr),
1268 : characters_preloaded_(0),
1269 : bound_checked_up_to_(0),
1270 : flush_budget_(100),
1271 1422980 : at_start_(UNKNOWN) {}
1272 :
1273 : // End the trace. This involves flushing the deferred actions in the trace
1274 : // and pushing a backtrack location onto the backtrack stack. Once this is
1275 : // done we can start a new trace or go to one that has already been
1276 : // generated.
1277 : void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1278 : int cp_offset() { return cp_offset_; }
1279 : DeferredAction* actions() { return actions_; }
1280 : // A trivial trace is one that has no deferred actions or other state that
1281 : // affects the assumptions used when generating code. There is no recorded
1282 : // backtrack location in a trivial trace, so with a trivial trace we will
1283 : // generate code that, on a failure to match, gets the backtrack location
1284 : // from the backtrack stack rather than using a direct jump instruction. We
1285 : // always start code generation with a trivial trace and non-trivial traces
1286 : // are created as we emit code for nodes or add to the list of deferred
1287 : // actions in the trace. The location of the code generated for a node using
1288 : // a trivial trace is recorded in a label in the node so that gotos can be
1289 : // generated to that code.
1290 2416014 : bool is_trivial() {
1291 3810184 : return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
1292 2010312 : characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
1293 4385510 : quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
1294 : }
1295 : TriBool at_start() { return at_start_; }
1296 583270 : void set_at_start(TriBool at_start) { at_start_ = at_start; }
1297 : Label* backtrack() { return backtrack_; }
1298 : Label* loop_label() { return loop_label_; }
1299 : RegExpNode* stop_node() { return stop_node_; }
1300 : int characters_preloaded() { return characters_preloaded_; }
1301 : int bound_checked_up_to() { return bound_checked_up_to_; }
1302 : int flush_budget() { return flush_budget_; }
1303 : QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1304 : bool mentions_reg(int reg);
1305 : // Returns true if a deferred position store exists to the specified
1306 : // register and stores the offset in the out-parameter. Otherwise
1307 : // returns false.
1308 : bool GetStoredPosition(int reg, int* cp_offset);
1309 : // These set methods and AdvanceCurrentPositionInTrace should be used only on
1310 : // new traces - the intention is that traces are immutable after creation.
1311 : void add_action(DeferredAction* new_action) {
1312 : DCHECK(new_action->next_ == nullptr);
1313 309577 : new_action->next_ = actions_;
1314 309577 : actions_ = new_action;
1315 : }
1316 854423 : void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1317 14809 : void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1318 14809 : void set_loop_label(Label* label) { loop_label_ = label; }
1319 877736 : void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1320 474434 : void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1321 216902 : void set_flush_budget(int to) { flush_budget_ = to; }
1322 : void set_quick_check_performed(QuickCheckDetails* d) {
1323 224858 : quick_check_performed_ = *d;
1324 : }
1325 : void InvalidateCurrentCharacter();
1326 : void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1327 :
1328 : private:
1329 : int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1330 : void PerformDeferredActions(RegExpMacroAssembler* macro,
1331 : int max_register,
1332 : const OutSet& affected_registers,
1333 : OutSet* registers_to_pop,
1334 : OutSet* registers_to_clear,
1335 : Zone* zone);
1336 : void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1337 : int max_register,
1338 : const OutSet& registers_to_pop,
1339 : const OutSet& registers_to_clear);
1340 : int cp_offset_;
1341 : DeferredAction* actions_;
1342 : Label* backtrack_;
1343 : RegExpNode* stop_node_;
1344 : Label* loop_label_;
1345 : int characters_preloaded_;
1346 : int bound_checked_up_to_;
1347 : QuickCheckDetails quick_check_performed_;
1348 : int flush_budget_;
1349 : TriBool at_start_;
1350 : };
1351 :
1352 :
1353 : class GreedyLoopState {
1354 : public:
1355 : explicit GreedyLoopState(bool not_at_start);
1356 :
1357 : Label* label() { return &label_; }
1358 : Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1359 :
1360 : private:
1361 : Label label_;
1362 : Trace counter_backtrack_trace_;
1363 : };
1364 :
1365 :
1366 : struct PreloadState {
1367 : static const int kEatsAtLeastNotYetInitialized = -1;
1368 : bool preload_is_current_;
1369 : bool preload_has_checked_bounds_;
1370 : int preload_characters_;
1371 : int eats_at_least_;
1372 : void init() {
1373 245782 : eats_at_least_ = kEatsAtLeastNotYetInitialized;
1374 : }
1375 : };
1376 :
1377 :
1378 94032 : class NodeVisitor {
1379 : public:
1380 94032 : virtual ~NodeVisitor() { }
1381 : #define DECLARE_VISIT(Type) \
1382 : virtual void Visit##Type(Type##Node* that) = 0;
1383 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1384 : #undef DECLARE_VISIT
1385 0 : virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1386 : };
1387 :
1388 :
1389 : // Node visitor used to add the start set of the alternatives to the
1390 : // dispatch table of a choice node.
1391 66 : class DispatchTableConstructor: public NodeVisitor {
1392 : public:
1393 : DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1394 : Zone* zone)
1395 : : table_(table),
1396 : choice_index_(-1),
1397 : ignore_case_(ignore_case),
1398 66 : zone_(zone) { }
1399 :
1400 : void BuildTable(ChoiceNode* node);
1401 :
1402 696 : void AddRange(CharacterRange range) {
1403 1392 : table()->AddRange(range, choice_index_, zone_);
1404 : }
1405 :
1406 : void AddInverse(ZoneList<CharacterRange>* ranges);
1407 :
1408 : #define DECLARE_VISIT(Type) \
1409 : virtual void Visit##Type(Type##Node* that);
1410 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1411 : #undef DECLARE_VISIT
1412 :
1413 : DispatchTable* table() { return table_; }
1414 66 : void set_choice_index(int value) { choice_index_ = value; }
1415 :
1416 : protected:
1417 : DispatchTable* table_;
1418 : int choice_index_;
1419 : bool ignore_case_;
1420 : Zone* zone_;
1421 : };
1422 :
1423 :
1424 : // Assertion propagation moves information about assertions such as
1425 : // \b to the affected nodes. For instance, in /.\b./ information must
1426 : // be propagated to the first '.' that whatever follows needs to know
1427 : // if it matched a word or a non-word, and to the second '.' that it
1428 : // has to check if it succeeds a word or non-word. In this case the
1429 : // result will be something like:
1430 : //
1431 : // +-------+ +------------+
1432 : // | . | | . |
1433 : // +-------+ ---> +------------+
1434 : // | word? | | check word |
1435 : // +-------+ +------------+
1436 93966 : class Analysis: public NodeVisitor {
1437 : public:
1438 : Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
1439 : : isolate_(isolate),
1440 : flags_(flags),
1441 : is_one_byte_(is_one_byte),
1442 93966 : error_message_(nullptr) {}
1443 : void EnsureAnalyzed(RegExpNode* node);
1444 :
1445 : #define DECLARE_VISIT(Type) \
1446 : virtual void Visit##Type(Type##Node* that);
1447 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1448 : #undef DECLARE_VISIT
1449 : virtual void VisitLoopChoice(LoopChoiceNode* that);
1450 :
1451 : bool has_failed() { return error_message_ != nullptr; }
1452 : const char* error_message() {
1453 : DCHECK(error_message_ != nullptr);
1454 : return error_message_;
1455 : }
1456 : void fail(const char* error_message) {
1457 448 : error_message_ = error_message;
1458 : }
1459 :
1460 : Isolate* isolate() const { return isolate_; }
1461 :
1462 : bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
1463 : bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
1464 :
1465 : private:
1466 : Isolate* isolate_;
1467 : JSRegExp::Flags flags_;
1468 : bool is_one_byte_;
1469 : const char* error_message_;
1470 :
1471 : DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1472 : };
1473 :
1474 :
1475 : struct RegExpCompileData {
1476 : RegExpCompileData()
1477 : : tree(nullptr),
1478 : node(nullptr),
1479 : simple(true),
1480 : contains_anchor(false),
1481 795978 : capture_count(0) {}
1482 : RegExpTree* tree;
1483 : RegExpNode* node;
1484 : bool simple;
1485 : bool contains_anchor;
1486 : Handle<FixedArray> capture_name_map;
1487 : Handle<String> error;
1488 : int capture_count;
1489 : };
1490 :
1491 :
1492 : class RegExpEngine: public AllStatic {
1493 : public:
1494 : struct CompilationResult {
1495 : CompilationResult(Isolate* isolate, const char* error_message)
1496 : : error_message(error_message),
1497 458 : code(isolate->heap()->the_hole_value()),
1498 916 : num_registers(0) {}
1499 : CompilationResult(Object* code, int registers)
1500 93518 : : error_message(nullptr), code(code), num_registers(registers) {}
1501 : const char* error_message;
1502 : Object* code;
1503 : int num_registers;
1504 : };
1505 :
1506 : static CompilationResult Compile(Isolate* isolate, Zone* zone,
1507 : RegExpCompileData* input,
1508 : JSRegExp::Flags flags,
1509 : Handle<String> pattern,
1510 : Handle<String> sample_subject,
1511 : bool is_one_byte);
1512 :
1513 : static bool TooMuchRegExpCode(Handle<String> pattern);
1514 :
1515 : static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1516 : };
1517 :
1518 :
1519 : class RegExpResultsCache : public AllStatic {
1520 : public:
1521 : enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1522 :
1523 : // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
1524 : // On success, the returned result is guaranteed to be a COW-array.
1525 : static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
1526 : FixedArray** last_match_out, ResultsCacheType type);
1527 : // Attempt to add value_array to the cache specified by type. On success,
1528 : // value_array is turned into a COW-array.
1529 : static void Enter(Isolate* isolate, Handle<String> key_string,
1530 : Handle<Object> key_pattern, Handle<FixedArray> value_array,
1531 : Handle<FixedArray> last_match_cache, ResultsCacheType type);
1532 : static void Clear(FixedArray* cache);
1533 : static const int kRegExpResultsCacheSize = 0x100;
1534 :
1535 : private:
1536 : static const int kArrayEntriesPerCacheEntry = 4;
1537 : static const int kStringOffset = 0;
1538 : static const int kPatternOffset = 1;
1539 : static const int kArrayOffset = 2;
1540 : static const int kLastMatchOffset = 3;
1541 : };
1542 :
1543 : } // namespace internal
1544 : } // namespace v8
1545 :
1546 : #endif // V8_REGEXP_JSREGEXP_H_
|