Line data Source code
1 : // Copyright 2012 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_REGEXP_JSREGEXP_H_
6 : #define V8_REGEXP_JSREGEXP_H_
7 :
8 : #include "src/allocation.h"
9 : #include "src/assembler.h"
10 : #include "src/regexp/regexp-ast.h"
11 : #include "src/regexp/regexp-macro-assembler.h"
12 :
13 : namespace v8 {
14 : namespace internal {
15 :
16 : class NodeVisitor;
17 : class RegExpCompiler;
18 : class RegExpMacroAssembler;
19 : class RegExpNode;
20 : class RegExpTree;
21 : class BoyerMooreLookahead;
22 :
23 : class RegExpImpl {
24 : public:
25 : // Whether V8 is compiled with native regexp support or not.
26 : static bool UsesNativeRegExp() {
27 : #ifdef V8_INTERPRETED_REGEXP
28 : return false;
29 : #else
30 : return true;
31 : #endif
32 : }
33 :
34 : // Returns a string representation of a regular expression.
35 : // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
36 : // This function calls the garbage collector if necessary.
37 : static Handle<String> ToString(Handle<Object> value);
38 :
39 : // Parses the RegExp pattern and prepares the JSRegExp object with
40 : // generic data and choice of implementation - as well as what
41 : // the implementation wants to store in the data field.
42 : // Returns false if compilation fails.
43 : MUST_USE_RESULT static MaybeHandle<Object> Compile(Handle<JSRegExp> re,
44 : Handle<String> pattern,
45 : JSRegExp::Flags flags);
46 :
47 : // See ECMA-262 section 15.10.6.2.
48 : // This function calls the garbage collector if necessary.
49 : V8_EXPORT_PRIVATE MUST_USE_RESULT static MaybeHandle<Object> Exec(
50 : Handle<JSRegExp> regexp, Handle<String> subject, int index,
51 : Handle<RegExpMatchInfo> last_match_info);
52 :
53 : // Prepares a JSRegExp object with Irregexp-specific data.
54 : static void IrregexpInitialize(Handle<JSRegExp> re,
55 : Handle<String> pattern,
56 : JSRegExp::Flags flags,
57 : int capture_register_count);
58 :
59 :
60 : static void AtomCompile(Handle<JSRegExp> re,
61 : Handle<String> pattern,
62 : JSRegExp::Flags flags,
63 : Handle<String> match_pattern);
64 :
65 :
66 : static int AtomExecRaw(Handle<JSRegExp> regexp,
67 : Handle<String> subject,
68 : int index,
69 : int32_t* output,
70 : int output_size);
71 :
72 : static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
73 : Handle<String> subject, int index,
74 : Handle<RegExpMatchInfo> last_match_info);
75 :
76 : enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
77 :
78 : // Prepare a RegExp for being executed one or more times (using
79 : // IrregexpExecOnce) on the subject.
80 : // This ensures that the regexp is compiled for the subject, and that
81 : // the subject is flat.
82 : // Returns the number of integer spaces required by IrregexpExecOnce
83 : // as its "registers" argument. If the regexp cannot be compiled,
84 : // an exception is set as pending, and this function returns negative.
85 : static int IrregexpPrepare(Handle<JSRegExp> regexp,
86 : Handle<String> subject);
87 :
88 : // Execute a regular expression on the subject, starting from index.
89 : // If matching succeeds, return the number of matches. This can be larger
90 : // than one in the case of global regular expressions.
91 : // The captures and subcaptures are stored into the registers vector.
92 : // If matching fails, returns RE_FAILURE.
93 : // If execution fails, sets a pending exception and returns RE_EXCEPTION.
94 : static int IrregexpExecRaw(Handle<JSRegExp> regexp,
95 : Handle<String> subject,
96 : int index,
97 : int32_t* output,
98 : int output_size);
99 :
100 : // Execute an Irregexp bytecode pattern.
101 : // On a successful match, the result is a JSArray containing
102 : // captured positions. On a failure, the result is the null value.
103 : // Returns an empty handle in case of an exception.
104 : MUST_USE_RESULT static MaybeHandle<Object> IrregexpExec(
105 : Handle<JSRegExp> regexp, Handle<String> subject, int index,
106 : Handle<RegExpMatchInfo> last_match_info);
107 :
108 : // Set last match info. If match is NULL, then setting captures is omitted.
109 : static Handle<RegExpMatchInfo> SetLastMatchInfo(
110 : Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
111 : int capture_count, int32_t* match);
112 :
113 : class GlobalCache {
114 : public:
115 : GlobalCache(Handle<JSRegExp> regexp,
116 : Handle<String> subject,
117 : Isolate* isolate);
118 :
119 : INLINE(~GlobalCache());
120 :
121 : // Fetch the next entry in the cache for global regexp match results.
122 : // This does not set the last match info. Upon failure, NULL is returned.
123 : // The cause can be checked with Result(). The previous
124 : // result is still in available in memory when a failure happens.
125 : INLINE(int32_t* FetchNext());
126 :
127 : INLINE(int32_t* LastSuccessfulMatch());
128 :
129 : INLINE(bool HasException()) { return num_matches_ < 0; }
130 :
131 : private:
132 : int AdvanceZeroLength(int last_index);
133 :
134 : int num_matches_;
135 : int max_matches_;
136 : int current_match_index_;
137 : int registers_per_match_;
138 : // Pointer to the last set of captures.
139 : int32_t* register_array_;
140 : int register_array_size_;
141 : Handle<JSRegExp> regexp_;
142 : Handle<String> subject_;
143 : };
144 :
145 : // For acting on the JSRegExp data FixedArray.
146 : static int IrregexpMaxRegisterCount(FixedArray* re);
147 : static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
148 : static void SetIrregexpCaptureNameMap(FixedArray* re,
149 : Handle<FixedArray> value);
150 : static int IrregexpNumberOfCaptures(FixedArray* re);
151 : static int IrregexpNumberOfRegisters(FixedArray* re);
152 : static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
153 : static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
154 :
155 : // Limit the space regexps take up on the heap. In order to limit this we
156 : // would like to keep track of the amount of regexp code on the heap. This
157 : // is not tracked, however. As a conservative approximation we track the
158 : // total regexp code compiled including code that has subsequently been freed
159 : // and the total executable memory at any point.
160 : static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
161 : static const size_t kRegExpCompiledLimit = 1 * MB;
162 : static const int kRegExpTooLargeToOptimize = 20 * KB;
163 :
164 : private:
165 : static bool CompileIrregexp(Handle<JSRegExp> re,
166 : Handle<String> sample_subject, bool is_one_byte);
167 : static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re,
168 : Handle<String> sample_subject,
169 : bool is_one_byte);
170 : };
171 :
172 :
173 : // Represents the location of one element relative to the intersection of
174 : // two sets. Corresponds to the four areas of a Venn diagram.
175 : enum ElementInSetsRelation {
176 : kInsideNone = 0,
177 : kInsideFirst = 1,
178 : kInsideSecond = 2,
179 : kInsideBoth = 3
180 : };
181 :
182 :
183 : // A set of unsigned integers that behaves especially well on small
184 : // integers (< 32). May do zone-allocation.
185 : class OutSet: public ZoneObject {
186 : public:
187 1193775 : OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
188 : OutSet* Extend(unsigned value, Zone* zone);
189 : bool Get(unsigned value) const;
190 : static const unsigned kFirstLimit = 32;
191 :
192 : private:
193 : // Destructively set a value in this set. In most cases you want
194 : // to use Extend instead to ensure that only one instance exists
195 : // that contains the same values.
196 : void Set(unsigned value, Zone* zone);
197 :
198 : // The successors are a list of sets that contain the same values
199 : // as this set and the one more value that is not present in this
200 : // set.
201 : ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
202 :
203 : OutSet(uint32_t first, ZoneList<unsigned>* remaining)
204 21069 : : first_(first), remaining_(remaining), successors_(NULL) { }
205 : uint32_t first_;
206 : ZoneList<unsigned>* remaining_;
207 : ZoneList<OutSet*>* successors_;
208 : friend class Trace;
209 : };
210 :
211 :
212 : // A mapping from integers, specified as ranges, to a set of integers.
213 : // Used for mapping character ranges to choices.
214 : class DispatchTable : public ZoneObject {
215 : public:
216 : explicit DispatchTable(Zone* zone) : tree_(zone) { }
217 :
218 : class Entry {
219 : public:
220 : Entry() : from_(0), to_(0), out_set_(NULL) { }
221 : Entry(uc32 from, uc32 to, OutSet* out_set)
222 : : from_(from), to_(to), out_set_(out_set) {
223 : DCHECK(from <= to);
224 : }
225 : uc32 from() { return from_; }
226 : uc32 to() { return to_; }
227 6467 : void set_to(uc32 value) { to_ = value; }
228 : void AddValue(int value, Zone* zone) {
229 93808 : out_set_ = out_set_->Extend(value, zone);
230 : }
231 : OutSet* out_set() { return out_set_; }
232 : private:
233 : uc32 from_;
234 : uc32 to_;
235 : OutSet* out_set_;
236 : };
237 :
238 : class Config {
239 : public:
240 : typedef uc32 Key;
241 : typedef Entry Value;
242 : static const uc32 kNoKey;
243 : static const Entry NoValue() { return Value(); }
244 : static inline int Compare(uc32 a, uc32 b) {
245 1861086 : if (a == b)
246 : return 0;
247 1816904 : else if (a < b)
248 : return -1;
249 : else
250 : return 1;
251 : }
252 : };
253 :
254 : void AddRange(CharacterRange range, int value, Zone* zone);
255 : OutSet* Get(uc32 value);
256 : void Dump();
257 :
258 : template <typename Callback>
259 : void ForEach(Callback* callback) {
260 3163 : return tree()->ForEach(callback);
261 : }
262 :
263 : private:
264 : // There can't be a static empty set since it allocates its
265 : // successors in a zone and caches them.
266 : OutSet* empty() { return &empty_; }
267 : OutSet empty_;
268 : ZoneSplayTree<Config>* tree() { return &tree_; }
269 : ZoneSplayTree<Config> tree_;
270 : };
271 :
272 :
273 : // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
274 : class UnicodeRangeSplitter {
275 : public:
276 : UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
277 : void Call(uc32 from, DispatchTable::Entry entry);
278 :
279 : ZoneList<CharacterRange>* bmp() { return bmp_; }
280 : ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
281 : ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
282 : ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
283 :
284 : private:
285 : static const int kBase = 0;
286 : // Separate ranges into
287 : static const int kBmpCodePoints = 1;
288 : static const int kLeadSurrogates = 2;
289 : static const int kTrailSurrogates = 3;
290 : static const int kNonBmpCodePoints = 4;
291 :
292 : Zone* zone_;
293 : DispatchTable table_;
294 : ZoneList<CharacterRange>* bmp_;
295 : ZoneList<CharacterRange>* lead_surrogates_;
296 : ZoneList<CharacterRange>* trail_surrogates_;
297 : ZoneList<CharacterRange>* non_bmp_;
298 : };
299 :
300 :
301 : #define FOR_EACH_NODE_TYPE(VISIT) \
302 : VISIT(End) \
303 : VISIT(Action) \
304 : VISIT(Choice) \
305 : VISIT(BackReference) \
306 : VISIT(Assertion) \
307 : VISIT(Text)
308 :
309 :
310 : class Trace;
311 : struct PreloadState;
312 : class GreedyLoopState;
313 : class AlternativeGenerationList;
314 :
315 : struct NodeInfo {
316 : NodeInfo()
317 : : being_analyzed(false),
318 : been_analyzed(false),
319 : follows_word_interest(false),
320 : follows_newline_interest(false),
321 : follows_start_interest(false),
322 : at_end(false),
323 : visited(false),
324 6915745 : replacement_calculated(false) { }
325 :
326 : // Returns true if the interests and assumptions of this node
327 : // matches the given one.
328 : bool Matches(NodeInfo* that) {
329 : return (at_end == that->at_end) &&
330 : (follows_word_interest == that->follows_word_interest) &&
331 : (follows_newline_interest == that->follows_newline_interest) &&
332 : (follows_start_interest == that->follows_start_interest);
333 : }
334 :
335 : // Updates the interests of this node given the interests of the
336 : // node preceding it.
337 : void AddFromPreceding(NodeInfo* that) {
338 : at_end |= that->at_end;
339 : follows_word_interest |= that->follows_word_interest;
340 : follows_newline_interest |= that->follows_newline_interest;
341 : follows_start_interest |= that->follows_start_interest;
342 : }
343 :
344 : bool HasLookbehind() {
345 : return follows_word_interest ||
346 : follows_newline_interest ||
347 : follows_start_interest;
348 : }
349 :
350 : // Sets the interests of this node to include the interests of the
351 : // following node.
352 : void AddFromFollowing(NodeInfo* that) {
353 662904 : follows_word_interest |= that->follows_word_interest;
354 662904 : follows_newline_interest |= that->follows_newline_interest;
355 662904 : follows_start_interest |= that->follows_start_interest;
356 : }
357 :
358 : void ResetCompilationState() {
359 : being_analyzed = false;
360 : been_analyzed = false;
361 : }
362 :
363 : bool being_analyzed: 1;
364 : bool been_analyzed: 1;
365 :
366 : // These bits are set of this node has to know what the preceding
367 : // character was.
368 : bool follows_word_interest: 1;
369 : bool follows_newline_interest: 1;
370 : bool follows_start_interest: 1;
371 :
372 : bool at_end: 1;
373 : bool visited: 1;
374 : bool replacement_calculated: 1;
375 : };
376 :
377 :
378 : // Details of a quick mask-compare check that can look ahead in the
379 : // input stream.
380 : class QuickCheckDetails {
381 : public:
382 : QuickCheckDetails()
383 : : characters_(0),
384 : mask_(0),
385 : value_(0),
386 17344640 : cannot_match_(false) { }
387 : explicit QuickCheckDetails(int characters)
388 : : characters_(characters),
389 : mask_(0),
390 : value_(0),
391 995545 : cannot_match_(false) { }
392 : bool Rationalize(bool one_byte);
393 : // Merge in the information from another branch of an alternation.
394 : void Merge(QuickCheckDetails* other, int from_index);
395 : // Advance the current position by some amount.
396 : void Advance(int by, bool one_byte);
397 : void Clear();
398 : bool cannot_match() { return cannot_match_; }
399 578 : void set_cannot_match() { cannot_match_ = true; }
400 : struct Position {
401 14672148 : Position() : mask(0), value(0), determines_perfectly(false) { }
402 : uc16 mask;
403 : uc16 value;
404 : bool determines_perfectly;
405 : };
406 : int characters() { return characters_; }
407 730201 : void set_characters(int characters) { characters_ = characters; }
408 : Position* positions(int index) {
409 : DCHECK(index >= 0);
410 : DCHECK(index < characters_);
411 2066721 : return positions_ + index;
412 : }
413 : uint32_t mask() { return mask_; }
414 : uint32_t value() { return value_; }
415 :
416 : private:
417 : // How many characters do we have quick check information from. This is
418 : // the same for all branches of a choice node.
419 : int characters_;
420 : Position positions_[4];
421 : // These values are the condensate of the above array after Rationalize().
422 : uint32_t mask_;
423 : uint32_t value_;
424 : // If set to true, there is no way this quick check can match at all.
425 : // E.g., if it requires to be at the start of the input, and isn't.
426 : bool cannot_match_;
427 : };
428 :
429 :
430 : extern int kUninitializedRegExpNodePlaceHolder;
431 :
432 :
433 : class RegExpNode: public ZoneObject {
434 : public:
435 : explicit RegExpNode(Zone* zone)
436 20747235 : : replacement_(NULL), on_work_list_(false), trace_count_(0), zone_(zone) {
437 6915745 : bm_info_[0] = bm_info_[1] = NULL;
438 : }
439 : virtual ~RegExpNode();
440 : virtual void Accept(NodeVisitor* visitor) = 0;
441 : // Generates a goto to this node or actually generates the code at this point.
442 : virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
443 : // How many characters must this node consume at a minimum in order to
444 : // succeed. If we have found at least 'still_to_find' characters that
445 : // must be consumed there is no need to ask any following nodes whether
446 : // they are sure to eat any more characters. The not_at_start argument is
447 : // used to indicate that we know we are not at the start of the input. In
448 : // this case anchored branches will always fail and can be ignored when
449 : // determining how many characters are consumed on success.
450 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
451 : // Emits some quick code that checks whether the preloaded characters match.
452 : // Falls through on certain failure, jumps to the label on possible success.
453 : // If the node cannot make a quick check it does nothing and returns false.
454 : bool EmitQuickCheck(RegExpCompiler* compiler,
455 : Trace* bounds_check_trace,
456 : Trace* trace,
457 : bool preload_has_checked_bounds,
458 : Label* on_possible_success,
459 : QuickCheckDetails* details_return,
460 : bool fall_through_on_failure);
461 : // For a given number of characters this returns a mask and a value. The
462 : // next n characters are anded with the mask and compared with the value.
463 : // A comparison failure indicates the node cannot match the next n characters.
464 : // A comparison success indicates the node may match.
465 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
466 : RegExpCompiler* compiler,
467 : int characters_filled_in,
468 : bool not_at_start) = 0;
469 : static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
470 115701 : virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
471 : // Only returns the successor for a text node of length 1 that matches any
472 : // character and that has no guards on it.
473 111129 : virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
474 : RegExpCompiler* compiler) {
475 111129 : return NULL;
476 : }
477 :
478 : // Collects information on the possible code units (mod 128) that can match if
479 : // we look forward. This is used for a Boyer-Moore-like string searching
480 : // implementation. TODO(erikcorry): This should share more code with
481 : // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
482 : // the number of nodes we are willing to look at in order to create this data.
483 : static const int kRecursionBudget = 200;
484 : bool KeepRecursing(RegExpCompiler* compiler);
485 0 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
486 : BoyerMooreLookahead* bm, bool not_at_start) {
487 0 : UNREACHABLE();
488 : }
489 :
490 : // If we know that the input is one-byte then there are some nodes that can
491 : // never match. This method returns a node that can be substituted for
492 : // itself, or NULL if the node can never match.
493 20163 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
494 20163 : return this;
495 : }
496 : // Helper for FilterOneByte.
497 : RegExpNode* replacement() {
498 : DCHECK(info()->replacement_calculated);
499 : return replacement_;
500 : }
501 : RegExpNode* set_replacement(RegExpNode* replacement) {
502 245975 : info()->replacement_calculated = true;
503 245975 : replacement_ = replacement;
504 : return replacement; // For convenience.
505 : }
506 :
507 : // We want to avoid recalculating the lookahead info, so we store it on the
508 : // node. Only info that is for this node is stored. We can tell that the
509 : // info is for this node when offset == 0, so the information is calculated
510 : // relative to this node.
511 : void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
512 139634 : if (offset == 0) set_bm_info(not_at_start, bm);
513 : }
514 :
515 : Label* label() { return &label_; }
516 : // If non-generic code is generated for a node (i.e. the node is not at the
517 : // start of the trace) then it cannot be reused. This variable sets a limit
518 : // on how often we allow that to happen before we insist on starting a new
519 : // trace and generating generic code for a node that can be reused by flushing
520 : // the deferred actions in the current trace and generating a goto.
521 : static const int kMaxCopiesCodeGenerated = 10;
522 :
523 : bool on_work_list() { return on_work_list_; }
524 465788 : void set_on_work_list(bool value) { on_work_list_ = value; }
525 :
526 : NodeInfo* info() { return &info_; }
527 :
528 : BoyerMooreLookahead* bm_info(bool not_at_start) {
529 87770 : return bm_info_[not_at_start ? 1 : 0];
530 : }
531 :
532 : Zone* zone() const { return zone_; }
533 :
534 : protected:
535 : enum LimitResult { DONE, CONTINUE };
536 : RegExpNode* replacement_;
537 :
538 : LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
539 :
540 : void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
541 200222 : bm_info_[not_at_start ? 1 : 0] = bm;
542 : }
543 :
544 : private:
545 : static const int kFirstCharBudget = 10;
546 : Label label_;
547 : bool on_work_list_;
548 : NodeInfo info_;
549 : // This variable keeps track of how many times code has been generated for
550 : // this node (in different traces). We don't keep track of where the
551 : // generated code is located unless the code is generated at the start of
552 : // a trace, in which case it is generic and can be reused by flushing the
553 : // deferred operations in the current trace and generating a goto.
554 : int trace_count_;
555 : BoyerMooreLookahead* bm_info_[2];
556 :
557 : Zone* zone_;
558 : };
559 :
560 :
561 0 : class SeqRegExpNode: public RegExpNode {
562 : public:
563 5169900 : explicit SeqRegExpNode(RegExpNode* on_success)
564 5169900 : : RegExpNode(on_success->zone()), on_success_(on_success) { }
565 : RegExpNode* on_success() { return on_success_; }
566 : void set_on_success(RegExpNode* node) { on_success_ = node; }
567 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
568 0 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
569 : BoyerMooreLookahead* bm, bool not_at_start) {
570 0 : on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
571 0 : if (offset == 0) set_bm_info(not_at_start, bm);
572 0 : }
573 :
574 : protected:
575 : RegExpNode* FilterSuccessor(int depth, bool ignore_case);
576 :
577 : private:
578 : RegExpNode* on_success_;
579 : };
580 :
581 :
582 0 : class ActionNode: public SeqRegExpNode {
583 : public:
584 : enum ActionType {
585 : SET_REGISTER,
586 : INCREMENT_REGISTER,
587 : STORE_POSITION,
588 : BEGIN_SUBMATCH,
589 : POSITIVE_SUBMATCH_SUCCESS,
590 : EMPTY_MATCH_CHECK,
591 : CLEAR_CAPTURES
592 : };
593 : static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
594 : static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
595 : static ActionNode* StorePosition(int reg,
596 : bool is_capture,
597 : RegExpNode* on_success);
598 : static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
599 : static ActionNode* BeginSubmatch(int stack_pointer_reg,
600 : int position_reg,
601 : RegExpNode* on_success);
602 : static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
603 : int restore_reg,
604 : int clear_capture_count,
605 : int clear_capture_from,
606 : RegExpNode* on_success);
607 : static ActionNode* EmptyMatchCheck(int start_register,
608 : int repetition_register,
609 : int repetition_limit,
610 : RegExpNode* on_success);
611 : virtual void Accept(NodeVisitor* visitor);
612 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
613 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
614 80720 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
615 : RegExpCompiler* compiler,
616 : int filled_in,
617 : bool not_at_start) {
618 80720 : return on_success()->GetQuickCheckDetails(
619 80720 : details, compiler, filled_in, not_at_start);
620 : }
621 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
622 : BoyerMooreLookahead* bm, bool not_at_start);
623 : ActionType action_type() { return action_type_; }
624 : // TODO(erikcorry): We should allow some action nodes in greedy loops.
625 137100 : virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
626 :
627 : private:
628 : union {
629 : struct {
630 : int reg;
631 : int value;
632 : } u_store_register;
633 : struct {
634 : int reg;
635 : } u_increment_register;
636 : struct {
637 : int reg;
638 : bool is_capture;
639 : } u_position_register;
640 : struct {
641 : int stack_pointer_register;
642 : int current_position_register;
643 : int clear_register_count;
644 : int clear_register_from;
645 : } u_submatch;
646 : struct {
647 : int start_register;
648 : int repetition_register;
649 : int repetition_limit;
650 : } u_empty_match_check;
651 : struct {
652 : int range_from;
653 : int range_to;
654 : } u_clear_captures;
655 : } data_;
656 : ActionNode(ActionType action_type, RegExpNode* on_success)
657 : : SeqRegExpNode(on_success),
658 3295122 : action_type_(action_type) { }
659 : ActionType action_type_;
660 : friend class DotPrinter;
661 : };
662 :
663 :
664 0 : class TextNode: public SeqRegExpNode {
665 : public:
666 : TextNode(ZoneList<TextElement>* elms, bool read_backward,
667 : RegExpNode* on_success)
668 1665932 : : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
669 196167 : TextNode(RegExpCharacterClass* that, bool read_backward,
670 : RegExpNode* on_success)
671 : : SeqRegExpNode(on_success),
672 392334 : elms_(new (zone()) ZoneList<TextElement>(1, zone())),
673 392334 : read_backward_(read_backward) {
674 392334 : elms_->Add(TextElement::CharClass(that), zone());
675 196167 : }
676 : // Create TextNode for a single character class for the given ranges.
677 : static TextNode* CreateForCharacterRanges(Zone* zone,
678 : ZoneList<CharacterRange>* ranges,
679 : bool read_backward,
680 : RegExpNode* on_success);
681 : // Create TextNode for a surrogate pair with a range given for the
682 : // lead and the trail surrogate each.
683 : static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
684 : CharacterRange trail,
685 : bool read_backward,
686 : RegExpNode* on_success);
687 : virtual void Accept(NodeVisitor* visitor);
688 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
689 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
690 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
691 : RegExpCompiler* compiler,
692 : int characters_filled_in,
693 : bool not_at_start);
694 : ZoneList<TextElement>* elements() { return elms_; }
695 : bool read_backward() { return read_backward_; }
696 : void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
697 : virtual int GreedyLoopTextLength();
698 : virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
699 : RegExpCompiler* compiler);
700 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
701 : BoyerMooreLookahead* bm, bool not_at_start);
702 : void CalculateOffsets();
703 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
704 :
705 : private:
706 : enum TextEmitPassType {
707 : NON_LATIN1_MATCH, // Check for characters that can't match.
708 : SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
709 : NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
710 : CASE_CHARACTER_MATCH, // Case-independent single character check.
711 : CHARACTER_CLASS_MATCH // Character class.
712 : };
713 : static bool SkipPass(int pass, bool ignore_case);
714 : static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
715 : static const int kLastPass = CHARACTER_CLASS_MATCH;
716 : void TextEmitPass(RegExpCompiler* compiler,
717 : TextEmitPassType pass,
718 : bool preloaded,
719 : Trace* trace,
720 : bool first_element_checked,
721 : int* checked_up_to);
722 : int Length();
723 : ZoneList<TextElement>* elms_;
724 : bool read_backward_;
725 : };
726 :
727 :
728 0 : class AssertionNode: public SeqRegExpNode {
729 : public:
730 : enum AssertionType {
731 : AT_END,
732 : AT_START,
733 : AT_BOUNDARY,
734 : AT_NON_BOUNDARY,
735 : AFTER_NEWLINE
736 : };
737 3809 : static AssertionNode* AtEnd(RegExpNode* on_success) {
738 3809 : return new(on_success->zone()) AssertionNode(AT_END, on_success);
739 : }
740 4673 : static AssertionNode* AtStart(RegExpNode* on_success) {
741 4673 : return new(on_success->zone()) AssertionNode(AT_START, on_success);
742 : }
743 167 : static AssertionNode* AtBoundary(RegExpNode* on_success) {
744 167 : return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
745 : }
746 143 : static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
747 143 : return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
748 : }
749 797 : static AssertionNode* AfterNewline(RegExpNode* on_success) {
750 797 : return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
751 : }
752 : virtual void Accept(NodeVisitor* visitor);
753 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
754 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
755 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
756 : RegExpCompiler* compiler,
757 : int filled_in,
758 : bool not_at_start);
759 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
760 : BoyerMooreLookahead* bm, bool not_at_start);
761 : AssertionType assertion_type() { return assertion_type_; }
762 :
763 : private:
764 : void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
765 : enum IfPrevious { kIsNonWord, kIsWord };
766 : void BacktrackIfPrevious(RegExpCompiler* compiler,
767 : Trace* trace,
768 : IfPrevious backtrack_if_previous);
769 : AssertionNode(AssertionType t, RegExpNode* on_success)
770 9589 : : SeqRegExpNode(on_success), assertion_type_(t) { }
771 : AssertionType assertion_type_;
772 : };
773 :
774 :
775 0 : class BackReferenceNode: public SeqRegExpNode {
776 : public:
777 : BackReferenceNode(int start_reg, int end_reg, bool read_backward,
778 : RegExpNode* on_success)
779 : : SeqRegExpNode(on_success),
780 : start_reg_(start_reg),
781 : end_reg_(end_reg),
782 3090 : read_backward_(read_backward) {}
783 : virtual void Accept(NodeVisitor* visitor);
784 : int start_register() { return start_reg_; }
785 : int end_register() { return end_reg_; }
786 : bool read_backward() { return read_backward_; }
787 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
788 : virtual int EatsAtLeast(int still_to_find,
789 : int recursion_depth,
790 : bool not_at_start);
791 726 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
792 : RegExpCompiler* compiler,
793 : int characters_filled_in,
794 : bool not_at_start) {
795 726 : return;
796 : }
797 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
798 : BoyerMooreLookahead* bm, bool not_at_start);
799 :
800 : private:
801 : int start_reg_;
802 : int end_reg_;
803 : bool read_backward_;
804 : };
805 :
806 :
807 0 : class EndNode: public RegExpNode {
808 : public:
809 : enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
810 96493 : EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
811 : virtual void Accept(NodeVisitor* visitor);
812 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
813 135207 : virtual int EatsAtLeast(int still_to_find,
814 : int recursion_depth,
815 135207 : bool not_at_start) { return 0; }
816 0 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
817 : RegExpCompiler* compiler,
818 : int characters_filled_in,
819 : bool not_at_start) {
820 : // Returning 0 from EatsAtLeast should ensure we never get here.
821 0 : UNREACHABLE();
822 : }
823 0 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
824 : BoyerMooreLookahead* bm, bool not_at_start) {
825 : // Returning 0 from EatsAtLeast should ensure we never get here.
826 0 : UNREACHABLE();
827 : }
828 :
829 : private:
830 : Action action_;
831 : };
832 :
833 :
834 0 : class NegativeSubmatchSuccess: public EndNode {
835 : public:
836 : NegativeSubmatchSuccess(int stack_pointer_reg,
837 : int position_reg,
838 : int clear_capture_count,
839 : int clear_capture_start,
840 : Zone* zone)
841 : : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
842 : stack_pointer_register_(stack_pointer_reg),
843 : current_position_register_(position_reg),
844 : clear_capture_count_(clear_capture_count),
845 3515 : clear_capture_start_(clear_capture_start) { }
846 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
847 :
848 : private:
849 : int stack_pointer_register_;
850 : int current_position_register_;
851 : int clear_capture_count_;
852 : int clear_capture_start_;
853 : };
854 :
855 :
856 : class Guard: public ZoneObject {
857 : public:
858 : enum Relation { LT, GEQ };
859 : Guard(int reg, Relation op, int value)
860 : : reg_(reg),
861 : op_(op),
862 1507750 : value_(value) { }
863 : int reg() { return reg_; }
864 : Relation op() { return op_; }
865 : int value() { return value_; }
866 :
867 : private:
868 : int reg_;
869 : Relation op_;
870 : int value_;
871 : };
872 :
873 :
874 : class GuardedAlternative {
875 : public:
876 3228627 : explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { }
877 : void AddGuard(Guard* guard, Zone* zone);
878 : RegExpNode* node() { return node_; }
879 86141 : void set_node(RegExpNode* node) { node_ = node; }
880 : ZoneList<Guard*>* guards() { return guards_; }
881 :
882 : private:
883 : RegExpNode* node_;
884 : ZoneList<Guard*>* guards_;
885 : };
886 :
887 :
888 : class AlternativeGeneration;
889 :
890 :
891 0 : class ChoiceNode: public RegExpNode {
892 : public:
893 1649352 : explicit ChoiceNode(int expected_size, Zone* zone)
894 : : RegExpNode(zone),
895 : alternatives_(new(zone)
896 1649352 : ZoneList<GuardedAlternative>(expected_size, zone)),
897 : table_(NULL),
898 : not_at_start_(false),
899 3298704 : being_calculated_(false) { }
900 : virtual void Accept(NodeVisitor* visitor);
901 3377266 : void AddAlternative(GuardedAlternative node) {
902 3377266 : alternatives()->Add(node, zone());
903 : }
904 : ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
905 : DispatchTable* GetTable(bool ignore_case);
906 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
907 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
908 : int EatsAtLeastHelper(int still_to_find,
909 : int budget,
910 : RegExpNode* ignore_this_node,
911 : bool not_at_start);
912 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
913 : RegExpCompiler* compiler,
914 : int characters_filled_in,
915 : bool not_at_start);
916 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
917 : BoyerMooreLookahead* bm, bool not_at_start);
918 :
919 : bool being_calculated() { return being_calculated_; }
920 : bool not_at_start() { return not_at_start_; }
921 7690 : void set_not_at_start() { not_at_start_ = true; }
922 0 : void set_being_calculated(bool b) { being_calculated_ = b; }
923 587302 : virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
924 587302 : return true;
925 : }
926 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
927 0 : virtual bool read_backward() { return false; }
928 :
929 : protected:
930 : int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
931 : ZoneList<GuardedAlternative>* alternatives_;
932 :
933 : private:
934 : friend class DispatchTableConstructor;
935 : friend class Analysis;
936 : void GenerateGuard(RegExpMacroAssembler* macro_assembler,
937 : Guard* guard,
938 : Trace* trace);
939 : int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
940 : void EmitOutOfLineContinuation(RegExpCompiler* compiler,
941 : Trace* trace,
942 : GuardedAlternative alternative,
943 : AlternativeGeneration* alt_gen,
944 : int preload_characters,
945 : bool next_expects_preload);
946 : void SetUpPreLoad(RegExpCompiler* compiler,
947 : Trace* current_trace,
948 : PreloadState* preloads);
949 : void AssertGuardsMentionRegisters(Trace* trace);
950 : int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
951 : Trace* EmitGreedyLoop(RegExpCompiler* compiler,
952 : Trace* trace,
953 : AlternativeGenerationList* alt_gens,
954 : PreloadState* preloads,
955 : GreedyLoopState* greedy_loop_state,
956 : int text_length);
957 : void EmitChoices(RegExpCompiler* compiler,
958 : AlternativeGenerationList* alt_gens,
959 : int first_choice,
960 : Trace* trace,
961 : PreloadState* preloads);
962 : DispatchTable* table_;
963 : // If true, this node is never checked at the start of the input.
964 : // Allows a new trace to start with at_start() set to false.
965 : bool not_at_start_;
966 : bool being_calculated_;
967 : };
968 :
969 :
970 0 : class NegativeLookaroundChoiceNode : public ChoiceNode {
971 : public:
972 3515 : explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
973 : GuardedAlternative then_do_this,
974 : Zone* zone)
975 3515 : : ChoiceNode(2, zone) {
976 : AddAlternative(this_must_fail);
977 : AddAlternative(then_do_this);
978 3515 : }
979 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
980 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
981 : RegExpCompiler* compiler,
982 : int characters_filled_in,
983 : bool not_at_start);
984 1423 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
985 : BoyerMooreLookahead* bm, bool not_at_start) {
986 2846 : alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
987 1423 : not_at_start);
988 1423 : if (offset == 0) set_bm_info(not_at_start, bm);
989 1423 : }
990 : // For a negative lookahead we don't emit the quick check for the
991 : // alternative that is expected to fail. This is because quick check code
992 : // starts by loading enough characters for the alternative that takes fewest
993 : // characters, but on a negative lookahead the negative branch did not take
994 : // part in that calculation (EatsAtLeast) so the assumptions don't hold.
995 7002 : virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
996 7002 : return !is_first;
997 : }
998 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
999 : };
1000 :
1001 :
1002 0 : class LoopChoiceNode: public ChoiceNode {
1003 : public:
1004 : LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1005 : : ChoiceNode(2, zone),
1006 : loop_node_(NULL),
1007 : continue_node_(NULL),
1008 : body_can_be_zero_length_(body_can_be_zero_length),
1009 1612556 : read_backward_(read_backward) {}
1010 : void AddLoopAlternative(GuardedAlternative alt);
1011 : void AddContinueAlternative(GuardedAlternative alt);
1012 : virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1013 : virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1014 : virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1015 : RegExpCompiler* compiler,
1016 : int characters_filled_in,
1017 : bool not_at_start);
1018 : virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1019 : BoyerMooreLookahead* bm, bool not_at_start);
1020 : RegExpNode* loop_node() { return loop_node_; }
1021 : RegExpNode* continue_node() { return continue_node_; }
1022 : bool body_can_be_zero_length() { return body_can_be_zero_length_; }
1023 32080 : virtual bool read_backward() { return read_backward_; }
1024 : virtual void Accept(NodeVisitor* visitor);
1025 : virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1026 :
1027 : private:
1028 : // AddAlternative is made private for loop nodes because alternatives
1029 : // should not be added freely, we need to keep track of which node
1030 : // goes back to the node itself.
1031 3225112 : void AddAlternative(GuardedAlternative node) {
1032 : ChoiceNode::AddAlternative(node);
1033 3225112 : }
1034 :
1035 : RegExpNode* loop_node_;
1036 : RegExpNode* continue_node_;
1037 : bool body_can_be_zero_length_;
1038 : bool read_backward_;
1039 : };
1040 :
1041 :
1042 : // Improve the speed that we scan for an initial point where a non-anchored
1043 : // regexp can match by using a Boyer-Moore-like table. This is done by
1044 : // identifying non-greedy non-capturing loops in the nodes that eat any
1045 : // character one at a time. For example in the middle of the regexp
1046 : // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1047 : // inserted at the start of any non-anchored regexp.
1048 : //
1049 : // When we have found such a loop we look ahead in the nodes to find the set of
1050 : // characters that can come at given distances. For example for the regexp
1051 : // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1052 : // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1053 : // the lookahead info where the set of characters is reasonably constrained. In
1054 : // our example this is from index 1 to 2 (0 is not constrained). We can now
1055 : // look 3 characters ahead and if we don't find one of [f, o] (the union of
1056 : // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1057 : //
1058 : // For Unicode input strings we do the same, but modulo 128.
1059 : //
1060 : // We also look at the first string fed to the regexp and use that to get a hint
1061 : // of the character frequencies in the inputs. This affects the assessment of
1062 : // whether the set of characters is 'reasonably constrained'.
1063 : //
1064 : // We also have another lookahead mechanism (called quick check in the code),
1065 : // which uses a wide load of multiple characters followed by a mask and compare
1066 : // to determine whether a match is possible at this point.
1067 : enum ContainedInLattice {
1068 : kNotYet = 0,
1069 : kLatticeIn = 1,
1070 : kLatticeOut = 2,
1071 : kLatticeUnknown = 3 // Can also mean both in and out.
1072 : };
1073 :
1074 :
1075 : inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1076 861889 : return static_cast<ContainedInLattice>(a | b);
1077 : }
1078 :
1079 :
1080 : ContainedInLattice AddRange(ContainedInLattice a,
1081 : const int* ranges,
1082 : int ranges_size,
1083 : Interval new_range);
1084 :
1085 :
1086 : class BoyerMoorePositionInfo : public ZoneObject {
1087 : public:
1088 113434 : explicit BoyerMoorePositionInfo(Zone* zone)
1089 : : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1090 : map_count_(0),
1091 : w_(kNotYet),
1092 : s_(kNotYet),
1093 : d_(kNotYet),
1094 113434 : surrogate_(kNotYet) {
1095 14632986 : for (int i = 0; i < kMapSize; i++) {
1096 14519552 : map_->Add(false, zone);
1097 : }
1098 113434 : }
1099 :
1100 : bool& at(int i) { return map_->at(i); }
1101 :
1102 : static const int kMapSize = 128;
1103 : static const int kMask = kMapSize - 1;
1104 :
1105 : int map_count() const { return map_count_; }
1106 :
1107 : void Set(int character);
1108 : void SetInterval(const Interval& interval);
1109 : void SetAll();
1110 : bool is_non_word() { return w_ == kLatticeOut; }
1111 : bool is_word() { return w_ == kLatticeIn; }
1112 :
1113 : private:
1114 : ZoneList<bool>* map_;
1115 : int map_count_; // Number of set bits in the map.
1116 : ContainedInLattice w_; // The \w character class.
1117 : ContainedInLattice s_; // The \s character class.
1118 : ContainedInLattice d_; // The \d character class.
1119 : ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1120 : };
1121 :
1122 :
1123 : class BoyerMooreLookahead : public ZoneObject {
1124 : public:
1125 : BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1126 :
1127 : int length() { return length_; }
1128 : int max_char() { return max_char_; }
1129 : RegExpCompiler* compiler() { return compiler_; }
1130 :
1131 : int Count(int map_number) {
1132 599221 : return bitmaps_->at(map_number)->map_count();
1133 : }
1134 :
1135 193 : BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1136 :
1137 50775 : void Set(int map_number, int character) {
1138 101550 : if (character > max_char_) return;
1139 101550 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1140 : info->Set(character);
1141 : }
1142 :
1143 442708 : void SetInterval(int map_number, const Interval& interval) {
1144 442708 : if (interval.from() > max_char_) return;
1145 442708 : BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1146 221354 : if (interval.to() > max_char_) {
1147 0 : info->SetInterval(Interval(interval.from(), max_char_));
1148 : } else {
1149 221354 : info->SetInterval(interval);
1150 : }
1151 : }
1152 :
1153 6750 : void SetAll(int map_number) {
1154 13500 : bitmaps_->at(map_number)->SetAll();
1155 6750 : }
1156 :
1157 1596 : void SetRest(int from_map) {
1158 1596 : for (int i = from_map; i < length_; i++) SetAll(i);
1159 : }
1160 : void EmitSkipInstructions(RegExpMacroAssembler* masm);
1161 :
1162 : private:
1163 : // This is the value obtained by EatsAtLeast. If we do not have at least this
1164 : // many characters left in the sample string then the match is bound to fail.
1165 : // Therefore it is OK to read a character this far ahead of the current match
1166 : // point.
1167 : int length_;
1168 : RegExpCompiler* compiler_;
1169 : // 0xff for Latin1, 0xffff for UTF-16.
1170 : int max_char_;
1171 : ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1172 :
1173 : int GetSkipTable(int min_lookahead,
1174 : int max_lookahead,
1175 : Handle<ByteArray> boolean_skip_table);
1176 : bool FindWorthwhileInterval(int* from, int* to);
1177 : int FindBestInterval(
1178 : int max_number_of_chars, int old_biggest_points, int* from, int* to);
1179 : };
1180 :
1181 :
1182 : // There are many ways to generate code for a node. This class encapsulates
1183 : // the current way we should be generating. In other words it encapsulates
1184 : // the current state of the code generator. The effect of this is that we
1185 : // generate code for paths that the matcher can take through the regular
1186 : // expression. A given node in the regexp can be code-generated several times
1187 : // as it can be part of several traces. For example for the regexp:
1188 : // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1189 : // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1190 : // to match foo is generated only once (the traces have a common prefix). The
1191 : // code to store the capture is deferred and generated (twice) after the places
1192 : // where baz has been matched.
1193 : class Trace {
1194 : public:
1195 : // A value for a property that is either known to be true, know to be false,
1196 : // or not known.
1197 : enum TriBool {
1198 : UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1199 : };
1200 :
1201 : class DeferredAction {
1202 : public:
1203 : DeferredAction(ActionNode::ActionType action_type, int reg)
1204 320303 : : action_type_(action_type), reg_(reg), next_(NULL) { }
1205 : DeferredAction* next() { return next_; }
1206 : bool Mentions(int reg);
1207 : int reg() { return reg_; }
1208 : ActionNode::ActionType action_type() { return action_type_; }
1209 : private:
1210 : ActionNode::ActionType action_type_;
1211 : int reg_;
1212 : DeferredAction* next_;
1213 : friend class Trace;
1214 : };
1215 :
1216 : class DeferredCapture : public DeferredAction {
1217 : public:
1218 302875 : DeferredCapture(int reg, bool is_capture, Trace* trace)
1219 : : DeferredAction(ActionNode::STORE_POSITION, reg),
1220 : cp_offset_(trace->cp_offset()),
1221 302875 : is_capture_(is_capture) { }
1222 : int cp_offset() { return cp_offset_; }
1223 : bool is_capture() { return is_capture_; }
1224 : private:
1225 : int cp_offset_;
1226 : bool is_capture_;
1227 : void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1228 : };
1229 :
1230 : class DeferredSetRegister : public DeferredAction {
1231 : public:
1232 : DeferredSetRegister(int reg, int value)
1233 : : DeferredAction(ActionNode::SET_REGISTER, reg),
1234 6226 : value_(value) { }
1235 : int value() { return value_; }
1236 : private:
1237 : int value_;
1238 : };
1239 :
1240 : class DeferredClearCaptures : public DeferredAction {
1241 : public:
1242 : explicit DeferredClearCaptures(Interval range)
1243 : : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1244 3897 : range_(range) { }
1245 : Interval range() { return range_; }
1246 : private:
1247 : Interval range_;
1248 : };
1249 :
1250 : class DeferredIncrementRegister : public DeferredAction {
1251 : public:
1252 : explicit DeferredIncrementRegister(int reg)
1253 : : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1254 : };
1255 :
1256 752663 : Trace()
1257 : : cp_offset_(0),
1258 : actions_(NULL),
1259 : backtrack_(NULL),
1260 : stop_node_(NULL),
1261 : loop_label_(NULL),
1262 : characters_preloaded_(0),
1263 : bound_checked_up_to_(0),
1264 : flush_budget_(100),
1265 1505326 : at_start_(UNKNOWN) { }
1266 :
1267 : // End the trace. This involves flushing the deferred actions in the trace
1268 : // and pushing a backtrack location onto the backtrack stack. Once this is
1269 : // done we can start a new trace or go to one that has already been
1270 : // generated.
1271 : void Flush(RegExpCompiler* compiler, RegExpNode* successor);
1272 : int cp_offset() { return cp_offset_; }
1273 : DeferredAction* actions() { return actions_; }
1274 : // A trivial trace is one that has no deferred actions or other state that
1275 : // affects the assumptions used when generating code. There is no recorded
1276 : // backtrack location in a trivial trace, so with a trivial trace we will
1277 : // generate code that, on a failure to match, gets the backtrack location
1278 : // from the backtrack stack rather than using a direct jump instruction. We
1279 : // always start code generation with a trivial trace and non-trivial traces
1280 : // are created as we emit code for nodes or add to the list of deferred
1281 : // actions in the trace. The location of the code generated for a node using
1282 : // a trivial trace is recorded in a label in the node so that gotos can be
1283 : // generated to that code.
1284 2601318 : bool is_trivial() {
1285 4061037 : return backtrack_ == NULL &&
1286 2744750 : actions_ == NULL &&
1287 1285031 : cp_offset_ == 0 &&
1288 1067772 : characters_preloaded_ == 0 &&
1289 1030254 : bound_checked_up_to_ == 0 &&
1290 4661826 : quick_check_performed_.characters() == 0 &&
1291 3631572 : at_start_ == UNKNOWN;
1292 : }
1293 : TriBool at_start() { return at_start_; }
1294 641236 : void set_at_start(TriBool at_start) { at_start_ = at_start; }
1295 : Label* backtrack() { return backtrack_; }
1296 : Label* loop_label() { return loop_label_; }
1297 : RegExpNode* stop_node() { return stop_node_; }
1298 : int characters_preloaded() { return characters_preloaded_; }
1299 : int bound_checked_up_to() { return bound_checked_up_to_; }
1300 : int flush_budget() { return flush_budget_; }
1301 : QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1302 : bool mentions_reg(int reg);
1303 : // Returns true if a deferred position store exists to the specified
1304 : // register and stores the offset in the out-parameter. Otherwise
1305 : // returns false.
1306 : bool GetStoredPosition(int reg, int* cp_offset);
1307 : // These set methods and AdvanceCurrentPositionInTrace should be used only on
1308 : // new traces - the intention is that traces are immutable after creation.
1309 : void add_action(DeferredAction* new_action) {
1310 : DCHECK(new_action->next_ == NULL);
1311 320303 : new_action->next_ = actions_;
1312 320303 : actions_ = new_action;
1313 : }
1314 956252 : void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
1315 16040 : void set_stop_node(RegExpNode* node) { stop_node_ = node; }
1316 16040 : void set_loop_label(Label* label) { loop_label_ = label; }
1317 984147 : void set_characters_preloaded(int count) { characters_preloaded_ = count; }
1318 544001 : void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
1319 250811 : void set_flush_budget(int to) { flush_budget_ = to; }
1320 : void set_quick_check_performed(QuickCheckDetails* d) {
1321 253946 : quick_check_performed_ = *d;
1322 : }
1323 : void InvalidateCurrentCharacter();
1324 : void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1325 :
1326 : private:
1327 : int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1328 : void PerformDeferredActions(RegExpMacroAssembler* macro,
1329 : int max_register,
1330 : const OutSet& affected_registers,
1331 : OutSet* registers_to_pop,
1332 : OutSet* registers_to_clear,
1333 : Zone* zone);
1334 : void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1335 : int max_register,
1336 : const OutSet& registers_to_pop,
1337 : const OutSet& registers_to_clear);
1338 : int cp_offset_;
1339 : DeferredAction* actions_;
1340 : Label* backtrack_;
1341 : RegExpNode* stop_node_;
1342 : Label* loop_label_;
1343 : int characters_preloaded_;
1344 : int bound_checked_up_to_;
1345 : QuickCheckDetails quick_check_performed_;
1346 : int flush_budget_;
1347 : TriBool at_start_;
1348 : };
1349 :
1350 :
1351 : class GreedyLoopState {
1352 : public:
1353 : explicit GreedyLoopState(bool not_at_start);
1354 :
1355 : Label* label() { return &label_; }
1356 : Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1357 :
1358 : private:
1359 : Label label_;
1360 : Trace counter_backtrack_trace_;
1361 : };
1362 :
1363 :
1364 : struct PreloadState {
1365 : static const int kEatsAtLeastNotYetInitialized = -1;
1366 : bool preload_is_current_;
1367 : bool preload_has_checked_bounds_;
1368 : int preload_characters_;
1369 : int eats_at_least_;
1370 : void init() {
1371 268841 : eats_at_least_ = kEatsAtLeastNotYetInitialized;
1372 : }
1373 : };
1374 :
1375 :
1376 92622 : class NodeVisitor {
1377 : public:
1378 92622 : virtual ~NodeVisitor() { }
1379 : #define DECLARE_VISIT(Type) \
1380 : virtual void Visit##Type(Type##Node* that) = 0;
1381 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1382 : #undef DECLARE_VISIT
1383 0 : virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1384 : };
1385 :
1386 :
1387 : // Node visitor used to add the start set of the alternatives to the
1388 : // dispatch table of a choice node.
1389 0 : class DispatchTableConstructor: public NodeVisitor {
1390 : public:
1391 : DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1392 : Zone* zone)
1393 : : table_(table),
1394 : choice_index_(-1),
1395 : ignore_case_(ignore_case),
1396 0 : zone_(zone) { }
1397 :
1398 : void BuildTable(ChoiceNode* node);
1399 :
1400 812 : void AddRange(CharacterRange range) {
1401 1624 : table()->AddRange(range, choice_index_, zone_);
1402 : }
1403 :
1404 : void AddInverse(ZoneList<CharacterRange>* ranges);
1405 :
1406 : #define DECLARE_VISIT(Type) \
1407 : virtual void Visit##Type(Type##Node* that);
1408 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1409 : #undef DECLARE_VISIT
1410 :
1411 : DispatchTable* table() { return table_; }
1412 0 : void set_choice_index(int value) { choice_index_ = value; }
1413 :
1414 : protected:
1415 : DispatchTable* table_;
1416 : int choice_index_;
1417 : bool ignore_case_;
1418 : Zone* zone_;
1419 : };
1420 :
1421 :
1422 : // Assertion propagation moves information about assertions such as
1423 : // \b to the affected nodes. For instance, in /.\b./ information must
1424 : // be propagated to the first '.' that whatever follows needs to know
1425 : // if it matched a word or a non-word, and to the second '.' that it
1426 : // has to check if it succeeds a word or non-word. In this case the
1427 : // result will be something like:
1428 : //
1429 : // +-------+ +------------+
1430 : // | . | | . |
1431 : // +-------+ ---> +------------+
1432 : // | word? | | check word |
1433 : // +-------+ +------------+
1434 92622 : class Analysis: public NodeVisitor {
1435 : public:
1436 : Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
1437 : : isolate_(isolate),
1438 : flags_(flags),
1439 : is_one_byte_(is_one_byte),
1440 92622 : error_message_(NULL) {}
1441 : void EnsureAnalyzed(RegExpNode* node);
1442 :
1443 : #define DECLARE_VISIT(Type) \
1444 : virtual void Visit##Type(Type##Node* that);
1445 : FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1446 : #undef DECLARE_VISIT
1447 : virtual void VisitLoopChoice(LoopChoiceNode* that);
1448 :
1449 : bool has_failed() { return error_message_ != NULL; }
1450 : const char* error_message() {
1451 : DCHECK(error_message_ != NULL);
1452 : return error_message_;
1453 : }
1454 : void fail(const char* error_message) {
1455 441 : error_message_ = error_message;
1456 : }
1457 :
1458 : Isolate* isolate() const { return isolate_; }
1459 :
1460 : bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
1461 : bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
1462 :
1463 : private:
1464 : Isolate* isolate_;
1465 : JSRegExp::Flags flags_;
1466 : bool is_one_byte_;
1467 : const char* error_message_;
1468 :
1469 : DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1470 : };
1471 :
1472 :
1473 : struct RegExpCompileData {
1474 : RegExpCompileData()
1475 : : tree(NULL),
1476 : node(NULL),
1477 : simple(true),
1478 : contains_anchor(false),
1479 912522 : capture_count(0) { }
1480 : RegExpTree* tree;
1481 : RegExpNode* node;
1482 : bool simple;
1483 : bool contains_anchor;
1484 : Handle<FixedArray> capture_name_map;
1485 : Handle<String> error;
1486 : int capture_count;
1487 : };
1488 :
1489 :
1490 : class RegExpEngine: public AllStatic {
1491 : public:
1492 : struct CompilationResult {
1493 : CompilationResult(Isolate* isolate, const char* error_message)
1494 : : error_message(error_message),
1495 456 : code(isolate->heap()->the_hole_value()),
1496 912 : num_registers(0) {}
1497 : CompilationResult(Object* code, int registers)
1498 92181 : : error_message(NULL), code(code), num_registers(registers) {}
1499 : const char* error_message;
1500 : Object* code;
1501 : int num_registers;
1502 : };
1503 :
1504 : static CompilationResult Compile(Isolate* isolate, Zone* zone,
1505 : RegExpCompileData* input,
1506 : JSRegExp::Flags flags,
1507 : Handle<String> pattern,
1508 : Handle<String> sample_subject,
1509 : bool is_one_byte);
1510 :
1511 : static bool TooMuchRegExpCode(Handle<String> pattern);
1512 :
1513 : static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1514 : };
1515 :
1516 :
1517 : class RegExpResultsCache : public AllStatic {
1518 : public:
1519 : enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1520 :
1521 : // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
1522 : // On success, the returned result is guaranteed to be a COW-array.
1523 : static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
1524 : FixedArray** last_match_out, ResultsCacheType type);
1525 : // Attempt to add value_array to the cache specified by type. On success,
1526 : // value_array is turned into a COW-array.
1527 : static void Enter(Isolate* isolate, Handle<String> key_string,
1528 : Handle<Object> key_pattern, Handle<FixedArray> value_array,
1529 : Handle<FixedArray> last_match_cache, ResultsCacheType type);
1530 : static void Clear(FixedArray* cache);
1531 : static const int kRegExpResultsCacheSize = 0x100;
1532 :
1533 : private:
1534 : static const int kArrayEntriesPerCacheEntry = 4;
1535 : static const int kStringOffset = 0;
1536 : static const int kPatternOffset = 1;
1537 : static const int kArrayOffset = 2;
1538 : static const int kLastMatchOffset = 3;
1539 : };
1540 :
1541 : } // namespace internal
1542 : } // namespace v8
1543 :
1544 : #endif // V8_REGEXP_JSREGEXP_H_
|