Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_REGEXP_REGEXP_PARSER_H_
6 : #define V8_REGEXP_REGEXP_PARSER_H_
7 :
8 : #include "src/objects.h"
9 : #include "src/regexp/regexp-ast.h"
10 : #include "src/zone/zone.h"
11 :
12 : namespace v8 {
13 : namespace internal {
14 :
15 : struct RegExpCompileData;
16 :
17 :
18 : // A BufferedZoneList is an automatically growing list, just like (and backed
19 : // by) a ZoneList, that is optimized for the case of adding and removing
20 : // a single element. The last element added is stored outside the backing list,
21 : // and if no more than one element is ever added, the ZoneList isn't even
22 : // allocated.
23 : // Elements must not be NULL pointers.
24 : template <typename T, int initial_size>
25 : class BufferedZoneList {
26 : public:
27 2143904 : BufferedZoneList() : list_(NULL), last_(NULL) {}
28 :
29 : // Adds element at end of list. This element is buffered and can
30 : // be read using last() or removed using RemoveLast until a new Add or until
31 : // RemoveLast or GetList has been called.
32 7714029 : void Add(T* value, Zone* zone) {
33 7714029 : if (last_ != NULL) {
34 3046784 : if (list_ == NULL) {
35 131400 : list_ = new (zone) ZoneList<T*>(initial_size, zone);
36 : }
37 3046784 : list_->Add(last_, zone);
38 : }
39 7714029 : last_ = value;
40 7714029 : }
41 :
42 : T* last() {
43 : DCHECK(last_ != NULL);
44 : return last_;
45 : }
46 :
47 60734 : T* RemoveLast() {
48 : DCHECK(last_ != NULL);
49 60734 : T* result = last_;
50 60734 : if ((list_ != NULL) && (list_->length() > 0))
51 75848 : last_ = list_->RemoveLast();
52 : else
53 22810 : last_ = NULL;
54 60734 : return result;
55 : }
56 :
57 : T* Get(int i) {
58 : DCHECK((0 <= i) && (i < length()));
59 20323 : if (list_ == NULL) {
60 : DCHECK_EQ(0, i);
61 0 : return last_;
62 : } else {
63 20323 : if (i == list_->length()) {
64 : DCHECK(last_ != NULL);
65 5990 : return last_;
66 : } else {
67 14333 : return list_->at(i);
68 : }
69 : }
70 : }
71 :
72 : void Clear() {
73 2965603 : list_ = NULL;
74 2965603 : last_ = NULL;
75 : }
76 :
77 : int length() {
78 10834474 : int length = (list_ == NULL) ? 0 : list_->length();
79 10834474 : return length + ((last_ == NULL) ? 0 : 1);
80 : }
81 :
82 106909 : ZoneList<T*>* GetList(Zone* zone) {
83 106909 : if (list_ == NULL) {
84 0 : list_ = new (zone) ZoneList<T*>(initial_size, zone);
85 : }
86 106909 : if (last_ != NULL) {
87 106909 : list_->Add(last_, zone);
88 106909 : last_ = NULL;
89 : }
90 106909 : return list_;
91 : }
92 :
93 : private:
94 : ZoneList<T*>* list_;
95 : T* last_;
96 : };
97 :
98 :
99 : // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100 : class RegExpBuilder : public ZoneObject {
101 : public:
102 : RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
103 : void AddCharacter(uc16 character);
104 : void AddUnicodeCharacter(uc32 character);
105 : void AddEscapedUnicodeCharacter(uc32 character);
106 : // "Adds" an empty expression. Does nothing except consume a
107 : // following quantifier
108 : void AddEmpty();
109 : void AddCharacterClass(RegExpCharacterClass* cc);
110 : void AddCharacterClassForDesugaring(uc32 c);
111 : void AddAtom(RegExpTree* tree);
112 : void AddTerm(RegExpTree* tree);
113 : void AddAssertion(RegExpTree* tree);
114 : void NewAlternative(); // '|'
115 : bool AddQuantifierToAtom(int min, int max,
116 : RegExpQuantifier::QuantifierType type);
117 : RegExpTree* ToRegExp();
118 :
119 : private:
120 : static const uc16 kNoPendingSurrogate = 0;
121 : void AddLeadSurrogate(uc16 lead_surrogate);
122 : void AddTrailSurrogate(uc16 trail_surrogate);
123 : void FlushPendingSurrogate();
124 : void FlushCharacters();
125 : void FlushText();
126 : void FlushTerms();
127 : bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
128 : bool NeedsDesugaringForIgnoreCase(uc32 c);
129 : Zone* zone() const { return zone_; }
130 : bool ignore_case() const { return ignore_case_; }
131 : bool unicode() const { return unicode_; }
132 :
133 : Zone* zone_;
134 : bool pending_empty_;
135 : bool ignore_case_;
136 : bool unicode_;
137 : ZoneList<uc16>* characters_;
138 : uc16 pending_surrogate_;
139 : BufferedZoneList<RegExpTree, 2> terms_;
140 : BufferedZoneList<RegExpTree, 2> text_;
141 : BufferedZoneList<RegExpTree, 2> alternatives_;
142 : #ifdef DEBUG
143 : enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
144 : #define LAST(x) last_added_ = x;
145 : #else
146 : #define LAST(x)
147 : #endif
148 : };
149 :
150 :
151 : class RegExpParser BASE_EMBEDDED {
152 : public:
153 : RegExpParser(FlatStringReader* in, Handle<String>* error,
154 : JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
155 :
156 : static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
157 : JSRegExp::Flags flags, RegExpCompileData* result);
158 :
159 : RegExpTree* ParsePattern();
160 : RegExpTree* ParseDisjunction();
161 : RegExpTree* ParseGroup();
162 : RegExpTree* ParseCharacterClass();
163 :
164 : // Parses a {...,...} quantifier and stores the range in the given
165 : // out parameters.
166 : bool ParseIntervalQuantifier(int* min_out, int* max_out);
167 :
168 : // Parses and returns a single escaped character. The character
169 : // must not be 'b' or 'B' since they are usually handle specially.
170 : uc32 ParseClassCharacterEscape();
171 :
172 : // Checks whether the following is a length-digit hexadecimal number,
173 : // and sets the value if it is.
174 : bool ParseHexEscape(int length, uc32* value);
175 : bool ParseUnicodeEscape(uc32* value);
176 : bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
177 : bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
178 :
179 : uc32 ParseOctalLiteral();
180 :
181 : // Tries to parse the input as a back reference. If successful it
182 : // stores the result in the output parameter and returns true. If
183 : // it fails it will push back the characters read so the same characters
184 : // can be reparsed.
185 : bool ParseBackReferenceIndex(int* index_out);
186 :
187 : // The default behavior is to combine surrogate pairs in unicode mode and
188 : // don't combine them otherwise (a quantifier after a surrogate pair would
189 : // then apply only to the trailing surrogate). Forcing combination is required
190 : // when parsing capture names since they can always legally contain surrogate
191 : // pairs.
192 : enum class ScanMode { DEFAULT, FORCE_COMBINE_SURROGATE_PAIRS };
193 :
194 : bool ParseClassProperty(ZoneList<CharacterRange>* result);
195 : CharacterRange ParseClassAtom(uc16* char_class);
196 : RegExpTree* ReportError(Vector<const char> message);
197 : void Advance(ScanMode mode = ScanMode::DEFAULT);
198 : void Advance(int dist, ScanMode mode = ScanMode::DEFAULT);
199 : void Reset(int pos);
200 :
201 : // Reports whether the pattern might be used as a literal search string.
202 : // Only use if the result of the parse is a single atom node.
203 : bool simple();
204 : bool contains_anchor() { return contains_anchor_; }
205 14806 : void set_contains_anchor() { contains_anchor_ = true; }
206 : int captures_started() { return captures_started_; }
207 41537 : int position() { return next_pos_ - 1; }
208 : bool failed() { return failed_; }
209 : bool dotall() const { return dotall_; }
210 : bool ignore_case() const { return ignore_case_; }
211 : bool multiline() const { return multiline_; }
212 : bool unicode() const { return unicode_; }
213 :
214 : static bool IsSyntaxCharacterOrSlash(uc32 c);
215 :
216 : static const int kMaxCaptures = 1 << 16;
217 : static const uc32 kEndMarker = (1 << 21);
218 :
219 : private:
220 : enum SubexpressionType {
221 : INITIAL,
222 : CAPTURE, // All positive values represent captures.
223 : POSITIVE_LOOKAROUND,
224 : NEGATIVE_LOOKAROUND,
225 : GROUPING
226 : };
227 :
228 : class RegExpParserState : public ZoneObject {
229 : public:
230 2143904 : RegExpParserState(RegExpParserState* previous_state,
231 : SubexpressionType group_type,
232 : RegExpLookaround::Type lookaround_type,
233 : int disjunction_capture_index,
234 : const ZoneVector<uc16>* capture_name, bool ignore_case,
235 : bool unicode, Zone* zone)
236 : : previous_state_(previous_state),
237 : builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
238 : group_type_(group_type),
239 : lookaround_type_(lookaround_type),
240 : disjunction_capture_index_(disjunction_capture_index),
241 4287808 : capture_name_(capture_name) {}
242 : // Parser state of containing expression, if any.
243 : RegExpParserState* previous_state() { return previous_state_; }
244 : bool IsSubexpression() { return previous_state_ != NULL; }
245 : // RegExpBuilder building this regexp's AST.
246 : RegExpBuilder* builder() { return builder_; }
247 : // Type of regexp being parsed (parenthesized group or entire regexp).
248 : SubexpressionType group_type() { return group_type_; }
249 : // Lookahead or Lookbehind.
250 : RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
251 : // Index in captures array of first capture in this sub-expression, if any.
252 : // Also the capture index of this sub-expression itself, if group_type
253 : // is CAPTURE.
254 : int capture_index() { return disjunction_capture_index_; }
255 : // The name of the current sub-expression, if group_type is CAPTURE. Only
256 : // used for named captures.
257 : const ZoneVector<uc16>* capture_name() { return capture_name_; }
258 :
259 : bool IsNamedCapture() const { return capture_name_ != nullptr; }
260 :
261 : // Check whether the parser is inside a capture group with the given index.
262 : bool IsInsideCaptureGroup(int index);
263 : // Check whether the parser is inside a capture group with the given name.
264 : bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
265 :
266 : private:
267 : // Linked list implementation of stack of states.
268 : RegExpParserState* previous_state_;
269 : // Builder for the stored disjunction.
270 : RegExpBuilder* builder_;
271 : // Stored disjunction type (capture, look-ahead or grouping), if any.
272 : SubexpressionType group_type_;
273 : // Stored read direction.
274 : RegExpLookaround::Type lookaround_type_;
275 : // Stored disjunction's capture index (if any).
276 : int disjunction_capture_index_;
277 : // Stored capture name (if any).
278 : const ZoneVector<uc16>* capture_name_;
279 : };
280 :
281 : // Return the 1-indexed RegExpCapture object, allocate if necessary.
282 : RegExpCapture* GetCapture(int index);
283 :
284 : // Creates a new named capture at the specified index. Must be called exactly
285 : // once for each named capture. Fails if a capture with the same name is
286 : // encountered.
287 : bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
288 :
289 : // Parses the name of a capture group (?<name>pattern). The name must adhere
290 : // to IdentifierName in the ECMAScript standard.
291 : const ZoneVector<uc16>* ParseCaptureGroupName();
292 :
293 : bool ParseNamedBackReference(RegExpBuilder* builder,
294 : RegExpParserState* state);
295 :
296 : // After the initial parsing pass, patch corresponding RegExpCapture objects
297 : // into all RegExpBackReferences. This is done after initial parsing in order
298 : // to avoid complicating cases in which references comes before the capture.
299 : void PatchNamedBackReferences();
300 :
301 : Handle<FixedArray> CreateCaptureNameMap();
302 :
303 : // Returns true iff the pattern contains named captures. May call
304 : // ScanForCaptures to look ahead at the remaining pattern.
305 : bool HasNamedCaptures();
306 :
307 : Isolate* isolate() { return isolate_; }
308 : Zone* zone() const { return zone_; }
309 :
310 : uc32 current() { return current_; }
311 : bool has_more() { return has_more_; }
312 15924220 : bool has_next() { return next_pos_ < in()->length(); }
313 : uc32 Next();
314 : uc32 ReadNext(bool update_position, ScanMode mode);
315 : FlatStringReader* in() { return in_; }
316 : void ScanForCaptures();
317 :
318 : Isolate* isolate_;
319 : Zone* zone_;
320 : Handle<String>* error_;
321 : ZoneList<RegExpCapture*>* captures_;
322 : ZoneList<RegExpCapture*>* named_captures_;
323 : ZoneList<RegExpBackReference*>* named_back_references_;
324 : FlatStringReader* in_;
325 : uc32 current_;
326 : bool dotall_;
327 : bool ignore_case_;
328 : bool multiline_;
329 : bool unicode_;
330 : int next_pos_;
331 : int captures_started_;
332 : int capture_count_; // Only valid after we have scanned for captures.
333 : bool has_more_;
334 : bool simple_;
335 : bool contains_anchor_;
336 : bool is_scanned_for_captures_;
337 : bool has_named_captures_; // Only valid after we have scanned for captures.
338 : bool failed_;
339 : };
340 :
341 : } // namespace internal
342 : } // namespace v8
343 :
344 : #endif // V8_REGEXP_REGEXP_PARSER_H_
|