Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_REGEXP_REGEXP_PARSER_H_
6 : #define V8_REGEXP_REGEXP_PARSER_H_
7 :
8 : #include "src/objects.h"
9 : #include "src/objects/js-regexp.h"
10 : #include "src/regexp/regexp-ast.h"
11 : #include "src/zone/zone.h"
12 :
13 : namespace v8 {
14 : namespace internal {
15 :
16 : struct RegExpCompileData;
17 :
18 : // A BufferedZoneList is an automatically growing list, just like (and backed
19 : // by) a ZoneList, that is optimized for the case of adding and removing
20 : // a single element. The last element added is stored outside the backing list,
21 : // and if no more than one element is ever added, the ZoneList isn't even
22 : // allocated.
23 : // Elements must not be nullptr pointers.
24 : template <typename T, int initial_size>
25 : class BufferedZoneList {
26 : public:
27 1699012 : BufferedZoneList() : list_(nullptr), last_(nullptr) {}
28 :
29 : // Adds element at end of list. This element is buffered and can
30 : // be read using last() or removed using RemoveLast until a new Add or until
31 : // RemoveLast or GetList has been called.
32 6064324 : void Add(T* value, Zone* zone) {
33 6064324 : if (last_ != nullptr) {
34 2329861 : if (list_ == nullptr) {
35 113961 : list_ = new (zone) ZoneList<T*>(initial_size, zone);
36 : }
37 2329861 : list_->Add(last_, zone);
38 : }
39 6064324 : last_ = value;
40 6064324 : }
41 :
42 : T* last() {
43 : DCHECK(last_ != nullptr);
44 : return last_;
45 : }
46 :
47 52853 : T* RemoveLast() {
48 : DCHECK(last_ != nullptr);
49 52853 : T* result = last_;
50 52853 : if ((list_ != nullptr) && (list_->length() > 0))
51 33039 : last_ = list_->RemoveLast();
52 : else
53 19814 : last_ = nullptr;
54 52853 : return result;
55 : }
56 :
57 : T* Get(int i) {
58 : DCHECK((0 <= i) && (i < length()));
59 33048 : if (list_ == nullptr) {
60 : DCHECK_EQ(0, i);
61 0 : return last_;
62 : } else {
63 16524 : if (i == list_->length()) {
64 : DCHECK(last_ != nullptr);
65 5244 : return last_;
66 : } else {
67 11280 : return list_->at(i);
68 : }
69 : }
70 : }
71 :
72 : void Clear() {
73 2413649 : list_ = nullptr;
74 2413649 : last_ = nullptr;
75 : }
76 :
77 : int length() {
78 8451920 : int length = (list_ == nullptr) ? 0 : list_->length();
79 8451920 : return length + ((last_ == nullptr) ? 0 : 1);
80 : }
81 :
82 91879 : ZoneList<T*>* GetList(Zone* zone) {
83 91879 : if (list_ == nullptr) {
84 0 : list_ = new (zone) ZoneList<T*>(initial_size, zone);
85 : }
86 91879 : if (last_ != nullptr) {
87 91879 : list_->Add(last_, zone);
88 91879 : last_ = nullptr;
89 : }
90 91879 : return list_;
91 : }
92 :
93 : private:
94 : ZoneList<T*>* list_;
95 : T* last_;
96 : };
97 :
98 :
99 : // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100 : class RegExpBuilder : public ZoneObject {
101 : public:
102 : RegExpBuilder(Zone* zone, bool ignore_case, bool unicode);
103 : void AddCharacter(uc16 character);
104 : void AddUnicodeCharacter(uc32 character);
105 : void AddEscapedUnicodeCharacter(uc32 character);
106 : // "Adds" an empty expression. Does nothing except consume a
107 : // following quantifier
108 : void AddEmpty();
109 : void AddCharacterClass(RegExpCharacterClass* cc);
110 : void AddCharacterClassForDesugaring(uc32 c);
111 : void AddAtom(RegExpTree* tree);
112 : void AddTerm(RegExpTree* tree);
113 : void AddAssertion(RegExpTree* tree);
114 : void NewAlternative(); // '|'
115 : bool AddQuantifierToAtom(int min, int max,
116 : RegExpQuantifier::QuantifierType type);
117 : RegExpTree* ToRegExp();
118 :
119 : private:
120 : static const uc16 kNoPendingSurrogate = 0;
121 : void AddLeadSurrogate(uc16 lead_surrogate);
122 : void AddTrailSurrogate(uc16 trail_surrogate);
123 : void FlushPendingSurrogate();
124 : void FlushCharacters();
125 : void FlushText();
126 : void FlushTerms();
127 : bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
128 : bool NeedsDesugaringForIgnoreCase(uc32 c);
129 : Zone* zone() const { return zone_; }
130 : bool ignore_case() const { return ignore_case_; }
131 : bool unicode() const { return unicode_; }
132 :
133 : Zone* zone_;
134 : bool pending_empty_;
135 : bool ignore_case_;
136 : bool unicode_;
137 : ZoneList<uc16>* characters_;
138 : uc16 pending_surrogate_;
139 : BufferedZoneList<RegExpTree, 2> terms_;
140 : BufferedZoneList<RegExpTree, 2> text_;
141 : BufferedZoneList<RegExpTree, 2> alternatives_;
142 : #ifdef DEBUG
143 : enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
144 : #define LAST(x) last_added_ = x;
145 : #else
146 : #define LAST(x)
147 : #endif
148 : };
149 :
150 :
151 : class RegExpParser BASE_EMBEDDED {
152 : public:
153 : RegExpParser(FlatStringReader* in, Handle<String>* error,
154 : JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
155 :
156 : static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
157 : JSRegExp::Flags flags, RegExpCompileData* result);
158 :
159 : RegExpTree* ParsePattern();
160 : RegExpTree* ParseDisjunction();
161 : RegExpTree* ParseGroup();
162 : RegExpTree* ParseCharacterClass();
163 :
164 : // Parses a {...,...} quantifier and stores the range in the given
165 : // out parameters.
166 : bool ParseIntervalQuantifier(int* min_out, int* max_out);
167 :
168 : // Parses and returns a single escaped character. The character
169 : // must not be 'b' or 'B' since they are usually handle specially.
170 : uc32 ParseClassCharacterEscape();
171 :
172 : // Checks whether the following is a length-digit hexadecimal number,
173 : // and sets the value if it is.
174 : bool ParseHexEscape(int length, uc32* value);
175 : bool ParseUnicodeEscape(uc32* value);
176 : bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
177 : bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
178 :
179 : uc32 ParseOctalLiteral();
180 :
181 : // Tries to parse the input as a back reference. If successful it
182 : // stores the result in the output parameter and returns true. If
183 : // it fails it will push back the characters read so the same characters
184 : // can be reparsed.
185 : bool ParseBackReferenceIndex(int* index_out);
186 :
187 : // Parse inside a class. Either add escaped class to the range, or return
188 : // false and pass parsed single character through |char_out|.
189 : void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
190 : bool add_unicode_case_equivalents, uc32* char_out,
191 : bool* is_class_escape);
192 :
193 : char ParseClassEscape();
194 :
195 : RegExpTree* ReportError(Vector<const char> message);
196 : void Advance();
197 : void Advance(int dist);
198 : void Reset(int pos);
199 :
200 : // Reports whether the pattern might be used as a literal search string.
201 : // Only use if the result of the parse is a single atom node.
202 : bool simple();
203 : bool contains_anchor() { return contains_anchor_; }
204 12576 : void set_contains_anchor() { contains_anchor_ = true; }
205 : int captures_started() { return captures_started_; }
206 36494 : int position() { return next_pos_ - 1; }
207 : bool failed() { return failed_; }
208 : bool dotall() const { return dotall_; }
209 : bool ignore_case() const { return ignore_case_; }
210 : bool multiline() const { return multiline_; }
211 : bool unicode() const { return unicode_; }
212 :
213 : static bool IsSyntaxCharacterOrSlash(uc32 c);
214 :
215 : static const int kMaxCaptures = 1 << 16;
216 : static const uc32 kEndMarker = (1 << 21);
217 :
218 : private:
219 : enum SubexpressionType {
220 : INITIAL,
221 : CAPTURE, // All positive values represent captures.
222 : POSITIVE_LOOKAROUND,
223 : NEGATIVE_LOOKAROUND,
224 : GROUPING
225 : };
226 :
227 : class RegExpParserState : public ZoneObject {
228 : public:
229 1699012 : RegExpParserState(RegExpParserState* previous_state,
230 : SubexpressionType group_type,
231 : RegExpLookaround::Type lookaround_type,
232 : int disjunction_capture_index,
233 : const ZoneVector<uc16>* capture_name, bool ignore_case,
234 : bool unicode, Zone* zone)
235 : : previous_state_(previous_state),
236 : builder_(new (zone) RegExpBuilder(zone, ignore_case, unicode)),
237 : group_type_(group_type),
238 : lookaround_type_(lookaround_type),
239 : disjunction_capture_index_(disjunction_capture_index),
240 3398024 : capture_name_(capture_name) {}
241 : // Parser state of containing expression, if any.
242 : RegExpParserState* previous_state() { return previous_state_; }
243 : bool IsSubexpression() { return previous_state_ != nullptr; }
244 : // RegExpBuilder building this regexp's AST.
245 : RegExpBuilder* builder() { return builder_; }
246 : // Type of regexp being parsed (parenthesized group or entire regexp).
247 : SubexpressionType group_type() { return group_type_; }
248 : // Lookahead or Lookbehind.
249 : RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
250 : // Index in captures array of first capture in this sub-expression, if any.
251 : // Also the capture index of this sub-expression itself, if group_type
252 : // is CAPTURE.
253 : int capture_index() { return disjunction_capture_index_; }
254 : // The name of the current sub-expression, if group_type is CAPTURE. Only
255 : // used for named captures.
256 : const ZoneVector<uc16>* capture_name() { return capture_name_; }
257 :
258 : bool IsNamedCapture() const { return capture_name_ != nullptr; }
259 :
260 : // Check whether the parser is inside a capture group with the given index.
261 : bool IsInsideCaptureGroup(int index);
262 : // Check whether the parser is inside a capture group with the given name.
263 : bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
264 :
265 : private:
266 : // Linked list implementation of stack of states.
267 : RegExpParserState* previous_state_;
268 : // Builder for the stored disjunction.
269 : RegExpBuilder* builder_;
270 : // Stored disjunction type (capture, look-ahead or grouping), if any.
271 : SubexpressionType group_type_;
272 : // Stored read direction.
273 : RegExpLookaround::Type lookaround_type_;
274 : // Stored disjunction's capture index (if any).
275 : int disjunction_capture_index_;
276 : // Stored capture name (if any).
277 : const ZoneVector<uc16>* capture_name_;
278 : };
279 :
280 : // Return the 1-indexed RegExpCapture object, allocate if necessary.
281 : RegExpCapture* GetCapture(int index);
282 :
283 : // Creates a new named capture at the specified index. Must be called exactly
284 : // once for each named capture. Fails if a capture with the same name is
285 : // encountered.
286 : bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
287 :
288 : // Parses the name of a capture group (?<name>pattern). The name must adhere
289 : // to IdentifierName in the ECMAScript standard.
290 : const ZoneVector<uc16>* ParseCaptureGroupName();
291 :
292 : bool ParseNamedBackReference(RegExpBuilder* builder,
293 : RegExpParserState* state);
294 :
295 : // After the initial parsing pass, patch corresponding RegExpCapture objects
296 : // into all RegExpBackReferences. This is done after initial parsing in order
297 : // to avoid complicating cases in which references comes before the capture.
298 : void PatchNamedBackReferences();
299 :
300 : Handle<FixedArray> CreateCaptureNameMap();
301 :
302 : // Returns true iff the pattern contains named captures. May call
303 : // ScanForCaptures to look ahead at the remaining pattern.
304 : bool HasNamedCaptures();
305 :
306 : Isolate* isolate() { return isolate_; }
307 : Zone* zone() const { return zone_; }
308 :
309 : uc32 current() { return current_; }
310 : bool has_more() { return has_more_; }
311 432005452 : bool has_next() { return next_pos_ < in()->length(); }
312 : uc32 Next();
313 : template <bool update_position>
314 : uc32 ReadNext();
315 : FlatStringReader* in() { return in_; }
316 : void ScanForCaptures();
317 :
318 : Isolate* isolate_;
319 : Zone* zone_;
320 : Handle<String>* error_;
321 : ZoneList<RegExpCapture*>* captures_;
322 : ZoneList<RegExpCapture*>* named_captures_;
323 : ZoneList<RegExpBackReference*>* named_back_references_;
324 : FlatStringReader* in_;
325 : uc32 current_;
326 : bool dotall_;
327 : bool ignore_case_;
328 : bool multiline_;
329 : bool unicode_;
330 : int next_pos_;
331 : int captures_started_;
332 : int capture_count_; // Only valid after we have scanned for captures.
333 : bool has_more_;
334 : bool simple_;
335 : bool contains_anchor_;
336 : bool is_scanned_for_captures_;
337 : bool has_named_captures_; // Only valid after we have scanned for captures.
338 : bool failed_;
339 : };
340 :
341 : } // namespace internal
342 : } // namespace v8
343 :
344 : #endif // V8_REGEXP_REGEXP_PARSER_H_
|