Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #ifndef V8_REGEXP_REGEXP_PARSER_H_
6 : #define V8_REGEXP_REGEXP_PARSER_H_
7 :
8 : #include "src/objects.h"
9 : #include "src/objects/js-regexp.h"
10 : #include "src/regexp/regexp-ast.h"
11 : #include "src/zone/zone.h"
12 :
13 : namespace v8 {
14 : namespace internal {
15 :
16 : struct RegExpCompileData;
17 :
18 : // A BufferedZoneList is an automatically growing list, just like (and backed
19 : // by) a ZoneList, that is optimized for the case of adding and removing
20 : // a single element. The last element added is stored outside the backing list,
21 : // and if no more than one element is ever added, the ZoneList isn't even
22 : // allocated.
23 : // Elements must not be nullptr pointers.
24 : template <typename T, int initial_size>
25 : class BufferedZoneList {
26 : public:
27 1486350 : BufferedZoneList() : list_(nullptr), last_(nullptr) {}
28 :
29 : // Adds element at end of list. This element is buffered and can
30 : // be read using last() or removed using RemoveLast until a new Add or until
31 : // RemoveLast or GetList has been called.
32 5439958 : void Add(T* value, Zone* zone) {
33 5439958 : if (last_ != nullptr) {
34 2072408 : if (list_ == nullptr) {
35 109006 : list_ = new (zone) ZoneList<T*>(initial_size, zone);
36 : }
37 2072408 : list_->Add(last_, zone);
38 : }
39 5439958 : last_ = value;
40 5439958 : }
41 :
42 : T* last() {
43 : DCHECK(last_ != nullptr);
44 : return last_;
45 : }
46 :
47 23771 : T* RemoveLast() {
48 : DCHECK(last_ != nullptr);
49 23771 : T* result = last_;
50 38106 : if ((list_ != nullptr) && (list_->length() > 0))
51 14335 : last_ = list_->RemoveLast();
52 : else
53 9436 : last_ = nullptr;
54 23771 : return result;
55 : }
56 :
57 : T* Get(int i) {
58 : DCHECK((0 <= i) && (i < length()));
59 107006 : if (list_ == nullptr) {
60 : DCHECK_EQ(0, i);
61 0 : return last_;
62 : } else {
63 107006 : if (i == list_->length()) {
64 : DCHECK(last_ != nullptr);
65 37536 : return last_;
66 : } else {
67 69470 : return list_->at(i);
68 : }
69 : }
70 : }
71 :
72 : void Clear() {
73 2204426 : list_ = nullptr;
74 2204426 : last_ = nullptr;
75 : }
76 :
77 : int length() {
78 7368369 : int length = (list_ == nullptr) ? 0 : list_->length();
79 7368369 : return length + ((last_ == nullptr) ? 0 : 1);
80 : }
81 :
82 62326 : ZoneList<T*>* GetList(Zone* zone) {
83 62326 : if (list_ == nullptr) {
84 0 : list_ = new (zone) ZoneList<T*>(initial_size, zone);
85 : }
86 62326 : if (last_ != nullptr) {
87 62326 : list_->Add(last_, zone);
88 62326 : last_ = nullptr;
89 : }
90 62326 : return list_;
91 : }
92 :
93 : private:
94 : ZoneList<T*>* list_;
95 : T* last_;
96 : };
97 :
98 :
99 : // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100 : class RegExpBuilder : public ZoneObject {
101 : public:
102 : RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
103 : void AddCharacter(uc16 character);
104 : void AddUnicodeCharacter(uc32 character);
105 : void AddEscapedUnicodeCharacter(uc32 character);
106 : // "Adds" an empty expression. Does nothing except consume a
107 : // following quantifier
108 : void AddEmpty();
109 : void AddCharacterClass(RegExpCharacterClass* cc);
110 : void AddCharacterClassForDesugaring(uc32 c);
111 : void AddAtom(RegExpTree* tree);
112 : void AddTerm(RegExpTree* tree);
113 : void AddAssertion(RegExpTree* tree);
114 : void NewAlternative(); // '|'
115 : bool AddQuantifierToAtom(int min, int max,
116 : RegExpQuantifier::QuantifierType type);
117 : void FlushText();
118 : RegExpTree* ToRegExp();
119 : JSRegExp::Flags flags() const { return flags_; }
120 1075 : void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
121 :
122 : bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
123 : bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
124 : bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
125 :
126 : private:
127 : static const uc16 kNoPendingSurrogate = 0;
128 : void AddLeadSurrogate(uc16 lead_surrogate);
129 : void AddTrailSurrogate(uc16 trail_surrogate);
130 : void FlushPendingSurrogate();
131 : void FlushCharacters();
132 : void FlushTerms();
133 : bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
134 : bool NeedsDesugaringForIgnoreCase(uc32 c);
135 : Zone* zone() const { return zone_; }
136 : bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
137 :
138 : Zone* zone_;
139 : bool pending_empty_;
140 : JSRegExp::Flags flags_;
141 : ZoneList<uc16>* characters_;
142 : uc16 pending_surrogate_;
143 : BufferedZoneList<RegExpTree, 2> terms_;
144 : BufferedZoneList<RegExpTree, 2> text_;
145 : BufferedZoneList<RegExpTree, 2> alternatives_;
146 : #ifdef DEBUG
147 : enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
148 : #define LAST(x) last_added_ = x;
149 : #else
150 : #define LAST(x)
151 : #endif
152 : };
153 :
154 : class V8_EXPORT_PRIVATE RegExpParser {
155 : public:
156 : RegExpParser(FlatStringReader* in, Handle<String>* error,
157 : JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
158 :
159 : static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
160 : JSRegExp::Flags flags, RegExpCompileData* result);
161 :
162 : RegExpTree* ParsePattern();
163 : RegExpTree* ParseDisjunction();
164 : RegExpTree* ParseGroup();
165 :
166 : // Parses a {...,...} quantifier and stores the range in the given
167 : // out parameters.
168 : bool ParseIntervalQuantifier(int* min_out, int* max_out);
169 :
170 : // Parses and returns a single escaped character. The character
171 : // must not be 'b' or 'B' since they are usually handle specially.
172 : uc32 ParseClassCharacterEscape();
173 :
174 : // Checks whether the following is a length-digit hexadecimal number,
175 : // and sets the value if it is.
176 : bool ParseHexEscape(int length, uc32* value);
177 : bool ParseUnicodeEscape(uc32* value);
178 : bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
179 :
180 : bool ParsePropertyClassName(std::vector<char>* name_1,
181 : std::vector<char>* name_2);
182 : bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to, bool negate,
183 : const std::vector<char>& name_1,
184 : const std::vector<char>& name_2);
185 :
186 : RegExpTree* GetPropertySequence(const std::vector<char>& name_1);
187 : RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
188 :
189 : uc32 ParseOctalLiteral();
190 :
191 : // Tries to parse the input as a back reference. If successful it
192 : // stores the result in the output parameter and returns true. If
193 : // it fails it will push back the characters read so the same characters
194 : // can be reparsed.
195 : bool ParseBackReferenceIndex(int* index_out);
196 :
197 : // Parse inside a class. Either add escaped class to the range, or return
198 : // false and pass parsed single character through |char_out|.
199 : void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
200 : bool add_unicode_case_equivalents, uc32* char_out,
201 : bool* is_class_escape);
202 :
203 : char ParseClassEscape();
204 :
205 : RegExpTree* ReportError(Vector<const char> message);
206 : void Advance();
207 : void Advance(int dist);
208 : void Reset(int pos);
209 :
210 : // Reports whether the pattern might be used as a literal search string.
211 : // Only use if the result of the parse is a single atom node.
212 : bool simple();
213 : bool contains_anchor() { return contains_anchor_; }
214 6184 : void set_contains_anchor() { contains_anchor_ = true; }
215 : int captures_started() { return captures_started_; }
216 96289 : int position() { return next_pos_ - 1; }
217 : bool failed() { return failed_; }
218 : // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
219 : // just read the initial flag value here.
220 : bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
221 :
222 : static bool IsSyntaxCharacterOrSlash(uc32 c);
223 :
224 : static const int kMaxCaptures = 1 << 16;
225 : static const uc32 kEndMarker = (1 << 21);
226 :
227 : private:
228 : enum SubexpressionType {
229 : INITIAL,
230 : CAPTURE, // All positive values represent captures.
231 : POSITIVE_LOOKAROUND,
232 : NEGATIVE_LOOKAROUND,
233 : GROUPING
234 : };
235 :
236 : class RegExpParserState : public ZoneObject {
237 : public:
238 : // Push a state on the stack.
239 1486200 : RegExpParserState(RegExpParserState* previous_state,
240 : SubexpressionType group_type,
241 : RegExpLookaround::Type lookaround_type,
242 : int disjunction_capture_index,
243 : const ZoneVector<uc16>* capture_name,
244 : JSRegExp::Flags flags, Zone* zone)
245 : : previous_state_(previous_state),
246 : builder_(new (zone) RegExpBuilder(zone, flags)),
247 : group_type_(group_type),
248 : lookaround_type_(lookaround_type),
249 : disjunction_capture_index_(disjunction_capture_index),
250 2972400 : capture_name_(capture_name) {}
251 : // Parser state of containing expression, if any.
252 : RegExpParserState* previous_state() const { return previous_state_; }
253 : bool IsSubexpression() { return previous_state_ != nullptr; }
254 : // RegExpBuilder building this regexp's AST.
255 : RegExpBuilder* builder() const { return builder_; }
256 : // Type of regexp being parsed (parenthesized group or entire regexp).
257 : SubexpressionType group_type() const { return group_type_; }
258 : // Lookahead or Lookbehind.
259 : RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
260 : // Index in captures array of first capture in this sub-expression, if any.
261 : // Also the capture index of this sub-expression itself, if group_type
262 : // is CAPTURE.
263 : int capture_index() const { return disjunction_capture_index_; }
264 : // The name of the current sub-expression, if group_type is CAPTURE. Only
265 : // used for named captures.
266 : const ZoneVector<uc16>* capture_name() const { return capture_name_; }
267 :
268 : bool IsNamedCapture() const { return capture_name_ != nullptr; }
269 :
270 : // Check whether the parser is inside a capture group with the given index.
271 : bool IsInsideCaptureGroup(int index);
272 : // Check whether the parser is inside a capture group with the given name.
273 : bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
274 :
275 : private:
276 : // Linked list implementation of stack of states.
277 : RegExpParserState* const previous_state_;
278 : // Builder for the stored disjunction.
279 : RegExpBuilder* const builder_;
280 : // Stored disjunction type (capture, look-ahead or grouping), if any.
281 : const SubexpressionType group_type_;
282 : // Stored read direction.
283 : const RegExpLookaround::Type lookaround_type_;
284 : // Stored disjunction's capture index (if any).
285 : const int disjunction_capture_index_;
286 : // Stored capture name (if any).
287 : const ZoneVector<uc16>* const capture_name_;
288 : };
289 :
290 : // Return the 1-indexed RegExpCapture object, allocate if necessary.
291 : RegExpCapture* GetCapture(int index);
292 :
293 : // Creates a new named capture at the specified index. Must be called exactly
294 : // once for each named capture. Fails if a capture with the same name is
295 : // encountered.
296 : bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
297 :
298 : // Parses the name of a capture group (?<name>pattern). The name must adhere
299 : // to IdentifierName in the ECMAScript standard.
300 : const ZoneVector<uc16>* ParseCaptureGroupName();
301 :
302 : bool ParseNamedBackReference(RegExpBuilder* builder,
303 : RegExpParserState* state);
304 : RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
305 :
306 : // After the initial parsing pass, patch corresponding RegExpCapture objects
307 : // into all RegExpBackReferences. This is done after initial parsing in order
308 : // to avoid complicating cases in which references comes before the capture.
309 : void PatchNamedBackReferences();
310 :
311 : Handle<FixedArray> CreateCaptureNameMap();
312 :
313 : // Returns true iff the pattern contains named captures. May call
314 : // ScanForCaptures to look ahead at the remaining pattern.
315 : bool HasNamedCaptures();
316 :
317 : Isolate* isolate() { return isolate_; }
318 : Zone* zone() const { return zone_; }
319 :
320 : uc32 current() { return current_; }
321 : bool has_more() { return has_more_; }
322 389298900 : bool has_next() { return next_pos_ < in()->length(); }
323 : uc32 Next();
324 : template <bool update_position>
325 : uc32 ReadNext();
326 : FlatStringReader* in() { return in_; }
327 : void ScanForCaptures();
328 :
329 : Isolate* isolate_;
330 : Zone* zone_;
331 : Handle<String>* error_;
332 : ZoneList<RegExpCapture*>* captures_;
333 : ZoneList<RegExpCapture*>* named_captures_;
334 : ZoneList<RegExpBackReference*>* named_back_references_;
335 : FlatStringReader* in_;
336 : uc32 current_;
337 : // These are the flags specified outside the regexp syntax ie after the
338 : // terminating '/' or in the second argument to the constructor. The current
339 : // flags are stored on the RegExpBuilder.
340 : JSRegExp::Flags top_level_flags_;
341 : int next_pos_;
342 : int captures_started_;
343 : int capture_count_; // Only valid after we have scanned for captures.
344 : bool has_more_;
345 : bool simple_;
346 : bool contains_anchor_;
347 : bool is_scanned_for_captures_;
348 : bool has_named_captures_; // Only valid after we have scanned for captures.
349 : bool failed_;
350 : };
351 :
352 : } // namespace internal
353 : } // namespace v8
354 :
355 : #endif // V8_REGEXP_REGEXP_PARSER_H_
|