Line | Count | Source |
1 | | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Rewrite POSIX and other features in re |
6 | | // to use simple extended regular expression features. |
7 | | // Also sort and simplify character classes. |
8 | | |
9 | | #include <stddef.h> |
10 | | |
11 | | #include <algorithm> |
12 | | #include <string> |
13 | | |
14 | | #include "absl/log/absl_log.h" |
15 | | #include "absl/strings/string_view.h" |
16 | | #include "re2/pod_array.h" |
17 | | #include "re2/regexp.h" |
18 | | #include "re2/walker-inl.h" |
19 | | #include "util/utf.h" |
20 | | |
21 | | namespace re2 { |
22 | | |
23 | | // Parses the regexp src and then simplifies it and sets *dst to the |
24 | | // string representation of the simplified form. Returns true on success. |
25 | | // Returns false and sets *error (if error != NULL) on error. |
26 | | bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags, |
27 | 0 | std::string* dst, RegexpStatus* status) { |
28 | 0 | Regexp* re = Parse(src, flags, status); |
29 | 0 | if (re == NULL) |
30 | 0 | return false; |
31 | 0 | Regexp* sre = re->Simplify(); |
32 | 0 | re->Decref(); |
33 | 0 | if (sre == NULL) { |
34 | 0 | if (status) { |
35 | 0 | status->set_code(kRegexpInternalError); |
36 | 0 | status->set_error_arg(src); |
37 | 0 | } |
38 | 0 | return false; |
39 | 0 | } |
40 | 0 | *dst = sre->ToString(); |
41 | 0 | sre->Decref(); |
42 | 0 | return true; |
43 | 0 | } |
44 | | |
45 | | // Assuming the simple_ flags on the children are accurate, |
46 | | // is this Regexp* simple? |
47 | 923k | bool Regexp::ComputeSimple() { |
48 | 923k | Regexp** subs; |
49 | 923k | switch (op_) { |
50 | 4.91k | case kRegexpNoMatch: |
51 | 63.0k | case kRegexpEmptyMatch: |
52 | 402k | case kRegexpLiteral: |
53 | 404k | case kRegexpLiteralString: |
54 | 413k | case kRegexpBeginLine: |
55 | 425k | case kRegexpEndLine: |
56 | 443k | case kRegexpBeginText: |
57 | 445k | case kRegexpWordBoundary: |
58 | 446k | case kRegexpNoWordBoundary: |
59 | 486k | case kRegexpEndText: |
60 | 491k | case kRegexpAnyChar: |
61 | 494k | case kRegexpAnyByte: |
62 | 494k | case kRegexpHaveMatch: |
63 | 494k | return true; |
64 | 64.1k | case kRegexpConcat: |
65 | 126k | case kRegexpAlternate: |
66 | | // These are simple as long as the subpieces are simple. |
67 | 126k | subs = sub(); |
68 | 767k | for (int i = 0; i < nsub_; i++) |
69 | 677k | if (!subs[i]->simple()) |
70 | 37.1k | return false; |
71 | 89.4k | return true; |
72 | 66.3k | case kRegexpCharClass: |
73 | | // Simple as long as the char class is not empty, not full. |
74 | 66.3k | if (ccb_ != NULL) |
75 | 58.1k | return !ccb_->empty() && !ccb_->full(); |
76 | 8.25k | return !cc_->empty() && !cc_->full(); |
77 | 87.1k | case kRegexpCapture: |
78 | 87.1k | subs = sub(); |
79 | 87.1k | return subs[0]->simple(); |
80 | 41.1k | case kRegexpStar: |
81 | 57.1k | case kRegexpPlus: |
82 | 100k | case kRegexpQuest: |
83 | 100k | subs = sub(); |
84 | 100k | if (!subs[0]->simple()) |
85 | 6.00k | return false; |
86 | 94.1k | switch (subs[0]->op_) { |
87 | 12 | case kRegexpStar: |
88 | 29 | case kRegexpPlus: |
89 | 131 | case kRegexpQuest: |
90 | 422 | case kRegexpEmptyMatch: |
91 | 2.22k | case kRegexpNoMatch: |
92 | 2.22k | return false; |
93 | 91.8k | default: |
94 | 91.8k | break; |
95 | 94.1k | } |
96 | 91.8k | return true; |
97 | 48.8k | case kRegexpRepeat: |
98 | 48.8k | return false; |
99 | 923k | } |
100 | 923k | ABSL_LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; |
101 | 0 | return false; |
102 | 0 | } |
103 | | |
104 | | // Walker subclass used by Simplify. |
105 | | // Coalesces runs of star/plus/quest/repeat of the same literal along with any |
106 | | // occurrences of that literal into repeats of that literal. It also works for |
107 | | // char classes, any char and any byte. |
108 | | // PostVisit creates the coalesced result, which should then be simplified. |
109 | | class CoalesceWalker : public Regexp::Walker<Regexp*> { |
110 | | public: |
111 | 50.4k | CoalesceWalker() {} |
112 | | virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, |
113 | | Regexp** child_args, int nchild_args); |
114 | | virtual Regexp* Copy(Regexp* re); |
115 | | virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); |
116 | | |
117 | | private: |
118 | | // These functions are declared inside CoalesceWalker so that |
119 | | // they can edit the private fields of the Regexps they construct. |
120 | | |
121 | | // Returns true if r1 and r2 can be coalesced. In particular, ensures that |
122 | | // the parse flags are consistent. (They will not be checked again later.) |
123 | | static bool CanCoalesce(Regexp* r1, Regexp* r2); |
124 | | |
125 | | // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards |
126 | | // will be empty match and the coalesced op. In other cases, where part of a |
127 | | // literal string was removed to be coalesced, the array elements afterwards |
128 | | // will be the coalesced op and the remainder of the literal string. |
129 | | static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr); |
130 | | |
131 | | CoalesceWalker(const CoalesceWalker&) = delete; |
132 | | CoalesceWalker& operator=(const CoalesceWalker&) = delete; |
133 | | }; |
134 | | |
135 | | // Walker subclass used by Simplify. |
136 | | // The simplify walk is purely post-recursive: given the simplified children, |
137 | | // PostVisit creates the simplified result. |
138 | | // The child_args are simplified Regexp*s. |
139 | | class SimplifyWalker : public Regexp::Walker<Regexp*> { |
140 | | public: |
141 | 50.4k | SimplifyWalker() {} |
142 | | virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); |
143 | | virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, |
144 | | Regexp** child_args, int nchild_args); |
145 | | virtual Regexp* Copy(Regexp* re); |
146 | | virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); |
147 | | |
148 | | private: |
149 | | // These functions are declared inside SimplifyWalker so that |
150 | | // they can edit the private fields of the Regexps they construct. |
151 | | |
152 | | // Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
153 | | // Caller must Decref return value when done with it. |
154 | | static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); |
155 | | |
156 | | // Simplifies the expression re{min,max} in terms of *, +, and ?. |
157 | | // Returns a new regexp. Does not edit re. Does not consume reference to re. |
158 | | // Caller must Decref return value when done with it. |
159 | | static Regexp* SimplifyRepeat(Regexp* re, int min, int max, |
160 | | Regexp::ParseFlags parse_flags); |
161 | | |
162 | | // Simplifies a character class by expanding any named classes |
163 | | // into rune ranges. Does not edit re. Does not consume ref to re. |
164 | | // Caller must Decref return value when done with it. |
165 | | static Regexp* SimplifyCharClass(Regexp* re); |
166 | | |
167 | | SimplifyWalker(const SimplifyWalker&) = delete; |
168 | | SimplifyWalker& operator=(const SimplifyWalker&) = delete; |
169 | | }; |
170 | | |
171 | | // Simplifies a regular expression, returning a new regexp. |
172 | | // The new regexp uses traditional Unix egrep features only, |
173 | | // plus the Perl (?:) non-capturing parentheses. |
174 | | // Otherwise, no POSIX or Perl additions. The new regexp |
175 | | // captures exactly the same subexpressions (with the same indices) |
176 | | // as the original. |
177 | | // Does not edit current object. |
178 | | // Caller must Decref() return value when done with it. |
179 | | |
180 | 50.4k | Regexp* Regexp::Simplify() { |
181 | 50.4k | CoalesceWalker cw; |
182 | 50.4k | Regexp* cre = cw.Walk(this, NULL); |
183 | 50.4k | if (cre == NULL) |
184 | 0 | return NULL; |
185 | 50.4k | if (cw.stopped_early()) { |
186 | 0 | cre->Decref(); |
187 | 0 | return NULL; |
188 | 0 | } |
189 | 50.4k | SimplifyWalker sw; |
190 | 50.4k | Regexp* sre = sw.Walk(cre, NULL); |
191 | 50.4k | cre->Decref(); |
192 | 50.4k | if (sre == NULL) |
193 | 0 | return NULL; |
194 | 50.4k | if (sw.stopped_early()) { |
195 | 0 | sre->Decref(); |
196 | 0 | return NULL; |
197 | 0 | } |
198 | 50.4k | return sre; |
199 | 50.4k | } |
200 | | |
201 | | #define Simplify DontCallSimplify // Avoid accidental recursion |
202 | | |
203 | | // Utility function for PostVisit implementations that compares re->sub() with |
204 | | // child_args to determine whether any child_args changed. In the common case, |
205 | | // where nothing changed, calls Decref() for all child_args and returns false, |
206 | | // so PostVisit must return re->Incref(). Otherwise, returns true. |
207 | 593k | static bool ChildArgsChanged(Regexp* re, Regexp** child_args) { |
208 | 1.72M | for (int i = 0; i < re->nsub(); i++) { |
209 | 1.22M | Regexp* sub = re->sub()[i]; |
210 | 1.22M | Regexp* newsub = child_args[i]; |
211 | 1.22M | if (newsub != sub) |
212 | 97.6k | return true; |
213 | 1.22M | } |
214 | 1.55M | for (int i = 0; i < re->nsub(); i++) { |
215 | 1.06M | Regexp* newsub = child_args[i]; |
216 | 1.06M | newsub->Decref(); |
217 | 1.06M | } |
218 | 495k | return false; |
219 | 593k | } |
220 | | |
221 | 0 | Regexp* CoalesceWalker::Copy(Regexp* re) { |
222 | 0 | return re->Incref(); |
223 | 0 | } |
224 | | |
225 | 0 | Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { |
226 | | // Should never be called: we use Walk(), not WalkExponential(). |
227 | | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
228 | | ABSL_LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; |
229 | | #endif |
230 | 0 | return re->Incref(); |
231 | 0 | } |
232 | | |
233 | | Regexp* CoalesceWalker::PostVisit(Regexp* re, |
234 | | Regexp* parent_arg, |
235 | | Regexp* pre_arg, |
236 | | Regexp** child_args, |
237 | 1.25M | int nchild_args) { |
238 | 1.25M | if (re->nsub() == 0) |
239 | 740k | return re->Incref(); |
240 | | |
241 | 512k | if (re->op() != kRegexpConcat) { |
242 | 399k | if (!ChildArgsChanged(re, child_args)) |
243 | 369k | return re->Incref(); |
244 | | |
245 | | // Something changed. Build a new op. |
246 | 29.6k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
247 | 29.6k | nre->AllocSub(re->nsub()); |
248 | 29.6k | Regexp** nre_subs = nre->sub(); |
249 | 89.6k | for (int i = 0; i < re->nsub(); i++) |
250 | 60.0k | nre_subs[i] = child_args[i]; |
251 | | // Repeats and Captures have additional data that must be copied. |
252 | 29.6k | if (re->op() == kRegexpRepeat) { |
253 | 1.74k | nre->min_ = re->min(); |
254 | 1.74k | nre->max_ = re->max(); |
255 | 27.8k | } else if (re->op() == kRegexpCapture) { |
256 | 10.3k | nre->cap_ = re->cap(); |
257 | 10.3k | } |
258 | 29.6k | return nre; |
259 | 399k | } |
260 | | |
261 | 112k | bool can_coalesce = false; |
262 | 607k | for (int i = 0; i < re->nsub(); i++) { |
263 | 502k | if (i+1 < re->nsub() && |
264 | 396k | CanCoalesce(child_args[i], child_args[i+1])) { |
265 | 7.18k | can_coalesce = true; |
266 | 7.18k | break; |
267 | 7.18k | } |
268 | 502k | } |
269 | 112k | if (!can_coalesce) { |
270 | 105k | if (!ChildArgsChanged(re, child_args)) |
271 | 91.3k | return re->Incref(); |
272 | | |
273 | | // Something changed. Build a new op. |
274 | 14.3k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
275 | 14.3k | nre->AllocSub(re->nsub()); |
276 | 14.3k | Regexp** nre_subs = nre->sub(); |
277 | 51.2k | for (int i = 0; i < re->nsub(); i++) |
278 | 36.9k | nre_subs[i] = child_args[i]; |
279 | 14.3k | return nre; |
280 | 105k | } |
281 | | |
282 | 156k | for (int i = 0; i < re->nsub(); i++) { |
283 | 149k | if (i+1 < re->nsub() && |
284 | 141k | CanCoalesce(child_args[i], child_args[i+1])) |
285 | 91.0k | DoCoalesce(&child_args[i], &child_args[i+1]); |
286 | 149k | } |
287 | | // Determine how many empty matches were left by DoCoalesce. |
288 | 7.18k | int n = 0; |
289 | 156k | for (int i = n; i < re->nsub(); i++) { |
290 | 149k | if (child_args[i]->op() == kRegexpEmptyMatch) |
291 | 79.3k | n++; |
292 | 149k | } |
293 | | // Build a new op. |
294 | 7.18k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
295 | 7.18k | nre->AllocSub(re->nsub() - n); |
296 | 7.18k | Regexp** nre_subs = nre->sub(); |
297 | 156k | for (int i = 0, j = 0; i < re->nsub(); i++) { |
298 | 149k | if (child_args[i]->op() == kRegexpEmptyMatch) { |
299 | 79.3k | child_args[i]->Decref(); |
300 | 79.3k | continue; |
301 | 79.3k | } |
302 | 69.7k | nre_subs[j] = child_args[i]; |
303 | 69.7k | j++; |
304 | 69.7k | } |
305 | 7.18k | return nre; |
306 | 112k | } |
307 | | |
308 | 538k | bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) { |
309 | | // r1 must be a star/plus/quest/repeat of a literal, char class, any char or |
310 | | // any byte. |
311 | 538k | if ((r1->op() == kRegexpStar || |
312 | 513k | r1->op() == kRegexpPlus || |
313 | 497k | r1->op() == kRegexpQuest || |
314 | 467k | r1->op() == kRegexpRepeat) && |
315 | 184k | (r1->sub()[0]->op() == kRegexpLiteral || |
316 | 56.2k | r1->sub()[0]->op() == kRegexpCharClass || |
317 | 46.6k | r1->sub()[0]->op() == kRegexpAnyChar || |
318 | 140k | r1->sub()[0]->op() == kRegexpAnyByte)) { |
319 | | // r2 must be a star/plus/quest/repeat of the same literal, char class, |
320 | | // any char or any byte. |
321 | 140k | if ((r2->op() == kRegexpStar || |
322 | 134k | r2->op() == kRegexpPlus || |
323 | 128k | r2->op() == kRegexpQuest || |
324 | 82.7k | r2->op() == kRegexpRepeat) && |
325 | 82.0k | Regexp::Equal(r1->sub()[0], r2->sub()[0]) && |
326 | | // The parse flags must be consistent. |
327 | 66.2k | ((r1->parse_flags() & Regexp::NonGreedy) == |
328 | 66.2k | (r2->parse_flags() & Regexp::NonGreedy))) { |
329 | 65.8k | return true; |
330 | 65.8k | } |
331 | | // ... OR an occurrence of that literal, char class, any char or any byte |
332 | 74.3k | if (Regexp::Equal(r1->sub()[0], r2)) { |
333 | 14.0k | return true; |
334 | 14.0k | } |
335 | | // ... OR a literal string that begins with that literal. |
336 | 60.2k | if (r1->sub()[0]->op() == kRegexpLiteral && |
337 | 51.5k | r2->op() == kRegexpLiteralString && |
338 | 31.1k | r2->runes()[0] == r1->sub()[0]->rune() && |
339 | | // The parse flags must be consistent. |
340 | 18.5k | ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) == |
341 | 18.5k | (r2->parse_flags() & Regexp::FoldCase))) { |
342 | 18.3k | return true; |
343 | 18.3k | } |
344 | 60.2k | } |
345 | 440k | return false; |
346 | 538k | } |
347 | | |
348 | 91.0k | void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { |
349 | 91.0k | Regexp* r1 = *r1ptr; |
350 | 91.0k | Regexp* r2 = *r2ptr; |
351 | | |
352 | 91.0k | Regexp* nre = Regexp::Repeat( |
353 | 91.0k | r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0); |
354 | | |
355 | 91.0k | switch (r1->op()) { |
356 | 2.88k | case kRegexpStar: |
357 | 2.88k | nre->min_ = 0; |
358 | 2.88k | nre->max_ = -1; |
359 | 2.88k | break; |
360 | | |
361 | 3.59k | case kRegexpPlus: |
362 | 3.59k | nre->min_ = 1; |
363 | 3.59k | nre->max_ = -1; |
364 | 3.59k | break; |
365 | | |
366 | 12.7k | case kRegexpQuest: |
367 | 12.7k | nre->min_ = 0; |
368 | 12.7k | nre->max_ = 1; |
369 | 12.7k | break; |
370 | | |
371 | 71.8k | case kRegexpRepeat: |
372 | 71.8k | nre->min_ = r1->min(); |
373 | 71.8k | nre->max_ = r1->max(); |
374 | 71.8k | break; |
375 | | |
376 | 0 | default: |
377 | 0 | nre->Decref(); |
378 | 0 | ABSL_LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); |
379 | 0 | return; |
380 | 91.0k | } |
381 | | |
382 | 91.0k | switch (r2->op()) { |
383 | 1.52k | case kRegexpStar: |
384 | 1.52k | nre->max_ = -1; |
385 | 1.52k | goto LeaveEmpty; |
386 | | |
387 | 2.75k | case kRegexpPlus: |
388 | 2.75k | nre->min_++; |
389 | 2.75k | nre->max_ = -1; |
390 | 2.75k | goto LeaveEmpty; |
391 | | |
392 | 39.7k | case kRegexpQuest: |
393 | 39.7k | if (nre->max() != -1) |
394 | 39.3k | nre->max_++; |
395 | 39.7k | goto LeaveEmpty; |
396 | | |
397 | 19.9k | case kRegexpRepeat: |
398 | 19.9k | nre->min_ += r2->min(); |
399 | 19.9k | if (r2->max() == -1) |
400 | 310 | nre->max_ = -1; |
401 | 19.6k | else if (nre->max() != -1) |
402 | 18.9k | nre->max_ += r2->max(); |
403 | 19.9k | goto LeaveEmpty; |
404 | | |
405 | 9.10k | case kRegexpLiteral: |
406 | 10.3k | case kRegexpCharClass: |
407 | 10.6k | case kRegexpAnyChar: |
408 | 11.1k | case kRegexpAnyByte: |
409 | 11.1k | nre->min_++; |
410 | 11.1k | if (nre->max() != -1) |
411 | 6.56k | nre->max_++; |
412 | 11.1k | goto LeaveEmpty; |
413 | | |
414 | 79.2k | LeaveEmpty: |
415 | 79.2k | *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags); |
416 | 79.2k | *r2ptr = nre; |
417 | 79.2k | break; |
418 | | |
419 | 16.0k | case kRegexpLiteralString: { |
420 | 16.0k | Rune r = r1->sub()[0]->rune(); |
421 | | // Determine how much of the literal string is removed. |
422 | | // We know that we have at least one rune. :) |
423 | 16.0k | int n = 1; |
424 | 35.8k | while (n < r2->nrunes() && r2->runes()[n] == r) |
425 | 19.8k | n++; |
426 | 16.0k | nre->min_ += n; |
427 | 16.0k | if (nre->max() != -1) |
428 | 12.6k | nre->max_ += n; |
429 | 16.0k | if (n == r2->nrunes()) |
430 | 4.21k | goto LeaveEmpty; |
431 | 11.8k | *r1ptr = nre; |
432 | 11.8k | *r2ptr = Regexp::LiteralString( |
433 | 11.8k | &r2->runes()[n], r2->nrunes() - n, r2->parse_flags()); |
434 | 11.8k | break; |
435 | 16.0k | } |
436 | | |
437 | 0 | default: |
438 | 0 | nre->Decref(); |
439 | 0 | ABSL_LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); |
440 | 0 | return; |
441 | 91.0k | } |
442 | | |
443 | 91.0k | r1->Decref(); |
444 | 91.0k | r2->Decref(); |
445 | 91.0k | } |
446 | | |
447 | 0 | Regexp* SimplifyWalker::Copy(Regexp* re) { |
448 | 0 | return re->Incref(); |
449 | 0 | } |
450 | | |
451 | 0 | Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { |
452 | | // Should never be called: we use Walk(), not WalkExponential(). |
453 | | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
454 | | ABSL_LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; |
455 | | #endif |
456 | 0 | return re->Incref(); |
457 | 0 | } |
458 | | |
459 | 520k | Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { |
460 | 520k | if (re->simple()) { |
461 | 284k | *stop = true; |
462 | 284k | return re->Incref(); |
463 | 284k | } |
464 | 236k | return NULL; |
465 | 520k | } |
466 | | |
467 | | Regexp* SimplifyWalker::PostVisit(Regexp* re, |
468 | | Regexp* parent_arg, |
469 | | Regexp* pre_arg, |
470 | | Regexp** child_args, |
471 | 236k | int nchild_args) { |
472 | 236k | switch (re->op()) { |
473 | 0 | case kRegexpNoMatch: |
474 | 4.02k | case kRegexpEmptyMatch: |
475 | 12.9k | case kRegexpLiteral: |
476 | 22.1k | case kRegexpLiteralString: |
477 | 22.1k | case kRegexpBeginLine: |
478 | 22.1k | case kRegexpEndLine: |
479 | 22.1k | case kRegexpBeginText: |
480 | 22.1k | case kRegexpWordBoundary: |
481 | 22.1k | case kRegexpNoWordBoundary: |
482 | 22.1k | case kRegexpEndText: |
483 | 22.1k | case kRegexpAnyChar: |
484 | 22.1k | case kRegexpAnyByte: |
485 | 31.8k | case kRegexpHaveMatch: |
486 | | // All these are always simple. |
487 | 31.8k | re->simple_ = true; |
488 | 31.8k | return re->Incref(); |
489 | | |
490 | 60.2k | case kRegexpConcat: |
491 | 88.4k | case kRegexpAlternate: { |
492 | | // These are simple as long as the subpieces are simple. |
493 | 88.4k | if (!ChildArgsChanged(re, child_args)) { |
494 | 34.8k | re->simple_ = true; |
495 | 34.8k | return re->Incref(); |
496 | 34.8k | } |
497 | 53.6k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
498 | 53.6k | nre->AllocSub(re->nsub()); |
499 | 53.6k | Regexp** nre_subs = nre->sub(); |
500 | 306k | for (int i = 0; i < re->nsub(); i++) |
501 | 253k | nre_subs[i] = child_args[i]; |
502 | 53.6k | nre->simple_ = true; |
503 | 53.6k | return nre; |
504 | 88.4k | } |
505 | | |
506 | 15.5k | case kRegexpCapture: { |
507 | 15.5k | Regexp* newsub = child_args[0]; |
508 | 15.5k | if (newsub == re->sub()[0]) { |
509 | 2.18k | newsub->Decref(); |
510 | 2.18k | re->simple_ = true; |
511 | 2.18k | return re->Incref(); |
512 | 2.18k | } |
513 | 13.3k | Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); |
514 | 13.3k | nre->AllocSub(1); |
515 | 13.3k | nre->sub()[0] = newsub; |
516 | 13.3k | nre->cap_ = re->cap(); |
517 | 13.3k | nre->simple_ = true; |
518 | 13.3k | return nre; |
519 | 15.5k | } |
520 | | |
521 | 9.49k | case kRegexpStar: |
522 | 13.5k | case kRegexpPlus: |
523 | 16.1k | case kRegexpQuest: { |
524 | 16.1k | Regexp* newsub = child_args[0]; |
525 | | // Special case: repeat the empty string as much as |
526 | | // you want, but it's still the empty string. |
527 | 16.1k | if (newsub->op() == kRegexpEmptyMatch) |
528 | 2.68k | return newsub; |
529 | | |
530 | | // These are simple as long as the subpiece is simple. |
531 | 13.4k | if (newsub == re->sub()[0]) { |
532 | 2.34k | newsub->Decref(); |
533 | 2.34k | re->simple_ = true; |
534 | 2.34k | return re->Incref(); |
535 | 2.34k | } |
536 | | |
537 | | // These are also idempotent if flags are constant. |
538 | 11.0k | if (re->op() == newsub->op() && |
539 | 1.67k | re->parse_flags() == newsub->parse_flags()) |
540 | 816 | return newsub; |
541 | | |
542 | 10.2k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
543 | 10.2k | nre->AllocSub(1); |
544 | 10.2k | nre->sub()[0] = newsub; |
545 | 10.2k | nre->simple_ = true; |
546 | 10.2k | return nre; |
547 | 11.0k | } |
548 | | |
549 | 79.9k | case kRegexpRepeat: { |
550 | 79.9k | Regexp* newsub = child_args[0]; |
551 | | // Special case: repeat the empty string as much as |
552 | | // you want, but it's still the empty string. |
553 | 79.9k | if (newsub->op() == kRegexpEmptyMatch) |
554 | 4.52k | return newsub; |
555 | | |
556 | 75.4k | Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, |
557 | 75.4k | re->parse_flags()); |
558 | 75.4k | newsub->Decref(); |
559 | 75.4k | nre->simple_ = true; |
560 | 75.4k | return nre; |
561 | 79.9k | } |
562 | | |
563 | 4.73k | case kRegexpCharClass: { |
564 | 4.73k | Regexp* nre = SimplifyCharClass(re); |
565 | 4.73k | nre->simple_ = true; |
566 | 4.73k | return nre; |
567 | 79.9k | } |
568 | 236k | } |
569 | | |
570 | 236k | ABSL_LOG(ERROR) << "Simplify case not handled: " << re->op(); |
571 | 0 | return re->Incref(); |
572 | 0 | } |
573 | | |
574 | | // Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
575 | | // Returns a new Regexp, handing the ref to the caller. |
576 | | Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, |
577 | 105k | Regexp::ParseFlags parse_flags) { |
578 | 105k | Regexp* re = new Regexp(kRegexpConcat, parse_flags); |
579 | 105k | re->AllocSub(2); |
580 | 105k | Regexp** subs = re->sub(); |
581 | 105k | subs[0] = re1; |
582 | 105k | subs[1] = re2; |
583 | 105k | return re; |
584 | 105k | } |
585 | | |
586 | | // Returns true if re is an empty-width op. |
587 | 89.3k | static bool IsEmptyOp(Regexp* re) { |
588 | 89.3k | return (re->op() == kRegexpBeginLine || |
589 | 88.5k | re->op() == kRegexpEndLine || |
590 | 87.7k | re->op() == kRegexpWordBoundary || |
591 | 87.3k | re->op() == kRegexpNoWordBoundary || |
592 | 87.0k | re->op() == kRegexpBeginText || |
593 | 85.4k | re->op() == kRegexpEndText); |
594 | 89.3k | } |
595 | | |
596 | | // Simplifies the expression re{min,max} in terms of *, +, and ?. |
597 | | // Returns a new regexp. Does not edit re. Does not consume reference to re. |
598 | | // Caller must Decref return value when done with it. |
599 | | // The result will *not* necessarily have the right capturing parens |
600 | | // if you call ToString() and re-parse it: (x){2} becomes (x)(x), |
601 | | // but in the Regexp* representation, both (x) are marked as $1. |
602 | | Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, |
603 | 75.4k | Regexp::ParseFlags f) { |
604 | | // For an empty-width op OR a concatenation or alternation of empty-width |
605 | | // ops, cap the repetition count at 1. |
606 | 75.4k | if (IsEmptyOp(re) || |
607 | 73.2k | ((re->op() == kRegexpConcat || |
608 | 67.2k | re->op() == kRegexpAlternate) && |
609 | 11.4k | std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) { |
610 | 2.63k | min = std::min(min, 1); |
611 | 2.63k | max = std::min(max, 1); |
612 | 2.63k | } |
613 | | |
614 | | // x{n,} means at least n matches of x. |
615 | 75.4k | if (max == -1) { |
616 | | // Special case: x{0,} is x* |
617 | 10.7k | if (min == 0) |
618 | 3.42k | return Regexp::Star(re->Incref(), f); |
619 | | |
620 | | // Special case: x{1,} is x+ |
621 | 7.36k | if (min == 1) |
622 | 2.70k | return Regexp::Plus(re->Incref(), f); |
623 | | |
624 | | // General case: x{4,} is xxxx+ |
625 | 4.66k | PODArray<Regexp*> nre_subs(min); |
626 | 36.8k | for (int i = 0; i < min-1; i++) |
627 | 32.2k | nre_subs[i] = re->Incref(); |
628 | 4.66k | nre_subs[min-1] = Regexp::Plus(re->Incref(), f); |
629 | 4.66k | return Regexp::Concat(nre_subs.data(), min, f); |
630 | 7.36k | } |
631 | | |
632 | | // Special case: (x){0} matches only empty string. |
633 | 64.6k | if (min == 0 && max == 0) |
634 | 946 | return new Regexp(kRegexpEmptyMatch, f); |
635 | | |
636 | | // Special case: x{1} is just x. |
637 | 63.6k | if (min == 1 && max == 1) |
638 | 4.45k | return re->Incref(); |
639 | | |
640 | | // General case: x{n,m} means n copies of x and m copies of x?. |
641 | | // The machine will do less work if we nest the final m copies, |
642 | | // so that x{2,5} = xx(x(x(x)?)?)? |
643 | | |
644 | | // Build leading prefix: xx. Capturing only on the last one. |
645 | 59.2k | Regexp* nre = NULL; |
646 | 59.2k | if (min > 0) { |
647 | 55.3k | PODArray<Regexp*> nre_subs(min); |
648 | 531k | for (int i = 0; i < min; i++) |
649 | 475k | nre_subs[i] = re->Incref(); |
650 | 55.3k | nre = Regexp::Concat(nre_subs.data(), min, f); |
651 | 55.3k | } |
652 | | |
653 | | // Build and attach suffix: (x(x(x)?)?)? |
654 | 59.2k | if (max > min) { |
655 | 15.6k | Regexp* suf = Regexp::Quest(re->Incref(), f); |
656 | 109k | for (int i = min+1; i < max; i++) |
657 | 93.7k | suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); |
658 | 15.6k | if (nre == NULL) |
659 | 3.92k | nre = suf; |
660 | 11.6k | else |
661 | 11.6k | nre = Concat2(nre, suf, f); |
662 | 15.6k | } |
663 | | |
664 | 59.2k | if (nre == NULL) { |
665 | | // Some degenerate case, like min > max, or min < max < 0. |
666 | | // This shouldn't happen, because the parser rejects such regexps. |
667 | 0 | ABSL_LOG(DFATAL) << "Malformed repeat of " << re->ToString() |
668 | 0 | << " min " << min << " max " << max; |
669 | 0 | return new Regexp(kRegexpNoMatch, f); |
670 | 0 | } |
671 | | |
672 | 59.2k | return nre; |
673 | 59.2k | } |
674 | | |
675 | | // Simplifies a character class. |
676 | | // Caller must Decref return value when done with it. |
677 | 4.73k | Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { |
678 | 4.73k | CharClass* cc = re->cc(); |
679 | | |
680 | | // Special cases |
681 | 4.73k | if (cc->empty()) |
682 | 2.61k | return new Regexp(kRegexpNoMatch, re->parse_flags()); |
683 | 2.11k | if (cc->full()) |
684 | 826 | return new Regexp(kRegexpAnyChar, re->parse_flags()); |
685 | | |
686 | 1.29k | return re->Incref(); |
687 | 2.11k | } |
688 | | |
689 | | } // namespace re2 |