Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Rewrite POSIX and other features in re |
6 | | // to use simple extended regular expression features. |
7 | | // Also sort and simplify character classes. |
8 | | |
9 | | #include <string> |
10 | | |
11 | | #include "util/util.h" |
12 | | #include "util/logging.h" |
13 | | #include "util/utf.h" |
14 | | #include "re2/pod_array.h" |
15 | | #include "re2/regexp.h" |
16 | | #include "re2/walker-inl.h" |
17 | | |
18 | | namespace re2 { |
19 | | |
20 | | // Parses the regexp src and then simplifies it and sets *dst to the |
21 | | // string representation of the simplified form. Returns true on success. |
22 | | // Returns false and sets *error (if error != NULL) on error. |
23 | | bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, |
24 | 0 | std::string* dst, RegexpStatus* status) { |
25 | 0 | Regexp* re = Parse(src, flags, status); |
26 | 0 | if (re == NULL) |
27 | 0 | return false; |
28 | 0 | Regexp* sre = re->Simplify(); |
29 | 0 | re->Decref(); |
30 | 0 | if (sre == NULL) { |
31 | 0 | if (status) { |
32 | 0 | status->set_code(kRegexpInternalError); |
33 | 0 | status->set_error_arg(src); |
34 | 0 | } |
35 | 0 | return false; |
36 | 0 | } |
37 | 0 | *dst = sre->ToString(); |
38 | 0 | sre->Decref(); |
39 | 0 | return true; |
40 | 0 | } |
41 | | |
42 | | // Assuming the simple_ flags on the children are accurate, |
43 | | // is this Regexp* simple? |
44 | 1.10M | bool Regexp::ComputeSimple() { |
45 | 1.10M | Regexp** subs; |
46 | 1.10M | switch (op_) { |
47 | 5.38k | case kRegexpNoMatch: |
48 | 37.8k | case kRegexpEmptyMatch: |
49 | 612k | case kRegexpLiteral: |
50 | 615k | case kRegexpLiteralString: |
51 | 621k | case kRegexpBeginLine: |
52 | 626k | case kRegexpEndLine: |
53 | 647k | case kRegexpBeginText: |
54 | 650k | case kRegexpWordBoundary: |
55 | 651k | case kRegexpNoWordBoundary: |
56 | 664k | case kRegexpEndText: |
57 | 688k | case kRegexpAnyChar: |
58 | 693k | case kRegexpAnyByte: |
59 | 693k | case kRegexpHaveMatch: |
60 | 693k | return true; |
61 | 88.8k | case kRegexpConcat: |
62 | 113k | case kRegexpAlternate: |
63 | | // These are simple as long as the subpieces are simple. |
64 | 113k | subs = sub(); |
65 | 574k | for (int i = 0; i < nsub_; i++) |
66 | 503k | if (!subs[i]->simple()) |
67 | 42.2k | return false; |
68 | 71.1k | return true; |
69 | 142k | case kRegexpCharClass: |
70 | | // Simple as long as the char class is not empty, not full. |
71 | 142k | if (ccb_ != NULL) |
72 | 140k | return !ccb_->empty() && !ccb_->full(); |
73 | 2.22k | return !cc_->empty() && !cc_->full(); |
74 | 47.8k | case kRegexpCapture: |
75 | 47.8k | subs = sub(); |
76 | 47.8k | return subs[0]->simple(); |
77 | 20.1k | case kRegexpStar: |
78 | 43.9k | case kRegexpPlus: |
79 | 72.5k | case kRegexpQuest: |
80 | 72.5k | subs = sub(); |
81 | 72.5k | if (!subs[0]->simple()) |
82 | 4.29k | return false; |
83 | 68.2k | switch (subs[0]->op_) { |
84 | 72 | case kRegexpStar: |
85 | 278 | case kRegexpPlus: |
86 | 576 | case kRegexpQuest: |
87 | 857 | case kRegexpEmptyMatch: |
88 | 2.06k | case kRegexpNoMatch: |
89 | 2.06k | return false; |
90 | 66.2k | default: |
91 | 66.2k | break; |
92 | 68.2k | } |
93 | 66.2k | return true; |
94 | 31.4k | case kRegexpRepeat: |
95 | 31.4k | return false; |
96 | 1.10M | } |
97 | 0 | LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; |
98 | 0 | return false; |
99 | 1.10M | } |
100 | | |
101 | | // Walker subclass used by Simplify. |
102 | | // Coalesces runs of star/plus/quest/repeat of the same literal along with any |
103 | | // occurrences of that literal into repeats of that literal. It also works for |
104 | | // char classes, any char and any byte. |
105 | | // PostVisit creates the coalesced result, which should then be simplified. |
106 | | class CoalesceWalker : public Regexp::Walker<Regexp*> { |
107 | | public: |
108 | 38.9k | CoalesceWalker() {} |
109 | | virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, |
110 | | Regexp** child_args, int nchild_args); |
111 | | virtual Regexp* Copy(Regexp* re); |
112 | | virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); |
113 | | |
114 | | private: |
115 | | // These functions are declared inside CoalesceWalker so that |
116 | | // they can edit the private fields of the Regexps they construct. |
117 | | |
118 | | // Returns true if r1 and r2 can be coalesced. In particular, ensures that |
119 | | // the parse flags are consistent. (They will not be checked again later.) |
120 | | static bool CanCoalesce(Regexp* r1, Regexp* r2); |
121 | | |
122 | | // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards |
123 | | // will be empty match and the coalesced op. In other cases, where part of a |
124 | | // literal string was removed to be coalesced, the array elements afterwards |
125 | | // will be the coalesced op and the remainder of the literal string. |
126 | | static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr); |
127 | | |
128 | | CoalesceWalker(const CoalesceWalker&) = delete; |
129 | | CoalesceWalker& operator=(const CoalesceWalker&) = delete; |
130 | | }; |
131 | | |
132 | | // Walker subclass used by Simplify. |
133 | | // The simplify walk is purely post-recursive: given the simplified children, |
134 | | // PostVisit creates the simplified result. |
135 | | // The child_args are simplified Regexp*s. |
136 | | class SimplifyWalker : public Regexp::Walker<Regexp*> { |
137 | | public: |
138 | 38.9k | SimplifyWalker() {} |
139 | | virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); |
140 | | virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, |
141 | | Regexp** child_args, int nchild_args); |
142 | | virtual Regexp* Copy(Regexp* re); |
143 | | virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); |
144 | | |
145 | | private: |
146 | | // These functions are declared inside SimplifyWalker so that |
147 | | // they can edit the private fields of the Regexps they construct. |
148 | | |
149 | | // Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
150 | | // Caller must Decref return value when done with it. |
151 | | static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); |
152 | | |
153 | | // Simplifies the expression re{min,max} in terms of *, +, and ?. |
154 | | // Returns a new regexp. Does not edit re. Does not consume reference to re. |
155 | | // Caller must Decref return value when done with it. |
156 | | static Regexp* SimplifyRepeat(Regexp* re, int min, int max, |
157 | | Regexp::ParseFlags parse_flags); |
158 | | |
159 | | // Simplifies a character class by expanding any named classes |
160 | | // into rune ranges. Does not edit re. Does not consume ref to re. |
161 | | // Caller must Decref return value when done with it. |
162 | | static Regexp* SimplifyCharClass(Regexp* re); |
163 | | |
164 | | SimplifyWalker(const SimplifyWalker&) = delete; |
165 | | SimplifyWalker& operator=(const SimplifyWalker&) = delete; |
166 | | }; |
167 | | |
168 | | // Simplifies a regular expression, returning a new regexp. |
169 | | // The new regexp uses traditional Unix egrep features only, |
170 | | // plus the Perl (?:) non-capturing parentheses. |
171 | | // Otherwise, no POSIX or Perl additions. The new regexp |
172 | | // captures exactly the same subexpressions (with the same indices) |
173 | | // as the original. |
174 | | // Does not edit current object. |
175 | | // Caller must Decref() return value when done with it. |
176 | | |
177 | 38.9k | Regexp* Regexp::Simplify() { |
178 | 38.9k | CoalesceWalker cw; |
179 | 38.9k | Regexp* cre = cw.Walk(this, NULL); |
180 | 38.9k | if (cre == NULL) |
181 | 0 | return NULL; |
182 | 38.9k | if (cw.stopped_early()) { |
183 | 0 | cre->Decref(); |
184 | 0 | return NULL; |
185 | 0 | } |
186 | 38.9k | SimplifyWalker sw; |
187 | 38.9k | Regexp* sre = sw.Walk(cre, NULL); |
188 | 38.9k | cre->Decref(); |
189 | 38.9k | if (sre == NULL) |
190 | 0 | return NULL; |
191 | 38.9k | if (sw.stopped_early()) { |
192 | 0 | sre->Decref(); |
193 | 0 | return NULL; |
194 | 0 | } |
195 | 38.9k | return sre; |
196 | 38.9k | } |
197 | | |
198 | | #define Simplify DontCallSimplify // Avoid accidental recursion |
199 | | |
200 | | // Utility function for PostVisit implementations that compares re->sub() with |
201 | | // child_args to determine whether any child_args changed. In the common case, |
202 | | // where nothing changed, calls Decref() for all child_args and returns false, |
203 | | // so PostVisit must return re->Incref(). Otherwise, returns true. |
204 | 279k | static bool ChildArgsChanged(Regexp* re, Regexp** child_args) { |
205 | 967k | for (int i = 0; i < re->nsub(); i++) { |
206 | 739k | Regexp* sub = re->sub()[i]; |
207 | 739k | Regexp* newsub = child_args[i]; |
208 | 739k | if (newsub != sub) |
209 | 51.6k | return true; |
210 | 739k | } |
211 | 831k | for (int i = 0; i < re->nsub(); i++) { |
212 | 603k | Regexp* newsub = child_args[i]; |
213 | 603k | newsub->Decref(); |
214 | 603k | } |
215 | 227k | return false; |
216 | 279k | } |
217 | | |
218 | 0 | Regexp* CoalesceWalker::Copy(Regexp* re) { |
219 | 0 | return re->Incref(); |
220 | 0 | } |
221 | | |
222 | 0 | Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { |
223 | | // Should never be called: we use Walk(), not WalkExponential(). |
224 | | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
225 | | LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; |
226 | | #endif |
227 | 0 | return re->Incref(); |
228 | 0 | } |
229 | | |
230 | | Regexp* CoalesceWalker::PostVisit(Regexp* re, |
231 | | Regexp* parent_arg, |
232 | | Regexp* pre_arg, |
233 | | Regexp** child_args, |
234 | 666k | int nchild_args) { |
235 | 666k | if (re->nsub() == 0) |
236 | 445k | return re->Incref(); |
237 | | |
238 | 221k | if (re->op() != kRegexpConcat) { |
239 | 139k | if (!ChildArgsChanged(re, child_args)) |
240 | 133k | return re->Incref(); |
241 | | |
242 | | // Something changed. Build a new op. |
243 | 6.72k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
244 | 6.72k | nre->AllocSub(re->nsub()); |
245 | 6.72k | Regexp** nre_subs = nre->sub(); |
246 | 26.8k | for (int i = 0; i < re->nsub(); i++) |
247 | 20.0k | nre_subs[i] = child_args[i]; |
248 | | // Repeats and Captures have additional data that must be copied. |
249 | 6.72k | if (re->op() == kRegexpRepeat) { |
250 | 927 | nre->min_ = re->min(); |
251 | 927 | nre->max_ = re->max(); |
252 | 5.79k | } else if (re->op() == kRegexpCapture) { |
253 | 1.73k | nre->cap_ = re->cap(); |
254 | 1.73k | } |
255 | 6.72k | return nre; |
256 | 139k | } |
257 | | |
258 | 81.2k | bool can_coalesce = false; |
259 | 451k | for (int i = 0; i < re->nsub(); i++) { |
260 | 378k | if (i+1 < re->nsub() && |
261 | 378k | CanCoalesce(child_args[i], child_args[i+1])) { |
262 | 7.98k | can_coalesce = true; |
263 | 7.98k | break; |
264 | 7.98k | } |
265 | 378k | } |
266 | 81.2k | if (!can_coalesce) { |
267 | 73.3k | if (!ChildArgsChanged(re, child_args)) |
268 | 70.0k | return re->Incref(); |
269 | | |
270 | | // Something changed. Build a new op. |
271 | 3.30k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
272 | 3.30k | nre->AllocSub(re->nsub()); |
273 | 3.30k | Regexp** nre_subs = nre->sub(); |
274 | 17.6k | for (int i = 0; i < re->nsub(); i++) |
275 | 14.3k | nre_subs[i] = child_args[i]; |
276 | 3.30k | return nre; |
277 | 73.3k | } |
278 | | |
279 | 76.6k | for (int i = 0; i < re->nsub(); i++) { |
280 | 68.6k | if (i+1 < re->nsub() && |
281 | 68.6k | CanCoalesce(child_args[i], child_args[i+1])) |
282 | 16.7k | DoCoalesce(&child_args[i], &child_args[i+1]); |
283 | 68.6k | } |
284 | | // Determine how many empty matches were left by DoCoalesce. |
285 | 7.98k | int n = 0; |
286 | 76.6k | for (int i = n; i < re->nsub(); i++) { |
287 | 68.6k | if (child_args[i]->op() == kRegexpEmptyMatch) |
288 | 14.0k | n++; |
289 | 68.6k | } |
290 | | // Build a new op. |
291 | 7.98k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
292 | 7.98k | nre->AllocSub(re->nsub() - n); |
293 | 7.98k | Regexp** nre_subs = nre->sub(); |
294 | 76.6k | for (int i = 0, j = 0; i < re->nsub(); i++) { |
295 | 68.6k | if (child_args[i]->op() == kRegexpEmptyMatch) { |
296 | 14.0k | child_args[i]->Decref(); |
297 | 14.0k | continue; |
298 | 14.0k | } |
299 | 54.6k | nre_subs[j] = child_args[i]; |
300 | 54.6k | j++; |
301 | 54.6k | } |
302 | 7.98k | return nre; |
303 | 81.2k | } |
304 | | |
305 | 365k | bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) { |
306 | | // r1 must be a star/plus/quest/repeat of a literal, char class, any char or |
307 | | // any byte. |
308 | 365k | if ((r1->op() == kRegexpStar || |
309 | 365k | r1->op() == kRegexpPlus || |
310 | 365k | r1->op() == kRegexpQuest || |
311 | 365k | r1->op() == kRegexpRepeat) && |
312 | 365k | (r1->sub()[0]->op() == kRegexpLiteral || |
313 | 89.3k | r1->sub()[0]->op() == kRegexpCharClass || |
314 | 89.3k | r1->sub()[0]->op() == kRegexpAnyChar || |
315 | 89.3k | r1->sub()[0]->op() == kRegexpAnyByte)) { |
316 | | // r2 must be a star/plus/quest/repeat of the same literal, char class, |
317 | | // any char or any byte. |
318 | 72.4k | if ((r2->op() == kRegexpStar || |
319 | 72.4k | r2->op() == kRegexpPlus || |
320 | 72.4k | r2->op() == kRegexpQuest || |
321 | 72.4k | r2->op() == kRegexpRepeat) && |
322 | 72.4k | Regexp::Equal(r1->sub()[0], r2->sub()[0]) && |
323 | | // The parse flags must be consistent. |
324 | 72.4k | ((r1->parse_flags() & Regexp::NonGreedy) == |
325 | 8.38k | (r2->parse_flags() & Regexp::NonGreedy))) { |
326 | 7.33k | return true; |
327 | 7.33k | } |
328 | | // ... OR an occurrence of that literal, char class, any char or any byte |
329 | 65.1k | if (Regexp::Equal(r1->sub()[0], r2)) { |
330 | 11.7k | return true; |
331 | 11.7k | } |
332 | | // ... OR a literal string that begins with that literal. |
333 | 53.3k | if (r1->sub()[0]->op() == kRegexpLiteral && |
334 | 53.3k | r2->op() == kRegexpLiteralString && |
335 | 53.3k | r2->runes()[0] == r1->sub()[0]->rune() && |
336 | | // The parse flags must be consistent. |
337 | 53.3k | ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) == |
338 | 5.76k | (r2->parse_flags() & Regexp::FoldCase))) { |
339 | 5.64k | return true; |
340 | 5.64k | } |
341 | 53.3k | } |
342 | 340k | return false; |
343 | 365k | } |
344 | | |
345 | 16.7k | void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { |
346 | 16.7k | Regexp* r1 = *r1ptr; |
347 | 16.7k | Regexp* r2 = *r2ptr; |
348 | | |
349 | 16.7k | Regexp* nre = Regexp::Repeat( |
350 | 16.7k | r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0); |
351 | | |
352 | 16.7k | switch (r1->op()) { |
353 | 2.85k | case kRegexpStar: |
354 | 2.85k | nre->min_ = 0; |
355 | 2.85k | nre->max_ = -1; |
356 | 2.85k | break; |
357 | | |
358 | 3.09k | case kRegexpPlus: |
359 | 3.09k | nre->min_ = 1; |
360 | 3.09k | nre->max_ = -1; |
361 | 3.09k | break; |
362 | | |
363 | 3.23k | case kRegexpQuest: |
364 | 3.23k | nre->min_ = 0; |
365 | 3.23k | nre->max_ = 1; |
366 | 3.23k | break; |
367 | | |
368 | 7.56k | case kRegexpRepeat: |
369 | 7.56k | nre->min_ = r1->min(); |
370 | 7.56k | nre->max_ = r1->max(); |
371 | 7.56k | break; |
372 | | |
373 | 0 | default: |
374 | 0 | nre->Decref(); |
375 | 0 | LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); |
376 | 0 | return; |
377 | 16.7k | } |
378 | | |
379 | 16.7k | switch (r2->op()) { |
380 | 1.02k | case kRegexpStar: |
381 | 1.02k | nre->max_ = -1; |
382 | 1.02k | goto LeaveEmpty; |
383 | | |
384 | 1.07k | case kRegexpPlus: |
385 | 1.07k | nre->min_++; |
386 | 1.07k | nre->max_ = -1; |
387 | 1.07k | goto LeaveEmpty; |
388 | | |
389 | 1.84k | case kRegexpQuest: |
390 | 1.84k | if (nre->max() != -1) |
391 | 1.36k | nre->max_++; |
392 | 1.84k | goto LeaveEmpty; |
393 | | |
394 | 1.04k | case kRegexpRepeat: |
395 | 1.04k | nre->min_ += r2->min(); |
396 | 1.04k | if (r2->max() == -1) |
397 | 268 | nre->max_ = -1; |
398 | 780 | else if (nre->max() != -1) |
399 | 557 | nre->max_ += r2->max(); |
400 | 1.04k | goto LeaveEmpty; |
401 | | |
402 | 1.92k | case kRegexpLiteral: |
403 | 4.83k | case kRegexpCharClass: |
404 | 7.90k | case kRegexpAnyChar: |
405 | 8.37k | case kRegexpAnyByte: |
406 | 8.37k | nre->min_++; |
407 | 8.37k | if (nre->max() != -1) |
408 | 2.30k | nre->max_++; |
409 | 8.37k | goto LeaveEmpty; |
410 | | |
411 | 13.9k | LeaveEmpty: |
412 | 13.9k | *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags); |
413 | 13.9k | *r2ptr = nre; |
414 | 13.9k | break; |
415 | | |
416 | 3.38k | case kRegexpLiteralString: { |
417 | 3.38k | Rune r = r1->sub()[0]->rune(); |
418 | | // Determine how much of the literal string is removed. |
419 | | // We know that we have at least one rune. :) |
420 | 3.38k | int n = 1; |
421 | 7.73k | while (n < r2->nrunes() && r2->runes()[n] == r) |
422 | 4.35k | n++; |
423 | 3.38k | nre->min_ += n; |
424 | 3.38k | if (nre->max() != -1) |
425 | 1.40k | nre->max_ += n; |
426 | 3.38k | if (n == r2->nrunes()) |
427 | 605 | goto LeaveEmpty; |
428 | 2.77k | *r1ptr = nre; |
429 | 2.77k | *r2ptr = Regexp::LiteralString( |
430 | 2.77k | &r2->runes()[n], r2->nrunes() - n, r2->parse_flags()); |
431 | 2.77k | break; |
432 | 3.38k | } |
433 | | |
434 | 0 | default: |
435 | 0 | nre->Decref(); |
436 | 0 | LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); |
437 | 0 | return; |
438 | 16.7k | } |
439 | | |
440 | 16.7k | r1->Decref(); |
441 | 16.7k | r2->Decref(); |
442 | 16.7k | } |
443 | | |
444 | 0 | Regexp* SimplifyWalker::Copy(Regexp* re) { |
445 | 0 | return re->Incref(); |
446 | 0 | } |
447 | | |
448 | 0 | Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { |
449 | | // Should never be called: we use Walk(), not WalkExponential(). |
450 | | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
451 | | LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; |
452 | | #endif |
453 | 0 | return re->Incref(); |
454 | 0 | } |
455 | | |
456 | 371k | Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { |
457 | 371k | if (re->simple()) { |
458 | 240k | *stop = true; |
459 | 240k | return re->Incref(); |
460 | 240k | } |
461 | 130k | return NULL; |
462 | 371k | } |
463 | | |
464 | | Regexp* SimplifyWalker::PostVisit(Regexp* re, |
465 | | Regexp* parent_arg, |
466 | | Regexp* pre_arg, |
467 | | Regexp** child_args, |
468 | 130k | int nchild_args) { |
469 | 130k | switch (re->op()) { |
470 | 0 | case kRegexpNoMatch: |
471 | 2.30k | case kRegexpEmptyMatch: |
472 | 7.68k | case kRegexpLiteral: |
473 | 12.3k | case kRegexpLiteralString: |
474 | 12.3k | case kRegexpBeginLine: |
475 | 12.3k | case kRegexpEndLine: |
476 | 12.3k | case kRegexpBeginText: |
477 | 12.3k | case kRegexpWordBoundary: |
478 | 12.3k | case kRegexpNoWordBoundary: |
479 | 12.3k | case kRegexpEndText: |
480 | 12.3k | case kRegexpAnyChar: |
481 | 12.3k | case kRegexpAnyByte: |
482 | 12.3k | case kRegexpHaveMatch: |
483 | | // All these are always simple. |
484 | 12.3k | re->simple_ = true; |
485 | 12.3k | return re->Incref(); |
486 | | |
487 | 45.7k | case kRegexpConcat: |
488 | 66.1k | case kRegexpAlternate: { |
489 | | // These are simple as long as the subpieces are simple. |
490 | 66.1k | if (!ChildArgsChanged(re, child_args)) { |
491 | 24.5k | re->simple_ = true; |
492 | 24.5k | return re->Incref(); |
493 | 24.5k | } |
494 | 41.6k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
495 | 41.6k | nre->AllocSub(re->nsub()); |
496 | 41.6k | Regexp** nre_subs = nre->sub(); |
497 | 247k | for (int i = 0; i < re->nsub(); i++) |
498 | 206k | nre_subs[i] = child_args[i]; |
499 | 41.6k | nre->simple_ = true; |
500 | 41.6k | return nre; |
501 | 66.1k | } |
502 | | |
503 | 5.11k | case kRegexpCapture: { |
504 | 5.11k | Regexp* newsub = child_args[0]; |
505 | 5.11k | if (newsub == re->sub()[0]) { |
506 | 951 | newsub->Decref(); |
507 | 951 | re->simple_ = true; |
508 | 951 | return re->Incref(); |
509 | 951 | } |
510 | 4.16k | Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); |
511 | 4.16k | nre->AllocSub(1); |
512 | 4.16k | nre->sub()[0] = newsub; |
513 | 4.16k | nre->cap_ = re->cap(); |
514 | 4.16k | nre->simple_ = true; |
515 | 4.16k | return nre; |
516 | 5.11k | } |
517 | | |
518 | 2.62k | case kRegexpStar: |
519 | 3.96k | case kRegexpPlus: |
520 | 6.34k | case kRegexpQuest: { |
521 | 6.34k | Regexp* newsub = child_args[0]; |
522 | | // Special case: repeat the empty string as much as |
523 | | // you want, but it's still the empty string. |
524 | 6.34k | if (newsub->op() == kRegexpEmptyMatch) |
525 | 401 | return newsub; |
526 | | |
527 | | // These are simple as long as the subpiece is simple. |
528 | 5.94k | if (newsub == re->sub()[0]) { |
529 | 1.88k | newsub->Decref(); |
530 | 1.88k | re->simple_ = true; |
531 | 1.88k | return re->Incref(); |
532 | 1.88k | } |
533 | | |
534 | | // These are also idempotent if flags are constant. |
535 | 4.06k | if (re->op() == newsub->op() && |
536 | 4.06k | re->parse_flags() == newsub->parse_flags()) |
537 | 484 | return newsub; |
538 | | |
539 | 3.58k | Regexp* nre = new Regexp(re->op(), re->parse_flags()); |
540 | 3.58k | nre->AllocSub(1); |
541 | 3.58k | nre->sub()[0] = newsub; |
542 | 3.58k | nre->simple_ = true; |
543 | 3.58k | return nre; |
544 | 4.06k | } |
545 | | |
546 | 36.8k | case kRegexpRepeat: { |
547 | 36.8k | Regexp* newsub = child_args[0]; |
548 | | // Special case: repeat the empty string as much as |
549 | | // you want, but it's still the empty string. |
550 | 36.8k | if (newsub->op() == kRegexpEmptyMatch) |
551 | 296 | return newsub; |
552 | | |
553 | 36.5k | Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, |
554 | 36.5k | re->parse_flags()); |
555 | 36.5k | newsub->Decref(); |
556 | 36.5k | nre->simple_ = true; |
557 | 36.5k | return nre; |
558 | 36.8k | } |
559 | | |
560 | 4.18k | case kRegexpCharClass: { |
561 | 4.18k | Regexp* nre = SimplifyCharClass(re); |
562 | 4.18k | nre->simple_ = true; |
563 | 4.18k | return nre; |
564 | 36.8k | } |
565 | 130k | } |
566 | | |
567 | 0 | LOG(ERROR) << "Simplify case not handled: " << re->op(); |
568 | 0 | return re->Incref(); |
569 | 130k | } |
570 | | |
571 | | // Creates a concatenation of two Regexp, consuming refs to re1 and re2. |
572 | | // Returns a new Regexp, handing the ref to the caller. |
573 | | Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, |
574 | 2.38M | Regexp::ParseFlags parse_flags) { |
575 | 2.38M | Regexp* re = new Regexp(kRegexpConcat, parse_flags); |
576 | 2.38M | re->AllocSub(2); |
577 | 2.38M | Regexp** subs = re->sub(); |
578 | 2.38M | subs[0] = re1; |
579 | 2.38M | subs[1] = re2; |
580 | 2.38M | return re; |
581 | 2.38M | } |
582 | | |
583 | | // Simplifies the expression re{min,max} in terms of *, +, and ?. |
584 | | // Returns a new regexp. Does not edit re. Does not consume reference to re. |
585 | | // Caller must Decref return value when done with it. |
586 | | // The result will *not* necessarily have the right capturing parens |
587 | | // if you call ToString() and re-parse it: (x){2} becomes (x)(x), |
588 | | // but in the Regexp* representation, both (x) are marked as $1. |
589 | | Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, |
590 | 36.5k | Regexp::ParseFlags f) { |
591 | | // x{n,} means at least n matches of x. |
592 | 36.5k | if (max == -1) { |
593 | | // Special case: x{0,} is x* |
594 | 8.14k | if (min == 0) |
595 | 880 | return Regexp::Star(re->Incref(), f); |
596 | | |
597 | | // Special case: x{1,} is x+ |
598 | 7.26k | if (min == 1) |
599 | 2.90k | return Regexp::Plus(re->Incref(), f); |
600 | | |
601 | | // General case: x{4,} is xxxx+ |
602 | 4.36k | PODArray<Regexp*> nre_subs(min); |
603 | 28.4k | for (int i = 0; i < min-1; i++) |
604 | 24.1k | nre_subs[i] = re->Incref(); |
605 | 4.36k | nre_subs[min-1] = Regexp::Plus(re->Incref(), f); |
606 | 4.36k | return Regexp::Concat(nre_subs.data(), min, f); |
607 | 7.26k | } |
608 | | |
609 | | // Special case: (x){0} matches only empty string. |
610 | 28.4k | if (min == 0 && max == 0) |
611 | 1.37k | return new Regexp(kRegexpEmptyMatch, f); |
612 | | |
613 | | // Special case: x{1} is just x. |
614 | 27.0k | if (min == 1 && max == 1) |
615 | 833 | return re->Incref(); |
616 | | |
617 | | // General case: x{n,m} means n copies of x and m copies of x?. |
618 | | // The machine will do less work if we nest the final m copies, |
619 | | // so that x{2,5} = xx(x(x(x)?)?)? |
620 | | |
621 | | // Build leading prefix: xx. Capturing only on the last one. |
622 | 26.2k | Regexp* nre = NULL; |
623 | 26.2k | if (min > 0) { |
624 | 19.7k | PODArray<Regexp*> nre_subs(min); |
625 | 3.89M | for (int i = 0; i < min; i++) |
626 | 3.87M | nre_subs[i] = re->Incref(); |
627 | 19.7k | nre = Regexp::Concat(nre_subs.data(), min, f); |
628 | 19.7k | } |
629 | | |
630 | | // Build and attach suffix: (x(x(x)?)?)? |
631 | 26.2k | if (max > min) { |
632 | 17.2k | Regexp* suf = Regexp::Quest(re->Incref(), f); |
633 | 2.39M | for (int i = min+1; i < max; i++) |
634 | 2.37M | suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); |
635 | 17.2k | if (nre == NULL) |
636 | 6.45k | nre = suf; |
637 | 10.8k | else |
638 | 10.8k | nre = Concat2(nre, suf, f); |
639 | 17.2k | } |
640 | | |
641 | 26.2k | if (nre == NULL) { |
642 | | // Some degenerate case, like min > max, or min < max < 0. |
643 | | // This shouldn't happen, because the parser rejects such regexps. |
644 | 0 | LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max; |
645 | 0 | return new Regexp(kRegexpNoMatch, f); |
646 | 0 | } |
647 | | |
648 | 26.2k | return nre; |
649 | 26.2k | } |
650 | | |
651 | | // Simplifies a character class. |
652 | | // Caller must Decref return value when done with it. |
653 | 4.18k | Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { |
654 | 4.18k | CharClass* cc = re->cc(); |
655 | | |
656 | | // Special cases |
657 | 4.18k | if (cc->empty()) |
658 | 1.27k | return new Regexp(kRegexpNoMatch, re->parse_flags()); |
659 | 2.90k | if (cc->full()) |
660 | 146 | return new Regexp(kRegexpAnyChar, re->parse_flags()); |
661 | | |
662 | 2.76k | return re->Incref(); |
663 | 2.90k | } |
664 | | |
665 | | } // namespace re2 |