Coverage Report

Created: 2024-01-20 12:39

/src/re2/re2/simplify.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Rewrite POSIX and other features in re
6
// to use simple extended regular expression features.
7
// Also sort and simplify character classes.
8
9
#include <string>
10
11
#include "util/util.h"
12
#include "util/logging.h"
13
#include "util/utf.h"
14
#include "re2/pod_array.h"
15
#include "re2/regexp.h"
16
#include "re2/walker-inl.h"
17
18
namespace re2 {
19
20
// Parses the regexp src and then simplifies it and sets *dst to the
21
// string representation of the simplified form.  Returns true on success.
22
// Returns false and sets *error (if error != NULL) on error.
23
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
24
0
                            std::string* dst, RegexpStatus* status) {
25
0
  Regexp* re = Parse(src, flags, status);
26
0
  if (re == NULL)
27
0
    return false;
28
0
  Regexp* sre = re->Simplify();
29
0
  re->Decref();
30
0
  if (sre == NULL) {
31
0
    if (status) {
32
0
      status->set_code(kRegexpInternalError);
33
0
      status->set_error_arg(src);
34
0
    }
35
0
    return false;
36
0
  }
37
0
  *dst = sre->ToString();
38
0
  sre->Decref();
39
0
  return true;
40
0
}
41
42
// Assuming the simple_ flags on the children are accurate,
43
// is this Regexp* simple?
44
1.10M
bool Regexp::ComputeSimple() {
45
1.10M
  Regexp** subs;
46
1.10M
  switch (op_) {
47
5.38k
    case kRegexpNoMatch:
48
37.8k
    case kRegexpEmptyMatch:
49
612k
    case kRegexpLiteral:
50
615k
    case kRegexpLiteralString:
51
621k
    case kRegexpBeginLine:
52
626k
    case kRegexpEndLine:
53
647k
    case kRegexpBeginText:
54
650k
    case kRegexpWordBoundary:
55
651k
    case kRegexpNoWordBoundary:
56
664k
    case kRegexpEndText:
57
688k
    case kRegexpAnyChar:
58
693k
    case kRegexpAnyByte:
59
693k
    case kRegexpHaveMatch:
60
693k
      return true;
61
88.8k
    case kRegexpConcat:
62
113k
    case kRegexpAlternate:
63
      // These are simple as long as the subpieces are simple.
64
113k
      subs = sub();
65
574k
      for (int i = 0; i < nsub_; i++)
66
503k
        if (!subs[i]->simple())
67
42.2k
          return false;
68
71.1k
      return true;
69
142k
    case kRegexpCharClass:
70
      // Simple as long as the char class is not empty, not full.
71
142k
      if (ccb_ != NULL)
72
140k
        return !ccb_->empty() && !ccb_->full();
73
2.22k
      return !cc_->empty() && !cc_->full();
74
47.8k
    case kRegexpCapture:
75
47.8k
      subs = sub();
76
47.8k
      return subs[0]->simple();
77
20.1k
    case kRegexpStar:
78
43.9k
    case kRegexpPlus:
79
72.5k
    case kRegexpQuest:
80
72.5k
      subs = sub();
81
72.5k
      if (!subs[0]->simple())
82
4.29k
        return false;
83
68.2k
      switch (subs[0]->op_) {
84
72
        case kRegexpStar:
85
278
        case kRegexpPlus:
86
576
        case kRegexpQuest:
87
857
        case kRegexpEmptyMatch:
88
2.06k
        case kRegexpNoMatch:
89
2.06k
          return false;
90
66.2k
        default:
91
66.2k
          break;
92
68.2k
      }
93
66.2k
      return true;
94
31.4k
    case kRegexpRepeat:
95
31.4k
      return false;
96
1.10M
  }
97
0
  LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
98
0
  return false;
99
1.10M
}
100
101
// Walker subclass used by Simplify.
102
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
103
// occurrences of that literal into repeats of that literal. It also works for
104
// char classes, any char and any byte.
105
// PostVisit creates the coalesced result, which should then be simplified.
106
class CoalesceWalker : public Regexp::Walker<Regexp*> {
107
 public:
108
38.9k
  CoalesceWalker() {}
109
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
110
                            Regexp** child_args, int nchild_args);
111
  virtual Regexp* Copy(Regexp* re);
112
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113
114
 private:
115
  // These functions are declared inside CoalesceWalker so that
116
  // they can edit the private fields of the Regexps they construct.
117
118
  // Returns true if r1 and r2 can be coalesced. In particular, ensures that
119
  // the parse flags are consistent. (They will not be checked again later.)
120
  static bool CanCoalesce(Regexp* r1, Regexp* r2);
121
122
  // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
123
  // will be empty match and the coalesced op. In other cases, where part of a
124
  // literal string was removed to be coalesced, the array elements afterwards
125
  // will be the coalesced op and the remainder of the literal string.
126
  static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
127
128
  CoalesceWalker(const CoalesceWalker&) = delete;
129
  CoalesceWalker& operator=(const CoalesceWalker&) = delete;
130
};
131
132
// Walker subclass used by Simplify.
133
// The simplify walk is purely post-recursive: given the simplified children,
134
// PostVisit creates the simplified result.
135
// The child_args are simplified Regexp*s.
136
class SimplifyWalker : public Regexp::Walker<Regexp*> {
137
 public:
138
38.9k
  SimplifyWalker() {}
139
  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
140
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
141
                            Regexp** child_args, int nchild_args);
142
  virtual Regexp* Copy(Regexp* re);
143
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
144
145
 private:
146
  // These functions are declared inside SimplifyWalker so that
147
  // they can edit the private fields of the Regexps they construct.
148
149
  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
150
  // Caller must Decref return value when done with it.
151
  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
152
153
  // Simplifies the expression re{min,max} in terms of *, +, and ?.
154
  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
155
  // Caller must Decref return value when done with it.
156
  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
157
                                Regexp::ParseFlags parse_flags);
158
159
  // Simplifies a character class by expanding any named classes
160
  // into rune ranges.  Does not edit re.  Does not consume ref to re.
161
  // Caller must Decref return value when done with it.
162
  static Regexp* SimplifyCharClass(Regexp* re);
163
164
  SimplifyWalker(const SimplifyWalker&) = delete;
165
  SimplifyWalker& operator=(const SimplifyWalker&) = delete;
166
};
167
168
// Simplifies a regular expression, returning a new regexp.
169
// The new regexp uses traditional Unix egrep features only,
170
// plus the Perl (?:) non-capturing parentheses.
171
// Otherwise, no POSIX or Perl additions.  The new regexp
172
// captures exactly the same subexpressions (with the same indices)
173
// as the original.
174
// Does not edit current object.
175
// Caller must Decref() return value when done with it.
176
177
38.9k
Regexp* Regexp::Simplify() {
178
38.9k
  CoalesceWalker cw;
179
38.9k
  Regexp* cre = cw.Walk(this, NULL);
180
38.9k
  if (cre == NULL)
181
0
    return NULL;
182
38.9k
  if (cw.stopped_early()) {
183
0
    cre->Decref();
184
0
    return NULL;
185
0
  }
186
38.9k
  SimplifyWalker sw;
187
38.9k
  Regexp* sre = sw.Walk(cre, NULL);
188
38.9k
  cre->Decref();
189
38.9k
  if (sre == NULL)
190
0
    return NULL;
191
38.9k
  if (sw.stopped_early()) {
192
0
    sre->Decref();
193
0
    return NULL;
194
0
  }
195
38.9k
  return sre;
196
38.9k
}
197
198
#define Simplify DontCallSimplify  // Avoid accidental recursion
199
200
// Utility function for PostVisit implementations that compares re->sub() with
201
// child_args to determine whether any child_args changed. In the common case,
202
// where nothing changed, calls Decref() for all child_args and returns false,
203
// so PostVisit must return re->Incref(). Otherwise, returns true.
204
279k
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
205
967k
  for (int i = 0; i < re->nsub(); i++) {
206
739k
    Regexp* sub = re->sub()[i];
207
739k
    Regexp* newsub = child_args[i];
208
739k
    if (newsub != sub)
209
51.6k
      return true;
210
739k
  }
211
831k
  for (int i = 0; i < re->nsub(); i++) {
212
603k
    Regexp* newsub = child_args[i];
213
603k
    newsub->Decref();
214
603k
  }
215
227k
  return false;
216
279k
}
217
218
0
Regexp* CoalesceWalker::Copy(Regexp* re) {
219
0
  return re->Incref();
220
0
}
221
222
0
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
223
  // Should never be called: we use Walk(), not WalkExponential().
224
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
225
  LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
226
#endif
227
0
  return re->Incref();
228
0
}
229
230
Regexp* CoalesceWalker::PostVisit(Regexp* re,
231
                                  Regexp* parent_arg,
232
                                  Regexp* pre_arg,
233
                                  Regexp** child_args,
234
666k
                                  int nchild_args) {
235
666k
  if (re->nsub() == 0)
236
445k
    return re->Incref();
237
238
221k
  if (re->op() != kRegexpConcat) {
239
139k
    if (!ChildArgsChanged(re, child_args))
240
133k
      return re->Incref();
241
242
    // Something changed. Build a new op.
243
6.72k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
244
6.72k
    nre->AllocSub(re->nsub());
245
6.72k
    Regexp** nre_subs = nre->sub();
246
26.8k
    for (int i = 0; i < re->nsub(); i++)
247
20.0k
      nre_subs[i] = child_args[i];
248
    // Repeats and Captures have additional data that must be copied.
249
6.72k
    if (re->op() == kRegexpRepeat) {
250
927
      nre->min_ = re->min();
251
927
      nre->max_ = re->max();
252
5.79k
    } else if (re->op() == kRegexpCapture) {
253
1.73k
      nre->cap_ = re->cap();
254
1.73k
    }
255
6.72k
    return nre;
256
139k
  }
257
258
81.2k
  bool can_coalesce = false;
259
451k
  for (int i = 0; i < re->nsub(); i++) {
260
378k
    if (i+1 < re->nsub() &&
261
378k
        CanCoalesce(child_args[i], child_args[i+1])) {
262
7.98k
      can_coalesce = true;
263
7.98k
      break;
264
7.98k
    }
265
378k
  }
266
81.2k
  if (!can_coalesce) {
267
73.3k
    if (!ChildArgsChanged(re, child_args))
268
70.0k
      return re->Incref();
269
270
    // Something changed. Build a new op.
271
3.30k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
272
3.30k
    nre->AllocSub(re->nsub());
273
3.30k
    Regexp** nre_subs = nre->sub();
274
17.6k
    for (int i = 0; i < re->nsub(); i++)
275
14.3k
      nre_subs[i] = child_args[i];
276
3.30k
    return nre;
277
73.3k
  }
278
279
76.6k
  for (int i = 0; i < re->nsub(); i++) {
280
68.6k
    if (i+1 < re->nsub() &&
281
68.6k
        CanCoalesce(child_args[i], child_args[i+1]))
282
16.7k
      DoCoalesce(&child_args[i], &child_args[i+1]);
283
68.6k
  }
284
  // Determine how many empty matches were left by DoCoalesce.
285
7.98k
  int n = 0;
286
76.6k
  for (int i = n; i < re->nsub(); i++) {
287
68.6k
    if (child_args[i]->op() == kRegexpEmptyMatch)
288
14.0k
      n++;
289
68.6k
  }
290
  // Build a new op.
291
7.98k
  Regexp* nre = new Regexp(re->op(), re->parse_flags());
292
7.98k
  nre->AllocSub(re->nsub() - n);
293
7.98k
  Regexp** nre_subs = nre->sub();
294
76.6k
  for (int i = 0, j = 0; i < re->nsub(); i++) {
295
68.6k
    if (child_args[i]->op() == kRegexpEmptyMatch) {
296
14.0k
      child_args[i]->Decref();
297
14.0k
      continue;
298
14.0k
    }
299
54.6k
    nre_subs[j] = child_args[i];
300
54.6k
    j++;
301
54.6k
  }
302
7.98k
  return nre;
303
81.2k
}
304
305
365k
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
306
  // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
307
  // any byte.
308
365k
  if ((r1->op() == kRegexpStar ||
309
365k
       r1->op() == kRegexpPlus ||
310
365k
       r1->op() == kRegexpQuest ||
311
365k
       r1->op() == kRegexpRepeat) &&
312
365k
      (r1->sub()[0]->op() == kRegexpLiteral ||
313
89.3k
       r1->sub()[0]->op() == kRegexpCharClass ||
314
89.3k
       r1->sub()[0]->op() == kRegexpAnyChar ||
315
89.3k
       r1->sub()[0]->op() == kRegexpAnyByte)) {
316
    // r2 must be a star/plus/quest/repeat of the same literal, char class,
317
    // any char or any byte.
318
72.4k
    if ((r2->op() == kRegexpStar ||
319
72.4k
         r2->op() == kRegexpPlus ||
320
72.4k
         r2->op() == kRegexpQuest ||
321
72.4k
         r2->op() == kRegexpRepeat) &&
322
72.4k
        Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
323
        // The parse flags must be consistent.
324
72.4k
        ((r1->parse_flags() & Regexp::NonGreedy) ==
325
8.38k
         (r2->parse_flags() & Regexp::NonGreedy))) {
326
7.33k
      return true;
327
7.33k
    }
328
    // ... OR an occurrence of that literal, char class, any char or any byte
329
65.1k
    if (Regexp::Equal(r1->sub()[0], r2)) {
330
11.7k
      return true;
331
11.7k
    }
332
    // ... OR a literal string that begins with that literal.
333
53.3k
    if (r1->sub()[0]->op() == kRegexpLiteral &&
334
53.3k
        r2->op() == kRegexpLiteralString &&
335
53.3k
        r2->runes()[0] == r1->sub()[0]->rune() &&
336
        // The parse flags must be consistent.
337
53.3k
        ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
338
5.76k
         (r2->parse_flags() & Regexp::FoldCase))) {
339
5.64k
      return true;
340
5.64k
    }
341
53.3k
  }
342
340k
  return false;
343
365k
}
344
345
16.7k
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
346
16.7k
  Regexp* r1 = *r1ptr;
347
16.7k
  Regexp* r2 = *r2ptr;
348
349
16.7k
  Regexp* nre = Regexp::Repeat(
350
16.7k
      r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
351
352
16.7k
  switch (r1->op()) {
353
2.85k
    case kRegexpStar:
354
2.85k
      nre->min_ = 0;
355
2.85k
      nre->max_ = -1;
356
2.85k
      break;
357
358
3.09k
    case kRegexpPlus:
359
3.09k
      nre->min_ = 1;
360
3.09k
      nre->max_ = -1;
361
3.09k
      break;
362
363
3.23k
    case kRegexpQuest:
364
3.23k
      nre->min_ = 0;
365
3.23k
      nre->max_ = 1;
366
3.23k
      break;
367
368
7.56k
    case kRegexpRepeat:
369
7.56k
      nre->min_ = r1->min();
370
7.56k
      nre->max_ = r1->max();
371
7.56k
      break;
372
373
0
    default:
374
0
      nre->Decref();
375
0
      LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
376
0
      return;
377
16.7k
  }
378
379
16.7k
  switch (r2->op()) {
380
1.02k
    case kRegexpStar:
381
1.02k
      nre->max_ = -1;
382
1.02k
      goto LeaveEmpty;
383
384
1.07k
    case kRegexpPlus:
385
1.07k
      nre->min_++;
386
1.07k
      nre->max_ = -1;
387
1.07k
      goto LeaveEmpty;
388
389
1.84k
    case kRegexpQuest:
390
1.84k
      if (nre->max() != -1)
391
1.36k
        nre->max_++;
392
1.84k
      goto LeaveEmpty;
393
394
1.04k
    case kRegexpRepeat:
395
1.04k
      nre->min_ += r2->min();
396
1.04k
      if (r2->max() == -1)
397
268
        nre->max_ = -1;
398
780
      else if (nre->max() != -1)
399
557
        nre->max_ += r2->max();
400
1.04k
      goto LeaveEmpty;
401
402
1.92k
    case kRegexpLiteral:
403
4.83k
    case kRegexpCharClass:
404
7.90k
    case kRegexpAnyChar:
405
8.37k
    case kRegexpAnyByte:
406
8.37k
      nre->min_++;
407
8.37k
      if (nre->max() != -1)
408
2.30k
        nre->max_++;
409
8.37k
      goto LeaveEmpty;
410
411
13.9k
    LeaveEmpty:
412
13.9k
      *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
413
13.9k
      *r2ptr = nre;
414
13.9k
      break;
415
416
3.38k
    case kRegexpLiteralString: {
417
3.38k
      Rune r = r1->sub()[0]->rune();
418
      // Determine how much of the literal string is removed.
419
      // We know that we have at least one rune. :)
420
3.38k
      int n = 1;
421
7.73k
      while (n < r2->nrunes() && r2->runes()[n] == r)
422
4.35k
        n++;
423
3.38k
      nre->min_ += n;
424
3.38k
      if (nre->max() != -1)
425
1.40k
        nre->max_ += n;
426
3.38k
      if (n == r2->nrunes())
427
605
        goto LeaveEmpty;
428
2.77k
      *r1ptr = nre;
429
2.77k
      *r2ptr = Regexp::LiteralString(
430
2.77k
          &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
431
2.77k
      break;
432
3.38k
    }
433
434
0
    default:
435
0
      nre->Decref();
436
0
      LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
437
0
      return;
438
16.7k
  }
439
440
16.7k
  r1->Decref();
441
16.7k
  r2->Decref();
442
16.7k
}
443
444
0
Regexp* SimplifyWalker::Copy(Regexp* re) {
445
0
  return re->Incref();
446
0
}
447
448
0
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
449
  // Should never be called: we use Walk(), not WalkExponential().
450
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
451
  LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
452
#endif
453
0
  return re->Incref();
454
0
}
455
456
371k
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
457
371k
  if (re->simple()) {
458
240k
    *stop = true;
459
240k
    return re->Incref();
460
240k
  }
461
130k
  return NULL;
462
371k
}
463
464
Regexp* SimplifyWalker::PostVisit(Regexp* re,
465
                                  Regexp* parent_arg,
466
                                  Regexp* pre_arg,
467
                                  Regexp** child_args,
468
130k
                                  int nchild_args) {
469
130k
  switch (re->op()) {
470
0
    case kRegexpNoMatch:
471
2.30k
    case kRegexpEmptyMatch:
472
7.68k
    case kRegexpLiteral:
473
12.3k
    case kRegexpLiteralString:
474
12.3k
    case kRegexpBeginLine:
475
12.3k
    case kRegexpEndLine:
476
12.3k
    case kRegexpBeginText:
477
12.3k
    case kRegexpWordBoundary:
478
12.3k
    case kRegexpNoWordBoundary:
479
12.3k
    case kRegexpEndText:
480
12.3k
    case kRegexpAnyChar:
481
12.3k
    case kRegexpAnyByte:
482
12.3k
    case kRegexpHaveMatch:
483
      // All these are always simple.
484
12.3k
      re->simple_ = true;
485
12.3k
      return re->Incref();
486
487
45.7k
    case kRegexpConcat:
488
66.1k
    case kRegexpAlternate: {
489
      // These are simple as long as the subpieces are simple.
490
66.1k
      if (!ChildArgsChanged(re, child_args)) {
491
24.5k
        re->simple_ = true;
492
24.5k
        return re->Incref();
493
24.5k
      }
494
41.6k
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
495
41.6k
      nre->AllocSub(re->nsub());
496
41.6k
      Regexp** nre_subs = nre->sub();
497
247k
      for (int i = 0; i < re->nsub(); i++)
498
206k
        nre_subs[i] = child_args[i];
499
41.6k
      nre->simple_ = true;
500
41.6k
      return nre;
501
66.1k
    }
502
503
5.11k
    case kRegexpCapture: {
504
5.11k
      Regexp* newsub = child_args[0];
505
5.11k
      if (newsub == re->sub()[0]) {
506
951
        newsub->Decref();
507
951
        re->simple_ = true;
508
951
        return re->Incref();
509
951
      }
510
4.16k
      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
511
4.16k
      nre->AllocSub(1);
512
4.16k
      nre->sub()[0] = newsub;
513
4.16k
      nre->cap_ = re->cap();
514
4.16k
      nre->simple_ = true;
515
4.16k
      return nre;
516
5.11k
    }
517
518
2.62k
    case kRegexpStar:
519
3.96k
    case kRegexpPlus:
520
6.34k
    case kRegexpQuest: {
521
6.34k
      Regexp* newsub = child_args[0];
522
      // Special case: repeat the empty string as much as
523
      // you want, but it's still the empty string.
524
6.34k
      if (newsub->op() == kRegexpEmptyMatch)
525
401
        return newsub;
526
527
      // These are simple as long as the subpiece is simple.
528
5.94k
      if (newsub == re->sub()[0]) {
529
1.88k
        newsub->Decref();
530
1.88k
        re->simple_ = true;
531
1.88k
        return re->Incref();
532
1.88k
      }
533
534
      // These are also idempotent if flags are constant.
535
4.06k
      if (re->op() == newsub->op() &&
536
4.06k
          re->parse_flags() == newsub->parse_flags())
537
484
        return newsub;
538
539
3.58k
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
540
3.58k
      nre->AllocSub(1);
541
3.58k
      nre->sub()[0] = newsub;
542
3.58k
      nre->simple_ = true;
543
3.58k
      return nre;
544
4.06k
    }
545
546
36.8k
    case kRegexpRepeat: {
547
36.8k
      Regexp* newsub = child_args[0];
548
      // Special case: repeat the empty string as much as
549
      // you want, but it's still the empty string.
550
36.8k
      if (newsub->op() == kRegexpEmptyMatch)
551
296
        return newsub;
552
553
36.5k
      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
554
36.5k
                                   re->parse_flags());
555
36.5k
      newsub->Decref();
556
36.5k
      nre->simple_ = true;
557
36.5k
      return nre;
558
36.8k
    }
559
560
4.18k
    case kRegexpCharClass: {
561
4.18k
      Regexp* nre = SimplifyCharClass(re);
562
4.18k
      nre->simple_ = true;
563
4.18k
      return nre;
564
36.8k
    }
565
130k
  }
566
567
0
  LOG(ERROR) << "Simplify case not handled: " << re->op();
568
0
  return re->Incref();
569
130k
}
570
571
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
572
// Returns a new Regexp, handing the ref to the caller.
573
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
574
2.38M
                                Regexp::ParseFlags parse_flags) {
575
2.38M
  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
576
2.38M
  re->AllocSub(2);
577
2.38M
  Regexp** subs = re->sub();
578
2.38M
  subs[0] = re1;
579
2.38M
  subs[1] = re2;
580
2.38M
  return re;
581
2.38M
}
582
583
// Simplifies the expression re{min,max} in terms of *, +, and ?.
584
// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
585
// Caller must Decref return value when done with it.
586
// The result will *not* necessarily have the right capturing parens
587
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
588
// but in the Regexp* representation, both (x) are marked as $1.
589
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
590
36.5k
                                       Regexp::ParseFlags f) {
591
  // x{n,} means at least n matches of x.
592
36.5k
  if (max == -1) {
593
    // Special case: x{0,} is x*
594
8.14k
    if (min == 0)
595
880
      return Regexp::Star(re->Incref(), f);
596
597
    // Special case: x{1,} is x+
598
7.26k
    if (min == 1)
599
2.90k
      return Regexp::Plus(re->Incref(), f);
600
601
    // General case: x{4,} is xxxx+
602
4.36k
    PODArray<Regexp*> nre_subs(min);
603
28.4k
    for (int i = 0; i < min-1; i++)
604
24.1k
      nre_subs[i] = re->Incref();
605
4.36k
    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
606
4.36k
    return Regexp::Concat(nre_subs.data(), min, f);
607
7.26k
  }
608
609
  // Special case: (x){0} matches only empty string.
610
28.4k
  if (min == 0 && max == 0)
611
1.37k
    return new Regexp(kRegexpEmptyMatch, f);
612
613
  // Special case: x{1} is just x.
614
27.0k
  if (min == 1 && max == 1)
615
833
    return re->Incref();
616
617
  // General case: x{n,m} means n copies of x and m copies of x?.
618
  // The machine will do less work if we nest the final m copies,
619
  // so that x{2,5} = xx(x(x(x)?)?)?
620
621
  // Build leading prefix: xx.  Capturing only on the last one.
622
26.2k
  Regexp* nre = NULL;
623
26.2k
  if (min > 0) {
624
19.7k
    PODArray<Regexp*> nre_subs(min);
625
3.89M
    for (int i = 0; i < min; i++)
626
3.87M
      nre_subs[i] = re->Incref();
627
19.7k
    nre = Regexp::Concat(nre_subs.data(), min, f);
628
19.7k
  }
629
630
  // Build and attach suffix: (x(x(x)?)?)?
631
26.2k
  if (max > min) {
632
17.2k
    Regexp* suf = Regexp::Quest(re->Incref(), f);
633
2.39M
    for (int i = min+1; i < max; i++)
634
2.37M
      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
635
17.2k
    if (nre == NULL)
636
6.45k
      nre = suf;
637
10.8k
    else
638
10.8k
      nre = Concat2(nre, suf, f);
639
17.2k
  }
640
641
26.2k
  if (nre == NULL) {
642
    // Some degenerate case, like min > max, or min < max < 0.
643
    // This shouldn't happen, because the parser rejects such regexps.
644
0
    LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
645
0
    return new Regexp(kRegexpNoMatch, f);
646
0
  }
647
648
26.2k
  return nre;
649
26.2k
}
650
651
// Simplifies a character class.
652
// Caller must Decref return value when done with it.
653
4.18k
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
654
4.18k
  CharClass* cc = re->cc();
655
656
  // Special cases
657
4.18k
  if (cc->empty())
658
1.27k
    return new Regexp(kRegexpNoMatch, re->parse_flags());
659
2.90k
  if (cc->full())
660
146
    return new Regexp(kRegexpAnyChar, re->parse_flags());
661
662
2.76k
  return re->Incref();
663
2.90k
}
664
665
}  // namespace re2