Coverage Report

Created: 2024-05-28 04:30

/src/re2/re2/simplify.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Rewrite POSIX and other features in re
6
// to use simple extended regular expression features.
7
// Also sort and simplify character classes.
8
9
#include <string>
10
11
#include "util/util.h"
12
#include "util/logging.h"
13
#include "util/utf.h"
14
#include "re2/pod_array.h"
15
#include "re2/regexp.h"
16
#include "re2/walker-inl.h"
17
18
namespace re2 {
19
20
// Parses the regexp src and then simplifies it and sets *dst to the
21
// string representation of the simplified form.  Returns true on success.
22
// Returns false and sets *error (if error != NULL) on error.
23
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
24
0
                            std::string* dst, RegexpStatus* status) {
25
0
  Regexp* re = Parse(src, flags, status);
26
0
  if (re == NULL)
27
0
    return false;
28
0
  Regexp* sre = re->Simplify();
29
0
  re->Decref();
30
0
  if (sre == NULL) {
31
0
    if (status) {
32
0
      status->set_code(kRegexpInternalError);
33
0
      status->set_error_arg(src);
34
0
    }
35
0
    return false;
36
0
  }
37
0
  *dst = sre->ToString();
38
0
  sre->Decref();
39
0
  return true;
40
0
}
41
42
// Assuming the simple_ flags on the children are accurate,
43
// is this Regexp* simple?
44
21.2M
bool Regexp::ComputeSimple() {
45
21.2M
  Regexp** subs;
46
21.2M
  switch (op_) {
47
29.1k
    case kRegexpNoMatch:
48
112k
    case kRegexpEmptyMatch:
49
17.6M
    case kRegexpLiteral:
50
17.6M
    case kRegexpLiteralString:
51
17.7M
    case kRegexpBeginLine:
52
17.7M
    case kRegexpEndLine:
53
17.8M
    case kRegexpBeginText:
54
17.8M
    case kRegexpWordBoundary:
55
17.8M
    case kRegexpNoWordBoundary:
56
17.8M
    case kRegexpEndText:
57
17.9M
    case kRegexpAnyChar:
58
17.9M
    case kRegexpAnyByte:
59
17.9M
    case kRegexpHaveMatch:
60
17.9M
      return true;
61
882k
    case kRegexpConcat:
62
936k
    case kRegexpAlternate:
63
      // These are simple as long as the subpieces are simple.
64
936k
      subs = sub();
65
5.06M
      for (int i = 0; i < nsub_; i++)
66
4.19M
        if (!subs[i]->simple())
67
72.7k
          return false;
68
863k
      return true;
69
2.07M
    case kRegexpCharClass:
70
      // Simple as long as the char class is not empty, not full.
71
2.07M
      if (ccb_ != NULL)
72
2.07M
        return !ccb_->empty() && !ccb_->full();
73
2.90k
      return !cc_->empty() && !cc_->full();
74
55.6k
    case kRegexpCapture:
75
55.6k
      subs = sub();
76
55.6k
      return subs[0]->simple();
77
75.1k
    case kRegexpStar:
78
117k
    case kRegexpPlus:
79
171k
    case kRegexpQuest:
80
171k
      subs = sub();
81
171k
      if (!subs[0]->simple())
82
6.43k
        return false;
83
164k
      switch (subs[0]->op_) {
84
208
        case kRegexpStar:
85
350
        case kRegexpPlus:
86
527
        case kRegexpQuest:
87
866
        case kRegexpEmptyMatch:
88
3.19k
        case kRegexpNoMatch:
89
3.19k
          return false;
90
161k
        default:
91
161k
          break;
92
164k
      }
93
161k
      return true;
94
50.1k
    case kRegexpRepeat:
95
50.1k
      return false;
96
21.2M
  }
97
0
  LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
98
0
  return false;
99
21.2M
}
100
101
// Walker subclass used by Simplify.
102
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
103
// occurrences of that literal into repeats of that literal. It also works for
104
// char classes, any char and any byte.
105
// PostVisit creates the coalesced result, which should then be simplified.
106
class CoalesceWalker : public Regexp::Walker<Regexp*> {
107
 public:
108
2.36M
  CoalesceWalker() {}
109
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
110
                            Regexp** child_args, int nchild_args);
111
  virtual Regexp* Copy(Regexp* re);
112
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113
114
 private:
115
  // These functions are declared inside CoalesceWalker so that
116
  // they can edit the private fields of the Regexps they construct.
117
118
  // Returns true if r1 and r2 can be coalesced. In particular, ensures that
119
  // the parse flags are consistent. (They will not be checked again later.)
120
  static bool CanCoalesce(Regexp* r1, Regexp* r2);
121
122
  // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
123
  // will be empty match and the coalesced op. In other cases, where part of a
124
  // literal string was removed to be coalesced, the array elements afterwards
125
  // will be the coalesced op and the remainder of the literal string.
126
  static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
127
128
  CoalesceWalker(const CoalesceWalker&) = delete;
129
  CoalesceWalker& operator=(const CoalesceWalker&) = delete;
130
};
131
132
// Walker subclass used by Simplify.
133
// The simplify walk is purely post-recursive: given the simplified children,
134
// PostVisit creates the simplified result.
135
// The child_args are simplified Regexp*s.
136
class SimplifyWalker : public Regexp::Walker<Regexp*> {
137
 public:
138
2.36M
  SimplifyWalker() {}
139
  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
140
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
141
                            Regexp** child_args, int nchild_args);
142
  virtual Regexp* Copy(Regexp* re);
143
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
144
145
 private:
146
  // These functions are declared inside SimplifyWalker so that
147
  // they can edit the private fields of the Regexps they construct.
148
149
  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
150
  // Caller must Decref return value when done with it.
151
  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
152
153
  // Simplifies the expression re{min,max} in terms of *, +, and ?.
154
  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
155
  // Caller must Decref return value when done with it.
156
  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
157
                                Regexp::ParseFlags parse_flags);
158
159
  // Simplifies a character class by expanding any named classes
160
  // into rune ranges.  Does not edit re.  Does not consume ref to re.
161
  // Caller must Decref return value when done with it.
162
  static Regexp* SimplifyCharClass(Regexp* re);
163
164
  SimplifyWalker(const SimplifyWalker&) = delete;
165
  SimplifyWalker& operator=(const SimplifyWalker&) = delete;
166
};
167
168
// Simplifies a regular expression, returning a new regexp.
169
// The new regexp uses traditional Unix egrep features only,
170
// plus the Perl (?:) non-capturing parentheses.
171
// Otherwise, no POSIX or Perl additions.  The new regexp
172
// captures exactly the same subexpressions (with the same indices)
173
// as the original.
174
// Does not edit current object.
175
// Caller must Decref() return value when done with it.
176
177
2.36M
Regexp* Regexp::Simplify() {
178
2.36M
  CoalesceWalker cw;
179
2.36M
  Regexp* cre = cw.Walk(this, NULL);
180
2.36M
  if (cre == NULL)
181
0
    return NULL;
182
2.36M
  if (cw.stopped_early()) {
183
0
    cre->Decref();
184
0
    return NULL;
185
0
  }
186
2.36M
  SimplifyWalker sw;
187
2.36M
  Regexp* sre = sw.Walk(cre, NULL);
188
2.36M
  cre->Decref();
189
2.36M
  if (sre == NULL)
190
0
    return NULL;
191
2.36M
  if (sw.stopped_early()) {
192
0
    sre->Decref();
193
0
    return NULL;
194
0
  }
195
2.36M
  return sre;
196
2.36M
}
197
198
#define Simplify DontCallSimplify  // Avoid accidental recursion
199
200
// Utility function for PostVisit implementations that compares re->sub() with
201
// child_args to determine whether any child_args changed. In the common case,
202
// where nothing changed, calls Decref() for all child_args and returns false,
203
// so PostVisit must return re->Incref(). Otherwise, returns true.
204
1.21M
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
205
5.69M
  for (int i = 0; i < re->nsub(); i++) {
206
4.56M
    Regexp* sub = re->sub()[i];
207
4.56M
    Regexp* newsub = child_args[i];
208
4.56M
    if (newsub != sub)
209
86.9k
      return true;
210
4.56M
  }
211
5.45M
  for (int i = 0; i < re->nsub(); i++) {
212
4.32M
    Regexp* newsub = child_args[i];
213
4.32M
    newsub->Decref();
214
4.32M
  }
215
1.12M
  return false;
216
1.21M
}
217
218
0
Regexp* CoalesceWalker::Copy(Regexp* re) {
219
0
  return re->Incref();
220
0
}
221
222
0
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
223
  // Should never be called: we use Walk(), not WalkExponential().
224
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
225
  LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
226
#endif
227
0
  return re->Incref();
228
0
}
229
230
Regexp* CoalesceWalker::PostVisit(Regexp* re,
231
                                  Regexp* parent_arg,
232
                                  Regexp* pre_arg,
233
                                  Regexp** child_args,
234
6.71M
                                  int nchild_args) {
235
6.71M
  if (re->nsub() == 0)
236
5.60M
    return re->Incref();
237
238
1.11M
  if (re->op() != kRegexpConcat) {
239
251k
    if (!ChildArgsChanged(re, child_args))
240
239k
      return re->Incref();
241
242
    // Something changed. Build a new op.
243
12.7k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
244
12.7k
    nre->AllocSub(re->nsub());
245
12.7k
    Regexp** nre_subs = nre->sub();
246
47.4k
    for (int i = 0; i < re->nsub(); i++)
247
34.6k
      nre_subs[i] = child_args[i];
248
    // Repeats and Captures have additional data that must be copied.
249
12.7k
    if (re->op() == kRegexpRepeat) {
250
1.65k
      nre->min_ = re->min();
251
1.65k
      nre->max_ = re->max();
252
11.0k
    } else if (re->op() == kRegexpCapture) {
253
3.04k
      nre->cap_ = re->cap();
254
3.04k
    }
255
12.7k
    return nre;
256
251k
  }
257
258
858k
  bool can_coalesce = false;
259
4.72M
  for (int i = 0; i < re->nsub(); i++) {
260
3.88M
    if (i+1 < re->nsub() &&
261
3.88M
        CanCoalesce(child_args[i], child_args[i+1])) {
262
13.6k
      can_coalesce = true;
263
13.6k
      break;
264
13.6k
    }
265
3.88M
  }
266
858k
  if (!can_coalesce) {
267
844k
    if (!ChildArgsChanged(re, child_args))
268
838k
      return re->Incref();
269
270
    // Something changed. Build a new op.
271
6.43k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
272
6.43k
    nre->AllocSub(re->nsub());
273
6.43k
    Regexp** nre_subs = nre->sub();
274
30.3k
    for (int i = 0; i < re->nsub(); i++)
275
23.9k
      nre_subs[i] = child_args[i];
276
6.43k
    return nre;
277
844k
  }
278
279
127k
  for (int i = 0; i < re->nsub(); i++) {
280
113k
    if (i+1 < re->nsub() &&
281
113k
        CanCoalesce(child_args[i], child_args[i+1]))
282
31.5k
      DoCoalesce(&child_args[i], &child_args[i+1]);
283
113k
  }
284
  // Determine how many empty matches were left by DoCoalesce.
285
13.6k
  int n = 0;
286
127k
  for (int i = n; i < re->nsub(); i++) {
287
113k
    if (child_args[i]->op() == kRegexpEmptyMatch)
288
26.3k
      n++;
289
113k
  }
290
  // Build a new op.
291
13.6k
  Regexp* nre = new Regexp(re->op(), re->parse_flags());
292
13.6k
  nre->AllocSub(re->nsub() - n);
293
13.6k
  Regexp** nre_subs = nre->sub();
294
127k
  for (int i = 0, j = 0; i < re->nsub(); i++) {
295
113k
    if (child_args[i]->op() == kRegexpEmptyMatch) {
296
26.3k
      child_args[i]->Decref();
297
26.3k
      continue;
298
26.3k
    }
299
87.1k
    nre_subs[j] = child_args[i];
300
87.1k
    j++;
301
87.1k
  }
302
13.6k
  return nre;
303
858k
}
304
305
3.13M
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
306
  // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
307
  // any byte.
308
3.13M
  if ((r1->op() == kRegexpStar ||
309
3.13M
       r1->op() == kRegexpPlus ||
310
3.13M
       r1->op() == kRegexpQuest ||
311
3.13M
       r1->op() == kRegexpRepeat) &&
312
3.13M
      (r1->sub()[0]->op() == kRegexpLiteral ||
313
172k
       r1->sub()[0]->op() == kRegexpCharClass ||
314
172k
       r1->sub()[0]->op() == kRegexpAnyChar ||
315
172k
       r1->sub()[0]->op() == kRegexpAnyByte)) {
316
    // r2 must be a star/plus/quest/repeat of the same literal, char class,
317
    // any char or any byte.
318
148k
    if ((r2->op() == kRegexpStar ||
319
148k
         r2->op() == kRegexpPlus ||
320
148k
         r2->op() == kRegexpQuest ||
321
148k
         r2->op() == kRegexpRepeat) &&
322
148k
        Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
323
        // The parse flags must be consistent.
324
148k
        ((r1->parse_flags() & Regexp::NonGreedy) ==
325
13.6k
         (r2->parse_flags() & Regexp::NonGreedy))) {
326
11.9k
      return true;
327
11.9k
    }
328
    // ... OR an occurrence of that literal, char class, any char or any byte
329
136k
    if (Regexp::Equal(r1->sub()[0], r2)) {
330
20.6k
      return true;
331
20.6k
    }
332
    // ... OR a literal string that begins with that literal.
333
115k
    if (r1->sub()[0]->op() == kRegexpLiteral &&
334
115k
        r2->op() == kRegexpLiteralString &&
335
115k
        r2->runes()[0] == r1->sub()[0]->rune() &&
336
        // The parse flags must be consistent.
337
115k
        ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
338
12.6k
         (r2->parse_flags() & Regexp::FoldCase))) {
339
12.5k
      return true;
340
12.5k
    }
341
115k
  }
342
3.09M
  return false;
343
3.13M
}
344
345
31.5k
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
346
31.5k
  Regexp* r1 = *r1ptr;
347
31.5k
  Regexp* r2 = *r2ptr;
348
349
31.5k
  Regexp* nre = Regexp::Repeat(
350
31.5k
      r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
351
352
31.5k
  switch (r1->op()) {
353
5.49k
    case kRegexpStar:
354
5.49k
      nre->min_ = 0;
355
5.49k
      nre->max_ = -1;
356
5.49k
      break;
357
358
3.94k
    case kRegexpPlus:
359
3.94k
      nre->min_ = 1;
360
3.94k
      nre->max_ = -1;
361
3.94k
      break;
362
363
5.95k
    case kRegexpQuest:
364
5.95k
      nre->min_ = 0;
365
5.95k
      nre->max_ = 1;
366
5.95k
      break;
367
368
16.2k
    case kRegexpRepeat:
369
16.2k
      nre->min_ = r1->min();
370
16.2k
      nre->max_ = r1->max();
371
16.2k
      break;
372
373
0
    default:
374
0
      nre->Decref();
375
0
      LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
376
0
      return;
377
31.5k
  }
378
379
31.5k
  switch (r2->op()) {
380
2.70k
    case kRegexpStar:
381
2.70k
      nre->max_ = -1;
382
2.70k
      goto LeaveEmpty;
383
384
1.67k
    case kRegexpPlus:
385
1.67k
      nre->min_++;
386
1.67k
      nre->max_ = -1;
387
1.67k
      goto LeaveEmpty;
388
389
3.13k
    case kRegexpQuest:
390
3.13k
      if (nre->max() != -1)
391
2.24k
        nre->max_++;
392
3.13k
      goto LeaveEmpty;
393
394
1.27k
    case kRegexpRepeat:
395
1.27k
      nre->min_ += r2->min();
396
1.27k
      if (r2->max() == -1)
397
321
        nre->max_ = -1;
398
956
      else if (nre->max() != -1)
399
648
        nre->max_ += r2->max();
400
1.27k
      goto LeaveEmpty;
401
402
4.41k
    case kRegexpLiteral:
403
9.47k
    case kRegexpCharClass:
404
14.6k
    case kRegexpAnyChar:
405
15.1k
    case kRegexpAnyByte:
406
15.1k
      nre->min_++;
407
15.1k
      if (nre->max() != -1)
408
5.71k
        nre->max_++;
409
15.1k
      goto LeaveEmpty;
410
411
26.3k
    LeaveEmpty:
412
26.3k
      *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
413
26.3k
      *r2ptr = nre;
414
26.3k
      break;
415
416
7.62k
    case kRegexpLiteralString: {
417
7.62k
      Rune r = r1->sub()[0]->rune();
418
      // Determine how much of the literal string is removed.
419
      // We know that we have at least one rune. :)
420
7.62k
      int n = 1;
421
20.1k
      while (n < r2->nrunes() && r2->runes()[n] == r)
422
12.5k
        n++;
423
7.62k
      nre->min_ += n;
424
7.62k
      if (nre->max() != -1)
425
3.47k
        nre->max_ += n;
426
7.62k
      if (n == r2->nrunes())
427
2.37k
        goto LeaveEmpty;
428
5.25k
      *r1ptr = nre;
429
5.25k
      *r2ptr = Regexp::LiteralString(
430
5.25k
          &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
431
5.25k
      break;
432
7.62k
    }
433
434
0
    default:
435
0
      nre->Decref();
436
0
      LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
437
0
      return;
438
31.5k
  }
439
440
31.5k
  r1->Decref();
441
31.5k
  r2->Decref();
442
31.5k
}
443
444
0
Regexp* SimplifyWalker::Copy(Regexp* re) {
445
0
  return re->Incref();
446
0
}
447
448
0
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
449
  // Should never be called: we use Walk(), not WalkExponential().
450
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
451
  LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
452
#endif
453
0
  return re->Incref();
454
0
}
455
456
2.94M
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
457
2.94M
  if (re->simple()) {
458
2.72M
    *stop = true;
459
2.72M
    return re->Incref();
460
2.72M
  }
461
218k
  return NULL;
462
2.94M
}
463
464
Regexp* SimplifyWalker::PostVisit(Regexp* re,
465
                                  Regexp* parent_arg,
466
                                  Regexp* pre_arg,
467
                                  Regexp** child_args,
468
218k
                                  int nchild_args) {
469
218k
  switch (re->op()) {
470
0
    case kRegexpNoMatch:
471
5.09k
    case kRegexpEmptyMatch:
472
16.3k
    case kRegexpLiteral:
473
25.3k
    case kRegexpLiteralString:
474
25.3k
    case kRegexpBeginLine:
475
25.3k
    case kRegexpEndLine:
476
25.3k
    case kRegexpBeginText:
477
25.3k
    case kRegexpWordBoundary:
478
25.3k
    case kRegexpNoWordBoundary:
479
25.3k
    case kRegexpEndText:
480
25.3k
    case kRegexpAnyChar:
481
25.3k
    case kRegexpAnyByte:
482
25.3k
    case kRegexpHaveMatch:
483
      // All these are always simple.
484
25.3k
      re->simple_ = true;
485
25.3k
      return re->Incref();
486
487
76.6k
    case kRegexpConcat:
488
116k
    case kRegexpAlternate: {
489
      // These are simple as long as the subpieces are simple.
490
116k
      if (!ChildArgsChanged(re, child_args)) {
491
48.5k
        re->simple_ = true;
492
48.5k
        return re->Incref();
493
48.5k
      }
494
67.7k
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
495
67.7k
      nre->AllocSub(re->nsub());
496
67.7k
      Regexp** nre_subs = nre->sub();
497
426k
      for (int i = 0; i < re->nsub(); i++)
498
358k
        nre_subs[i] = child_args[i];
499
67.7k
      nre->simple_ = true;
500
67.7k
      return nre;
501
116k
    }
502
503
6.89k
    case kRegexpCapture: {
504
6.89k
      Regexp* newsub = child_args[0];
505
6.89k
      if (newsub == re->sub()[0]) {
506
1.87k
        newsub->Decref();
507
1.87k
        re->simple_ = true;
508
1.87k
        return re->Incref();
509
1.87k
      }
510
5.01k
      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
511
5.01k
      nre->AllocSub(1);
512
5.01k
      nre->sub()[0] = newsub;
513
5.01k
      nre->cap_ = re->cap();
514
5.01k
      nre->simple_ = true;
515
5.01k
      return nre;
516
6.89k
    }
517
518
3.10k
    case kRegexpStar:
519
5.06k
    case kRegexpPlus:
520
8.64k
    case kRegexpQuest: {
521
8.64k
      Regexp* newsub = child_args[0];
522
      // Special case: repeat the empty string as much as
523
      // you want, but it's still the empty string.
524
8.64k
      if (newsub->op() == kRegexpEmptyMatch)
525
574
        return newsub;
526
527
      // These are simple as long as the subpiece is simple.
528
8.06k
      if (newsub == re->sub()[0]) {
529
2.57k
        newsub->Decref();
530
2.57k
        re->simple_ = true;
531
2.57k
        return re->Incref();
532
2.57k
      }
533
534
      // These are also idempotent if flags are constant.
535
5.48k
      if (re->op() == newsub->op() &&
536
5.48k
          re->parse_flags() == newsub->parse_flags())
537
499
        return newsub;
538
539
4.98k
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
540
4.98k
      nre->AllocSub(1);
541
4.98k
      nre->sub()[0] = newsub;
542
4.98k
      nre->simple_ = true;
543
4.98k
      return nre;
544
5.48k
    }
545
546
54.1k
    case kRegexpRepeat: {
547
54.1k
      Regexp* newsub = child_args[0];
548
      // Special case: repeat the empty string as much as
549
      // you want, but it's still the empty string.
550
54.1k
      if (newsub->op() == kRegexpEmptyMatch)
551
1.44k
        return newsub;
552
553
52.7k
      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
554
52.7k
                                   re->parse_flags());
555
52.7k
      newsub->Decref();
556
52.7k
      nre->simple_ = true;
557
52.7k
      return nre;
558
54.1k
    }
559
560
7.28k
    case kRegexpCharClass: {
561
7.28k
      Regexp* nre = SimplifyCharClass(re);
562
7.28k
      nre->simple_ = true;
563
7.28k
      return nre;
564
54.1k
    }
565
218k
  }
566
567
0
  LOG(ERROR) << "Simplify case not handled: " << re->op();
568
0
  return re->Incref();
569
218k
}
570
571
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
572
// Returns a new Regexp, handing the ref to the caller.
573
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
574
1.45M
                                Regexp::ParseFlags parse_flags) {
575
1.45M
  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
576
1.45M
  re->AllocSub(2);
577
1.45M
  Regexp** subs = re->sub();
578
1.45M
  subs[0] = re1;
579
1.45M
  subs[1] = re2;
580
1.45M
  return re;
581
1.45M
}
582
583
// Simplifies the expression re{min,max} in terms of *, +, and ?.
584
// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
585
// Caller must Decref return value when done with it.
586
// The result will *not* necessarily have the right capturing parens
587
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
588
// but in the Regexp* representation, both (x) are marked as $1.
589
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
590
52.7k
                                       Regexp::ParseFlags f) {
591
  // x{n,} means at least n matches of x.
592
52.7k
  if (max == -1) {
593
    // Special case: x{0,} is x*
594
13.8k
    if (min == 0)
595
1.72k
      return Regexp::Star(re->Incref(), f);
596
597
    // Special case: x{1,} is x+
598
12.1k
    if (min == 1)
599
4.42k
      return Regexp::Plus(re->Incref(), f);
600
601
    // General case: x{4,} is xxxx+
602
7.67k
    PODArray<Regexp*> nre_subs(min);
603
445k
    for (int i = 0; i < min-1; i++)
604
438k
      nre_subs[i] = re->Incref();
605
7.67k
    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
606
7.67k
    return Regexp::Concat(nre_subs.data(), min, f);
607
12.1k
  }
608
609
  // Special case: (x){0} matches only empty string.
610
38.9k
  if (min == 0 && max == 0)
611
1.50k
    return new Regexp(kRegexpEmptyMatch, f);
612
613
  // Special case: x{1} is just x.
614
37.4k
  if (min == 1 && max == 1)
615
650
    return re->Incref();
616
617
  // General case: x{n,m} means n copies of x and m copies of x?.
618
  // The machine will do less work if we nest the final m copies,
619
  // so that x{2,5} = xx(x(x(x)?)?)?
620
621
  // Build leading prefix: xx.  Capturing only on the last one.
622
36.7k
  Regexp* nre = NULL;
623
36.7k
  if (min > 0) {
624
32.0k
    PODArray<Regexp*> nre_subs(min);
625
7.36M
    for (int i = 0; i < min; i++)
626
7.33M
      nre_subs[i] = re->Incref();
627
32.0k
    nre = Regexp::Concat(nre_subs.data(), min, f);
628
32.0k
  }
629
630
  // Build and attach suffix: (x(x(x)?)?)?
631
36.7k
  if (max > min) {
632
19.8k
    Regexp* suf = Regexp::Quest(re->Incref(), f);
633
1.45M
    for (int i = min+1; i < max; i++)
634
1.43M
      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
635
19.8k
    if (nre == NULL)
636
4.71k
      nre = suf;
637
15.0k
    else
638
15.0k
      nre = Concat2(nre, suf, f);
639
19.8k
  }
640
641
36.7k
  if (nre == NULL) {
642
    // Some degenerate case, like min > max, or min < max < 0.
643
    // This shouldn't happen, because the parser rejects such regexps.
644
0
    LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
645
0
    return new Regexp(kRegexpNoMatch, f);
646
0
  }
647
648
36.7k
  return nre;
649
36.7k
}
650
651
// Simplifies a character class.
652
// Caller must Decref return value when done with it.
653
7.28k
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
654
7.28k
  CharClass* cc = re->cc();
655
656
  // Special cases
657
7.28k
  if (cc->empty())
658
1.76k
    return new Regexp(kRegexpNoMatch, re->parse_flags());
659
5.52k
  if (cc->full())
660
247
    return new Regexp(kRegexpAnyChar, re->parse_flags());
661
662
5.27k
  return re->Incref();
663
5.52k
}
664
665
}  // namespace re2