Coverage Report

Created: 2024-05-08 16:16

/src/re2/re2/simplify.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Rewrite POSIX and other features in re
6
// to use simple extended regular expression features.
7
// Also sort and simplify character classes.
8
9
#include <string>
10
11
#include "util/util.h"
12
#include "util/logging.h"
13
#include "util/utf.h"
14
#include "re2/pod_array.h"
15
#include "re2/regexp.h"
16
#include "re2/walker-inl.h"
17
18
namespace re2 {
19
20
// Parses the regexp src and then simplifies it and sets *dst to the
21
// string representation of the simplified form.  Returns true on success.
22
// Returns false and sets *error (if error != NULL) on error.
23
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
24
0
                            std::string* dst, RegexpStatus* status) {
25
0
  Regexp* re = Parse(src, flags, status);
26
0
  if (re == NULL)
27
0
    return false;
28
0
  Regexp* sre = re->Simplify();
29
0
  re->Decref();
30
0
  if (sre == NULL) {
31
0
    if (status) {
32
0
      status->set_code(kRegexpInternalError);
33
0
      status->set_error_arg(src);
34
0
    }
35
0
    return false;
36
0
  }
37
0
  *dst = sre->ToString();
38
0
  sre->Decref();
39
0
  return true;
40
0
}
41
42
// Assuming the simple_ flags on the children are accurate,
43
// is this Regexp* simple?
44
612k
bool Regexp::ComputeSimple() {
45
612k
  Regexp** subs;
46
612k
  switch (op_) {
47
4.05k
    case kRegexpNoMatch:
48
24.3k
    case kRegexpEmptyMatch:
49
367k
    case kRegexpLiteral:
50
368k
    case kRegexpLiteralString:
51
373k
    case kRegexpBeginLine:
52
376k
    case kRegexpEndLine:
53
392k
    case kRegexpBeginText:
54
393k
    case kRegexpWordBoundary:
55
394k
    case kRegexpNoWordBoundary:
56
401k
    case kRegexpEndText:
57
414k
    case kRegexpAnyChar:
58
416k
    case kRegexpAnyByte:
59
416k
    case kRegexpHaveMatch:
60
416k
      return true;
61
42.2k
    case kRegexpConcat:
62
55.3k
    case kRegexpAlternate:
63
      // These are simple as long as the subpieces are simple.
64
55.3k
      subs = sub();
65
308k
      for (int i = 0; i < nsub_; i++)
66
272k
        if (!subs[i]->simple())
67
19.6k
          return false;
68
35.6k
      return true;
69
78.6k
    case kRegexpCharClass:
70
      // Simple as long as the char class is not empty, not full.
71
78.6k
      if (ccb_ != NULL)
72
77.9k
        return !ccb_->empty() && !ccb_->full();
73
714
      return !cc_->empty() && !cc_->full();
74
16.3k
    case kRegexpCapture:
75
16.3k
      subs = sub();
76
16.3k
      return subs[0]->simple();
77
10.3k
    case kRegexpStar:
78
20.9k
    case kRegexpPlus:
79
33.3k
    case kRegexpQuest:
80
33.3k
      subs = sub();
81
33.3k
      if (!subs[0]->simple())
82
1.17k
        return false;
83
32.1k
      switch (subs[0]->op_) {
84
23
        case kRegexpStar:
85
64
        case kRegexpPlus:
86
135
        case kRegexpQuest:
87
313
        case kRegexpEmptyMatch:
88
1.11k
        case kRegexpNoMatch:
89
1.11k
          return false;
90
31.0k
        default:
91
31.0k
          break;
92
32.1k
      }
93
31.0k
      return true;
94
13.3k
    case kRegexpRepeat:
95
13.3k
      return false;
96
612k
  }
97
0
  LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
98
0
  return false;
99
612k
}
100
101
// Walker subclass used by Simplify.
102
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
103
// occurrences of that literal into repeats of that literal. It also works for
104
// char classes, any char and any byte.
105
// PostVisit creates the coalesced result, which should then be simplified.
106
class CoalesceWalker : public Regexp::Walker<Regexp*> {
107
 public:
108
22.9k
  CoalesceWalker() {}
109
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
110
                            Regexp** child_args, int nchild_args);
111
  virtual Regexp* Copy(Regexp* re);
112
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113
114
 private:
115
  // These functions are declared inside CoalesceWalker so that
116
  // they can edit the private fields of the Regexps they construct.
117
118
  // Returns true if r1 and r2 can be coalesced. In particular, ensures that
119
  // the parse flags are consistent. (They will not be checked again later.)
120
  static bool CanCoalesce(Regexp* r1, Regexp* r2);
121
122
  // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
123
  // will be empty match and the coalesced op. In other cases, where part of a
124
  // literal string was removed to be coalesced, the array elements afterwards
125
  // will be the coalesced op and the remainder of the literal string.
126
  static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
127
128
  CoalesceWalker(const CoalesceWalker&) = delete;
129
  CoalesceWalker& operator=(const CoalesceWalker&) = delete;
130
};
131
132
// Walker subclass used by Simplify.
133
// The simplify walk is purely post-recursive: given the simplified children,
134
// PostVisit creates the simplified result.
135
// The child_args are simplified Regexp*s.
136
class SimplifyWalker : public Regexp::Walker<Regexp*> {
137
 public:
138
22.9k
  SimplifyWalker() {}
139
  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
140
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
141
                            Regexp** child_args, int nchild_args);
142
  virtual Regexp* Copy(Regexp* re);
143
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
144
145
 private:
146
  // These functions are declared inside SimplifyWalker so that
147
  // they can edit the private fields of the Regexps they construct.
148
149
  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
150
  // Caller must Decref return value when done with it.
151
  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
152
153
  // Simplifies the expression re{min,max} in terms of *, +, and ?.
154
  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
155
  // Caller must Decref return value when done with it.
156
  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
157
                                Regexp::ParseFlags parse_flags);
158
159
  // Simplifies a character class by expanding any named classes
160
  // into rune ranges.  Does not edit re.  Does not consume ref to re.
161
  // Caller must Decref return value when done with it.
162
  static Regexp* SimplifyCharClass(Regexp* re);
163
164
  SimplifyWalker(const SimplifyWalker&) = delete;
165
  SimplifyWalker& operator=(const SimplifyWalker&) = delete;
166
};
167
168
// Simplifies a regular expression, returning a new regexp.
169
// The new regexp uses traditional Unix egrep features only,
170
// plus the Perl (?:) non-capturing parentheses.
171
// Otherwise, no POSIX or Perl additions.  The new regexp
172
// captures exactly the same subexpressions (with the same indices)
173
// as the original.
174
// Does not edit current object.
175
// Caller must Decref() return value when done with it.
176
177
22.9k
Regexp* Regexp::Simplify() {
178
22.9k
  CoalesceWalker cw;
179
22.9k
  Regexp* cre = cw.Walk(this, NULL);
180
22.9k
  if (cre == NULL)
181
0
    return NULL;
182
22.9k
  if (cw.stopped_early()) {
183
0
    cre->Decref();
184
0
    return NULL;
185
0
  }
186
22.9k
  SimplifyWalker sw;
187
22.9k
  Regexp* sre = sw.Walk(cre, NULL);
188
22.9k
  cre->Decref();
189
22.9k
  if (sre == NULL)
190
0
    return NULL;
191
22.9k
  if (sw.stopped_early()) {
192
0
    sre->Decref();
193
0
    return NULL;
194
0
  }
195
22.9k
  return sre;
196
22.9k
}
197
198
#define Simplify DontCallSimplify  // Avoid accidental recursion
199
200
// Utility function for PostVisit implementations that compares re->sub() with
201
// child_args to determine whether any child_args changed. In the common case,
202
// where nothing changed, calls Decref() for all child_args and returns false,
203
// so PostVisit must return re->Incref(). Otherwise, returns true.
204
134k
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
205
504k
  for (int i = 0; i < re->nsub(); i++) {
206
395k
    Regexp* sub = re->sub()[i];
207
395k
    Regexp* newsub = child_args[i];
208
395k
    if (newsub != sub)
209
25.4k
      return true;
210
395k
  }
211
439k
  for (int i = 0; i < re->nsub(); i++) {
212
330k
    Regexp* newsub = child_args[i];
213
330k
    newsub->Decref();
214
330k
  }
215
109k
  return false;
216
134k
}
217
218
0
Regexp* CoalesceWalker::Copy(Regexp* re) {
219
0
  return re->Incref();
220
0
}
221
222
0
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
223
  // Should never be called: we use Walk(), not WalkExponential().
224
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
225
  LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
226
#endif
227
0
  return re->Incref();
228
0
}
229
230
Regexp* CoalesceWalker::PostVisit(Regexp* re,
231
                                  Regexp* parent_arg,
232
                                  Regexp* pre_arg,
233
                                  Regexp** child_args,
234
362k
                                  int nchild_args) {
235
362k
  if (re->nsub() == 0)
236
257k
    return re->Incref();
237
238
105k
  if (re->op() != kRegexpConcat) {
239
64.1k
    if (!ChildArgsChanged(re, child_args))
240
60.9k
      return re->Incref();
241
242
    // Something changed. Build a new op.
243
3.14k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
244
3.14k
    nre->AllocSub(re->nsub());
245
3.14k
    Regexp** nre_subs = nre->sub();
246
13.3k
    for (int i = 0; i < re->nsub(); i++)
247
10.1k
      nre_subs[i] = child_args[i];
248
    // Repeats and Captures have additional data that must be copied.
249
3.14k
    if (re->op() == kRegexpRepeat) {
250
346
      nre->min_ = re->min();
251
346
      nre->max_ = re->max();
252
2.80k
    } else if (re->op() == kRegexpCapture) {
253
521
      nre->cap_ = re->cap();
254
521
    }
255
3.14k
    return nre;
256
64.1k
  }
257
258
41.1k
  bool can_coalesce = false;
259
246k
  for (int i = 0; i < re->nsub(); i++) {
260
209k
    if (i+1 < re->nsub() &&
261
209k
        CanCoalesce(child_args[i], child_args[i+1])) {
262
4.13k
      can_coalesce = true;
263
4.13k
      break;
264
4.13k
    }
265
209k
  }
266
41.1k
  if (!can_coalesce) {
267
37.0k
    if (!ChildArgsChanged(re, child_args))
268
35.3k
      return re->Incref();
269
270
    // Something changed. Build a new op.
271
1.66k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
272
1.66k
    nre->AllocSub(re->nsub());
273
1.66k
    Regexp** nre_subs = nre->sub();
274
7.85k
    for (int i = 0; i < re->nsub(); i++)
275
6.19k
      nre_subs[i] = child_args[i];
276
1.66k
    return nre;
277
37.0k
  }
278
279
42.7k
  for (int i = 0; i < re->nsub(); i++) {
280
38.6k
    if (i+1 < re->nsub() &&
281
38.6k
        CanCoalesce(child_args[i], child_args[i+1]))
282
9.32k
      DoCoalesce(&child_args[i], &child_args[i+1]);
283
38.6k
  }
284
  // Determine how many empty matches were left by DoCoalesce.
285
4.13k
  int n = 0;
286
42.7k
  for (int i = n; i < re->nsub(); i++) {
287
38.6k
    if (child_args[i]->op() == kRegexpEmptyMatch)
288
7.34k
      n++;
289
38.6k
  }
290
  // Build a new op.
291
4.13k
  Regexp* nre = new Regexp(re->op(), re->parse_flags());
292
4.13k
  nre->AllocSub(re->nsub() - n);
293
4.13k
  Regexp** nre_subs = nre->sub();
294
42.7k
  for (int i = 0, j = 0; i < re->nsub(); i++) {
295
38.6k
    if (child_args[i]->op() == kRegexpEmptyMatch) {
296
7.34k
      child_args[i]->Decref();
297
7.34k
      continue;
298
7.34k
    }
299
31.2k
    nre_subs[j] = child_args[i];
300
31.2k
    j++;
301
31.2k
  }
302
4.13k
  return nre;
303
41.1k
}
304
305
207k
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
306
  // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
307
  // any byte.
308
207k
  if ((r1->op() == kRegexpStar ||
309
207k
       r1->op() == kRegexpPlus ||
310
207k
       r1->op() == kRegexpQuest ||
311
207k
       r1->op() == kRegexpRepeat) &&
312
207k
      (r1->sub()[0]->op() == kRegexpLiteral ||
313
45.8k
       r1->sub()[0]->op() == kRegexpCharClass ||
314
45.8k
       r1->sub()[0]->op() == kRegexpAnyChar ||
315
45.8k
       r1->sub()[0]->op() == kRegexpAnyByte)) {
316
    // r2 must be a star/plus/quest/repeat of the same literal, char class,
317
    // any char or any byte.
318
38.6k
    if ((r2->op() == kRegexpStar ||
319
38.6k
         r2->op() == kRegexpPlus ||
320
38.6k
         r2->op() == kRegexpQuest ||
321
38.6k
         r2->op() == kRegexpRepeat) &&
322
38.6k
        Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
323
        // The parse flags must be consistent.
324
38.6k
        ((r1->parse_flags() & Regexp::NonGreedy) ==
325
3.14k
         (r2->parse_flags() & Regexp::NonGreedy))) {
326
2.76k
      return true;
327
2.76k
    }
328
    // ... OR an occurrence of that literal, char class, any char or any byte
329
35.8k
    if (Regexp::Equal(r1->sub()[0], r2)) {
330
6.32k
      return true;
331
6.32k
    }
332
    // ... OR a literal string that begins with that literal.
333
29.5k
    if (r1->sub()[0]->op() == kRegexpLiteral &&
334
29.5k
        r2->op() == kRegexpLiteralString &&
335
29.5k
        r2->runes()[0] == r1->sub()[0]->rune() &&
336
        // The parse flags must be consistent.
337
29.5k
        ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
338
4.40k
         (r2->parse_flags() & Regexp::FoldCase))) {
339
4.37k
      return true;
340
4.37k
    }
341
29.5k
  }
342
193k
  return false;
343
207k
}
344
345
9.32k
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
346
9.32k
  Regexp* r1 = *r1ptr;
347
9.32k
  Regexp* r2 = *r2ptr;
348
349
9.32k
  Regexp* nre = Regexp::Repeat(
350
9.32k
      r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
351
352
9.32k
  switch (r1->op()) {
353
1.50k
    case kRegexpStar:
354
1.50k
      nre->min_ = 0;
355
1.50k
      nre->max_ = -1;
356
1.50k
      break;
357
358
1.73k
    case kRegexpPlus:
359
1.73k
      nre->min_ = 1;
360
1.73k
      nre->max_ = -1;
361
1.73k
      break;
362
363
1.80k
    case kRegexpQuest:
364
1.80k
      nre->min_ = 0;
365
1.80k
      nre->max_ = 1;
366
1.80k
      break;
367
368
4.29k
    case kRegexpRepeat:
369
4.29k
      nre->min_ = r1->min();
370
4.29k
      nre->max_ = r1->max();
371
4.29k
      break;
372
373
0
    default:
374
0
      nre->Decref();
375
0
      LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
376
0
      return;
377
9.32k
  }
378
379
9.32k
  switch (r2->op()) {
380
526
    case kRegexpStar:
381
526
      nre->max_ = -1;
382
526
      goto LeaveEmpty;
383
384
508
    case kRegexpPlus:
385
508
      nre->min_++;
386
508
      nre->max_ = -1;
387
508
      goto LeaveEmpty;
388
389
651
    case kRegexpQuest:
390
651
      if (nre->max() != -1)
391
460
        nre->max_++;
392
651
      goto LeaveEmpty;
393
394
286
    case kRegexpRepeat:
395
286
      nre->min_ += r2->min();
396
286
      if (r2->max() == -1)
397
85
        nre->max_ = -1;
398
201
      else if (nre->max() != -1)
399
151
        nre->max_ += r2->max();
400
286
      goto LeaveEmpty;
401
402
1.09k
    case kRegexpLiteral:
403
2.91k
    case kRegexpCharClass:
404
4.65k
    case kRegexpAnyChar:
405
4.82k
    case kRegexpAnyByte:
406
4.82k
      nre->min_++;
407
4.82k
      if (nre->max() != -1)
408
1.07k
        nre->max_++;
409
4.82k
      goto LeaveEmpty;
410
411
7.33k
    LeaveEmpty:
412
7.33k
      *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
413
7.33k
      *r2ptr = nre;
414
7.33k
      break;
415
416
2.53k
    case kRegexpLiteralString: {
417
2.53k
      Rune r = r1->sub()[0]->rune();
418
      // Determine how much of the literal string is removed.
419
      // We know that we have at least one rune. :)
420
2.53k
      int n = 1;
421
6.32k
      while (n < r2->nrunes() && r2->runes()[n] == r)
422
3.79k
        n++;
423
2.53k
      nre->min_ += n;
424
2.53k
      if (nre->max() != -1)
425
1.15k
        nre->max_ += n;
426
2.53k
      if (n == r2->nrunes())
427
539
        goto LeaveEmpty;
428
1.99k
      *r1ptr = nre;
429
1.99k
      *r2ptr = Regexp::LiteralString(
430
1.99k
          &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
431
1.99k
      break;
432
2.53k
    }
433
434
0
    default:
435
0
      nre->Decref();
436
0
      LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
437
0
      return;
438
9.32k
  }
439
440
9.32k
  r1->Decref();
441
9.32k
  r2->Decref();
442
9.32k
}
443
444
0
Regexp* SimplifyWalker::Copy(Regexp* re) {
445
0
  return re->Incref();
446
0
}
447
448
0
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
449
  // Should never be called: we use Walk(), not WalkExponential().
450
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
451
  LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
452
#endif
453
0
  return re->Incref();
454
0
}
455
456
195k
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
457
195k
  if (re->simple()) {
458
130k
    *stop = true;
459
130k
    return re->Incref();
460
130k
  }
461
64.1k
  return NULL;
462
195k
}
463
464
Regexp* SimplifyWalker::PostVisit(Regexp* re,
465
                                  Regexp* parent_arg,
466
                                  Regexp* pre_arg,
467
                                  Regexp** child_args,
468
64.1k
                                  int nchild_args) {
469
64.1k
  switch (re->op()) {
470
0
    case kRegexpNoMatch:
471
1.13k
    case kRegexpEmptyMatch:
472
4.49k
    case kRegexpLiteral:
473
7.41k
    case kRegexpLiteralString:
474
7.41k
    case kRegexpBeginLine:
475
7.41k
    case kRegexpEndLine:
476
7.41k
    case kRegexpBeginText:
477
7.41k
    case kRegexpWordBoundary:
478
7.41k
    case kRegexpNoWordBoundary:
479
7.41k
    case kRegexpEndText:
480
7.41k
    case kRegexpAnyChar:
481
7.41k
    case kRegexpAnyByte:
482
7.41k
    case kRegexpHaveMatch:
483
      // All these are always simple.
484
7.41k
      re->simple_ = true;
485
7.41k
      return re->Incref();
486
487
22.4k
    case kRegexpConcat:
488
33.8k
    case kRegexpAlternate: {
489
      // These are simple as long as the subpieces are simple.
490
33.8k
      if (!ChildArgsChanged(re, child_args)) {
491
13.2k
        re->simple_ = true;
492
13.2k
        return re->Incref();
493
13.2k
      }
494
20.6k
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
495
20.6k
      nre->AllocSub(re->nsub());
496
20.6k
      Regexp** nre_subs = nre->sub();
497
126k
      for (int i = 0; i < re->nsub(); i++)
498
106k
        nre_subs[i] = child_args[i];
499
20.6k
      nre->simple_ = true;
500
20.6k
      return nre;
501
33.8k
    }
502
503
1.39k
    case kRegexpCapture: {
504
1.39k
      Regexp* newsub = child_args[0];
505
1.39k
      if (newsub == re->sub()[0]) {
506
384
        newsub->Decref();
507
384
        re->simple_ = true;
508
384
        return re->Incref();
509
384
      }
510
1.01k
      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
511
1.01k
      nre->AllocSub(1);
512
1.01k
      nre->sub()[0] = newsub;
513
1.01k
      nre->cap_ = re->cap();
514
1.01k
      nre->simple_ = true;
515
1.01k
      return nre;
516
1.39k
    }
517
518
830
    case kRegexpStar:
519
1.31k
    case kRegexpPlus:
520
2.27k
    case kRegexpQuest: {
521
2.27k
      Regexp* newsub = child_args[0];
522
      // Special case: repeat the empty string as much as
523
      // you want, but it's still the empty string.
524
2.27k
      if (newsub->op() == kRegexpEmptyMatch)
525
218
        return newsub;
526
527
      // These are simple as long as the subpiece is simple.
528
2.06k
      if (newsub == re->sub()[0]) {
529
897
        newsub->Decref();
530
897
        re->simple_ = true;
531
897
        return re->Incref();
532
897
      }
533
534
      // These are also idempotent if flags are constant.
535
1.16k
      if (re->op() == newsub->op() &&
536
1.16k
          re->parse_flags() == newsub->parse_flags())
537
184
        return newsub;
538
539
979
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
540
979
      nre->AllocSub(1);
541
979
      nre->sub()[0] = newsub;
542
979
      nre->simple_ = true;
543
979
      return nre;
544
1.16k
    }
545
546
17.0k
    case kRegexpRepeat: {
547
17.0k
      Regexp* newsub = child_args[0];
548
      // Special case: repeat the empty string as much as
549
      // you want, but it's still the empty string.
550
17.0k
      if (newsub->op() == kRegexpEmptyMatch)
551
94
        return newsub;
552
553
16.9k
      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
554
16.9k
                                   re->parse_flags());
555
16.9k
      newsub->Decref();
556
16.9k
      nre->simple_ = true;
557
16.9k
      return nre;
558
17.0k
    }
559
560
2.21k
    case kRegexpCharClass: {
561
2.21k
      Regexp* nre = SimplifyCharClass(re);
562
2.21k
      nre->simple_ = true;
563
2.21k
      return nre;
564
17.0k
    }
565
64.1k
  }
566
567
0
  LOG(ERROR) << "Simplify case not handled: " << re->op();
568
0
  return re->Incref();
569
64.1k
}
570
571
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
572
// Returns a new Regexp, handing the ref to the caller.
573
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
574
505k
                                Regexp::ParseFlags parse_flags) {
575
505k
  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
576
505k
  re->AllocSub(2);
577
505k
  Regexp** subs = re->sub();
578
505k
  subs[0] = re1;
579
505k
  subs[1] = re2;
580
505k
  return re;
581
505k
}
582
583
// Simplifies the expression re{min,max} in terms of *, +, and ?.
584
// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
585
// Caller must Decref return value when done with it.
586
// The result will *not* necessarily have the right capturing parens
587
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
588
// but in the Regexp* representation, both (x) are marked as $1.
589
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
590
16.9k
                                       Regexp::ParseFlags f) {
591
  // x{n,} means at least n matches of x.
592
16.9k
  if (max == -1) {
593
    // Special case: x{0,} is x*
594
4.19k
    if (min == 0)
595
300
      return Regexp::Star(re->Incref(), f);
596
597
    // Special case: x{1,} is x+
598
3.89k
    if (min == 1)
599
1.44k
      return Regexp::Plus(re->Incref(), f);
600
601
    // General case: x{4,} is xxxx+
602
2.45k
    PODArray<Regexp*> nre_subs(min);
603
17.8k
    for (int i = 0; i < min-1; i++)
604
15.4k
      nre_subs[i] = re->Incref();
605
2.45k
    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
606
2.45k
    return Regexp::Concat(nre_subs.data(), min, f);
607
3.89k
  }
608
609
  // Special case: (x){0} matches only empty string.
610
12.7k
  if (min == 0 && max == 0)
611
907
    return new Regexp(kRegexpEmptyMatch, f);
612
613
  // Special case: x{1} is just x.
614
11.8k
  if (min == 1 && max == 1)
615
182
    return re->Incref();
616
617
  // General case: x{n,m} means n copies of x and m copies of x?.
618
  // The machine will do less work if we nest the final m copies,
619
  // so that x{2,5} = xx(x(x(x)?)?)?
620
621
  // Build leading prefix: xx.  Capturing only on the last one.
622
11.6k
  Regexp* nre = NULL;
623
11.6k
  if (min > 0) {
624
10.3k
    PODArray<Regexp*> nre_subs(min);
625
2.38M
    for (int i = 0; i < min; i++)
626
2.37M
      nre_subs[i] = re->Incref();
627
10.3k
    nre = Regexp::Concat(nre_subs.data(), min, f);
628
10.3k
  }
629
630
  // Build and attach suffix: (x(x(x)?)?)?
631
11.6k
  if (max > min) {
632
6.05k
    Regexp* suf = Regexp::Quest(re->Incref(), f);
633
506k
    for (int i = min+1; i < max; i++)
634
500k
      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
635
6.05k
    if (nre == NULL)
636
1.29k
      nre = suf;
637
4.76k
    else
638
4.76k
      nre = Concat2(nre, suf, f);
639
6.05k
  }
640
641
11.6k
  if (nre == NULL) {
642
    // Some degenerate case, like min > max, or min < max < 0.
643
    // This shouldn't happen, because the parser rejects such regexps.
644
0
    LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
645
0
    return new Regexp(kRegexpNoMatch, f);
646
0
  }
647
648
11.6k
  return nre;
649
11.6k
}
650
651
// Simplifies a character class.
652
// Caller must Decref return value when done with it.
653
2.21k
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
654
2.21k
  CharClass* cc = re->cc();
655
656
  // Special cases
657
2.21k
  if (cc->empty())
658
340
    return new Regexp(kRegexpNoMatch, re->parse_flags());
659
1.87k
  if (cc->full())
660
31
    return new Regexp(kRegexpAnyChar, re->parse_flags());
661
662
1.83k
  return re->Incref();
663
1.87k
}
664
665
}  // namespace re2