Coverage Report

Created: 2025-06-24 07:53

/src/duckdb/third_party/re2/re2/simplify.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Rewrite POSIX and other features in re
6
// to use simple extended regular expression features.
7
// Also sort and simplify character classes.
8
9
#include <string>
10
11
#include "util/util.h"
12
#include "util/logging.h"
13
#include "util/utf.h"
14
#include "re2/pod_array.h"
15
#include "re2/regexp.h"
16
#include "re2/walker-inl.h"
17
18
namespace duckdb_re2 {
19
20
// Parses the regexp src and then simplifies it and sets *dst to the
21
// string representation of the simplified form.  Returns true on success.
22
// Returns false and sets *error (if error != NULL) on error.
23
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
24
0
                            std::string* dst, RegexpStatus* status) {
25
0
  Regexp* re = Parse(src, flags, status);
26
0
  if (re == NULL)
27
0
    return false;
28
0
  Regexp* sre = re->Simplify();
29
0
  re->Decref();
30
0
  if (sre == NULL) {
31
0
    if (status) {
32
0
      status->set_code(kRegexpInternalError);
33
0
      status->set_error_arg(src);
34
0
    }
35
0
    return false;
36
0
  }
37
0
  *dst = sre->ToString();
38
0
  sre->Decref();
39
0
  return true;
40
0
}
41
42
// Assuming the simple_ flags on the children are accurate,
43
// is this Regexp* simple?
44
1.72M
bool Regexp::ComputeSimple() {
45
1.72M
  Regexp** subs;
46
1.72M
  switch (op_) {
47
0
    case kRegexpNoMatch:
48
344k
    case kRegexpEmptyMatch:
49
882k
    case kRegexpLiteral:
50
882k
    case kRegexpLiteralString:
51
882k
    case kRegexpBeginLine:
52
882k
    case kRegexpEndLine:
53
926k
    case kRegexpBeginText:
54
927k
    case kRegexpWordBoundary:
55
928k
    case kRegexpNoWordBoundary:
56
936k
    case kRegexpEndText:
57
936k
    case kRegexpAnyChar:
58
937k
    case kRegexpAnyByte:
59
937k
    case kRegexpHaveMatch:
60
937k
      return true;
61
46.1k
    case kRegexpConcat:
62
48.0k
    case kRegexpAlternate:
63
      // These are simple as long as the subpieces are simple.
64
48.0k
      subs = sub();
65
1.07M
      for (int i = 0; i < nsub_; i++)
66
1.02M
        if (!subs[i]->simple())
67
1.42k
          return false;
68
46.5k
      return true;
69
636k
    case kRegexpCharClass:
70
      // Simple as long as the char class is not empty, not full.
71
636k
      if (arguments.char_class.ccb_ != NULL)
72
636k
        return !arguments.char_class.ccb_->empty() && !arguments.char_class.ccb_->full();
73
33
      return !arguments.char_class.cc_->empty() && !arguments.char_class.cc_->full();
74
3.03k
    case kRegexpCapture:
75
3.03k
      subs = sub();
76
3.03k
      return subs[0]->simple();
77
19.2k
    case kRegexpStar:
78
54.4k
    case kRegexpPlus:
79
104k
    case kRegexpQuest:
80
104k
      subs = sub();
81
104k
      if (!subs[0]->simple())
82
0
        return false;
83
104k
      switch (subs[0]->op_) {
84
0
        case kRegexpStar:
85
0
        case kRegexpPlus:
86
0
        case kRegexpQuest:
87
0
        case kRegexpEmptyMatch:
88
0
        case kRegexpNoMatch:
89
0
          return false;
90
104k
        default:
91
104k
          break;
92
104k
      }
93
104k
      return true;
94
36
    case kRegexpRepeat:
95
36
      return false;
96
1.72M
  }
97
0
  LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
98
0
  return false;
99
1.72M
}
100
101
// Walker subclass used by Simplify.
102
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
103
// occurrences of that literal into repeats of that literal. It also works for
104
// char classes, any char and any byte.
105
// PostVisit creates the coalesced result, which should then be simplified.
106
class CoalesceWalker : public Regexp::Walker<Regexp*> {
107
 public:
108
4.88k
  CoalesceWalker() {}
109
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
110
                            Regexp** child_args, int nchild_args);
111
  virtual Regexp* Copy(Regexp* re);
112
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113
114
 private:
115
  // These functions are declared inside CoalesceWalker so that
116
  // they can edit the private fields of the Regexps they construct.
117
118
  // Returns true if r1 and r2 can be coalesced. In particular, ensures that
119
  // the parse flags are consistent. (They will not be checked again later.)
120
  static bool CanCoalesce(Regexp* r1, Regexp* r2);
121
122
  // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
123
  // will be empty match and the coalesced op. In other cases, where part of a
124
  // literal string was removed to be coalesced, the array elements afterwards
125
  // will be the coalesced op and the remainder of the literal string.
126
  static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
127
128
  CoalesceWalker(const CoalesceWalker&) = delete;
129
  CoalesceWalker& operator=(const CoalesceWalker&) = delete;
130
};
131
132
// Walker subclass used by Simplify.
133
// The simplify walk is purely post-recursive: given the simplified children,
134
// PostVisit creates the simplified result.
135
// The child_args are simplified Regexp*s.
136
class SimplifyWalker : public Regexp::Walker<Regexp*> {
137
 public:
138
4.88k
  SimplifyWalker() {}
139
  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
140
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
141
                            Regexp** child_args, int nchild_args);
142
  virtual Regexp* Copy(Regexp* re);
143
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
144
145
 private:
146
  // These functions are declared inside SimplifyWalker so that
147
  // they can edit the private fields of the Regexps they construct.
148
149
  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
150
  // Caller must Decref return value when done with it.
151
  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
152
153
  // Simplifies the expression re{min,max} in terms of *, +, and ?.
154
  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
155
  // Caller must Decref return value when done with it.
156
  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
157
                                Regexp::ParseFlags parse_flags);
158
159
  // Simplifies a character class by expanding any named classes
160
  // into rune ranges.  Does not edit re.  Does not consume ref to re.
161
  // Caller must Decref return value when done with it.
162
  static Regexp* SimplifyCharClass(Regexp* re);
163
164
  SimplifyWalker(const SimplifyWalker&) = delete;
165
  SimplifyWalker& operator=(const SimplifyWalker&) = delete;
166
};
167
168
// Simplifies a regular expression, returning a new regexp.
169
// The new regexp uses traditional Unix egrep features only,
170
// plus the Perl (?:) non-capturing parentheses.
171
// Otherwise, no POSIX or Perl additions.  The new regexp
172
// captures exactly the same subexpressions (with the same indices)
173
// as the original.
174
// Does not edit current object.
175
// Caller must Decref() return value when done with it.
176
177
4.88k
Regexp* Regexp::Simplify() {
178
4.88k
  CoalesceWalker cw;
179
4.88k
  Regexp* cre = cw.Walk(this, NULL);
180
4.88k
  if (cre == NULL)
181
0
    return NULL;
182
4.88k
  if (cw.stopped_early()) {
183
0
    cre->Decref();
184
0
    return NULL;
185
0
  }
186
4.88k
  SimplifyWalker sw;
187
4.88k
  Regexp* sre = sw.Walk(cre, NULL);
188
4.88k
  cre->Decref();
189
4.88k
  if (sre == NULL)
190
0
    return NULL;
191
4.88k
  if (sw.stopped_early()) {
192
0
    sre->Decref();
193
0
    return NULL;
194
0
  }
195
4.88k
  return sre;
196
4.88k
}
197
198
#define Simplify DontCallSimplify  // Avoid accidental recursion
199
200
// Utility function for PostVisit implementations that compares re->sub() with
201
// child_args to determine whether any child_args changed. In the common case,
202
// where nothing changed, calls Decref() for all child_args and returns false,
203
// so PostVisit must return re->Incref(). Otherwise, returns true.
204
216k
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
205
707k
  for (int i = 0; i < re->nsub(); i++) {
206
561k
    Regexp* sub = re->sub()[i];
207
561k
    Regexp* newsub = child_args[i];
208
561k
    if (newsub != sub)
209
71.6k
      return true;
210
561k
  }
211
450k
  for (int i = 0; i < re->nsub(); i++) {
212
305k
    Regexp* newsub = child_args[i];
213
305k
    newsub->Decref();
214
305k
  }
215
145k
  return false;
216
216k
}
217
218
0
Regexp* CoalesceWalker::Copy(Regexp* re) {
219
0
  return re->Incref();
220
0
}
221
222
0
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
223
  // Should never be called: we use Walk(), not WalkExponential().
224
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
225
  LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
226
#endif
227
0
  return re->Incref();
228
0
}
229
230
Regexp* CoalesceWalker::PostVisit(Regexp* re,
231
                                  Regexp* parent_arg,
232
                                  Regexp* pre_arg,
233
                                  Regexp** child_args,
234
1.49M
                                  int nchild_args) {
235
1.49M
  if (re->nsub() == 0)
236
1.31M
    return re->Incref();
237
238
183k
  if (re->op() != kRegexpConcat) {
239
114k
    if (!ChildArgsChanged(re, child_args))
240
109k
      return re->Incref();
241
242
    // Something changed. Build a new op.
243
4.78k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
244
4.78k
    nre->AllocSub(re->nsub());
245
4.78k
    Regexp** nre_subs = nre->sub();
246
422k
    for (int i = 0; i < re->nsub(); i++)
247
418k
      nre_subs[i] = child_args[i];
248
    // Repeats and Captures have additional data that must be copied.
249
4.78k
    if (re->op() == kRegexpRepeat) {
250
0
      nre->arguments.repeat.min_ = re->min();
251
0
      nre->arguments.repeat.max_ = re->max();
252
4.78k
    } else if (re->op() == kRegexpCapture) {
253
84
      nre->arguments.capture.cap_ = re->cap();
254
84
    }
255
4.78k
    return nre;
256
114k
  }
257
258
68.9k
  bool can_coalesce = false;
259
374k
  for (int i = 0; i < re->nsub(); i++) {
260
339k
    if (i+1 < re->nsub() &&
261
339k
        CanCoalesce(child_args[i], child_args[i+1])) {
262
34.0k
      can_coalesce = true;
263
34.0k
      break;
264
34.0k
    }
265
339k
  }
266
68.9k
  if (!can_coalesce) {
267
34.8k
    if (!ChildArgsChanged(re, child_args))
268
20.8k
      return re->Incref();
269
270
    // Something changed. Build a new op.
271
14.0k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
272
14.0k
    nre->AllocSub(re->nsub());
273
14.0k
    Regexp** nre_subs = nre->sub();
274
42.7k
    for (int i = 0; i < re->nsub(); i++)
275
28.7k
      nre_subs[i] = child_args[i];
276
14.0k
    return nre;
277
34.8k
  }
278
279
805k
  for (int i = 0; i < re->nsub(); i++) {
280
771k
    if (i+1 < re->nsub() &&
281
771k
        CanCoalesce(child_args[i], child_args[i+1]))
282
164k
      DoCoalesce(&child_args[i], &child_args[i+1]);
283
771k
  }
284
  // Determine how many empty matches were left by DoCoalesce.
285
34.0k
  int n = 0;
286
805k
  for (int i = n; i < re->nsub(); i++) {
287
771k
    if (child_args[i]->op() == kRegexpEmptyMatch)
288
99.7k
      n++;
289
771k
  }
290
  // Build a new op.
291
34.0k
  Regexp* nre = new Regexp(re->op(), re->parse_flags());
292
34.0k
  nre->AllocSub(re->nsub() - n);
293
34.0k
  Regexp** nre_subs = nre->sub();
294
805k
  for (int i = 0, j = 0; i < re->nsub(); i++) {
295
771k
    if (child_args[i]->op() == kRegexpEmptyMatch) {
296
99.7k
      child_args[i]->Decref();
297
99.7k
      continue;
298
99.7k
    }
299
671k
    nre_subs[j] = child_args[i];
300
671k
    j++;
301
671k
  }
302
34.0k
  return nre;
303
68.9k
}
304
305
1.04M
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
306
  // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
307
  // any byte.
308
1.04M
  if ((r1->op() == kRegexpStar ||
309
1.04M
       r1->op() == kRegexpPlus ||
310
1.04M
       r1->op() == kRegexpQuest ||
311
1.04M
       r1->op() == kRegexpRepeat) &&
312
1.04M
      (r1->sub()[0]->op() == kRegexpLiteral ||
313
240k
       r1->sub()[0]->op() == kRegexpCharClass ||
314
240k
       r1->sub()[0]->op() == kRegexpAnyChar ||
315
240k
       r1->sub()[0]->op() == kRegexpAnyByte)) {
316
    // r2 must be a star/plus/quest/repeat of the same literal, char class,
317
    // any char or any byte.
318
239k
    if ((r2->op() == kRegexpStar ||
319
239k
         r2->op() == kRegexpPlus ||
320
239k
         r2->op() == kRegexpQuest ||
321
239k
         r2->op() == kRegexpRepeat) &&
322
239k
        Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
323
        // The parse flags must be consistent.
324
239k
        ((r1->parse_flags() & Regexp::NonGreedy) ==
325
2.04k
         (r2->parse_flags() & Regexp::NonGreedy))) {
326
2.03k
      return true;
327
2.03k
    }
328
    // ... OR an occurrence of that literal, char class, any char or any byte
329
237k
    if (Regexp::Equal(r1->sub()[0], r2)) {
330
102k
      return true;
331
102k
    }
332
    // ... OR a literal string that begins with that literal.
333
134k
    if (r1->sub()[0]->op() == kRegexpLiteral &&
334
134k
        r2->op() == kRegexpLiteralString &&
335
134k
        r2->runes()[0] == r1->sub()[0]->rune() &&
336
        // The parse flags must be consistent.
337
134k
        ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
338
93.3k
         (r2->parse_flags() & Regexp::FoldCase))) {
339
93.3k
      return true;
340
93.3k
    }
341
134k
  }
342
843k
  return false;
343
1.04M
}
344
345
164k
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
346
164k
  Regexp* r1 = *r1ptr;
347
164k
  Regexp* r2 = *r2ptr;
348
349
164k
  Regexp* nre = Regexp::Repeat(
350
164k
      r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
351
352
164k
  switch (r1->op()) {
353
12.5k
    case kRegexpStar:
354
12.5k
      nre->arguments.repeat.min_ = 0;
355
12.5k
      nre->arguments.repeat.max_ = -1;
356
12.5k
      break;
357
358
25.8k
    case kRegexpPlus:
359
25.8k
      nre->arguments.repeat.min_ = 1;
360
25.8k
      nre->arguments.repeat.max_ = -1;
361
25.8k
      break;
362
363
37.7k
    case kRegexpQuest:
364
37.7k
      nre->arguments.repeat.min_ = 0;
365
37.7k
      nre->arguments.repeat.max_ = 1;
366
37.7k
      break;
367
368
88.2k
    case kRegexpRepeat:
369
88.2k
      nre->arguments.repeat.min_ = r1->min();
370
88.2k
      nre->arguments.repeat.max_ = r1->max();
371
88.2k
      break;
372
373
0
    default:
374
0
      nre->Decref();
375
0
      LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
376
0
      return;
377
164k
  }
378
379
164k
  switch (r2->op()) {
380
686
    case kRegexpStar:
381
686
      nre->arguments.repeat.max_ = -1;
382
686
      goto LeaveEmpty;
383
384
565
    case kRegexpPlus:
385
565
      nre->arguments.repeat.min_++;
386
565
      nre->arguments.repeat.max_ = -1;
387
565
      goto LeaveEmpty;
388
389
732
    case kRegexpQuest:
390
732
      if (nre->max() != -1)
391
158
        nre->arguments.repeat.max_++;
392
732
      goto LeaveEmpty;
393
394
0
    case kRegexpRepeat:
395
0
      nre->arguments.repeat.min_ += r2->min();
396
0
      if (r2->max() == -1)
397
0
        nre->arguments.repeat.max_ = -1;
398
0
      else if (nre->max() != -1)
399
0
        nre->arguments.repeat.max_ += r2->max();
400
0
      goto LeaveEmpty;
401
402
125
    case kRegexpLiteral:
403
97.6k
    case kRegexpCharClass:
404
97.6k
    case kRegexpAnyChar:
405
97.6k
    case kRegexpAnyByte:
406
97.6k
      nre->arguments.repeat.min_++;
407
97.6k
      if (nre->max() != -1)
408
53.0k
        nre->arguments.repeat.max_++;
409
97.6k
      goto LeaveEmpty;
410
411
99.7k
    LeaveEmpty:
412
99.7k
      *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
413
99.7k
      *r2ptr = nre;
414
99.7k
      break;
415
416
64.6k
    case kRegexpLiteralString: {
417
64.6k
      Rune r = r1->sub()[0]->rune();
418
      // Determine how much of the literal string is removed.
419
      // We know that we have at least one rune. :)
420
64.6k
      int n = 1;
421
366k
      while (n < r2->nrunes() && r2->runes()[n] == r)
422
301k
        n++;
423
64.6k
      nre->arguments.repeat.min_ += n;
424
64.6k
      if (nre->max() != -1)
425
28.0k
        nre->arguments.repeat.max_ += n;
426
64.6k
      if (n == r2->nrunes())
427
142
        goto LeaveEmpty;
428
64.4k
      *r1ptr = nre;
429
64.4k
      *r2ptr = Regexp::LiteralString(
430
64.4k
          &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
431
64.4k
      break;
432
64.6k
    }
433
434
0
    default:
435
0
      nre->Decref();
436
0
      LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
437
0
      return;
438
164k
  }
439
440
164k
  r1->Decref();
441
164k
  r2->Decref();
442
164k
}
443
444
0
Regexp* SimplifyWalker::Copy(Regexp* re) {
445
0
  return re->Incref();
446
0
}
447
448
0
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
449
  // Should never be called: we use Walk(), not WalkExponential().
450
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
451
  LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
452
#endif
453
0
  return re->Incref();
454
0
}
455
456
1.23M
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
457
1.23M
  if (re->simple()) {
458
1.00M
    *stop = true;
459
1.00M
    return re->Incref();
460
1.00M
  }
461
221k
  return NULL;
462
1.23M
}
463
464
Regexp* SimplifyWalker::PostVisit(Regexp* re,
465
                                  Regexp* parent_arg,
466
                                  Regexp* pre_arg,
467
                                  Regexp** child_args,
468
221k
                                  int nchild_args) {
469
221k
  switch (re->op()) {
470
0
    case kRegexpNoMatch:
471
47
    case kRegexpEmptyMatch:
472
23.5k
    case kRegexpLiteral:
473
70.1k
    case kRegexpLiteralString:
474
70.1k
    case kRegexpBeginLine:
475
70.1k
    case kRegexpEndLine:
476
70.1k
    case kRegexpBeginText:
477
70.1k
    case kRegexpWordBoundary:
478
70.1k
    case kRegexpNoWordBoundary:
479
70.1k
    case kRegexpEndText:
480
70.1k
    case kRegexpAnyChar:
481
70.1k
    case kRegexpAnyByte:
482
70.1k
    case kRegexpHaveMatch:
483
      // All these are always simple.
484
70.1k
      re->simple_ = true;
485
70.1k
      return re->Incref();
486
487
57.9k
    case kRegexpConcat:
488
67.7k
    case kRegexpAlternate: {
489
      // These are simple as long as the subpieces are simple.
490
67.7k
      if (!ChildArgsChanged(re, child_args)) {
491
14.8k
        re->simple_ = true;
492
14.8k
        return re->Incref();
493
14.8k
      }
494
52.8k
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
495
52.8k
      nre->AllocSub(re->nsub());
496
52.8k
      Regexp** nre_subs = nre->sub();
497
1.17M
      for (int i = 0; i < re->nsub(); i++)
498
1.11M
        nre_subs[i] = child_args[i];
499
52.8k
      nre->simple_ = true;
500
52.8k
      return nre;
501
67.7k
    }
502
503
292
    case kRegexpCapture: {
504
292
      Regexp* newsub = child_args[0];
505
292
      if (newsub == re->sub()[0]) {
506
172
        newsub->Decref();
507
172
        re->simple_ = true;
508
172
        return re->Incref();
509
172
      }
510
120
      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
511
120
      nre->AllocSub(1);
512
120
      nre->sub()[0] = newsub;
513
120
      nre->arguments.capture.cap_ = re->cap();
514
120
      nre->simple_ = true;
515
120
      return nre;
516
292
    }
517
518
0
    case kRegexpStar:
519
0
    case kRegexpPlus:
520
0
    case kRegexpQuest: {
521
0
      Regexp* newsub = child_args[0];
522
      // Special case: repeat the empty string as much as
523
      // you want, but it's still the empty string.
524
0
      if (newsub->op() == kRegexpEmptyMatch)
525
0
        return newsub;
526
527
      // These are simple as long as the subpiece is simple.
528
0
      if (newsub == re->sub()[0]) {
529
0
        newsub->Decref();
530
0
        re->simple_ = true;
531
0
        return re->Incref();
532
0
      }
533
534
      // These are also idempotent if flags are constant.
535
0
      if (re->op() == newsub->op() &&
536
0
          re->parse_flags() == newsub->parse_flags())
537
0
        return newsub;
538
539
0
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
540
0
      nre->AllocSub(1);
541
0
      nre->sub()[0] = newsub;
542
0
      nre->simple_ = true;
543
0
      return nre;
544
0
    }
545
546
76.0k
    case kRegexpRepeat: {
547
76.0k
      Regexp* newsub = child_args[0];
548
      // Special case: repeat the empty string as much as
549
      // you want, but it's still the empty string.
550
76.0k
      if (newsub->op() == kRegexpEmptyMatch)
551
0
        return newsub;
552
553
76.0k
      Regexp* nre = SimplifyRepeat(newsub, re->arguments.repeat.min_, re->arguments.repeat.max_,
554
76.0k
                                   re->parse_flags());
555
76.0k
      newsub->Decref();
556
76.0k
      nre->simple_ = true;
557
76.0k
      return nre;
558
76.0k
    }
559
560
7.77k
    case kRegexpCharClass: {
561
7.77k
      Regexp* nre = SimplifyCharClass(re);
562
7.77k
      nre->simple_ = true;
563
7.77k
      return nre;
564
76.0k
    }
565
221k
  }
566
567
0
  LOG(ERROR) << "Simplify case not handled: " << re->op();
568
0
  return re->Incref();
569
221k
}
570
571
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
572
// Returns a new Regexp, handing the ref to the caller.
573
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
574
37.2k
                                Regexp::ParseFlags parse_flags) {
575
37.2k
  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
576
37.2k
  re->AllocSub(2);
577
37.2k
  Regexp** subs = re->sub();
578
37.2k
  subs[0] = re1;
579
37.2k
  subs[1] = re2;
580
37.2k
  return re;
581
37.2k
}
582
583
// Simplifies the expression re{min,max} in terms of *, +, and ?.
584
// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
585
// Caller must Decref return value when done with it.
586
// The result will *not* necessarily have the right capturing parens
587
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
588
// but in the Regexp* representation, both (x) are marked as $1.
589
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
590
76.0k
                                       Regexp::ParseFlags f) {
591
  // x{n,} means at least n matches of x.
592
76.0k
  if (max == -1) {
593
    // Special case: x{0,} is x*
594
38.9k
    if (min == 0)
595
49
      return Regexp::Star(re->Incref(), f);
596
597
    // Special case: x{1,} is x+
598
38.8k
    if (min == 1)
599
10.4k
      return Regexp::Plus(re->Incref(), f);
600
601
    // General case: x{4,} is xxxx+
602
28.4k
    PODArray<Regexp*> nre_subs(min);
603
105k
    for (int i = 0; i < min-1; i++)
604
77.5k
      nre_subs[i] = re->Incref();
605
28.4k
    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
606
28.4k
    return Regexp::Concat(nre_subs.data(), min, f);
607
38.8k
  }
608
609
  // Special case: (x){0} matches only empty string.
610
37.1k
  if (min == 0 && max == 0)
611
0
    return new Regexp(kRegexpEmptyMatch, f);
612
613
  // Special case: x{1} is just x.
614
37.1k
  if (min == 1 && max == 1)
615
0
    return re->Incref();
616
617
  // General case: x{n,m} means n copies of x and m copies of x?.
618
  // The machine will do less work if we nest the final m copies,
619
  // so that x{2,5} = xx(x(x(x)?)?)?
620
621
  // Build leading prefix: xx.  Capturing only on the last one.
622
37.1k
  Regexp* nre = NULL;
623
37.1k
  if (min > 0) {
624
37.1k
    PODArray<Regexp*> nre_subs(min);
625
411k
    for (int i = 0; i < min; i++)
626
374k
      nre_subs[i] = re->Incref();
627
37.1k
    nre = Regexp::Concat(nre_subs.data(), min, f);
628
37.1k
  }
629
630
  // Build and attach suffix: (x(x(x)?)?)?
631
37.1k
  if (max > min) {
632
37.1k
    Regexp* suf = Regexp::Quest(re->Incref(), f);
633
37.2k
    for (int i = min+1; i < max; i++)
634
158
      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
635
37.1k
    if (nre == NULL)
636
14
      nre = suf;
637
37.1k
    else
638
37.1k
      nre = Concat2(nre, suf, f);
639
37.1k
  }
640
641
37.1k
  if (nre == NULL) {
642
    // Some degenerate case, like min > max, or min < max < 0.
643
    // This shouldn't happen, because the parser rejects such regexps.
644
0
    LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
645
0
    return new Regexp(kRegexpNoMatch, f);
646
0
  }
647
648
37.1k
  return nre;
649
37.1k
}
650
651
// Simplifies a character class.
652
// Caller must Decref return value when done with it.
653
7.77k
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
654
7.77k
  CharClass* cc = re->cc();
655
656
  // Special cases
657
7.77k
  if (cc->empty())
658
0
    return new Regexp(kRegexpNoMatch, re->parse_flags());
659
7.77k
  if (cc->full())
660
0
    return new Regexp(kRegexpAnyChar, re->parse_flags());
661
662
7.77k
  return re->Incref();
663
7.77k
}
664
665
}  // namespace re2