Coverage Report

Created: 2025-10-27 06:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/re2/re2/simplify.cc
Line
Count
Source
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Rewrite POSIX and other features in re
6
// to use simple extended regular expression features.
7
// Also sort and simplify character classes.
8
9
#include <stddef.h>
10
11
#include <algorithm>
12
#include <string>
13
14
#include "absl/log/absl_log.h"
15
#include "absl/strings/string_view.h"
16
#include "re2/pod_array.h"
17
#include "re2/regexp.h"
18
#include "re2/walker-inl.h"
19
#include "util/utf.h"
20
21
namespace re2 {
22
23
// Parses the regexp src and then simplifies it and sets *dst to the
24
// string representation of the simplified form.  Returns true on success.
25
// Returns false and sets *error (if error != NULL) on error.
26
bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags,
27
0
                            std::string* dst, RegexpStatus* status) {
28
0
  Regexp* re = Parse(src, flags, status);
29
0
  if (re == NULL)
30
0
    return false;
31
0
  Regexp* sre = re->Simplify();
32
0
  re->Decref();
33
0
  if (sre == NULL) {
34
0
    if (status) {
35
0
      status->set_code(kRegexpInternalError);
36
0
      status->set_error_arg(src);
37
0
    }
38
0
    return false;
39
0
  }
40
0
  *dst = sre->ToString();
41
0
  sre->Decref();
42
0
  return true;
43
0
}
44
45
// Assuming the simple_ flags on the children are accurate,
46
// is this Regexp* simple?
47
923k
bool Regexp::ComputeSimple() {
48
923k
  Regexp** subs;
49
923k
  switch (op_) {
50
4.91k
    case kRegexpNoMatch:
51
63.0k
    case kRegexpEmptyMatch:
52
402k
    case kRegexpLiteral:
53
404k
    case kRegexpLiteralString:
54
413k
    case kRegexpBeginLine:
55
425k
    case kRegexpEndLine:
56
443k
    case kRegexpBeginText:
57
445k
    case kRegexpWordBoundary:
58
446k
    case kRegexpNoWordBoundary:
59
486k
    case kRegexpEndText:
60
491k
    case kRegexpAnyChar:
61
494k
    case kRegexpAnyByte:
62
494k
    case kRegexpHaveMatch:
63
494k
      return true;
64
64.1k
    case kRegexpConcat:
65
126k
    case kRegexpAlternate:
66
      // These are simple as long as the subpieces are simple.
67
126k
      subs = sub();
68
767k
      for (int i = 0; i < nsub_; i++)
69
677k
        if (!subs[i]->simple())
70
37.1k
          return false;
71
89.4k
      return true;
72
66.3k
    case kRegexpCharClass:
73
      // Simple as long as the char class is not empty, not full.
74
66.3k
      if (ccb_ != NULL)
75
58.1k
        return !ccb_->empty() && !ccb_->full();
76
8.25k
      return !cc_->empty() && !cc_->full();
77
87.1k
    case kRegexpCapture:
78
87.1k
      subs = sub();
79
87.1k
      return subs[0]->simple();
80
41.1k
    case kRegexpStar:
81
57.1k
    case kRegexpPlus:
82
100k
    case kRegexpQuest:
83
100k
      subs = sub();
84
100k
      if (!subs[0]->simple())
85
6.00k
        return false;
86
94.1k
      switch (subs[0]->op_) {
87
12
        case kRegexpStar:
88
29
        case kRegexpPlus:
89
131
        case kRegexpQuest:
90
422
        case kRegexpEmptyMatch:
91
2.22k
        case kRegexpNoMatch:
92
2.22k
          return false;
93
91.8k
        default:
94
91.8k
          break;
95
94.1k
      }
96
91.8k
      return true;
97
48.8k
    case kRegexpRepeat:
98
48.8k
      return false;
99
923k
  }
100
923k
  ABSL_LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
101
0
  return false;
102
0
}
103
104
// Walker subclass used by Simplify.
105
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
106
// occurrences of that literal into repeats of that literal. It also works for
107
// char classes, any char and any byte.
108
// PostVisit creates the coalesced result, which should then be simplified.
109
class CoalesceWalker : public Regexp::Walker<Regexp*> {
110
 public:
111
50.4k
  CoalesceWalker() {}
112
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
113
                            Regexp** child_args, int nchild_args);
114
  virtual Regexp* Copy(Regexp* re);
115
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
116
117
 private:
118
  // These functions are declared inside CoalesceWalker so that
119
  // they can edit the private fields of the Regexps they construct.
120
121
  // Returns true if r1 and r2 can be coalesced. In particular, ensures that
122
  // the parse flags are consistent. (They will not be checked again later.)
123
  static bool CanCoalesce(Regexp* r1, Regexp* r2);
124
125
  // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
126
  // will be empty match and the coalesced op. In other cases, where part of a
127
  // literal string was removed to be coalesced, the array elements afterwards
128
  // will be the coalesced op and the remainder of the literal string.
129
  static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
130
131
  CoalesceWalker(const CoalesceWalker&) = delete;
132
  CoalesceWalker& operator=(const CoalesceWalker&) = delete;
133
};
134
135
// Walker subclass used by Simplify.
136
// The simplify walk is purely post-recursive: given the simplified children,
137
// PostVisit creates the simplified result.
138
// The child_args are simplified Regexp*s.
139
class SimplifyWalker : public Regexp::Walker<Regexp*> {
140
 public:
141
50.4k
  SimplifyWalker() {}
142
  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
143
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
144
                            Regexp** child_args, int nchild_args);
145
  virtual Regexp* Copy(Regexp* re);
146
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
147
148
 private:
149
  // These functions are declared inside SimplifyWalker so that
150
  // they can edit the private fields of the Regexps they construct.
151
152
  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
153
  // Caller must Decref return value when done with it.
154
  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
155
156
  // Simplifies the expression re{min,max} in terms of *, +, and ?.
157
  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
158
  // Caller must Decref return value when done with it.
159
  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
160
                                Regexp::ParseFlags parse_flags);
161
162
  // Simplifies a character class by expanding any named classes
163
  // into rune ranges.  Does not edit re.  Does not consume ref to re.
164
  // Caller must Decref return value when done with it.
165
  static Regexp* SimplifyCharClass(Regexp* re);
166
167
  SimplifyWalker(const SimplifyWalker&) = delete;
168
  SimplifyWalker& operator=(const SimplifyWalker&) = delete;
169
};
170
171
// Simplifies a regular expression, returning a new regexp.
172
// The new regexp uses traditional Unix egrep features only,
173
// plus the Perl (?:) non-capturing parentheses.
174
// Otherwise, no POSIX or Perl additions.  The new regexp
175
// captures exactly the same subexpressions (with the same indices)
176
// as the original.
177
// Does not edit current object.
178
// Caller must Decref() return value when done with it.
179
180
50.4k
Regexp* Regexp::Simplify() {
181
50.4k
  CoalesceWalker cw;
182
50.4k
  Regexp* cre = cw.Walk(this, NULL);
183
50.4k
  if (cre == NULL)
184
0
    return NULL;
185
50.4k
  if (cw.stopped_early()) {
186
0
    cre->Decref();
187
0
    return NULL;
188
0
  }
189
50.4k
  SimplifyWalker sw;
190
50.4k
  Regexp* sre = sw.Walk(cre, NULL);
191
50.4k
  cre->Decref();
192
50.4k
  if (sre == NULL)
193
0
    return NULL;
194
50.4k
  if (sw.stopped_early()) {
195
0
    sre->Decref();
196
0
    return NULL;
197
0
  }
198
50.4k
  return sre;
199
50.4k
}
200
201
#define Simplify DontCallSimplify  // Avoid accidental recursion
202
203
// Utility function for PostVisit implementations that compares re->sub() with
204
// child_args to determine whether any child_args changed. In the common case,
205
// where nothing changed, calls Decref() for all child_args and returns false,
206
// so PostVisit must return re->Incref(). Otherwise, returns true.
207
593k
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
208
1.72M
  for (int i = 0; i < re->nsub(); i++) {
209
1.22M
    Regexp* sub = re->sub()[i];
210
1.22M
    Regexp* newsub = child_args[i];
211
1.22M
    if (newsub != sub)
212
97.6k
      return true;
213
1.22M
  }
214
1.55M
  for (int i = 0; i < re->nsub(); i++) {
215
1.06M
    Regexp* newsub = child_args[i];
216
1.06M
    newsub->Decref();
217
1.06M
  }
218
495k
  return false;
219
593k
}
220
221
0
Regexp* CoalesceWalker::Copy(Regexp* re) {
222
0
  return re->Incref();
223
0
}
224
225
0
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
226
  // Should never be called: we use Walk(), not WalkExponential().
227
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
228
  ABSL_LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
229
#endif
230
0
  return re->Incref();
231
0
}
232
233
Regexp* CoalesceWalker::PostVisit(Regexp* re,
234
                                  Regexp* parent_arg,
235
                                  Regexp* pre_arg,
236
                                  Regexp** child_args,
237
1.25M
                                  int nchild_args) {
238
1.25M
  if (re->nsub() == 0)
239
740k
    return re->Incref();
240
241
512k
  if (re->op() != kRegexpConcat) {
242
399k
    if (!ChildArgsChanged(re, child_args))
243
369k
      return re->Incref();
244
245
    // Something changed. Build a new op.
246
29.6k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
247
29.6k
    nre->AllocSub(re->nsub());
248
29.6k
    Regexp** nre_subs = nre->sub();
249
89.6k
    for (int i = 0; i < re->nsub(); i++)
250
60.0k
      nre_subs[i] = child_args[i];
251
    // Repeats and Captures have additional data that must be copied.
252
29.6k
    if (re->op() == kRegexpRepeat) {
253
1.74k
      nre->min_ = re->min();
254
1.74k
      nre->max_ = re->max();
255
27.8k
    } else if (re->op() == kRegexpCapture) {
256
10.3k
      nre->cap_ = re->cap();
257
10.3k
    }
258
29.6k
    return nre;
259
399k
  }
260
261
112k
  bool can_coalesce = false;
262
607k
  for (int i = 0; i < re->nsub(); i++) {
263
502k
    if (i+1 < re->nsub() &&
264
396k
        CanCoalesce(child_args[i], child_args[i+1])) {
265
7.18k
      can_coalesce = true;
266
7.18k
      break;
267
7.18k
    }
268
502k
  }
269
112k
  if (!can_coalesce) {
270
105k
    if (!ChildArgsChanged(re, child_args))
271
91.3k
      return re->Incref();
272
273
    // Something changed. Build a new op.
274
14.3k
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
275
14.3k
    nre->AllocSub(re->nsub());
276
14.3k
    Regexp** nre_subs = nre->sub();
277
51.2k
    for (int i = 0; i < re->nsub(); i++)
278
36.9k
      nre_subs[i] = child_args[i];
279
14.3k
    return nre;
280
105k
  }
281
282
156k
  for (int i = 0; i < re->nsub(); i++) {
283
149k
    if (i+1 < re->nsub() &&
284
141k
        CanCoalesce(child_args[i], child_args[i+1]))
285
91.0k
      DoCoalesce(&child_args[i], &child_args[i+1]);
286
149k
  }
287
  // Determine how many empty matches were left by DoCoalesce.
288
7.18k
  int n = 0;
289
156k
  for (int i = n; i < re->nsub(); i++) {
290
149k
    if (child_args[i]->op() == kRegexpEmptyMatch)
291
79.3k
      n++;
292
149k
  }
293
  // Build a new op.
294
7.18k
  Regexp* nre = new Regexp(re->op(), re->parse_flags());
295
7.18k
  nre->AllocSub(re->nsub() - n);
296
7.18k
  Regexp** nre_subs = nre->sub();
297
156k
  for (int i = 0, j = 0; i < re->nsub(); i++) {
298
149k
    if (child_args[i]->op() == kRegexpEmptyMatch) {
299
79.3k
      child_args[i]->Decref();
300
79.3k
      continue;
301
79.3k
    }
302
69.7k
    nre_subs[j] = child_args[i];
303
69.7k
    j++;
304
69.7k
  }
305
7.18k
  return nre;
306
112k
}
307
308
538k
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
309
  // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
310
  // any byte.
311
538k
  if ((r1->op() == kRegexpStar ||
312
513k
       r1->op() == kRegexpPlus ||
313
497k
       r1->op() == kRegexpQuest ||
314
467k
       r1->op() == kRegexpRepeat) &&
315
184k
      (r1->sub()[0]->op() == kRegexpLiteral ||
316
56.2k
       r1->sub()[0]->op() == kRegexpCharClass ||
317
46.6k
       r1->sub()[0]->op() == kRegexpAnyChar ||
318
140k
       r1->sub()[0]->op() == kRegexpAnyByte)) {
319
    // r2 must be a star/plus/quest/repeat of the same literal, char class,
320
    // any char or any byte.
321
140k
    if ((r2->op() == kRegexpStar ||
322
134k
         r2->op() == kRegexpPlus ||
323
128k
         r2->op() == kRegexpQuest ||
324
82.7k
         r2->op() == kRegexpRepeat) &&
325
82.0k
        Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
326
        // The parse flags must be consistent.
327
66.2k
        ((r1->parse_flags() & Regexp::NonGreedy) ==
328
66.2k
         (r2->parse_flags() & Regexp::NonGreedy))) {
329
65.8k
      return true;
330
65.8k
    }
331
    // ... OR an occurrence of that literal, char class, any char or any byte
332
74.3k
    if (Regexp::Equal(r1->sub()[0], r2)) {
333
14.0k
      return true;
334
14.0k
    }
335
    // ... OR a literal string that begins with that literal.
336
60.2k
    if (r1->sub()[0]->op() == kRegexpLiteral &&
337
51.5k
        r2->op() == kRegexpLiteralString &&
338
31.1k
        r2->runes()[0] == r1->sub()[0]->rune() &&
339
        // The parse flags must be consistent.
340
18.5k
        ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
341
18.5k
         (r2->parse_flags() & Regexp::FoldCase))) {
342
18.3k
      return true;
343
18.3k
    }
344
60.2k
  }
345
440k
  return false;
346
538k
}
347
348
91.0k
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
349
91.0k
  Regexp* r1 = *r1ptr;
350
91.0k
  Regexp* r2 = *r2ptr;
351
352
91.0k
  Regexp* nre = Regexp::Repeat(
353
91.0k
      r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
354
355
91.0k
  switch (r1->op()) {
356
2.88k
    case kRegexpStar:
357
2.88k
      nre->min_ = 0;
358
2.88k
      nre->max_ = -1;
359
2.88k
      break;
360
361
3.59k
    case kRegexpPlus:
362
3.59k
      nre->min_ = 1;
363
3.59k
      nre->max_ = -1;
364
3.59k
      break;
365
366
12.7k
    case kRegexpQuest:
367
12.7k
      nre->min_ = 0;
368
12.7k
      nre->max_ = 1;
369
12.7k
      break;
370
371
71.8k
    case kRegexpRepeat:
372
71.8k
      nre->min_ = r1->min();
373
71.8k
      nre->max_ = r1->max();
374
71.8k
      break;
375
376
0
    default:
377
0
      nre->Decref();
378
0
      ABSL_LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
379
0
      return;
380
91.0k
  }
381
382
91.0k
  switch (r2->op()) {
383
1.52k
    case kRegexpStar:
384
1.52k
      nre->max_ = -1;
385
1.52k
      goto LeaveEmpty;
386
387
2.75k
    case kRegexpPlus:
388
2.75k
      nre->min_++;
389
2.75k
      nre->max_ = -1;
390
2.75k
      goto LeaveEmpty;
391
392
39.7k
    case kRegexpQuest:
393
39.7k
      if (nre->max() != -1)
394
39.3k
        nre->max_++;
395
39.7k
      goto LeaveEmpty;
396
397
19.9k
    case kRegexpRepeat:
398
19.9k
      nre->min_ += r2->min();
399
19.9k
      if (r2->max() == -1)
400
310
        nre->max_ = -1;
401
19.6k
      else if (nre->max() != -1)
402
18.9k
        nre->max_ += r2->max();
403
19.9k
      goto LeaveEmpty;
404
405
9.10k
    case kRegexpLiteral:
406
10.3k
    case kRegexpCharClass:
407
10.6k
    case kRegexpAnyChar:
408
11.1k
    case kRegexpAnyByte:
409
11.1k
      nre->min_++;
410
11.1k
      if (nre->max() != -1)
411
6.56k
        nre->max_++;
412
11.1k
      goto LeaveEmpty;
413
414
79.2k
    LeaveEmpty:
415
79.2k
      *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
416
79.2k
      *r2ptr = nre;
417
79.2k
      break;
418
419
16.0k
    case kRegexpLiteralString: {
420
16.0k
      Rune r = r1->sub()[0]->rune();
421
      // Determine how much of the literal string is removed.
422
      // We know that we have at least one rune. :)
423
16.0k
      int n = 1;
424
35.8k
      while (n < r2->nrunes() && r2->runes()[n] == r)
425
19.8k
        n++;
426
16.0k
      nre->min_ += n;
427
16.0k
      if (nre->max() != -1)
428
12.6k
        nre->max_ += n;
429
16.0k
      if (n == r2->nrunes())
430
4.21k
        goto LeaveEmpty;
431
11.8k
      *r1ptr = nre;
432
11.8k
      *r2ptr = Regexp::LiteralString(
433
11.8k
          &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
434
11.8k
      break;
435
16.0k
    }
436
437
0
    default:
438
0
      nre->Decref();
439
0
      ABSL_LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
440
0
      return;
441
91.0k
  }
442
443
91.0k
  r1->Decref();
444
91.0k
  r2->Decref();
445
91.0k
}
446
447
0
Regexp* SimplifyWalker::Copy(Regexp* re) {
448
0
  return re->Incref();
449
0
}
450
451
0
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
452
  // Should never be called: we use Walk(), not WalkExponential().
453
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
454
  ABSL_LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
455
#endif
456
0
  return re->Incref();
457
0
}
458
459
520k
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
460
520k
  if (re->simple()) {
461
284k
    *stop = true;
462
284k
    return re->Incref();
463
284k
  }
464
236k
  return NULL;
465
520k
}
466
467
Regexp* SimplifyWalker::PostVisit(Regexp* re,
468
                                  Regexp* parent_arg,
469
                                  Regexp* pre_arg,
470
                                  Regexp** child_args,
471
236k
                                  int nchild_args) {
472
236k
  switch (re->op()) {
473
0
    case kRegexpNoMatch:
474
4.02k
    case kRegexpEmptyMatch:
475
12.9k
    case kRegexpLiteral:
476
22.1k
    case kRegexpLiteralString:
477
22.1k
    case kRegexpBeginLine:
478
22.1k
    case kRegexpEndLine:
479
22.1k
    case kRegexpBeginText:
480
22.1k
    case kRegexpWordBoundary:
481
22.1k
    case kRegexpNoWordBoundary:
482
22.1k
    case kRegexpEndText:
483
22.1k
    case kRegexpAnyChar:
484
22.1k
    case kRegexpAnyByte:
485
31.8k
    case kRegexpHaveMatch:
486
      // All these are always simple.
487
31.8k
      re->simple_ = true;
488
31.8k
      return re->Incref();
489
490
60.2k
    case kRegexpConcat:
491
88.4k
    case kRegexpAlternate: {
492
      // These are simple as long as the subpieces are simple.
493
88.4k
      if (!ChildArgsChanged(re, child_args)) {
494
34.8k
        re->simple_ = true;
495
34.8k
        return re->Incref();
496
34.8k
      }
497
53.6k
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
498
53.6k
      nre->AllocSub(re->nsub());
499
53.6k
      Regexp** nre_subs = nre->sub();
500
306k
      for (int i = 0; i < re->nsub(); i++)
501
253k
        nre_subs[i] = child_args[i];
502
53.6k
      nre->simple_ = true;
503
53.6k
      return nre;
504
88.4k
    }
505
506
15.5k
    case kRegexpCapture: {
507
15.5k
      Regexp* newsub = child_args[0];
508
15.5k
      if (newsub == re->sub()[0]) {
509
2.18k
        newsub->Decref();
510
2.18k
        re->simple_ = true;
511
2.18k
        return re->Incref();
512
2.18k
      }
513
13.3k
      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
514
13.3k
      nre->AllocSub(1);
515
13.3k
      nre->sub()[0] = newsub;
516
13.3k
      nre->cap_ = re->cap();
517
13.3k
      nre->simple_ = true;
518
13.3k
      return nre;
519
15.5k
    }
520
521
9.49k
    case kRegexpStar:
522
13.5k
    case kRegexpPlus:
523
16.1k
    case kRegexpQuest: {
524
16.1k
      Regexp* newsub = child_args[0];
525
      // Special case: repeat the empty string as much as
526
      // you want, but it's still the empty string.
527
16.1k
      if (newsub->op() == kRegexpEmptyMatch)
528
2.68k
        return newsub;
529
530
      // These are simple as long as the subpiece is simple.
531
13.4k
      if (newsub == re->sub()[0]) {
532
2.34k
        newsub->Decref();
533
2.34k
        re->simple_ = true;
534
2.34k
        return re->Incref();
535
2.34k
      }
536
537
      // These are also idempotent if flags are constant.
538
11.0k
      if (re->op() == newsub->op() &&
539
1.67k
          re->parse_flags() == newsub->parse_flags())
540
816
        return newsub;
541
542
10.2k
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
543
10.2k
      nre->AllocSub(1);
544
10.2k
      nre->sub()[0] = newsub;
545
10.2k
      nre->simple_ = true;
546
10.2k
      return nre;
547
11.0k
    }
548
549
79.9k
    case kRegexpRepeat: {
550
79.9k
      Regexp* newsub = child_args[0];
551
      // Special case: repeat the empty string as much as
552
      // you want, but it's still the empty string.
553
79.9k
      if (newsub->op() == kRegexpEmptyMatch)
554
4.52k
        return newsub;
555
556
75.4k
      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
557
75.4k
                                   re->parse_flags());
558
75.4k
      newsub->Decref();
559
75.4k
      nre->simple_ = true;
560
75.4k
      return nre;
561
79.9k
    }
562
563
4.73k
    case kRegexpCharClass: {
564
4.73k
      Regexp* nre = SimplifyCharClass(re);
565
4.73k
      nre->simple_ = true;
566
4.73k
      return nre;
567
79.9k
    }
568
236k
  }
569
570
236k
  ABSL_LOG(ERROR) << "Simplify case not handled: " << re->op();
571
0
  return re->Incref();
572
0
}
573
574
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
575
// Returns a new Regexp, handing the ref to the caller.
576
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
577
105k
                                Regexp::ParseFlags parse_flags) {
578
105k
  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
579
105k
  re->AllocSub(2);
580
105k
  Regexp** subs = re->sub();
581
105k
  subs[0] = re1;
582
105k
  subs[1] = re2;
583
105k
  return re;
584
105k
}
585
586
// Returns true if re is an empty-width op.
587
89.3k
static bool IsEmptyOp(Regexp* re) {
588
89.3k
  return (re->op() == kRegexpBeginLine ||
589
88.5k
          re->op() == kRegexpEndLine ||
590
87.7k
          re->op() == kRegexpWordBoundary ||
591
87.3k
          re->op() == kRegexpNoWordBoundary ||
592
87.0k
          re->op() == kRegexpBeginText ||
593
85.4k
          re->op() == kRegexpEndText);
594
89.3k
}
595
596
// Simplifies the expression re{min,max} in terms of *, +, and ?.
597
// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
598
// Caller must Decref return value when done with it.
599
// The result will *not* necessarily have the right capturing parens
600
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
601
// but in the Regexp* representation, both (x) are marked as $1.
602
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
603
75.4k
                                       Regexp::ParseFlags f) {
604
  // For an empty-width op OR a concatenation or alternation of empty-width
605
  // ops, cap the repetition count at 1.
606
75.4k
  if (IsEmptyOp(re) ||
607
73.2k
      ((re->op() == kRegexpConcat ||
608
67.2k
        re->op() == kRegexpAlternate) &&
609
11.4k
       std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) {
610
2.63k
    min = std::min(min, 1);
611
2.63k
    max = std::min(max, 1);
612
2.63k
  }
613
614
  // x{n,} means at least n matches of x.
615
75.4k
  if (max == -1) {
616
    // Special case: x{0,} is x*
617
10.7k
    if (min == 0)
618
3.42k
      return Regexp::Star(re->Incref(), f);
619
620
    // Special case: x{1,} is x+
621
7.36k
    if (min == 1)
622
2.70k
      return Regexp::Plus(re->Incref(), f);
623
624
    // General case: x{4,} is xxxx+
625
4.66k
    PODArray<Regexp*> nre_subs(min);
626
36.8k
    for (int i = 0; i < min-1; i++)
627
32.2k
      nre_subs[i] = re->Incref();
628
4.66k
    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
629
4.66k
    return Regexp::Concat(nre_subs.data(), min, f);
630
7.36k
  }
631
632
  // Special case: (x){0} matches only empty string.
633
64.6k
  if (min == 0 && max == 0)
634
946
    return new Regexp(kRegexpEmptyMatch, f);
635
636
  // Special case: x{1} is just x.
637
63.6k
  if (min == 1 && max == 1)
638
4.45k
    return re->Incref();
639
640
  // General case: x{n,m} means n copies of x and m copies of x?.
641
  // The machine will do less work if we nest the final m copies,
642
  // so that x{2,5} = xx(x(x(x)?)?)?
643
644
  // Build leading prefix: xx.  Capturing only on the last one.
645
59.2k
  Regexp* nre = NULL;
646
59.2k
  if (min > 0) {
647
55.3k
    PODArray<Regexp*> nre_subs(min);
648
531k
    for (int i = 0; i < min; i++)
649
475k
      nre_subs[i] = re->Incref();
650
55.3k
    nre = Regexp::Concat(nre_subs.data(), min, f);
651
55.3k
  }
652
653
  // Build and attach suffix: (x(x(x)?)?)?
654
59.2k
  if (max > min) {
655
15.6k
    Regexp* suf = Regexp::Quest(re->Incref(), f);
656
109k
    for (int i = min+1; i < max; i++)
657
93.7k
      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
658
15.6k
    if (nre == NULL)
659
3.92k
      nre = suf;
660
11.6k
    else
661
11.6k
      nre = Concat2(nre, suf, f);
662
15.6k
  }
663
664
59.2k
  if (nre == NULL) {
665
    // Some degenerate case, like min > max, or min < max < 0.
666
    // This shouldn't happen, because the parser rejects such regexps.
667
0
    ABSL_LOG(DFATAL) << "Malformed repeat of " << re->ToString()
668
0
                     << " min " << min << " max " << max;
669
0
    return new Regexp(kRegexpNoMatch, f);
670
0
  }
671
672
59.2k
  return nre;
673
59.2k
}
674
675
// Simplifies a character class.
676
// Caller must Decref return value when done with it.
677
4.73k
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
678
4.73k
  CharClass* cc = re->cc();
679
680
  // Special cases
681
4.73k
  if (cc->empty())
682
2.61k
    return new Regexp(kRegexpNoMatch, re->parse_flags());
683
2.11k
  if (cc->full())
684
826
    return new Regexp(kRegexpAnyChar, re->parse_flags());
685
686
1.29k
  return re->Incref();
687
2.11k
}
688
689
}  // namespace re2