Coverage Report

Created: 2026-05-27 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/proc/self/cwd/external/re2~/re2/simplify.cc
Line
Count
Source
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Rewrite POSIX and other features in re
6
// to use simple extended regular expression features.
7
// Also sort and simplify character classes.
8
9
#include <stddef.h>
10
11
#include <algorithm>
12
#include <string>
13
14
#include "absl/log/absl_log.h"
15
#include "absl/strings/string_view.h"
16
#include "re2/pod_array.h"
17
#include "re2/regexp.h"
18
#include "re2/walker-inl.h"
19
#include "util/utf.h"
20
21
namespace re2 {
22
23
// Parses the regexp src and then simplifies it and sets *dst to the
24
// string representation of the simplified form.  Returns true on success.
25
// Returns false and sets *error (if error != NULL) on error.
26
bool Regexp::SimplifyRegexp(absl::string_view src, ParseFlags flags,
27
0
                            std::string* dst, RegexpStatus* status) {
28
0
  Regexp* re = Parse(src, flags, status);
29
0
  if (re == NULL)
30
0
    return false;
31
0
  Regexp* sre = re->Simplify();
32
0
  re->Decref();
33
0
  if (sre == NULL) {
34
0
    if (status) {
35
0
      status->set_code(kRegexpInternalError);
36
0
      status->set_error_arg(src);
37
0
    }
38
0
    return false;
39
0
  }
40
0
  *dst = sre->ToString();
41
0
  sre->Decref();
42
0
  return true;
43
0
}
44
45
// Assuming the simple_ flags on the children are accurate,
46
// is this Regexp* simple?
47
0
bool Regexp::ComputeSimple() {
48
0
  Regexp** subs;
49
0
  switch (op_) {
50
0
    case kRegexpNoMatch:
51
0
    case kRegexpEmptyMatch:
52
0
    case kRegexpLiteral:
53
0
    case kRegexpLiteralString:
54
0
    case kRegexpBeginLine:
55
0
    case kRegexpEndLine:
56
0
    case kRegexpBeginText:
57
0
    case kRegexpWordBoundary:
58
0
    case kRegexpNoWordBoundary:
59
0
    case kRegexpEndText:
60
0
    case kRegexpAnyChar:
61
0
    case kRegexpAnyByte:
62
0
    case kRegexpHaveMatch:
63
0
      return true;
64
0
    case kRegexpConcat:
65
0
    case kRegexpAlternate:
66
      // These are simple as long as the subpieces are simple.
67
0
      subs = sub();
68
0
      for (int i = 0; i < nsub_; i++)
69
0
        if (!subs[i]->simple())
70
0
          return false;
71
0
      return true;
72
0
    case kRegexpCharClass:
73
      // Simple as long as the char class is not empty, not full.
74
0
      if (ccb_ != NULL)
75
0
        return !ccb_->empty() && !ccb_->full();
76
0
      return !cc_->empty() && !cc_->full();
77
0
    case kRegexpCapture:
78
0
      subs = sub();
79
0
      return subs[0]->simple();
80
0
    case kRegexpStar:
81
0
    case kRegexpPlus:
82
0
    case kRegexpQuest:
83
0
      subs = sub();
84
0
      if (!subs[0]->simple())
85
0
        return false;
86
0
      switch (subs[0]->op_) {
87
0
        case kRegexpStar:
88
0
        case kRegexpPlus:
89
0
        case kRegexpQuest:
90
0
        case kRegexpEmptyMatch:
91
0
        case kRegexpNoMatch:
92
0
          return false;
93
0
        default:
94
0
          break;
95
0
      }
96
0
      return true;
97
0
    case kRegexpRepeat:
98
0
      return false;
99
0
  }
100
0
  ABSL_LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
101
0
  return false;
102
0
}
103
104
// Walker subclass used by Simplify.
105
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
106
// occurrences of that literal into repeats of that literal. It also works for
107
// char classes, any char and any byte.
108
// PostVisit creates the coalesced result, which should then be simplified.
109
class CoalesceWalker : public Regexp::Walker<Regexp*> {
110
 public:
111
0
  CoalesceWalker() {}
112
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
113
                            Regexp** child_args, int nchild_args);
114
  virtual Regexp* Copy(Regexp* re);
115
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
116
117
 private:
118
  // These functions are declared inside CoalesceWalker so that
119
  // they can edit the private fields of the Regexps they construct.
120
121
  // Returns true if r1 and r2 can be coalesced. In particular, ensures that
122
  // the parse flags are consistent. (They will not be checked again later.)
123
  static bool CanCoalesce(Regexp* r1, Regexp* r2);
124
125
  // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
126
  // will be empty match and the coalesced op. In other cases, where part of a
127
  // literal string was removed to be coalesced, the array elements afterwards
128
  // will be the coalesced op and the remainder of the literal string.
129
  static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
130
131
  CoalesceWalker(const CoalesceWalker&) = delete;
132
  CoalesceWalker& operator=(const CoalesceWalker&) = delete;
133
};
134
135
// Walker subclass used by Simplify.
136
// The simplify walk is purely post-recursive: given the simplified children,
137
// PostVisit creates the simplified result.
138
// The child_args are simplified Regexp*s.
139
class SimplifyWalker : public Regexp::Walker<Regexp*> {
140
 public:
141
0
  SimplifyWalker() {}
142
  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
143
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
144
                            Regexp** child_args, int nchild_args);
145
  virtual Regexp* Copy(Regexp* re);
146
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
147
148
 private:
149
  // These functions are declared inside SimplifyWalker so that
150
  // they can edit the private fields of the Regexps they construct.
151
152
  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
153
  // Caller must Decref return value when done with it.
154
  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
155
156
  // Simplifies the expression re{min,max} in terms of *, +, and ?.
157
  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
158
  // Caller must Decref return value when done with it.
159
  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
160
                                Regexp::ParseFlags parse_flags);
161
162
  // Simplifies a character class by expanding any named classes
163
  // into rune ranges.  Does not edit re.  Does not consume ref to re.
164
  // Caller must Decref return value when done with it.
165
  static Regexp* SimplifyCharClass(Regexp* re);
166
167
  SimplifyWalker(const SimplifyWalker&) = delete;
168
  SimplifyWalker& operator=(const SimplifyWalker&) = delete;
169
};
170
171
// Simplifies a regular expression, returning a new regexp.
172
// The new regexp uses traditional Unix egrep features only,
173
// plus the Perl (?:) non-capturing parentheses.
174
// Otherwise, no POSIX or Perl additions.  The new regexp
175
// captures exactly the same subexpressions (with the same indices)
176
// as the original.
177
// Does not edit current object.
178
// Caller must Decref() return value when done with it.
179
180
0
Regexp* Regexp::Simplify() {
181
0
  CoalesceWalker cw;
182
0
  Regexp* cre = cw.Walk(this, NULL);
183
0
  if (cre == NULL)
184
0
    return NULL;
185
0
  if (cw.stopped_early()) {
186
0
    cre->Decref();
187
0
    return NULL;
188
0
  }
189
0
  SimplifyWalker sw;
190
0
  Regexp* sre = sw.Walk(cre, NULL);
191
0
  cre->Decref();
192
0
  if (sre == NULL)
193
0
    return NULL;
194
0
  if (sw.stopped_early()) {
195
0
    sre->Decref();
196
0
    return NULL;
197
0
  }
198
0
  return sre;
199
0
}
200
201
#define Simplify DontCallSimplify  // Avoid accidental recursion
202
203
// Utility function for PostVisit implementations that compares re->sub() with
204
// child_args to determine whether any child_args changed. In the common case,
205
// where nothing changed, calls Decref() for all child_args and returns false,
206
// so PostVisit must return re->Incref(). Otherwise, returns true.
207
0
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
208
0
  for (int i = 0; i < re->nsub(); i++) {
209
0
    Regexp* sub = re->sub()[i];
210
0
    Regexp* newsub = child_args[i];
211
0
    if (newsub != sub)
212
0
      return true;
213
0
  }
214
0
  for (int i = 0; i < re->nsub(); i++) {
215
0
    Regexp* newsub = child_args[i];
216
0
    newsub->Decref();
217
0
  }
218
0
  return false;
219
0
}
220
221
0
Regexp* CoalesceWalker::Copy(Regexp* re) {
222
0
  return re->Incref();
223
0
}
224
225
0
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
226
  // Should never be called: we use Walk(), not WalkExponential().
227
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
228
  ABSL_LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
229
#endif
230
0
  return re->Incref();
231
0
}
232
233
Regexp* CoalesceWalker::PostVisit(Regexp* re,
234
                                  Regexp* parent_arg,
235
                                  Regexp* pre_arg,
236
                                  Regexp** child_args,
237
0
                                  int nchild_args) {
238
0
  if (re->nsub() == 0)
239
0
    return re->Incref();
240
241
0
  if (re->op() != kRegexpConcat) {
242
0
    if (!ChildArgsChanged(re, child_args))
243
0
      return re->Incref();
244
245
    // Something changed. Build a new op.
246
0
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
247
0
    nre->AllocSub(re->nsub());
248
0
    Regexp** nre_subs = nre->sub();
249
0
    for (int i = 0; i < re->nsub(); i++)
250
0
      nre_subs[i] = child_args[i];
251
    // Repeats and Captures have additional data that must be copied.
252
0
    if (re->op() == kRegexpRepeat) {
253
0
      nre->min_ = re->min();
254
0
      nre->max_ = re->max();
255
0
    } else if (re->op() == kRegexpCapture) {
256
0
      nre->cap_ = re->cap();
257
0
    }
258
0
    return nre;
259
0
  }
260
261
0
  bool can_coalesce = false;
262
0
  for (int i = 0; i < re->nsub(); i++) {
263
0
    if (i+1 < re->nsub() &&
264
0
        CanCoalesce(child_args[i], child_args[i+1])) {
265
0
      can_coalesce = true;
266
0
      break;
267
0
    }
268
0
  }
269
0
  if (!can_coalesce) {
270
0
    if (!ChildArgsChanged(re, child_args))
271
0
      return re->Incref();
272
273
    // Something changed. Build a new op.
274
0
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
275
0
    nre->AllocSub(re->nsub());
276
0
    Regexp** nre_subs = nre->sub();
277
0
    for (int i = 0; i < re->nsub(); i++)
278
0
      nre_subs[i] = child_args[i];
279
0
    return nre;
280
0
  }
281
282
0
  for (int i = 0; i < re->nsub(); i++) {
283
0
    if (i+1 < re->nsub() &&
284
0
        CanCoalesce(child_args[i], child_args[i+1]))
285
0
      DoCoalesce(&child_args[i], &child_args[i+1]);
286
0
  }
287
  // Determine how many empty matches were left by DoCoalesce.
288
0
  int n = 0;
289
0
  for (int i = n; i < re->nsub(); i++) {
290
0
    if (child_args[i]->op() == kRegexpEmptyMatch)
291
0
      n++;
292
0
  }
293
  // Build a new op.
294
0
  Regexp* nre = new Regexp(re->op(), re->parse_flags());
295
0
  nre->AllocSub(re->nsub() - n);
296
0
  Regexp** nre_subs = nre->sub();
297
0
  for (int i = 0, j = 0; i < re->nsub(); i++) {
298
0
    if (child_args[i]->op() == kRegexpEmptyMatch) {
299
0
      child_args[i]->Decref();
300
0
      continue;
301
0
    }
302
0
    nre_subs[j] = child_args[i];
303
0
    j++;
304
0
  }
305
0
  return nre;
306
0
}
307
308
0
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
309
  // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
310
  // any byte.
311
0
  if ((r1->op() == kRegexpStar ||
312
0
       r1->op() == kRegexpPlus ||
313
0
       r1->op() == kRegexpQuest ||
314
0
       r1->op() == kRegexpRepeat) &&
315
0
      (r1->sub()[0]->op() == kRegexpLiteral ||
316
0
       r1->sub()[0]->op() == kRegexpCharClass ||
317
0
       r1->sub()[0]->op() == kRegexpAnyChar ||
318
0
       r1->sub()[0]->op() == kRegexpAnyByte)) {
319
    // r2 must be a star/plus/quest/repeat of the same literal, char class,
320
    // any char or any byte.
321
0
    if ((r2->op() == kRegexpStar ||
322
0
         r2->op() == kRegexpPlus ||
323
0
         r2->op() == kRegexpQuest ||
324
0
         r2->op() == kRegexpRepeat) &&
325
0
        Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
326
        // The parse flags must be consistent.
327
0
        ((r1->parse_flags() & Regexp::NonGreedy) ==
328
0
         (r2->parse_flags() & Regexp::NonGreedy))) {
329
0
      return true;
330
0
    }
331
    // ... OR an occurrence of that literal, char class, any char or any byte
332
0
    if (Regexp::Equal(r1->sub()[0], r2)) {
333
0
      return true;
334
0
    }
335
    // ... OR a literal string that begins with that literal.
336
0
    if (r1->sub()[0]->op() == kRegexpLiteral &&
337
0
        r2->op() == kRegexpLiteralString &&
338
0
        r2->runes()[0] == r1->sub()[0]->rune() &&
339
        // The parse flags must be consistent.
340
0
        ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
341
0
         (r2->parse_flags() & Regexp::FoldCase))) {
342
0
      return true;
343
0
    }
344
0
  }
345
0
  return false;
346
0
}
347
348
0
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
349
0
  Regexp* r1 = *r1ptr;
350
0
  Regexp* r2 = *r2ptr;
351
352
0
  Regexp* nre = Regexp::Repeat(
353
0
      r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
354
355
0
  switch (r1->op()) {
356
0
    case kRegexpStar:
357
0
      nre->min_ = 0;
358
0
      nre->max_ = -1;
359
0
      break;
360
361
0
    case kRegexpPlus:
362
0
      nre->min_ = 1;
363
0
      nre->max_ = -1;
364
0
      break;
365
366
0
    case kRegexpQuest:
367
0
      nre->min_ = 0;
368
0
      nre->max_ = 1;
369
0
      break;
370
371
0
    case kRegexpRepeat:
372
0
      nre->min_ = r1->min();
373
0
      nre->max_ = r1->max();
374
0
      break;
375
376
0
    default:
377
0
      nre->Decref();
378
0
      ABSL_LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
379
0
      return;
380
0
  }
381
382
0
  switch (r2->op()) {
383
0
    case kRegexpStar:
384
0
      nre->max_ = -1;
385
0
      goto LeaveEmpty;
386
387
0
    case kRegexpPlus:
388
0
      nre->min_++;
389
0
      nre->max_ = -1;
390
0
      goto LeaveEmpty;
391
392
0
    case kRegexpQuest:
393
0
      if (nre->max() != -1)
394
0
        nre->max_++;
395
0
      goto LeaveEmpty;
396
397
0
    case kRegexpRepeat:
398
0
      nre->min_ += r2->min();
399
0
      if (r2->max() == -1)
400
0
        nre->max_ = -1;
401
0
      else if (nre->max() != -1)
402
0
        nre->max_ += r2->max();
403
0
      goto LeaveEmpty;
404
405
0
    case kRegexpLiteral:
406
0
    case kRegexpCharClass:
407
0
    case kRegexpAnyChar:
408
0
    case kRegexpAnyByte:
409
0
      nre->min_++;
410
0
      if (nre->max() != -1)
411
0
        nre->max_++;
412
0
      goto LeaveEmpty;
413
414
0
    LeaveEmpty:
415
0
      *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
416
0
      *r2ptr = nre;
417
0
      break;
418
419
0
    case kRegexpLiteralString: {
420
0
      Rune r = r1->sub()[0]->rune();
421
      // Determine how much of the literal string is removed.
422
      // We know that we have at least one rune. :)
423
0
      int n = 1;
424
0
      while (n < r2->nrunes() && r2->runes()[n] == r)
425
0
        n++;
426
0
      nre->min_ += n;
427
0
      if (nre->max() != -1)
428
0
        nre->max_ += n;
429
0
      if (n == r2->nrunes())
430
0
        goto LeaveEmpty;
431
0
      *r1ptr = nre;
432
0
      *r2ptr = Regexp::LiteralString(
433
0
          &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
434
0
      break;
435
0
    }
436
437
0
    default:
438
0
      nre->Decref();
439
0
      ABSL_LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
440
0
      return;
441
0
  }
442
443
0
  r1->Decref();
444
0
  r2->Decref();
445
0
}
446
447
0
Regexp* SimplifyWalker::Copy(Regexp* re) {
448
0
  return re->Incref();
449
0
}
450
451
0
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
452
  // Should never be called: we use Walk(), not WalkExponential().
453
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
454
  ABSL_LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
455
#endif
456
0
  return re->Incref();
457
0
}
458
459
0
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
460
0
  if (re->simple()) {
461
0
    *stop = true;
462
0
    return re->Incref();
463
0
  }
464
0
  return NULL;
465
0
}
466
467
Regexp* SimplifyWalker::PostVisit(Regexp* re,
468
                                  Regexp* parent_arg,
469
                                  Regexp* pre_arg,
470
                                  Regexp** child_args,
471
0
                                  int nchild_args) {
472
0
  switch (re->op()) {
473
0
    case kRegexpNoMatch:
474
0
    case kRegexpEmptyMatch:
475
0
    case kRegexpLiteral:
476
0
    case kRegexpLiteralString:
477
0
    case kRegexpBeginLine:
478
0
    case kRegexpEndLine:
479
0
    case kRegexpBeginText:
480
0
    case kRegexpWordBoundary:
481
0
    case kRegexpNoWordBoundary:
482
0
    case kRegexpEndText:
483
0
    case kRegexpAnyChar:
484
0
    case kRegexpAnyByte:
485
0
    case kRegexpHaveMatch:
486
      // All these are always simple.
487
0
      re->simple_ = true;
488
0
      return re->Incref();
489
490
0
    case kRegexpConcat:
491
0
    case kRegexpAlternate: {
492
      // These are simple as long as the subpieces are simple.
493
0
      if (!ChildArgsChanged(re, child_args)) {
494
0
        re->simple_ = true;
495
0
        return re->Incref();
496
0
      }
497
0
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
498
0
      nre->AllocSub(re->nsub());
499
0
      Regexp** nre_subs = nre->sub();
500
0
      for (int i = 0; i < re->nsub(); i++)
501
0
        nre_subs[i] = child_args[i];
502
0
      nre->simple_ = true;
503
0
      return nre;
504
0
    }
505
506
0
    case kRegexpCapture: {
507
0
      Regexp* newsub = child_args[0];
508
0
      if (newsub == re->sub()[0]) {
509
0
        newsub->Decref();
510
0
        re->simple_ = true;
511
0
        return re->Incref();
512
0
      }
513
0
      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
514
0
      nre->AllocSub(1);
515
0
      nre->sub()[0] = newsub;
516
0
      nre->cap_ = re->cap();
517
0
      nre->simple_ = true;
518
0
      return nre;
519
0
    }
520
521
0
    case kRegexpStar:
522
0
    case kRegexpPlus:
523
0
    case kRegexpQuest: {
524
0
      Regexp* newsub = child_args[0];
525
      // Special case: repeat the empty string as much as
526
      // you want, but it's still the empty string.
527
0
      if (newsub->op() == kRegexpEmptyMatch)
528
0
        return newsub;
529
530
      // These are simple as long as the subpiece is simple.
531
0
      if (newsub == re->sub()[0]) {
532
0
        newsub->Decref();
533
0
        re->simple_ = true;
534
0
        return re->Incref();
535
0
      }
536
537
      // These are also idempotent if flags are constant.
538
0
      if (re->op() == newsub->op() &&
539
0
          re->parse_flags() == newsub->parse_flags())
540
0
        return newsub;
541
542
0
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
543
0
      nre->AllocSub(1);
544
0
      nre->sub()[0] = newsub;
545
0
      nre->simple_ = true;
546
0
      return nre;
547
0
    }
548
549
0
    case kRegexpRepeat: {
550
0
      Regexp* newsub = child_args[0];
551
      // Special case: repeat the empty string as much as
552
      // you want, but it's still the empty string.
553
0
      if (newsub->op() == kRegexpEmptyMatch)
554
0
        return newsub;
555
556
0
      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
557
0
                                   re->parse_flags());
558
0
      newsub->Decref();
559
0
      nre->simple_ = true;
560
0
      return nre;
561
0
    }
562
563
0
    case kRegexpCharClass: {
564
0
      Regexp* nre = SimplifyCharClass(re);
565
0
      nre->simple_ = true;
566
0
      return nre;
567
0
    }
568
0
  }
569
570
0
  ABSL_LOG(ERROR) << "Simplify case not handled: " << re->op();
571
0
  return re->Incref();
572
0
}
573
574
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
575
// Returns a new Regexp, handing the ref to the caller.
576
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
577
0
                                Regexp::ParseFlags parse_flags) {
578
0
  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
579
0
  re->AllocSub(2);
580
0
  Regexp** subs = re->sub();
581
0
  subs[0] = re1;
582
0
  subs[1] = re2;
583
0
  return re;
584
0
}
585
586
// Returns true if re is an empty-width op.
587
0
static bool IsEmptyOp(Regexp* re) {
588
0
  return (re->op() == kRegexpBeginLine ||
589
0
          re->op() == kRegexpEndLine ||
590
0
          re->op() == kRegexpWordBoundary ||
591
0
          re->op() == kRegexpNoWordBoundary ||
592
0
          re->op() == kRegexpBeginText ||
593
0
          re->op() == kRegexpEndText);
594
0
}
595
596
// Simplifies the expression re{min,max} in terms of *, +, and ?.
597
// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
598
// Caller must Decref return value when done with it.
599
// The result will *not* necessarily have the right capturing parens
600
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
601
// but in the Regexp* representation, both (x) are marked as $1.
602
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
603
0
                                       Regexp::ParseFlags f) {
604
  // For an empty-width op OR a concatenation or alternation of empty-width
605
  // ops, cap the repetition count at 1.
606
0
  if (IsEmptyOp(re) ||
607
0
      ((re->op() == kRegexpConcat ||
608
0
        re->op() == kRegexpAlternate) &&
609
0
       std::all_of(re->sub(), re->sub() + re->nsub(), IsEmptyOp))) {
610
0
    min = std::min(min, 1);
611
0
    max = std::min(max, 1);
612
0
  }
613
614
  // x{n,} means at least n matches of x.
615
0
  if (max == -1) {
616
    // Special case: x{0,} is x*
617
0
    if (min == 0)
618
0
      return Regexp::Star(re->Incref(), f);
619
620
    // Special case: x{1,} is x+
621
0
    if (min == 1)
622
0
      return Regexp::Plus(re->Incref(), f);
623
624
    // General case: x{4,} is xxxx+
625
0
    PODArray<Regexp*> nre_subs(min);
626
0
    for (int i = 0; i < min-1; i++)
627
0
      nre_subs[i] = re->Incref();
628
0
    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
629
0
    return Regexp::Concat(nre_subs.data(), min, f);
630
0
  }
631
632
  // Special case: (x){0} matches only empty string.
633
0
  if (min == 0 && max == 0)
634
0
    return new Regexp(kRegexpEmptyMatch, f);
635
636
  // Special case: x{1} is just x.
637
0
  if (min == 1 && max == 1)
638
0
    return re->Incref();
639
640
  // General case: x{n,m} means n copies of x and m copies of x?.
641
  // The machine will do less work if we nest the final m copies,
642
  // so that x{2,5} = xx(x(x(x)?)?)?
643
644
  // Build leading prefix: xx.  Capturing only on the last one.
645
0
  Regexp* nre = NULL;
646
0
  if (min > 0) {
647
0
    PODArray<Regexp*> nre_subs(min);
648
0
    for (int i = 0; i < min; i++)
649
0
      nre_subs[i] = re->Incref();
650
0
    nre = Regexp::Concat(nre_subs.data(), min, f);
651
0
  }
652
653
  // Build and attach suffix: (x(x(x)?)?)?
654
0
  if (max > min) {
655
0
    Regexp* suf = Regexp::Quest(re->Incref(), f);
656
0
    for (int i = min+1; i < max; i++)
657
0
      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
658
0
    if (nre == NULL)
659
0
      nre = suf;
660
0
    else
661
0
      nre = Concat2(nre, suf, f);
662
0
  }
663
664
0
  if (nre == NULL) {
665
    // Some degenerate case, like min > max, or min < max < 0.
666
    // This shouldn't happen, because the parser rejects such regexps.
667
0
    ABSL_LOG(DFATAL) << "Malformed repeat of " << re->ToString()
668
0
                     << " min " << min << " max " << max;
669
0
    return new Regexp(kRegexpNoMatch, f);
670
0
  }
671
672
0
  return nre;
673
0
}
674
675
// Simplifies a character class.
676
// Caller must Decref return value when done with it.
677
0
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
678
0
  CharClass* cc = re->cc();
679
680
  // Special cases
681
0
  if (cc->empty())
682
0
    return new Regexp(kRegexpNoMatch, re->parse_flags());
683
0
  if (cc->full())
684
0
    return new Regexp(kRegexpAnyChar, re->parse_flags());
685
686
0
  return re->Incref();
687
0
}
688
689
}  // namespace re2