Coverage Report

Created: 2024-08-17 11:04

/src/re2/re2/simplify.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Rewrite POSIX and other features in re
6
// to use simple extended regular expression features.
7
// Also sort and simplify character classes.
8
9
#include <string>
10
11
#include "util/util.h"
12
#include "util/logging.h"
13
#include "util/utf.h"
14
#include "re2/pod_array.h"
15
#include "re2/regexp.h"
16
#include "re2/walker-inl.h"
17
18
namespace re2 {
19
20
// Parses the regexp src and then simplifies it and sets *dst to the
21
// string representation of the simplified form.  Returns true on success.
22
// Returns false and sets *error (if error != NULL) on error.
23
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
24
0
                            std::string* dst, RegexpStatus* status) {
25
0
  Regexp* re = Parse(src, flags, status);
26
0
  if (re == NULL)
27
0
    return false;
28
0
  Regexp* sre = re->Simplify();
29
0
  re->Decref();
30
0
  if (sre == NULL) {
31
0
    if (status) {
32
0
      status->set_code(kRegexpInternalError);
33
0
      status->set_error_arg(src);
34
0
    }
35
0
    return false;
36
0
  }
37
0
  *dst = sre->ToString();
38
0
  sre->Decref();
39
0
  return true;
40
0
}
41
42
// Assuming the simple_ flags on the children are accurate,
43
// is this Regexp* simple?
44
202k
bool Regexp::ComputeSimple() {
45
202k
  Regexp** subs;
46
202k
  switch (op_) {
47
950
    case kRegexpNoMatch:
48
5.69k
    case kRegexpEmptyMatch:
49
133k
    case kRegexpLiteral:
50
133k
    case kRegexpLiteralString:
51
135k
    case kRegexpBeginLine:
52
136k
    case kRegexpEndLine:
53
139k
    case kRegexpBeginText:
54
139k
    case kRegexpWordBoundary:
55
139k
    case kRegexpNoWordBoundary:
56
142k
    case kRegexpEndText:
57
148k
    case kRegexpAnyChar:
58
148k
    case kRegexpAnyByte:
59
148k
    case kRegexpHaveMatch:
60
148k
      return true;
61
12.0k
    case kRegexpConcat:
62
15.4k
    case kRegexpAlternate:
63
      // These are simple as long as the subpieces are simple.
64
15.4k
      subs = sub();
65
84.9k
      for (int i = 0; i < nsub_; i++)
66
74.0k
        if (!subs[i]->simple())
67
4.46k
          return false;
68
10.9k
      return true;
69
24.5k
    case kRegexpCharClass:
70
      // Simple as long as the char class is not empty, not full.
71
24.5k
      if (ccb_ != NULL)
72
24.3k
        return !ccb_->empty() && !ccb_->full();
73
181
      return !cc_->empty() && !cc_->full();
74
3.59k
    case kRegexpCapture:
75
3.59k
      subs = sub();
76
3.59k
      return subs[0]->simple();
77
2.25k
    case kRegexpStar:
78
5.73k
    case kRegexpPlus:
79
7.90k
    case kRegexpQuest:
80
7.90k
      subs = sub();
81
7.90k
      if (!subs[0]->simple())
82
529
        return false;
83
7.37k
      switch (subs[0]->op_) {
84
3
        case kRegexpStar:
85
9
        case kRegexpPlus:
86
30
        case kRegexpQuest:
87
50
        case kRegexpEmptyMatch:
88
190
        case kRegexpNoMatch:
89
190
          return false;
90
7.18k
        default:
91
7.18k
          break;
92
7.37k
      }
93
7.18k
      return true;
94
2.97k
    case kRegexpRepeat:
95
2.97k
      return false;
96
202k
  }
97
0
  LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
98
0
  return false;
99
202k
}
100
101
// Walker subclass used by Simplify.
102
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
103
// occurrences of that literal into repeats of that literal. It also works for
104
// char classes, any char and any byte.
105
// PostVisit creates the coalesced result, which should then be simplified.
106
class CoalesceWalker : public Regexp::Walker<Regexp*> {
107
 public:
108
11.6k
  CoalesceWalker() {}
109
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
110
                            Regexp** child_args, int nchild_args);
111
  virtual Regexp* Copy(Regexp* re);
112
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113
114
 private:
115
  // These functions are declared inside CoalesceWalker so that
116
  // they can edit the private fields of the Regexps they construct.
117
118
  // Returns true if r1 and r2 can be coalesced. In particular, ensures that
119
  // the parse flags are consistent. (They will not be checked again later.)
120
  static bool CanCoalesce(Regexp* r1, Regexp* r2);
121
122
  // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
123
  // will be empty match and the coalesced op. In other cases, where part of a
124
  // literal string was removed to be coalesced, the array elements afterwards
125
  // will be the coalesced op and the remainder of the literal string.
126
  static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
127
128
  CoalesceWalker(const CoalesceWalker&) = delete;
129
  CoalesceWalker& operator=(const CoalesceWalker&) = delete;
130
};
131
132
// Walker subclass used by Simplify.
133
// The simplify walk is purely post-recursive: given the simplified children,
134
// PostVisit creates the simplified result.
135
// The child_args are simplified Regexp*s.
136
class SimplifyWalker : public Regexp::Walker<Regexp*> {
137
 public:
138
11.6k
  SimplifyWalker() {}
139
  virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
140
  virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
141
                            Regexp** child_args, int nchild_args);
142
  virtual Regexp* Copy(Regexp* re);
143
  virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
144
145
 private:
146
  // These functions are declared inside SimplifyWalker so that
147
  // they can edit the private fields of the Regexps they construct.
148
149
  // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
150
  // Caller must Decref return value when done with it.
151
  static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
152
153
  // Simplifies the expression re{min,max} in terms of *, +, and ?.
154
  // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
155
  // Caller must Decref return value when done with it.
156
  static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
157
                                Regexp::ParseFlags parse_flags);
158
159
  // Simplifies a character class by expanding any named classes
160
  // into rune ranges.  Does not edit re.  Does not consume ref to re.
161
  // Caller must Decref return value when done with it.
162
  static Regexp* SimplifyCharClass(Regexp* re);
163
164
  SimplifyWalker(const SimplifyWalker&) = delete;
165
  SimplifyWalker& operator=(const SimplifyWalker&) = delete;
166
};
167
168
// Simplifies a regular expression, returning a new regexp.
169
// The new regexp uses traditional Unix egrep features only,
170
// plus the Perl (?:) non-capturing parentheses.
171
// Otherwise, no POSIX or Perl additions.  The new regexp
172
// captures exactly the same subexpressions (with the same indices)
173
// as the original.
174
// Does not edit current object.
175
// Caller must Decref() return value when done with it.
176
177
11.6k
Regexp* Regexp::Simplify() {
178
11.6k
  CoalesceWalker cw;
179
11.6k
  Regexp* cre = cw.Walk(this, NULL);
180
11.6k
  if (cre == NULL)
181
0
    return NULL;
182
11.6k
  if (cw.stopped_early()) {
183
0
    cre->Decref();
184
0
    return NULL;
185
0
  }
186
11.6k
  SimplifyWalker sw;
187
11.6k
  Regexp* sre = sw.Walk(cre, NULL);
188
11.6k
  cre->Decref();
189
11.6k
  if (sre == NULL)
190
0
    return NULL;
191
11.6k
  if (sw.stopped_early()) {
192
0
    sre->Decref();
193
0
    return NULL;
194
0
  }
195
11.6k
  return sre;
196
11.6k
}
197
198
#define Simplify DontCallSimplify  // Avoid accidental recursion
199
200
// Utility function for PostVisit implementations that compares re->sub() with
201
// child_args to determine whether any child_args changed. In the common case,
202
// where nothing changed, calls Decref() for all child_args and returns false,
203
// so PostVisit must return re->Incref(). Otherwise, returns true.
204
34.1k
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
205
135k
  for (int i = 0; i < re->nsub(); i++) {
206
106k
    Regexp* sub = re->sub()[i];
207
106k
    Regexp* newsub = child_args[i];
208
106k
    if (newsub != sub)
209
5.98k
      return true;
210
106k
  }
211
116k
  for (int i = 0; i < re->nsub(); i++) {
212
88.7k
    Regexp* newsub = child_args[i];
213
88.7k
    newsub->Decref();
214
88.7k
  }
215
28.2k
  return false;
216
34.1k
}
217
218
0
Regexp* CoalesceWalker::Copy(Regexp* re) {
219
0
  return re->Incref();
220
0
}
221
222
0
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
223
  // Should never be called: we use Walk(), not WalkExponential().
224
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
225
  LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
226
#endif
227
0
  return re->Incref();
228
0
}
229
230
Regexp* CoalesceWalker::PostVisit(Regexp* re,
231
                                  Regexp* parent_arg,
232
                                  Regexp* pre_arg,
233
                                  Regexp** child_args,
234
101k
                                  int nchild_args) {
235
101k
  if (re->nsub() == 0)
236
75.1k
    return re->Incref();
237
238
26.7k
  if (re->op() != kRegexpConcat) {
239
14.6k
    if (!ChildArgsChanged(re, child_args))
240
13.8k
      return re->Incref();
241
242
    // Something changed. Build a new op.
243
861
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
244
861
    nre->AllocSub(re->nsub());
245
861
    Regexp** nre_subs = nre->sub();
246
3.65k
    for (int i = 0; i < re->nsub(); i++)
247
2.79k
      nre_subs[i] = child_args[i];
248
    // Repeats and Captures have additional data that must be copied.
249
861
    if (re->op() == kRegexpRepeat) {
250
76
      nre->min_ = re->min();
251
76
      nre->max_ = re->max();
252
785
    } else if (re->op() == kRegexpCapture) {
253
181
      nre->cap_ = re->cap();
254
181
    }
255
861
    return nre;
256
14.6k
  }
257
258
12.0k
  bool can_coalesce = false;
259
71.3k
  for (int i = 0; i < re->nsub(); i++) {
260
60.2k
    if (i+1 < re->nsub() &&
261
60.2k
        CanCoalesce(child_args[i], child_args[i+1])) {
262
995
      can_coalesce = true;
263
995
      break;
264
995
    }
265
60.2k
  }
266
12.0k
  if (!can_coalesce) {
267
11.0k
    if (!ChildArgsChanged(re, child_args))
268
10.6k
      return re->Incref();
269
270
    // Something changed. Build a new op.
271
441
    Regexp* nre = new Regexp(re->op(), re->parse_flags());
272
441
    nre->AllocSub(re->nsub());
273
441
    Regexp** nre_subs = nre->sub();
274
2.57k
    for (int i = 0; i < re->nsub(); i++)
275
2.13k
      nre_subs[i] = child_args[i];
276
441
    return nre;
277
11.0k
  }
278
279
9.30k
  for (int i = 0; i < re->nsub(); i++) {
280
8.30k
    if (i+1 < re->nsub() &&
281
8.30k
        CanCoalesce(child_args[i], child_args[i+1]))
282
2.27k
      DoCoalesce(&child_args[i], &child_args[i+1]);
283
8.30k
  }
284
  // Determine how many empty matches were left by DoCoalesce.
285
995
  int n = 0;
286
9.30k
  for (int i = n; i < re->nsub(); i++) {
287
8.30k
    if (child_args[i]->op() == kRegexpEmptyMatch)
288
2.00k
      n++;
289
8.30k
  }
290
  // Build a new op.
291
995
  Regexp* nre = new Regexp(re->op(), re->parse_flags());
292
995
  nre->AllocSub(re->nsub() - n);
293
995
  Regexp** nre_subs = nre->sub();
294
9.30k
  for (int i = 0, j = 0; i < re->nsub(); i++) {
295
8.30k
    if (child_args[i]->op() == kRegexpEmptyMatch) {
296
2.00k
      child_args[i]->Decref();
297
2.00k
      continue;
298
2.00k
    }
299
6.30k
    nre_subs[j] = child_args[i];
300
6.30k
    j++;
301
6.30k
  }
302
995
  return nre;
303
12.0k
}
304
305
56.4k
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
306
  // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
307
  // any byte.
308
56.4k
  if ((r1->op() == kRegexpStar ||
309
56.4k
       r1->op() == kRegexpPlus ||
310
56.4k
       r1->op() == kRegexpQuest ||
311
56.4k
       r1->op() == kRegexpRepeat) &&
312
56.4k
      (r1->sub()[0]->op() == kRegexpLiteral ||
313
9.89k
       r1->sub()[0]->op() == kRegexpCharClass ||
314
9.89k
       r1->sub()[0]->op() == kRegexpAnyChar ||
315
9.89k
       r1->sub()[0]->op() == kRegexpAnyByte)) {
316
    // r2 must be a star/plus/quest/repeat of the same literal, char class,
317
    // any char or any byte.
318
8.08k
    if ((r2->op() == kRegexpStar ||
319
8.08k
         r2->op() == kRegexpPlus ||
320
8.08k
         r2->op() == kRegexpQuest ||
321
8.08k
         r2->op() == kRegexpRepeat) &&
322
8.08k
        Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
323
        // The parse flags must be consistent.
324
8.08k
        ((r1->parse_flags() & Regexp::NonGreedy) ==
325
1.10k
         (r2->parse_flags() & Regexp::NonGreedy))) {
326
1.05k
      return true;
327
1.05k
    }
328
    // ... OR an occurrence of that literal, char class, any char or any byte
329
7.03k
    if (Regexp::Equal(r1->sub()[0], r2)) {
330
1.65k
      return true;
331
1.65k
    }
332
    // ... OR a literal string that begins with that literal.
333
5.37k
    if (r1->sub()[0]->op() == kRegexpLiteral &&
334
5.37k
        r2->op() == kRegexpLiteralString &&
335
5.37k
        r2->runes()[0] == r1->sub()[0]->rune() &&
336
        // The parse flags must be consistent.
337
5.37k
        ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
338
575
         (r2->parse_flags() & Regexp::FoldCase))) {
339
557
      return true;
340
557
    }
341
5.37k
  }
342
53.1k
  return false;
343
56.4k
}
344
345
2.27k
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
346
2.27k
  Regexp* r1 = *r1ptr;
347
2.27k
  Regexp* r2 = *r2ptr;
348
349
2.27k
  Regexp* nre = Regexp::Repeat(
350
2.27k
      r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
351
352
2.27k
  switch (r1->op()) {
353
275
    case kRegexpStar:
354
275
      nre->min_ = 0;
355
275
      nre->max_ = -1;
356
275
      break;
357
358
529
    case kRegexpPlus:
359
529
      nre->min_ = 1;
360
529
      nre->max_ = -1;
361
529
      break;
362
363
388
    case kRegexpQuest:
364
388
      nre->min_ = 0;
365
388
      nre->max_ = 1;
366
388
      break;
367
368
1.07k
    case kRegexpRepeat:
369
1.07k
      nre->min_ = r1->min();
370
1.07k
      nre->max_ = r1->max();
371
1.07k
      break;
372
373
0
    default:
374
0
      nre->Decref();
375
0
      LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
376
0
      return;
377
2.27k
  }
378
379
2.27k
  switch (r2->op()) {
380
85
    case kRegexpStar:
381
85
      nre->max_ = -1;
382
85
      goto LeaveEmpty;
383
384
287
    case kRegexpPlus:
385
287
      nre->min_++;
386
287
      nre->max_ = -1;
387
287
      goto LeaveEmpty;
388
389
227
    case kRegexpQuest:
390
227
      if (nre->max() != -1)
391
185
        nre->max_++;
392
227
      goto LeaveEmpty;
393
394
63
    case kRegexpRepeat:
395
63
      nre->min_ += r2->min();
396
63
      if (r2->max() == -1)
397
20
        nre->max_ = -1;
398
43
      else if (nre->max() != -1)
399
20
        nre->max_ += r2->max();
400
63
      goto LeaveEmpty;
401
402
152
    case kRegexpLiteral:
403
675
    case kRegexpCharClass:
404
1.22k
    case kRegexpAnyChar:
405
1.26k
    case kRegexpAnyByte:
406
1.26k
      nre->min_++;
407
1.26k
      if (nre->max() != -1)
408
295
        nre->max_++;
409
1.26k
      goto LeaveEmpty;
410
411
2.00k
    LeaveEmpty:
412
2.00k
      *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
413
2.00k
      *r2ptr = nre;
414
2.00k
      break;
415
416
345
    case kRegexpLiteralString: {
417
345
      Rune r = r1->sub()[0]->rune();
418
      // Determine how much of the literal string is removed.
419
      // We know that we have at least one rune. :)
420
345
      int n = 1;
421
923
      while (n < r2->nrunes() && r2->runes()[n] == r)
422
578
        n++;
423
345
      nre->min_ += n;
424
345
      if (nre->max() != -1)
425
171
        nre->max_ += n;
426
345
      if (n == r2->nrunes())
427
76
        goto LeaveEmpty;
428
269
      *r1ptr = nre;
429
269
      *r2ptr = Regexp::LiteralString(
430
269
          &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
431
269
      break;
432
345
    }
433
434
0
    default:
435
0
      nre->Decref();
436
0
      LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
437
0
      return;
438
2.27k
  }
439
440
2.27k
  r1->Decref();
441
2.27k
  r2->Decref();
442
2.27k
}
443
444
0
Regexp* SimplifyWalker::Copy(Regexp* re) {
445
0
  return re->Incref();
446
0
}
447
448
0
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
449
  // Should never be called: we use Walk(), not WalkExponential().
450
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
451
  LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
452
#endif
453
0
  return re->Incref();
454
0
}
455
456
54.7k
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
457
54.7k
  if (re->simple()) {
458
39.5k
    *stop = true;
459
39.5k
    return re->Incref();
460
39.5k
  }
461
15.2k
  return NULL;
462
54.7k
}
463
464
Regexp* SimplifyWalker::PostVisit(Regexp* re,
465
                                  Regexp* parent_arg,
466
                                  Regexp* pre_arg,
467
                                  Regexp** child_args,
468
15.2k
                                  int nchild_args) {
469
15.2k
  switch (re->op()) {
470
0
    case kRegexpNoMatch:
471
360
    case kRegexpEmptyMatch:
472
1.07k
    case kRegexpLiteral:
473
1.61k
    case kRegexpLiteralString:
474
1.61k
    case kRegexpBeginLine:
475
1.61k
    case kRegexpEndLine:
476
1.61k
    case kRegexpBeginText:
477
1.61k
    case kRegexpWordBoundary:
478
1.61k
    case kRegexpNoWordBoundary:
479
1.61k
    case kRegexpEndText:
480
1.61k
    case kRegexpAnyChar:
481
1.61k
    case kRegexpAnyByte:
482
1.61k
    case kRegexpHaveMatch:
483
      // All these are always simple.
484
1.61k
      re->simple_ = true;
485
1.61k
      return re->Incref();
486
487
5.62k
    case kRegexpConcat:
488
8.39k
    case kRegexpAlternate: {
489
      // These are simple as long as the subpieces are simple.
490
8.39k
      if (!ChildArgsChanged(re, child_args)) {
491
3.71k
        re->simple_ = true;
492
3.71k
        return re->Incref();
493
3.71k
      }
494
4.68k
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
495
4.68k
      nre->AllocSub(re->nsub());
496
4.68k
      Regexp** nre_subs = nre->sub();
497
31.4k
      for (int i = 0; i < re->nsub(); i++)
498
26.7k
        nre_subs[i] = child_args[i];
499
4.68k
      nre->simple_ = true;
500
4.68k
      return nre;
501
8.39k
    }
502
503
313
    case kRegexpCapture: {
504
313
      Regexp* newsub = child_args[0];
505
313
      if (newsub == re->sub()[0]) {
506
73
        newsub->Decref();
507
73
        re->simple_ = true;
508
73
        return re->Incref();
509
73
      }
510
240
      Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
511
240
      nre->AllocSub(1);
512
240
      nre->sub()[0] = newsub;
513
240
      nre->cap_ = re->cap();
514
240
      nre->simple_ = true;
515
240
      return nre;
516
313
    }
517
518
358
    case kRegexpStar:
519
524
    case kRegexpPlus:
520
715
    case kRegexpQuest: {
521
715
      Regexp* newsub = child_args[0];
522
      // Special case: repeat the empty string as much as
523
      // you want, but it's still the empty string.
524
715
      if (newsub->op() == kRegexpEmptyMatch)
525
38
        return newsub;
526
527
      // These are simple as long as the subpiece is simple.
528
677
      if (newsub == re->sub()[0]) {
529
186
        newsub->Decref();
530
186
        re->simple_ = true;
531
186
        return re->Incref();
532
186
      }
533
534
      // These are also idempotent if flags are constant.
535
491
      if (re->op() == newsub->op() &&
536
491
          re->parse_flags() == newsub->parse_flags())
537
126
        return newsub;
538
539
365
      Regexp* nre = new Regexp(re->op(), re->parse_flags());
540
365
      nre->AllocSub(1);
541
365
      nre->sub()[0] = newsub;
542
365
      nre->simple_ = true;
543
365
      return nre;
544
491
    }
545
546
3.65k
    case kRegexpRepeat: {
547
3.65k
      Regexp* newsub = child_args[0];
548
      // Special case: repeat the empty string as much as
549
      // you want, but it's still the empty string.
550
3.65k
      if (newsub->op() == kRegexpEmptyMatch)
551
32
        return newsub;
552
553
3.62k
      Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
554
3.62k
                                   re->parse_flags());
555
3.62k
      newsub->Decref();
556
3.62k
      nre->simple_ = true;
557
3.62k
      return nre;
558
3.65k
    }
559
560
530
    case kRegexpCharClass: {
561
530
      Regexp* nre = SimplifyCharClass(re);
562
530
      nre->simple_ = true;
563
530
      return nre;
564
3.65k
    }
565
15.2k
  }
566
567
0
  LOG(ERROR) << "Simplify case not handled: " << re->op();
568
0
  return re->Incref();
569
15.2k
}
570
571
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
572
// Returns a new Regexp, handing the ref to the caller.
573
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
574
270k
                                Regexp::ParseFlags parse_flags) {
575
270k
  Regexp* re = new Regexp(kRegexpConcat, parse_flags);
576
270k
  re->AllocSub(2);
577
270k
  Regexp** subs = re->sub();
578
270k
  subs[0] = re1;
579
270k
  subs[1] = re2;
580
270k
  return re;
581
270k
}
582
583
// Simplifies the expression re{min,max} in terms of *, +, and ?.
584
// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
585
// Caller must Decref return value when done with it.
586
// The result will *not* necessarily have the right capturing parens
587
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
588
// but in the Regexp* representation, both (x) are marked as $1.
589
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
590
3.62k
                                       Regexp::ParseFlags f) {
591
  // x{n,} means at least n matches of x.
592
3.62k
  if (max == -1) {
593
    // Special case: x{0,} is x*
594
940
    if (min == 0)
595
97
      return Regexp::Star(re->Incref(), f);
596
597
    // Special case: x{1,} is x+
598
843
    if (min == 1)
599
209
      return Regexp::Plus(re->Incref(), f);
600
601
    // General case: x{4,} is xxxx+
602
634
    PODArray<Regexp*> nre_subs(min);
603
2.85k
    for (int i = 0; i < min-1; i++)
604
2.22k
      nre_subs[i] = re->Incref();
605
634
    nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
606
634
    return Regexp::Concat(nre_subs.data(), min, f);
607
843
  }
608
609
  // Special case: (x){0} matches only empty string.
610
2.68k
  if (min == 0 && max == 0)
611
108
    return new Regexp(kRegexpEmptyMatch, f);
612
613
  // Special case: x{1} is just x.
614
2.57k
  if (min == 1 && max == 1)
615
93
    return re->Incref();
616
617
  // General case: x{n,m} means n copies of x and m copies of x?.
618
  // The machine will do less work if we nest the final m copies,
619
  // so that x{2,5} = xx(x(x(x)?)?)?
620
621
  // Build leading prefix: xx.  Capturing only on the last one.
622
2.48k
  Regexp* nre = NULL;
623
2.48k
  if (min > 0) {
624
1.93k
    PODArray<Regexp*> nre_subs(min);
625
520k
    for (int i = 0; i < min; i++)
626
518k
      nre_subs[i] = re->Incref();
627
1.93k
    nre = Regexp::Concat(nre_subs.data(), min, f);
628
1.93k
  }
629
630
  // Build and attach suffix: (x(x(x)?)?)?
631
2.48k
  if (max > min) {
632
1.51k
    Regexp* suf = Regexp::Quest(re->Incref(), f);
633
270k
    for (int i = min+1; i < max; i++)
634
269k
      suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
635
1.51k
    if (nre == NULL)
636
552
      nre = suf;
637
965
    else
638
965
      nre = Concat2(nre, suf, f);
639
1.51k
  }
640
641
2.48k
  if (nre == NULL) {
642
    // Some degenerate case, like min > max, or min < max < 0.
643
    // This shouldn't happen, because the parser rejects such regexps.
644
0
    LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
645
0
    return new Regexp(kRegexpNoMatch, f);
646
0
  }
647
648
2.48k
  return nre;
649
2.48k
}
650
651
// Simplifies a character class.
652
// Caller must Decref return value when done with it.
653
530
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
654
530
  CharClass* cc = re->cc();
655
656
  // Special cases
657
530
  if (cc->empty())
658
111
    return new Regexp(kRegexpNoMatch, re->parse_flags());
659
419
  if (cc->full())
660
7
    return new Regexp(kRegexpAnyChar, re->parse_flags());
661
662
412
  return re->Incref();
663
419
}
664
665
}  // namespace re2