Coverage Report

Created: 2025-06-16 06:26

/src/re2/re2/parse.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Regular expression parser.
6
7
// The parser is a simple precedence-based parser with a
8
// manual stack.  The parsing work is done by the methods
9
// of the ParseState class.  The Regexp::Parse function is
10
// essentially just a lexer that calls the ParseState method
11
// for each token.
12
13
// The parser recognizes POSIX extended regular expressions
14
// excluding backreferences, collating elements, and collating
15
// classes.  It also allows the empty string as a regular expression
16
// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
17
// See regexp.h for rationale.
18
19
#include <stddef.h>
20
#include <stdint.h>
21
#include <string.h>
22
23
#include <algorithm>
24
#include <string>
25
#include <vector>
26
27
#include "absl/base/attributes.h"
28
#include "absl/base/macros.h"
29
#include "absl/log/absl_log.h"
30
#include "absl/strings/ascii.h"
31
#include "absl/strings/string_view.h"
32
#include "re2/pod_array.h"
33
#include "re2/regexp.h"
34
#include "re2/unicode_casefold.h"
35
#include "re2/unicode_groups.h"
36
#include "re2/walker-inl.h"
37
#include "util/utf.h"
38
39
#if defined(RE2_USE_ICU)
40
#include "unicode/uniset.h"
41
#include "unicode/unistr.h"
42
#include "unicode/utypes.h"
43
#endif
44
45
namespace re2 {
46
47
// Controls the maximum repeat count permitted by the parser.
48
static int maximum_repeat_count = 1000;
49
50
12.6k
void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) {
51
12.6k
  maximum_repeat_count = i;
52
12.6k
}
53
54
// Regular expression parse state.
55
// The list of parsed regexps so far is maintained as a vector of
56
// Regexp pointers called the stack.  Left parenthesis and vertical
57
// bar markers are also placed on the stack, as Regexps with
58
// non-standard opcodes.
59
// Scanning a left parenthesis causes the parser to push a left parenthesis
60
// marker on the stack.
61
// Scanning a vertical bar causes the parser to pop the stack until it finds a
62
// vertical bar or left parenthesis marker (not popping the marker),
63
// concatenate all the popped results, and push them back on
64
// the stack (DoConcatenation).
65
// Scanning a right parenthesis causes the parser to act as though it
66
// has seen a vertical bar, which then leaves the top of the stack in the
67
// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
68
// The parser pops all this off the stack and creates an alternation of the
69
// regexps (DoAlternation).
70
71
class Regexp::ParseState {
72
 public:
73
  ParseState(ParseFlags flags, absl::string_view whole_regexp,
74
             RegexpStatus* status);
75
  ~ParseState();
76
77
358k
  ParseFlags flags() { return flags_; }
78
3.62k
  int rune_max() { return rune_max_; }
79
80
  // Parse methods.  All public methods return a bool saying
81
  // whether parsing should continue.  If a method returns
82
  // false, it has set fields in *status_, and the parser
83
  // should return NULL.
84
85
  // Pushes the given regular expression onto the stack.
86
  // Could check for too much memory used here.
87
  bool PushRegexp(Regexp* re);
88
89
  // Pushes the literal rune r onto the stack.
90
  bool PushLiteral(Rune r);
91
92
  // Pushes a regexp with the given op (and no args) onto the stack.
93
  bool PushSimpleOp(RegexpOp op);
94
95
  // Pushes a ^ onto the stack.
96
  bool PushCaret();
97
98
  // Pushes a \b (word == true) or \B (word == false) onto the stack.
99
  bool PushWordBoundary(bool word);
100
101
  // Pushes a $ onto the stack.
102
  bool PushDollar();
103
104
  // Pushes a . onto the stack
105
  bool PushDot();
106
107
  // Pushes a repeat operator regexp onto the stack.
108
  // A valid argument for the operator must already be on the stack.
109
  // s is the name of the operator, for use in error messages.
110
  bool PushRepeatOp(RegexpOp op, absl::string_view s, bool nongreedy);
111
112
  // Pushes a repetition regexp onto the stack.
113
  // A valid argument for the operator must already be on the stack.
114
  bool PushRepetition(int min, int max, absl::string_view s, bool nongreedy);
115
116
  // Checks whether a particular regexp op is a marker.
117
  bool IsMarker(RegexpOp op);
118
119
  // Processes a left parenthesis in the input.
120
  // Pushes a marker onto the stack.
121
  bool DoLeftParen(absl::string_view name);
122
  bool DoLeftParenNoCapture();
123
124
  // Processes a vertical bar in the input.
125
  bool DoVerticalBar();
126
127
  // Processes a right parenthesis in the input.
128
  bool DoRightParen();
129
130
  // Processes the end of input, returning the final regexp.
131
  Regexp* DoFinish();
132
133
  // Finishes the regexp if necessary, preparing it for use
134
  // in a more complicated expression.
135
  // If it is a CharClassBuilder, converts into a CharClass.
136
  Regexp* FinishRegexp(Regexp*);
137
138
  // These routines don't manipulate the parse stack
139
  // directly, but they do need to look at flags_.
140
  // ParseCharClass also manipulates the internals of Regexp
141
  // while creating *out_re.
142
143
  // Parse a character class into *out_re.
144
  // Removes parsed text from s.
145
  bool ParseCharClass(absl::string_view* s, Regexp** out_re,
146
                      RegexpStatus* status);
147
148
  // Parse a character class character into *rp.
149
  // Removes parsed text from s.
150
  bool ParseCCCharacter(absl::string_view* s, Rune* rp,
151
                        absl::string_view whole_class,
152
                        RegexpStatus* status);
153
154
  // Parse a character class range into rr.
155
  // Removes parsed text from s.
156
  bool ParseCCRange(absl::string_view* s, RuneRange* rr,
157
                    absl::string_view whole_class,
158
                    RegexpStatus* status);
159
160
  // Parse a Perl flag set or non-capturing group from s.
161
  bool ParsePerlFlags(absl::string_view* s);
162
163
  // Finishes the current concatenation,
164
  // collapsing it into a single regexp on the stack.
165
  void DoConcatenation();
166
167
  // Finishes the current alternation,
168
  // collapsing it to a single regexp on the stack.
169
  void DoAlternation();
170
171
  // Generalized DoAlternation/DoConcatenation.
172
  void DoCollapse(RegexpOp op);
173
174
  // Maybe concatenate Literals into LiteralString.
175
  bool MaybeConcatString(int r, ParseFlags flags);
176
177
private:
178
  ParseFlags flags_;
179
  absl::string_view whole_regexp_;
180
  RegexpStatus* status_;
181
  Regexp* stacktop_;
182
  int ncap_;  // number of capturing parens seen
183
  int rune_max_;  // maximum char value for this encoding
184
185
  ParseState(const ParseState&) = delete;
186
  ParseState& operator=(const ParseState&) = delete;
187
};
188
189
// Pseudo-operators - only on parse stack.
190
const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
191
const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
192
193
Regexp::ParseState::ParseState(ParseFlags flags,
194
                               absl::string_view whole_regexp,
195
                               RegexpStatus* status)
196
32.6k
  : flags_(flags), whole_regexp_(whole_regexp),
197
32.6k
    status_(status), stacktop_(NULL), ncap_(0) {
198
32.6k
  if (flags_ & Latin1)
199
16.4k
    rune_max_ = 0xFF;
200
16.1k
  else
201
16.1k
    rune_max_ = Runemax;
202
32.6k
}
203
204
// Cleans up by freeing all the regexps on the stack.
205
32.6k
Regexp::ParseState::~ParseState() {
206
32.6k
  Regexp* next;
207
34.8k
  for (Regexp* re = stacktop_; re != NULL; re = next) {
208
2.22k
    next = re->down_;
209
2.22k
    re->down_ = NULL;
210
2.22k
    if (re->op() == kLeftParen)
211
983
      delete re->name_;
212
2.22k
    re->Decref();
213
2.22k
  }
214
32.6k
}
215
216
// Finishes the regexp if necessary, preparing it for use in
217
// a more complex expression.
218
// If it is a CharClassBuilder, converts into a CharClass.
219
743k
Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
220
743k
  if (re == NULL)
221
0
    return NULL;
222
743k
  re->down_ = NULL;
223
224
743k
  if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
225
58.4k
    CharClassBuilder* ccb = re->ccb_;
226
58.4k
    re->ccb_ = NULL;
227
58.4k
    re->cc_ = ccb->GetCharClass();
228
58.4k
    delete ccb;
229
58.4k
  }
230
231
743k
  return re;
232
743k
}
233
234
// Pushes the given regular expression onto the stack.
235
// Could check for too much memory used here.
236
810k
bool Regexp::ParseState::PushRegexp(Regexp* re) {
237
810k
  MaybeConcatString(-1, NoParseFlags);
238
239
  // Special case: a character class of one character is just
240
  // a literal.  This is a common idiom for escaping
241
  // single characters (e.g., [.] instead of \.), and some
242
  // analysis does better with fewer character classes.
243
  // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
244
810k
  if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
245
80.4k
    re->ccb_->RemoveAbove(rune_max_);
246
80.4k
    if (re->ccb_->size() == 1) {
247
872
      Rune r = re->ccb_->begin()->lo;
248
872
      re->Decref();
249
872
      re = new Regexp(kRegexpLiteral, flags_);
250
872
      re->rune_ = r;
251
79.5k
    } else if (re->ccb_->size() == 2) {
252
39.6k
      Rune r = re->ccb_->begin()->lo;
253
39.6k
      if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
254
20.4k
        re->Decref();
255
20.4k
        re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
256
20.4k
        re->rune_ = r + 'a' - 'A';
257
20.4k
      }
258
39.6k
    }
259
80.4k
  }
260
261
810k
  if (!IsMarker(re->op()))
262
610k
    re->simple_ = re->ComputeSimple();
263
810k
  re->down_ = stacktop_;
264
810k
  stacktop_ = re;
265
810k
  return true;
266
810k
}
267
268
// Searches the case folding tables and returns the CaseFold* that contains r.
269
// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
270
// If there isn't one, returns NULL.
271
4.24M
const CaseFold* LookupCaseFold(const CaseFold* f, int n, Rune r) {
272
4.24M
  const CaseFold* ef = f + n;
273
274
  // Binary search for entry containing r.
275
35.1M
  while (n > 0) {
276
33.7M
    int m = n/2;
277
33.7M
    if (f[m].lo <= r && r <= f[m].hi)
278
2.79M
      return &f[m];
279
30.9M
    if (r < f[m].lo) {
280
17.4M
      n = m;
281
17.4M
    } else {
282
13.4M
      f += m+1;
283
13.4M
      n -= m+1;
284
13.4M
    }
285
30.9M
  }
286
287
  // There is no entry that contains r, but f points
288
  // where it would have been.  Unless f points at
289
  // the end of the array, it points at the next entry
290
  // after r.
291
1.44M
  if (f < ef)
292
1.39M
    return f;
293
294
  // No entry contains r; no entry contains runes > r.
295
55.4k
  return NULL;
296
1.44M
}
297
298
// Returns the result of applying the fold f to the rune r.
299
181k
Rune ApplyFold(const CaseFold* f, Rune r) {
300
181k
  switch (f->delta) {
301
50.0k
    default:
302
50.0k
      return r + f->delta;
303
304
80.6k
    case EvenOddSkip:  // even <-> odd but only applies to every other
305
80.6k
      if ((r - f->lo) % 2)
306
40.4k
        return r;
307
40.2k
      ABSL_FALLTHROUGH_INTENDED;
308
83.4k
    case EvenOdd:  // even <-> odd
309
83.4k
      if (r%2 == 0)
310
57.4k
        return r + 1;
311
26.0k
      return r - 1;
312
313
3.08k
    case OddEvenSkip:  // odd <-> even but only applies to every other
314
3.08k
      if ((r - f->lo) % 2)
315
1.18k
        return r;
316
1.90k
      ABSL_FALLTHROUGH_INTENDED;
317
6.11k
    case OddEven:  // odd <-> even
318
6.11k
      if (r%2 == 1)
319
4.67k
        return r + 1;
320
1.44k
      return r - 1;
321
181k
  }
322
181k
}
323
324
// Returns the next Rune in r's folding cycle (see unicode_casefold.h).
325
// Examples:
326
//   CycleFoldRune('A') = 'a'
327
//   CycleFoldRune('a') = 'A'
328
//
329
//   CycleFoldRune('K') = 'k'
330
//   CycleFoldRune('k') = 0x212A (Kelvin)
331
//   CycleFoldRune(0x212A) = 'K'
332
//
333
//   CycleFoldRune('?') = '?'
334
111k
Rune CycleFoldRune(Rune r) {
335
111k
  const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
336
111k
  if (f == NULL || r < f->lo)
337
32.4k
    return r;
338
79.5k
  return ApplyFold(f, r);
339
111k
}
340
341
// Add lo-hi to the class, along with their fold-equivalent characters.
342
74.6k
static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) {
343
12.8M
  while (lo <= hi) {
344
12.7M
    cc->AddRange(lo, lo);
345
12.7M
    if ('A' <= lo && lo <= 'Z') {
346
50.3k
      cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a');
347
50.3k
    }
348
12.7M
    if ('a' <= lo && lo <= 'z') {
349
45.6k
      cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A');
350
45.6k
    }
351
12.7M
    lo++;
352
12.7M
  }
353
74.6k
}
354
355
// Add lo-hi to the class, along with their fold-equivalent characters.
356
// If lo-hi is already in the class, assume that the fold-equivalent
357
// chars are there too, so there's no work to do.
358
3.53M
static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
359
  // AddFoldedRange calls itself recursively for each rune in the fold cycle.
360
  // Most folding cycles are small: there aren't any bigger than four in the
361
  // current Unicode tables.  make_unicode_casefold.py checks that
362
  // the cycles are not too long, and we double-check here using depth.
363
3.53M
  if (depth > 10) {
364
0
    ABSL_LOG(DFATAL) << "AddFoldedRange recurses too much.";
365
0
    return;
366
0
  }
367
368
3.53M
  if (!cc->AddRange(lo, hi))  // lo-hi was already there? we're done
369
2.09M
    return;
370
371
5.39M
  while (lo <= hi) {
372
4.00M
    const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
373
4.00M
    if (f == NULL)  // lo has no fold, nor does anything above lo
374
52.9k
      break;
375
3.95M
    if (lo < f->lo) {  // lo has no fold; next rune with a fold is f->lo
376
1.34M
      lo = f->lo;
377
1.34M
      continue;
378
1.34M
    }
379
380
    // Add in the result of folding the range lo - f->hi
381
    // and that range's fold, recursively.
382
2.60M
    Rune lo1 = lo;
383
2.60M
    Rune hi1 = std::min<Rune>(hi, f->hi);
384
2.60M
    switch (f->delta) {
385
2.11M
      default:
386
2.11M
        lo1 += f->delta;
387
2.11M
        hi1 += f->delta;
388
2.11M
        break;
389
305k
      case EvenOdd:
390
305k
        if (lo1%2 == 1)
391
2.39k
          lo1--;
392
305k
        if (hi1%2 == 0)
393
81.0k
          hi1++;
394
305k
        break;
395
192k
      case OddEven:
396
192k
        if (lo1%2 == 0)
397
434
          lo1--;
398
192k
        if (hi1%2 == 1)
399
47.2k
          hi1++;
400
192k
        break;
401
2.60M
    }
402
2.60M
    AddFoldedRange(cc, lo1, hi1, depth+1);
403
404
    // Pick up where this fold left off.
405
2.60M
    lo = f->hi + 1;
406
2.60M
  }
407
1.44M
}
408
409
// Pushes the literal rune r onto the stack.
410
523k
bool Regexp::ParseState::PushLiteral(Rune r) {
411
  // Do case folding if needed.
412
523k
  if (flags_ & FoldCase) {
413
200k
    if (flags_ & Latin1 && (('A' <= r && r <= 'Z') ||
414
142k
                            ('a' <= r && r <= 'z'))) {
415
13.4k
      Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
416
13.4k
      re->ccb_ = new CharClassBuilder;
417
13.4k
      AddFoldedRangeLatin1(re->ccb_, r, r);
418
13.4k
      return PushRegexp(re);
419
13.4k
    }
420
187k
    if (!(flags_ & Latin1) && CycleFoldRune(r) != r) {
421
25.2k
      Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
422
25.2k
      re->ccb_ = new CharClassBuilder;
423
25.2k
      Rune r1 = r;
424
54.2k
      do {
425
54.2k
        if (!(flags_ & NeverNL) || r != '\n') {
426
54.2k
          re->ccb_->AddRange(r, r);
427
54.2k
        }
428
54.2k
        r = CycleFoldRune(r);
429
54.2k
      } while (r != r1);
430
25.2k
      return PushRegexp(re);
431
25.2k
    }
432
187k
  }
433
434
  // Exclude newline if applicable.
435
485k
  if ((flags_ & NeverNL) && r == '\n')
436
11.7k
    return PushRegexp(new Regexp(kRegexpNoMatch, flags_));
437
438
  // No fancy stuff worked.  Ordinary literal.
439
473k
  if (MaybeConcatString(r, flags_))
440
185k
    return true;
441
442
288k
  Regexp* re = new Regexp(kRegexpLiteral, flags_);
443
288k
  re->rune_ = r;
444
288k
  return PushRegexp(re);
445
473k
}
446
447
// Pushes a ^ onto the stack.
448
26.1k
bool Regexp::ParseState::PushCaret() {
449
26.1k
  if (flags_ & OneLine) {
450
20.9k
    return PushSimpleOp(kRegexpBeginText);
451
20.9k
  }
452
5.16k
  return PushSimpleOp(kRegexpBeginLine);
453
26.1k
}
454
455
// Pushes a \b or \B onto the stack.
456
3.55k
bool Regexp::ParseState::PushWordBoundary(bool word) {
457
3.55k
  if (word)
458
1.98k
    return PushSimpleOp(kRegexpWordBoundary);
459
1.57k
  return PushSimpleOp(kRegexpNoWordBoundary);
460
3.55k
}
461
462
// Pushes a $ onto the stack.
463
51.4k
bool Regexp::ParseState::PushDollar() {
464
51.4k
  if (flags_ & OneLine) {
465
    // Clumsy marker so that MimicsPCRE() can tell whether
466
    // this kRegexpEndText was a $ and not a \z.
467
41.5k
    Regexp::ParseFlags oflags = flags_;
468
41.5k
    flags_ = flags_ | WasDollar;
469
41.5k
    bool ret = PushSimpleOp(kRegexpEndText);
470
41.5k
    flags_ = oflags;
471
41.5k
    return ret;
472
41.5k
  }
473
9.92k
  return PushSimpleOp(kRegexpEndLine);
474
51.4k
}
475
476
// Pushes a . onto the stack.
477
13.8k
bool Regexp::ParseState::PushDot() {
478
13.8k
  if ((flags_ & DotNL) && !(flags_ & NeverNL))
479
4.28k
    return PushSimpleOp(kRegexpAnyChar);
480
  // Rewrite . into [^\n]
481
9.58k
  Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
482
9.58k
  re->ccb_ = new CharClassBuilder;
483
9.58k
  re->ccb_->AddRange(0, '\n' - 1);
484
9.58k
  re->ccb_->AddRange('\n' + 1, rune_max_);
485
9.58k
  return PushRegexp(re);
486
13.8k
}
487
488
// Pushes a regexp with the given op (and no args) onto the stack.
489
203k
bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
490
203k
  Regexp* re = new Regexp(op, flags_);
491
203k
  return PushRegexp(re);
492
203k
}
493
494
// Pushes a repeat operator regexp onto the stack.
495
// A valid argument for the operator must already be on the stack.
496
// The char c is the name of the operator, for use in error messages.
497
bool Regexp::ParseState::PushRepeatOp(RegexpOp op, absl::string_view s,
498
98.7k
                                      bool nongreedy) {
499
98.7k
  if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
500
16
    status_->set_code(kRegexpRepeatArgument);
501
16
    status_->set_error_arg(s);
502
16
    return false;
503
16
  }
504
98.7k
  Regexp::ParseFlags fl = flags_;
505
98.7k
  if (nongreedy)
506
2.01k
    fl = fl ^ NonGreedy;
507
508
  // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but
509
  // they're mostly for use during simplification, not during parsing.
510
98.7k
  if (op == stacktop_->op() && fl == stacktop_->parse_flags())
511
5.12k
    return true;
512
513
  // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
514
  // op is a repeat, we just have to check that stacktop_->op() is too,
515
  // then adjust stacktop_.
516
93.5k
  if ((stacktop_->op() == kRegexpStar ||
517
93.5k
       stacktop_->op() == kRegexpPlus ||
518
93.5k
       stacktop_->op() == kRegexpQuest) &&
519
93.5k
      fl == stacktop_->parse_flags()) {
520
894
    stacktop_->op_ = kRegexpStar;
521
894
    return true;
522
894
  }
523
524
92.6k
  Regexp* re = new Regexp(op, fl);
525
92.6k
  re->AllocSub(1);
526
92.6k
  re->down_ = stacktop_->down_;
527
92.6k
  re->sub()[0] = FinishRegexp(stacktop_);
528
92.6k
  re->simple_ = re->ComputeSimple();
529
92.6k
  stacktop_ = re;
530
92.6k
  return true;
531
93.5k
}
532
533
// RepetitionWalker reports whether the repetition regexp is valid.
534
// Valid means that the combination of the top-level repetition
535
// and any inner repetitions does not exceed n copies of the
536
// innermost thing.
537
// This rewalks the regexp tree and is called for every repetition,
538
// so we have to worry about inducing quadratic behavior in the parser.
539
// We avoid this by only using RepetitionWalker when min or max >= 2.
540
// In that case the depth of any >= 2 nesting can only get to 9 without
541
// triggering a parse error, so each subtree can only be rewalked 9 times.
542
class RepetitionWalker : public Regexp::Walker<int> {
543
 public:
544
39.8k
  RepetitionWalker() {}
545
  virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
546
  virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
547
                        int* child_args, int nchild_args);
548
  virtual int ShortVisit(Regexp* re, int parent_arg);
549
550
 private:
551
  RepetitionWalker(const RepetitionWalker&) = delete;
552
  RepetitionWalker& operator=(const RepetitionWalker&) = delete;
553
};
554
555
378k
int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
556
378k
  int arg = parent_arg;
557
378k
  if (re->op() == kRegexpRepeat) {
558
44.4k
    int m = re->max();
559
44.4k
    if (m < 0) {
560
1.09k
      m = re->min();
561
1.09k
    }
562
44.4k
    if (m > 0) {
563
43.2k
      arg /= m;
564
43.2k
    }
565
44.4k
  }
566
378k
  return arg;
567
378k
}
568
569
int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
570
378k
                                int* child_args, int nchild_args) {
571
378k
  int arg = pre_arg;
572
718k
  for (int i = 0; i < nchild_args; i++) {
573
339k
    if (child_args[i] < arg) {
574
5.17k
      arg = child_args[i];
575
5.17k
    }
576
339k
  }
577
378k
  return arg;
578
378k
}
579
580
0
int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
581
  // Should never be called: we use Walk(), not WalkExponential().
582
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
583
  ABSL_LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
584
#endif
585
0
  return 0;
586
0
}
587
588
// Pushes a repetition regexp onto the stack.
589
// A valid argument for the operator must already be on the stack.
590
bool Regexp::ParseState::PushRepetition(int min, int max, absl::string_view s,
591
48.3k
                                        bool nongreedy) {
592
48.3k
  if ((max != -1 && max < min) ||
593
48.3k
      min > maximum_repeat_count ||
594
48.3k
      max > maximum_repeat_count) {
595
54
    status_->set_code(kRegexpRepeatSize);
596
54
    status_->set_error_arg(s);
597
54
    return false;
598
54
  }
599
48.3k
  if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
600
10
    status_->set_code(kRegexpRepeatArgument);
601
10
    status_->set_error_arg(s);
602
10
    return false;
603
10
  }
604
48.3k
  Regexp::ParseFlags fl = flags_;
605
48.3k
  if (nongreedy)
606
329
    fl = fl ^ NonGreedy;
607
48.3k
  Regexp* re = new Regexp(kRegexpRepeat, fl);
608
48.3k
  re->min_ = min;
609
48.3k
  re->max_ = max;
610
48.3k
  re->AllocSub(1);
611
48.3k
  re->down_ = stacktop_->down_;
612
48.3k
  re->sub()[0] = FinishRegexp(stacktop_);
613
48.3k
  re->simple_ = re->ComputeSimple();
614
48.3k
  stacktop_ = re;
615
48.3k
  if (min >= 2 || max >= 2) {
616
39.8k
    RepetitionWalker w;
617
39.8k
    if (w.Walk(stacktop_, maximum_repeat_count) == 0) {
618
18
      status_->set_code(kRegexpRepeatSize);
619
18
      status_->set_error_arg(s);
620
18
      return false;
621
18
    }
622
39.8k
  }
623
48.3k
  return true;
624
48.3k
}
625
626
// Checks whether a particular regexp op is a marker.
627
2.84M
bool Regexp::ParseState::IsMarker(RegexpOp op) {
628
2.84M
  return op >= kLeftParen;
629
2.84M
}
630
631
// Processes a left parenthesis in the input.
632
// Pushes a marker onto the stack.
633
46.9k
bool Regexp::ParseState::DoLeftParen(absl::string_view name) {
634
46.9k
  Regexp* re = new Regexp(kLeftParen, flags_);
635
46.9k
  re->cap_ = ++ncap_;
636
46.9k
  if (name.data() != NULL)
637
8.95k
    re->name_ = new std::string(name);
638
46.9k
  return PushRegexp(re);
639
46.9k
}
640
641
// Pushes a non-capturing marker onto the stack.
642
37.6k
bool Regexp::ParseState::DoLeftParenNoCapture() {
643
37.6k
  Regexp* re = new Regexp(kLeftParen, flags_);
644
37.6k
  re->cap_ = -1;
645
37.6k
  return PushRegexp(re);
646
37.6k
}
647
648
// Processes a vertical bar in the input.
649
228k
bool Regexp::ParseState::DoVerticalBar() {
650
228k
  MaybeConcatString(-1, NoParseFlags);
651
228k
  DoConcatenation();
652
653
  // Below the vertical bar is a list to alternate.
654
  // Above the vertical bar is a list to concatenate.
655
  // We just did the concatenation, so either swap
656
  // the result below the vertical bar or push a new
657
  // vertical bar on the stack.
658
228k
  Regexp* r1;
659
228k
  Regexp* r2;
660
228k
  if ((r1 = stacktop_) != NULL &&
661
228k
      (r2 = r1->down_) != NULL &&
662
228k
      r2->op() == kVerticalBar) {
663
113k
    Regexp* r3;
664
113k
    if ((r3 = r2->down_) != NULL &&
665
113k
        (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
666
      // AnyChar is above or below the vertical bar. Let it subsume
667
      // the other when the other is Literal, CharClass or AnyChar.
668
1.07k
      if (r3->op() == kRegexpAnyChar &&
669
1.07k
          (r1->op() == kRegexpLiteral ||
670
539
           r1->op() == kRegexpCharClass ||
671
539
           r1->op() == kRegexpAnyChar)) {
672
        // Discard r1.
673
387
        stacktop_ = r2;
674
387
        r1->Decref();
675
387
        return true;
676
387
      }
677
691
      if (r1->op() == kRegexpAnyChar &&
678
691
          (r3->op() == kRegexpLiteral ||
679
539
           r3->op() == kRegexpCharClass ||
680
539
           r3->op() == kRegexpAnyChar)) {
681
        // Rearrange the stack and discard r3.
682
345
        r1->down_ = r3->down_;
683
345
        r2->down_ = r1;
684
345
        stacktop_ = r2;
685
345
        r3->Decref();
686
345
        return true;
687
345
      }
688
691
    }
689
    // Swap r1 below vertical bar (r2).
690
112k
    r1->down_ = r2->down_;
691
112k
    r2->down_ = r1;
692
112k
    stacktop_ = r2;
693
112k
    return true;
694
113k
  }
695
115k
  return PushSimpleOp(kVerticalBar);
696
228k
}
697
698
// Processes a right parenthesis in the input.
699
83.6k
bool Regexp::ParseState::DoRightParen() {
700
  // Finish the current concatenation and alternation.
701
83.6k
  DoAlternation();
702
703
  // The stack should be: LeftParen regexp
704
  // Remove the LeftParen, leaving the regexp,
705
  // parenthesized.
706
83.6k
  Regexp* r1;
707
83.6k
  Regexp* r2;
708
83.6k
  if ((r1 = stacktop_) == NULL ||
709
83.6k
      (r2 = r1->down_) == NULL ||
710
83.6k
      r2->op() != kLeftParen) {
711
51
    status_->set_code(kRegexpUnexpectedParen);
712
51
    status_->set_error_arg(whole_regexp_);
713
51
    return false;
714
51
  }
715
716
  // Pop off r1, r2.  Will Decref or reuse below.
717
83.6k
  stacktop_ = r2->down_;
718
719
  // Restore flags from when paren opened.
720
83.6k
  Regexp* re = r2;
721
83.6k
  flags_ = re->parse_flags();
722
723
  // Rewrite LeftParen as capture if needed.
724
83.6k
  if (re->cap_ > 0) {
725
46.2k
    re->op_ = kRegexpCapture;
726
    // re->cap_ is already set
727
46.2k
    re->AllocSub(1);
728
46.2k
    re->sub()[0] = FinishRegexp(r1);
729
46.2k
    re->simple_ = re->ComputeSimple();
730
46.2k
  } else {
731
37.3k
    re->Decref();
732
37.3k
    re = r1;
733
37.3k
  }
734
83.6k
  return PushRegexp(re);
735
83.6k
}
736
737
// Processes the end of input, returning the final regexp.
738
31.3k
Regexp* Regexp::ParseState::DoFinish() {
739
31.3k
  DoAlternation();
740
31.3k
  Regexp* re = stacktop_;
741
31.3k
  if (re != NULL && re->down_ != NULL) {
742
276
    status_->set_code(kRegexpMissingParen);
743
276
    status_->set_error_arg(whole_regexp_);
744
276
    return NULL;
745
276
  }
746
31.0k
  stacktop_ = NULL;
747
31.0k
  return FinishRegexp(re);
748
31.3k
}
749
750
// Returns the leading regexp that re starts with.
751
// The returned Regexp* points into a piece of re,
752
// so it must not be used after the caller calls re->Decref().
753
202k
Regexp* Regexp::LeadingRegexp(Regexp* re) {
754
202k
  if (re->op() == kRegexpEmptyMatch)
755
61.2k
    return NULL;
756
141k
  if (re->op() == kRegexpConcat && re->nsub() >= 2) {
757
55.7k
    Regexp** sub = re->sub();
758
55.7k
    if (sub[0]->op() == kRegexpEmptyMatch)
759
277
      return NULL;
760
55.4k
    return sub[0];
761
55.7k
  }
762
85.5k
  return re;
763
141k
}
764
765
// Removes LeadingRegexp(re) from re and returns what's left.
766
// Consumes the reference to re and may edit it in place.
767
// If caller wants to hold on to LeadingRegexp(re),
768
// must have already Incref'ed it.
769
22.1k
Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
770
22.1k
  if (re->op() == kRegexpEmptyMatch)
771
0
    return re;
772
22.1k
  if (re->op() == kRegexpConcat && re->nsub() >= 2) {
773
18.5k
    Regexp** sub = re->sub();
774
18.5k
    if (sub[0]->op() == kRegexpEmptyMatch)
775
0
      return re;
776
18.5k
    sub[0]->Decref();
777
18.5k
    sub[0] = NULL;
778
18.5k
    if (re->nsub() == 2) {
779
      // Collapse concatenation to single regexp.
780
2.41k
      Regexp* nre = sub[1];
781
2.41k
      sub[1] = NULL;
782
2.41k
      re->Decref();
783
2.41k
      return nre;
784
2.41k
    }
785
    // 3 or more -> 2 or more.
786
16.1k
    re->nsub_--;
787
16.1k
    memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
788
16.1k
    return re;
789
18.5k
  }
790
3.59k
  Regexp::ParseFlags pf = re->parse_flags();
791
3.59k
  re->Decref();
792
3.59k
  return new Regexp(kRegexpEmptyMatch, pf);
793
22.1k
}
794
795
// Returns the leading string that re starts with.
796
// The returned Rune* points into a piece of re,
797
// so it must not be used after the caller calls re->Decref().
798
Rune* Regexp::LeadingString(Regexp* re, int* nrune,
799
212k
                            Regexp::ParseFlags* flags) {
800
266k
  while (re->op() == kRegexpConcat && re->nsub() > 0)
801
54.1k
    re = re->sub()[0];
802
803
212k
  *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ &
804
212k
                                           (Regexp::FoldCase | Regexp::Latin1));
805
806
212k
  if (re->op() == kRegexpLiteral) {
807
38.9k
    *nrune = 1;
808
38.9k
    return &re->rune_;
809
38.9k
  }
810
811
173k
  if (re->op() == kRegexpLiteralString) {
812
36.4k
    *nrune = re->nrunes_;
813
36.4k
    return re->runes_;
814
36.4k
  }
815
816
136k
  *nrune = 0;
817
136k
  return NULL;
818
173k
}
819
820
// Removes the first n leading runes from the beginning of re.
821
// Edits re in place.
822
16.1k
void Regexp::RemoveLeadingString(Regexp* re, int n) {
823
  // Chase down concats to find first string.
824
  // For regexps generated by parser, nested concats are
825
  // flattened except when doing so would overflow the 16-bit
826
  // limit on the size of a concatenation, so we should never
827
  // see more than two here.
828
16.1k
  Regexp* stk[4];
829
16.1k
  size_t d = 0;
830
21.0k
  while (re->op() == kRegexpConcat) {
831
4.83k
    if (d < ABSL_ARRAYSIZE(stk))
832
4.83k
      stk[d++] = re;
833
4.83k
    re = re->sub()[0];
834
4.83k
  }
835
836
  // Remove leading string from re.
837
16.1k
  if (re->op() == kRegexpLiteral) {
838
9.87k
    re->rune_ = 0;
839
9.87k
    re->op_ = kRegexpEmptyMatch;
840
9.87k
  } else if (re->op() == kRegexpLiteralString) {
841
6.31k
    if (n >= re->nrunes_) {
842
680
      delete[] re->runes_;
843
680
      re->runes_ = NULL;
844
680
      re->nrunes_ = 0;
845
680
      re->op_ = kRegexpEmptyMatch;
846
5.63k
    } else if (n == re->nrunes_ - 1) {
847
1.46k
      Rune rune = re->runes_[re->nrunes_ - 1];
848
1.46k
      delete[] re->runes_;
849
1.46k
      re->runes_ = NULL;
850
1.46k
      re->nrunes_ = 0;
851
1.46k
      re->rune_ = rune;
852
1.46k
      re->op_ = kRegexpLiteral;
853
4.16k
    } else {
854
4.16k
      re->nrunes_ -= n;
855
4.16k
      memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
856
4.16k
    }
857
6.31k
  }
858
859
  // If re is now empty, concatenations might simplify too.
860
21.0k
  while (d > 0) {
861
4.83k
    re = stk[--d];
862
4.83k
    Regexp** sub = re->sub();
863
4.83k
    if (sub[0]->op() == kRegexpEmptyMatch) {
864
2.56k
      sub[0]->Decref();
865
2.56k
      sub[0] = NULL;
866
      // Delete first element of concat.
867
2.56k
      switch (re->nsub()) {
868
0
        case 0:
869
0
        case 1:
870
          // Impossible.
871
0
          ABSL_LOG(DFATAL) << "Concat of " << re->nsub();
872
0
          re->submany_ = NULL;
873
0
          re->op_ = kRegexpEmptyMatch;
874
0
          break;
875
876
1.47k
        case 2: {
877
          // Replace re with sub[1].
878
1.47k
          Regexp* old = sub[1];
879
1.47k
          sub[1] = NULL;
880
1.47k
          re->Swap(old);
881
1.47k
          old->Decref();
882
1.47k
          break;
883
0
        }
884
885
1.09k
        default:
886
          // Slide down.
887
1.09k
          re->nsub_--;
888
1.09k
          memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
889
1.09k
          break;
890
2.56k
      }
891
2.56k
    }
892
4.83k
  }
893
16.1k
}
894
895
// In the context of factoring alternations, a Splice is: a factored prefix or
896
// merged character class computed by one iteration of one round of factoring;
897
// the span of subexpressions of the alternation to be "spliced" (i.e. removed
898
// and replaced); and, for a factored prefix, the number of suffixes after any
899
// factoring that might have subsequently been performed on them. For a merged
900
// character class, there are no suffixes, of course, so the field is ignored.
901
struct Splice {
902
  Splice(Regexp* prefix, Regexp** sub, int nsub)
903
23.2k
      : prefix(prefix),
904
23.2k
        sub(sub),
905
23.2k
        nsub(nsub),
906
23.2k
        nsuffix(-1) {}
907
908
  Regexp* prefix;
909
  Regexp** sub;
910
  int nsub;
911
  int nsuffix;
912
};
913
914
// Named so because it is used to implement an explicit stack, a Frame is: the
915
// span of subexpressions of the alternation to be factored; the current round
916
// of factoring; any Splices computed; and, for a factored prefix, an iterator
917
// to the next Splice to be factored (i.e. in another Frame) because suffixes.
918
struct Frame {
919
  Frame(Regexp** sub, int nsub)
920
66.1k
      : sub(sub),
921
66.1k
        nsub(nsub),
922
66.1k
        round(0) {}
923
924
  Regexp** sub;
925
  int nsub;
926
  int round;
927
  std::vector<Splice> splices;
928
  int spliceidx;
929
};
930
931
// Bundled into a class for friend access to Regexp without needing to declare
932
// (or define) Splice in regexp.h.
933
class FactorAlternationImpl {
934
 public:
935
  static void Round1(Regexp** sub, int nsub,
936
                     Regexp::ParseFlags flags,
937
                     std::vector<Splice>* splices);
938
  static void Round2(Regexp** sub, int nsub,
939
                     Regexp::ParseFlags flags,
940
                     std::vector<Splice>* splices);
941
  static void Round3(Regexp** sub, int nsub,
942
                     Regexp::ParseFlags flags,
943
                     std::vector<Splice>* splices);
944
};
945
946
// Factors common prefixes from alternation.
947
// For example,
948
//     ABC|ABD|AEF|BCX|BCY
949
// simplifies to
950
//     A(B(C|D)|EF)|BC(X|Y)
951
// and thence to
952
//     A(B[CD]|EF)|BC[XY]
953
//
954
// Rewrites sub to contain simplified list to alternate and returns
955
// the new length of sub.  Adjusts reference counts accordingly
956
// (incoming sub[i] decremented, outgoing sub[i] incremented).
957
50.0k
int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
958
50.0k
  std::vector<Frame> stk;
959
50.0k
  stk.emplace_back(sub, nsub);
960
961
280k
  for (;;) {
962
280k
    auto& sub = stk.back().sub;
963
280k
    auto& nsub = stk.back().nsub;
964
280k
    auto& round = stk.back().round;
965
280k
    auto& splices = stk.back().splices;
966
280k
    auto& spliceidx = stk.back().spliceidx;
967
968
280k
    if (splices.empty()) {
969
      // Advance to the next round of factoring. Note that this covers
970
      // the initialised state: when splices is empty and round is 0.
971
245k
      round++;
972
245k
    } else if (spliceidx < static_cast<int>(splices.size())) {
973
      // We have at least one more Splice to factor. Recurse logically.
974
16.1k
      stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub);
975
16.1k
      continue;
976
19.2k
    } else {
977
      // We have no more Splices to factor. Apply them.
978
19.2k
      auto iter = splices.begin();
979
19.2k
      int out = 0;
980
42.4k
      for (int i = 0; i < nsub; ) {
981
        // Copy until we reach where the next Splice begins.
982
33.0k
        while (sub + i < iter->sub)
983
9.81k
          sub[out++] = sub[i++];
984
23.2k
        switch (round) {
985
6.52k
          case 1:
986
16.1k
          case 2: {
987
            // Assemble the Splice prefix and the suffixes.
988
16.1k
            Regexp* re[2];
989
16.1k
            re[0] = iter->prefix;
990
16.1k
            re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags);
991
16.1k
            sub[out++] = Regexp::Concat(re, 2, flags);
992
16.1k
            i += iter->nsub;
993
16.1k
            break;
994
6.52k
          }
995
7.12k
          case 3:
996
            // Just use the Splice prefix.
997
7.12k
            sub[out++] = iter->prefix;
998
7.12k
            i += iter->nsub;
999
7.12k
            break;
1000
0
          default:
1001
0
            ABSL_LOG(DFATAL) << "unknown round: " << round;
1002
0
            break;
1003
23.2k
        }
1004
        // If we are done, copy until the end of sub.
1005
23.2k
        if (++iter == splices.end()) {
1006
28.5k
          while (i < nsub)
1007
9.33k
            sub[out++] = sub[i++];
1008
19.2k
        }
1009
23.2k
      }
1010
19.2k
      splices.clear();
1011
19.2k
      nsub = out;
1012
      // Advance to the next round of factoring.
1013
19.2k
      round++;
1014
19.2k
    }
1015
1016
264k
    switch (round) {
1017
66.1k
      case 1:
1018
66.1k
        FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
1019
66.1k
        break;
1020
66.1k
      case 2:
1021
66.1k
        FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
1022
66.1k
        break;
1023
66.1k
      case 3:
1024
66.1k
        FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
1025
66.1k
        break;
1026
66.1k
      case 4:
1027
66.1k
        if (stk.size() == 1) {
1028
          // We are at the top of the stack. Just return.
1029
50.0k
          return nsub;
1030
50.0k
        } else {
1031
          // Pop the stack and set the number of suffixes.
1032
          // (Note that references will be invalidated!)
1033
16.1k
          int nsuffix = nsub;
1034
16.1k
          stk.pop_back();
1035
16.1k
          stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix;
1036
16.1k
          ++stk.back().spliceidx;
1037
16.1k
          continue;
1038
16.1k
        }
1039
0
      default:
1040
0
        ABSL_LOG(DFATAL) << "unknown round: " << round;
1041
0
        break;
1042
264k
    }
1043
1044
    // Set spliceidx depending on whether we have Splices to factor.
1045
198k
    if (splices.empty() || round == 3) {
1046
185k
      spliceidx = static_cast<int>(splices.size());
1047
185k
    } else {
1048
12.8k
      spliceidx = 0;
1049
12.8k
    }
1050
198k
  }
1051
50.0k
}
1052
1053
void FactorAlternationImpl::Round1(Regexp** sub, int nsub,
1054
                                   Regexp::ParseFlags flags,
1055
66.1k
                                   std::vector<Splice>* splices) {
1056
  // Round 1: Factor out common literal prefixes.
1057
66.1k
  int start = 0;
1058
66.1k
  Rune* rune = NULL;
1059
66.1k
  int nrune = 0;
1060
66.1k
  Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
1061
344k
  for (int i = 0; i <= nsub; i++) {
1062
    // Invariant: sub[start:i] consists of regexps that all
1063
    // begin with rune[0:nrune].
1064
278k
    Rune* rune_i = NULL;
1065
278k
    int nrune_i = 0;
1066
278k
    Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
1067
278k
    if (i < nsub) {
1068
212k
      rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i);
1069
212k
      if (runeflags_i == runeflags) {
1070
145k
        int same = 0;
1071
161k
        while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
1072
15.3k
          same++;
1073
145k
        if (same > 0) {
1074
          // Matches at least one rune in current range.  Keep going around.
1075
9.66k
          nrune = same;
1076
9.66k
          continue;
1077
9.66k
        }
1078
145k
      }
1079
212k
    }
1080
1081
    // Found end of a run with common leading literal string:
1082
    // sub[start:i] all begin with rune[0:nrune],
1083
    // but sub[i] does not even begin with rune[0].
1084
268k
    if (i == start) {
1085
      // Nothing to do - first iteration.
1086
202k
    } else if (i == start+1) {
1087
      // Just one: don't bother factoring.
1088
195k
    } else {
1089
6.52k
      Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags);
1090
22.7k
      for (int j = start; j < i; j++)
1091
16.1k
        Regexp::RemoveLeadingString(sub[j], nrune);
1092
6.52k
      splices->emplace_back(prefix, sub + start, i - start);
1093
6.52k
    }
1094
1095
    // Prepare for next iteration (if there is one).
1096
268k
    if (i < nsub) {
1097
202k
      start = i;
1098
202k
      rune = rune_i;
1099
202k
      nrune = nrune_i;
1100
202k
      runeflags = runeflags_i;
1101
202k
    }
1102
268k
  }
1103
66.1k
}
1104
1105
void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
1106
                                   Regexp::ParseFlags flags,
1107
66.1k
                                   std::vector<Splice>* splices) {
1108
  // Round 2: Factor out common simple prefixes,
1109
  // just the first piece of each concatenation.
1110
  // This will be good enough a lot of the time.
1111
  //
1112
  // Complex subexpressions (e.g. involving quantifiers)
1113
  // are not safe to factor because that collapses their
1114
  // distinct paths through the automaton, which affects
1115
  // correctness in some cases.
1116
66.1k
  int start = 0;
1117
66.1k
  Regexp* first = NULL;
1118
334k
  for (int i = 0; i <= nsub; i++) {
1119
    // Invariant: sub[start:i] consists of regexps that all
1120
    // begin with first.
1121
268k
    Regexp* first_i = NULL;
1122
268k
    if (i < nsub) {
1123
202k
      first_i = Regexp::LeadingRegexp(sub[i]);
1124
202k
      if (first != NULL &&
1125
          // first must be an empty-width op
1126
          // OR a char class, any char or any byte
1127
          // OR a fixed repeat of a literal, char class, any char or any byte.
1128
202k
          (first->op() == kRegexpBeginLine ||
1129
90.6k
           first->op() == kRegexpEndLine ||
1130
90.6k
           first->op() == kRegexpWordBoundary ||
1131
90.6k
           first->op() == kRegexpNoWordBoundary ||
1132
90.6k
           first->op() == kRegexpBeginText ||
1133
90.6k
           first->op() == kRegexpEndText ||
1134
90.6k
           first->op() == kRegexpCharClass ||
1135
90.6k
           first->op() == kRegexpAnyChar ||
1136
90.6k
           first->op() == kRegexpAnyByte ||
1137
90.6k
           (first->op() == kRegexpRepeat &&
1138
72.7k
            first->min() == first->max() &&
1139
72.7k
            (first->sub()[0]->op() == kRegexpLiteral ||
1140
2.73k
             first->sub()[0]->op() == kRegexpCharClass ||
1141
2.73k
             first->sub()[0]->op() == kRegexpAnyChar ||
1142
2.73k
             first->sub()[0]->op() == kRegexpAnyByte))) &&
1143
202k
          Regexp::Equal(first, first_i))
1144
12.4k
        continue;
1145
202k
    }
1146
1147
    // Found end of a run with common leading regexp:
1148
    // sub[start:i] all begin with first,
1149
    // but sub[i] does not.
1150
256k
    if (i == start) {
1151
      // Nothing to do - first iteration.
1152
189k
    } else if (i == start+1) {
1153
      // Just one: don't bother factoring.
1154
180k
    } else {
1155
9.61k
      Regexp* prefix = first->Incref();
1156
31.7k
      for (int j = start; j < i; j++)
1157
22.1k
        sub[j] = Regexp::RemoveLeadingRegexp(sub[j]);
1158
9.61k
      splices->emplace_back(prefix, sub + start, i - start);
1159
9.61k
    }
1160
1161
    // Prepare for next iteration (if there is one).
1162
256k
    if (i < nsub) {
1163
189k
      start = i;
1164
189k
      first = first_i;
1165
189k
    }
1166
256k
  }
1167
66.1k
}
1168
1169
void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
1170
                                   Regexp::ParseFlags flags,
1171
66.1k
                                   std::vector<Splice>* splices) {
1172
  // Round 3: Merge runs of literals and/or character classes.
1173
66.1k
  int start = 0;
1174
66.1k
  Regexp* first = NULL;
1175
322k
  for (int i = 0; i <= nsub; i++) {
1176
    // Invariant: sub[start:i] consists of regexps that all
1177
    // are either literals (i.e. runes) or character classes.
1178
256k
    Regexp* first_i = NULL;
1179
256k
    if (i < nsub) {
1180
189k
      first_i = sub[i];
1181
189k
      if (first != NULL &&
1182
189k
          (first->op() == kRegexpLiteral ||
1183
123k
           first->op() == kRegexpCharClass) &&
1184
189k
          (first_i->op() == kRegexpLiteral ||
1185
18.8k
           first_i->op() == kRegexpCharClass))
1186
9.34k
        continue;
1187
189k
    }
1188
1189
    // Found end of a run of Literal/CharClass:
1190
    // sub[start:i] all are either one or the other,
1191
    // but sub[i] is not.
1192
246k
    if (i == start) {
1193
      // Nothing to do - first iteration.
1194
180k
    } else if (i == start+1) {
1195
      // Just one: don't bother factoring.
1196
173k
    } else {
1197
7.12k
      CharClassBuilder ccb;
1198
23.5k
      for (int j = start; j < i; j++) {
1199
16.4k
        Regexp* re = sub[j];
1200
16.4k
        if (re->op() == kRegexpCharClass) {
1201
1.80k
          CharClass* cc = re->cc();
1202
35.5k
          for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
1203
33.7k
            ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags());
1204
14.6k
        } else if (re->op() == kRegexpLiteral) {
1205
14.6k
          if (re->parse_flags() & Regexp::FoldCase) {
1206
            // AddFoldedRange() can terminate prematurely if the character class
1207
            // already contains the rune. For example, if it contains 'a' and we
1208
            // want to add folded 'a', it sees 'a' and stops without adding 'A'.
1209
            // To avoid that, we use an empty character class and then merge it.
1210
4.15k
            CharClassBuilder tmp;
1211
4.15k
            tmp.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
1212
4.15k
            ccb.AddCharClass(&tmp);
1213
10.5k
          } else {
1214
10.5k
            ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
1215
10.5k
          }
1216
14.6k
        } else {
1217
0
          ABSL_LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
1218
0
                           << re->ToString();
1219
0
        }
1220
16.4k
        re->Decref();
1221
16.4k
      }
1222
7.12k
      Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase);
1223
7.12k
      splices->emplace_back(re, sub + start, i - start);
1224
7.12k
    }
1225
1226
    // Prepare for next iteration (if there is one).
1227
246k
    if (i < nsub) {
1228
180k
      start = i;
1229
180k
      first = first_i;
1230
180k
    }
1231
246k
  }
1232
66.1k
}
1233
1234
// Collapse the regexps on top of the stack, down to the
1235
// first marker, into a new op node (op == kRegexpAlternate
1236
// or op == kRegexpConcat).
1237
343k
void Regexp::ParseState::DoCollapse(RegexpOp op) {
1238
  // Scan backward to marker, counting children of composite.
1239
343k
  int n = 0;
1240
343k
  Regexp* next = NULL;
1241
343k
  Regexp* sub;
1242
1.11M
  for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1243
766k
    next = sub->down_;
1244
766k
    if (sub->op_ == op)
1245
6.48k
      n += sub->nsub_;
1246
760k
    else
1247
760k
      n++;
1248
766k
  }
1249
1250
  // If there's just one child, leave it alone.
1251
  // (Concat of one thing is that one thing; alternate of one thing is same.)
1252
343k
  if (stacktop_ != NULL && stacktop_->down_ == next)
1253
237k
    return;
1254
1255
  // Construct op (alternation or concatenation), flattening op of op.
1256
106k
  PODArray<Regexp*> subs(n);
1257
106k
  next = NULL;
1258
106k
  int i = n;
1259
635k
  for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1260
529k
    next = sub->down_;
1261
529k
    if (sub->op_ == op) {
1262
4.48k
      Regexp** sub_subs = sub->sub();
1263
93.0k
      for (int k = sub->nsub_ - 1; k >= 0; k--)
1264
88.5k
        subs[--i] = sub_subs[k]->Incref();
1265
4.48k
      sub->Decref();
1266
524k
    } else {
1267
524k
      subs[--i] = FinishRegexp(sub);
1268
524k
    }
1269
529k
  }
1270
1271
106k
  Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true);
1272
106k
  re->simple_ = re->ComputeSimple();
1273
106k
  re->down_ = next;
1274
106k
  stacktop_ = re;
1275
106k
}
1276
1277
// Finishes the current concatenation,
1278
// collapsing it into a single regexp on the stack.
1279
228k
void Regexp::ParseState::DoConcatenation() {
1280
228k
  Regexp* r1 = stacktop_;
1281
228k
  if (r1 == NULL || IsMarker(r1->op())) {
1282
    // empty concatenation is special case
1283
58.2k
    Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);
1284
58.2k
    PushRegexp(re);
1285
58.2k
  }
1286
228k
  DoCollapse(kRegexpConcat);
1287
228k
}
1288
1289
// Finishes the current alternation,
1290
// collapsing it to a single regexp on the stack.
1291
114k
void Regexp::ParseState::DoAlternation() {
1292
114k
  DoVerticalBar();
1293
  // Now stack top is kVerticalBar.
1294
114k
  Regexp* r1 = stacktop_;
1295
114k
  stacktop_ = r1->down_;
1296
114k
  r1->Decref();
1297
114k
  DoCollapse(kRegexpAlternate);
1298
114k
}
1299
1300
// Incremental conversion of concatenated literals into strings.
1301
// If top two elements on stack are both literal or string,
1302
// collapse into single string.
1303
// Don't walk down the stack -- the parser calls this frequently
1304
// enough that below the bottom two is known to be collapsed.
1305
// Only called when another regexp is about to be pushed
1306
// on the stack, so that the topmost literal is not being considered.
1307
// (Otherwise ab* would turn into (ab)*.)
1308
// If r >= 0, consider pushing a literal r on the stack.
1309
// Return whether that happened.
1310
1.51M
bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
1311
1.51M
  Regexp* re1;
1312
1.51M
  Regexp* re2;
1313
1.51M
  if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)
1314
140k
    return false;
1315
1316
1.37M
  if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
1317
851k
    return false;
1318
520k
  if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
1319
263k
    return false;
1320
256k
  if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
1321
541
    return false;
1322
1323
256k
  if (re2->op_ == kRegexpLiteral) {
1324
    // convert into string
1325
75.1k
    Rune rune = re2->rune_;
1326
75.1k
    re2->op_ = kRegexpLiteralString;
1327
75.1k
    re2->nrunes_ = 0;
1328
75.1k
    re2->runes_ = NULL;
1329
75.1k
    re2->AddRuneToString(rune);
1330
75.1k
  }
1331
1332
  // push re1 into re2.
1333
256k
  if (re1->op_ == kRegexpLiteral) {
1334
255k
    re2->AddRuneToString(re1->rune_);
1335
255k
  } else {
1336
37.7k
    for (int i = 0; i < re1->nrunes_; i++)
1337
36.5k
      re2->AddRuneToString(re1->runes_[i]);
1338
1.17k
    re1->nrunes_ = 0;
1339
1.17k
    delete[] re1->runes_;
1340
1.17k
    re1->runes_ = NULL;
1341
1.17k
  }
1342
1343
  // reuse re1 if possible
1344
256k
  if (r >= 0) {
1345
185k
    re1->op_ = kRegexpLiteral;
1346
185k
    re1->rune_ = r;
1347
185k
    re1->parse_flags_ = static_cast<uint16_t>(flags);
1348
185k
    return true;
1349
185k
  }
1350
1351
71.2k
  stacktop_ = re2;
1352
71.2k
  re1->Decref();
1353
71.2k
  return false;
1354
256k
}
1355
1356
// Lexing routines.
1357
1358
// Parses a decimal integer, storing it in *np.
1359
// Sets *s to span the remainder of the string.
1360
63.2k
static bool ParseInteger(absl::string_view* s, int* np) {
1361
63.2k
  if (s->empty() || !absl::ascii_isdigit((*s)[0] & 0xFF))
1362
5.48k
    return false;
1363
  // Disallow leading zeros.
1364
57.7k
  if (s->size() >= 2 && (*s)[0] == '0' && absl::ascii_isdigit((*s)[1] & 0xFF))
1365
653
    return false;
1366
57.0k
  int n = 0;
1367
57.0k
  int c;
1368
120k
  while (!s->empty() && absl::ascii_isdigit(c = (*s)[0] & 0xFF)) {
1369
    // Avoid overflow.
1370
63.6k
    if (n >= 100000000)
1371
288
      return false;
1372
63.3k
    n = n*10 + c - '0';
1373
63.3k
    s->remove_prefix(1);  // digit
1374
63.3k
  }
1375
56.8k
  *np = n;
1376
56.8k
  return true;
1377
57.0k
}
1378
1379
// Parses a repetition suffix like {1,2} or {2} or {2,}.
1380
// Sets *s to span the remainder of the string on success.
1381
// Sets *lo and *hi to the given range.
1382
// In the case of {2,}, the high number is unbounded;
1383
// sets *hi to -1 to signify this.
1384
// {,2} is NOT a valid suffix.
1385
// The Maybe in the name signifies that the regexp parse
1386
// doesn't fail even if ParseRepetition does, so the string_view
1387
// s must NOT be edited unless MaybeParseRepetition returns true.
1388
57.1k
static bool MaybeParseRepetition(absl::string_view* sp, int* lo, int* hi) {
1389
57.1k
  absl::string_view s = *sp;
1390
57.1k
  if (s.empty() || s[0] != '{')
1391
0
    return false;
1392
57.1k
  s.remove_prefix(1);  // '{'
1393
57.1k
  if (!ParseInteger(&s, lo))
1394
5.49k
    return false;
1395
51.6k
  if (s.empty())
1396
83
    return false;
1397
51.5k
  if (s[0] == ',') {
1398
8.88k
    s.remove_prefix(1);  // ','
1399
8.88k
    if (s.empty())
1400
22
      return false;
1401
8.86k
    if (s[0] == '}') {
1402
      // {2,} means at least 2
1403
2.73k
      *hi = -1;
1404
6.12k
    } else {
1405
      // {2,4} means 2, 3, or 4.
1406
6.12k
      if (!ParseInteger(&s, hi))
1407
929
        return false;
1408
6.12k
    }
1409
42.6k
  } else {
1410
    // {2} means exactly two
1411
42.6k
    *hi = *lo;
1412
42.6k
  }
1413
50.5k
  if (s.empty() || s[0] != '}')
1414
2.19k
    return false;
1415
48.3k
  s.remove_prefix(1);  // '}'
1416
48.3k
  *sp = s;
1417
48.3k
  return true;
1418
50.5k
}
1419
1420
// Removes the next Rune from the string_view and stores it in *r.
1421
// Returns number of bytes removed from sp.
1422
// Behaves as though there is a terminating NUL at the end of sp.
1423
// Argument order is backwards from usual Google style
1424
// but consistent with chartorune.
1425
static int StringViewToRune(Rune* r, absl::string_view* sp,
1426
703k
                            RegexpStatus* status) {
1427
  // fullrune() takes int, not size_t. However, it just looks
1428
  // at the leading byte and treats any length >= 4 the same.
1429
703k
  if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) {
1430
703k
    int n = chartorune(r, sp->data());
1431
    // Some copies of chartorune have a bug that accepts
1432
    // encodings of values in (10FFFF, 1FFFFF] as valid.
1433
    // Those values break the character class algorithm,
1434
    // which assumes Runemax is the largest rune.
1435
703k
    if (*r > Runemax) {
1436
18
      n = 1;
1437
18
      *r = Runeerror;
1438
18
    }
1439
703k
    if (!(n == 1 && *r == Runeerror)) {  // no decoding error
1440
703k
      sp->remove_prefix(n);
1441
703k
      return n;
1442
703k
    }
1443
703k
  }
1444
1445
201
  if (status != NULL) {
1446
201
    status->set_code(kRegexpBadUTF8);
1447
201
    status->set_error_arg(absl::string_view());
1448
201
  }
1449
201
  return -1;
1450
703k
}
1451
1452
// Returns whether name is valid UTF-8.
1453
// If not, sets status to kRegexpBadUTF8.
1454
9.38k
static bool IsValidUTF8(absl::string_view s, RegexpStatus* status) {
1455
9.38k
  absl::string_view t = s;
1456
9.38k
  Rune r;
1457
61.4k
  while (!t.empty()) {
1458
52.1k
    if (StringViewToRune(&r, &t, status) < 0)
1459
13
      return false;
1460
52.1k
  }
1461
9.36k
  return true;
1462
9.38k
}
1463
1464
// Is c a hex digit?
1465
3.92k
static int IsHex(int c) {
1466
3.92k
  return ('0' <= c && c <= '9') ||
1467
3.92k
         ('A' <= c && c <= 'F') ||
1468
3.92k
         ('a' <= c && c <= 'f');
1469
3.92k
}
1470
1471
// Convert hex digit to value.
1472
3.32k
static int UnHex(int c) {
1473
3.32k
  if ('0' <= c && c <= '9')
1474
1.55k
    return c - '0';
1475
1.76k
  if ('A' <= c && c <= 'F')
1476
827
    return c - 'A' + 10;
1477
939
  if ('a' <= c && c <= 'f')
1478
939
    return c - 'a' + 10;
1479
0
  ABSL_LOG(DFATAL) << "Bad hex digit " << c;
1480
0
  return 0;
1481
0
}
1482
1483
// Parse an escape sequence (e.g., \n, \{).
1484
// Sets *s to span the remainder of the string.
1485
// Sets *rp to the named character.
1486
static bool ParseEscape(absl::string_view* s, Rune* rp,
1487
6.17k
                        RegexpStatus* status, int rune_max) {
1488
6.17k
  const char* begin = s->data();
1489
6.17k
  if (s->empty() || (*s)[0] != '\\') {
1490
    // Should not happen - caller always checks.
1491
0
    status->set_code(kRegexpInternalError);
1492
0
    status->set_error_arg(absl::string_view());
1493
0
    return false;
1494
0
  }
1495
6.17k
  if (s->size() == 1) {
1496
62
    status->set_code(kRegexpTrailingBackslash);
1497
62
    status->set_error_arg(absl::string_view());
1498
62
    return false;
1499
62
  }
1500
6.11k
  Rune c, c1;
1501
6.11k
  s->remove_prefix(1);  // backslash
1502
6.11k
  if (StringViewToRune(&c, s, status) < 0)
1503
20
    return false;
1504
6.09k
  int code;
1505
6.09k
  switch (c) {
1506
533
    default:
1507
533
      if (c < Runeself && !absl::ascii_isalnum(c)) {
1508
        // Escaped non-word characters are always themselves.
1509
        // PCRE is not quite so rigorous: it accepts things like
1510
        // \q, but we don't.  We once rejected \_, but too many
1511
        // programs and people insist on using it, so allow \_.
1512
496
        *rp = c;
1513
496
        return true;
1514
496
      }
1515
37
      goto BadEscape;
1516
1517
    // Octal escapes.
1518
183
    case '1':
1519
420
    case '2':
1520
891
    case '3':
1521
1.09k
    case '4':
1522
1.31k
    case '5':
1523
1.53k
    case '6':
1524
1.62k
    case '7':
1525
      // Single non-zero octal digit is a backreference; not supported.
1526
1.62k
      if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7')
1527
18
        goto BadEscape;
1528
1.60k
      ABSL_FALLTHROUGH_INTENDED;
1529
2.15k
    case '0':
1530
      // consume up to three octal digits; already have one.
1531
2.15k
      code = c - '0';
1532
2.15k
      if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') {
1533
1.64k
        code = code * 8 + c - '0';
1534
1.64k
        s->remove_prefix(1);  // digit
1535
1.64k
        if (!s->empty()) {
1536
1.53k
          c = (*s)[0];
1537
1.53k
          if ('0' <= c && c <= '7') {
1538
345
            code = code * 8 + c - '0';
1539
345
            s->remove_prefix(1);  // digit
1540
345
          }
1541
1.53k
        }
1542
1.64k
      }
1543
2.15k
      if (code > rune_max)
1544
2
        goto BadEscape;
1545
2.15k
      *rp = code;
1546
2.15k
      return true;
1547
1548
    // Hexadecimal escapes
1549
1.59k
    case 'x':
1550
1.59k
      if (s->empty())
1551
1
        goto BadEscape;
1552
1.59k
      if (StringViewToRune(&c, s, status) < 0)
1553
3
        return false;
1554
1.59k
      if (c == '{') {
1555
        // Any number of digits in braces.
1556
        // Update n as we consume the string, so that
1557
        // the whole thing gets shown in the error message.
1558
        // Perl accepts any text at all; it ignores all text
1559
        // after the first non-hex digit.  We require only hex digits,
1560
        // and at least one.
1561
521
        if (StringViewToRune(&c, s, status) < 0)
1562
1
          return false;
1563
520
        int nhex = 0;
1564
520
        code = 0;
1565
1.86k
        while (IsHex(c)) {
1566
1.39k
          nhex++;
1567
1.39k
          code = code * 16 + UnHex(c);
1568
1.39k
          if (code > rune_max)
1569
27
            goto BadEscape;
1570
1.37k
          if (s->empty())
1571
22
            goto BadEscape;
1572
1.35k
          if (StringViewToRune(&c, s, status) < 0)
1573
1
            return false;
1574
1.35k
        }
1575
470
        if (c != '}' || nhex == 0)
1576
36
          goto BadEscape;
1577
434
        *rp = code;
1578
434
        return true;
1579
470
      }
1580
      // Easy case: two hex digits.
1581
1.07k
      if (s->empty())
1582
28
        goto BadEscape;
1583
1.04k
      if (StringViewToRune(&c1, s, status) < 0)
1584
1
        return false;
1585
1.04k
      if (!IsHex(c) || !IsHex(c1))
1586
81
        goto BadEscape;
1587
963
      *rp = UnHex(c) * 16 + UnHex(c1);
1588
963
      return true;
1589
1590
    // C escapes.
1591
651
    case 'n':
1592
651
      *rp = '\n';
1593
651
      return true;
1594
278
    case 'r':
1595
278
      *rp = '\r';
1596
278
      return true;
1597
207
    case 't':
1598
207
      *rp = '\t';
1599
207
      return true;
1600
1601
    // Less common C escapes.
1602
252
    case 'a':
1603
252
      *rp = '\a';
1604
252
      return true;
1605
201
    case 'f':
1606
201
      *rp = '\f';
1607
201
      return true;
1608
202
    case 'v':
1609
202
      *rp = '\v';
1610
202
      return true;
1611
1612
    // This code is disabled to avoid misparsing
1613
    // the Perl word-boundary \b as a backspace
1614
    // when in POSIX regexp mode.  Surprisingly,
1615
    // in Perl, \b means word-boundary but [\b]
1616
    // means backspace.  We don't support that:
1617
    // if you want a backspace embed a literal
1618
    // backspace character or use \x08.
1619
    //
1620
    // case 'b':
1621
    //   *rp = '\b';
1622
    //   return true;
1623
6.09k
  }
1624
1625
252
BadEscape:
1626
  // Unrecognized escape sequence.
1627
252
  status->set_code(kRegexpBadEscape);
1628
252
  status->set_error_arg(
1629
252
      absl::string_view(begin, static_cast<size_t>(s->data() - begin)));
1630
252
  return false;
1631
6.09k
}
1632
1633
// Add a range to the character class, but exclude newline if asked.
1634
// Also handle case folding.
1635
void CharClassBuilder::AddRangeFlags(
1636
2.04M
    Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
1637
1638
  // Take out \n if the flags say so.
1639
2.04M
  bool cutnl = !(parse_flags & Regexp::ClassNL) ||
1640
2.04M
               (parse_flags & Regexp::NeverNL);
1641
2.04M
  if (cutnl && lo <= '\n' && '\n' <= hi) {
1642
10.6k
    if (lo < '\n')
1643
9.04k
      AddRangeFlags(lo, '\n' - 1, parse_flags);
1644
10.6k
    if (hi > '\n')
1645
8.89k
      AddRangeFlags('\n' + 1, hi, parse_flags);
1646
10.6k
    return;
1647
10.6k
  }
1648
1649
  // If folding case, add fold-equivalent characters too.
1650
2.03M
  if (parse_flags & Regexp::FoldCase) {
1651
988k
    if (parse_flags & Regexp::Latin1) {
1652
61.1k
      AddFoldedRangeLatin1(this, lo, hi);
1653
926k
    } else {
1654
926k
      AddFoldedRange(this, lo, hi, 0);
1655
926k
    }
1656
1.04M
  } else {
1657
1.04M
    AddRange(lo, hi);
1658
1.04M
  }
1659
2.03M
}
1660
1661
// Look for a group with the given name.
1662
static const UGroup* LookupGroup(absl::string_view name,
1663
12.0k
                                 const UGroup* groups, int ngroups) {
1664
  // Simple name lookup.
1665
489k
  for (int i = 0; i < ngroups; i++)
1666
485k
    if (absl::string_view(groups[i].name) == name)
1667
7.70k
      return &groups[i];
1668
4.32k
  return NULL;
1669
12.0k
}
1670
1671
// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
1672
1.15k
static const UGroup* LookupPosixGroup(absl::string_view name) {
1673
1.15k
  return LookupGroup(name, posix_groups, num_posix_groups);
1674
1.15k
}
1675
1676
6.28k
static const UGroup* LookupPerlGroup(absl::string_view name) {
1677
6.28k
  return LookupGroup(name, perl_groups, num_perl_groups);
1678
6.28k
}
1679
1680
#if !defined(RE2_USE_ICU)
1681
// Fake UGroup containing all Runes
1682
static URange16 any16[] = { { 0, 65535 } };
1683
static URange32 any32[] = { { 65536, Runemax } };
1684
static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
1685
1686
// Look for a Unicode group with the given name (e.g., "Han")
1687
4.60k
static const UGroup* LookupUnicodeGroup(absl::string_view name) {
1688
  // Special case: "Any" means any.
1689
4.60k
  if (name == absl::string_view("Any"))
1690
11
    return &anygroup;
1691
4.58k
  return LookupGroup(name, unicode_groups, num_unicode_groups);
1692
4.60k
}
1693
#endif
1694
1695
// Add a UGroup or its negation to the character class.
1696
static void AddUGroup(CharClassBuilder* cc, const UGroup* g, int sign,
1697
10.1k
                      Regexp::ParseFlags parse_flags) {
1698
10.1k
  if (sign == +1) {
1699
733k
    for (int i = 0; i < g->nr16; i++) {
1700
727k
      cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
1701
727k
    }
1702
510k
    for (int i = 0; i < g->nr32; i++) {
1703
505k
      cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
1704
505k
    }
1705
5.45k
  } else {
1706
4.68k
    if (parse_flags & Regexp::FoldCase) {
1707
      // Normally adding a case-folded group means
1708
      // adding all the extra fold-equivalent runes too.
1709
      // But if we're adding the negation of the group,
1710
      // we have to exclude all the runes that are fold-equivalent
1711
      // to what's already missing.  Too hard, so do in two steps.
1712
2.42k
      CharClassBuilder ccb1;
1713
2.42k
      AddUGroup(&ccb1, g, +1, parse_flags);
1714
      // If the flags say to take out \n, put it in, so that negating will take it out.
1715
      // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
1716
2.42k
      bool cutnl = !(parse_flags & Regexp::ClassNL) ||
1717
2.42k
                   (parse_flags & Regexp::NeverNL);
1718
2.42k
      if (cutnl) {
1719
1.00k
        ccb1.AddRange('\n', '\n');
1720
1.00k
      }
1721
2.42k
      ccb1.Negate();
1722
2.42k
      cc->AddCharClass(&ccb1);
1723
2.42k
      return;
1724
2.42k
    }
1725
2.25k
    int next = 0;
1726
406k
    for (int i = 0; i < g->nr16; i++) {
1727
404k
      if (next < g->r16[i].lo)
1728
404k
        cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
1729
404k
      next = g->r16[i].hi + 1;
1730
404k
    }
1731
271k
    for (int i = 0; i < g->nr32; i++) {
1732
268k
      if (next < g->r32[i].lo)
1733
268k
        cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
1734
268k
      next = g->r32[i].hi + 1;
1735
268k
    }
1736
2.25k
    if (next <= Runemax)
1737
2.25k
      cc->AddRangeFlags(next, Runemax, parse_flags);
1738
2.25k
  }
1739
10.1k
}
1740
1741
// Maybe parse a Perl character class escape sequence.
1742
// Only recognizes the Perl character classes (\d \s \w \D \S \W),
1743
// not the Perl empty-string classes (\b \B \A \Z \z).
1744
// On success, sets *s to span the remainder of the string
1745
// and returns the corresponding UGroup.
1746
// The string_view must *NOT* be edited unless the call succeeds.
1747
const UGroup* MaybeParsePerlCCEscape(absl::string_view* s,
1748
74.1k
                                     Regexp::ParseFlags parse_flags) {
1749
74.1k
  if (!(parse_flags & Regexp::PerlClasses))
1750
14.5k
    return NULL;
1751
59.5k
  if (s->size() < 2 || (*s)[0] != '\\')
1752
53.2k
    return NULL;
1753
  // Could use StringViewToRune, but there aren't
1754
  // any non-ASCII Perl group names.
1755
6.28k
  absl::string_view name(s->data(), 2);
1756
6.28k
  const UGroup* g = LookupPerlGroup(name);
1757
6.28k
  if (g == NULL)
1758
4.21k
    return NULL;
1759
2.06k
  s->remove_prefix(name.size());
1760
2.06k
  return g;
1761
6.28k
}
1762
1763
enum ParseStatus {
1764
  kParseOk,  // Did some parsing.
1765
  kParseError,  // Found an error.
1766
  kParseNothing,  // Decided not to parse.
1767
};
1768
1769
// Maybe parses a Unicode character group like \p{Han} or \P{Han}
1770
// (the latter is a negated group).
1771
ParseStatus ParseUnicodeGroup(absl::string_view* s,
1772
                              Regexp::ParseFlags parse_flags,
1773
4.66k
                              CharClassBuilder* cc, RegexpStatus* status) {
1774
  // Decide whether to parse.
1775
4.66k
  if (!(parse_flags & Regexp::UnicodeGroups))
1776
2
    return kParseNothing;
1777
4.66k
  if (s->size() < 2 || (*s)[0] != '\\')
1778
0
    return kParseNothing;
1779
4.66k
  Rune c = (*s)[1];
1780
4.66k
  if (c != 'p' && c != 'P')
1781
0
    return kParseNothing;
1782
1783
  // Committed to parse.  Results:
1784
4.66k
  int sign = +1;  // -1 = negated char class
1785
4.66k
  if (c == 'P')
1786
3.24k
    sign = -sign;
1787
4.66k
  absl::string_view seq = *s;  // \p{Han} or \pL
1788
4.66k
  absl::string_view name;  // Han or L
1789
4.66k
  s->remove_prefix(2);  // '\\', 'p'
1790
1791
4.66k
  if (!StringViewToRune(&c, s, status))
1792
0
    return kParseError;
1793
4.66k
  if (c != '{') {
1794
    // Name is the bit of string we just skipped over for c.
1795
4.34k
    const char* p = seq.data() + 2;
1796
4.34k
    name = absl::string_view(p, static_cast<size_t>(s->data() - p));
1797
4.34k
  } else {
1798
    // Name is in braces. Look for closing }
1799
318
    size_t end = s->find('}', 0);
1800
318
    if (end == absl::string_view::npos) {
1801
57
      if (!IsValidUTF8(seq, status))
1802
4
        return kParseError;
1803
53
      status->set_code(kRegexpBadCharRange);
1804
53
      status->set_error_arg(seq);
1805
53
      return kParseError;
1806
57
    }
1807
261
    name = absl::string_view(s->data(), end);  // without '}'
1808
261
    s->remove_prefix(end + 1);  // with '}'
1809
261
    if (!IsValidUTF8(name, status))
1810
3
      return kParseError;
1811
261
  }
1812
1813
  // Chop seq where s now begins.
1814
4.60k
  seq = absl::string_view(seq.data(), static_cast<size_t>(s->data() - seq.data()));
1815
1816
4.60k
  if (!name.empty() && name[0] == '^') {
1817
4
    sign = -sign;
1818
4
    name.remove_prefix(1);  // '^'
1819
4
  }
1820
1821
4.60k
#if !defined(RE2_USE_ICU)
1822
  // Look up the group in the RE2 Unicode data.
1823
4.60k
  const UGroup* g = LookupUnicodeGroup(name);
1824
4.60k
  if (g == NULL) {
1825
102
    status->set_code(kRegexpBadCharRange);
1826
102
    status->set_error_arg(seq);
1827
102
    return kParseError;
1828
102
  }
1829
1830
4.49k
  AddUGroup(cc, g, sign, parse_flags);
1831
#else
1832
  // Look up the group in the ICU Unicode data. Because ICU provides full
1833
  // Unicode properties support, this could be more than a lookup by name.
1834
  ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
1835
      std::string("\\p{") + std::string(name) + std::string("}"));
1836
  UErrorCode uerr = U_ZERO_ERROR;
1837
  ::icu::UnicodeSet uset(ustr, uerr);
1838
  if (U_FAILURE(uerr)) {
1839
    status->set_code(kRegexpBadCharRange);
1840
    status->set_error_arg(seq);
1841
    return kParseError;
1842
  }
1843
1844
  // Convert the UnicodeSet to a URange32 and UGroup that we can add.
1845
  int nr = uset.getRangeCount();
1846
  PODArray<URange32> r(nr);
1847
  for (int i = 0; i < nr; i++) {
1848
    r[i].lo = uset.getRangeStart(i);
1849
    r[i].hi = uset.getRangeEnd(i);
1850
  }
1851
  UGroup g = {"", +1, 0, 0, r.data(), nr};
1852
  AddUGroup(cc, &g, sign, parse_flags);
1853
#endif
1854
1855
4.49k
  return kParseOk;
1856
4.60k
}
1857
1858
// Parses a character class name like [:alnum:].
1859
// Sets *s to span the remainder of the string.
1860
// Adds the ranges corresponding to the class to ranges.
1861
static ParseStatus ParseCCName(absl::string_view* s,
1862
                               Regexp::ParseFlags parse_flags,
1863
1.46k
                               CharClassBuilder* cc, RegexpStatus* status) {
1864
  // Check begins with [:
1865
1.46k
  const char* p = s->data();
1866
1.46k
  const char* ep = s->data() + s->size();
1867
1.46k
  if (ep - p < 2 || p[0] != '[' || p[1] != ':')
1868
0
    return kParseNothing;
1869
1870
  // Look for closing :].
1871
1.46k
  const char* q;
1872
19.6k
  for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)
1873
18.1k
    ;
1874
1875
  // If no closing :], then ignore.
1876
1.46k
  if (q > ep-2)
1877
314
    return kParseNothing;
1878
1879
  // Got it.  Check that it's valid.
1880
1.15k
  q += 2;
1881
1.15k
  absl::string_view name(p, static_cast<size_t>(q - p));
1882
1883
1.15k
  const UGroup* g = LookupPosixGroup(name);
1884
1.15k
  if (g == NULL) {
1885
7
    status->set_code(kRegexpBadCharRange);
1886
7
    status->set_error_arg(name);
1887
7
    return kParseError;
1888
7
  }
1889
1890
1.14k
  s->remove_prefix(name.size());
1891
1.14k
  AddUGroup(cc, g, g->sign, parse_flags);
1892
1.14k
  return kParseOk;
1893
1.15k
}
1894
1895
// Parses a character inside a character class.
1896
// There are fewer special characters here than in the rest of the regexp.
1897
// Sets *s to span the remainder of the string.
1898
// Sets *rp to the character.
1899
bool Regexp::ParseState::ParseCCCharacter(absl::string_view* s, Rune* rp,
1900
                                          absl::string_view whole_class,
1901
81.9k
                                          RegexpStatus* status) {
1902
81.9k
  if (s->empty()) {
1903
0
    status->set_code(kRegexpMissingBracket);
1904
0
    status->set_error_arg(whole_class);
1905
0
    return false;
1906
0
  }
1907
1908
  // Allow regular escape sequences even though
1909
  // many need not be escaped in this context.
1910
81.9k
  if ((*s)[0] == '\\')
1911
2.55k
    return ParseEscape(s, rp, status, rune_max_);
1912
1913
  // Otherwise take the next rune.
1914
79.4k
  return StringViewToRune(rp, s, status) >= 0;
1915
81.9k
}
1916
1917
// Parses a character class character, or, if the character
1918
// is followed by a hyphen, parses a character class range.
1919
// For single characters, rr->lo == rr->hi.
1920
// Sets *s to span the remainder of the string.
1921
// Sets *rp to the character.
1922
bool Regexp::ParseState::ParseCCRange(absl::string_view* s, RuneRange* rr,
1923
                                      absl::string_view whole_class,
1924
68.4k
                                      RegexpStatus* status) {
1925
68.4k
  absl::string_view os = *s;
1926
68.4k
  if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
1927
18
    return false;
1928
  // [a-] means (a|-), so check for final ].
1929
68.4k
  if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') {
1930
13.5k
    s->remove_prefix(1);  // '-'
1931
13.5k
    if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
1932
1
      return false;
1933
13.5k
    if (rr->hi < rr->lo) {
1934
2
      status->set_code(kRegexpBadCharRange);
1935
2
      status->set_error_arg(absl::string_view(
1936
2
          os.data(), static_cast<size_t>(s->data() - os.data())));
1937
2
      return false;
1938
2
    }
1939
54.8k
  } else {
1940
54.8k
    rr->hi = rr->lo;
1941
54.8k
  }
1942
68.4k
  return true;
1943
68.4k
}
1944
1945
// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
1946
// Sets *s to span the remainder of the string.
1947
// Sets *out_re to the regexp for the class.
1948
bool Regexp::ParseState::ParseCharClass(absl::string_view* s, Regexp** out_re,
1949
25.7k
                                        RegexpStatus* status) {
1950
25.7k
  absl::string_view whole_class = *s;
1951
25.7k
  if (s->empty() || (*s)[0] != '[') {
1952
    // Caller checked this.
1953
0
    status->set_code(kRegexpInternalError);
1954
0
    status->set_error_arg(absl::string_view());
1955
0
    return false;
1956
0
  }
1957
25.7k
  bool negated = false;
1958
25.7k
  Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
1959
25.7k
  re->ccb_ = new CharClassBuilder;
1960
25.7k
  s->remove_prefix(1);  // '['
1961
25.7k
  if (!s->empty() && (*s)[0] == '^') {
1962
4.67k
    s->remove_prefix(1);  // '^'
1963
4.67k
    negated = true;
1964
4.67k
    if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
1965
      // If NL can't match implicitly, then pretend
1966
      // negated classes include a leading \n.
1967
3.50k
      re->ccb_->AddRange('\n', '\n');
1968
3.50k
    }
1969
4.67k
  }
1970
25.7k
  bool first = true;  // ] is okay as first char in class
1971
95.5k
  while (!s->empty() && ((*s)[0] != ']' || first)) {
1972
    // - is only okay unescaped as first or last in class.
1973
    // Except that Perl allows - anywhere.
1974
69.8k
    if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
1975
69.8k
        (s->size() == 1 || (*s)[1] != ']')) {
1976
12
      absl::string_view t = *s;
1977
12
      t.remove_prefix(1);  // '-'
1978
12
      Rune r;
1979
12
      int n = StringViewToRune(&r, &t, status);
1980
12
      if (n < 0) {
1981
4
        re->Decref();
1982
4
        return false;
1983
4
      }
1984
8
      status->set_code(kRegexpBadCharRange);
1985
8
      status->set_error_arg(absl::string_view(s->data(), 1+n));
1986
8
      re->Decref();
1987
8
      return false;
1988
12
    }
1989
69.7k
    first = false;
1990
1991
    // Look for [:alnum:] etc.
1992
69.7k
    if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
1993
1.46k
      switch (ParseCCName(s, flags_, re->ccb_, status)) {
1994
1.14k
        case kParseOk:
1995
1.14k
          continue;
1996
7
        case kParseError:
1997
7
          re->Decref();
1998
7
          return false;
1999
314
        case kParseNothing:
2000
314
          break;
2001
1.46k
      }
2002
1.46k
    }
2003
2004
    // Look for Unicode character group like \p{Han}
2005
68.6k
    if (s->size() > 2 &&
2006
68.6k
        (*s)[0] == '\\' &&
2007
68.6k
        ((*s)[1] == 'p' || (*s)[1] == 'P')) {
2008
54
      switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
2009
51
        case kParseOk:
2010
51
          continue;
2011
2
        case kParseError:
2012
2
          re->Decref();
2013
2
          return false;
2014
1
        case kParseNothing:
2015
1
          break;
2016
54
      }
2017
54
    }
2018
2019
    // Look for Perl character class symbols (extension).
2020
68.5k
    const UGroup* g = MaybeParsePerlCCEscape(s, flags_);
2021
68.5k
    if (g != NULL) {
2022
171
      AddUGroup(re->ccb_, g, g->sign, flags_);
2023
171
      continue;
2024
171
    }
2025
2026
    // Otherwise assume single character or simple range.
2027
68.4k
    RuneRange rr;
2028
68.4k
    if (!ParseCCRange(s, &rr, whole_class, status)) {
2029
21
      re->Decref();
2030
21
      return false;
2031
21
    }
2032
    // AddRangeFlags is usually called in response to a class like
2033
    // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
2034
    // Regexp::ClassNL is set.  In an explicit range or singleton
2035
    // like we just parsed, we do not filter \n out, so set ClassNL
2036
    // in the flags.
2037
68.4k
    re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
2038
68.4k
  }
2039
25.7k
  if (s->empty()) {
2040
309
    status->set_code(kRegexpMissingBracket);
2041
309
    status->set_error_arg(whole_class);
2042
309
    re->Decref();
2043
309
    return false;
2044
309
  }
2045
25.3k
  s->remove_prefix(1);  // ']'
2046
2047
25.3k
  if (negated)
2048
4.66k
    re->ccb_->Negate();
2049
2050
25.3k
  *out_re = re;
2051
25.3k
  return true;
2052
25.7k
}
2053
2054
// Returns whether name is a valid capture name.
2055
8.99k
static bool IsValidCaptureName(absl::string_view name) {
2056
8.99k
  if (name.empty())
2057
1
    return false;
2058
2059
  // Historically, we effectively used [0-9A-Za-z_]+ to validate; that
2060
  // followed Python 2 except for not restricting the first character.
2061
  // As of Python 3, Unicode characters beyond ASCII are also allowed;
2062
  // accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and
2063
  // Pc categories, but again without restricting the first character.
2064
  // Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3
2065
  // performs it for identifiers, but seemingly not for capture names;
2066
  // if they start doing that for capture names, we won't follow suit.
2067
8.99k
  static const CharClass* const cc = []() {
2068
1
    CharClassBuilder ccb;
2069
1
    for (absl::string_view group :
2070
1
         {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"})
2071
10
      AddUGroup(&ccb, LookupGroup(group, unicode_groups, num_unicode_groups),
2072
10
                +1, Regexp::NoParseFlags);
2073
1
    return ccb.GetCharClass();
2074
1
  }();
2075
2076
8.99k
  absl::string_view t = name;
2077
8.99k
  Rune r;
2078
49.3k
  while (!t.empty()) {
2079
40.4k
    if (StringViewToRune(&r, &t, NULL) < 0)
2080
0
      return false;
2081
40.4k
    if (cc->Contains(r))
2082
40.3k
      continue;
2083
35
    return false;
2084
40.4k
  }
2085
8.95k
  return true;
2086
8.99k
}
2087
2088
// Parses a Perl flag setting or non-capturing group or both,
2089
// like (?i) or (?: or (?i:.  Removes from s, updates parse state.
2090
// The caller must check that s begins with "(?".
2091
// Returns true on success.  If the Perl flag is not
2092
// well-formed or not supported, sets status_ and returns false.
2093
10.5k
bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
2094
10.5k
  absl::string_view t = *s;
2095
2096
  // Caller is supposed to check this.
2097
10.5k
  if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') {
2098
0
    status_->set_code(kRegexpInternalError);
2099
0
    ABSL_LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
2100
0
    return false;
2101
0
  }
2102
2103
  // Check for look-around assertions. This is NOT because we support them! ;)
2104
  // As per https://github.com/google/re2/issues/468, we really want to report
2105
  // kRegexpBadPerlOp (not kRegexpBadNamedCapture) for look-behind assertions.
2106
  // Additionally, it would be nice to report not "(?<", but "(?<=" or "(?<!".
2107
10.5k
  if ((t.size() > 3 && (t[2] == '=' || t[2] == '!')) ||
2108
10.5k
      (t.size() > 4 && t[2] == '<' && (t[3] == '=' || t[3] == '!'))) {
2109
4
    status_->set_code(kRegexpBadPerlOp);
2110
4
    status_->set_error_arg(absl::string_view(t.data(), t[2] == '<' ? 4 : 3));
2111
4
    return false;
2112
4
  }
2113
2114
  // Check for named captures, first introduced in Python's regexp library.
2115
  // As usual, there are three slightly different syntaxes:
2116
  //
2117
  //   (?P<name>expr)   the original, introduced by Python
2118
  //   (?<name>expr)    the .NET alteration, adopted by Perl 5.10
2119
  //   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
2120
  //
2121
  // Perl 5.10 gave in and implemented the Python version too,
2122
  // but they claim that the last two are the preferred forms.
2123
  // PCRE and languages based on it (specifically, PHP and Ruby)
2124
  // support all three as well.  EcmaScript 4 uses only the Python form.
2125
  //
2126
  // In both the open source world (via Code Search) and the
2127
  // Google source tree, (?P<name>expr) and (?<name>expr) are the
2128
  // dominant forms of named captures and both are supported.
2129
10.4k
  if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') ||
2130
10.4k
      (t.size() > 3 && t[2] == '<')) {
2131
    // Pull out name.
2132
9.06k
    size_t begin = t[2] == 'P' ? 4 : 3;
2133
9.06k
    size_t end = t.find('>', begin);
2134
9.06k
    if (end == absl::string_view::npos) {
2135
68
      if (!IsValidUTF8(t, status_))
2136
3
        return false;
2137
65
      status_->set_code(kRegexpBadNamedCapture);
2138
65
      status_->set_error_arg(t);
2139
65
      return false;
2140
68
    }
2141
2142
8.99k
    absl::string_view capture(t.data(), end+1);
2143
8.99k
    absl::string_view name(t.data()+begin, end-begin);
2144
8.99k
    if (!IsValidUTF8(name, status_))
2145
3
      return false;
2146
8.99k
    if (!IsValidCaptureName(name)) {
2147
36
      status_->set_code(kRegexpBadNamedCapture);
2148
36
      status_->set_error_arg(capture);
2149
36
      return false;
2150
36
    }
2151
2152
8.95k
    if (!DoLeftParen(name)) {
2153
      // DoLeftParen's failure set status_.
2154
0
      return false;
2155
0
    }
2156
2157
8.95k
    s->remove_prefix(capture.size());
2158
8.95k
    return true;
2159
8.95k
  }
2160
2161
1.43k
  t.remove_prefix(2);  // "(?"
2162
2163
1.43k
  bool negated = false;
2164
1.43k
  bool sawflags = false;
2165
1.43k
  int nflags = flags_;
2166
1.43k
  Rune c;
2167
5.44k
  for (bool done = false; !done; ) {
2168
4.06k
    if (t.empty())
2169
21
      goto BadPerlOp;
2170
4.04k
    if (StringViewToRune(&c, &t, status_) < 0)
2171
6
      return false;
2172
4.04k
    switch (c) {
2173
32
      default:
2174
32
        goto BadPerlOp;
2175
2176
      // Parse flags.
2177
1.10k
      case 'i':
2178
1.10k
        sawflags = true;
2179
1.10k
        if (negated)
2180
366
          nflags &= ~FoldCase;
2181
742
        else
2182
742
          nflags |= FoldCase;
2183
1.10k
        break;
2184
2185
597
      case 'm':  // opposite of our OneLine
2186
597
        sawflags = true;
2187
597
        if (negated)
2188
387
          nflags |= OneLine;
2189
210
        else
2190
210
          nflags &= ~OneLine;
2191
597
        break;
2192
2193
78
      case 's':
2194
78
        sawflags = true;
2195
78
        if (negated)
2196
39
          nflags &= ~DotNL;
2197
39
        else
2198
39
          nflags |= DotNL;
2199
78
        break;
2200
2201
556
      case 'U':
2202
556
        sawflags = true;
2203
556
        if (negated)
2204
250
          nflags &= ~NonGreedy;
2205
306
        else
2206
306
          nflags |= NonGreedy;
2207
556
        break;
2208
2209
      // Negation
2210
296
      case '-':
2211
296
        if (negated)
2212
1
          goto BadPerlOp;
2213
295
        negated = true;
2214
295
        sawflags = false;
2215
295
        break;
2216
2217
      // Open new group.
2218
225
      case ':':
2219
225
        if (!DoLeftParenNoCapture()) {
2220
          // DoLeftParenNoCapture's failure set status_.
2221
0
          return false;
2222
0
        }
2223
225
        done = true;
2224
225
        break;
2225
2226
      // Finish flags.
2227
1.15k
      case ')':
2228
1.15k
        done = true;
2229
1.15k
        break;
2230
4.04k
    }
2231
4.04k
  }
2232
2233
1.37k
  if (negated && !sawflags)
2234
1
    goto BadPerlOp;
2235
2236
1.37k
  flags_ = static_cast<Regexp::ParseFlags>(nflags);
2237
1.37k
  *s = t;
2238
1.37k
  return true;
2239
2240
55
BadPerlOp:
2241
55
  status_->set_code(kRegexpBadPerlOp);
2242
55
  status_->set_error_arg(
2243
55
      absl::string_view(s->data(), static_cast<size_t>(t.data() - s->data())));
2244
55
  return false;
2245
1.37k
}
2246
2247
// Converts latin1 (assumed to be encoded as Latin1 bytes)
2248
// into UTF8 encoding in string.
2249
// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
2250
// deprecated and because it rejects code points 0x80-0x9F.
2251
16.4k
void ConvertLatin1ToUTF8(absl::string_view latin1, std::string* utf) {
2252
16.4k
  char buf[UTFmax];
2253
2254
16.4k
  utf->clear();
2255
1.17M
  for (size_t i = 0; i < latin1.size(); i++) {
2256
1.15M
    Rune r = latin1[i] & 0xFF;
2257
1.15M
    int n = runetochar(buf, &r);
2258
1.15M
    utf->append(buf, n);
2259
1.15M
  }
2260
16.4k
}
2261
2262
// Parses the regular expression given by s,
2263
// returning the corresponding Regexp tree.
2264
// The caller must Decref the return value when done with it.
2265
// Returns NULL on error.
2266
Regexp* Regexp::Parse(absl::string_view s, ParseFlags global_flags,
2267
32.6k
                      RegexpStatus* status) {
2268
  // Make status non-NULL (easier on everyone else).
2269
32.6k
  RegexpStatus xstatus;
2270
32.6k
  if (status == NULL)
2271
0
    status = &xstatus;
2272
2273
32.6k
  ParseState ps(global_flags, s, status);
2274
32.6k
  absl::string_view t = s;
2275
2276
  // Convert regexp to UTF-8 (easier on the rest of the parser).
2277
32.6k
  if (global_flags & Latin1) {
2278
16.4k
    std::string* tmp = new std::string;
2279
16.4k
    ConvertLatin1ToUTF8(t, tmp);
2280
16.4k
    status->set_tmp(tmp);
2281
16.4k
    t = *tmp;
2282
16.4k
  }
2283
2284
32.6k
  if (global_flags & Literal) {
2285
    // Special parse loop for literal string.
2286
17.8k
    while (!t.empty()) {
2287
16.5k
      Rune r;
2288
16.5k
      if (StringViewToRune(&r, &t, status) < 0)
2289
34
        return NULL;
2290
16.5k
      if (!ps.PushLiteral(r))
2291
0
        return NULL;
2292
16.5k
    }
2293
1.25k
    return ps.DoFinish();
2294
1.28k
  }
2295
2296
31.3k
  absl::string_view lastunary = absl::string_view();
2297
1.09M
  while (!t.empty()) {
2298
1.06M
    absl::string_view isunary = absl::string_view();
2299
1.06M
    switch (t[0]) {
2300
493k
      default: {
2301
493k
        Rune r;
2302
493k
        if (StringViewToRune(&r, &t, status) < 0)
2303
92
          return NULL;
2304
493k
        if (!ps.PushLiteral(r))
2305
0
          return NULL;
2306
493k
        break;
2307
493k
      }
2308
2309
493k
      case '(':
2310
        // "(?" introduces Perl escape.
2311
85.9k
        if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {
2312
          // Flag changes and non-capturing groups.
2313
10.5k
          if (!ps.ParsePerlFlags(&t))
2314
172
            return NULL;
2315
10.3k
          break;
2316
10.5k
        }
2317
75.4k
        if (ps.flags() & NeverCapture) {
2318
37.3k
          if (!ps.DoLeftParenNoCapture())
2319
0
            return NULL;
2320
38.0k
        } else {
2321
38.0k
          if (!ps.DoLeftParen(absl::string_view()))
2322
0
            return NULL;
2323
38.0k
        }
2324
75.4k
        t.remove_prefix(1);  // '('
2325
75.4k
        break;
2326
2327
113k
      case '|':
2328
113k
        if (!ps.DoVerticalBar())
2329
0
          return NULL;
2330
113k
        t.remove_prefix(1);  // '|'
2331
113k
        break;
2332
2333
83.6k
      case ')':
2334
83.6k
        if (!ps.DoRightParen())
2335
51
          return NULL;
2336
83.6k
        t.remove_prefix(1);  // ')'
2337
83.6k
        break;
2338
2339
26.1k
      case '^':  // Beginning of line.
2340
26.1k
        if (!ps.PushCaret())
2341
0
          return NULL;
2342
26.1k
        t.remove_prefix(1);  // '^'
2343
26.1k
        break;
2344
2345
51.4k
      case '$':  // End of line.
2346
51.4k
        if (!ps.PushDollar())
2347
0
          return NULL;
2348
51.4k
        t.remove_prefix(1);  // '$'
2349
51.4k
        break;
2350
2351
13.8k
      case '.':  // Any character (possibly except newline).
2352
13.8k
        if (!ps.PushDot())
2353
0
          return NULL;
2354
13.8k
        t.remove_prefix(1);  // '.'
2355
13.8k
        break;
2356
2357
25.7k
      case '[': {  // Character class.
2358
25.7k
        Regexp* re;
2359
25.7k
        if (!ps.ParseCharClass(&t, &re, status))
2360
351
          return NULL;
2361
25.3k
        if (!ps.PushRegexp(re))
2362
0
          return NULL;
2363
25.3k
        break;
2364
25.3k
      }
2365
2366
39.8k
      case '*': {  // Zero or more.
2367
39.8k
        RegexpOp op;
2368
39.8k
        op = kRegexpStar;
2369
39.8k
        goto Rep;
2370
19.3k
      case '+':  // One or more.
2371
19.3k
        op = kRegexpPlus;
2372
19.3k
        goto Rep;
2373
39.4k
      case '?':  // Zero or one.
2374
39.4k
        op = kRegexpQuest;
2375
39.4k
        goto Rep;
2376
98.7k
      Rep:
2377
98.7k
        absl::string_view opstr = t;
2378
98.7k
        bool nongreedy = false;
2379
98.7k
        t.remove_prefix(1);  // '*' or '+' or '?'
2380
98.7k
        if (ps.flags() & PerlX) {
2381
21.1k
          if (!t.empty() && t[0] == '?') {
2382
2.01k
            nongreedy = true;
2383
2.01k
            t.remove_prefix(1);  // '?'
2384
2.01k
          }
2385
21.1k
          if (!lastunary.empty()) {
2386
            // In Perl it is not allowed to stack repetition operators:
2387
            //   a** is a syntax error, not a double-star.
2388
            // (and a++ means something else entirely, which we don't support!)
2389
8
            status->set_code(kRegexpRepeatOp);
2390
8
            status->set_error_arg(absl::string_view(
2391
8
                lastunary.data(),
2392
8
                static_cast<size_t>(t.data() - lastunary.data())));
2393
8
            return NULL;
2394
8
          }
2395
21.1k
        }
2396
98.7k
        opstr = absl::string_view(opstr.data(),
2397
98.7k
                                  static_cast<size_t>(t.data() - opstr.data()));
2398
98.7k
        if (!ps.PushRepeatOp(op, opstr, nongreedy))
2399
16
          return NULL;
2400
98.7k
        isunary = opstr;
2401
98.7k
        break;
2402
98.7k
      }
2403
2404
57.1k
      case '{': {  // Counted repetition.
2405
57.1k
        int lo, hi;
2406
57.1k
        absl::string_view opstr = t;
2407
57.1k
        if (!MaybeParseRepetition(&t, &lo, &hi)) {
2408
          // Treat like a literal.
2409
8.71k
          if (!ps.PushLiteral('{'))
2410
0
            return NULL;
2411
8.71k
          t.remove_prefix(1);  // '{'
2412
8.71k
          break;
2413
8.71k
        }
2414
48.3k
        bool nongreedy = false;
2415
48.3k
        if (ps.flags() & PerlX) {
2416
27.7k
          if (!t.empty() && t[0] == '?') {
2417
330
            nongreedy = true;
2418
330
            t.remove_prefix(1);  // '?'
2419
330
          }
2420
27.7k
          if (!lastunary.empty()) {
2421
            // Not allowed to stack repetition operators.
2422
2
            status->set_code(kRegexpRepeatOp);
2423
2
            status->set_error_arg(absl::string_view(
2424
2
                lastunary.data(),
2425
2
                static_cast<size_t>(t.data() - lastunary.data())));
2426
2
            return NULL;
2427
2
          }
2428
27.7k
        }
2429
48.3k
        opstr = absl::string_view(opstr.data(),
2430
48.3k
                                  static_cast<size_t>(t.data() - opstr.data()));
2431
48.3k
        if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
2432
82
          return NULL;
2433
48.3k
        isunary = opstr;
2434
48.3k
        break;
2435
48.3k
      }
2436
2437
17.3k
      case '\\': {  // Escaped character or Perl sequence.
2438
        // \b and \B: word boundary or not
2439
17.3k
        if ((ps.flags() & Regexp::PerlB) &&
2440
17.3k
            t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) {
2441
3.55k
          if (!ps.PushWordBoundary(t[1] == 'b'))
2442
0
            return NULL;
2443
3.55k
          t.remove_prefix(2);  // '\\', 'b'
2444
3.55k
          break;
2445
3.55k
        }
2446
2447
13.8k
        if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {
2448
11.9k
          if (t[1] == 'A') {
2449
210
            if (!ps.PushSimpleOp(kRegexpBeginText))
2450
0
              return NULL;
2451
210
            t.remove_prefix(2);  // '\\', 'A'
2452
210
            break;
2453
210
          }
2454
11.7k
          if (t[1] == 'z') {
2455
765
            if (!ps.PushSimpleOp(kRegexpEndText))
2456
0
              return NULL;
2457
765
            t.remove_prefix(2);  // '\\', 'z'
2458
765
            break;
2459
765
          }
2460
          // Do not recognize \Z, because this library can't
2461
          // implement the exact Perl/PCRE semantics.
2462
          // (This library treats "(?-m)$" as \z, even though
2463
          // in Perl and PCRE it is equivalent to \Z.)
2464
2465
11.0k
          if (t[1] == 'C') {  // \C: any byte [sic]
2466
2.32k
            if (!ps.PushSimpleOp(kRegexpAnyByte))
2467
0
              return NULL;
2468
2.32k
            t.remove_prefix(2);  // '\\', 'C'
2469
2.32k
            break;
2470
2.32k
          }
2471
2472
8.69k
          if (t[1] == 'Q') {  // \Q ... \E: the ... is always literals
2473
399
            t.remove_prefix(2);  // '\\', 'Q'
2474
2.73k
            while (!t.empty()) {
2475
2.54k
              if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {
2476
201
                t.remove_prefix(2);  // '\\', 'E'
2477
201
                break;
2478
201
              }
2479
2.34k
              Rune r;
2480
2.34k
              if (StringViewToRune(&r, &t, status) < 0)
2481
8
                return NULL;
2482
2.33k
              if (!ps.PushLiteral(r))
2483
0
                return NULL;
2484
2.33k
            }
2485
391
            break;
2486
399
          }
2487
8.69k
        }
2488
2489
10.1k
        if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) {
2490
4.60k
          Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2491
4.60k
          re->ccb_ = new CharClassBuilder;
2492
4.60k
          switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
2493
4.44k
            case kParseOk:
2494
4.44k
              if (!ps.PushRegexp(re))
2495
0
                return NULL;
2496
4.44k
              goto Break2;
2497
4.44k
            case kParseError:
2498
160
              re->Decref();
2499
160
              return NULL;
2500
1
            case kParseNothing:
2501
1
              re->Decref();
2502
1
              break;
2503
4.60k
          }
2504
4.60k
        }
2505
2506
5.51k
        const UGroup* g = MaybeParsePerlCCEscape(&t, ps.flags());
2507
5.51k
        if (g != NULL) {
2508
1.89k
          Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2509
1.89k
          re->ccb_ = new CharClassBuilder;
2510
1.89k
          AddUGroup(re->ccb_, g, g->sign, ps.flags());
2511
1.89k
          if (!ps.PushRegexp(re))
2512
0
            return NULL;
2513
1.89k
          break;
2514
1.89k
        }
2515
2516
3.62k
        Rune r;
2517
3.62k
        if (!ParseEscape(&t, &r, status, ps.rune_max()))
2518
332
          return NULL;
2519
3.28k
        if (!ps.PushLiteral(r))
2520
0
          return NULL;
2521
3.28k
        break;
2522
3.28k
      }
2523
1.06M
    }
2524
1.06M
  Break2:
2525
1.06M
    lastunary = isunary;
2526
1.06M
  }
2527
30.0k
  return ps.DoFinish();
2528
31.3k
}
2529
2530
}  // namespace re2