Coverage Report

Created: 2025-06-09 06:44

/src/re2/re2/parse.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Regular expression parser.
6
7
// The parser is a simple precedence-based parser with a
8
// manual stack.  The parsing work is done by the methods
9
// of the ParseState class.  The Regexp::Parse function is
10
// essentially just a lexer that calls the ParseState method
11
// for each token.
12
13
// The parser recognizes POSIX extended regular expressions
14
// excluding backreferences, collating elements, and collating
15
// classes.  It also allows the empty string as a regular expression
16
// and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.
17
// See regexp.h for rationale.
18
19
#include <stddef.h>
20
#include <stdint.h>
21
#include <string.h>
22
23
#include <algorithm>
24
#include <string>
25
#include <vector>
26
27
#include "absl/base/attributes.h"
28
#include "absl/base/macros.h"
29
#include "absl/log/absl_log.h"
30
#include "absl/strings/ascii.h"
31
#include "absl/strings/string_view.h"
32
#include "re2/pod_array.h"
33
#include "re2/regexp.h"
34
#include "re2/unicode_casefold.h"
35
#include "re2/unicode_groups.h"
36
#include "re2/walker-inl.h"
37
#include "util/utf.h"
38
39
#if defined(RE2_USE_ICU)
40
#include "unicode/uniset.h"
41
#include "unicode/unistr.h"
42
#include "unicode/utypes.h"
43
#endif
44
45
namespace re2 {
46
47
// Controls the maximum repeat count permitted by the parser.
48
static int maximum_repeat_count = 1000;
49
50
12.4k
void Regexp::FUZZING_ONLY_set_maximum_repeat_count(int i) {
51
12.4k
  maximum_repeat_count = i;
52
12.4k
}
53
54
// Regular expression parse state.
55
// The list of parsed regexps so far is maintained as a vector of
56
// Regexp pointers called the stack.  Left parenthesis and vertical
57
// bar markers are also placed on the stack, as Regexps with
58
// non-standard opcodes.
59
// Scanning a left parenthesis causes the parser to push a left parenthesis
60
// marker on the stack.
61
// Scanning a vertical bar causes the parser to pop the stack until it finds a
62
// vertical bar or left parenthesis marker (not popping the marker),
63
// concatenate all the popped results, and push them back on
64
// the stack (DoConcatenation).
65
// Scanning a right parenthesis causes the parser to act as though it
66
// has seen a vertical bar, which then leaves the top of the stack in the
67
// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
68
// The parser pops all this off the stack and creates an alternation of the
69
// regexps (DoAlternation).
70
71
class Regexp::ParseState {
72
 public:
73
  ParseState(ParseFlags flags, absl::string_view whole_regexp,
74
             RegexpStatus* status);
75
  ~ParseState();
76
77
365k
  ParseFlags flags() { return flags_; }
78
3.50k
  int rune_max() { return rune_max_; }
79
80
  // Parse methods.  All public methods return a bool saying
81
  // whether parsing should continue.  If a method returns
82
  // false, it has set fields in *status_, and the parser
83
  // should return NULL.
84
85
  // Pushes the given regular expression onto the stack.
86
  // Could check for too much memory used here.
87
  bool PushRegexp(Regexp* re);
88
89
  // Pushes the literal rune r onto the stack.
90
  bool PushLiteral(Rune r);
91
92
  // Pushes a regexp with the given op (and no args) onto the stack.
93
  bool PushSimpleOp(RegexpOp op);
94
95
  // Pushes a ^ onto the stack.
96
  bool PushCaret();
97
98
  // Pushes a \b (word == true) or \B (word == false) onto the stack.
99
  bool PushWordBoundary(bool word);
100
101
  // Pushes a $ onto the stack.
102
  bool PushDollar();
103
104
  // Pushes a . onto the stack
105
  bool PushDot();
106
107
  // Pushes a repeat operator regexp onto the stack.
108
  // A valid argument for the operator must already be on the stack.
109
  // s is the name of the operator, for use in error messages.
110
  bool PushRepeatOp(RegexpOp op, absl::string_view s, bool nongreedy);
111
112
  // Pushes a repetition regexp onto the stack.
113
  // A valid argument for the operator must already be on the stack.
114
  bool PushRepetition(int min, int max, absl::string_view s, bool nongreedy);
115
116
  // Checks whether a particular regexp op is a marker.
117
  bool IsMarker(RegexpOp op);
118
119
  // Processes a left parenthesis in the input.
120
  // Pushes a marker onto the stack.
121
  bool DoLeftParen(absl::string_view name);
122
  bool DoLeftParenNoCapture();
123
124
  // Processes a vertical bar in the input.
125
  bool DoVerticalBar();
126
127
  // Processes a right parenthesis in the input.
128
  bool DoRightParen();
129
130
  // Processes the end of input, returning the final regexp.
131
  Regexp* DoFinish();
132
133
  // Finishes the regexp if necessary, preparing it for use
134
  // in a more complicated expression.
135
  // If it is a CharClassBuilder, converts into a CharClass.
136
  Regexp* FinishRegexp(Regexp*);
137
138
  // These routines don't manipulate the parse stack
139
  // directly, but they do need to look at flags_.
140
  // ParseCharClass also manipulates the internals of Regexp
141
  // while creating *out_re.
142
143
  // Parse a character class into *out_re.
144
  // Removes parsed text from s.
145
  bool ParseCharClass(absl::string_view* s, Regexp** out_re,
146
                      RegexpStatus* status);
147
148
  // Parse a character class character into *rp.
149
  // Removes parsed text from s.
150
  bool ParseCCCharacter(absl::string_view* s, Rune* rp,
151
                        absl::string_view whole_class,
152
                        RegexpStatus* status);
153
154
  // Parse a character class range into rr.
155
  // Removes parsed text from s.
156
  bool ParseCCRange(absl::string_view* s, RuneRange* rr,
157
                    absl::string_view whole_class,
158
                    RegexpStatus* status);
159
160
  // Parse a Perl flag set or non-capturing group from s.
161
  bool ParsePerlFlags(absl::string_view* s);
162
163
  // Finishes the current concatenation,
164
  // collapsing it into a single regexp on the stack.
165
  void DoConcatenation();
166
167
  // Finishes the current alternation,
168
  // collapsing it to a single regexp on the stack.
169
  void DoAlternation();
170
171
  // Generalized DoAlternation/DoConcatenation.
172
  void DoCollapse(RegexpOp op);
173
174
  // Maybe concatenate Literals into LiteralString.
175
  bool MaybeConcatString(int r, ParseFlags flags);
176
177
private:
178
  ParseFlags flags_;
179
  absl::string_view whole_regexp_;
180
  RegexpStatus* status_;
181
  Regexp* stacktop_;
182
  int ncap_;  // number of capturing parens seen
183
  int rune_max_;  // maximum char value for this encoding
184
185
  ParseState(const ParseState&) = delete;
186
  ParseState& operator=(const ParseState&) = delete;
187
};
188
189
// Pseudo-operators - only on parse stack.
190
const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);
191
const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);
192
193
Regexp::ParseState::ParseState(ParseFlags flags,
194
                               absl::string_view whole_regexp,
195
                               RegexpStatus* status)
196
32.0k
  : flags_(flags), whole_regexp_(whole_regexp),
197
32.0k
    status_(status), stacktop_(NULL), ncap_(0) {
198
32.0k
  if (flags_ & Latin1)
199
16.1k
    rune_max_ = 0xFF;
200
15.8k
  else
201
15.8k
    rune_max_ = Runemax;
202
32.0k
}
203
204
// Cleans up by freeing all the regexps on the stack.
205
32.0k
Regexp::ParseState::~ParseState() {
206
32.0k
  Regexp* next;
207
34.4k
  for (Regexp* re = stacktop_; re != NULL; re = next) {
208
2.40k
    next = re->down_;
209
2.40k
    re->down_ = NULL;
210
2.40k
    if (re->op() == kLeftParen)
211
951
      delete re->name_;
212
2.40k
    re->Decref();
213
2.40k
  }
214
32.0k
}
215
216
// Finishes the regexp if necessary, preparing it for use in
217
// a more complex expression.
218
// If it is a CharClassBuilder, converts into a CharClass.
219
736k
Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {
220
736k
  if (re == NULL)
221
0
    return NULL;
222
736k
  re->down_ = NULL;
223
224
736k
  if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
225
58.4k
    CharClassBuilder* ccb = re->ccb_;
226
58.4k
    re->ccb_ = NULL;
227
58.4k
    re->cc_ = ccb->GetCharClass();
228
58.4k
    delete ccb;
229
58.4k
  }
230
231
736k
  return re;
232
736k
}
233
234
// Pushes the given regular expression onto the stack.
235
// Could check for too much memory used here.
236
808k
bool Regexp::ParseState::PushRegexp(Regexp* re) {
237
808k
  MaybeConcatString(-1, NoParseFlags);
238
239
  // Special case: a character class of one character is just
240
  // a literal.  This is a common idiom for escaping
241
  // single characters (e.g., [.] instead of \.), and some
242
  // analysis does better with fewer character classes.
243
  // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
244
808k
  if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
245
82.6k
    re->ccb_->RemoveAbove(rune_max_);
246
82.6k
    if (re->ccb_->size() == 1) {
247
957
      Rune r = re->ccb_->begin()->lo;
248
957
      re->Decref();
249
957
      re = new Regexp(kRegexpLiteral, flags_);
250
957
      re->rune_ = r;
251
81.7k
    } else if (re->ccb_->size() == 2) {
252
41.8k
      Rune r = re->ccb_->begin()->lo;
253
41.8k
      if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {
254
22.6k
        re->Decref();
255
22.6k
        re = new Regexp(kRegexpLiteral, flags_ | FoldCase);
256
22.6k
        re->rune_ = r + 'a' - 'A';
257
22.6k
      }
258
41.8k
    }
259
82.6k
  }
260
261
808k
  if (!IsMarker(re->op()))
262
605k
    re->simple_ = re->ComputeSimple();
263
808k
  re->down_ = stacktop_;
264
808k
  stacktop_ = re;
265
808k
  return true;
266
808k
}
267
268
// Searches the case folding tables and returns the CaseFold* that contains r.
269
// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
270
// If there isn't one, returns NULL.
271
4.31M
const CaseFold* LookupCaseFold(const CaseFold* f, int n, Rune r) {
272
4.31M
  const CaseFold* ef = f + n;
273
274
  // Binary search for entry containing r.
275
35.7M
  while (n > 0) {
276
34.3M
    int m = n/2;
277
34.3M
    if (f[m].lo <= r && r <= f[m].hi)
278
2.87M
      return &f[m];
279
31.4M
    if (r < f[m].lo) {
280
17.8M
      n = m;
281
17.8M
    } else {
282
13.6M
      f += m+1;
283
13.6M
      n -= m+1;
284
13.6M
    }
285
31.4M
  }
286
287
  // There is no entry that contains r, but f points
288
  // where it would have been.  Unless f points at
289
  // the end of the array, it points at the next entry
290
  // after r.
291
1.44M
  if (f < ef)
292
1.38M
    return f;
293
294
  // No entry contains r; no entry contains runes > r.
295
53.4k
  return NULL;
296
1.44M
}
297
298
// Returns the result of applying the fold f to the rune r.
299
180k
Rune ApplyFold(const CaseFold* f, Rune r) {
300
180k
  switch (f->delta) {
301
52.9k
    default:
302
52.9k
      return r + f->delta;
303
304
79.8k
    case EvenOddSkip:  // even <-> odd but only applies to every other
305
79.8k
      if ((r - f->lo) % 2)
306
39.9k
        return r;
307
39.8k
      ABSL_FALLTHROUGH_INTENDED;
308
81.2k
    case EvenOdd:  // even <-> odd
309
81.2k
      if (r%2 == 0)
310
55.9k
        return r + 1;
311
25.2k
      return r - 1;
312
313
2.95k
    case OddEvenSkip:  // odd <-> even but only applies to every other
314
2.95k
      if ((r - f->lo) % 2)
315
1.22k
        return r;
316
1.72k
      ABSL_FALLTHROUGH_INTENDED;
317
5.46k
    case OddEven:  // odd <-> even
318
5.46k
      if (r%2 == 1)
319
4.22k
        return r + 1;
320
1.24k
      return r - 1;
321
180k
  }
322
180k
}
323
324
// Returns the next Rune in r's folding cycle (see unicode_casefold.h).
325
// Examples:
326
//   CycleFoldRune('A') = 'a'
327
//   CycleFoldRune('a') = 'A'
328
//
329
//   CycleFoldRune('K') = 'k'
330
//   CycleFoldRune('k') = 0x212A (Kelvin)
331
//   CycleFoldRune(0x212A) = 'K'
332
//
333
//   CycleFoldRune('?') = '?'
334
112k
Rune CycleFoldRune(Rune r) {
335
112k
  const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
336
112k
  if (f == NULL || r < f->lo)
337
32.6k
    return r;
338
79.8k
  return ApplyFold(f, r);
339
112k
}
340
341
// Add lo-hi to the class, along with their fold-equivalent characters.
342
69.7k
static void AddFoldedRangeLatin1(CharClassBuilder* cc, Rune lo, Rune hi) {
343
10.2M
  while (lo <= hi) {
344
10.1M
    cc->AddRange(lo, lo);
345
10.1M
    if ('A' <= lo && lo <= 'Z') {
346
39.7k
      cc->AddRange(lo - 'A' + 'a', lo - 'A' + 'a');
347
39.7k
    }
348
10.1M
    if ('a' <= lo && lo <= 'z') {
349
34.0k
      cc->AddRange(lo - 'a' + 'A', lo - 'a' + 'A');
350
34.0k
    }
351
10.1M
    lo++;
352
10.1M
  }
353
69.7k
}
354
355
// Add lo-hi to the class, along with their fold-equivalent characters.
356
// If lo-hi is already in the class, assume that the fold-equivalent
357
// chars are there too, so there's no work to do.
358
3.60M
static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {
359
  // AddFoldedRange calls itself recursively for each rune in the fold cycle.
360
  // Most folding cycles are small: there aren't any bigger than four in the
361
  // current Unicode tables.  make_unicode_casefold.py checks that
362
  // the cycles are not too long, and we double-check here using depth.
363
3.60M
  if (depth > 10) {
364
0
    ABSL_LOG(DFATAL) << "AddFoldedRange recurses too much.";
365
0
    return;
366
0
  }
367
368
3.60M
  if (!cc->AddRange(lo, hi))  // lo-hi was already there? we're done
369
2.14M
    return;
370
371
5.48M
  while (lo <= hi) {
372
4.08M
    const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
373
4.08M
    if (f == NULL)  // lo has no fold, nor does anything above lo
374
50.8k
      break;
375
4.03M
    if (lo < f->lo) {  // lo has no fold; next rune with a fold is f->lo
376
1.33M
      lo = f->lo;
377
1.33M
      continue;
378
1.33M
    }
379
380
    // Add in the result of folding the range lo - f->hi
381
    // and that range's fold, recursively.
382
2.69M
    Rune lo1 = lo;
383
2.69M
    Rune hi1 = std::min<Rune>(hi, f->hi);
384
2.69M
    switch (f->delta) {
385
2.17M
      default:
386
2.17M
        lo1 += f->delta;
387
2.17M
        hi1 += f->delta;
388
2.17M
        break;
389
316k
      case EvenOdd:
390
316k
        if (lo1%2 == 1)
391
2.37k
          lo1--;
392
316k
        if (hi1%2 == 0)
393
85.0k
          hi1++;
394
316k
        break;
395
199k
      case OddEven:
396
199k
        if (lo1%2 == 0)
397
489
          lo1--;
398
199k
        if (hi1%2 == 1)
399
49.5k
          hi1++;
400
199k
        break;
401
2.69M
    }
402
2.69M
    AddFoldedRange(cc, lo1, hi1, depth+1);
403
404
    // Pick up where this fold left off.
405
2.69M
    lo = f->hi + 1;
406
2.69M
  }
407
1.45M
}
408
409
// Pushes the literal rune r onto the stack.
410
546k
bool Regexp::ParseState::PushLiteral(Rune r) {
411
  // Do case folding if needed.
412
546k
  if (flags_ & FoldCase) {
413
216k
    if (flags_ & Latin1 && (('A' <= r && r <= 'Z') ||
414
158k
                            ('a' <= r && r <= 'z'))) {
415
15.3k
      Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
416
15.3k
      re->ccb_ = new CharClassBuilder;
417
15.3k
      AddFoldedRangeLatin1(re->ccb_, r, r);
418
15.3k
      return PushRegexp(re);
419
15.3k
    }
420
201k
    if (!(flags_ & Latin1) && CycleFoldRune(r) != r) {
421
25.3k
      Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
422
25.3k
      re->ccb_ = new CharClassBuilder;
423
25.3k
      Rune r1 = r;
424
54.5k
      do {
425
54.5k
        if (!(flags_ & NeverNL) || r != '\n') {
426
54.5k
          re->ccb_->AddRange(r, r);
427
54.5k
        }
428
54.5k
        r = CycleFoldRune(r);
429
54.5k
      } while (r != r1);
430
25.3k
      return PushRegexp(re);
431
25.3k
    }
432
201k
  }
433
434
  // Exclude newline if applicable.
435
505k
  if ((flags_ & NeverNL) && r == '\n')
436
11.2k
    return PushRegexp(new Regexp(kRegexpNoMatch, flags_));
437
438
  // No fancy stuff worked.  Ordinary literal.
439
494k
  if (MaybeConcatString(r, flags_))
440
198k
    return true;
441
442
295k
  Regexp* re = new Regexp(kRegexpLiteral, flags_);
443
295k
  re->rune_ = r;
444
295k
  return PushRegexp(re);
445
494k
}
446
447
// Pushes a ^ onto the stack.
448
21.0k
bool Regexp::ParseState::PushCaret() {
449
21.0k
  if (flags_ & OneLine) {
450
15.7k
    return PushSimpleOp(kRegexpBeginText);
451
15.7k
  }
452
5.30k
  return PushSimpleOp(kRegexpBeginLine);
453
21.0k
}
454
455
// Pushes a \b or \B onto the stack.
456
3.14k
bool Regexp::ParseState::PushWordBoundary(bool word) {
457
3.14k
  if (word)
458
1.54k
    return PushSimpleOp(kRegexpWordBoundary);
459
1.60k
  return PushSimpleOp(kRegexpNoWordBoundary);
460
3.14k
}
461
462
// Pushes a $ onto the stack.
463
39.5k
bool Regexp::ParseState::PushDollar() {
464
39.5k
  if (flags_ & OneLine) {
465
    // Clumsy marker so that MimicsPCRE() can tell whether
466
    // this kRegexpEndText was a $ and not a \z.
467
28.3k
    Regexp::ParseFlags oflags = flags_;
468
28.3k
    flags_ = flags_ | WasDollar;
469
28.3k
    bool ret = PushSimpleOp(kRegexpEndText);
470
28.3k
    flags_ = oflags;
471
28.3k
    return ret;
472
28.3k
  }
473
11.1k
  return PushSimpleOp(kRegexpEndLine);
474
39.5k
}
475
476
// Pushes a . onto the stack.
477
13.3k
bool Regexp::ParseState::PushDot() {
478
13.3k
  if ((flags_ & DotNL) && !(flags_ & NeverNL))
479
4.30k
    return PushSimpleOp(kRegexpAnyChar);
480
  // Rewrite . into [^\n]
481
9.00k
  Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
482
9.00k
  re->ccb_ = new CharClassBuilder;
483
9.00k
  re->ccb_->AddRange(0, '\n' - 1);
484
9.00k
  re->ccb_->AddRange('\n' + 1, rune_max_);
485
9.00k
  return PushRegexp(re);
486
13.3k
}
487
488
// Pushes a regexp with the given op (and no args) onto the stack.
489
187k
bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {
490
187k
  Regexp* re = new Regexp(op, flags_);
491
187k
  return PushRegexp(re);
492
187k
}
493
494
// Pushes a repeat operator regexp onto the stack.
495
// A valid argument for the operator must already be on the stack.
496
// The char c is the name of the operator, for use in error messages.
497
bool Regexp::ParseState::PushRepeatOp(RegexpOp op, absl::string_view s,
498
103k
                                      bool nongreedy) {
499
103k
  if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
500
15
    status_->set_code(kRegexpRepeatArgument);
501
15
    status_->set_error_arg(s);
502
15
    return false;
503
15
  }
504
103k
  Regexp::ParseFlags fl = flags_;
505
103k
  if (nongreedy)
506
2.13k
    fl = fl ^ NonGreedy;
507
508
  // Squash **, ++ and ??. Regexp::Star() et al. handle this too, but
509
  // they're mostly for use during simplification, not during parsing.
510
103k
  if (op == stacktop_->op() && fl == stacktop_->parse_flags())
511
5.90k
    return true;
512
513
  // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
514
  // op is a repeat, we just have to check that stacktop_->op() is too,
515
  // then adjust stacktop_.
516
98.0k
  if ((stacktop_->op() == kRegexpStar ||
517
98.0k
       stacktop_->op() == kRegexpPlus ||
518
98.0k
       stacktop_->op() == kRegexpQuest) &&
519
98.0k
      fl == stacktop_->parse_flags()) {
520
925
    stacktop_->op_ = kRegexpStar;
521
925
    return true;
522
925
  }
523
524
97.0k
  Regexp* re = new Regexp(op, fl);
525
97.0k
  re->AllocSub(1);
526
97.0k
  re->down_ = stacktop_->down_;
527
97.0k
  re->sub()[0] = FinishRegexp(stacktop_);
528
97.0k
  re->simple_ = re->ComputeSimple();
529
97.0k
  stacktop_ = re;
530
97.0k
  return true;
531
98.0k
}
532
533
// RepetitionWalker reports whether the repetition regexp is valid.
534
// Valid means that the combination of the top-level repetition
535
// and any inner repetitions does not exceed n copies of the
536
// innermost thing.
537
// This rewalks the regexp tree and is called for every repetition,
538
// so we have to worry about inducing quadratic behavior in the parser.
539
// We avoid this by only using RepetitionWalker when min or max >= 2.
540
// In that case the depth of any >= 2 nesting can only get to 9 without
541
// triggering a parse error, so each subtree can only be rewalked 9 times.
542
class RepetitionWalker : public Regexp::Walker<int> {
543
 public:
544
40.2k
  RepetitionWalker() {}
545
  virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
546
  virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
547
                        int* child_args, int nchild_args);
548
  virtual int ShortVisit(Regexp* re, int parent_arg);
549
550
 private:
551
  RepetitionWalker(const RepetitionWalker&) = delete;
552
  RepetitionWalker& operator=(const RepetitionWalker&) = delete;
553
};
554
555
392k
int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
556
392k
  int arg = parent_arg;
557
392k
  if (re->op() == kRegexpRepeat) {
558
44.9k
    int m = re->max();
559
44.9k
    if (m < 0) {
560
1.00k
      m = re->min();
561
1.00k
    }
562
44.9k
    if (m > 0) {
563
43.8k
      arg /= m;
564
43.8k
    }
565
44.9k
  }
566
392k
  return arg;
567
392k
}
568
569
int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
570
392k
                                int* child_args, int nchild_args) {
571
392k
  int arg = pre_arg;
572
745k
  for (int i = 0; i < nchild_args; i++) {
573
352k
    if (child_args[i] < arg) {
574
5.26k
      arg = child_args[i];
575
5.26k
    }
576
352k
  }
577
392k
  return arg;
578
392k
}
579
580
0
int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
581
  // Should never be called: we use Walk(), not WalkExponential().
582
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
583
  ABSL_LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
584
#endif
585
0
  return 0;
586
0
}
587
588
// Pushes a repetition regexp onto the stack.
589
// A valid argument for the operator must already be on the stack.
590
bool Regexp::ParseState::PushRepetition(int min, int max, absl::string_view s,
591
48.9k
                                        bool nongreedy) {
592
48.9k
  if ((max != -1 && max < min) ||
593
48.9k
      min > maximum_repeat_count ||
594
48.9k
      max > maximum_repeat_count) {
595
41
    status_->set_code(kRegexpRepeatSize);
596
41
    status_->set_error_arg(s);
597
41
    return false;
598
41
  }
599
48.9k
  if (stacktop_ == NULL || IsMarker(stacktop_->op())) {
600
10
    status_->set_code(kRegexpRepeatArgument);
601
10
    status_->set_error_arg(s);
602
10
    return false;
603
10
  }
604
48.9k
  Regexp::ParseFlags fl = flags_;
605
48.9k
  if (nongreedy)
606
313
    fl = fl ^ NonGreedy;
607
48.9k
  Regexp* re = new Regexp(kRegexpRepeat, fl);
608
48.9k
  re->min_ = min;
609
48.9k
  re->max_ = max;
610
48.9k
  re->AllocSub(1);
611
48.9k
  re->down_ = stacktop_->down_;
612
48.9k
  re->sub()[0] = FinishRegexp(stacktop_);
613
48.9k
  re->simple_ = re->ComputeSimple();
614
48.9k
  stacktop_ = re;
615
48.9k
  if (min >= 2 || max >= 2) {
616
40.2k
    RepetitionWalker w;
617
40.2k
    if (w.Walk(stacktop_, maximum_repeat_count) == 0) {
618
16
      status_->set_code(kRegexpRepeatSize);
619
16
      status_->set_error_arg(s);
620
16
      return false;
621
16
    }
622
40.2k
  }
623
48.9k
  return true;
624
48.9k
}
625
626
// Checks whether a particular regexp op is a marker.
627
2.84M
bool Regexp::ParseState::IsMarker(RegexpOp op) {
628
2.84M
  return op >= kLeftParen;
629
2.84M
}
630
631
// Processes a left parenthesis in the input.
632
// Pushes a marker onto the stack.
633
48.5k
bool Regexp::ParseState::DoLeftParen(absl::string_view name) {
634
48.5k
  Regexp* re = new Regexp(kLeftParen, flags_);
635
48.5k
  re->cap_ = ++ncap_;
636
48.5k
  if (name.data() != NULL)
637
9.29k
    re->name_ = new std::string(name);
638
48.5k
  return PushRegexp(re);
639
48.5k
}
640
641
// Pushes a non-capturing marker onto the stack.
642
38.2k
bool Regexp::ParseState::DoLeftParenNoCapture() {
643
38.2k
  Regexp* re = new Regexp(kLeftParen, flags_);
644
38.2k
  re->cap_ = -1;
645
38.2k
  return PushRegexp(re);
646
38.2k
}
647
648
// Processes a vertical bar in the input.
649
231k
bool Regexp::ParseState::DoVerticalBar() {
650
231k
  MaybeConcatString(-1, NoParseFlags);
651
231k
  DoConcatenation();
652
653
  // Below the vertical bar is a list to alternate.
654
  // Above the vertical bar is a list to concatenate.
655
  // We just did the concatenation, so either swap
656
  // the result below the vertical bar or push a new
657
  // vertical bar on the stack.
658
231k
  Regexp* r1;
659
231k
  Regexp* r2;
660
231k
  if ((r1 = stacktop_) != NULL &&
661
231k
      (r2 = r1->down_) != NULL &&
662
231k
      r2->op() == kVerticalBar) {
663
114k
    Regexp* r3;
664
114k
    if ((r3 = r2->down_) != NULL &&
665
114k
        (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
666
      // AnyChar is above or below the vertical bar. Let it subsume
667
      // the other when the other is Literal, CharClass or AnyChar.
668
1.01k
      if (r3->op() == kRegexpAnyChar &&
669
1.01k
          (r1->op() == kRegexpLiteral ||
670
531
           r1->op() == kRegexpCharClass ||
671
531
           r1->op() == kRegexpAnyChar)) {
672
        // Discard r1.
673
418
        stacktop_ = r2;
674
418
        r1->Decref();
675
418
        return true;
676
418
      }
677
592
      if (r1->op() == kRegexpAnyChar &&
678
592
          (r3->op() == kRegexpLiteral ||
679
479
           r3->op() == kRegexpCharClass ||
680
479
           r3->op() == kRegexpAnyChar)) {
681
        // Rearrange the stack and discard r3.
682
351
        r1->down_ = r3->down_;
683
351
        r2->down_ = r1;
684
351
        stacktop_ = r2;
685
351
        r3->Decref();
686
351
        return true;
687
351
      }
688
592
    }
689
    // Swap r1 below vertical bar (r2).
690
114k
    r1->down_ = r2->down_;
691
114k
    r2->down_ = r1;
692
114k
    stacktop_ = r2;
693
114k
    return true;
694
114k
  }
695
116k
  return PushSimpleOp(kVerticalBar);
696
231k
}
697
698
// Processes a right parenthesis in the input.
699
85.8k
bool Regexp::ParseState::DoRightParen() {
700
  // Finish the current concatenation and alternation.
701
85.8k
  DoAlternation();
702
703
  // The stack should be: LeftParen regexp
704
  // Remove the LeftParen, leaving the regexp,
705
  // parenthesized.
706
85.8k
  Regexp* r1;
707
85.8k
  Regexp* r2;
708
85.8k
  if ((r1 = stacktop_) == NULL ||
709
85.8k
      (r2 = r1->down_) == NULL ||
710
85.8k
      r2->op() != kLeftParen) {
711
58
    status_->set_code(kRegexpUnexpectedParen);
712
58
    status_->set_error_arg(whole_regexp_);
713
58
    return false;
714
58
  }
715
716
  // Pop off r1, r2.  Will Decref or reuse below.
717
85.8k
  stacktop_ = r2->down_;
718
719
  // Restore flags from when paren opened.
720
85.8k
  Regexp* re = r2;
721
85.8k
  flags_ = re->parse_flags();
722
723
  // Rewrite LeftParen as capture if needed.
724
85.8k
  if (re->cap_ > 0) {
725
47.8k
    re->op_ = kRegexpCapture;
726
    // re->cap_ is already set
727
47.8k
    re->AllocSub(1);
728
47.8k
    re->sub()[0] = FinishRegexp(r1);
729
47.8k
    re->simple_ = re->ComputeSimple();
730
47.8k
  } else {
731
37.9k
    re->Decref();
732
37.9k
    re = r1;
733
37.9k
  }
734
85.8k
  return PushRegexp(re);
735
85.8k
}
736
737
// Processes the end of input, returning the final regexp.
738
30.7k
Regexp* Regexp::ParseState::DoFinish() {
739
30.7k
  DoAlternation();
740
30.7k
  Regexp* re = stacktop_;
741
30.7k
  if (re != NULL && re->down_ != NULL) {
742
259
    status_->set_code(kRegexpMissingParen);
743
259
    status_->set_error_arg(whole_regexp_);
744
259
    return NULL;
745
259
  }
746
30.4k
  stacktop_ = NULL;
747
30.4k
  return FinishRegexp(re);
748
30.7k
}
749
750
// Returns the leading regexp that re starts with.
751
// The returned Regexp* points into a piece of re,
752
// so it must not be used after the caller calls re->Decref().
753
212k
Regexp* Regexp::LeadingRegexp(Regexp* re) {
754
212k
  if (re->op() == kRegexpEmptyMatch)
755
66.9k
    return NULL;
756
145k
  if (re->op() == kRegexpConcat && re->nsub() >= 2) {
757
55.7k
    Regexp** sub = re->sub();
758
55.7k
    if (sub[0]->op() == kRegexpEmptyMatch)
759
406
      return NULL;
760
55.3k
    return sub[0];
761
55.7k
  }
762
89.9k
  return re;
763
145k
}
764
765
// Removes LeadingRegexp(re) from re and returns what's left.
766
// Consumes the reference to re and may edit it in place.
767
// If caller wants to hold on to LeadingRegexp(re),
768
// must have already Incref'ed it.
769
21.8k
Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {
770
21.8k
  if (re->op() == kRegexpEmptyMatch)
771
0
    return re;
772
21.8k
  if (re->op() == kRegexpConcat && re->nsub() >= 2) {
773
18.3k
    Regexp** sub = re->sub();
774
18.3k
    if (sub[0]->op() == kRegexpEmptyMatch)
775
0
      return re;
776
18.3k
    sub[0]->Decref();
777
18.3k
    sub[0] = NULL;
778
18.3k
    if (re->nsub() == 2) {
779
      // Collapse concatenation to single regexp.
780
2.33k
      Regexp* nre = sub[1];
781
2.33k
      sub[1] = NULL;
782
2.33k
      re->Decref();
783
2.33k
      return nre;
784
2.33k
    }
785
    // 3 or more -> 2 or more.
786
15.9k
    re->nsub_--;
787
15.9k
    memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
788
15.9k
    return re;
789
18.3k
  }
790
3.52k
  Regexp::ParseFlags pf = re->parse_flags();
791
3.52k
  re->Decref();
792
3.52k
  return new Regexp(kRegexpEmptyMatch, pf);
793
21.8k
}
794
795
// Returns the leading string that re starts with.
796
// The returned Rune* points into a piece of re,
797
// so it must not be used after the caller calls re->Decref().
798
Rune* Regexp::LeadingString(Regexp* re, int* nrune,
799
223k
                            Regexp::ParseFlags* flags) {
800
276k
  while (re->op() == kRegexpConcat && re->nsub() > 0)
801
53.3k
    re = re->sub()[0];
802
803
223k
  *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ &
804
223k
                                           (Regexp::FoldCase | Regexp::Latin1));
805
806
223k
  if (re->op() == kRegexpLiteral) {
807
44.8k
    *nrune = 1;
808
44.8k
    return &re->rune_;
809
44.8k
  }
810
811
178k
  if (re->op() == kRegexpLiteralString) {
812
37.1k
    *nrune = re->nrunes_;
813
37.1k
    return re->runes_;
814
37.1k
  }
815
816
141k
  *nrune = 0;
817
141k
  return NULL;
818
178k
}
819
820
// Removes the first n leading runes from the beginning of re.
821
// Edits re in place.
822
18.3k
void Regexp::RemoveLeadingString(Regexp* re, int n) {
823
  // Chase down concats to find first string.
824
  // For regexps generated by parser, nested concats are
825
  // flattened except when doing so would overflow the 16-bit
826
  // limit on the size of a concatenation, so we should never
827
  // see more than two here.
828
18.3k
  Regexp* stk[4];
829
18.3k
  size_t d = 0;
830
23.3k
  while (re->op() == kRegexpConcat) {
831
4.97k
    if (d < ABSL_ARRAYSIZE(stk))
832
4.97k
      stk[d++] = re;
833
4.97k
    re = re->sub()[0];
834
4.97k
  }
835
836
  // Remove leading string from re.
837
18.3k
  if (re->op() == kRegexpLiteral) {
838
11.7k
    re->rune_ = 0;
839
11.7k
    re->op_ = kRegexpEmptyMatch;
840
11.7k
  } else if (re->op() == kRegexpLiteralString) {
841
6.66k
    if (n >= re->nrunes_) {
842
674
      delete[] re->runes_;
843
674
      re->runes_ = NULL;
844
674
      re->nrunes_ = 0;
845
674
      re->op_ = kRegexpEmptyMatch;
846
5.98k
    } else if (n == re->nrunes_ - 1) {
847
1.49k
      Rune rune = re->runes_[re->nrunes_ - 1];
848
1.49k
      delete[] re->runes_;
849
1.49k
      re->runes_ = NULL;
850
1.49k
      re->nrunes_ = 0;
851
1.49k
      re->rune_ = rune;
852
1.49k
      re->op_ = kRegexpLiteral;
853
4.49k
    } else {
854
4.49k
      re->nrunes_ -= n;
855
4.49k
      memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);
856
4.49k
    }
857
6.66k
  }
858
859
  // If re is now empty, concatenations might simplify too.
860
23.3k
  while (d > 0) {
861
4.97k
    re = stk[--d];
862
4.97k
    Regexp** sub = re->sub();
863
4.97k
    if (sub[0]->op() == kRegexpEmptyMatch) {
864
2.62k
      sub[0]->Decref();
865
2.62k
      sub[0] = NULL;
866
      // Delete first element of concat.
867
2.62k
      switch (re->nsub()) {
868
0
        case 0:
869
0
        case 1:
870
          // Impossible.
871
0
          ABSL_LOG(DFATAL) << "Concat of " << re->nsub();
872
0
          re->submany_ = NULL;
873
0
          re->op_ = kRegexpEmptyMatch;
874
0
          break;
875
876
1.47k
        case 2: {
877
          // Replace re with sub[1].
878
1.47k
          Regexp* old = sub[1];
879
1.47k
          sub[1] = NULL;
880
1.47k
          re->Swap(old);
881
1.47k
          old->Decref();
882
1.47k
          break;
883
0
        }
884
885
1.14k
        default:
886
          // Slide down.
887
1.14k
          re->nsub_--;
888
1.14k
          memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);
889
1.14k
          break;
890
2.62k
      }
891
2.62k
    }
892
4.97k
  }
893
18.3k
}
894
895
// In the context of factoring alternations, a Splice is: a factored prefix or
896
// merged character class computed by one iteration of one round of factoring;
897
// the span of subexpressions of the alternation to be "spliced" (i.e. removed
898
// and replaced); and, for a factored prefix, the number of suffixes after any
899
// factoring that might have subsequently been performed on them. For a merged
900
// character class, there are no suffixes, of course, so the field is ignored.
901
struct Splice {
902
  Splice(Regexp* prefix, Regexp** sub, int nsub)
903
23.8k
      : prefix(prefix),
904
23.8k
        sub(sub),
905
23.8k
        nsub(nsub),
906
23.8k
        nsuffix(-1) {}
907
908
  Regexp* prefix;
909
  Regexp** sub;
910
  int nsub;
911
  int nsuffix;
912
};
913
914
// Named so because it is used to implement an explicit stack, a Frame is: the
915
// span of subexpressions of the alternation to be factored; the current round
916
// of factoring; any Splices computed; and, for a factored prefix, an iterator
917
// to the next Splice to be factored (i.e. in another Frame) because suffixes.
918
struct Frame {
919
  Frame(Regexp** sub, int nsub)
920
67.1k
      : sub(sub),
921
67.1k
        nsub(nsub),
922
67.1k
        round(0) {}
923
924
  Regexp** sub;
925
  int nsub;
926
  int round;
927
  std::vector<Splice> splices;
928
  int spliceidx;
929
};
930
931
// Bundled into a class for friend access to Regexp without needing to declare
932
// (or define) Splice in regexp.h.
933
class FactorAlternationImpl {
934
 public:
935
  static void Round1(Regexp** sub, int nsub,
936
                     Regexp::ParseFlags flags,
937
                     std::vector<Splice>* splices);
938
  static void Round2(Regexp** sub, int nsub,
939
                     Regexp::ParseFlags flags,
940
                     std::vector<Splice>* splices);
941
  static void Round3(Regexp** sub, int nsub,
942
                     Regexp::ParseFlags flags,
943
                     std::vector<Splice>* splices);
944
};
945
946
// Factors common prefixes from alternation.
947
// For example,
948
//     ABC|ABD|AEF|BCX|BCY
949
// simplifies to
950
//     A(B(C|D)|EF)|BC(X|Y)
951
// and thence to
952
//     A(B[CD]|EF)|BC[XY]
953
//
954
// Rewrites sub to contain simplified list to alternate and returns
955
// the new length of sub.  Adjusts reference counts accordingly
956
// (incoming sub[i] decremented, outgoing sub[i] incremented).
957
50.0k
int Regexp::FactorAlternation(Regexp** sub, int nsub, ParseFlags flags) {
958
50.0k
  std::vector<Frame> stk;
959
50.0k
  stk.emplace_back(sub, nsub);
960
961
285k
  for (;;) {
962
285k
    auto& sub = stk.back().sub;
963
285k
    auto& nsub = stk.back().nsub;
964
285k
    auto& round = stk.back().round;
965
285k
    auto& splices = stk.back().splices;
966
285k
    auto& spliceidx = stk.back().spliceidx;
967
968
285k
    if (splices.empty()) {
969
      // Advance to the next round of factoring. Note that this covers
970
      // the initialised state: when splices is empty and round is 0.
971
249k
      round++;
972
249k
    } else if (spliceidx < static_cast<int>(splices.size())) {
973
      // We have at least one more Splice to factor. Recurse logically.
974
17.0k
      stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub);
975
17.0k
      continue;
976
19.3k
    } else {
977
      // We have no more Splices to factor. Apply them.
978
19.3k
      auto iter = splices.begin();
979
19.3k
      int out = 0;
980
43.1k
      for (int i = 0; i < nsub; ) {
981
        // Copy until we reach where the next Splice begins.
982
32.1k
        while (sub + i < iter->sub)
983
8.34k
          sub[out++] = sub[i++];
984
23.8k
        switch (round) {
985
7.47k
          case 1:
986
17.0k
          case 2: {
987
            // Assemble the Splice prefix and the suffixes.
988
17.0k
            Regexp* re[2];
989
17.0k
            re[0] = iter->prefix;
990
17.0k
            re[1] = Regexp::AlternateNoFactor(iter->sub, iter->nsuffix, flags);
991
17.0k
            sub[out++] = Regexp::Concat(re, 2, flags);
992
17.0k
            i += iter->nsub;
993
17.0k
            break;
994
7.47k
          }
995
6.79k
          case 3:
996
            // Just use the Splice prefix.
997
6.79k
            sub[out++] = iter->prefix;
998
6.79k
            i += iter->nsub;
999
6.79k
            break;
1000
0
          default:
1001
0
            ABSL_LOG(DFATAL) << "unknown round: " << round;
1002
0
            break;
1003
23.8k
        }
1004
        // If we are done, copy until the end of sub.
1005
23.8k
        if (++iter == splices.end()) {
1006
29.6k
          while (i < nsub)
1007
10.3k
            sub[out++] = sub[i++];
1008
19.3k
        }
1009
23.8k
      }
1010
19.3k
      splices.clear();
1011
19.3k
      nsub = out;
1012
      // Advance to the next round of factoring.
1013
19.3k
      round++;
1014
19.3k
    }
1015
1016
268k
    switch (round) {
1017
67.1k
      case 1:
1018
67.1k
        FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
1019
67.1k
        break;
1020
67.1k
      case 2:
1021
67.1k
        FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
1022
67.1k
        break;
1023
67.1k
      case 3:
1024
67.1k
        FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
1025
67.1k
        break;
1026
67.1k
      case 4:
1027
67.1k
        if (stk.size() == 1) {
1028
          // We are at the top of the stack. Just return.
1029
50.0k
          return nsub;
1030
50.0k
        } else {
1031
          // Pop the stack and set the number of suffixes.
1032
          // (Note that references will be invalidated!)
1033
17.0k
          int nsuffix = nsub;
1034
17.0k
          stk.pop_back();
1035
17.0k
          stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix;
1036
17.0k
          ++stk.back().spliceidx;
1037
17.0k
          continue;
1038
17.0k
        }
1039
0
      default:
1040
0
        ABSL_LOG(DFATAL) << "unknown round: " << round;
1041
0
        break;
1042
268k
    }
1043
1044
    // Set spliceidx depending on whether we have Splices to factor.
1045
201k
    if (splices.empty() || round == 3) {
1046
188k
      spliceidx = static_cast<int>(splices.size());
1047
188k
    } else {
1048
12.9k
      spliceidx = 0;
1049
12.9k
    }
1050
201k
  }
1051
50.0k
}
1052
1053
void FactorAlternationImpl::Round1(Regexp** sub, int nsub,
1054
                                   Regexp::ParseFlags flags,
1055
67.1k
                                   std::vector<Splice>* splices) {
1056
  // Round 1: Factor out common literal prefixes.
1057
67.1k
  int start = 0;
1058
67.1k
  Rune* rune = NULL;
1059
67.1k
  int nrune = 0;
1060
67.1k
  Regexp::ParseFlags runeflags = Regexp::NoParseFlags;
1061
357k
  for (int i = 0; i <= nsub; i++) {
1062
    // Invariant: sub[start:i] consists of regexps that all
1063
    // begin with rune[0:nrune].
1064
290k
    Rune* rune_i = NULL;
1065
290k
    int nrune_i = 0;
1066
290k
    Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;
1067
290k
    if (i < nsub) {
1068
223k
      rune_i = Regexp::LeadingString(sub[i], &nrune_i, &runeflags_i);
1069
223k
      if (runeflags_i == runeflags) {
1070
156k
        int same = 0;
1071
173k
        while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
1072
16.6k
          same++;
1073
156k
        if (same > 0) {
1074
          // Matches at least one rune in current range.  Keep going around.
1075
10.9k
          nrune = same;
1076
10.9k
          continue;
1077
10.9k
        }
1078
156k
      }
1079
223k
    }
1080
1081
    // Found end of a run with common leading literal string:
1082
    // sub[start:i] all begin with rune[0:nrune],
1083
    // but sub[i] does not even begin with rune[0].
1084
279k
    if (i == start) {
1085
      // Nothing to do - first iteration.
1086
212k
    } else if (i == start+1) {
1087
      // Just one: don't bother factoring.
1088
205k
    } else {
1089
7.47k
      Regexp* prefix = Regexp::LiteralString(rune, nrune, runeflags);
1090
25.8k
      for (int j = start; j < i; j++)
1091
18.3k
        Regexp::RemoveLeadingString(sub[j], nrune);
1092
7.47k
      splices->emplace_back(prefix, sub + start, i - start);
1093
7.47k
    }
1094
1095
    // Prepare for next iteration (if there is one).
1096
279k
    if (i < nsub) {
1097
212k
      start = i;
1098
212k
      rune = rune_i;
1099
212k
      nrune = nrune_i;
1100
212k
      runeflags = runeflags_i;
1101
212k
    }
1102
279k
  }
1103
67.1k
}
1104
1105
void FactorAlternationImpl::Round2(Regexp** sub, int nsub,
1106
                                   Regexp::ParseFlags flags,
1107
67.1k
                                   std::vector<Splice>* splices) {
1108
  // Round 2: Factor out common simple prefixes,
1109
  // just the first piece of each concatenation.
1110
  // This will be good enough a lot of the time.
1111
  //
1112
  // Complex subexpressions (e.g. involving quantifiers)
1113
  // are not safe to factor because that collapses their
1114
  // distinct paths through the automaton, which affects
1115
  // correctness in some cases.
1116
67.1k
  int start = 0;
1117
67.1k
  Regexp* first = NULL;
1118
346k
  for (int i = 0; i <= nsub; i++) {
1119
    // Invariant: sub[start:i] consists of regexps that all
1120
    // begin with first.
1121
279k
    Regexp* first_i = NULL;
1122
279k
    if (i < nsub) {
1123
212k
      first_i = Regexp::LeadingRegexp(sub[i]);
1124
212k
      if (first != NULL &&
1125
          // first must be an empty-width op
1126
          // OR a char class, any char or any byte
1127
          // OR a fixed repeat of a literal, char class, any char or any byte.
1128
212k
          (first->op() == kRegexpBeginLine ||
1129
94.6k
           first->op() == kRegexpEndLine ||
1130
94.6k
           first->op() == kRegexpWordBoundary ||
1131
94.6k
           first->op() == kRegexpNoWordBoundary ||
1132
94.6k
           first->op() == kRegexpBeginText ||
1133
94.6k
           first->op() == kRegexpEndText ||
1134
94.6k
           first->op() == kRegexpCharClass ||
1135
94.6k
           first->op() == kRegexpAnyChar ||
1136
94.6k
           first->op() == kRegexpAnyByte ||
1137
94.6k
           (first->op() == kRegexpRepeat &&
1138
77.4k
            first->min() == first->max() &&
1139
77.4k
            (first->sub()[0]->op() == kRegexpLiteral ||
1140
2.77k
             first->sub()[0]->op() == kRegexpCharClass ||
1141
2.77k
             first->sub()[0]->op() == kRegexpAnyChar ||
1142
2.77k
             first->sub()[0]->op() == kRegexpAnyByte))) &&
1143
212k
          Regexp::Equal(first, first_i))
1144
12.2k
        continue;
1145
212k
    }
1146
1147
    // Found end of a run with common leading regexp:
1148
    // sub[start:i] all begin with first,
1149
    // but sub[i] does not.
1150
267k
    if (i == start) {
1151
      // Nothing to do - first iteration.
1152
200k
    } else if (i == start+1) {
1153
      // Just one: don't bother factoring.
1154
190k
    } else {
1155
9.58k
      Regexp* prefix = first->Incref();
1156
31.4k
      for (int j = start; j < i; j++)
1157
21.8k
        sub[j] = Regexp::RemoveLeadingRegexp(sub[j]);
1158
9.58k
      splices->emplace_back(prefix, sub + start, i - start);
1159
9.58k
    }
1160
1161
    // Prepare for next iteration (if there is one).
1162
267k
    if (i < nsub) {
1163
200k
      start = i;
1164
200k
      first = first_i;
1165
200k
    }
1166
267k
  }
1167
67.1k
}
1168
1169
void FactorAlternationImpl::Round3(Regexp** sub, int nsub,
1170
                                   Regexp::ParseFlags flags,
1171
67.1k
                                   std::vector<Splice>* splices) {
1172
  // Round 3: Merge runs of literals and/or character classes.
1173
67.1k
  int start = 0;
1174
67.1k
  Regexp* first = NULL;
1175
334k
  for (int i = 0; i <= nsub; i++) {
1176
    // Invariant: sub[start:i] consists of regexps that all
1177
    // are either literals (i.e. runes) or character classes.
1178
267k
    Regexp* first_i = NULL;
1179
267k
    if (i < nsub) {
1180
200k
      first_i = sub[i];
1181
200k
      if (first != NULL &&
1182
200k
          (first->op() == kRegexpLiteral ||
1183
133k
           first->op() == kRegexpCharClass) &&
1184
200k
          (first_i->op() == kRegexpLiteral ||
1185
22.3k
           first_i->op() == kRegexpCharClass))
1186
8.90k
        continue;
1187
200k
    }
1188
1189
    // Found end of a run of Literal/CharClass:
1190
    // sub[start:i] all are either one or the other,
1191
    // but sub[i] is not.
1192
258k
    if (i == start) {
1193
      // Nothing to do - first iteration.
1194
191k
    } else if (i == start+1) {
1195
      // Just one: don't bother factoring.
1196
184k
    } else {
1197
6.79k
      CharClassBuilder ccb;
1198
22.4k
      for (int j = start; j < i; j++) {
1199
15.6k
        Regexp* re = sub[j];
1200
15.6k
        if (re->op() == kRegexpCharClass) {
1201
1.81k
          CharClass* cc = re->cc();
1202
30.5k
          for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
1203
28.7k
            ccb.AddRangeFlags(it->lo, it->hi, re->parse_flags());
1204
13.8k
        } else if (re->op() == kRegexpLiteral) {
1205
13.8k
          if (re->parse_flags() & Regexp::FoldCase) {
1206
            // AddFoldedRange() can terminate prematurely if the character class
1207
            // already contains the rune. For example, if it contains 'a' and we
1208
            // want to add folded 'a', it sees 'a' and stops without adding 'A'.
1209
            // To avoid that, we use an empty character class and then merge it.
1210
3.95k
            CharClassBuilder tmp;
1211
3.95k
            tmp.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
1212
3.95k
            ccb.AddCharClass(&tmp);
1213
9.93k
          } else {
1214
9.93k
            ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());
1215
9.93k
          }
1216
13.8k
        } else {
1217
0
          ABSL_LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "
1218
0
                           << re->ToString();
1219
0
        }
1220
15.6k
        re->Decref();
1221
15.6k
      }
1222
6.79k
      Regexp* re = Regexp::NewCharClass(ccb.GetCharClass(), flags & ~Regexp::FoldCase);
1223
6.79k
      splices->emplace_back(re, sub + start, i - start);
1224
6.79k
    }
1225
1226
    // Prepare for next iteration (if there is one).
1227
258k
    if (i < nsub) {
1228
191k
      start = i;
1229
191k
      first = first_i;
1230
191k
    }
1231
258k
  }
1232
67.1k
}
1233
1234
// Collapse the regexps on top of the stack, down to the
1235
// first marker, into a new op node (op == kRegexpAlternate
1236
// or op == kRegexpConcat).
1237
348k
void Regexp::ParseState::DoCollapse(RegexpOp op) {
1238
  // Scan backward to marker, counting children of composite.
1239
348k
  int n = 0;
1240
348k
  Regexp* next = NULL;
1241
348k
  Regexp* sub;
1242
1.10M
  for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1243
759k
    next = sub->down_;
1244
759k
    if (sub->op_ == op)
1245
6.39k
      n += sub->nsub_;
1246
753k
    else
1247
753k
      n++;
1248
759k
  }
1249
1250
  // If there's just one child, leave it alone.
1251
  // (Concat of one thing is that one thing; alternate of one thing is same.)
1252
348k
  if (stacktop_ != NULL && stacktop_->down_ == next)
1253
242k
    return;
1254
1255
  // Construct op (alternation or concatenation), flattening op of op.
1256
105k
  PODArray<Regexp*> subs(n);
1257
105k
  next = NULL;
1258
105k
  int i = n;
1259
621k
  for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {
1260
516k
    next = sub->down_;
1261
516k
    if (sub->op_ == op) {
1262
4.50k
      Regexp** sub_subs = sub->sub();
1263
73.6k
      for (int k = sub->nsub_ - 1; k >= 0; k--)
1264
69.1k
        subs[--i] = sub_subs[k]->Incref();
1265
4.50k
      sub->Decref();
1266
512k
    } else {
1267
512k
      subs[--i] = FinishRegexp(sub);
1268
512k
    }
1269
516k
  }
1270
1271
105k
  Regexp* re = ConcatOrAlternate(op, subs.data(), n, flags_, true);
1272
105k
  re->simple_ = re->ComputeSimple();
1273
105k
  re->down_ = next;
1274
105k
  stacktop_ = re;
1275
105k
}
1276
1277
// Finishes the current concatenation,
1278
// collapsing it into a single regexp on the stack.
1279
231k
void Regexp::ParseState::DoConcatenation() {
1280
231k
  Regexp* r1 = stacktop_;
1281
231k
  if (r1 == NULL || IsMarker(r1->op())) {
1282
    // empty concatenation is special case
1283
59.2k
    Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);
1284
59.2k
    PushRegexp(re);
1285
59.2k
  }
1286
231k
  DoCollapse(kRegexpConcat);
1287
231k
}
1288
1289
// Finishes the current alternation,
1290
// collapsing it to a single regexp on the stack.
1291
116k
void Regexp::ParseState::DoAlternation() {
1292
116k
  DoVerticalBar();
1293
  // Now stack top is kVerticalBar.
1294
116k
  Regexp* r1 = stacktop_;
1295
116k
  stacktop_ = r1->down_;
1296
116k
  r1->Decref();
1297
116k
  DoCollapse(kRegexpAlternate);
1298
116k
}
1299
1300
// Incremental conversion of concatenated literals into strings.
1301
// If top two elements on stack are both literal or string,
1302
// collapse into single string.
1303
// Don't walk down the stack -- the parser calls this frequently
1304
// enough that below the bottom two is known to be collapsed.
1305
// Only called when another regexp is about to be pushed
1306
// on the stack, so that the topmost literal is not being considered.
1307
// (Otherwise ab* would turn into (ab)*.)
1308
// If r >= 0, consider pushing a literal r on the stack.
1309
// Return whether that happened.
1310
1.53M
bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {
1311
1.53M
  Regexp* re1;
1312
1.53M
  Regexp* re2;
1313
1.53M
  if ((re1 = stacktop_) == NULL || (re2 = re1->down_) == NULL)
1314
138k
    return false;
1315
1316
1.39M
  if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)
1317
849k
    return false;
1318
546k
  if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)
1319
272k
    return false;
1320
274k
  if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))
1321
628
    return false;
1322
1323
274k
  if (re2->op_ == kRegexpLiteral) {
1324
    // convert into string
1325
77.2k
    Rune rune = re2->rune_;
1326
77.2k
    re2->op_ = kRegexpLiteralString;
1327
77.2k
    re2->nrunes_ = 0;
1328
77.2k
    re2->runes_ = NULL;
1329
77.2k
    re2->AddRuneToString(rune);
1330
77.2k
  }
1331
1332
  // push re1 into re2.
1333
274k
  if (re1->op_ == kRegexpLiteral) {
1334
272k
    re2->AddRuneToString(re1->rune_);
1335
272k
  } else {
1336
42.5k
    for (int i = 0; i < re1->nrunes_; i++)
1337
41.0k
      re2->AddRuneToString(re1->runes_[i]);
1338
1.42k
    re1->nrunes_ = 0;
1339
1.42k
    delete[] re1->runes_;
1340
1.42k
    re1->runes_ = NULL;
1341
1.42k
  }
1342
1343
  // reuse re1 if possible
1344
274k
  if (r >= 0) {
1345
198k
    re1->op_ = kRegexpLiteral;
1346
198k
    re1->rune_ = r;
1347
198k
    re1->parse_flags_ = static_cast<uint16_t>(flags);
1348
198k
    return true;
1349
198k
  }
1350
1351
75.3k
  stacktop_ = re2;
1352
75.3k
  re1->Decref();
1353
75.3k
  return false;
1354
274k
}
1355
1356
// Lexing routines.
1357
1358
// Parses a decimal integer, storing it in *np.
1359
// Sets *s to span the remainder of the string.
1360
64.8k
static bool ParseInteger(absl::string_view* s, int* np) {
1361
64.8k
  if (s->empty() || !absl::ascii_isdigit((*s)[0] & 0xFF))
1362
5.67k
    return false;
1363
  // Disallow leading zeros.
1364
59.1k
  if (s->size() >= 2 && (*s)[0] == '0' && absl::ascii_isdigit((*s)[1] & 0xFF))
1365
667
    return false;
1366
58.4k
  int n = 0;
1367
58.4k
  int c;
1368
123k
  while (!s->empty() && absl::ascii_isdigit(c = (*s)[0] & 0xFF)) {
1369
    // Avoid overflow.
1370
65.4k
    if (n >= 100000000)
1371
301
      return false;
1372
65.1k
    n = n*10 + c - '0';
1373
65.1k
    s->remove_prefix(1);  // digit
1374
65.1k
  }
1375
58.1k
  *np = n;
1376
58.1k
  return true;
1377
58.4k
}
1378
1379
// Parses a repetition suffix like {1,2} or {2} or {2,}.
1380
// Sets *s to span the remainder of the string on success.
1381
// Sets *lo and *hi to the given range.
1382
// In the case of {2,}, the high number is unbounded;
1383
// sets *hi to -1 to signify this.
1384
// {,2} is NOT a valid suffix.
1385
// The Maybe in the name signifies that the regexp parse
1386
// doesn't fail even if ParseRepetition does, so the string_view
1387
// s must NOT be edited unless MaybeParseRepetition returns true.
1388
58.1k
static bool MaybeParseRepetition(absl::string_view* sp, int* lo, int* hi) {
1389
58.1k
  absl::string_view s = *sp;
1390
58.1k
  if (s.empty() || s[0] != '{')
1391
0
    return false;
1392
58.1k
  s.remove_prefix(1);  // '{'
1393
58.1k
  if (!ParseInteger(&s, lo))
1394
5.68k
    return false;
1395
52.4k
  if (s.empty())
1396
151
    return false;
1397
52.2k
  if (s[0] == ',') {
1398
9.44k
    s.remove_prefix(1);  // ','
1399
9.44k
    if (s.empty())
1400
26
      return false;
1401
9.42k
    if (s[0] == '}') {
1402
      // {2,} means at least 2
1403
2.70k
      *hi = -1;
1404
6.71k
    } else {
1405
      // {2,4} means 2, 3, or 4.
1406
6.71k
      if (!ParseInteger(&s, hi))
1407
954
        return false;
1408
6.71k
    }
1409
42.8k
  } else {
1410
    // {2} means exactly two
1411
42.8k
    *hi = *lo;
1412
42.8k
  }
1413
51.2k
  if (s.empty() || s[0] != '}')
1414
2.28k
    return false;
1415
49.0k
  s.remove_prefix(1);  // '}'
1416
49.0k
  *sp = s;
1417
49.0k
  return true;
1418
51.2k
}
1419
1420
// Removes the next Rune from the string_view and stores it in *r.
1421
// Returns number of bytes removed from sp.
1422
// Behaves as though there is a terminating NUL at the end of sp.
1423
// Argument order is backwards from usual Google style
1424
// but consistent with chartorune.
1425
static int StringViewToRune(Rune* r, absl::string_view* sp,
1426
734k
                            RegexpStatus* status) {
1427
  // fullrune() takes int, not size_t. However, it just looks
1428
  // at the leading byte and treats any length >= 4 the same.
1429
734k
  if (fullrune(sp->data(), static_cast<int>(std::min(size_t{4}, sp->size())))) {
1430
734k
    int n = chartorune(r, sp->data());
1431
    // Some copies of chartorune have a bug that accepts
1432
    // encodings of values in (10FFFF, 1FFFFF] as valid.
1433
    // Those values break the character class algorithm,
1434
    // which assumes Runemax is the largest rune.
1435
734k
    if (*r > Runemax) {
1436
20
      n = 1;
1437
20
      *r = Runeerror;
1438
20
    }
1439
734k
    if (!(n == 1 && *r == Runeerror)) {  // no decoding error
1440
734k
      sp->remove_prefix(n);
1441
734k
      return n;
1442
734k
    }
1443
734k
  }
1444
1445
207
  if (status != NULL) {
1446
207
    status->set_code(kRegexpBadUTF8);
1447
207
    status->set_error_arg(absl::string_view());
1448
207
  }
1449
207
  return -1;
1450
734k
}
1451
1452
// Returns whether name is valid UTF-8.
1453
// If not, sets status to kRegexpBadUTF8.
1454
9.72k
static bool IsValidUTF8(absl::string_view s, RegexpStatus* status) {
1455
9.72k
  absl::string_view t = s;
1456
9.72k
  Rune r;
1457
63.7k
  while (!t.empty()) {
1458
54.0k
    if (StringViewToRune(&r, &t, status) < 0)
1459
20
      return false;
1460
54.0k
  }
1461
9.70k
  return true;
1462
9.72k
}
1463
1464
// Is c a hex digit?
1465
3.97k
static int IsHex(int c) {
1466
3.97k
  return ('0' <= c && c <= '9') ||
1467
3.97k
         ('A' <= c && c <= 'F') ||
1468
3.97k
         ('a' <= c && c <= 'f');
1469
3.97k
}
1470
1471
// Convert hex digit to value.
1472
3.39k
static int UnHex(int c) {
1473
3.39k
  if ('0' <= c && c <= '9')
1474
1.59k
    return c - '0';
1475
1.79k
  if ('A' <= c && c <= 'F')
1476
830
    return c - 'A' + 10;
1477
968
  if ('a' <= c && c <= 'f')
1478
968
    return c - 'a' + 10;
1479
0
  ABSL_LOG(DFATAL) << "Bad hex digit " << c;
1480
0
  return 0;
1481
0
}
1482
1483
// Parse an escape sequence (e.g., \n, \{).
1484
// Sets *s to span the remainder of the string.
1485
// Sets *rp to the named character.
1486
static bool ParseEscape(absl::string_view* s, Rune* rp,
1487
6.22k
                        RegexpStatus* status, int rune_max) {
1488
6.22k
  const char* begin = s->data();
1489
6.22k
  if (s->empty() || (*s)[0] != '\\') {
1490
    // Should not happen - caller always checks.
1491
0
    status->set_code(kRegexpInternalError);
1492
0
    status->set_error_arg(absl::string_view());
1493
0
    return false;
1494
0
  }
1495
6.22k
  if (s->size() == 1) {
1496
55
    status->set_code(kRegexpTrailingBackslash);
1497
55
    status->set_error_arg(absl::string_view());
1498
55
    return false;
1499
55
  }
1500
6.17k
  Rune c, c1;
1501
6.17k
  s->remove_prefix(1);  // backslash
1502
6.17k
  if (StringViewToRune(&c, s, status) < 0)
1503
18
    return false;
1504
6.15k
  int code;
1505
6.15k
  switch (c) {
1506
539
    default:
1507
539
      if (c < Runeself && !absl::ascii_isalnum(c)) {
1508
        // Escaped non-word characters are always themselves.
1509
        // PCRE is not quite so rigorous: it accepts things like
1510
        // \q, but we don't.  We once rejected \_, but too many
1511
        // programs and people insist on using it, so allow \_.
1512
507
        *rp = c;
1513
507
        return true;
1514
507
      }
1515
32
      goto BadEscape;
1516
1517
    // Octal escapes.
1518
142
    case '1':
1519
376
    case '2':
1520
857
    case '3':
1521
1.06k
    case '4':
1522
1.28k
    case '5':
1523
1.51k
    case '6':
1524
1.59k
    case '7':
1525
      // Single non-zero octal digit is a backreference; not supported.
1526
1.59k
      if (s->empty() || (*s)[0] < '0' || (*s)[0] > '7')
1527
16
        goto BadEscape;
1528
1.58k
      ABSL_FALLTHROUGH_INTENDED;
1529
2.10k
    case '0':
1530
      // consume up to three octal digits; already have one.
1531
2.10k
      code = c - '0';
1532
2.10k
      if (!s->empty() && '0' <= (c = (*s)[0]) && c <= '7') {
1533
1.61k
        code = code * 8 + c - '0';
1534
1.61k
        s->remove_prefix(1);  // digit
1535
1.61k
        if (!s->empty()) {
1536
1.51k
          c = (*s)[0];
1537
1.51k
          if ('0' <= c && c <= '7') {
1538
358
            code = code * 8 + c - '0';
1539
358
            s->remove_prefix(1);  // digit
1540
358
          }
1541
1.51k
        }
1542
1.61k
      }
1543
2.10k
      if (code > rune_max)
1544
3
        goto BadEscape;
1545
2.09k
      *rp = code;
1546
2.09k
      return true;
1547
1548
    // Hexadecimal escapes
1549
1.62k
    case 'x':
1550
1.62k
      if (s->empty())
1551
1
        goto BadEscape;
1552
1.62k
      if (StringViewToRune(&c, s, status) < 0)
1553
3
        return false;
1554
1.62k
      if (c == '{') {
1555
        // Any number of digits in braces.
1556
        // Update n as we consume the string, so that
1557
        // the whole thing gets shown in the error message.
1558
        // Perl accepts any text at all; it ignores all text
1559
        // after the first non-hex digit.  We require only hex digits,
1560
        // and at least one.
1561
520
        if (StringViewToRune(&c, s, status) < 0)
1562
1
          return false;
1563
519
        int nhex = 0;
1564
519
        code = 0;
1565
1.87k
        while (IsHex(c)) {
1566
1.40k
          nhex++;
1567
1.40k
          code = code * 16 + UnHex(c);
1568
1.40k
          if (code > rune_max)
1569
24
            goto BadEscape;
1570
1.37k
          if (s->empty())
1571
21
            goto BadEscape;
1572
1.35k
          if (StringViewToRune(&c, s, status) < 0)
1573
1
            return false;
1574
1.35k
        }
1575
473
        if (c != '}' || nhex == 0)
1576
41
          goto BadEscape;
1577
432
        *rp = code;
1578
432
        return true;
1579
473
      }
1580
      // Easy case: two hex digits.
1581
1.10k
      if (s->empty())
1582
38
        goto BadEscape;
1583
1.06k
      if (StringViewToRune(&c1, s, status) < 0)
1584
1
        return false;
1585
1.06k
      if (!IsHex(c) || !IsHex(c1))
1586
68
        goto BadEscape;
1587
996
      *rp = UnHex(c) * 16 + UnHex(c1);
1588
996
      return true;
1589
1590
    // C escapes.
1591
615
    case 'n':
1592
615
      *rp = '\n';
1593
615
      return true;
1594
299
    case 'r':
1595
299
      *rp = '\r';
1596
299
      return true;
1597
234
    case 't':
1598
234
      *rp = '\t';
1599
234
      return true;
1600
1601
    // Less common C escapes.
1602
243
    case 'a':
1603
243
      *rp = '\a';
1604
243
      return true;
1605
213
    case 'f':
1606
213
      *rp = '\f';
1607
213
      return true;
1608
267
    case 'v':
1609
267
      *rp = '\v';
1610
267
      return true;
1611
1612
    // This code is disabled to avoid misparsing
1613
    // the Perl word-boundary \b as a backspace
1614
    // when in POSIX regexp mode.  Surprisingly,
1615
    // in Perl, \b means word-boundary but [\b]
1616
    // means backspace.  We don't support that:
1617
    // if you want a backspace embed a literal
1618
    // backspace character or use \x08.
1619
    //
1620
    // case 'b':
1621
    //   *rp = '\b';
1622
    //   return true;
1623
6.15k
  }
1624
1625
244
BadEscape:
1626
  // Unrecognized escape sequence.
1627
244
  status->set_code(kRegexpBadEscape);
1628
244
  status->set_error_arg(
1629
244
      absl::string_view(begin, static_cast<size_t>(s->data() - begin)));
1630
244
  return false;
1631
6.15k
}
1632
1633
// Add a range to the character class, but exclude newline if asked.
1634
// Also handle case folding.
1635
void CharClassBuilder::AddRangeFlags(
1636
1.94M
    Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {
1637
1638
  // Take out \n if the flags say so.
1639
1.94M
  bool cutnl = !(parse_flags & Regexp::ClassNL) ||
1640
1.94M
               (parse_flags & Regexp::NeverNL);
1641
1.94M
  if (cutnl && lo <= '\n' && '\n' <= hi) {
1642
10.4k
    if (lo < '\n')
1643
8.86k
      AddRangeFlags(lo, '\n' - 1, parse_flags);
1644
10.4k
    if (hi > '\n')
1645
8.73k
      AddRangeFlags('\n' + 1, hi, parse_flags);
1646
10.4k
    return;
1647
10.4k
  }
1648
1649
  // If folding case, add fold-equivalent characters too.
1650
1.93M
  if (parse_flags & Regexp::FoldCase) {
1651
961k
    if (parse_flags & Regexp::Latin1) {
1652
54.4k
      AddFoldedRangeLatin1(this, lo, hi);
1653
907k
    } else {
1654
907k
      AddFoldedRange(this, lo, hi, 0);
1655
907k
    }
1656
969k
  } else {
1657
969k
    AddRange(lo, hi);
1658
969k
  }
1659
1.93M
}
1660
1661
// Look for a group with the given name.
1662
static const UGroup* LookupGroup(absl::string_view name,
1663
11.8k
                                 const UGroup* groups, int ngroups) {
1664
  // Simple name lookup.
1665
470k
  for (int i = 0; i < ngroups; i++)
1666
466k
    if (absl::string_view(groups[i].name) == name)
1667
7.21k
      return &groups[i];
1668
4.62k
  return NULL;
1669
11.8k
}
1670
1671
// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
1672
995
static const UGroup* LookupPosixGroup(absl::string_view name) {
1673
995
  return LookupGroup(name, posix_groups, num_posix_groups);
1674
995
}
1675
1676
6.46k
static const UGroup* LookupPerlGroup(absl::string_view name) {
1677
6.46k
  return LookupGroup(name, perl_groups, num_perl_groups);
1678
6.46k
}
1679
1680
#if !defined(RE2_USE_ICU)
1681
// Fake UGroup containing all Runes
1682
static URange16 any16[] = { { 0, 65535 } };
1683
static URange32 any32[] = { { 65536, Runemax } };
1684
static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
1685
1686
// Look for a Unicode group with the given name (e.g., "Han")
1687
4.38k
static const UGroup* LookupUnicodeGroup(absl::string_view name) {
1688
  // Special case: "Any" means any.
1689
4.38k
  if (name == absl::string_view("Any"))
1690
11
    return &anygroup;
1691
4.36k
  return LookupGroup(name, unicode_groups, num_unicode_groups);
1692
4.38k
}
1693
#endif
1694
1695
// Add a UGroup or its negation to the character class.
1696
static void AddUGroup(CharClassBuilder* cc, const UGroup* g, int sign,
1697
9.36k
                      Regexp::ParseFlags parse_flags) {
1698
9.36k
  if (sign == +1) {
1699
714k
    for (int i = 0; i < g->nr16; i++) {
1700
709k
      cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);
1701
709k
    }
1702
498k
    for (int i = 0; i < g->nr32; i++) {
1703
493k
      cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);
1704
493k
    }
1705
5.10k
  } else {
1706
4.26k
    if (parse_flags & Regexp::FoldCase) {
1707
      // Normally adding a case-folded group means
1708
      // adding all the extra fold-equivalent runes too.
1709
      // But if we're adding the negation of the group,
1710
      // we have to exclude all the runes that are fold-equivalent
1711
      // to what's already missing.  Too hard, so do in two steps.
1712
2.13k
      CharClassBuilder ccb1;
1713
2.13k
      AddUGroup(&ccb1, g, +1, parse_flags);
1714
      // If the flags say to take out \n, put it in, so that negating will take it out.
1715
      // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
1716
2.13k
      bool cutnl = !(parse_flags & Regexp::ClassNL) ||
1717
2.13k
                   (parse_flags & Regexp::NeverNL);
1718
2.13k
      if (cutnl) {
1719
1.06k
        ccb1.AddRange('\n', '\n');
1720
1.06k
      }
1721
2.13k
      ccb1.Negate();
1722
2.13k
      cc->AddCharClass(&ccb1);
1723
2.13k
      return;
1724
2.13k
    }
1725
2.12k
    int next = 0;
1726
365k
    for (int i = 0; i < g->nr16; i++) {
1727
363k
      if (next < g->r16[i].lo)
1728
363k
        cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);
1729
363k
      next = g->r16[i].hi + 1;
1730
363k
    }
1731
240k
    for (int i = 0; i < g->nr32; i++) {
1732
238k
      if (next < g->r32[i].lo)
1733
238k
        cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);
1734
238k
      next = g->r32[i].hi + 1;
1735
238k
    }
1736
2.12k
    if (next <= Runemax)
1737
2.12k
      cc->AddRangeFlags(next, Runemax, parse_flags);
1738
2.12k
  }
1739
9.36k
}
1740
1741
// Maybe parse a Perl character class escape sequence.
1742
// Only recognizes the Perl character classes (\d \s \w \D \S \W),
1743
// not the Perl empty-string classes (\b \B \A \Z \z).
1744
// On success, sets *s to span the remainder of the string
1745
// and returns the corresponding UGroup.
1746
// The string_view must *NOT* be edited unless the call succeeds.
1747
const UGroup* MaybeParsePerlCCEscape(absl::string_view* s,
1748
79.3k
                                     Regexp::ParseFlags parse_flags) {
1749
79.3k
  if (!(parse_flags & Regexp::PerlClasses))
1750
13.9k
    return NULL;
1751
65.3k
  if (s->size() < 2 || (*s)[0] != '\\')
1752
58.9k
    return NULL;
1753
  // Could use StringViewToRune, but there aren't
1754
  // any non-ASCII Perl group names.
1755
6.46k
  absl::string_view name(s->data(), 2);
1756
6.46k
  const UGroup* g = LookupPerlGroup(name);
1757
6.46k
  if (g == NULL)
1758
4.50k
    return NULL;
1759
1.96k
  s->remove_prefix(name.size());
1760
1.96k
  return g;
1761
6.46k
}
1762
1763
enum ParseStatus {
1764
  kParseOk,  // Did some parsing.
1765
  kParseError,  // Found an error.
1766
  kParseNothing,  // Decided not to parse.
1767
};
1768
1769
// Maybe parses a Unicode character group like \p{Han} or \P{Han}
1770
// (the latter is a negated group).
1771
ParseStatus ParseUnicodeGroup(absl::string_view* s,
1772
                              Regexp::ParseFlags parse_flags,
1773
4.44k
                              CharClassBuilder* cc, RegexpStatus* status) {
1774
  // Decide whether to parse.
1775
4.44k
  if (!(parse_flags & Regexp::UnicodeGroups))
1776
2
    return kParseNothing;
1777
4.44k
  if (s->size() < 2 || (*s)[0] != '\\')
1778
0
    return kParseNothing;
1779
4.44k
  Rune c = (*s)[1];
1780
4.44k
  if (c != 'p' && c != 'P')
1781
0
    return kParseNothing;
1782
1783
  // Committed to parse.  Results:
1784
4.44k
  int sign = +1;  // -1 = negated char class
1785
4.44k
  if (c == 'P')
1786
2.99k
    sign = -sign;
1787
4.44k
  absl::string_view seq = *s;  // \p{Han} or \pL
1788
4.44k
  absl::string_view name;  // Han or L
1789
4.44k
  s->remove_prefix(2);  // '\\', 'p'
1790
1791
4.44k
  if (!StringViewToRune(&c, s, status))
1792
0
    return kParseError;
1793
4.44k
  if (c != '{') {
1794
    // Name is the bit of string we just skipped over for c.
1795
4.11k
    const char* p = seq.data() + 2;
1796
4.11k
    name = absl::string_view(p, static_cast<size_t>(s->data() - p));
1797
4.11k
  } else {
1798
    // Name is in braces. Look for closing }
1799
335
    size_t end = s->find('}', 0);
1800
335
    if (end == absl::string_view::npos) {
1801
58
      if (!IsValidUTF8(seq, status))
1802
5
        return kParseError;
1803
53
      status->set_code(kRegexpBadCharRange);
1804
53
      status->set_error_arg(seq);
1805
53
      return kParseError;
1806
58
    }
1807
277
    name = absl::string_view(s->data(), end);  // without '}'
1808
277
    s->remove_prefix(end + 1);  // with '}'
1809
277
    if (!IsValidUTF8(name, status))
1810
8
      return kParseError;
1811
277
  }
1812
1813
  // Chop seq where s now begins.
1814
4.38k
  seq = absl::string_view(seq.data(), static_cast<size_t>(s->data() - seq.data()));
1815
1816
4.38k
  if (!name.empty() && name[0] == '^') {
1817
4
    sign = -sign;
1818
4
    name.remove_prefix(1);  // '^'
1819
4
  }
1820
1821
4.38k
#if !defined(RE2_USE_ICU)
1822
  // Look up the group in the RE2 Unicode data.
1823
4.38k
  const UGroup* g = LookupUnicodeGroup(name);
1824
4.38k
  if (g == NULL) {
1825
113
    status->set_code(kRegexpBadCharRange);
1826
113
    status->set_error_arg(seq);
1827
113
    return kParseError;
1828
113
  }
1829
1830
4.26k
  AddUGroup(cc, g, sign, parse_flags);
1831
#else
1832
  // Look up the group in the ICU Unicode data. Because ICU provides full
1833
  // Unicode properties support, this could be more than a lookup by name.
1834
  ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
1835
      std::string("\\p{") + std::string(name) + std::string("}"));
1836
  UErrorCode uerr = U_ZERO_ERROR;
1837
  ::icu::UnicodeSet uset(ustr, uerr);
1838
  if (U_FAILURE(uerr)) {
1839
    status->set_code(kRegexpBadCharRange);
1840
    status->set_error_arg(seq);
1841
    return kParseError;
1842
  }
1843
1844
  // Convert the UnicodeSet to a URange32 and UGroup that we can add.
1845
  int nr = uset.getRangeCount();
1846
  PODArray<URange32> r(nr);
1847
  for (int i = 0; i < nr; i++) {
1848
    r[i].lo = uset.getRangeStart(i);
1849
    r[i].hi = uset.getRangeEnd(i);
1850
  }
1851
  UGroup g = {"", +1, 0, 0, r.data(), nr};
1852
  AddUGroup(cc, &g, sign, parse_flags);
1853
#endif
1854
1855
4.26k
  return kParseOk;
1856
4.38k
}
1857
1858
// Parses a character class name like [:alnum:].
1859
// Sets *s to span the remainder of the string.
1860
// Adds the ranges corresponding to the class to ranges.
1861
static ParseStatus ParseCCName(absl::string_view* s,
1862
                               Regexp::ParseFlags parse_flags,
1863
1.30k
                               CharClassBuilder* cc, RegexpStatus* status) {
1864
  // Check begins with [:
1865
1.30k
  const char* p = s->data();
1866
1.30k
  const char* ep = s->data() + s->size();
1867
1.30k
  if (ep - p < 2 || p[0] != '[' || p[1] != ':')
1868
0
    return kParseNothing;
1869
1870
  // Look for closing :].
1871
1.30k
  const char* q;
1872
17.8k
  for (q = p+2; q <= ep-2 && (*q != ':' || *(q+1) != ']'); q++)
1873
16.5k
    ;
1874
1875
  // If no closing :], then ignore.
1876
1.30k
  if (q > ep-2)
1877
310
    return kParseNothing;
1878
1879
  // Got it.  Check that it's valid.
1880
995
  q += 2;
1881
995
  absl::string_view name(p, static_cast<size_t>(q - p));
1882
1883
995
  const UGroup* g = LookupPosixGroup(name);
1884
995
  if (g == NULL) {
1885
6
    status->set_code(kRegexpBadCharRange);
1886
6
    status->set_error_arg(name);
1887
6
    return kParseError;
1888
6
  }
1889
1890
989
  s->remove_prefix(name.size());
1891
989
  AddUGroup(cc, g, g->sign, parse_flags);
1892
989
  return kParseOk;
1893
995
}
1894
1895
// Parses a character inside a character class.
1896
// There are fewer special characters here than in the rest of the regexp.
1897
// Sets *s to span the remainder of the string.
1898
// Sets *rp to the character.
1899
bool Regexp::ParseState::ParseCCCharacter(absl::string_view* s, Rune* rp,
1900
                                          absl::string_view whole_class,
1901
88.3k
                                          RegexpStatus* status) {
1902
88.3k
  if (s->empty()) {
1903
0
    status->set_code(kRegexpMissingBracket);
1904
0
    status->set_error_arg(whole_class);
1905
0
    return false;
1906
0
  }
1907
1908
  // Allow regular escape sequences even though
1909
  // many need not be escaped in this context.
1910
88.3k
  if ((*s)[0] == '\\')
1911
2.71k
    return ParseEscape(s, rp, status, rune_max_);
1912
1913
  // Otherwise take the next rune.
1914
85.6k
  return StringViewToRune(rp, s, status) >= 0;
1915
88.3k
}
1916
1917
// Parses a character class character, or, if the character
1918
// is followed by a hyphen, parses a character class range.
1919
// For single characters, rr->lo == rr->hi.
1920
// Sets *s to span the remainder of the string.
1921
// Sets *rp to the character.
1922
bool Regexp::ParseState::ParseCCRange(absl::string_view* s, RuneRange* rr,
1923
                                      absl::string_view whole_class,
1924
73.9k
                                      RegexpStatus* status) {
1925
73.9k
  absl::string_view os = *s;
1926
73.9k
  if (!ParseCCCharacter(s, &rr->lo, whole_class, status))
1927
18
    return false;
1928
  // [a-] means (a|-), so check for final ].
1929
73.8k
  if (s->size() >= 2 && (*s)[0] == '-' && (*s)[1] != ']') {
1930
14.4k
    s->remove_prefix(1);  // '-'
1931
14.4k
    if (!ParseCCCharacter(s, &rr->hi, whole_class, status))
1932
1
      return false;
1933
14.4k
    if (rr->hi < rr->lo) {
1934
1
      status->set_code(kRegexpBadCharRange);
1935
1
      status->set_error_arg(absl::string_view(
1936
1
          os.data(), static_cast<size_t>(s->data() - os.data())));
1937
1
      return false;
1938
1
    }
1939
59.4k
  } else {
1940
59.4k
    rr->hi = rr->lo;
1941
59.4k
  }
1942
73.8k
  return true;
1943
73.8k
}
1944
1945
// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
1946
// Sets *s to span the remainder of the string.
1947
// Sets *out_re to the regexp for the class.
1948
bool Regexp::ParseState::ParseCharClass(absl::string_view* s, Regexp** out_re,
1949
26.9k
                                        RegexpStatus* status) {
1950
26.9k
  absl::string_view whole_class = *s;
1951
26.9k
  if (s->empty() || (*s)[0] != '[') {
1952
    // Caller checked this.
1953
0
    status->set_code(kRegexpInternalError);
1954
0
    status->set_error_arg(absl::string_view());
1955
0
    return false;
1956
0
  }
1957
26.9k
  bool negated = false;
1958
26.9k
  Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);
1959
26.9k
  re->ccb_ = new CharClassBuilder;
1960
26.9k
  s->remove_prefix(1);  // '['
1961
26.9k
  if (!s->empty() && (*s)[0] == '^') {
1962
4.54k
    s->remove_prefix(1);  // '^'
1963
4.54k
    negated = true;
1964
4.54k
    if (!(flags_ & ClassNL) || (flags_ & NeverNL)) {
1965
      // If NL can't match implicitly, then pretend
1966
      // negated classes include a leading \n.
1967
3.29k
      re->ccb_->AddRange('\n', '\n');
1968
3.29k
    }
1969
4.54k
  }
1970
26.9k
  bool first = true;  // ] is okay as first char in class
1971
102k
  while (!s->empty() && ((*s)[0] != ']' || first)) {
1972
    // - is only okay unescaped as first or last in class.
1973
    // Except that Perl allows - anywhere.
1974
75.1k
    if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&
1975
75.1k
        (s->size() == 1 || (*s)[1] != ']')) {
1976
37
      absl::string_view t = *s;
1977
37
      t.remove_prefix(1);  // '-'
1978
37
      Rune r;
1979
37
      int n = StringViewToRune(&r, &t, status);
1980
37
      if (n < 0) {
1981
5
        re->Decref();
1982
5
        return false;
1983
5
      }
1984
32
      status->set_code(kRegexpBadCharRange);
1985
32
      status->set_error_arg(absl::string_view(s->data(), 1+n));
1986
32
      re->Decref();
1987
32
      return false;
1988
37
    }
1989
75.1k
    first = false;
1990
1991
    // Look for [:alnum:] etc.
1992
75.1k
    if (s->size() > 2 && (*s)[0] == '[' && (*s)[1] == ':') {
1993
1.30k
      switch (ParseCCName(s, flags_, re->ccb_, status)) {
1994
989
        case kParseOk:
1995
989
          continue;
1996
6
        case kParseError:
1997
6
          re->Decref();
1998
6
          return false;
1999
310
        case kParseNothing:
2000
310
          break;
2001
1.30k
      }
2002
1.30k
    }
2003
2004
    // Look for Unicode character group like \p{Han}
2005
74.1k
    if (s->size() > 2 &&
2006
74.1k
        (*s)[0] == '\\' &&
2007
74.1k
        ((*s)[1] == 'p' || (*s)[1] == 'P')) {
2008
50
      switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {
2009
47
        case kParseOk:
2010
47
          continue;
2011
2
        case kParseError:
2012
2
          re->Decref();
2013
2
          return false;
2014
1
        case kParseNothing:
2015
1
          break;
2016
50
      }
2017
50
    }
2018
2019
    // Look for Perl character class symbols (extension).
2020
74.0k
    const UGroup* g = MaybeParsePerlCCEscape(s, flags_);
2021
74.0k
    if (g != NULL) {
2022
148
      AddUGroup(re->ccb_, g, g->sign, flags_);
2023
148
      continue;
2024
148
    }
2025
2026
    // Otherwise assume single character or simple range.
2027
73.9k
    RuneRange rr;
2028
73.9k
    if (!ParseCCRange(s, &rr, whole_class, status)) {
2029
20
      re->Decref();
2030
20
      return false;
2031
20
    }
2032
    // AddRangeFlags is usually called in response to a class like
2033
    // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
2034
    // Regexp::ClassNL is set.  In an explicit range or singleton
2035
    // like we just parsed, we do not filter \n out, so set ClassNL
2036
    // in the flags.
2037
73.8k
    re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ | Regexp::ClassNL);
2038
73.8k
  }
2039
26.9k
  if (s->empty()) {
2040
312
    status->set_code(kRegexpMissingBracket);
2041
312
    status->set_error_arg(whole_class);
2042
312
    re->Decref();
2043
312
    return false;
2044
312
  }
2045
26.6k
  s->remove_prefix(1);  // ']'
2046
2047
26.6k
  if (negated)
2048
4.53k
    re->ccb_->Negate();
2049
2050
26.6k
  *out_re = re;
2051
26.6k
  return true;
2052
26.9k
}
2053
2054
// Returns whether name is a valid capture name.
2055
9.32k
static bool IsValidCaptureName(absl::string_view name) {
2056
9.32k
  if (name.empty())
2057
1
    return false;
2058
2059
  // Historically, we effectively used [0-9A-Za-z_]+ to validate; that
2060
  // followed Python 2 except for not restricting the first character.
2061
  // As of Python 3, Unicode characters beyond ASCII are also allowed;
2062
  // accordingly, we permit the Lu, Ll, Lt, Lm, Lo, Nl, Mn, Mc, Nd and
2063
  // Pc categories, but again without restricting the first character.
2064
  // Also, Unicode normalization (e.g. NFKC) isn't performed: Python 3
2065
  // performs it for identifiers, but seemingly not for capture names;
2066
  // if they start doing that for capture names, we won't follow suit.
2067
9.32k
  static const CharClass* const cc = []() {
2068
1
    CharClassBuilder ccb;
2069
1
    for (absl::string_view group :
2070
1
         {"Lu", "Ll", "Lt", "Lm", "Lo", "Nl", "Mn", "Mc", "Nd", "Pc"})
2071
10
      AddUGroup(&ccb, LookupGroup(group, unicode_groups, num_unicode_groups),
2072
10
                +1, Regexp::NoParseFlags);
2073
1
    return ccb.GetCharClass();
2074
1
  }();
2075
2076
9.32k
  absl::string_view t = name;
2077
9.32k
  Rune r;
2078
51.8k
  while (!t.empty()) {
2079
42.6k
    if (StringViewToRune(&r, &t, NULL) < 0)
2080
0
      return false;
2081
42.6k
    if (cc->Contains(r))
2082
42.5k
      continue;
2083
33
    return false;
2084
42.6k
  }
2085
9.29k
  return true;
2086
9.32k
}
2087
2088
// Parses a Perl flag setting or non-capturing group or both,
2089
// like (?i) or (?: or (?i:.  Removes from s, updates parse state.
2090
// The caller must check that s begins with "(?".
2091
// Returns true on success.  If the Perl flag is not
2092
// well-formed or not supported, sets status_ and returns false.
2093
10.7k
bool Regexp::ParseState::ParsePerlFlags(absl::string_view* s) {
2094
10.7k
  absl::string_view t = *s;
2095
2096
  // Caller is supposed to check this.
2097
10.7k
  if (!(flags_ & PerlX) || t.size() < 2 || t[0] != '(' || t[1] != '?') {
2098
0
    status_->set_code(kRegexpInternalError);
2099
0
    ABSL_LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";
2100
0
    return false;
2101
0
  }
2102
2103
  // Check for look-around assertions. This is NOT because we support them! ;)
2104
  // As per https://github.com/google/re2/issues/468, we really want to report
2105
  // kRegexpBadPerlOp (not kRegexpBadNamedCapture) for look-behind assertions.
2106
  // Additionally, it would be nice to report not "(?<", but "(?<=" or "(?<!".
2107
10.7k
  if ((t.size() > 3 && (t[2] == '=' || t[2] == '!')) ||
2108
10.7k
      (t.size() > 4 && t[2] == '<' && (t[3] == '=' || t[3] == '!'))) {
2109
4
    status_->set_code(kRegexpBadPerlOp);
2110
4
    status_->set_error_arg(absl::string_view(t.data(), t[2] == '<' ? 4 : 3));
2111
4
    return false;
2112
4
  }
2113
2114
  // Check for named captures, first introduced in Python's regexp library.
2115
  // As usual, there are three slightly different syntaxes:
2116
  //
2117
  //   (?P<name>expr)   the original, introduced by Python
2118
  //   (?<name>expr)    the .NET alteration, adopted by Perl 5.10
2119
  //   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
2120
  //
2121
  // Perl 5.10 gave in and implemented the Python version too,
2122
  // but they claim that the last two are the preferred forms.
2123
  // PCRE and languages based on it (specifically, PHP and Ruby)
2124
  // support all three as well.  EcmaScript 4 uses only the Python form.
2125
  //
2126
  // In both the open source world (via Code Search) and the
2127
  // Google source tree, (?P<name>expr) and (?<name>expr) are the
2128
  // dominant forms of named captures and both are supported.
2129
10.7k
  if ((t.size() > 4 && t[2] == 'P' && t[3] == '<') ||
2130
10.7k
      (t.size() > 3 && t[2] == '<')) {
2131
    // Pull out name.
2132
9.39k
    size_t begin = t[2] == 'P' ? 4 : 3;
2133
9.39k
    size_t end = t.find('>', begin);
2134
9.39k
    if (end == absl::string_view::npos) {
2135
63
      if (!IsValidUTF8(t, status_))
2136
4
        return false;
2137
59
      status_->set_code(kRegexpBadNamedCapture);
2138
59
      status_->set_error_arg(t);
2139
59
      return false;
2140
63
    }
2141
2142
9.33k
    absl::string_view capture(t.data(), end+1);
2143
9.33k
    absl::string_view name(t.data()+begin, end-begin);
2144
9.33k
    if (!IsValidUTF8(name, status_))
2145
3
      return false;
2146
9.32k
    if (!IsValidCaptureName(name)) {
2147
34
      status_->set_code(kRegexpBadNamedCapture);
2148
34
      status_->set_error_arg(capture);
2149
34
      return false;
2150
34
    }
2151
2152
9.29k
    if (!DoLeftParen(name)) {
2153
      // DoLeftParen's failure set status_.
2154
0
      return false;
2155
0
    }
2156
2157
9.29k
    s->remove_prefix(capture.size());
2158
9.29k
    return true;
2159
9.29k
  }
2160
2161
1.31k
  t.remove_prefix(2);  // "(?"
2162
2163
1.31k
  bool negated = false;
2164
1.31k
  bool sawflags = false;
2165
1.31k
  int nflags = flags_;
2166
1.31k
  Rune c;
2167
4.83k
  for (bool done = false; !done; ) {
2168
3.59k
    if (t.empty())
2169
22
      goto BadPerlOp;
2170
3.57k
    if (StringViewToRune(&c, &t, status_) < 0)
2171
3
      return false;
2172
3.56k
    switch (c) {
2173
44
      default:
2174
44
        goto BadPerlOp;
2175
2176
      // Parse flags.
2177
725
      case 'i':
2178
725
        sawflags = true;
2179
725
        if (negated)
2180
360
          nflags &= ~FoldCase;
2181
365
        else
2182
365
          nflags |= FoldCase;
2183
725
        break;
2184
2185
600
      case 'm':  // opposite of our OneLine
2186
600
        sawflags = true;
2187
600
        if (negated)
2188
374
          nflags |= OneLine;
2189
226
        else
2190
226
          nflags &= ~OneLine;
2191
600
        break;
2192
2193
100
      case 's':
2194
100
        sawflags = true;
2195
100
        if (negated)
2196
45
          nflags &= ~DotNL;
2197
55
        else
2198
55
          nflags |= DotNL;
2199
100
        break;
2200
2201
560
      case 'U':
2202
560
        sawflags = true;
2203
560
        if (negated)
2204
251
          nflags &= ~NonGreedy;
2205
309
        else
2206
309
          nflags |= NonGreedy;
2207
560
        break;
2208
2209
      // Negation
2210
294
      case '-':
2211
294
        if (negated)
2212
1
          goto BadPerlOp;
2213
293
        negated = true;
2214
293
        sawflags = false;
2215
293
        break;
2216
2217
      // Open new group.
2218
225
      case ':':
2219
225
        if (!DoLeftParenNoCapture()) {
2220
          // DoLeftParenNoCapture's failure set status_.
2221
0
          return false;
2222
0
        }
2223
225
        done = true;
2224
225
        break;
2225
2226
      // Finish flags.
2227
1.01k
      case ')':
2228
1.01k
        done = true;
2229
1.01k
        break;
2230
3.56k
    }
2231
3.56k
  }
2232
2233
1.24k
  if (negated && !sawflags)
2234
1
    goto BadPerlOp;
2235
2236
1.24k
  flags_ = static_cast<Regexp::ParseFlags>(nflags);
2237
1.24k
  *s = t;
2238
1.24k
  return true;
2239
2240
68
BadPerlOp:
2241
68
  status_->set_code(kRegexpBadPerlOp);
2242
68
  status_->set_error_arg(
2243
68
      absl::string_view(s->data(), static_cast<size_t>(t.data() - s->data())));
2244
68
  return false;
2245
1.24k
}
2246
2247
// Converts latin1 (assumed to be encoded as Latin1 bytes)
2248
// into UTF8 encoding in string.
2249
// Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
2250
// deprecated and because it rejects code points 0x80-0x9F.
2251
16.1k
void ConvertLatin1ToUTF8(absl::string_view latin1, std::string* utf) {
2252
16.1k
  char buf[UTFmax];
2253
2254
16.1k
  utf->clear();
2255
1.20M
  for (size_t i = 0; i < latin1.size(); i++) {
2256
1.18M
    Rune r = latin1[i] & 0xFF;
2257
1.18M
    int n = runetochar(buf, &r);
2258
1.18M
    utf->append(buf, n);
2259
1.18M
  }
2260
16.1k
}
2261
2262
// Parses the regular expression given by s,
2263
// returning the corresponding Regexp tree.
2264
// The caller must Decref the return value when done with it.
2265
// Returns NULL on error.
2266
Regexp* Regexp::Parse(absl::string_view s, ParseFlags global_flags,
2267
32.0k
                      RegexpStatus* status) {
2268
  // Make status non-NULL (easier on everyone else).
2269
32.0k
  RegexpStatus xstatus;
2270
32.0k
  if (status == NULL)
2271
0
    status = &xstatus;
2272
2273
32.0k
  ParseState ps(global_flags, s, status);
2274
32.0k
  absl::string_view t = s;
2275
2276
  // Convert regexp to UTF-8 (easier on the rest of the parser).
2277
32.0k
  if (global_flags & Latin1) {
2278
16.1k
    std::string* tmp = new std::string;
2279
16.1k
    ConvertLatin1ToUTF8(t, tmp);
2280
16.1k
    status->set_tmp(tmp);
2281
16.1k
    t = *tmp;
2282
16.1k
  }
2283
2284
32.0k
  if (global_flags & Literal) {
2285
    // Special parse loop for literal string.
2286
17.2k
    while (!t.empty()) {
2287
15.9k
      Rune r;
2288
15.9k
      if (StringViewToRune(&r, &t, status) < 0)
2289
33
        return NULL;
2290
15.9k
      if (!ps.PushLiteral(r))
2291
0
        return NULL;
2292
15.9k
    }
2293
1.31k
    return ps.DoFinish();
2294
1.34k
  }
2295
2296
30.7k
  absl::string_view lastunary = absl::string_view();
2297
1.11M
  while (!t.empty()) {
2298
1.08M
    absl::string_view isunary = absl::string_view();
2299
1.08M
    switch (t[0]) {
2300
514k
      default: {
2301
514k
        Rune r;
2302
514k
        if (StringViewToRune(&r, &t, status) < 0)
2303
100
          return NULL;
2304
514k
        if (!ps.PushLiteral(r))
2305
0
          return NULL;
2306
514k
        break;
2307
514k
      }
2308
2309
514k
      case '(':
2310
        // "(?" introduces Perl escape.
2311
87.9k
        if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {
2312
          // Flag changes and non-capturing groups.
2313
10.7k
          if (!ps.ParsePerlFlags(&t))
2314
175
            return NULL;
2315
10.5k
          break;
2316
10.7k
        }
2317
77.2k
        if (ps.flags() & NeverCapture) {
2318
38.0k
          if (!ps.DoLeftParenNoCapture())
2319
0
            return NULL;
2320
39.2k
        } else {
2321
39.2k
          if (!ps.DoLeftParen(absl::string_view()))
2322
0
            return NULL;
2323
39.2k
        }
2324
77.2k
        t.remove_prefix(1);  // '('
2325
77.2k
        break;
2326
2327
114k
      case '|':
2328
114k
        if (!ps.DoVerticalBar())
2329
0
          return NULL;
2330
114k
        t.remove_prefix(1);  // '|'
2331
114k
        break;
2332
2333
85.8k
      case ')':
2334
85.8k
        if (!ps.DoRightParen())
2335
58
          return NULL;
2336
85.8k
        t.remove_prefix(1);  // ')'
2337
85.8k
        break;
2338
2339
21.0k
      case '^':  // Beginning of line.
2340
21.0k
        if (!ps.PushCaret())
2341
0
          return NULL;
2342
21.0k
        t.remove_prefix(1);  // '^'
2343
21.0k
        break;
2344
2345
39.5k
      case '$':  // End of line.
2346
39.5k
        if (!ps.PushDollar())
2347
0
          return NULL;
2348
39.5k
        t.remove_prefix(1);  // '$'
2349
39.5k
        break;
2350
2351
13.3k
      case '.':  // Any character (possibly except newline).
2352
13.3k
        if (!ps.PushDot())
2353
0
          return NULL;
2354
13.3k
        t.remove_prefix(1);  // '.'
2355
13.3k
        break;
2356
2357
26.9k
      case '[': {  // Character class.
2358
26.9k
        Regexp* re;
2359
26.9k
        if (!ps.ParseCharClass(&t, &re, status))
2360
377
          return NULL;
2361
26.6k
        if (!ps.PushRegexp(re))
2362
0
          return NULL;
2363
26.6k
        break;
2364
26.6k
      }
2365
2366
41.4k
      case '*': {  // Zero or more.
2367
41.4k
        RegexpOp op;
2368
41.4k
        op = kRegexpStar;
2369
41.4k
        goto Rep;
2370
19.1k
      case '+':  // One or more.
2371
19.1k
        op = kRegexpPlus;
2372
19.1k
        goto Rep;
2373
43.3k
      case '?':  // Zero or one.
2374
43.3k
        op = kRegexpQuest;
2375
43.3k
        goto Rep;
2376
103k
      Rep:
2377
103k
        absl::string_view opstr = t;
2378
103k
        bool nongreedy = false;
2379
103k
        t.remove_prefix(1);  // '*' or '+' or '?'
2380
103k
        if (ps.flags() & PerlX) {
2381
22.5k
          if (!t.empty() && t[0] == '?') {
2382
2.13k
            nongreedy = true;
2383
2.13k
            t.remove_prefix(1);  // '?'
2384
2.13k
          }
2385
22.5k
          if (!lastunary.empty()) {
2386
            // In Perl it is not allowed to stack repetition operators:
2387
            //   a** is a syntax error, not a double-star.
2388
            // (and a++ means something else entirely, which we don't support!)
2389
5
            status->set_code(kRegexpRepeatOp);
2390
5
            status->set_error_arg(absl::string_view(
2391
5
                lastunary.data(),
2392
5
                static_cast<size_t>(t.data() - lastunary.data())));
2393
5
            return NULL;
2394
5
          }
2395
22.5k
        }
2396
103k
        opstr = absl::string_view(opstr.data(),
2397
103k
                                  static_cast<size_t>(t.data() - opstr.data()));
2398
103k
        if (!ps.PushRepeatOp(op, opstr, nongreedy))
2399
15
          return NULL;
2400
103k
        isunary = opstr;
2401
103k
        break;
2402
103k
      }
2403
2404
58.1k
      case '{': {  // Counted repetition.
2405
58.1k
        int lo, hi;
2406
58.1k
        absl::string_view opstr = t;
2407
58.1k
        if (!MaybeParseRepetition(&t, &lo, &hi)) {
2408
          // Treat like a literal.
2409
9.10k
          if (!ps.PushLiteral('{'))
2410
0
            return NULL;
2411
9.10k
          t.remove_prefix(1);  // '{'
2412
9.10k
          break;
2413
9.10k
        }
2414
49.0k
        bool nongreedy = false;
2415
49.0k
        if (ps.flags() & PerlX) {
2416
28.5k
          if (!t.empty() && t[0] == '?') {
2417
314
            nongreedy = true;
2418
314
            t.remove_prefix(1);  // '?'
2419
314
          }
2420
28.5k
          if (!lastunary.empty()) {
2421
            // Not allowed to stack repetition operators.
2422
2
            status->set_code(kRegexpRepeatOp);
2423
2
            status->set_error_arg(absl::string_view(
2424
2
                lastunary.data(),
2425
2
                static_cast<size_t>(t.data() - lastunary.data())));
2426
2
            return NULL;
2427
2
          }
2428
28.5k
        }
2429
48.9k
        opstr = absl::string_view(opstr.data(),
2430
48.9k
                                  static_cast<size_t>(t.data() - opstr.data()));
2431
48.9k
        if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
2432
67
          return NULL;
2433
48.9k
        isunary = opstr;
2434
48.9k
        break;
2435
48.9k
      }
2436
2437
16.4k
      case '\\': {  // Escaped character or Perl sequence.
2438
        // \b and \B: word boundary or not
2439
16.4k
        if ((ps.flags() & Regexp::PerlB) &&
2440
16.4k
            t.size() >= 2 && (t[1] == 'b' || t[1] == 'B')) {
2441
3.14k
          if (!ps.PushWordBoundary(t[1] == 'b'))
2442
0
            return NULL;
2443
3.14k
          t.remove_prefix(2);  // '\\', 'b'
2444
3.14k
          break;
2445
3.14k
        }
2446
2447
13.3k
        if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {
2448
11.8k
          if (t[1] == 'A') {
2449
207
            if (!ps.PushSimpleOp(kRegexpBeginText))
2450
0
              return NULL;
2451
207
            t.remove_prefix(2);  // '\\', 'A'
2452
207
            break;
2453
207
          }
2454
11.6k
          if (t[1] == 'z') {
2455
765
            if (!ps.PushSimpleOp(kRegexpEndText))
2456
0
              return NULL;
2457
765
            t.remove_prefix(2);  // '\\', 'z'
2458
765
            break;
2459
765
          }
2460
          // Do not recognize \Z, because this library can't
2461
          // implement the exact Perl/PCRE semantics.
2462
          // (This library treats "(?-m)$" as \z, even though
2463
          // in Perl and PCRE it is equivalent to \Z.)
2464
2465
10.8k
          if (t[1] == 'C') {  // \C: any byte [sic]
2466
2.23k
            if (!ps.PushSimpleOp(kRegexpAnyByte))
2467
0
              return NULL;
2468
2.23k
            t.remove_prefix(2);  // '\\', 'C'
2469
2.23k
            break;
2470
2.23k
          }
2471
2472
8.62k
          if (t[1] == 'Q') {  // \Q ... \E: the ... is always literals
2473
423
            t.remove_prefix(2);  // '\\', 'Q'
2474
3.59k
            while (!t.empty()) {
2475
3.40k
              if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {
2476
225
                t.remove_prefix(2);  // '\\', 'E'
2477
225
                break;
2478
225
              }
2479
3.17k
              Rune r;
2480
3.17k
              if (StringViewToRune(&r, &t, status) < 0)
2481
9
                return NULL;
2482
3.16k
              if (!ps.PushLiteral(r))
2483
0
                return NULL;
2484
3.16k
            }
2485
414
            break;
2486
423
          }
2487
8.62k
        }
2488
2489
9.72k
        if (t.size() >= 2 && (t[1] == 'p' || t[1] == 'P')) {
2490
4.39k
          Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2491
4.39k
          re->ccb_ = new CharClassBuilder;
2492
4.39k
          switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {
2493
4.22k
            case kParseOk:
2494
4.22k
              if (!ps.PushRegexp(re))
2495
0
                return NULL;
2496
4.22k
              goto Break2;
2497
4.22k
            case kParseError:
2498
177
              re->Decref();
2499
177
              return NULL;
2500
1
            case kParseNothing:
2501
1
              re->Decref();
2502
1
              break;
2503
4.39k
          }
2504
4.39k
        }
2505
2506
5.32k
        const UGroup* g = MaybeParsePerlCCEscape(&t, ps.flags());
2507
5.32k
        if (g != NULL) {
2508
1.81k
          Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
2509
1.81k
          re->ccb_ = new CharClassBuilder;
2510
1.81k
          AddUGroup(re->ccb_, g, g->sign, ps.flags());
2511
1.81k
          if (!ps.PushRegexp(re))
2512
0
            return NULL;
2513
1.81k
          break;
2514
1.81k
        }
2515
2516
3.50k
        Rune r;
2517
3.50k
        if (!ParseEscape(&t, &r, status, ps.rune_max()))
2518
313
          return NULL;
2519
3.19k
        if (!ps.PushLiteral(r))
2520
0
          return NULL;
2521
3.19k
        break;
2522
3.19k
      }
2523
1.08M
    }
2524
1.08M
  Break2:
2525
1.08M
    lastunary = isunary;
2526
1.08M
  }
2527
29.4k
  return ps.DoFinish();
2528
30.7k
}
2529
2530
}  // namespace re2