Coverage Report

Created: 2026-05-27 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/proc/self/cwd/external/re2~/re2/regexp.h
Line
Count
Source
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
#ifndef RE2_REGEXP_H_
6
#define RE2_REGEXP_H_
7
8
// --- SPONSORED LINK --------------------------------------------------
9
// If you want to use this library for regular expression matching,
10
// you should use re2/re2.h, which provides a class RE2 that
11
// mimics the PCRE interface provided by PCRE's C++ wrappers.
12
// This header describes the low-level interface used to implement RE2
13
// and may change in backwards-incompatible ways from time to time.
14
// In contrast, RE2's interface will not.
15
// ---------------------------------------------------------------------
16
17
// Regular expression library: parsing, execution, and manipulation
18
// of regular expressions.
19
//
20
// Any operation that traverses the Regexp structures should be written
21
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
22
// regular expressions such as x++++++++++++++++++++... might cause recursive
23
// traversals to overflow the stack.
24
//
25
// It is the caller's responsibility to provide appropriate mutual exclusion
26
// around manipulation of the regexps.  RE2 does this.
27
//
28
// PARSING
29
//
30
// Regexp::Parse parses regular expressions encoded in UTF-8.
31
// The default syntax is POSIX extended regular expressions,
32
// with the following changes:
33
//
34
//   1.  Backreferences (optional in POSIX EREs) are not supported.
35
//         (Supporting them precludes the use of DFA-based
36
//          matching engines.)
37
//
38
//   2.  Collating elements and collation classes are not supported.
39
//         (No one has needed or wanted them.)
40
//
41
// The exact syntax accepted can be modified by passing flags to
42
// Regexp::Parse.  In particular, many of the basic Perl additions
43
// are available.  The flags are documented below (search for LikePerl).
44
//
45
// If parsed with the flag Regexp::Latin1, both the regular expression
46
// and the input to the matching routines are assumed to be encoded in
47
// Latin-1, not UTF-8.
48
//
49
// EXECUTION
50
//
51
// Once Regexp has parsed a regular expression, it provides methods
52
// to search text using that regular expression.  These methods are
53
// implemented via calling out to other regular expression libraries.
54
// (Let's call them the sublibraries.)
55
//
56
// To call a sublibrary, Regexp does not simply prepare a
57
// string version of the regular expression and hand it to the
58
// sublibrary.  Instead, Regexp prepares, from its own parsed form, the
59
// corresponding internal representation used by the sublibrary.
60
// This has the drawback of needing to know the internal representation
61
// used by the sublibrary, but it has two important benefits:
62
//
63
//   1. The syntax and meaning of regular expressions is guaranteed
64
//      to be that used by Regexp's parser, not the syntax expected
65
//      by the sublibrary.  Regexp might accept a restricted or
66
//      expanded syntax for regular expressions as compared with
67
//      the sublibrary.  As long as Regexp can translate from its
68
//      internal form into the sublibrary's, clients need not know
69
//      exactly which sublibrary they are using.
70
//
71
//   2. The sublibrary parsers are bypassed.  For whatever reason,
72
//      sublibrary regular expression parsers often have security
73
//      problems.  For example, plan9grep's regular expression parser
74
//      has a buffer overflow in its handling of large character
75
//      classes, and PCRE's parser has had buffer overflow problems
76
//      in the past.  Security-team requires sandboxing of sublibrary
77
//      regular expression parsers.  Avoiding the sublibrary parsers
78
//      avoids the sandbox.
79
//
80
// The execution methods we use now are provided by the compiled form,
81
// Prog, described in prog.h
82
//
83
// MANIPULATION
84
//
85
// Unlike other regular expression libraries, Regexp makes its parsed
86
// form accessible to clients, so that client code can analyze the
87
// parsed regular expressions.
88
89
#include <stddef.h>
90
#include <stdint.h>
91
92
#include <map>
93
#include <set>
94
#include <string>
95
96
#include "absl/log/absl_check.h"
97
#include "absl/log/absl_log.h"
98
#include "absl/strings/string_view.h"
99
#include "util/utf.h"
100
101
namespace re2 {
102
103
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
104
enum RegexpOp {
105
  // Matches no strings.
106
  kRegexpNoMatch = 1,
107
108
  // Matches empty string.
109
  kRegexpEmptyMatch,
110
111
  // Matches rune_.
112
  kRegexpLiteral,
113
114
  // Matches runes_.
115
  kRegexpLiteralString,
116
117
  // Matches concatenation of sub_[0..nsub-1].
118
  kRegexpConcat,
119
  // Matches union of sub_[0..nsub-1].
120
  kRegexpAlternate,
121
122
  // Matches sub_[0] zero or more times.
123
  kRegexpStar,
124
  // Matches sub_[0] one or more times.
125
  kRegexpPlus,
126
  // Matches sub_[0] zero or one times.
127
  kRegexpQuest,
128
129
  // Matches sub_[0] at least min_ times, at most max_ times.
130
  // max_ == -1 means no upper limit.
131
  kRegexpRepeat,
132
133
  // Parenthesized (capturing) subexpression.  Index is cap_.
134
  // Optionally, capturing name is name_.
135
  kRegexpCapture,
136
137
  // Matches any character.
138
  kRegexpAnyChar,
139
140
  // Matches any byte [sic].
141
  kRegexpAnyByte,
142
143
  // Matches empty string at beginning of line.
144
  kRegexpBeginLine,
145
  // Matches empty string at end of line.
146
  kRegexpEndLine,
147
148
  // Matches word boundary "\b".
149
  kRegexpWordBoundary,
150
  // Matches not-a-word boundary "\B".
151
  kRegexpNoWordBoundary,
152
153
  // Matches empty string at beginning of text.
154
  kRegexpBeginText,
155
  // Matches empty string at end of text.
156
  kRegexpEndText,
157
158
  // Matches character class given by cc_.
159
  kRegexpCharClass,
160
161
  // Forces match of entire expression right now,
162
  // with match ID match_id_ (used by RE2::Set).
163
  kRegexpHaveMatch,
164
165
  kMaxRegexpOp = kRegexpHaveMatch,
166
};
167
168
// Keep in sync with string list in regexp.cc
169
enum RegexpStatusCode {
170
  // No error
171
  kRegexpSuccess = 0,
172
173
  // Unexpected error
174
  kRegexpInternalError,
175
176
  // Parse errors
177
  kRegexpBadEscape,          // bad escape sequence
178
  kRegexpBadCharClass,       // bad character class
179
  kRegexpBadCharRange,       // bad character class range
180
  kRegexpMissingBracket,     // missing closing ]
181
  kRegexpMissingParen,       // missing closing )
182
  kRegexpUnexpectedParen,    // unexpected closing )
183
  kRegexpTrailingBackslash,  // at end of regexp
184
  kRegexpRepeatArgument,     // repeat argument missing, e.g. "*"
185
  kRegexpRepeatSize,         // bad repetition argument
186
  kRegexpRepeatOp,           // bad repetition operator
187
  kRegexpBadPerlOp,          // bad perl operator
188
  kRegexpBadUTF8,            // invalid UTF-8 in regexp
189
  kRegexpBadNamedCapture,    // bad named capture
190
};
191
192
// Error status for certain operations.
193
class RegexpStatus {
194
 public:
195
0
  RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
196
0
  ~RegexpStatus() { delete tmp_; }
197
198
0
  void set_code(RegexpStatusCode code) { code_ = code; }
199
0
  void set_error_arg(absl::string_view error_arg) { error_arg_ = error_arg; }
200
0
  void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; }
201
0
  RegexpStatusCode code() const { return code_; }
202
0
  absl::string_view error_arg() const { return error_arg_; }
203
0
  bool ok() const { return code() == kRegexpSuccess; }
204
205
  // Copies state from status.
206
  void Copy(const RegexpStatus& status);
207
208
  // Returns text equivalent of code, e.g.:
209
  //   "Bad character class"
210
  static std::string CodeText(RegexpStatusCode code);
211
212
  // Returns text describing error, e.g.:
213
  //   "Bad character class: [z-a]"
214
  std::string Text() const;
215
216
 private:
217
  RegexpStatusCode code_;        // Kind of error.
218
  absl::string_view error_arg_;  // Piece of regexp containing syntax error.
219
  std::string* tmp_;             // Temporary storage, possibly for error_arg_.
220
221
  RegexpStatus(const RegexpStatus&) = delete;
222
  RegexpStatus& operator=(const RegexpStatus&) = delete;
223
};
224
225
// Compiled form; see prog.h
226
class Prog;
227
228
struct RuneRange {
229
0
  RuneRange() : lo(0), hi(0) { }
230
0
  RuneRange(int l, int h) : lo(l), hi(h) { }
231
  Rune lo;
232
  Rune hi;
233
};
234
235
// Less-than on RuneRanges treats a == b if they overlap at all.
236
// This lets us look in a set to find the range covering a particular Rune.
237
struct RuneRangeLess {
238
0
  bool operator()(const RuneRange& a, const RuneRange& b) const {
239
0
    return a.hi < b.lo;
240
0
  }
241
};
242
243
class CharClassBuilder;
244
245
class CharClass {
246
 public:
247
  void Delete();
248
249
  typedef RuneRange* iterator;
250
0
  iterator begin() { return ranges_; }
251
0
  iterator end() { return ranges_ + nranges_; }
252
253
0
  int size() { return nrunes_; }
254
0
  bool empty() { return nrunes_ == 0; }
255
0
  bool full() { return nrunes_ == Runemax+1; }
256
0
  bool FoldsASCII() { return folds_ascii_; }
257
258
  bool Contains(Rune r) const;
259
  CharClass* Negate();
260
261
 private:
262
  CharClass();  // not implemented
263
  ~CharClass();  // not implemented
264
  static CharClass* New(size_t maxranges);
265
266
  friend class CharClassBuilder;
267
268
  bool folds_ascii_;
269
  int nrunes_;
270
  RuneRange *ranges_;
271
  int nranges_;
272
273
  CharClass(const CharClass&) = delete;
274
  CharClass& operator=(const CharClass&) = delete;
275
};
276
277
class Regexp {
278
 public:
279
280
  // Flags for parsing.  Can be ORed together.
281
  enum ParseFlags {
282
    NoParseFlags  = 0,
283
    FoldCase      = 1<<0,   // Fold case during matching (case-insensitive).
284
    Literal       = 1<<1,   // Treat s as literal string instead of a regexp.
285
    ClassNL       = 1<<2,   // Allow char classes like [^a-z] and \D and \s
286
                            // and [[:space:]] to match newline.
287
    DotNL         = 1<<3,   // Allow . to match newline.
288
    MatchNL       = ClassNL | DotNL,
289
    OneLine       = 1<<4,   // Treat ^ and $ as only matching at beginning and
290
                            // end of text, not around embedded newlines.
291
                            // (Perl's default)
292
    Latin1        = 1<<5,   // Regexp and text are in Latin1, not UTF-8.
293
    NonGreedy     = 1<<6,   // Repetition operators are non-greedy by default.
294
    PerlClasses   = 1<<7,   // Allow Perl character classes like \d.
295
    PerlB         = 1<<8,   // Allow Perl's \b and \B.
296
    PerlX         = 1<<9,   // Perl extensions:
297
                            //   non-capturing parens - (?: )
298
                            //   non-greedy operators - *? +? ?? {}?
299
                            //   flag edits - (?i) (?-i) (?i: )
300
                            //     i - FoldCase
301
                            //     m - !OneLine
302
                            //     s - DotNL
303
                            //     U - NonGreedy
304
                            //   line ends: \A \z
305
                            //   \Q and \E to disable/enable metacharacters
306
                            //   (?P<name>expr) for named captures
307
                            //   \C to match any single byte
308
    UnicodeGroups = 1<<10,  // Allow \p{Han} for Unicode Han group
309
                            //   and \P{Han} for its negation.
310
    NeverNL       = 1<<11,  // Never match NL, even if the regexp mentions
311
                            //   it explicitly.
312
    NeverCapture  = 1<<12,  // Parse all parens as non-capturing.
313
314
    // As close to Perl as we can get.
315
    LikePerl      = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
316
                    UnicodeGroups,
317
318
    // Internal use only.
319
    WasDollar     = 1<<13,  // on kRegexpEndText: was $ in regexp text
320
    AllParseFlags = (1<<14)-1,
321
  };
322
323
  // Get.  No set, Regexps are logically immutable once created.
324
0
  RegexpOp op() { return static_cast<RegexpOp>(op_); }
325
0
  int nsub() { return nsub_; }
326
0
  bool simple() { return simple_ != 0; }
327
0
  ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
328
  int Ref();  // For testing.
329
330
0
  Regexp** sub() {
331
0
    if(nsub_ <= 1)
332
0
      return &subone_;
333
0
    else
334
0
      return submany_;
335
0
  }
336
337
0
  int min() {
338
0
    ABSL_DCHECK_EQ(op_, kRegexpRepeat);
339
0
    return min_;
340
0
  }
341
0
  int max() {
342
0
    ABSL_DCHECK_EQ(op_, kRegexpRepeat);
343
0
    return max_;
344
0
  }
345
0
  Rune rune() {
346
0
    ABSL_DCHECK_EQ(op_, kRegexpLiteral);
347
0
    return rune_;
348
0
  }
349
0
  CharClass* cc() {
350
0
    ABSL_DCHECK_EQ(op_, kRegexpCharClass);
351
0
    return cc_;
352
0
  }
353
0
  int cap() {
354
0
    ABSL_DCHECK_EQ(op_, kRegexpCapture);
355
0
    return cap_;
356
0
  }
357
0
  const std::string* name() {
358
0
    ABSL_DCHECK_EQ(op_, kRegexpCapture);
359
0
    return name_;
360
0
  }
361
0
  Rune* runes() {
362
0
    ABSL_DCHECK_EQ(op_, kRegexpLiteralString);
363
0
    return runes_;
364
0
  }
365
0
  int nrunes() {
366
0
    ABSL_DCHECK_EQ(op_, kRegexpLiteralString);
367
0
    return nrunes_;
368
0
  }
369
0
  int match_id() {
370
0
    ABSL_DCHECK_EQ(op_, kRegexpHaveMatch);
371
0
    return match_id_;
372
0
  }
373
374
  // Increments reference count, returns object as convenience.
375
  Regexp* Incref();
376
377
  // Decrements reference count and deletes this object if count reaches 0.
378
  void Decref();
379
380
  // Parses string s to produce regular expression, returned.
381
  // Caller must release return value with re->Decref().
382
  // On failure, sets *status (if status != NULL) and returns NULL.
383
  static Regexp* Parse(absl::string_view s, ParseFlags flags,
384
                       RegexpStatus* status);
385
386
  // Returns a _new_ simplified version of the current regexp.
387
  // Does not edit the current regexp.
388
  // Caller must release return value with re->Decref().
389
  // Simplified means that counted repetition has been rewritten
390
  // into simpler terms and all Perl/POSIX features have been
391
  // removed.  The result will capture exactly the same
392
  // subexpressions the original did, unless formatted with ToString.
393
  Regexp* Simplify();
394
  friend class CoalesceWalker;
395
  friend class SimplifyWalker;
396
397
  // Parses the regexp src and then simplifies it and sets *dst to the
398
  // string representation of the simplified form.  Returns true on success.
399
  // Returns false and sets *status (if status != NULL) on parse error.
400
  static bool SimplifyRegexp(absl::string_view src, ParseFlags flags,
401
                             std::string* dst, RegexpStatus* status);
402
403
  // Returns the number of capturing groups in the regexp.
404
  int NumCaptures();
405
  friend class NumCapturesWalker;
406
407
  // Returns a map from names to capturing group indices,
408
  // or NULL if the regexp contains no named capture groups.
409
  // The caller is responsible for deleting the map.
410
  std::map<std::string, int>* NamedCaptures();
411
412
  // Returns a map from capturing group indices to capturing group
413
  // names or NULL if the regexp contains no named capture groups. The
414
  // caller is responsible for deleting the map.
415
  std::map<int, std::string>* CaptureNames();
416
417
  // Returns a string representation of the current regexp,
418
  // using as few parentheses as possible.
419
  std::string ToString();
420
421
  // Convenience functions.  They consume the passed reference,
422
  // so in many cases you should use, e.g., Plus(re->Incref(), flags).
423
  // They do not consume allocated arrays like subs or runes.
424
  static Regexp* Plus(Regexp* sub, ParseFlags flags);
425
  static Regexp* Star(Regexp* sub, ParseFlags flags);
426
  static Regexp* Quest(Regexp* sub, ParseFlags flags);
427
  static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
428
  static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
429
  static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
430
  static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
431
  static Regexp* NewLiteral(Rune rune, ParseFlags flags);
432
  static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
433
  static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
434
  static Regexp* HaveMatch(int match_id, ParseFlags flags);
435
436
  // Like Alternate but does not factor out common prefixes.
437
  static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
438
439
  // Debugging function.  Returns string format for regexp
440
  // that makes structure clear.  Does NOT use regexp syntax.
441
  std::string Dump();
442
443
  // Helper traversal class, defined fully in walker-inl.h.
444
  template<typename T> class Walker;
445
446
  // Compile to Prog.  See prog.h
447
  // Reverse prog expects to be run over text backward.
448
  // Construction and execution of prog will
449
  // stay within approximately max_mem bytes of memory.
450
  // If max_mem <= 0, a reasonable default is used.
451
  Prog* CompileToProg(int64_t max_mem);
452
  Prog* CompileToReverseProg(int64_t max_mem);
453
454
  // Whether to expect this library to find exactly the same answer as PCRE
455
  // when running this regexp.  Most regexps do mimic PCRE exactly, but a few
456
  // obscure cases behave differently.  Technically this is more a property
457
  // of the Prog than the Regexp, but the computation is much easier to do
458
  // on the Regexp.  See mimics_pcre.cc for the exact conditions.
459
  bool MimicsPCRE();
460
461
  // Benchmarking function.
462
  void NullWalk();
463
464
  // Whether every match of this regexp must be anchored and
465
  // begin with a non-empty fixed string (perhaps after ASCII
466
  // case-folding).  If so, returns the prefix and the sub-regexp that
467
  // follows it.
468
  // Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
469
  // regardless of the return value.
470
  bool RequiredPrefix(std::string* prefix, bool* foldcase,
471
                      Regexp** suffix);
472
473
  // Whether every match of this regexp must be unanchored and
474
  // begin with a non-empty fixed string (perhaps after ASCII
475
  // case-folding).  If so, returns the prefix.
476
  // Callers should expect *prefix and *foldcase to be "zeroed"
477
  // regardless of the return value.
478
  bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase);
479
480
  // Controls the maximum repeat count permitted by the parser.
481
  // FOR FUZZING ONLY.
482
  static void FUZZING_ONLY_set_maximum_repeat_count(int i);
483
484
 private:
485
  // Constructor allocates vectors as appropriate for operator.
486
  explicit Regexp(RegexpOp op, ParseFlags parse_flags);
487
488
  // Use Decref() instead of delete to release Regexps.
489
  // This is private to catch deletes at compile time.
490
  ~Regexp();
491
  void Destroy();
492
  bool QuickDestroy();
493
494
  // Helpers for Parse.  Listed here so they can edit Regexps.
495
  class ParseState;
496
497
  friend class ParseState;
498
  friend bool ParseCharClass(absl::string_view* s, Regexp** out_re,
499
                             RegexpStatus* status);
500
501
  // Helper for testing [sic].
502
  friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
503
504
  // Computes whether Regexp is already simple.
505
  bool ComputeSimple();
506
507
  // Constructor that generates a Star, Plus or Quest,
508
  // squashing the pair if sub is also a Star, Plus or Quest.
509
  static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
510
511
  // Constructor that generates a concatenation or alternation,
512
  // enforcing the limit on the number of subexpressions for
513
  // a particular Regexp.
514
  static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
515
                                   ParseFlags flags, bool can_factor);
516
517
  // Returns the leading string that re starts with.
518
  // The returned Rune* points into a piece of re,
519
  // so it must not be used after the caller calls re->Decref().
520
  static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
521
522
  // Removes the first n leading runes from the beginning of re.
523
  // Edits re in place.
524
  static void RemoveLeadingString(Regexp* re, int n);
525
526
  // Returns the leading regexp in re's top-level concatenation.
527
  // The returned Regexp* points at re or a sub-expression of re,
528
  // so it must not be used after the caller calls re->Decref().
529
  static Regexp* LeadingRegexp(Regexp* re);
530
531
  // Removes LeadingRegexp(re) from re and returns the remainder.
532
  // Might edit re in place.
533
  static Regexp* RemoveLeadingRegexp(Regexp* re);
534
535
  // Simplifies an alternation of literal strings by factoring out
536
  // common prefixes.
537
  static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
538
  friend class FactorAlternationImpl;
539
540
  // Is a == b?  Only efficient on regexps that have not been through
541
  // Simplify yet - the expansion of a kRegexpRepeat will make this
542
  // take a long time.  Do not call on such regexps, hence private.
543
  static bool Equal(Regexp* a, Regexp* b);
544
545
  // Allocate space for n sub-regexps.
546
0
  void AllocSub(int n) {
547
0
    ABSL_DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
548
0
    if (n > 1)
549
0
      submany_ = new Regexp*[n];
550
0
    nsub_ = static_cast<uint16_t>(n);
551
0
  }
552
553
  // Add Rune to LiteralString
554
  void AddRuneToString(Rune r);
555
556
  // Swaps this with that, in place.
557
  void Swap(Regexp *that);
558
559
  // Operator.  See description of operators above.
560
  // uint8_t instead of RegexpOp to control space usage.
561
  uint8_t op_;
562
563
  // Is this regexp structure already simple
564
  // (has it been returned by Simplify)?
565
  // uint8_t instead of bool to control space usage.
566
  uint8_t simple_;
567
568
  // Flags saved from parsing and used during execution.
569
  // (Only FoldCase is used.)
570
  // uint16_t instead of ParseFlags to control space usage.
571
  uint16_t parse_flags_;
572
573
  // Reference count.  Exists so that SimplifyRegexp can build
574
  // regexp structures that are dags rather than trees to avoid
575
  // exponential blowup in space requirements.
576
  // uint16_t to control space usage.
577
  // The standard regexp routines will never generate a
578
  // ref greater than the maximum repeat count (kMaxRepeat),
579
  // but even so, Incref and Decref consult an overflow map
580
  // when ref_ reaches kMaxRef.
581
  uint16_t ref_;
582
  static const uint16_t kMaxRef = 0xffff;
583
584
  // Subexpressions.
585
  // uint16_t to control space usage.
586
  // Concat and Alternate handle larger numbers of subexpressions
587
  // by building concatenation or alternation trees.
588
  // Other routines should call Concat or Alternate instead of
589
  // filling in sub() by hand.
590
  uint16_t nsub_;
591
  static const uint16_t kMaxNsub = 0xffff;
592
  union {
593
    Regexp** submany_;  // if nsub_ > 1
594
    Regexp* subone_;  // if nsub_ == 1
595
  };
596
597
  // Extra space for parse and teardown stacks.
598
  Regexp* down_;
599
600
  // Arguments to operator.  See description of operators above.
601
  union {
602
    struct {  // Repeat
603
      int max_;
604
      int min_;
605
    };
606
    struct {  // Capture
607
      int cap_;
608
      std::string* name_;
609
    };
610
    struct {  // LiteralString
611
      int nrunes_;
612
      Rune* runes_;
613
    };
614
    struct {  // CharClass
615
      // These two could be in separate union members,
616
      // but it wouldn't save any space (there are other two-word structs)
617
      // and keeping them separate avoids confusion during parsing.
618
      CharClass* cc_;
619
      CharClassBuilder* ccb_;
620
    };
621
    Rune rune_;  // Literal
622
    int match_id_;  // HaveMatch
623
    void *the_union_[2];  // as big as any other element, for memset
624
  };
625
626
  Regexp(const Regexp&) = delete;
627
  Regexp& operator=(const Regexp&) = delete;
628
};
629
630
// Character class set: contains non-overlapping, non-abutting RuneRanges.
631
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
632
633
class CharClassBuilder {
634
 public:
635
  CharClassBuilder();
636
637
  typedef RuneRangeSet::iterator iterator;
638
0
  iterator begin() { return ranges_.begin(); }
639
0
  iterator end() { return ranges_.end(); }
640
641
0
  int size() { return nrunes_; }
642
0
  bool empty() { return nrunes_ == 0; }
643
0
  bool full() { return nrunes_ == Runemax+1; }
644
645
  bool Contains(Rune r);
646
  bool FoldsASCII();
647
  bool AddRange(Rune lo, Rune hi);  // returns whether class changed
648
  CharClassBuilder* Copy();
649
  void AddCharClass(CharClassBuilder* cc);
650
  void Negate();
651
  void RemoveAbove(Rune r);
652
  CharClass* GetCharClass();
653
  void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
654
655
 private:
656
  static const uint32_t AlphaMask = (1<<26) - 1;
657
  uint32_t upper_;  // bitmap of A-Z
658
  uint32_t lower_;  // bitmap of a-z
659
  int nrunes_;
660
  RuneRangeSet ranges_;
661
662
  CharClassBuilder(const CharClassBuilder&) = delete;
663
  CharClassBuilder& operator=(const CharClassBuilder&) = delete;
664
};
665
666
// Bitwise ops on ParseFlags produce ParseFlags.
667
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
668
0
                                    Regexp::ParseFlags b) {
669
0
  return static_cast<Regexp::ParseFlags>(
670
0
      static_cast<int>(a) | static_cast<int>(b));
671
0
}
672
673
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
674
0
                                    Regexp::ParseFlags b) {
675
0
  return static_cast<Regexp::ParseFlags>(
676
0
      static_cast<int>(a) ^ static_cast<int>(b));
677
0
}
678
679
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
680
0
                                    Regexp::ParseFlags b) {
681
0
  return static_cast<Regexp::ParseFlags>(
682
0
      static_cast<int>(a) & static_cast<int>(b));
683
0
}
684
685
0
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
686
  // Attempting to produce a value out of enum's range has undefined behaviour.
687
0
  return static_cast<Regexp::ParseFlags>(
688
0
      ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
689
0
}
690
691
}  // namespace re2
692
693
#endif  // RE2_REGEXP_H_