Coverage Report

Created: 2024-05-04 01:14

/src/re2/re2/regexp.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
#ifndef RE2_REGEXP_H_
6
#define RE2_REGEXP_H_
7
8
// --- SPONSORED LINK --------------------------------------------------
9
// If you want to use this library for regular expression matching,
10
// you should use re2/re2.h, which provides a class RE2 that
11
// mimics the PCRE interface provided by PCRE's C++ wrappers.
12
// This header describes the low-level interface used to implement RE2
13
// and may change in backwards-incompatible ways from time to time.
14
// In contrast, RE2's interface will not.
15
// ---------------------------------------------------------------------
16
17
// Regular expression library: parsing, execution, and manipulation
18
// of regular expressions.
19
//
20
// Any operation that traverses the Regexp structures should be written
21
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
22
// regular expressions such as x++++++++++++++++++++... might cause recursive
23
// traversals to overflow the stack.
24
//
25
// It is the caller's responsibility to provide appropriate mutual exclusion
26
// around manipulation of the regexps.  RE2 does this.
27
//
28
// PARSING
29
//
30
// Regexp::Parse parses regular expressions encoded in UTF-8.
31
// The default syntax is POSIX extended regular expressions,
32
// with the following changes:
33
//
34
//   1.  Backreferences (optional in POSIX EREs) are not supported.
35
//         (Supporting them precludes the use of DFA-based
36
//          matching engines.)
37
//
38
//   2.  Collating elements and collation classes are not supported.
39
//         (No one has needed or wanted them.)
40
//
41
// The exact syntax accepted can be modified by passing flags to
42
// Regexp::Parse.  In particular, many of the basic Perl additions
43
// are available.  The flags are documented below (search for LikePerl).
44
//
45
// If parsed with the flag Regexp::Latin1, both the regular expression
46
// and the input to the matching routines are assumed to be encoded in
47
// Latin-1, not UTF-8.
48
//
49
// EXECUTION
50
//
51
// Once Regexp has parsed a regular expression, it provides methods
52
// to search text using that regular expression.  These methods are
53
// implemented via calling out to other regular expression libraries.
54
// (Let's call them the sublibraries.)
55
//
56
// To call a sublibrary, Regexp does not simply prepare a
57
// string version of the regular expression and hand it to the
58
// sublibrary.  Instead, Regexp prepares, from its own parsed form, the
59
// corresponding internal representation used by the sublibrary.
60
// This has the drawback of needing to know the internal representation
61
// used by the sublibrary, but it has two important benefits:
62
//
63
//   1. The syntax and meaning of regular expressions is guaranteed
64
//      to be that used by Regexp's parser, not the syntax expected
65
//      by the sublibrary.  Regexp might accept a restricted or
66
//      expanded syntax for regular expressions as compared with
67
//      the sublibrary.  As long as Regexp can translate from its
68
//      internal form into the sublibrary's, clients need not know
69
//      exactly which sublibrary they are using.
70
//
71
//   2. The sublibrary parsers are bypassed.  For whatever reason,
72
//      sublibrary regular expression parsers often have security
73
//      problems.  For example, plan9grep's regular expression parser
74
//      has a buffer overflow in its handling of large character
75
//      classes, and PCRE's parser has had buffer overflow problems
76
//      in the past.  Security-team requires sandboxing of sublibrary
77
//      regular expression parsers.  Avoiding the sublibrary parsers
78
//      avoids the sandbox.
79
//
80
// The execution methods we use now are provided by the compiled form,
81
// Prog, described in prog.h
82
//
83
// MANIPULATION
84
//
85
// Unlike other regular expression libraries, Regexp makes its parsed
86
// form accessible to clients, so that client code can analyze the
87
// parsed regular expressions.
88
89
#include <stddef.h>
90
#include <stdint.h>
91
#include <map>
92
#include <set>
93
#include <string>
94
95
#include "util/util.h"
96
#include "util/logging.h"
97
#include "util/utf.h"
98
#include "re2/stringpiece.h"
99
100
namespace re2 {
101
102
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
103
enum RegexpOp {
104
  // Matches no strings.
105
  kRegexpNoMatch = 1,
106
107
  // Matches empty string.
108
  kRegexpEmptyMatch,
109
110
  // Matches rune_.
111
  kRegexpLiteral,
112
113
  // Matches runes_.
114
  kRegexpLiteralString,
115
116
  // Matches concatenation of sub_[0..nsub-1].
117
  kRegexpConcat,
118
  // Matches union of sub_[0..nsub-1].
119
  kRegexpAlternate,
120
121
  // Matches sub_[0] zero or more times.
122
  kRegexpStar,
123
  // Matches sub_[0] one or more times.
124
  kRegexpPlus,
125
  // Matches sub_[0] zero or one times.
126
  kRegexpQuest,
127
128
  // Matches sub_[0] at least min_ times, at most max_ times.
129
  // max_ == -1 means no upper limit.
130
  kRegexpRepeat,
131
132
  // Parenthesized (capturing) subexpression.  Index is cap_.
133
  // Optionally, capturing name is name_.
134
  kRegexpCapture,
135
136
  // Matches any character.
137
  kRegexpAnyChar,
138
139
  // Matches any byte [sic].
140
  kRegexpAnyByte,
141
142
  // Matches empty string at beginning of line.
143
  kRegexpBeginLine,
144
  // Matches empty string at end of line.
145
  kRegexpEndLine,
146
147
  // Matches word boundary "\b".
148
  kRegexpWordBoundary,
149
  // Matches not-a-word boundary "\B".
150
  kRegexpNoWordBoundary,
151
152
  // Matches empty string at beginning of text.
153
  kRegexpBeginText,
154
  // Matches empty string at end of text.
155
  kRegexpEndText,
156
157
  // Matches character class given by cc_.
158
  kRegexpCharClass,
159
160
  // Forces match of entire expression right now,
161
  // with match ID match_id_ (used by RE2::Set).
162
  kRegexpHaveMatch,
163
164
  kMaxRegexpOp = kRegexpHaveMatch,
165
};
166
167
// Keep in sync with string list in regexp.cc
168
enum RegexpStatusCode {
169
  // No error
170
  kRegexpSuccess = 0,
171
172
  // Unexpected error
173
  kRegexpInternalError,
174
175
  // Parse errors
176
  kRegexpBadEscape,          // bad escape sequence
177
  kRegexpBadCharClass,       // bad character class
178
  kRegexpBadCharRange,       // bad character class range
179
  kRegexpMissingBracket,     // missing closing ]
180
  kRegexpMissingParen,       // missing closing )
181
  kRegexpUnexpectedParen,    // unexpected closing )
182
  kRegexpTrailingBackslash,  // at end of regexp
183
  kRegexpRepeatArgument,     // repeat argument missing, e.g. "*"
184
  kRegexpRepeatSize,         // bad repetition argument
185
  kRegexpRepeatOp,           // bad repetition operator
186
  kRegexpBadPerlOp,          // bad perl operator
187
  kRegexpBadUTF8,            // invalid UTF-8 in regexp
188
  kRegexpBadNamedCapture,    // bad named capture
189
};
190
191
// Error status for certain operations.
192
class RegexpStatus {
193
 public:
194
46.4k
  RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
195
46.4k
  ~RegexpStatus() { delete tmp_; }
196
197
5.49k
  void set_code(RegexpStatusCode code) { code_ = code; }
198
5.49k
  void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
199
18.3k
  void set_tmp(std::string* tmp) { delete tmp_; tmp_ = tmp; }
200
5.45k
  RegexpStatusCode code() const { return code_; }
201
5.45k
  const StringPiece& error_arg() const { return error_arg_; }
202
0
  bool ok() const { return code() == kRegexpSuccess; }
203
204
  // Copies state from status.
205
  void Copy(const RegexpStatus& status);
206
207
  // Returns text equivalent of code, e.g.:
208
  //   "Bad character class"
209
  static std::string CodeText(RegexpStatusCode code);
210
211
  // Returns text describing error, e.g.:
212
  //   "Bad character class: [z-a]"
213
  std::string Text() const;
214
215
 private:
216
  RegexpStatusCode code_;  // Kind of error
217
  StringPiece error_arg_;  // Piece of regexp containing syntax error.
218
  std::string* tmp_;       // Temporary storage, possibly where error_arg_ is.
219
220
  RegexpStatus(const RegexpStatus&) = delete;
221
  RegexpStatus& operator=(const RegexpStatus&) = delete;
222
};
223
224
// Compiled form; see prog.h
225
class Prog;
226
227
struct RuneRange {
228
54.6k
  RuneRange() : lo(0), hi(0) { }
229
10.4M
  RuneRange(int l, int h) : lo(l), hi(h) { }
230
  Rune lo;
231
  Rune hi;
232
};
233
234
// Less-than on RuneRanges treats a == b if they overlap at all.
235
// This lets us look in a set to find the range covering a particular Rune.
236
struct RuneRangeLess {
237
106M
  bool operator()(const RuneRange& a, const RuneRange& b) const {
238
106M
    return a.hi < b.lo;
239
106M
  }
240
};
241
242
class CharClassBuilder;
243
244
class CharClass {
245
 public:
246
  void Delete();
247
248
  typedef RuneRange* iterator;
249
1.91M
  iterator begin() { return ranges_; }
250
27.4M
  iterator end() { return ranges_ + nranges_; }
251
252
15.6k
  int size() { return nrunes_; }
253
1.88M
  bool empty() { return nrunes_ == 0; }
254
3.83k
  bool full() { return nrunes_ == Runemax+1; }
255
1.88M
  bool FoldsASCII() { return folds_ascii_; }
256
257
  bool Contains(Rune r) const;
258
  CharClass* Negate();
259
260
 private:
261
  CharClass();  // not implemented
262
  ~CharClass();  // not implemented
263
  static CharClass* New(size_t maxranges);
264
265
  friend class CharClassBuilder;
266
267
  bool folds_ascii_;
268
  int nrunes_;
269
  RuneRange *ranges_;
270
  int nranges_;
271
272
  CharClass(const CharClass&) = delete;
273
  CharClass& operator=(const CharClass&) = delete;
274
};
275
276
class Regexp {
277
 public:
278
279
  // Flags for parsing.  Can be ORed together.
280
  enum ParseFlags {
281
    NoParseFlags  = 0,
282
    FoldCase      = 1<<0,   // Fold case during matching (case-insensitive).
283
    Literal       = 1<<1,   // Treat s as literal string instead of a regexp.
284
    ClassNL       = 1<<2,   // Allow char classes like [^a-z] and \D and \s
285
                            // and [[:space:]] to match newline.
286
    DotNL         = 1<<3,   // Allow . to match newline.
287
    MatchNL       = ClassNL | DotNL,
288
    OneLine       = 1<<4,   // Treat ^ and $ as only matching at beginning and
289
                            // end of text, not around embedded newlines.
290
                            // (Perl's default)
291
    Latin1        = 1<<5,   // Regexp and text are in Latin1, not UTF-8.
292
    NonGreedy     = 1<<6,   // Repetition operators are non-greedy by default.
293
    PerlClasses   = 1<<7,   // Allow Perl character classes like \d.
294
    PerlB         = 1<<8,   // Allow Perl's \b and \B.
295
    PerlX         = 1<<9,   // Perl extensions:
296
                            //   non-capturing parens - (?: )
297
                            //   non-greedy operators - *? +? ?? {}?
298
                            //   flag edits - (?i) (?-i) (?i: )
299
                            //     i - FoldCase
300
                            //     m - !OneLine
301
                            //     s - DotNL
302
                            //     U - NonGreedy
303
                            //   line ends: \A \z
304
                            //   \Q and \E to disable/enable metacharacters
305
                            //   (?P<name>expr) for named captures
306
                            //   \C to match any single byte
307
    UnicodeGroups = 1<<10,  // Allow \p{Han} for Unicode Han group
308
                            //   and \P{Han} for its negation.
309
    NeverNL       = 1<<11,  // Never match NL, even if the regexp mentions
310
                            //   it explicitly.
311
    NeverCapture  = 1<<12,  // Parse all parens as non-capturing.
312
313
    // As close to Perl as we can get.
314
    LikePerl      = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
315
                    UnicodeGroups,
316
317
    // Internal use only.
318
    WasDollar     = 1<<13,  // on kRegexpEndText: was $ in regexp text
319
    AllParseFlags = (1<<14)-1,
320
  };
321
322
  // Get.  No set, Regexps are logically immutable once created.
323
18.8M
  RegexpOp op() { return static_cast<RegexpOp>(op_); }
324
2.17M
  int nsub() { return nsub_; }
325
488k
  bool simple() { return simple_ != 0; }
326
19.2M
  ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
327
  int Ref();  // For testing.
328
329
20.7M
  Regexp** sub() {
330
20.7M
    if(nsub_ <= 1)
331
6.44M
      return &subone_;
332
14.3M
    else
333
14.3M
      return submany_;
334
20.7M
  }
335
336
11.9k
  int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
337
32.9k
  int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
338
1.80M
  Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
339
1.90M
  CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
340
1.60M
  int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
341
0
  const std::string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
342
15.8M
  Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
343
20.0M
  int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
344
0
  int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
345
346
  // Increments reference count, returns object as convenience.
347
  Regexp* Incref();
348
349
  // Decrements reference count and deletes this object if count reaches 0.
350
  void Decref();
351
352
  // Parses string s to produce regular expression, returned.
353
  // Caller must release return value with re->Decref().
354
  // On failure, sets *status (if status != NULL) and returns NULL.
355
  static Regexp* Parse(const StringPiece& s, ParseFlags flags,
356
                       RegexpStatus* status);
357
358
  // Returns a _new_ simplified version of the current regexp.
359
  // Does not edit the current regexp.
360
  // Caller must release return value with re->Decref().
361
  // Simplified means that counted repetition has been rewritten
362
  // into simpler terms and all Perl/POSIX features have been
363
  // removed.  The result will capture exactly the same
364
  // subexpressions the original did, unless formatted with ToString.
365
  Regexp* Simplify();
366
  friend class CoalesceWalker;
367
  friend class SimplifyWalker;
368
369
  // Parses the regexp src and then simplifies it and sets *dst to the
370
  // string representation of the simplified form.  Returns true on success.
371
  // Returns false and sets *status (if status != NULL) on parse error.
372
  static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
373
                             std::string* dst, RegexpStatus* status);
374
375
  // Returns the number of capturing groups in the regexp.
376
  int NumCaptures();
377
  friend class NumCapturesWalker;
378
379
  // Returns a map from names to capturing group indices,
380
  // or NULL if the regexp contains no named capture groups.
381
  // The caller is responsible for deleting the map.
382
  std::map<std::string, int>* NamedCaptures();
383
384
  // Returns a map from capturing group indices to capturing group
385
  // names or NULL if the regexp contains no named capture groups. The
386
  // caller is responsible for deleting the map.
387
  std::map<int, std::string>* CaptureNames();
388
389
  // Returns a string representation of the current regexp,
390
  // using as few parentheses as possible.
391
  std::string ToString();
392
393
  // Convenience functions.  They consume the passed reference,
394
  // so in many cases you should use, e.g., Plus(re->Incref(), flags).
395
  // They do not consume allocated arrays like subs or runes.
396
  static Regexp* Plus(Regexp* sub, ParseFlags flags);
397
  static Regexp* Star(Regexp* sub, ParseFlags flags);
398
  static Regexp* Quest(Regexp* sub, ParseFlags flags);
399
  static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
400
  static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
401
  static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
402
  static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
403
  static Regexp* NewLiteral(Rune rune, ParseFlags flags);
404
  static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
405
  static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
406
  static Regexp* HaveMatch(int match_id, ParseFlags flags);
407
408
  // Like Alternate but does not factor out common prefixes.
409
  static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
410
411
  // Debugging function.  Returns string format for regexp
412
  // that makes structure clear.  Does NOT use regexp syntax.
413
  std::string Dump();
414
415
  // Helper traversal class, defined fully in walker-inl.h.
416
  template<typename T> class Walker;
417
418
  // Compile to Prog.  See prog.h
419
  // Reverse prog expects to be run over text backward.
420
  // Construction and execution of prog will
421
  // stay within approximately max_mem bytes of memory.
422
  // If max_mem <= 0, a reasonable default is used.
423
  Prog* CompileToProg(int64_t max_mem);
424
  Prog* CompileToReverseProg(int64_t max_mem);
425
426
  // Whether to expect this library to find exactly the same answer as PCRE
427
  // when running this regexp.  Most regexps do mimic PCRE exactly, but a few
428
  // obscure cases behave differently.  Technically this is more a property
429
  // of the Prog than the Regexp, but the computation is much easier to do
430
  // on the Regexp.  See mimics_pcre.cc for the exact conditions.
431
  bool MimicsPCRE();
432
433
  // Benchmarking function.
434
  void NullWalk();
435
436
  // Whether every match of this regexp must be anchored and
437
  // begin with a non-empty fixed string (perhaps after ASCII
438
  // case-folding).  If so, returns the prefix and the sub-regexp that
439
  // follows it.
440
  // Callers should expect *prefix, *foldcase and *suffix to be "zeroed"
441
  // regardless of the return value.
442
  bool RequiredPrefix(std::string* prefix, bool* foldcase,
443
                      Regexp** suffix);
444
445
  // Whether every match of this regexp must be unanchored and
446
  // begin with a non-empty fixed string (perhaps after ASCII
447
  // case-folding).  If so, returns the prefix.
448
  // Callers should expect *prefix and *foldcase to be "zeroed"
449
  // regardless of the return value.
450
  bool RequiredPrefixForAccel(std::string* prefix, bool* foldcase);
451
452
  // Controls the maximum repeat count permitted by the parser.
453
  // FOR FUZZING ONLY.
454
  static void FUZZING_ONLY_set_maximum_repeat_count(int i);
455
456
 private:
457
  // Constructor allocates vectors as appropriate for operator.
458
  explicit Regexp(RegexpOp op, ParseFlags parse_flags);
459
460
  // Use Decref() instead of delete to release Regexps.
461
  // This is private to catch deletes at compile time.
462
  ~Regexp();
463
  void Destroy();
464
  bool QuickDestroy();
465
466
  // Helpers for Parse.  Listed here so they can edit Regexps.
467
  class ParseState;
468
469
  friend class ParseState;
470
  friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
471
                             RegexpStatus* status);
472
473
  // Helper for testing [sic].
474
  friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
475
476
  // Computes whether Regexp is already simple.
477
  bool ComputeSimple();
478
479
  // Constructor that generates a Star, Plus or Quest,
480
  // squashing the pair if sub is also a Star, Plus or Quest.
481
  static Regexp* StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags);
482
483
  // Constructor that generates a concatenation or alternation,
484
  // enforcing the limit on the number of subexpressions for
485
  // a particular Regexp.
486
  static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
487
                                   ParseFlags flags, bool can_factor);
488
489
  // Returns the leading string that re starts with.
490
  // The returned Rune* points into a piece of re,
491
  // so it must not be used after the caller calls re->Decref().
492
  static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
493
494
  // Removes the first n leading runes from the beginning of re.
495
  // Edits re in place.
496
  static void RemoveLeadingString(Regexp* re, int n);
497
498
  // Returns the leading regexp in re's top-level concatenation.
499
  // The returned Regexp* points at re or a sub-expression of re,
500
  // so it must not be used after the caller calls re->Decref().
501
  static Regexp* LeadingRegexp(Regexp* re);
502
503
  // Removes LeadingRegexp(re) from re and returns the remainder.
504
  // Might edit re in place.
505
  static Regexp* RemoveLeadingRegexp(Regexp* re);
506
507
  // Simplifies an alternation of literal strings by factoring out
508
  // common prefixes.
509
  static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
510
  friend class FactorAlternationImpl;
511
512
  // Is a == b?  Only efficient on regexps that have not been through
513
  // Simplify yet - the expansion of a kRegexpRepeat will make this
514
  // take a long time.  Do not call on such regexps, hence private.
515
  static bool Equal(Regexp* a, Regexp* b);
516
517
  // Allocate space for n sub-regexps.
518
1.36M
  void AllocSub(int n) {
519
1.36M
    DCHECK(n >= 0 && static_cast<uint16_t>(n) == n);
520
1.36M
    if (n > 1)
521
694k
      submany_ = new Regexp*[n];
522
1.36M
    nsub_ = static_cast<uint16_t>(n);
523
1.36M
  }
524
525
  // Add Rune to LiteralString
526
  void AddRuneToString(Rune r);
527
528
  // Swaps this with that, in place.
529
  void Swap(Regexp *that);
530
531
  // Operator.  See description of operators above.
532
  // uint8_t instead of RegexpOp to control space usage.
533
  uint8_t op_;
534
535
  // Is this regexp structure already simple
536
  // (has it been returned by Simplify)?
537
  // uint8_t instead of bool to control space usage.
538
  uint8_t simple_;
539
540
  // Flags saved from parsing and used during execution.
541
  // (Only FoldCase is used.)
542
  // uint16_t instead of ParseFlags to control space usage.
543
  uint16_t parse_flags_;
544
545
  // Reference count.  Exists so that SimplifyRegexp can build
546
  // regexp structures that are dags rather than trees to avoid
547
  // exponential blowup in space requirements.
548
  // uint16_t to control space usage.
549
  // The standard regexp routines will never generate a
550
  // ref greater than the maximum repeat count (kMaxRepeat),
551
  // but even so, Incref and Decref consult an overflow map
552
  // when ref_ reaches kMaxRef.
553
  uint16_t ref_;
554
  static const uint16_t kMaxRef = 0xffff;
555
556
  // Subexpressions.
557
  // uint16_t to control space usage.
558
  // Concat and Alternate handle larger numbers of subexpressions
559
  // by building concatenation or alternation trees.
560
  // Other routines should call Concat or Alternate instead of
561
  // filling in sub() by hand.
562
  uint16_t nsub_;
563
  static const uint16_t kMaxNsub = 0xffff;
564
  union {
565
    Regexp** submany_;  // if nsub_ > 1
566
    Regexp* subone_;  // if nsub_ == 1
567
  };
568
569
  // Extra space for parse and teardown stacks.
570
  Regexp* down_;
571
572
  // Arguments to operator.  See description of operators above.
573
  union {
574
    struct {  // Repeat
575
      int max_;
576
      int min_;
577
    };
578
    struct {  // Capture
579
      int cap_;
580
      std::string* name_;
581
    };
582
    struct {  // LiteralString
583
      int nrunes_;
584
      Rune* runes_;
585
    };
586
    struct {  // CharClass
587
      // These two could be in separate union members,
588
      // but it wouldn't save any space (there are other two-word structs)
589
      // and keeping them separate avoids confusion during parsing.
590
      CharClass* cc_;
591
      CharClassBuilder* ccb_;
592
    };
593
    Rune rune_;  // Literal
594
    int match_id_;  // HaveMatch
595
    void *the_union_[2];  // as big as any other element, for memset
596
  };
597
598
  Regexp(const Regexp&) = delete;
599
  Regexp& operator=(const Regexp&) = delete;
600
};
601
602
// Character class set: contains non-overlapping, non-abutting RuneRanges.
603
typedef std::set<RuneRange, RuneRangeLess> RuneRangeSet;
604
605
class CharClassBuilder {
606
 public:
607
  CharClassBuilder();
608
609
  typedef RuneRangeSet::iterator iterator;
610
260k
  iterator begin() { return ranges_.begin(); }
611
9.42M
  iterator end() { return ranges_.end(); }
612
613
401k
  int size() { return nrunes_; }
614
69.8k
  bool empty() { return nrunes_ == 0; }
615
69.0k
  bool full() { return nrunes_ == Runemax+1; }
616
617
  bool Contains(Rune r);
618
  bool FoldsASCII();
619
  bool AddRange(Rune lo, Rune hi);  // returns whether class changed
620
  CharClassBuilder* Copy();
621
  void AddCharClass(CharClassBuilder* cc);
622
  void Negate();
623
  void RemoveAbove(Rune r);
624
  CharClass* GetCharClass();
625
  void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
626
627
 private:
628
  static const uint32_t AlphaMask = (1<<26) - 1;
629
  uint32_t upper_;  // bitmap of A-Z
630
  uint32_t lower_;  // bitmap of a-z
631
  int nrunes_;
632
  RuneRangeSet ranges_;
633
634
  CharClassBuilder(const CharClassBuilder&) = delete;
635
  CharClassBuilder& operator=(const CharClassBuilder&) = delete;
636
};
637
638
// Bitwise ops on ParseFlags produce ParseFlags.
639
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a,
640
182k
                                    Regexp::ParseFlags b) {
641
182k
  return static_cast<Regexp::ParseFlags>(
642
182k
      static_cast<int>(a) | static_cast<int>(b));
643
182k
}
644
645
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a,
646
13.5k
                                    Regexp::ParseFlags b) {
647
13.5k
  return static_cast<Regexp::ParseFlags>(
648
13.5k
      static_cast<int>(a) ^ static_cast<int>(b));
649
13.5k
}
650
651
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a,
652
24.1M
                                    Regexp::ParseFlags b) {
653
24.1M
  return static_cast<Regexp::ParseFlags>(
654
24.1M
      static_cast<int>(a) & static_cast<int>(b));
655
24.1M
}
656
657
215k
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a) {
658
  // Attempting to produce a value out of enum's range has undefined behaviour.
659
215k
  return static_cast<Regexp::ParseFlags>(
660
215k
      ~static_cast<int>(a) & static_cast<int>(Regexp::AllParseFlags));
661
215k
}
662
663
}  // namespace re2
664
665
#endif  // RE2_REGEXP_H_