Coverage Report

Created: 2026-05-27 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/proc/self/cwd/external/re2~/re2/prog.h
Line
Count
Source
1
// Copyright 2007 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
#ifndef RE2_PROG_H_
6
#define RE2_PROG_H_
7
8
// Compiled representation of regular expressions.
9
// See regexp.h for the Regexp class, which represents a regular
10
// expression symbolically.
11
12
#include <stdint.h>
13
14
#include <cstring>
15
#include <functional>
16
#include <string>
17
#include <type_traits>
18
#include <vector>
19
20
#include "absl/base/call_once.h"
21
#include "absl/log/absl_check.h"
22
#include "absl/log/absl_log.h"
23
#include "absl/strings/string_view.h"
24
#include "re2/pod_array.h"
25
#include "re2/re2.h"
26
#include "re2/sparse_array.h"
27
#include "re2/sparse_set.h"
28
29
namespace re2 {
30
31
// Opcodes for Inst
32
enum InstOp {
33
  kInstAlt = 0,      // choose between out_ and out1_
34
  kInstAltMatch,     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
35
  kInstByteRange,    // next (possible case-folded) byte must be in [lo_, hi_]
36
  kInstCapture,      // capturing parenthesis number cap_
37
  kInstEmptyWidth,   // empty-width special (^ $ ...); bit(s) set in empty_
38
  kInstMatch,        // found a match!
39
  kInstNop,          // no-op; occasionally unavoidable
40
  kInstFail,         // never match; occasionally unavoidable
41
  kNumInst,
42
};
43
44
// Bit flags for empty-width specials
45
enum EmptyOp {
46
  kEmptyBeginLine        = 1<<0,      // ^ - beginning of line
47
  kEmptyEndLine          = 1<<1,      // $ - end of line
48
  kEmptyBeginText        = 1<<2,      // \A - beginning of text
49
  kEmptyEndText          = 1<<3,      // \z - end of text
50
  kEmptyWordBoundary     = 1<<4,      // \b - word boundary
51
  kEmptyNonWordBoundary  = 1<<5,      // \B - not \b
52
  kEmptyAllFlags         = (1<<6)-1,
53
};
54
55
class DFA;
56
class Regexp;
57
58
// Compiled form of regexp program.
59
class Prog {
60
 public:
61
  Prog();
62
  ~Prog();
63
64
  // Single instruction in regexp program.
65
  class Inst {
66
   public:
67
    // See the assertion below for why this is so.
68
    Inst() = default;
69
70
    // Copyable.
71
    Inst(const Inst&) = default;
72
    Inst& operator=(const Inst&) = default;
73
74
    // Constructors per opcode
75
    void InitAlt(uint32_t out, uint32_t out1);
76
    void InitByteRange(int lo, int hi, int foldcase, uint32_t out);
77
    void InitCapture(int cap, uint32_t out);
78
    void InitEmptyWidth(EmptyOp empty, uint32_t out);
79
    void InitMatch(int id);
80
    void InitNop(uint32_t out);
81
    void InitFail();
82
83
    // Getters
84
0
    int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
85
0
    InstOp opcode() { return static_cast<InstOp>(out_opcode_ & 7); }
86
0
    int last() { return (out_opcode_ >> 3) & 1; }
87
0
    int out() { return out_opcode_ >> 4; }
88
0
    int out1() {
89
0
      ABSL_DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch);
90
0
      return out1_;
91
0
    }
92
0
    int cap() {
93
0
      ABSL_DCHECK_EQ(opcode(), kInstCapture);
94
0
      return cap_;
95
0
    }
96
0
    int lo() {
97
0
      ABSL_DCHECK_EQ(opcode(), kInstByteRange);
98
0
      return lo_;
99
0
    }
100
0
    int hi() {
101
0
      ABSL_DCHECK_EQ(opcode(), kInstByteRange);
102
0
      return hi_;
103
0
    }
104
0
    int foldcase() {
105
0
      ABSL_DCHECK_EQ(opcode(), kInstByteRange);
106
0
      return hint_foldcase_ & 1;
107
0
    }
108
0
    int hint() {
109
0
      ABSL_DCHECK_EQ(opcode(), kInstByteRange);
110
0
      return hint_foldcase_ >> 1;
111
0
    }
112
0
    int match_id() {
113
0
      ABSL_DCHECK_EQ(opcode(), kInstMatch);
114
0
      return match_id_;
115
0
    }
116
0
    EmptyOp empty() {
117
0
      ABSL_DCHECK_EQ(opcode(), kInstEmptyWidth);
118
0
      return empty_;
119
0
    }
120
121
0
    bool greedy(Prog* p) {
122
0
      ABSL_DCHECK_EQ(opcode(), kInstAltMatch);
123
0
      return p->inst(out())->opcode() == kInstByteRange ||
124
0
             (p->inst(out())->opcode() == kInstNop &&
125
0
              p->inst(p->inst(out())->out())->opcode() == kInstByteRange);
126
0
    }
127
128
    // Does this inst (an kInstByteRange) match c?
129
0
    inline bool Matches(int c) {
130
0
      ABSL_DCHECK_EQ(opcode(), kInstByteRange);
131
0
      if (foldcase() && 'A' <= c && c <= 'Z')
132
0
        c += 'a' - 'A';
133
0
      return lo_ <= c && c <= hi_;
134
0
    }
135
136
    // Returns string representation for debugging.
137
    std::string Dump();
138
139
    // Maximum instruction id.
140
    // (Must fit in out_opcode_. PatchList/last steal another bit.)
141
    static const int kMaxInst = (1<<28) - 1;
142
143
   private:
144
0
    void set_opcode(InstOp opcode) {
145
0
      out_opcode_ = (out()<<4) | (last()<<3) | opcode;
146
0
    }
147
148
0
    void set_last() {
149
0
      out_opcode_ = (out()<<4) | (1<<3) | opcode();
150
0
    }
151
152
0
    void set_out(int out) {
153
0
      out_opcode_ = (out<<4) | (last()<<3) | opcode();
154
0
    }
155
156
0
    void set_out_opcode(int out, InstOp opcode) {
157
0
      out_opcode_ = (out<<4) | (last()<<3) | opcode;
158
0
    }
159
160
    uint32_t out_opcode_;  // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
161
    union {                // additional instruction arguments:
162
      uint32_t out1_;      // opcode == kInstAlt
163
                           //   alternate next instruction
164
165
      int32_t cap_;        // opcode == kInstCapture
166
                           //   Index of capture register (holds text
167
                           //   position recorded by capturing parentheses).
168
                           //   For \n (the submatch for the nth parentheses),
169
                           //   the left parenthesis captures into register 2*n
170
                           //   and the right one captures into register 2*n+1.
171
172
      int32_t match_id_;   // opcode == kInstMatch
173
                           //   Match ID to identify this match (for re2::Set).
174
175
      struct {             // opcode == kInstByteRange
176
        uint8_t lo_;       //   byte range is lo_-hi_ inclusive
177
        uint8_t hi_;       //
178
        uint16_t hint_foldcase_;  // 15 bits: hint, 1 (low) bit: foldcase
179
                           //   hint to execution engines: the delta to the
180
                           //   next instruction (in the current list) worth
181
                           //   exploring iff this instruction matched; 0
182
                           //   means there are no remaining possibilities,
183
                           //   which is most likely for character classes.
184
                           //   foldcase: A-Z -> a-z before checking range.
185
      };
186
187
      EmptyOp empty_;       // opcode == kInstEmptyWidth
188
                            //   empty_ is bitwise OR of kEmpty* flags above.
189
    };
190
191
    friend class Compiler;
192
    friend struct PatchList;
193
    friend class Prog;
194
  };
195
196
  // Inst must be trivial so that we can freely clear it with memset(3).
197
  // Arrays of Inst are initialised by copying the initial elements with
198
  // memmove(3) and then clearing any remaining elements with memset(3).
199
  static_assert(std::is_trivial<Inst>::value, "Inst must be trivial");
200
201
  // Whether to anchor the search.
202
  enum Anchor {
203
    kUnanchored,  // match anywhere
204
    kAnchored,    // match only starting at beginning of text
205
  };
206
207
  // Kind of match to look for (for anchor != kFullMatch)
208
  //
209
  // kLongestMatch mode finds the overall longest
210
  // match but still makes its submatch choices the way
211
  // Perl would, not in the way prescribed by POSIX.
212
  // The POSIX rules are much more expensive to implement,
213
  // and no one has needed them.
214
  //
215
  // kFullMatch is not strictly necessary -- we could use
216
  // kLongestMatch and then check the length of the match -- but
217
  // the matching code can run faster if it knows to consider only
218
  // full matches.
219
  enum MatchKind {
220
    kFirstMatch,     // like Perl, PCRE
221
    kLongestMatch,   // like egrep or POSIX
222
    kFullMatch,      // match only entire text; implies anchor==kAnchored
223
    kManyMatch       // for SearchDFA, records set of matches
224
  };
225
226
0
  Inst *inst(int id) { return &inst_[id]; }
227
0
  int start() { return start_; }
228
0
  void set_start(int start) { start_ = start; }
229
0
  int start_unanchored() { return start_unanchored_; }
230
0
  void set_start_unanchored(int start) { start_unanchored_ = start; }
231
0
  int size() { return size_; }
232
0
  bool reversed() { return reversed_; }
233
0
  void set_reversed(bool reversed) { reversed_ = reversed; }
234
0
  int list_count() { return list_count_; }
235
0
  int inst_count(InstOp op) { return inst_count_[op]; }
236
0
  uint16_t* list_heads() { return list_heads_.data(); }
237
0
  size_t bit_state_text_max_size() { return bit_state_text_max_size_; }
238
0
  int64_t dfa_mem() { return dfa_mem_; }
239
0
  void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
240
0
  bool anchor_start() { return anchor_start_; }
241
0
  void set_anchor_start(bool b) { anchor_start_ = b; }
242
0
  bool anchor_end() { return anchor_end_; }
243
0
  void set_anchor_end(bool b) { anchor_end_ = b; }
244
0
  int bytemap_range() { return bytemap_range_; }
245
0
  const uint8_t* bytemap() { return bytemap_; }
246
0
  bool can_prefix_accel() { return prefix_size_ != 0; }
247
248
  // Accelerates to the first likely occurrence of the prefix.
249
  // Returns a pointer to the first byte or NULL if not found.
250
0
  const void* PrefixAccel(const void* data, size_t size) {
251
0
    ABSL_DCHECK(can_prefix_accel());
252
0
    if (prefix_foldcase_) {
253
0
      return PrefixAccel_ShiftDFA(data, size);
254
0
    } else if (prefix_size_ != 1) {
255
0
      return PrefixAccel_FrontAndBack(data, size);
256
0
    } else {
257
0
      return memchr(data, prefix_front_, size);
258
0
    }
259
0
  }
260
261
  // Configures prefix accel using the analysis performed during compilation.
262
  void ConfigurePrefixAccel(const std::string& prefix, bool prefix_foldcase);
263
264
  // An implementation of prefix accel that uses prefix_dfa_ to perform
265
  // case-insensitive search.
266
  const void* PrefixAccel_ShiftDFA(const void* data, size_t size);
267
268
  // An implementation of prefix accel that looks for prefix_front_ and
269
  // prefix_back_ to return fewer false positives than memchr(3) alone.
270
  const void* PrefixAccel_FrontAndBack(const void* data, size_t size);
271
272
  // Returns string representation of program for debugging.
273
  std::string Dump();
274
  std::string DumpUnanchored();
275
  std::string DumpByteMap();
276
277
  // Returns the set of kEmpty flags that are in effect at
278
  // position p within context.
279
  static uint32_t EmptyFlags(absl::string_view context, const char* p);
280
281
  // Returns whether byte c is a word character: ASCII only.
282
  // Used by the implementation of \b and \B.
283
  // This is not right for Unicode, but:
284
  //   - it's hard to get right in a byte-at-a-time matching world
285
  //     (the DFA has only one-byte lookahead).
286
  //   - even if the lookahead were possible, the Progs would be huge.
287
  // This crude approximation is the same one PCRE uses.
288
0
  static bool IsWordChar(uint8_t c) {
289
0
    return ('A' <= c && c <= 'Z') ||
290
0
           ('a' <= c && c <= 'z') ||
291
0
           ('0' <= c && c <= '9') ||
292
0
           c == '_';
293
0
  }
294
295
  // Execution engines.  They all search for the regexp (run the prog)
296
  // in text, which is in the larger context (used for ^ $ \b etc).
297
  // Anchor and kind control the kind of search.
298
  // Returns true if match found, false if not.
299
  // If match found, fills match[0..nmatch-1] with submatch info.
300
  // match[0] is overall match, match[1] is first set of parens, etc.
301
  // If a particular submatch is not matched during the regexp match,
302
  // it is set to NULL.
303
  //
304
  // Matching text == absl::string_view() is treated as any other empty
305
  // string, but note that on return, it will not be possible to distinguish
306
  // submatches that matched that empty string from submatches that didn't
307
  // match anything.  Either way, match[i] == NULL.
308
309
  // Search using NFA: can find submatches but kind of slow.
310
  bool SearchNFA(absl::string_view text, absl::string_view context,
311
                 Anchor anchor, MatchKind kind, absl::string_view* match,
312
                 int nmatch);
313
314
  // Search using DFA: much faster than NFA but only finds
315
  // end of match and can use a lot more memory.
316
  // Returns whether a match was found.
317
  // If the DFA runs out of memory, sets *failed to true and returns false.
318
  // If matches != NULL and kind == kManyMatch and there is a match,
319
  // SearchDFA fills matches with the match IDs of the final matching state.
320
  bool SearchDFA(absl::string_view text, absl::string_view context,
321
                 Anchor anchor, MatchKind kind, absl::string_view* match0,
322
                 bool* failed, SparseSet* matches);
323
324
  // The callback issued after building each DFA state with BuildEntireDFA().
325
  // If next is null, then the memory budget has been exhausted and building
326
  // will halt. Otherwise, the state has been built and next points to an array
327
  // of bytemap_range()+1 slots holding the next states as per the bytemap and
328
  // kByteEndText. The number of the state is implied by the callback sequence:
329
  // the first callback is for state 0, the second callback is for state 1, ...
330
  // match indicates whether the state is a matching state.
331
  using DFAStateCallback = std::function<void(const int* next, bool match)>;
332
333
  // Build the entire DFA for the given match kind.
334
  // Usually the DFA is built out incrementally, as needed, which
335
  // avoids lots of unnecessary work.
336
  // If cb is not empty, it receives one callback per state built.
337
  // Returns the number of states built.
338
  // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
339
  int BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb);
340
341
  // Compute bytemap.
342
  void ComputeByteMap();
343
344
  // Run peep-hole optimizer on program.
345
  void Optimize();
346
347
  // One-pass NFA: only correct if IsOnePass() is true,
348
  // but much faster than NFA (competitive with PCRE)
349
  // for those expressions.
350
  bool IsOnePass();
351
  bool SearchOnePass(absl::string_view text, absl::string_view context,
352
                     Anchor anchor, MatchKind kind, absl::string_view* match,
353
                     int nmatch);
354
355
  // Bit-state backtracking.  Fast on small cases but uses memory
356
  // proportional to the product of the list count and the text size.
357
0
  bool CanBitState() { return list_heads_.data() != NULL; }
358
  bool SearchBitState(absl::string_view text, absl::string_view context,
359
                      Anchor anchor, MatchKind kind, absl::string_view* match,
360
                      int nmatch);
361
362
  static const int kMaxOnePassCapture = 5;  // $0 through $4
363
364
  // Backtracking search: the gold standard against which the other
365
  // implementations are checked.  FOR TESTING ONLY.
366
  // It allocates a ton of memory to avoid running forever.
367
  // It is also recursive, so can't use in production (will overflow stacks).
368
  // The name "Unsafe" here is supposed to be a flag that
369
  // you should not be using this function.
370
  bool UnsafeSearchBacktrack(absl::string_view text, absl::string_view context,
371
                             Anchor anchor, MatchKind kind,
372
                             absl::string_view* match, int nmatch);
373
374
  // Computes range for any strings matching regexp. The min and max can in
375
  // some cases be arbitrarily precise, so the caller gets to specify the
376
  // maximum desired length of string returned.
377
  //
378
  // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
379
  // string s that is an anchored match for this regexp satisfies
380
  //   min <= s && s <= max.
381
  //
382
  // Note that PossibleMatchRange() will only consider the first copy of an
383
  // infinitely repeated element (i.e., any regexp element followed by a '*' or
384
  // '+' operator). Regexps with "{N}" constructions are not affected, as those
385
  // do not compile down to infinite repetitions.
386
  //
387
  // Returns true on success, false on error.
388
  bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
389
390
  // Outputs the program fanout into the given sparse array.
391
  void Fanout(SparseArray<int>* fanout);
392
393
  // Compiles a collection of regexps to Prog.  Each regexp will have
394
  // its own Match instruction recording the index in the output vector.
395
  static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
396
397
  // Flattens the Prog from "tree" form to "list" form. This is an in-place
398
  // operation in the sense that the old instructions are lost.
399
  void Flatten();
400
401
  // Walks the Prog; the "successor roots" or predecessors of the reachable
402
  // instructions are marked in rootmap or predmap/predvec, respectively.
403
  // reachable and stk are preallocated scratch structures.
404
  void MarkSuccessors(SparseArray<int>* rootmap,
405
                      SparseArray<int>* predmap,
406
                      std::vector<std::vector<int>>* predvec,
407
                      SparseSet* reachable, std::vector<int>* stk);
408
409
  // Walks the Prog from the given "root" instruction; the "dominator root"
410
  // of the reachable instructions (if such exists) is marked in rootmap.
411
  // reachable and stk are preallocated scratch structures.
412
  void MarkDominator(int root, SparseArray<int>* rootmap,
413
                     SparseArray<int>* predmap,
414
                     std::vector<std::vector<int>>* predvec,
415
                     SparseSet* reachable, std::vector<int>* stk);
416
417
  // Walks the Prog from the given "root" instruction; the reachable
418
  // instructions are emitted in "list" form and appended to flat.
419
  // reachable and stk are preallocated scratch structures.
420
  void EmitList(int root, SparseArray<int>* rootmap,
421
                std::vector<Inst>* flat,
422
                SparseSet* reachable, std::vector<int>* stk);
423
424
  // Computes hints for ByteRange instructions in [begin, end).
425
  void ComputeHints(std::vector<Inst>* flat, int begin, int end);
426
427
  // Controls whether the DFA should bail out early if the NFA would be faster.
428
  // FOR TESTING ONLY.
429
  static void TESTING_ONLY_set_dfa_should_bail_when_slow(bool b);
430
431
 private:
432
  friend class Compiler;
433
434
  DFA* GetDFA(MatchKind kind);
435
  void DeleteDFA(DFA* dfa);
436
437
  bool anchor_start_;       // regexp has explicit start anchor
438
  bool anchor_end_;         // regexp has explicit end anchor
439
  bool reversed_;           // whether program runs backward over input
440
  bool did_flatten_;        // has Flatten been called?
441
  bool did_onepass_;        // has IsOnePass been called?
442
443
  int start_;               // entry point for program
444
  int start_unanchored_;    // unanchored entry point for program
445
  int size_;                // number of instructions
446
  int bytemap_range_;       // bytemap_[x] < bytemap_range_
447
448
  bool prefix_foldcase_;    // whether prefix is case-insensitive
449
  size_t prefix_size_;      // size of prefix (0 if no prefix)
450
  union {
451
    uint64_t* prefix_dfa_;  // "Shift DFA" for prefix
452
    struct {
453
      int prefix_front_;    // first byte of prefix
454
      int prefix_back_;     // last byte of prefix
455
    };
456
  };
457
458
  int list_count_;                  // count of lists (see above)
459
  int inst_count_[kNumInst];        // count of instructions by opcode
460
  PODArray<uint16_t> list_heads_;   // sparse array enumerating list heads
461
                                    // not populated if size_ is overly large
462
  size_t bit_state_text_max_size_;  // upper bound (inclusive) on text.size()
463
464
  PODArray<Inst> inst_;              // pointer to instruction array
465
  PODArray<uint8_t> onepass_nodes_;  // data for OnePass nodes
466
467
  int64_t dfa_mem_;         // Maximum memory for DFAs.
468
  DFA* dfa_first_;          // DFA cached for kFirstMatch/kManyMatch
469
  DFA* dfa_longest_;        // DFA cached for kLongestMatch/kFullMatch
470
471
  uint8_t bytemap_[256];    // map from input bytes to byte classes
472
473
  absl::once_flag dfa_first_once_;
474
  absl::once_flag dfa_longest_once_;
475
476
  Prog(const Prog&) = delete;
477
  Prog& operator=(const Prog&) = delete;
478
};
479
480
// std::string_view in MSVC has iterators that aren't just pointers and
481
// that don't allow comparisons between different objects - not even if
482
// those objects are views into the same string! Thus, we provide these
483
// conversion functions for convenience.
484
0
static inline const char* BeginPtr(absl::string_view s) {
485
0
  return s.data();
486
0
}
Unexecuted instantiation: re2.cc:re2::BeginPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: compile.cc:re2::BeginPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: prog.cc:re2::BeginPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: dfa.cc:re2::BeginPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: onepass.cc:re2::BeginPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: nfa.cc:re2::BeginPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: bitstate.cc:re2::BeginPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
487
0
static inline const char* EndPtr(absl::string_view s) {
488
0
  return s.data() + s.size();
489
0
}
Unexecuted instantiation: re2.cc:re2::EndPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: compile.cc:re2::EndPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: prog.cc:re2::EndPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: dfa.cc:re2::EndPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: onepass.cc:re2::EndPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: nfa.cc:re2::EndPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
Unexecuted instantiation: bitstate.cc:re2::EndPtr(std::__1::basic_string_view<char, std::__1::char_traits<char> >)
490
491
}  // namespace re2
492
493
#endif  // RE2_PROG_H_