Coverage Report

Created: 2026-05-27 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/proc/self/cwd/external/re2~/re2/dfa.cc
Line
Count
Source
1
// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// A DFA (deterministic finite automaton)-based regular expression search.
6
//
7
// The DFA search has two main parts: the construction of the automaton,
8
// which is represented by a graph of State structures, and the execution
9
// of the automaton over a given input string.
10
//
11
// The basic idea is that the State graph is constructed so that the
12
// execution can simply start with a state s, and then for each byte c in
13
// the input string, execute "s = s->next[c]", checking at each point whether
14
// the current s represents a matching state.
15
//
16
// The simple explanation just given does convey the essence of this code,
17
// but it omits the details of how the State graph gets constructed as well
18
// as some performance-driven optimizations to the execution of the automaton.
19
// All these details are explained in the comments for the code following
20
// the definition of class DFA.
21
//
22
// See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.
23
24
#include <stddef.h>
25
#include <stdint.h>
26
#include <stdio.h>
27
#include <string.h>
28
29
#include <algorithm>
30
#include <atomic>
31
#include <deque>
32
#include <memory>
33
#include <string>
34
#include <utility>
35
#include <vector>
36
37
#include "absl/base/call_once.h"
38
#include "absl/base/thread_annotations.h"
39
#include "absl/container/flat_hash_map.h"
40
#include "absl/container/flat_hash_set.h"
41
#include "absl/hash/hash.h"
42
#include "absl/log/absl_check.h"
43
#include "absl/log/absl_log.h"
44
#include "absl/strings/str_format.h"
45
#include "absl/strings/string_view.h"
46
#include "absl/synchronization/mutex.h"
47
#include "absl/types/span.h"
48
#include "re2/pod_array.h"
49
#include "re2/prog.h"
50
#include "re2/re2.h"
51
#include "re2/sparse_set.h"
52
#include "util/strutil.h"
53
54
// Silence "zero-sized array in struct/union" warning for DFA::State::next_.
55
#ifdef _MSC_VER
56
#pragma warning(disable: 4200)
57
#endif
58
59
namespace re2 {
60
61
// Controls whether the DFA should bail out early if the NFA would be faster.
62
static bool dfa_should_bail_when_slow = true;
63
64
0
void Prog::TESTING_ONLY_set_dfa_should_bail_when_slow(bool b) {
65
0
  dfa_should_bail_when_slow = b;
66
0
}
67
68
// Changing this to true compiles in prints that trace execution of the DFA.
69
// Generates a lot of output -- only useful for debugging.
70
static const bool ExtraDebug = false;
71
72
// A DFA implementation of a regular expression program.
73
// Since this is entirely a forward declaration mandated by C++,
74
// some of the comments here are better understood after reading
75
// the comments in the sections that follow the DFA definition.
76
class DFA {
77
 public:
78
  DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem);
79
  ~DFA();
80
0
  bool ok() const { return !init_failed_; }
81
0
  Prog::MatchKind kind() { return kind_; }
82
83
  // Searches for the regular expression in text, which is considered
84
  // as a subsection of context for the purposes of interpreting flags
85
  // like ^ and $ and \A and \z.
86
  // Returns whether a match was found.
87
  // If a match is found, sets *ep to the end point of the best match in text.
88
  // If "anchored", the match must begin at the start of text.
89
  // If "want_earliest_match", the match that ends first is used, not
90
  //   necessarily the best one.
91
  // If "run_forward" is true, the DFA runs from text.begin() to text.end().
92
  //   If it is false, the DFA runs from text.end() to text.begin(),
93
  //   returning the leftmost end of the match instead of the rightmost one.
94
  // If the DFA cannot complete the search (for example, if it is out of
95
  //   memory), it sets *failed and returns false.
96
  bool Search(absl::string_view text, absl::string_view context, bool anchored,
97
              bool want_earliest_match, bool run_forward, bool* failed,
98
              const char** ep, SparseSet* matches);
99
100
  // Builds out all states for the entire DFA.
101
  // If cb is not empty, it receives one callback per state built.
102
  // Returns the number of states built.
103
  // FOR TESTING OR EXPERIMENTAL PURPOSES ONLY.
104
  int BuildAllStates(const Prog::DFAStateCallback& cb);
105
106
  // Computes min and max for matching strings.  Won't return strings
107
  // bigger than maxlen.
108
  bool PossibleMatchRange(std::string* min, std::string* max, int maxlen);
109
110
  // These data structures are logically private, but C++ makes it too
111
  // difficult to mark them as such.
112
  class RWLocker;
113
  class StateSaver;
114
  class Workq;
115
116
  // A single DFA state.  The DFA is represented as a graph of these
117
  // States, linked by the next_ pointers.  If in state s and reading
118
  // byte c, the next state should be s->next_[c].
119
  struct State {
120
0
    inline bool IsMatch() const { return (flag_ & kFlagMatch) != 0; }
121
122
    template <typename H>
123
0
    friend H AbslHashValue(H h, const State& a) {
124
0
      const absl::Span<const int> ainst(a.inst_, a.ninst_);
125
0
      return H::combine(std::move(h), a.flag_, ainst);
126
0
    }
127
128
0
    friend bool operator==(const State& a, const State& b) {
129
0
      const absl::Span<const int> ainst(a.inst_, a.ninst_);
130
0
      const absl::Span<const int> binst(b.inst_, b.ninst_);
131
0
      return &a == &b || (a.flag_ == b.flag_ && ainst == binst);
132
0
    }
133
134
    int* inst_;         // Instruction pointers in the state.
135
    int ninst_;         // # of inst_ pointers.
136
    uint32_t flag_;     // Empty string bitfield flags in effect on the way
137
                        // into this state, along with kFlagMatch if this
138
                        // is a matching state.
139
140
    std::atomic<State*> next_[];    // Outgoing arrows from State,
141
                                    // one per input byte class
142
  };
143
144
  enum {
145
    kByteEndText = 256,         // imaginary byte at end of text
146
147
    kFlagEmptyMask = 0xFF,      // State.flag_: bits holding kEmptyXXX flags
148
    kFlagMatch = 0x0100,        // State.flag_: this is a matching state
149
    kFlagLastWord = 0x0200,     // State.flag_: last byte was a word char
150
    kFlagNeedShift = 16,        // needed kEmpty bits are or'ed in shifted left
151
  };
152
153
  struct StateHash {
154
0
    size_t operator()(const State* a) const {
155
0
      ABSL_DCHECK(a != NULL);
156
0
      return absl::Hash<State>()(*a);
157
0
    }
158
  };
159
160
  struct StateEqual {
161
0
    bool operator()(const State* a, const State* b) const {
162
0
      ABSL_DCHECK(a != NULL);
163
0
      ABSL_DCHECK(b != NULL);
164
0
      return *a == *b;
165
0
    }
166
  };
167
168
  typedef absl::flat_hash_set<State*, StateHash, StateEqual> StateSet;
169
170
 private:
171
  // Make it easier to swap in a scalable reader-writer mutex.
172
  using CacheMutex = absl::Mutex;
173
174
  enum {
175
    // Indices into start_ for unanchored searches.
176
    // Add kStartAnchored for anchored searches.
177
    kStartBeginText = 0,          // text at beginning of context
178
    kStartBeginLine = 2,          // text at beginning of line
179
    kStartAfterWordChar = 4,      // text follows a word character
180
    kStartAfterNonWordChar = 6,   // text follows non-word character
181
    kMaxStart = 8,
182
183
    kStartAnchored = 1,
184
  };
185
186
  // Resets the DFA State cache, flushing all saved State* information.
187
  // Releases and reacquires cache_mutex_ via cache_lock, so any
188
  // State* existing before the call are not valid after the call.
189
  // Use a StateSaver to preserve important states across the call.
190
  // cache_mutex_.r <= L < mutex_
191
  // After: cache_mutex_.w <= L < mutex_
192
  void ResetCache(RWLocker* cache_lock);
193
194
  // Looks up and returns the State corresponding to a Workq.
195
  // L >= mutex_
196
  State* WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag);
197
198
  // Looks up and returns a State matching the inst, ninst, and flag.
199
  // L >= mutex_
200
  State* CachedState(int* inst, int ninst, uint32_t flag);
201
202
  // Clear the cache entirely.
203
  // Must hold cache_mutex_.w or be in destructor.
204
  void ClearCache();
205
206
  // Converts a State into a Workq: the opposite of WorkqToCachedState.
207
  // L >= mutex_
208
  void StateToWorkq(State* s, Workq* q);
209
210
  // Runs a State on a given byte, returning the next state.
211
  State* RunStateOnByteUnlocked(State*, int);  // cache_mutex_.r <= L < mutex_
212
  State* RunStateOnByte(State*, int);          // L >= mutex_
213
214
  // Runs a Workq on a given byte followed by a set of empty-string flags,
215
  // producing a new Workq in nq.  If a match instruction is encountered,
216
  // sets *ismatch to true.
217
  // L >= mutex_
218
  void RunWorkqOnByte(Workq* q, Workq* nq,
219
                      int c, uint32_t flag, bool* ismatch);
220
221
  // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
222
  // L >= mutex_
223
  void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint32_t flag);
224
225
  // Adds the instruction id to the Workq, following empty arrows
226
  // according to flag.
227
  // L >= mutex_
228
  void AddToQueue(Workq* q, int id, uint32_t flag);
229
230
  // For debugging, returns a text representation of State.
231
  static std::string DumpState(State* state);
232
233
  // For debugging, returns a text representation of a Workq.
234
  static std::string DumpWorkq(Workq* q);
235
236
  // Search parameters
237
  struct SearchParams {
238
    SearchParams(absl::string_view text, absl::string_view context,
239
                 RWLocker* cache_lock)
240
0
      : text(text),
241
0
        context(context),
242
0
        anchored(false),
243
0
        can_prefix_accel(false),
244
0
        want_earliest_match(false),
245
0
        run_forward(false),
246
0
        start(NULL),
247
0
        cache_lock(cache_lock),
248
0
        failed(false),
249
0
        ep(NULL),
250
0
        matches(NULL) {}
251
252
    absl::string_view text;
253
    absl::string_view context;
254
    bool anchored;
255
    bool can_prefix_accel;
256
    bool want_earliest_match;
257
    bool run_forward;
258
    State* start;
259
    RWLocker* cache_lock;
260
    bool failed;     // "out" parameter: whether search gave up
261
    const char* ep;  // "out" parameter: end pointer for match
262
    SparseSet* matches;
263
264
   private:
265
    SearchParams(const SearchParams&) = delete;
266
    SearchParams& operator=(const SearchParams&) = delete;
267
  };
268
269
  // Before each search, the parameters to Search are analyzed by
270
  // AnalyzeSearch to determine the state in which to start.
271
  struct StartInfo {
272
0
    StartInfo() : start(NULL) {}
273
    std::atomic<State*> start;
274
  };
275
276
  // Fills in params->start and params->can_prefix_accel using
277
  // the other search parameters.  Returns true on success,
278
  // false on failure.
279
  // cache_mutex_.r <= L < mutex_
280
  bool AnalyzeSearch(SearchParams* params);
281
  bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
282
                           uint32_t flags);
283
284
  // The generic search loop, inlined to create specialized versions.
285
  // cache_mutex_.r <= L < mutex_
286
  // Might unlock and relock cache_mutex_ via params->cache_lock.
287
  template <bool can_prefix_accel,
288
            bool want_earliest_match,
289
            bool run_forward>
290
  inline bool InlinedSearchLoop(SearchParams* params);
291
292
  // The specialized versions of InlinedSearchLoop.  The three letters
293
  // at the ends of the name denote the true/false values used as the
294
  // last three parameters of InlinedSearchLoop.
295
  // cache_mutex_.r <= L < mutex_
296
  // Might unlock and relock cache_mutex_ via params->cache_lock.
297
  bool SearchFFF(SearchParams* params);
298
  bool SearchFFT(SearchParams* params);
299
  bool SearchFTF(SearchParams* params);
300
  bool SearchFTT(SearchParams* params);
301
  bool SearchTFF(SearchParams* params);
302
  bool SearchTFT(SearchParams* params);
303
  bool SearchTTF(SearchParams* params);
304
  bool SearchTTT(SearchParams* params);
305
306
  // The main search loop: calls an appropriate specialized version of
307
  // InlinedSearchLoop.
308
  // cache_mutex_.r <= L < mutex_
309
  // Might unlock and relock cache_mutex_ via params->cache_lock.
310
  bool FastSearchLoop(SearchParams* params);
311
312
313
  // Looks up bytes in bytemap_ but handles case c == kByteEndText too.
314
0
  int ByteMap(int c) {
315
0
    if (c == kByteEndText)
316
0
      return prog_->bytemap_range();
317
0
    return prog_->bytemap()[c];
318
0
  }
319
320
  // Constant after initialization.
321
  Prog* prog_;              // The regular expression program to run.
322
  Prog::MatchKind kind_;    // The kind of DFA.
323
  bool init_failed_;        // initialization failed (out of memory)
324
325
  absl::Mutex mutex_;  // mutex_ >= cache_mutex_.r
326
327
  // Scratch areas, protected by mutex_.
328
  Workq* q0_;             // Two pre-allocated work queues.
329
  Workq* q1_;
330
  PODArray<int> stack_;   // Pre-allocated stack for AddToQueue
331
332
  // State* cache.  Many threads use and add to the cache simultaneously,
333
  // holding cache_mutex_ for reading and mutex_ (above) when adding.
334
  // If the cache fills and needs to be discarded, the discarding is done
335
  // while holding cache_mutex_ for writing, to avoid interrupting other
336
  // readers.  Any State* pointers are only valid while cache_mutex_
337
  // is held.
338
  CacheMutex cache_mutex_;
339
  int64_t mem_budget_;     // Total memory budget for all States.
340
  int64_t state_budget_;   // Amount of memory remaining for new States.
341
  StateSet state_cache_;   // All States computed so far.
342
  StartInfo start_[kMaxStart];
343
344
  DFA(const DFA&) = delete;
345
  DFA& operator=(const DFA&) = delete;
346
};
347
348
// Shorthand for casting to uint8_t*.
349
0
static inline const uint8_t* BytePtr(const void* v) {
350
0
  return reinterpret_cast<const uint8_t*>(v);
351
0
}
352
353
// Work queues
354
355
// Marks separate thread groups of different priority
356
// in the work queue when in leftmost-longest matching mode.
357
0
#define Mark (-1)
358
359
// Separates the match IDs from the instructions in inst_.
360
// Used only for "many match" DFA states.
361
0
#define MatchSep (-2)
362
363
// Internally, the DFA uses a sparse array of
364
// program instruction pointers as a work queue.
365
// In leftmost longest mode, marks separate sections
366
// of workq that started executing at different
367
// locations in the string (earlier locations first).
368
class DFA::Workq : public SparseSet {
369
 public:
370
  // Constructor: n is number of normal slots, maxmark number of mark slots.
371
  Workq(int n, int maxmark) :
372
0
    SparseSet(n+maxmark),
373
0
    n_(n),
374
0
    maxmark_(maxmark),
375
0
    nextmark_(n),
376
0
    last_was_mark_(true) {
377
0
  }
378
379
0
  bool is_mark(int i) { return i >= n_; }
380
381
0
  int maxmark() { return maxmark_; }
382
383
0
  void clear() {
384
0
    SparseSet::clear();
385
0
    nextmark_ = n_;
386
0
  }
387
388
0
  void mark() {
389
0
    if (last_was_mark_)
390
0
      return;
391
0
    last_was_mark_ = false;
392
0
    SparseSet::insert_new(nextmark_++);
393
0
  }
394
395
0
  int size() {
396
0
    return n_ + maxmark_;
397
0
  }
398
399
0
  void insert(int id) {
400
0
    if (contains(id))
401
0
      return;
402
0
    insert_new(id);
403
0
  }
404
405
0
  void insert_new(int id) {
406
0
    last_was_mark_ = false;
407
0
    SparseSet::insert_new(id);
408
0
  }
409
410
 private:
411
  int n_;                // size excluding marks
412
  int maxmark_;          // maximum number of marks
413
  int nextmark_;         // id of next mark
414
  bool last_was_mark_;   // last inserted was mark
415
416
  Workq(const Workq&) = delete;
417
  Workq& operator=(const Workq&) = delete;
418
};
419
420
DFA::DFA(Prog* prog, Prog::MatchKind kind, int64_t max_mem)
421
0
  : prog_(prog),
422
0
    kind_(kind),
423
0
    init_failed_(false),
424
0
    q0_(NULL),
425
0
    q1_(NULL),
426
0
    mem_budget_(max_mem) {
427
0
  if (ExtraDebug)
428
0
    absl::FPrintF(stderr, "\nkind %d\n%s\n", kind_, prog_->DumpUnanchored());
429
0
  int nmark = 0;
430
0
  if (kind_ == Prog::kLongestMatch)
431
0
    nmark = prog_->size();
432
  // See DFA::AddToQueue() for why this is so.
433
0
  int nstack = prog_->inst_count(kInstCapture) +
434
0
               prog_->inst_count(kInstEmptyWidth) +
435
0
               prog_->inst_count(kInstNop) +
436
0
               nmark + 1;  // + 1 for start inst
437
438
  // Account for space needed for DFA, q0, q1, stack.
439
0
  mem_budget_ -= sizeof(DFA);
440
0
  mem_budget_ -= (prog_->size() + nmark) *
441
0
                 (sizeof(int)+sizeof(int)) * 2;  // q0, q1
442
0
  mem_budget_ -= nstack * sizeof(int);  // stack
443
0
  if (mem_budget_ < 0) {
444
0
    init_failed_ = true;
445
0
    return;
446
0
  }
447
448
0
  state_budget_ = mem_budget_;
449
450
  // Make sure there is a reasonable amount of working room left.
451
  // At minimum, the search requires room for two states in order
452
  // to limp along, restarting frequently.  We'll get better performance
453
  // if there is room for a larger number of states, say 20.
454
  // Note that a state stores list heads only, so we use the program
455
  // list count for the upper bound, not the program size.
456
0
  int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
457
0
  int64_t one_state = sizeof(State) + nnext*sizeof(std::atomic<State*>) +
458
0
                      (prog_->list_count()+nmark)*sizeof(int);
459
0
  if (state_budget_ < 20*one_state) {
460
0
    init_failed_ = true;
461
0
    return;
462
0
  }
463
464
0
  q0_ = new Workq(prog_->size(), nmark);
465
0
  q1_ = new Workq(prog_->size(), nmark);
466
0
  stack_ = PODArray<int>(nstack);
467
0
}
468
469
0
DFA::~DFA() {
470
0
  delete q0_;
471
0
  delete q1_;
472
0
  ClearCache();
473
0
}
474
475
// In the DFA state graph, s->next[c] == NULL means that the
476
// state has not yet been computed and needs to be.  We need
477
// a different special value to signal that s->next[c] is a
478
// state that can never lead to a match (and thus the search
479
// can be called off).  Hence DeadState.
480
0
#define DeadState reinterpret_cast<State*>(1)
481
482
// Signals that the rest of the string matches no matter what it is.
483
0
#define FullMatchState reinterpret_cast<State*>(2)
484
485
0
#define SpecialStateMax FullMatchState
486
487
// Debugging printouts
488
489
// For debugging, returns a string representation of the work queue.
490
0
std::string DFA::DumpWorkq(Workq* q) {
491
0
  std::string s;
492
0
  const char* sep = "";
493
0
  for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
494
0
    if (q->is_mark(*it)) {
495
0
      s += "|";
496
0
      sep = "";
497
0
    } else {
498
0
      s += absl::StrFormat("%s%d", sep, *it);
499
0
      sep = ",";
500
0
    }
501
0
  }
502
0
  return s;
503
0
}
504
505
// For debugging, returns a string representation of the state.
506
0
std::string DFA::DumpState(State* state) {
507
0
  if (state == NULL)
508
0
    return "_";
509
0
  if (state == DeadState)
510
0
    return "X";
511
0
  if (state == FullMatchState)
512
0
    return "*";
513
0
  std::string s;
514
0
  const char* sep = "";
515
0
  s += absl::StrFormat("(%p)", state);
516
0
  for (int i = 0; i < state->ninst_; i++) {
517
0
    if (state->inst_[i] == Mark) {
518
0
      s += "|";
519
0
      sep = "";
520
0
    } else if (state->inst_[i] == MatchSep) {
521
0
      s += "||";
522
0
      sep = "";
523
0
    } else {
524
0
      s += absl::StrFormat("%s%d", sep, state->inst_[i]);
525
0
      sep = ",";
526
0
    }
527
0
  }
528
0
  s += absl::StrFormat(" flag=%#x", state->flag_);
529
0
  return s;
530
0
}
531
532
//////////////////////////////////////////////////////////////////////
533
//
534
// DFA state graph construction.
535
//
536
// The DFA state graph is a heavily-linked collection of State* structures.
537
// The state_cache_ is a set of all the State structures ever allocated,
538
// so that if the same state is reached by two different paths,
539
// the same State structure can be used.  This reduces allocation
540
// requirements and also avoids duplication of effort across the two
541
// identical states.
542
//
543
// A State is defined by an ordered list of instruction ids and a flag word.
544
//
545
// The choice of an ordered list of instructions differs from a typical
546
// textbook DFA implementation, which would use an unordered set.
547
// Textbook descriptions, however, only care about whether
548
// the DFA matches, not where it matches in the text.  To decide where the
549
// DFA matches, we need to mimic the behavior of the dominant backtracking
550
// implementations like PCRE, which try one possible regular expression
551
// execution, then another, then another, stopping when one of them succeeds.
552
// The DFA execution tries these many executions in parallel, representing
553
// each by an instruction id.  These pointers are ordered in the State.inst_
554
// list in the same order that the executions would happen in a backtracking
555
// search: if a match is found during execution of inst_[2], inst_[i] for i>=3
556
// can be discarded.
557
//
558
// Textbooks also typically do not consider context-aware empty string operators
559
// like ^ or $.  These are handled by the flag word, which specifies the set
560
// of empty-string operators that should be matched when executing at the
561
// current text position.  These flag bits are defined in prog.h.
562
// The flag word also contains two DFA-specific bits: kFlagMatch if the state
563
// is a matching state (one that reached a kInstMatch in the program)
564
// and kFlagLastWord if the last processed byte was a word character, for the
565
// implementation of \B and \b.
566
//
567
// The flag word also contains, shifted up 16 bits, the bits looked for by
568
// any kInstEmptyWidth instructions in the state.  These provide a useful
569
// summary indicating when new flags might be useful.
570
//
571
// The permanent representation of a State's instruction ids is just an array,
572
// but while a state is being analyzed, these instruction ids are represented
573
// as a Workq, which is an array that allows iteration in insertion order.
574
575
// NOTE(rsc): The choice of State construction determines whether the DFA
576
// mimics backtracking implementations (so-called leftmost first matching) or
577
// traditional DFA implementations (so-called leftmost longest matching as
578
// prescribed by POSIX).  This implementation chooses to mimic the
579
// backtracking implementations, because we want to replace PCRE.  To get
580
// POSIX behavior, the states would need to be considered not as a simple
581
// ordered list of instruction ids, but as a list of unordered sets of instruction
582
// ids.  A match by a state in one set would inhibit the running of sets
583
// farther down the list but not other instruction ids in the same set.  Each
584
// set would correspond to matches beginning at a given point in the string.
585
// This is implemented by separating different sets with Mark pointers.
586
587
// Looks in the State cache for a State matching q, flag.
588
// If one is found, returns it.  If one is not found, allocates one,
589
// inserts it in the cache, and returns it.
590
// If mq is not null, MatchSep and the match IDs in mq will be appended
591
// to the State.
592
0
DFA::State* DFA::WorkqToCachedState(Workq* q, Workq* mq, uint32_t flag) {
593
  //mutex_.AssertHeld();
594
595
  // Construct array of instruction ids for the new state.
596
  // In some cases, kInstAltMatch may trigger an upgrade to FullMatchState.
597
  // Otherwise, "compress" q down to list heads for storage; StateToWorkq()
598
  // will "decompress" it for computation by exploring from each list head.
599
  //
600
  // Historically, only kInstByteRange, kInstEmptyWidth and kInstMatch were
601
  // useful to keep, but it turned out that kInstAlt was necessary to keep:
602
  //
603
  // > [*] kInstAlt would seem useless to record in a state, since
604
  // > we've already followed both its arrows and saved all the
605
  // > interesting states we can reach from there.  The problem
606
  // > is that one of the empty-width instructions might lead
607
  // > back to the same kInstAlt (if an empty-width operator is starred),
608
  // > producing a different evaluation order depending on whether
609
  // > we keep the kInstAlt to begin with.  Sigh.
610
  // > A specific case that this affects is /(^|a)+/ matching "a".
611
  // > If we don't save the kInstAlt, we will match the whole "a" (0,1)
612
  // > but in fact the correct leftmost-first match is the leading "" (0,0).
613
  //
614
  // Recall that flattening transformed the Prog from "tree" form to "list"
615
  // form: in the former, kInstAlt existed explicitly... and abundantly; in
616
  // the latter, it's implied between the instructions that compose a list.
617
  // Thus, because the information wasn't lost, the bug doesn't remanifest.
618
0
  PODArray<int> inst(q->size());
619
0
  int n = 0;
620
0
  uint32_t needflags = 0;  // flags needed by kInstEmptyWidth instructions
621
0
  bool sawmatch = false;   // whether queue contains guaranteed kInstMatch
622
0
  bool sawmark = false;    // whether queue contains a Mark
623
0
  if (ExtraDebug)
624
0
    absl::FPrintF(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q), flag);
625
0
  for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
626
0
    int id = *it;
627
0
    if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
628
0
      break;
629
0
    if (q->is_mark(id)) {
630
0
      if (n > 0 && inst[n-1] != Mark) {
631
0
        sawmark = true;
632
0
        inst[n++] = Mark;
633
0
      }
634
0
      continue;
635
0
    }
636
0
    Prog::Inst* ip = prog_->inst(id);
637
0
    switch (ip->opcode()) {
638
0
      case kInstAltMatch:
639
        // This state will continue to a match no matter what
640
        // the rest of the input is.  If it is the highest priority match
641
        // being considered, return the special FullMatchState
642
        // to indicate that it's all matches from here out.
643
0
        if (kind_ != Prog::kManyMatch &&
644
0
            (kind_ != Prog::kFirstMatch ||
645
0
             (it == q->begin() && ip->greedy(prog_))) &&
646
0
            (kind_ != Prog::kLongestMatch || !sawmark) &&
647
0
            (flag & kFlagMatch)) {
648
0
          if (ExtraDebug)
649
0
            absl::FPrintF(stderr, " -> FullMatchState\n");
650
0
          return FullMatchState;
651
0
        }
652
0
        [[fallthrough]];
653
0
      default:
654
        // Record iff id is the head of its list, which must
655
        // be the case if id-1 is the last of *its* list. :)
656
0
        if (prog_->inst(id-1)->last())
657
0
          inst[n++] = *it;
658
0
        if (ip->opcode() == kInstEmptyWidth)
659
0
          needflags |= ip->empty();
660
0
        if (ip->opcode() == kInstMatch && !prog_->anchor_end())
661
0
          sawmatch = true;
662
0
        break;
663
0
    }
664
0
  }
665
0
  ABSL_DCHECK_LE(n, q->size());
666
0
  if (n > 0 && inst[n-1] == Mark)
667
0
    n--;
668
669
  // If there are no empty-width instructions waiting to execute,
670
  // then the extra flag bits will not be used, so there is no
671
  // point in saving them.  (Discarding them reduces the number
672
  // of distinct states.)
673
0
  if (needflags == 0)
674
0
    flag &= kFlagMatch;
675
676
  // NOTE(rsc): The code above cannot do flag &= needflags,
677
  // because if the right flags were present to pass the current
678
  // kInstEmptyWidth instructions, new kInstEmptyWidth instructions
679
  // might be reached that in turn need different flags.
680
  // The only sure thing is that if there are no kInstEmptyWidth
681
  // instructions at all, no flags will be needed.
682
  // We could do the extra work to figure out the full set of
683
  // possibly needed flags by exploring past the kInstEmptyWidth
684
  // instructions, but the check above -- are any flags needed
685
  // at all? -- handles the most common case.  More fine-grained
686
  // analysis can only be justified by measurements showing that
687
  // too many redundant states are being allocated.
688
689
  // If there are no Insts in the list, it's a dead state,
690
  // which is useful to signal with a special pointer so that
691
  // the execution loop can stop early.  This is only okay
692
  // if the state is *not* a matching state.
693
0
  if (n == 0 && flag == 0) {
694
0
    if (ExtraDebug)
695
0
      absl::FPrintF(stderr, " -> DeadState\n");
696
0
    return DeadState;
697
0
  }
698
699
  // If we're in longest match mode, the state is a sequence of
700
  // unordered state sets separated by Marks.  Sort each set
701
  // to canonicalize, to reduce the number of distinct sets stored.
702
0
  if (kind_ == Prog::kLongestMatch) {
703
0
    int* ip = inst.data();
704
0
    int* ep = ip + n;
705
0
    while (ip < ep) {
706
0
      int* markp = ip;
707
0
      while (markp < ep && *markp != Mark)
708
0
        markp++;
709
0
      std::sort(ip, markp);
710
0
      if (markp < ep)
711
0
        markp++;
712
0
      ip = markp;
713
0
    }
714
0
  }
715
716
  // If we're in many match mode, canonicalize for similar reasons:
717
  // we have an unordered set of states (i.e. we don't have Marks)
718
  // and sorting will reduce the number of distinct sets stored.
719
0
  if (kind_ == Prog::kManyMatch) {
720
0
    int* ip = inst.data();
721
0
    int* ep = ip + n;
722
0
    std::sort(ip, ep);
723
0
  }
724
725
  // Append MatchSep and the match IDs in mq if necessary.
726
0
  if (mq != NULL) {
727
0
    inst[n++] = MatchSep;
728
0
    for (Workq::iterator i = mq->begin(); i != mq->end(); ++i) {
729
0
      int id = *i;
730
0
      Prog::Inst* ip = prog_->inst(id);
731
0
      if (ip->opcode() == kInstMatch)
732
0
        inst[n++] = ip->match_id();
733
0
    }
734
0
  }
735
736
  // Save the needed empty-width flags in the top bits for use later.
737
0
  flag |= needflags << kFlagNeedShift;
738
739
0
  State* state = CachedState(inst.data(), n, flag);
740
0
  return state;
741
0
}
742
743
// Looks in the State cache for a State matching inst, ninst, flag.
744
// If one is found, returns it.  If one is not found, allocates one,
745
// inserts it in the cache, and returns it.
746
0
DFA::State* DFA::CachedState(int* inst, int ninst, uint32_t flag) {
747
  //mutex_.AssertHeld();
748
749
  // Look in the cache for a pre-existing state.
750
  // We have to initialise the struct like this because otherwise
751
  // MSVC will complain about the flexible array member. :(
752
0
  State state;
753
0
  state.inst_ = inst;
754
0
  state.ninst_ = ninst;
755
0
  state.flag_ = flag;
756
0
  StateSet::iterator it = state_cache_.find(&state);
757
0
  if (it != state_cache_.end()) {
758
0
    if (ExtraDebug)
759
0
      absl::FPrintF(stderr, " -cached-> %s\n", DumpState(*it));
760
0
    return *it;
761
0
  }
762
763
  // Must have enough memory for new state.
764
  // In addition to what we're going to allocate,
765
  // the state cache hash table seems to incur about 18 bytes per
766
  // State*. Worst case for non-small sets is it being half full, where each
767
  // value present takes up 1 byte hash sample plus the pointer itself.
768
0
  const int kStateCacheOverhead = 18;
769
0
  int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
770
0
  int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>);
771
0
  int instmem = ninst*sizeof(int);
772
0
  if (mem_budget_ < mem + instmem + kStateCacheOverhead) {
773
0
    mem_budget_ = -1;
774
0
    return NULL;
775
0
  }
776
0
  mem_budget_ -= mem + instmem + kStateCacheOverhead;
777
778
  // Allocate new state along with room for next_ and inst_.
779
  // inst_ is stored separately since it's colder; this also
780
  // means that the States for a given DFA are the same size
781
  // class, so the allocator can hopefully pack them better.
782
0
  char* space = std::allocator<char>().allocate(mem);
783
0
  State* s = new (space) State;
784
0
  (void) new (s->next_) std::atomic<State*>[nnext];
785
  // Work around a unfortunate bug in older versions of libstdc++.
786
  // (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=64658)
787
0
  for (int i = 0; i < nnext; i++)
788
0
    (void) new (s->next_ + i) std::atomic<State*>(NULL);
789
0
  s->inst_ = std::allocator<int>().allocate(ninst);
790
0
  (void) new (s->inst_) int[ninst];
791
0
  memmove(s->inst_, inst, instmem);
792
0
  s->ninst_ = ninst;
793
0
  s->flag_ = flag;
794
0
  if (ExtraDebug)
795
0
    absl::FPrintF(stderr, " -> %s\n", DumpState(s));
796
797
  // Put state in cache and return it.
798
0
  state_cache_.insert(s);
799
0
  return s;
800
0
}
801
802
// Clear the cache.  Must hold cache_mutex_.w or be in destructor.
803
0
void DFA::ClearCache() {
804
0
  StateSet::iterator begin = state_cache_.begin();
805
0
  StateSet::iterator end = state_cache_.end();
806
0
  while (begin != end) {
807
0
    StateSet::iterator tmp = begin;
808
0
    ++begin;
809
    // Deallocate the instruction array, which is stored separately as above.
810
0
    std::allocator<int>().deallocate((*tmp)->inst_, (*tmp)->ninst_);
811
    // Deallocate the blob of memory that we allocated in DFA::CachedState().
812
    // We recompute mem in order to benefit from sized delete where possible.
813
0
    int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
814
0
    int mem = sizeof(State) + nnext*sizeof(std::atomic<State*>);
815
0
    std::allocator<char>().deallocate(reinterpret_cast<char*>(*tmp), mem);
816
0
  }
817
0
  state_cache_.clear();
818
0
}
819
820
// Copies insts in state s to the work queue q.
821
0
void DFA::StateToWorkq(State* s, Workq* q) {
822
0
  q->clear();
823
0
  for (int i = 0; i < s->ninst_; i++) {
824
0
    if (s->inst_[i] == Mark) {
825
0
      q->mark();
826
0
    } else if (s->inst_[i] == MatchSep) {
827
      // Nothing after this is an instruction!
828
0
      break;
829
0
    } else {
830
      // Explore from the head of the list.
831
0
      AddToQueue(q, s->inst_[i], s->flag_ & kFlagEmptyMask);
832
0
    }
833
0
  }
834
0
}
835
836
// Adds ip to the work queue, following empty arrows according to flag.
837
0
void DFA::AddToQueue(Workq* q, int id, uint32_t flag) {
838
839
  // Use stack_ to hold our stack of instructions yet to process.
840
  // It was preallocated as follows:
841
  //   one entry per Capture;
842
  //   one entry per EmptyWidth; and
843
  //   one entry per Nop.
844
  // This reflects the maximum number of stack pushes that each can
845
  // perform. (Each instruction can be processed at most once.)
846
  // When using marks, we also added nmark == prog_->size().
847
  // (Otherwise, nmark == 0.)
848
0
  int* stk = stack_.data();
849
0
  int nstk = 0;
850
851
0
  stk[nstk++] = id;
852
0
  while (nstk > 0) {
853
0
    ABSL_DCHECK_LE(nstk, stack_.size());
854
0
    id = stk[--nstk];
855
856
0
  Loop:
857
0
    if (id == Mark) {
858
0
      q->mark();
859
0
      continue;
860
0
    }
861
862
0
    if (id == 0)
863
0
      continue;
864
865
    // If ip is already on the queue, nothing to do.
866
    // Otherwise add it.  We don't actually keep all the
867
    // ones that get added, but adding all of them here
868
    // increases the likelihood of q->contains(id),
869
    // reducing the amount of duplicated work.
870
0
    if (q->contains(id))
871
0
      continue;
872
0
    q->insert_new(id);
873
874
    // Process instruction.
875
0
    Prog::Inst* ip = prog_->inst(id);
876
0
    switch (ip->opcode()) {
877
0
      default:
878
0
        ABSL_LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
879
0
        break;
880
881
0
      case kInstByteRange:  // just save these on the queue
882
0
      case kInstMatch:
883
0
        if (ip->last())
884
0
          break;
885
0
        id = id+1;
886
0
        goto Loop;
887
888
0
      case kInstCapture:    // DFA treats captures as no-ops.
889
0
      case kInstNop:
890
0
        if (!ip->last())
891
0
          stk[nstk++] = id+1;
892
893
        // If this instruction is the [00-FF]* loop at the beginning of
894
        // a leftmost-longest unanchored search, separate with a Mark so
895
        // that future threads (which will start farther to the right in
896
        // the input string) are lower priority than current threads.
897
0
        if (ip->opcode() == kInstNop && q->maxmark() > 0 &&
898
0
            id == prog_->start_unanchored() && id != prog_->start())
899
0
          stk[nstk++] = Mark;
900
0
        id = ip->out();
901
0
        goto Loop;
902
903
0
      case kInstAltMatch:
904
0
        ABSL_DCHECK(!ip->last());
905
0
        id = id+1;
906
0
        goto Loop;
907
908
0
      case kInstEmptyWidth:
909
0
        if (!ip->last())
910
0
          stk[nstk++] = id+1;
911
912
        // Continue on if we have all the right flag bits.
913
0
        if (ip->empty() & ~flag)
914
0
          break;
915
0
        id = ip->out();
916
0
        goto Loop;
917
0
    }
918
0
  }
919
0
}
920
921
// Running of work queues.  In the work queue, order matters:
922
// the queue is sorted in priority order.  If instruction i comes before j,
923
// then the instructions that i produces during the run must come before
924
// the ones that j produces.  In order to keep this invariant, all the
925
// work queue runners have to take an old queue to process and then
926
// also a new queue to fill in.  It's not acceptable to add to the end of
927
// an existing queue, because new instructions will not end up in the
928
// correct position.
929
930
// Runs the work queue, processing the empty strings indicated by flag.
931
// For example, flag == kEmptyBeginLine|kEmptyEndLine means to match
932
// both ^ and $.  It is important that callers pass all flags at once:
933
// processing both ^ and $ is not the same as first processing only ^
934
// and then processing only $.  Doing the two-step sequence won't match
935
// ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior
936
// exhibited by existing implementations).
937
0
void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint32_t flag) {
938
0
  newq->clear();
939
0
  for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
940
0
    if (oldq->is_mark(*i))
941
0
      AddToQueue(newq, Mark, flag);
942
0
    else
943
0
      AddToQueue(newq, *i, flag);
944
0
  }
945
0
}
946
947
// Runs the work queue, processing the single byte c followed by any empty
948
// strings indicated by flag.  For example, c == 'a' and flag == kEmptyEndLine,
949
// means to match c$.  Sets the bool *ismatch to true if the end of the
950
// regular expression program has been reached (the regexp has matched).
951
void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
952
0
                         int c, uint32_t flag, bool* ismatch) {
953
  //mutex_.AssertHeld();
954
955
0
  newq->clear();
956
0
  for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
957
0
    if (oldq->is_mark(*i)) {
958
0
      if (*ismatch)
959
0
        return;
960
0
      newq->mark();
961
0
      continue;
962
0
    }
963
0
    int id = *i;
964
0
    Prog::Inst* ip = prog_->inst(id);
965
0
    switch (ip->opcode()) {
966
0
      default:
967
0
        ABSL_LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
968
0
        break;
969
970
0
      case kInstFail:        // never succeeds
971
0
      case kInstCapture:     // already followed
972
0
      case kInstNop:         // already followed
973
0
      case kInstAltMatch:    // already followed
974
0
      case kInstEmptyWidth:  // already followed
975
0
        break;
976
977
0
      case kInstByteRange:   // can follow if c is in range
978
0
        if (!ip->Matches(c))
979
0
          break;
980
0
        AddToQueue(newq, ip->out(), flag);
981
0
        if (ip->hint() != 0) {
982
          // We have a hint, but we must cancel out the
983
          // increment that will occur after the break.
984
0
          i += ip->hint() - 1;
985
0
        } else {
986
          // We have no hint, so we must find the end
987
          // of the current list and then skip to it.
988
0
          Prog::Inst* ip0 = ip;
989
0
          while (!ip->last())
990
0
            ++ip;
991
0
          i += ip - ip0;
992
0
        }
993
0
        break;
994
995
0
      case kInstMatch:
996
0
        if (prog_->anchor_end() && c != kByteEndText &&
997
0
            kind_ != Prog::kManyMatch)
998
0
          break;
999
0
        *ismatch = true;
1000
0
        if (kind_ == Prog::kFirstMatch) {
1001
          // Can stop processing work queue since we found a match.
1002
0
          return;
1003
0
        }
1004
0
        break;
1005
0
    }
1006
0
  }
1007
1008
0
  if (ExtraDebug)
1009
0
    absl::FPrintF(stderr, "%s on %d[%#x] -> %s [%d]\n",
1010
0
                  DumpWorkq(oldq), c, flag, DumpWorkq(newq), *ismatch);
1011
0
}
1012
1013
// Processes input byte c in state, returning new state.
1014
// Caller does not hold mutex.
1015
0
DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
1016
  // Keep only one RunStateOnByte going
1017
  // even if the DFA is being run by multiple threads.
1018
0
  absl::MutexLock l(&mutex_);
1019
0
  return RunStateOnByte(state, c);
1020
0
}
1021
1022
// Processes input byte c in state, returning new state.
1023
0
DFA::State* DFA::RunStateOnByte(State* state, int c) {
1024
  //mutex_.AssertHeld();
1025
1026
0
  if (state <= SpecialStateMax) {
1027
0
    if (state == FullMatchState) {
1028
      // It is convenient for routines like PossibleMatchRange
1029
      // if we implement RunStateOnByte for FullMatchState:
1030
      // once you get into this state you never get out,
1031
      // so it's pretty easy.
1032
0
      return FullMatchState;
1033
0
    }
1034
0
    if (state == DeadState) {
1035
0
      ABSL_LOG(DFATAL) << "DeadState in RunStateOnByte";
1036
0
      return NULL;
1037
0
    }
1038
0
    if (state == NULL) {
1039
0
      ABSL_LOG(DFATAL) << "NULL state in RunStateOnByte";
1040
0
      return NULL;
1041
0
    }
1042
0
    ABSL_LOG(DFATAL) << "Unexpected special state in RunStateOnByte";
1043
0
    return NULL;
1044
0
  }
1045
1046
  // If someone else already computed this, return it.
1047
0
  State* ns = state->next_[ByteMap(c)].load(std::memory_order_relaxed);
1048
0
  if (ns != NULL)
1049
0
    return ns;
1050
1051
  // Convert state into Workq.
1052
0
  StateToWorkq(state, q0_);
1053
1054
  // Flags marking the kinds of empty-width things (^ $ etc)
1055
  // around this byte.  Before the byte we have the flags recorded
1056
  // in the State structure itself.  After the byte we have
1057
  // nothing yet (but that will change: read on).
1058
0
  uint32_t needflag = state->flag_ >> kFlagNeedShift;
1059
0
  uint32_t beforeflag = state->flag_ & kFlagEmptyMask;
1060
0
  uint32_t oldbeforeflag = beforeflag;
1061
0
  uint32_t afterflag = 0;
1062
1063
0
  if (c == '\n') {
1064
    // Insert implicit $ and ^ around \n
1065
0
    beforeflag |= kEmptyEndLine;
1066
0
    afterflag |= kEmptyBeginLine;
1067
0
  }
1068
1069
0
  if (c == kByteEndText) {
1070
    // Insert implicit $ and \z before the fake "end text" byte.
1071
0
    beforeflag |= kEmptyEndLine | kEmptyEndText;
1072
0
  }
1073
1074
  // The state flag kFlagLastWord says whether the last
1075
  // byte processed was a word character.  Use that info to
1076
  // insert empty-width (non-)word boundaries.
1077
0
  bool islastword = (state->flag_ & kFlagLastWord) != 0;
1078
0
  bool isword = c != kByteEndText && Prog::IsWordChar(static_cast<uint8_t>(c));
1079
0
  if (isword == islastword)
1080
0
    beforeflag |= kEmptyNonWordBoundary;
1081
0
  else
1082
0
    beforeflag |= kEmptyWordBoundary;
1083
1084
  // Okay, finally ready to run.
1085
  // Only useful to rerun on empty string if there are new, useful flags.
1086
0
  if (beforeflag & ~oldbeforeflag & needflag) {
1087
0
    RunWorkqOnEmptyString(q0_, q1_, beforeflag);
1088
0
    using std::swap;
1089
0
    swap(q0_, q1_);
1090
0
  }
1091
0
  bool ismatch = false;
1092
0
  RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch);
1093
0
  using std::swap;
1094
0
  swap(q0_, q1_);
1095
1096
  // Save afterflag along with ismatch and isword in new state.
1097
0
  uint32_t flag = afterflag;
1098
0
  if (ismatch)
1099
0
    flag |= kFlagMatch;
1100
0
  if (isword)
1101
0
    flag |= kFlagLastWord;
1102
1103
0
  if (ismatch && kind_ == Prog::kManyMatch)
1104
0
    ns = WorkqToCachedState(q0_, q1_, flag);
1105
0
  else
1106
0
    ns = WorkqToCachedState(q0_, NULL, flag);
1107
1108
  // Flush ns before linking to it.
1109
  // Write barrier before updating state->next_ so that the
1110
  // main search loop can proceed without any locking, for speed.
1111
  // (Otherwise it would need one mutex operation per input byte.)
1112
0
  state->next_[ByteMap(c)].store(ns, std::memory_order_release);
1113
0
  return ns;
1114
0
}
1115
1116
1117
//////////////////////////////////////////////////////////////////////
1118
// DFA cache reset.
1119
1120
// Reader-writer lock helper.
1121
//
1122
// The DFA uses a reader-writer mutex to protect the state graph itself.
1123
// Traversing the state graph requires holding the mutex for reading,
1124
// and discarding the state graph and starting over requires holding the
1125
// lock for writing.  If a search needs to expand the graph but is out
1126
// of memory, it will need to drop its read lock and then acquire the
1127
// write lock.  Since it cannot then atomically downgrade from write lock
1128
// to read lock, it runs the rest of the search holding the write lock.
1129
// (This probably helps avoid repeated contention, but really the decision
1130
// is forced by the Mutex interface.)  It's a bit complicated to keep
1131
// track of whether the lock is held for reading or writing and thread
1132
// that through the search, so instead we encapsulate it in the RWLocker
1133
// and pass that around.
1134
1135
class DFA::RWLocker {
1136
 public:
1137
  explicit RWLocker(CacheMutex* mu);
1138
  ~RWLocker();
1139
1140
  // If the lock is only held for reading right now,
1141
  // drop the read lock and re-acquire for writing.
1142
  // Subsequent calls to LockForWriting are no-ops.
1143
  // Notice that the lock is *released* temporarily.
1144
  void LockForWriting();
1145
1146
 private:
1147
  CacheMutex* mu_;
1148
  bool writing_;
1149
1150
  RWLocker(const RWLocker&) = delete;
1151
  RWLocker& operator=(const RWLocker&) = delete;
1152
};
1153
1154
0
DFA::RWLocker::RWLocker(CacheMutex* mu) : mu_(mu), writing_(false) {
1155
0
  mu_->ReaderLock();
1156
0
}
1157
1158
// This function is marked as ABSL_NO_THREAD_SAFETY_ANALYSIS because
1159
// the annotations don't support lock upgrade.
1160
0
void DFA::RWLocker::LockForWriting() ABSL_NO_THREAD_SAFETY_ANALYSIS {
1161
0
  if (!writing_) {
1162
0
    mu_->ReaderUnlock();
1163
0
    mu_->WriterLock();
1164
0
    writing_ = true;
1165
0
  }
1166
0
}
1167
1168
0
DFA::RWLocker::~RWLocker() {
1169
0
  if (!writing_)
1170
0
    mu_->ReaderUnlock();
1171
0
  else
1172
0
    mu_->WriterUnlock();
1173
0
}
1174
1175
1176
// When the DFA's State cache fills, we discard all the states in the
1177
// cache and start over.  Many threads can be using and adding to the
1178
// cache at the same time, so we synchronize using the cache_mutex_
1179
// to keep from stepping on other threads.  Specifically, all the
1180
// threads using the current cache hold cache_mutex_ for reading.
1181
// When a thread decides to flush the cache, it drops cache_mutex_
1182
// and then re-acquires it for writing.  That ensures there are no
1183
// other threads accessing the cache anymore.  The rest of the search
1184
// runs holding cache_mutex_ for writing, avoiding any contention
1185
// with or cache pollution caused by other threads.
1186
1187
0
void DFA::ResetCache(RWLocker* cache_lock) {
1188
  // Re-acquire the cache_mutex_ for writing (exclusive use).
1189
0
  cache_lock->LockForWriting();
1190
1191
0
  hooks::GetDFAStateCacheResetHook()({
1192
0
      state_budget_,
1193
0
      state_cache_.size(),
1194
0
  });
1195
1196
  // Clear the cache, reset the memory budget.
1197
0
  for (int i = 0; i < kMaxStart; i++)
1198
0
    start_[i].start.store(NULL, std::memory_order_relaxed);
1199
0
  ClearCache();
1200
0
  mem_budget_ = state_budget_;
1201
0
}
1202
1203
// Typically, a couple States do need to be preserved across a cache
1204
// reset, like the State at the current point in the search.
1205
// The StateSaver class helps keep States across cache resets.
1206
// It makes a copy of the state's guts outside the cache (before the reset)
1207
// and then can be asked, after the reset, to recreate the State
1208
// in the new cache.  For example, in a DFA method ("this" is a DFA):
1209
//
1210
//   StateSaver saver(this, s);
1211
//   ResetCache(cache_lock);
1212
//   s = saver.Restore();
1213
//
1214
// The saver should always have room in the cache to re-create the state,
1215
// because resetting the cache locks out all other threads, and the cache
1216
// is known to have room for at least a couple states (otherwise the DFA
1217
// constructor fails).
1218
1219
class DFA::StateSaver {
1220
 public:
1221
  explicit StateSaver(DFA* dfa, State* state);
1222
  ~StateSaver();
1223
1224
  // Recreates and returns a state equivalent to the
1225
  // original state passed to the constructor.
1226
  // Returns NULL if the cache has filled, but
1227
  // since the DFA guarantees to have room in the cache
1228
  // for a couple states, should never return NULL
1229
  // if used right after ResetCache.
1230
  State* Restore();
1231
1232
 private:
1233
  DFA* dfa_;         // the DFA to use
1234
  int* inst_;        // saved info from State
1235
  int ninst_;
1236
  uint32_t flag_;
1237
  bool is_special_;  // whether original state was special
1238
  State* special_;   // if is_special_, the original state
1239
1240
  StateSaver(const StateSaver&) = delete;
1241
  StateSaver& operator=(const StateSaver&) = delete;
1242
};
1243
1244
0
DFA::StateSaver::StateSaver(DFA* dfa, State* state) {
1245
0
  dfa_ = dfa;
1246
0
  if (state <= SpecialStateMax) {
1247
0
    inst_ = NULL;
1248
0
    ninst_ = 0;
1249
0
    flag_ = 0;
1250
0
    is_special_ = true;
1251
0
    special_ = state;
1252
0
    return;
1253
0
  }
1254
0
  is_special_ = false;
1255
0
  special_ = NULL;
1256
0
  flag_ = state->flag_;
1257
0
  ninst_ = state->ninst_;
1258
0
  inst_ = new int[ninst_];
1259
0
  memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
1260
0
}
1261
1262
0
DFA::StateSaver::~StateSaver() {
1263
0
  if (!is_special_)
1264
0
    delete[] inst_;
1265
0
}
1266
1267
0
DFA::State* DFA::StateSaver::Restore() {
1268
0
  if (is_special_)
1269
0
    return special_;
1270
0
  absl::MutexLock l(&dfa_->mutex_);
1271
0
  State* s = dfa_->CachedState(inst_, ninst_, flag_);
1272
0
  if (s == NULL)
1273
0
    ABSL_LOG(DFATAL) << "StateSaver failed to restore state.";
1274
0
  return s;
1275
0
}
1276
1277
1278
//////////////////////////////////////////////////////////////////////
1279
//
1280
// DFA execution.
1281
//
1282
// The basic search loop is easy: start in a state s and then for each
1283
// byte c in the input, s = s->next[c].
1284
//
1285
// This simple description omits a few efficiency-driven complications.
1286
//
1287
// First, the State graph is constructed incrementally: it is possible
1288
// that s->next[c] is null, indicating that that state has not been
1289
// fully explored.  In this case, RunStateOnByte must be invoked to
1290
// determine the next state, which is cached in s->next[c] to save
1291
// future effort.  An alternative reason for s->next[c] to be null is
1292
// that the DFA has reached a so-called "dead state", in which any match
1293
// is no longer possible.  In this case RunStateOnByte will return NULL
1294
// and the processing of the string can stop early.
1295
//
1296
// Second, a 256-element pointer array for s->next_ makes each State
1297
// quite large (2kB on 64-bit machines).  Instead, dfa->bytemap_[]
1298
// maps from bytes to "byte classes" and then next_ only needs to have
1299
// as many pointers as there are byte classes.  A byte class is simply a
1300
// range of bytes that the regexp never distinguishes between.
1301
// A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,
1302
// 'a', 'b' to 'c', and 'c' to 0xFF.  The bytemap slows us a little bit
1303
// but in exchange we typically cut the size of a State (and thus our
1304
// memory footprint) by about 5-10x.  The comments still refer to
1305
// s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].
1306
//
1307
// Third, it is common for a DFA for an unanchored match to begin in a
1308
// state in which only one particular byte value can take the DFA to a
1309
// different state.  That is, s->next[c] != s for only one c.  In this
1310
// situation, the DFA can do better than executing the simple loop.
1311
// Instead, it can call memchr to search very quickly for the byte c.
1312
// Whether the start state has this property is determined during a
1313
// pre-compilation pass and the "can_prefix_accel" argument is set.
1314
//
1315
// Fourth, the desired behavior is to search for the leftmost-best match
1316
// (approximately, the same one that Perl would find), which is not
1317
// necessarily the match ending earliest in the string.  Each time a
1318
// match is found, it must be noted, but the DFA must continue on in
1319
// hope of finding a higher-priority match.  In some cases, the caller only
1320
// cares whether there is any match at all, not which one is found.
1321
// The "want_earliest_match" flag causes the search to stop at the first
1322
// match found.
1323
//
1324
// Fifth, one algorithm that uses the DFA needs it to run over the
1325
// input string backward, beginning at the end and ending at the beginning.
1326
// Passing false for the "run_forward" flag causes the DFA to run backward.
1327
//
1328
// The checks for these last three cases, which in a naive implementation
1329
// would be performed once per input byte, slow the general loop enough
1330
// to merit specialized versions of the search loop for each of the
1331
// eight possible settings of the three booleans.  Rather than write
1332
// eight different functions, we write one general implementation and then
1333
// inline it to create the specialized ones.
1334
//
1335
// Note that matches are delayed by one byte, to make it easier to
1336
// accomodate match conditions depending on the next input byte (like $ and \b).
1337
// When s->next[c]->IsMatch(), it means that there is a match ending just
1338
// *before* byte c.
1339
1340
// The generic search loop.  Searches text for a match, returning
1341
// the pointer to the end of the chosen match, or NULL if no match.
1342
// The bools are equal to the same-named variables in params, but
1343
// making them function arguments lets the inliner specialize
1344
// this function to each combination (see two paragraphs above).
1345
template <bool can_prefix_accel,
1346
          bool want_earliest_match,
1347
          bool run_forward>
1348
0
inline bool DFA::InlinedSearchLoop(SearchParams* params) {
1349
0
  State* start = params->start;
1350
0
  const uint8_t* bp = BytePtr(params->text.data());  // start of text
1351
0
  const uint8_t* p = bp;                             // text scanning point
1352
0
  const uint8_t* ep = BytePtr(params->text.data() +
1353
0
                              params->text.size());  // end of text
1354
0
  const uint8_t* resetp = NULL;                      // p at last cache reset
1355
0
  if (!run_forward) {
1356
0
    using std::swap;
1357
0
    swap(p, ep);
1358
0
  }
1359
1360
0
  const uint8_t* bytemap = prog_->bytemap();
1361
0
  const uint8_t* lastmatch = NULL;   // most recent matching position in text
1362
0
  bool matched = false;
1363
1364
0
  State* s = start;
1365
0
  if (ExtraDebug)
1366
0
    absl::FPrintF(stderr, "@stx: %s\n", DumpState(s));
1367
1368
0
  if (s->IsMatch()) {
1369
0
    matched = true;
1370
0
    lastmatch = p;
1371
0
    if (ExtraDebug)
1372
0
      absl::FPrintF(stderr, "match @stx! [%s]\n", DumpState(s));
1373
0
    if (params->matches != NULL) {
1374
0
      for (int i = s->ninst_ - 1; i >= 0; i--) {
1375
0
        int id = s->inst_[i];
1376
0
        if (id == MatchSep)
1377
0
          break;
1378
0
        params->matches->insert(id);
1379
0
      }
1380
0
    }
1381
0
    if (want_earliest_match) {
1382
0
      params->ep = reinterpret_cast<const char*>(lastmatch);
1383
0
      return true;
1384
0
    }
1385
0
  }
1386
1387
0
  while (p != ep) {
1388
0
    if (ExtraDebug)
1389
0
      absl::FPrintF(stderr, "@%d: %s\n", p - bp, DumpState(s));
1390
1391
0
    if (can_prefix_accel && s == start) {
1392
      // In start state, only way out is to find the prefix,
1393
      // so we use prefix accel (e.g. memchr) to skip ahead.
1394
      // If not found, we can skip to the end of the string.
1395
0
      p = BytePtr(prog_->PrefixAccel(p, ep - p));
1396
0
      if (p == NULL) {
1397
0
        p = ep;
1398
0
        break;
1399
0
      }
1400
0
    }
1401
1402
0
    int c;
1403
0
    if (run_forward)
1404
0
      c = *p++;
1405
0
    else
1406
0
      c = *--p;
1407
1408
    // Note that multiple threads might be consulting
1409
    // s->next_[bytemap[c]] simultaneously.
1410
    // RunStateOnByte takes care of the appropriate locking,
1411
    // including a memory barrier so that the unlocked access
1412
    // (sometimes known as "double-checked locking") is safe.
1413
    // The alternative would be either one DFA per thread
1414
    // or one mutex operation per input byte.
1415
    //
1416
    // ns == DeadState means the state is known to be dead
1417
    // (no more matches are possible).
1418
    // ns == NULL means the state has not yet been computed
1419
    // (need to call RunStateOnByteUnlocked).
1420
    // RunStateOnByte returns ns == NULL if it is out of memory.
1421
    // ns == FullMatchState means the rest of the string matches.
1422
    //
1423
    // Okay to use bytemap[] not ByteMap() here, because
1424
    // c is known to be an actual byte and not kByteEndText.
1425
1426
0
    State* ns = s->next_[bytemap[c]].load(std::memory_order_acquire);
1427
0
    if (ns == NULL) {
1428
0
      ns = RunStateOnByteUnlocked(s, c);
1429
0
      if (ns == NULL) {
1430
        // After we reset the cache, we hold cache_mutex exclusively,
1431
        // so if resetp != NULL, it means we filled the DFA state
1432
        // cache with this search alone (without any other threads).
1433
        // Benchmarks show that doing a state computation on every
1434
        // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the
1435
        // same at about 2 MB/s.  Unless we're processing an average
1436
        // of 10 bytes per state computation, fail so that RE2 can
1437
        // fall back to the NFA.  However, RE2::Set cannot fall back,
1438
        // so we just have to keep on keeping on in that case.
1439
0
        if (dfa_should_bail_when_slow && resetp != NULL &&
1440
0
            static_cast<size_t>(p - resetp) < 10*state_cache_.size() &&
1441
0
            kind_ != Prog::kManyMatch) {
1442
0
          params->failed = true;
1443
0
          return false;
1444
0
        }
1445
0
        resetp = p;
1446
1447
        // Prepare to save start and s across the reset.
1448
0
        StateSaver save_start(this, start);
1449
0
        StateSaver save_s(this, s);
1450
1451
        // Discard all the States in the cache.
1452
0
        ResetCache(params->cache_lock);
1453
1454
        // Restore start and s so we can continue.
1455
0
        if ((start = save_start.Restore()) == NULL ||
1456
0
            (s = save_s.Restore()) == NULL) {
1457
          // Restore already did ABSL_LOG(DFATAL).
1458
0
          params->failed = true;
1459
0
          return false;
1460
0
        }
1461
0
        ns = RunStateOnByteUnlocked(s, c);
1462
0
        if (ns == NULL) {
1463
0
          ABSL_LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";
1464
0
          params->failed = true;
1465
0
          return false;
1466
0
        }
1467
0
      }
1468
0
    }
1469
0
    if (ns <= SpecialStateMax) {
1470
0
      if (ns == DeadState) {
1471
0
        params->ep = reinterpret_cast<const char*>(lastmatch);
1472
0
        return matched;
1473
0
      }
1474
      // FullMatchState
1475
0
      params->ep = reinterpret_cast<const char*>(ep);
1476
0
      return true;
1477
0
    }
1478
1479
0
    s = ns;
1480
0
    if (s->IsMatch()) {
1481
0
      matched = true;
1482
      // The DFA notices the match one byte late,
1483
      // so adjust p before using it in the match.
1484
0
      if (run_forward)
1485
0
        lastmatch = p - 1;
1486
0
      else
1487
0
        lastmatch = p + 1;
1488
0
      if (ExtraDebug)
1489
0
        absl::FPrintF(stderr, "match @%d! [%s]\n", lastmatch - bp, DumpState(s));
1490
0
      if (params->matches != NULL) {
1491
0
        for (int i = s->ninst_ - 1; i >= 0; i--) {
1492
0
          int id = s->inst_[i];
1493
0
          if (id == MatchSep)
1494
0
            break;
1495
0
          params->matches->insert(id);
1496
0
        }
1497
0
      }
1498
0
      if (want_earliest_match) {
1499
0
        params->ep = reinterpret_cast<const char*>(lastmatch);
1500
0
        return true;
1501
0
      }
1502
0
    }
1503
0
  }
1504
1505
  // Process one more byte to see if it triggers a match.
1506
  // (Remember, matches are delayed one byte.)
1507
0
  if (ExtraDebug)
1508
0
    absl::FPrintF(stderr, "@etx: %s\n", DumpState(s));
1509
1510
0
  int lastbyte;
1511
0
  if (run_forward) {
1512
0
    if (EndPtr(params->text) == EndPtr(params->context))
1513
0
      lastbyte = kByteEndText;
1514
0
    else
1515
0
      lastbyte = EndPtr(params->text)[0] & 0xFF;
1516
0
  } else {
1517
0
    if (BeginPtr(params->text) == BeginPtr(params->context))
1518
0
      lastbyte = kByteEndText;
1519
0
    else
1520
0
      lastbyte = BeginPtr(params->text)[-1] & 0xFF;
1521
0
  }
1522
1523
0
  State* ns = s->next_[ByteMap(lastbyte)].load(std::memory_order_acquire);
1524
0
  if (ns == NULL) {
1525
0
    ns = RunStateOnByteUnlocked(s, lastbyte);
1526
0
    if (ns == NULL) {
1527
0
      StateSaver save_s(this, s);
1528
0
      ResetCache(params->cache_lock);
1529
0
      if ((s = save_s.Restore()) == NULL) {
1530
0
        params->failed = true;
1531
0
        return false;
1532
0
      }
1533
0
      ns = RunStateOnByteUnlocked(s, lastbyte);
1534
0
      if (ns == NULL) {
1535
0
        ABSL_LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";
1536
0
        params->failed = true;
1537
0
        return false;
1538
0
      }
1539
0
    }
1540
0
  }
1541
0
  if (ns <= SpecialStateMax) {
1542
0
    if (ns == DeadState) {
1543
0
      params->ep = reinterpret_cast<const char*>(lastmatch);
1544
0
      return matched;
1545
0
    }
1546
    // FullMatchState
1547
0
    params->ep = reinterpret_cast<const char*>(ep);
1548
0
    return true;
1549
0
  }
1550
1551
0
  s = ns;
1552
0
  if (s->IsMatch()) {
1553
0
    matched = true;
1554
0
    lastmatch = p;
1555
0
    if (ExtraDebug)
1556
0
      absl::FPrintF(stderr, "match @etx! [%s]\n", DumpState(s));
1557
0
    if (params->matches != NULL) {
1558
0
      for (int i = s->ninst_ - 1; i >= 0; i--) {
1559
0
        int id = s->inst_[i];
1560
0
        if (id == MatchSep)
1561
0
          break;
1562
0
        params->matches->insert(id);
1563
0
      }
1564
0
    }
1565
0
  }
1566
1567
0
  params->ep = reinterpret_cast<const char*>(lastmatch);
1568
0
  return matched;
1569
0
}
Unexecuted instantiation: bool re2::DFA::InlinedSearchLoop<false, false, false>(re2::DFA::SearchParams*)
Unexecuted instantiation: bool re2::DFA::InlinedSearchLoop<false, false, true>(re2::DFA::SearchParams*)
Unexecuted instantiation: bool re2::DFA::InlinedSearchLoop<false, true, false>(re2::DFA::SearchParams*)
Unexecuted instantiation: bool re2::DFA::InlinedSearchLoop<false, true, true>(re2::DFA::SearchParams*)
Unexecuted instantiation: bool re2::DFA::InlinedSearchLoop<true, false, false>(re2::DFA::SearchParams*)
Unexecuted instantiation: bool re2::DFA::InlinedSearchLoop<true, false, true>(re2::DFA::SearchParams*)
Unexecuted instantiation: bool re2::DFA::InlinedSearchLoop<true, true, false>(re2::DFA::SearchParams*)
Unexecuted instantiation: bool re2::DFA::InlinedSearchLoop<true, true, true>(re2::DFA::SearchParams*)
1570
1571
// Inline specializations of the general loop.
1572
0
bool DFA::SearchFFF(SearchParams* params) {
1573
0
  return InlinedSearchLoop<false, false, false>(params);
1574
0
}
1575
0
bool DFA::SearchFFT(SearchParams* params) {
1576
0
  return InlinedSearchLoop<false, false, true>(params);
1577
0
}
1578
0
bool DFA::SearchFTF(SearchParams* params) {
1579
0
  return InlinedSearchLoop<false, true, false>(params);
1580
0
}
1581
0
bool DFA::SearchFTT(SearchParams* params) {
1582
0
  return InlinedSearchLoop<false, true, true>(params);
1583
0
}
1584
0
bool DFA::SearchTFF(SearchParams* params) {
1585
0
  return InlinedSearchLoop<true, false, false>(params);
1586
0
}
1587
0
bool DFA::SearchTFT(SearchParams* params) {
1588
0
  return InlinedSearchLoop<true, false, true>(params);
1589
0
}
1590
0
bool DFA::SearchTTF(SearchParams* params) {
1591
0
  return InlinedSearchLoop<true, true, false>(params);
1592
0
}
1593
0
bool DFA::SearchTTT(SearchParams* params) {
1594
0
  return InlinedSearchLoop<true, true, true>(params);
1595
0
}
1596
1597
// For performance, calls the appropriate specialized version
1598
// of InlinedSearchLoop.
1599
0
bool DFA::FastSearchLoop(SearchParams* params) {
1600
  // Because the methods are private, the Searches array
1601
  // cannot be declared at top level.
1602
0
  static bool (DFA::*Searches[])(SearchParams*) = {
1603
0
    &DFA::SearchFFF,
1604
0
    &DFA::SearchFFT,
1605
0
    &DFA::SearchFTF,
1606
0
    &DFA::SearchFTT,
1607
0
    &DFA::SearchTFF,
1608
0
    &DFA::SearchTFT,
1609
0
    &DFA::SearchTTF,
1610
0
    &DFA::SearchTTT,
1611
0
  };
1612
1613
0
  int index = 4 * params->can_prefix_accel +
1614
0
              2 * params->want_earliest_match +
1615
0
              1 * params->run_forward;
1616
0
  return (this->*Searches[index])(params);
1617
0
}
1618
1619
1620
// The discussion of DFA execution above ignored the question of how
1621
// to determine the initial state for the search loop.  There are two
1622
// factors that influence the choice of start state.
1623
//
1624
// The first factor is whether the search is anchored or not.
1625
// The regexp program (Prog*) itself has
1626
// two different entry points: one for anchored searches and one for
1627
// unanchored searches.  (The unanchored version starts with a leading ".*?"
1628
// and then jumps to the anchored one.)
1629
//
1630
// The second factor is where text appears in the larger context, which
1631
// determines which empty-string operators can be matched at the beginning
1632
// of execution.  If text is at the very beginning of context, \A and ^ match.
1633
// Otherwise if text is at the beginning of a line, then ^ matches.
1634
// Otherwise it matters whether the character before text is a word character
1635
// or a non-word character.
1636
//
1637
// The two cases (unanchored vs not) and four cases (empty-string flags)
1638
// combine to make the eight cases recorded in the DFA's begin_text_[2],
1639
// begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached
1640
// StartInfos.  The start state for each is filled in the first time it
1641
// is used for an actual search.
1642
1643
// Examines text, context, and anchored to determine the right start
1644
// state for the DFA search loop.  Fills in params and returns true on success.
1645
// Returns false on failure.
1646
0
bool DFA::AnalyzeSearch(SearchParams* params) {
1647
0
  absl::string_view text = params->text;
1648
0
  absl::string_view context = params->context;
1649
1650
  // Sanity check: make sure that text lies within context.
1651
0
  if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) {
1652
0
    ABSL_LOG(DFATAL) << "context does not contain text";
1653
0
    params->start = DeadState;
1654
0
    return true;
1655
0
  }
1656
1657
  // Determine correct search type.
1658
0
  int start;
1659
0
  uint32_t flags;
1660
0
  if (params->run_forward) {
1661
0
    if (BeginPtr(text) == BeginPtr(context)) {
1662
0
      start = kStartBeginText;
1663
0
      flags = kEmptyBeginText|kEmptyBeginLine;
1664
0
    } else if (BeginPtr(text)[-1] == '\n') {
1665
0
      start = kStartBeginLine;
1666
0
      flags = kEmptyBeginLine;
1667
0
    } else if (Prog::IsWordChar(BeginPtr(text)[-1] & 0xFF)) {
1668
0
      start = kStartAfterWordChar;
1669
0
      flags = kFlagLastWord;
1670
0
    } else {
1671
0
      start = kStartAfterNonWordChar;
1672
0
      flags = 0;
1673
0
    }
1674
0
  } else {
1675
0
    if (EndPtr(text) == EndPtr(context)) {
1676
0
      start = kStartBeginText;
1677
0
      flags = kEmptyBeginText|kEmptyBeginLine;
1678
0
    } else if (EndPtr(text)[0] == '\n') {
1679
0
      start = kStartBeginLine;
1680
0
      flags = kEmptyBeginLine;
1681
0
    } else if (Prog::IsWordChar(EndPtr(text)[0] & 0xFF)) {
1682
0
      start = kStartAfterWordChar;
1683
0
      flags = kFlagLastWord;
1684
0
    } else {
1685
0
      start = kStartAfterNonWordChar;
1686
0
      flags = 0;
1687
0
    }
1688
0
  }
1689
0
  if (params->anchored)
1690
0
    start |= kStartAnchored;
1691
0
  StartInfo* info = &start_[start];
1692
1693
  // Try once without cache_lock for writing.
1694
  // Try again after resetting the cache
1695
  // (ResetCache will relock cache_lock for writing).
1696
0
  if (!AnalyzeSearchHelper(params, info, flags)) {
1697
0
    ResetCache(params->cache_lock);
1698
0
    if (!AnalyzeSearchHelper(params, info, flags)) {
1699
0
      params->failed = true;
1700
0
      ABSL_LOG(DFATAL) << "Failed to analyze start state.";
1701
0
      return false;
1702
0
    }
1703
0
  }
1704
1705
0
  params->start = info->start.load(std::memory_order_acquire);
1706
1707
  // Even if we could prefix accel, we cannot do so when anchored and,
1708
  // less obviously, we cannot do so when we are going to need flags.
1709
  // This trick works only when there is a single byte that leads to a
1710
  // different state!
1711
0
  if (prog_->can_prefix_accel() &&
1712
0
      !params->anchored &&
1713
0
      params->start > SpecialStateMax &&
1714
0
      params->start->flag_ >> kFlagNeedShift == 0)
1715
0
    params->can_prefix_accel = true;
1716
1717
0
  if (ExtraDebug)
1718
0
    absl::FPrintF(stderr, "anchored=%d fwd=%d flags=%#x state=%s can_prefix_accel=%d\n",
1719
0
                  params->anchored, params->run_forward, flags,
1720
0
                  DumpState(params->start), params->can_prefix_accel);
1721
1722
0
  return true;
1723
0
}
1724
1725
// Fills in info if needed.  Returns true on success, false on failure.
1726
bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
1727
0
                              uint32_t flags) {
1728
  // Quick check.
1729
0
  State* start = info->start.load(std::memory_order_acquire);
1730
0
  if (start != NULL)
1731
0
    return true;
1732
1733
0
  absl::MutexLock l(&mutex_);
1734
0
  start = info->start.load(std::memory_order_relaxed);
1735
0
  if (start != NULL)
1736
0
    return true;
1737
1738
0
  q0_->clear();
1739
0
  AddToQueue(q0_,
1740
0
             params->anchored ? prog_->start() : prog_->start_unanchored(),
1741
0
             flags);
1742
0
  start = WorkqToCachedState(q0_, NULL, flags);
1743
0
  if (start == NULL)
1744
0
    return false;
1745
1746
  // Synchronize with "quick check" above.
1747
0
  info->start.store(start, std::memory_order_release);
1748
0
  return true;
1749
0
}
1750
1751
// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
1752
bool DFA::Search(absl::string_view text, absl::string_view context,
1753
                 bool anchored, bool want_earliest_match, bool run_forward,
1754
0
                 bool* failed, const char** epp, SparseSet* matches) {
1755
0
  *epp = NULL;
1756
0
  if (!ok()) {
1757
0
    *failed = true;
1758
0
    return false;
1759
0
  }
1760
0
  *failed = false;
1761
1762
0
  if (ExtraDebug) {
1763
0
    absl::FPrintF(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored());
1764
0
    absl::FPrintF(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
1765
0
                  text, anchored, want_earliest_match, run_forward, kind_);
1766
0
  }
1767
1768
0
  RWLocker l(&cache_mutex_);
1769
0
  SearchParams params(text, context, &l);
1770
0
  params.anchored = anchored;
1771
0
  params.want_earliest_match = want_earliest_match;
1772
0
  params.run_forward = run_forward;
1773
  // matches should be null except when using RE2::Set.
1774
0
  ABSL_DCHECK(matches == NULL || kind_ == Prog::kManyMatch);
1775
0
  params.matches = matches;
1776
1777
0
  if (!AnalyzeSearch(&params)) {
1778
0
    *failed = true;
1779
0
    return false;
1780
0
  }
1781
0
  if (params.start == DeadState)
1782
0
    return false;
1783
0
  if (params.start == FullMatchState) {
1784
0
    if (run_forward == want_earliest_match)
1785
0
      *epp = text.data();
1786
0
    else
1787
0
      *epp = text.data() + text.size();
1788
0
    return true;
1789
0
  }
1790
0
  if (ExtraDebug)
1791
0
    absl::FPrintF(stderr, "start %s\n", DumpState(params.start));
1792
0
  bool ret = FastSearchLoop(&params);
1793
0
  if (params.failed) {
1794
0
    *failed = true;
1795
0
    return false;
1796
0
  }
1797
0
  *epp = params.ep;
1798
0
  return ret;
1799
0
}
1800
1801
0
DFA* Prog::GetDFA(MatchKind kind) {
1802
  // For a forward DFA, half the memory goes to each DFA.
1803
  // However, if it is a "many match" DFA, then there is
1804
  // no counterpart with which the memory must be shared.
1805
  //
1806
  // For a reverse DFA, all the memory goes to the
1807
  // "longest match" DFA, because RE2 never does reverse
1808
  // "first match" searches.
1809
0
  if (kind == kFirstMatch) {
1810
0
    absl::call_once(dfa_first_once_, [](Prog* prog) {
1811
0
      prog->dfa_first_ = new DFA(prog, kFirstMatch, prog->dfa_mem_ / 2);
1812
0
    }, this);
1813
0
    return dfa_first_;
1814
0
  } else if (kind == kManyMatch) {
1815
0
    absl::call_once(dfa_first_once_, [](Prog* prog) {
1816
0
      prog->dfa_first_ = new DFA(prog, kManyMatch, prog->dfa_mem_);
1817
0
    }, this);
1818
0
    return dfa_first_;
1819
0
  } else {
1820
0
    absl::call_once(dfa_longest_once_, [](Prog* prog) {
1821
0
      if (!prog->reversed_)
1822
0
        prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_ / 2);
1823
0
      else
1824
0
        prog->dfa_longest_ = new DFA(prog, kLongestMatch, prog->dfa_mem_);
1825
0
    }, this);
1826
0
    return dfa_longest_;
1827
0
  }
1828
0
}
1829
1830
0
void Prog::DeleteDFA(DFA* dfa) {
1831
0
  delete dfa;
1832
0
}
1833
1834
// Executes the regexp program to search in text,
1835
// which itself is inside the larger context.  (As a convenience,
1836
// passing a NULL context is equivalent to passing text.)
1837
// Returns true if a match is found, false if not.
1838
// If a match is found, fills in match0->end() to point at the end of the match
1839
// and sets match0->begin() to text.begin(), since the DFA can't track
1840
// where the match actually began.
1841
//
1842
// This is the only external interface (class DFA only exists in this file).
1843
//
1844
bool Prog::SearchDFA(absl::string_view text, absl::string_view context,
1845
                     Anchor anchor, MatchKind kind, absl::string_view* match0,
1846
0
                     bool* failed, SparseSet* matches) {
1847
0
  *failed = false;
1848
1849
0
  if (context.data() == NULL)
1850
0
    context = text;
1851
0
  bool caret = anchor_start();
1852
0
  bool dollar = anchor_end();
1853
0
  if (reversed_) {
1854
0
    using std::swap;
1855
0
    swap(caret, dollar);
1856
0
  }
1857
0
  if (caret && BeginPtr(context) != BeginPtr(text))
1858
0
    return false;
1859
0
  if (dollar && EndPtr(context) != EndPtr(text))
1860
0
    return false;
1861
1862
  // Handle full match by running an anchored longest match
1863
  // and then checking if it covers all of text.
1864
0
  bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
1865
0
  bool endmatch = false;
1866
0
  if (kind == kManyMatch) {
1867
    // This is split out in order to avoid clobbering kind.
1868
0
  } else if (kind == kFullMatch || anchor_end()) {
1869
0
    endmatch = true;
1870
0
    kind = kLongestMatch;
1871
0
  }
1872
1873
  // If the caller doesn't care where the match is (just whether one exists),
1874
  // then we can stop at the very first match we find, the so-called
1875
  // "earliest match".
1876
0
  bool want_earliest_match = false;
1877
0
  if (kind == kManyMatch) {
1878
    // This is split out in order to avoid clobbering kind.
1879
0
    if (matches == NULL) {
1880
0
      want_earliest_match = true;
1881
0
    }
1882
0
  } else if (match0 == NULL && !endmatch) {
1883
0
    want_earliest_match = true;
1884
0
    kind = kLongestMatch;
1885
0
  }
1886
1887
0
  DFA* dfa = GetDFA(kind);
1888
0
  const char* ep;
1889
0
  bool matched = dfa->Search(text, context, anchored,
1890
0
                             want_earliest_match, !reversed_,
1891
0
                             failed, &ep, matches);
1892
0
  if (*failed) {
1893
0
    hooks::GetDFASearchFailureHook()({
1894
        // Nothing yet...
1895
0
    });
1896
0
    return false;
1897
0
  }
1898
0
  if (!matched)
1899
0
    return false;
1900
0
  if (endmatch && ep != (reversed_ ? text.data() : text.data() + text.size()))
1901
0
    return false;
1902
1903
  // If caller cares, record the boundary of the match.
1904
  // We only know where it ends, so use the boundary of text
1905
  // as the beginning.
1906
0
  if (match0) {
1907
0
    if (reversed_)
1908
0
      *match0 =
1909
0
          absl::string_view(ep, static_cast<size_t>(text.data() + text.size() - ep));
1910
0
    else
1911
0
      *match0 =
1912
0
          absl::string_view(text.data(), static_cast<size_t>(ep - text.data()));
1913
0
  }
1914
0
  return true;
1915
0
}
1916
1917
// Build out all states in DFA.  Returns number of states.
1918
0
int DFA::BuildAllStates(const Prog::DFAStateCallback& cb) {
1919
0
  if (!ok())
1920
0
    return 0;
1921
1922
  // Pick out start state for unanchored search
1923
  // at beginning of text.
1924
0
  RWLocker l(&cache_mutex_);
1925
0
  SearchParams params(absl::string_view(), absl::string_view(), &l);
1926
0
  params.anchored = false;
1927
0
  if (!AnalyzeSearch(&params) ||
1928
0
      params.start == NULL ||
1929
0
      params.start == DeadState)
1930
0
    return 0;
1931
1932
  // Add start state to work queue.
1933
  // Note that any State* that we handle here must point into the cache,
1934
  // so we can simply depend on pointer-as-a-number hashing and equality.
1935
0
  absl::flat_hash_map<State*, int> m;
1936
0
  std::deque<State*> q;
1937
0
  m.emplace(params.start, static_cast<int>(m.size()));
1938
0
  q.push_back(params.start);
1939
1940
  // Compute the input bytes needed to cover all of the next pointers.
1941
0
  int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
1942
0
  std::vector<int> input(nnext);
1943
0
  for (int c = 0; c < 256; c++) {
1944
0
    int b = prog_->bytemap()[c];
1945
0
    while (c < 256-1 && prog_->bytemap()[c+1] == b)
1946
0
      c++;
1947
0
    input[b] = c;
1948
0
  }
1949
0
  input[prog_->bytemap_range()] = kByteEndText;
1950
1951
  // Scratch space for the output.
1952
0
  std::vector<int> output(nnext);
1953
1954
  // Flood to expand every state.
1955
0
  bool oom = false;
1956
0
  while (!q.empty()) {
1957
0
    State* s = q.front();
1958
0
    q.pop_front();
1959
0
    for (int c : input) {
1960
0
      State* ns = RunStateOnByteUnlocked(s, c);
1961
0
      if (ns == NULL) {
1962
0
        oom = true;
1963
0
        break;
1964
0
      }
1965
0
      if (ns == DeadState) {
1966
0
        output[ByteMap(c)] = -1;
1967
0
        continue;
1968
0
      }
1969
0
      if (m.find(ns) == m.end()) {
1970
0
        m.emplace(ns, static_cast<int>(m.size()));
1971
0
        q.push_back(ns);
1972
0
      }
1973
0
      output[ByteMap(c)] = m[ns];
1974
0
    }
1975
0
    if (cb)
1976
0
      cb(oom ? NULL : output.data(),
1977
0
         s == FullMatchState || s->IsMatch());
1978
0
    if (oom)
1979
0
      break;
1980
0
  }
1981
1982
0
  return static_cast<int>(m.size());
1983
0
}
1984
1985
// Build out all states in DFA for kind.  Returns number of states.
1986
0
int Prog::BuildEntireDFA(MatchKind kind, const DFAStateCallback& cb) {
1987
0
  return GetDFA(kind)->BuildAllStates(cb);
1988
0
}
1989
1990
// Computes min and max for matching string.
1991
// Won't return strings bigger than maxlen.
1992
0
bool DFA::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
1993
0
  if (!ok())
1994
0
    return false;
1995
1996
  // NOTE: if future users of PossibleMatchRange want more precision when
1997
  // presented with infinitely repeated elements, consider making this a
1998
  // parameter to PossibleMatchRange.
1999
0
  static int kMaxEltRepetitions = 0;
2000
2001
  // Keep track of the number of times we've visited states previously. We only
2002
  // revisit a given state if it's part of a repeated group, so if the value
2003
  // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set
2004
  // |*max| to |PrefixSuccessor(*max)|.
2005
  //
2006
  // Also note that previously_visited_states[UnseenStatePtr] will, in the STL
2007
  // tradition, implicitly insert a '0' value at first use. We take advantage
2008
  // of that property below.
2009
0
  absl::flat_hash_map<State*, int> previously_visited_states;
2010
2011
  // Pick out start state for anchored search at beginning of text.
2012
0
  RWLocker l(&cache_mutex_);
2013
0
  SearchParams params(absl::string_view(), absl::string_view(), &l);
2014
0
  params.anchored = true;
2015
0
  if (!AnalyzeSearch(&params))
2016
0
    return false;
2017
0
  if (params.start == DeadState) {  // No matching strings
2018
0
    *min = "";
2019
0
    *max = "";
2020
0
    return true;
2021
0
  }
2022
0
  if (params.start == FullMatchState)  // Every string matches: no max
2023
0
    return false;
2024
2025
  // The DFA is essentially a big graph rooted at params.start,
2026
  // and paths in the graph correspond to accepted strings.
2027
  // Each node in the graph has potentially 256+1 arrows
2028
  // coming out, one for each byte plus the magic end of
2029
  // text character kByteEndText.
2030
2031
  // To find the smallest possible prefix of an accepted
2032
  // string, we just walk the graph preferring to follow
2033
  // arrows with the lowest bytes possible.  To find the
2034
  // largest possible prefix, we follow the largest bytes
2035
  // possible.
2036
2037
  // The test for whether there is an arrow from s on byte j is
2038
  //    ns = RunStateOnByteUnlocked(s, j);
2039
  //    if (ns == NULL)
2040
  //      return false;
2041
  //    if (ns != DeadState && ns->ninst > 0)
2042
  // The RunStateOnByteUnlocked call asks the DFA to build out the graph.
2043
  // It returns NULL only if the DFA has run out of memory,
2044
  // in which case we can't be sure of anything.
2045
  // The second check sees whether there was graph built
2046
  // and whether it is interesting graph.  Nodes might have
2047
  // ns->ninst == 0 if they exist only to represent the fact
2048
  // that a match was found on the previous byte.
2049
2050
  // Build minimum prefix.
2051
0
  State* s = params.start;
2052
0
  min->clear();
2053
0
  absl::MutexLock lock(&mutex_);
2054
0
  for (int i = 0; i < maxlen; i++) {
2055
0
    if (previously_visited_states[s] > kMaxEltRepetitions)
2056
0
      break;
2057
0
    previously_visited_states[s]++;
2058
2059
    // Stop if min is a match.
2060
0
    State* ns = RunStateOnByte(s, kByteEndText);
2061
0
    if (ns == NULL)  // DFA out of memory
2062
0
      return false;
2063
0
    if (ns != DeadState && (ns == FullMatchState || ns->IsMatch()))
2064
0
      break;
2065
2066
    // Try to extend the string with low bytes.
2067
0
    bool extended = false;
2068
0
    for (int j = 0; j < 256; j++) {
2069
0
      ns = RunStateOnByte(s, j);
2070
0
      if (ns == NULL)  // DFA out of memory
2071
0
        return false;
2072
0
      if (ns == FullMatchState ||
2073
0
          (ns > SpecialStateMax && ns->ninst_ > 0)) {
2074
0
        extended = true;
2075
0
        min->append(1, static_cast<char>(j));
2076
0
        s = ns;
2077
0
        break;
2078
0
      }
2079
0
    }
2080
0
    if (!extended)
2081
0
      break;
2082
0
  }
2083
2084
  // Build maximum prefix.
2085
0
  previously_visited_states.clear();
2086
0
  s = params.start;
2087
0
  max->clear();
2088
0
  for (int i = 0; i < maxlen; i++) {
2089
0
    if (previously_visited_states[s] > kMaxEltRepetitions)
2090
0
      break;
2091
0
    previously_visited_states[s] += 1;
2092
2093
    // Try to extend the string with high bytes.
2094
0
    bool extended = false;
2095
0
    for (int j = 255; j >= 0; j--) {
2096
0
      State* ns = RunStateOnByte(s, j);
2097
0
      if (ns == NULL)
2098
0
        return false;
2099
0
      if (ns == FullMatchState ||
2100
0
          (ns > SpecialStateMax && ns->ninst_ > 0)) {
2101
0
        extended = true;
2102
0
        max->append(1, static_cast<char>(j));
2103
0
        s = ns;
2104
0
        break;
2105
0
      }
2106
0
    }
2107
0
    if (!extended) {
2108
      // Done, no need for PrefixSuccessor.
2109
0
      return true;
2110
0
    }
2111
0
  }
2112
2113
  // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
2114
0
  PrefixSuccessor(max);
2115
2116
  // If there are no bytes left, we have no way to say "there is no maximum
2117
  // string".  We could make the interface more complicated and be able to
2118
  // return "there is no maximum but here is a minimum", but that seems like
2119
  // overkill -- the most common no-max case is all possible strings, so not
2120
  // telling the caller that the empty string is the minimum match isn't a
2121
  // great loss.
2122
0
  if (max->empty())
2123
0
    return false;
2124
2125
0
  return true;
2126
0
}
2127
2128
// PossibleMatchRange for a Prog.
2129
0
bool Prog::PossibleMatchRange(std::string* min, std::string* max, int maxlen) {
2130
  // Have to use dfa_longest_ to get all strings for full matches.
2131
  // For example, (a|aa) never matches aa in first-match mode.
2132
0
  return GetDFA(kLongestMatch)->PossibleMatchRange(min, max, maxlen);
2133
0
}
2134
2135
}  // namespace re2