Coverage Report

Created: 2026-05-27 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/proc/self/cwd/external/re2~/re2/bitstate.cc
Line
Count
Source
1
// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
6
7
// Prog::SearchBitState is a regular expression search with submatch
8
// tracking for small regular expressions and texts.  Similarly to
9
// testing/backtrack.cc, it allocates a bitmap with (count of
10
// lists) * (length of text) bits to make sure it never explores the
11
// same (instruction list, character position) multiple times.  This
12
// limits the search to run in time linear in the length of the text.
13
//
14
// Unlike testing/backtrack.cc, SearchBitState is not recursive
15
// on the text.
16
//
17
// SearchBitState is a fast replacement for the NFA code on small
18
// regexps and texts when SearchOnePass cannot be used.
19
20
#include <stddef.h>
21
#include <stdint.h>
22
#include <string.h>
23
24
#include <limits>
25
#include <utility>
26
27
#include "absl/log/absl_check.h"
28
#include "absl/log/absl_log.h"
29
#include "absl/strings/string_view.h"
30
#include "re2/pod_array.h"
31
#include "re2/prog.h"
32
#include "re2/regexp.h"
33
34
namespace re2 {
35
36
struct Job {
37
  int id;
38
  int rle;  // run length encoding
39
  const char* p;
40
};
41
42
class BitState {
43
 public:
44
  explicit BitState(Prog* prog);
45
46
  // The usual Search prototype.
47
  // Can only call Search once per BitState.
48
  bool Search(absl::string_view text, absl::string_view context, bool anchored,
49
              bool longest, absl::string_view* submatch, int nsubmatch);
50
51
 private:
52
  static inline bool ShouldVisit(absl::string_view text, uint64_t* visited, uint16_t id, const char* p);
53
  void Push(int id, const char* p);
54
  void GrowStack();
55
  bool TrySearch(int id, const char* p);
56
57
  // Search parameters
58
  Prog* prog_;                   // program being run
59
  absl::string_view text_;       // text being searched
60
  absl::string_view context_;    // greater context of text being searched
61
  bool anchored_;                // whether search is anchored at text.begin()
62
  bool longest_;                 // whether search wants leftmost-longest match
63
  bool endmatch_;                // whether match must end at text.end()
64
  absl::string_view* submatch_;  // submatches to fill in
65
  int nsubmatch_;                //   # of submatches to fill in
66
67
  // Search state
68
  static constexpr int kVisitedBits = 64;
69
  PODArray<uint64_t> visited_;  // bitmap: (list ID, char*) pairs visited
70
  PODArray<const char*> cap_;   // capture registers
71
  PODArray<Job> job_;           // stack of text positions to explore
72
  int njob_;                    // stack size
73
74
  BitState(const BitState&) = delete;
75
  BitState& operator=(const BitState&) = delete;
76
};
77
78
BitState::BitState(Prog* prog)
79
0
  : prog_(prog),
80
0
    anchored_(false),
81
0
    longest_(false),
82
0
    endmatch_(false),
83
0
    submatch_(NULL),
84
0
    nsubmatch_(0),
85
0
    njob_(0) {
86
0
}
87
88
// Given the text being searched and current visited state,
89
// as well as a list ID, should the search visit the (list ID, p) pair?
90
// If so, remember that it was visited so that the next time,
91
// we don't repeat the visit.
92
// We pass text and visited to this as a static method so that the
93
// caller can do those loads once instead of this code dereferencing
94
// them multiple times.
95
0
bool BitState::ShouldVisit(absl::string_view text, uint64_t* visited, uint16_t list_id, const char* p) {
96
0
  int n = list_id * static_cast<int>(text.size()+1) +
97
0
          static_cast<int>(p-text.data());
98
0
  if (visited[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
99
0
    return false;
100
0
  visited[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
101
0
  return true;
102
0
}
103
104
// Grow the stack.
105
0
void BitState::GrowStack() {
106
0
  PODArray<Job> tmp(2*job_.size());
107
0
  memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
108
0
  job_ = std::move(tmp);
109
0
}
110
111
// Push (id, p) onto the stack, growing it if necessary.
112
0
void BitState::Push(int id, const char* p) {
113
0
  if (njob_ >= job_.size()) {
114
0
    GrowStack();
115
0
    if (njob_ >= job_.size()) {
116
0
      ABSL_LOG(DFATAL) << "GrowStack() failed: "
117
0
                       << "njob_ = " << njob_ << ", "
118
0
                       << "job_.size() = " << job_.size();
119
0
      return;
120
0
    }
121
0
  }
122
123
  // If id < 0, it's undoing a Capture,
124
  // so we mustn't interfere with that.
125
0
  if (id >= 0 && njob_ > 0) {
126
0
    Job* top = &job_[njob_-1];
127
0
    if (id == top->id &&
128
0
        p == top->p + top->rle + 1 &&
129
0
        top->rle < std::numeric_limits<int>::max()) {
130
0
      ++top->rle;
131
0
      return;
132
0
    }
133
0
  }
134
135
0
  Job* top = &job_[njob_++];
136
0
  top->id = id;
137
0
  top->rle = 0;
138
0
  top->p = p;
139
0
}
140
141
// Try a search from instruction id0 in state p0.
142
// Return whether it succeeded.
143
0
bool BitState::TrySearch(int id0, const char* p0) {
144
0
  bool matched = false;
145
0
  const char* end = text_.data() + text_.size();
146
0
  uint16_t* list_heads = prog_->list_heads();
147
0
  uint64_t* visited = visited_.data();
148
0
  njob_ = 0;
149
  // Push() no longer checks ShouldVisit(),
150
  // so we must perform the check ourselves.
151
0
  if (ShouldVisit(text_, visited, list_heads[id0], p0))
152
0
    Push(id0, p0);
153
0
  while (njob_ > 0) {
154
    // Pop job off stack.
155
0
    --njob_;
156
0
    int id = job_[njob_].id;
157
0
    int& rle = job_[njob_].rle;
158
0
    const char* p = job_[njob_].p;
159
160
0
    if (id < 0) {
161
      // Undo the Capture.
162
0
      cap_[prog_->inst(-id)->cap()] = p;
163
0
      continue;
164
0
    }
165
166
0
    if (rle > 0) {
167
0
      p += rle;
168
      // Revivify job on stack.
169
0
      --rle;
170
0
      ++njob_;
171
0
    }
172
173
0
  Loop:
174
    // Visit id, p.
175
0
    Prog::Inst* ip = prog_->inst(id);
176
0
    switch (ip->opcode()) {
177
0
      default:
178
0
        ABSL_LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
179
0
        return false;
180
181
0
      case kInstFail:
182
0
        break;
183
184
0
      case kInstAltMatch:
185
0
        if (ip->greedy(prog_)) {
186
          // out1 is the Match instruction.
187
0
          id = ip->out1();
188
0
          p = end;
189
0
          goto Loop;
190
0
        }
191
0
        if (longest_) {
192
          // ip must be non-greedy...
193
          // out is the Match instruction.
194
0
          id = ip->out();
195
0
          p = end;
196
0
          goto Loop;
197
0
        }
198
0
        goto Next;
199
200
0
      case kInstByteRange: {
201
0
        int c = -1;
202
0
        if (p < end)
203
0
          c = *p & 0xFF;
204
0
        if (!ip->Matches(c))
205
0
          goto Next;
206
207
0
        if (ip->hint() != 0)
208
0
          Push(id+ip->hint(), p);  // try the next when we're done
209
0
        id = ip->out();
210
0
        p++;
211
0
        goto CheckAndLoop;
212
0
      }
213
214
0
      case kInstCapture:
215
0
        if (!ip->last())
216
0
          Push(id+1, p);  // try the next when we're done
217
218
0
        if (0 <= ip->cap() && ip->cap() < cap_.size()) {
219
          // Capture p to register, but save old value first.
220
0
          Push(-id, cap_[ip->cap()]);  // undo when we're done
221
0
          cap_[ip->cap()] = p;
222
0
        }
223
224
0
        id = ip->out();
225
0
        goto CheckAndLoop;
226
227
0
      case kInstEmptyWidth:
228
0
        if (ip->empty() & ~Prog::EmptyFlags(context_, p))
229
0
          goto Next;
230
231
0
        if (!ip->last())
232
0
          Push(id+1, p);  // try the next when we're done
233
0
        id = ip->out();
234
0
        goto CheckAndLoop;
235
236
0
      case kInstNop:
237
0
        if (!ip->last())
238
0
          Push(id+1, p);  // try the next when we're done
239
0
        id = ip->out();
240
241
0
      CheckAndLoop:
242
        // Sanity check: id is the head of its list, which must
243
        // be the case if id-1 is the last of *its* list. :)
244
0
        ABSL_DCHECK(id == 0 || prog_->inst(id-1)->last());
245
0
        if (ShouldVisit(text_, visited, list_heads[id], p))
246
0
          goto Loop;
247
0
        break;
248
249
0
      case kInstMatch: {
250
0
        if (endmatch_ && p != end)
251
0
          goto Next;
252
253
        // We found a match.  If the caller doesn't care
254
        // where the match is, no point going further.
255
0
        if (nsubmatch_ == 0)
256
0
          return true;
257
258
        // Record best match so far.
259
        // Only need to check end point, because this entire
260
        // call is only considering one start position.
261
0
        matched = true;
262
0
        cap_[1] = p;
263
0
        if (submatch_[0].data() == NULL ||
264
0
            (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
265
0
          for (int i = 0; i < nsubmatch_; i++)
266
0
            submatch_[i] = absl::string_view(
267
0
                cap_[2 * i],
268
0
                static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
269
0
        }
270
271
        // If going for first match, we're done.
272
0
        if (!longest_)
273
0
          return true;
274
275
        // If we used the entire text, no longer match is possible.
276
0
        if (p == end)
277
0
          return true;
278
279
        // Otherwise, continue on in hope of a longer match.
280
        // Note the absence of the ShouldVisit() check here
281
        // due to execution remaining in the same list.
282
0
      Next:
283
0
        if (!ip->last()) {
284
0
          id++;
285
0
          goto Loop;
286
0
        }
287
0
        break;
288
0
      }
289
0
    }
290
0
  }
291
0
  return matched;
292
0
}
293
294
// Search text (within context) for prog_.
295
bool BitState::Search(absl::string_view text, absl::string_view context,
296
                      bool anchored, bool longest, absl::string_view* submatch,
297
0
                      int nsubmatch) {
298
  // Search parameters.
299
0
  text_ = text;
300
0
  context_ = context;
301
0
  if (context_.data() == NULL)
302
0
    context_ = text;
303
0
  if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
304
0
    return false;
305
0
  if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
306
0
    return false;
307
0
  anchored_ = anchored || prog_->anchor_start();
308
0
  longest_ = longest || prog_->anchor_end();
309
0
  endmatch_ = prog_->anchor_end();
310
0
  submatch_ = submatch;
311
0
  nsubmatch_ = nsubmatch;
312
0
  for (int i = 0; i < nsubmatch_; i++)
313
0
    submatch_[i] = absl::string_view();
314
315
  // Allocate scratch space.
316
0
  int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
317
0
  nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
318
0
  visited_ = PODArray<uint64_t>(nvisited);
319
0
  memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
320
321
0
  int ncap = 2*nsubmatch;
322
0
  if (ncap < 2)
323
0
    ncap = 2;
324
0
  cap_ = PODArray<const char*>(ncap);
325
0
  memset(cap_.data(), 0, ncap*sizeof cap_[0]);
326
327
  // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
328
0
  job_ = PODArray<Job>(64);
329
330
  // Anchored search must start at text.begin().
331
0
  if (anchored_) {
332
0
    cap_[0] = text.data();
333
0
    return TrySearch(prog_->start(), text.data());
334
0
  }
335
336
  // Unanchored search, starting from each possible text position.
337
  // Notice that we have to try the empty string at the end of
338
  // the text, so the loop condition is p <= text.end(), not p < text.end().
339
  // This looks like it's quadratic in the size of the text,
340
  // but we are not clearing visited_ between calls to TrySearch,
341
  // so no work is duplicated and it ends up still being linear.
342
0
  const char* etext = text.data() + text.size();
343
0
  for (const char* p = text.data(); p <= etext; p++) {
344
    // Try to use prefix accel (e.g. memchr) to skip ahead.
345
0
    if (p < etext && prog_->can_prefix_accel()) {
346
0
      p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
347
0
      if (p == NULL)
348
0
        p = etext;
349
0
    }
350
351
0
    cap_[0] = p;
352
0
    if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
353
0
      return true;
354
    // Avoid invoking undefined behavior (arithmetic on a null pointer)
355
    // by simply not continuing the loop.
356
0
    if (p == NULL)
357
0
      break;
358
0
  }
359
0
  return false;
360
0
}
361
362
// Bit-state search.
363
bool Prog::SearchBitState(absl::string_view text, absl::string_view context,
364
                          Anchor anchor, MatchKind kind,
365
0
                          absl::string_view* match, int nmatch) {
366
  // If full match, we ask for an anchored longest match
367
  // and then check that match[0] == text.
368
  // So make sure match[0] exists.
369
0
  absl::string_view sp0;
370
0
  if (kind == kFullMatch) {
371
0
    anchor = kAnchored;
372
0
    if (nmatch < 1) {
373
0
      match = &sp0;
374
0
      nmatch = 1;
375
0
    }
376
0
  }
377
378
  // Run the search.
379
0
  BitState b(this);
380
0
  bool anchored = anchor == kAnchored;
381
0
  bool longest = kind != kFirstMatch;
382
0
  if (!b.Search(text, context, anchored, longest, match, nmatch))
383
0
    return false;
384
0
  if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
385
0
    return false;
386
0
  return true;
387
0
}
388
389
}  // namespace re2