Coverage Report

Created: 2025-07-17 06:14

/src/re2/re2/bitstate.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
6
7
// Prog::SearchBitState is a regular expression search with submatch
8
// tracking for small regular expressions and texts.  Similarly to
9
// testing/backtrack.cc, it allocates a bitmap with (count of
10
// lists) * (length of text) bits to make sure it never explores the
11
// same (instruction list, character position) multiple times.  This
12
// limits the search to run in time linear in the length of the text.
13
//
14
// Unlike testing/backtrack.cc, SearchBitState is not recursive
15
// on the text.
16
//
17
// SearchBitState is a fast replacement for the NFA code on small
18
// regexps and texts when SearchOnePass cannot be used.
19
20
#include <stddef.h>
21
#include <stdint.h>
22
#include <string.h>
23
24
#include <limits>
25
#include <utility>
26
27
#include "absl/log/absl_check.h"
28
#include "absl/log/absl_log.h"
29
#include "absl/strings/string_view.h"
30
#include "re2/pod_array.h"
31
#include "re2/prog.h"
32
#include "re2/regexp.h"
33
34
namespace re2 {
35
36
struct Job {
37
  int id;
38
  int rle;  // run length encoding
39
  const char* p;
40
};
41
42
class BitState {
43
 public:
44
  explicit BitState(Prog* prog);
45
46
  // The usual Search prototype.
47
  // Can only call Search once per BitState.
48
  bool Search(absl::string_view text, absl::string_view context, bool anchored,
49
              bool longest, absl::string_view* submatch, int nsubmatch);
50
51
 private:
52
  inline bool ShouldVisit(int id, const char* p);
53
  void Push(int id, const char* p);
54
  void GrowStack();
55
  bool TrySearch(int id, const char* p);
56
57
  // Search parameters
58
  Prog* prog_;                   // program being run
59
  absl::string_view text_;       // text being searched
60
  absl::string_view context_;    // greater context of text being searched
61
  bool anchored_;                // whether search is anchored at text.begin()
62
  bool longest_;                 // whether search wants leftmost-longest match
63
  bool endmatch_;                // whether match must end at text.end()
64
  absl::string_view* submatch_;  // submatches to fill in
65
  int nsubmatch_;                //   # of submatches to fill in
66
67
  // Search state
68
  static constexpr int kVisitedBits = 64;
69
  PODArray<uint64_t> visited_;  // bitmap: (list ID, char*) pairs visited
70
  PODArray<const char*> cap_;   // capture registers
71
  PODArray<Job> job_;           // stack of text positions to explore
72
  int njob_;                    // stack size
73
74
  BitState(const BitState&) = delete;
75
  BitState& operator=(const BitState&) = delete;
76
};
77
78
BitState::BitState(Prog* prog)
79
3.40k
  : prog_(prog),
80
3.40k
    anchored_(false),
81
3.40k
    longest_(false),
82
3.40k
    endmatch_(false),
83
    submatch_(NULL),
84
3.40k
    nsubmatch_(0),
85
3.40k
    njob_(0) {
86
3.40k
}
87
88
// Given id, which *must* be a list head, we can look up its list ID.
89
// Then the question is: Should the search visit the (list ID, p) pair?
90
// If so, remember that it was visited so that the next time,
91
// we don't repeat the visit.
92
1.34M
bool BitState::ShouldVisit(int id, const char* p) {
93
1.34M
  int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
94
1.34M
          static_cast<int>(p-text_.data());
95
1.34M
  if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
96
17.3k
    return false;
97
1.32M
  visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
98
1.32M
  return true;
99
1.34M
}
100
101
// Grow the stack.
102
388
void BitState::GrowStack() {
103
388
  PODArray<Job> tmp(2*job_.size());
104
388
  memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
105
388
  job_ = std::move(tmp);
106
388
}
107
108
// Push (id, p) onto the stack, growing it if necessary.
109
1.16M
void BitState::Push(int id, const char* p) {
110
1.16M
  if (njob_ >= job_.size()) {
111
388
    GrowStack();
112
388
    if (njob_ >= job_.size()) {
113
0
      ABSL_LOG(DFATAL) << "GrowStack() failed: "
114
0
                       << "njob_ = " << njob_ << ", "
115
0
                       << "job_.size() = " << job_.size();
116
0
      return;
117
0
    }
118
388
  }
119
120
  // If id < 0, it's undoing a Capture,
121
  // so we mustn't interfere with that.
122
1.16M
  if (id >= 0 && njob_ > 0) {
123
1.08M
    Job* top = &job_[njob_-1];
124
1.08M
    if (id == top->id &&
125
1.08M
        p == top->p + top->rle + 1 &&
126
1.08M
        top->rle < std::numeric_limits<int>::max()) {
127
9.96k
      ++top->rle;
128
9.96k
      return;
129
9.96k
    }
130
1.08M
  }
131
132
1.15M
  Job* top = &job_[njob_++];
133
1.15M
  top->id = id;
134
1.15M
  top->rle = 0;
135
1.15M
  top->p = p;
136
1.15M
}
137
138
// Try a search from instruction id0 in state p0.
139
// Return whether it succeeded.
140
3.40k
bool BitState::TrySearch(int id0, const char* p0) {
141
3.40k
  bool matched = false;
142
3.40k
  const char* end = text_.data() + text_.size();
143
3.40k
  njob_ = 0;
144
  // Push() no longer checks ShouldVisit(),
145
  // so we must perform the check ourselves.
146
3.40k
  if (ShouldVisit(id0, p0))
147
3.40k
    Push(id0, p0);
148
74.3k
  while (njob_ > 0) {
149
    // Pop job off stack.
150
73.1k
    --njob_;
151
73.1k
    int id = job_[njob_].id;
152
73.1k
    int& rle = job_[njob_].rle;
153
73.1k
    const char* p = job_[njob_].p;
154
155
73.1k
    if (id < 0) {
156
      // Undo the Capture.
157
33.6k
      cap_[prog_->inst(-id)->cap()] = p;
158
33.6k
      continue;
159
33.6k
    }
160
161
39.5k
    if (rle > 0) {
162
4.39k
      p += rle;
163
      // Revivify job on stack.
164
4.39k
      --rle;
165
4.39k
      ++njob_;
166
4.39k
    }
167
168
1.45M
  Loop:
169
    // Visit id, p.
170
1.45M
    Prog::Inst* ip = prog_->inst(id);
171
1.45M
    switch (ip->opcode()) {
172
0
      default:
173
0
        ABSL_LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
174
0
        return false;
175
176
352
      case kInstFail:
177
352
        break;
178
179
161
      case kInstAltMatch:
180
161
        if (ip->greedy(prog_)) {
181
          // out1 is the Match instruction.
182
130
          id = ip->out1();
183
130
          p = end;
184
130
          goto Loop;
185
130
        }
186
31
        if (longest_) {
187
          // ip must be non-greedy...
188
          // out is the Match instruction.
189
26
          id = ip->out();
190
26
          p = end;
191
26
          goto Loop;
192
26
        }
193
5
        goto Next;
194
195
203k
      case kInstByteRange: {
196
203k
        int c = -1;
197
203k
        if (p < end)
198
183k
          c = *p & 0xFF;
199
203k
        if (!ip->Matches(c))
200
102k
          goto Next;
201
202
100k
        if (ip->hint() != 0)
203
17.4k
          Push(id+ip->hint(), p);  // try the next when we're done
204
100k
        id = ip->out();
205
100k
        p++;
206
100k
        goto CheckAndLoop;
207
203k
      }
208
209
216k
      case kInstCapture:
210
216k
        if (!ip->last())
211
81.2k
          Push(id+1, p);  // try the next when we're done
212
213
216k
        if (0 <= ip->cap() && ip->cap() < cap_.size()) {
214
          // Capture p to register, but save old value first.
215
74.4k
          Push(-id, cap_[ip->cap()]);  // undo when we're done
216
74.4k
          cap_[ip->cap()] = p;
217
74.4k
        }
218
219
216k
        id = ip->out();
220
216k
        goto CheckAndLoop;
221
222
21.1k
      case kInstEmptyWidth:
223
21.1k
        if (ip->empty() & ~Prog::EmptyFlags(context_, p))
224
7.79k
          goto Next;
225
226
13.3k
        if (!ip->last())
227
992
          Push(id+1, p);  // try the next when we're done
228
13.3k
        id = ip->out();
229
13.3k
        goto CheckAndLoop;
230
231
1.00M
      case kInstNop:
232
1.00M
        if (!ip->last())
233
987k
          Push(id+1, p);  // try the next when we're done
234
1.00M
        id = ip->out();
235
236
1.33M
      CheckAndLoop:
237
        // Sanity check: id is the head of its list, which must
238
        // be the case if id-1 is the last of *its* list. :)
239
1.33M
        ABSL_DCHECK(id == 0 || prog_->inst(id-1)->last());
240
1.33M
        if (ShouldVisit(id, p))
241
1.32M
          goto Loop;
242
17.3k
        break;
243
244
17.3k
      case kInstMatch: {
245
6.27k
        if (endmatch_ && p != end)
246
460
          goto Next;
247
248
        // We found a match.  If the caller doesn't care
249
        // where the match is, no point going further.
250
5.81k
        if (nsubmatch_ == 0)
251
0
          return true;
252
253
        // Record best match so far.
254
        // Only need to check end point, because this entire
255
        // call is only considering one start position.
256
5.81k
        matched = true;
257
5.81k
        cap_[1] = p;
258
5.81k
        if (submatch_[0].data() == NULL ||
259
5.81k
            (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
260
13.8k
          for (int i = 0; i < nsubmatch_; i++)
261
9.23k
            submatch_[i] = absl::string_view(
262
9.23k
                cap_[2 * i],
263
9.23k
                static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
264
4.61k
        }
265
266
        // If going for first match, we're done.
267
5.81k
        if (!longest_)
268
434
          return true;
269
270
        // If we used the entire text, no longer match is possible.
271
5.37k
        if (p == end)
272
1.80k
          return true;
273
274
        // Otherwise, continue on in hope of a longer match.
275
        // Note the absence of the ShouldVisit() check here
276
        // due to execution remaining in the same list.
277
113k
      Next:
278
113k
        if (!ip->last()) {
279
94.3k
          id++;
280
94.3k
          goto Loop;
281
94.3k
        }
282
19.6k
        break;
283
113k
      }
284
1.45M
    }
285
1.45M
  }
286
1.16k
  return matched;
287
3.40k
}
288
289
// Search text (within context) for prog_.
290
bool BitState::Search(absl::string_view text, absl::string_view context,
291
                      bool anchored, bool longest, absl::string_view* submatch,
292
3.40k
                      int nsubmatch) {
293
  // Search parameters.
294
3.40k
  text_ = text;
295
3.40k
  context_ = context;
296
3.40k
  if (context_.data() == NULL)
297
0
    context_ = text;
298
3.40k
  if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
299
4
    return false;
300
3.40k
  if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
301
0
    return false;
302
3.40k
  anchored_ = anchored || prog_->anchor_start();
303
3.40k
  longest_ = longest || prog_->anchor_end();
304
3.40k
  endmatch_ = prog_->anchor_end();
305
3.40k
  submatch_ = submatch;
306
3.40k
  nsubmatch_ = nsubmatch;
307
10.2k
  for (int i = 0; i < nsubmatch_; i++)
308
6.80k
    submatch_[i] = absl::string_view();
309
310
  // Allocate scratch space.
311
3.40k
  int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
312
3.40k
  nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
313
3.40k
  visited_ = PODArray<uint64_t>(nvisited);
314
3.40k
  memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
315
316
3.40k
  int ncap = 2*nsubmatch;
317
3.40k
  if (ncap < 2)
318
0
    ncap = 2;
319
3.40k
  cap_ = PODArray<const char*>(ncap);
320
3.40k
  memset(cap_.data(), 0, ncap*sizeof cap_[0]);
321
322
  // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
323
3.40k
  job_ = PODArray<Job>(64);
324
325
  // Anchored search must start at text.begin().
326
3.40k
  if (anchored_) {
327
3.40k
    cap_[0] = text.data();
328
3.40k
    return TrySearch(prog_->start(), text.data());
329
3.40k
  }
330
331
  // Unanchored search, starting from each possible text position.
332
  // Notice that we have to try the empty string at the end of
333
  // the text, so the loop condition is p <= text.end(), not p < text.end().
334
  // This looks like it's quadratic in the size of the text,
335
  // but we are not clearing visited_ between calls to TrySearch,
336
  // so no work is duplicated and it ends up still being linear.
337
0
  const char* etext = text.data() + text.size();
338
0
  for (const char* p = text.data(); p <= etext; p++) {
339
    // Try to use prefix accel (e.g. memchr) to skip ahead.
340
0
    if (p < etext && prog_->can_prefix_accel()) {
341
0
      p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
342
0
      if (p == NULL)
343
0
        p = etext;
344
0
    }
345
346
0
    cap_[0] = p;
347
0
    if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
348
0
      return true;
349
    // Avoid invoking undefined behavior (arithmetic on a null pointer)
350
    // by simply not continuing the loop.
351
0
    if (p == NULL)
352
0
      break;
353
0
  }
354
0
  return false;
355
0
}
356
357
// Bit-state search.
358
bool Prog::SearchBitState(absl::string_view text, absl::string_view context,
359
                          Anchor anchor, MatchKind kind,
360
3.40k
                          absl::string_view* match, int nmatch) {
361
  // If full match, we ask for an anchored longest match
362
  // and then check that match[0] == text.
363
  // So make sure match[0] exists.
364
3.40k
  absl::string_view sp0;
365
3.40k
  if (kind == kFullMatch) {
366
2.17k
    anchor = kAnchored;
367
2.17k
    if (nmatch < 1) {
368
0
      match = &sp0;
369
0
      nmatch = 1;
370
0
    }
371
2.17k
  }
372
373
  // Run the search.
374
3.40k
  BitState b(this);
375
3.40k
  bool anchored = anchor == kAnchored;
376
3.40k
  bool longest = kind != kFirstMatch;
377
3.40k
  if (!b.Search(text, context, anchored, longest, match, nmatch))
378
1.02k
    return false;
379
2.38k
  if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
380
93
    return false;
381
2.28k
  return true;
382
2.38k
}
383
384
}  // namespace re2