Coverage Report

Created: 2023-09-28 22:23

/src/re2/re2/bitstate.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
6
7
// Prog::SearchBitState is a regular expression search with submatch
8
// tracking for small regular expressions and texts.  Similarly to
9
// testing/backtrack.cc, it allocates a bitmap with (count of
10
// lists) * (length of text) bits to make sure it never explores the
11
// same (instruction list, character position) multiple times.  This
12
// limits the search to run in time linear in the length of the text.
13
//
14
// Unlike testing/backtrack.cc, SearchBitState is not recursive
15
// on the text.
16
//
17
// SearchBitState is a fast replacement for the NFA code on small
18
// regexps and texts when SearchOnePass cannot be used.
19
20
#include <stddef.h>
21
#include <stdint.h>
22
#include <string.h>
23
#include <limits>
24
#include <utility>
25
26
#include "util/logging.h"
27
#include "re2/pod_array.h"
28
#include "re2/prog.h"
29
#include "re2/regexp.h"
30
31
namespace re2 {
32
33
struct Job {
34
  int id;
35
  int rle;  // run length encoding
36
  const char* p;
37
};
38
39
class BitState {
40
 public:
41
  explicit BitState(Prog* prog);
42
43
  // The usual Search prototype.
44
  // Can only call Search once per BitState.
45
  bool Search(const StringPiece& text, const StringPiece& context,
46
              bool anchored, bool longest,
47
              StringPiece* submatch, int nsubmatch);
48
49
 private:
50
  inline bool ShouldVisit(int id, const char* p);
51
  void Push(int id, const char* p);
52
  void GrowStack();
53
  bool TrySearch(int id, const char* p);
54
55
  // Search parameters
56
  Prog* prog_;              // program being run
57
  StringPiece text_;        // text being searched
58
  StringPiece context_;     // greater context of text being searched
59
  bool anchored_;           // whether search is anchored at text.begin()
60
  bool longest_;            // whether search wants leftmost-longest match
61
  bool endmatch_;           // whether match must end at text.end()
62
  StringPiece* submatch_;   // submatches to fill in
63
  int nsubmatch_;           //   # of submatches to fill in
64
65
  // Search state
66
  static constexpr int kVisitedBits = 64;
67
  PODArray<uint64_t> visited_;  // bitmap: (list ID, char*) pairs visited
68
  PODArray<const char*> cap_;   // capture registers
69
  PODArray<Job> job_;           // stack of text positions to explore
70
  int njob_;                    // stack size
71
72
  BitState(const BitState&) = delete;
73
  BitState& operator=(const BitState&) = delete;
74
};
75
76
BitState::BitState(Prog* prog)
77
  : prog_(prog),
78
    anchored_(false),
79
    longest_(false),
80
    endmatch_(false),
81
    submatch_(NULL),
82
    nsubmatch_(0),
83
0
    njob_(0) {
84
0
}
85
86
// Given id, which *must* be a list head, we can look up its list ID.
87
// Then the question is: Should the search visit the (list ID, p) pair?
88
// If so, remember that it was visited so that the next time,
89
// we don't repeat the visit.
90
0
bool BitState::ShouldVisit(int id, const char* p) {
91
0
  int n = prog_->list_heads()[id] * static_cast<int>(text_.size()+1) +
92
0
          static_cast<int>(p-text_.data());
93
0
  if (visited_[n/kVisitedBits] & (uint64_t{1} << (n & (kVisitedBits-1))))
94
0
    return false;
95
0
  visited_[n/kVisitedBits] |= uint64_t{1} << (n & (kVisitedBits-1));
96
0
  return true;
97
0
}
98
99
// Grow the stack.
100
0
void BitState::GrowStack() {
101
0
  PODArray<Job> tmp(2*job_.size());
102
0
  memmove(tmp.data(), job_.data(), njob_*sizeof job_[0]);
103
0
  job_ = std::move(tmp);
104
0
}
105
106
// Push (id, p) onto the stack, growing it if necessary.
107
0
void BitState::Push(int id, const char* p) {
108
0
  if (njob_ >= job_.size()) {
109
0
    GrowStack();
110
0
    if (njob_ >= job_.size()) {
111
0
      LOG(DFATAL) << "GrowStack() failed: "
112
0
                  << "njob_ = " << njob_ << ", "
113
0
                  << "job_.size() = " << job_.size();
114
0
      return;
115
0
    }
116
0
  }
117
118
  // If id < 0, it's undoing a Capture,
119
  // so we mustn't interfere with that.
120
0
  if (id >= 0 && njob_ > 0) {
121
0
    Job* top = &job_[njob_-1];
122
0
    if (id == top->id &&
123
0
        p == top->p + top->rle + 1 &&
124
0
        top->rle < std::numeric_limits<int>::max()) {
125
0
      ++top->rle;
126
0
      return;
127
0
    }
128
0
  }
129
130
0
  Job* top = &job_[njob_++];
131
0
  top->id = id;
132
0
  top->rle = 0;
133
0
  top->p = p;
134
0
}
135
136
// Try a search from instruction id0 in state p0.
137
// Return whether it succeeded.
138
0
bool BitState::TrySearch(int id0, const char* p0) {
139
0
  bool matched = false;
140
0
  const char* end = text_.data() + text_.size();
141
0
  njob_ = 0;
142
  // Push() no longer checks ShouldVisit(),
143
  // so we must perform the check ourselves.
144
0
  if (ShouldVisit(id0, p0))
145
0
    Push(id0, p0);
146
0
  while (njob_ > 0) {
147
    // Pop job off stack.
148
0
    --njob_;
149
0
    int id = job_[njob_].id;
150
0
    int& rle = job_[njob_].rle;
151
0
    const char* p = job_[njob_].p;
152
153
0
    if (id < 0) {
154
      // Undo the Capture.
155
0
      cap_[prog_->inst(-id)->cap()] = p;
156
0
      continue;
157
0
    }
158
159
0
    if (rle > 0) {
160
0
      p += rle;
161
      // Revivify job on stack.
162
0
      --rle;
163
0
      ++njob_;
164
0
    }
165
166
0
  Loop:
167
    // Visit id, p.
168
0
    Prog::Inst* ip = prog_->inst(id);
169
0
    switch (ip->opcode()) {
170
0
      default:
171
0
        LOG(DFATAL) << "Unexpected opcode: " << ip->opcode();
172
0
        return false;
173
174
0
      case kInstFail:
175
0
        break;
176
177
0
      case kInstAltMatch:
178
0
        if (ip->greedy(prog_)) {
179
          // out1 is the Match instruction.
180
0
          id = ip->out1();
181
0
          p = end;
182
0
          goto Loop;
183
0
        }
184
0
        if (longest_) {
185
          // ip must be non-greedy...
186
          // out is the Match instruction.
187
0
          id = ip->out();
188
0
          p = end;
189
0
          goto Loop;
190
0
        }
191
0
        goto Next;
192
193
0
      case kInstByteRange: {
194
0
        int c = -1;
195
0
        if (p < end)
196
0
          c = *p & 0xFF;
197
0
        if (!ip->Matches(c))
198
0
          goto Next;
199
200
0
        if (ip->hint() != 0)
201
0
          Push(id+ip->hint(), p);  // try the next when we're done
202
0
        id = ip->out();
203
0
        p++;
204
0
        goto CheckAndLoop;
205
0
      }
206
207
0
      case kInstCapture:
208
0
        if (!ip->last())
209
0
          Push(id+1, p);  // try the next when we're done
210
211
0
        if (0 <= ip->cap() && ip->cap() < cap_.size()) {
212
          // Capture p to register, but save old value first.
213
0
          Push(-id, cap_[ip->cap()]);  // undo when we're done
214
0
          cap_[ip->cap()] = p;
215
0
        }
216
217
0
        id = ip->out();
218
0
        goto CheckAndLoop;
219
220
0
      case kInstEmptyWidth:
221
0
        if (ip->empty() & ~Prog::EmptyFlags(context_, p))
222
0
          goto Next;
223
224
0
        if (!ip->last())
225
0
          Push(id+1, p);  // try the next when we're done
226
0
        id = ip->out();
227
0
        goto CheckAndLoop;
228
229
0
      case kInstNop:
230
0
        if (!ip->last())
231
0
          Push(id+1, p);  // try the next when we're done
232
0
        id = ip->out();
233
234
0
      CheckAndLoop:
235
        // Sanity check: id is the head of its list, which must
236
        // be the case if id-1 is the last of *its* list. :)
237
0
        DCHECK(id == 0 || prog_->inst(id-1)->last());
238
0
        if (ShouldVisit(id, p))
239
0
          goto Loop;
240
0
        break;
241
242
0
      case kInstMatch: {
243
0
        if (endmatch_ && p != end)
244
0
          goto Next;
245
246
        // We found a match.  If the caller doesn't care
247
        // where the match is, no point going further.
248
0
        if (nsubmatch_ == 0)
249
0
          return true;
250
251
        // Record best match so far.
252
        // Only need to check end point, because this entire
253
        // call is only considering one start position.
254
0
        matched = true;
255
0
        cap_[1] = p;
256
0
        if (submatch_[0].data() == NULL ||
257
0
            (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
258
0
          for (int i = 0; i < nsubmatch_; i++)
259
0
            submatch_[i] =
260
0
                StringPiece(cap_[2 * i],
261
0
                            static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
262
0
        }
263
264
        // If going for first match, we're done.
265
0
        if (!longest_)
266
0
          return true;
267
268
        // If we used the entire text, no longer match is possible.
269
0
        if (p == end)
270
0
          return true;
271
272
        // Otherwise, continue on in hope of a longer match.
273
        // Note the absence of the ShouldVisit() check here
274
        // due to execution remaining in the same list.
275
0
      Next:
276
0
        if (!ip->last()) {
277
0
          id++;
278
0
          goto Loop;
279
0
        }
280
0
        break;
281
0
      }
282
0
    }
283
0
  }
284
0
  return matched;
285
0
}
286
287
// Search text (within context) for prog_.
288
bool BitState::Search(const StringPiece& text, const StringPiece& context,
289
                      bool anchored, bool longest,
290
0
                      StringPiece* submatch, int nsubmatch) {
291
  // Search parameters.
292
0
  text_ = text;
293
0
  context_ = context;
294
0
  if (context_.data() == NULL)
295
0
    context_ = text;
296
0
  if (prog_->anchor_start() && BeginPtr(context_) != BeginPtr(text))
297
0
    return false;
298
0
  if (prog_->anchor_end() && EndPtr(context_) != EndPtr(text))
299
0
    return false;
300
0
  anchored_ = anchored || prog_->anchor_start();
301
0
  longest_ = longest || prog_->anchor_end();
302
0
  endmatch_ = prog_->anchor_end();
303
0
  submatch_ = submatch;
304
0
  nsubmatch_ = nsubmatch;
305
0
  for (int i = 0; i < nsubmatch_; i++)
306
0
    submatch_[i] = StringPiece();
307
308
  // Allocate scratch space.
309
0
  int nvisited = prog_->list_count() * static_cast<int>(text.size()+1);
310
0
  nvisited = (nvisited + kVisitedBits-1) / kVisitedBits;
311
0
  visited_ = PODArray<uint64_t>(nvisited);
312
0
  memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
313
314
0
  int ncap = 2*nsubmatch;
315
0
  if (ncap < 2)
316
0
    ncap = 2;
317
0
  cap_ = PODArray<const char*>(ncap);
318
0
  memset(cap_.data(), 0, ncap*sizeof cap_[0]);
319
320
  // When sizeof(Job) == 16, we start with a nice round 1KiB. :)
321
0
  job_ = PODArray<Job>(64);
322
323
  // Anchored search must start at text.begin().
324
0
  if (anchored_) {
325
0
    cap_[0] = text.data();
326
0
    return TrySearch(prog_->start(), text.data());
327
0
  }
328
329
  // Unanchored search, starting from each possible text position.
330
  // Notice that we have to try the empty string at the end of
331
  // the text, so the loop condition is p <= text.end(), not p < text.end().
332
  // This looks like it's quadratic in the size of the text,
333
  // but we are not clearing visited_ between calls to TrySearch,
334
  // so no work is duplicated and it ends up still being linear.
335
0
  const char* etext = text.data() + text.size();
336
0
  for (const char* p = text.data(); p <= etext; p++) {
337
    // Try to use prefix accel (e.g. memchr) to skip ahead.
338
0
    if (p < etext && prog_->can_prefix_accel()) {
339
0
      p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext - p));
340
0
      if (p == NULL)
341
0
        p = etext;
342
0
    }
343
344
0
    cap_[0] = p;
345
0
    if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
346
0
      return true;
347
    // Avoid invoking undefined behavior (arithmetic on a null pointer)
348
    // by simply not continuing the loop.
349
0
    if (p == NULL)
350
0
      break;
351
0
  }
352
0
  return false;
353
0
}
354
355
// Bit-state search.
356
bool Prog::SearchBitState(const StringPiece& text,
357
                          const StringPiece& context,
358
                          Anchor anchor,
359
                          MatchKind kind,
360
                          StringPiece* match,
361
0
                          int nmatch) {
362
  // If full match, we ask for an anchored longest match
363
  // and then check that match[0] == text.
364
  // So make sure match[0] exists.
365
0
  StringPiece sp0;
366
0
  if (kind == kFullMatch) {
367
0
    anchor = kAnchored;
368
0
    if (nmatch < 1) {
369
0
      match = &sp0;
370
0
      nmatch = 1;
371
0
    }
372
0
  }
373
374
  // Run the search.
375
0
  BitState b(this);
376
0
  bool anchored = anchor == kAnchored;
377
0
  bool longest = kind != kFirstMatch;
378
0
  if (!b.Search(text, context, anchored, longest, match, nmatch))
379
0
    return false;
380
0
  if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text))
381
0
    return false;
382
0
  return true;
383
0
}
384
385
}  // namespace re2