Coverage Report

Created: 2024-06-09 06:56

/src/re2/re2/onepass.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2008 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Tested by search_test.cc.
6
//
7
// Prog::SearchOnePass is an efficient implementation of
8
// regular expression search with submatch tracking for
9
// what I call "one-pass regular expressions".  (An alternate
10
// name might be "backtracking-free regular expressions".)
11
//
12
// One-pass regular expressions have the property that
13
// at each input byte during an anchored match, there may be
14
// multiple alternatives but only one can proceed for any
15
// given input byte.
16
//
17
// For example, the regexp /x*yx*/ is one-pass: you read
18
// x's until a y, then you read the y, then you keep reading x's.
19
// At no point do you have to guess what to do or back up
20
// and try a different guess.
21
//
22
// On the other hand, /x*x/ is not one-pass: when you're
23
// looking at an input "x", it's not clear whether you should
24
// use it to extend the x* or as the final x.
25
//
26
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
27
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
28
//
29
// A simple intuition for identifying one-pass regular expressions
30
// is that it's always immediately obvious when a repetition ends.
31
// It must also be immediately obvious which branch of an | to take:
32
//
33
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
34
//
35
// The NFA-based search in nfa.cc does some bookkeeping to
36
// avoid the need for backtracking and its associated exponential blowup.
37
// But if we have a one-pass regular expression, there is no
38
// possibility of backtracking, so there is no need for the
39
// extra bookkeeping.  Hence, this code.
40
//
41
// On a one-pass regular expression, the NFA code in nfa.cc
42
// runs at about 1/20 of the backtracking-based PCRE speed.
43
// In contrast, the code in this file runs at about the same
44
// speed as PCRE.
45
//
46
// One-pass regular expressions get used a lot when RE is
47
// used for parsing simple strings, so it pays off to
48
// notice them and handle them efficiently.
49
//
50
// See also Anne Brüggemann-Klein and Derick Wood,
51
// "One-unambiguous regular languages", Information and Computation 142(2).
52
53
#include <stdint.h>
54
#include <string.h>
55
#include <algorithm>
56
#include <map>
57
#include <string>
58
#include <vector>
59
60
#include "util/util.h"
61
#include "util/logging.h"
62
#include "util/strutil.h"
63
#include "util/utf.h"
64
#include "re2/pod_array.h"
65
#include "re2/prog.h"
66
#include "re2/sparse_set.h"
67
#include "re2/stringpiece.h"
68
69
// Silence "zero-sized array in struct/union" warning for OneState::action.
70
#ifdef _MSC_VER
71
#pragma warning(disable: 4200)
72
#endif
73
74
namespace re2 {
75
76
static const bool ExtraDebug = false;
77
78
// The key insight behind this implementation is that the
79
// non-determinism in an NFA for a one-pass regular expression
80
// is contained.  To explain what that means, first a
81
// refresher about what regular expression programs look like
82
// and how the usual NFA execution runs.
83
//
84
// In a regular expression program, only the kInstByteRange
85
// instruction processes an input byte c and moves on to the
86
// next byte in the string (it does so if c is in the given range).
87
// The kInstByteRange instructions correspond to literal characters
88
// and character classes in the regular expression.
89
//
90
// The kInstAlt instructions are used as wiring to connect the
91
// kInstByteRange instructions together in interesting ways when
92
// implementing | + and *.
93
// The kInstAlt instruction forks execution, like a goto that
94
// jumps to ip->out() and ip->out1() in parallel.  Each of the
95
// resulting computation paths is called a thread.
96
//
97
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
98
// are interesting in their own right but like kInstAlt they don't
99
// advance the input pointer.  Only kInstByteRange does.
100
//
101
// The automaton execution in nfa.cc runs all the possible
102
// threads of execution in lock-step over the input.  To process
103
// a particular byte, each thread gets run until it either dies
104
// or finds a kInstByteRange instruction matching the byte.
105
// If the latter happens, the thread stops just past the
106
// kInstByteRange instruction (at ip->out()) and waits for
107
// the other threads to finish processing the input byte.
108
// Then, once all the threads have processed that input byte,
109
// the whole process repeats.  The kInstAlt state instruction
110
// might create new threads during input processing, but no
111
// matter what, all the threads stop after a kInstByteRange
112
// and wait for the other threads to "catch up".
113
// Running in lock step like this ensures that the NFA reads
114
// the input string only once.
115
//
116
// Each thread maintains its own set of capture registers
117
// (the string positions at which it executed the kInstCapture
118
// instructions corresponding to capturing parentheses in the
119
// regular expression).  Repeated copying of the capture registers
120
// is the main performance bottleneck in the NFA implementation.
121
//
122
// A regular expression program is "one-pass" if, no matter what
123
// the input string, there is only one thread that makes it
124
// past a kInstByteRange instruction at each input byte.  This means
125
// that there is in some sense only one active thread throughout
126
// the execution.  Other threads might be created during the
127
// processing of an input byte, but they are ephemeral: only one
128
// thread is left to start processing the next input byte.
129
// This is what I meant above when I said the non-determinism
130
// was "contained".
131
//
132
// To execute a one-pass regular expression program, we can build
133
// a DFA (no non-determinism) that has at most as many states as
134
// the NFA (compare this to the possibly exponential number of states
135
// in the general case).  Each state records, for each possible
136
// input byte, the next state along with the conditions required
137
// before entering that state -- empty-width flags that must be true
138
// and capture operations that must be performed.  It also records
139
// whether a set of conditions required to finish a match at that
140
// point in the input rather than process the next byte.
141
142
// A state in the one-pass NFA - just an array of actions indexed
143
// by the bytemap_[] of the next input byte.  (The bytemap
144
// maps next input bytes into equivalence classes, to reduce
145
// the memory footprint.)
146
struct OneState {
147
  uint32_t matchcond;   // conditions to match right now.
148
  uint32_t action[];
149
};
150
151
// The uint32_t conditions in the action are a combination of
152
// condition and capture bits and the next state.  The bottom 16 bits
153
// are the condition and capture bits, and the top 16 are the index of
154
// the next state.
155
//
156
// Bits 0-5 are the empty-width flags from prog.h.
157
// Bit 6 is kMatchWins, which means the match takes
158
// priority over moving to next in a first-match search.
159
// The remaining bits mark capture registers that should
160
// be set to the current input position.  The capture bits
161
// start at index 2, since the search loop can take care of
162
// cap[0], cap[1] (the overall match position).
163
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
164
// No input position can satisfy both kEmptyWordBoundary
165
// and kEmptyNonWordBoundary, so we can use that as a sentinel
166
// instead of needing an extra bit.
167
168
static const int    kIndexShift   = 16;  // number of bits below index
169
static const int    kEmptyShift   = 6;   // number of empty flags in prog.h
170
static const int    kRealCapShift = kEmptyShift + 1;
171
static const int    kRealMaxCap   = (kIndexShift - kRealCapShift) / 2 * 2;
172
173
// Parameters used to skip over cap[0], cap[1].
174
static const int    kCapShift     = kRealCapShift - 2;
175
static const int    kMaxCap       = kRealMaxCap + 2;
176
177
static const uint32_t kMatchWins  = 1 << kEmptyShift;
178
static const uint32_t kCapMask    = ((1 << kRealMaxCap) - 1) << kRealCapShift;
179
180
static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
181
182
// Check, at compile time, that prog.h agrees with math above.
183
// This function is never called.
184
0
void OnePass_Checks() {
185
0
  static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,
186
0
                "kEmptyShift disagrees with kEmptyAllFlags");
187
  // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
188
0
  static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,
189
0
                "kMaxCap disagrees with kMaxOnePassCapture");
190
0
}
191
192
12.3k
static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) {
193
12.3k
  uint32_t satisfied = Prog::EmptyFlags(context, p);
194
12.3k
  if (cond & kEmptyAllFlags & ~satisfied)
195
11.7k
    return false;
196
611
  return true;
197
12.3k
}
198
199
// Apply the capture bits in cond, saving p to the appropriate
200
// locations in cap[].
201
static void ApplyCaptures(uint32_t cond, const char* p,
202
0
                          const char** cap, int ncap) {
203
0
  for (int i = 2; i < ncap; i++)
204
0
    if (cond & (1 << kCapShift << i))
205
0
      cap[i] = p;
206
0
}
207
208
// Computes the OneState* for the given nodeindex.
209
static inline OneState* IndexToNode(uint8_t* nodes, int statesize,
210
27.5M
                                    int nodeindex) {
211
27.5M
  return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);
212
27.5M
}
213
214
bool Prog::SearchOnePass(const StringPiece& text,
215
                         const StringPiece& const_context,
216
                         Anchor anchor, MatchKind kind,
217
15.4k
                         StringPiece* match, int nmatch) {
218
15.4k
  if (anchor != kAnchored && kind != kFullMatch) {
219
0
    LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
220
0
    return false;
221
0
  }
222
223
  // Make sure we have at least cap[1],
224
  // because we use it to tell if we matched.
225
15.4k
  int ncap = 2*nmatch;
226
15.4k
  if (ncap < 2)
227
15.4k
    ncap = 2;
228
229
15.4k
  const char* cap[kMaxCap];
230
46.3k
  for (int i = 0; i < ncap; i++)
231
30.9k
    cap[i] = NULL;
232
233
15.4k
  const char* matchcap[kMaxCap];
234
46.3k
  for (int i = 0; i < ncap; i++)
235
30.9k
    matchcap[i] = NULL;
236
237
15.4k
  StringPiece context = const_context;
238
15.4k
  if (context.data() == NULL)
239
0
    context = text;
240
15.4k
  if (anchor_start() && BeginPtr(context) != BeginPtr(text))
241
29
    return false;
242
15.4k
  if (anchor_end() && EndPtr(context) != EndPtr(text))
243
0
    return false;
244
15.4k
  if (anchor_end())
245
452
    kind = kFullMatch;
246
247
15.4k
  uint8_t* nodes = onepass_nodes_.data();
248
15.4k
  int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
249
  // start() is always mapped to the zeroth OneState.
250
15.4k
  OneState* state = IndexToNode(nodes, statesize, 0);
251
15.4k
  uint8_t* bytemap = bytemap_;
252
15.4k
  const char* bp = text.data();
253
15.4k
  const char* ep = text.data() + text.size();
254
15.4k
  const char* p;
255
15.4k
  bool matched = false;
256
15.4k
  matchcap[0] = bp;
257
15.4k
  cap[0] = bp;
258
15.4k
  uint32_t nextmatchcond = state->matchcond;
259
60.8k
  for (p = bp; p < ep; p++) {
260
57.0k
    int c = bytemap[*p & 0xFF];
261
57.0k
    uint32_t matchcond = nextmatchcond;
262
57.0k
    uint32_t cond = state->action[c];
263
264
    // Determine whether we can reach act->next.
265
    // If so, advance state and nextmatchcond.
266
57.0k
    if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
267
45.3k
      uint32_t nextindex = cond >> kIndexShift;
268
45.3k
      state = IndexToNode(nodes, statesize, nextindex);
269
45.3k
      nextmatchcond = state->matchcond;
270
45.3k
    } else {
271
11.6k
      state = NULL;
272
11.6k
      nextmatchcond = kImpossible;
273
11.6k
    }
274
275
    // This code section is carefully tuned.
276
    // The goto sequence is about 10% faster than the
277
    // obvious rewrite as a large if statement in the
278
    // ASCIIMatchRE2 and DotMatchRE2 benchmarks.
279
280
    // Saving the match capture registers is expensive.
281
    // Is this intermediate match worth thinking about?
282
283
    // Not if we want a full match.
284
57.0k
    if (kind == kFullMatch)
285
57.0k
      goto skipmatch;
286
287
    // Not if it's impossible.
288
0
    if (matchcond == kImpossible)
289
0
      goto skipmatch;
290
291
    // Not if the possible match is beaten by the certain
292
    // match at the next byte.  When this test is useless
293
    // (e.g., HTTPPartialMatchRE2) it slows the loop by
294
    // about 10%, but when it avoids work (e.g., DotMatchRE2),
295
    // it cuts the loop execution by about 45%.
296
0
    if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
297
0
      goto skipmatch;
298
299
    // Finally, the match conditions must be satisfied.
300
0
    if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
301
0
      for (int i = 2; i < 2*nmatch; i++)
302
0
        matchcap[i] = cap[i];
303
0
      if (nmatch > 1 && (matchcond & kCapMask))
304
0
        ApplyCaptures(matchcond, p, matchcap, ncap);
305
0
      matchcap[1] = p;
306
0
      matched = true;
307
308
      // If we're in longest match mode, we have to keep
309
      // going and see if we find a longer match.
310
      // In first match mode, we can stop if the match
311
      // takes priority over the next state for this input byte.
312
      // That bit is per-input byte and thus in cond, not matchcond.
313
0
      if (kind == kFirstMatch && (cond & kMatchWins))
314
0
        goto done;
315
0
    }
316
317
57.0k
  skipmatch:
318
57.0k
    if (state == NULL)
319
11.6k
      goto done;
320
45.3k
    if ((cond & kCapMask) && nmatch > 1)
321
0
      ApplyCaptures(cond, p, cap, ncap);
322
45.3k
  }
323
324
  // Look for match at end of input.
325
3.76k
  {
326
3.76k
    uint32_t matchcond = state->matchcond;
327
3.76k
    if (matchcond != kImpossible &&
328
3.76k
        ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
329
3.61k
      if (nmatch > 1 && (matchcond & kCapMask))
330
0
        ApplyCaptures(matchcond, p, cap, ncap);
331
3.61k
      for (int i = 2; i < ncap; i++)
332
0
        matchcap[i] = cap[i];
333
3.61k
      matchcap[1] = p;
334
3.61k
      matched = true;
335
3.61k
    }
336
3.76k
  }
337
338
15.4k
done:
339
15.4k
  if (!matched)
340
11.8k
    return false;
341
3.61k
  for (int i = 0; i < nmatch; i++)
342
0
    match[i] =
343
0
        StringPiece(matchcap[2 * i],
344
0
                    static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));
345
3.61k
  return true;
346
15.4k
}
347
348
349
// Analysis to determine whether a given regexp program is one-pass.
350
351
// If ip is not on workq, adds ip to work queue and returns true.
352
// If ip is already on work queue, does nothing and returns false.
353
// If ip is NULL, does nothing and returns true (pretends to add it).
354
typedef SparseSet Instq;
355
22.9M
static bool AddQ(Instq *q, int id) {
356
22.9M
  if (id == 0)
357
19.7k
    return true;
358
22.9M
  if (q->contains(id))
359
3.44k
    return false;
360
22.9M
  q->insert(id);
361
22.9M
  return true;
362
22.9M
}
363
364
struct InstCond {
365
  int id;
366
  uint32_t cond;
367
};
368
369
// Returns whether this is a one-pass program; that is,
370
// returns whether it is safe to use SearchOnePass on this program.
371
// These conditions must be true for any instruction ip:
372
//
373
//   (1) for any other Inst nip, there is at most one input-free
374
//       path from ip to nip.
375
//   (2) there is at most one kInstByte instruction reachable from
376
//       ip that matches any particular byte c.
377
//   (3) there is at most one input-free path from ip to a kInstMatch
378
//       instruction.
379
//
380
// This is actually just a conservative approximation: it might
381
// return false when the answer is true, when kInstEmptyWidth
382
// instructions are involved.
383
// Constructs and saves corresponding one-pass NFA on success.
384
67.2k
bool Prog::IsOnePass() {
385
67.2k
  if (did_onepass_)
386
0
    return onepass_nodes_.data() != NULL;
387
67.2k
  did_onepass_ = true;
388
389
67.2k
  if (start() == 0)  // no match
390
1.94k
    return false;
391
392
  // Steal memory for the one-pass NFA from the overall DFA budget.
393
  // Willing to use at most 1/4 of the DFA budget (heuristic).
394
  // Limit max node count to 65000 as a conservative estimate to
395
  // avoid overflowing 16-bit node index in encoding.
396
65.3k
  int maxnodes = 2 + inst_count(kInstByteRange);
397
65.3k
  int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);
398
65.3k
  if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
399
3.65k
    return false;
400
401
  // Flood the graph starting at the start state, and check
402
  // that in each reachable state, each possible byte leads
403
  // to a unique next state.
404
61.6k
  int stacksize = inst_count(kInstCapture) +
405
61.6k
                  inst_count(kInstEmptyWidth) +
406
61.6k
                  inst_count(kInstNop) + 1;  // + 1 for start inst
407
61.6k
  PODArray<InstCond> stack(stacksize);
408
409
61.6k
  int size = this->size();
410
61.6k
  PODArray<int> nodebyid(size);  // indexed by ip
411
61.6k
  memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]);
412
413
  // Originally, nodes was a uint8_t[maxnodes*statesize], but that was
414
  // unnecessarily optimistic: why allocate a large amount of memory
415
  // upfront for a large program when it is unlikely to be one-pass?
416
61.6k
  std::vector<uint8_t> nodes;
417
418
61.6k
  Instq tovisit(size), workq(size);
419
61.6k
  AddQ(&tovisit, start());
420
61.6k
  nodebyid[start()] = 0;
421
61.6k
  int nalloc = 1;
422
61.6k
  nodes.insert(nodes.end(), statesize, 0);
423
13.8M
  for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
424
13.7M
    int id = *it;
425
13.7M
    int nodeindex = nodebyid[id];
426
13.7M
    OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
427
428
    // Flood graph using manual stack, filling in actions as found.
429
    // Default is none.
430
208M
    for (int b = 0; b < bytemap_range_; b++)
431
194M
      node->action[b] = kImpossible;
432
13.7M
    node->matchcond = kImpossible;
433
434
13.7M
    workq.clear();
435
13.7M
    bool matched = false;
436
13.7M
    int nstack = 0;
437
13.7M
    stack[nstack].id = id;
438
13.7M
    stack[nstack++].cond = 0;
439
27.9M
    while (nstack > 0) {
440
14.2M
      int id = stack[--nstack].id;
441
14.2M
      uint32_t cond = stack[nstack].cond;
442
443
22.5M
    Loop:
444
22.5M
      Prog::Inst* ip = inst(id);
445
22.5M
      switch (ip->opcode()) {
446
0
        default:
447
0
          LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
448
0
          break;
449
450
1.20k
        case kInstAltMatch:
451
          // TODO(rsc): Ignoring kInstAltMatch optimization.
452
          // Should implement it in this engine, but it's subtle.
453
1.20k
          DCHECK(!ip->last());
454
          // If already on work queue, (1) is violated: bail out.
455
1.20k
          if (!AddQ(&workq, id+1))
456
0
            goto fail;
457
1.20k
          id = id+1;
458
1.20k
          goto Loop;
459
460
17.6M
        case kInstByteRange: {
461
17.6M
          int nextindex = nodebyid[ip->out()];
462
17.6M
          if (nextindex == -1) {
463
13.7M
            if (nalloc >= maxnodes) {
464
0
              if (ExtraDebug)
465
0
                LOG(ERROR) << StringPrintf(
466
0
                    "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);
467
0
              goto fail;
468
0
            }
469
13.7M
            nextindex = nalloc;
470
13.7M
            AddQ(&tovisit, ip->out());
471
13.7M
            nodebyid[ip->out()] = nalloc;
472
13.7M
            nalloc++;
473
13.7M
            nodes.insert(nodes.end(), statesize, 0);
474
            // Update node because it might have been invalidated.
475
13.7M
            node = IndexToNode(nodes.data(), statesize, nodeindex);
476
13.7M
          }
477
43.9M
          for (int c = ip->lo(); c <= ip->hi(); c++) {
478
26.3M
            int b = bytemap_[c];
479
            // Skip any bytes immediately after c that are also in b.
480
80.8M
            while (c < 256-1 && bytemap_[c+1] == b)
481
54.4M
              c++;
482
26.3M
            uint32_t act = node->action[b];
483
26.3M
            uint32_t newact = (nextindex << kIndexShift) | cond;
484
26.3M
            if (matched)
485
151k
              newact |= kMatchWins;
486
26.3M
            if ((act & kImpossible) == kImpossible) {
487
22.3M
              node->action[b] = newact;
488
22.3M
            } else if (act != newact) {
489
20.8k
              if (ExtraDebug)
490
0
                LOG(ERROR) << StringPrintf(
491
0
                    "Not OnePass: conflict on byte %#x at state %d", c, *it);
492
20.8k
              goto fail;
493
20.8k
            }
494
26.3M
          }
495
17.5M
          if (ip->foldcase()) {
496
11.4M
            Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';
497
11.4M
            Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';
498
15.8M
            for (int c = lo; c <= hi; c++) {
499
4.30M
              int b = bytemap_[c];
500
              // Skip any bytes immediately after c that are also in b.
501
5.89M
              while (c < 256-1 && bytemap_[c+1] == b)
502
1.58M
                c++;
503
4.30M
              uint32_t act = node->action[b];
504
4.30M
              uint32_t newact = (nextindex << kIndexShift) | cond;
505
4.30M
              if (matched)
506
40.8k
                newact |= kMatchWins;
507
4.30M
              if ((act & kImpossible) == kImpossible) {
508
13.2k
                node->action[b] = newact;
509
4.29M
              } else if (act != newact) {
510
25
                if (ExtraDebug)
511
0
                  LOG(ERROR) << StringPrintf(
512
0
                      "Not OnePass: conflict on byte %#x at state %d", c, *it);
513
25
                goto fail;
514
25
              }
515
4.30M
            }
516
11.4M
          }
517
518
17.5M
          if (ip->last())
519
14.0M
            break;
520
          // If already on work queue, (1) is violated: bail out.
521
3.53M
          if (!AddQ(&workq, id+1))
522
50
            goto fail;
523
3.53M
          id = id+1;
524
3.53M
          goto Loop;
525
3.53M
        }
526
527
2.85M
        case kInstCapture:
528
3.31M
        case kInstEmptyWidth:
529
4.78M
        case kInstNop:
530
4.78M
          if (!ip->last()) {
531
            // If already on work queue, (1) is violated: bail out.
532
823k
            if (!AddQ(&workq, id+1))
533
41
              goto fail;
534
823k
            stack[nstack].id = id+1;
535
823k
            stack[nstack++].cond = cond;
536
823k
          }
537
538
4.78M
          if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)
539
2.18M
            cond |= (1 << kCapShift) << ip->cap();
540
4.78M
          if (ip->opcode() == kInstEmptyWidth)
541
458k
            cond |= ip->empty();
542
543
          // kInstCapture and kInstNop always proceed to ip->out().
544
          // kInstEmptyWidth only sometimes proceeds to ip->out(),
545
          // but as a conservative approximation we assume it always does.
546
          // We could be a little more precise by looking at what c
547
          // is, but that seems like overkill.
548
549
          // If already on work queue, (1) is violated: bail out.
550
4.78M
          if (!AddQ(&workq, ip->out())) {
551
3.35k
            if (ExtraDebug)
552
0
              LOG(ERROR) << StringPrintf(
553
0
                  "Not OnePass: multiple paths %d -> %d", *it, ip->out());
554
3.35k
            goto fail;
555
3.35k
          }
556
4.78M
          id = ip->out();
557
4.78M
          goto Loop;
558
559
144k
        case kInstMatch:
560
144k
          if (matched) {
561
            // (3) is violated
562
31
            if (ExtraDebug)
563
0
              LOG(ERROR) << StringPrintf(
564
0
                  "Not OnePass: multiple matches from %d", *it);
565
31
            goto fail;
566
31
          }
567
143k
          matched = true;
568
143k
          node->matchcond = cond;
569
570
143k
          if (ip->last())
571
116k
            break;
572
          // If already on work queue, (1) is violated: bail out.
573
27.9k
          if (!AddQ(&workq, id+1))
574
0
            goto fail;
575
27.9k
          id = id+1;
576
27.9k
          goto Loop;
577
578
19.7k
        case kInstFail:
579
19.7k
          break;
580
22.5M
      }
581
22.5M
    }
582
13.7M
  }
583
584
37.3k
  if (ExtraDebug) {  // For debugging, dump one-pass NFA to LOG(ERROR).
585
0
    LOG(ERROR) << "bytemap:\n" << DumpByteMap();
586
0
    LOG(ERROR) << "prog:\n" << Dump();
587
588
0
    std::map<int, int> idmap;
589
0
    for (int i = 0; i < size; i++)
590
0
      if (nodebyid[i] != -1)
591
0
        idmap[nodebyid[i]] = i;
592
593
0
    std::string dump;
594
0
    for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
595
0
      int id = *it;
596
0
      int nodeindex = nodebyid[id];
597
0
      if (nodeindex == -1)
598
0
        continue;
599
0
      OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);
600
0
      dump += StringPrintf("node %d id=%d: matchcond=%#x\n",
601
0
                           nodeindex, id, node->matchcond);
602
0
      for (int i = 0; i < bytemap_range_; i++) {
603
0
        if ((node->action[i] & kImpossible) == kImpossible)
604
0
          continue;
605
0
        dump += StringPrintf("  %d cond %#x -> %d id=%d\n",
606
0
                             i, node->action[i] & 0xFFFF,
607
0
                             node->action[i] >> kIndexShift,
608
0
                             idmap[node->action[i] >> kIndexShift]);
609
0
      }
610
0
    }
611
0
    LOG(ERROR) << "nodes:\n" << dump;
612
0
  }
613
614
37.3k
  dfa_mem_ -= nalloc*statesize;
615
37.3k
  onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize);
616
37.3k
  memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize);
617
37.3k
  return true;
618
619
24.3k
fail:
620
24.3k
  return false;
621
61.6k
}
622
623
}  // namespace re2