Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Tested by search_test.cc. |
6 | | // |
7 | | // Prog::SearchOnePass is an efficient implementation of |
8 | | // regular expression search with submatch tracking for |
9 | | // what I call "one-pass regular expressions". (An alternate |
10 | | // name might be "backtracking-free regular expressions".) |
11 | | // |
12 | | // One-pass regular expressions have the property that |
13 | | // at each input byte during an anchored match, there may be |
14 | | // multiple alternatives but only one can proceed for any |
15 | | // given input byte. |
16 | | // |
17 | | // For example, the regexp /x*yx*/ is one-pass: you read |
18 | | // x's until a y, then you read the y, then you keep reading x's. |
19 | | // At no point do you have to guess what to do or back up |
20 | | // and try a different guess. |
21 | | // |
22 | | // On the other hand, /x*x/ is not one-pass: when you're |
23 | | // looking at an input "x", it's not clear whether you should |
24 | | // use it to extend the x* or as the final x. |
25 | | // |
26 | | // More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not. |
27 | | // /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not. |
28 | | // |
29 | | // A simple intuition for identifying one-pass regular expressions |
30 | | // is that it's always immediately obvious when a repetition ends. |
31 | | // It must also be immediately obvious which branch of an | to take: |
32 | | // |
33 | | // /x(y|z)/ is one-pass, but /(xy|xz)/ is not. |
34 | | // |
35 | | // The NFA-based search in nfa.cc does some bookkeeping to |
36 | | // avoid the need for backtracking and its associated exponential blowup. |
37 | | // But if we have a one-pass regular expression, there is no |
38 | | // possibility of backtracking, so there is no need for the |
39 | | // extra bookkeeping. Hence, this code. |
40 | | // |
41 | | // On a one-pass regular expression, the NFA code in nfa.cc |
42 | | // runs at about 1/20 of the backtracking-based PCRE speed. |
43 | | // In contrast, the code in this file runs at about the same |
44 | | // speed as PCRE. |
45 | | // |
46 | | // One-pass regular expressions get used a lot when RE is |
47 | | // used for parsing simple strings, so it pays off to |
48 | | // notice them and handle them efficiently. |
49 | | // |
50 | | // See also Anne Brüggemann-Klein and Derick Wood, |
51 | | // "One-unambiguous regular languages", Information and Computation 142(2). |
52 | | |
53 | | #include <stdint.h> |
54 | | #include <string.h> |
55 | | #include <algorithm> |
56 | | #include <map> |
57 | | #include <string> |
58 | | #include <vector> |
59 | | |
60 | | #include "util/util.h" |
61 | | #include "util/logging.h" |
62 | | #include "util/strutil.h" |
63 | | #include "util/utf.h" |
64 | | #include "re2/pod_array.h" |
65 | | #include "re2/prog.h" |
66 | | #include "re2/sparse_set.h" |
67 | | #include "re2/stringpiece.h" |
68 | | |
69 | | // Silence "zero-sized array in struct/union" warning for OneState::action. |
70 | | #ifdef _MSC_VER |
71 | | #pragma warning(disable: 4200) |
72 | | #endif |
73 | | |
74 | | namespace re2 { |
75 | | |
76 | | static const bool ExtraDebug = false; |
77 | | |
78 | | // The key insight behind this implementation is that the |
79 | | // non-determinism in an NFA for a one-pass regular expression |
80 | | // is contained. To explain what that means, first a |
81 | | // refresher about what regular expression programs look like |
82 | | // and how the usual NFA execution runs. |
83 | | // |
84 | | // In a regular expression program, only the kInstByteRange |
85 | | // instruction processes an input byte c and moves on to the |
86 | | // next byte in the string (it does so if c is in the given range). |
87 | | // The kInstByteRange instructions correspond to literal characters |
88 | | // and character classes in the regular expression. |
89 | | // |
90 | | // The kInstAlt instructions are used as wiring to connect the |
91 | | // kInstByteRange instructions together in interesting ways when |
92 | | // implementing | + and *. |
93 | | // The kInstAlt instruction forks execution, like a goto that |
94 | | // jumps to ip->out() and ip->out1() in parallel. Each of the |
95 | | // resulting computation paths is called a thread. |
96 | | // |
97 | | // The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture -- |
98 | | // are interesting in their own right but like kInstAlt they don't |
99 | | // advance the input pointer. Only kInstByteRange does. |
100 | | // |
101 | | // The automaton execution in nfa.cc runs all the possible |
102 | | // threads of execution in lock-step over the input. To process |
103 | | // a particular byte, each thread gets run until it either dies |
104 | | // or finds a kInstByteRange instruction matching the byte. |
105 | | // If the latter happens, the thread stops just past the |
106 | | // kInstByteRange instruction (at ip->out()) and waits for |
107 | | // the other threads to finish processing the input byte. |
108 | | // Then, once all the threads have processed that input byte, |
109 | | // the whole process repeats. The kInstAlt state instruction |
110 | | // might create new threads during input processing, but no |
111 | | // matter what, all the threads stop after a kInstByteRange |
112 | | // and wait for the other threads to "catch up". |
113 | | // Running in lock step like this ensures that the NFA reads |
114 | | // the input string only once. |
115 | | // |
116 | | // Each thread maintains its own set of capture registers |
117 | | // (the string positions at which it executed the kInstCapture |
118 | | // instructions corresponding to capturing parentheses in the |
119 | | // regular expression). Repeated copying of the capture registers |
120 | | // is the main performance bottleneck in the NFA implementation. |
121 | | // |
122 | | // A regular expression program is "one-pass" if, no matter what |
123 | | // the input string, there is only one thread that makes it |
124 | | // past a kInstByteRange instruction at each input byte. This means |
125 | | // that there is in some sense only one active thread throughout |
126 | | // the execution. Other threads might be created during the |
127 | | // processing of an input byte, but they are ephemeral: only one |
128 | | // thread is left to start processing the next input byte. |
129 | | // This is what I meant above when I said the non-determinism |
130 | | // was "contained". |
131 | | // |
132 | | // To execute a one-pass regular expression program, we can build |
133 | | // a DFA (no non-determinism) that has at most as many states as |
134 | | // the NFA (compare this to the possibly exponential number of states |
135 | | // in the general case). Each state records, for each possible |
136 | | // input byte, the next state along with the conditions required |
137 | | // before entering that state -- empty-width flags that must be true |
138 | | // and capture operations that must be performed. It also records |
139 | | // whether a set of conditions required to finish a match at that |
140 | | // point in the input rather than process the next byte. |
141 | | |
142 | | // A state in the one-pass NFA - just an array of actions indexed |
143 | | // by the bytemap_[] of the next input byte. (The bytemap |
144 | | // maps next input bytes into equivalence classes, to reduce |
145 | | // the memory footprint.) |
146 | | struct OneState { |
147 | | uint32_t matchcond; // conditions to match right now. |
148 | | uint32_t action[]; |
149 | | }; |
150 | | |
151 | | // The uint32_t conditions in the action are a combination of |
152 | | // condition and capture bits and the next state. The bottom 16 bits |
153 | | // are the condition and capture bits, and the top 16 are the index of |
154 | | // the next state. |
155 | | // |
156 | | // Bits 0-5 are the empty-width flags from prog.h. |
157 | | // Bit 6 is kMatchWins, which means the match takes |
158 | | // priority over moving to next in a first-match search. |
159 | | // The remaining bits mark capture registers that should |
160 | | // be set to the current input position. The capture bits |
161 | | // start at index 2, since the search loop can take care of |
162 | | // cap[0], cap[1] (the overall match position). |
163 | | // That means we can handle up to 5 capturing parens: $1 through $4, plus $0. |
164 | | // No input position can satisfy both kEmptyWordBoundary |
165 | | // and kEmptyNonWordBoundary, so we can use that as a sentinel |
166 | | // instead of needing an extra bit. |
167 | | |
168 | | static const int kIndexShift = 16; // number of bits below index |
169 | | static const int kEmptyShift = 6; // number of empty flags in prog.h |
170 | | static const int kRealCapShift = kEmptyShift + 1; |
171 | | static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2; |
172 | | |
173 | | // Parameters used to skip over cap[0], cap[1]. |
174 | | static const int kCapShift = kRealCapShift - 2; |
175 | | static const int kMaxCap = kRealMaxCap + 2; |
176 | | |
177 | | static const uint32_t kMatchWins = 1 << kEmptyShift; |
178 | | static const uint32_t kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift; |
179 | | |
180 | | static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary; |
181 | | |
182 | | // Check, at compile time, that prog.h agrees with math above. |
183 | | // This function is never called. |
184 | 0 | void OnePass_Checks() { |
185 | 0 | static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags, |
186 | 0 | "kEmptyShift disagrees with kEmptyAllFlags"); |
187 | | // kMaxCap counts pointers, kMaxOnePassCapture counts pairs. |
188 | 0 | static_assert(kMaxCap == Prog::kMaxOnePassCapture*2, |
189 | 0 | "kMaxCap disagrees with kMaxOnePassCapture"); |
190 | 0 | } |
191 | | |
192 | 12.3k | static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) { |
193 | 12.3k | uint32_t satisfied = Prog::EmptyFlags(context, p); |
194 | 12.3k | if (cond & kEmptyAllFlags & ~satisfied) |
195 | 11.7k | return false; |
196 | 611 | return true; |
197 | 12.3k | } |
198 | | |
199 | | // Apply the capture bits in cond, saving p to the appropriate |
200 | | // locations in cap[]. |
201 | | static void ApplyCaptures(uint32_t cond, const char* p, |
202 | 0 | const char** cap, int ncap) { |
203 | 0 | for (int i = 2; i < ncap; i++) |
204 | 0 | if (cond & (1 << kCapShift << i)) |
205 | 0 | cap[i] = p; |
206 | 0 | } |
207 | | |
208 | | // Computes the OneState* for the given nodeindex. |
209 | | static inline OneState* IndexToNode(uint8_t* nodes, int statesize, |
210 | 27.5M | int nodeindex) { |
211 | 27.5M | return reinterpret_cast<OneState*>(nodes + statesize*nodeindex); |
212 | 27.5M | } |
213 | | |
214 | | bool Prog::SearchOnePass(const StringPiece& text, |
215 | | const StringPiece& const_context, |
216 | | Anchor anchor, MatchKind kind, |
217 | 15.4k | StringPiece* match, int nmatch) { |
218 | 15.4k | if (anchor != kAnchored && kind != kFullMatch) { |
219 | 0 | LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches."; |
220 | 0 | return false; |
221 | 0 | } |
222 | | |
223 | | // Make sure we have at least cap[1], |
224 | | // because we use it to tell if we matched. |
225 | 15.4k | int ncap = 2*nmatch; |
226 | 15.4k | if (ncap < 2) |
227 | 15.4k | ncap = 2; |
228 | | |
229 | 15.4k | const char* cap[kMaxCap]; |
230 | 46.3k | for (int i = 0; i < ncap; i++) |
231 | 30.9k | cap[i] = NULL; |
232 | | |
233 | 15.4k | const char* matchcap[kMaxCap]; |
234 | 46.3k | for (int i = 0; i < ncap; i++) |
235 | 30.9k | matchcap[i] = NULL; |
236 | | |
237 | 15.4k | StringPiece context = const_context; |
238 | 15.4k | if (context.data() == NULL) |
239 | 0 | context = text; |
240 | 15.4k | if (anchor_start() && BeginPtr(context) != BeginPtr(text)) |
241 | 29 | return false; |
242 | 15.4k | if (anchor_end() && EndPtr(context) != EndPtr(text)) |
243 | 0 | return false; |
244 | 15.4k | if (anchor_end()) |
245 | 452 | kind = kFullMatch; |
246 | | |
247 | 15.4k | uint8_t* nodes = onepass_nodes_.data(); |
248 | 15.4k | int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t); |
249 | | // start() is always mapped to the zeroth OneState. |
250 | 15.4k | OneState* state = IndexToNode(nodes, statesize, 0); |
251 | 15.4k | uint8_t* bytemap = bytemap_; |
252 | 15.4k | const char* bp = text.data(); |
253 | 15.4k | const char* ep = text.data() + text.size(); |
254 | 15.4k | const char* p; |
255 | 15.4k | bool matched = false; |
256 | 15.4k | matchcap[0] = bp; |
257 | 15.4k | cap[0] = bp; |
258 | 15.4k | uint32_t nextmatchcond = state->matchcond; |
259 | 60.8k | for (p = bp; p < ep; p++) { |
260 | 57.0k | int c = bytemap[*p & 0xFF]; |
261 | 57.0k | uint32_t matchcond = nextmatchcond; |
262 | 57.0k | uint32_t cond = state->action[c]; |
263 | | |
264 | | // Determine whether we can reach act->next. |
265 | | // If so, advance state and nextmatchcond. |
266 | 57.0k | if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) { |
267 | 45.3k | uint32_t nextindex = cond >> kIndexShift; |
268 | 45.3k | state = IndexToNode(nodes, statesize, nextindex); |
269 | 45.3k | nextmatchcond = state->matchcond; |
270 | 45.3k | } else { |
271 | 11.6k | state = NULL; |
272 | 11.6k | nextmatchcond = kImpossible; |
273 | 11.6k | } |
274 | | |
275 | | // This code section is carefully tuned. |
276 | | // The goto sequence is about 10% faster than the |
277 | | // obvious rewrite as a large if statement in the |
278 | | // ASCIIMatchRE2 and DotMatchRE2 benchmarks. |
279 | | |
280 | | // Saving the match capture registers is expensive. |
281 | | // Is this intermediate match worth thinking about? |
282 | | |
283 | | // Not if we want a full match. |
284 | 57.0k | if (kind == kFullMatch) |
285 | 57.0k | goto skipmatch; |
286 | | |
287 | | // Not if it's impossible. |
288 | 0 | if (matchcond == kImpossible) |
289 | 0 | goto skipmatch; |
290 | | |
291 | | // Not if the possible match is beaten by the certain |
292 | | // match at the next byte. When this test is useless |
293 | | // (e.g., HTTPPartialMatchRE2) it slows the loop by |
294 | | // about 10%, but when it avoids work (e.g., DotMatchRE2), |
295 | | // it cuts the loop execution by about 45%. |
296 | 0 | if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0) |
297 | 0 | goto skipmatch; |
298 | | |
299 | | // Finally, the match conditions must be satisfied. |
300 | 0 | if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) { |
301 | 0 | for (int i = 2; i < 2*nmatch; i++) |
302 | 0 | matchcap[i] = cap[i]; |
303 | 0 | if (nmatch > 1 && (matchcond & kCapMask)) |
304 | 0 | ApplyCaptures(matchcond, p, matchcap, ncap); |
305 | 0 | matchcap[1] = p; |
306 | 0 | matched = true; |
307 | | |
308 | | // If we're in longest match mode, we have to keep |
309 | | // going and see if we find a longer match. |
310 | | // In first match mode, we can stop if the match |
311 | | // takes priority over the next state for this input byte. |
312 | | // That bit is per-input byte and thus in cond, not matchcond. |
313 | 0 | if (kind == kFirstMatch && (cond & kMatchWins)) |
314 | 0 | goto done; |
315 | 0 | } |
316 | | |
317 | 57.0k | skipmatch: |
318 | 57.0k | if (state == NULL) |
319 | 11.6k | goto done; |
320 | 45.3k | if ((cond & kCapMask) && nmatch > 1) |
321 | 0 | ApplyCaptures(cond, p, cap, ncap); |
322 | 45.3k | } |
323 | | |
324 | | // Look for match at end of input. |
325 | 3.76k | { |
326 | 3.76k | uint32_t matchcond = state->matchcond; |
327 | 3.76k | if (matchcond != kImpossible && |
328 | 3.76k | ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) { |
329 | 3.61k | if (nmatch > 1 && (matchcond & kCapMask)) |
330 | 0 | ApplyCaptures(matchcond, p, cap, ncap); |
331 | 3.61k | for (int i = 2; i < ncap; i++) |
332 | 0 | matchcap[i] = cap[i]; |
333 | 3.61k | matchcap[1] = p; |
334 | 3.61k | matched = true; |
335 | 3.61k | } |
336 | 3.76k | } |
337 | | |
338 | 15.4k | done: |
339 | 15.4k | if (!matched) |
340 | 11.8k | return false; |
341 | 3.61k | for (int i = 0; i < nmatch; i++) |
342 | 0 | match[i] = |
343 | 0 | StringPiece(matchcap[2 * i], |
344 | 0 | static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i])); |
345 | 3.61k | return true; |
346 | 15.4k | } |
347 | | |
348 | | |
349 | | // Analysis to determine whether a given regexp program is one-pass. |
350 | | |
351 | | // If ip is not on workq, adds ip to work queue and returns true. |
352 | | // If ip is already on work queue, does nothing and returns false. |
353 | | // If ip is NULL, does nothing and returns true (pretends to add it). |
354 | | typedef SparseSet Instq; |
355 | 22.9M | static bool AddQ(Instq *q, int id) { |
356 | 22.9M | if (id == 0) |
357 | 19.7k | return true; |
358 | 22.9M | if (q->contains(id)) |
359 | 3.44k | return false; |
360 | 22.9M | q->insert(id); |
361 | 22.9M | return true; |
362 | 22.9M | } |
363 | | |
364 | | struct InstCond { |
365 | | int id; |
366 | | uint32_t cond; |
367 | | }; |
368 | | |
369 | | // Returns whether this is a one-pass program; that is, |
370 | | // returns whether it is safe to use SearchOnePass on this program. |
371 | | // These conditions must be true for any instruction ip: |
372 | | // |
373 | | // (1) for any other Inst nip, there is at most one input-free |
374 | | // path from ip to nip. |
375 | | // (2) there is at most one kInstByte instruction reachable from |
376 | | // ip that matches any particular byte c. |
377 | | // (3) there is at most one input-free path from ip to a kInstMatch |
378 | | // instruction. |
379 | | // |
380 | | // This is actually just a conservative approximation: it might |
381 | | // return false when the answer is true, when kInstEmptyWidth |
382 | | // instructions are involved. |
383 | | // Constructs and saves corresponding one-pass NFA on success. |
384 | 67.2k | bool Prog::IsOnePass() { |
385 | 67.2k | if (did_onepass_) |
386 | 0 | return onepass_nodes_.data() != NULL; |
387 | 67.2k | did_onepass_ = true; |
388 | | |
389 | 67.2k | if (start() == 0) // no match |
390 | 1.94k | return false; |
391 | | |
392 | | // Steal memory for the one-pass NFA from the overall DFA budget. |
393 | | // Willing to use at most 1/4 of the DFA budget (heuristic). |
394 | | // Limit max node count to 65000 as a conservative estimate to |
395 | | // avoid overflowing 16-bit node index in encoding. |
396 | 65.3k | int maxnodes = 2 + inst_count(kInstByteRange); |
397 | 65.3k | int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t); |
398 | 65.3k | if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes) |
399 | 3.65k | return false; |
400 | | |
401 | | // Flood the graph starting at the start state, and check |
402 | | // that in each reachable state, each possible byte leads |
403 | | // to a unique next state. |
404 | 61.6k | int stacksize = inst_count(kInstCapture) + |
405 | 61.6k | inst_count(kInstEmptyWidth) + |
406 | 61.6k | inst_count(kInstNop) + 1; // + 1 for start inst |
407 | 61.6k | PODArray<InstCond> stack(stacksize); |
408 | | |
409 | 61.6k | int size = this->size(); |
410 | 61.6k | PODArray<int> nodebyid(size); // indexed by ip |
411 | 61.6k | memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]); |
412 | | |
413 | | // Originally, nodes was a uint8_t[maxnodes*statesize], but that was |
414 | | // unnecessarily optimistic: why allocate a large amount of memory |
415 | | // upfront for a large program when it is unlikely to be one-pass? |
416 | 61.6k | std::vector<uint8_t> nodes; |
417 | | |
418 | 61.6k | Instq tovisit(size), workq(size); |
419 | 61.6k | AddQ(&tovisit, start()); |
420 | 61.6k | nodebyid[start()] = 0; |
421 | 61.6k | int nalloc = 1; |
422 | 61.6k | nodes.insert(nodes.end(), statesize, 0); |
423 | 13.8M | for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { |
424 | 13.7M | int id = *it; |
425 | 13.7M | int nodeindex = nodebyid[id]; |
426 | 13.7M | OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); |
427 | | |
428 | | // Flood graph using manual stack, filling in actions as found. |
429 | | // Default is none. |
430 | 208M | for (int b = 0; b < bytemap_range_; b++) |
431 | 194M | node->action[b] = kImpossible; |
432 | 13.7M | node->matchcond = kImpossible; |
433 | | |
434 | 13.7M | workq.clear(); |
435 | 13.7M | bool matched = false; |
436 | 13.7M | int nstack = 0; |
437 | 13.7M | stack[nstack].id = id; |
438 | 13.7M | stack[nstack++].cond = 0; |
439 | 27.9M | while (nstack > 0) { |
440 | 14.2M | int id = stack[--nstack].id; |
441 | 14.2M | uint32_t cond = stack[nstack].cond; |
442 | | |
443 | 22.5M | Loop: |
444 | 22.5M | Prog::Inst* ip = inst(id); |
445 | 22.5M | switch (ip->opcode()) { |
446 | 0 | default: |
447 | 0 | LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
448 | 0 | break; |
449 | | |
450 | 1.20k | case kInstAltMatch: |
451 | | // TODO(rsc): Ignoring kInstAltMatch optimization. |
452 | | // Should implement it in this engine, but it's subtle. |
453 | 1.20k | DCHECK(!ip->last()); |
454 | | // If already on work queue, (1) is violated: bail out. |
455 | 1.20k | if (!AddQ(&workq, id+1)) |
456 | 0 | goto fail; |
457 | 1.20k | id = id+1; |
458 | 1.20k | goto Loop; |
459 | | |
460 | 17.6M | case kInstByteRange: { |
461 | 17.6M | int nextindex = nodebyid[ip->out()]; |
462 | 17.6M | if (nextindex == -1) { |
463 | 13.7M | if (nalloc >= maxnodes) { |
464 | 0 | if (ExtraDebug) |
465 | 0 | LOG(ERROR) << StringPrintf( |
466 | 0 | "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes); |
467 | 0 | goto fail; |
468 | 0 | } |
469 | 13.7M | nextindex = nalloc; |
470 | 13.7M | AddQ(&tovisit, ip->out()); |
471 | 13.7M | nodebyid[ip->out()] = nalloc; |
472 | 13.7M | nalloc++; |
473 | 13.7M | nodes.insert(nodes.end(), statesize, 0); |
474 | | // Update node because it might have been invalidated. |
475 | 13.7M | node = IndexToNode(nodes.data(), statesize, nodeindex); |
476 | 13.7M | } |
477 | 43.9M | for (int c = ip->lo(); c <= ip->hi(); c++) { |
478 | 26.3M | int b = bytemap_[c]; |
479 | | // Skip any bytes immediately after c that are also in b. |
480 | 80.8M | while (c < 256-1 && bytemap_[c+1] == b) |
481 | 54.4M | c++; |
482 | 26.3M | uint32_t act = node->action[b]; |
483 | 26.3M | uint32_t newact = (nextindex << kIndexShift) | cond; |
484 | 26.3M | if (matched) |
485 | 151k | newact |= kMatchWins; |
486 | 26.3M | if ((act & kImpossible) == kImpossible) { |
487 | 22.3M | node->action[b] = newact; |
488 | 22.3M | } else if (act != newact) { |
489 | 20.8k | if (ExtraDebug) |
490 | 0 | LOG(ERROR) << StringPrintf( |
491 | 0 | "Not OnePass: conflict on byte %#x at state %d", c, *it); |
492 | 20.8k | goto fail; |
493 | 20.8k | } |
494 | 26.3M | } |
495 | 17.5M | if (ip->foldcase()) { |
496 | 11.4M | Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a'; |
497 | 11.4M | Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a'; |
498 | 15.8M | for (int c = lo; c <= hi; c++) { |
499 | 4.30M | int b = bytemap_[c]; |
500 | | // Skip any bytes immediately after c that are also in b. |
501 | 5.89M | while (c < 256-1 && bytemap_[c+1] == b) |
502 | 1.58M | c++; |
503 | 4.30M | uint32_t act = node->action[b]; |
504 | 4.30M | uint32_t newact = (nextindex << kIndexShift) | cond; |
505 | 4.30M | if (matched) |
506 | 40.8k | newact |= kMatchWins; |
507 | 4.30M | if ((act & kImpossible) == kImpossible) { |
508 | 13.2k | node->action[b] = newact; |
509 | 4.29M | } else if (act != newact) { |
510 | 25 | if (ExtraDebug) |
511 | 0 | LOG(ERROR) << StringPrintf( |
512 | 0 | "Not OnePass: conflict on byte %#x at state %d", c, *it); |
513 | 25 | goto fail; |
514 | 25 | } |
515 | 4.30M | } |
516 | 11.4M | } |
517 | | |
518 | 17.5M | if (ip->last()) |
519 | 14.0M | break; |
520 | | // If already on work queue, (1) is violated: bail out. |
521 | 3.53M | if (!AddQ(&workq, id+1)) |
522 | 50 | goto fail; |
523 | 3.53M | id = id+1; |
524 | 3.53M | goto Loop; |
525 | 3.53M | } |
526 | | |
527 | 2.85M | case kInstCapture: |
528 | 3.31M | case kInstEmptyWidth: |
529 | 4.78M | case kInstNop: |
530 | 4.78M | if (!ip->last()) { |
531 | | // If already on work queue, (1) is violated: bail out. |
532 | 823k | if (!AddQ(&workq, id+1)) |
533 | 41 | goto fail; |
534 | 823k | stack[nstack].id = id+1; |
535 | 823k | stack[nstack++].cond = cond; |
536 | 823k | } |
537 | | |
538 | 4.78M | if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap) |
539 | 2.18M | cond |= (1 << kCapShift) << ip->cap(); |
540 | 4.78M | if (ip->opcode() == kInstEmptyWidth) |
541 | 458k | cond |= ip->empty(); |
542 | | |
543 | | // kInstCapture and kInstNop always proceed to ip->out(). |
544 | | // kInstEmptyWidth only sometimes proceeds to ip->out(), |
545 | | // but as a conservative approximation we assume it always does. |
546 | | // We could be a little more precise by looking at what c |
547 | | // is, but that seems like overkill. |
548 | | |
549 | | // If already on work queue, (1) is violated: bail out. |
550 | 4.78M | if (!AddQ(&workq, ip->out())) { |
551 | 3.35k | if (ExtraDebug) |
552 | 0 | LOG(ERROR) << StringPrintf( |
553 | 0 | "Not OnePass: multiple paths %d -> %d", *it, ip->out()); |
554 | 3.35k | goto fail; |
555 | 3.35k | } |
556 | 4.78M | id = ip->out(); |
557 | 4.78M | goto Loop; |
558 | | |
559 | 144k | case kInstMatch: |
560 | 144k | if (matched) { |
561 | | // (3) is violated |
562 | 31 | if (ExtraDebug) |
563 | 0 | LOG(ERROR) << StringPrintf( |
564 | 0 | "Not OnePass: multiple matches from %d", *it); |
565 | 31 | goto fail; |
566 | 31 | } |
567 | 143k | matched = true; |
568 | 143k | node->matchcond = cond; |
569 | | |
570 | 143k | if (ip->last()) |
571 | 116k | break; |
572 | | // If already on work queue, (1) is violated: bail out. |
573 | 27.9k | if (!AddQ(&workq, id+1)) |
574 | 0 | goto fail; |
575 | 27.9k | id = id+1; |
576 | 27.9k | goto Loop; |
577 | | |
578 | 19.7k | case kInstFail: |
579 | 19.7k | break; |
580 | 22.5M | } |
581 | 22.5M | } |
582 | 13.7M | } |
583 | | |
584 | 37.3k | if (ExtraDebug) { // For debugging, dump one-pass NFA to LOG(ERROR). |
585 | 0 | LOG(ERROR) << "bytemap:\n" << DumpByteMap(); |
586 | 0 | LOG(ERROR) << "prog:\n" << Dump(); |
587 | |
|
588 | 0 | std::map<int, int> idmap; |
589 | 0 | for (int i = 0; i < size; i++) |
590 | 0 | if (nodebyid[i] != -1) |
591 | 0 | idmap[nodebyid[i]] = i; |
592 | |
|
593 | 0 | std::string dump; |
594 | 0 | for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { |
595 | 0 | int id = *it; |
596 | 0 | int nodeindex = nodebyid[id]; |
597 | 0 | if (nodeindex == -1) |
598 | 0 | continue; |
599 | 0 | OneState* node = IndexToNode(nodes.data(), statesize, nodeindex); |
600 | 0 | dump += StringPrintf("node %d id=%d: matchcond=%#x\n", |
601 | 0 | nodeindex, id, node->matchcond); |
602 | 0 | for (int i = 0; i < bytemap_range_; i++) { |
603 | 0 | if ((node->action[i] & kImpossible) == kImpossible) |
604 | 0 | continue; |
605 | 0 | dump += StringPrintf(" %d cond %#x -> %d id=%d\n", |
606 | 0 | i, node->action[i] & 0xFFFF, |
607 | 0 | node->action[i] >> kIndexShift, |
608 | 0 | idmap[node->action[i] >> kIndexShift]); |
609 | 0 | } |
610 | 0 | } |
611 | 0 | LOG(ERROR) << "nodes:\n" << dump; |
612 | 0 | } |
613 | | |
614 | 37.3k | dfa_mem_ -= nalloc*statesize; |
615 | 37.3k | onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize); |
616 | 37.3k | memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize); |
617 | 37.3k | return true; |
618 | | |
619 | 24.3k | fail: |
620 | 24.3k | return false; |
621 | 61.6k | } |
622 | | |
623 | | } // namespace re2 |