Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2006-2007 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Tested by search_test.cc. |
6 | | // |
7 | | // Prog::SearchNFA, an NFA search. |
8 | | // This is an actual NFA like the theorists talk about, |
9 | | // not the pseudo-NFA found in backtracking regexp implementations. |
10 | | // |
11 | | // IMPLEMENTATION |
12 | | // |
13 | | // This algorithm is a variant of one that appeared in Rob Pike's sam editor, |
14 | | // which is a variant of the one described in Thompson's 1968 CACM paper. |
15 | | // See http://swtch.com/~rsc/regexp/ for various history. The main feature |
16 | | // over the DFA implementation is that it tracks submatch boundaries. |
17 | | // |
18 | | // When the choice of submatch boundaries is ambiguous, this particular |
19 | | // implementation makes the same choices that traditional backtracking |
20 | | // implementations (in particular, Perl and PCRE) do. |
21 | | // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential |
22 | | // time in the length of the input. |
23 | | // |
24 | | // Like Thompson's original machine and like the DFA implementation, this |
25 | | // implementation notices a match only once it is one byte past it. |
26 | | |
27 | | #include <stdio.h> |
28 | | #include <string.h> |
29 | | #include <algorithm> |
30 | | #include <deque> |
31 | | #include <string> |
32 | | #include <utility> |
33 | | #include <vector> |
34 | | |
35 | | #include "util/logging.h" |
36 | | #include "util/strutil.h" |
37 | | #include "re2/pod_array.h" |
38 | | #include "re2/prog.h" |
39 | | #include "re2/regexp.h" |
40 | | #include "re2/sparse_array.h" |
41 | | #include "re2/sparse_set.h" |
42 | | |
43 | | namespace re2 { |
44 | | |
45 | | static const bool ExtraDebug = false; |
46 | | |
47 | | class NFA { |
48 | | public: |
49 | | NFA(Prog* prog); |
50 | | ~NFA(); |
51 | | |
52 | | // Searches for a matching string. |
53 | | // * If anchored is true, only considers matches starting at offset. |
54 | | // Otherwise finds lefmost match at or after offset. |
55 | | // * If longest is true, returns the longest match starting |
56 | | // at the chosen start point. Otherwise returns the so-called |
57 | | // left-biased match, the one traditional backtracking engines |
58 | | // (like Perl and PCRE) find. |
59 | | // Records submatch boundaries in submatch[1..nsubmatch-1]. |
60 | | // Submatch[0] is the entire match. When there is a choice in |
61 | | // which text matches each subexpression, the submatch boundaries |
62 | | // are chosen to match what a backtracking implementation would choose. |
63 | | bool Search(const StringPiece& text, const StringPiece& context, |
64 | | bool anchored, bool longest, |
65 | | StringPiece* submatch, int nsubmatch); |
66 | | |
67 | | private: |
68 | | struct Thread { |
69 | | union { |
70 | | int ref; |
71 | | Thread* next; // when on free list |
72 | | }; |
73 | | const char** capture; |
74 | | }; |
75 | | |
76 | | // State for explicit stack in AddToThreadq. |
77 | | struct AddState { |
78 | | int id; // Inst to process |
79 | | Thread* t; // if not null, set t0 = t before processing id |
80 | | }; |
81 | | |
82 | | // Threadq is a list of threads. The list is sorted by the order |
83 | | // in which Perl would explore that particular state -- the earlier |
84 | | // choices appear earlier in the list. |
85 | | typedef SparseArray<Thread*> Threadq; |
86 | | |
87 | | inline Thread* AllocThread(); |
88 | | inline Thread* Incref(Thread* t); |
89 | | inline void Decref(Thread* t); |
90 | | |
91 | | // Follows all empty arrows from id0 and enqueues all the states reached. |
92 | | // Enqueues only the ByteRange instructions that match byte c. |
93 | | // context is used (with p) for evaluating empty-width specials. |
94 | | // p is the current input position, and t0 is the current thread. |
95 | | void AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, |
96 | | const char* p, Thread* t0); |
97 | | |
98 | | // Run runq on byte c, appending new states to nextq. |
99 | | // Updates matched_ and match_ as new, better matches are found. |
100 | | // context is used (with p) for evaluating empty-width specials. |
101 | | // p is the position of byte c in the input string for AddToThreadq; |
102 | | // p-1 will be used when processing Match instructions. |
103 | | // Frees all the threads on runq. |
104 | | // If there is a shortcut to the end, returns that shortcut. |
105 | | int Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, |
106 | | const char* p); |
107 | | |
108 | | // Returns text version of capture information, for debugging. |
109 | | std::string FormatCapture(const char** capture); |
110 | | |
111 | 67.1k | void CopyCapture(const char** dst, const char** src) { |
112 | 67.1k | memmove(dst, src, ncapture_*sizeof src[0]); |
113 | 67.1k | } |
114 | | |
115 | | Prog* prog_; // underlying program |
116 | | int start_; // start instruction in program |
117 | | int ncapture_; // number of submatches to track |
118 | | bool longest_; // whether searching for longest match |
119 | | bool endmatch_; // whether match must end at text.end() |
120 | | const char* btext_; // beginning of text (for FormatSubmatch) |
121 | | const char* etext_; // end of text (for endmatch_) |
122 | | Threadq q0_, q1_; // pre-allocated for Search. |
123 | | PODArray<AddState> stack_; // pre-allocated for AddToThreadq |
124 | | std::deque<Thread> arena_; // thread arena |
125 | | Thread* freelist_; // thread freelist |
126 | | const char** match_; // best match so far |
127 | | bool matched_; // any match so far? |
128 | | |
129 | | NFA(const NFA&) = delete; |
130 | | NFA& operator=(const NFA&) = delete; |
131 | | }; |
132 | | |
133 | 17.5k | NFA::NFA(Prog* prog) { |
134 | 17.5k | prog_ = prog; |
135 | 17.5k | start_ = prog_->start(); |
136 | 17.5k | ncapture_ = 0; |
137 | 17.5k | longest_ = false; |
138 | 17.5k | endmatch_ = false; |
139 | 17.5k | btext_ = NULL; |
140 | 17.5k | etext_ = NULL; |
141 | 17.5k | q0_.resize(prog_->size()); |
142 | 17.5k | q1_.resize(prog_->size()); |
143 | | // See NFA::AddToThreadq() for why this is so. |
144 | 17.5k | int nstack = 2*prog_->inst_count(kInstCapture) + |
145 | 17.5k | prog_->inst_count(kInstEmptyWidth) + |
146 | 17.5k | prog_->inst_count(kInstNop) + 1; // + 1 for start inst |
147 | 17.5k | stack_ = PODArray<AddState>(nstack); |
148 | 17.5k | freelist_ = NULL; |
149 | 17.5k | match_ = NULL; |
150 | 17.5k | matched_ = false; |
151 | 17.5k | } |
152 | | |
153 | 17.5k | NFA::~NFA() { |
154 | 17.5k | delete[] match_; |
155 | 17.5k | for (const Thread& t : arena_) |
156 | 17.5k | delete[] t.capture; |
157 | 17.5k | } |
158 | | |
159 | 17.5k | NFA::Thread* NFA::AllocThread() { |
160 | 17.5k | Thread* t = freelist_; |
161 | 17.5k | if (t != NULL) { |
162 | 0 | freelist_ = t->next; |
163 | 0 | t->ref = 1; |
164 | | // We don't need to touch t->capture because |
165 | | // the caller will immediately overwrite it. |
166 | 0 | return t; |
167 | 0 | } |
168 | 17.5k | arena_.emplace_back(); |
169 | 17.5k | t = &arena_.back(); |
170 | 17.5k | t->ref = 1; |
171 | 17.5k | t->capture = new const char*[ncapture_]; |
172 | 17.5k | return t; |
173 | 17.5k | } |
174 | | |
175 | 62.2M | NFA::Thread* NFA::Incref(Thread* t) { |
176 | 62.2M | DCHECK(t != NULL); |
177 | 62.2M | t->ref++; |
178 | 62.2M | return t; |
179 | 62.2M | } |
180 | | |
181 | 62.2M | void NFA::Decref(Thread* t) { |
182 | 62.2M | DCHECK(t != NULL); |
183 | 62.2M | t->ref--; |
184 | 62.2M | if (t->ref > 0) |
185 | 62.2M | return; |
186 | 17.5k | DCHECK_EQ(t->ref, 0); |
187 | 17.5k | t->next = freelist_; |
188 | 17.5k | freelist_ = t; |
189 | 17.5k | } |
190 | | |
191 | | // Follows all empty arrows from id0 and enqueues all the states reached. |
192 | | // Enqueues only the ByteRange instructions that match byte c. |
193 | | // context is used (with p) for evaluating empty-width specials. |
194 | | // p is the current input position, and t0 is the current thread. |
195 | | void NFA::AddToThreadq(Threadq* q, int id0, int c, const StringPiece& context, |
196 | 61.9M | const char* p, Thread* t0) { |
197 | 61.9M | if (id0 == 0) |
198 | 0 | return; |
199 | | |
200 | | // Use stack_ to hold our stack of instructions yet to process. |
201 | | // It was preallocated as follows: |
202 | | // two entries per Capture; |
203 | | // one entry per EmptyWidth; and |
204 | | // one entry per Nop. |
205 | | // This reflects the maximum number of stack pushes that each can |
206 | | // perform. (Each instruction can be processed at most once.) |
207 | 61.9M | AddState* stk = stack_.data(); |
208 | 61.9M | int nstk = 0; |
209 | | |
210 | 61.9M | stk[nstk++] = {id0, NULL}; |
211 | 236M | while (nstk > 0) { |
212 | 174M | DCHECK_LE(nstk, stack_.size()); |
213 | 174M | AddState a = stk[--nstk]; |
214 | | |
215 | 441M | Loop: |
216 | 441M | if (a.t != NULL) { |
217 | | // t0 was a thread that we allocated and copied in order to |
218 | | // record the capture, so we must now decref it. |
219 | 0 | Decref(t0); |
220 | 0 | t0 = a.t; |
221 | 0 | } |
222 | | |
223 | 441M | int id = a.id; |
224 | 441M | if (id == 0) |
225 | 1.05k | continue; |
226 | 441M | if (q->has_index(id)) { |
227 | 106M | if (ExtraDebug) |
228 | 0 | fprintf(stderr, " [%d%s]\n", id, FormatCapture(t0->capture).c_str()); |
229 | 106M | continue; |
230 | 106M | } |
231 | | |
232 | | // Create entry in q no matter what. We might fill it in below, |
233 | | // or we might not. Even if not, it is necessary to have it, |
234 | | // so that we don't revisit id0 during the recursion. |
235 | 334M | q->set_new(id, NULL); |
236 | 334M | Thread** tp = &q->get_existing(id); |
237 | 334M | int j; |
238 | 334M | Thread* t; |
239 | 334M | Prog::Inst* ip = prog_->inst(id); |
240 | 334M | switch (ip->opcode()) { |
241 | 0 | default: |
242 | 0 | LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq"; |
243 | 0 | break; |
244 | | |
245 | 0 | case kInstFail: |
246 | 0 | break; |
247 | | |
248 | 24.6k | case kInstAltMatch: |
249 | | // Save state; will pick up at next byte. |
250 | 24.6k | t = Incref(t0); |
251 | 24.6k | *tp = t; |
252 | | |
253 | 24.6k | DCHECK(!ip->last()); |
254 | 24.6k | a = {id+1, NULL}; |
255 | 24.6k | goto Loop; |
256 | | |
257 | 121M | case kInstNop: |
258 | 121M | if (!ip->last()) |
259 | 105M | stk[nstk++] = {id+1, NULL}; |
260 | | |
261 | | // Continue on. |
262 | 121M | a = {ip->out(), NULL}; |
263 | 121M | goto Loop; |
264 | | |
265 | 43.4M | case kInstCapture: |
266 | 43.4M | if (!ip->last()) |
267 | 2.07M | stk[nstk++] = {id+1, NULL}; |
268 | | |
269 | 43.4M | if ((j=ip->cap()) < ncapture_) { |
270 | | // Push a dummy whose only job is to restore t0 |
271 | | // once we finish exploring this possibility. |
272 | 0 | stk[nstk++] = {0, t0}; |
273 | | |
274 | | // Record capture. |
275 | 0 | t = AllocThread(); |
276 | 0 | CopyCapture(t->capture, t0->capture); |
277 | 0 | t->capture[j] = p; |
278 | 0 | t0 = t; |
279 | 0 | } |
280 | 43.4M | a = {ip->out(), NULL}; |
281 | 43.4M | goto Loop; |
282 | | |
283 | 162M | case kInstByteRange: |
284 | 162M | if (!ip->Matches(c)) |
285 | 100M | goto Next; |
286 | | |
287 | | // Save state; will pick up at next byte. |
288 | 61.9M | t = Incref(t0); |
289 | 61.9M | *tp = t; |
290 | 61.9M | if (ExtraDebug) |
291 | 0 | fprintf(stderr, " + %d%s\n", id, FormatCapture(t0->capture).c_str()); |
292 | | |
293 | 61.9M | if (ip->hint() == 0) |
294 | 28.7M | break; |
295 | 33.2M | a = {id+ip->hint(), NULL}; |
296 | 33.2M | goto Loop; |
297 | | |
298 | 188k | case kInstMatch: |
299 | | // Save state; will pick up at next byte. |
300 | 188k | t = Incref(t0); |
301 | 188k | *tp = t; |
302 | 188k | if (ExtraDebug) |
303 | 0 | fprintf(stderr, " ! %d%s\n", id, FormatCapture(t0->capture).c_str()); |
304 | | |
305 | 100M | Next: |
306 | 100M | if (ip->last()) |
307 | 34.3M | break; |
308 | 66.5M | a = {id+1, NULL}; |
309 | 66.5M | goto Loop; |
310 | | |
311 | 6.69M | case kInstEmptyWidth: |
312 | 6.69M | if (!ip->last()) |
313 | 5.20M | stk[nstk++] = {id+1, NULL}; |
314 | | |
315 | | // Continue on if we have all the right flag bits. |
316 | 6.69M | if (ip->empty() & ~Prog::EmptyFlags(context, p)) |
317 | 5.04M | break; |
318 | 1.64M | a = {ip->out(), NULL}; |
319 | 1.64M | goto Loop; |
320 | 334M | } |
321 | 334M | } |
322 | 61.9M | } |
323 | | |
324 | | // Run runq on byte c, appending new states to nextq. |
325 | | // Updates matched_ and match_ as new, better matches are found. |
326 | | // context is used (with p) for evaluating empty-width specials. |
327 | | // p is the position of byte c in the input string for AddToThreadq; |
328 | | // p-1 will be used when processing Match instructions. |
329 | | // Frees all the threads on runq. |
330 | | // If there is a shortcut to the end, returns that shortcut. |
331 | | int NFA::Step(Threadq* runq, Threadq* nextq, int c, const StringPiece& context, |
332 | 274k | const char* p) { |
333 | 274k | nextq->clear(); |
334 | | |
335 | 333M | for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
336 | 333M | Thread* t = i->value(); |
337 | 333M | if (t == NULL) |
338 | 271M | continue; |
339 | | |
340 | 62.1M | if (longest_) { |
341 | | // Can skip any threads started after our current best match. |
342 | 62.1M | if (matched_ && match_[0] < t->capture[0]) { |
343 | 0 | Decref(t); |
344 | 0 | continue; |
345 | 0 | } |
346 | 62.1M | } |
347 | | |
348 | 62.1M | int id = i->index(); |
349 | 62.1M | Prog::Inst* ip = prog_->inst(id); |
350 | | |
351 | 62.1M | switch (ip->opcode()) { |
352 | 0 | default: |
353 | | // Should only see the values handled below. |
354 | 0 | LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step"; |
355 | 0 | break; |
356 | | |
357 | 61.9M | case kInstByteRange: |
358 | 61.9M | AddToThreadq(nextq, ip->out(), c, context, p, t); |
359 | 61.9M | break; |
360 | | |
361 | 24.2k | case kInstAltMatch: |
362 | 24.2k | if (i != runq->begin()) |
363 | 23.1k | break; |
364 | | // The match is ours if we want it. |
365 | 1.08k | if (ip->greedy(prog_) || longest_) { |
366 | 1.08k | CopyCapture(match_, t->capture); |
367 | 1.08k | matched_ = true; |
368 | | |
369 | 1.08k | Decref(t); |
370 | 871k | for (++i; i != runq->end(); ++i) { |
371 | 870k | if (i->value() != NULL) |
372 | 76.3k | Decref(i->value()); |
373 | 870k | } |
374 | 1.08k | runq->clear(); |
375 | 1.08k | if (ip->greedy(prog_)) |
376 | 821 | return ip->out1(); |
377 | 260 | return ip->out(); |
378 | 1.08k | } |
379 | 0 | break; |
380 | | |
381 | 187k | case kInstMatch: { |
382 | | // Avoid invoking undefined behavior (arithmetic on a null pointer) |
383 | | // by storing p instead of p-1. (What would the latter even mean?!) |
384 | | // This complements the special case in NFA::Search(). |
385 | 187k | if (p == NULL) { |
386 | 0 | CopyCapture(match_, t->capture); |
387 | 0 | match_[1] = p; |
388 | 0 | matched_ = true; |
389 | 0 | break; |
390 | 0 | } |
391 | | |
392 | 187k | if (endmatch_ && p-1 != etext_) |
393 | 56 | break; |
394 | | |
395 | 187k | if (longest_) { |
396 | | // Leftmost-longest mode: save this match only if |
397 | | // it is either farther to the left or at the same |
398 | | // point but longer than an existing match. |
399 | 187k | if (!matched_ || t->capture[0] < match_[0] || |
400 | 187k | (t->capture[0] == match_[0] && p-1 > match_[1])) { |
401 | 48.5k | CopyCapture(match_, t->capture); |
402 | 48.5k | match_[1] = p-1; |
403 | 48.5k | matched_ = true; |
404 | 48.5k | } |
405 | 187k | } else { |
406 | | // Leftmost-biased mode: this match is by definition |
407 | | // better than what we've already found (see next line). |
408 | 0 | CopyCapture(match_, t->capture); |
409 | 0 | match_[1] = p-1; |
410 | 0 | matched_ = true; |
411 | | |
412 | | // Cut off the threads that can only find matches |
413 | | // worse than the one we just found: don't run the |
414 | | // rest of the current Threadq. |
415 | 0 | Decref(t); |
416 | 0 | for (++i; i != runq->end(); ++i) { |
417 | 0 | if (i->value() != NULL) |
418 | 0 | Decref(i->value()); |
419 | 0 | } |
420 | 0 | runq->clear(); |
421 | 0 | return 0; |
422 | 0 | } |
423 | 187k | break; |
424 | 187k | } |
425 | 62.1M | } |
426 | 62.1M | Decref(t); |
427 | 62.1M | } |
428 | 273k | runq->clear(); |
429 | 273k | return 0; |
430 | 274k | } |
431 | | |
432 | 0 | std::string NFA::FormatCapture(const char** capture) { |
433 | 0 | std::string s; |
434 | 0 | for (int i = 0; i < ncapture_; i+=2) { |
435 | 0 | if (capture[i] == NULL) |
436 | 0 | s += "(?,?)"; |
437 | 0 | else if (capture[i+1] == NULL) |
438 | 0 | s += StringPrintf("(%td,?)", |
439 | 0 | capture[i] - btext_); |
440 | 0 | else |
441 | 0 | s += StringPrintf("(%td,%td)", |
442 | 0 | capture[i] - btext_, |
443 | 0 | capture[i+1] - btext_); |
444 | 0 | } |
445 | 0 | return s; |
446 | 0 | } |
447 | | |
448 | | bool NFA::Search(const StringPiece& text, const StringPiece& const_context, |
449 | | bool anchored, bool longest, |
450 | 17.5k | StringPiece* submatch, int nsubmatch) { |
451 | 17.5k | if (start_ == 0) |
452 | 0 | return false; |
453 | | |
454 | 17.5k | StringPiece context = const_context; |
455 | 17.5k | if (context.data() == NULL) |
456 | 0 | context = text; |
457 | | |
458 | | // Sanity check: make sure that text lies within context. |
459 | 17.5k | if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) { |
460 | 0 | LOG(DFATAL) << "context does not contain text"; |
461 | 0 | return false; |
462 | 0 | } |
463 | | |
464 | 17.5k | if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text)) |
465 | 0 | return false; |
466 | 17.5k | if (prog_->anchor_end() && EndPtr(context) != EndPtr(text)) |
467 | 0 | return false; |
468 | 17.5k | anchored |= prog_->anchor_start(); |
469 | 17.5k | if (prog_->anchor_end()) { |
470 | 626 | longest = true; |
471 | 626 | endmatch_ = true; |
472 | 626 | } |
473 | | |
474 | 17.5k | if (nsubmatch < 0) { |
475 | 0 | LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; |
476 | 0 | return false; |
477 | 0 | } |
478 | | |
479 | | // Save search parameters. |
480 | 17.5k | ncapture_ = 2*nsubmatch; |
481 | 17.5k | longest_ = longest; |
482 | | |
483 | 17.5k | if (nsubmatch == 0) { |
484 | | // We need to maintain match[0], both to distinguish the |
485 | | // longest match (if longest is true) and also to tell |
486 | | // whether we've seen any matches at all. |
487 | 0 | ncapture_ = 2; |
488 | 0 | } |
489 | | |
490 | 17.5k | match_ = new const char*[ncapture_]; |
491 | 17.5k | memset(match_, 0, ncapture_*sizeof match_[0]); |
492 | 17.5k | matched_ = false; |
493 | | |
494 | | // For debugging prints. |
495 | 17.5k | btext_ = context.data(); |
496 | | // For convenience. |
497 | 17.5k | etext_ = text.data() + text.size(); |
498 | | |
499 | 17.5k | if (ExtraDebug) |
500 | 0 | fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", |
501 | 0 | std::string(text).c_str(), std::string(context).c_str(), anchored, longest); |
502 | | |
503 | | // Set up search. |
504 | 17.5k | Threadq* runq = &q0_; |
505 | 17.5k | Threadq* nextq = &q1_; |
506 | 17.5k | runq->clear(); |
507 | 17.5k | nextq->clear(); |
508 | | |
509 | | // Loop over the text, stepping the machine. |
510 | 274k | for (const char* p = text.data();; p++) { |
511 | 274k | if (ExtraDebug) { |
512 | 0 | int c = 0; |
513 | 0 | if (p == btext_) |
514 | 0 | c = '^'; |
515 | 0 | else if (p > etext_) |
516 | 0 | c = '$'; |
517 | 0 | else if (p < etext_) |
518 | 0 | c = p[0] & 0xFF; |
519 | |
|
520 | 0 | fprintf(stderr, "%c:", c); |
521 | 0 | for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
522 | 0 | Thread* t = i->value(); |
523 | 0 | if (t == NULL) |
524 | 0 | continue; |
525 | 0 | fprintf(stderr, " %d%s", i->index(), FormatCapture(t->capture).c_str()); |
526 | 0 | } |
527 | 0 | fprintf(stderr, "\n"); |
528 | 0 | } |
529 | | |
530 | | // This is a no-op the first time around the loop because runq is empty. |
531 | 274k | int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p); |
532 | 274k | DCHECK_EQ(runq->size(), 0); |
533 | 274k | using std::swap; |
534 | 274k | swap(nextq, runq); |
535 | 274k | nextq->clear(); |
536 | 274k | if (id != 0) { |
537 | | // We're done: full match ahead. |
538 | 1.08k | p = etext_; |
539 | 2.13k | for (;;) { |
540 | 2.13k | Prog::Inst* ip = prog_->inst(id); |
541 | 2.13k | switch (ip->opcode()) { |
542 | 0 | default: |
543 | 0 | LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode(); |
544 | 0 | break; |
545 | | |
546 | 87 | case kInstCapture: |
547 | 87 | if (ip->cap() < ncapture_) |
548 | 0 | match_[ip->cap()] = p; |
549 | 87 | id = ip->out(); |
550 | 87 | continue; |
551 | | |
552 | 962 | case kInstNop: |
553 | 962 | id = ip->out(); |
554 | 962 | continue; |
555 | | |
556 | 1.08k | case kInstMatch: |
557 | 1.08k | match_[1] = p; |
558 | 1.08k | matched_ = true; |
559 | 1.08k | break; |
560 | 2.13k | } |
561 | 1.08k | break; |
562 | 2.13k | } |
563 | 1.08k | break; |
564 | 1.08k | } |
565 | | |
566 | 273k | if (p > etext_) |
567 | 3.37k | break; |
568 | | |
569 | | // Start a new thread if there have not been any matches. |
570 | | // (No point in starting a new thread if there have been |
571 | | // matches, since it would be to the right of the match |
572 | | // we already found.) |
573 | 269k | if (!matched_ && (!anchored || p == text.data())) { |
574 | | // Try to use prefix accel (e.g. memchr) to skip ahead. |
575 | | // The search must be unanchored and there must be zero |
576 | | // possible matches already. |
577 | 17.5k | if (!anchored && runq->size() == 0 && |
578 | 17.5k | p < etext_ && prog_->can_prefix_accel()) { |
579 | 0 | p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p)); |
580 | 0 | if (p == NULL) |
581 | 0 | p = etext_; |
582 | 0 | } |
583 | | |
584 | 17.5k | Thread* t = AllocThread(); |
585 | 17.5k | CopyCapture(t->capture, match_); |
586 | 17.5k | t->capture[0] = p; |
587 | 17.5k | AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p, |
588 | 17.5k | t); |
589 | 17.5k | Decref(t); |
590 | 17.5k | } |
591 | | |
592 | | // If all the threads have died, stop early. |
593 | 269k | if (runq->size() == 0) { |
594 | 13.0k | if (ExtraDebug) |
595 | 0 | fprintf(stderr, "dead\n"); |
596 | 13.0k | break; |
597 | 13.0k | } |
598 | | |
599 | | // Avoid invoking undefined behavior (arithmetic on a null pointer) |
600 | | // by simply not continuing the loop. |
601 | | // This complements the special case in NFA::Step(). |
602 | 256k | if (p == NULL) { |
603 | 0 | (void) Step(runq, nextq, -1, context, p); |
604 | 0 | DCHECK_EQ(runq->size(), 0); |
605 | 0 | using std::swap; |
606 | 0 | swap(nextq, runq); |
607 | 0 | nextq->clear(); |
608 | 0 | break; |
609 | 0 | } |
610 | 256k | } |
611 | | |
612 | 17.5k | for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
613 | 0 | if (i->value() != NULL) |
614 | 0 | Decref(i->value()); |
615 | 0 | } |
616 | | |
617 | 17.5k | if (matched_) { |
618 | 15.8k | for (int i = 0; i < nsubmatch; i++) |
619 | 7.94k | submatch[i] = |
620 | 7.94k | StringPiece(match_[2 * i], |
621 | 7.94k | static_cast<size_t>(match_[2 * i + 1] - match_[2 * i])); |
622 | 7.94k | if (ExtraDebug) |
623 | 0 | fprintf(stderr, "match (%td,%td)\n", |
624 | 0 | match_[0] - btext_, |
625 | 0 | match_[1] - btext_); |
626 | 7.94k | return true; |
627 | 7.94k | } |
628 | 9.56k | return false; |
629 | 17.5k | } |
630 | | |
631 | | bool |
632 | | Prog::SearchNFA(const StringPiece& text, const StringPiece& context, |
633 | | Anchor anchor, MatchKind kind, |
634 | 17.5k | StringPiece* match, int nmatch) { |
635 | 17.5k | if (ExtraDebug) |
636 | 0 | Dump(); |
637 | | |
638 | 17.5k | NFA nfa(this); |
639 | 17.5k | StringPiece sp; |
640 | 17.5k | if (kind == kFullMatch) { |
641 | 17.5k | anchor = kAnchored; |
642 | 17.5k | if (nmatch == 0) { |
643 | 17.5k | match = &sp; |
644 | 17.5k | nmatch = 1; |
645 | 17.5k | } |
646 | 17.5k | } |
647 | 17.5k | if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) |
648 | 9.56k | return false; |
649 | 7.94k | if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) |
650 | 5.60k | return false; |
651 | 2.33k | return true; |
652 | 7.94k | } |
653 | | |
654 | | // For each instruction i in the program reachable from the start, compute the |
655 | | // number of instructions reachable from i by following only empty transitions |
656 | | // and record that count as fanout[i]. |
657 | | // |
658 | | // fanout holds the results and is also the work queue for the outer iteration. |
659 | | // reachable holds the reached nodes for the inner iteration. |
660 | 0 | void Prog::Fanout(SparseArray<int>* fanout) { |
661 | 0 | DCHECK_EQ(fanout->max_size(), size()); |
662 | 0 | SparseSet reachable(size()); |
663 | 0 | fanout->clear(); |
664 | 0 | fanout->set_new(start(), 0); |
665 | 0 | for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) { |
666 | 0 | int* count = &i->value(); |
667 | 0 | reachable.clear(); |
668 | 0 | reachable.insert(i->index()); |
669 | 0 | for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) { |
670 | 0 | int id = *j; |
671 | 0 | Prog::Inst* ip = inst(id); |
672 | 0 | switch (ip->opcode()) { |
673 | 0 | default: |
674 | 0 | LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()"; |
675 | 0 | break; |
676 | | |
677 | 0 | case kInstByteRange: |
678 | 0 | if (!ip->last()) |
679 | 0 | reachable.insert(id+1); |
680 | |
|
681 | 0 | (*count)++; |
682 | 0 | if (!fanout->has_index(ip->out())) { |
683 | 0 | fanout->set_new(ip->out(), 0); |
684 | 0 | } |
685 | 0 | break; |
686 | | |
687 | 0 | case kInstAltMatch: |
688 | 0 | DCHECK(!ip->last()); |
689 | 0 | reachable.insert(id+1); |
690 | 0 | break; |
691 | | |
692 | 0 | case kInstCapture: |
693 | 0 | case kInstEmptyWidth: |
694 | 0 | case kInstNop: |
695 | 0 | if (!ip->last()) |
696 | 0 | reachable.insert(id+1); |
697 | |
|
698 | 0 | reachable.insert(ip->out()); |
699 | 0 | break; |
700 | | |
701 | 0 | case kInstMatch: |
702 | 0 | if (!ip->last()) |
703 | 0 | reachable.insert(id+1); |
704 | 0 | break; |
705 | | |
706 | 0 | case kInstFail: |
707 | 0 | break; |
708 | 0 | } |
709 | 0 | } |
710 | 0 | } |
711 | 0 | } |
712 | | |
713 | | } // namespace re2 |