Line | Count | Source |
1 | | // Copyright 2006-2007 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Tested by search_test.cc. |
6 | | // |
7 | | // Prog::SearchNFA, an NFA search. |
8 | | // This is an actual NFA like the theorists talk about, |
9 | | // not the pseudo-NFA found in backtracking regexp implementations. |
10 | | // |
11 | | // IMPLEMENTATION |
12 | | // |
13 | | // This algorithm is a variant of one that appeared in Rob Pike's sam editor, |
14 | | // which is a variant of the one described in Thompson's 1968 CACM paper. |
15 | | // See http://swtch.com/~rsc/regexp/ for various history. The main feature |
16 | | // over the DFA implementation is that it tracks submatch boundaries. |
17 | | // |
18 | | // When the choice of submatch boundaries is ambiguous, this particular |
19 | | // implementation makes the same choices that traditional backtracking |
20 | | // implementations (in particular, Perl and PCRE) do. |
21 | | // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential |
22 | | // time in the length of the input. |
23 | | // |
24 | | // Like Thompson's original machine and like the DFA implementation, this |
25 | | // implementation notices a match only once it is one byte past it. |
26 | | |
27 | | #include <stdio.h> |
28 | | #include <string.h> |
29 | | |
30 | | #include <algorithm> |
31 | | #include <deque> |
32 | | #include <string> |
33 | | #include <utility> |
34 | | |
35 | | #include "absl/log/absl_check.h" |
36 | | #include "absl/log/absl_log.h" |
37 | | #include "absl/strings/str_format.h" |
38 | | #include "absl/strings/string_view.h" |
39 | | #include "re2/pod_array.h" |
40 | | #include "re2/prog.h" |
41 | | #include "re2/regexp.h" |
42 | | #include "re2/sparse_array.h" |
43 | | #include "re2/sparse_set.h" |
44 | | |
45 | | namespace re2 { |
46 | | |
47 | | static const bool ExtraDebug = false; |
48 | | |
49 | | class NFA { |
50 | | public: |
51 | | NFA(Prog* prog); |
52 | | ~NFA(); |
53 | | |
54 | | // Searches for a matching string. |
55 | | // * If anchored is true, only considers matches starting at offset. |
56 | | // Otherwise finds lefmost match at or after offset. |
57 | | // * If longest is true, returns the longest match starting |
58 | | // at the chosen start point. Otherwise returns the so-called |
59 | | // left-biased match, the one traditional backtracking engines |
60 | | // (like Perl and PCRE) find. |
61 | | // Records submatch boundaries in submatch[1..nsubmatch-1]. |
62 | | // Submatch[0] is the entire match. When there is a choice in |
63 | | // which text matches each subexpression, the submatch boundaries |
64 | | // are chosen to match what a backtracking implementation would choose. |
65 | | bool Search(absl::string_view text, absl::string_view context, bool anchored, |
66 | | bool longest, absl::string_view* submatch, int nsubmatch); |
67 | | |
68 | | private: |
69 | | struct Thread { |
70 | | union { |
71 | | int ref; |
72 | | Thread* next; // when on free list |
73 | | }; |
74 | | const char** capture; |
75 | | }; |
76 | | |
77 | | // State for explicit stack in AddToThreadq. |
78 | | struct AddState { |
79 | | int id; // Inst to process |
80 | | Thread* t; // if not null, set t0 = t before processing id |
81 | | }; |
82 | | |
83 | | // Threadq is a list of threads. The list is sorted by the order |
84 | | // in which Perl would explore that particular state -- the earlier |
85 | | // choices appear earlier in the list. |
86 | | typedef SparseArray<Thread*> Threadq; |
87 | | |
88 | | inline Thread* AllocThread(); |
89 | | inline Thread* Incref(Thread* t); |
90 | | inline void Decref(Thread* t); |
91 | | |
92 | | // Follows all empty arrows from id0 and enqueues all the states reached. |
93 | | // Enqueues only the ByteRange instructions that match byte c. |
94 | | // context is used (with p) for evaluating empty-width specials. |
95 | | // p is the current input position, and t0 is the current thread. |
96 | | void AddToThreadq(Threadq* q, int id0, int c, absl::string_view context, |
97 | | const char* p, Thread* t0); |
98 | | |
99 | | // Run runq on byte c, appending new states to nextq. |
100 | | // Updates matched_ and match_ as new, better matches are found. |
101 | | // context is used (with p) for evaluating empty-width specials. |
102 | | // p is the position of byte c in the input string for AddToThreadq; |
103 | | // p-1 will be used when processing Match instructions. |
104 | | // Frees all the threads on runq. |
105 | | // If there is a shortcut to the end, returns that shortcut. |
106 | | int Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context, |
107 | | const char* p); |
108 | | |
109 | | // Returns text version of capture information, for debugging. |
110 | | std::string FormatCapture(const char** capture); |
111 | | |
112 | 340k | void CopyCapture(const char** dst, const char** src) { |
113 | 340k | memmove(dst, src, ncapture_*sizeof src[0]); |
114 | 340k | } |
115 | | |
116 | | Prog* prog_; // underlying program |
117 | | int start_; // start instruction in program |
118 | | int ncapture_; // number of submatches to track |
119 | | bool longest_; // whether searching for longest match |
120 | | bool endmatch_; // whether match must end at text.end() |
121 | | const char* btext_; // beginning of text (for FormatSubmatch) |
122 | | const char* etext_; // end of text (for endmatch_) |
123 | | Threadq q0_, q1_; // pre-allocated for Search. |
124 | | PODArray<AddState> stack_; // pre-allocated for AddToThreadq |
125 | | std::deque<Thread> arena_; // thread arena |
126 | | Thread* freelist_; // thread freelist |
127 | | const char** match_; // best match so far |
128 | | bool matched_; // any match so far? |
129 | | |
130 | | NFA(const NFA&) = delete; |
131 | | NFA& operator=(const NFA&) = delete; |
132 | | }; |
133 | | |
134 | 1.46k | NFA::NFA(Prog* prog) { |
135 | 1.46k | prog_ = prog; |
136 | 1.46k | start_ = prog_->start(); |
137 | 1.46k | ncapture_ = 0; |
138 | 1.46k | longest_ = false; |
139 | 1.46k | endmatch_ = false; |
140 | 1.46k | btext_ = NULL; |
141 | 1.46k | etext_ = NULL; |
142 | 1.46k | q0_.resize(prog_->size()); |
143 | 1.46k | q1_.resize(prog_->size()); |
144 | | // See NFA::AddToThreadq() for why this is so. |
145 | 1.46k | int nstack = 2*prog_->inst_count(kInstCapture) + |
146 | 1.46k | prog_->inst_count(kInstEmptyWidth) + |
147 | 1.46k | prog_->inst_count(kInstNop) + 1; // + 1 for start inst |
148 | 1.46k | stack_ = PODArray<AddState>(nstack); |
149 | 1.46k | freelist_ = NULL; |
150 | 1.46k | match_ = NULL; |
151 | 1.46k | matched_ = false; |
152 | 1.46k | } |
153 | | |
154 | 1.46k | NFA::~NFA() { |
155 | 1.46k | delete[] match_; |
156 | 1.46k | for (const Thread& t : arena_) |
157 | 121k | delete[] t.capture; |
158 | 1.46k | } |
159 | | |
160 | 325k | NFA::Thread* NFA::AllocThread() { |
161 | 325k | Thread* t = freelist_; |
162 | 325k | if (t != NULL) { |
163 | 204k | freelist_ = t->next; |
164 | 204k | t->ref = 1; |
165 | | // We don't need to touch t->capture because |
166 | | // the caller will immediately overwrite it. |
167 | 204k | return t; |
168 | 204k | } |
169 | 121k | arena_.emplace_back(); |
170 | 121k | t = &arena_.back(); |
171 | 121k | t->ref = 1; |
172 | 121k | t->capture = new const char*[ncapture_]; |
173 | 121k | return t; |
174 | 325k | } |
175 | | |
176 | 10.9M | NFA::Thread* NFA::Incref(Thread* t) { |
177 | 10.9M | ABSL_DCHECK(t != NULL); |
178 | 10.9M | t->ref++; |
179 | 10.9M | return t; |
180 | 10.9M | } |
181 | | |
182 | 11.2M | void NFA::Decref(Thread* t) { |
183 | 11.2M | ABSL_DCHECK(t != NULL); |
184 | 11.2M | t->ref--; |
185 | 11.2M | if (t->ref > 0) |
186 | 10.9M | return; |
187 | 11.2M | ABSL_DCHECK_EQ(t->ref, 0); |
188 | 325k | t->next = freelist_; |
189 | 325k | freelist_ = t; |
190 | 325k | } |
191 | | |
192 | | // Follows all empty arrows from id0 and enqueues all the states reached. |
193 | | // Enqueues only the ByteRange instructions that match byte c. |
194 | | // context is used (with p) for evaluating empty-width specials. |
195 | | // p is the current input position, and t0 is the current thread. |
196 | | void NFA::AddToThreadq(Threadq* q, int id0, int c, absl::string_view context, |
197 | 10.8M | const char* p, Thread* t0) { |
198 | 10.8M | if (id0 == 0) |
199 | 0 | return; |
200 | | |
201 | | // Use stack_ to hold our stack of instructions yet to process. |
202 | | // It was preallocated as follows: |
203 | | // two entries per Capture; |
204 | | // one entry per EmptyWidth; and |
205 | | // one entry per Nop. |
206 | | // This reflects the maximum number of stack pushes that each can |
207 | | // perform. (Each instruction can be processed at most once.) |
208 | 10.8M | AddState* stk = stack_.data(); |
209 | 10.8M | int nstk = 0; |
210 | | |
211 | 10.8M | stk[nstk++] = {id0, NULL}; |
212 | 22.4M | while (nstk > 0) { |
213 | 11.5M | ABSL_DCHECK_LE(nstk, stack_.size()); |
214 | 11.5M | AddState a = stk[--nstk]; |
215 | | |
216 | 24.1M | Loop: |
217 | 24.1M | if (a.t != NULL) { |
218 | | // t0 was a thread that we allocated and copied in order to |
219 | | // record the capture, so we must now decref it. |
220 | 324k | Decref(t0); |
221 | 324k | t0 = a.t; |
222 | 324k | } |
223 | | |
224 | 24.1M | int id = a.id; |
225 | 24.1M | if (id == 0) |
226 | 324k | continue; |
227 | 23.8M | if (q->has_index(id)) { |
228 | 5.39M | if (ExtraDebug) |
229 | 0 | absl::FPrintF(stderr, " [%d%s]\n", id, FormatCapture(t0->capture)); |
230 | 5.39M | continue; |
231 | 5.39M | } |
232 | | |
233 | | // Create entry in q no matter what. We might fill it in below, |
234 | | // or we might not. Even if not, it is necessary to have it, |
235 | | // so that we don't revisit id0 during the recursion. |
236 | 18.4M | q->set_new(id, NULL); |
237 | 18.4M | Thread** tp = &q->get_existing(id); |
238 | 18.4M | int j; |
239 | 18.4M | Thread* t; |
240 | 18.4M | Prog::Inst* ip = prog_->inst(id); |
241 | 18.4M | switch (ip->opcode()) { |
242 | 0 | default: |
243 | 0 | ABSL_LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq"; |
244 | 0 | break; |
245 | | |
246 | 0 | case kInstFail: |
247 | 0 | break; |
248 | | |
249 | 2.47k | case kInstAltMatch: |
250 | | // Save state; will pick up at next byte. |
251 | 2.47k | t = Incref(t0); |
252 | 2.47k | *tp = t; |
253 | | |
254 | 2.47k | ABSL_DCHECK(!ip->last()); |
255 | 2.47k | a = {id+1, NULL}; |
256 | 2.47k | goto Loop; |
257 | | |
258 | 5.95M | case kInstNop: |
259 | 5.95M | if (!ip->last()) |
260 | 251k | stk[nstk++] = {id+1, NULL}; |
261 | | |
262 | | // Continue on. |
263 | 5.95M | a = {ip->out(), NULL}; |
264 | 5.95M | goto Loop; |
265 | | |
266 | 408k | case kInstCapture: |
267 | 408k | if (!ip->last()) |
268 | 5.28k | stk[nstk++] = {id+1, NULL}; |
269 | | |
270 | 408k | if ((j=ip->cap()) < ncapture_) { |
271 | | // Push a dummy whose only job is to restore t0 |
272 | | // once we finish exploring this possibility. |
273 | 324k | stk[nstk++] = {0, t0}; |
274 | | |
275 | | // Record capture. |
276 | 324k | t = AllocThread(); |
277 | 324k | CopyCapture(t->capture, t0->capture); |
278 | 324k | t->capture[j] = p; |
279 | 324k | t0 = t; |
280 | 324k | } |
281 | 408k | a = {ip->out(), NULL}; |
282 | 408k | goto Loop; |
283 | | |
284 | 11.9M | case kInstByteRange: |
285 | 11.9M | if (!ip->Matches(c)) |
286 | 1.09M | goto Next; |
287 | | |
288 | | // Save state; will pick up at next byte. |
289 | 10.8M | t = Incref(t0); |
290 | 10.8M | *tp = t; |
291 | 10.8M | if (ExtraDebug) |
292 | 0 | absl::FPrintF(stderr, " + %d%s\n", id, FormatCapture(t0->capture)); |
293 | | |
294 | 10.8M | if (ip->hint() == 0) |
295 | 5.54M | break; |
296 | 5.33M | a = {id+ip->hint(), NULL}; |
297 | 5.33M | goto Loop; |
298 | | |
299 | 26.1k | case kInstMatch: |
300 | | // Save state; will pick up at next byte. |
301 | 26.1k | t = Incref(t0); |
302 | 26.1k | *tp = t; |
303 | 26.1k | if (ExtraDebug) |
304 | 0 | absl::FPrintF(stderr, " ! %d%s\n", id, FormatCapture(t0->capture)); |
305 | | |
306 | 1.11M | Next: |
307 | 1.11M | if (ip->last()) |
308 | 193k | break; |
309 | 923k | a = {id+1, NULL}; |
310 | 923k | goto Loop; |
311 | | |
312 | 79.9k | case kInstEmptyWidth: |
313 | 79.9k | if (!ip->last()) |
314 | 67.3k | stk[nstk++] = {id+1, NULL}; |
315 | | |
316 | | // Continue on if we have all the right flag bits. |
317 | 79.9k | if (ip->empty() & ~Prog::EmptyFlags(context, p)) |
318 | 72.5k | break; |
319 | 7.37k | a = {ip->out(), NULL}; |
320 | 7.37k | goto Loop; |
321 | 18.4M | } |
322 | 18.4M | } |
323 | 10.8M | } |
324 | | |
325 | | // Run runq on byte c, appending new states to nextq. |
326 | | // Updates matched_ and match_ as new, better matches are found. |
327 | | // context is used (with p) for evaluating empty-width specials. |
328 | | // p is the position of byte c in the input string for AddToThreadq; |
329 | | // p-1 will be used when processing Match instructions. |
330 | | // Frees all the threads on runq. |
331 | | // If there is a shortcut to the end, returns that shortcut. |
332 | | int NFA::Step(Threadq* runq, Threadq* nextq, int c, absl::string_view context, |
333 | 38.3k | const char* p) { |
334 | 38.3k | nextq->clear(); |
335 | | |
336 | 18.4M | for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
337 | 18.4M | Thread* t = i->value(); |
338 | 18.4M | if (t == NULL) |
339 | 7.52M | continue; |
340 | | |
341 | 10.9M | if (longest_) { |
342 | | // Can skip any threads started after our current best match. |
343 | 10.9M | if (matched_ && match_[0] < t->capture[0]) { |
344 | 0 | Decref(t); |
345 | 0 | continue; |
346 | 0 | } |
347 | 10.9M | } |
348 | | |
349 | 10.9M | int id = i->index(); |
350 | 10.9M | Prog::Inst* ip = prog_->inst(id); |
351 | | |
352 | 10.9M | switch (ip->opcode()) { |
353 | 0 | default: |
354 | | // Should only see the values handled below. |
355 | 0 | ABSL_LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step"; |
356 | 0 | break; |
357 | | |
358 | 10.8M | case kInstByteRange: |
359 | 10.8M | AddToThreadq(nextq, ip->out(), c, context, p, t); |
360 | 10.8M | break; |
361 | | |
362 | 2.44k | case kInstAltMatch: |
363 | 2.44k | if (i != runq->begin()) |
364 | 2.26k | break; |
365 | | // The match is ours if we want it. |
366 | 189 | if (ip->greedy(prog_) || longest_) { |
367 | 189 | CopyCapture(match_, t->capture); |
368 | 189 | matched_ = true; |
369 | | |
370 | 189 | Decref(t); |
371 | 6.91k | for (++i; i != runq->end(); ++i) { |
372 | 6.73k | if (i->value() != NULL) |
373 | 1.87k | Decref(i->value()); |
374 | 6.73k | } |
375 | 189 | runq->clear(); |
376 | 189 | if (ip->greedy(prog_)) |
377 | 151 | return ip->out1(); |
378 | 38 | return ip->out(); |
379 | 189 | } |
380 | 0 | break; |
381 | | |
382 | 25.9k | case kInstMatch: { |
383 | | // Avoid invoking undefined behavior (arithmetic on a null pointer) |
384 | | // by storing p instead of p-1. (What would the latter even mean?!) |
385 | | // This complements the special case in NFA::Search(). |
386 | 25.9k | if (p == NULL) { |
387 | 0 | CopyCapture(match_, t->capture); |
388 | 0 | match_[1] = p; |
389 | 0 | matched_ = true; |
390 | 0 | break; |
391 | 0 | } |
392 | | |
393 | 25.9k | if (endmatch_ && p-1 != etext_) |
394 | 8.89k | break; |
395 | | |
396 | 17.0k | if (longest_) { |
397 | | // Leftmost-longest mode: save this match only if |
398 | | // it is either farther to the left or at the same |
399 | | // point but longer than an existing match. |
400 | 17.0k | if (!matched_ || t->capture[0] < match_[0] || |
401 | 15.6k | (t->capture[0] == match_[0] && p-1 > match_[1])) { |
402 | 14.0k | CopyCapture(match_, t->capture); |
403 | 14.0k | match_[1] = p-1; |
404 | 14.0k | matched_ = true; |
405 | 14.0k | } |
406 | 17.0k | } else { |
407 | | // Leftmost-biased mode: this match is by definition |
408 | | // better than what we've already found (see next line). |
409 | 0 | CopyCapture(match_, t->capture); |
410 | 0 | match_[1] = p-1; |
411 | 0 | matched_ = true; |
412 | | |
413 | | // Cut off the threads that can only find matches |
414 | | // worse than the one we just found: don't run the |
415 | | // rest of the current Threadq. |
416 | 0 | Decref(t); |
417 | 0 | for (++i; i != runq->end(); ++i) { |
418 | 0 | if (i->value() != NULL) |
419 | 0 | Decref(i->value()); |
420 | 0 | } |
421 | 0 | runq->clear(); |
422 | 0 | return 0; |
423 | 0 | } |
424 | 17.0k | break; |
425 | 17.0k | } |
426 | 10.9M | } |
427 | 10.9M | Decref(t); |
428 | 10.9M | } |
429 | 38.1k | runq->clear(); |
430 | 38.1k | return 0; |
431 | 38.3k | } |
432 | | |
433 | 0 | std::string NFA::FormatCapture(const char** capture) { |
434 | 0 | std::string s; |
435 | 0 | for (int i = 0; i < ncapture_; i+=2) { |
436 | 0 | if (capture[i] == NULL) |
437 | 0 | s += "(?,?)"; |
438 | 0 | else if (capture[i+1] == NULL) |
439 | 0 | s += absl::StrFormat("(%d,?)", |
440 | 0 | capture[i] - btext_); |
441 | 0 | else |
442 | 0 | s += absl::StrFormat("(%d,%d)", |
443 | 0 | capture[i] - btext_, |
444 | 0 | capture[i+1] - btext_); |
445 | 0 | } |
446 | 0 | return s; |
447 | 0 | } |
448 | | |
449 | | bool NFA::Search(absl::string_view text, absl::string_view context, |
450 | | bool anchored, bool longest, absl::string_view* submatch, |
451 | 1.46k | int nsubmatch) { |
452 | 1.46k | if (start_ == 0) |
453 | 0 | return false; |
454 | | |
455 | 1.46k | if (context.data() == NULL) |
456 | 0 | context = text; |
457 | | |
458 | | // Sanity check: make sure that text lies within context. |
459 | 1.46k | if (BeginPtr(text) < BeginPtr(context) || EndPtr(text) > EndPtr(context)) { |
460 | 0 | ABSL_LOG(DFATAL) << "context does not contain text"; |
461 | 0 | return false; |
462 | 0 | } |
463 | | |
464 | 1.46k | if (prog_->anchor_start() && BeginPtr(context) != BeginPtr(text)) |
465 | 0 | return false; |
466 | 1.46k | if (prog_->anchor_end() && EndPtr(context) != EndPtr(text)) |
467 | 0 | return false; |
468 | 1.46k | anchored |= prog_->anchor_start(); |
469 | 1.46k | if (prog_->anchor_end()) { |
470 | 152 | longest = true; |
471 | 152 | endmatch_ = true; |
472 | 152 | } |
473 | | |
474 | 1.46k | if (nsubmatch < 0) { |
475 | 0 | ABSL_LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; |
476 | 0 | return false; |
477 | 0 | } |
478 | | |
479 | | // Save search parameters. |
480 | 1.46k | ncapture_ = 2*nsubmatch; |
481 | 1.46k | longest_ = longest; |
482 | | |
483 | 1.46k | if (nsubmatch == 0) { |
484 | | // We need to maintain match[0], both to distinguish the |
485 | | // longest match (if longest is true) and also to tell |
486 | | // whether we've seen any matches at all. |
487 | 0 | ncapture_ = 2; |
488 | 0 | } |
489 | | |
490 | 1.46k | match_ = new const char*[ncapture_]; |
491 | 1.46k | memset(match_, 0, ncapture_*sizeof match_[0]); |
492 | 1.46k | matched_ = false; |
493 | | |
494 | | // For debugging prints. |
495 | 1.46k | btext_ = context.data(); |
496 | | // For convenience. |
497 | 1.46k | etext_ = text.data() + text.size(); |
498 | | |
499 | 1.46k | if (ExtraDebug) |
500 | 0 | absl::FPrintF(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", |
501 | 0 | text, context, anchored, longest); |
502 | | |
503 | | // Set up search. |
504 | 1.46k | Threadq* runq = &q0_; |
505 | 1.46k | Threadq* nextq = &q1_; |
506 | 1.46k | runq->clear(); |
507 | 1.46k | nextq->clear(); |
508 | | |
509 | | // Loop over the text, stepping the machine. |
510 | 38.3k | for (const char* p = text.data();; p++) { |
511 | 38.3k | if (ExtraDebug) { |
512 | 0 | int c = 0; |
513 | 0 | if (p == btext_) |
514 | 0 | c = '^'; |
515 | 0 | else if (p > etext_) |
516 | 0 | c = '$'; |
517 | 0 | else if (p < etext_) |
518 | 0 | c = p[0] & 0xFF; |
519 | |
|
520 | 0 | absl::FPrintF(stderr, "%c:", c); |
521 | 0 | for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
522 | 0 | Thread* t = i->value(); |
523 | 0 | if (t == NULL) |
524 | 0 | continue; |
525 | 0 | absl::FPrintF(stderr, " %d%s", i->index(), FormatCapture(t->capture)); |
526 | 0 | } |
527 | 0 | absl::FPrintF(stderr, "\n"); |
528 | 0 | } |
529 | | |
530 | | // This is a no-op the first time around the loop because runq is empty. |
531 | 38.3k | int id = Step(runq, nextq, p < etext_ ? p[0] & 0xFF : -1, context, p); |
532 | 38.3k | ABSL_DCHECK_EQ(runq->size(), 0); |
533 | 38.3k | using std::swap; |
534 | 38.3k | swap(nextq, runq); |
535 | 38.3k | nextq->clear(); |
536 | 38.3k | if (id != 0) { |
537 | | // We're done: full match ahead. |
538 | 189 | p = etext_; |
539 | 775 | for (;;) { |
540 | 775 | Prog::Inst* ip = prog_->inst(id); |
541 | 775 | switch (ip->opcode()) { |
542 | 0 | default: |
543 | 0 | ABSL_LOG(DFATAL) << "Unexpected opcode in short circuit: " |
544 | 0 | << ip->opcode(); |
545 | 0 | break; |
546 | | |
547 | 468 | case kInstCapture: |
548 | 468 | if (ip->cap() < ncapture_) |
549 | 242 | match_[ip->cap()] = p; |
550 | 468 | id = ip->out(); |
551 | 468 | continue; |
552 | | |
553 | 118 | case kInstNop: |
554 | 118 | id = ip->out(); |
555 | 118 | continue; |
556 | | |
557 | 189 | case kInstMatch: |
558 | 189 | match_[1] = p; |
559 | 189 | matched_ = true; |
560 | 189 | break; |
561 | 775 | } |
562 | 189 | break; |
563 | 775 | } |
564 | 189 | break; |
565 | 189 | } |
566 | | |
567 | 38.1k | if (p > etext_) |
568 | 1.27k | break; |
569 | | |
570 | | // Start a new thread if there have not been any matches. |
571 | | // (No point in starting a new thread if there have been |
572 | | // matches, since it would be to the right of the match |
573 | | // we already found.) |
574 | 36.8k | if (!matched_ && (!anchored || p == text.data())) { |
575 | | // Try to use prefix accel (e.g. memchr) to skip ahead. |
576 | | // The search must be unanchored and there must be zero |
577 | | // possible matches already. |
578 | 1.46k | if (!anchored && runq->size() == 0 && |
579 | 0 | p < etext_ && prog_->can_prefix_accel()) { |
580 | 0 | p = reinterpret_cast<const char*>(prog_->PrefixAccel(p, etext_ - p)); |
581 | 0 | if (p == NULL) |
582 | 0 | p = etext_; |
583 | 0 | } |
584 | | |
585 | 1.46k | Thread* t = AllocThread(); |
586 | 1.46k | CopyCapture(t->capture, match_); |
587 | 1.46k | t->capture[0] = p; |
588 | 1.46k | AddToThreadq(runq, start_, p < etext_ ? p[0] & 0xFF : -1, context, p, |
589 | 1.46k | t); |
590 | 1.46k | Decref(t); |
591 | 1.46k | } |
592 | | |
593 | | // If all the threads have died, stop early. |
594 | 36.8k | if (runq->size() == 0) { |
595 | 0 | if (ExtraDebug) |
596 | 0 | absl::FPrintF(stderr, "dead\n"); |
597 | 0 | break; |
598 | 0 | } |
599 | | |
600 | | // Avoid invoking undefined behavior (arithmetic on a null pointer) |
601 | | // by simply not continuing the loop. |
602 | | // This complements the special case in NFA::Step(). |
603 | 36.8k | if (p == NULL) { |
604 | 0 | (void) Step(runq, nextq, -1, context, p); |
605 | 0 | ABSL_DCHECK_EQ(runq->size(), 0); |
606 | 0 | using std::swap; |
607 | 0 | swap(nextq, runq); |
608 | 0 | nextq->clear(); |
609 | 0 | break; |
610 | 0 | } |
611 | 36.8k | } |
612 | | |
613 | 1.46k | for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
614 | 0 | if (i->value() != NULL) |
615 | 0 | Decref(i->value()); |
616 | 0 | } |
617 | | |
618 | 1.46k | if (matched_) { |
619 | 4.39k | for (int i = 0; i < nsubmatch; i++) |
620 | 2.93k | submatch[i] = absl::string_view( |
621 | 2.93k | match_[2 * i], |
622 | 2.93k | static_cast<size_t>(match_[2 * i + 1] - match_[2 * i])); |
623 | 1.46k | if (ExtraDebug) |
624 | 0 | absl::FPrintF(stderr, "match (%d,%d)\n", |
625 | 0 | match_[0] - btext_, |
626 | 0 | match_[1] - btext_); |
627 | 1.46k | return true; |
628 | 1.46k | } |
629 | 0 | return false; |
630 | 1.46k | } |
631 | | |
632 | | bool Prog::SearchNFA(absl::string_view text, absl::string_view context, |
633 | | Anchor anchor, MatchKind kind, absl::string_view* match, |
634 | 1.46k | int nmatch) { |
635 | 1.46k | if (ExtraDebug) |
636 | 0 | Dump(); |
637 | | |
638 | 1.46k | NFA nfa(this); |
639 | 1.46k | absl::string_view sp; |
640 | 1.46k | if (kind == kFullMatch) { |
641 | 1.46k | anchor = kAnchored; |
642 | 1.46k | if (nmatch == 0) { |
643 | 0 | match = &sp; |
644 | 0 | nmatch = 1; |
645 | 0 | } |
646 | 1.46k | } |
647 | 1.46k | if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch)) |
648 | 0 | return false; |
649 | 1.46k | if (kind == kFullMatch && EndPtr(match[0]) != EndPtr(text)) |
650 | 0 | return false; |
651 | 1.46k | return true; |
652 | 1.46k | } |
653 | | |
654 | | // For each instruction i in the program reachable from the start, compute the |
655 | | // number of instructions reachable from i by following only empty transitions |
656 | | // and record that count as fanout[i]. |
657 | | // |
658 | | // fanout holds the results and is also the work queue for the outer iteration. |
659 | | // reachable holds the reached nodes for the inner iteration. |
660 | 18.8k | void Prog::Fanout(SparseArray<int>* fanout) { |
661 | 18.8k | ABSL_DCHECK_EQ(fanout->max_size(), size()); |
662 | 18.8k | SparseSet reachable(size()); |
663 | 18.8k | fanout->clear(); |
664 | 18.8k | fanout->set_new(start(), 0); |
665 | 4.10M | for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) { |
666 | 4.08M | int* count = &i->value(); |
667 | 4.08M | reachable.clear(); |
668 | 4.08M | reachable.insert(i->index()); |
669 | 66.9M | for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) { |
670 | 62.8M | int id = *j; |
671 | 62.8M | Prog::Inst* ip = inst(id); |
672 | 62.8M | switch (ip->opcode()) { |
673 | 0 | default: |
674 | 0 | ABSL_LOG(DFATAL) << "unhandled " << ip->opcode() |
675 | 0 | << " in Prog::Fanout()"; |
676 | 0 | break; |
677 | | |
678 | 26.0M | case kInstByteRange: |
679 | 26.0M | if (!ip->last()) |
680 | 19.1M | reachable.insert(id+1); |
681 | | |
682 | 26.0M | (*count)++; |
683 | 26.0M | if (!fanout->has_index(ip->out())) { |
684 | 4.06M | fanout->set_new(ip->out(), 0); |
685 | 4.06M | } |
686 | 26.0M | break; |
687 | | |
688 | 4.75k | case kInstAltMatch: |
689 | 4.75k | ABSL_DCHECK(!ip->last()); |
690 | 4.75k | reachable.insert(id+1); |
691 | 4.75k | break; |
692 | | |
693 | 1.79M | case kInstCapture: |
694 | 2.61M | case kInstEmptyWidth: |
695 | 36.7M | case kInstNop: |
696 | 36.7M | if (!ip->last()) |
697 | 19.9M | reachable.insert(id+1); |
698 | | |
699 | 36.7M | reachable.insert(ip->out()); |
700 | 36.7M | break; |
701 | | |
702 | 107k | case kInstMatch: |
703 | 107k | if (!ip->last()) |
704 | 34.4k | reachable.insert(id+1); |
705 | 107k | break; |
706 | | |
707 | 3.70k | case kInstFail: |
708 | 3.70k | break; |
709 | 62.8M | } |
710 | 62.8M | } |
711 | 4.08M | } |
712 | 18.8k | } |
713 | | |
714 | | } // namespace re2 |