Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // Copyright 2008 The RE2 Authors.  All Rights Reserved.  | 
2  |  | // Use of this source code is governed by a BSD-style  | 
3  |  | // license that can be found in the LICENSE file.  | 
4  |  |  | 
5  |  | // Tested by search_test.cc.  | 
6  |  | //  | 
7  |  | // Prog::SearchOnePass is an efficient implementation of  | 
8  |  | // regular expression search with submatch tracking for  | 
9  |  | // what I call "one-pass regular expressions".  (An alternate  | 
10  |  | // name might be "backtracking-free regular expressions".)  | 
11  |  | //  | 
12  |  | // One-pass regular expressions have the property that  | 
13  |  | // at each input byte during an anchored match, there may be  | 
14  |  | // multiple alternatives but only one can proceed for any  | 
15  |  | // given input byte.  | 
16  |  | //  | 
17  |  | // For example, the regexp /x*yx*/ is one-pass: you read  | 
18  |  | // x's until a y, then you read the y, then you keep reading x's.  | 
19  |  | // At no point do you have to guess what to do or back up  | 
20  |  | // and try a different guess.  | 
21  |  | //  | 
22  |  | // On the other hand, /x*x/ is not one-pass: when you're  | 
23  |  | // looking at an input "x", it's not clear whether you should  | 
24  |  | // use it to extend the x* or as the final x.  | 
25  |  | //  | 
26  |  | // More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.  | 
27  |  | // /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.  | 
28  |  | //  | 
29  |  | // A simple intuition for identifying one-pass regular expressions  | 
30  |  | // is that it's always immediately obvious when a repetition ends.  | 
31  |  | // It must also be immediately obvious which branch of an | to take:  | 
32  |  | //  | 
33  |  | // /x(y|z)/ is one-pass, but /(xy|xz)/ is not.  | 
34  |  | //  | 
35  |  | // The NFA-based search in nfa.cc does some bookkeeping to  | 
36  |  | // avoid the need for backtracking and its associated exponential blowup.  | 
37  |  | // But if we have a one-pass regular expression, there is no  | 
38  |  | // possibility of backtracking, so there is no need for the  | 
39  |  | // extra bookkeeping.  Hence, this code.  | 
40  |  | //  | 
41  |  | // On a one-pass regular expression, the NFA code in nfa.cc  | 
42  |  | // runs at about 1/20 of the backtracking-based PCRE speed.  | 
43  |  | // In contrast, the code in this file runs at about the same  | 
44  |  | // speed as PCRE.  | 
45  |  | //  | 
46  |  | // One-pass regular expressions get used a lot when RE is  | 
47  |  | // used for parsing simple strings, so it pays off to  | 
48  |  | // notice them and handle them efficiently.  | 
49  |  | //  | 
50  |  | // See also Anne Brüggemann-Klein and Derick Wood,  | 
51  |  | // "One-unambiguous regular languages", Information and Computation 142(2).  | 
52  |  |  | 
53  |  | #include <stdint.h>  | 
54  |  | #include <string.h>  | 
55  |  | #include <algorithm>  | 
56  |  | #include <map>  | 
57  |  | #include <string>  | 
58  |  | #include <vector>  | 
59  |  |  | 
60  |  | #include "util/util.h"  | 
61  |  | #include "util/logging.h"  | 
62  |  | #include "util/strutil.h"  | 
63  |  | #include "util/utf.h"  | 
64  |  | #include "re2/pod_array.h"  | 
65  |  | #include "re2/prog.h"  | 
66  |  | #include "re2/sparse_set.h"  | 
67  |  | #include "re2/stringpiece.h"  | 
68  |  |  | 
69  |  | // Silence "zero-sized array in struct/union" warning for OneState::action.  | 
70  |  | #ifdef _MSC_VER  | 
71  |  | #pragma warning(disable: 4200)  | 
72  |  | #endif  | 
73  |  |  | 
74  |  | namespace re2 { | 
75  |  |  | 
76  |  | static const bool ExtraDebug = false;  | 
77  |  |  | 
78  |  | // The key insight behind this implementation is that the  | 
79  |  | // non-determinism in an NFA for a one-pass regular expression  | 
80  |  | // is contained.  To explain what that means, first a  | 
81  |  | // refresher about what regular expression programs look like  | 
82  |  | // and how the usual NFA execution runs.  | 
83  |  | //  | 
84  |  | // In a regular expression program, only the kInstByteRange  | 
85  |  | // instruction processes an input byte c and moves on to the  | 
86  |  | // next byte in the string (it does so if c is in the given range).  | 
87  |  | // The kInstByteRange instructions correspond to literal characters  | 
88  |  | // and character classes in the regular expression.  | 
89  |  | //  | 
90  |  | // The kInstAlt instructions are used as wiring to connect the  | 
91  |  | // kInstByteRange instructions together in interesting ways when  | 
92  |  | // implementing | + and *.  | 
93  |  | // The kInstAlt instruction forks execution, like a goto that  | 
94  |  | // jumps to ip->out() and ip->out1() in parallel.  Each of the  | 
95  |  | // resulting computation paths is called a thread.  | 
96  |  | //  | 
97  |  | // The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --  | 
98  |  | // are interesting in their own right but like kInstAlt they don't  | 
99  |  | // advance the input pointer.  Only kInstByteRange does.  | 
100  |  | //  | 
101  |  | // The automaton execution in nfa.cc runs all the possible  | 
102  |  | // threads of execution in lock-step over the input.  To process  | 
103  |  | // a particular byte, each thread gets run until it either dies  | 
104  |  | // or finds a kInstByteRange instruction matching the byte.  | 
105  |  | // If the latter happens, the thread stops just past the  | 
106  |  | // kInstByteRange instruction (at ip->out()) and waits for  | 
107  |  | // the other threads to finish processing the input byte.  | 
108  |  | // Then, once all the threads have processed that input byte,  | 
109  |  | // the whole process repeats.  The kInstAlt state instruction  | 
110  |  | // might create new threads during input processing, but no  | 
111  |  | // matter what, all the threads stop after a kInstByteRange  | 
112  |  | // and wait for the other threads to "catch up".  | 
113  |  | // Running in lock step like this ensures that the NFA reads  | 
114  |  | // the input string only once.  | 
115  |  | //  | 
116  |  | // Each thread maintains its own set of capture registers  | 
117  |  | // (the string positions at which it executed the kInstCapture  | 
118  |  | // instructions corresponding to capturing parentheses in the  | 
119  |  | // regular expression).  Repeated copying of the capture registers  | 
120  |  | // is the main performance bottleneck in the NFA implementation.  | 
121  |  | //  | 
122  |  | // A regular expression program is "one-pass" if, no matter what  | 
123  |  | // the input string, there is only one thread that makes it  | 
124  |  | // past a kInstByteRange instruction at each input byte.  This means  | 
125  |  | // that there is in some sense only one active thread throughout  | 
126  |  | // the execution.  Other threads might be created during the  | 
127  |  | // processing of an input byte, but they are ephemeral: only one  | 
128  |  | // thread is left to start processing the next input byte.  | 
129  |  | // This is what I meant above when I said the non-determinism  | 
130  |  | // was "contained".  | 
131  |  | //  | 
132  |  | // To execute a one-pass regular expression program, we can build  | 
133  |  | // a DFA (no non-determinism) that has at most as many states as  | 
134  |  | // the NFA (compare this to the possibly exponential number of states  | 
135  |  | // in the general case).  Each state records, for each possible  | 
136  |  | // input byte, the next state along with the conditions required  | 
137  |  | // before entering that state -- empty-width flags that must be true  | 
138  |  | // and capture operations that must be performed.  It also records  | 
139  |  | // whether a set of conditions required to finish a match at that  | 
140  |  | // point in the input rather than process the next byte.  | 
141  |  |  | 
142  |  | // A state in the one-pass NFA - just an array of actions indexed  | 
143  |  | // by the bytemap_[] of the next input byte.  (The bytemap  | 
144  |  | // maps next input bytes into equivalence classes, to reduce  | 
145  |  | // the memory footprint.)  | 
146  |  | struct OneState { | 
147  |  |   uint32_t matchcond;   // conditions to match right now.  | 
148  |  |   uint32_t action[];  | 
149  |  | };  | 
150  |  |  | 
151  |  | // The uint32_t conditions in the action are a combination of  | 
152  |  | // condition and capture bits and the next state.  The bottom 16 bits  | 
153  |  | // are the condition and capture bits, and the top 16 are the index of  | 
154  |  | // the next state.  | 
155  |  | //  | 
156  |  | // Bits 0-5 are the empty-width flags from prog.h.  | 
157  |  | // Bit 6 is kMatchWins, which means the match takes  | 
158  |  | // priority over moving to next in a first-match search.  | 
159  |  | // The remaining bits mark capture registers that should  | 
160  |  | // be set to the current input position.  The capture bits  | 
161  |  | // start at index 2, since the search loop can take care of  | 
162  |  | // cap[0], cap[1] (the overall match position).  | 
163  |  | // That means we can handle up to 5 capturing parens: $1 through $4, plus $0.  | 
164  |  | // No input position can satisfy both kEmptyWordBoundary  | 
165  |  | // and kEmptyNonWordBoundary, so we can use that as a sentinel  | 
166  |  | // instead of needing an extra bit.  | 
167  |  |  | 
168  |  | static const int    kIndexShift   = 16;  // number of bits below index  | 
169  |  | static const int    kEmptyShift   = 6;   // number of empty flags in prog.h  | 
170  |  | static const int    kRealCapShift = kEmptyShift + 1;  | 
171  |  | static const int    kRealMaxCap   = (kIndexShift - kRealCapShift) / 2 * 2;  | 
172  |  |  | 
173  |  | // Parameters used to skip over cap[0], cap[1].  | 
174  |  | static const int    kCapShift     = kRealCapShift - 2;  | 
175  |  | static const int    kMaxCap       = kRealMaxCap + 2;  | 
176  |  |  | 
177  |  | static const uint32_t kMatchWins  = 1 << kEmptyShift;  | 
178  |  | static const uint32_t kCapMask    = ((1 << kRealMaxCap) - 1) << kRealCapShift;  | 
179  |  |  | 
180  |  | static const uint32_t kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;  | 
181  |  |  | 
182  |  | // Check, at compile time, that prog.h agrees with math above.  | 
183  |  | // This function is never called.  | 
184  | 0  | void OnePass_Checks() { | 
185  | 0  |   static_assert((1<<kEmptyShift)-1 == kEmptyAllFlags,  | 
186  | 0  |                 "kEmptyShift disagrees with kEmptyAllFlags");  | 
187  |  |   // kMaxCap counts pointers, kMaxOnePassCapture counts pairs.  | 
188  | 0  |   static_assert(kMaxCap == Prog::kMaxOnePassCapture*2,  | 
189  | 0  |                 "kMaxCap disagrees with kMaxOnePassCapture");  | 
190  | 0  | }  | 
191  |  |  | 
192  | 2.96k  | static bool Satisfy(uint32_t cond, const StringPiece& context, const char* p) { | 
193  | 2.96k  |   uint32_t satisfied = Prog::EmptyFlags(context, p);  | 
194  | 2.96k  |   if (cond & kEmptyAllFlags & ~satisfied)  | 
195  | 2.93k  |     return false;  | 
196  | 34  |   return true;  | 
197  | 2.96k  | }  | 
198  |  |  | 
199  |  | // Apply the capture bits in cond, saving p to the appropriate  | 
200  |  | // locations in cap[].  | 
201  |  | static void ApplyCaptures(uint32_t cond, const char* p,  | 
202  | 0  |                           const char** cap, int ncap) { | 
203  | 0  |   for (int i = 2; i < ncap; i++)  | 
204  | 0  |     if (cond & (1 << kCapShift << i))  | 
205  | 0  |       cap[i] = p;  | 
206  | 0  | }  | 
207  |  |  | 
208  |  | // Computes the OneState* for the given nodeindex.  | 
209  |  | static inline OneState* IndexToNode(uint8_t* nodes, int statesize,  | 
210  | 5.72M  |                                     int nodeindex) { | 
211  | 5.72M  |   return reinterpret_cast<OneState*>(nodes + statesize*nodeindex);  | 
212  | 5.72M  | }  | 
213  |  |  | 
214  |  | bool Prog::SearchOnePass(const StringPiece& text,  | 
215  |  |                          const StringPiece& const_context,  | 
216  |  |                          Anchor anchor, MatchKind kind,  | 
217  | 3.66k  |                          StringPiece* match, int nmatch) { | 
218  | 3.66k  |   if (anchor != kAnchored && kind != kFullMatch) { | 
219  | 0  |     LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";  | 
220  | 0  |     return false;  | 
221  | 0  |   }  | 
222  |  |  | 
223  |  |   // Make sure we have at least cap[1],  | 
224  |  |   // because we use it to tell if we matched.  | 
225  | 3.66k  |   int ncap = 2*nmatch;  | 
226  | 3.66k  |   if (ncap < 2)  | 
227  | 3.66k  |     ncap = 2;  | 
228  |  |  | 
229  | 3.66k  |   const char* cap[kMaxCap];  | 
230  | 10.9k  |   for (int i = 0; i < ncap; i++)  | 
231  | 7.33k  |     cap[i] = NULL;  | 
232  |  |  | 
233  | 3.66k  |   const char* matchcap[kMaxCap];  | 
234  | 10.9k  |   for (int i = 0; i < ncap; i++)  | 
235  | 7.33k  |     matchcap[i] = NULL;  | 
236  |  |  | 
237  | 3.66k  |   StringPiece context = const_context;  | 
238  | 3.66k  |   if (context.data() == NULL)  | 
239  | 0  |     context = text;  | 
240  | 3.66k  |   if (anchor_start() && BeginPtr(context) != BeginPtr(text))  | 
241  | 5  |     return false;  | 
242  | 3.66k  |   if (anchor_end() && EndPtr(context) != EndPtr(text))  | 
243  | 0  |     return false;  | 
244  | 3.66k  |   if (anchor_end())  | 
245  | 85  |     kind = kFullMatch;  | 
246  |  |  | 
247  | 3.66k  |   uint8_t* nodes = onepass_nodes_.data();  | 
248  | 3.66k  |   int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);  | 
249  |  |   // start() is always mapped to the zeroth OneState.  | 
250  | 3.66k  |   OneState* state = IndexToNode(nodes, statesize, 0);  | 
251  | 3.66k  |   uint8_t* bytemap = bytemap_;  | 
252  | 3.66k  |   const char* bp = text.data();  | 
253  | 3.66k  |   const char* ep = text.data() + text.size();  | 
254  | 3.66k  |   const char* p;  | 
255  | 3.66k  |   bool matched = false;  | 
256  | 3.66k  |   matchcap[0] = bp;  | 
257  | 3.66k  |   cap[0] = bp;  | 
258  | 3.66k  |   uint32_t nextmatchcond = state->matchcond;  | 
259  | 12.8k  |   for (p = bp; p < ep; p++) { | 
260  | 12.0k  |     int c = bytemap[*p & 0xFF];  | 
261  | 12.0k  |     uint32_t matchcond = nextmatchcond;  | 
262  | 12.0k  |     uint32_t cond = state->action[c];  | 
263  |  |  | 
264  |  |     // Determine whether we can reach act->next.  | 
265  |  |     // If so, advance state and nextmatchcond.  | 
266  | 12.0k  |     if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) { | 
267  | 9.13k  |       uint32_t nextindex = cond >> kIndexShift;  | 
268  | 9.13k  |       state = IndexToNode(nodes, statesize, nextindex);  | 
269  | 9.13k  |       nextmatchcond = state->matchcond;  | 
270  | 9.13k  |     } else { | 
271  | 2.92k  |       state = NULL;  | 
272  | 2.92k  |       nextmatchcond = kImpossible;  | 
273  | 2.92k  |     }  | 
274  |  |  | 
275  |  |     // This code section is carefully tuned.  | 
276  |  |     // The goto sequence is about 10% faster than the  | 
277  |  |     // obvious rewrite as a large if statement in the  | 
278  |  |     // ASCIIMatchRE2 and DotMatchRE2 benchmarks.  | 
279  |  |  | 
280  |  |     // Saving the match capture registers is expensive.  | 
281  |  |     // Is this intermediate match worth thinking about?  | 
282  |  |  | 
283  |  |     // Not if we want a full match.  | 
284  | 12.0k  |     if (kind == kFullMatch)  | 
285  | 12.0k  |       goto skipmatch;  | 
286  |  |  | 
287  |  |     // Not if it's impossible.  | 
288  | 0  |     if (matchcond == kImpossible)  | 
289  | 0  |       goto skipmatch;  | 
290  |  |  | 
291  |  |     // Not if the possible match is beaten by the certain  | 
292  |  |     // match at the next byte.  When this test is useless  | 
293  |  |     // (e.g., HTTPPartialMatchRE2) it slows the loop by  | 
294  |  |     // about 10%, but when it avoids work (e.g., DotMatchRE2),  | 
295  |  |     // it cuts the loop execution by about 45%.  | 
296  | 0  |     if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)  | 
297  | 0  |       goto skipmatch;  | 
298  |  |  | 
299  |  |     // Finally, the match conditions must be satisfied.  | 
300  | 0  |     if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) { | 
301  | 0  |       for (int i = 2; i < 2*nmatch; i++)  | 
302  | 0  |         matchcap[i] = cap[i];  | 
303  | 0  |       if (nmatch > 1 && (matchcond & kCapMask))  | 
304  | 0  |         ApplyCaptures(matchcond, p, matchcap, ncap);  | 
305  | 0  |       matchcap[1] = p;  | 
306  | 0  |       matched = true;  | 
307  |  |  | 
308  |  |       // If we're in longest match mode, we have to keep  | 
309  |  |       // going and see if we find a longer match.  | 
310  |  |       // In first match mode, we can stop if the match  | 
311  |  |       // takes priority over the next state for this input byte.  | 
312  |  |       // That bit is per-input byte and thus in cond, not matchcond.  | 
313  | 0  |       if (kind == kFirstMatch && (cond & kMatchWins))  | 
314  | 0  |         goto done;  | 
315  | 0  |     }  | 
316  |  |  | 
317  | 12.0k  |   skipmatch:  | 
318  | 12.0k  |     if (state == NULL)  | 
319  | 2.92k  |       goto done;  | 
320  | 9.13k  |     if ((cond & kCapMask) && nmatch > 1)  | 
321  | 0  |       ApplyCaptures(cond, p, cap, ncap);  | 
322  | 9.13k  |   }  | 
323  |  |  | 
324  |  |   // Look for match at end of input.  | 
325  | 736  |   { | 
326  | 736  |     uint32_t matchcond = state->matchcond;  | 
327  | 736  |     if (matchcond != kImpossible &&  | 
328  | 736  |         ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) { | 
329  | 726  |       if (nmatch > 1 && (matchcond & kCapMask))  | 
330  | 0  |         ApplyCaptures(matchcond, p, cap, ncap);  | 
331  | 726  |       for (int i = 2; i < ncap; i++)  | 
332  | 0  |         matchcap[i] = cap[i];  | 
333  | 726  |       matchcap[1] = p;  | 
334  | 726  |       matched = true;  | 
335  | 726  |     }  | 
336  | 736  |   }  | 
337  |  |  | 
338  | 3.66k  | done:  | 
339  | 3.66k  |   if (!matched)  | 
340  | 2.93k  |     return false;  | 
341  | 726  |   for (int i = 0; i < nmatch; i++)  | 
342  | 0  |     match[i] =  | 
343  | 0  |         StringPiece(matchcap[2 * i],  | 
344  | 0  |                     static_cast<size_t>(matchcap[2 * i + 1] - matchcap[2 * i]));  | 
345  | 726  |   return true;  | 
346  | 3.66k  | }  | 
347  |  |  | 
348  |  |  | 
349  |  | // Analysis to determine whether a given regexp program is one-pass.  | 
350  |  |  | 
351  |  | // If ip is not on workq, adds ip to work queue and returns true.  | 
352  |  | // If ip is already on work queue, does nothing and returns false.  | 
353  |  | // If ip is NULL, does nothing and returns true (pretends to add it).  | 
354  |  | typedef SparseSet Instq;  | 
355  | 4.62M  | static bool AddQ(Instq *q, int id) { | 
356  | 4.62M  |   if (id == 0)  | 
357  | 3.64k  |     return true;  | 
358  | 4.62M  |   if (q->contains(id))  | 
359  | 480  |     return false;  | 
360  | 4.62M  |   q->insert(id);  | 
361  | 4.62M  |   return true;  | 
362  | 4.62M  | }  | 
363  |  |  | 
364  |  | struct InstCond { | 
365  |  |   int id;  | 
366  |  |   uint32_t cond;  | 
367  |  | };  | 
368  |  |  | 
369  |  | // Returns whether this is a one-pass program; that is,  | 
370  |  | // returns whether it is safe to use SearchOnePass on this program.  | 
371  |  | // These conditions must be true for any instruction ip:  | 
372  |  | //  | 
373  |  | //   (1) for any other Inst nip, there is at most one input-free  | 
374  |  | //       path from ip to nip.  | 
375  |  | //   (2) there is at most one kInstByte instruction reachable from  | 
376  |  | //       ip that matches any particular byte c.  | 
377  |  | //   (3) there is at most one input-free path from ip to a kInstMatch  | 
378  |  | //       instruction.  | 
379  |  | //  | 
380  |  | // This is actually just a conservative approximation: it might  | 
381  |  | // return false when the answer is true, when kInstEmptyWidth  | 
382  |  | // instructions are involved.  | 
383  |  | // Constructs and saves corresponding one-pass NFA on success.  | 
384  | 10.9k  | bool Prog::IsOnePass() { | 
385  | 10.9k  |   if (did_onepass_)  | 
386  | 0  |     return onepass_nodes_.data() != NULL;  | 
387  | 10.9k  |   did_onepass_ = true;  | 
388  |  |  | 
389  | 10.9k  |   if (start() == 0)  // no match  | 
390  | 229  |     return false;  | 
391  |  |  | 
392  |  |   // Steal memory for the one-pass NFA from the overall DFA budget.  | 
393  |  |   // Willing to use at most 1/4 of the DFA budget (heuristic).  | 
394  |  |   // Limit max node count to 65000 as a conservative estimate to  | 
395  |  |   // avoid overflowing 16-bit node index in encoding.  | 
396  | 10.7k  |   int maxnodes = 2 + inst_count(kInstByteRange);  | 
397  | 10.7k  |   int statesize = sizeof(OneState) + bytemap_range()*sizeof(uint32_t);  | 
398  | 10.7k  |   if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)  | 
399  | 587  |     return false;  | 
400  |  |  | 
401  |  |   // Flood the graph starting at the start state, and check  | 
402  |  |   // that in each reachable state, each possible byte leads  | 
403  |  |   // to a unique next state.  | 
404  | 10.1k  |   int stacksize = inst_count(kInstCapture) +  | 
405  | 10.1k  |                   inst_count(kInstEmptyWidth) +  | 
406  | 10.1k  |                   inst_count(kInstNop) + 1;  // + 1 for start inst  | 
407  | 10.1k  |   PODArray<InstCond> stack(stacksize);  | 
408  |  |  | 
409  | 10.1k  |   int size = this->size();  | 
410  | 10.1k  |   PODArray<int> nodebyid(size);  // indexed by ip  | 
411  | 10.1k  |   memset(nodebyid.data(), 0xFF, size*sizeof nodebyid[0]);  | 
412  |  |  | 
413  |  |   // Originally, nodes was a uint8_t[maxnodes*statesize], but that was  | 
414  |  |   // unnecessarily optimistic: why allocate a large amount of memory  | 
415  |  |   // upfront for a large program when it is unlikely to be one-pass?  | 
416  | 10.1k  |   std::vector<uint8_t> nodes;  | 
417  |  |  | 
418  | 10.1k  |   Instq tovisit(size), workq(size);  | 
419  | 10.1k  |   AddQ(&tovisit, start());  | 
420  | 10.1k  |   nodebyid[start()] = 0;  | 
421  | 10.1k  |   int nalloc = 1;  | 
422  | 10.1k  |   nodes.insert(nodes.end(), statesize, 0);  | 
423  | 2.86M  |   for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { | 
424  | 2.85M  |     int id = *it;  | 
425  | 2.85M  |     int nodeindex = nodebyid[id];  | 
426  | 2.85M  |     OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);  | 
427  |  |  | 
428  |  |     // Flood graph using manual stack, filling in actions as found.  | 
429  |  |     // Default is none.  | 
430  | 46.7M  |     for (int b = 0; b < bytemap_range_; b++)  | 
431  | 43.9M  |       node->action[b] = kImpossible;  | 
432  | 2.85M  |     node->matchcond = kImpossible;  | 
433  |  |  | 
434  | 2.85M  |     workq.clear();  | 
435  | 2.85M  |     bool matched = false;  | 
436  | 2.85M  |     int nstack = 0;  | 
437  | 2.85M  |     stack[nstack].id = id;  | 
438  | 2.85M  |     stack[nstack++].cond = 0;  | 
439  | 5.81M  |     while (nstack > 0) { | 
440  | 2.95M  |       int id = stack[--nstack].id;  | 
441  | 2.95M  |       uint32_t cond = stack[nstack].cond;  | 
442  |  |  | 
443  | 4.58M  |     Loop:  | 
444  | 4.58M  |       Prog::Inst* ip = inst(id);  | 
445  | 4.58M  |       switch (ip->opcode()) { | 
446  | 0  |         default:  | 
447  | 0  |           LOG(DFATAL) << "unhandled opcode: " << ip->opcode();  | 
448  | 0  |           break;  | 
449  |  |  | 
450  | 63  |         case kInstAltMatch:  | 
451  |  |           // TODO(rsc): Ignoring kInstAltMatch optimization.  | 
452  |  |           // Should implement it in this engine, but it's subtle.  | 
453  | 63  |           DCHECK(!ip->last());  | 
454  |  |           // If already on work queue, (1) is violated: bail out.  | 
455  | 63  |           if (!AddQ(&workq, id+1))  | 
456  | 0  |             goto fail;  | 
457  | 63  |           id = id+1;  | 
458  | 63  |           goto Loop;  | 
459  |  |  | 
460  | 3.87M  |         case kInstByteRange: { | 
461  | 3.87M  |           int nextindex = nodebyid[ip->out()];  | 
462  | 3.87M  |           if (nextindex == -1) { | 
463  | 2.85M  |             if (nalloc >= maxnodes) { | 
464  | 0  |               if (ExtraDebug)  | 
465  | 0  |                 LOG(ERROR) << StringPrintf(  | 
466  | 0  |                     "Not OnePass: hit node limit %d >= %d", nalloc, maxnodes);  | 
467  | 0  |               goto fail;  | 
468  | 0  |             }  | 
469  | 2.85M  |             nextindex = nalloc;  | 
470  | 2.85M  |             AddQ(&tovisit, ip->out());  | 
471  | 2.85M  |             nodebyid[ip->out()] = nalloc;  | 
472  | 2.85M  |             nalloc++;  | 
473  | 2.85M  |             nodes.insert(nodes.end(), statesize, 0);  | 
474  |  |             // Update node because it might have been invalidated.  | 
475  | 2.85M  |             node = IndexToNode(nodes.data(), statesize, nodeindex);  | 
476  | 2.85M  |           }  | 
477  | 11.5M  |           for (int c = ip->lo(); c <= ip->hi(); c++) { | 
478  | 7.65M  |             int b = bytemap_[c];  | 
479  |  |             // Skip any bytes immediately after c that are also in b.  | 
480  | 26.7M  |             while (c < 256-1 && bytemap_[c+1] == b)  | 
481  | 19.0M  |               c++;  | 
482  | 7.65M  |             uint32_t act = node->action[b];  | 
483  | 7.65M  |             uint32_t newact = (nextindex << kIndexShift) | cond;  | 
484  | 7.65M  |             if (matched)  | 
485  | 21.9k  |               newact |= kMatchWins;  | 
486  | 7.65M  |             if ((act & kImpossible) == kImpossible) { | 
487  | 6.61M  |               node->action[b] = newact;  | 
488  | 6.61M  |             } else if (act != newact) { | 
489  | 3.29k  |               if (ExtraDebug)  | 
490  | 0  |                 LOG(ERROR) << StringPrintf(  | 
491  | 0  |                     "Not OnePass: conflict on byte %#x at state %d", c, *it);  | 
492  | 3.29k  |               goto fail;  | 
493  | 3.29k  |             }  | 
494  | 7.65M  |           }  | 
495  | 3.87M  |           if (ip->foldcase()) { | 
496  | 2.20M  |             Rune lo = std::max<Rune>(ip->lo(), 'a') + 'A' - 'a';  | 
497  | 2.20M  |             Rune hi = std::min<Rune>(ip->hi(), 'z') + 'A' - 'a';  | 
498  | 3.28M  |             for (int c = lo; c <= hi; c++) { | 
499  | 1.07M  |               int b = bytemap_[c];  | 
500  |  |               // Skip any bytes immediately after c that are also in b.  | 
501  | 2.03M  |               while (c < 256-1 && bytemap_[c+1] == b)  | 
502  | 956k  |                 c++;  | 
503  | 1.07M  |               uint32_t act = node->action[b];  | 
504  | 1.07M  |               uint32_t newact = (nextindex << kIndexShift) | cond;  | 
505  | 1.07M  |               if (matched)  | 
506  | 8.03k  |                 newact |= kMatchWins;  | 
507  | 1.07M  |               if ((act & kImpossible) == kImpossible) { | 
508  | 8.45k  |                 node->action[b] = newact;  | 
509  | 1.06M  |               } else if (act != newact) { | 
510  | 4  |                 if (ExtraDebug)  | 
511  | 0  |                   LOG(ERROR) << StringPrintf(  | 
512  | 0  |                       "Not OnePass: conflict on byte %#x at state %d", c, *it);  | 
513  | 4  |                 goto fail;  | 
514  | 4  |               }  | 
515  | 1.07M  |             }  | 
516  | 2.20M  |           }  | 
517  |  |  | 
518  | 3.87M  |           if (ip->last())  | 
519  | 2.92M  |             break;  | 
520  |  |           // If already on work queue, (1) is violated: bail out.  | 
521  | 949k  |           if (!AddQ(&workq, id+1))  | 
522  | 6  |             goto fail;  | 
523  | 948k  |           id = id+1;  | 
524  | 948k  |           goto Loop;  | 
525  | 949k  |         }  | 
526  |  |  | 
527  | 308k  |         case kInstCapture:  | 
528  | 431k  |         case kInstEmptyWidth:  | 
529  | 681k  |         case kInstNop:  | 
530  | 681k  |           if (!ip->last()) { | 
531  |  |             // If already on work queue, (1) is violated: bail out.  | 
532  | 129k  |             if (!AddQ(&workq, id+1))  | 
533  | 4  |               goto fail;  | 
534  | 129k  |             stack[nstack].id = id+1;  | 
535  | 129k  |             stack[nstack++].cond = cond;  | 
536  | 129k  |           }  | 
537  |  |  | 
538  | 681k  |           if (ip->opcode() == kInstCapture && ip->cap() < kMaxCap)  | 
539  | 293k  |             cond |= (1 << kCapShift) << ip->cap();  | 
540  | 681k  |           if (ip->opcode() == kInstEmptyWidth)  | 
541  | 123k  |             cond |= ip->empty();  | 
542  |  |  | 
543  |  |           // kInstCapture and kInstNop always proceed to ip->out().  | 
544  |  |           // kInstEmptyWidth only sometimes proceeds to ip->out(),  | 
545  |  |           // but as a conservative approximation we assume it always does.  | 
546  |  |           // We could be a little more precise by looking at what c  | 
547  |  |           // is, but that seems like overkill.  | 
548  |  |  | 
549  |  |           // If already on work queue, (1) is violated: bail out.  | 
550  | 681k  |           if (!AddQ(&workq, ip->out())) { | 
551  | 470  |             if (ExtraDebug)  | 
552  | 0  |               LOG(ERROR) << StringPrintf(  | 
553  | 0  |                   "Not OnePass: multiple paths %d -> %d", *it, ip->out());  | 
554  | 470  |             goto fail;  | 
555  | 470  |           }  | 
556  | 681k  |           id = ip->out();  | 
557  | 681k  |           goto Loop;  | 
558  |  |  | 
559  | 24.9k  |         case kInstMatch:  | 
560  | 24.9k  |           if (matched) { | 
561  |  |             // (3) is violated  | 
562  | 1  |             if (ExtraDebug)  | 
563  | 0  |               LOG(ERROR) << StringPrintf(  | 
564  | 0  |                   "Not OnePass: multiple matches from %d", *it);  | 
565  | 1  |             goto fail;  | 
566  | 1  |           }  | 
567  | 24.9k  |           matched = true;  | 
568  | 24.9k  |           node->matchcond = cond;  | 
569  |  |  | 
570  | 24.9k  |           if (ip->last())  | 
571  | 24.8k  |             break;  | 
572  |  |           // If already on work queue, (1) is violated: bail out.  | 
573  | 73  |           if (!AddQ(&workq, id+1))  | 
574  | 0  |             goto fail;  | 
575  | 73  |           id = id+1;  | 
576  | 73  |           goto Loop;  | 
577  |  |  | 
578  | 3.64k  |         case kInstFail:  | 
579  | 3.64k  |           break;  | 
580  | 4.58M  |       }  | 
581  | 4.58M  |     }  | 
582  | 2.85M  |   }  | 
583  |  |  | 
584  | 6.34k  |   if (ExtraDebug) {  // For debugging, dump one-pass NFA to LOG(ERROR). | 
585  | 0  |     LOG(ERROR) << "bytemap:\n" << DumpByteMap();  | 
586  | 0  |     LOG(ERROR) << "prog:\n" << Dump();  | 
587  |  | 
  | 
588  | 0  |     std::map<int, int> idmap;  | 
589  | 0  |     for (int i = 0; i < size; i++)  | 
590  | 0  |       if (nodebyid[i] != -1)  | 
591  | 0  |         idmap[nodebyid[i]] = i;  | 
592  |  | 
  | 
593  | 0  |     std::string dump;  | 
594  | 0  |     for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { | 
595  | 0  |       int id = *it;  | 
596  | 0  |       int nodeindex = nodebyid[id];  | 
597  | 0  |       if (nodeindex == -1)  | 
598  | 0  |         continue;  | 
599  | 0  |       OneState* node = IndexToNode(nodes.data(), statesize, nodeindex);  | 
600  | 0  |       dump += StringPrintf("node %d id=%d: matchcond=%#x\n", | 
601  | 0  |                            nodeindex, id, node->matchcond);  | 
602  | 0  |       for (int i = 0; i < bytemap_range_; i++) { | 
603  | 0  |         if ((node->action[i] & kImpossible) == kImpossible)  | 
604  | 0  |           continue;  | 
605  | 0  |         dump += StringPrintf("  %d cond %#x -> %d id=%d\n", | 
606  | 0  |                              i, node->action[i] & 0xFFFF,  | 
607  | 0  |                              node->action[i] >> kIndexShift,  | 
608  | 0  |                              idmap[node->action[i] >> kIndexShift]);  | 
609  | 0  |       }  | 
610  | 0  |     }  | 
611  | 0  |     LOG(ERROR) << "nodes:\n" << dump;  | 
612  | 0  |   }  | 
613  |  |  | 
614  | 6.34k  |   dfa_mem_ -= nalloc*statesize;  | 
615  | 6.34k  |   onepass_nodes_ = PODArray<uint8_t>(nalloc*statesize);  | 
616  | 6.34k  |   memmove(onepass_nodes_.data(), nodes.data(), nalloc*statesize);  | 
617  | 6.34k  |   return true;  | 
618  |  |  | 
619  | 3.78k  | fail:  | 
620  | 3.78k  |   return false;  | 
621  | 10.1k  | }  | 
622  |  |  | 
623  |  | }  // namespace re2  |