Line | Count | Source |
1 | | // Copyright 2007 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Compile regular expression to Prog. |
6 | | // |
7 | | // Prog and Inst are defined in prog.h. |
8 | | // This file's external interface is just Regexp::CompileToProg. |
9 | | // The Compiler class defined in this file is private. |
10 | | |
11 | | #include <stdint.h> |
12 | | #include <string.h> |
13 | | |
14 | | #include <string> |
15 | | #include <utility> |
16 | | |
17 | | #include "absl/container/flat_hash_map.h" |
18 | | #include "absl/log/absl_check.h" |
19 | | #include "absl/log/absl_log.h" |
20 | | #include "absl/strings/string_view.h" |
21 | | #include "re2/pod_array.h" |
22 | | #include "re2/prog.h" |
23 | | #include "re2/re2.h" |
24 | | #include "re2/regexp.h" |
25 | | #include "re2/walker-inl.h" |
26 | | #include "util/utf.h" |
27 | | |
28 | | namespace re2 { |
29 | | |
30 | | // List of pointers to Inst* that need to be filled in (patched). |
31 | | // Because the Inst* haven't been filled in yet, |
32 | | // we can use the Inst* word to hold the list's "next" pointer. |
33 | | // It's kind of sleazy, but it works well in practice. |
34 | | // See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. |
35 | | // |
36 | | // Because the out and out1 fields in Inst are no longer pointers, |
37 | | // we can't use pointers directly here either. Instead, head refers |
38 | | // to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1). |
39 | | // head == 0 represents the NULL list. This is okay because instruction #0 |
40 | | // is always the fail instruction, which never appears on a list. |
41 | | struct PatchList { |
42 | | // Returns patch list containing just p. |
43 | 60.1M | static PatchList Mk(uint32_t p) { |
44 | 60.1M | return {p, p}; |
45 | 60.1M | } |
46 | | |
47 | | // Patches all the entries on l to have value p. |
48 | | // Caller must not ever use patch list again. |
49 | 34.2M | static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) { |
50 | 74.8M | while (l.head != 0) { |
51 | 40.6M | Prog::Inst* ip = &inst0[l.head>>1]; |
52 | 40.6M | if (l.head&1) { |
53 | 1.01M | l.head = ip->out1(); |
54 | 1.01M | ip->out1_ = p; |
55 | 39.6M | } else { |
56 | 39.6M | l.head = ip->out(); |
57 | 39.6M | ip->set_out(p); |
58 | 39.6M | } |
59 | 40.6M | } |
60 | 34.2M | } |
61 | | |
62 | | // Appends two patch lists and returns result. |
63 | 7.71M | static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { |
64 | 7.71M | if (l1.head == 0) |
65 | 487k | return l2; |
66 | 7.22M | if (l2.head == 0) |
67 | 0 | return l1; |
68 | 7.22M | Prog::Inst* ip = &inst0[l1.tail>>1]; |
69 | 7.22M | if (l1.tail&1) |
70 | 923k | ip->out1_ = l2.head; |
71 | 6.30M | else |
72 | 6.30M | ip->set_out(l2.head); |
73 | 7.22M | return {l1.head, l2.tail}; |
74 | 7.22M | } |
75 | | |
76 | | uint32_t head; |
77 | | uint32_t tail; // for constant-time append |
78 | | }; |
79 | | |
80 | | static const PatchList kNullPatchList = {0, 0}; |
81 | | |
82 | | // Compiled program fragment. |
83 | | struct Frag { |
84 | | uint32_t begin; |
85 | | PatchList end; |
86 | | bool nullable; |
87 | | |
88 | 47.2M | Frag() : begin(0), end(kNullPatchList), nullable(false) {} |
89 | | Frag(uint32_t begin, PatchList end, bool nullable) |
90 | 66.1M | : begin(begin), end(end), nullable(nullable) {} |
91 | | }; |
92 | | |
93 | | // Input encodings. |
94 | | enum Encoding { |
95 | | kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) |
96 | | kEncodingLatin1, // Latin-1 (0-FF) |
97 | | }; |
98 | | |
99 | | class Compiler : public Regexp::Walker<Frag> { |
100 | | public: |
101 | | explicit Compiler(); |
102 | | ~Compiler(); |
103 | | |
104 | | // Compiles Regexp to a new Prog. |
105 | | // Caller is responsible for deleting Prog when finished with it. |
106 | | // If reversed is true, compiles for walking over the input |
107 | | // string backward (reverses all concatenations). |
108 | | static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem); |
109 | | |
110 | | // Compiles alternation of all the re to a new Prog. |
111 | | // Each re has a match with an id equal to its index in the vector. |
112 | | static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem); |
113 | | |
114 | | // Interface for Regexp::Walker, which helps traverse the Regexp. |
115 | | // The walk is purely post-recursive: given the machines for the |
116 | | // children, PostVisit combines them to create the machine for |
117 | | // the current node. The child_args are Frags. |
118 | | // The Compiler traverses the Regexp parse tree, visiting |
119 | | // each node in depth-first order. It invokes PreVisit before |
120 | | // visiting the node's children and PostVisit after visiting |
121 | | // the children. |
122 | | Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); |
123 | | Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, |
124 | | int nchild_args); |
125 | | Frag ShortVisit(Regexp* re, Frag parent_arg); |
126 | | Frag Copy(Frag arg); |
127 | | |
128 | | // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? |
129 | | Frag Plus(Frag a, bool nongreedy); |
130 | | Frag Star(Frag a, bool nongreedy); |
131 | | Frag Quest(Frag a, bool nongreedy); |
132 | | |
133 | | // Given fragment a, returns (a) capturing as \n. |
134 | | Frag Capture(Frag a, int n); |
135 | | |
136 | | // Given fragments a and b, returns ab; a|b |
137 | | Frag Cat(Frag a, Frag b); |
138 | | Frag Alt(Frag a, Frag b); |
139 | | |
140 | | // Returns a fragment that can't match anything. |
141 | | Frag NoMatch(); |
142 | | |
143 | | // Returns a fragment that matches the empty string. |
144 | | Frag Match(int32_t id); |
145 | | |
146 | | // Returns a no-op fragment. |
147 | | Frag Nop(); |
148 | | |
149 | | // Returns a fragment matching the byte range lo-hi. |
150 | | Frag ByteRange(int lo, int hi, bool foldcase); |
151 | | |
152 | | // Returns a fragment matching an empty-width special op. |
153 | | Frag EmptyWidth(EmptyOp op); |
154 | | |
155 | | // Adds n instructions to the program. |
156 | | // Returns the index of the first one. |
157 | | // Returns -1 if no more instructions are available. |
158 | | int AllocInst(int n); |
159 | | |
160 | | // Rune range compiler. |
161 | | |
162 | | // Begins a new alternation. |
163 | | void BeginRange(); |
164 | | |
165 | | // Adds a fragment matching the rune range lo-hi. |
166 | | void AddRuneRange(Rune lo, Rune hi, bool foldcase); |
167 | | void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); |
168 | | void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); |
169 | | void Add_80_10ffff(); |
170 | | |
171 | | // New suffix that matches the byte range lo-hi, then goes to next. |
172 | | int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); |
173 | | int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next); |
174 | | |
175 | | // Returns true iff the suffix is cached. |
176 | | bool IsCachedRuneByteSuffix(int id); |
177 | | |
178 | | // Adds a suffix to alternation. |
179 | | void AddSuffix(int id); |
180 | | |
181 | | // Adds a suffix to the trie starting from the given root node. |
182 | | // Returns zero iff allocating an instruction fails. Otherwise, returns |
183 | | // the current root node, which might be different from what was given. |
184 | | int AddSuffixRecursive(int root, int id); |
185 | | |
186 | | // Finds the trie node for the given suffix. Returns a Frag in order to |
187 | | // distinguish between pointing at the root node directly (end.head == 0) |
188 | | // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively). |
189 | | Frag FindByteRange(int root, int id); |
190 | | |
191 | | // Compares two ByteRanges and returns true iff they are equal. |
192 | | bool ByteRangeEqual(int id1, int id2); |
193 | | |
194 | | // Returns the alternation of all the added suffixes. |
195 | | Frag EndRange(); |
196 | | |
197 | | // Single rune. |
198 | | Frag Literal(Rune r, bool foldcase); |
199 | | |
200 | | void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor); |
201 | | Prog* Finish(Regexp* re); |
202 | | |
203 | | // Returns .* where dot = any byte |
204 | | Frag DotStar(); |
205 | | |
206 | | private: |
207 | | Prog* prog_; // Program being built. |
208 | | bool failed_; // Did we give up compiling? |
209 | | Encoding encoding_; // Input encoding |
210 | | bool reversed_; // Should program run backward over text? |
211 | | |
212 | | PODArray<Prog::Inst> inst_; |
213 | | int ninst_; // Number of instructions used. |
214 | | int max_ninst_; // Maximum number of instructions. |
215 | | |
216 | | int64_t max_mem_; // Total memory budget. |
217 | | |
218 | | absl::flat_hash_map<uint64_t, int> rune_cache_; |
219 | | Frag rune_range_; |
220 | | |
221 | | RE2::Anchor anchor_; // anchor mode for RE2::Set |
222 | | |
223 | | Compiler(const Compiler&) = delete; |
224 | | Compiler& operator=(const Compiler&) = delete; |
225 | | }; |
226 | | |
227 | 35.0k | Compiler::Compiler() { |
228 | 35.0k | prog_ = new Prog(); |
229 | 35.0k | failed_ = false; |
230 | 35.0k | encoding_ = kEncodingUTF8; |
231 | 35.0k | reversed_ = false; |
232 | 35.0k | ninst_ = 0; |
233 | 35.0k | max_ninst_ = 1; // make AllocInst for fail instruction okay |
234 | 35.0k | max_mem_ = 0; |
235 | 35.0k | int fail = AllocInst(1); |
236 | 35.0k | inst_[fail].InitFail(); |
237 | 35.0k | max_ninst_ = 0; // Caller must change |
238 | 35.0k | } |
239 | | |
240 | 35.0k | Compiler::~Compiler() { |
241 | 35.0k | delete prog_; |
242 | 35.0k | } |
243 | | |
244 | 56.8M | int Compiler::AllocInst(int n) { |
245 | 56.8M | if (failed_ || ninst_ + n > max_ninst_) { |
246 | 0 | failed_ = true; |
247 | 0 | return -1; |
248 | 0 | } |
249 | | |
250 | 56.8M | if (ninst_ + n > inst_.size()) { |
251 | 146k | int cap = inst_.size(); |
252 | 146k | if (cap == 0) |
253 | 35.0k | cap = 8; |
254 | 257k | while (ninst_ + n > cap) |
255 | 111k | cap *= 2; |
256 | 146k | PODArray<Prog::Inst> inst(cap); |
257 | 146k | if (inst_.data() != NULL) |
258 | 111k | memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]); |
259 | 146k | memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]); |
260 | 146k | inst_ = std::move(inst); |
261 | 146k | } |
262 | 56.8M | int id = ninst_; |
263 | 56.8M | ninst_ += n; |
264 | 56.8M | return id; |
265 | 56.8M | } |
266 | | |
267 | | // These routines are somewhat hard to visualize in text -- |
268 | | // see http://swtch.com/~rsc/regexp/regexp1.html for |
269 | | // pictures explaining what is going on here. |
270 | | |
271 | | // Returns an unmatchable fragment. |
272 | 14.6M | Frag Compiler::NoMatch() { |
273 | 14.6M | return Frag(); |
274 | 14.6M | } |
275 | | |
276 | | // Is a an unmatchable fragment? |
277 | 45.1M | static bool IsNoMatch(Frag a) { |
278 | 45.1M | return a.begin == 0; |
279 | 45.1M | } |
280 | | |
281 | | // Given fragments a and b, returns fragment for ab. |
282 | 4.48M | Frag Compiler::Cat(Frag a, Frag b) { |
283 | 4.48M | if (IsNoMatch(a) || IsNoMatch(b)) |
284 | 47.8k | return NoMatch(); |
285 | | |
286 | | // Elide no-op. |
287 | 4.43M | Prog::Inst* begin = &inst_[a.begin]; |
288 | 4.43M | if (begin->opcode() == kInstNop && |
289 | 6.40k | a.end.head == (a.begin << 1) && |
290 | 4.47k | begin->out() == 0) { |
291 | | // in case refs to a somewhere |
292 | 4.47k | PatchList::Patch(inst_.data(), a.end, b.begin); |
293 | 4.47k | return b; |
294 | 4.47k | } |
295 | | |
296 | | // To run backward over string, reverse all concatenations. |
297 | 4.43M | if (reversed_) { |
298 | 1.07M | PatchList::Patch(inst_.data(), b.end, a.begin); |
299 | 1.07M | return Frag(b.begin, a.end, b.nullable && a.nullable); |
300 | 1.07M | } |
301 | | |
302 | 3.35M | PatchList::Patch(inst_.data(), a.end, b.begin); |
303 | 3.35M | return Frag(a.begin, b.end, a.nullable && b.nullable); |
304 | 4.43M | } |
305 | | |
306 | | // Given fragments for a and b, returns fragment for a|b. |
307 | 540k | Frag Compiler::Alt(Frag a, Frag b) { |
308 | | // Special case for convenience in loops. |
309 | 540k | if (IsNoMatch(a)) |
310 | 1.13k | return b; |
311 | 538k | if (IsNoMatch(b)) |
312 | 2.78k | return a; |
313 | | |
314 | 536k | int id = AllocInst(1); |
315 | 536k | if (id < 0) |
316 | 0 | return NoMatch(); |
317 | | |
318 | 536k | inst_[id].InitAlt(a.begin, b.begin); |
319 | 536k | return Frag(id, PatchList::Append(inst_.data(), a.end, b.end), |
320 | 536k | a.nullable || b.nullable); |
321 | 536k | } |
322 | | |
323 | | // When capturing submatches in like-Perl mode, a kOpAlt Inst |
324 | | // treats out_ as the first choice, out1_ as the second. |
325 | | // |
326 | | // For *, +, and ?, if out_ causes another repetition, |
327 | | // then the operator is greedy. If out1_ is the repetition |
328 | | // (and out_ moves forward), then the operator is non-greedy. |
329 | | |
330 | | // Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) |
331 | 88.9k | Frag Compiler::Plus(Frag a, bool nongreedy) { |
332 | 88.9k | int id = AllocInst(1); |
333 | 88.9k | if (id < 0) |
334 | 0 | return NoMatch(); |
335 | 88.9k | PatchList pl; |
336 | 88.9k | if (nongreedy) { |
337 | 2.72k | inst_[id].InitAlt(0, a.begin); |
338 | 2.72k | pl = PatchList::Mk(id << 1); |
339 | 86.2k | } else { |
340 | 86.2k | inst_[id].InitAlt(a.begin, 0); |
341 | 86.2k | pl = PatchList::Mk((id << 1) | 1); |
342 | 86.2k | } |
343 | 88.9k | PatchList::Patch(inst_.data(), a.end, id); |
344 | 88.9k | return Frag(a.begin, pl, a.nullable); |
345 | 88.9k | } |
346 | | |
347 | | // Given a fragment for a, returns a fragment for a* or a*? (if nongreedy) |
348 | 327k | Frag Compiler::Star(Frag a, bool nongreedy) { |
349 | | // When the subexpression is nullable, one Alt isn't enough to guarantee |
350 | | // correct priority ordering within the transitive closure. The simplest |
351 | | // solution is to handle it as (a+)? instead, which adds the second Alt. |
352 | 327k | if (a.nullable) |
353 | 29.6k | return Quest(Plus(a, nongreedy), nongreedy); |
354 | | |
355 | 298k | int id = AllocInst(1); |
356 | 298k | if (id < 0) |
357 | 0 | return NoMatch(); |
358 | 298k | PatchList pl; |
359 | 298k | if (nongreedy) { |
360 | 34.8k | inst_[id].InitAlt(0, a.begin); |
361 | 34.8k | pl = PatchList::Mk(id << 1); |
362 | 263k | } else { |
363 | 263k | inst_[id].InitAlt(a.begin, 0); |
364 | 263k | pl = PatchList::Mk((id << 1) | 1); |
365 | 263k | } |
366 | 298k | PatchList::Patch(inst_.data(), a.end, id); |
367 | 298k | return Frag(id, pl, true); |
368 | 298k | } |
369 | | |
370 | | // Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) |
371 | 691k | Frag Compiler::Quest(Frag a, bool nongreedy) { |
372 | 691k | if (IsNoMatch(a)) |
373 | 363 | return Nop(); |
374 | 691k | int id = AllocInst(1); |
375 | 691k | if (id < 0) |
376 | 0 | return NoMatch(); |
377 | 691k | PatchList pl; |
378 | 691k | if (nongreedy) { |
379 | 5.39k | inst_[id].InitAlt(0, a.begin); |
380 | 5.39k | pl = PatchList::Mk(id << 1); |
381 | 685k | } else { |
382 | 685k | inst_[id].InitAlt(a.begin, 0); |
383 | 685k | pl = PatchList::Mk((id << 1) | 1); |
384 | 685k | } |
385 | 691k | return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true); |
386 | 691k | } |
387 | | |
388 | | // Returns a fragment for the byte range lo-hi. |
389 | 39.6M | Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { |
390 | 39.6M | int id = AllocInst(1); |
391 | 39.6M | if (id < 0) |
392 | 0 | return NoMatch(); |
393 | 39.6M | inst_[id].InitByteRange(lo, hi, foldcase, 0); |
394 | 39.6M | return Frag(id, PatchList::Mk(id << 1), false); |
395 | 39.6M | } |
396 | | |
397 | | // Returns a no-op fragment. Sometimes unavoidable. |
398 | 160k | Frag Compiler::Nop() { |
399 | 160k | int id = AllocInst(1); |
400 | 160k | if (id < 0) |
401 | 0 | return NoMatch(); |
402 | 160k | inst_[id].InitNop(0); |
403 | 160k | return Frag(id, PatchList::Mk(id << 1), true); |
404 | 160k | } |
405 | | |
406 | | // Returns a fragment that signals a match. |
407 | 35.0k | Frag Compiler::Match(int32_t match_id) { |
408 | 35.0k | int id = AllocInst(1); |
409 | 35.0k | if (id < 0) |
410 | 0 | return NoMatch(); |
411 | 35.0k | inst_[id].InitMatch(match_id); |
412 | 35.0k | return Frag(id, kNullPatchList, false); |
413 | 35.0k | } |
414 | | |
415 | | // Returns a fragment matching a particular empty-width op (like ^ or $) |
416 | 351k | Frag Compiler::EmptyWidth(EmptyOp empty) { |
417 | 351k | int id = AllocInst(1); |
418 | 351k | if (id < 0) |
419 | 0 | return NoMatch(); |
420 | 351k | inst_[id].InitEmptyWidth(empty, 0); |
421 | 351k | return Frag(id, PatchList::Mk(id << 1), true); |
422 | 351k | } |
423 | | |
424 | | // Given a fragment a, returns a fragment with capturing parens around a. |
425 | 206k | Frag Compiler::Capture(Frag a, int n) { |
426 | 206k | if (IsNoMatch(a)) |
427 | 1.01k | return NoMatch(); |
428 | 205k | int id = AllocInst(2); |
429 | 205k | if (id < 0) |
430 | 0 | return NoMatch(); |
431 | 205k | inst_[id].InitCapture(2*n, a.begin); |
432 | 205k | inst_[id+1].InitCapture(2*n+1, 0); |
433 | 205k | PatchList::Patch(inst_.data(), a.end, id+1); |
434 | | |
435 | 205k | return Frag(id, PatchList::Mk((id+1) << 1), a.nullable); |
436 | 205k | } |
437 | | |
438 | | // A Rune is a name for a Unicode code point. |
439 | | // Returns maximum rune encoded by UTF-8 sequence of length len. |
440 | 54.6M | static int MaxRune(int len) { |
441 | 54.6M | int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) |
442 | 54.6M | if (len == 1) |
443 | 18.3M | b = 7; |
444 | 36.3M | else |
445 | 36.3M | b = 8-(len+1) + 6*(len-1); |
446 | 54.6M | return (1<<b) - 1; // maximum Rune for b bits. |
447 | 54.6M | } |
448 | | |
449 | | // The rune range compiler caches common suffix fragments, |
450 | | // which are very common in UTF-8 (e.g., [80-bf]). |
451 | | // The fragment suffixes are identified by their start |
452 | | // instructions. NULL denotes the eventual end match. |
453 | | // The Frag accumulates in rune_range_. Caching common |
454 | | // suffixes reduces the UTF-8 "." from 32 to 24 instructions, |
455 | | // and it reduces the corresponding one-pass NFA from 16 nodes to 8. |
456 | | |
457 | 487k | void Compiler::BeginRange() { |
458 | 487k | rune_cache_.clear(); |
459 | 487k | rune_range_.begin = 0; |
460 | 487k | rune_range_.end = kNullPatchList; |
461 | 487k | } |
462 | | |
463 | | int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, |
464 | 35.6M | int next) { |
465 | 35.6M | Frag f = ByteRange(lo, hi, foldcase); |
466 | 35.6M | if (next != 0) { |
467 | 29.1M | PatchList::Patch(inst_.data(), f.end, next); |
468 | 29.1M | } else { |
469 | 6.48M | rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end); |
470 | 6.48M | } |
471 | 35.6M | return f.begin; |
472 | 35.6M | } |
473 | | |
474 | | static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase, |
475 | 59.5M | int next) { |
476 | 59.5M | return (uint64_t)next << 17 | |
477 | 59.5M | (uint64_t)lo << 9 | |
478 | 59.5M | (uint64_t)hi << 1 | |
479 | 59.5M | (uint64_t)foldcase; |
480 | 59.5M | } |
481 | | |
482 | | int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, |
483 | 20.1M | int next) { |
484 | 20.1M | uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); |
485 | 20.1M | absl::flat_hash_map<uint64_t, int>::const_iterator it = rune_cache_.find(key); |
486 | 20.1M | if (it != rune_cache_.end()) |
487 | 12.0M | return it->second; |
488 | 8.08M | int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); |
489 | 8.08M | rune_cache_[key] = id; |
490 | 8.08M | return id; |
491 | 20.1M | } |
492 | | |
493 | 39.3M | bool Compiler::IsCachedRuneByteSuffix(int id) { |
494 | 39.3M | uint8_t lo = inst_[id].lo_; |
495 | 39.3M | uint8_t hi = inst_[id].hi_; |
496 | 39.3M | bool foldcase = inst_[id].foldcase() != 0; |
497 | 39.3M | int next = inst_[id].out(); |
498 | | |
499 | 39.3M | uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next); |
500 | 39.3M | return rune_cache_.find(key) != rune_cache_.end(); |
501 | 39.3M | } |
502 | | |
503 | 15.2M | void Compiler::AddSuffix(int id) { |
504 | 15.2M | if (failed_) |
505 | 0 | return; |
506 | | |
507 | 15.2M | if (rune_range_.begin == 0) { |
508 | 487k | rune_range_.begin = id; |
509 | 487k | return; |
510 | 487k | } |
511 | | |
512 | 14.7M | if (encoding_ == kEncodingUTF8) { |
513 | | // Build a trie in order to reduce fanout. |
514 | 14.5M | rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id); |
515 | 14.5M | return; |
516 | 14.5M | } |
517 | | |
518 | 148k | int alt = AllocInst(1); |
519 | 148k | if (alt < 0) { |
520 | 0 | rune_range_.begin = 0; |
521 | 0 | return; |
522 | 0 | } |
523 | 148k | inst_[alt].InitAlt(rune_range_.begin, id); |
524 | 148k | rune_range_.begin = alt; |
525 | 148k | } |
526 | | |
527 | 34.2M | int Compiler::AddSuffixRecursive(int root, int id) { |
528 | 34.2M | ABSL_DCHECK(inst_[root].opcode() == kInstAlt || |
529 | 34.2M | inst_[root].opcode() == kInstByteRange); |
530 | | |
531 | 34.2M | Frag f = FindByteRange(root, id); |
532 | 34.2M | if (IsNoMatch(f)) { |
533 | 14.5M | int alt = AllocInst(1); |
534 | 14.5M | if (alt < 0) |
535 | 0 | return 0; |
536 | 14.5M | inst_[alt].InitAlt(root, id); |
537 | 14.5M | return alt; |
538 | 14.5M | } |
539 | | |
540 | 19.6M | int br; |
541 | 19.6M | if (f.end.head == 0) |
542 | 1.06M | br = root; |
543 | 18.6M | else if (f.end.head&1) |
544 | 18.6M | br = inst_[f.begin].out1(); |
545 | 10.9k | else |
546 | 10.9k | br = inst_[f.begin].out(); |
547 | | |
548 | 19.6M | if (IsCachedRuneByteSuffix(br)) { |
549 | | // We can't fiddle with cached suffixes, so make a clone of the head. |
550 | 69.8k | int byterange = AllocInst(1); |
551 | 69.8k | if (byterange < 0) |
552 | 0 | return 0; |
553 | 69.8k | inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(), |
554 | 69.8k | inst_[br].foldcase(), inst_[br].out()); |
555 | | |
556 | | // Ensure that the parent points to the clone, not to the original. |
557 | | // Note that this could leave the head unreachable except via the cache. |
558 | 69.8k | br = byterange; |
559 | 69.8k | if (f.end.head == 0) |
560 | 12.5k | root = br; |
561 | 57.3k | else if (f.end.head&1) |
562 | 49.2k | inst_[f.begin].out1_ = br; |
563 | 8.05k | else |
564 | 8.05k | inst_[f.begin].set_out(br); |
565 | 69.8k | } |
566 | | |
567 | 19.6M | int out = inst_[id].out(); |
568 | 19.6M | if (!IsCachedRuneByteSuffix(id)) { |
569 | | // The head should be the instruction most recently allocated, so free it |
570 | | // instead of leaving it unreachable. |
571 | 19.6M | ABSL_DCHECK_EQ(id, ninst_-1); |
572 | 19.6M | inst_[id].out_opcode_ = 0; |
573 | 19.6M | inst_[id].out1_ = 0; |
574 | 19.6M | ninst_--; |
575 | 19.6M | } |
576 | | |
577 | 19.6M | out = AddSuffixRecursive(inst_[br].out(), out); |
578 | 19.6M | if (out == 0) |
579 | 0 | return 0; |
580 | | |
581 | 19.6M | inst_[br].set_out(out); |
582 | 19.6M | return root; |
583 | 19.6M | } |
584 | | |
585 | 522M | bool Compiler::ByteRangeEqual(int id1, int id2) { |
586 | 522M | return inst_[id1].lo() == inst_[id2].lo() && |
587 | 49.3M | inst_[id1].hi() == inst_[id2].hi() && |
588 | 19.6M | inst_[id1].foldcase() == inst_[id2].foldcase(); |
589 | 522M | } |
590 | | |
591 | 34.2M | Frag Compiler::FindByteRange(int root, int id) { |
592 | 34.2M | if (inst_[root].opcode() == kInstByteRange) { |
593 | 4.82M | if (ByteRangeEqual(root, id)) |
594 | 1.06M | return Frag(root, kNullPatchList, false); |
595 | 3.75M | else |
596 | 3.75M | return NoMatch(); |
597 | 4.82M | } |
598 | | |
599 | 514M | while (inst_[root].opcode() == kInstAlt) { |
600 | 514M | int out1 = inst_[root].out1(); |
601 | 514M | if (ByteRangeEqual(out1, id)) |
602 | 18.6M | return Frag(root, PatchList::Mk((root << 1) | 1), false); |
603 | | |
604 | | // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't |
605 | | // what we're looking for, then we can stop immediately. Unfortunately, we |
606 | | // can't short-circuit the search in reverse mode. |
607 | 495M | if (!reversed_) |
608 | 7.54M | return NoMatch(); |
609 | | |
610 | 488M | int out = inst_[root].out(); |
611 | 488M | if (inst_[out].opcode() == kInstAlt) |
612 | 484M | root = out; |
613 | 3.27M | else if (ByteRangeEqual(out, id)) |
614 | 10.9k | return Frag(root, PatchList::Mk(root << 1), false); |
615 | 3.26M | else |
616 | 3.26M | return NoMatch(); |
617 | 488M | } |
618 | | |
619 | 29.4M | ABSL_LOG(DFATAL) << "should never happen"; |
620 | 0 | return NoMatch(); |
621 | 0 | } |
622 | | |
623 | 487k | Frag Compiler::EndRange() { |
624 | 487k | return rune_range_; |
625 | 487k | } |
626 | | |
627 | | // Converts rune range lo-hi into a fragment that recognizes |
628 | | // the bytes that would make up those runes in the current |
629 | | // encoding (Latin 1 or UTF-8). |
630 | | // This lets the machine work byte-by-byte even when |
631 | | // using multibyte encodings. |
632 | | |
633 | 11.6M | void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { |
634 | 11.6M | switch (encoding_) { |
635 | 0 | default: |
636 | 11.4M | case kEncodingUTF8: |
637 | 11.4M | AddRuneRangeUTF8(lo, hi, foldcase); |
638 | 11.4M | break; |
639 | 269k | case kEncodingLatin1: |
640 | 269k | AddRuneRangeLatin1(lo, hi, foldcase); |
641 | 269k | break; |
642 | 11.6M | } |
643 | 11.6M | } |
644 | | |
645 | 269k | void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { |
646 | | // Latin-1 is easy: runes *are* bytes. |
647 | 269k | if (lo > hi || lo > 0xFF) |
648 | 0 | return; |
649 | 269k | if (hi > 0xFF) |
650 | 15.4k | hi = 0xFF; |
651 | 269k | AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo), |
652 | 269k | static_cast<uint8_t>(hi), foldcase, 0)); |
653 | 269k | } |
654 | | |
655 | 31.4k | void Compiler::Add_80_10ffff() { |
656 | | // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough |
657 | | // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by |
658 | | // permitting overlong encodings in E0 and F0 sequences and code points |
659 | | // over 10FFFF in F4 sequences, the size of the bytecode and the number |
660 | | // of equivalence classes are reduced significantly. |
661 | 31.4k | int id; |
662 | 31.4k | if (reversed_) { |
663 | | // Prefix factoring matters, but we don't have to handle it here |
664 | | // because the rune range trie logic takes care of that already. |
665 | 8.72k | id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0); |
666 | 8.72k | id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); |
667 | 8.72k | AddSuffix(id); |
668 | | |
669 | 8.72k | id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0); |
670 | 8.72k | id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); |
671 | 8.72k | id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); |
672 | 8.72k | AddSuffix(id); |
673 | | |
674 | 8.72k | id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0); |
675 | 8.72k | id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); |
676 | 8.72k | id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); |
677 | 8.72k | id = UncachedRuneByteSuffix(0x80, 0xBF, false, id); |
678 | 8.72k | AddSuffix(id); |
679 | 22.7k | } else { |
680 | | // Suffix factoring matters - and we do have to handle it here. |
681 | 22.7k | int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0); |
682 | 22.7k | id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1); |
683 | 22.7k | AddSuffix(id); |
684 | | |
685 | 22.7k | int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1); |
686 | 22.7k | id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2); |
687 | 22.7k | AddSuffix(id); |
688 | | |
689 | 22.7k | int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2); |
690 | 22.7k | id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3); |
691 | 22.7k | AddSuffix(id); |
692 | 22.7k | } |
693 | 31.4k | } |
694 | | |
695 | 18.3M | void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { |
696 | 18.3M | if (lo > hi) |
697 | 0 | return; |
698 | | |
699 | | // Pick off 80-10FFFF as a common special case. |
700 | 18.3M | if (lo == 0x80 && hi == 0x10ffff) { |
701 | 31.4k | Add_80_10ffff(); |
702 | 31.4k | return; |
703 | 31.4k | } |
704 | | |
705 | | // Split range into same-length sized ranges. |
706 | 72.7M | for (int i = 1; i < UTFmax; i++) { |
707 | 54.6M | Rune max = MaxRune(i); |
708 | 54.6M | if (lo <= max && max < hi) { |
709 | 182k | AddRuneRangeUTF8(lo, max, foldcase); |
710 | 182k | AddRuneRangeUTF8(max+1, hi, foldcase); |
711 | 182k | return; |
712 | 182k | } |
713 | 54.6M | } |
714 | | |
715 | | // ASCII range is always a special case. |
716 | 18.1M | if (hi < Runeself) { |
717 | 341k | AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo), |
718 | 341k | static_cast<uint8_t>(hi), foldcase, 0)); |
719 | 341k | return; |
720 | 341k | } |
721 | | |
722 | | // Split range into sections that agree on leading bytes. |
723 | 61.7M | for (int i = 1; i < UTFmax; i++) { |
724 | 47.2M | uint32_t m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence |
725 | 47.2M | if ((lo & ~m) != (hi & ~m)) { |
726 | 5.37M | if ((lo & m) != 0) { |
727 | 2.25M | AddRuneRangeUTF8(lo, lo|m, foldcase); |
728 | 2.25M | AddRuneRangeUTF8((lo|m)+1, hi, foldcase); |
729 | 2.25M | return; |
730 | 2.25M | } |
731 | 3.11M | if ((hi & m) != m) { |
732 | 1.02M | AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); |
733 | 1.02M | AddRuneRangeUTF8(hi&~m, hi, foldcase); |
734 | 1.02M | return; |
735 | 1.02M | } |
736 | 3.11M | } |
737 | 47.2M | } |
738 | | |
739 | | // Finally. Generate byte matching equivalent for lo-hi. |
740 | 14.4M | uint8_t ulo[UTFmax], uhi[UTFmax]; |
741 | 14.4M | int n = runetochar(reinterpret_cast<char*>(ulo), &lo); |
742 | 14.4M | int m = runetochar(reinterpret_cast<char*>(uhi), &hi); |
743 | 14.4M | (void)m; // USED(m) |
744 | 14.4M | ABSL_DCHECK_EQ(n, m); |
745 | | |
746 | | // The logic below encodes this thinking: |
747 | | // |
748 | | // 1. When we have built the whole suffix, we know that it cannot |
749 | | // possibly be a suffix of anything longer: in forward mode, nothing |
750 | | // else can occur before the leading byte; in reverse mode, nothing |
751 | | // else can occur after the last continuation byte or else the leading |
752 | | // byte would have to change. Thus, there is no benefit to caching |
753 | | // the first byte of the suffix whereas there is a cost involved in |
754 | | // cloning it if it begins a common prefix, which is fairly likely. |
755 | | // |
756 | | // 2. Conversely, the last byte of the suffix cannot possibly be a |
757 | | // prefix of anything because next == 0, so we will never want to |
758 | | // clone it, but it is fairly likely to be a common suffix. Perhaps |
759 | | // more so in reverse mode than in forward mode because the former is |
760 | | // "converging" towards lower entropy, but caching is still worthwhile |
761 | | // for the latter in cases such as 80-BF. |
762 | | // |
763 | | // 3. Handling the bytes between the first and the last is less |
764 | | // straightforward and, again, the approach depends on whether we are |
765 | | // "converging" towards lower entropy: in forward mode, a single byte |
766 | | // is unlikely to be part of a common suffix whereas a byte range |
767 | | // is more likely so; in reverse mode, a byte range is unlikely to |
768 | | // be part of a common suffix whereas a single byte is more likely |
769 | | // so. The same benefit versus cost argument applies here. |
770 | 14.4M | int id = 0; |
771 | 14.4M | if (reversed_) { |
772 | 16.9M | for (int i = 0; i < n; i++) { |
773 | | // In reverse UTF-8 mode: cache the leading byte; don't cache the last |
774 | | // continuation byte; cache anything else iff it's a single byte (XX-XX). |
775 | 12.9M | if (i == 0 || (ulo[i] == uhi[i] && i != n-1)) |
776 | 8.61M | id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); |
777 | 4.32M | else |
778 | 4.32M | id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); |
779 | 12.9M | } |
780 | 10.5M | } else { |
781 | 44.4M | for (int i = n-1; i >= 0; i--) { |
782 | | // In forward UTF-8 mode: don't cache the leading byte; cache the last |
783 | | // continuation byte; cache anything else iff it's a byte range (XX-YY). |
784 | 33.9M | if (i == n-1 || (ulo[i] < uhi[i] && i != 0)) |
785 | 11.5M | id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id); |
786 | 22.4M | else |
787 | 22.4M | id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id); |
788 | 33.9M | } |
789 | 10.5M | } |
790 | 14.4M | AddSuffix(id); |
791 | 14.4M | } |
792 | | |
793 | | // Should not be called. |
794 | 0 | Frag Compiler::Copy(Frag arg) { |
795 | | // We're using WalkExponential; there should be no copying. |
796 | 0 | failed_ = true; |
797 | 0 | ABSL_LOG(DFATAL) << "Compiler::Copy called!"; |
798 | 0 | return NoMatch(); |
799 | 0 | } |
800 | | |
801 | | // Visits a node quickly; called once WalkExponential has |
802 | | // decided to cut this walk short. |
803 | 0 | Frag Compiler::ShortVisit(Regexp* re, Frag) { |
804 | 0 | failed_ = true; |
805 | 0 | return NoMatch(); |
806 | 0 | } |
807 | | |
808 | | // Called before traversing a node's children during the walk. |
809 | 5.55M | Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { |
810 | | // Cut off walk if we've already failed. |
811 | 5.55M | if (failed_) |
812 | 0 | *stop = true; |
813 | | |
814 | 5.55M | return Frag(); // not used by caller |
815 | 5.55M | } |
816 | | |
817 | 3.88M | Frag Compiler::Literal(Rune r, bool foldcase) { |
818 | 3.88M | switch (encoding_) { |
819 | 0 | default: |
820 | 0 | return Frag(); |
821 | | |
822 | 3.50M | case kEncodingLatin1: |
823 | 3.50M | return ByteRange(r, r, foldcase); |
824 | | |
825 | 381k | case kEncodingUTF8: { |
826 | 381k | if (r < Runeself) // Make common case fast. |
827 | 324k | return ByteRange(r, r, foldcase); |
828 | 57.2k | uint8_t buf[UTFmax]; |
829 | 57.2k | int n = runetochar(reinterpret_cast<char*>(buf), &r); |
830 | 57.2k | Frag f = ByteRange((uint8_t)buf[0], buf[0], false); |
831 | 138k | for (int i = 1; i < n; i++) |
832 | 81.0k | f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false)); |
833 | 57.2k | return f; |
834 | 381k | } |
835 | 3.88M | } |
836 | 3.88M | } |
837 | | |
838 | | // Called after traversing the node's children during the walk. |
839 | | // Given their frags, build and return the frag for this re. |
840 | | Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, |
841 | 5.55M | int nchild_frags) { |
842 | | // If a child failed, don't bother going forward, especially |
843 | | // since the child_frags might contain Frags with NULLs in them. |
844 | 5.55M | if (failed_) |
845 | 0 | return NoMatch(); |
846 | | |
847 | | // Given the child fragments, return the fragment for this node. |
848 | 5.55M | switch (re->op()) { |
849 | 0 | case kRegexpRepeat: |
850 | | // Should not see; code at bottom of function will print error |
851 | 0 | break; |
852 | | |
853 | 21.8k | case kRegexpNoMatch: |
854 | 21.8k | return NoMatch(); |
855 | | |
856 | 159k | case kRegexpEmptyMatch: |
857 | 159k | return Nop(); |
858 | | |
859 | 8.36k | case kRegexpHaveMatch: { |
860 | 8.36k | Frag f = Match(re->match_id()); |
861 | 8.36k | if (anchor_ == RE2::ANCHOR_BOTH) { |
862 | | // Append \z or else the subexpression will effectively be unanchored. |
863 | | // Complemented by the UNANCHORED case in CompileSet(). |
864 | 2.13k | f = Cat(EmptyWidth(kEmptyEndText), f); |
865 | 2.13k | } |
866 | 8.36k | return f; |
867 | 0 | } |
868 | | |
869 | 883k | case kRegexpConcat: { |
870 | 883k | Frag f = child_frags[0]; |
871 | 3.55M | for (int i = 1; i < nchild_frags; i++) |
872 | 2.67M | f = Cat(f, child_frags[i]); |
873 | 883k | return f; |
874 | 0 | } |
875 | | |
876 | 195k | case kRegexpAlternate: { |
877 | 195k | Frag f = child_frags[0]; |
878 | 735k | for (int i = 1; i < nchild_frags; i++) |
879 | 540k | f = Alt(f, child_frags[i]); |
880 | 195k | return f; |
881 | 0 | } |
882 | | |
883 | 299k | case kRegexpStar: |
884 | 299k | return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); |
885 | | |
886 | 59.2k | case kRegexpPlus: |
887 | 59.2k | return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); |
888 | | |
889 | 662k | case kRegexpQuest: |
890 | 662k | return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0); |
891 | | |
892 | 1.70M | case kRegexpLiteral: |
893 | 1.70M | return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0); |
894 | | |
895 | 514k | case kRegexpLiteralString: { |
896 | | // Concatenation of literals. |
897 | 514k | if (re->nrunes() == 0) |
898 | 0 | return Nop(); |
899 | 514k | Frag f; |
900 | 2.70M | for (int i = 0; i < re->nrunes(); i++) { |
901 | 2.18M | Frag f1 = Literal(re->runes()[i], |
902 | 2.18M | (re->parse_flags()&Regexp::FoldCase) != 0); |
903 | 2.18M | if (i == 0) |
904 | 514k | f = f1; |
905 | 1.67M | else |
906 | 1.67M | f = Cat(f, f1); |
907 | 2.18M | } |
908 | 514k | return f; |
909 | 514k | } |
910 | | |
911 | 23.0k | case kRegexpAnyChar: |
912 | 23.0k | BeginRange(); |
913 | 23.0k | AddRuneRange(0, Runemax, false); |
914 | 23.0k | return EndRange(); |
915 | | |
916 | 7.28k | case kRegexpAnyByte: |
917 | 7.28k | return ByteRange(0x00, 0xFF, false); |
918 | | |
919 | 464k | case kRegexpCharClass: { |
920 | 464k | CharClass* cc = re->cc(); |
921 | 464k | if (cc->empty()) { |
922 | | // This can't happen. |
923 | 0 | failed_ = true; |
924 | 0 | ABSL_LOG(DFATAL) << "No ranges in char class"; |
925 | 0 | return NoMatch(); |
926 | 0 | } |
927 | | |
928 | | // ASCII case-folding optimization: if the char class |
929 | | // behaves the same on A-Z as it does on a-z, |
930 | | // discard any ranges wholly contained in A-Z |
931 | | // and mark the other ranges as foldascii. |
932 | | // This reduces the size of a program for |
933 | | // (?i)abc from 3 insts per letter to 1 per letter. |
934 | 464k | bool foldascii = cc->FoldsASCII(); |
935 | | |
936 | | // Character class is just a big OR of the different |
937 | | // character ranges in the class. |
938 | 464k | BeginRange(); |
939 | 12.1M | for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { |
940 | | // ASCII case-folding optimization (see above). |
941 | 11.6M | if (foldascii && 'A' <= i->lo && i->hi <= 'Z') |
942 | 39.5k | continue; |
943 | | |
944 | | // If this range contains all of A-Za-z or none of it, |
945 | | // the fold flag is unnecessary; don't bother. |
946 | 11.6M | bool fold = foldascii; |
947 | 11.6M | if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo || |
948 | 145k | ('Z' < i->lo && i->hi < 'a')) |
949 | 11.5M | fold = false; |
950 | | |
951 | 11.6M | AddRuneRange(i->lo, i->hi, fold); |
952 | 11.6M | } |
953 | 464k | return EndRange(); |
954 | 464k | } |
955 | | |
956 | 206k | case kRegexpCapture: |
957 | | // If this is a non-capturing parenthesis -- (?:foo) -- |
958 | | // just use the inner expression. |
959 | 206k | if (re->cap() < 0) |
960 | 0 | return child_frags[0]; |
961 | 206k | return Capture(child_frags[0], re->cap()); |
962 | | |
963 | 20.2k | case kRegexpBeginLine: |
964 | 20.2k | return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); |
965 | | |
966 | 44.9k | case kRegexpEndLine: |
967 | 44.9k | return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); |
968 | | |
969 | 45.0k | case kRegexpBeginText: |
970 | 45.0k | return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); |
971 | | |
972 | 233k | case kRegexpEndText: |
973 | 233k | return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); |
974 | | |
975 | 2.68k | case kRegexpWordBoundary: |
976 | 2.68k | return EmptyWidth(kEmptyWordBoundary); |
977 | | |
978 | 2.68k | case kRegexpNoWordBoundary: |
979 | 2.68k | return EmptyWidth(kEmptyNonWordBoundary); |
980 | 5.55M | } |
981 | 0 | failed_ = true; |
982 | 0 | ABSL_LOG(DFATAL) << "Missing case in Compiler: " << re->op(); |
983 | 0 | return NoMatch(); |
984 | 0 | } |
985 | | |
986 | | // Is this regexp required to start at the beginning of the text? |
987 | | // Only approximate; can return false for complicated regexps like (\Aa|\Ab), |
988 | | // but handles (\A(a|b)). Could use the Walker to write a more exact one. |
989 | 51.8k | static bool IsAnchorStart(Regexp** pre, int depth) { |
990 | 51.8k | Regexp* re = *pre; |
991 | 51.8k | Regexp* sub; |
992 | | // The depth limit makes sure that we don't overflow |
993 | | // the stack on a deeply nested regexp. As the comment |
994 | | // above says, IsAnchorStart is conservative, so returning |
995 | | // a false negative is okay. The exact limit is somewhat arbitrary. |
996 | 51.8k | if (re == NULL || depth >= 4) |
997 | 725 | return false; |
998 | 51.1k | switch (re->op()) { |
999 | 24.6k | default: |
1000 | 24.6k | break; |
1001 | 24.6k | case kRegexpConcat: |
1002 | 19.4k | if (re->nsub() > 0) { |
1003 | 19.4k | sub = re->sub()[0]->Incref(); |
1004 | 19.4k | if (IsAnchorStart(&sub, depth+1)) { |
1005 | 1.51k | PODArray<Regexp*> subcopy(re->nsub()); |
1006 | 1.51k | subcopy[0] = sub; // already have reference |
1007 | 9.64k | for (int i = 1; i < re->nsub(); i++) |
1008 | 8.12k | subcopy[i] = re->sub()[i]->Incref(); |
1009 | 1.51k | *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); |
1010 | 1.51k | re->Decref(); |
1011 | 1.51k | return true; |
1012 | 1.51k | } |
1013 | 17.9k | sub->Decref(); |
1014 | 17.9k | } |
1015 | 17.9k | break; |
1016 | 17.9k | case kRegexpCapture: |
1017 | 5.74k | sub = re->sub()[0]->Incref(); |
1018 | 5.74k | if (IsAnchorStart(&sub, depth+1)) { |
1019 | 334 | *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); |
1020 | 334 | re->Decref(); |
1021 | 334 | return true; |
1022 | 334 | } |
1023 | 5.40k | sub->Decref(); |
1024 | 5.40k | break; |
1025 | 1.24k | case kRegexpBeginText: |
1026 | 1.24k | *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); |
1027 | 1.24k | re->Decref(); |
1028 | 1.24k | return true; |
1029 | 51.1k | } |
1030 | 48.0k | return false; |
1031 | 51.1k | } |
1032 | | |
1033 | | // Is this regexp required to start at the end of the text? |
1034 | | // Only approximate; can return false for complicated regexps like (a\z|b\z), |
1035 | | // but handles ((a|b)\z). Could use the Walker to write a more exact one. |
1036 | 50.4k | static bool IsAnchorEnd(Regexp** pre, int depth) { |
1037 | 50.4k | Regexp* re = *pre; |
1038 | 50.4k | Regexp* sub; |
1039 | | // The depth limit makes sure that we don't overflow |
1040 | | // the stack on a deeply nested regexp. As the comment |
1041 | | // above says, IsAnchorEnd is conservative, so returning |
1042 | | // a false negative is okay. The exact limit is somewhat arbitrary. |
1043 | 50.4k | if (re == NULL || depth >= 4) |
1044 | 538 | return false; |
1045 | 49.9k | switch (re->op()) { |
1046 | 23.9k | default: |
1047 | 23.9k | break; |
1048 | 23.9k | case kRegexpConcat: |
1049 | 18.7k | if (re->nsub() > 0) { |
1050 | 18.7k | sub = re->sub()[re->nsub() - 1]->Incref(); |
1051 | 18.7k | if (IsAnchorEnd(&sub, depth+1)) { |
1052 | 2.37k | PODArray<Regexp*> subcopy(re->nsub()); |
1053 | 2.37k | subcopy[re->nsub() - 1] = sub; // already have reference |
1054 | 26.1k | for (int i = 0; i < re->nsub() - 1; i++) |
1055 | 23.7k | subcopy[i] = re->sub()[i]->Incref(); |
1056 | 2.37k | *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags()); |
1057 | 2.37k | re->Decref(); |
1058 | 2.37k | return true; |
1059 | 2.37k | } |
1060 | 16.4k | sub->Decref(); |
1061 | 16.4k | } |
1062 | 16.4k | break; |
1063 | 16.4k | case kRegexpCapture: |
1064 | 5.01k | sub = re->sub()[0]->Incref(); |
1065 | 5.01k | if (IsAnchorEnd(&sub, depth+1)) { |
1066 | 320 | *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); |
1067 | 320 | re->Decref(); |
1068 | 320 | return true; |
1069 | 320 | } |
1070 | 4.69k | sub->Decref(); |
1071 | 4.69k | break; |
1072 | 2.14k | case kRegexpEndText: |
1073 | 2.14k | *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); |
1074 | 2.14k | re->Decref(); |
1075 | 2.14k | return true; |
1076 | 49.9k | } |
1077 | 45.0k | return false; |
1078 | 49.9k | } |
1079 | | |
1080 | | void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem, |
1081 | 35.0k | RE2::Anchor anchor) { |
1082 | 35.0k | if (flags & Regexp::Latin1) |
1083 | 18.3k | encoding_ = kEncodingLatin1; |
1084 | 35.0k | max_mem_ = max_mem; |
1085 | 35.0k | if (max_mem <= 0) { |
1086 | 0 | max_ninst_ = 100000; // more than enough |
1087 | 35.0k | } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) { |
1088 | | // No room for anything. |
1089 | 0 | max_ninst_ = 0; |
1090 | 35.0k | } else { |
1091 | 35.0k | int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); |
1092 | | // Limit instruction count so that inst->id() fits nicely in an int. |
1093 | | // SparseArray also assumes that the indices (inst->id()) are ints. |
1094 | | // The call to WalkExponential uses 2*max_ninst_ below, |
1095 | | // and other places in the code use 2 or 3 * prog->size(). |
1096 | | // Limiting to 2^24 should avoid overflow in those places. |
1097 | | // (The point of allowing more than 32 bits of memory is to |
1098 | | // have plenty of room for the DFA states, not to use it up |
1099 | | // on the program.) |
1100 | 35.0k | if (m >= 1<<24) |
1101 | 0 | m = 1<<24; |
1102 | | // Inst imposes its own limit (currently bigger than 2^24 but be safe). |
1103 | 35.0k | if (m > Prog::Inst::kMaxInst) |
1104 | 0 | m = Prog::Inst::kMaxInst; |
1105 | 35.0k | max_ninst_ = static_cast<int>(m); |
1106 | 35.0k | } |
1107 | 35.0k | anchor_ = anchor; |
1108 | 35.0k | } |
1109 | | |
1110 | | // Compiles re, returning program. |
1111 | | // Caller is responsible for deleting prog_. |
1112 | | // If reversed is true, compiles a program that expects |
1113 | | // to run over the input string backward (reverses all concatenations). |
1114 | | // The reversed flag is also recorded in the returned program. |
1115 | 26.6k | Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) { |
1116 | 26.6k | Compiler c; |
1117 | 26.6k | c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */); |
1118 | 26.6k | c.reversed_ = reversed; |
1119 | | |
1120 | | // Simplify to remove things like counted repetitions |
1121 | | // and character classes like \d. |
1122 | 26.6k | Regexp* sre = re->Simplify(); |
1123 | 26.6k | if (sre == NULL) |
1124 | 0 | return NULL; |
1125 | | |
1126 | | // Record whether prog is anchored, removing the anchors. |
1127 | | // (They get in the way of other optimizations.) |
1128 | 26.6k | bool is_anchor_start = IsAnchorStart(&sre, 0); |
1129 | 26.6k | bool is_anchor_end = IsAnchorEnd(&sre, 0); |
1130 | | |
1131 | | // Generate fragment for entire regexp. |
1132 | 26.6k | Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); |
1133 | 26.6k | sre->Decref(); |
1134 | 26.6k | if (c.failed_) |
1135 | 0 | return NULL; |
1136 | | |
1137 | | // Success! Finish by putting Match node at end, and record start. |
1138 | | // Turn off c.reversed_ (if it is set) to force the remaining concatenations |
1139 | | // to behave normally. |
1140 | 26.6k | c.reversed_ = false; |
1141 | 26.6k | all = c.Cat(all, c.Match(0)); |
1142 | | |
1143 | 26.6k | c.prog_->set_reversed(reversed); |
1144 | 26.6k | if (c.prog_->reversed()) { |
1145 | 8.95k | c.prog_->set_anchor_start(is_anchor_end); |
1146 | 8.95k | c.prog_->set_anchor_end(is_anchor_start); |
1147 | 17.7k | } else { |
1148 | 17.7k | c.prog_->set_anchor_start(is_anchor_start); |
1149 | 17.7k | c.prog_->set_anchor_end(is_anchor_end); |
1150 | 17.7k | } |
1151 | | |
1152 | 26.6k | c.prog_->set_start(all.begin); |
1153 | 26.6k | if (!c.prog_->anchor_start()) { |
1154 | | // Also create unanchored version, which starts with a .*? loop. |
1155 | 24.7k | all = c.Cat(c.DotStar(), all); |
1156 | 24.7k | } |
1157 | 26.6k | c.prog_->set_start_unanchored(all.begin); |
1158 | | |
1159 | | // Hand ownership of prog_ to caller. |
1160 | 26.6k | return c.Finish(re); |
1161 | 26.6k | } |
1162 | | |
1163 | 35.0k | Prog* Compiler::Finish(Regexp* re) { |
1164 | 35.0k | if (failed_) |
1165 | 0 | return NULL; |
1166 | | |
1167 | 35.0k | if (prog_->start() == 0 && prog_->start_unanchored() == 0) { |
1168 | | // No possible matches; keep Fail instruction only. |
1169 | 662 | ninst_ = 1; |
1170 | 662 | } |
1171 | | |
1172 | | // Hand off the array to Prog. |
1173 | 35.0k | prog_->inst_ = std::move(inst_); |
1174 | 35.0k | prog_->size_ = ninst_; |
1175 | | |
1176 | 35.0k | prog_->Optimize(); |
1177 | 35.0k | prog_->Flatten(); |
1178 | 35.0k | prog_->ComputeByteMap(); |
1179 | | |
1180 | 35.0k | if (!prog_->reversed()) { |
1181 | 26.0k | std::string prefix; |
1182 | 26.0k | bool prefix_foldcase; |
1183 | 26.0k | if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase)) |
1184 | 5.85k | prog_->ConfigurePrefixAccel(prefix, prefix_foldcase); |
1185 | 26.0k | } |
1186 | | |
1187 | | // Record remaining memory for DFA. |
1188 | 35.0k | if (max_mem_ <= 0) { |
1189 | 0 | prog_->set_dfa_mem(1<<20); |
1190 | 35.0k | } else { |
1191 | 35.0k | int64_t m = max_mem_ - sizeof(Prog); |
1192 | 35.0k | m -= prog_->size_*sizeof(Prog::Inst); // account for inst_ |
1193 | 35.0k | if (prog_->CanBitState()) |
1194 | 28.6k | m -= prog_->size_*sizeof(uint16_t); // account for list_heads_ |
1195 | 35.0k | if (m < 0) |
1196 | 21 | m = 0; |
1197 | 35.0k | prog_->set_dfa_mem(m); |
1198 | 35.0k | } |
1199 | | |
1200 | 35.0k | Prog* p = prog_; |
1201 | 35.0k | prog_ = NULL; |
1202 | 35.0k | return p; |
1203 | 35.0k | } |
1204 | | |
1205 | | // Converts Regexp to Prog. |
1206 | 17.7k | Prog* Regexp::CompileToProg(int64_t max_mem) { |
1207 | 17.7k | return Compiler::Compile(this, false, max_mem); |
1208 | 17.7k | } |
1209 | | |
1210 | 8.95k | Prog* Regexp::CompileToReverseProg(int64_t max_mem) { |
1211 | 8.95k | return Compiler::Compile(this, true, max_mem); |
1212 | 8.95k | } |
1213 | | |
1214 | 28.6k | Frag Compiler::DotStar() { |
1215 | 28.6k | return Star(ByteRange(0x00, 0xff, false), true); |
1216 | 28.6k | } |
1217 | | |
1218 | | // Compiles RE set to Prog. |
1219 | 8.36k | Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { |
1220 | 8.36k | Compiler c; |
1221 | 8.36k | c.Setup(re->parse_flags(), max_mem, anchor); |
1222 | | |
1223 | 8.36k | Regexp* sre = re->Simplify(); |
1224 | 8.36k | if (sre == NULL) |
1225 | 0 | return NULL; |
1226 | | |
1227 | 8.36k | Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_); |
1228 | 8.36k | sre->Decref(); |
1229 | 8.36k | if (c.failed_) |
1230 | 0 | return NULL; |
1231 | | |
1232 | 8.36k | c.prog_->set_anchor_start(true); |
1233 | 8.36k | c.prog_->set_anchor_end(true); |
1234 | | |
1235 | 8.36k | if (anchor == RE2::UNANCHORED) { |
1236 | | // Prepend .* or else the expression will effectively be anchored. |
1237 | | // Complemented by the ANCHOR_BOTH case in PostVisit(). |
1238 | 3.88k | all = c.Cat(c.DotStar(), all); |
1239 | 3.88k | } |
1240 | 8.36k | c.prog_->set_start(all.begin); |
1241 | 8.36k | c.prog_->set_start_unanchored(all.begin); |
1242 | | |
1243 | 8.36k | Prog* prog = c.Finish(re); |
1244 | 8.36k | if (prog == NULL) |
1245 | 0 | return NULL; |
1246 | | |
1247 | | // Make sure DFA has enough memory to operate, |
1248 | | // since we're not going to fall back to the NFA. |
1249 | 8.36k | bool dfa_failed = false; |
1250 | 8.36k | absl::string_view sp = "hello, world"; |
1251 | 8.36k | prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, |
1252 | 8.36k | NULL, &dfa_failed, NULL); |
1253 | 8.36k | if (dfa_failed) { |
1254 | 0 | delete prog; |
1255 | 0 | return NULL; |
1256 | 0 | } |
1257 | | |
1258 | 8.36k | return prog; |
1259 | 8.36k | } |
1260 | | |
1261 | 8.36k | Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) { |
1262 | 8.36k | return Compiler::CompileSet(re, anchor, max_mem); |
1263 | 8.36k | } |
1264 | | |
1265 | | } // namespace re2 |