Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2007 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Compiled regular expression representation. |
6 | | // Tested by compile_test.cc |
7 | | |
8 | | #include "re2/prog.h" |
9 | | |
10 | | #if defined(__AVX2__) |
11 | | #include <immintrin.h> |
12 | | #ifdef _MSC_VER |
13 | | #include <intrin.h> |
14 | | #endif |
15 | | #endif |
16 | | #include <stdint.h> |
17 | | #include <string.h> |
18 | | #include <algorithm> |
19 | | #include <memory> |
20 | | #include <utility> |
21 | | |
22 | | #include "util/util.h" |
23 | | #include "util/logging.h" |
24 | | #include "util/strutil.h" |
25 | | #include "re2/bitmap256.h" |
26 | | #include "re2/stringpiece.h" |
27 | | |
28 | | namespace re2 { |
29 | | |
30 | | // Constructors per Inst opcode |
31 | | |
32 | 224M | void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) { |
33 | 224M | DCHECK_EQ(out_opcode_, 0); |
34 | 224M | set_out_opcode(out, kInstAlt); |
35 | 224M | out1_ = out1; |
36 | 224M | } |
37 | | |
38 | 648M | void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) { |
39 | 648M | DCHECK_EQ(out_opcode_, 0); |
40 | 648M | set_out_opcode(out, kInstByteRange); |
41 | 648M | lo_ = lo & 0xFF; |
42 | 648M | hi_ = hi & 0xFF; |
43 | 648M | hint_foldcase_ = foldcase&1; |
44 | 648M | } |
45 | | |
46 | 7.97M | void Prog::Inst::InitCapture(int cap, uint32_t out) { |
47 | 7.97M | DCHECK_EQ(out_opcode_, 0); |
48 | 7.97M | set_out_opcode(out, kInstCapture); |
49 | 7.97M | cap_ = cap; |
50 | 7.97M | } |
51 | | |
52 | 5.43M | void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) { |
53 | 5.43M | DCHECK_EQ(out_opcode_, 0); |
54 | 5.43M | set_out_opcode(out, kInstEmptyWidth); |
55 | 5.43M | empty_ = empty; |
56 | 5.43M | } |
57 | | |
58 | 44.5k | void Prog::Inst::InitMatch(int32_t id) { |
59 | 44.5k | DCHECK_EQ(out_opcode_, 0); |
60 | 44.5k | set_opcode(kInstMatch); |
61 | 44.5k | match_id_ = id; |
62 | 44.5k | } |
63 | | |
64 | 2.94M | void Prog::Inst::InitNop(uint32_t out) { |
65 | 2.94M | DCHECK_EQ(out_opcode_, 0); |
66 | 2.94M | set_opcode(kInstNop); |
67 | 2.94M | } |
68 | | |
69 | 45.0k | void Prog::Inst::InitFail() { |
70 | 45.0k | DCHECK_EQ(out_opcode_, 0); |
71 | 45.0k | set_opcode(kInstFail); |
72 | 45.0k | } |
73 | | |
74 | 0 | std::string Prog::Inst::Dump() { |
75 | 0 | switch (opcode()) { |
76 | 0 | default: |
77 | 0 | return StringPrintf("opcode %d", static_cast<int>(opcode())); |
78 | | |
79 | 0 | case kInstAlt: |
80 | 0 | return StringPrintf("alt -> %d | %d", out(), out1_); |
81 | | |
82 | 0 | case kInstAltMatch: |
83 | 0 | return StringPrintf("altmatch -> %d | %d", out(), out1_); |
84 | | |
85 | 0 | case kInstByteRange: |
86 | 0 | return StringPrintf("byte%s [%02x-%02x] %d -> %d", |
87 | 0 | foldcase() ? "/i" : "", |
88 | 0 | lo_, hi_, hint(), out()); |
89 | | |
90 | 0 | case kInstCapture: |
91 | 0 | return StringPrintf("capture %d -> %d", cap_, out()); |
92 | | |
93 | 0 | case kInstEmptyWidth: |
94 | 0 | return StringPrintf("emptywidth %#x -> %d", |
95 | 0 | static_cast<int>(empty_), out()); |
96 | | |
97 | 0 | case kInstMatch: |
98 | 0 | return StringPrintf("match! %d", match_id()); |
99 | | |
100 | 0 | case kInstNop: |
101 | 0 | return StringPrintf("nop -> %d", out()); |
102 | | |
103 | 0 | case kInstFail: |
104 | 0 | return StringPrintf("fail"); |
105 | 0 | } |
106 | 0 | } |
107 | | |
108 | | Prog::Prog() |
109 | | : anchor_start_(false), |
110 | | anchor_end_(false), |
111 | | reversed_(false), |
112 | | did_flatten_(false), |
113 | | did_onepass_(false), |
114 | | start_(0), |
115 | | start_unanchored_(0), |
116 | | size_(0), |
117 | | bytemap_range_(0), |
118 | | prefix_foldcase_(false), |
119 | | prefix_size_(0), |
120 | | list_count_(0), |
121 | | bit_state_text_max_size_(0), |
122 | | dfa_mem_(0), |
123 | | dfa_first_(NULL), |
124 | 45.0k | dfa_longest_(NULL) { |
125 | 45.0k | } |
126 | | |
127 | 45.0k | Prog::~Prog() { |
128 | 45.0k | DeleteDFA(dfa_longest_); |
129 | 45.0k | DeleteDFA(dfa_first_); |
130 | 45.0k | if (prefix_foldcase_) |
131 | 13.2k | delete[] prefix_dfa_; |
132 | 45.0k | } |
133 | | |
134 | | typedef SparseSet Workq; |
135 | | |
136 | 637M | static inline void AddToQueue(Workq* q, int id) { |
137 | 637M | if (id != 0) |
138 | 636M | q->insert(id); |
139 | 637M | } |
140 | | |
141 | 0 | static std::string ProgToString(Prog* prog, Workq* q) { |
142 | 0 | std::string s; |
143 | 0 | for (Workq::iterator i = q->begin(); i != q->end(); ++i) { |
144 | 0 | int id = *i; |
145 | 0 | Prog::Inst* ip = prog->inst(id); |
146 | 0 | s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); |
147 | 0 | AddToQueue(q, ip->out()); |
148 | 0 | if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch) |
149 | 0 | AddToQueue(q, ip->out1()); |
150 | 0 | } |
151 | 0 | return s; |
152 | 0 | } |
153 | | |
154 | 0 | static std::string FlattenedProgToString(Prog* prog, int start) { |
155 | 0 | std::string s; |
156 | 0 | for (int id = start; id < prog->size(); id++) { |
157 | 0 | Prog::Inst* ip = prog->inst(id); |
158 | 0 | if (ip->last()) |
159 | 0 | s += StringPrintf("%d. %s\n", id, ip->Dump().c_str()); |
160 | 0 | else |
161 | 0 | s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str()); |
162 | 0 | } |
163 | 0 | return s; |
164 | 0 | } |
165 | | |
166 | 0 | std::string Prog::Dump() { |
167 | 0 | if (did_flatten_) |
168 | 0 | return FlattenedProgToString(this, start_); |
169 | | |
170 | 0 | Workq q(size_); |
171 | 0 | AddToQueue(&q, start_); |
172 | 0 | return ProgToString(this, &q); |
173 | 0 | } |
174 | | |
175 | 0 | std::string Prog::DumpUnanchored() { |
176 | 0 | if (did_flatten_) |
177 | 0 | return FlattenedProgToString(this, start_unanchored_); |
178 | | |
179 | 0 | Workq q(size_); |
180 | 0 | AddToQueue(&q, start_unanchored_); |
181 | 0 | return ProgToString(this, &q); |
182 | 0 | } |
183 | | |
184 | 0 | std::string Prog::DumpByteMap() { |
185 | 0 | std::string map; |
186 | 0 | for (int c = 0; c < 256; c++) { |
187 | 0 | int b = bytemap_[c]; |
188 | 0 | int lo = c; |
189 | 0 | while (c < 256-1 && bytemap_[c+1] == b) |
190 | 0 | c++; |
191 | 0 | int hi = c; |
192 | 0 | map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b); |
193 | 0 | } |
194 | 0 | return map; |
195 | 0 | } |
196 | | |
197 | | // Is ip a guaranteed match at end of text, perhaps after some capturing? |
198 | 72.3M | static bool IsMatch(Prog* prog, Prog::Inst* ip) { |
199 | 74.0M | for (;;) { |
200 | 74.0M | switch (ip->opcode()) { |
201 | 0 | default: |
202 | 0 | LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode(); |
203 | 0 | return false; |
204 | | |
205 | 39.8M | case kInstAlt: |
206 | 39.8M | case kInstAltMatch: |
207 | 71.4M | case kInstByteRange: |
208 | 71.5M | case kInstFail: |
209 | 72.3M | case kInstEmptyWidth: |
210 | 72.3M | return false; |
211 | | |
212 | 1.65M | case kInstCapture: |
213 | 1.65M | case kInstNop: |
214 | 1.65M | ip = prog->inst(ip->out()); |
215 | 1.65M | break; |
216 | | |
217 | 30.3k | case kInstMatch: |
218 | 30.3k | return true; |
219 | 74.0M | } |
220 | 74.0M | } |
221 | 72.3M | } |
222 | | |
223 | | // Peep-hole optimizer. |
224 | 44.5k | void Prog::Optimize() { |
225 | 44.5k | Workq q(size_); |
226 | | |
227 | | // Eliminate nops. Most are taken out during compilation |
228 | | // but a few are hard to avoid. |
229 | 44.5k | q.clear(); |
230 | 44.5k | AddToQueue(&q, start_); |
231 | 246M | for (Workq::iterator i = q.begin(); i != q.end(); ++i) { |
232 | 246M | int id = *i; |
233 | | |
234 | 246M | Inst* ip = inst(id); |
235 | 246M | int j = ip->out(); |
236 | 246M | Inst* jp; |
237 | 247M | while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { |
238 | 551k | j = jp->out(); |
239 | 551k | } |
240 | 246M | ip->set_out(j); |
241 | 246M | AddToQueue(&q, ip->out()); |
242 | | |
243 | 246M | if (ip->opcode() == kInstAlt) { |
244 | 71.7M | j = ip->out1(); |
245 | 74.1M | while (j != 0 && (jp=inst(j))->opcode() == kInstNop) { |
246 | 2.37M | j = jp->out(); |
247 | 2.37M | } |
248 | 71.7M | ip->out1_ = j; |
249 | 71.7M | AddToQueue(&q, ip->out1()); |
250 | 71.7M | } |
251 | 246M | } |
252 | | |
253 | | // Insert kInstAltMatch instructions |
254 | | // Look for |
255 | | // ip: Alt -> j | k |
256 | | // j: ByteRange [00-FF] -> ip |
257 | | // k: Match |
258 | | // or the reverse (the above is the greedy one). |
259 | | // Rewrite Alt to AltMatch. |
260 | 44.5k | q.clear(); |
261 | 44.5k | AddToQueue(&q, start_); |
262 | 246M | for (Workq::iterator i = q.begin(); i != q.end(); ++i) { |
263 | 246M | int id = *i; |
264 | 246M | Inst* ip = inst(id); |
265 | 246M | AddToQueue(&q, ip->out()); |
266 | 246M | if (ip->opcode() == kInstAlt) |
267 | 71.7M | AddToQueue(&q, ip->out1()); |
268 | | |
269 | 246M | if (ip->opcode() == kInstAlt) { |
270 | 71.7M | Inst* j = inst(ip->out()); |
271 | 71.7M | Inst* k = inst(ip->out1()); |
272 | 71.7M | if (j->opcode() == kInstByteRange && j->out() == id && |
273 | 71.7M | j->lo() == 0x00 && j->hi() == 0xFF && |
274 | 71.7M | IsMatch(this, k)) { |
275 | 2.56k | ip->set_opcode(kInstAltMatch); |
276 | 2.56k | continue; |
277 | 2.56k | } |
278 | 71.7M | if (IsMatch(this, j) && |
279 | 71.7M | k->opcode() == kInstByteRange && k->out() == id && |
280 | 71.7M | k->lo() == 0x00 && k->hi() == 0xFF) { |
281 | 399 | ip->set_opcode(kInstAltMatch); |
282 | 399 | } |
283 | 71.7M | } |
284 | 246M | } |
285 | 44.5k | } |
286 | | |
287 | 8.43M | uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) { |
288 | 8.43M | int flags = 0; |
289 | | |
290 | | // ^ and \A |
291 | 8.43M | if (p == text.data()) |
292 | 216k | flags |= kEmptyBeginText | kEmptyBeginLine; |
293 | 8.22M | else if (p[-1] == '\n') |
294 | 8.16k | flags |= kEmptyBeginLine; |
295 | | |
296 | | // $ and \z |
297 | 8.43M | if (p == text.data() + text.size()) |
298 | 171k | flags |= kEmptyEndText | kEmptyEndLine; |
299 | 8.26M | else if (p < text.data() + text.size() && p[0] == '\n') |
300 | 11.6k | flags |= kEmptyEndLine; |
301 | | |
302 | | // \b and \B |
303 | 8.43M | if (p == text.data() && p == text.data() + text.size()) { |
304 | | // no word boundary here |
305 | 8.43M | } else if (p == text.data()) { |
306 | 216k | if (IsWordChar(p[0])) |
307 | 4.22k | flags |= kEmptyWordBoundary; |
308 | 8.22M | } else if (p == text.data() + text.size()) { |
309 | 171k | if (IsWordChar(p[-1])) |
310 | 45.2k | flags |= kEmptyWordBoundary; |
311 | 8.05M | } else { |
312 | 8.05M | if (IsWordChar(p[-1]) != IsWordChar(p[0])) |
313 | 2.46M | flags |= kEmptyWordBoundary; |
314 | 8.05M | } |
315 | 8.43M | if (!(flags & kEmptyWordBoundary)) |
316 | 5.92M | flags |= kEmptyNonWordBoundary; |
317 | | |
318 | 8.43M | return flags; |
319 | 8.43M | } |
320 | | |
321 | | // ByteMapBuilder implements a coloring algorithm. |
322 | | // |
323 | | // The first phase is a series of "mark and merge" batches: we mark one or more |
324 | | // [lo-hi] ranges, then merge them into our internal state. Batching is not for |
325 | | // performance; rather, it means that the ranges are treated indistinguishably. |
326 | | // |
327 | | // Internally, the ranges are represented using a bitmap that stores the splits |
328 | | // and a vector that stores the colors; both of them are indexed by the ranges' |
329 | | // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at |
330 | | // hi (if not already split), then recolor each range in between. The color map |
331 | | // (i.e. from the old color to the new color) is maintained for the lifetime of |
332 | | // the batch and so underpins this somewhat obscure approach to set operations. |
333 | | // |
334 | | // The second phase builds the bytemap from our internal state: we recolor each |
335 | | // range, then store the new color (which is now the byte class) in each of the |
336 | | // corresponding array elements. Finally, we output the number of byte classes. |
337 | | class ByteMapBuilder { |
338 | | public: |
339 | 44.5k | ByteMapBuilder() { |
340 | | // Initial state: the [0-255] range has color 256. |
341 | | // This will avoid problems during the second phase, |
342 | | // in which we assign byte classes numbered from 0. |
343 | 44.5k | splits_.Set(255); |
344 | 44.5k | colors_[255] = 256; |
345 | 44.5k | nextcolor_ = 257; |
346 | 44.5k | } |
347 | | |
348 | | void Mark(int lo, int hi); |
349 | | void Merge(); |
350 | | void Build(uint8_t* bytemap, int* bytemap_range); |
351 | | |
352 | | private: |
353 | | int Recolor(int oldcolor); |
354 | | |
355 | | Bitmap256 splits_; |
356 | | int colors_[256]; |
357 | | int nextcolor_; |
358 | | std::vector<std::pair<int, int>> colormap_; |
359 | | std::vector<std::pair<int, int>> ranges_; |
360 | | |
361 | | ByteMapBuilder(const ByteMapBuilder&) = delete; |
362 | | ByteMapBuilder& operator=(const ByteMapBuilder&) = delete; |
363 | | }; |
364 | | |
365 | 181M | void ByteMapBuilder::Mark(int lo, int hi) { |
366 | 181M | DCHECK_GE(lo, 0); |
367 | 181M | DCHECK_GE(hi, 0); |
368 | 181M | DCHECK_LE(lo, 255); |
369 | 181M | DCHECK_LE(hi, 255); |
370 | 181M | DCHECK_LE(lo, hi); |
371 | | |
372 | | // Ignore any [0-255] ranges. They cause us to recolor every range, which |
373 | | // has no effect on the eventual result and is therefore a waste of time. |
374 | 181M | if (lo == 0 && hi == 255) |
375 | 2.12M | return; |
376 | | |
377 | 179M | ranges_.emplace_back(lo, hi); |
378 | 179M | } |
379 | | |
380 | 144M | void ByteMapBuilder::Merge() { |
381 | 144M | for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin(); |
382 | 323M | it != ranges_.end(); |
383 | 179M | ++it) { |
384 | 179M | int lo = it->first-1; |
385 | 179M | int hi = it->second; |
386 | | |
387 | 179M | if (0 <= lo && !splits_.Test(lo)) { |
388 | 690k | splits_.Set(lo); |
389 | 690k | int next = splits_.FindNextSetBit(lo+1); |
390 | 690k | colors_[lo] = colors_[next]; |
391 | 690k | } |
392 | 179M | if (!splits_.Test(hi)) { |
393 | 768k | splits_.Set(hi); |
394 | 768k | int next = splits_.FindNextSetBit(hi+1); |
395 | 768k | colors_[hi] = colors_[next]; |
396 | 768k | } |
397 | | |
398 | 179M | int c = lo+1; |
399 | 659M | while (c < 256) { |
400 | 659M | int next = splits_.FindNextSetBit(c); |
401 | 659M | colors_[next] = Recolor(colors_[next]); |
402 | 659M | if (next == hi) |
403 | 179M | break; |
404 | 480M | c = next+1; |
405 | 480M | } |
406 | 179M | } |
407 | 144M | colormap_.clear(); |
408 | 144M | ranges_.clear(); |
409 | 144M | } |
410 | | |
411 | 44.5k | void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) { |
412 | | // Assign byte classes numbered from 0. |
413 | 44.5k | nextcolor_ = 0; |
414 | | |
415 | 44.5k | int c = 0; |
416 | 1.54M | while (c < 256) { |
417 | 1.50M | int next = splits_.FindNextSetBit(c); |
418 | 1.50M | uint8_t b = static_cast<uint8_t>(Recolor(colors_[next])); |
419 | 12.9M | while (c <= next) { |
420 | 11.4M | bytemap[c] = b; |
421 | 11.4M | c++; |
422 | 11.4M | } |
423 | 1.50M | } |
424 | | |
425 | 44.5k | *bytemap_range = nextcolor_; |
426 | 44.5k | } |
427 | | |
428 | 661M | int ByteMapBuilder::Recolor(int oldcolor) { |
429 | | // Yes, this is a linear search. There can be at most 256 |
430 | | // colors and there will typically be far fewer than that. |
431 | | // Also, we need to consider keys *and* values in order to |
432 | | // avoid recoloring a given range more than once per batch. |
433 | 661M | std::vector<std::pair<int, int>>::const_iterator it = |
434 | 661M | std::find_if(colormap_.begin(), colormap_.end(), |
435 | 9.15G | [=](const std::pair<int, int>& kv) -> bool { |
436 | 9.15G | return kv.first == oldcolor || kv.second == oldcolor; |
437 | 9.15G | }); |
438 | 661M | if (it != colormap_.end()) |
439 | 71.2M | return it->second; |
440 | 590M | int newcolor = nextcolor_; |
441 | 590M | nextcolor_++; |
442 | 590M | colormap_.emplace_back(oldcolor, newcolor); |
443 | 590M | return newcolor; |
444 | 661M | } |
445 | | |
446 | 44.5k | void Prog::ComputeByteMap() { |
447 | | // Fill in bytemap with byte classes for the program. |
448 | | // Ranges of bytes that are treated indistinguishably |
449 | | // will be mapped to a single byte class. |
450 | 44.5k | ByteMapBuilder builder; |
451 | | |
452 | | // Don't repeat the work for ^ and $. |
453 | 44.5k | bool marked_line_boundaries = false; |
454 | | // Don't repeat the work for \b and \B. |
455 | 44.5k | bool marked_word_boundaries = false; |
456 | | |
457 | 215M | for (int id = 0; id < size(); id++) { |
458 | 215M | Inst* ip = inst(id); |
459 | 215M | if (ip->opcode() == kInstByteRange) { |
460 | 163M | int lo = ip->lo(); |
461 | 163M | int hi = ip->hi(); |
462 | 163M | builder.Mark(lo, hi); |
463 | 163M | if (ip->foldcase() && lo <= 'z' && hi >= 'a') { |
464 | 18.2M | int foldlo = lo; |
465 | 18.2M | int foldhi = hi; |
466 | 18.2M | if (foldlo < 'a') |
467 | 32.0k | foldlo = 'a'; |
468 | 18.2M | if (foldhi > 'z') |
469 | 67.4k | foldhi = 'z'; |
470 | 18.2M | if (foldlo <= foldhi) { |
471 | 18.2M | foldlo += 'A' - 'a'; |
472 | 18.2M | foldhi += 'A' - 'a'; |
473 | 18.2M | builder.Mark(foldlo, foldhi); |
474 | 18.2M | } |
475 | 18.2M | } |
476 | | // If this Inst is not the last Inst in its list AND the next Inst is |
477 | | // also a ByteRange AND the Insts have the same out, defer the merge. |
478 | 163M | if (!ip->last() && |
479 | 163M | inst(id+1)->opcode() == kInstByteRange && |
480 | 163M | ip->out() == inst(id+1)->out()) |
481 | 18.8M | continue; |
482 | 144M | builder.Merge(); |
483 | 144M | } else if (ip->opcode() == kInstEmptyWidth) { |
484 | 5.27M | if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) && |
485 | 5.27M | !marked_line_boundaries) { |
486 | 2.27k | builder.Mark('\n', '\n'); |
487 | 2.27k | builder.Merge(); |
488 | 2.27k | marked_line_boundaries = true; |
489 | 2.27k | } |
490 | 5.27M | if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) && |
491 | 5.27M | !marked_word_boundaries) { |
492 | | // We require two batches here: the first for ranges that are word |
493 | | // characters, the second for ranges that are not word characters. |
494 | 4.15k | for (bool isword : {true, false}) { |
495 | 4.15k | int j; |
496 | 41.5k | for (int i = 0; i < 256; i = j) { |
497 | 1.06M | for (j = i + 1; j < 256 && |
498 | 1.06M | Prog::IsWordChar(static_cast<uint8_t>(i)) == |
499 | 1.05M | Prog::IsWordChar(static_cast<uint8_t>(j)); |
500 | 1.02M | j++) |
501 | 1.02M | ; |
502 | 37.4k | if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword) |
503 | 18.7k | builder.Mark(i, j - 1); |
504 | 37.4k | } |
505 | 4.15k | builder.Merge(); |
506 | 4.15k | } |
507 | 2.07k | marked_word_boundaries = true; |
508 | 2.07k | } |
509 | 5.27M | } |
510 | 215M | } |
511 | | |
512 | 44.5k | builder.Build(bytemap_, &bytemap_range_); |
513 | | |
514 | 44.5k | if ((0)) { // For debugging, use trivial bytemap. |
515 | 0 | LOG(ERROR) << "Using trivial bytemap."; |
516 | 0 | for (int i = 0; i < 256; i++) |
517 | 0 | bytemap_[i] = static_cast<uint8_t>(i); |
518 | 0 | bytemap_range_ = 256; |
519 | 0 | } |
520 | 44.5k | } |
521 | | |
522 | | // Prog::Flatten() implements a graph rewriting algorithm. |
523 | | // |
524 | | // The overall process is similar to epsilon removal, but retains some epsilon |
525 | | // transitions: those from Capture and EmptyWidth instructions; and those from |
526 | | // nullable subexpressions. (The latter avoids quadratic blowup in transitions |
527 | | // in the worst case.) It might be best thought of as Alt instruction elision. |
528 | | // |
529 | | // In conceptual terms, it divides the Prog into "trees" of instructions, then |
530 | | // traverses the "trees" in order to produce "lists" of instructions. A "tree" |
531 | | // is one or more instructions that grow from one "root" instruction to one or |
532 | | // more "leaf" instructions; if a "tree" has exactly one instruction, then the |
533 | | // "root" is also the "leaf". In most cases, a "root" is the successor of some |
534 | | // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction) |
535 | | // and is considered a "successor root". A "leaf" can be a ByteRange, Capture, |
536 | | // EmptyWidth or Match instruction. However, this is insufficient for handling |
537 | | // nested nullable subexpressions correctly, so in some cases, a "root" is the |
538 | | // dominator of the instructions reachable from some "successor root" (i.e. it |
539 | | // has an unreachable predecessor) and is considered a "dominator root". Since |
540 | | // only Alt instructions can be "dominator roots" (other instructions would be |
541 | | // "leaves"), only Alt instructions are required to be marked as predecessors. |
542 | | // |
543 | | // Dividing the Prog into "trees" comprises two passes: marking the "successor |
544 | | // roots" and the predecessors; and marking the "dominator roots". Sorting the |
545 | | // "successor roots" by their bytecode offsets enables iteration in order from |
546 | | // greatest to least during the second pass; by working backwards in this case |
547 | | // and flooding the graph no further than "leaves" and already marked "roots", |
548 | | // it becomes possible to mark "dominator roots" without doing excessive work. |
549 | | // |
550 | | // Traversing the "trees" is just iterating over the "roots" in order of their |
551 | | // marking and flooding the graph no further than "leaves" and "roots". When a |
552 | | // "leaf" is reached, the instruction is copied with its successor remapped to |
553 | | // its "root" number. When a "root" is reached, a Nop instruction is generated |
554 | | // with its successor remapped similarly. As each "list" is produced, its last |
555 | | // instruction is marked as such. After all of the "lists" have been produced, |
556 | | // a pass over their instructions remaps their successors to bytecode offsets. |
557 | 44.5k | void Prog::Flatten() { |
558 | 44.5k | if (did_flatten_) |
559 | 0 | return; |
560 | 44.5k | did_flatten_ = true; |
561 | | |
562 | | // Scratch structures. It's important that these are reused by functions |
563 | | // that we call in loops because they would thrash the heap otherwise. |
564 | 44.5k | SparseSet reachable(size()); |
565 | 44.5k | std::vector<int> stk; |
566 | 44.5k | stk.reserve(size()); |
567 | | |
568 | | // First pass: Marks "successor roots" and predecessors. |
569 | | // Builds the mapping from inst-ids to root-ids. |
570 | 44.5k | SparseArray<int> rootmap(size()); |
571 | 44.5k | SparseArray<int> predmap(size()); |
572 | 44.5k | std::vector<std::vector<int>> predvec; |
573 | 44.5k | MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk); |
574 | | |
575 | | // Second pass: Marks "dominator roots". |
576 | 44.5k | SparseArray<int> sorted(rootmap); |
577 | 44.5k | std::sort(sorted.begin(), sorted.end(), sorted.less); |
578 | 44.5k | for (SparseArray<int>::const_iterator i = sorted.end() - 1; |
579 | 125M | i != sorted.begin(); |
580 | 125M | --i) { |
581 | 125M | if (i->index() != start_unanchored() && i->index() != start()) |
582 | 124M | MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk); |
583 | 125M | } |
584 | | |
585 | | // Third pass: Emits "lists". Remaps outs to root-ids. |
586 | | // Builds the mapping from root-ids to flat-ids. |
587 | 44.5k | std::vector<int> flatmap(rootmap.size()); |
588 | 44.5k | std::vector<Inst> flat; |
589 | 44.5k | flat.reserve(size()); |
590 | 44.5k | for (SparseArray<int>::const_iterator i = rootmap.begin(); |
591 | 130M | i != rootmap.end(); |
592 | 130M | ++i) { |
593 | 130M | flatmap[i->value()] = static_cast<int>(flat.size()); |
594 | 130M | EmitList(i->index(), &rootmap, &flat, &reachable, &stk); |
595 | 130M | flat.back().set_last(); |
596 | | // We have the bounds of the "list", so this is the |
597 | | // most convenient point at which to compute hints. |
598 | 130M | ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size())); |
599 | 130M | } |
600 | | |
601 | 44.5k | list_count_ = static_cast<int>(flatmap.size()); |
602 | 401k | for (int i = 0; i < kNumInst; i++) |
603 | 356k | inst_count_[i] = 0; |
604 | | |
605 | | // Fourth pass: Remaps outs to flat-ids. |
606 | | // Counts instructions by opcode. |
607 | 215M | for (int id = 0; id < static_cast<int>(flat.size()); id++) { |
608 | 215M | Inst* ip = &flat[id]; |
609 | 215M | if (ip->opcode() != kInstAltMatch) // handled in EmitList() |
610 | 215M | ip->set_out(flatmap[ip->out()]); |
611 | 215M | inst_count_[ip->opcode()]++; |
612 | 215M | } |
613 | | |
614 | | #if !defined(NDEBUG) |
615 | | // Address a `-Wunused-but-set-variable' warning from Clang 13.x. |
616 | | size_t total = 0; |
617 | | for (int i = 0; i < kNumInst; i++) |
618 | | total += inst_count_[i]; |
619 | | CHECK_EQ(total, flat.size()); |
620 | | #endif |
621 | | |
622 | | // Remap start_unanchored and start. |
623 | 44.5k | if (start_unanchored() == 0) { |
624 | 1.06k | DCHECK_EQ(start(), 0); |
625 | 43.5k | } else if (start_unanchored() == start()) { |
626 | 1.03k | set_start_unanchored(flatmap[1]); |
627 | 1.03k | set_start(flatmap[1]); |
628 | 42.4k | } else { |
629 | 42.4k | set_start_unanchored(flatmap[1]); |
630 | 42.4k | set_start(flatmap[2]); |
631 | 42.4k | } |
632 | | |
633 | | // Finally, replace the old instructions with the new instructions. |
634 | 44.5k | size_ = static_cast<int>(flat.size()); |
635 | 44.5k | inst_ = PODArray<Inst>(size_); |
636 | 44.5k | memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]); |
637 | | |
638 | | // Populate the list heads for BitState. |
639 | | // 512 instructions limits the memory footprint to 1KiB. |
640 | 44.5k | if (size_ <= 512) { |
641 | 35.0k | list_heads_ = PODArray<uint16_t>(size_); |
642 | | // 0xFF makes it more obvious if we try to look up a non-head. |
643 | 35.0k | memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]); |
644 | 1.51M | for (int i = 0; i < list_count_; ++i) |
645 | 1.47M | list_heads_[flatmap[i]] = i; |
646 | 35.0k | } |
647 | | |
648 | | // BitState allocates a bitmap of size list_count_ * (text.size()+1) |
649 | | // for tracking pairs of possibilities that it has already explored. |
650 | 44.5k | const size_t kBitStateBitmapMaxSize = 256*1024; // max size in bits |
651 | 44.5k | bit_state_text_max_size_ = kBitStateBitmapMaxSize / list_count_ - 1; |
652 | 44.5k | } |
653 | | |
654 | | void Prog::MarkSuccessors(SparseArray<int>* rootmap, |
655 | | SparseArray<int>* predmap, |
656 | | std::vector<std::vector<int>>* predvec, |
657 | 44.5k | SparseSet* reachable, std::vector<int>* stk) { |
658 | | // Mark the kInstFail instruction. |
659 | 44.5k | rootmap->set_new(0, rootmap->size()); |
660 | | |
661 | | // Mark the start_unanchored and start instructions. |
662 | 44.5k | if (!rootmap->has_index(start_unanchored())) |
663 | 43.5k | rootmap->set_new(start_unanchored(), rootmap->size()); |
664 | 44.5k | if (!rootmap->has_index(start())) |
665 | 42.4k | rootmap->set_new(start(), rootmap->size()); |
666 | | |
667 | 44.5k | reachable->clear(); |
668 | 44.5k | stk->clear(); |
669 | 44.5k | stk->push_back(start_unanchored()); |
670 | 71.9M | while (!stk->empty()) { |
671 | 71.8M | int id = stk->back(); |
672 | 71.8M | stk->pop_back(); |
673 | 318M | Loop: |
674 | 318M | if (reachable->contains(id)) |
675 | 71.8M | continue; |
676 | 246M | reachable->insert_new(id); |
677 | | |
678 | 246M | Inst* ip = inst(id); |
679 | 246M | switch (ip->opcode()) { |
680 | 0 | default: |
681 | 0 | LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
682 | 0 | break; |
683 | | |
684 | 2.96k | case kInstAltMatch: |
685 | 71.8M | case kInstAlt: |
686 | | // Mark this instruction as a predecessor of each out. |
687 | 143M | for (int out : {ip->out(), ip->out1()}) { |
688 | 143M | if (!predmap->has_index(out)) { |
689 | 128M | predmap->set_new(out, static_cast<int>(predvec->size())); |
690 | 128M | predvec->emplace_back(); |
691 | 128M | } |
692 | 143M | (*predvec)[predmap->get_existing(out)].emplace_back(id); |
693 | 143M | } |
694 | 71.8M | stk->push_back(ip->out1()); |
695 | 71.8M | id = ip->out(); |
696 | 71.8M | goto Loop; |
697 | | |
698 | 161M | case kInstByteRange: |
699 | 169M | case kInstCapture: |
700 | 174M | case kInstEmptyWidth: |
701 | | // Mark the out of this instruction as a "root". |
702 | 174M | if (!rootmap->has_index(ip->out())) |
703 | 124M | rootmap->set_new(ip->out(), rootmap->size()); |
704 | 174M | id = ip->out(); |
705 | 174M | goto Loop; |
706 | | |
707 | 66 | case kInstNop: |
708 | 66 | id = ip->out(); |
709 | 66 | goto Loop; |
710 | | |
711 | 43.5k | case kInstMatch: |
712 | 45.3k | case kInstFail: |
713 | 45.3k | break; |
714 | 246M | } |
715 | 246M | } |
716 | 44.5k | } |
717 | | |
718 | | void Prog::MarkDominator(int root, SparseArray<int>* rootmap, |
719 | | SparseArray<int>* predmap, |
720 | | std::vector<std::vector<int>>* predvec, |
721 | 124M | SparseSet* reachable, std::vector<int>* stk) { |
722 | 124M | reachable->clear(); |
723 | 124M | stk->clear(); |
724 | 124M | stk->push_back(root); |
725 | 329M | while (!stk->empty()) { |
726 | 204M | int id = stk->back(); |
727 | 204M | stk->pop_back(); |
728 | 283M | Loop: |
729 | 283M | if (reachable->contains(id)) |
730 | 4.41M | continue; |
731 | 279M | reachable->insert_new(id); |
732 | | |
733 | 279M | if (id != root && rootmap->has_index(id)) { |
734 | | // We reached another "tree" via epsilon transition. |
735 | 23.8M | continue; |
736 | 23.8M | } |
737 | | |
738 | 255M | Inst* ip = inst(id); |
739 | 255M | switch (ip->opcode()) { |
740 | 0 | default: |
741 | 0 | LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
742 | 0 | break; |
743 | | |
744 | 2.96k | case kInstAltMatch: |
745 | 79.3M | case kInstAlt: |
746 | 79.3M | stk->push_back(ip->out1()); |
747 | 79.3M | id = ip->out(); |
748 | 79.3M | goto Loop; |
749 | | |
750 | 162M | case kInstByteRange: |
751 | 170M | case kInstCapture: |
752 | 176M | case kInstEmptyWidth: |
753 | 176M | break; |
754 | | |
755 | 0 | case kInstNop: |
756 | 0 | id = ip->out(); |
757 | 0 | goto Loop; |
758 | | |
759 | 48.8k | case kInstMatch: |
760 | 48.8k | case kInstFail: |
761 | 48.8k | break; |
762 | 255M | } |
763 | 255M | } |
764 | | |
765 | 124M | for (SparseSet::const_iterator i = reachable->begin(); |
766 | 404M | i != reachable->end(); |
767 | 279M | ++i) { |
768 | 279M | int id = *i; |
769 | 279M | if (predmap->has_index(id)) { |
770 | 1.75G | for (int pred : (*predvec)[predmap->get_existing(id)]) { |
771 | 1.75G | if (!reachable->contains(pred)) { |
772 | | // id has a predecessor that cannot be reached from root! |
773 | | // Therefore, id must be a "root" too - mark it as such. |
774 | 1.59G | if (!rootmap->has_index(id)) |
775 | 5.30M | rootmap->set_new(id, rootmap->size()); |
776 | 1.59G | } |
777 | 1.75G | } |
778 | 161M | } |
779 | 279M | } |
780 | 124M | } |
781 | | |
782 | | void Prog::EmitList(int root, SparseArray<int>* rootmap, |
783 | | std::vector<Inst>* flat, |
784 | 130M | SparseSet* reachable, std::vector<int>* stk) { |
785 | 130M | reachable->clear(); |
786 | 130M | stk->clear(); |
787 | 130M | stk->push_back(root); |
788 | 351M | while (!stk->empty()) { |
789 | 220M | int id = stk->back(); |
790 | 220M | stk->pop_back(); |
791 | 311M | Loop: |
792 | 311M | if (reachable->contains(id)) |
793 | 5.56M | continue; |
794 | 305M | reachable->insert_new(id); |
795 | | |
796 | 305M | if (id != root && rootmap->has_index(id)) { |
797 | | // We reached another "tree" via epsilon transition. Emit a kInstNop |
798 | | // instruction so that the Prog does not become quadratically larger. |
799 | 38.9M | flat->emplace_back(); |
800 | 38.9M | flat->back().set_opcode(kInstNop); |
801 | 38.9M | flat->back().set_out(rootmap->get_existing(id)); |
802 | 38.9M | continue; |
803 | 38.9M | } |
804 | | |
805 | 266M | Inst* ip = inst(id); |
806 | 266M | switch (ip->opcode()) { |
807 | 0 | default: |
808 | 0 | LOG(DFATAL) << "unhandled opcode: " << ip->opcode(); |
809 | 0 | break; |
810 | | |
811 | 2.96k | case kInstAltMatch: |
812 | 2.96k | flat->emplace_back(); |
813 | 2.96k | flat->back().set_opcode(kInstAltMatch); |
814 | 2.96k | flat->back().set_out(static_cast<int>(flat->size())); |
815 | 2.96k | flat->back().out1_ = static_cast<uint32_t>(flat->size())+1; |
816 | 2.96k | FALLTHROUGH_INTENDED; |
817 | | |
818 | 90.4M | case kInstAlt: |
819 | 90.4M | stk->push_back(ip->out1()); |
820 | 90.4M | id = ip->out(); |
821 | 90.4M | goto Loop; |
822 | | |
823 | 163M | case kInstByteRange: |
824 | 170M | case kInstCapture: |
825 | 176M | case kInstEmptyWidth: |
826 | 176M | flat->emplace_back(); |
827 | 176M | memmove(&flat->back(), ip, sizeof *ip); |
828 | 176M | flat->back().set_out(rootmap->get_existing(ip->out())); |
829 | 176M | break; |
830 | | |
831 | 66 | case kInstNop: |
832 | 66 | id = ip->out(); |
833 | 66 | goto Loop; |
834 | | |
835 | 75.0k | case kInstMatch: |
836 | 119k | case kInstFail: |
837 | 119k | flat->emplace_back(); |
838 | 119k | memmove(&flat->back(), ip, sizeof *ip); |
839 | 119k | break; |
840 | 266M | } |
841 | 266M | } |
842 | 130M | } |
843 | | |
844 | | // For each ByteRange instruction in [begin, end), computes a hint to execution |
845 | | // engines: the delta to the next instruction (in flat) worth exploring iff the |
846 | | // current instruction matched. |
847 | | // |
848 | | // Implements a coloring algorithm related to ByteMapBuilder, but in this case, |
849 | | // colors are instructions and recoloring ranges precisely identifies conflicts |
850 | | // between instructions. Iterating backwards over [begin, end) is guaranteed to |
851 | | // identify the nearest conflict (if any) with only linear complexity. |
852 | 130M | void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) { |
853 | 130M | Bitmap256 splits; |
854 | 130M | int colors[256]; |
855 | | |
856 | 130M | bool dirty = false; |
857 | 476M | for (int id = end; id >= begin; --id) { |
858 | 345M | if (id == end || |
859 | 345M | (*flat)[id].opcode() != kInstByteRange) { |
860 | 182M | if (dirty) { |
861 | 8.06M | dirty = false; |
862 | 8.06M | splits.Clear(); |
863 | 8.06M | } |
864 | 182M | splits.Set(255); |
865 | 182M | colors[255] = id; |
866 | | // At this point, the [0-255] range is colored with id. |
867 | | // Thus, hints cannot point beyond id; and if id == end, |
868 | | // hints that would have pointed to id will be 0 instead. |
869 | 182M | continue; |
870 | 182M | } |
871 | 163M | dirty = true; |
872 | | |
873 | | // We recolor the [lo-hi] range with id. Note that first ratchets backwards |
874 | | // from end to the nearest conflict (if any) during recoloring. |
875 | 163M | int first = end; |
876 | 181M | auto Recolor = [&](int lo, int hi) { |
877 | | // Like ByteMapBuilder, we split at lo-1 and at hi. |
878 | 181M | --lo; |
879 | | |
880 | 181M | if (0 <= lo && !splits.Test(lo)) { |
881 | 169M | splits.Set(lo); |
882 | 169M | int next = splits.FindNextSetBit(lo+1); |
883 | 169M | colors[lo] = colors[next]; |
884 | 169M | } |
885 | 181M | if (!splits.Test(hi)) { |
886 | 153M | splits.Set(hi); |
887 | 153M | int next = splits.FindNextSetBit(hi+1); |
888 | 153M | colors[hi] = colors[next]; |
889 | 153M | } |
890 | | |
891 | 181M | int c = lo+1; |
892 | 183M | while (c < 256) { |
893 | 183M | int next = splits.FindNextSetBit(c); |
894 | | // Ratchet backwards... |
895 | 183M | first = std::min(first, colors[next]); |
896 | | // Recolor with id - because it's the new nearest conflict! |
897 | 183M | colors[next] = id; |
898 | 183M | if (next == hi) |
899 | 181M | break; |
900 | 1.88M | c = next+1; |
901 | 1.88M | } |
902 | 181M | }; |
903 | | |
904 | 163M | Inst* ip = &(*flat)[id]; |
905 | 163M | int lo = ip->lo(); |
906 | 163M | int hi = ip->hi(); |
907 | 163M | Recolor(lo, hi); |
908 | 163M | if (ip->foldcase() && lo <= 'z' && hi >= 'a') { |
909 | 18.2M | int foldlo = lo; |
910 | 18.2M | int foldhi = hi; |
911 | 18.2M | if (foldlo < 'a') |
912 | 32.0k | foldlo = 'a'; |
913 | 18.2M | if (foldhi > 'z') |
914 | 67.4k | foldhi = 'z'; |
915 | 18.2M | if (foldlo <= foldhi) { |
916 | 18.2M | foldlo += 'A' - 'a'; |
917 | 18.2M | foldhi += 'A' - 'a'; |
918 | 18.2M | Recolor(foldlo, foldhi); |
919 | 18.2M | } |
920 | 18.2M | } |
921 | | |
922 | 163M | if (first != end) { |
923 | 17.0M | uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767)); |
924 | 17.0M | ip->hint_foldcase_ |= hint<<1; |
925 | 17.0M | } |
926 | 163M | } |
927 | 130M | } |
928 | | |
929 | | // The final state will always be this, which frees up a register for the hot |
930 | | // loop and thus avoids the spilling that can occur when building with Clang. |
931 | | static const size_t kShiftDFAFinal = 9; |
932 | | |
933 | | // This function takes the prefix as std::string (i.e. not const std::string& |
934 | | // as normal) because it's going to clobber it, so a temporary is convenient. |
935 | 13.2k | static uint64_t* BuildShiftDFA(std::string prefix) { |
936 | | // This constant is for convenience now and also for correctness later when |
937 | | // we clobber the prefix, but still need to know how long it was initially. |
938 | 13.2k | const size_t size = prefix.size(); |
939 | | |
940 | | // Construct the NFA. |
941 | | // The table is indexed by input byte; each element is a bitfield of states |
942 | | // reachable by the input byte. Given a bitfield of the current states, the |
943 | | // bitfield of states reachable from those is - for this specific purpose - |
944 | | // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives |
945 | | // the bitfield of the next states reached by stepping over the input byte. |
946 | | // Credits for this technique: the Hyperscan paper by Geoff Langdale et al. |
947 | 13.2k | uint16_t nfa[256]{}; |
948 | 66.7k | for (size_t i = 0; i < size; ++i) { |
949 | 53.5k | uint8_t b = prefix[i]; |
950 | 53.5k | nfa[b] |= 1 << (i+1); |
951 | 53.5k | } |
952 | | // This is the `\C*?` for unanchored search. |
953 | 3.39M | for (int b = 0; b < 256; ++b) |
954 | 3.38M | nfa[b] |= 1; |
955 | | |
956 | | // This maps from DFA state to NFA states; the reverse mapping is used when |
957 | | // recording transitions and gets implemented with plain old linear search. |
958 | | // The "Shift DFA" technique limits this to ten states when using uint64_t; |
959 | | // to allow for the initial state, we use at most nine bytes of the prefix. |
960 | | // That same limit is also why uint16_t is sufficient for the NFA bitfield. |
961 | 13.2k | uint16_t states[kShiftDFAFinal+1]{}; |
962 | 13.2k | states[0] = 1; |
963 | 66.7k | for (size_t dcurr = 0; dcurr < size; ++dcurr) { |
964 | 53.5k | uint8_t b = prefix[dcurr]; |
965 | 53.5k | uint16_t ncurr = states[dcurr]; |
966 | 53.5k | uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); |
967 | 53.5k | size_t dnext = dcurr+1; |
968 | 53.5k | if (dnext == size) |
969 | 13.2k | dnext = kShiftDFAFinal; |
970 | 53.5k | states[dnext] = nnext; |
971 | 53.5k | } |
972 | | |
973 | | // Sort and unique the bytes of the prefix to avoid repeating work while we |
974 | | // record transitions. This clobbers the prefix, but it's no longer needed. |
975 | 13.2k | std::sort(prefix.begin(), prefix.end()); |
976 | 13.2k | prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end()); |
977 | | |
978 | | // Construct the DFA. |
979 | | // The table is indexed by input byte; each element is effectively a packed |
980 | | // array of uint6_t; each array value will be multiplied by six in order to |
981 | | // avoid having to do so later in the hot loop as well as masking/shifting. |
982 | | // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen. |
983 | 13.2k | uint64_t* dfa = new uint64_t[256]{}; |
984 | | // Record a transition from each state for each of the bytes of the prefix. |
985 | | // Note that all other input bytes go back to the initial state by default. |
986 | 66.7k | for (size_t dcurr = 0; dcurr < size; ++dcurr) { |
987 | 242k | for (uint8_t b : prefix) { |
988 | 242k | uint16_t ncurr = states[dcurr]; |
989 | 242k | uint16_t nnext = nfa[b] & ((ncurr << 1) | 1); |
990 | 242k | size_t dnext = 0; |
991 | 541k | while (states[dnext] != nnext) |
992 | 298k | ++dnext; |
993 | 242k | dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6); |
994 | | // Convert ASCII letters to uppercase and record the extra transitions. |
995 | | // Note that ASCII letters are guaranteed to be lowercase at this point |
996 | | // because that's how the parser normalises them. #FunFact: 'k' and 's' |
997 | | // match U+212A and U+017F, respectively, so they won't occur here when |
998 | | // using UTF-8 encoding because the parser will emit character classes. |
999 | 242k | if ('a' <= b && b <= 'z') { |
1000 | 81.1k | b -= 'a' - 'A'; |
1001 | 81.1k | dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6); |
1002 | 81.1k | } |
1003 | 242k | } |
1004 | 53.5k | } |
1005 | | // This lets the final state "saturate", which will matter for performance: |
1006 | | // in the hot loop, we check for a match only at the end of each iteration, |
1007 | | // so we must keep signalling the match until we get around to checking it. |
1008 | 3.39M | for (int b = 0; b < 256; ++b) |
1009 | 3.38M | dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6); |
1010 | | |
1011 | 13.2k | return dfa; |
1012 | 13.2k | } |
1013 | | |
1014 | | void Prog::ConfigurePrefixAccel(const std::string& prefix, |
1015 | 19.1k | bool prefix_foldcase) { |
1016 | 19.1k | prefix_foldcase_ = prefix_foldcase; |
1017 | 19.1k | prefix_size_ = prefix.size(); |
1018 | 19.1k | if (prefix_foldcase_) { |
1019 | | // Use PrefixAccel_ShiftDFA(). |
1020 | | // ... and no more than nine bytes of the prefix. (See above for details.) |
1021 | 13.2k | prefix_size_ = std::min(prefix_size_, kShiftDFAFinal); |
1022 | 13.2k | prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_)); |
1023 | 13.2k | } else if (prefix_size_ != 1) { |
1024 | | // Use PrefixAccel_FrontAndBack(). |
1025 | 4.80k | prefix_front_ = prefix.front(); |
1026 | 4.80k | prefix_back_ = prefix.back(); |
1027 | 4.80k | } else { |
1028 | | // Use memchr(3). |
1029 | 1.15k | prefix_front_ = prefix.front(); |
1030 | 1.15k | } |
1031 | 19.1k | } |
1032 | | |
1033 | 0 | const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) { |
1034 | 0 | if (size < prefix_size_) |
1035 | 0 | return NULL; |
1036 | | |
1037 | 0 | uint64_t curr = 0; |
1038 | | |
1039 | | // At the time of writing, rough benchmarks on a Broadwell machine showed |
1040 | | // that this unroll factor (i.e. eight) achieves a speedup factor of two. |
1041 | 0 | if (size >= 8) { |
1042 | 0 | const uint8_t* p = reinterpret_cast<const uint8_t*>(data); |
1043 | 0 | const uint8_t* endp = p + (size&~7); |
1044 | 0 | do { |
1045 | 0 | uint8_t b0 = p[0]; |
1046 | 0 | uint8_t b1 = p[1]; |
1047 | 0 | uint8_t b2 = p[2]; |
1048 | 0 | uint8_t b3 = p[3]; |
1049 | 0 | uint8_t b4 = p[4]; |
1050 | 0 | uint8_t b5 = p[5]; |
1051 | 0 | uint8_t b6 = p[6]; |
1052 | 0 | uint8_t b7 = p[7]; |
1053 | |
|
1054 | 0 | uint64_t next0 = prefix_dfa_[b0]; |
1055 | 0 | uint64_t next1 = prefix_dfa_[b1]; |
1056 | 0 | uint64_t next2 = prefix_dfa_[b2]; |
1057 | 0 | uint64_t next3 = prefix_dfa_[b3]; |
1058 | 0 | uint64_t next4 = prefix_dfa_[b4]; |
1059 | 0 | uint64_t next5 = prefix_dfa_[b5]; |
1060 | 0 | uint64_t next6 = prefix_dfa_[b6]; |
1061 | 0 | uint64_t next7 = prefix_dfa_[b7]; |
1062 | |
|
1063 | 0 | uint64_t curr0 = next0 >> (curr & 63); |
1064 | 0 | uint64_t curr1 = next1 >> (curr0 & 63); |
1065 | 0 | uint64_t curr2 = next2 >> (curr1 & 63); |
1066 | 0 | uint64_t curr3 = next3 >> (curr2 & 63); |
1067 | 0 | uint64_t curr4 = next4 >> (curr3 & 63); |
1068 | 0 | uint64_t curr5 = next5 >> (curr4 & 63); |
1069 | 0 | uint64_t curr6 = next6 >> (curr5 & 63); |
1070 | 0 | uint64_t curr7 = next7 >> (curr6 & 63); |
1071 | |
|
1072 | 0 | if ((curr7 & 63) == kShiftDFAFinal * 6) { |
1073 | | // At the time of writing, using the same masking subexpressions from |
1074 | | // the preceding lines caused Clang to clutter the hot loop computing |
1075 | | // them - even though they aren't actually needed for shifting! Hence |
1076 | | // these rewritten conditions, which achieve a speedup factor of two. |
1077 | 0 | if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_; |
1078 | 0 | if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_; |
1079 | 0 | if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_; |
1080 | 0 | if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_; |
1081 | 0 | if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_; |
1082 | 0 | if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_; |
1083 | 0 | if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_; |
1084 | 0 | if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_; |
1085 | 0 | } |
1086 | | |
1087 | 0 | curr = curr7; |
1088 | 0 | p += 8; |
1089 | 0 | } while (p != endp); |
1090 | 0 | data = p; |
1091 | 0 | size = size&7; |
1092 | 0 | } |
1093 | | |
1094 | 0 | const uint8_t* p = reinterpret_cast<const uint8_t*>(data); |
1095 | 0 | const uint8_t* endp = p + size; |
1096 | 0 | while (p != endp) { |
1097 | 0 | uint8_t b = *p++; |
1098 | 0 | uint64_t next = prefix_dfa_[b]; |
1099 | 0 | curr = next >> (curr & 63); |
1100 | 0 | if ((curr & 63) == kShiftDFAFinal * 6) |
1101 | 0 | return p-prefix_size_; |
1102 | 0 | } |
1103 | 0 | return NULL; |
1104 | 0 | } |
1105 | | |
1106 | | #if defined(__AVX2__) |
1107 | | // Finds the least significant non-zero bit in n. |
1108 | | static int FindLSBSet(uint32_t n) { |
1109 | | DCHECK_NE(n, 0); |
1110 | | #if defined(__GNUC__) |
1111 | | return __builtin_ctz(n); |
1112 | | #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) |
1113 | | unsigned long c; |
1114 | | _BitScanForward(&c, n); |
1115 | | return static_cast<int>(c); |
1116 | | #else |
1117 | | int c = 31; |
1118 | | for (int shift = 1 << 4; shift != 0; shift >>= 1) { |
1119 | | uint32_t word = n << shift; |
1120 | | if (word != 0) { |
1121 | | n = word; |
1122 | | c -= shift; |
1123 | | } |
1124 | | } |
1125 | | return c; |
1126 | | #endif |
1127 | | } |
1128 | | #endif |
1129 | | |
1130 | 0 | const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) { |
1131 | 0 | DCHECK_GE(prefix_size_, 2); |
1132 | 0 | if (size < prefix_size_) |
1133 | 0 | return NULL; |
1134 | | // Don't bother searching the last prefix_size_-1 bytes for prefix_front_. |
1135 | | // This also means that probing for prefix_back_ doesn't go out of bounds. |
1136 | 0 | size -= prefix_size_-1; |
1137 | |
|
1138 | | #if defined(__AVX2__) |
1139 | | // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time. |
1140 | | if (size >= sizeof(__m256i)) { |
1141 | | const __m256i* fp = reinterpret_cast<const __m256i*>( |
1142 | | reinterpret_cast<const char*>(data)); |
1143 | | const __m256i* bp = reinterpret_cast<const __m256i*>( |
1144 | | reinterpret_cast<const char*>(data) + prefix_size_-1); |
1145 | | const __m256i* endfp = fp + size/sizeof(__m256i); |
1146 | | const __m256i f_set1 = _mm256_set1_epi8(prefix_front_); |
1147 | | const __m256i b_set1 = _mm256_set1_epi8(prefix_back_); |
1148 | | do { |
1149 | | const __m256i f_loadu = _mm256_loadu_si256(fp++); |
1150 | | const __m256i b_loadu = _mm256_loadu_si256(bp++); |
1151 | | const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu); |
1152 | | const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu); |
1153 | | const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq); |
1154 | | if (fb_testz == 0) { // ZF: 1 means zero, 0 means non-zero. |
1155 | | const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq); |
1156 | | const int fb_movemask = _mm256_movemask_epi8(fb_and); |
1157 | | const int fb_ctz = FindLSBSet(fb_movemask); |
1158 | | return reinterpret_cast<const char*>(fp-1) + fb_ctz; |
1159 | | } |
1160 | | } while (fp != endfp); |
1161 | | data = fp; |
1162 | | size = size%sizeof(__m256i); |
1163 | | } |
1164 | | #endif |
1165 | |
|
1166 | 0 | const char* p0 = reinterpret_cast<const char*>(data); |
1167 | 0 | for (const char* p = p0;; p++) { |
1168 | 0 | DCHECK_GE(size, static_cast<size_t>(p-p0)); |
1169 | 0 | p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0))); |
1170 | 0 | if (p == NULL || p[prefix_size_-1] == prefix_back_) |
1171 | 0 | return p; |
1172 | 0 | } |
1173 | 0 | } |
1174 | | |
1175 | | } // namespace re2 |