Coverage Report

Created: 2024-05-15 07:20

/src/re2/re2/prog.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2007 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Compiled regular expression representation.
6
// Tested by compile_test.cc
7
8
#include "re2/prog.h"
9
10
#if defined(__AVX2__)
11
#include <immintrin.h>
12
#ifdef _MSC_VER
13
#include <intrin.h>
14
#endif
15
#endif
16
#include <stdint.h>
17
#include <string.h>
18
#include <algorithm>
19
#include <memory>
20
#include <utility>
21
22
#include "util/util.h"
23
#include "util/logging.h"
24
#include "util/strutil.h"
25
#include "re2/bitmap256.h"
26
#include "re2/stringpiece.h"
27
28
namespace re2 {
29
30
// Constructors per Inst opcode
31
32
224M
void Prog::Inst::InitAlt(uint32_t out, uint32_t out1) {
33
224M
  DCHECK_EQ(out_opcode_, 0);
34
224M
  set_out_opcode(out, kInstAlt);
35
224M
  out1_ = out1;
36
224M
}
37
38
648M
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32_t out) {
39
648M
  DCHECK_EQ(out_opcode_, 0);
40
648M
  set_out_opcode(out, kInstByteRange);
41
648M
  lo_ = lo & 0xFF;
42
648M
  hi_ = hi & 0xFF;
43
648M
  hint_foldcase_ = foldcase&1;
44
648M
}
45
46
7.97M
void Prog::Inst::InitCapture(int cap, uint32_t out) {
47
7.97M
  DCHECK_EQ(out_opcode_, 0);
48
7.97M
  set_out_opcode(out, kInstCapture);
49
7.97M
  cap_ = cap;
50
7.97M
}
51
52
5.43M
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32_t out) {
53
5.43M
  DCHECK_EQ(out_opcode_, 0);
54
5.43M
  set_out_opcode(out, kInstEmptyWidth);
55
5.43M
  empty_ = empty;
56
5.43M
}
57
58
44.5k
void Prog::Inst::InitMatch(int32_t id) {
59
44.5k
  DCHECK_EQ(out_opcode_, 0);
60
44.5k
  set_opcode(kInstMatch);
61
44.5k
  match_id_ = id;
62
44.5k
}
63
64
2.94M
void Prog::Inst::InitNop(uint32_t out) {
65
2.94M
  DCHECK_EQ(out_opcode_, 0);
66
2.94M
  set_opcode(kInstNop);
67
2.94M
}
68
69
45.0k
void Prog::Inst::InitFail() {
70
45.0k
  DCHECK_EQ(out_opcode_, 0);
71
45.0k
  set_opcode(kInstFail);
72
45.0k
}
73
74
0
std::string Prog::Inst::Dump() {
75
0
  switch (opcode()) {
76
0
    default:
77
0
      return StringPrintf("opcode %d", static_cast<int>(opcode()));
78
79
0
    case kInstAlt:
80
0
      return StringPrintf("alt -> %d | %d", out(), out1_);
81
82
0
    case kInstAltMatch:
83
0
      return StringPrintf("altmatch -> %d | %d", out(), out1_);
84
85
0
    case kInstByteRange:
86
0
      return StringPrintf("byte%s [%02x-%02x] %d -> %d",
87
0
                          foldcase() ? "/i" : "",
88
0
                          lo_, hi_, hint(), out());
89
90
0
    case kInstCapture:
91
0
      return StringPrintf("capture %d -> %d", cap_, out());
92
93
0
    case kInstEmptyWidth:
94
0
      return StringPrintf("emptywidth %#x -> %d",
95
0
                          static_cast<int>(empty_), out());
96
97
0
    case kInstMatch:
98
0
      return StringPrintf("match! %d", match_id());
99
100
0
    case kInstNop:
101
0
      return StringPrintf("nop -> %d", out());
102
103
0
    case kInstFail:
104
0
      return StringPrintf("fail");
105
0
  }
106
0
}
107
108
Prog::Prog()
109
  : anchor_start_(false),
110
    anchor_end_(false),
111
    reversed_(false),
112
    did_flatten_(false),
113
    did_onepass_(false),
114
    start_(0),
115
    start_unanchored_(0),
116
    size_(0),
117
    bytemap_range_(0),
118
    prefix_foldcase_(false),
119
    prefix_size_(0),
120
    list_count_(0),
121
    bit_state_text_max_size_(0),
122
    dfa_mem_(0),
123
    dfa_first_(NULL),
124
45.0k
    dfa_longest_(NULL) {
125
45.0k
}
126
127
45.0k
Prog::~Prog() {
128
45.0k
  DeleteDFA(dfa_longest_);
129
45.0k
  DeleteDFA(dfa_first_);
130
45.0k
  if (prefix_foldcase_)
131
13.2k
    delete[] prefix_dfa_;
132
45.0k
}
133
134
typedef SparseSet Workq;
135
136
637M
static inline void AddToQueue(Workq* q, int id) {
137
637M
  if (id != 0)
138
636M
    q->insert(id);
139
637M
}
140
141
0
static std::string ProgToString(Prog* prog, Workq* q) {
142
0
  std::string s;
143
0
  for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
144
0
    int id = *i;
145
0
    Prog::Inst* ip = prog->inst(id);
146
0
    s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
147
0
    AddToQueue(q, ip->out());
148
0
    if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
149
0
      AddToQueue(q, ip->out1());
150
0
  }
151
0
  return s;
152
0
}
153
154
0
static std::string FlattenedProgToString(Prog* prog, int start) {
155
0
  std::string s;
156
0
  for (int id = start; id < prog->size(); id++) {
157
0
    Prog::Inst* ip = prog->inst(id);
158
0
    if (ip->last())
159
0
      s += StringPrintf("%d. %s\n", id, ip->Dump().c_str());
160
0
    else
161
0
      s += StringPrintf("%d+ %s\n", id, ip->Dump().c_str());
162
0
  }
163
0
  return s;
164
0
}
165
166
0
std::string Prog::Dump() {
167
0
  if (did_flatten_)
168
0
    return FlattenedProgToString(this, start_);
169
170
0
  Workq q(size_);
171
0
  AddToQueue(&q, start_);
172
0
  return ProgToString(this, &q);
173
0
}
174
175
0
std::string Prog::DumpUnanchored() {
176
0
  if (did_flatten_)
177
0
    return FlattenedProgToString(this, start_unanchored_);
178
179
0
  Workq q(size_);
180
0
  AddToQueue(&q, start_unanchored_);
181
0
  return ProgToString(this, &q);
182
0
}
183
184
0
std::string Prog::DumpByteMap() {
185
0
  std::string map;
186
0
  for (int c = 0; c < 256; c++) {
187
0
    int b = bytemap_[c];
188
0
    int lo = c;
189
0
    while (c < 256-1 && bytemap_[c+1] == b)
190
0
      c++;
191
0
    int hi = c;
192
0
    map += StringPrintf("[%02x-%02x] -> %d\n", lo, hi, b);
193
0
  }
194
0
  return map;
195
0
}
196
197
// Is ip a guaranteed match at end of text, perhaps after some capturing?
198
72.3M
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
199
74.0M
  for (;;) {
200
74.0M
    switch (ip->opcode()) {
201
0
      default:
202
0
        LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
203
0
        return false;
204
205
39.8M
      case kInstAlt:
206
39.8M
      case kInstAltMatch:
207
71.4M
      case kInstByteRange:
208
71.5M
      case kInstFail:
209
72.3M
      case kInstEmptyWidth:
210
72.3M
        return false;
211
212
1.65M
      case kInstCapture:
213
1.65M
      case kInstNop:
214
1.65M
        ip = prog->inst(ip->out());
215
1.65M
        break;
216
217
30.3k
      case kInstMatch:
218
30.3k
        return true;
219
74.0M
    }
220
74.0M
  }
221
72.3M
}
222
223
// Peep-hole optimizer.
224
44.5k
void Prog::Optimize() {
225
44.5k
  Workq q(size_);
226
227
  // Eliminate nops.  Most are taken out during compilation
228
  // but a few are hard to avoid.
229
44.5k
  q.clear();
230
44.5k
  AddToQueue(&q, start_);
231
246M
  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
232
246M
    int id = *i;
233
234
246M
    Inst* ip = inst(id);
235
246M
    int j = ip->out();
236
246M
    Inst* jp;
237
247M
    while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
238
551k
      j = jp->out();
239
551k
    }
240
246M
    ip->set_out(j);
241
246M
    AddToQueue(&q, ip->out());
242
243
246M
    if (ip->opcode() == kInstAlt) {
244
71.7M
      j = ip->out1();
245
74.1M
      while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
246
2.37M
        j = jp->out();
247
2.37M
      }
248
71.7M
      ip->out1_ = j;
249
71.7M
      AddToQueue(&q, ip->out1());
250
71.7M
    }
251
246M
  }
252
253
  // Insert kInstAltMatch instructions
254
  // Look for
255
  //   ip: Alt -> j | k
256
  //    j: ByteRange [00-FF] -> ip
257
  //    k: Match
258
  // or the reverse (the above is the greedy one).
259
  // Rewrite Alt to AltMatch.
260
44.5k
  q.clear();
261
44.5k
  AddToQueue(&q, start_);
262
246M
  for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
263
246M
    int id = *i;
264
246M
    Inst* ip = inst(id);
265
246M
    AddToQueue(&q, ip->out());
266
246M
    if (ip->opcode() == kInstAlt)
267
71.7M
      AddToQueue(&q, ip->out1());
268
269
246M
    if (ip->opcode() == kInstAlt) {
270
71.7M
      Inst* j = inst(ip->out());
271
71.7M
      Inst* k = inst(ip->out1());
272
71.7M
      if (j->opcode() == kInstByteRange && j->out() == id &&
273
71.7M
          j->lo() == 0x00 && j->hi() == 0xFF &&
274
71.7M
          IsMatch(this, k)) {
275
2.56k
        ip->set_opcode(kInstAltMatch);
276
2.56k
        continue;
277
2.56k
      }
278
71.7M
      if (IsMatch(this, j) &&
279
71.7M
          k->opcode() == kInstByteRange && k->out() == id &&
280
71.7M
          k->lo() == 0x00 && k->hi() == 0xFF) {
281
399
        ip->set_opcode(kInstAltMatch);
282
399
      }
283
71.7M
    }
284
246M
  }
285
44.5k
}
286
287
8.43M
uint32_t Prog::EmptyFlags(const StringPiece& text, const char* p) {
288
8.43M
  int flags = 0;
289
290
  // ^ and \A
291
8.43M
  if (p == text.data())
292
216k
    flags |= kEmptyBeginText | kEmptyBeginLine;
293
8.22M
  else if (p[-1] == '\n')
294
8.16k
    flags |= kEmptyBeginLine;
295
296
  // $ and \z
297
8.43M
  if (p == text.data() + text.size())
298
171k
    flags |= kEmptyEndText | kEmptyEndLine;
299
8.26M
  else if (p < text.data() + text.size() && p[0] == '\n')
300
11.6k
    flags |= kEmptyEndLine;
301
302
  // \b and \B
303
8.43M
  if (p == text.data() && p == text.data() + text.size()) {
304
    // no word boundary here
305
8.43M
  } else if (p == text.data()) {
306
216k
    if (IsWordChar(p[0]))
307
4.22k
      flags |= kEmptyWordBoundary;
308
8.22M
  } else if (p == text.data() + text.size()) {
309
171k
    if (IsWordChar(p[-1]))
310
45.2k
      flags |= kEmptyWordBoundary;
311
8.05M
  } else {
312
8.05M
    if (IsWordChar(p[-1]) != IsWordChar(p[0]))
313
2.46M
      flags |= kEmptyWordBoundary;
314
8.05M
  }
315
8.43M
  if (!(flags & kEmptyWordBoundary))
316
5.92M
    flags |= kEmptyNonWordBoundary;
317
318
8.43M
  return flags;
319
8.43M
}
320
321
// ByteMapBuilder implements a coloring algorithm.
322
//
323
// The first phase is a series of "mark and merge" batches: we mark one or more
324
// [lo-hi] ranges, then merge them into our internal state. Batching is not for
325
// performance; rather, it means that the ranges are treated indistinguishably.
326
//
327
// Internally, the ranges are represented using a bitmap that stores the splits
328
// and a vector that stores the colors; both of them are indexed by the ranges'
329
// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
330
// hi (if not already split), then recolor each range in between. The color map
331
// (i.e. from the old color to the new color) is maintained for the lifetime of
332
// the batch and so underpins this somewhat obscure approach to set operations.
333
//
334
// The second phase builds the bytemap from our internal state: we recolor each
335
// range, then store the new color (which is now the byte class) in each of the
336
// corresponding array elements. Finally, we output the number of byte classes.
337
class ByteMapBuilder {
338
 public:
339
44.5k
  ByteMapBuilder() {
340
    // Initial state: the [0-255] range has color 256.
341
    // This will avoid problems during the second phase,
342
    // in which we assign byte classes numbered from 0.
343
44.5k
    splits_.Set(255);
344
44.5k
    colors_[255] = 256;
345
44.5k
    nextcolor_ = 257;
346
44.5k
  }
347
348
  void Mark(int lo, int hi);
349
  void Merge();
350
  void Build(uint8_t* bytemap, int* bytemap_range);
351
352
 private:
353
  int Recolor(int oldcolor);
354
355
  Bitmap256 splits_;
356
  int colors_[256];
357
  int nextcolor_;
358
  std::vector<std::pair<int, int>> colormap_;
359
  std::vector<std::pair<int, int>> ranges_;
360
361
  ByteMapBuilder(const ByteMapBuilder&) = delete;
362
  ByteMapBuilder& operator=(const ByteMapBuilder&) = delete;
363
};
364
365
181M
void ByteMapBuilder::Mark(int lo, int hi) {
366
181M
  DCHECK_GE(lo, 0);
367
181M
  DCHECK_GE(hi, 0);
368
181M
  DCHECK_LE(lo, 255);
369
181M
  DCHECK_LE(hi, 255);
370
181M
  DCHECK_LE(lo, hi);
371
372
  // Ignore any [0-255] ranges. They cause us to recolor every range, which
373
  // has no effect on the eventual result and is therefore a waste of time.
374
181M
  if (lo == 0 && hi == 255)
375
2.12M
    return;
376
377
179M
  ranges_.emplace_back(lo, hi);
378
179M
}
379
380
144M
void ByteMapBuilder::Merge() {
381
144M
  for (std::vector<std::pair<int, int>>::const_iterator it = ranges_.begin();
382
323M
       it != ranges_.end();
383
179M
       ++it) {
384
179M
    int lo = it->first-1;
385
179M
    int hi = it->second;
386
387
179M
    if (0 <= lo && !splits_.Test(lo)) {
388
690k
      splits_.Set(lo);
389
690k
      int next = splits_.FindNextSetBit(lo+1);
390
690k
      colors_[lo] = colors_[next];
391
690k
    }
392
179M
    if (!splits_.Test(hi)) {
393
768k
      splits_.Set(hi);
394
768k
      int next = splits_.FindNextSetBit(hi+1);
395
768k
      colors_[hi] = colors_[next];
396
768k
    }
397
398
179M
    int c = lo+1;
399
659M
    while (c < 256) {
400
659M
      int next = splits_.FindNextSetBit(c);
401
659M
      colors_[next] = Recolor(colors_[next]);
402
659M
      if (next == hi)
403
179M
        break;
404
480M
      c = next+1;
405
480M
    }
406
179M
  }
407
144M
  colormap_.clear();
408
144M
  ranges_.clear();
409
144M
}
410
411
44.5k
void ByteMapBuilder::Build(uint8_t* bytemap, int* bytemap_range) {
412
  // Assign byte classes numbered from 0.
413
44.5k
  nextcolor_ = 0;
414
415
44.5k
  int c = 0;
416
1.54M
  while (c < 256) {
417
1.50M
    int next = splits_.FindNextSetBit(c);
418
1.50M
    uint8_t b = static_cast<uint8_t>(Recolor(colors_[next]));
419
12.9M
    while (c <= next) {
420
11.4M
      bytemap[c] = b;
421
11.4M
      c++;
422
11.4M
    }
423
1.50M
  }
424
425
44.5k
  *bytemap_range = nextcolor_;
426
44.5k
}
427
428
661M
int ByteMapBuilder::Recolor(int oldcolor) {
429
  // Yes, this is a linear search. There can be at most 256
430
  // colors and there will typically be far fewer than that.
431
  // Also, we need to consider keys *and* values in order to
432
  // avoid recoloring a given range more than once per batch.
433
661M
  std::vector<std::pair<int, int>>::const_iterator it =
434
661M
      std::find_if(colormap_.begin(), colormap_.end(),
435
9.15G
                   [=](const std::pair<int, int>& kv) -> bool {
436
9.15G
                     return kv.first == oldcolor || kv.second == oldcolor;
437
9.15G
                   });
438
661M
  if (it != colormap_.end())
439
71.2M
    return it->second;
440
590M
  int newcolor = nextcolor_;
441
590M
  nextcolor_++;
442
590M
  colormap_.emplace_back(oldcolor, newcolor);
443
590M
  return newcolor;
444
661M
}
445
446
44.5k
void Prog::ComputeByteMap() {
447
  // Fill in bytemap with byte classes for the program.
448
  // Ranges of bytes that are treated indistinguishably
449
  // will be mapped to a single byte class.
450
44.5k
  ByteMapBuilder builder;
451
452
  // Don't repeat the work for ^ and $.
453
44.5k
  bool marked_line_boundaries = false;
454
  // Don't repeat the work for \b and \B.
455
44.5k
  bool marked_word_boundaries = false;
456
457
215M
  for (int id = 0; id < size(); id++) {
458
215M
    Inst* ip = inst(id);
459
215M
    if (ip->opcode() == kInstByteRange) {
460
163M
      int lo = ip->lo();
461
163M
      int hi = ip->hi();
462
163M
      builder.Mark(lo, hi);
463
163M
      if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
464
18.2M
        int foldlo = lo;
465
18.2M
        int foldhi = hi;
466
18.2M
        if (foldlo < 'a')
467
32.0k
          foldlo = 'a';
468
18.2M
        if (foldhi > 'z')
469
67.4k
          foldhi = 'z';
470
18.2M
        if (foldlo <= foldhi) {
471
18.2M
          foldlo += 'A' - 'a';
472
18.2M
          foldhi += 'A' - 'a';
473
18.2M
          builder.Mark(foldlo, foldhi);
474
18.2M
        }
475
18.2M
      }
476
      // If this Inst is not the last Inst in its list AND the next Inst is
477
      // also a ByteRange AND the Insts have the same out, defer the merge.
478
163M
      if (!ip->last() &&
479
163M
          inst(id+1)->opcode() == kInstByteRange &&
480
163M
          ip->out() == inst(id+1)->out())
481
18.8M
        continue;
482
144M
      builder.Merge();
483
144M
    } else if (ip->opcode() == kInstEmptyWidth) {
484
5.27M
      if (ip->empty() & (kEmptyBeginLine|kEmptyEndLine) &&
485
5.27M
          !marked_line_boundaries) {
486
2.27k
        builder.Mark('\n', '\n');
487
2.27k
        builder.Merge();
488
2.27k
        marked_line_boundaries = true;
489
2.27k
      }
490
5.27M
      if (ip->empty() & (kEmptyWordBoundary|kEmptyNonWordBoundary) &&
491
5.27M
          !marked_word_boundaries) {
492
        // We require two batches here: the first for ranges that are word
493
        // characters, the second for ranges that are not word characters.
494
4.15k
        for (bool isword : {true, false}) {
495
4.15k
          int j;
496
41.5k
          for (int i = 0; i < 256; i = j) {
497
1.06M
            for (j = i + 1; j < 256 &&
498
1.06M
                            Prog::IsWordChar(static_cast<uint8_t>(i)) ==
499
1.05M
                                Prog::IsWordChar(static_cast<uint8_t>(j));
500
1.02M
                 j++)
501
1.02M
              ;
502
37.4k
            if (Prog::IsWordChar(static_cast<uint8_t>(i)) == isword)
503
18.7k
              builder.Mark(i, j - 1);
504
37.4k
          }
505
4.15k
          builder.Merge();
506
4.15k
        }
507
2.07k
        marked_word_boundaries = true;
508
2.07k
      }
509
5.27M
    }
510
215M
  }
511
512
44.5k
  builder.Build(bytemap_, &bytemap_range_);
513
514
44.5k
  if ((0)) {  // For debugging, use trivial bytemap.
515
0
    LOG(ERROR) << "Using trivial bytemap.";
516
0
    for (int i = 0; i < 256; i++)
517
0
      bytemap_[i] = static_cast<uint8_t>(i);
518
0
    bytemap_range_ = 256;
519
0
  }
520
44.5k
}
521
522
// Prog::Flatten() implements a graph rewriting algorithm.
523
//
524
// The overall process is similar to epsilon removal, but retains some epsilon
525
// transitions: those from Capture and EmptyWidth instructions; and those from
526
// nullable subexpressions. (The latter avoids quadratic blowup in transitions
527
// in the worst case.) It might be best thought of as Alt instruction elision.
528
//
529
// In conceptual terms, it divides the Prog into "trees" of instructions, then
530
// traverses the "trees" in order to produce "lists" of instructions. A "tree"
531
// is one or more instructions that grow from one "root" instruction to one or
532
// more "leaf" instructions; if a "tree" has exactly one instruction, then the
533
// "root" is also the "leaf". In most cases, a "root" is the successor of some
534
// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
535
// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
536
// EmptyWidth or Match instruction. However, this is insufficient for handling
537
// nested nullable subexpressions correctly, so in some cases, a "root" is the
538
// dominator of the instructions reachable from some "successor root" (i.e. it
539
// has an unreachable predecessor) and is considered a "dominator root". Since
540
// only Alt instructions can be "dominator roots" (other instructions would be
541
// "leaves"), only Alt instructions are required to be marked as predecessors.
542
//
543
// Dividing the Prog into "trees" comprises two passes: marking the "successor
544
// roots" and the predecessors; and marking the "dominator roots". Sorting the
545
// "successor roots" by their bytecode offsets enables iteration in order from
546
// greatest to least during the second pass; by working backwards in this case
547
// and flooding the graph no further than "leaves" and already marked "roots",
548
// it becomes possible to mark "dominator roots" without doing excessive work.
549
//
550
// Traversing the "trees" is just iterating over the "roots" in order of their
551
// marking and flooding the graph no further than "leaves" and "roots". When a
552
// "leaf" is reached, the instruction is copied with its successor remapped to
553
// its "root" number. When a "root" is reached, a Nop instruction is generated
554
// with its successor remapped similarly. As each "list" is produced, its last
555
// instruction is marked as such. After all of the "lists" have been produced,
556
// a pass over their instructions remaps their successors to bytecode offsets.
557
44.5k
void Prog::Flatten() {
558
44.5k
  if (did_flatten_)
559
0
    return;
560
44.5k
  did_flatten_ = true;
561
562
  // Scratch structures. It's important that these are reused by functions
563
  // that we call in loops because they would thrash the heap otherwise.
564
44.5k
  SparseSet reachable(size());
565
44.5k
  std::vector<int> stk;
566
44.5k
  stk.reserve(size());
567
568
  // First pass: Marks "successor roots" and predecessors.
569
  // Builds the mapping from inst-ids to root-ids.
570
44.5k
  SparseArray<int> rootmap(size());
571
44.5k
  SparseArray<int> predmap(size());
572
44.5k
  std::vector<std::vector<int>> predvec;
573
44.5k
  MarkSuccessors(&rootmap, &predmap, &predvec, &reachable, &stk);
574
575
  // Second pass: Marks "dominator roots".
576
44.5k
  SparseArray<int> sorted(rootmap);
577
44.5k
  std::sort(sorted.begin(), sorted.end(), sorted.less);
578
44.5k
  for (SparseArray<int>::const_iterator i = sorted.end() - 1;
579
125M
       i != sorted.begin();
580
125M
       --i) {
581
125M
    if (i->index() != start_unanchored() && i->index() != start())
582
124M
      MarkDominator(i->index(), &rootmap, &predmap, &predvec, &reachable, &stk);
583
125M
  }
584
585
  // Third pass: Emits "lists". Remaps outs to root-ids.
586
  // Builds the mapping from root-ids to flat-ids.
587
44.5k
  std::vector<int> flatmap(rootmap.size());
588
44.5k
  std::vector<Inst> flat;
589
44.5k
  flat.reserve(size());
590
44.5k
  for (SparseArray<int>::const_iterator i = rootmap.begin();
591
130M
       i != rootmap.end();
592
130M
       ++i) {
593
130M
    flatmap[i->value()] = static_cast<int>(flat.size());
594
130M
    EmitList(i->index(), &rootmap, &flat, &reachable, &stk);
595
130M
    flat.back().set_last();
596
    // We have the bounds of the "list", so this is the
597
    // most convenient point at which to compute hints.
598
130M
    ComputeHints(&flat, flatmap[i->value()], static_cast<int>(flat.size()));
599
130M
  }
600
601
44.5k
  list_count_ = static_cast<int>(flatmap.size());
602
401k
  for (int i = 0; i < kNumInst; i++)
603
356k
    inst_count_[i] = 0;
604
605
  // Fourth pass: Remaps outs to flat-ids.
606
  // Counts instructions by opcode.
607
215M
  for (int id = 0; id < static_cast<int>(flat.size()); id++) {
608
215M
    Inst* ip = &flat[id];
609
215M
    if (ip->opcode() != kInstAltMatch)  // handled in EmitList()
610
215M
      ip->set_out(flatmap[ip->out()]);
611
215M
    inst_count_[ip->opcode()]++;
612
215M
  }
613
614
#if !defined(NDEBUG)
615
  // Address a `-Wunused-but-set-variable' warning from Clang 13.x.
616
  size_t total = 0;
617
  for (int i = 0; i < kNumInst; i++)
618
    total += inst_count_[i];
619
  CHECK_EQ(total, flat.size());
620
#endif
621
622
  // Remap start_unanchored and start.
623
44.5k
  if (start_unanchored() == 0) {
624
1.06k
    DCHECK_EQ(start(), 0);
625
43.5k
  } else if (start_unanchored() == start()) {
626
1.03k
    set_start_unanchored(flatmap[1]);
627
1.03k
    set_start(flatmap[1]);
628
42.4k
  } else {
629
42.4k
    set_start_unanchored(flatmap[1]);
630
42.4k
    set_start(flatmap[2]);
631
42.4k
  }
632
633
  // Finally, replace the old instructions with the new instructions.
634
44.5k
  size_ = static_cast<int>(flat.size());
635
44.5k
  inst_ = PODArray<Inst>(size_);
636
44.5k
  memmove(inst_.data(), flat.data(), size_*sizeof inst_[0]);
637
638
  // Populate the list heads for BitState.
639
  // 512 instructions limits the memory footprint to 1KiB.
640
44.5k
  if (size_ <= 512) {
641
35.0k
    list_heads_ = PODArray<uint16_t>(size_);
642
    // 0xFF makes it more obvious if we try to look up a non-head.
643
35.0k
    memset(list_heads_.data(), 0xFF, size_*sizeof list_heads_[0]);
644
1.51M
    for (int i = 0; i < list_count_; ++i)
645
1.47M
      list_heads_[flatmap[i]] = i;
646
35.0k
  }
647
648
  // BitState allocates a bitmap of size list_count_ * (text.size()+1)
649
  // for tracking pairs of possibilities that it has already explored.
650
44.5k
  const size_t kBitStateBitmapMaxSize = 256*1024;  // max size in bits
651
44.5k
  bit_state_text_max_size_ = kBitStateBitmapMaxSize / list_count_ - 1;
652
44.5k
}
653
654
void Prog::MarkSuccessors(SparseArray<int>* rootmap,
655
                          SparseArray<int>* predmap,
656
                          std::vector<std::vector<int>>* predvec,
657
44.5k
                          SparseSet* reachable, std::vector<int>* stk) {
658
  // Mark the kInstFail instruction.
659
44.5k
  rootmap->set_new(0, rootmap->size());
660
661
  // Mark the start_unanchored and start instructions.
662
44.5k
  if (!rootmap->has_index(start_unanchored()))
663
43.5k
    rootmap->set_new(start_unanchored(), rootmap->size());
664
44.5k
  if (!rootmap->has_index(start()))
665
42.4k
    rootmap->set_new(start(), rootmap->size());
666
667
44.5k
  reachable->clear();
668
44.5k
  stk->clear();
669
44.5k
  stk->push_back(start_unanchored());
670
71.9M
  while (!stk->empty()) {
671
71.8M
    int id = stk->back();
672
71.8M
    stk->pop_back();
673
318M
  Loop:
674
318M
    if (reachable->contains(id))
675
71.8M
      continue;
676
246M
    reachable->insert_new(id);
677
678
246M
    Inst* ip = inst(id);
679
246M
    switch (ip->opcode()) {
680
0
      default:
681
0
        LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
682
0
        break;
683
684
2.96k
      case kInstAltMatch:
685
71.8M
      case kInstAlt:
686
        // Mark this instruction as a predecessor of each out.
687
143M
        for (int out : {ip->out(), ip->out1()}) {
688
143M
          if (!predmap->has_index(out)) {
689
128M
            predmap->set_new(out, static_cast<int>(predvec->size()));
690
128M
            predvec->emplace_back();
691
128M
          }
692
143M
          (*predvec)[predmap->get_existing(out)].emplace_back(id);
693
143M
        }
694
71.8M
        stk->push_back(ip->out1());
695
71.8M
        id = ip->out();
696
71.8M
        goto Loop;
697
698
161M
      case kInstByteRange:
699
169M
      case kInstCapture:
700
174M
      case kInstEmptyWidth:
701
        // Mark the out of this instruction as a "root".
702
174M
        if (!rootmap->has_index(ip->out()))
703
124M
          rootmap->set_new(ip->out(), rootmap->size());
704
174M
        id = ip->out();
705
174M
        goto Loop;
706
707
66
      case kInstNop:
708
66
        id = ip->out();
709
66
        goto Loop;
710
711
43.5k
      case kInstMatch:
712
45.3k
      case kInstFail:
713
45.3k
        break;
714
246M
    }
715
246M
  }
716
44.5k
}
717
718
void Prog::MarkDominator(int root, SparseArray<int>* rootmap,
719
                         SparseArray<int>* predmap,
720
                         std::vector<std::vector<int>>* predvec,
721
124M
                         SparseSet* reachable, std::vector<int>* stk) {
722
124M
  reachable->clear();
723
124M
  stk->clear();
724
124M
  stk->push_back(root);
725
329M
  while (!stk->empty()) {
726
204M
    int id = stk->back();
727
204M
    stk->pop_back();
728
283M
  Loop:
729
283M
    if (reachable->contains(id))
730
4.41M
      continue;
731
279M
    reachable->insert_new(id);
732
733
279M
    if (id != root && rootmap->has_index(id)) {
734
      // We reached another "tree" via epsilon transition.
735
23.8M
      continue;
736
23.8M
    }
737
738
255M
    Inst* ip = inst(id);
739
255M
    switch (ip->opcode()) {
740
0
      default:
741
0
        LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
742
0
        break;
743
744
2.96k
      case kInstAltMatch:
745
79.3M
      case kInstAlt:
746
79.3M
        stk->push_back(ip->out1());
747
79.3M
        id = ip->out();
748
79.3M
        goto Loop;
749
750
162M
      case kInstByteRange:
751
170M
      case kInstCapture:
752
176M
      case kInstEmptyWidth:
753
176M
        break;
754
755
0
      case kInstNop:
756
0
        id = ip->out();
757
0
        goto Loop;
758
759
48.8k
      case kInstMatch:
760
48.8k
      case kInstFail:
761
48.8k
        break;
762
255M
    }
763
255M
  }
764
765
124M
  for (SparseSet::const_iterator i = reachable->begin();
766
404M
       i != reachable->end();
767
279M
       ++i) {
768
279M
    int id = *i;
769
279M
    if (predmap->has_index(id)) {
770
1.75G
      for (int pred : (*predvec)[predmap->get_existing(id)]) {
771
1.75G
        if (!reachable->contains(pred)) {
772
          // id has a predecessor that cannot be reached from root!
773
          // Therefore, id must be a "root" too - mark it as such.
774
1.59G
          if (!rootmap->has_index(id))
775
5.30M
            rootmap->set_new(id, rootmap->size());
776
1.59G
        }
777
1.75G
      }
778
161M
    }
779
279M
  }
780
124M
}
781
782
void Prog::EmitList(int root, SparseArray<int>* rootmap,
783
                    std::vector<Inst>* flat,
784
130M
                    SparseSet* reachable, std::vector<int>* stk) {
785
130M
  reachable->clear();
786
130M
  stk->clear();
787
130M
  stk->push_back(root);
788
351M
  while (!stk->empty()) {
789
220M
    int id = stk->back();
790
220M
    stk->pop_back();
791
311M
  Loop:
792
311M
    if (reachable->contains(id))
793
5.56M
      continue;
794
305M
    reachable->insert_new(id);
795
796
305M
    if (id != root && rootmap->has_index(id)) {
797
      // We reached another "tree" via epsilon transition. Emit a kInstNop
798
      // instruction so that the Prog does not become quadratically larger.
799
38.9M
      flat->emplace_back();
800
38.9M
      flat->back().set_opcode(kInstNop);
801
38.9M
      flat->back().set_out(rootmap->get_existing(id));
802
38.9M
      continue;
803
38.9M
    }
804
805
266M
    Inst* ip = inst(id);
806
266M
    switch (ip->opcode()) {
807
0
      default:
808
0
        LOG(DFATAL) << "unhandled opcode: " << ip->opcode();
809
0
        break;
810
811
2.96k
      case kInstAltMatch:
812
2.96k
        flat->emplace_back();
813
2.96k
        flat->back().set_opcode(kInstAltMatch);
814
2.96k
        flat->back().set_out(static_cast<int>(flat->size()));
815
2.96k
        flat->back().out1_ = static_cast<uint32_t>(flat->size())+1;
816
2.96k
        FALLTHROUGH_INTENDED;
817
818
90.4M
      case kInstAlt:
819
90.4M
        stk->push_back(ip->out1());
820
90.4M
        id = ip->out();
821
90.4M
        goto Loop;
822
823
163M
      case kInstByteRange:
824
170M
      case kInstCapture:
825
176M
      case kInstEmptyWidth:
826
176M
        flat->emplace_back();
827
176M
        memmove(&flat->back(), ip, sizeof *ip);
828
176M
        flat->back().set_out(rootmap->get_existing(ip->out()));
829
176M
        break;
830
831
66
      case kInstNop:
832
66
        id = ip->out();
833
66
        goto Loop;
834
835
75.0k
      case kInstMatch:
836
119k
      case kInstFail:
837
119k
        flat->emplace_back();
838
119k
        memmove(&flat->back(), ip, sizeof *ip);
839
119k
        break;
840
266M
    }
841
266M
  }
842
130M
}
843
844
// For each ByteRange instruction in [begin, end), computes a hint to execution
845
// engines: the delta to the next instruction (in flat) worth exploring iff the
846
// current instruction matched.
847
//
848
// Implements a coloring algorithm related to ByteMapBuilder, but in this case,
849
// colors are instructions and recoloring ranges precisely identifies conflicts
850
// between instructions. Iterating backwards over [begin, end) is guaranteed to
851
// identify the nearest conflict (if any) with only linear complexity.
852
130M
void Prog::ComputeHints(std::vector<Inst>* flat, int begin, int end) {
853
130M
  Bitmap256 splits;
854
130M
  int colors[256];
855
856
130M
  bool dirty = false;
857
476M
  for (int id = end; id >= begin; --id) {
858
345M
    if (id == end ||
859
345M
        (*flat)[id].opcode() != kInstByteRange) {
860
182M
      if (dirty) {
861
8.06M
        dirty = false;
862
8.06M
        splits.Clear();
863
8.06M
      }
864
182M
      splits.Set(255);
865
182M
      colors[255] = id;
866
      // At this point, the [0-255] range is colored with id.
867
      // Thus, hints cannot point beyond id; and if id == end,
868
      // hints that would have pointed to id will be 0 instead.
869
182M
      continue;
870
182M
    }
871
163M
    dirty = true;
872
873
    // We recolor the [lo-hi] range with id. Note that first ratchets backwards
874
    // from end to the nearest conflict (if any) during recoloring.
875
163M
    int first = end;
876
181M
    auto Recolor = [&](int lo, int hi) {
877
      // Like ByteMapBuilder, we split at lo-1 and at hi.
878
181M
      --lo;
879
880
181M
      if (0 <= lo && !splits.Test(lo)) {
881
169M
        splits.Set(lo);
882
169M
        int next = splits.FindNextSetBit(lo+1);
883
169M
        colors[lo] = colors[next];
884
169M
      }
885
181M
      if (!splits.Test(hi)) {
886
153M
        splits.Set(hi);
887
153M
        int next = splits.FindNextSetBit(hi+1);
888
153M
        colors[hi] = colors[next];
889
153M
      }
890
891
181M
      int c = lo+1;
892
183M
      while (c < 256) {
893
183M
        int next = splits.FindNextSetBit(c);
894
        // Ratchet backwards...
895
183M
        first = std::min(first, colors[next]);
896
        // Recolor with id - because it's the new nearest conflict!
897
183M
        colors[next] = id;
898
183M
        if (next == hi)
899
181M
          break;
900
1.88M
        c = next+1;
901
1.88M
      }
902
181M
    };
903
904
163M
    Inst* ip = &(*flat)[id];
905
163M
    int lo = ip->lo();
906
163M
    int hi = ip->hi();
907
163M
    Recolor(lo, hi);
908
163M
    if (ip->foldcase() && lo <= 'z' && hi >= 'a') {
909
18.2M
      int foldlo = lo;
910
18.2M
      int foldhi = hi;
911
18.2M
      if (foldlo < 'a')
912
32.0k
        foldlo = 'a';
913
18.2M
      if (foldhi > 'z')
914
67.4k
        foldhi = 'z';
915
18.2M
      if (foldlo <= foldhi) {
916
18.2M
        foldlo += 'A' - 'a';
917
18.2M
        foldhi += 'A' - 'a';
918
18.2M
        Recolor(foldlo, foldhi);
919
18.2M
      }
920
18.2M
    }
921
922
163M
    if (first != end) {
923
17.0M
      uint16_t hint = static_cast<uint16_t>(std::min(first - id, 32767));
924
17.0M
      ip->hint_foldcase_ |= hint<<1;
925
17.0M
    }
926
163M
  }
927
130M
}
928
929
// The final state will always be this, which frees up a register for the hot
930
// loop and thus avoids the spilling that can occur when building with Clang.
931
static const size_t kShiftDFAFinal = 9;
932
933
// This function takes the prefix as std::string (i.e. not const std::string&
934
// as normal) because it's going to clobber it, so a temporary is convenient.
935
13.2k
static uint64_t* BuildShiftDFA(std::string prefix) {
936
  // This constant is for convenience now and also for correctness later when
937
  // we clobber the prefix, but still need to know how long it was initially.
938
13.2k
  const size_t size = prefix.size();
939
940
  // Construct the NFA.
941
  // The table is indexed by input byte; each element is a bitfield of states
942
  // reachable by the input byte. Given a bitfield of the current states, the
943
  // bitfield of states reachable from those is - for this specific purpose -
944
  // always ((ncurr << 1) | 1). Intersecting the reachability bitfields gives
945
  // the bitfield of the next states reached by stepping over the input byte.
946
  // Credits for this technique: the Hyperscan paper by Geoff Langdale et al.
947
13.2k
  uint16_t nfa[256]{};
948
66.7k
  for (size_t i = 0; i < size; ++i) {
949
53.5k
    uint8_t b = prefix[i];
950
53.5k
    nfa[b] |= 1 << (i+1);
951
53.5k
  }
952
  // This is the `\C*?` for unanchored search.
953
3.39M
  for (int b = 0; b < 256; ++b)
954
3.38M
    nfa[b] |= 1;
955
956
  // This maps from DFA state to NFA states; the reverse mapping is used when
957
  // recording transitions and gets implemented with plain old linear search.
958
  // The "Shift DFA" technique limits this to ten states when using uint64_t;
959
  // to allow for the initial state, we use at most nine bytes of the prefix.
960
  // That same limit is also why uint16_t is sufficient for the NFA bitfield.
961
13.2k
  uint16_t states[kShiftDFAFinal+1]{};
962
13.2k
  states[0] = 1;
963
66.7k
  for (size_t dcurr = 0; dcurr < size; ++dcurr) {
964
53.5k
    uint8_t b = prefix[dcurr];
965
53.5k
    uint16_t ncurr = states[dcurr];
966
53.5k
    uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
967
53.5k
    size_t dnext = dcurr+1;
968
53.5k
    if (dnext == size)
969
13.2k
      dnext = kShiftDFAFinal;
970
53.5k
    states[dnext] = nnext;
971
53.5k
  }
972
973
  // Sort and unique the bytes of the prefix to avoid repeating work while we
974
  // record transitions. This clobbers the prefix, but it's no longer needed.
975
13.2k
  std::sort(prefix.begin(), prefix.end());
976
13.2k
  prefix.erase(std::unique(prefix.begin(), prefix.end()), prefix.end());
977
978
  // Construct the DFA.
979
  // The table is indexed by input byte; each element is effectively a packed
980
  // array of uint6_t; each array value will be multiplied by six in order to
981
  // avoid having to do so later in the hot loop as well as masking/shifting.
982
  // Credits for this technique: "Shift-based DFAs" on GitHub by Per Vognsen.
983
13.2k
  uint64_t* dfa = new uint64_t[256]{};
984
  // Record a transition from each state for each of the bytes of the prefix.
985
  // Note that all other input bytes go back to the initial state by default.
986
66.7k
  for (size_t dcurr = 0; dcurr < size; ++dcurr) {
987
242k
    for (uint8_t b : prefix) {
988
242k
      uint16_t ncurr = states[dcurr];
989
242k
      uint16_t nnext = nfa[b] & ((ncurr << 1) | 1);
990
242k
      size_t dnext = 0;
991
541k
      while (states[dnext] != nnext)
992
298k
        ++dnext;
993
242k
      dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
994
      // Convert ASCII letters to uppercase and record the extra transitions.
995
      // Note that ASCII letters are guaranteed to be lowercase at this point
996
      // because that's how the parser normalises them. #FunFact: 'k' and 's'
997
      // match U+212A and U+017F, respectively, so they won't occur here when
998
      // using UTF-8 encoding because the parser will emit character classes.
999
242k
      if ('a' <= b && b <= 'z') {
1000
81.1k
        b -= 'a' - 'A';
1001
81.1k
        dfa[b] |= static_cast<uint64_t>(dnext * 6) << (dcurr * 6);
1002
81.1k
      }
1003
242k
    }
1004
53.5k
  }
1005
  // This lets the final state "saturate", which will matter for performance:
1006
  // in the hot loop, we check for a match only at the end of each iteration,
1007
  // so we must keep signalling the match until we get around to checking it.
1008
3.39M
  for (int b = 0; b < 256; ++b)
1009
3.38M
    dfa[b] |= static_cast<uint64_t>(kShiftDFAFinal * 6) << (kShiftDFAFinal * 6);
1010
1011
13.2k
  return dfa;
1012
13.2k
}
1013
1014
void Prog::ConfigurePrefixAccel(const std::string& prefix,
1015
19.1k
                                bool prefix_foldcase) {
1016
19.1k
  prefix_foldcase_ = prefix_foldcase;
1017
19.1k
  prefix_size_ = prefix.size();
1018
19.1k
  if (prefix_foldcase_) {
1019
    // Use PrefixAccel_ShiftDFA().
1020
    // ... and no more than nine bytes of the prefix. (See above for details.)
1021
13.2k
    prefix_size_ = std::min(prefix_size_, kShiftDFAFinal);
1022
13.2k
    prefix_dfa_ = BuildShiftDFA(prefix.substr(0, prefix_size_));
1023
13.2k
  } else if (prefix_size_ != 1) {
1024
    // Use PrefixAccel_FrontAndBack().
1025
4.80k
    prefix_front_ = prefix.front();
1026
4.80k
    prefix_back_ = prefix.back();
1027
4.80k
  } else {
1028
    // Use memchr(3).
1029
1.15k
    prefix_front_ = prefix.front();
1030
1.15k
  }
1031
19.1k
}
1032
1033
0
const void* Prog::PrefixAccel_ShiftDFA(const void* data, size_t size) {
1034
0
  if (size < prefix_size_)
1035
0
    return NULL;
1036
1037
0
  uint64_t curr = 0;
1038
1039
  // At the time of writing, rough benchmarks on a Broadwell machine showed
1040
  // that this unroll factor (i.e. eight) achieves a speedup factor of two.
1041
0
  if (size >= 8) {
1042
0
    const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1043
0
    const uint8_t* endp = p + (size&~7);
1044
0
    do {
1045
0
      uint8_t b0 = p[0];
1046
0
      uint8_t b1 = p[1];
1047
0
      uint8_t b2 = p[2];
1048
0
      uint8_t b3 = p[3];
1049
0
      uint8_t b4 = p[4];
1050
0
      uint8_t b5 = p[5];
1051
0
      uint8_t b6 = p[6];
1052
0
      uint8_t b7 = p[7];
1053
1054
0
      uint64_t next0 = prefix_dfa_[b0];
1055
0
      uint64_t next1 = prefix_dfa_[b1];
1056
0
      uint64_t next2 = prefix_dfa_[b2];
1057
0
      uint64_t next3 = prefix_dfa_[b3];
1058
0
      uint64_t next4 = prefix_dfa_[b4];
1059
0
      uint64_t next5 = prefix_dfa_[b5];
1060
0
      uint64_t next6 = prefix_dfa_[b6];
1061
0
      uint64_t next7 = prefix_dfa_[b7];
1062
1063
0
      uint64_t curr0 = next0 >> (curr  & 63);
1064
0
      uint64_t curr1 = next1 >> (curr0 & 63);
1065
0
      uint64_t curr2 = next2 >> (curr1 & 63);
1066
0
      uint64_t curr3 = next3 >> (curr2 & 63);
1067
0
      uint64_t curr4 = next4 >> (curr3 & 63);
1068
0
      uint64_t curr5 = next5 >> (curr4 & 63);
1069
0
      uint64_t curr6 = next6 >> (curr5 & 63);
1070
0
      uint64_t curr7 = next7 >> (curr6 & 63);
1071
1072
0
      if ((curr7 & 63) == kShiftDFAFinal * 6) {
1073
        // At the time of writing, using the same masking subexpressions from
1074
        // the preceding lines caused Clang to clutter the hot loop computing
1075
        // them - even though they aren't actually needed for shifting! Hence
1076
        // these rewritten conditions, which achieve a speedup factor of two.
1077
0
        if (((curr7-curr0) & 63) == 0) return p+1-prefix_size_;
1078
0
        if (((curr7-curr1) & 63) == 0) return p+2-prefix_size_;
1079
0
        if (((curr7-curr2) & 63) == 0) return p+3-prefix_size_;
1080
0
        if (((curr7-curr3) & 63) == 0) return p+4-prefix_size_;
1081
0
        if (((curr7-curr4) & 63) == 0) return p+5-prefix_size_;
1082
0
        if (((curr7-curr5) & 63) == 0) return p+6-prefix_size_;
1083
0
        if (((curr7-curr6) & 63) == 0) return p+7-prefix_size_;
1084
0
        if (((curr7-curr7) & 63) == 0) return p+8-prefix_size_;
1085
0
      }
1086
1087
0
      curr = curr7;
1088
0
      p += 8;
1089
0
    } while (p != endp);
1090
0
    data = p;
1091
0
    size = size&7;
1092
0
  }
1093
1094
0
  const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
1095
0
  const uint8_t* endp = p + size;
1096
0
  while (p != endp) {
1097
0
    uint8_t b = *p++;
1098
0
    uint64_t next = prefix_dfa_[b];
1099
0
    curr = next >> (curr & 63);
1100
0
    if ((curr & 63) == kShiftDFAFinal * 6)
1101
0
      return p-prefix_size_;
1102
0
  }
1103
0
  return NULL;
1104
0
}
1105
1106
#if defined(__AVX2__)
1107
// Finds the least significant non-zero bit in n.
1108
static int FindLSBSet(uint32_t n) {
1109
  DCHECK_NE(n, 0);
1110
#if defined(__GNUC__)
1111
  return __builtin_ctz(n);
1112
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
1113
  unsigned long c;
1114
  _BitScanForward(&c, n);
1115
  return static_cast<int>(c);
1116
#else
1117
  int c = 31;
1118
  for (int shift = 1 << 4; shift != 0; shift >>= 1) {
1119
    uint32_t word = n << shift;
1120
    if (word != 0) {
1121
      n = word;
1122
      c -= shift;
1123
    }
1124
  }
1125
  return c;
1126
#endif
1127
}
1128
#endif
1129
1130
0
const void* Prog::PrefixAccel_FrontAndBack(const void* data, size_t size) {
1131
0
  DCHECK_GE(prefix_size_, 2);
1132
0
  if (size < prefix_size_)
1133
0
    return NULL;
1134
  // Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
1135
  // This also means that probing for prefix_back_ doesn't go out of bounds.
1136
0
  size -= prefix_size_-1;
1137
1138
#if defined(__AVX2__)
1139
  // Use AVX2 to look for prefix_front_ and prefix_back_ 32 bytes at a time.
1140
  if (size >= sizeof(__m256i)) {
1141
    const __m256i* fp = reinterpret_cast<const __m256i*>(
1142
        reinterpret_cast<const char*>(data));
1143
    const __m256i* bp = reinterpret_cast<const __m256i*>(
1144
        reinterpret_cast<const char*>(data) + prefix_size_-1);
1145
    const __m256i* endfp = fp + size/sizeof(__m256i);
1146
    const __m256i f_set1 = _mm256_set1_epi8(prefix_front_);
1147
    const __m256i b_set1 = _mm256_set1_epi8(prefix_back_);
1148
    do {
1149
      const __m256i f_loadu = _mm256_loadu_si256(fp++);
1150
      const __m256i b_loadu = _mm256_loadu_si256(bp++);
1151
      const __m256i f_cmpeq = _mm256_cmpeq_epi8(f_set1, f_loadu);
1152
      const __m256i b_cmpeq = _mm256_cmpeq_epi8(b_set1, b_loadu);
1153
      const int fb_testz = _mm256_testz_si256(f_cmpeq, b_cmpeq);
1154
      if (fb_testz == 0) {  // ZF: 1 means zero, 0 means non-zero.
1155
        const __m256i fb_and = _mm256_and_si256(f_cmpeq, b_cmpeq);
1156
        const int fb_movemask = _mm256_movemask_epi8(fb_and);
1157
        const int fb_ctz = FindLSBSet(fb_movemask);
1158
        return reinterpret_cast<const char*>(fp-1) + fb_ctz;
1159
      }
1160
    } while (fp != endfp);
1161
    data = fp;
1162
    size = size%sizeof(__m256i);
1163
  }
1164
#endif
1165
1166
0
  const char* p0 = reinterpret_cast<const char*>(data);
1167
0
  for (const char* p = p0;; p++) {
1168
0
    DCHECK_GE(size, static_cast<size_t>(p-p0));
1169
0
    p = reinterpret_cast<const char*>(memchr(p, prefix_front_, size - (p-p0)));
1170
0
    if (p == NULL || p[prefix_size_-1] == prefix_back_)
1171
0
      return p;
1172
0
  }
1173
0
}
1174
1175
}  // namespace re2