Coverage Report

Created: 2024-08-17 06:49

/src/re2/re2/compile.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2007 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Compile regular expression to Prog.
6
//
7
// Prog and Inst are defined in prog.h.
8
// This file's external interface is just Regexp::CompileToProg.
9
// The Compiler class defined in this file is private.
10
11
#include <stdint.h>
12
#include <string.h>
13
#include <unordered_map>
14
#include <utility>
15
16
#include "util/logging.h"
17
#include "util/utf.h"
18
#include "re2/pod_array.h"
19
#include "re2/prog.h"
20
#include "re2/re2.h"
21
#include "re2/regexp.h"
22
#include "re2/walker-inl.h"
23
24
namespace re2 {
25
26
// List of pointers to Inst* that need to be filled in (patched).
27
// Because the Inst* haven't been filled in yet,
28
// we can use the Inst* word to hold the list's "next" pointer.
29
// It's kind of sleazy, but it works well in practice.
30
// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
31
//
32
// Because the out and out1 fields in Inst are no longer pointers,
33
// we can't use pointers directly here either.  Instead, head refers
34
// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
35
// head == 0 represents the NULL list.  This is okay because instruction #0
36
// is always the fail instruction, which never appears on a list.
37
struct PatchList {
38
  // Returns patch list containing just p.
39
5.54M
  static PatchList Mk(uint32_t p) {
40
5.54M
    return {p, p};
41
5.54M
  }
42
43
  // Patches all the entries on l to have value p.
44
  // Caller must not ever use patch list again.
45
4.22M
  static void Patch(Prog::Inst* inst0, PatchList l, uint32_t p) {
46
9.39M
    while (l.head != 0) {
47
5.16M
      Prog::Inst* ip = &inst0[l.head>>1];
48
5.16M
      if (l.head&1) {
49
266k
        l.head = ip->out1();
50
266k
        ip->out1_ = p;
51
4.90M
      } else {
52
4.90M
        l.head = ip->out();
53
4.90M
        ip->set_out(p);
54
4.90M
      }
55
5.16M
    }
56
4.22M
  }
57
58
  // Appends two patch lists and returns result.
59
1.27M
  static PatchList Append(Prog::Inst* inst0, PatchList l1, PatchList l2) {
60
1.27M
    if (l1.head == 0)
61
331k
      return l2;
62
946k
    if (l2.head == 0)
63
0
      return l1;
64
946k
    Prog::Inst* ip = &inst0[l1.tail>>1];
65
946k
    if (l1.tail&1)
66
236k
      ip->out1_ = l2.head;
67
709k
    else
68
709k
      ip->set_out(l2.head);
69
946k
    return {l1.head, l2.tail};
70
946k
  }
71
72
  uint32_t head;
73
  uint32_t tail;  // for constant-time append
74
};
75
76
static const PatchList kNullPatchList = {0, 0};
77
78
// Compiled program fragment.
79
struct Frag {
80
  uint32_t begin;
81
  PatchList end;
82
  bool nullable;
83
84
13.8M
  Frag() : begin(0), end(kNullPatchList), nullable(false) {}
85
  Frag(uint32_t begin, PatchList end, bool nullable)
86
8.92M
      : begin(begin), end(end), nullable(nullable) {}
87
};
88
89
// Input encodings.
90
enum Encoding {
91
  kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
92
  kEncodingLatin1,    // Latin-1 (0-FF)
93
};
94
95
class Compiler : public Regexp::Walker<Frag> {
96
 public:
97
  explicit Compiler();
98
  ~Compiler();
99
100
  // Compiles Regexp to a new Prog.
101
  // Caller is responsible for deleting Prog when finished with it.
102
  // If reversed is true, compiles for walking over the input
103
  // string backward (reverses all concatenations).
104
  static Prog *Compile(Regexp* re, bool reversed, int64_t max_mem);
105
106
  // Compiles alternation of all the re to a new Prog.
107
  // Each re has a match with an id equal to its index in the vector.
108
  static Prog* CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem);
109
110
  // Interface for Regexp::Walker, which helps traverse the Regexp.
111
  // The walk is purely post-recursive: given the machines for the
112
  // children, PostVisit combines them to create the machine for
113
  // the current node.  The child_args are Frags.
114
  // The Compiler traverses the Regexp parse tree, visiting
115
  // each node in depth-first order.  It invokes PreVisit before
116
  // visiting the node's children and PostVisit after visiting
117
  // the children.
118
  Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop);
119
  Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args,
120
                 int nchild_args);
121
  Frag ShortVisit(Regexp* re, Frag parent_arg);
122
  Frag Copy(Frag arg);
123
124
  // Given fragment a, returns a+ or a+?; a* or a*?; a? or a??
125
  Frag Plus(Frag a, bool nongreedy);
126
  Frag Star(Frag a, bool nongreedy);
127
  Frag Quest(Frag a, bool nongreedy);
128
129
  // Given fragment a, returns (a) capturing as \n.
130
  Frag Capture(Frag a, int n);
131
132
  // Given fragments a and b, returns ab; a|b
133
  Frag Cat(Frag a, Frag b);
134
  Frag Alt(Frag a, Frag b);
135
136
  // Returns a fragment that can't match anything.
137
  Frag NoMatch();
138
139
  // Returns a fragment that matches the empty string.
140
  Frag Match(int32_t id);
141
142
  // Returns a no-op fragment.
143
  Frag Nop();
144
145
  // Returns a fragment matching the byte range lo-hi.
146
  Frag ByteRange(int lo, int hi, bool foldcase);
147
148
  // Returns a fragment matching an empty-width special op.
149
  Frag EmptyWidth(EmptyOp op);
150
151
  // Adds n instructions to the program.
152
  // Returns the index of the first one.
153
  // Returns -1 if no more instructions are available.
154
  int AllocInst(int n);
155
156
  // Rune range compiler.
157
158
  // Begins a new alternation.
159
  void BeginRange();
160
161
  // Adds a fragment matching the rune range lo-hi.
162
  void AddRuneRange(Rune lo, Rune hi, bool foldcase);
163
  void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
164
  void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
165
  void Add_80_10ffff();
166
167
  // New suffix that matches the byte range lo-hi, then goes to next.
168
  int UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
169
  int CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase, int next);
170
171
  // Returns true iff the suffix is cached.
172
  bool IsCachedRuneByteSuffix(int id);
173
174
  // Adds a suffix to alternation.
175
  void AddSuffix(int id);
176
177
  // Adds a suffix to the trie starting from the given root node.
178
  // Returns zero iff allocating an instruction fails. Otherwise, returns
179
  // the current root node, which might be different from what was given.
180
  int AddSuffixRecursive(int root, int id);
181
182
  // Finds the trie node for the given suffix. Returns a Frag in order to
183
  // distinguish between pointing at the root node directly (end.head == 0)
184
  // and pointing at an Alt's out1 or out (end.head&1 == 1 or 0, respectively).
185
  Frag FindByteRange(int root, int id);
186
187
  // Compares two ByteRanges and returns true iff they are equal.
188
  bool ByteRangeEqual(int id1, int id2);
189
190
  // Returns the alternation of all the added suffixes.
191
  Frag EndRange();
192
193
  // Single rune.
194
  Frag Literal(Rune r, bool foldcase);
195
196
  void Setup(Regexp::ParseFlags flags, int64_t max_mem, RE2::Anchor anchor);
197
  Prog* Finish(Regexp* re);
198
199
  // Returns .* where dot = any byte
200
  Frag DotStar();
201
202
 private:
203
  Prog* prog_;         // Program being built.
204
  bool failed_;        // Did we give up compiling?
205
  Encoding encoding_;  // Input encoding
206
  bool reversed_;      // Should program run backward over text?
207
208
  PODArray<Prog::Inst> inst_;
209
  int ninst_;          // Number of instructions used.
210
  int max_ninst_;      // Maximum number of instructions.
211
212
  int64_t max_mem_;    // Total memory budget.
213
214
  std::unordered_map<uint64_t, int> rune_cache_;
215
  Frag rune_range_;
216
217
  RE2::Anchor anchor_;  // anchor mode for RE2::Set
218
219
  Compiler(const Compiler&) = delete;
220
  Compiler& operator=(const Compiler&) = delete;
221
};
222
223
5.21k
Compiler::Compiler() {
224
5.21k
  prog_ = new Prog();
225
5.21k
  failed_ = false;
226
5.21k
  encoding_ = kEncodingUTF8;
227
5.21k
  reversed_ = false;
228
5.21k
  ninst_ = 0;
229
5.21k
  max_ninst_ = 1;  // make AllocInst for fail instruction okay
230
5.21k
  max_mem_ = 0;
231
5.21k
  int fail = AllocInst(1);
232
5.21k
  inst_[fail].InitFail();
233
5.21k
  max_ninst_ = 0;  // Caller must change
234
5.21k
}
235
236
5.21k
Compiler::~Compiler() {
237
5.21k
  delete prog_;
238
5.21k
}
239
240
6.05M
int Compiler::AllocInst(int n) {
241
6.05M
  if (failed_ || ninst_ + n > max_ninst_) {
242
0
    failed_ = true;
243
0
    return -1;
244
0
  }
245
246
6.05M
  if (ninst_ + n > inst_.size()) {
247
22.0k
    int cap = inst_.size();
248
22.0k
    if (cap == 0)
249
5.21k
      cap = 8;
250
38.8k
    while (ninst_ + n > cap)
251
16.7k
      cap *= 2;
252
22.0k
    PODArray<Prog::Inst> inst(cap);
253
22.0k
    if (inst_.data() != NULL)
254
16.7k
      memmove(inst.data(), inst_.data(), ninst_*sizeof inst_[0]);
255
22.0k
    memset(inst.data() + ninst_, 0, (cap - ninst_)*sizeof inst_[0]);
256
22.0k
    inst_ = std::move(inst);
257
22.0k
  }
258
6.05M
  int id = ninst_;
259
6.05M
  ninst_ += n;
260
6.05M
  return id;
261
6.05M
}
262
263
// These routines are somewhat hard to visualize in text --
264
// see http://swtch.com/~rsc/regexp/regexp1.html for
265
// pictures explaining what is going on here.
266
267
// Returns an unmatchable fragment.
268
493k
Frag Compiler::NoMatch() {
269
493k
  return Frag();
270
493k
}
271
272
// Is a an unmatchable fragment?
273
8.04M
static bool IsNoMatch(Frag a) {
274
8.04M
  return a.begin == 0;
275
8.04M
}
276
277
// Given fragments a and b, returns fragment for ab.
278
3.30M
Frag Compiler::Cat(Frag a, Frag b) {
279
3.30M
  if (IsNoMatch(a) || IsNoMatch(b))
280
35.4k
    return NoMatch();
281
282
  // Elide no-op.
283
3.26M
  Prog::Inst* begin = &inst_[a.begin];
284
3.26M
  if (begin->opcode() == kInstNop &&
285
3.26M
      a.end.head == (a.begin << 1) &&
286
3.26M
      begin->out() == 0) {
287
    // in case refs to a somewhere
288
5.25k
    PatchList::Patch(inst_.data(), a.end, b.begin);
289
5.25k
    return b;
290
5.25k
  }
291
292
  // To run backward over string, reverse all concatenations.
293
3.26M
  if (reversed_) {
294
0
    PatchList::Patch(inst_.data(), b.end, a.begin);
295
0
    return Frag(b.begin, a.end, b.nullable && a.nullable);
296
0
  }
297
298
3.26M
  PatchList::Patch(inst_.data(), a.end, b.begin);
299
3.26M
  return Frag(a.begin, b.end, a.nullable && b.nullable);
300
3.26M
}
301
302
// Given fragments for a and b, returns fragment for a|b.
303
85.5k
Frag Compiler::Alt(Frag a, Frag b) {
304
  // Special case for convenience in loops.
305
85.5k
  if (IsNoMatch(a))
306
1.00k
    return b;
307
84.5k
  if (IsNoMatch(b))
308
1.65k
    return a;
309
310
82.9k
  int id = AllocInst(1);
311
82.9k
  if (id < 0)
312
0
    return NoMatch();
313
314
82.9k
  inst_[id].InitAlt(a.begin, b.begin);
315
82.9k
  return Frag(id, PatchList::Append(inst_.data(), a.end, b.end),
316
82.9k
              a.nullable || b.nullable);
317
82.9k
}
318
319
// When capturing submatches in like-Perl mode, a kOpAlt Inst
320
// treats out_ as the first choice, out1_ as the second.
321
//
322
// For *, +, and ?, if out_ causes another repetition,
323
// then the operator is greedy.  If out1_ is the repetition
324
// (and out_ moves forward), then the operator is non-greedy.
325
326
// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
327
25.8k
Frag Compiler::Plus(Frag a, bool nongreedy) {
328
25.8k
  int id = AllocInst(1);
329
25.8k
  if (id < 0)
330
0
    return NoMatch();
331
25.8k
  PatchList pl;
332
25.8k
  if (nongreedy) {
333
5.06k
    inst_[id].InitAlt(0, a.begin);
334
5.06k
    pl = PatchList::Mk(id << 1);
335
20.7k
  } else {
336
20.7k
    inst_[id].InitAlt(a.begin, 0);
337
20.7k
    pl = PatchList::Mk((id << 1) | 1);
338
20.7k
  }
339
25.8k
  PatchList::Patch(inst_.data(), a.end, id);
340
25.8k
  return Frag(a.begin, pl, a.nullable);
341
25.8k
}
342
343
// Given a fragment for a, returns a fragment for a* or a*? (if nongreedy)
344
27.2k
Frag Compiler::Star(Frag a, bool nongreedy) {
345
  // When the subexpression is nullable, one Alt isn't enough to guarantee
346
  // correct priority ordering within the transitive closure. The simplest
347
  // solution is to handle it as (a+)? instead, which adds the second Alt.
348
27.2k
  if (a.nullable)
349
2.27k
    return Quest(Plus(a, nongreedy), nongreedy);
350
351
24.9k
  int id = AllocInst(1);
352
24.9k
  if (id < 0)
353
0
    return NoMatch();
354
24.9k
  PatchList pl;
355
24.9k
  if (nongreedy) {
356
8.21k
    inst_[id].InitAlt(0, a.begin);
357
8.21k
    pl = PatchList::Mk(id << 1);
358
16.7k
  } else {
359
16.7k
    inst_[id].InitAlt(a.begin, 0);
360
16.7k
    pl = PatchList::Mk((id << 1) | 1);
361
16.7k
  }
362
24.9k
  PatchList::Patch(inst_.data(), a.end, id);
363
24.9k
  return Frag(id, pl, true);
364
24.9k
}
365
366
// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
367
315k
Frag Compiler::Quest(Frag a, bool nongreedy) {
368
315k
  if (IsNoMatch(a))
369
9.58k
    return Nop();
370
305k
  int id = AllocInst(1);
371
305k
  if (id < 0)
372
0
    return NoMatch();
373
305k
  PatchList pl;
374
305k
  if (nongreedy) {
375
70.8k
    inst_[id].InitAlt(0, a.begin);
376
70.8k
    pl = PatchList::Mk(id << 1);
377
234k
  } else {
378
234k
    inst_[id].InitAlt(a.begin, 0);
379
234k
    pl = PatchList::Mk((id << 1) | 1);
380
234k
  }
381
305k
  return Frag(id, PatchList::Append(inst_.data(), pl, a.end), true);
382
305k
}
383
384
// Returns a fragment for the byte range lo-hi.
385
4.53M
Frag Compiler::ByteRange(int lo, int hi, bool foldcase) {
386
4.53M
  int id = AllocInst(1);
387
4.53M
  if (id < 0)
388
0
    return NoMatch();
389
4.53M
  inst_[id].InitByteRange(lo, hi, foldcase, 0);
390
4.53M
  return Frag(id, PatchList::Mk(id << 1), false);
391
4.53M
}
392
393
// Returns a no-op fragment.  Sometimes unavoidable.
394
42.5k
Frag Compiler::Nop() {
395
42.5k
  int id = AllocInst(1);
396
42.5k
  if (id < 0)
397
0
    return NoMatch();
398
42.5k
  inst_[id].InitNop(0);
399
42.5k
  return Frag(id, PatchList::Mk(id << 1), true);
400
42.5k
}
401
402
// Returns a fragment that signals a match.
403
5.21k
Frag Compiler::Match(int32_t match_id) {
404
5.21k
  int id = AllocInst(1);
405
5.21k
  if (id < 0)
406
0
    return NoMatch();
407
5.21k
  inst_[id].InitMatch(match_id);
408
5.21k
  return Frag(id, kNullPatchList, false);
409
5.21k
}
410
411
// Returns a fragment matching a particular empty-width op (like ^ or $)
412
86.5k
Frag Compiler::EmptyWidth(EmptyOp empty) {
413
86.5k
  int id = AllocInst(1);
414
86.5k
  if (id < 0)
415
0
    return NoMatch();
416
86.5k
  inst_[id].InitEmptyWidth(empty, 0);
417
86.5k
  return Frag(id, PatchList::Mk(id << 1), true);
418
86.5k
}
419
420
// Given a fragment a, returns a fragment with capturing parens around a.
421
179k
Frag Compiler::Capture(Frag a, int n) {
422
179k
  if (IsNoMatch(a))
423
2.62k
    return NoMatch();
424
176k
  int id = AllocInst(2);
425
176k
  if (id < 0)
426
0
    return NoMatch();
427
176k
  inst_[id].InitCapture(2*n, a.begin);
428
176k
  inst_[id+1].InitCapture(2*n+1, 0);
429
176k
  PatchList::Patch(inst_.data(), a.end, id+1);
430
431
176k
  return Frag(id, PatchList::Mk((id+1) << 1), a.nullable);
432
176k
}
433
434
// A Rune is a name for a Unicode code point.
435
// Returns maximum rune encoded by UTF-8 sequence of length len.
436
1.81M
static int MaxRune(int len) {
437
1.81M
  int b;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
438
1.81M
  if (len == 1)
439
620k
    b = 7;
440
1.19M
  else
441
1.19M
    b = 8-(len+1) + 6*(len-1);
442
1.81M
  return (1<<b) - 1;   // maximum Rune for b bits.
443
1.81M
}
444
445
// The rune range compiler caches common suffix fragments,
446
// which are very common in UTF-8 (e.g., [80-bf]).
447
// The fragment suffixes are identified by their start
448
// instructions.  NULL denotes the eventual end match.
449
// The Frag accumulates in rune_range_.  Caching common
450
// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
451
// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
452
453
331k
void Compiler::BeginRange() {
454
331k
  rune_cache_.clear();
455
331k
  rune_range_.begin = 0;
456
331k
  rune_range_.end = kNullPatchList;
457
331k
}
458
459
int Compiler::UncachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
460
1.62M
                                     int next) {
461
1.62M
  Frag f = ByteRange(lo, hi, foldcase);
462
1.62M
  if (next != 0) {
463
730k
    PatchList::Patch(inst_.data(), f.end, next);
464
889k
  } else {
465
889k
    rune_range_.end = PatchList::Append(inst_.data(), rune_range_.end, f.end);
466
889k
  }
467
1.62M
  return f.begin;
468
1.62M
}
469
470
static uint64_t MakeRuneCacheKey(uint8_t lo, uint8_t hi, bool foldcase,
471
1.18M
                                 int next) {
472
1.18M
  return (uint64_t)next << 17 |
473
1.18M
         (uint64_t)lo   <<  9 |
474
1.18M
         (uint64_t)hi   <<  1 |
475
1.18M
         (uint64_t)foldcase;
476
1.18M
}
477
478
int Compiler::CachedRuneByteSuffix(uint8_t lo, uint8_t hi, bool foldcase,
479
439k
                                   int next) {
480
439k
  uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
481
439k
  std::unordered_map<uint64_t, int>::const_iterator it = rune_cache_.find(key);
482
439k
  if (it != rune_cache_.end())
483
213k
    return it->second;
484
225k
  int id = UncachedRuneByteSuffix(lo, hi, foldcase, next);
485
225k
  rune_cache_[key] = id;
486
225k
  return id;
487
439k
}
488
489
749k
bool Compiler::IsCachedRuneByteSuffix(int id) {
490
749k
  uint8_t lo = inst_[id].lo_;
491
749k
  uint8_t hi = inst_[id].hi_;
492
749k
  bool foldcase = inst_[id].foldcase() != 0;
493
749k
  int next = inst_[id].out();
494
495
749k
  uint64_t key = MakeRuneCacheKey(lo, hi, foldcase, next);
496
749k
  return rune_cache_.find(key) != rune_cache_.end();
497
749k
}
498
499
1.09M
void Compiler::AddSuffix(int id) {
500
1.09M
  if (failed_)
501
0
    return;
502
503
1.09M
  if (rune_range_.begin == 0) {
504
331k
    rune_range_.begin = id;
505
331k
    return;
506
331k
  }
507
508
763k
  if (encoding_ == kEncodingUTF8) {
509
    // Build a trie in order to reduce fanout.
510
431k
    rune_range_.begin = AddSuffixRecursive(rune_range_.begin, id);
511
431k
    return;
512
431k
  }
513
514
332k
  int alt = AllocInst(1);
515
332k
  if (alt < 0) {
516
0
    rune_range_.begin = 0;
517
0
    return;
518
0
  }
519
332k
  inst_[alt].InitAlt(rune_range_.begin, id);
520
332k
  rune_range_.begin = alt;
521
332k
}
522
523
806k
int Compiler::AddSuffixRecursive(int root, int id) {
524
806k
  DCHECK(inst_[root].opcode() == kInstAlt ||
525
806k
         inst_[root].opcode() == kInstByteRange);
526
527
806k
  Frag f = FindByteRange(root, id);
528
806k
  if (IsNoMatch(f)) {
529
431k
    int alt = AllocInst(1);
530
431k
    if (alt < 0)
531
0
      return 0;
532
431k
    inst_[alt].InitAlt(root, id);
533
431k
    return alt;
534
431k
  }
535
536
374k
  int br;
537
374k
  if (f.end.head == 0)
538
24.2k
    br = root;
539
350k
  else if (f.end.head&1)
540
350k
    br = inst_[f.begin].out1();
541
0
  else
542
0
    br = inst_[f.begin].out();
543
544
374k
  if (IsCachedRuneByteSuffix(br)) {
545
    // We can't fiddle with cached suffixes, so make a clone of the head.
546
0
    int byterange = AllocInst(1);
547
0
    if (byterange < 0)
548
0
      return 0;
549
0
    inst_[byterange].InitByteRange(inst_[br].lo(), inst_[br].hi(),
550
0
                                   inst_[br].foldcase(), inst_[br].out());
551
552
    // Ensure that the parent points to the clone, not to the original.
553
    // Note that this could leave the head unreachable except via the cache.
554
0
    br = byterange;
555
0
    if (f.end.head == 0)
556
0
      root = br;
557
0
    else if (f.end.head&1)
558
0
      inst_[f.begin].out1_ = br;
559
0
    else
560
0
      inst_[f.begin].set_out(br);
561
0
  }
562
563
374k
  int out = inst_[id].out();
564
374k
  if (!IsCachedRuneByteSuffix(id)) {
565
    // The head should be the instruction most recently allocated, so free it
566
    // instead of leaving it unreachable.
567
374k
    DCHECK_EQ(id, ninst_-1);
568
374k
    inst_[id].out_opcode_ = 0;
569
374k
    inst_[id].out1_ = 0;
570
374k
    ninst_--;
571
374k
  }
572
573
374k
  out = AddSuffixRecursive(inst_[br].out(), out);
574
374k
  if (out == 0)
575
0
    return 0;
576
577
374k
  inst_[br].set_out(out);
578
374k
  return root;
579
374k
}
580
581
806k
bool Compiler::ByteRangeEqual(int id1, int id2) {
582
806k
  return inst_[id1].lo() == inst_[id2].lo() &&
583
806k
         inst_[id1].hi() == inst_[id2].hi() &&
584
806k
         inst_[id1].foldcase() == inst_[id2].foldcase();
585
806k
}
586
587
806k
Frag Compiler::FindByteRange(int root, int id) {
588
806k
  if (inst_[root].opcode() == kInstByteRange) {
589
145k
    if (ByteRangeEqual(root, id))
590
24.2k
      return Frag(root, kNullPatchList, false);
591
121k
    else
592
121k
      return NoMatch();
593
145k
  }
594
595
660k
  while (inst_[root].opcode() == kInstAlt) {
596
660k
    int out1 = inst_[root].out1();
597
660k
    if (ByteRangeEqual(out1, id))
598
350k
      return Frag(root, PatchList::Mk((root << 1) | 1), false);
599
600
    // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
601
    // what we're looking for, then we can stop immediately. Unfortunately, we
602
    // can't short-circuit the search in reverse mode.
603
309k
    if (!reversed_)
604
309k
      return NoMatch();
605
606
0
    int out = inst_[root].out();
607
0
    if (inst_[out].opcode() == kInstAlt)
608
0
      root = out;
609
0
    else if (ByteRangeEqual(out, id))
610
0
      return Frag(root, PatchList::Mk(root << 1), false);
611
0
    else
612
0
      return NoMatch();
613
0
  }
614
615
0
  LOG(DFATAL) << "should never happen";
616
0
  return NoMatch();
617
660k
}
618
619
331k
Frag Compiler::EndRange() {
620
331k
  return rune_range_;
621
331k
}
622
623
// Converts rune range lo-hi into a fragment that recognizes
624
// the bytes that would make up those runes in the current
625
// encoding (Latin 1 or UTF-8).
626
// This lets the machine work byte-by-byte even when
627
// using multibyte encodings.
628
629
884k
void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) {
630
884k
  switch (encoding_) {
631
0
    default:
632
260k
    case kEncodingUTF8:
633
260k
      AddRuneRangeUTF8(lo, hi, foldcase);
634
260k
      break;
635
623k
    case kEncodingLatin1:
636
623k
      AddRuneRangeLatin1(lo, hi, foldcase);
637
623k
      break;
638
884k
  }
639
884k
}
640
641
623k
void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) {
642
  // Latin-1 is easy: runes *are* bytes.
643
623k
  if (lo > hi || lo > 0xFF)
644
673
    return;
645
622k
  if (hi > 0xFF)
646
21.0k
    hi = 0xFF;
647
622k
  AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
648
622k
                                   static_cast<uint8_t>(hi), foldcase, 0));
649
622k
}
650
651
12.7k
void Compiler::Add_80_10ffff() {
652
  // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
653
  // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
654
  // permitting overlong encodings in E0 and F0 sequences and code points
655
  // over 10FFFF in F4 sequences, the size of the bytecode and the number
656
  // of equivalence classes are reduced significantly.
657
12.7k
  int id;
658
12.7k
  if (reversed_) {
659
    // Prefix factoring matters, but we don't have to handle it here
660
    // because the rune range trie logic takes care of that already.
661
0
    id = UncachedRuneByteSuffix(0xC2, 0xDF, false, 0);
662
0
    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
663
0
    AddSuffix(id);
664
665
0
    id = UncachedRuneByteSuffix(0xE0, 0xEF, false, 0);
666
0
    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
667
0
    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
668
0
    AddSuffix(id);
669
670
0
    id = UncachedRuneByteSuffix(0xF0, 0xF4, false, 0);
671
0
    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
672
0
    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
673
0
    id = UncachedRuneByteSuffix(0x80, 0xBF, false, id);
674
0
    AddSuffix(id);
675
12.7k
  } else {
676
    // Suffix factoring matters - and we do have to handle it here.
677
12.7k
    int cont1 = UncachedRuneByteSuffix(0x80, 0xBF, false, 0);
678
12.7k
    id = UncachedRuneByteSuffix(0xC2, 0xDF, false, cont1);
679
12.7k
    AddSuffix(id);
680
681
12.7k
    int cont2 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont1);
682
12.7k
    id = UncachedRuneByteSuffix(0xE0, 0xEF, false, cont2);
683
12.7k
    AddSuffix(id);
684
685
12.7k
    int cont3 = UncachedRuneByteSuffix(0x80, 0xBF, false, cont2);
686
12.7k
    id = UncachedRuneByteSuffix(0xF0, 0xF4, false, cont3);
687
12.7k
    AddSuffix(id);
688
12.7k
  }
689
12.7k
}
690
691
632k
void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) {
692
632k
  if (lo > hi)
693
0
    return;
694
695
  // Pick off 80-10FFFF as a common special case.
696
632k
  if (lo == 0x80 && hi == 0x10ffff) {
697
12.7k
    Add_80_10ffff();
698
12.7k
    return;
699
12.7k
  }
700
701
  // Split range into same-length sized ranges.
702
2.39M
  for (int i = 1; i < UTFmax; i++) {
703
1.81M
    Rune max = MaxRune(i);
704
1.81M
    if (lo <= max && max < hi) {
705
34.8k
      AddRuneRangeUTF8(lo, max, foldcase);
706
34.8k
      AddRuneRangeUTF8(max+1, hi, foldcase);
707
34.8k
      return;
708
34.8k
    }
709
1.81M
  }
710
711
  // ASCII range is always a special case.
712
585k
  if (hi < Runeself) {
713
108k
    AddSuffix(UncachedRuneByteSuffix(static_cast<uint8_t>(lo),
714
108k
                                     static_cast<uint8_t>(hi), foldcase, 0));
715
108k
    return;
716
108k
  }
717
718
  // Split range into sections that agree on leading bytes.
719
1.51M
  for (int i = 1; i < UTFmax; i++) {
720
1.18M
    uint32_t m = (1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
721
1.18M
    if ((lo & ~m) != (hi & ~m)) {
722
354k
      if ((lo & m) != 0) {
723
82.9k
        AddRuneRangeUTF8(lo, lo|m, foldcase);
724
82.9k
        AddRuneRangeUTF8((lo|m)+1, hi, foldcase);
725
82.9k
        return;
726
82.9k
      }
727
271k
      if ((hi & m) != m) {
728
68.1k
        AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase);
729
68.1k
        AddRuneRangeUTF8(hi&~m, hi, foldcase);
730
68.1k
        return;
731
68.1k
      }
732
271k
    }
733
1.18M
  }
734
735
  // Finally.  Generate byte matching equivalent for lo-hi.
736
325k
  uint8_t ulo[UTFmax], uhi[UTFmax];
737
325k
  int n = runetochar(reinterpret_cast<char*>(ulo), &lo);
738
325k
  int m = runetochar(reinterpret_cast<char*>(uhi), &hi);
739
325k
  (void)m;  // USED(m)
740
325k
  DCHECK_EQ(n, m);
741
742
  // The logic below encodes this thinking:
743
  //
744
  // 1. When we have built the whole suffix, we know that it cannot
745
  // possibly be a suffix of anything longer: in forward mode, nothing
746
  // else can occur before the leading byte; in reverse mode, nothing
747
  // else can occur after the last continuation byte or else the leading
748
  // byte would have to change. Thus, there is no benefit to caching
749
  // the first byte of the suffix whereas there is a cost involved in
750
  // cloning it if it begins a common prefix, which is fairly likely.
751
  //
752
  // 2. Conversely, the last byte of the suffix cannot possibly be a
753
  // prefix of anything because next == 0, so we will never want to
754
  // clone it, but it is fairly likely to be a common suffix. Perhaps
755
  // more so in reverse mode than in forward mode because the former is
756
  // "converging" towards lower entropy, but caching is still worthwhile
757
  // for the latter in cases such as 80-BF.
758
  //
759
  // 3. Handling the bytes between the first and the last is less
760
  // straightforward and, again, the approach depends on whether we are
761
  // "converging" towards lower entropy: in forward mode, a single byte
762
  // is unlikely to be part of a common suffix whereas a byte range
763
  // is more likely so; in reverse mode, a byte range is unlikely to
764
  // be part of a common suffix whereas a single byte is more likely
765
  // so. The same benefit versus cost argument applies here.
766
325k
  int id = 0;
767
325k
  if (reversed_) {
768
0
    for (int i = 0; i < n; i++) {
769
      // In reverse UTF-8 mode: cache the leading byte; don't cache the last
770
      // continuation byte; cache anything else iff it's a single byte (XX-XX).
771
0
      if (i == 0 || (ulo[i] == uhi[i] && i != n-1))
772
0
        id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
773
0
      else
774
0
        id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
775
0
    }
776
325k
  } else {
777
1.35M
    for (int i = n-1; i >= 0; i--) {
778
      // In forward UTF-8 mode: don't cache the leading byte; cache the last
779
      // continuation byte; cache anything else iff it's a byte range (XX-YY).
780
1.02M
      if (i == n-1 || (ulo[i] < uhi[i] && i != 0))
781
439k
        id = CachedRuneByteSuffix(ulo[i], uhi[i], false, id);
782
587k
      else
783
587k
        id = UncachedRuneByteSuffix(ulo[i], uhi[i], false, id);
784
1.02M
    }
785
325k
  }
786
325k
  AddSuffix(id);
787
325k
}
788
789
// Should not be called.
790
0
Frag Compiler::Copy(Frag arg) {
791
  // We're using WalkExponential; there should be no copying.
792
0
  failed_ = true;
793
0
  LOG(DFATAL) << "Compiler::Copy called!";
794
0
  return NoMatch();
795
0
}
796
797
// Visits a node quickly; called once WalkExponential has
798
// decided to cut this walk short.
799
0
Frag Compiler::ShortVisit(Regexp* re, Frag) {
800
0
  failed_ = true;
801
0
  return NoMatch();
802
0
}
803
804
// Called before traversing a node's children during the walk.
805
2.25M
Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) {
806
  // Cut off walk if we've already failed.
807
2.25M
  if (failed_)
808
0
    *stop = true;
809
810
2.25M
  return Frag();  // not used by caller
811
2.25M
}
812
813
2.90M
Frag Compiler::Literal(Rune r, bool foldcase) {
814
2.90M
  switch (encoding_) {
815
0
    default:
816
0
      return Frag();
817
818
2.46M
    case kEncodingLatin1:
819
2.46M
      return ByteRange(r, r, foldcase);
820
821
435k
    case kEncodingUTF8: {
822
435k
      if (r < Runeself)  // Make common case fast.
823
434k
        return ByteRange(r, r, foldcase);
824
1.54k
      uint8_t buf[UTFmax];
825
1.54k
      int n = runetochar(reinterpret_cast<char*>(buf), &r);
826
1.54k
      Frag f = ByteRange((uint8_t)buf[0], buf[0], false);
827
4.63k
      for (int i = 1; i < n; i++)
828
3.09k
        f = Cat(f, ByteRange((uint8_t)buf[i], buf[i], false));
829
1.54k
      return f;
830
435k
    }
831
2.90M
  }
832
2.90M
}
833
834
// Called after traversing the node's children during the walk.
835
// Given their frags, build and return the frag for this re.
836
Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags,
837
2.25M
                         int nchild_frags) {
838
  // If a child failed, don't bother going forward, especially
839
  // since the child_frags might contain Frags with NULLs in them.
840
2.25M
  if (failed_)
841
0
    return NoMatch();
842
843
  // Given the child fragments, return the fragment for this node.
844
2.25M
  switch (re->op()) {
845
0
    case kRegexpRepeat:
846
      // Should not see; code at bottom of function will print error
847
0
      break;
848
849
23.6k
    case kRegexpNoMatch:
850
23.6k
      return NoMatch();
851
852
32.9k
    case kRegexpEmptyMatch:
853
32.9k
      return Nop();
854
855
0
    case kRegexpHaveMatch: {
856
0
      Frag f = Match(re->match_id());
857
0
      if (anchor_ == RE2::ANCHOR_BOTH) {
858
        // Append \z or else the subexpression will effectively be unanchored.
859
        // Complemented by the UNANCHORED case in CompileSet().
860
0
        f = Cat(EmptyWidth(kEmptyEndText), f);
861
0
      }
862
0
      return f;
863
0
    }
864
865
401k
    case kRegexpConcat: {
866
401k
      Frag f = child_frags[0];
867
1.58M
      for (int i = 1; i < nchild_frags; i++)
868
1.18M
        f = Cat(f, child_frags[i]);
869
401k
      return f;
870
0
    }
871
872
41.5k
    case kRegexpAlternate: {
873
41.5k
      Frag f = child_frags[0];
874
127k
      for (int i = 1; i < nchild_frags; i++)
875
85.5k
        f = Alt(f, child_frags[i]);
876
41.5k
      return f;
877
0
    }
878
879
22.0k
    case kRegexpStar:
880
22.0k
      return Star(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
881
882
23.5k
    case kRegexpPlus:
883
23.5k
      return Plus(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
884
885
312k
    case kRegexpQuest:
886
312k
      return Quest(child_frags[0], (re->parse_flags()&Regexp::NonGreedy) != 0);
887
888
411k
    case kRegexpLiteral:
889
411k
      return Literal(re->rune(), (re->parse_flags()&Regexp::FoldCase) != 0);
890
891
387k
    case kRegexpLiteralString: {
892
      // Concatenation of literals.
893
387k
      if (re->nrunes() == 0)
894
0
        return Nop();
895
387k
      Frag f;
896
2.88M
      for (int i = 0; i < re->nrunes(); i++) {
897
2.49M
        Frag f1 = Literal(re->runes()[i],
898
2.49M
                          (re->parse_flags()&Regexp::FoldCase) != 0);
899
2.49M
        if (i == 0)
900
387k
          f = f1;
901
2.10M
        else
902
2.10M
          f = Cat(f, f1);
903
2.49M
      }
904
387k
      return f;
905
387k
    }
906
907
21.2k
    case kRegexpAnyChar:
908
21.2k
      BeginRange();
909
21.2k
      AddRuneRange(0, Runemax, false);
910
21.2k
      return EndRange();
911
912
2.22k
    case kRegexpAnyByte:
913
2.22k
      return ByteRange(0x00, 0xFF, false);
914
915
310k
    case kRegexpCharClass: {
916
310k
      CharClass* cc = re->cc();
917
310k
      if (cc->empty()) {
918
        // This can't happen.
919
0
        failed_ = true;
920
0
        LOG(DFATAL) << "No ranges in char class";
921
0
        return NoMatch();
922
0
      }
923
924
      // ASCII case-folding optimization: if the char class
925
      // behaves the same on A-Z as it does on a-z,
926
      // discard any ranges wholly contained in A-Z
927
      // and mark the other ranges as foldascii.
928
      // This reduces the size of a program for
929
      // (?i)abc from 3 insts per letter to 1 per letter.
930
310k
      bool foldascii = cc->FoldsASCII();
931
932
      // Character class is just a big OR of the different
933
      // character ranges in the class.
934
310k
      BeginRange();
935
1.20M
      for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) {
936
        // ASCII case-folding optimization (see above).
937
890k
        if (foldascii && 'A' <= i->lo && i->hi <= 'Z')
938
27.5k
          continue;
939
940
        // If this range contains all of A-Za-z or none of it,
941
        // the fold flag is unnecessary; don't bother.
942
863k
        bool fold = foldascii;
943
863k
        if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo ||
944
863k
            ('Z' < i->lo && i->hi < 'a'))
945
826k
          fold = false;
946
947
863k
        AddRuneRange(i->lo, i->hi, fold);
948
863k
      }
949
310k
      return EndRange();
950
310k
    }
951
952
179k
    case kRegexpCapture:
953
      // If this is a non-capturing parenthesis -- (?:foo) --
954
      // just use the inner expression.
955
179k
      if (re->cap() < 0)
956
0
        return child_frags[0];
957
179k
      return Capture(child_frags[0], re->cap());
958
959
2.91k
    case kRegexpBeginLine:
960
2.91k
      return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine);
961
962
4.12k
    case kRegexpEndLine:
963
4.12k
      return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine);
964
965
14.6k
    case kRegexpBeginText:
966
14.6k
      return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText);
967
968
61.7k
    case kRegexpEndText:
969
61.7k
      return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText);
970
971
1.30k
    case kRegexpWordBoundary:
972
1.30k
      return EmptyWidth(kEmptyWordBoundary);
973
974
1.79k
    case kRegexpNoWordBoundary:
975
1.79k
      return EmptyWidth(kEmptyNonWordBoundary);
976
2.25M
  }
977
0
  failed_ = true;
978
0
  LOG(DFATAL) << "Missing case in Compiler: " << re->op();
979
0
  return NoMatch();
980
2.25M
}
981
982
// Is this regexp required to start at the beginning of the text?
983
// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
984
// but handles (\A(a|b)).  Could use the Walker to write a more exact one.
985
8.93k
static bool IsAnchorStart(Regexp** pre, int depth) {
986
8.93k
  Regexp* re = *pre;
987
8.93k
  Regexp* sub;
988
  // The depth limit makes sure that we don't overflow
989
  // the stack on a deeply nested regexp.  As the comment
990
  // above says, IsAnchorStart is conservative, so returning
991
  // a false negative is okay.  The exact limit is somewhat arbitrary.
992
8.93k
  if (re == NULL || depth >= 4)
993
79
    return false;
994
8.85k
  switch (re->op()) {
995
5.03k
    default:
996
5.03k
      break;
997
5.03k
    case kRegexpConcat:
998
3.53k
      if (re->nsub() > 0) {
999
3.53k
        sub = re->sub()[0]->Incref();
1000
3.53k
        if (IsAnchorStart(&sub, depth+1)) {
1001
109
          PODArray<Regexp*> subcopy(re->nsub());
1002
109
          subcopy[0] = sub;  // already have reference
1003
2.64k
          for (int i = 1; i < re->nsub(); i++)
1004
2.53k
            subcopy[i] = re->sub()[i]->Incref();
1005
109
          *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1006
109
          re->Decref();
1007
109
          return true;
1008
109
        }
1009
3.42k
        sub->Decref();
1010
3.42k
      }
1011
3.42k
      break;
1012
3.42k
    case kRegexpCapture:
1013
190
      sub = re->sub()[0]->Incref();
1014
190
      if (IsAnchorStart(&sub, depth+1)) {
1015
3
        *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1016
3
        re->Decref();
1017
3
        return true;
1018
3
      }
1019
187
      sub->Decref();
1020
187
      break;
1021
99
    case kRegexpBeginText:
1022
99
      *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1023
99
      re->Decref();
1024
99
      return true;
1025
8.85k
  }
1026
8.64k
  return false;
1027
8.85k
}
1028
1029
// Is this regexp required to start at the end of the text?
1030
// Only approximate; can return false for complicated regexps like (a\z|b\z),
1031
// but handles ((a|b)\z).  Could use the Walker to write a more exact one.
1032
8.75k
static bool IsAnchorEnd(Regexp** pre, int depth) {
1033
8.75k
  Regexp* re = *pre;
1034
8.75k
  Regexp* sub;
1035
  // The depth limit makes sure that we don't overflow
1036
  // the stack on a deeply nested regexp.  As the comment
1037
  // above says, IsAnchorEnd is conservative, so returning
1038
  // a false negative is okay.  The exact limit is somewhat arbitrary.
1039
8.75k
  if (re == NULL || depth >= 4)
1040
11
    return false;
1041
8.74k
  switch (re->op()) {
1042
5.13k
    default:
1043
5.13k
      break;
1044
5.13k
    case kRegexpConcat:
1045
3.45k
      if (re->nsub() > 0) {
1046
3.45k
        sub = re->sub()[re->nsub() - 1]->Incref();
1047
3.45k
        if (IsAnchorEnd(&sub, depth+1)) {
1048
66
          PODArray<Regexp*> subcopy(re->nsub());
1049
66
          subcopy[re->nsub() - 1] = sub;  // already have reference
1050
2.35k
          for (int i = 0; i < re->nsub() - 1; i++)
1051
2.29k
            subcopy[i] = re->sub()[i]->Incref();
1052
66
          *pre = Regexp::Concat(subcopy.data(), re->nsub(), re->parse_flags());
1053
66
          re->Decref();
1054
66
          return true;
1055
66
        }
1056
3.38k
        sub->Decref();
1057
3.38k
      }
1058
3.38k
      break;
1059
3.38k
    case kRegexpCapture:
1060
87
      sub = re->sub()[0]->Incref();
1061
87
      if (IsAnchorEnd(&sub, depth+1)) {
1062
4
        *pre = Regexp::Capture(sub, re->parse_flags(), re->cap());
1063
4
        re->Decref();
1064
4
        return true;
1065
4
      }
1066
83
      sub->Decref();
1067
83
      break;
1068
65
    case kRegexpEndText:
1069
65
      *pre = Regexp::LiteralString(NULL, 0, re->parse_flags());
1070
65
      re->Decref();
1071
65
      return true;
1072
8.74k
  }
1073
8.60k
  return false;
1074
8.74k
}
1075
1076
void Compiler::Setup(Regexp::ParseFlags flags, int64_t max_mem,
1077
5.21k
                     RE2::Anchor anchor) {
1078
5.21k
  if (flags & Regexp::Latin1)
1079
3.84k
    encoding_ = kEncodingLatin1;
1080
5.21k
  max_mem_ = max_mem;
1081
5.21k
  if (max_mem <= 0) {
1082
0
    max_ninst_ = 100000;  // more than enough
1083
5.21k
  } else if (static_cast<size_t>(max_mem) <= sizeof(Prog)) {
1084
    // No room for anything.
1085
0
    max_ninst_ = 0;
1086
5.21k
  } else {
1087
5.21k
    int64_t m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst);
1088
    // Limit instruction count so that inst->id() fits nicely in an int.
1089
    // SparseArray also assumes that the indices (inst->id()) are ints.
1090
    // The call to WalkExponential uses 2*max_ninst_ below,
1091
    // and other places in the code use 2 or 3 * prog->size().
1092
    // Limiting to 2^24 should avoid overflow in those places.
1093
    // (The point of allowing more than 32 bits of memory is to
1094
    // have plenty of room for the DFA states, not to use it up
1095
    // on the program.)
1096
5.21k
    if (m >= 1<<24)
1097
0
      m = 1<<24;
1098
    // Inst imposes its own limit (currently bigger than 2^24 but be safe).
1099
5.21k
    if (m > Prog::Inst::kMaxInst)
1100
0
      m = Prog::Inst::kMaxInst;
1101
5.21k
    max_ninst_ = static_cast<int>(m);
1102
5.21k
  }
1103
5.21k
  anchor_ = anchor;
1104
5.21k
}
1105
1106
// Compiles re, returning program.
1107
// Caller is responsible for deleting prog_.
1108
// If reversed is true, compiles a program that expects
1109
// to run over the input string backward (reverses all concatenations).
1110
// The reversed flag is also recorded in the returned program.
1111
5.21k
Prog* Compiler::Compile(Regexp* re, bool reversed, int64_t max_mem) {
1112
5.21k
  Compiler c;
1113
5.21k
  c.Setup(re->parse_flags(), max_mem, RE2::UNANCHORED /* unused */);
1114
5.21k
  c.reversed_ = reversed;
1115
1116
  // Simplify to remove things like counted repetitions
1117
  // and character classes like \d.
1118
5.21k
  Regexp* sre = re->Simplify();
1119
5.21k
  if (sre == NULL)
1120
0
    return NULL;
1121
1122
  // Record whether prog is anchored, removing the anchors.
1123
  // (They get in the way of other optimizations.)
1124
5.21k
  bool is_anchor_start = IsAnchorStart(&sre, 0);
1125
5.21k
  bool is_anchor_end = IsAnchorEnd(&sre, 0);
1126
1127
  // Generate fragment for entire regexp.
1128
5.21k
  Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1129
5.21k
  sre->Decref();
1130
5.21k
  if (c.failed_)
1131
0
    return NULL;
1132
1133
  // Success!  Finish by putting Match node at end, and record start.
1134
  // Turn off c.reversed_ (if it is set) to force the remaining concatenations
1135
  // to behave normally.
1136
5.21k
  c.reversed_ = false;
1137
5.21k
  all = c.Cat(all, c.Match(0));
1138
1139
5.21k
  c.prog_->set_reversed(reversed);
1140
5.21k
  if (c.prog_->reversed()) {
1141
0
    c.prog_->set_anchor_start(is_anchor_end);
1142
0
    c.prog_->set_anchor_end(is_anchor_start);
1143
5.21k
  } else {
1144
5.21k
    c.prog_->set_anchor_start(is_anchor_start);
1145
5.21k
    c.prog_->set_anchor_end(is_anchor_end);
1146
5.21k
  }
1147
1148
5.21k
  c.prog_->set_start(all.begin);
1149
5.21k
  if (!c.prog_->anchor_start()) {
1150
    // Also create unanchored version, which starts with a .*? loop.
1151
5.11k
    all = c.Cat(c.DotStar(), all);
1152
5.11k
  }
1153
5.21k
  c.prog_->set_start_unanchored(all.begin);
1154
1155
  // Hand ownership of prog_ to caller.
1156
5.21k
  return c.Finish(re);
1157
5.21k
}
1158
1159
5.21k
Prog* Compiler::Finish(Regexp* re) {
1160
5.21k
  if (failed_)
1161
0
    return NULL;
1162
1163
5.21k
  if (prog_->start() == 0 && prog_->start_unanchored() == 0) {
1164
    // No possible matches; keep Fail instruction only.
1165
161
    ninst_ = 1;
1166
161
  }
1167
1168
  // Hand off the array to Prog.
1169
5.21k
  prog_->inst_ = std::move(inst_);
1170
5.21k
  prog_->size_ = ninst_;
1171
1172
5.21k
  prog_->Optimize();
1173
5.21k
  prog_->Flatten();
1174
5.21k
  prog_->ComputeByteMap();
1175
1176
5.21k
  if (!prog_->reversed()) {
1177
5.21k
    std::string prefix;
1178
5.21k
    bool prefix_foldcase;
1179
5.21k
    if (re->RequiredPrefixForAccel(&prefix, &prefix_foldcase))
1180
2.36k
      prog_->ConfigurePrefixAccel(prefix, prefix_foldcase);
1181
5.21k
  }
1182
1183
  // Record remaining memory for DFA.
1184
5.21k
  if (max_mem_ <= 0) {
1185
0
    prog_->set_dfa_mem(1<<20);
1186
5.21k
  } else {
1187
5.21k
    int64_t m = max_mem_ - sizeof(Prog);
1188
5.21k
    m -= prog_->size_*sizeof(Prog::Inst);  // account for inst_
1189
5.21k
    if (prog_->CanBitState())
1190
4.66k
      m -= prog_->size_*sizeof(uint16_t);  // account for list_heads_
1191
5.21k
    if (m < 0)
1192
0
      m = 0;
1193
5.21k
    prog_->set_dfa_mem(m);
1194
5.21k
  }
1195
1196
5.21k
  Prog* p = prog_;
1197
5.21k
  prog_ = NULL;
1198
5.21k
  return p;
1199
5.21k
}
1200
1201
// Converts Regexp to Prog.
1202
5.21k
Prog* Regexp::CompileToProg(int64_t max_mem) {
1203
5.21k
  return Compiler::Compile(this, false, max_mem);
1204
5.21k
}
1205
1206
0
Prog* Regexp::CompileToReverseProg(int64_t max_mem) {
1207
0
  return Compiler::Compile(this, true, max_mem);
1208
0
}
1209
1210
5.11k
Frag Compiler::DotStar() {
1211
5.11k
  return Star(ByteRange(0x00, 0xff, false), true);
1212
5.11k
}
1213
1214
// Compiles RE set to Prog.
1215
0
Prog* Compiler::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1216
0
  Compiler c;
1217
0
  c.Setup(re->parse_flags(), max_mem, anchor);
1218
1219
0
  Regexp* sre = re->Simplify();
1220
0
  if (sre == NULL)
1221
0
    return NULL;
1222
1223
0
  Frag all = c.WalkExponential(sre, Frag(), 2*c.max_ninst_);
1224
0
  sre->Decref();
1225
0
  if (c.failed_)
1226
0
    return NULL;
1227
1228
0
  c.prog_->set_anchor_start(true);
1229
0
  c.prog_->set_anchor_end(true);
1230
1231
0
  if (anchor == RE2::UNANCHORED) {
1232
    // Prepend .* or else the expression will effectively be anchored.
1233
    // Complemented by the ANCHOR_BOTH case in PostVisit().
1234
0
    all = c.Cat(c.DotStar(), all);
1235
0
  }
1236
0
  c.prog_->set_start(all.begin);
1237
0
  c.prog_->set_start_unanchored(all.begin);
1238
1239
0
  Prog* prog = c.Finish(re);
1240
0
  if (prog == NULL)
1241
0
    return NULL;
1242
1243
  // Make sure DFA has enough memory to operate,
1244
  // since we're not going to fall back to the NFA.
1245
0
  bool dfa_failed = false;
1246
0
  StringPiece sp = "hello, world";
1247
0
  prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch,
1248
0
                  NULL, &dfa_failed, NULL);
1249
0
  if (dfa_failed) {
1250
0
    delete prog;
1251
0
    return NULL;
1252
0
  }
1253
1254
0
  return prog;
1255
0
}
1256
1257
0
Prog* Prog::CompileSet(Regexp* re, RE2::Anchor anchor, int64_t max_mem) {
1258
0
  return Compiler::CompileSet(re, anchor, max_mem);
1259
0
}
1260
1261
}  // namespace re2