Coverage Report

Created: 2026-05-27 07:00

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/proc/self/cwd/external/re2~/re2/regexp.cc
Line
Count
Source
1
// Copyright 2006 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
// Regular expression representation.
6
// Tested by parse_test.cc
7
8
#include "re2/regexp.h"
9
10
#include <stddef.h>
11
#include <stdint.h>
12
#include <string.h>
13
14
#include <algorithm>
15
#include <map>
16
#include <string>
17
#include <vector>
18
19
#include "absl/base/call_once.h"
20
#include "absl/base/macros.h"
21
#include "absl/container/flat_hash_map.h"
22
#include "absl/log/absl_check.h"
23
#include "absl/log/absl_log.h"
24
#include "absl/synchronization/mutex.h"
25
#include "re2/pod_array.h"
26
#include "re2/walker-inl.h"
27
#include "util/utf.h"
28
29
namespace re2 {
30
31
// Constructor.  Allocates vectors as appropriate for operator.
32
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
33
0
  : op_(static_cast<uint8_t>(op)),
34
0
    simple_(false),
35
0
    parse_flags_(static_cast<uint16_t>(parse_flags)),
36
0
    ref_(1),
37
0
    nsub_(0),
38
0
    down_(NULL) {
39
0
  subone_ = NULL;
40
0
  memset(the_union_, 0, sizeof the_union_);
41
0
}
42
43
// Destructor.  Assumes already cleaned up children.
44
// Private: use Decref() instead of delete to destroy Regexps.
45
// Can't call Decref on the sub-Regexps here because
46
// that could cause arbitrarily deep recursion, so
47
// required Decref() to have handled them for us.
48
0
Regexp::~Regexp() {
49
0
  if (nsub_ > 0)
50
0
    ABSL_LOG(DFATAL) << "Regexp not destroyed.";
51
52
0
  switch (op_) {
53
0
    default:
54
0
      break;
55
0
    case kRegexpCapture:
56
0
      delete name_;
57
0
      break;
58
0
    case kRegexpLiteralString:
59
0
      delete[] runes_;
60
0
      break;
61
0
    case kRegexpCharClass:
62
0
      if (cc_)
63
0
        cc_->Delete();
64
0
      delete ccb_;
65
0
      break;
66
0
  }
67
0
}
68
69
// If it's possible to destroy this regexp without recurring,
70
// do so and return true.  Else return false.
71
0
bool Regexp::QuickDestroy() {
72
0
  if (nsub_ == 0) {
73
0
    delete this;
74
0
    return true;
75
0
  }
76
0
  return false;
77
0
}
78
79
// Similar to EmptyStorage in re2.cc.
80
struct RefStorage {
81
  absl::Mutex ref_mutex;
82
  absl::flat_hash_map<Regexp*, int> ref_map;
83
};
84
alignas(RefStorage) static char ref_storage[sizeof(RefStorage)];
85
86
0
static inline absl::Mutex* ref_mutex() {
87
0
  return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex;
88
0
}
89
90
0
static inline absl::flat_hash_map<Regexp*, int>* ref_map() {
91
0
  return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map;
92
0
}
93
94
0
int Regexp::Ref() {
95
0
  if (ref_ < kMaxRef)
96
0
    return ref_;
97
98
0
  absl::MutexLock l(ref_mutex());
99
0
  return (*ref_map())[this];
100
0
}
101
102
// Increments reference count, returns object as convenience.
103
0
Regexp* Regexp::Incref() {
104
0
  if (ref_ >= kMaxRef-1) {
105
0
    static absl::once_flag ref_once;
106
0
    absl::call_once(ref_once, []() {
107
0
      (void) new (ref_storage) RefStorage;
108
0
    });
109
110
    // Store ref count in overflow map.
111
0
    absl::MutexLock l(ref_mutex());
112
0
    if (ref_ == kMaxRef) {
113
      // already overflowed
114
0
      (*ref_map())[this]++;
115
0
    } else {
116
      // overflowing now
117
0
      (*ref_map())[this] = kMaxRef;
118
0
      ref_ = kMaxRef;
119
0
    }
120
0
    return this;
121
0
  }
122
123
0
  ref_++;
124
0
  return this;
125
0
}
126
127
// Decrements reference count and deletes this object if count reaches 0.
128
0
void Regexp::Decref() {
129
0
  if (ref_ == kMaxRef) {
130
    // Ref count is stored in overflow map.
131
0
    absl::MutexLock l(ref_mutex());
132
0
    int r = (*ref_map())[this] - 1;
133
0
    if (r < kMaxRef) {
134
0
      ref_ = static_cast<uint16_t>(r);
135
0
      ref_map()->erase(this);
136
0
    } else {
137
0
      (*ref_map())[this] = r;
138
0
    }
139
0
    return;
140
0
  }
141
0
  ref_--;
142
0
  if (ref_ == 0)
143
0
    Destroy();
144
0
}
145
146
// Deletes this object; ref count has count reached 0.
147
0
void Regexp::Destroy() {
148
0
  if (QuickDestroy())
149
0
    return;
150
151
  // Handle recursive Destroy with explicit stack
152
  // to avoid arbitrarily deep recursion on process stack [sigh].
153
0
  down_ = NULL;
154
0
  Regexp* stack = this;
155
0
  while (stack != NULL) {
156
0
    Regexp* re = stack;
157
0
    stack = re->down_;
158
0
    if (re->ref_ != 0)
159
0
      ABSL_LOG(DFATAL) << "Bad reference count " << re->ref_;
160
0
    if (re->nsub_ > 0) {
161
0
      Regexp** subs = re->sub();
162
0
      for (int i = 0; i < re->nsub_; i++) {
163
0
        Regexp* sub = subs[i];
164
0
        if (sub == NULL)
165
0
          continue;
166
0
        if (sub->ref_ == kMaxRef)
167
0
          sub->Decref();
168
0
        else
169
0
          --sub->ref_;
170
0
        if (sub->ref_ == 0 && !sub->QuickDestroy()) {
171
0
          sub->down_ = stack;
172
0
          stack = sub;
173
0
        }
174
0
      }
175
0
      if (re->nsub_ > 1)
176
0
        delete[] subs;
177
0
      re->nsub_ = 0;
178
0
    }
179
0
    delete re;
180
0
  }
181
0
}
182
183
0
void Regexp::AddRuneToString(Rune r) {
184
0
  ABSL_DCHECK(op_ == kRegexpLiteralString);
185
0
  if (nrunes_ == 0) {
186
    // start with 8
187
0
    runes_ = new Rune[8];
188
0
  } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
189
    // double on powers of two
190
0
    Rune *old = runes_;
191
0
    runes_ = new Rune[nrunes_ * 2];
192
0
    for (int i = 0; i < nrunes_; i++)
193
0
      runes_[i] = old[i];
194
0
    delete[] old;
195
0
  }
196
197
0
  runes_[nrunes_++] = r;
198
0
}
199
200
0
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
201
0
  Regexp* re = new Regexp(kRegexpHaveMatch, flags);
202
0
  re->match_id_ = match_id;
203
0
  return re;
204
0
}
205
206
0
Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) {
207
  // Squash **, ++ and ??.
208
0
  if (op == sub->op() && flags == sub->parse_flags())
209
0
    return sub;
210
211
  // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
212
  // op is Star/Plus/Quest, we just have to check that sub->op() is too.
213
0
  if ((sub->op() == kRegexpStar ||
214
0
       sub->op() == kRegexpPlus ||
215
0
       sub->op() == kRegexpQuest) &&
216
0
      flags == sub->parse_flags()) {
217
    // If sub is Star, no need to rewrite it.
218
0
    if (sub->op() == kRegexpStar)
219
0
      return sub;
220
221
    // Rewrite sub to Star.
222
0
    Regexp* re = new Regexp(kRegexpStar, flags);
223
0
    re->AllocSub(1);
224
0
    re->sub()[0] = sub->sub()[0]->Incref();
225
0
    sub->Decref();  // We didn't consume the reference after all.
226
0
    return re;
227
0
  }
228
229
0
  Regexp* re = new Regexp(op, flags);
230
0
  re->AllocSub(1);
231
0
  re->sub()[0] = sub;
232
0
  return re;
233
0
}
234
235
0
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
236
0
  return StarPlusOrQuest(kRegexpPlus, sub, flags);
237
0
}
238
239
0
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
240
0
  return StarPlusOrQuest(kRegexpStar, sub, flags);
241
0
}
242
243
0
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
244
0
  return StarPlusOrQuest(kRegexpQuest, sub, flags);
245
0
}
246
247
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
248
0
                                  ParseFlags flags, bool can_factor) {
249
0
  if (nsub == 1)
250
0
    return sub[0];
251
252
0
  if (nsub == 0) {
253
0
    if (op == kRegexpAlternate)
254
0
      return new Regexp(kRegexpNoMatch, flags);
255
0
    else
256
0
      return new Regexp(kRegexpEmptyMatch, flags);
257
0
  }
258
259
0
  PODArray<Regexp*> subcopy;
260
0
  if (op == kRegexpAlternate && can_factor) {
261
    // Going to edit sub; make a copy so we don't step on caller.
262
0
    subcopy = PODArray<Regexp*>(nsub);
263
0
    memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
264
0
    sub = subcopy.data();
265
0
    nsub = FactorAlternation(sub, nsub, flags);
266
0
    if (nsub == 1) {
267
0
      Regexp* re = sub[0];
268
0
      return re;
269
0
    }
270
0
  }
271
272
0
  if (nsub > kMaxNsub) {
273
    // Too many subexpressions to fit in a single Regexp.
274
    // Make a two-level tree.  Two levels gets us to 65535^2.
275
0
    int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
276
0
    Regexp* re = new Regexp(op, flags);
277
0
    re->AllocSub(nbigsub);
278
0
    Regexp** subs = re->sub();
279
0
    for (int i = 0; i < nbigsub - 1; i++)
280
0
      subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
281
0
    subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
282
0
                                          nsub - (nbigsub-1)*kMaxNsub, flags,
283
0
                                          false);
284
0
    return re;
285
0
  }
286
287
0
  Regexp* re = new Regexp(op, flags);
288
0
  re->AllocSub(nsub);
289
0
  Regexp** subs = re->sub();
290
0
  for (int i = 0; i < nsub; i++)
291
0
    subs[i] = sub[i];
292
0
  return re;
293
0
}
294
295
0
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
296
0
  return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
297
0
}
298
299
0
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
300
0
  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
301
0
}
302
303
0
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
304
0
  return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
305
0
}
306
307
0
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
308
0
  Regexp* re = new Regexp(kRegexpCapture, flags);
309
0
  re->AllocSub(1);
310
0
  re->sub()[0] = sub;
311
0
  re->cap_ = cap;
312
0
  return re;
313
0
}
314
315
0
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
316
0
  Regexp* re = new Regexp(kRegexpRepeat, flags);
317
0
  re->AllocSub(1);
318
0
  re->sub()[0] = sub;
319
0
  re->min_ = min;
320
0
  re->max_ = max;
321
0
  return re;
322
0
}
323
324
0
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
325
0
  Regexp* re = new Regexp(kRegexpLiteral, flags);
326
0
  re->rune_ = rune;
327
0
  return re;
328
0
}
329
330
0
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
331
0
  if (nrunes <= 0)
332
0
    return new Regexp(kRegexpEmptyMatch, flags);
333
0
  if (nrunes == 1)
334
0
    return NewLiteral(runes[0], flags);
335
0
  Regexp* re = new Regexp(kRegexpLiteralString, flags);
336
0
  for (int i = 0; i < nrunes; i++)
337
0
    re->AddRuneToString(runes[i]);
338
0
  return re;
339
0
}
340
341
0
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
342
0
  Regexp* re = new Regexp(kRegexpCharClass, flags);
343
0
  re->cc_ = cc;
344
0
  return re;
345
0
}
346
347
0
void Regexp::Swap(Regexp* that) {
348
  // Regexp is not trivially copyable, so we cannot freely copy it with
349
  // memmove(3), but swapping objects like so is safe for our purposes.
350
0
  char tmp[sizeof *this];
351
0
  void* vthis = reinterpret_cast<void*>(this);
352
0
  void* vthat = reinterpret_cast<void*>(that);
353
0
  memmove(tmp, vthis, sizeof *this);
354
0
  memmove(vthis, vthat, sizeof *this);
355
0
  memmove(vthat, tmp, sizeof *this);
356
0
}
357
358
// Tests equality of all top-level structure but not subregexps.
359
0
static bool TopEqual(Regexp* a, Regexp* b) {
360
0
  if (a->op() != b->op())
361
0
    return false;
362
363
0
  switch (a->op()) {
364
0
    case kRegexpNoMatch:
365
0
    case kRegexpEmptyMatch:
366
0
    case kRegexpAnyChar:
367
0
    case kRegexpAnyByte:
368
0
    case kRegexpBeginLine:
369
0
    case kRegexpEndLine:
370
0
    case kRegexpWordBoundary:
371
0
    case kRegexpNoWordBoundary:
372
0
    case kRegexpBeginText:
373
0
      return true;
374
375
0
    case kRegexpEndText:
376
      // The parse flags remember whether it's \z or (?-m:$),
377
      // which matters when testing against PCRE.
378
0
      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
379
380
0
    case kRegexpLiteral:
381
0
      return a->rune() == b->rune() &&
382
0
             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
383
384
0
    case kRegexpLiteralString:
385
0
      return a->nrunes() == b->nrunes() &&
386
0
             ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
387
0
             memcmp(a->runes(), b->runes(),
388
0
                    a->nrunes() * sizeof a->runes()[0]) == 0;
389
390
0
    case kRegexpAlternate:
391
0
    case kRegexpConcat:
392
0
      return a->nsub() == b->nsub();
393
394
0
    case kRegexpStar:
395
0
    case kRegexpPlus:
396
0
    case kRegexpQuest:
397
0
      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
398
399
0
    case kRegexpRepeat:
400
0
      return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
401
0
             a->min() == b->min() &&
402
0
             a->max() == b->max();
403
404
0
    case kRegexpCapture:
405
0
      if (a->name() == NULL || b->name() == NULL) {
406
        // One pointer is null, so the other pointer should also be null.
407
0
        return a->cap() == b->cap() && a->name() == b->name();
408
0
      } else {
409
        // Neither pointer is null, so compare the pointees for equality.
410
0
        return a->cap() == b->cap() && *a->name() == *b->name();
411
0
      }
412
413
0
    case kRegexpHaveMatch:
414
0
      return a->match_id() == b->match_id();
415
416
0
    case kRegexpCharClass: {
417
0
      CharClass* acc = a->cc();
418
0
      CharClass* bcc = b->cc();
419
0
      return acc->size() == bcc->size() &&
420
0
             acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
421
0
             memcmp(acc->begin(), bcc->begin(),
422
0
                    (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
423
0
    }
424
0
  }
425
426
0
  ABSL_LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
427
0
  return 0;
428
0
}
429
430
bool Regexp::Equal(Regexp* a, Regexp* b) {
431
  if (a == NULL || b == NULL)
432
    return a == b;
433
434
  if (!TopEqual(a, b))
435
    return false;
436
437
  // Fast path:
438
  // return without allocating vector if there are no subregexps.
439
  switch (a->op()) {
440
    case kRegexpAlternate:
441
    case kRegexpConcat:
442
    case kRegexpStar:
443
    case kRegexpPlus:
444
    case kRegexpQuest:
445
    case kRegexpRepeat:
446
    case kRegexpCapture:
447
      break;
448
449
    default:
450
      return true;
451
  }
452
453
  // Committed to doing real work.
454
  // The stack (vector) has pairs of regexps waiting to
455
  // be compared.  The regexps are only equal if
456
  // all the pairs end up being equal.
457
  std::vector<Regexp*> stk;
458
459
  for (;;) {
460
    // Invariant: TopEqual(a, b) == true.
461
    Regexp* a2;
462
    Regexp* b2;
463
    switch (a->op()) {
464
      default:
465
        break;
466
      case kRegexpAlternate:
467
      case kRegexpConcat:
468
        for (int i = 0; i < a->nsub(); i++) {
469
          a2 = a->sub()[i];
470
          b2 = b->sub()[i];
471
          if (!TopEqual(a2, b2))
472
            return false;
473
          stk.push_back(a2);
474
          stk.push_back(b2);
475
        }
476
        break;
477
478
      case kRegexpStar:
479
      case kRegexpPlus:
480
      case kRegexpQuest:
481
      case kRegexpRepeat:
482
      case kRegexpCapture:
483
        a2 = a->sub()[0];
484
        b2 = b->sub()[0];
485
        if (!TopEqual(a2, b2))
486
          return false;
487
        // Really:
488
        //   stk.push_back(a2);
489
        //   stk.push_back(b2);
490
        //   break;
491
        // but faster to assign directly and loop.
492
        a = a2;
493
        b = b2;
494
        continue;
495
    }
496
497
    size_t n = stk.size();
498
    if (n == 0)
499
      break;
500
501
    ABSL_DCHECK_GE(n, size_t{2});
502
    a = stk[n-2];
503
    b = stk[n-1];
504
    stk.resize(n-2);
505
  }
506
507
  return true;
508
}
509
510
// Keep in sync with enum RegexpStatusCode in regexp.h
511
static const char *kErrorStrings[] = {
512
  "no error",
513
  "unexpected error",
514
  "invalid escape sequence",
515
  "invalid character class",
516
  "invalid character class range",
517
  "missing ]",
518
  "missing )",
519
  "unexpected )",
520
  "trailing \\",
521
  "no argument for repetition operator",
522
  "invalid repetition size",
523
  "bad repetition operator",
524
  "invalid perl operator",
525
  "invalid UTF-8",
526
  "invalid named capture group",
527
};
528
529
0
std::string RegexpStatus::CodeText(enum RegexpStatusCode code) {
530
0
  if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings))
531
0
    code = kRegexpInternalError;
532
0
  return kErrorStrings[code];
533
0
}
534
535
0
std::string RegexpStatus::Text() const {
536
0
  if (error_arg_.empty())
537
0
    return CodeText(code_);
538
0
  std::string s;
539
0
  s.append(CodeText(code_));
540
0
  s.append(": ");
541
0
  s.append(error_arg_.data(), error_arg_.size());
542
0
  return s;
543
0
}
544
545
0
void RegexpStatus::Copy(const RegexpStatus& status) {
546
0
  code_ = status.code_;
547
0
  error_arg_ = status.error_arg_;
548
0
}
549
550
typedef int Ignored;  // Walker<void> doesn't exist
551
552
// Walker subclass to count capturing parens in regexp.
553
class NumCapturesWalker : public Regexp::Walker<Ignored> {
554
 public:
555
0
  NumCapturesWalker() : ncapture_(0) {}
556
0
  int ncapture() { return ncapture_; }
557
558
0
  virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
559
0
    if (re->op() == kRegexpCapture)
560
0
      ncapture_++;
561
0
    return ignored;
562
0
  }
563
564
0
  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
565
    // Should never be called: we use Walk(), not WalkExponential().
566
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
567
    ABSL_LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
568
#endif
569
0
    return ignored;
570
0
  }
571
572
 private:
573
  int ncapture_;
574
575
  NumCapturesWalker(const NumCapturesWalker&) = delete;
576
  NumCapturesWalker& operator=(const NumCapturesWalker&) = delete;
577
};
578
579
0
int Regexp::NumCaptures() {
580
0
  NumCapturesWalker w;
581
0
  w.Walk(this, 0);
582
0
  return w.ncapture();
583
0
}
584
585
// Walker class to build map of named capture groups and their indices.
586
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
587
 public:
588
0
  NamedCapturesWalker() : map_(NULL) {}
589
0
  ~NamedCapturesWalker() { delete map_; }
590
591
0
  std::map<std::string, int>* TakeMap() {
592
0
    std::map<std::string, int>* m = map_;
593
0
    map_ = NULL;
594
0
    return m;
595
0
  }
596
597
0
  virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
598
0
    if (re->op() == kRegexpCapture && re->name() != NULL) {
599
      // Allocate map once we find a name.
600
0
      if (map_ == NULL)
601
0
        map_ = new std::map<std::string, int>;
602
603
      // Record first occurrence of each name.
604
      // (The rule is that if you have the same name
605
      // multiple times, only the leftmost one counts.)
606
0
      map_->insert({*re->name(), re->cap()});
607
0
    }
608
0
    return ignored;
609
0
  }
610
611
0
  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
612
    // Should never be called: we use Walk(), not WalkExponential().
613
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
614
    ABSL_LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
615
#endif
616
0
    return ignored;
617
0
  }
618
619
 private:
620
  std::map<std::string, int>* map_;
621
622
  NamedCapturesWalker(const NamedCapturesWalker&) = delete;
623
  NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete;
624
};
625
626
0
std::map<std::string, int>* Regexp::NamedCaptures() {
627
0
  NamedCapturesWalker w;
628
0
  w.Walk(this, 0);
629
0
  return w.TakeMap();
630
0
}
631
632
// Walker class to build map from capture group indices to their names.
633
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
634
 public:
635
0
  CaptureNamesWalker() : map_(NULL) {}
636
0
  ~CaptureNamesWalker() { delete map_; }
637
638
0
  std::map<int, std::string>* TakeMap() {
639
0
    std::map<int, std::string>* m = map_;
640
0
    map_ = NULL;
641
0
    return m;
642
0
  }
643
644
0
  virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
645
0
    if (re->op() == kRegexpCapture && re->name() != NULL) {
646
      // Allocate map once we find a name.
647
0
      if (map_ == NULL)
648
0
        map_ = new std::map<int, std::string>;
649
650
0
      (*map_)[re->cap()] = *re->name();
651
0
    }
652
0
    return ignored;
653
0
  }
654
655
0
  virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
656
    // Should never be called: we use Walk(), not WalkExponential().
657
#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
658
    ABSL_LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
659
#endif
660
0
    return ignored;
661
0
  }
662
663
 private:
664
  std::map<int, std::string>* map_;
665
666
  CaptureNamesWalker(const CaptureNamesWalker&) = delete;
667
  CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete;
668
};
669
670
0
std::map<int, std::string>* Regexp::CaptureNames() {
671
0
  CaptureNamesWalker w;
672
0
  w.Walk(this, 0);
673
0
  return w.TakeMap();
674
0
}
675
676
void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes,
677
0
                         std::string* bytes) {
678
0
  if (latin1) {
679
0
    bytes->resize(nrunes);
680
0
    for (int i = 0; i < nrunes; i++)
681
0
      (*bytes)[i] = static_cast<char>(runes[i]);
682
0
  } else {
683
0
    bytes->resize(nrunes * UTFmax);  // worst case
684
0
    char* p = &(*bytes)[0];
685
0
    for (int i = 0; i < nrunes; i++)
686
0
      p += runetochar(p, &runes[i]);
687
0
    bytes->resize(p - &(*bytes)[0]);
688
0
    bytes->shrink_to_fit();
689
0
  }
690
0
}
691
692
// Determines whether regexp matches must be anchored
693
// with a fixed string prefix.  If so, returns the prefix and
694
// the regexp that remains after the prefix.  The prefix might
695
// be ASCII case-insensitive.
696
bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase,
697
0
                            Regexp** suffix) {
698
0
  prefix->clear();
699
0
  *foldcase = false;
700
0
  *suffix = NULL;
701
702
  // No need for a walker: the regexp must be of the form
703
  // 1. some number of ^ anchors
704
  // 2. a literal char or string
705
  // 3. the rest
706
0
  if (op_ != kRegexpConcat)
707
0
    return false;
708
0
  int i = 0;
709
0
  while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText)
710
0
    i++;
711
0
  if (i == 0 || i >= nsub_)
712
0
    return false;
713
0
  Regexp* re = sub()[i];
714
0
  if (re->op_ != kRegexpLiteral &&
715
0
      re->op_ != kRegexpLiteralString)
716
0
    return false;
717
0
  i++;
718
0
  if (i < nsub_) {
719
0
    for (int j = i; j < nsub_; j++)
720
0
      sub()[j]->Incref();
721
0
    *suffix = Concat(sub() + i, nsub_ - i, parse_flags());
722
0
  } else {
723
0
    *suffix = new Regexp(kRegexpEmptyMatch, parse_flags());
724
0
  }
725
726
0
  bool latin1 = (re->parse_flags() & Latin1) != 0;
727
0
  Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
728
0
  int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
729
0
  ConvertRunesToBytes(latin1, runes, nrunes, prefix);
730
0
  *foldcase = (re->parse_flags() & FoldCase) != 0;
731
0
  return true;
732
0
}
733
734
// Determines whether regexp matches must be unanchored
735
// with a fixed string prefix.  If so, returns the prefix.
736
// The prefix might be ASCII case-insensitive.
737
0
bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) {
738
0
  prefix->clear();
739
0
  *foldcase = false;
740
741
  // No need for a walker: the regexp must either begin with or be
742
  // a literal char or string. We "see through" capturing groups,
743
  // but make no effort to glue multiple prefix fragments together.
744
0
  Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this;
745
0
  while (re->op_ == kRegexpCapture) {
746
0
    re = re->sub()[0];
747
0
    if (re->op_ == kRegexpConcat && re->nsub_ > 0)
748
0
      re = re->sub()[0];
749
0
  }
750
0
  if (re->op_ != kRegexpLiteral &&
751
0
      re->op_ != kRegexpLiteralString)
752
0
    return false;
753
754
0
  bool latin1 = (re->parse_flags() & Latin1) != 0;
755
0
  Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_;
756
0
  int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_;
757
0
  ConvertRunesToBytes(latin1, runes, nrunes, prefix);
758
0
  *foldcase = (re->parse_flags() & FoldCase) != 0;
759
0
  return true;
760
0
}
761
762
// Character class builder is a balanced binary tree (STL set)
763
// containing non-overlapping, non-abutting RuneRanges.
764
// The less-than operator used in the tree treats two
765
// ranges as equal if they overlap at all, so that
766
// lookups for a particular Rune are possible.
767
768
0
CharClassBuilder::CharClassBuilder() {
769
0
  nrunes_ = 0;
770
0
  upper_ = 0;
771
0
  lower_ = 0;
772
0
}
773
774
// Add lo-hi to the class; return whether class got bigger.
775
0
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
776
0
  if (hi < lo)
777
0
    return false;
778
779
0
  if (lo <= 'z' && hi >= 'A') {
780
    // Overlaps some alpha, maybe not all.
781
    // Update bitmaps telling which ASCII letters are in the set.
782
0
    Rune lo1 = std::max<Rune>(lo, 'A');
783
0
    Rune hi1 = std::min<Rune>(hi, 'Z');
784
0
    if (lo1 <= hi1)
785
0
      upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
786
787
0
    lo1 = std::max<Rune>(lo, 'a');
788
0
    hi1 = std::min<Rune>(hi, 'z');
789
0
    if (lo1 <= hi1)
790
0
      lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
791
0
  }
792
793
0
  {  // Check whether lo, hi is already in the class.
794
0
    iterator it = ranges_.find(RuneRange(lo, lo));
795
0
    if (it != end() && it->lo <= lo && hi <= it->hi)
796
0
      return false;
797
0
  }
798
799
  // Look for a range abutting lo on the left.
800
  // If it exists, take it out and increase our range.
801
0
  if (lo > 0) {
802
0
    iterator it = ranges_.find(RuneRange(lo-1, lo-1));
803
0
    if (it != end()) {
804
0
      lo = it->lo;
805
0
      if (it->hi > hi)
806
0
        hi = it->hi;
807
0
      nrunes_ -= it->hi - it->lo + 1;
808
0
      ranges_.erase(it);
809
0
    }
810
0
  }
811
812
  // Look for a range abutting hi on the right.
813
  // If it exists, take it out and increase our range.
814
0
  if (hi < Runemax) {
815
0
    iterator it = ranges_.find(RuneRange(hi+1, hi+1));
816
0
    if (it != end()) {
817
0
      hi = it->hi;
818
0
      nrunes_ -= it->hi - it->lo + 1;
819
0
      ranges_.erase(it);
820
0
    }
821
0
  }
822
823
  // Look for ranges between lo and hi.  Take them out.
824
  // This is only safe because the set has no overlapping ranges.
825
  // We've already removed any ranges abutting lo and hi, so
826
  // any that overlap [lo, hi] must be contained within it.
827
0
  for (;;) {
828
0
    iterator it = ranges_.find(RuneRange(lo, hi));
829
0
    if (it == end())
830
0
      break;
831
0
    nrunes_ -= it->hi - it->lo + 1;
832
0
    ranges_.erase(it);
833
0
  }
834
835
  // Finally, add [lo, hi].
836
0
  nrunes_ += hi - lo + 1;
837
0
  ranges_.insert(RuneRange(lo, hi));
838
0
  return true;
839
0
}
840
841
0
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
842
0
  for (iterator it = cc->begin(); it != cc->end(); ++it)
843
0
    AddRange(it->lo, it->hi);
844
0
}
845
846
0
bool CharClassBuilder::Contains(Rune r) {
847
0
  return ranges_.find(RuneRange(r, r)) != end();
848
0
}
849
850
// Does the character class behave the same on A-Z as on a-z?
851
0
bool CharClassBuilder::FoldsASCII() {
852
0
  return ((upper_ ^ lower_) & AlphaMask) == 0;
853
0
}
854
855
0
CharClassBuilder* CharClassBuilder::Copy() {
856
0
  CharClassBuilder* cc = new CharClassBuilder;
857
0
  for (iterator it = begin(); it != end(); ++it)
858
0
    cc->ranges_.insert(RuneRange(it->lo, it->hi));
859
0
  cc->upper_ = upper_;
860
0
  cc->lower_ = lower_;
861
0
  cc->nrunes_ = nrunes_;
862
0
  return cc;
863
0
}
864
865
866
867
0
void CharClassBuilder::RemoveAbove(Rune r) {
868
0
  if (r >= Runemax)
869
0
    return;
870
871
0
  if (r < 'z') {
872
0
    if (r < 'a')
873
0
      lower_ = 0;
874
0
    else
875
0
      lower_ &= AlphaMask >> ('z' - r);
876
0
  }
877
878
0
  if (r < 'Z') {
879
0
    if (r < 'A')
880
0
      upper_ = 0;
881
0
    else
882
0
      upper_ &= AlphaMask >> ('Z' - r);
883
0
  }
884
885
0
  for (;;) {
886
887
0
    iterator it = ranges_.find(RuneRange(r + 1, Runemax));
888
0
    if (it == end())
889
0
      break;
890
0
    RuneRange rr = *it;
891
0
    ranges_.erase(it);
892
0
    nrunes_ -= rr.hi - rr.lo + 1;
893
0
    if (rr.lo <= r) {
894
0
      rr.hi = r;
895
0
      ranges_.insert(rr);
896
0
      nrunes_ += rr.hi - rr.lo + 1;
897
0
    }
898
0
  }
899
0
}
900
901
0
void CharClassBuilder::Negate() {
902
  // Build up negation and then copy in.
903
  // Could edit ranges in place, but C++ won't let me.
904
0
  std::vector<RuneRange> v;
905
0
  v.reserve(ranges_.size() + 1);
906
907
  // In negation, first range begins at 0, unless
908
  // the current class begins at 0.
909
0
  iterator it = begin();
910
0
  if (it == end()) {
911
0
    v.push_back(RuneRange(0, Runemax));
912
0
  } else {
913
0
    int nextlo = 0;
914
0
    if (it->lo == 0) {
915
0
      nextlo = it->hi + 1;
916
0
      ++it;
917
0
    }
918
0
    for (; it != end(); ++it) {
919
0
      v.push_back(RuneRange(nextlo, it->lo - 1));
920
0
      nextlo = it->hi + 1;
921
0
    }
922
0
    if (nextlo <= Runemax)
923
0
      v.push_back(RuneRange(nextlo, Runemax));
924
0
  }
925
926
0
  ranges_.clear();
927
0
  for (size_t i = 0; i < v.size(); i++)
928
0
    ranges_.insert(v[i]);
929
930
0
  upper_ = AlphaMask & ~upper_;
931
0
  lower_ = AlphaMask & ~lower_;
932
0
  nrunes_ = Runemax+1 - nrunes_;
933
0
}
934
935
// Character class is a sorted list of ranges.
936
// The ranges are allocated in the same block as the header,
937
// necessitating a special allocator and Delete method.
938
939
0
CharClass* CharClass::New(size_t maxranges) {
940
0
  CharClass* cc;
941
0
  uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
942
0
  cc = reinterpret_cast<CharClass*>(data);
943
0
  cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
944
0
  cc->nranges_ = 0;
945
0
  cc->folds_ascii_ = false;
946
0
  cc->nrunes_ = 0;
947
0
  return cc;
948
0
}
949
950
0
void CharClass::Delete() {
951
0
  uint8_t* data = reinterpret_cast<uint8_t*>(this);
952
0
  delete[] data;
953
0
}
954
955
0
CharClass* CharClass::Negate() {
956
0
  CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1));
957
0
  cc->folds_ascii_ = folds_ascii_;
958
0
  cc->nrunes_ = Runemax + 1 - nrunes_;
959
0
  int n = 0;
960
0
  int nextlo = 0;
961
0
  for (CharClass::iterator it = begin(); it != end(); ++it) {
962
0
    if (it->lo == nextlo) {
963
0
      nextlo = it->hi + 1;
964
0
    } else {
965
0
      cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
966
0
      nextlo = it->hi + 1;
967
0
    }
968
0
  }
969
0
  if (nextlo <= Runemax)
970
0
    cc->ranges_[n++] = RuneRange(nextlo, Runemax);
971
0
  cc->nranges_ = n;
972
0
  return cc;
973
0
}
974
975
0
bool CharClass::Contains(Rune r) const {
976
0
  RuneRange* rr = ranges_;
977
0
  int n = nranges_;
978
0
  while (n > 0) {
979
0
    int m = n/2;
980
0
    if (rr[m].hi < r) {
981
0
      rr += m+1;
982
0
      n -= m+1;
983
0
    } else if (r < rr[m].lo) {
984
0
      n = m;
985
0
    } else {  // rr[m].lo <= r && r <= rr[m].hi
986
0
      return true;
987
0
    }
988
0
  }
989
0
  return false;
990
0
}
991
992
0
CharClass* CharClassBuilder::GetCharClass() {
993
0
  CharClass* cc = CharClass::New(ranges_.size());
994
0
  int n = 0;
995
0
  for (iterator it = begin(); it != end(); ++it)
996
0
    cc->ranges_[n++] = *it;
997
0
  cc->nranges_ = n;
998
0
  ABSL_DCHECK_LE(n, static_cast<int>(ranges_.size()));
999
0
  cc->nrunes_ = nrunes_;
1000
0
  cc->folds_ascii_ = FoldsASCII();
1001
0
  return cc;
1002
0
}
1003
1004
}  // namespace re2