Coverage Report

Created: 2025-10-27 06:07

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/re2/re2/fuzzing/re2_fuzzer.cc
Line
Count
Source
1
// Copyright 2016 The RE2 Authors.  All Rights Reserved.
2
// Use of this source code is governed by a BSD-style
3
// license that can be found in the LICENSE file.
4
5
#include <fuzzer/FuzzedDataProvider.h>
6
#include <stddef.h>
7
#include <stdint.h>
8
9
#include <algorithm>
10
#include <string>
11
#include <vector>
12
13
#include "absl/strings/string_view.h"
14
#include "re2/filtered_re2.h"
15
#include "re2/re2.h"
16
#include "re2/regexp.h"
17
#include "re2/set.h"
18
#include "re2/walker-inl.h"
19
20
// NOT static, NOT signed.
21
uint8_t dummy = 0;
22
23
// Walks kRegexpConcat and kRegexpAlternate subexpressions
24
// to determine their maximum length.
25
class SubexpressionWalker : public re2::Regexp::Walker<int> {
26
 public:
27
10.8k
  SubexpressionWalker() = default;
28
  ~SubexpressionWalker() override = default;
29
30
  int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
31
302k
                int* child_args, int nchild_args) override {
32
302k
    switch (re->op()) {
33
23.8k
      case re2::kRegexpConcat:
34
41.5k
      case re2::kRegexpAlternate: {
35
41.5k
        int max = nchild_args;
36
250k
        for (int i = 0; i < nchild_args; i++)
37
208k
          max = std::max(max, child_args[i]);
38
41.5k
        return max;
39
23.8k
      }
40
41
260k
      default:
42
260k
        break;
43
302k
    }
44
260k
    return -1;
45
302k
  }
46
47
  // Should never be called: we use Walk(), not WalkExponential().
48
0
  int ShortVisit(re2::Regexp* re, int parent_arg) override {
49
0
    return parent_arg;
50
0
  }
51
52
 private:
53
  SubexpressionWalker(const SubexpressionWalker&) = delete;
54
  SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
55
};
56
57
// Walks substrings (i.e. kRegexpLiteralString subexpressions)
58
// to determine their maximum length... in runes, but avoiding
59
// overheads due to UTF-8 encoding is worthwhile when fuzzing.
60
class SubstringWalker : public re2::Regexp::Walker<int> {
61
 public:
62
10.6k
  SubstringWalker() = default;
63
  ~SubstringWalker() override = default;
64
65
  int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
66
286k
                int* child_args, int nchild_args) override {
67
286k
    switch (re->op()) {
68
23.3k
      case re2::kRegexpConcat:
69
40.7k
      case re2::kRegexpAlternate:
70
64.4k
      case re2::kRegexpStar:
71
71.0k
      case re2::kRegexpPlus:
72
85.1k
      case re2::kRegexpQuest:
73
101k
      case re2::kRegexpRepeat:
74
122k
      case re2::kRegexpCapture: {
75
122k
        int max = -1;
76
398k
        for (int i = 0; i < nchild_args; i++)
77
276k
          max = std::max(max, child_args[i]);
78
122k
        return max;
79
101k
      }
80
81
28.4k
      case re2::kRegexpLiteralString:
82
28.4k
        return re->nrunes();
83
84
135k
      default:
85
135k
        break;
86
286k
    }
87
135k
    return -1;
88
286k
  }
89
90
  // Should never be called: we use Walk(), not WalkExponential().
91
0
  int ShortVisit(re2::Regexp* re, int parent_arg) override {
92
0
    return parent_arg;
93
0
  }
94
95
 private:
96
  SubstringWalker(const SubstringWalker&) = delete;
97
  SubstringWalker& operator=(const SubstringWalker&) = delete;
98
};
99
100
void TestOneInput(absl::string_view pattern, const RE2::Options& options,
101
12.5k
                  RE2::Anchor anchor, absl::string_view text) {
102
  // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
103
  // Otherwise, we will waste time on inputs that have long runs of various
104
  // character classes. The fuzzer has shown itself to be easily capable of
105
  // generating such patterns that fall within the other limits, but result
106
  // in timeouts nonetheless. The marginal cost is high - even more so when
107
  // counted repetition is involved - whereas the marginal benefit is zero.
108
  // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
109
  // three-element character classes when case-insensitive and using UTF-8.
110
  // TODO(junyer): Handle [[:alnum:]] et al. when they start to cause pain.
111
12.5k
  int char_class = 0;
112
12.5k
  int backslash_p = 0;  // very expensive, so handle specially
113
652k
  for (size_t i = 0; i < pattern.size(); i++) {
114
639k
    if (pattern[i] == '.' ||
115
634k
        pattern[i] == 'k' || pattern[i] == 'K' ||
116
633k
        pattern[i] == 's' || pattern[i] == 'S')
117
8.16k
      char_class++;
118
639k
    if (pattern[i] != '\\')
119
630k
      continue;
120
9.55k
    i++;
121
9.55k
    if (i >= pattern.size())
122
72
      break;
123
9.47k
    if (pattern[i] == 'p' || pattern[i] == 'P' ||
124
7.33k
        pattern[i] == 'd' || pattern[i] == 'D' ||
125
6.61k
        pattern[i] == 's' || pattern[i] == 'S' ||
126
6.13k
        pattern[i] == 'w' || pattern[i] == 'W')
127
3.85k
      char_class++;
128
9.47k
    if (pattern[i] == 'p' || pattern[i] == 'P')
129
2.14k
      backslash_p++;
130
9.47k
  }
131
12.5k
  if (char_class > 9)
132
40
    return;
133
12.4k
  if (backslash_p > 1)
134
8
    return;
135
136
  // Iterate just once when fuzzing. Otherwise, we easily get bogged down
137
  // and coverage is unlikely to improve despite significant expense.
138
12.4k
  RE2::FUZZING_ONLY_set_maximum_global_replace_count(1);
139
  // The default is 1000. Even 100 turned out to be too generous
140
  // for fuzzing, empirically speaking, so let's try 10 instead.
141
12.4k
  re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
142
143
12.4k
  RE2 re(pattern, options);
144
12.4k
  if (!re.ok())
145
1.60k
    return;
146
147
  // Don't waste time fuzzing programs with large subexpressions.
148
  // They can cause bug reports due to fuzzer timeouts. And they
149
  // aren't interesting for fuzzing purposes.
150
10.8k
  if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
151
226
    return;
152
153
  // Don't waste time fuzzing programs with large substrings.
154
  // They can cause bug reports due to fuzzer timeouts when they
155
  // are repetitions (e.g. hundreds of NUL bytes) and matching is
156
  // unanchored. And they aren't interesting for fuzzing purposes.
157
10.6k
  if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
158
611
    return;
159
160
  // Don't waste time fuzzing high-size programs.
161
  // They can cause bug reports due to fuzzer timeouts.
162
10.0k
  int size = re.ProgramSize();
163
10.0k
  if (size > 9999)
164
140
    return;
165
9.86k
  int rsize = re.ReverseProgramSize();
166
9.86k
  if (rsize > 9999)
167
40
    return;
168
169
  // Don't waste time fuzzing high-fanout programs.
170
  // They can cause bug reports due to fuzzer timeouts.
171
9.82k
  std::vector<int> histogram;
172
9.82k
  int fanout = re.ProgramFanout(&histogram);
173
9.82k
  if (fanout > 9)
174
9
    return;
175
9.82k
  int rfanout = re.ReverseProgramFanout(&histogram);
176
9.82k
  if (rfanout > 9)
177
68
    return;
178
179
9.75k
  if (re.NumberOfCapturingGroups() == 0) {
180
    // Avoid early return due to too many arguments.
181
6.42k
    absl::string_view sp = text;
182
6.42k
    RE2::FullMatch(sp, re);
183
6.42k
    RE2::PartialMatch(sp, re);
184
6.42k
    RE2::Consume(&sp, re);
185
6.42k
    sp = text;  // Reset.
186
6.42k
    RE2::FindAndConsume(&sp, re);
187
6.42k
  } else {
188
    // Okay, we have at least one capturing group...
189
    // Try conversion for variously typed arguments.
190
3.32k
    absl::string_view sp = text;
191
3.32k
    short s;
192
3.32k
    RE2::FullMatch(sp, re, &s);
193
3.32k
    long l;
194
3.32k
    RE2::PartialMatch(sp, re, &l);
195
3.32k
    float f;
196
3.32k
    RE2::Consume(&sp, re, &f);
197
3.32k
    sp = text;  // Reset.
198
3.32k
    double d;
199
3.32k
    RE2::FindAndConsume(&sp, re, &d);
200
3.32k
  }
201
202
9.75k
  std::string s = std::string(text);
203
9.75k
  RE2::Replace(&s, re, "");
204
9.75k
  s = std::string(text);  // Reset.
205
9.75k
  RE2::GlobalReplace(&s, re, "");
206
207
9.75k
  std::string min, max;
208
9.75k
  re.PossibleMatchRange(&min, &max, /*maxlen=*/9);
209
210
  // Exercise some other API functionality.
211
9.75k
  dummy += re.NamedCapturingGroups().size();
212
9.75k
  dummy += re.CapturingGroupNames().size();
213
9.75k
  dummy += RE2::QuoteMeta(pattern).size();
214
9.75k
  dummy += re.Regexp()->ToString().size();
215
216
9.75k
  RE2::Set set(options, anchor);
217
9.75k
  int index = set.Add(pattern, /*error=*/NULL);  // -1 on error
218
9.75k
  if (index != -1 && set.Compile()) {
219
9.75k
    std::vector<int> matches;
220
9.75k
    set.Match(text, &matches);
221
9.75k
  }
222
223
9.75k
  re2::FilteredRE2 filter;
224
9.75k
  index = -1;  // not clobbered on error
225
9.75k
  filter.Add(pattern, options, &index);
226
9.75k
  if (index != -1) {
227
9.75k
    std::vector<std::string> atoms;
228
9.75k
    filter.Compile(&atoms);
229
    // Pretend that all atoms match, which
230
    // triggers the AND-OR tree maximally.
231
9.75k
    std::vector<int> matched_atoms;
232
9.75k
    matched_atoms.reserve(atoms.size());
233
131k
    for (size_t i = 0; i < atoms.size(); ++i)
234
122k
      matched_atoms.push_back(static_cast<int>(i));
235
9.75k
    std::vector<int> matches;
236
9.75k
    filter.AllMatches(text, matched_atoms, &matches);
237
9.75k
  }
238
9.75k
}
239
240
// Entry point for libFuzzer.
241
12.5k
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
242
  // An input larger than 4 KiB probably isn't interesting. (This limit
243
  // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.)
244
12.5k
  if (size == 0 || size > 4096)
245
7
    return 0;
246
247
12.5k
  FuzzedDataProvider fdp(data, size);
248
249
  // The convention here is that fdp.ConsumeBool() returning false sets
250
  // the default value whereas returning true sets the alternate value:
251
  // most options default to false and so can be set directly; encoding
252
  // defaults to UTF-8; case_sensitive defaults to true. We do NOT want
253
  // to log errors. max_mem is 64 MiB because we can afford to use more
254
  // RAM in exchange for (hopefully) faster fuzzing.
255
12.5k
  RE2::Options options;
256
12.5k
  options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1
257
12.5k
                                         : RE2::Options::EncodingUTF8);
258
12.5k
  options.set_posix_syntax(fdp.ConsumeBool());
259
12.5k
  options.set_longest_match(fdp.ConsumeBool());
260
12.5k
  options.set_log_errors(false);
261
12.5k
  options.set_max_mem(64 << 20);
262
12.5k
  options.set_literal(fdp.ConsumeBool());
263
12.5k
  options.set_never_nl(fdp.ConsumeBool());
264
12.5k
  options.set_dot_nl(fdp.ConsumeBool());
265
12.5k
  options.set_never_capture(fdp.ConsumeBool());
266
12.5k
  options.set_case_sensitive(!fdp.ConsumeBool());
267
12.5k
  options.set_perl_classes(fdp.ConsumeBool());
268
12.5k
  options.set_word_boundary(fdp.ConsumeBool());
269
12.5k
  options.set_one_line(fdp.ConsumeBool());
270
271
  // ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify
272
  // kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead.
273
12.5k
  RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({
274
12.5k
      RE2::UNANCHORED,
275
12.5k
      RE2::ANCHOR_START,
276
12.5k
      RE2::ANCHOR_BOTH,
277
12.5k
  });
278
279
12.5k
  std::string pattern = fdp.ConsumeRandomLengthString(999);
280
12.5k
  std::string text = fdp.ConsumeRandomLengthString(999);
281
282
12.5k
  TestOneInput(pattern, options, anchor, text);
283
12.5k
  return 0;
284
12.5k
}