/src/re2/re2/fuzzing/re2_fuzzer.cc

Source
// Copyright 2016 The RE2 Authors.  All Rights Reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include <fuzzer/FuzzedDataProvider.h>
#include <stddef.h>
#include <stdint.h>

#include <algorithm>
#include <string>
#include <vector>

#include "absl/strings/string_view.h"
#include "re2/filtered_re2.h"
#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/set.h"
#include "re2/walker-inl.h"

// NOT static, NOT signed.
uint8_t dummy = 0;

// Walks kRegexpConcat and kRegexpAlternate subexpressions
// to determine their maximum length.
class SubexpressionWalker : public re2::Regexp::Walker<int> {
 public:
  SubexpressionWalker() = default;
  ~SubexpressionWalker() override = default;

  int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
                int* child_args, int nchild_args) override {
    switch (re->op()) {
      case re2::kRegexpConcat:
      case re2::kRegexpAlternate: {
        int max = nchild_args;
        for (int i = 0; i < nchild_args; i++)
          max = std::max(max, child_args[i]);
        return max;
      }

      default:
        break;
    }
    return -1;
  }

  // Should never be called: we use Walk(), not WalkExponential().
  int ShortVisit(re2::Regexp* re, int parent_arg) override {
    return parent_arg;
  }

 private:
  SubexpressionWalker(const SubexpressionWalker&) = delete;
  SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
};

// Walks substrings (i.e. kRegexpLiteralString subexpressions)
// to determine their maximum length... in runes, but avoiding
// overheads due to UTF-8 encoding is worthwhile when fuzzing.
class SubstringWalker : public re2::Regexp::Walker<int> {
 public:
  SubstringWalker() = default;
  ~SubstringWalker() override = default;

  int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
                int* child_args, int nchild_args) override {
    switch (re->op()) {
      case re2::kRegexpConcat:
      case re2::kRegexpAlternate:
      case re2::kRegexpStar:
      case re2::kRegexpPlus:
      case re2::kRegexpQuest:
      case re2::kRegexpRepeat:
      case re2::kRegexpCapture: {
        int max = -1;
        for (int i = 0; i < nchild_args; i++)
          max = std::max(max, child_args[i]);
        return max;
      }

      case re2::kRegexpLiteralString:
        return re->nrunes();

      default:
        break;
    }
    return -1;
  }

  // Should never be called: we use Walk(), not WalkExponential().
  int ShortVisit(re2::Regexp* re, int parent_arg) override {
    return parent_arg;
  }

 private:
  SubstringWalker(const SubstringWalker&) = delete;
  SubstringWalker& operator=(const SubstringWalker&) = delete;
};

void TestOneInput(absl::string_view pattern, const RE2::Options& options,
                  RE2::Anchor anchor, absl::string_view text) {
  // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
  // Otherwise, we will waste time on inputs that have long runs of various
  // character classes. The fuzzer has shown itself to be easily capable of
  // generating such patterns that fall within the other limits, but result
  // in timeouts nonetheless. The marginal cost is high - even more so when
  // counted repetition is involved - whereas the marginal benefit is zero.
  // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
  // three-element character classes when case-insensitive and using UTF-8.
  // TODO(junyer): Handle [[:alnum:]] et al. when they start to cause pain.
  int char_class = 0;
  int backslash_p = 0;  // very expensive, so handle specially
  for (size_t i = 0; i < pattern.size(); i++) {
    if (pattern[i] == '.' ||
        pattern[i] == 'k' || pattern[i] == 'K' ||
        pattern[i] == 's' || pattern[i] == 'S')
      char_class++;
    if (pattern[i] != '\\')
      continue;
    i++;
    if (i >= pattern.size())
      break;
    if (pattern[i] == 'p' || pattern[i] == 'P' ||
        pattern[i] == 'd' || pattern[i] == 'D' ||
        pattern[i] == 's' || pattern[i] == 'S' ||
        pattern[i] == 'w' || pattern[i] == 'W')
      char_class++;
    if (pattern[i] == 'p' || pattern[i] == 'P')
      backslash_p++;
  }
  if (char_class > 9)
    return;
  if (backslash_p > 1)
    return;

  // Iterate just once when fuzzing. Otherwise, we easily get bogged down
  // and coverage is unlikely to improve despite significant expense.
  RE2::FUZZING_ONLY_set_maximum_global_replace_count(1);
  // The default is 1000. Even 100 turned out to be too generous
  // for fuzzing, empirically speaking, so let's try 10 instead.
  re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);

  RE2 re(pattern, options);
  if (!re.ok())
    return;

  // Don't waste time fuzzing programs with large subexpressions.
  // They can cause bug reports due to fuzzer timeouts. And they
  // aren't interesting for fuzzing purposes.
  if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
    return;

  // Don't waste time fuzzing programs with large substrings.
  // They can cause bug reports due to fuzzer timeouts when they
  // are repetitions (e.g. hundreds of NUL bytes) and matching is
  // unanchored. And they aren't interesting for fuzzing purposes.
  if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
    return;

  // Don't waste time fuzzing high-size programs.
  // They can cause bug reports due to fuzzer timeouts.
  int size = re.ProgramSize();
  if (size > 9999)
    return;
  int rsize = re.ReverseProgramSize();
  if (rsize > 9999)
    return;

  // Don't waste time fuzzing high-fanout programs.
  // They can cause bug reports due to fuzzer timeouts.
  std::vector<int> histogram;
  int fanout = re.ProgramFanout(&histogram);
  if (fanout > 9)
    return;
  int rfanout = re.ReverseProgramFanout(&histogram);
  if (rfanout > 9)
    return;

  if (re.NumberOfCapturingGroups() == 0) {
    // Avoid early return due to too many arguments.
    absl::string_view sp = text;
    RE2::FullMatch(sp, re);
    RE2::PartialMatch(sp, re);
    RE2::Consume(&sp, re);
    sp = text;  // Reset.
    RE2::FindAndConsume(&sp, re);
  } else {
    // Okay, we have at least one capturing group...
    // Try conversion for variously typed arguments.
    absl::string_view sp = text;
    short s;
    RE2::FullMatch(sp, re, &s);
    long l;
    RE2::PartialMatch(sp, re, &l);
    float f;
    RE2::Consume(&sp, re, &f);
    sp = text;  // Reset.
    double d;
    RE2::FindAndConsume(&sp, re, &d);
  }

  std::string s = std::string(text);
  RE2::Replace(&s, re, "");
  s = std::string(text);  // Reset.
  RE2::GlobalReplace(&s, re, "");

  std::string min, max;
  re.PossibleMatchRange(&min, &max, /*maxlen=*/9);

  // Exercise some other API functionality.
  dummy += re.NamedCapturingGroups().size();
  dummy += re.CapturingGroupNames().size();
  dummy += RE2::QuoteMeta(pattern).size();
  dummy += re.Regexp()->ToString().size();

  RE2::Set set(options, anchor);
  int index = set.Add(pattern, /*error=*/NULL);  // -1 on error
  if (index != -1 && set.Compile()) {
    std::vector<int> matches;
    set.Match(text, &matches);
  }

  re2::FilteredRE2 filter;
  index = -1;  // not clobbered on error
  filter.Add(pattern, options, &index);
  if (index != -1) {
    std::vector<std::string> atoms;
    filter.Compile(&atoms);
    // Pretend that all atoms match, which
    // triggers the AND-OR tree maximally.
    std::vector<int> matched_atoms;
    matched_atoms.reserve(atoms.size());
    for (size_t i = 0; i < atoms.size(); ++i)
      matched_atoms.push_back(static_cast<int>(i));
    std::vector<int> matches;
    filter.AllMatches(text, matched_atoms, &matches);
  }
}

// Entry point for libFuzzer.
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
  // An input larger than 4 KiB probably isn't interesting. (This limit
  // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.)
  if (size == 0 || size > 4096)
    return 0;

  FuzzedDataProvider fdp(data, size);

  // The convention here is that fdp.ConsumeBool() returning false sets
  // the default value whereas returning true sets the alternate value:
  // most options default to false and so can be set directly; encoding
  // defaults to UTF-8; case_sensitive defaults to true. We do NOT want
  // to log errors. max_mem is 64 MiB because we can afford to use more
  // RAM in exchange for (hopefully) faster fuzzing.
  RE2::Options options;
  options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1
                                         : RE2::Options::EncodingUTF8);
  options.set_posix_syntax(fdp.ConsumeBool());
  options.set_longest_match(fdp.ConsumeBool());
  options.set_log_errors(false);
  options.set_max_mem(64 << 20);
  options.set_literal(fdp.ConsumeBool());
  options.set_never_nl(fdp.ConsumeBool());
  options.set_dot_nl(fdp.ConsumeBool());
  options.set_never_capture(fdp.ConsumeBool());
  options.set_case_sensitive(!fdp.ConsumeBool());
  options.set_perl_classes(fdp.ConsumeBool());
  options.set_word_boundary(fdp.ConsumeBool());
  options.set_one_line(fdp.ConsumeBool());

  // ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify
  // kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead.
  RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({
      RE2::UNANCHORED,
      RE2::ANCHOR_START,
      RE2::ANCHOR_BOTH,
  });

  std::string pattern = fdp.ConsumeRandomLengthString(999);
  std::string text = fdp.ConsumeRandomLengthString(999);

  TestOneInput(pattern, options, anchor, text);
  return 0;
}

Coverage Report

Created: 2025-10-27 06:07

Line	Count	Source
1		// Copyright 2016 The RE2 Authors. All Rights Reserved.
2		// Use of this source code is governed by a BSD-style
3		// license that can be found in the LICENSE file.
4
5		#include <fuzzer/FuzzedDataProvider.h>
6		#include <stddef.h>
7		#include <stdint.h>
8
9		#include <algorithm>
10		#include <string>
11		#include <vector>
12
13		#include "absl/strings/string_view.h"
14		#include "re2/filtered_re2.h"
15		#include "re2/re2.h"
16		#include "re2/regexp.h"
17		#include "re2/set.h"
18		#include "re2/walker-inl.h"
19
20		// NOT static, NOT signed.
21		uint8_t dummy = 0;
22
23		// Walks kRegexpConcat and kRegexpAlternate subexpressions
24		// to determine their maximum length.
25		class SubexpressionWalker : public re2::Regexp::Walker<int> {
26		public:
27	10.8k	SubexpressionWalker() = default;
28		~SubexpressionWalker() override = default;
29
30		int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
31	302k	int* child_args, int nchild_args) override {
32	302k	switch (re->op()) {
33	23.8k	case re2::kRegexpConcat:
34	41.5k	case re2::kRegexpAlternate: {
35	41.5k	int max = nchild_args;
36	250k	for (int i = 0; i < nchild_args; i++)
37	208k	max = std::max(max, child_args[i]);
38	41.5k	return max;
39	23.8k	}
40
41	260k	default:
42	260k	break;
43	302k	}
44	260k	return -1;
45	302k	}
46
47		// Should never be called: we use Walk(), not WalkExponential().
48	0	int ShortVisit(re2::Regexp* re, int parent_arg) override {
49	0	return parent_arg;
50	0	}
51
52		private:
53		SubexpressionWalker(const SubexpressionWalker&) = delete;
54		SubexpressionWalker& operator=(const SubexpressionWalker&) = delete;
55		};
56
57		// Walks substrings (i.e. kRegexpLiteralString subexpressions)
58		// to determine their maximum length... in runes, but avoiding
59		// overheads due to UTF-8 encoding is worthwhile when fuzzing.
60		class SubstringWalker : public re2::Regexp::Walker<int> {
61		public:
62	10.6k	SubstringWalker() = default;
63		~SubstringWalker() override = default;
64
65		int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg,
66	286k	int* child_args, int nchild_args) override {
67	286k	switch (re->op()) {
68	23.3k	case re2::kRegexpConcat:
69	40.7k	case re2::kRegexpAlternate:
70	64.4k	case re2::kRegexpStar:
71	71.0k	case re2::kRegexpPlus:
72	85.1k	case re2::kRegexpQuest:
73	101k	case re2::kRegexpRepeat:
74	122k	case re2::kRegexpCapture: {
75	122k	int max = -1;
76	398k	for (int i = 0; i < nchild_args; i++)
77	276k	max = std::max(max, child_args[i]);
78	122k	return max;
79	101k	}
80
81	28.4k	case re2::kRegexpLiteralString:
82	28.4k	return re->nrunes();
83
84	135k	default:
85	135k	break;
86	286k	}
87	135k	return -1;
88	286k	}
89
90		// Should never be called: we use Walk(), not WalkExponential().
91	0	int ShortVisit(re2::Regexp* re, int parent_arg) override {
92	0	return parent_arg;
93	0	}
94
95		private:
96		SubstringWalker(const SubstringWalker&) = delete;
97		SubstringWalker& operator=(const SubstringWalker&) = delete;
98		};
99
100		void TestOneInput(absl::string_view pattern, const RE2::Options& options,
101	12.5k	RE2::Anchor anchor, absl::string_view text) {
102		// Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W.
103		// Otherwise, we will waste time on inputs that have long runs of various
104		// character classes. The fuzzer has shown itself to be easily capable of
105		// generating such patterns that fall within the other limits, but result
106		// in timeouts nonetheless. The marginal cost is high - even more so when
107		// counted repetition is involved - whereas the marginal benefit is zero.
108		// Crudely limit the use of 'k', 'K', 's' and 'S' too because they become
109		// three-element character classes when case-insensitive and using UTF-8.
110		// TODO(junyer): Handle [[:alnum:]] et al. when they start to cause pain.
111	12.5k	int char_class = 0;
112	12.5k	int backslash_p = 0; // very expensive, so handle specially
113	652k	for (size_t i = 0; i < pattern.size(); i++) {
114	639k	if (pattern[i] == '.' \|\|
115	634k	pattern[i] == 'k' \|\| pattern[i] == 'K' \|\|
116	633k	pattern[i] == 's' \|\| pattern[i] == 'S')
117	8.16k	char_class++;
118	639k	if (pattern[i] != '\\')
119	630k	continue;
120	9.55k	i++;
121	9.55k	if (i >= pattern.size())
122	72	break;
123	9.47k	if (pattern[i] == 'p' \|\| pattern[i] == 'P' \|\|
124	7.33k	pattern[i] == 'd' \|\| pattern[i] == 'D' \|\|
125	6.61k	pattern[i] == 's' \|\| pattern[i] == 'S' \|\|
126	6.13k	pattern[i] == 'w' \|\| pattern[i] == 'W')
127	3.85k	char_class++;
128	9.47k	if (pattern[i] == 'p' \|\| pattern[i] == 'P')
129	2.14k	backslash_p++;
130	9.47k	}
131	12.5k	if (char_class > 9)
132	40	return;
133	12.4k	if (backslash_p > 1)
134	8	return;
135
136		// Iterate just once when fuzzing. Otherwise, we easily get bogged down
137		// and coverage is unlikely to improve despite significant expense.
138	12.4k	RE2::FUZZING_ONLY_set_maximum_global_replace_count(1);
139		// The default is 1000. Even 100 turned out to be too generous
140		// for fuzzing, empirically speaking, so let's try 10 instead.
141	12.4k	re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10);
142
143	12.4k	RE2 re(pattern, options);
144	12.4k	if (!re.ok())
145	1.60k	return;
146
147		// Don't waste time fuzzing programs with large subexpressions.
148		// They can cause bug reports due to fuzzer timeouts. And they
149		// aren't interesting for fuzzing purposes.
150	10.8k	if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9)
151	226	return;
152
153		// Don't waste time fuzzing programs with large substrings.
154		// They can cause bug reports due to fuzzer timeouts when they
155		// are repetitions (e.g. hundreds of NUL bytes) and matching is
156		// unanchored. And they aren't interesting for fuzzing purposes.
157	10.6k	if (SubstringWalker().Walk(re.Regexp(), -1) > 9)
158	611	return;
159
160		// Don't waste time fuzzing high-size programs.
161		// They can cause bug reports due to fuzzer timeouts.
162	10.0k	int size = re.ProgramSize();
163	10.0k	if (size > 9999)
164	140	return;
165	9.86k	int rsize = re.ReverseProgramSize();
166	9.86k	if (rsize > 9999)
167	40	return;
168
169		// Don't waste time fuzzing high-fanout programs.
170		// They can cause bug reports due to fuzzer timeouts.
171	9.82k	std::vector<int> histogram;
172	9.82k	int fanout = re.ProgramFanout(&histogram);
173	9.82k	if (fanout > 9)
174	9	return;
175	9.82k	int rfanout = re.ReverseProgramFanout(&histogram);
176	9.82k	if (rfanout > 9)
177	68	return;
178
179	9.75k	if (re.NumberOfCapturingGroups() == 0) {
180		// Avoid early return due to too many arguments.
181	6.42k	absl::string_view sp = text;
182	6.42k	RE2::FullMatch(sp, re);
183	6.42k	RE2::PartialMatch(sp, re);
184	6.42k	RE2::Consume(&sp, re);
185	6.42k	sp = text; // Reset.
186	6.42k	RE2::FindAndConsume(&sp, re);
187	6.42k	} else {
188		// Okay, we have at least one capturing group...
189		// Try conversion for variously typed arguments.
190	3.32k	absl::string_view sp = text;
191	3.32k	short s;
192	3.32k	RE2::FullMatch(sp, re, &s);
193	3.32k	long l;
194	3.32k	RE2::PartialMatch(sp, re, &l);
195	3.32k	float f;
196	3.32k	RE2::Consume(&sp, re, &f);
197	3.32k	sp = text; // Reset.
198	3.32k	double d;
199	3.32k	RE2::FindAndConsume(&sp, re, &d);
200	3.32k	}
201
202	9.75k	std::string s = std::string(text);
203	9.75k	RE2::Replace(&s, re, "");
204	9.75k	s = std::string(text); // Reset.
205	9.75k	RE2::GlobalReplace(&s, re, "");
206
207	9.75k	std::string min, max;
208	9.75k	re.PossibleMatchRange(&min, &max, /maxlen=/9);
209
210		// Exercise some other API functionality.
211	9.75k	dummy += re.NamedCapturingGroups().size();
212	9.75k	dummy += re.CapturingGroupNames().size();
213	9.75k	dummy += RE2::QuoteMeta(pattern).size();
214	9.75k	dummy += re.Regexp()->ToString().size();
215
216	9.75k	RE2::Set set(options, anchor);
217	9.75k	int index = set.Add(pattern, /error=/NULL); // -1 on error
218	9.75k	if (index != -1 && set.Compile()) {
219	9.75k	std::vector<int> matches;
220	9.75k	set.Match(text, &matches);
221	9.75k	}
222
223	9.75k	re2::FilteredRE2 filter;
224	9.75k	index = -1; // not clobbered on error
225	9.75k	filter.Add(pattern, options, &index);
226	9.75k	if (index != -1) {
227	9.75k	std::vector<std::string> atoms;
228	9.75k	filter.Compile(&atoms);
229		// Pretend that all atoms match, which
230		// triggers the AND-OR tree maximally.
231	9.75k	std::vector<int> matched_atoms;
232	9.75k	matched_atoms.reserve(atoms.size());
233	131k	for (size_t i = 0; i < atoms.size(); ++i)
234	122k	matched_atoms.push_back(static_cast<int>(i));
235	9.75k	std::vector<int> matches;
236	9.75k	filter.AllMatches(text, matched_atoms, &matches);
237	9.75k	}
238	9.75k	}
239
240		// Entry point for libFuzzer.
241	12.5k	extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
242		// An input larger than 4 KiB probably isn't interesting. (This limit
243		// allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.)
244	12.5k	if (size == 0 \|\| size > 4096)
245	7	return 0;
246
247	12.5k	FuzzedDataProvider fdp(data, size);
248
249		// The convention here is that fdp.ConsumeBool() returning false sets
250		// the default value whereas returning true sets the alternate value:
251		// most options default to false and so can be set directly; encoding
252		// defaults to UTF-8; case_sensitive defaults to true. We do NOT want
253		// to log errors. max_mem is 64 MiB because we can afford to use more
254		// RAM in exchange for (hopefully) faster fuzzing.
255	12.5k	RE2::Options options;
256	12.5k	options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1
257	12.5k	: RE2::Options::EncodingUTF8);
258	12.5k	options.set_posix_syntax(fdp.ConsumeBool());
259	12.5k	options.set_longest_match(fdp.ConsumeBool());
260	12.5k	options.set_log_errors(false);
261	12.5k	options.set_max_mem(64 << 20);
262	12.5k	options.set_literal(fdp.ConsumeBool());
263	12.5k	options.set_never_nl(fdp.ConsumeBool());
264	12.5k	options.set_dot_nl(fdp.ConsumeBool());
265	12.5k	options.set_never_capture(fdp.ConsumeBool());
266	12.5k	options.set_case_sensitive(!fdp.ConsumeBool());
267	12.5k	options.set_perl_classes(fdp.ConsumeBool());
268	12.5k	options.set_word_boundary(fdp.ConsumeBool());
269	12.5k	options.set_one_line(fdp.ConsumeBool());
270
271		// ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify
272		// kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead.
273	12.5k	RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({
274	12.5k	RE2::UNANCHORED,
275	12.5k	RE2::ANCHOR_START,
276	12.5k	RE2::ANCHOR_BOTH,
277	12.5k	});
278
279	12.5k	std::string pattern = fdp.ConsumeRandomLengthString(999);
280	12.5k	std::string text = fdp.ConsumeRandomLengthString(999);
281
282	12.5k	TestOneInput(pattern, options, anchor, text);
283	12.5k	return 0;
284	12.5k	}