/src/re2/re2/fuzzing/re2_fuzzer.cc
Line | Count | Source |
1 | | // Copyright 2016 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | #include <fuzzer/FuzzedDataProvider.h> |
6 | | #include <stddef.h> |
7 | | #include <stdint.h> |
8 | | |
9 | | #include <algorithm> |
10 | | #include <string> |
11 | | #include <vector> |
12 | | |
13 | | #include "absl/strings/string_view.h" |
14 | | #include "re2/filtered_re2.h" |
15 | | #include "re2/re2.h" |
16 | | #include "re2/regexp.h" |
17 | | #include "re2/set.h" |
18 | | #include "re2/walker-inl.h" |
19 | | |
20 | | // NOT static, NOT signed. |
21 | | uint8_t dummy = 0; |
22 | | |
23 | | // Walks kRegexpConcat and kRegexpAlternate subexpressions |
24 | | // to determine their maximum length. |
25 | | class SubexpressionWalker : public re2::Regexp::Walker<int> { |
26 | | public: |
27 | 10.8k | SubexpressionWalker() = default; |
28 | | ~SubexpressionWalker() override = default; |
29 | | |
30 | | int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, |
31 | 302k | int* child_args, int nchild_args) override { |
32 | 302k | switch (re->op()) { |
33 | 23.8k | case re2::kRegexpConcat: |
34 | 41.5k | case re2::kRegexpAlternate: { |
35 | 41.5k | int max = nchild_args; |
36 | 250k | for (int i = 0; i < nchild_args; i++) |
37 | 208k | max = std::max(max, child_args[i]); |
38 | 41.5k | return max; |
39 | 23.8k | } |
40 | | |
41 | 260k | default: |
42 | 260k | break; |
43 | 302k | } |
44 | 260k | return -1; |
45 | 302k | } |
46 | | |
47 | | // Should never be called: we use Walk(), not WalkExponential(). |
48 | 0 | int ShortVisit(re2::Regexp* re, int parent_arg) override { |
49 | 0 | return parent_arg; |
50 | 0 | } |
51 | | |
52 | | private: |
53 | | SubexpressionWalker(const SubexpressionWalker&) = delete; |
54 | | SubexpressionWalker& operator=(const SubexpressionWalker&) = delete; |
55 | | }; |
56 | | |
57 | | // Walks substrings (i.e. kRegexpLiteralString subexpressions) |
58 | | // to determine their maximum length... in runes, but avoiding |
59 | | // overheads due to UTF-8 encoding is worthwhile when fuzzing. |
60 | | class SubstringWalker : public re2::Regexp::Walker<int> { |
61 | | public: |
62 | 10.6k | SubstringWalker() = default; |
63 | | ~SubstringWalker() override = default; |
64 | | |
65 | | int PostVisit(re2::Regexp* re, int parent_arg, int pre_arg, |
66 | 286k | int* child_args, int nchild_args) override { |
67 | 286k | switch (re->op()) { |
68 | 23.3k | case re2::kRegexpConcat: |
69 | 40.7k | case re2::kRegexpAlternate: |
70 | 64.4k | case re2::kRegexpStar: |
71 | 71.0k | case re2::kRegexpPlus: |
72 | 85.1k | case re2::kRegexpQuest: |
73 | 101k | case re2::kRegexpRepeat: |
74 | 122k | case re2::kRegexpCapture: { |
75 | 122k | int max = -1; |
76 | 398k | for (int i = 0; i < nchild_args; i++) |
77 | 276k | max = std::max(max, child_args[i]); |
78 | 122k | return max; |
79 | 101k | } |
80 | | |
81 | 28.4k | case re2::kRegexpLiteralString: |
82 | 28.4k | return re->nrunes(); |
83 | | |
84 | 135k | default: |
85 | 135k | break; |
86 | 286k | } |
87 | 135k | return -1; |
88 | 286k | } |
89 | | |
90 | | // Should never be called: we use Walk(), not WalkExponential(). |
91 | 0 | int ShortVisit(re2::Regexp* re, int parent_arg) override { |
92 | 0 | return parent_arg; |
93 | 0 | } |
94 | | |
95 | | private: |
96 | | SubstringWalker(const SubstringWalker&) = delete; |
97 | | SubstringWalker& operator=(const SubstringWalker&) = delete; |
98 | | }; |
99 | | |
100 | | void TestOneInput(absl::string_view pattern, const RE2::Options& options, |
101 | 12.5k | RE2::Anchor anchor, absl::string_view text) { |
102 | | // Crudely limit the use of ., \p, \P, \d, \D, \s, \S, \w and \W. |
103 | | // Otherwise, we will waste time on inputs that have long runs of various |
104 | | // character classes. The fuzzer has shown itself to be easily capable of |
105 | | // generating such patterns that fall within the other limits, but result |
106 | | // in timeouts nonetheless. The marginal cost is high - even more so when |
107 | | // counted repetition is involved - whereas the marginal benefit is zero. |
108 | | // Crudely limit the use of 'k', 'K', 's' and 'S' too because they become |
109 | | // three-element character classes when case-insensitive and using UTF-8. |
110 | | // TODO(junyer): Handle [[:alnum:]] et al. when they start to cause pain. |
111 | 12.5k | int char_class = 0; |
112 | 12.5k | int backslash_p = 0; // very expensive, so handle specially |
113 | 652k | for (size_t i = 0; i < pattern.size(); i++) { |
114 | 639k | if (pattern[i] == '.' || |
115 | 634k | pattern[i] == 'k' || pattern[i] == 'K' || |
116 | 633k | pattern[i] == 's' || pattern[i] == 'S') |
117 | 8.16k | char_class++; |
118 | 639k | if (pattern[i] != '\\') |
119 | 630k | continue; |
120 | 9.55k | i++; |
121 | 9.55k | if (i >= pattern.size()) |
122 | 72 | break; |
123 | 9.47k | if (pattern[i] == 'p' || pattern[i] == 'P' || |
124 | 7.33k | pattern[i] == 'd' || pattern[i] == 'D' || |
125 | 6.61k | pattern[i] == 's' || pattern[i] == 'S' || |
126 | 6.13k | pattern[i] == 'w' || pattern[i] == 'W') |
127 | 3.85k | char_class++; |
128 | 9.47k | if (pattern[i] == 'p' || pattern[i] == 'P') |
129 | 2.14k | backslash_p++; |
130 | 9.47k | } |
131 | 12.5k | if (char_class > 9) |
132 | 40 | return; |
133 | 12.4k | if (backslash_p > 1) |
134 | 8 | return; |
135 | | |
136 | | // Iterate just once when fuzzing. Otherwise, we easily get bogged down |
137 | | // and coverage is unlikely to improve despite significant expense. |
138 | 12.4k | RE2::FUZZING_ONLY_set_maximum_global_replace_count(1); |
139 | | // The default is 1000. Even 100 turned out to be too generous |
140 | | // for fuzzing, empirically speaking, so let's try 10 instead. |
141 | 12.4k | re2::Regexp::FUZZING_ONLY_set_maximum_repeat_count(10); |
142 | | |
143 | 12.4k | RE2 re(pattern, options); |
144 | 12.4k | if (!re.ok()) |
145 | 1.60k | return; |
146 | | |
147 | | // Don't waste time fuzzing programs with large subexpressions. |
148 | | // They can cause bug reports due to fuzzer timeouts. And they |
149 | | // aren't interesting for fuzzing purposes. |
150 | 10.8k | if (SubexpressionWalker().Walk(re.Regexp(), -1) > 9) |
151 | 226 | return; |
152 | | |
153 | | // Don't waste time fuzzing programs with large substrings. |
154 | | // They can cause bug reports due to fuzzer timeouts when they |
155 | | // are repetitions (e.g. hundreds of NUL bytes) and matching is |
156 | | // unanchored. And they aren't interesting for fuzzing purposes. |
157 | 10.6k | if (SubstringWalker().Walk(re.Regexp(), -1) > 9) |
158 | 611 | return; |
159 | | |
160 | | // Don't waste time fuzzing high-size programs. |
161 | | // They can cause bug reports due to fuzzer timeouts. |
162 | 10.0k | int size = re.ProgramSize(); |
163 | 10.0k | if (size > 9999) |
164 | 140 | return; |
165 | 9.86k | int rsize = re.ReverseProgramSize(); |
166 | 9.86k | if (rsize > 9999) |
167 | 40 | return; |
168 | | |
169 | | // Don't waste time fuzzing high-fanout programs. |
170 | | // They can cause bug reports due to fuzzer timeouts. |
171 | 9.82k | std::vector<int> histogram; |
172 | 9.82k | int fanout = re.ProgramFanout(&histogram); |
173 | 9.82k | if (fanout > 9) |
174 | 9 | return; |
175 | 9.82k | int rfanout = re.ReverseProgramFanout(&histogram); |
176 | 9.82k | if (rfanout > 9) |
177 | 68 | return; |
178 | | |
179 | 9.75k | if (re.NumberOfCapturingGroups() == 0) { |
180 | | // Avoid early return due to too many arguments. |
181 | 6.42k | absl::string_view sp = text; |
182 | 6.42k | RE2::FullMatch(sp, re); |
183 | 6.42k | RE2::PartialMatch(sp, re); |
184 | 6.42k | RE2::Consume(&sp, re); |
185 | 6.42k | sp = text; // Reset. |
186 | 6.42k | RE2::FindAndConsume(&sp, re); |
187 | 6.42k | } else { |
188 | | // Okay, we have at least one capturing group... |
189 | | // Try conversion for variously typed arguments. |
190 | 3.32k | absl::string_view sp = text; |
191 | 3.32k | short s; |
192 | 3.32k | RE2::FullMatch(sp, re, &s); |
193 | 3.32k | long l; |
194 | 3.32k | RE2::PartialMatch(sp, re, &l); |
195 | 3.32k | float f; |
196 | 3.32k | RE2::Consume(&sp, re, &f); |
197 | 3.32k | sp = text; // Reset. |
198 | 3.32k | double d; |
199 | 3.32k | RE2::FindAndConsume(&sp, re, &d); |
200 | 3.32k | } |
201 | | |
202 | 9.75k | std::string s = std::string(text); |
203 | 9.75k | RE2::Replace(&s, re, ""); |
204 | 9.75k | s = std::string(text); // Reset. |
205 | 9.75k | RE2::GlobalReplace(&s, re, ""); |
206 | | |
207 | 9.75k | std::string min, max; |
208 | 9.75k | re.PossibleMatchRange(&min, &max, /*maxlen=*/9); |
209 | | |
210 | | // Exercise some other API functionality. |
211 | 9.75k | dummy += re.NamedCapturingGroups().size(); |
212 | 9.75k | dummy += re.CapturingGroupNames().size(); |
213 | 9.75k | dummy += RE2::QuoteMeta(pattern).size(); |
214 | 9.75k | dummy += re.Regexp()->ToString().size(); |
215 | | |
216 | 9.75k | RE2::Set set(options, anchor); |
217 | 9.75k | int index = set.Add(pattern, /*error=*/NULL); // -1 on error |
218 | 9.75k | if (index != -1 && set.Compile()) { |
219 | 9.75k | std::vector<int> matches; |
220 | 9.75k | set.Match(text, &matches); |
221 | 9.75k | } |
222 | | |
223 | 9.75k | re2::FilteredRE2 filter; |
224 | 9.75k | index = -1; // not clobbered on error |
225 | 9.75k | filter.Add(pattern, options, &index); |
226 | 9.75k | if (index != -1) { |
227 | 9.75k | std::vector<std::string> atoms; |
228 | 9.75k | filter.Compile(&atoms); |
229 | | // Pretend that all atoms match, which |
230 | | // triggers the AND-OR tree maximally. |
231 | 9.75k | std::vector<int> matched_atoms; |
232 | 9.75k | matched_atoms.reserve(atoms.size()); |
233 | 131k | for (size_t i = 0; i < atoms.size(); ++i) |
234 | 122k | matched_atoms.push_back(static_cast<int>(i)); |
235 | 9.75k | std::vector<int> matches; |
236 | 9.75k | filter.AllMatches(text, matched_atoms, &matches); |
237 | 9.75k | } |
238 | 9.75k | } |
239 | | |
240 | | // Entry point for libFuzzer. |
241 | 12.5k | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { |
242 | | // An input larger than 4 KiB probably isn't interesting. (This limit |
243 | | // allows for fdp.ConsumeRandomLengthString()'s backslash behaviour.) |
244 | 12.5k | if (size == 0 || size > 4096) |
245 | 7 | return 0; |
246 | | |
247 | 12.5k | FuzzedDataProvider fdp(data, size); |
248 | | |
249 | | // The convention here is that fdp.ConsumeBool() returning false sets |
250 | | // the default value whereas returning true sets the alternate value: |
251 | | // most options default to false and so can be set directly; encoding |
252 | | // defaults to UTF-8; case_sensitive defaults to true. We do NOT want |
253 | | // to log errors. max_mem is 64 MiB because we can afford to use more |
254 | | // RAM in exchange for (hopefully) faster fuzzing. |
255 | 12.5k | RE2::Options options; |
256 | 12.5k | options.set_encoding(fdp.ConsumeBool() ? RE2::Options::EncodingLatin1 |
257 | 12.5k | : RE2::Options::EncodingUTF8); |
258 | 12.5k | options.set_posix_syntax(fdp.ConsumeBool()); |
259 | 12.5k | options.set_longest_match(fdp.ConsumeBool()); |
260 | 12.5k | options.set_log_errors(false); |
261 | 12.5k | options.set_max_mem(64 << 20); |
262 | 12.5k | options.set_literal(fdp.ConsumeBool()); |
263 | 12.5k | options.set_never_nl(fdp.ConsumeBool()); |
264 | 12.5k | options.set_dot_nl(fdp.ConsumeBool()); |
265 | 12.5k | options.set_never_capture(fdp.ConsumeBool()); |
266 | 12.5k | options.set_case_sensitive(!fdp.ConsumeBool()); |
267 | 12.5k | options.set_perl_classes(fdp.ConsumeBool()); |
268 | 12.5k | options.set_word_boundary(fdp.ConsumeBool()); |
269 | 12.5k | options.set_one_line(fdp.ConsumeBool()); |
270 | | |
271 | | // ConsumeEnum<RE2::Anchor>() would require RE2::Anchor to specify |
272 | | // kMaxValue, so just use PickValueInArray<RE2::Anchor>() instead. |
273 | 12.5k | RE2::Anchor anchor = fdp.PickValueInArray<RE2::Anchor>({ |
274 | 12.5k | RE2::UNANCHORED, |
275 | 12.5k | RE2::ANCHOR_START, |
276 | 12.5k | RE2::ANCHOR_BOTH, |
277 | 12.5k | }); |
278 | | |
279 | 12.5k | std::string pattern = fdp.ConsumeRandomLengthString(999); |
280 | 12.5k | std::string text = fdp.ConsumeRandomLengthString(999); |
281 | | |
282 | 12.5k | TestOneInput(pattern, options, anchor, text); |
283 | 12.5k | return 0; |
284 | 12.5k | } |