Line data Source code
1 : // Copyright 2017 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include <limits.h>
6 : #include <stddef.h>
7 : #include <stdint.h>
8 :
9 : #include <functional>
10 : #include <string>
11 :
12 : #include "include/v8.h"
13 : #include "src/heap/factory.h"
14 : #include "src/objects-inl.h"
15 : #include "src/regexp/jsregexp.h"
16 : #include "test/fuzzer/fuzzer-support.h"
17 :
18 : // This is a hexdump of test/fuzzer/regexp_builtins/mjsunit.js generated using
19 : // `xxd -i mjsunit.js`. It contains the `assertEquals` JS function used below.
20 : #include "test/fuzzer/regexp_builtins/mjsunit.js.h"
21 :
22 : namespace v8 {
23 : namespace internal {
24 : namespace {
25 :
26 : constexpr bool kVerbose = false; // For debugging, verbose error messages.
27 : constexpr uint32_t kRegExpBuiltinsFuzzerHashSeed = 83;
28 :
29 : #define REGEXP_BUILTINS(V) \
30 : V(Exec, exec) \
31 : V(Match, Symbol.match) \
32 : V(Replace, Symbol.replace) \
33 : V(Search, Symbol.search) \
34 : V(Split, Symbol.split) \
35 : V(Test, test)
36 :
37 : struct FuzzerArgs {
38 : FuzzerArgs(const uint8_t* input_data, size_t input_length,
39 : v8::Local<v8::Context> context, Isolate* isolate)
40 : : input_cursor(0),
41 : input_data(input_data),
42 : input_length(input_length),
43 : context(context),
44 2 : isolate(isolate) {}
45 :
46 : size_t input_cursor;
47 : const uint8_t* const input_data;
48 : const size_t input_length;
49 : v8::Local<v8::Context> context;
50 : Isolate* const isolate;
51 : };
52 :
53 : enum RegExpBuiltin {
54 : #define CASE(name, ...) kRegExpPrototype##name,
55 : REGEXP_BUILTINS(CASE)
56 : #undef CASE
57 : kRegExpBuiltinCount,
58 : };
59 :
60 : #define CASE(name, ...) void TestRegExpPrototype##name(FuzzerArgs* args);
61 : REGEXP_BUILTINS(CASE)
62 : #undef CASE
63 :
64 6 : v8::Local<v8::String> v8_str(const char* s) {
65 6 : return v8::String::NewFromUtf8(v8::Isolate::GetCurrent(), s,
66 6 : v8::NewStringType::kNormal)
67 6 : .ToLocalChecked();
68 : }
69 :
70 6 : v8::MaybeLocal<v8::Value> CompileRun(v8::Local<v8::Context> context,
71 : const char* source) {
72 : v8::Local<v8::Script> script;
73 : v8::MaybeLocal<v8::Script> maybe_script =
74 6 : v8::Script::Compile(context, v8_str(source));
75 :
76 6 : if (!maybe_script.ToLocal(&script)) return v8::MaybeLocal<v8::Value>();
77 6 : return script->Run(context);
78 : }
79 :
80 12 : uint8_t RandomByte(FuzzerArgs* args) {
81 : // Silently wraps to the beginning of input data. Ideally, input data should
82 : // be long enough to avoid that.
83 12 : const size_t index = args->input_cursor;
84 12 : CHECK(index < args->input_length);
85 12 : args->input_cursor = (index + 1) % args->input_length;
86 12 : return args->input_data[index];
87 : }
88 :
89 2 : void CompileMjsunit(const FuzzerArgs* args) {
90 : std::string source(
91 : reinterpret_cast<const char*>(test_fuzzer_regexp_builtins_mjsunit_js),
92 2 : test_fuzzer_regexp_builtins_mjsunit_js_len);
93 2 : CompileRun(args->context, source.c_str()).ToLocalChecked();
94 2 : }
95 :
96 0 : std::string NaiveEscape(const std::string& input, char escaped_char) {
97 : std::string out;
98 0 : for (size_t i = 0; i < input.size(); i++) {
99 : // Just omit newlines and \0 chars and naively replace other escaped chars.
100 0 : const char c = input[i];
101 0 : if (c == '\r' || c == '\n' || c == '\0') continue;
102 0 : out += (input[i] == escaped_char) ? '_' : c;
103 : }
104 : // Disallow trailing backslashes as they mess with our naive source string
105 : // concatenation.
106 0 : if (!out.empty() && out.back() == '\\') out.back() = '_';
107 :
108 0 : return out;
109 : }
110 :
111 0 : std::string GenerateRandomString(FuzzerArgs* args, size_t length) {
112 : // Limited to an ASCII subset for now.
113 : std::string s(length, '\0');
114 0 : for (size_t i = 0; i < length; i++) {
115 0 : s[i] = static_cast<char>((RandomByte(args) % 0x5F) + 0x20);
116 : }
117 :
118 0 : return s;
119 : }
120 :
121 0 : std::string GenerateRandomPattern(FuzzerArgs* args) {
122 : const int kMaxPatternLength = 16;
123 : std::string s =
124 0 : GenerateRandomString(args, (RandomByte(args) % kMaxPatternLength) + 1);
125 : // A leading '*' would be a comment instead of a regexp literal.
126 0 : if (s[0] == '*') s[0] = '.';
127 0 : return s;
128 : }
129 :
130 : std::string PickRandomPresetPattern(FuzzerArgs* args) {
131 : static const char* preset_patterns[] = {
132 : ".", // Always matches.
133 : "\\P{Any}", // Never matches.
134 : "^", // Zero-width assertion, matches once.
135 : "(?=.)", // Zero-width assertion, matches at every position.
136 : "\\b", // Zero-width assertion, matches at each word boundary.
137 : "()", // Zero-width assertion, matches at every position with groups.
138 : "(?<a>)", // Likewise but with named groups.
139 : "((((.).).).)", "(?<a>(?<b>(?<c>(?<d>.).).).)",
140 : // Copied from
141 : // https://cs.chromium.org/chromium/src/testing/libfuzzer/fuzzers/dicts/regexp.dict
142 : "?", "abc", "()", "[]", "abc|def", "abc|def|ghi", "^xxx$",
143 : "ab\\b\\d\\bcd", "\\w|\\d", "a*?", "abc+", "abc+?", "xyz?", "xyz??",
144 : "xyz{0,1}", "xyz{0,1}?", "xyz{93}", "xyz{1,32}", "xyz{1,32}?", "xyz{1,}",
145 : "xyz{1,}?", "a\\fb\\nc\\rd\\te\\vf", "a\\nb\\bc", "(?:foo)", "(?: foo )",
146 : "foo|(bar|baz)|quux", "foo(?=bar)baz", "foo(?!bar)baz", "foo(?<=bar)baz",
147 : "foo(?<!bar)baz", "()", "(?=)", "[]", "[x]", "[xyz]", "[a-zA-Z0-9]",
148 : "[-123]", "[^123]", "]", "}", "[a-b-c]", "[x\\dz]", "[\\d-z]",
149 : "[\\d-\\d]", "[z-\\d]", "\\cj\\cJ\\ci\\cI\\ck\\cK", "\\c!", "\\c_",
150 : "\\c~", "[\\c!]", "[\\c_]", "[\\c~]", "[\\ca]", "[\\cz]", "[\\cA]",
151 : "[\\cZ]", "[\\c1]", "\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ",
152 : "[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "\\8", "\\9", "\\11", "\\11a",
153 : "\\011", "\\118", "\\111", "\\1111", "(x)(x)(x)\\1", "(x)(x)(x)\\2",
154 : "(x)(x)(x)\\3", "(x)(x)(x)\\4", "(x)(x)(x)\\1*", "(x)(x)(x)\\3*",
155 : "(x)(x)(x)\\4*", "(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
156 : "(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11", "(a)\\1", "(a\\1)", "(\\1a)",
157 : "(\\2)(\\1)", "(?=a){0,10}a", "(?=a){1,10}a", "(?=a){9,10}a", "(?!a)?a",
158 : "\\1(a)", "(?!(a))\\1", "(?!\\1(a\\1)\\1)\\1",
159 : "\\1\\2(a(?:\\1(b\\1\\2))\\2)\\1", "[\\0]", "[\\11]", "[\\11a]",
160 : "[\\011]", "[\\00011]", "[\\118]", "[\\111]", "[\\1111]", "\\x60",
161 : "\\x3z", "\\c", "\\u0034", "\\u003z", "foo[z]*", "\\u{12345}",
162 : "\\u{12345}\\u{23456}", "\\u{12345}{3}", "\\u{12345}*", "\\ud808\\udf45*",
163 : "[\\ud808\\udf45-\\ud809\\udccc]", "a", "a|b", "a\\n", "a$", "a\\b!",
164 : "a\\Bb", "a*?", "a?", "a??", "a{0,1}?", "a{1,2}?", "a+?", "(a)", "(a)\\1",
165 : "(\\1a)", "\\1(a)", "a\\s", "a\\S", "a\\D", "a\\w", "a\\W", "a.", "a\\q",
166 : "a[a]", "a[^a]", "a[a-z]", "a(?:b)", "a(?=b)", "a(?!b)", "\\x60",
167 : "\\u0060", "\\cA", "\\q", "\\1112", "(a)\\1", "(?!a)?a\\1",
168 : "(?:(?=a))a\\1", "a{}", "a{,}", "a{", "a{z}", "a{12z}", "a{12,",
169 : "a{12,3b", "{}", "{,}", "{", "{z}", "{1z}", "{12,", "{12,3b", "a", "abc",
170 : "a[bc]d", "a|bc", "ab|c", "a||bc", "(?:ab)", "(?:ab|cde)", "(?:ab)|cde",
171 : "(ab)", "(ab|cde)", "(ab)\\1", "(ab|cde)\\1", "(?:ab)?", "(?:ab)+", "a?",
172 : "a+", "a??", "a*?", "a+?", "(?:a?)?", "(?:a+)?", "(?:a?)+", "(?:a*)+",
173 : "(?:a+)+", "(?:a?)*", "(?:a*)*", "(?:a+)*", "a{0}", "(?:a+){0,0}", "a*b",
174 : "a+b", "a*b|c", "a+b|c", "(?:a{5,1000000}){3,1000000}", "(?:ab){4,7}",
175 : "a\\bc", "a\\sc", "a\\Sc", "a(?=b)c", "a(?=bbb|bb)c", "a(?!bbb|bb)c",
176 : "\xe2\x81\xa3", "[\xe2\x81\xa3]", "\xed\xb0\x80", "\xed\xa0\x80",
177 : "(\xed\xb0\x80)\x01", "((\xed\xa0\x80))\x02", "\xf0\x9f\x92\xa9", "\x01",
178 : "\x0f", "[-\xf0\x9f\x92\xa9]+", "[\xf0\x9f\x92\xa9-\xf4\x8f\xbf\xbf]",
179 : "(?<=)", "(?<=a)", "(?<!)", "(?<!a)", "(?<a>)", "(?<a>.)",
180 : "(?<a>.)\\k<a>", "\\p{Script=Greek}", "\\P{sc=Greek}",
181 : "\\p{Script_Extensions=Greek}", "\\P{scx=Greek}",
182 : "\\p{General_Category=Decimal_Number}", "\\P{gc=Decimal_Number}",
183 : "\\p{gc=Nd}", "\\P{Decimal_Number}", "\\p{Nd}", "\\P{Any}",
184 : "\\p{Changes_When_NFKC_Casefolded}",
185 : };
186 : static constexpr int preset_pattern_count = arraysize(preset_patterns);
187 : STATIC_ASSERT(preset_pattern_count < 0xFF);
188 :
189 2 : return std::string(preset_patterns[RandomByte(args) % preset_pattern_count]);
190 : }
191 :
192 2 : std::string PickPattern(FuzzerArgs* args) {
193 2 : if ((RandomByte(args) & 3) == 0) {
194 0 : return NaiveEscape(GenerateRandomPattern(args), '/');
195 : } else {
196 : return PickRandomPresetPattern(args);
197 : }
198 : }
199 :
200 0 : std::string GenerateRandomString(FuzzerArgs* args) {
201 : const int kMaxStringLength = 64;
202 0 : return GenerateRandomString(args, RandomByte(args) % kMaxStringLength);
203 : }
204 :
205 2 : std::string PickSubjectString(FuzzerArgs* args) {
206 2 : if ((RandomByte(args) & 0xF) == 0) {
207 : // Sometimes we have a two-byte subject string.
208 2 : return "f\\uD83D\\uDCA9ba\\u2603";
209 : } else {
210 0 : return NaiveEscape(GenerateRandomString(args), '\'');
211 : }
212 : }
213 :
214 0 : std::string PickReplacementForReplace(FuzzerArgs* args) {
215 : static const char* candidates[] = {
216 : "'X'",
217 : "'$1$2$3'",
218 : "'$$$&$`$\\'$1'",
219 : "() => 'X'",
220 : "(arg0, arg1, arg2, arg3, arg4) => arg0 + arg1 + arg2 + arg3 + arg4",
221 : "() => 42",
222 : };
223 : static const int candidate_count = arraysize(candidates);
224 :
225 0 : if ((RandomByte(args) & 1) == 0) {
226 0 : return candidates[RandomByte(args) % candidate_count];
227 : } else {
228 0 : return std::string("'") + NaiveEscape(GenerateRandomString(args), '\'') +
229 0 : std::string("'");
230 : }
231 : }
232 :
233 0 : std::string PickLimitForSplit(FuzzerArgs* args) {
234 : // clang-format off
235 0 : switch (RandomByte(args) & 0x3) {
236 0 : case 0: return "undefined";
237 0 : case 1: return "'not a number'";
238 0 : case 2: return std::to_string(Smi::kMaxValue + RandomByte(args));
239 0 : case 3: return std::to_string(RandomByte(args));
240 0 : default: UNREACHABLE();
241 : } // clang-format on
242 : }
243 :
244 2 : std::string GenerateRandomFlags(FuzzerArgs* args) {
245 : constexpr size_t kFlagCount = JSRegExp::FlagCount();
246 : CHECK_EQ(JSRegExp::kDotAll, 1 << (kFlagCount - 1));
247 : STATIC_ASSERT((1 << kFlagCount) - 1 < 0xFF);
248 :
249 2 : const size_t flags = RandomByte(args) & ((1 << kFlagCount) - 1);
250 :
251 : int cursor = 0;
252 2 : char buffer[kFlagCount] = {'\0'};
253 :
254 2 : if (flags & JSRegExp::kGlobal) buffer[cursor++] = 'g';
255 2 : if (flags & JSRegExp::kIgnoreCase) buffer[cursor++] = 'i';
256 2 : if (flags & JSRegExp::kMultiline) buffer[cursor++] = 'm';
257 2 : if (flags & JSRegExp::kSticky) buffer[cursor++] = 'y';
258 2 : if (flags & JSRegExp::kUnicode) buffer[cursor++] = 'u';
259 2 : if (flags & JSRegExp::kDotAll) buffer[cursor++] = 's';
260 :
261 4 : return std::string(buffer, cursor);
262 : }
263 :
264 : std::string GenerateRandomLastIndex(FuzzerArgs* args) {
265 : static const char* candidates[] = {
266 : "undefined", "-1", "0",
267 : "1", "2", "3",
268 : "4", "5", "6",
269 : "7", "8", "9",
270 : "50", "4294967296", "2147483647",
271 : "2147483648", "NaN", "Not a Number",
272 : };
273 : static const int candidate_count = arraysize(candidates);
274 2 : return candidates[RandomByte(args) % candidate_count];
275 : }
276 :
277 2 : void RunTest(FuzzerArgs* args) {
278 2 : switch (RandomByte(args) % kRegExpBuiltinCount) {
279 : #define CASE(name, ...) \
280 : case kRegExpPrototype##name: \
281 : TestRegExpPrototype##name(args); \
282 : break;
283 0 : REGEXP_BUILTINS(CASE)
284 : #undef CASE
285 : default:
286 0 : UNREACHABLE();
287 : }
288 2 : }
289 :
290 2 : std::string GenerateSourceString(FuzzerArgs* args, const std::string& test) {
291 2 : std::string pattern = PickPattern(args);
292 2 : std::string flags = GenerateRandomFlags(args);
293 : std::string last_index = GenerateRandomLastIndex(args);
294 2 : std::string subject = PickSubjectString(args);
295 :
296 : // clang-format off
297 4 : std::stringstream ss;
298 : ss << "function test() {\n"
299 : << " const re = /" << pattern<< "/"
300 : << flags << ";\n"
301 : << " re.lastIndex = " << last_index << ";\n"
302 : << " const str = '" << subject << "';\n"
303 : << " let result = null;\n"
304 : << " let exception = null;\n"
305 : << " try {\n"
306 : << " result = " << test << "\n"
307 : << " } catch (e) {\n"
308 : << " exception = e;\n"
309 : << " }\n"
310 : << " return { result: result, re: re, exception: exception };\n"
311 : << "}\n"
312 : << "%SetForceSlowPath(false);\n"
313 : << "test(); // Run once ahead of time to compile the regexp.\n"
314 : << "const fast = test();\n"
315 : << "%SetForceSlowPath(true);\n"
316 : << "const slow = test();\n"
317 2 : << "%SetForceSlowPath(false);\n";
318 : // clang-format on
319 :
320 : std::string source = ss.str();
321 : if (kVerbose) {
322 : fprintf(stderr, "Generated source:\n```\n%s\n```\n", source.c_str());
323 : }
324 :
325 2 : return source;
326 : }
327 :
328 0 : void PrintExceptionMessage(v8::Isolate* isolate, v8::TryCatch* try_catch) {
329 0 : CHECK(try_catch->HasCaught());
330 : static const int kBufferLength = 256;
331 : char buffer[kBufferLength + 1];
332 0 : try_catch->Message()->Get()->WriteOneByte(
333 0 : isolate, reinterpret_cast<uint8_t*>(&buffer[0]), 0, kBufferLength);
334 0 : fprintf(stderr, "%s\n", buffer);
335 0 : }
336 :
337 2 : bool ResultsAreIdentical(FuzzerArgs* args) {
338 : std::string source =
339 : "assertEquals(fast.exception, slow.exception);\n"
340 : "assertEquals(fast.result, slow.result);\n"
341 : "if (fast.result !== null)\n"
342 : " assertEquals(fast.result.groups, slow.result.groups);\n"
343 2 : "assertEquals(fast.re.lastIndex, slow.re.lastIndex);\n";
344 :
345 : v8::Local<v8::Value> result;
346 2 : v8::Isolate* isolate = reinterpret_cast<v8::Isolate*>(args->isolate);
347 4 : v8::TryCatch try_catch(isolate);
348 2 : if (!CompileRun(args->context, source.c_str()).ToLocal(&result)) {
349 0 : PrintExceptionMessage(isolate, &try_catch);
350 0 : args->isolate->clear_pending_exception();
351 : return false;
352 : }
353 :
354 : return true;
355 : }
356 :
357 2 : void CompileRunAndVerify(FuzzerArgs* args, const std::string& source) {
358 : v8::Local<v8::Value> result;
359 2 : v8::Isolate* isolate = reinterpret_cast<v8::Isolate*>(args->isolate);
360 4 : v8::TryCatch try_catch(isolate);
361 2 : if (!CompileRun(args->context, source.c_str()).ToLocal(&result)) {
362 0 : args->isolate->clear_pending_exception();
363 : // No need to verify result if an exception was thrown here, since that
364 : // implies a syntax error somewhere in the pattern or string. We simply
365 : // ignore those.
366 : if (kVerbose) {
367 : PrintExceptionMessage(isolate, &try_catch);
368 : fprintf(stderr, "Failed to run script:\n```\n%s\n```\n", source.c_str());
369 : }
370 0 : return;
371 : }
372 :
373 2 : if (!ResultsAreIdentical(args)) {
374 0 : uint32_t hash = StringHasher::HashSequentialString(
375 0 : args->input_data, static_cast<int>(args->input_length),
376 0 : kRegExpBuiltinsFuzzerHashSeed);
377 : V8_Fatal(__FILE__, __LINE__,
378 0 : "!ResultAreIdentical(args); RegExpBuiltinsFuzzerHash=%x", hash);
379 : }
380 : }
381 :
382 0 : void TestRegExpPrototypeExec(FuzzerArgs* args) {
383 0 : std::string test = "re.exec(str);";
384 0 : std::string source = GenerateSourceString(args, test);
385 0 : CompileRunAndVerify(args, source);
386 0 : }
387 :
388 0 : void TestRegExpPrototypeMatch(FuzzerArgs* args) {
389 0 : std::string test = "re[Symbol.match](str);";
390 0 : std::string source = GenerateSourceString(args, test);
391 0 : CompileRunAndVerify(args, source);
392 0 : }
393 :
394 0 : void TestRegExpPrototypeReplace(FuzzerArgs* args) {
395 0 : std::string replacement = PickReplacementForReplace(args);
396 0 : std::string test = "re[Symbol.replace](str, " + replacement + ");";
397 0 : std::string source = GenerateSourceString(args, test);
398 0 : CompileRunAndVerify(args, source);
399 0 : }
400 :
401 0 : void TestRegExpPrototypeSearch(FuzzerArgs* args) {
402 0 : std::string test = "re[Symbol.search](str);";
403 0 : std::string source = GenerateSourceString(args, test);
404 0 : CompileRunAndVerify(args, source);
405 0 : }
406 :
407 0 : void TestRegExpPrototypeSplit(FuzzerArgs* args) {
408 0 : std::string limit = PickLimitForSplit(args);
409 0 : std::string test = "re[Symbol.split](str, " + limit + ");";
410 0 : std::string source = GenerateSourceString(args, test);
411 0 : CompileRunAndVerify(args, source);
412 0 : }
413 :
414 2 : void TestRegExpPrototypeTest(FuzzerArgs* args) {
415 2 : std::string test = "re.test(str);";
416 2 : std::string source = GenerateSourceString(args, test);
417 2 : CompileRunAndVerify(args, source);
418 2 : }
419 :
420 : #undef REGEXP_BUILTINS
421 :
422 : } // namespace
423 :
424 2 : extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
425 2 : if (size < 64) return 0; // Need a minimal amount of randomness to do stuff.
426 :
427 : // Flag definitions.
428 :
429 2 : FLAG_allow_natives_syntax = true;
430 :
431 : // V8 setup.
432 :
433 2 : v8_fuzzer::FuzzerSupport* support = v8_fuzzer::FuzzerSupport::Get();
434 : v8::Isolate* isolate = support->GetIsolate();
435 : Isolate* i_isolate = reinterpret_cast<i::Isolate*>(isolate);
436 : v8::Isolate::Scope isolate_scope(isolate);
437 4 : v8::HandleScope handle_scope(isolate);
438 2 : v8::Local<v8::Context> context = support->GetContext();
439 : v8::Context::Scope context_scope(context);
440 4 : v8::TryCatch try_catch(isolate);
441 :
442 2 : CHECK(!i_isolate->has_pending_exception());
443 :
444 : // And run.
445 :
446 : FuzzerArgs args(data, size, context, i_isolate);
447 2 : CompileMjsunit(&args);
448 2 : RunTest(&args);
449 :
450 2 : CHECK(!i_isolate->has_pending_exception());
451 : return 0;
452 : }
453 :
454 : } // namespace internal
455 4 : } // namespace v8
|