Line | Count | Source |
1 | | /* Copyright 2025 Google LLC |
2 | | Licensed under the Apache License, Version 2.0 (the "License"); |
3 | | you may not use this file except in compliance with the License. |
4 | | You may obtain a copy of the License at |
5 | | http://www.apache.org/licenses/LICENSE-2.0 |
6 | | Unless required by applicable law or agreed to in writing, software |
7 | | distributed under the License is distributed on an "AS IS" BASIS, |
8 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
9 | | See the License for the specific language governing permissions and |
10 | | limitations under the License. |
11 | | */ |
12 | | |
13 | | /* |
14 | | * Fuzzer for Ruby's Regex implementation (re.c, regcomp.c, regexec.c, regparse.c) |
15 | | * |
16 | | * Purpose: Test regex compilation from potentially malformed patterns and matching |
17 | | * against various strings. Tests parser edge cases, compilation bugs, and matching |
18 | | * correctness with complex patterns. |
19 | | * |
20 | | * Coverage: |
21 | | * - Regex compilation: Pattern parsing, syntax validation, optimization |
22 | | * - Regex matching: match, =~, scan, gsub operations |
23 | | * - Edge cases: Invalid patterns, backtracking, captures, Unicode, lookahead/lookbehind |
24 | | * - Memory: Backtracking stack overflow, catastrophic backtracking |
25 | | */ |
26 | | |
27 | | #include <stdint.h> |
28 | | #include <stddef.h> |
29 | | #include <stdlib.h> |
30 | | #include <string.h> |
31 | | #include <unistd.h> |
32 | | #include <fcntl.h> |
33 | | #include <fuzzer/FuzzedDataProvider.h> |
34 | | #include "ruby.h" |
35 | | #include "ruby/encoding.h" |
36 | | #include "ruby/re.h" |
37 | | |
38 | | static int ruby_initialized = 0; |
39 | | |
40 | | extern "C" VALUE ruby_verbose; |
41 | | |
42 | | // Wrapper functions for rb_protect - necessary to catch exceptions |
43 | | // Regex operations can raise (e.g., syntax errors, invalid patterns) |
44 | 5.36k | static VALUE call_regex_match(VALUE args) { |
45 | 5.36k | VALUE *ptr = (VALUE *)args; |
46 | 5.36k | return rb_funcall(ptr[0], rb_intern("match"), 1, ptr[1]); // Regexp#match - full match info |
47 | 5.36k | } |
48 | | |
49 | | // Wrapper for regex =~ operator |
50 | 5.36k | static VALUE call_regex_match_op(VALUE args) { |
51 | 5.36k | VALUE *ptr = (VALUE *)args; |
52 | 5.36k | return rb_funcall(ptr[0], rb_intern("=~"), 1, ptr[1]); // Regexp#=~ - match position |
53 | 5.36k | } |
54 | | |
55 | | // Wrapper for regex scan |
56 | 4.32k | static VALUE call_regex_scan(VALUE args) { |
57 | 4.32k | VALUE *ptr = (VALUE *)args; |
58 | 4.32k | return rb_funcall(ptr[1], rb_intern("scan"), 1, ptr[0]); // String#scan - find all matches |
59 | 4.32k | } |
60 | | |
61 | | // Wrapper for regex gsub |
62 | 5.36k | static VALUE call_regex_gsub(VALUE args) { |
63 | 5.36k | VALUE *ptr = (VALUE *)args; |
64 | 5.36k | return rb_funcall(ptr[1], rb_intern("gsub"), 2, ptr[0], ptr[2]); // String#gsub - replace all |
65 | 5.36k | } |
66 | | |
67 | 6.54k | extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { |
68 | | // Initialize Ruby once on first call |
69 | | // Sets up VM, object system, and Regexp class |
70 | 6.54k | if (!ruby_initialized) { |
71 | 1 | ruby_init(); |
72 | 1 | ruby_initialized = 1; |
73 | | |
74 | | // Suppress Ruby warnings to avoid log noise |
75 | 1 | ruby_verbose = Qfalse; |
76 | 1 | } |
77 | | |
78 | 6.54k | if (size < 2) return 0; |
79 | | |
80 | | // Use FuzzedDataProvider to split input into pattern and test string |
81 | 6.54k | FuzzedDataProvider fdp(data, size); |
82 | | |
83 | | // Consume pattern string with limited length to avoid pathological patterns |
84 | 6.54k | size_t pattern_len = fdp.ConsumeIntegralInRange<size_t>(1, 1000); // Reduced from 10000 |
85 | 6.54k | std::string pattern = fdp.ConsumeBytesAsString(pattern_len); |
86 | | |
87 | | // Consume test string from remaining data with size limit |
88 | 6.54k | std::string test = fdp.ConsumeRemainingBytesAsString(); |
89 | 6.54k | if (test.size() > 10000) { |
90 | 82 | test.resize(10000); // Limit test string size to prevent memory issues |
91 | 82 | } |
92 | | |
93 | | // Create Ruby strings - these can fail if data is invalid |
94 | 6.54k | VALUE pattern_str = rb_str_new(pattern.data(), pattern.size()); |
95 | 6.54k | VALUE test_str = rb_str_new(test.data(), test.size()); |
96 | | |
97 | 6.54k | int state = 0; |
98 | 6.54k | VALUE args[3]; |
99 | | |
100 | | // Temporarily redirect stderr file descriptor to suppress regex compilation warnings |
101 | | // Duplicate stderr, redirect to /dev/null, then restore after compilation |
102 | 6.54k | int saved_stderr = dup(STDERR_FILENO); |
103 | 6.54k | int dev_null = open("/dev/null", O_WRONLY); |
104 | 6.54k | if (dev_null >= 0) { |
105 | 6.54k | dup2(dev_null, STDERR_FILENO); |
106 | 6.54k | close(dev_null); |
107 | 6.54k | } |
108 | | |
109 | | // Compile the regex with default options (0) |
110 | | // This exercises the regex parser (regparse.c) |
111 | | // Tests pattern syntax validation, AST building, and optimization |
112 | 6.54k | VALUE regexp = rb_protect((VALUE (*)(VALUE))rb_reg_regcomp, pattern_str, &state); |
113 | | |
114 | | // Restore stderr file descriptor |
115 | 6.54k | if (saved_stderr >= 0) { |
116 | 6.54k | dup2(saved_stderr, STDERR_FILENO); |
117 | 6.54k | close(saved_stderr); |
118 | 6.54k | } |
119 | | |
120 | 6.54k | if (state) { |
121 | | // Pattern compilation failed (syntax error, invalid escape, etc.) |
122 | 1.17k | rb_set_errinfo(Qnil); |
123 | 1.17k | rb_gc_start(); |
124 | 1.17k | return 0; |
125 | 1.17k | } |
126 | | |
127 | 5.36k | if (NIL_P(regexp)) { |
128 | 0 | rb_gc_start(); |
129 | 0 | return 0; |
130 | 0 | } |
131 | | |
132 | | // Test 1: Regexp#match - exercises regex matching engine (regexec.c) |
133 | | // Returns MatchData object with capture groups |
134 | 5.36k | args[0] = regexp; |
135 | 5.36k | args[1] = test_str; |
136 | 5.36k | rb_protect(call_regex_match, (VALUE)args, &state); |
137 | 5.36k | if (state) { |
138 | 166 | rb_set_errinfo(Qnil); |
139 | 166 | state = 0; |
140 | 166 | } |
141 | | |
142 | | // Test 2: Regexp#=~ - exercises match position finding |
143 | | // Returns integer position or nil |
144 | 5.36k | rb_protect(call_regex_match_op, (VALUE)args, &state); |
145 | 5.36k | if (state) { |
146 | 166 | rb_set_errinfo(Qnil); |
147 | 166 | state = 0; |
148 | 166 | } |
149 | | |
150 | | // Test 3: String#scan - find all matches |
151 | | // Tests repeated matching and capture handling |
152 | | // Skip scan if test string is too large to avoid memory issues |
153 | 5.36k | if (test.size() <= 5000) { |
154 | 4.32k | rb_protect(call_regex_scan, (VALUE)args, &state); |
155 | 4.32k | if (state) { |
156 | 202 | rb_set_errinfo(Qnil); |
157 | 202 | state = 0; |
158 | 202 | } |
159 | 4.32k | } |
160 | | |
161 | | // Test 4: String#gsub - replace matches |
162 | | // Tests matching combined with string building |
163 | 5.36k | VALUE replacement = rb_str_new("X", 1); |
164 | 5.36k | args[0] = regexp; |
165 | 5.36k | args[1] = test_str; |
166 | 5.36k | args[2] = replacement; |
167 | 5.36k | rb_protect(call_regex_gsub, (VALUE)args, &state); |
168 | 5.36k | if (state) { |
169 | 256 | rb_set_errinfo(Qnil); |
170 | 256 | state = 0; |
171 | 256 | } |
172 | | |
173 | | // Clean up - force GC to release memory |
174 | | // Necessary to prevent memory growth from regex compilation artifacts |
175 | 5.36k | rb_gc_start(); |
176 | | |
177 | 5.36k | return 0; |
178 | 5.36k | } |