/src/ada-url/fuzz/url_pattern.cc
Line | Count | Source |
1 | | #include <fuzzer/FuzzedDataProvider.h> |
2 | | |
3 | | #include <memory> |
4 | | #include <string> |
5 | | |
6 | | #include "ada.cpp" |
7 | | #include "ada.h" |
8 | | |
9 | | using regex_provider = ada::url_pattern_regex::std_regex_provider; |
10 | | |
11 | 92.1k | void exercise_result(auto result) { |
12 | 92.1k | (void)result.get_protocol(); |
13 | 92.1k | (void)result.get_username(); |
14 | 92.1k | (void)result.get_password(); |
15 | 92.1k | (void)result.get_hostname(); |
16 | 92.1k | (void)result.get_port(); |
17 | 92.1k | (void)result.get_pathname(); |
18 | 92.1k | (void)result.get_search(); |
19 | 92.1k | (void)result.get_hash(); |
20 | 92.1k | (void)result.ignore_case(); |
21 | 92.1k | (void)result.has_regexp_groups(); |
22 | 92.1k | } |
23 | | |
24 | | // Shared helper: walk every field of a url_pattern_result. |
25 | 36.6k | static void exercise_match_result(const ada::url_pattern_result& match) { |
26 | 36.6k | volatile size_t len = 0; |
27 | 293k | auto exercise_component = [&len](const ada::url_pattern_component_result& c) { |
28 | 293k | len += c.input.size(); |
29 | 293k | for (const auto& [k, v] : c.groups) { |
30 | 290k | len += k.size(); |
31 | 290k | if (v.has_value()) len += v->size(); |
32 | 290k | } |
33 | 293k | }; |
34 | 36.6k | exercise_component(match.protocol); |
35 | 36.6k | exercise_component(match.username); |
36 | 36.6k | exercise_component(match.password); |
37 | 36.6k | exercise_component(match.hostname); |
38 | 36.6k | exercise_component(match.port); |
39 | 36.6k | exercise_component(match.pathname); |
40 | 36.6k | exercise_component(match.search); |
41 | 36.6k | exercise_component(match.hash); |
42 | | // Exercise the 'inputs' vector (each element is a url_pattern_input |
43 | | // variant holding either a string_view or url_pattern_init). |
44 | 45.0k | for (const auto& inp : match.inputs) { |
45 | 45.0k | if (std::holds_alternative<std::string_view>(inp)) { |
46 | 24.9k | len += std::get<std::string_view>(inp).size(); |
47 | 24.9k | } |
48 | 45.0k | } |
49 | 36.6k | (void)len; |
50 | 36.6k | } |
51 | | |
52 | | // Exercise exec() and test() on a parsed url_pattern with an ASCII input. |
53 | | // We restrict inputs to ASCII to avoid catastrophic regex backtracking. |
54 | | static void exercise_exec_and_test(ada::url_pattern<regex_provider>& pattern, |
55 | | const std::string& test_input, |
56 | 92.1k | const std::string& test_base) { |
57 | 92.1k | std::string_view test_view(test_input.data(), test_input.size()); |
58 | | |
59 | | // exec() and test() must agree: exec finds a match iff test returns true. |
60 | | // Both operate on the same input so their answers must be consistent. |
61 | 92.1k | auto exec_result = pattern.exec(test_view, nullptr); |
62 | 92.1k | auto test_result = pattern.test(test_view, nullptr); |
63 | | |
64 | 92.1k | bool exec_matched = exec_result && exec_result->has_value(); |
65 | 92.1k | bool test_matched = test_result && *test_result; |
66 | | |
67 | 92.1k | if (exec_matched != test_matched) { |
68 | 0 | printf( |
69 | 0 | "exec/test inconsistency on input '%s': exec_matched=%d " |
70 | 0 | "test_matched=%d\n", |
71 | 0 | test_input.c_str(), exec_matched, test_matched); |
72 | 0 | abort(); |
73 | 0 | } |
74 | | |
75 | 92.1k | if (exec_result && exec_result->has_value()) { |
76 | 8.33k | exercise_match_result(**exec_result); |
77 | 8.33k | } |
78 | | |
79 | | // test() with base URL |
80 | 92.1k | if (!test_base.empty()) { |
81 | 92.1k | std::string_view base_view(test_base.data(), test_base.size()); |
82 | 92.1k | auto test_result_with_base = pattern.test(test_view, &base_view); |
83 | 92.1k | auto exec_with_base = pattern.exec(test_view, &base_view); |
84 | | |
85 | 92.1k | bool exec_base_matched = exec_with_base && exec_with_base->has_value(); |
86 | 92.1k | bool test_base_matched = test_result_with_base && *test_result_with_base; |
87 | | |
88 | 92.1k | if (exec_base_matched != test_base_matched) { |
89 | 0 | printf( |
90 | 0 | "exec/test inconsistency with base on input '%s': " |
91 | 0 | "exec_matched=%d test_matched=%d\n", |
92 | 0 | test_input.c_str(), exec_base_matched, test_base_matched); |
93 | 0 | abort(); |
94 | 0 | } |
95 | | |
96 | 92.1k | if (exec_with_base && exec_with_base->has_value()) { |
97 | 8.33k | exercise_match_result(**exec_with_base); |
98 | 8.33k | } |
99 | 92.1k | } |
100 | | |
101 | | // test() with url_pattern_init input (sets only the pathname component) |
102 | 92.1k | ada::url_pattern_init init_input{}; |
103 | 92.1k | init_input.pathname = test_input; |
104 | 92.1k | auto test_with_init = pattern.test(init_input, nullptr); |
105 | 92.1k | auto exec_with_init = pattern.exec(init_input, nullptr); |
106 | | // exec and test must agree on the init-based input too. |
107 | 92.1k | if ((test_with_init && *test_with_init) != |
108 | 92.1k | (exec_with_init && exec_with_init->has_value())) { |
109 | 0 | printf("exec/test inconsistency on url_pattern_init input\n"); |
110 | 0 | abort(); |
111 | 0 | } |
112 | 92.1k | if (exec_with_init && exec_with_init->has_value()) { |
113 | 20.0k | exercise_match_result(**exec_with_init); |
114 | 20.0k | } |
115 | | |
116 | | // test_components() — tests each URL component individually |
117 | 92.1k | { |
118 | 92.1k | std::string_view sv(test_input.data(), test_input.size()); |
119 | 92.1k | auto parsed = ada::parse<ada::url_aggregator>(sv); |
120 | 92.1k | if (parsed) { |
121 | 36.0k | volatile bool tc = pattern.test_components( |
122 | 36.0k | std::string(parsed->get_protocol()), |
123 | 36.0k | std::string(parsed->get_username()), |
124 | 36.0k | std::string(parsed->get_password()), |
125 | 36.0k | std::string(parsed->get_hostname()), std::string(parsed->get_port()), |
126 | 36.0k | std::string(parsed->get_pathname()), |
127 | 36.0k | std::string(parsed->get_search()), std::string(parsed->get_hash())); |
128 | 36.0k | (void)tc; |
129 | 36.0k | } |
130 | 92.1k | } |
131 | | |
132 | | // match() — the internal method underlying exec(); must not crash. |
133 | 92.1k | auto match_result = pattern.match(test_view, nullptr); |
134 | 92.1k | (void)match_result; |
135 | 92.1k | } |
136 | | |
137 | 10.6k | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { |
138 | 325k | auto to_ascii = [](const std::string& source) -> std::string { |
139 | 325k | std::string result; |
140 | 325k | result.reserve(source.size()); |
141 | 325k | for (char c : source) { |
142 | 217k | result.push_back(static_cast<unsigned char>(c) % 128); |
143 | 217k | } |
144 | 325k | return result; |
145 | 325k | }; |
146 | 10.6k | FuzzedDataProvider fdp(data, size); |
147 | | // We do not want to trigger arbitrary regex matching. |
148 | 10.6k | std::string source_1 = "/" + to_ascii(fdp.ConsumeRandomLengthString(50)) + |
149 | 10.6k | "/" + to_ascii(fdp.ConsumeRandomLengthString(50)); |
150 | 10.6k | std::string base_source_1 = "/" + |
151 | 10.6k | to_ascii(fdp.ConsumeRandomLengthString(50)) + |
152 | 10.6k | "/" + to_ascii(fdp.ConsumeRandomLengthString(50)); |
153 | | |
154 | 10.6k | std::string source_2 = "https://ada-url.com/*"; |
155 | 10.6k | std::string base_source_2 = "https://ada-url.com"; |
156 | | |
157 | | // Additional test input for exec/test calls (also ASCII-only) |
158 | 10.6k | std::string test_input = "https://" + |
159 | 10.6k | to_ascii(fdp.ConsumeRandomLengthString(30)) + "/" + |
160 | 10.6k | to_ascii(fdp.ConsumeRandomLengthString(20)); |
161 | 10.6k | std::string test_base = "https://ada-url.com"; |
162 | | |
163 | 10.6k | std::array<std::pair<std::string, std::string>, 2> sources = {{ |
164 | 10.6k | {source_1, base_source_1}, |
165 | 10.6k | {source_2, base_source_2}, |
166 | 10.6k | }}; |
167 | | |
168 | 21.3k | for (const auto& [source, base_source] : sources) { |
169 | | // Without base or options |
170 | 21.3k | auto result = |
171 | 21.3k | ada::parse_url_pattern<regex_provider>(source, nullptr, nullptr); |
172 | 21.3k | if (result) { |
173 | 10.6k | exercise_result(*result); |
174 | 10.6k | exercise_exec_and_test(*result, test_input, test_base); |
175 | 10.6k | } |
176 | | |
177 | | // Testing with base_url |
178 | 21.3k | std::string_view base_source_view(base_source.data(), base_source.length()); |
179 | 21.3k | auto result_with_base = ada::parse_url_pattern<regex_provider>( |
180 | 21.3k | source, &base_source_view, nullptr); |
181 | 21.3k | if (result_with_base) { |
182 | 10.6k | exercise_result(*result_with_base); |
183 | 10.6k | exercise_exec_and_test(*result_with_base, test_input, test_base); |
184 | 10.6k | } |
185 | | |
186 | | // Testing with base_url and options |
187 | 21.3k | ada::url_pattern_options options{.ignore_case = fdp.ConsumeBool()}; |
188 | 21.3k | auto result_with_base_and_options = ada::parse_url_pattern<regex_provider>( |
189 | 21.3k | source, &base_source_view, &options); |
190 | 21.3k | if (result_with_base_and_options) { |
191 | 10.6k | exercise_result(*result_with_base_and_options); |
192 | 10.6k | exercise_exec_and_test(*result_with_base_and_options, test_input, |
193 | 10.6k | test_base); |
194 | 10.6k | } |
195 | | |
196 | | // Testing with url_pattern_init and base url. |
197 | 21.3k | int field_index = fdp.ConsumeIntegralInRange(0, 7); |
198 | 21.3k | std::string random_value = to_ascii(fdp.ConsumeRandomLengthString(50)); |
199 | 21.3k | ada::url_pattern_init init{}; |
200 | 21.3k | switch (field_index) { |
201 | 15.1k | case 0: |
202 | 15.1k | init.protocol = random_value; |
203 | 15.1k | break; |
204 | 548 | case 1: |
205 | 548 | init.username = random_value; |
206 | 548 | break; |
207 | 1.37k | case 2: |
208 | 1.37k | init.password = random_value; |
209 | 1.37k | break; |
210 | 566 | case 3: |
211 | 566 | init.hostname = random_value; |
212 | 566 | break; |
213 | 1.63k | case 4: |
214 | 1.63k | init.port = random_value; |
215 | 1.63k | break; |
216 | 568 | case 5: |
217 | 568 | init.pathname = random_value; |
218 | 568 | break; |
219 | 542 | case 6: |
220 | 542 | init.search = random_value; |
221 | 542 | break; |
222 | 982 | case 7: |
223 | 982 | init.hash = random_value; |
224 | 982 | break; |
225 | 21.3k | } |
226 | 21.3k | auto result_with_init = ada::parse_url_pattern<regex_provider>( |
227 | 21.3k | init, &base_source_view, nullptr); |
228 | 21.3k | if (result_with_init) { |
229 | 0 | exercise_result(*result_with_init); |
230 | 0 | exercise_exec_and_test(*result_with_init, test_input, test_base); |
231 | 0 | } |
232 | | |
233 | | // Testing url_pattern_init with ALL fields populated simultaneously |
234 | 21.3k | ada::url_pattern_init init_all{}; |
235 | 21.3k | init_all.protocol = to_ascii(fdp.ConsumeRandomLengthString(10)); |
236 | 21.3k | init_all.username = to_ascii(fdp.ConsumeRandomLengthString(10)); |
237 | 21.3k | init_all.password = to_ascii(fdp.ConsumeRandomLengthString(10)); |
238 | 21.3k | init_all.hostname = to_ascii(fdp.ConsumeRandomLengthString(20)); |
239 | 21.3k | init_all.port = to_ascii(fdp.ConsumeRandomLengthString(5)); |
240 | 21.3k | init_all.pathname = "/" + to_ascii(fdp.ConsumeRandomLengthString(20)); |
241 | 21.3k | init_all.search = to_ascii(fdp.ConsumeRandomLengthString(10)); |
242 | 21.3k | init_all.hash = to_ascii(fdp.ConsumeRandomLengthString(10)); |
243 | 21.3k | auto result_with_init_all = |
244 | 21.3k | ada::parse_url_pattern<regex_provider>(init_all, nullptr, nullptr); |
245 | 21.3k | if (result_with_init_all) { |
246 | 18.1k | exercise_result(*result_with_init_all); |
247 | 18.1k | exercise_exec_and_test(*result_with_init_all, test_input, test_base); |
248 | 18.1k | } |
249 | | |
250 | | // Testing url_pattern_init with the base_url field set. |
251 | | // |
252 | | // url_pattern_init::base_url is a completely separate code path from the |
253 | | // base_url *parameter* of parse_url_pattern. When base_url is embedded |
254 | | // inside the init struct the spec processes it differently. This field |
255 | | // was previously never exercised by any fuzzer. |
256 | 21.3k | { |
257 | 21.3k | ada::url_pattern_init init_base_url{}; |
258 | 21.3k | init_base_url.pathname = |
259 | 21.3k | "/" + to_ascii(fdp.ConsumeRandomLengthString(20)); |
260 | 21.3k | init_base_url.base_url = "https://example.com"; |
261 | 21.3k | auto result_base_in_init = ada::parse_url_pattern<regex_provider>( |
262 | 21.3k | init_base_url, nullptr, nullptr); |
263 | 21.3k | if (result_base_in_init) { |
264 | 20.6k | exercise_result(*result_base_in_init); |
265 | 20.6k | exercise_exec_and_test(*result_base_in_init, test_input, test_base); |
266 | 20.6k | } |
267 | | |
268 | | // Also fuzz the base_url field itself. |
269 | 21.3k | ada::url_pattern_init init_fuzz_base{}; |
270 | 21.3k | init_fuzz_base.pathname = |
271 | 21.3k | "/" + to_ascii(fdp.ConsumeRandomLengthString(15)); |
272 | 21.3k | init_fuzz_base.base_url = |
273 | 21.3k | "https://" + to_ascii(fdp.ConsumeRandomLengthString(20)); |
274 | 21.3k | auto result_fuzz_base = ada::parse_url_pattern<regex_provider>( |
275 | 21.3k | init_fuzz_base, nullptr, nullptr); |
276 | 21.3k | if (result_fuzz_base) { |
277 | 717 | exercise_result(*result_fuzz_base); |
278 | 717 | exercise_exec_and_test(*result_fuzz_base, test_input, test_base); |
279 | 717 | } |
280 | 21.3k | } |
281 | | |
282 | | // Testing url_pattern_init with a random subset (2–4) of fields set. |
283 | | // |
284 | | // The single-field case (switch above) and the all-fields case are covered |
285 | | // above. Here we pick a random bitmask of fields so the parser sees every |
286 | | // combination of present/absent components. |
287 | 21.3k | { |
288 | 21.3k | uint8_t field_mask = fdp.ConsumeIntegral<uint8_t>(); |
289 | 21.3k | ada::url_pattern_init init_subset{}; |
290 | 21.3k | if (field_mask & 0x01) |
291 | 521 | init_subset.protocol = to_ascii(fdp.ConsumeRandomLengthString(8)); |
292 | 21.3k | if (field_mask & 0x02) |
293 | 689 | init_subset.hostname = to_ascii(fdp.ConsumeRandomLengthString(20)); |
294 | 21.3k | if (field_mask & 0x04) |
295 | 719 | init_subset.port = to_ascii(fdp.ConsumeRandomLengthString(5)); |
296 | 21.3k | if (field_mask & 0x08) |
297 | 903 | init_subset.pathname = |
298 | 903 | "/" + to_ascii(fdp.ConsumeRandomLengthString(20)); |
299 | 21.3k | if (field_mask & 0x10) |
300 | 715 | init_subset.search = to_ascii(fdp.ConsumeRandomLengthString(10)); |
301 | 21.3k | if (field_mask & 0x20) |
302 | 858 | init_subset.hash = to_ascii(fdp.ConsumeRandomLengthString(10)); |
303 | 21.3k | if (field_mask & 0x40) |
304 | 647 | init_subset.username = to_ascii(fdp.ConsumeRandomLengthString(10)); |
305 | 21.3k | if (field_mask & 0x80) |
306 | 374 | init_subset.password = to_ascii(fdp.ConsumeRandomLengthString(10)); |
307 | 21.3k | auto result_subset = |
308 | 21.3k | ada::parse_url_pattern<regex_provider>(init_subset, nullptr, nullptr); |
309 | 21.3k | if (result_subset) { |
310 | 20.5k | exercise_result(*result_subset); |
311 | 20.5k | exercise_exec_and_test(*result_subset, test_input, test_base); |
312 | 20.5k | } |
313 | 21.3k | } |
314 | 21.3k | } |
315 | | |
316 | 10.6k | return 0; |
317 | 10.6k | } |