/src/simdutf/fuzz/misc.cpp
Line | Count | Source |
1 | | #include <cstddef> |
2 | | #include <cstdint> |
3 | | #include <ranges> |
4 | | |
5 | | #include "helpers/common.h" |
6 | | #include "simdutf.h" |
7 | | |
8 | 841 | void autodetect(std::span<const char> chardata) { |
9 | 841 | std::vector<simdutf::encoding_type> results; |
10 | 841 | const auto implementations = get_supported_implementations(); |
11 | 2.52k | for (const simdutf::implementation* impl : implementations) { |
12 | 2.52k | results.push_back( |
13 | 2.52k | impl->autodetect_encoding(chardata.data(), chardata.size())); |
14 | 2.52k | } |
15 | 1.68k | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
16 | 841 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
17 | 0 | std::cerr << "output differs between implementations\n"; |
18 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
19 | 0 | std::cerr << "implementation " << implementations[i] << " gave " |
20 | 0 | << results.at(i) << '\n'; |
21 | 0 | } |
22 | 0 | std::abort(); |
23 | 0 | } |
24 | 841 | } |
25 | | |
26 | 679 | void detect(std::span<const char> chardata) { |
27 | 679 | std::vector<int> results; |
28 | 679 | const auto implementations = get_supported_implementations(); |
29 | 2.03k | for (const simdutf::implementation* impl : implementations) { |
30 | 2.03k | results.push_back(impl->detect_encodings(chardata.data(), chardata.size())); |
31 | 2.03k | } |
32 | 1.35k | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
33 | 679 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
34 | 0 | std::cerr << "in detect_encodings(const char*, std::size_t):\n"; |
35 | 0 | std::cerr << "output differs between implementations\n"; |
36 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
37 | 0 | std::cerr << "implementation " << implementations[i]->name() << " gave " |
38 | 0 | << results.at(i) << '\n'; |
39 | 0 | } |
40 | 0 | std::cerr << " std::vector<unsigned char> data{"; |
41 | 0 | for (unsigned char x : chardata) { |
42 | 0 | std::cerr << +x << ", "; |
43 | 0 | }; |
44 | 0 | std::cerr << "};\n"; |
45 | 0 | std::abort(); |
46 | 0 | } |
47 | 679 | } |
48 | | |
49 | 214 | void validate_ascii(std::span<const char> chardata) { |
50 | | // use int, not bool to avoid vector<bool> |
51 | 214 | std::vector<int> results; |
52 | 214 | const auto implementations = get_supported_implementations(); |
53 | 642 | for (const simdutf::implementation* impl : implementations) { |
54 | 642 | results.push_back(+impl->validate_ascii(chardata.data(), chardata.size())); |
55 | 642 | } |
56 | 428 | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
57 | 214 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
58 | 0 | std::cerr << "in validate_ascii(const char*, std::size_t):\n"; |
59 | 0 | std::cerr << "output differs between implementations\n"; |
60 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
61 | 0 | std::cerr << "implementation " << implementations[i]->name() << " gave " |
62 | 0 | << results.at(i) << '\n'; |
63 | 0 | } |
64 | 0 | std::cerr << " std::vector<unsigned char> data{"; |
65 | 0 | for (unsigned char x : chardata) { |
66 | 0 | std::cerr << +x << ", "; |
67 | 0 | }; |
68 | 0 | std::cerr << "};\n"; |
69 | 0 | std::abort(); |
70 | 0 | } |
71 | 214 | } |
72 | | |
73 | 266 | void validate_ascii_with_err(std::span<const char> chardata) { |
74 | | // use int, not bool to avoid vector<bool> |
75 | 266 | std::vector<simdutf::result> results; |
76 | 266 | const auto implementations = get_supported_implementations(); |
77 | 798 | for (const simdutf::implementation* impl : implementations) { |
78 | 798 | results.push_back( |
79 | 798 | impl->validate_ascii_with_errors(chardata.data(), chardata.size())); |
80 | 798 | } |
81 | 532 | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
82 | 266 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
83 | 0 | std::cerr << "in validate_ascii(const char*, std::size_t):\n"; |
84 | 0 | std::cerr << "output differs between implementations\n"; |
85 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
86 | 0 | std::cerr << "implementation " << implementations[i]->name() << " gave " |
87 | 0 | << results.at(i) << '\n'; |
88 | 0 | } |
89 | 0 | std::cerr << " std::vector<unsigned char> data{"; |
90 | 0 | for (unsigned char x : chardata) { |
91 | 0 | std::cerr << +x << ", "; |
92 | 0 | }; |
93 | 0 | std::cerr << "};\n"; |
94 | 0 | std::abort(); |
95 | 0 | } |
96 | 266 | } |
97 | | |
98 | 137 | void utf16_endianness(std::span<const char16_t> data) { |
99 | 137 | std::vector<std::string> results; |
100 | 137 | const auto implementations = get_supported_implementations(); |
101 | 411 | for (const simdutf::implementation* impl : implementations) { |
102 | 411 | std::vector<char16_t> out(data.size()); |
103 | 411 | impl->change_endianness_utf16(data.data(), data.size(), out.data()); |
104 | 411 | results.push_back(FNV1A_hash::as_str(out)); |
105 | 411 | } |
106 | 274 | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
107 | 137 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
108 | 0 | std::cerr << "in utf16_endianness(const char*, std::size_t):\n"; |
109 | 0 | std::cerr << "output differs between implementations\n"; |
110 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
111 | 0 | std::cerr << "implementation " << implementations[i]->name() << " gave " |
112 | 0 | << results.at(i) << '\n'; |
113 | 0 | } |
114 | 0 | std::cerr << " std::vector<char16_t> data{"; |
115 | 0 | for (int x : data) { |
116 | 0 | std::cerr << +x << ", "; |
117 | 0 | }; |
118 | 0 | std::cerr << "};\n"; |
119 | 0 | std::abort(); |
120 | 0 | } |
121 | 137 | } |
122 | | |
123 | | // Checks that validate_utf16le_as_ascii and validate_utf16be_as_ascii agree |
124 | | // across all implementations, and that a true result implies valid UTF-16. |
125 | 0 | void validate_utf16_as_ascii(std::span<const char16_t> data) { |
126 | 0 | const auto implementations = get_supported_implementations(); |
127 | | // use int, not bool to avoid vector<bool> |
128 | 0 | std::vector<int> le_results, be_results; |
129 | 0 | le_results.reserve(implementations.size()); |
130 | 0 | be_results.reserve(implementations.size()); |
131 | 0 | for (const simdutf::implementation* impl : implementations) { |
132 | 0 | le_results.push_back( |
133 | 0 | +impl->validate_utf16le_as_ascii(data.data(), data.size())); |
134 | 0 | be_results.push_back( |
135 | 0 | +impl->validate_utf16be_as_ascii(data.data(), data.size())); |
136 | 0 | } |
137 | 0 | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
138 | 0 | if (std::ranges::adjacent_find(le_results, neq) != le_results.end()) { |
139 | 0 | std::cerr << "validate_utf16le_as_ascii: output differs between " |
140 | 0 | "implementations\n"; |
141 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
142 | 0 | std::cerr << " " << implementations[i]->name() << " gave " |
143 | 0 | << le_results[i] << '\n'; |
144 | 0 | } |
145 | 0 | std::abort(); |
146 | 0 | } |
147 | 0 | if (std::ranges::adjacent_find(be_results, neq) != be_results.end()) { |
148 | 0 | std::cerr << "validate_utf16be_as_ascii: output differs between " |
149 | 0 | "implementations\n"; |
150 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
151 | 0 | std::cerr << " " << implementations[i]->name() << " gave " |
152 | 0 | << be_results[i] << '\n'; |
153 | 0 | } |
154 | 0 | std::abort(); |
155 | 0 | } |
156 | | // If LE validates as ASCII, it must also validate as UTF-16LE (ASCII is a |
157 | | // subset). |
158 | 0 | if (le_results[0]) { |
159 | 0 | for (const simdutf::implementation* impl : implementations) { |
160 | 0 | if (!impl->validate_utf16le(data.data(), data.size())) { |
161 | 0 | std::cerr << "validate_utf16le_as_ascii returned true but " |
162 | 0 | "validate_utf16le returned false" |
163 | 0 | << " impl=" << impl->name() << "\n"; |
164 | 0 | std::abort(); |
165 | 0 | } |
166 | 0 | } |
167 | 0 | } |
168 | | // Same for BE. |
169 | 0 | if (be_results[0]) { |
170 | 0 | for (const simdutf::implementation* impl : implementations) { |
171 | 0 | if (!impl->validate_utf16be(data.data(), data.size())) { |
172 | 0 | std::cerr << "validate_utf16be_as_ascii returned true but " |
173 | 0 | "validate_utf16be returned false" |
174 | 0 | << " impl=" << impl->name() << "\n"; |
175 | 0 | std::abort(); |
176 | 0 | } |
177 | 0 | } |
178 | 0 | } |
179 | 0 | } |
180 | | |
181 | | // Checks that to_well_formed_utf16le / to_well_formed_utf16be: |
182 | | // 1. All implementations agree on the output. |
183 | | // 2. The output is always valid UTF-16LE / UTF-16BE. |
184 | | // 3. When the input is already valid UTF-16, the output equals the input. |
185 | 0 | void to_well_formed_utf16(std::span<const char16_t> data) { |
186 | 0 | const auto implementations = get_supported_implementations(); |
187 | 0 | if (implementations.empty()) { |
188 | 0 | return; |
189 | 0 | } |
190 | | |
191 | | // Check LE variant |
192 | 0 | { |
193 | 0 | std::vector<std::vector<char16_t>> le_outputs; |
194 | 0 | le_outputs.reserve(implementations.size()); |
195 | 0 | for (const simdutf::implementation* impl : implementations) { |
196 | 0 | std::vector<char16_t> out(data.size()); |
197 | 0 | impl->to_well_formed_utf16le(data.data(), data.size(), out.data()); |
198 | 0 | le_outputs.push_back(std::move(out)); |
199 | 0 | } |
200 | 0 | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
201 | 0 | if (std::ranges::adjacent_find(le_outputs, neq) != le_outputs.end()) { |
202 | 0 | std::cerr |
203 | 0 | << "to_well_formed_utf16le: outputs differ between implementations\n"; |
204 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
205 | 0 | std::cerr << " " << implementations[i]->name() |
206 | 0 | << ": hash=" << FNV1A_hash::as_str(le_outputs[i]) << "\n"; |
207 | 0 | } |
208 | 0 | std::abort(); |
209 | 0 | } |
210 | | // Output must be valid UTF-16LE. |
211 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
212 | 0 | if (!implementations[i]->validate_utf16le(le_outputs[i].data(), |
213 | 0 | le_outputs[i].size())) { |
214 | 0 | std::cerr << "to_well_formed_utf16le: output is not valid UTF-16LE" |
215 | 0 | << " impl=" << implementations[i]->name() << "\n"; |
216 | 0 | std::abort(); |
217 | 0 | } |
218 | 0 | } |
219 | | // If input was already valid UTF-16LE, output must equal input. |
220 | 0 | if (implementations[0]->validate_utf16le(data.data(), data.size())) { |
221 | 0 | if (!std::ranges::equal(le_outputs[0], data)) { |
222 | 0 | std::cerr << "to_well_formed_utf16le: valid input was modified\n"; |
223 | 0 | std::abort(); |
224 | 0 | } |
225 | 0 | } |
226 | 0 | } |
227 | | |
228 | | // Check BE variant |
229 | 0 | { |
230 | 0 | std::vector<std::vector<char16_t>> be_outputs; |
231 | 0 | be_outputs.reserve(implementations.size()); |
232 | 0 | for (const simdutf::implementation* impl : implementations) { |
233 | 0 | std::vector<char16_t> out(data.size()); |
234 | 0 | impl->to_well_formed_utf16be(data.data(), data.size(), out.data()); |
235 | 0 | be_outputs.push_back(std::move(out)); |
236 | 0 | } |
237 | 0 | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
238 | 0 | if (std::ranges::adjacent_find(be_outputs, neq) != be_outputs.end()) { |
239 | 0 | std::cerr |
240 | 0 | << "to_well_formed_utf16be: outputs differ between implementations\n"; |
241 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
242 | 0 | std::cerr << " " << implementations[i]->name() |
243 | 0 | << ": hash=" << FNV1A_hash::as_str(be_outputs[i]) << "\n"; |
244 | 0 | } |
245 | 0 | std::abort(); |
246 | 0 | } |
247 | | // Output must be valid UTF-16BE. |
248 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
249 | 0 | if (!implementations[i]->validate_utf16be(be_outputs[i].data(), |
250 | 0 | be_outputs[i].size())) { |
251 | 0 | std::cerr << "to_well_formed_utf16be: output is not valid UTF-16BE" |
252 | 0 | << " impl=" << implementations[i]->name() << "\n"; |
253 | 0 | std::abort(); |
254 | 0 | } |
255 | 0 | } |
256 | | // If input was already valid UTF-16BE, output must equal input. |
257 | 0 | if (implementations[0]->validate_utf16be(data.data(), data.size())) { |
258 | 0 | if (!std::ranges::equal(be_outputs[0], data)) { |
259 | 0 | std::cerr << "to_well_formed_utf16be: valid input was modified\n"; |
260 | 0 | std::abort(); |
261 | 0 | } |
262 | 0 | } |
263 | 0 | } |
264 | 0 | } |
265 | | |
266 | | void convert_latin1_to_utf8_safe(std::span<const char> chardata, |
267 | 279 | const std::size_t outputsize) { |
268 | | // convert with a limited output buffer |
269 | 279 | std::vector<char> limited_output(outputsize); |
270 | 279 | const auto limited_ret = simdutf::convert_latin1_to_utf8_safe( |
271 | 279 | chardata.data(), chardata.size(), limited_output.data(), outputsize); |
272 | | |
273 | | // convert with a sufficiently large output buffer |
274 | 279 | std::vector<char> large_output(2 * chardata.size()); |
275 | 279 | const auto large_ret = simdutf::convert_latin1_to_utf8( |
276 | 279 | chardata.data(), chardata.size(), large_output.data()); |
277 | | |
278 | 279 | if (large_ret != 0) { |
279 | | // conversion was possible with a large buffer. |
280 | 245 | if (large_ret <= outputsize) { |
281 | | // the limited buffer was large enough, ensure we got the same result |
282 | 132 | assert(limited_ret == large_ret); |
283 | 132 | assert(std::ranges::equal(limited_output | std::views::take(large_ret), |
284 | 132 | large_output | std::views::take(large_ret))); |
285 | 132 | } else { |
286 | | // the number of written bytes for a limited buffer must not exceed what |
287 | | // the large buffer got. |
288 | 113 | assert(limited_ret <= large_ret); |
289 | | // the written data should be equal |
290 | 113 | assert(std::ranges::equal(limited_output | std::views::take(limited_ret), |
291 | 113 | large_output | std::views::take(limited_ret))); |
292 | 113 | } |
293 | 245 | } else { |
294 | | // conversion with a big buffer failed - is there anything we can check or |
295 | | // assert for the limited buffer? I don't think so. |
296 | 34 | } |
297 | 279 | } |
298 | | |
299 | 2.61k | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { |
300 | | // pick one of the functions, based on the fuzz data. |
301 | | // the first byte is which action to take. step forward |
302 | | // several bytes so the input is aligned. |
303 | 2.61k | if (size < 4) { |
304 | 2 | return 0; |
305 | 2 | } |
306 | 2.61k | constexpr auto Ncases = 11u; |
307 | 2.61k | constexpr auto actionmask = std::bit_ceil(Ncases) - 1; |
308 | 2.61k | const auto action = data[0] & actionmask; |
309 | | |
310 | 2.61k | const std::uint16_t u16 = data[1] + (data[2] << 8); |
311 | | |
312 | 2.61k | data += 4; |
313 | 2.61k | size -= 4; |
314 | | |
315 | 2.61k | const std::span<const char> chardata{(const char*)data, size}; |
316 | 2.61k | const std::span<const char16_t> u16data{(const char16_t*)data, |
317 | 2.61k | size / sizeof(char16_t)}; |
318 | | |
319 | 2.61k | switch (action) { |
320 | 841 | case 0: |
321 | 841 | autodetect(chardata); |
322 | 841 | break; |
323 | 679 | case 1: |
324 | 679 | detect(chardata); |
325 | 679 | break; |
326 | 214 | case 2: |
327 | 214 | validate_ascii(chardata); |
328 | 214 | break; |
329 | 266 | case 3: |
330 | 266 | validate_ascii_with_err(chardata); |
331 | 266 | break; |
332 | 137 | case 4: |
333 | 137 | utf16_endianness(u16data); |
334 | 137 | break; |
335 | 48 | case 5: { |
336 | 48 | [[maybe_unused]] auto ret = |
337 | 48 | simdutf::trim_partial_utf16le(u16data.data(), u16data.size()); |
338 | 48 | assert(ret == u16data.size() || ret + 1 == u16data.size()); |
339 | 48 | } break; |
340 | 48 | case 6: { |
341 | 39 | [[maybe_unused]] auto ret = |
342 | 39 | simdutf::trim_partial_utf16be(u16data.data(), u16data.size()); |
343 | 39 | assert(ret == u16data.size() || ret + 1 == u16data.size()); |
344 | 39 | } break; |
345 | 111 | case 7: { |
346 | 111 | [[maybe_unused]] const std::size_t N = chardata.size(); |
347 | 111 | [[maybe_unused]] const auto ret = |
348 | 111 | simdutf::trim_partial_utf8(chardata.data(), chardata.size()); |
349 | 111 | if ((ret + 3 < N) || (ret > N)) { |
350 | 0 | std::cerr << "ret=" << ret << " N=" << N << '\n'; |
351 | 0 | std::abort(); |
352 | 0 | } |
353 | 111 | } break; |
354 | 279 | case 8: |
355 | 279 | convert_latin1_to_utf8_safe(chardata, u16); |
356 | 279 | break; |
357 | 0 | case 9: |
358 | 0 | validate_utf16_as_ascii(u16data); |
359 | 0 | break; |
360 | 0 | case 10: |
361 | 0 | to_well_formed_utf16(u16data); |
362 | 0 | break; |
363 | 2.61k | } |
364 | 2.61k | return 0; |
365 | 2.61k | } |