/src/simdutf/fuzz/misc.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | #include <cstddef> |
2 | | #include <cstdint> |
3 | | #include <ranges> |
4 | | |
5 | | #include "helpers/common.h" |
6 | | #include "simdutf.h" |
7 | | |
8 | 875 | void autodetect(std::span<const char> chardata) { |
9 | 875 | std::vector<simdutf::encoding_type> results; |
10 | 875 | const auto implementations = get_supported_implementations(); |
11 | 2.62k | for (const simdutf::implementation* impl : implementations) { |
12 | 2.62k | results.push_back( |
13 | 2.62k | impl->autodetect_encoding(chardata.data(), chardata.size())); |
14 | 2.62k | } |
15 | 1.75k | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
16 | 875 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
17 | 0 | std::cerr << "output differs between implementations\n"; |
18 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
19 | 0 | std::cerr << "implementation " << implementations[i] << " gave " |
20 | 0 | << results.at(i) << '\n'; |
21 | 0 | } |
22 | 0 | std::abort(); |
23 | 0 | } |
24 | 875 | } |
25 | | |
26 | 700 | void detect(std::span<const char> chardata) { |
27 | 700 | std::vector<int> results; |
28 | 700 | const auto implementations = get_supported_implementations(); |
29 | 2.10k | for (const simdutf::implementation* impl : implementations) { |
30 | 2.10k | results.push_back(impl->detect_encodings(chardata.data(), chardata.size())); |
31 | 2.10k | } |
32 | 1.40k | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
33 | 700 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
34 | 0 | std::cerr << "in detect_encodings(const char*, std::size_t):\n"; |
35 | 0 | std::cerr << "output differs between implementations\n"; |
36 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
37 | 0 | std::cerr << "implementation " << implementations[i]->name() << " gave " |
38 | 0 | << results.at(i) << '\n'; |
39 | 0 | } |
40 | 0 | std::cerr << " std::vector<unsigned char> data{"; |
41 | 0 | for (unsigned char x : chardata) { |
42 | 0 | std::cerr << +x << ", "; |
43 | 0 | }; |
44 | 0 | std::cerr << "};\n"; |
45 | 0 | std::abort(); |
46 | 0 | } |
47 | 700 | } |
48 | | |
49 | 163 | void validate_ascii(std::span<const char> chardata) { |
50 | | // use int, not bool to avoid vector<bool> |
51 | 163 | std::vector<int> results; |
52 | 163 | const auto implementations = get_supported_implementations(); |
53 | 489 | for (const simdutf::implementation* impl : implementations) { |
54 | 489 | results.push_back(+impl->validate_ascii(chardata.data(), chardata.size())); |
55 | 489 | } |
56 | 326 | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
57 | 163 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
58 | 0 | std::cerr << "in validate_ascii(const char*, std::size_t):\n"; |
59 | 0 | std::cerr << "output differs between implementations\n"; |
60 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
61 | 0 | std::cerr << "implementation " << implementations[i]->name() << " gave " |
62 | 0 | << results.at(i) << '\n'; |
63 | 0 | } |
64 | 0 | std::cerr << " std::vector<unsigned char> data{"; |
65 | 0 | for (unsigned char x : chardata) { |
66 | 0 | std::cerr << +x << ", "; |
67 | 0 | }; |
68 | 0 | std::cerr << "};\n"; |
69 | 0 | std::abort(); |
70 | 0 | } |
71 | 163 | } |
72 | | |
73 | 259 | void validate_ascii_with_err(std::span<const char> chardata) { |
74 | | // use int, not bool to avoid vector<bool> |
75 | 259 | std::vector<simdutf::result> results; |
76 | 259 | const auto implementations = get_supported_implementations(); |
77 | 777 | for (const simdutf::implementation* impl : implementations) { |
78 | 777 | results.push_back( |
79 | 777 | impl->validate_ascii_with_errors(chardata.data(), chardata.size())); |
80 | 777 | } |
81 | 518 | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
82 | 259 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
83 | 0 | std::cerr << "in validate_ascii(const char*, std::size_t):\n"; |
84 | 0 | std::cerr << "output differs between implementations\n"; |
85 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
86 | 0 | std::cerr << "implementation " << implementations[i]->name() << " gave " |
87 | 0 | << results.at(i) << '\n'; |
88 | 0 | } |
89 | 0 | std::cerr << " std::vector<unsigned char> data{"; |
90 | 0 | for (unsigned char x : chardata) { |
91 | 0 | std::cerr << +x << ", "; |
92 | 0 | }; |
93 | 0 | std::cerr << "};\n"; |
94 | 0 | std::abort(); |
95 | 0 | } |
96 | 259 | } |
97 | | |
98 | 137 | void utf16_endianess(std::span<const char16_t> data) { |
99 | 137 | std::vector<std::string> results; |
100 | 137 | const auto implementations = get_supported_implementations(); |
101 | 411 | for (const simdutf::implementation* impl : implementations) { |
102 | 411 | std::vector<char16_t> out(data.size()); |
103 | 411 | impl->change_endianness_utf16(data.data(), data.size(), out.data()); |
104 | 411 | results.push_back(FNV1A_hash::as_str(out)); |
105 | 411 | } |
106 | 274 | auto neq = [](const auto& a, const auto& b) { return a != b; }; |
107 | 137 | if (std::ranges::adjacent_find(results, neq) != results.end()) { |
108 | 0 | std::cerr << "in utf16_endianess(const char*, std::size_t):\n"; |
109 | 0 | std::cerr << "output differs between implementations\n"; |
110 | 0 | for (std::size_t i = 0; i < implementations.size(); ++i) { |
111 | 0 | std::cerr << "implementation " << implementations[i]->name() << " gave " |
112 | 0 | << results.at(i) << '\n'; |
113 | 0 | } |
114 | 0 | std::cerr << " std::vector<char16_t> data{"; |
115 | 0 | for (int x : data) { |
116 | 0 | std::cerr << +x << ", "; |
117 | 0 | }; |
118 | 0 | std::cerr << "};\n"; |
119 | 0 | std::abort(); |
120 | 0 | } |
121 | 137 | } |
122 | | |
123 | | void convert_latin1_to_utf8_safe(std::span<const char> chardata, |
124 | 311 | const std::size_t outputsize) { |
125 | | // convert with a limited output buffer |
126 | 311 | std::vector<char> limited_output(outputsize); |
127 | 311 | const auto limited_ret = simdutf::convert_latin1_to_utf8_safe( |
128 | 311 | chardata.data(), chardata.size(), limited_output.data(), outputsize); |
129 | | |
130 | | // convert with a sufficiently large output buffer |
131 | 311 | std::vector<char> large_output(2 * chardata.size()); |
132 | 311 | const auto large_ret = simdutf::convert_latin1_to_utf8( |
133 | 311 | chardata.data(), chardata.size(), large_output.data()); |
134 | | |
135 | 311 | if (large_ret != 0) { |
136 | | // conversion was possible with a large buffer. |
137 | 280 | if (large_ret <= outputsize) { |
138 | | // the limited buffer was large enough, ensure we got the same result |
139 | 131 | assert(limited_ret == large_ret); |
140 | 131 | assert(std::ranges::equal(limited_output | std::views::take(large_ret), |
141 | 131 | large_output | std::views::take(large_ret))); |
142 | 149 | } else { |
143 | | // the number of written bytes for a limited buffer must not exceed what |
144 | | // the large buffer got. |
145 | 149 | assert(limited_ret <= large_ret); |
146 | | // the written data should be equal |
147 | 149 | assert(std::ranges::equal(limited_output | std::views::take(limited_ret), |
148 | 149 | large_output | std::views::take(limited_ret))); |
149 | 149 | } |
150 | 280 | } else { |
151 | | // conversion with a big buffer failed - is there anything we can check or |
152 | | // assert for the limited buffer? I don't think so. |
153 | 31 | } |
154 | 311 | } |
155 | | |
156 | 2.61k | extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { |
157 | | // pick one of the functions, based on the fuzz data. |
158 | | // the first byte is which action to take. step forward |
159 | | // several bytes so the input is aligned. |
160 | 2.61k | if (size < 4) { |
161 | 2 | return 0; |
162 | 2 | } |
163 | 2.61k | constexpr auto Ncases = 9u; |
164 | 2.61k | constexpr auto actionmask = std::bit_ceil(Ncases) - 1; |
165 | 2.61k | const auto action = data[0] & actionmask; |
166 | | |
167 | 2.61k | const std::uint16_t u16 = data[1] + (data[2] << 8); |
168 | | |
169 | 2.61k | data += 4; |
170 | 2.61k | size -= 4; |
171 | | |
172 | 2.61k | const std::span<const char> chardata{(const char*)data, size}; |
173 | 2.61k | const std::span<const char16_t> u16data{(const char16_t*)data, |
174 | 2.61k | size / sizeof(char16_t)}; |
175 | | |
176 | 2.61k | switch (action) { |
177 | 875 | case 0: |
178 | 875 | autodetect(chardata); |
179 | 875 | break; |
180 | 700 | case 1: |
181 | 700 | detect(chardata); |
182 | 700 | break; |
183 | 163 | case 2: |
184 | 163 | validate_ascii(chardata); |
185 | 163 | break; |
186 | 259 | case 3: |
187 | 259 | validate_ascii_with_err(chardata); |
188 | 259 | break; |
189 | 137 | case 4: |
190 | 137 | utf16_endianess(u16data); |
191 | 137 | break; |
192 | 48 | case 5: { |
193 | 48 | [[maybe_unused]] auto ret = |
194 | 48 | simdutf::trim_partial_utf16le(u16data.data(), u16data.size()); |
195 | 48 | assert(ret == u16data.size() || ret + 1 == u16data.size()); |
196 | 48 | } break; |
197 | 48 | case 6: { |
198 | 40 | [[maybe_unused]] auto ret = |
199 | 40 | simdutf::trim_partial_utf16be(u16data.data(), u16data.size()); |
200 | 40 | assert(ret == u16data.size() || ret + 1 == u16data.size()); |
201 | 40 | } break; |
202 | 81 | case 7: { |
203 | 81 | [[maybe_unused]] const std::size_t N = chardata.size(); |
204 | 81 | [[maybe_unused]] const auto ret = |
205 | 81 | simdutf::trim_partial_utf8(chardata.data(), chardata.size()); |
206 | 81 | if ((ret + 3 < N) || (ret > N)) { |
207 | 0 | std::cerr << "ret=" << ret << " N=" << N << '\n'; |
208 | 0 | std::abort(); |
209 | 0 | } |
210 | 81 | } break; |
211 | 311 | case 8: |
212 | 311 | convert_latin1_to_utf8_safe(chardata, u16); |
213 | 311 | break; |
214 | 2.61k | } |
215 | 2.61k | return 0; |
216 | 2.61k | } |