/src/mercurial/contrib/fuzz/FuzzedDataProvider.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===- FuzzedDataProvider.h - Utility header for fuzz targets ---*- C++ -* ===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // A single header library providing an utility class to break up an array of |
9 | | // bytes. Whenever run on the same input, provides the same output, as long as |
10 | | // its methods are called in the same order, with the same arguments. |
11 | | //===----------------------------------------------------------------------===// |
12 | | |
13 | | #ifndef LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ |
14 | | #define LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ |
15 | | |
16 | | #include <algorithm> |
17 | | #include <climits> |
18 | | #include <cstddef> |
19 | | #include <cstdint> |
20 | | #include <cstring> |
21 | | #include <initializer_list> |
22 | | #include <string> |
23 | | #include <type_traits> |
24 | | #include <utility> |
25 | | #include <vector> |
26 | | |
27 | | // In addition to the comments below, the API is also briefly documented at |
28 | | // https://github.com/google/fuzzing/blob/master/docs/split-inputs.md#fuzzed-data-provider |
29 | | class FuzzedDataProvider |
30 | | { |
31 | | public: |
32 | | // |data| is an array of length |size| that the FuzzedDataProvider wraps |
33 | | // to provide more granular access. |data| must outlive the |
34 | | // FuzzedDataProvider. |
35 | | FuzzedDataProvider(const uint8_t *data, size_t size) |
36 | 4.74k | : data_ptr_(data), remaining_bytes_(size) |
37 | 4.74k | { |
38 | 4.74k | } |
39 | | ~FuzzedDataProvider() = default; |
40 | | |
41 | | // Returns a std::vector containing |num_bytes| of input data. If fewer |
42 | | // than |num_bytes| of data remain, returns a shorter std::vector |
43 | | // containing all of the data that's left. Can be used with any byte |
44 | | // sized type, such as char, unsigned char, uint8_t, etc. |
45 | | template <typename T> std::vector<T> ConsumeBytes(size_t num_bytes) |
46 | | { |
47 | | num_bytes = std::min(num_bytes, remaining_bytes_); |
48 | | return ConsumeBytes<T>(num_bytes, num_bytes); |
49 | | } |
50 | | |
51 | | // Similar to |ConsumeBytes|, but also appends the terminator value at |
52 | | // the end of the resulting vector. Useful, when a mutable |
53 | | // null-terminated C-string is needed, for example. But that is a rare |
54 | | // case. Better avoid it, if possible, and prefer using |ConsumeBytes| |
55 | | // or |ConsumeBytesAsString| methods. |
56 | | template <typename T> |
57 | | std::vector<T> ConsumeBytesWithTerminator(size_t num_bytes, |
58 | | T terminator = 0) |
59 | | { |
60 | | num_bytes = std::min(num_bytes, remaining_bytes_); |
61 | | std::vector<T> result = |
62 | | ConsumeBytes<T>(num_bytes + 1, num_bytes); |
63 | | result.back() = terminator; |
64 | | return result; |
65 | | } |
66 | | |
67 | | // Returns a std::string containing |num_bytes| of input data. Using |
68 | | // this and |
69 | | // |.c_str()| on the resulting string is the best way to get an |
70 | | // immutable null-terminated C string. If fewer than |num_bytes| of data |
71 | | // remain, returns a shorter std::string containing all of the data |
72 | | // that's left. |
73 | | std::string ConsumeBytesAsString(size_t num_bytes) |
74 | 4.67k | { |
75 | 4.67k | static_assert(sizeof(std::string::value_type) == |
76 | 4.67k | sizeof(uint8_t), |
77 | 4.67k | "ConsumeBytesAsString cannot convert the data to " |
78 | 4.67k | "a string."); |
79 | | |
80 | 4.67k | num_bytes = std::min(num_bytes, remaining_bytes_); |
81 | 4.67k | std::string result( |
82 | 4.67k | reinterpret_cast<const std::string::value_type *>( |
83 | 4.67k | data_ptr_), |
84 | 4.67k | num_bytes); |
85 | 4.67k | Advance(num_bytes); |
86 | 4.67k | return result; |
87 | 4.67k | } |
88 | | |
89 | | // Returns a number in the range [min, max] by consuming bytes from the |
90 | | // input data. The value might not be uniformly distributed in the given |
91 | | // range. If there's no input data left, always returns |min|. |min| |
92 | | // must be less than or equal to |max|. |
93 | | template <typename T> T ConsumeIntegralInRange(T min, T max) |
94 | 364 | { |
95 | 364 | static_assert(std::is_integral<T>::value, |
96 | 364 | "An integral type is required."); |
97 | 364 | static_assert(sizeof(T) <= sizeof(uint64_t), |
98 | 364 | "Unsupported integral type."); |
99 | | |
100 | 364 | if (min > max) |
101 | 0 | abort(); |
102 | | |
103 | | // Use the biggest type possible to hold the range and the |
104 | | // result. |
105 | 364 | uint64_t range = static_cast<uint64_t>(max) - min; |
106 | 364 | uint64_t result = 0; |
107 | 364 | size_t offset = 0; |
108 | | |
109 | 728 | while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && |
110 | 728 | remaining_bytes_ != 0) { |
111 | | // Pull bytes off the end of the seed data. |
112 | | // Experimentally, this seems to allow the fuzzer to |
113 | | // more easily explore the input space. This makes |
114 | | // sense, since it works by modifying inputs that caused |
115 | | // new code to run, and this data is often used to |
116 | | // encode length of data read by |ConsumeBytes|. |
117 | | // Separating out read lengths makes it easier modify |
118 | | // the contents of the data that is actually read. |
119 | 364 | --remaining_bytes_; |
120 | 364 | result = |
121 | 364 | (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; |
122 | 364 | offset += CHAR_BIT; |
123 | 364 | } |
124 | | |
125 | | // Avoid division by 0, in case |range + 1| results in overflow. |
126 | 364 | if (range != std::numeric_limits<decltype(range)>::max()) |
127 | 364 | result = result % (range + 1); |
128 | | |
129 | 364 | return static_cast<T>(min + result); |
130 | 364 | } |
131 | | |
132 | | // Returns a std::string of length from 0 to |max_length|. When it runs |
133 | | // out of input data, returns what remains of the input. Designed to be |
134 | | // more stable with respect to a fuzzer inserting characters than just |
135 | | // picking a random length and then consuming that many bytes with |
136 | | // |ConsumeBytes|. |
137 | | std::string ConsumeRandomLengthString(size_t max_length) |
138 | 4.38k | { |
139 | | // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", |
140 | | // and maps "\" followed by anything else to the end of the |
141 | | // string. As a result of this logic, a fuzzer can insert |
142 | | // characters into the string, and the string will be lengthened |
143 | | // to include those new characters, resulting in a more stable |
144 | | // fuzzer than picking the length of a string independently from |
145 | | // picking its contents. |
146 | 4.38k | std::string result; |
147 | | |
148 | | // Reserve the anticipated capaticity to prevent several |
149 | | // reallocations. |
150 | 4.38k | result.reserve(std::min(max_length, remaining_bytes_)); |
151 | 42.7M | for (size_t i = 0; i < max_length && remaining_bytes_ != 0; |
152 | 42.7M | ++i) { |
153 | 42.7M | char next = ConvertUnsignedToSigned<char>(data_ptr_[0]); |
154 | 42.7M | Advance(1); |
155 | 42.7M | if (next == '\\' && remaining_bytes_ != 0) { |
156 | 4.75k | next = |
157 | 4.75k | ConvertUnsignedToSigned<char>(data_ptr_[0]); |
158 | 4.75k | Advance(1); |
159 | 4.75k | if (next != '\\') |
160 | 3.78k | break; |
161 | 4.75k | } |
162 | 42.7M | result += next; |
163 | 42.7M | } |
164 | | |
165 | 4.38k | result.shrink_to_fit(); |
166 | 4.38k | return result; |
167 | 4.38k | } |
168 | | |
169 | | // Returns a std::vector containing all remaining bytes of the input |
170 | | // data. |
171 | | template <typename T> std::vector<T> ConsumeRemainingBytes() |
172 | | { |
173 | | return ConsumeBytes<T>(remaining_bytes_); |
174 | | } |
175 | | |
176 | | // Returns a std::string containing all remaining bytes of the input |
177 | | // data. Prefer using |ConsumeRemainingBytes| unless you actually need a |
178 | | // std::string object. |
179 | | std::string ConsumeRemainingBytesAsString() |
180 | 4.67k | { |
181 | 4.67k | return ConsumeBytesAsString(remaining_bytes_); |
182 | 4.67k | } |
183 | | |
184 | | // Returns a number in the range [Type's min, Type's max]. The value |
185 | | // might not be uniformly distributed in the given range. If there's no |
186 | | // input data left, always returns |min|. |
187 | | template <typename T> T ConsumeIntegral() |
188 | 364 | { |
189 | 364 | return ConsumeIntegralInRange(std::numeric_limits<T>::min(), |
190 | 364 | std::numeric_limits<T>::max()); |
191 | 364 | } |
192 | | |
193 | | // Reads one byte and returns a bool, or false when no data remains. |
194 | | bool ConsumeBool() |
195 | 364 | { |
196 | 364 | return 1 & ConsumeIntegral<uint8_t>(); |
197 | 364 | } |
198 | | |
199 | | // Returns a copy of the value selected from the given fixed-size |
200 | | // |array|. |
201 | | template <typename T, size_t size> |
202 | | T PickValueInArray(const T (&array)[size]) |
203 | | { |
204 | | static_assert(size > 0, "The array must be non empty."); |
205 | | return array[ConsumeIntegralInRange<size_t>(0, size - 1)]; |
206 | | } |
207 | | |
208 | | template <typename T> |
209 | | T PickValueInArray(std::initializer_list<const T> list) |
210 | | { |
211 | | // TODO(Dor1s): switch to static_assert once C++14 is allowed. |
212 | | if (!list.size()) |
213 | | abort(); |
214 | | |
215 | | return *(list.begin() + |
216 | | ConsumeIntegralInRange<size_t>(0, list.size() - 1)); |
217 | | } |
218 | | |
219 | | // Returns an enum value. The enum must start at 0 and be contiguous. It |
220 | | // must also contain |kMaxValue| aliased to its largest (inclusive) |
221 | | // value. Such as: enum class Foo { SomeValue, OtherValue, kMaxValue = |
222 | | // OtherValue }; |
223 | | template <typename T> T ConsumeEnum() |
224 | | { |
225 | | static_assert(std::is_enum<T>::value, |
226 | | "|T| must be an enum type."); |
227 | | return static_cast<T>(ConsumeIntegralInRange<uint32_t>( |
228 | | 0, static_cast<uint32_t>(T::kMaxValue))); |
229 | | } |
230 | | |
231 | | // Returns a floating point number in the range [0.0, 1.0]. If there's |
232 | | // no input data left, always returns 0. |
233 | | template <typename T> T ConsumeProbability() |
234 | | { |
235 | | static_assert(std::is_floating_point<T>::value, |
236 | | "A floating point type is required."); |
237 | | |
238 | | // Use different integral types for different floating point |
239 | | // types in order to provide better density of the resulting |
240 | | // values. |
241 | | using IntegralType = |
242 | | typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), |
243 | | uint32_t, uint64_t>::type; |
244 | | |
245 | | T result = static_cast<T>(ConsumeIntegral<IntegralType>()); |
246 | | result /= |
247 | | static_cast<T>(std::numeric_limits<IntegralType>::max()); |
248 | | return result; |
249 | | } |
250 | | |
251 | | // Returns a floating point value in the range [Type's lowest, Type's |
252 | | // max] by consuming bytes from the input data. If there's no input data |
253 | | // left, always returns approximately 0. |
254 | | template <typename T> T ConsumeFloatingPoint() |
255 | | { |
256 | | return ConsumeFloatingPointInRange<T>( |
257 | | std::numeric_limits<T>::lowest(), |
258 | | std::numeric_limits<T>::max()); |
259 | | } |
260 | | |
261 | | // Returns a floating point value in the given range by consuming bytes |
262 | | // from the input data. If there's no input data left, returns |min|. |
263 | | // Note that |min| must be less than or equal to |max|. |
264 | | template <typename T> T ConsumeFloatingPointInRange(T min, T max) |
265 | | { |
266 | | if (min > max) |
267 | | abort(); |
268 | | |
269 | | T range = .0; |
270 | | T result = min; |
271 | | constexpr T zero(.0); |
272 | | if (max > zero && min < zero && |
273 | | max > min + std::numeric_limits<T>::max()) { |
274 | | // The diff |max - min| would overflow the given |
275 | | // floating point type. Use the half of the diff as the |
276 | | // range and consume a bool to decide whether the result |
277 | | // is in the first of the second part of the diff. |
278 | | range = (max / 2.0) - (min / 2.0); |
279 | | if (ConsumeBool()) { |
280 | | result += range; |
281 | | } |
282 | | } else { |
283 | | range = max - min; |
284 | | } |
285 | | |
286 | | return result + range * ConsumeProbability<T>(); |
287 | | } |
288 | | |
289 | | // Reports the remaining bytes available for fuzzed input. |
290 | | size_t remaining_bytes() |
291 | 0 | { |
292 | 0 | return remaining_bytes_; |
293 | 0 | } |
294 | | |
295 | | private: |
296 | | FuzzedDataProvider(const FuzzedDataProvider &) = delete; |
297 | | FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; |
298 | | |
299 | | void Advance(size_t num_bytes) |
300 | 42.7M | { |
301 | 42.7M | if (num_bytes > remaining_bytes_) |
302 | 0 | abort(); |
303 | | |
304 | 42.7M | data_ptr_ += num_bytes; |
305 | 42.7M | remaining_bytes_ -= num_bytes; |
306 | 42.7M | } |
307 | | |
308 | | template <typename T> |
309 | | std::vector<T> ConsumeBytes(size_t size, size_t num_bytes_to_consume) |
310 | | { |
311 | | static_assert(sizeof(T) == sizeof(uint8_t), |
312 | | "Incompatible data type."); |
313 | | |
314 | | // The point of using the size-based constructor below is to |
315 | | // increase the odds of having a vector object with capacity |
316 | | // being equal to the length. That part is always implementation |
317 | | // specific, but at least both libc++ and libstdc++ allocate the |
318 | | // requested number of bytes in that constructor, which seems to |
319 | | // be a natural choice for other implementations as well. To |
320 | | // increase the odds even more, we also call |shrink_to_fit| |
321 | | // below. |
322 | | std::vector<T> result(size); |
323 | | if (size == 0) { |
324 | | if (num_bytes_to_consume != 0) |
325 | | abort(); |
326 | | return result; |
327 | | } |
328 | | |
329 | | std::memcpy(result.data(), data_ptr_, num_bytes_to_consume); |
330 | | Advance(num_bytes_to_consume); |
331 | | |
332 | | // Even though |shrink_to_fit| is also implementation specific, |
333 | | // we expect it to provide an additional assurance in case |
334 | | // vector's constructor allocated a buffer which is larger than |
335 | | // the actual amount of data we put inside it. |
336 | | result.shrink_to_fit(); |
337 | | return result; |
338 | | } |
339 | | |
340 | | template <typename TS, typename TU> TS ConvertUnsignedToSigned(TU value) |
341 | 42.7M | { |
342 | 42.7M | static_assert(sizeof(TS) == sizeof(TU), |
343 | 42.7M | "Incompatible data types."); |
344 | 42.7M | static_assert(!std::numeric_limits<TU>::is_signed, |
345 | 42.7M | "Source type must be unsigned."); |
346 | | |
347 | | // TODO(Dor1s): change to `if constexpr` once C++17 becomes |
348 | | // mainstream. |
349 | 42.7M | if (std::numeric_limits<TS>::is_modulo) |
350 | 0 | return static_cast<TS>(value); |
351 | | |
352 | | // Avoid using implementation-defined unsigned to signer |
353 | | // conversions. To learn more, see |
354 | | // https://stackoverflow.com/questions/13150449. |
355 | 42.7M | if (value <= std::numeric_limits<TS>::max()) { |
356 | 41.0M | return static_cast<TS>(value); |
357 | 41.0M | } else { |
358 | 1.67M | constexpr auto TS_min = std::numeric_limits<TS>::min(); |
359 | 1.67M | return TS_min + static_cast<char>(value - TS_min); |
360 | 1.67M | } |
361 | 42.7M | } |
362 | | |
363 | | const uint8_t *data_ptr_; |
364 | | size_t remaining_bytes_; |
365 | | }; |
366 | | |
367 | | #endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ |
368 | | // no-check-code since this is from a third party |