1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import enum
12import hashlib
13import heapq
14import math
15import sys
16from collections import OrderedDict, abc
17from collections.abc import Sequence
18from functools import lru_cache
19from typing import TYPE_CHECKING, Optional, TypeVar, Union
20
21from hypothesis.errors import InvalidArgument
22from hypothesis.internal.compat import int_from_bytes
23from hypothesis.internal.floats import next_up
24
25if TYPE_CHECKING:
26 from hypothesis.internal.conjecture.data import ConjectureData
27
28
29LABEL_MASK = 2**64 - 1
30
31
32def calc_label_from_name(name: str) -> int:
33 hashed = hashlib.sha384(name.encode()).digest()
34 return int_from_bytes(hashed[:8])
35
36
37def calc_label_from_cls(cls: type) -> int:
38 return calc_label_from_name(cls.__qualname__)
39
40
41def calc_label_from_hash(obj: object) -> int:
42 return calc_label_from_name(str(hash(obj)))
43
44
45def combine_labels(*labels: int) -> int:
46 label = 0
47 for l in labels:
48 label = (label << 1) & LABEL_MASK
49 label ^= l
50 return label
51
52
53SAMPLE_IN_SAMPLER_LABEL = calc_label_from_name("a sample() in Sampler")
54ONE_FROM_MANY_LABEL = calc_label_from_name("one more from many()")
55
56
57T = TypeVar("T")
58
59
60def identity(v: T) -> T:
61 return v
62
63
64def check_sample(
65 values: Union[type[enum.Enum], Sequence[T]], strategy_name: str
66) -> Sequence[T]:
67 if "numpy" in sys.modules and isinstance(values, sys.modules["numpy"].ndarray):
68 if values.ndim != 1:
69 raise InvalidArgument(
70 "Only one-dimensional arrays are supported for sampling, "
71 f"and the given value has {values.ndim} dimensions (shape "
72 f"{values.shape}). This array would give samples of array slices "
73 "instead of elements! Use np.ravel(values) to convert "
74 "to a one-dimensional array, or tuple(values) if you "
75 "want to sample slices."
76 )
77 elif not isinstance(values, (OrderedDict, abc.Sequence, enum.EnumMeta)):
78 raise InvalidArgument(
79 f"Cannot sample from {values!r} because it is not an ordered collection. "
80 f"Hypothesis goes to some length to ensure that the {strategy_name} "
81 "strategy has stable results between runs. To replay a saved "
82 "example, the sampled values must have the same iteration order "
83 "on every run - ruling out sets, dicts, etc due to hash "
84 "randomization. Most cases can simply use `sorted(values)`, but "
85 "mixed types or special values such as math.nan require careful "
86 "handling - and note that when simplifying an example, "
87 "Hypothesis treats earlier values as simpler."
88 )
89 if isinstance(values, range):
90 return values
91 return tuple(values)
92
93
94@lru_cache(64)
95def compute_sampler_table(weights: tuple[float, ...]) -> list[tuple[int, int, float]]:
96 n = len(weights)
97 table: list[list[int | float | None]] = [[i, None, None] for i in range(n)]
98 total = sum(weights)
99 num_type = type(total)
100
101 zero = num_type(0) # type: ignore
102 one = num_type(1) # type: ignore
103
104 small: list[int] = []
105 large: list[int] = []
106
107 probabilities = [w / total for w in weights]
108 scaled_probabilities: list[float] = []
109
110 for i, alternate_chance in enumerate(probabilities):
111 scaled = alternate_chance * n
112 scaled_probabilities.append(scaled)
113 if scaled == 1:
114 table[i][2] = zero
115 elif scaled < 1:
116 small.append(i)
117 else:
118 large.append(i)
119 heapq.heapify(small)
120 heapq.heapify(large)
121
122 while small and large:
123 lo = heapq.heappop(small)
124 hi = heapq.heappop(large)
125
126 assert lo != hi
127 assert scaled_probabilities[hi] > one
128 assert table[lo][1] is None
129 table[lo][1] = hi
130 table[lo][2] = one - scaled_probabilities[lo]
131 scaled_probabilities[hi] = (
132 scaled_probabilities[hi] + scaled_probabilities[lo]
133 ) - one
134
135 if scaled_probabilities[hi] < 1:
136 heapq.heappush(small, hi)
137 elif scaled_probabilities[hi] == 1:
138 table[hi][2] = zero
139 else:
140 heapq.heappush(large, hi)
141 while large:
142 table[large.pop()][2] = zero
143 while small:
144 table[small.pop()][2] = zero
145
146 new_table: list[tuple[int, int, float]] = []
147 for base, alternate, alternate_chance in table:
148 assert isinstance(base, int)
149 assert isinstance(alternate, int) or alternate is None
150 assert alternate_chance is not None
151 if alternate is None:
152 new_table.append((base, base, alternate_chance))
153 elif alternate < base:
154 new_table.append((alternate, base, one - alternate_chance))
155 else:
156 new_table.append((base, alternate, alternate_chance))
157 new_table.sort()
158 return new_table
159
160
161class Sampler:
162 """Sampler based on Vose's algorithm for the alias method. See
163 http://www.keithschwarz.com/darts-dice-coins/ for a good explanation.
164
165 The general idea is that we store a table of triples (base, alternate, p).
166 base. We then pick a triple uniformly at random, and choose its alternate
167 value with probability p and else choose its base value. The triples are
168 chosen so that the resulting mixture has the right distribution.
169
170 We maintain the following invariants to try to produce good shrinks:
171
172 1. The table is in lexicographic (base, alternate) order, so that choosing
173 an earlier value in the list always lowers (or at least leaves
174 unchanged) the value.
175 2. base[i] < alternate[i], so that shrinking the draw always results in
176 shrinking the chosen element.
177 """
178
179 table: list[tuple[int, int, float]] # (base_idx, alt_idx, alt_chance)
180
181 def __init__(self, weights: Sequence[float], *, observe: bool = True):
182 self.observe = observe
183 self.table = compute_sampler_table(tuple(weights))
184
185 def sample(
186 self,
187 data: "ConjectureData",
188 *,
189 forced: Optional[int] = None,
190 ) -> int:
191 if self.observe:
192 data.start_span(SAMPLE_IN_SAMPLER_LABEL)
193 forced_choice = ( # pragma: no branch # https://github.com/nedbat/coveragepy/issues/1617
194 None
195 if forced is None
196 else next(
197 (base, alternate, alternate_chance)
198 for (base, alternate, alternate_chance) in self.table
199 if forced == base or (forced == alternate and alternate_chance > 0)
200 )
201 )
202 base, alternate, alternate_chance = data.choice(
203 self.table,
204 forced=forced_choice,
205 observe=self.observe,
206 )
207 forced_use_alternate = None
208 if forced is not None:
209 # we maintain this invariant when picking forced_choice above.
210 # This song and dance about alternate_chance > 0 is to avoid forcing
211 # e.g. draw_boolean(p=0, forced=True), which is an error.
212 forced_use_alternate = forced == alternate and alternate_chance > 0
213 assert forced == base or forced_use_alternate
214
215 use_alternate = data.draw_boolean(
216 alternate_chance,
217 forced=forced_use_alternate,
218 observe=self.observe,
219 )
220 if self.observe:
221 data.stop_span()
222 if use_alternate:
223 assert forced is None or alternate == forced, (forced, alternate)
224 return alternate
225 else:
226 assert forced is None or base == forced, (forced, base)
227 return base
228
229
230INT_SIZES = (8, 16, 32, 64, 128)
231INT_SIZES_SAMPLER = Sampler((4.0, 8.0, 1.0, 1.0, 0.5), observe=False)
232
233
234class many:
235 """Utility class for collections. Bundles up the logic we use for "should I
236 keep drawing more values?" and handles starting and stopping examples in
237 the right place.
238
239 Intended usage is something like:
240
241 elements = many(data, ...)
242 while elements.more():
243 add_stuff_to_result()
244 """
245
246 def __init__(
247 self,
248 data: "ConjectureData",
249 min_size: int,
250 max_size: Union[int, float],
251 average_size: Union[int, float],
252 *,
253 forced: Optional[int] = None,
254 observe: bool = True,
255 ) -> None:
256 assert 0 <= min_size <= average_size <= max_size
257 assert forced is None or min_size <= forced <= max_size
258 self.min_size = min_size
259 self.max_size = max_size
260 self.data = data
261 self.forced_size = forced
262 self.p_continue = _calc_p_continue(average_size - min_size, max_size - min_size)
263 self.count = 0
264 self.rejections = 0
265 self.drawn = False
266 self.force_stop = False
267 self.rejected = False
268 self.observe = observe
269
270 def stop_span(self):
271 if self.observe:
272 self.data.stop_span()
273
274 def start_span(self, label):
275 if self.observe:
276 self.data.start_span(label)
277
278 def more(self) -> bool:
279 """Should I draw another element to add to the collection?"""
280 if self.drawn:
281 self.stop_span()
282
283 self.drawn = True
284 self.rejected = False
285
286 self.start_span(ONE_FROM_MANY_LABEL)
287 if self.min_size == self.max_size:
288 # if we have to hit an exact size, draw unconditionally until that
289 # point, and no further.
290 should_continue = self.count < self.min_size
291 else:
292 forced_result = None
293 if self.force_stop:
294 # if our size is forced, we can't reject in a way that would
295 # cause us to differ from the forced size.
296 assert self.forced_size is None or self.count == self.forced_size
297 forced_result = False
298 elif self.count < self.min_size:
299 forced_result = True
300 elif self.count >= self.max_size:
301 forced_result = False
302 elif self.forced_size is not None:
303 forced_result = self.count < self.forced_size
304 should_continue = self.data.draw_boolean(
305 self.p_continue,
306 forced=forced_result,
307 observe=self.observe,
308 )
309
310 if should_continue:
311 self.count += 1
312 return True
313 else:
314 self.stop_span()
315 return False
316
317 def reject(self, why: Optional[str] = None) -> None:
318 """Reject the last example (i.e. don't count it towards our budget of
319 elements because it's not going to go in the final collection)."""
320 assert self.count > 0
321 self.count -= 1
322 self.rejections += 1
323 self.rejected = True
324 # We set a minimum number of rejections before we give up to avoid
325 # failing too fast when we reject the first draw.
326 if self.rejections > max(3, 2 * self.count):
327 if self.count < self.min_size:
328 self.data.mark_invalid(why)
329 else:
330 self.force_stop = True
331
332
333SMALLEST_POSITIVE_FLOAT: float = next_up(0.0) or sys.float_info.min
334
335
336@lru_cache
337def _calc_p_continue(desired_avg: float, max_size: Union[int, float]) -> float:
338 """Return the p_continue which will generate the desired average size."""
339 assert desired_avg <= max_size, (desired_avg, max_size)
340 if desired_avg == max_size:
341 return 1.0
342 p_continue = 1 - 1.0 / (1 + desired_avg)
343 if p_continue == 0 or max_size == math.inf:
344 assert 0 <= p_continue < 1, p_continue
345 return p_continue
346 assert 0 < p_continue < 1, p_continue
347 # For small max_size, the infinite-series p_continue is a poor approximation,
348 # and while we can't solve the polynomial a few rounds of iteration quickly
349 # gets us a good approximate solution in almost all cases (sometimes exact!).
350 while _p_continue_to_avg(p_continue, max_size) > desired_avg:
351 # This is impossible over the reals, but *can* happen with floats.
352 p_continue -= 0.0001
353 # If we've reached zero or gone negative, we want to break out of this loop,
354 # and do so even if we're on a system with the unsafe denormals-are-zero flag.
355 # We make that an explicit error in st.floats(), but here we'd prefer to
356 # just get somewhat worse precision on collection lengths.
357 if p_continue < SMALLEST_POSITIVE_FLOAT:
358 p_continue = SMALLEST_POSITIVE_FLOAT
359 break
360 # Let's binary-search our way to a better estimate! We tried fancier options
361 # like gradient descent, but this is numerically stable and works better.
362 hi = 1.0
363 while desired_avg - _p_continue_to_avg(p_continue, max_size) > 0.01:
364 assert 0 < p_continue < hi, (p_continue, hi)
365 mid = (p_continue + hi) / 2
366 if _p_continue_to_avg(mid, max_size) <= desired_avg:
367 p_continue = mid
368 else:
369 hi = mid
370 assert 0 < p_continue < 1, p_continue
371 assert _p_continue_to_avg(p_continue, max_size) <= desired_avg
372 return p_continue
373
374
375def _p_continue_to_avg(p_continue: float, max_size: Union[int, float]) -> float:
376 """Return the average_size generated by this p_continue and max_size."""
377 if p_continue >= 1:
378 return max_size
379 return (1.0 / (1 - p_continue) - 1) * (1 - p_continue**max_size)