1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11from collections.abc import Iterator
12from enum import Enum
13from random import Random
14from typing import TYPE_CHECKING, Callable, Optional, Union
15
16from sortedcontainers import SortedList
17
18from hypothesis.internal.conjecture.choice import choices_key
19from hypothesis.internal.conjecture.data import (
20 ConjectureData,
21 ConjectureResult,
22 Status,
23 _Overrun,
24)
25from hypothesis.internal.conjecture.junkdrawer import LazySequenceCopy
26from hypothesis.internal.conjecture.shrinker import sort_key
27
28NO_SCORE = float("-inf")
29
30if TYPE_CHECKING:
31 from hypothesis.internal.conjecture.engine import ConjectureRunner
32
33
34class DominanceRelation(Enum):
35 NO_DOMINANCE = 0
36 EQUAL = 1
37 LEFT_DOMINATES = 2
38 RIGHT_DOMINATES = 3
39
40
41def dominance(left: ConjectureResult, right: ConjectureResult) -> DominanceRelation:
42 """Returns the dominance relation between ``left`` and ``right``, according
43 to the rules that one ConjectureResult dominates another if and only if it
44 is better in every way.
45
46 The things we currently consider to be "better" are:
47
48 * Something that is smaller in shrinking order is better.
49 * Something that has higher status is better.
50 * Each ``interesting_origin`` is treated as its own score, so if two
51 interesting examples have different origins then neither dominates
52 the other.
53 * For each target observation, a higher score is better.
54
55 In "normal" operation where there are no bugs or target observations, the
56 pareto front only has one element (the smallest valid test case), but for
57 more structured or failing tests it can be useful to track, and future work
58 will depend on it more."""
59
60 left_key = sort_key(left.nodes)
61 right_key = sort_key(right.nodes)
62 if left_key == right_key:
63 return DominanceRelation.EQUAL
64
65 if right_key < left_key:
66 result = dominance(left=right, right=left)
67 if result == DominanceRelation.LEFT_DOMINATES:
68 return DominanceRelation.RIGHT_DOMINATES
69 else:
70 # Because we have sort_key(left) < sort_key(right) the only options
71 # are that right is better than left or that the two are
72 # incomparable.
73 assert result == DominanceRelation.NO_DOMINANCE
74 return result
75
76 # Either left is better or there is no dominance relationship.
77 assert left_key < right_key
78
79 # The right is more interesting
80 if left.status < right.status:
81 return DominanceRelation.NO_DOMINANCE
82
83 if not right.tags.issubset(left.tags):
84 return DominanceRelation.NO_DOMINANCE
85
86 # Things that are interesting for different reasons are incomparable in
87 # the dominance relationship.
88 if (
89 left.status == Status.INTERESTING
90 and right.interesting_origin is not None
91 and left.interesting_origin != right.interesting_origin
92 ):
93 return DominanceRelation.NO_DOMINANCE
94
95 for target in set(left.target_observations) | set(right.target_observations):
96 left_score = left.target_observations.get(target, NO_SCORE)
97 right_score = right.target_observations.get(target, NO_SCORE)
98 if right_score > left_score:
99 return DominanceRelation.NO_DOMINANCE
100
101 return DominanceRelation.LEFT_DOMINATES
102
103
104class ParetoFront:
105 """Maintains an approximate pareto front of ConjectureData objects. That
106 is, we try to maintain a collection of objects such that no element of the
107 collection is pareto dominated by any other. In practice we don't quite
108 manage that, because doing so is computationally very expensive. Instead
109 we maintain a random sample of data objects that are "rarely" dominated by
110 any other element of the collection (roughly, no more than about 10%).
111
112 Only valid test cases are considered to belong to the pareto front - any
113 test case with a status less than valid is discarded.
114
115 Note that the pareto front is potentially quite large, and currently this
116 will store the entire front in memory. This is bounded by the number of
117 valid examples we run, which is max_examples in normal execution, and
118 currently we do not support workflows with large max_examples which have
119 large values of max_examples very well anyway, so this isn't a major issue.
120 In future we may weish to implement some sort of paging out to disk so that
121 we can work with larger fronts.
122
123 Additionally, because this is only an approximate pareto front, there are
124 scenarios where it can be much larger than the actual pareto front. There
125 isn't a huge amount we can do about this - checking an exact pareto front
126 is intrinsically quadratic.
127
128 "Most" of the time we should be relatively close to the true pareto front,
129 say within an order of magnitude, but it's not hard to construct scenarios
130 where this is not the case. e.g. suppose we enumerate all valid test cases
131 in increasing shortlex order as s_1, ..., s_n, ... and have scores f and
132 g such that f(s_i) = min(i, N) and g(s_i) = 1 if i >= N, then the pareto
133 front is the set {s_1, ..., S_N}, but the only element of the front that
134 will dominate s_i when i > N is S_N, which we select with probability
135 1 / N. A better data structure could solve this, but at the cost of more
136 expensive operations and higher per element memory use, so we'll wait to
137 see how much of a problem this is in practice before we try that.
138 """
139
140 def __init__(self, random: Random) -> None:
141 self.__random = random
142 self.__eviction_listeners: list[Callable[[ConjectureResult], None]] = []
143
144 self.front: SortedList[ConjectureResult] = SortedList(
145 key=lambda d: sort_key(d.nodes)
146 )
147 self.__pending: Optional[ConjectureResult] = None
148
149 def add(self, data: Union[ConjectureData, ConjectureResult, _Overrun]) -> bool:
150 """Attempts to add ``data`` to the pareto front. Returns True if
151 ``data`` is now in the front, including if data is already in the
152 collection, and False otherwise"""
153 if data.status < Status.VALID:
154 return False
155
156 assert not isinstance(data, _Overrun)
157 data = data.as_result()
158 assert not isinstance(data, _Overrun)
159
160 if not self.front:
161 self.front.add(data)
162 return True
163
164 if data in self.front:
165 return True
166
167 # We add data to the pareto front by adding it unconditionally and then
168 # doing a certain amount of randomized "clear down" - testing a random
169 # set of elements (currently 10) to see if they are dominated by
170 # something else in the collection. If they are, we remove them.
171 self.front.add(data)
172 assert self.__pending is None
173 try:
174 self.__pending = data
175
176 # We maintain a set of the current exact pareto front of the
177 # values we've sampled so far. When we sample a new element we
178 # either add it to this exact pareto front or remove it from the
179 # collection entirely.
180 front = LazySequenceCopy(self.front)
181
182 # We track which values we are going to remove and remove them all
183 # at the end so the shape of the front doesn't change while we're
184 # using it.
185 to_remove: list[ConjectureResult] = []
186
187 # We now iteratively sample elements from the approximate pareto
188 # front to check whether they should be retained. When the set of
189 # dominators gets too large we have sampled at least 10 elements
190 # and it gets too expensive to continue, so we consider that enough
191 # due diligence.
192 i = self.front.index(data)
193
194 # First we attempt to look for values that must be removed by the
195 # addition of the data. These are necessarily to the right of it
196 # in the list.
197
198 failures = 0
199 while i + 1 < len(front) and failures < 10:
200 j = self.__random.randrange(i + 1, len(front))
201 candidate = front.pop(j)
202 dom = dominance(data, candidate)
203 assert dom != DominanceRelation.RIGHT_DOMINATES
204 if dom == DominanceRelation.LEFT_DOMINATES:
205 to_remove.append(candidate)
206 failures = 0
207 else:
208 failures += 1
209
210 # Now we look at the points up to where we put data in to see if
211 # it is dominated. While we're here we spend some time looking for
212 # anything else that might be dominated too, compacting down parts
213 # of the list.
214
215 dominators = [data]
216
217 while i >= 0 and len(dominators) < 10:
218 front.swap(i, self.__random.randint(0, i))
219
220 candidate = front[i]
221
222 already_replaced = False
223 j = 0
224 while j < len(dominators):
225 v = dominators[j]
226
227 dom = dominance(candidate, v)
228 if dom == DominanceRelation.LEFT_DOMINATES:
229 if not already_replaced:
230 already_replaced = True
231 dominators[j] = candidate
232 j += 1
233 else: # pragma: no cover # flaky, by test_database_contains_only_pareto_front
234 dominators[j], dominators[-1] = (
235 dominators[-1],
236 dominators[j],
237 )
238 dominators.pop()
239 to_remove.append(v)
240 elif dom == DominanceRelation.RIGHT_DOMINATES:
241 to_remove.append(candidate)
242 break
243 elif dom == DominanceRelation.EQUAL:
244 break
245 else:
246 j += 1
247 else:
248 dominators.append(candidate)
249 i -= 1
250
251 for v in to_remove:
252 self._remove(v)
253 return data in self.front
254 finally:
255 self.__pending = None
256
257 def on_evict(self, f: Callable[[ConjectureResult], None]) -> None:
258 """Register a listener function that will be called with data when it
259 gets removed from the front because something else dominates it."""
260 self.__eviction_listeners.append(f)
261
262 def __contains__(self, data: object) -> bool:
263 if not isinstance(data, (ConjectureData, ConjectureResult)):
264 return False
265
266 result = data.as_result()
267 if isinstance(result, _Overrun):
268 return False
269
270 return result in self.front
271
272 def __iter__(self) -> Iterator[ConjectureResult]:
273 return iter(self.front)
274
275 def __getitem__(self, i: int) -> ConjectureResult:
276 return self.front[i]
277
278 def __len__(self) -> int:
279 return len(self.front)
280
281 def _remove(self, data: ConjectureResult) -> None:
282 try:
283 self.front.remove(data)
284 except ValueError:
285 return
286 if data is not self.__pending:
287 for f in self.__eviction_listeners:
288 f(data)
289
290
291class ParetoOptimiser:
292 """Class for managing optimisation of the pareto front. That is, given the
293 current best known pareto front, this class runs an optimisation process
294 that attempts to bring it closer to the actual pareto front.
295
296 Currently this is fairly basic and only handles pareto optimisation that
297 works by reducing the test case in the shortlex order. We expect it will
298 grow more powerful over time.
299 """
300
301 def __init__(self, engine: "ConjectureRunner") -> None:
302 self.__engine = engine
303 assert self.__engine.pareto_front is not None
304 self.front: ParetoFront = self.__engine.pareto_front
305
306 def run(self) -> None:
307 seen = set()
308
309 # We iterate backwards through the pareto front, using the shrinker to
310 # (hopefully) replace each example with a smaller one. Note that it's
311 # important that we start from the end for two reasons: Firstly, by
312 # doing it this way we ensure that any new front members we discover
313 # during optimisation will also get optimised (because they will be
314 # inserted into the part of the front that we haven't visited yet),
315 # and secondly we generally expect that we will not finish this process
316 # in a single run, because it's relatively expensive in terms of our
317 # example budget, and by starting from the end we ensure that each time
318 # we run the tests we improve the pareto front because we work on the
319 # bits that we haven't covered yet.
320 i = len(self.front) - 1
321 prev = None
322 while i >= 0 and not self.__engine.interesting_examples:
323 assert self.front
324 i = min(i, len(self.front) - 1)
325 target = self.front[i]
326 if choices_key(target.choices) in seen:
327 i -= 1
328 continue
329 assert target is not prev
330 prev = target
331
332 def allow_transition(source, destination):
333 """Shrink to data that strictly pareto dominates the current
334 best value we've seen, which is the current target of the
335 shrinker.
336
337 Note that during shrinking we may discover other smaller
338 examples that this function will reject and will get added to
339 the front. This is fine, because they will be processed on
340 later iterations of this loop."""
341 if dominance(destination, source) == DominanceRelation.LEFT_DOMINATES:
342 # If ``destination`` dominates ``source`` then ``source``
343 # must be dominated in the front - either ``destination`` is in
344 # the front, or it was not added to it because it was
345 # dominated by something in it.
346 self.front._remove(source)
347 return True
348 return False
349
350 shrunk = self.__engine.shrink(target, allow_transition=allow_transition)
351 seen.add(choices_key(shrunk.choices))
352
353 # Note that the front may have changed shape arbitrarily when
354 # we ran the shrinker. If it didn't change shape then this is
355 # i - 1. If it did change shape then this is the largest value
356 # in the front which is smaller than the previous target, so
357 # is the correct place to resume from. In particular note that the
358 # size of the front might have grown because of slippage during the
359 # shrink, but all of the newly introduced elements will be smaller
360 # than `target`, so will be covered by this iteration.
361 i = self.front.front.bisect_left(target)