Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/hypothesis/internal/conjecture/datatree.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

416 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import math 

12from collections.abc import Generator 

13from random import Random 

14from typing import TYPE_CHECKING, AbstractSet, Final, Optional, Union, cast 

15 

16import attr 

17 

18from hypothesis.errors import ( 

19 FlakyReplay, 

20 FlakyStrategyDefinition, 

21 HypothesisException, 

22 StopTest, 

23) 

24from hypothesis.internal import floats as flt 

25from hypothesis.internal.conjecture.choice import ( 

26 BooleanConstraints, 

27 BytesConstraints, 

28 ChoiceConstraintsT, 

29 ChoiceT, 

30 ChoiceTypeT, 

31 FloatConstraints, 

32 IntegerConstraints, 

33 StringConstraints, 

34 choice_from_index, 

35) 

36from hypothesis.internal.conjecture.data import ConjectureData, DataObserver, Status 

37from hypothesis.internal.escalation import InterestingOrigin 

38from hypothesis.internal.floats import ( 

39 count_between_floats, 

40 float_to_int, 

41 int_to_float, 

42 sign_aware_lte, 

43) 

44 

45if TYPE_CHECKING: 

46 from typing import TypeAlias 

47 

48 from hypothesis.vendor.pretty import RepresentationPrinter 

49 

50ChildrenCacheValueT: "TypeAlias" = tuple[ 

51 Generator[ChoiceT, None, None], list[ChoiceT], set[ChoiceT] 

52] 

53 

54 

55class PreviouslyUnseenBehaviour(HypothesisException): 

56 pass 

57 

58 

59_FLAKY_STRAT_MSG = ( 

60 "Inconsistent data generation! Data generation behaved differently " 

61 "between different runs. Is your data generation depending on external " 

62 "state?" 

63) 

64 

65 

66EMPTY: frozenset[int] = frozenset() 

67 

68 

69@attr.s(slots=True) 

70class Killed: 

71 """Represents a transition to part of the tree which has been marked as 

72 "killed", meaning we want to treat it as not worth exploring, so it will 

73 be treated as if it were completely explored for the purposes of 

74 exhaustion.""" 

75 

76 next_node: "TreeNode" = attr.ib() 

77 

78 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

79 assert cycle is False 

80 p.text("Killed") 

81 

82 

83def _node_pretty( 

84 choice_type: ChoiceTypeT, 

85 value: ChoiceT, 

86 constraints: ChoiceConstraintsT, 

87 *, 

88 forced: bool, 

89) -> str: 

90 forced_marker = " [forced]" if forced else "" 

91 return f"{choice_type} {value!r}{forced_marker} {constraints}" 

92 

93 

94@attr.s(slots=True) 

95class Branch: 

96 """Represents a transition where multiple choices can be made as to what 

97 to drawn.""" 

98 

99 constraints: ChoiceConstraintsT = attr.ib() 

100 choice_type: ChoiceTypeT = attr.ib() 

101 children: dict[ChoiceT, "TreeNode"] = attr.ib(repr=False) 

102 

103 @property 

104 def max_children(self) -> int: 

105 max_children = compute_max_children(self.choice_type, self.constraints) 

106 assert max_children > 0 

107 return max_children 

108 

109 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

110 assert cycle is False 

111 for i, (value, child) in enumerate(self.children.items()): 

112 if i > 0: 

113 p.break_() 

114 p.text( 

115 _node_pretty(self.choice_type, value, self.constraints, forced=False) 

116 ) 

117 with p.indent(2): 

118 p.break_() 

119 p.pretty(child) 

120 

121 

122@attr.s(slots=True, frozen=True) 

123class Conclusion: 

124 """Represents a transition to a finished state.""" 

125 

126 status: Status = attr.ib() 

127 interesting_origin: Optional[InterestingOrigin] = attr.ib() 

128 

129 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

130 assert cycle is False 

131 o = self.interesting_origin 

132 # avoid str(o), which can include multiple lines of context 

133 origin = ( 

134 "" if o is None else f", {o.exc_type.__name__} at {o.filename}:{o.lineno}" 

135 ) 

136 p.text(f"Conclusion ({self.status!r}{origin})") 

137 

138 

139# The number of max children where, beyond this, it is practically impossible 

140# for hypothesis to saturate / explore all children nodes in a reasonable time 

141# frame. We use this to bail out of expensive max children computations early, 

142# where the numbers involved are so large that we know they will be larger than 

143# this number. 

144# 

145# Note that it's ok for us to underestimate the number of max children of a node 

146# by using this. We just may think the node is exhausted when in fact it has more 

147# possible children to be explored. This has the potential to finish generation 

148# early due to exhausting the entire tree, but that is quite unlikely: (1) the 

149# number of examples would have to be quite high, and (2) the tree would have to 

150# contain only one or two nodes, or generate_novel_prefix would simply switch to 

151# exploring another non-exhausted node. 

152# 

153# Also note that we may sometimes compute max children above this value. In other 

154# words, this is *not* a hard maximum on the computed max children. It's the point 

155# where further computation is not beneficial - but sometimes doing that computation 

156# unconditionally is cheaper than estimating against this value. 

157# 

158# The one case where this may be detrimental is fuzzing, where the throughput of 

159# examples is so high that it really may saturate important nodes. We'll cross 

160# that bridge when we come to it. 

161MAX_CHILDREN_EFFECTIVELY_INFINITE: Final[int] = 10_000_000 

162 

163 

164def _count_distinct_strings(*, alphabet_size: int, min_size: int, max_size: int) -> int: 

165 # We want to estimate if we're going to have more children than 

166 # MAX_CHILDREN_EFFECTIVELY_INFINITE, without computing a potentially 

167 # extremely expensive pow. We'll check the two extreme cases - if the 

168 # number of strings in the largest string size alone is enough to put us 

169 # over this limit (at alphabet_size >= 2), and if the variation in sizes 

170 # (at alphabet_size == 1) is enough. If neither result in an early return, 

171 # the exact result should be reasonably cheap to compute. 

172 if alphabet_size == 0: 

173 # Special-case the empty string, avoid error in math.log(0). 

174 return 1 

175 elif alphabet_size == 1: 

176 # Special-case the constant alphabet, invalid in the geom-series sum. 

177 return max_size - min_size + 1 

178 else: 

179 # Estimate against log, which is cheaper than computing a pow. 

180 # 

181 # m = max_size 

182 # a = alphabet_size 

183 # N = MAX_CHILDREN_EFFECTIVELY_INFINITE 

184 # 

185 # a**m > N 

186 # <=> m * log(a) > log(N) 

187 log_max_sized_children = max_size * math.log(alphabet_size) 

188 if log_max_sized_children > math.log(MAX_CHILDREN_EFFECTIVELY_INFINITE): 

189 return MAX_CHILDREN_EFFECTIVELY_INFINITE 

190 

191 # The sum of a geometric series is given by (ref: wikipedia): 

192 # ᵐ∑ₖ₌₀ aᵏ = (aᵐ⁺¹ - 1) / (a - 1) 

193 # = S(m) / S(0) 

194 # assuming a != 1 and using the definition 

195 # S(m) := aᵐ⁺¹ - 1. 

196 # The sum we want, starting from a number n [0 <= n <= m] rather than zero, is 

197 # ᵐ∑ₖ₌ₙ aᵏ = ᵐ∑ₖ₌₀ aᵏ - ⁿ⁻¹∑ₖ₌₀ aᵏ = S(m) / S(0) - S(n - 1) / S(0) 

198 def S(n): 

199 return alphabet_size ** (n + 1) - 1 

200 

201 return (S(max_size) - S(min_size - 1)) // S(0) 

202 

203 

204def compute_max_children( 

205 choice_type: ChoiceTypeT, constraints: ChoiceConstraintsT 

206) -> int: 

207 if choice_type == "integer": 

208 constraints = cast(IntegerConstraints, constraints) 

209 min_value = constraints["min_value"] 

210 max_value = constraints["max_value"] 

211 

212 if min_value is None and max_value is None: 

213 # full 128 bit range. 

214 return 2**128 - 1 

215 if min_value is not None and max_value is not None: 

216 # count between min/max value. 

217 return max_value - min_value + 1 

218 

219 # hard case: only one bound was specified. Here we probe either upwards 

220 # or downwards with our full 128 bit generation, but only half of these 

221 # (plus one for the case of generating zero) result in a probe in the 

222 # direction we want. ((2**128 - 1) // 2) + 1 == 2 ** 127 

223 assert (min_value is None) ^ (max_value is None) 

224 return 2**127 

225 elif choice_type == "boolean": 

226 constraints = cast(BooleanConstraints, constraints) 

227 p = constraints["p"] 

228 # probabilities of 0 or 1 (or effectively 0 or 1) only have one choice. 

229 if p <= 2 ** (-64) or p >= (1 - 2 ** (-64)): 

230 return 1 

231 return 2 

232 elif choice_type == "bytes": 

233 constraints = cast(BytesConstraints, constraints) 

234 return _count_distinct_strings( 

235 alphabet_size=2**8, 

236 min_size=constraints["min_size"], 

237 max_size=constraints["max_size"], 

238 ) 

239 elif choice_type == "string": 

240 constraints = cast(StringConstraints, constraints) 

241 min_size = constraints["min_size"] 

242 max_size = constraints["max_size"] 

243 intervals = constraints["intervals"] 

244 

245 return _count_distinct_strings( 

246 alphabet_size=len(intervals), min_size=min_size, max_size=max_size 

247 ) 

248 elif choice_type == "float": 

249 constraints = cast(FloatConstraints, constraints) 

250 min_value_f = constraints["min_value"] 

251 max_value_f = constraints["max_value"] 

252 smallest_nonzero_magnitude = constraints["smallest_nonzero_magnitude"] 

253 

254 count = count_between_floats(min_value_f, max_value_f) 

255 

256 # we have two intervals: 

257 # a. [min_value, max_value] 

258 # b. [-smallest_nonzero_magnitude, smallest_nonzero_magnitude] 

259 # 

260 # which could be subsets (in either order), overlapping, or disjoint. We 

261 # want the interval difference a - b. 

262 

263 # next_down because endpoints are ok with smallest_nonzero_magnitude 

264 min_point = max(min_value_f, -flt.next_down(smallest_nonzero_magnitude)) 

265 max_point = min(max_value_f, flt.next_down(smallest_nonzero_magnitude)) 

266 

267 if min_point > max_point: 

268 # case: disjoint intervals. 

269 return count 

270 

271 count -= count_between_floats(min_point, max_point) 

272 if sign_aware_lte(min_value_f, -0.0) and sign_aware_lte(-0.0, max_value_f): 

273 # account for -0.0 

274 count += 1 

275 if sign_aware_lte(min_value_f, 0.0) and sign_aware_lte(0.0, max_value_f): 

276 # account for 0.0 

277 count += 1 

278 return count 

279 

280 raise NotImplementedError(f"unhandled choice_type {choice_type}") 

281 

282 

283# In theory, this is a strict superset of the functionality of compute_max_children; 

284# 

285# assert len(all_children(choice_type, constraints)) == compute_max_children(choice_type, constraints) 

286# 

287# In practice, we maintain two distinct implementations for efficiency and space 

288# reasons. If you just need the number of children, it is cheaper to use 

289# compute_max_children than to reify the list of children (only to immediately 

290# throw it away). 

291def _floats_between(a: float, b: float) -> Generator[float, None, None]: 

292 for n in range(float_to_int(a), float_to_int(b) + 1): 

293 yield int_to_float(n) 

294 

295 

296def all_children( 

297 choice_type: ChoiceTypeT, constraints: ChoiceConstraintsT 

298) -> Generator[ChoiceT, None, None]: 

299 if choice_type != "float": 

300 for index in range(compute_max_children(choice_type, constraints)): 

301 yield choice_from_index(index, choice_type, constraints) 

302 else: 

303 constraints = cast(FloatConstraints, constraints) 

304 # the float ordering is not injective (because of resampling 

305 # out-of-bounds values), so using choice_from_index would result in 

306 # duplicates. This violates invariants in datatree about being able 

307 # to draw unique new children using all_children. 

308 # 

309 # We instead maintain a separate implementation for floats. 

310 # TODO_IR write a better (bijective) ordering for floats and remove this! 

311 min_value = constraints["min_value"] 

312 max_value = constraints["max_value"] 

313 smallest_nonzero_magnitude = constraints["smallest_nonzero_magnitude"] 

314 

315 # handle zeroes separately so smallest_nonzero_magnitude can think of 

316 # itself as a complete interval (instead of a hole at ±0). 

317 if sign_aware_lte(min_value, -0.0) and sign_aware_lte(-0.0, max_value): 

318 yield -0.0 

319 if sign_aware_lte(min_value, 0.0) and sign_aware_lte(0.0, max_value): 

320 yield 0.0 

321 

322 if flt.is_negative(min_value): 

323 if flt.is_negative(max_value): 

324 # case: both negative. 

325 max_point = min(max_value, -smallest_nonzero_magnitude) 

326 # float_to_int increases as negative magnitude increases, so 

327 # invert order. 

328 yield from _floats_between(max_point, min_value) 

329 else: 

330 # case: straddles midpoint (which is between -0.0 and 0.0). 

331 yield from _floats_between(-smallest_nonzero_magnitude, min_value) 

332 yield from _floats_between(smallest_nonzero_magnitude, max_value) 

333 else: 

334 # case: both positive. 

335 min_point = max(min_value, smallest_nonzero_magnitude) 

336 yield from _floats_between(min_point, max_value) 

337 

338 

339@attr.s(slots=True) 

340class TreeNode: 

341 """ 

342 A node, or collection of directly descended nodes, in a DataTree. 

343 

344 We store the DataTree as a radix tree (https://en.wikipedia.org/wiki/Radix_tree), 

345 which means that nodes that are the only child of their parent are collapsed 

346 into their parent to save space. 

347 

348 Conceptually, you can unfold a single TreeNode storing n values in its lists 

349 into a sequence of n nodes, each a child of the last. In other words, 

350 (constraints[i], values[i], choice_types[i]) corresponds to the single node at index 

351 i. 

352 

353 Note that if a TreeNode represents a choice (i.e. the nodes cannot be compacted 

354 via the radix tree definition), then its lists will be empty and it will 

355 store a `Branch` representing that choce in its `transition`. 

356 

357 Examples 

358 -------- 

359 

360 Consider sequentially drawing a boolean, then an integer. 

361 

362 data.draw_boolean() 

363 data.draw_integer(1, 3) 

364 

365 If we draw True and then 2, the tree may conceptually look like this. 

366 

367 ┌──────┐ 

368 │ root │ 

369 └──┬───┘ 

370 ┌──┴───┐ 

371 │ True │ 

372 └──┬───┘ 

373 ┌──┴───┐ 

374 │ 2 │ 

375 └──────┘ 

376 

377 But since 2 is the only child of True, we will compact these nodes and store 

378 them as a single TreeNode. 

379 

380 ┌──────┐ 

381 │ root │ 

382 └──┬───┘ 

383 ┌────┴──────┐ 

384 │ [True, 2] │ 

385 └───────────┘ 

386 

387 If we then draw True and then 3, True will have multiple children and we 

388 can no longer store this compacted representation. We would call split_at(0) 

389 on the [True, 2] node to indicate that we need to add a choice at 0-index 

390 node (True). 

391 

392 ┌──────┐ 

393 │ root │ 

394 └──┬───┘ 

395 ┌──┴───┐ 

396 ┌─┤ True ├─┐ 

397 │ └──────┘ │ 

398 ┌─┴─┐ ┌─┴─┐ 

399 │ 2 │ │ 3 │ 

400 └───┘ └───┘ 

401 """ 

402 

403 # The constraints, value, and choice_types of the nodes stored here. These always 

404 # have the same length. The values at index i belong to node i. 

405 constraints: list[ChoiceConstraintsT] = attr.ib(factory=list) 

406 values: list[ChoiceT] = attr.ib(factory=list) 

407 choice_types: list[ChoiceTypeT] = attr.ib(factory=list) 

408 

409 # The indices of nodes which had forced values. 

410 # 

411 # Stored as None if no indices have been forced, purely for space saving 

412 # reasons (we force quite rarely). 

413 __forced: Optional[set[int]] = attr.ib(default=None, init=False) 

414 

415 # What happens next after drawing these nodes. (conceptually, "what is the 

416 # child/children of the last node stored here"). 

417 # 

418 # One of: 

419 # - None (we don't know yet) 

420 # - Branch (we have seen multiple possible outcomes here) 

421 # - Conclusion (ConjectureData.conclude_test was called here) 

422 # - Killed (this branch is valid and may even have children, but should not 

423 # be explored when generating novel prefixes) 

424 transition: Union[None, Branch, Conclusion, Killed] = attr.ib(default=None) 

425 

426 # A tree node is exhausted if every possible sequence of draws below it has 

427 # been explored. We only update this when performing operations that could 

428 # change the answer. 

429 # 

430 # See also TreeNode.check_exhausted. 

431 is_exhausted: bool = attr.ib(default=False, init=False) 

432 

433 @property 

434 def forced(self) -> AbstractSet[int]: 

435 if not self.__forced: 

436 return EMPTY 

437 return self.__forced 

438 

439 def mark_forced(self, i: int) -> None: 

440 """ 

441 Note that the draw at node i was forced. 

442 """ 

443 assert 0 <= i < len(self.values) 

444 if self.__forced is None: 

445 self.__forced = set() 

446 self.__forced.add(i) 

447 

448 def split_at(self, i: int) -> None: 

449 """ 

450 Splits the tree so that it can incorporate a decision at the draw call 

451 corresponding to the node at position i. 

452 

453 Raises FlakyStrategyDefinition if node i was forced. 

454 """ 

455 

456 if i in self.forced: 

457 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

458 

459 assert not self.is_exhausted 

460 

461 key = self.values[i] 

462 

463 child = TreeNode( 

464 choice_types=self.choice_types[i + 1 :], 

465 constraints=self.constraints[i + 1 :], 

466 values=self.values[i + 1 :], 

467 transition=self.transition, 

468 ) 

469 self.transition = Branch( 

470 constraints=self.constraints[i], 

471 choice_type=self.choice_types[i], 

472 children={key: child}, 

473 ) 

474 if self.__forced is not None: 

475 child.__forced = {j - i - 1 for j in self.__forced if j > i} 

476 self.__forced = {j for j in self.__forced if j < i} 

477 child.check_exhausted() 

478 del self.choice_types[i:] 

479 del self.values[i:] 

480 del self.constraints[i:] 

481 assert len(self.values) == len(self.constraints) == len(self.choice_types) == i 

482 

483 def check_exhausted(self) -> bool: 

484 """ 

485 Recalculates is_exhausted if necessary, and then returns it. 

486 

487 A node is exhausted if: 

488 - Its transition is Conclusion or Killed 

489 - It has the maximum number of children (i.e. we have found all of its 

490 possible children), and all its children are exhausted 

491 

492 Therefore, we only need to compute this for a node when: 

493 - We first create it in split_at 

494 - We set its transition to either Conclusion or Killed 

495 (TreeRecordingObserver.conclude_test or TreeRecordingObserver.kill_branch) 

496 - We exhaust any of its children 

497 """ 

498 

499 if ( 

500 # a node cannot go from is_exhausted -> not is_exhausted. 

501 not self.is_exhausted 

502 # if we don't know what happens after this node, we don't have 

503 # enough information to tell if it's exhausted. 

504 and self.transition is not None 

505 # if there are still any nodes left which are the only child of their 

506 # parent (len(self.values) > 0), then this TreeNode must be not 

507 # exhausted, unless all of those nodes were forced. 

508 # 

509 # This is because we maintain an invariant of only adding nodes to 

510 # DataTree which have at least 2 possible values, so we know that if 

511 # they do not have any siblings that we still have more choices to 

512 # discover. 

513 # 

514 # (We actually *do* currently add single-valued nodes to the tree, 

515 # but immediately split them into a transition to avoid falsifying 

516 # this check. this is a bit of a hack.) 

517 and len(self.forced) == len(self.values) 

518 ): 

519 if isinstance(self.transition, (Conclusion, Killed)): 

520 self.is_exhausted = True 

521 elif len(self.transition.children) == self.transition.max_children: 

522 self.is_exhausted = all( 

523 v.is_exhausted for v in self.transition.children.values() 

524 ) 

525 return self.is_exhausted 

526 

527 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

528 assert cycle is False 

529 indent = 0 

530 for i, (choice_type, constraints, value) in enumerate( 

531 zip(self.choice_types, self.constraints, self.values) 

532 ): 

533 with p.indent(indent): 

534 if i > 0: 

535 p.break_() 

536 p.text( 

537 _node_pretty( 

538 choice_type, value, constraints, forced=i in self.forced 

539 ) 

540 ) 

541 indent += 2 

542 

543 with p.indent(indent): 

544 if len(self.values) > 0: 

545 p.break_() 

546 if self.transition is not None: 

547 p.pretty(self.transition) 

548 else: 

549 p.text("unknown") 

550 

551 

552class DataTree: 

553 """ 

554 A DataTree tracks the structured history of draws in some test function, 

555 across multiple ConjectureData objects. 

556 

557 This information is used by ConjectureRunner to generate novel prefixes of 

558 this tree (see generate_novel_prefix). A novel prefix is a sequence of draws 

559 which the tree has not seen before, and therefore the ConjectureRunner has 

560 not generated as an input to the test function before. 

561 

562 DataTree tracks the following: 

563 

564 - Drawn choices in the choice sequence 

565 - ConjectureData.draw_integer() 

566 - ConjectureData.draw_float() 

567 - ConjectureData.draw_string() 

568 - ConjectureData.draw_boolean() 

569 - ConjectureData.draw_bytes() 

570 - Test conclusions (with some Status, e.g. Status.VALID) 

571 - ConjectureData.conclude_test() 

572 

573 A DataTree is — surprise — a *tree*. A node in this tree is either a choice draw 

574 with some value, a test conclusion with some Status, or a special `Killed` value, 

575 which denotes that further draws may exist beyond this node but should not be 

576 considered worth exploring when generating novel prefixes. A node is a leaf 

577 iff it is a conclusion or Killed. 

578 

579 A branch from node A to node B indicates that we have previously seen some 

580 sequence (a, b) of draws, where a and b are the values in nodes A and B. 

581 Similar intuition holds for conclusion and Killed nodes. 

582 

583 Examples 

584 -------- 

585 

586 To see how a DataTree gets built through successive sets of draws, consider 

587 the following code that calls through to some ConjecutreData object `data`. 

588 The first call can be either True or False, and the second call can be any 

589 integer in the range [1, 3]. 

590 

591 data.draw_boolean() 

592 data.draw_integer(1, 3) 

593 

594 To start, the corresponding DataTree object is completely empty. 

595 

596 ┌──────┐ 

597 │ root │ 

598 └──────┘ 

599 

600 We happen to draw True and then 2 in the above code. The tree tracks this. 

601 (2 also connects to a child Conclusion node with Status.VALID since it's the 

602 final draw in the code. I'll omit Conclusion nodes in diagrams for brevity.) 

603 

604 ┌──────┐ 

605 │ root │ 

606 └──┬───┘ 

607 ┌──┴───┐ 

608 │ True │ 

609 └──┬───┘ 

610 ┌──┴───┐ 

611 │ 2 │ 

612 └──────┘ 

613 

614 This is a very boring tree so far! But now we happen to draw False and 

615 then 1. This causes a split in the tree. Remember, DataTree tracks history 

616 over all invocations of a function, not just one. The end goal is to know 

617 what invocations haven't been tried yet, after all. 

618 

619 ┌──────┐ 

620 ┌───┤ root ├───┐ 

621 │ └──────┘ │ 

622 ┌──┴───┐ ┌─┴─────┐ 

623 │ True │ │ False │ 

624 └──┬───┘ └──┬────┘ 

625 ┌─┴─┐ ┌─┴─┐ 

626 │ 2 │ │ 1 │ 

627 └───┘ └───┘ 

628 

629 If we were to ask DataTree for a novel prefix at this point, it might 

630 generate any of (True, 1), (True, 3), (False, 2), or (False, 3). 

631 

632 Note that the novel prefix stops as soon as it generates a novel node. For 

633 instance, if we had generated a novel prefix back when the tree was only 

634 root -> True -> 2, we could have gotten any of (True, 1), (True, 3), or 

635 (False). But we could *not* have gotten (False, n), because both False and 

636 n were novel at that point, and we stop at the first novel node — False. 

637 

638 I won't belabor this example. Here's what the tree looks like when fully 

639 explored: 

640 

641 ┌──────┐ 

642 ┌──────┤ root ├──────┐ 

643 │ └──────┘ │ 

644 ┌──┴───┐ ┌─┴─────┐ 

645 ┌──┤ True ├──┐ ┌───┤ False ├──┐ 

646 │ └──┬───┘ │ │ └──┬────┘ │ 

647 ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ 

648 │ 1 │ │ 2 │ │ 3 │ │ 1 │ │ 2 │ │ 3 │ 

649 └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ 

650 

651 You could imagine much more complicated trees than this arising in practice, 

652 and indeed they do. In particular, the tree need not be balanced or 'nice' 

653 like the tree above. For instance, 

654 

655 b = data.draw_boolean() 

656 if b: 

657 data.draw_integer(1, 3) 

658 

659 results in a tree with the entire right part lopped off, and False leading 

660 straight to a conclusion node with Status.VALID. As another example, 

661 

662 n = data.draw_integers() 

663 assume(n >= 3) 

664 data.draw_string() 

665 

666 results in a tree with the 0, 1, and 2 nodes leading straight to a 

667 conclusion node with Status.INVALID, and the rest branching off into all 

668 the possibilities of draw_string. 

669 

670 Notes 

671 ----- 

672 

673 The above examples are slightly simplified and are intended to convey 

674 intuition. In practice, there are some implementation details to be aware 

675 of. 

676 

677 - In draw nodes, we store the constraints used in addition to the value drawn. 

678 E.g. the node corresponding to data.draw_float(min_value=1.0, max_value=1.5) 

679 would store {"min_value": 1.0, "max_value": 1.5, ...} (default values for 

680 other constraints omitted). 

681 

682 The constraints parameters have the potential to change both the range of 

683 possible outputs of a node, and the probability distribution within that 

684 range, so we need to use these when drawing in DataTree as well. We draw 

685 values using these constraints when (1) generating a novel value for a node 

686 and (2) choosing a random child when traversing the tree. 

687 

688 - For space efficiency, rather than tracking the full tree structure, we 

689 store DataTree as a radix tree. This is conceptually equivalent (radix 

690 trees can always be "unfolded" to the full tree) but it means the internal 

691 representation may differ in practice. 

692 

693 See TreeNode for more information. 

694 """ 

695 

696 def __init__(self) -> None: 

697 self.root: TreeNode = TreeNode() 

698 self._children_cache: dict[ChoiceT, ChildrenCacheValueT] = {} 

699 

700 @property 

701 def is_exhausted(self) -> bool: 

702 """ 

703 Returns True if every node is exhausted, and therefore the tree has 

704 been fully explored. 

705 """ 

706 return self.root.is_exhausted 

707 

708 def generate_novel_prefix(self, random: Random) -> tuple[ChoiceT, ...]: 

709 """Generate a short random string that (after rewriting) is not 

710 a prefix of any buffer previously added to the tree. 

711 

712 The resulting prefix is essentially arbitrary - it would be nice 

713 for it to be uniform at random, but previous attempts to do that 

714 have proven too expensive. 

715 """ 

716 assert not self.is_exhausted 

717 prefix = [] 

718 

719 def append_choice(choice_type: ChoiceTypeT, choice: ChoiceT) -> None: 

720 if choice_type == "float": 

721 assert isinstance(choice, int) 

722 choice = int_to_float(choice) 

723 prefix.append(choice) 

724 

725 current_node = self.root 

726 while True: 

727 assert not current_node.is_exhausted 

728 for i, (choice_type, constraints, value) in enumerate( 

729 zip( 

730 current_node.choice_types, 

731 current_node.constraints, 

732 current_node.values, 

733 ) 

734 ): 

735 if i in current_node.forced: 

736 append_choice(choice_type, value) 

737 else: 

738 attempts = 0 

739 while True: 

740 if attempts <= 10: 

741 try: 

742 node_value = self._draw( 

743 choice_type, constraints, random=random 

744 ) 

745 except StopTest: # pragma: no cover 

746 # it is possible that drawing from a fresh data can 

747 # overrun BUFFER_SIZE, due to eg unlucky rejection sampling 

748 # of integer probes. Retry these cases. 

749 attempts += 1 

750 continue 

751 else: 

752 node_value = self._draw_from_cache( 

753 choice_type, 

754 constraints, 

755 key=id(current_node), 

756 random=random, 

757 ) 

758 

759 if node_value != value: 

760 append_choice(choice_type, node_value) 

761 break 

762 attempts += 1 

763 self._reject_child( 

764 choice_type, 

765 constraints, 

766 child=node_value, 

767 key=id(current_node), 

768 ) 

769 # We've now found a value that is allowed to 

770 # vary, so what follows is not fixed. 

771 return tuple(prefix) 

772 else: 

773 assert not isinstance(current_node.transition, (Conclusion, Killed)) 

774 if current_node.transition is None: 

775 return tuple(prefix) 

776 branch = current_node.transition 

777 assert isinstance(branch, Branch) 

778 

779 attempts = 0 

780 while True: 

781 if attempts <= 10: 

782 try: 

783 node_value = self._draw( 

784 branch.choice_type, branch.constraints, random=random 

785 ) 

786 except StopTest: # pragma: no cover 

787 attempts += 1 

788 continue 

789 else: 

790 node_value = self._draw_from_cache( 

791 branch.choice_type, 

792 branch.constraints, 

793 key=id(branch), 

794 random=random, 

795 ) 

796 try: 

797 child = branch.children[node_value] 

798 except KeyError: 

799 append_choice(branch.choice_type, node_value) 

800 return tuple(prefix) 

801 if not child.is_exhausted: 

802 append_choice(branch.choice_type, node_value) 

803 current_node = child 

804 break 

805 attempts += 1 

806 self._reject_child( 

807 branch.choice_type, 

808 branch.constraints, 

809 child=node_value, 

810 key=id(branch), 

811 ) 

812 

813 # We don't expect this assertion to ever fire, but coverage 

814 # wants the loop inside to run if you have branch checking 

815 # on, hence the pragma. 

816 assert ( # pragma: no cover 

817 attempts != 1000 

818 or len(branch.children) < branch.max_children 

819 or any(not v.is_exhausted for v in branch.children.values()) 

820 ) 

821 

822 def rewrite(self, choices): 

823 """Use previously seen ConjectureData objects to return a tuple of 

824 the rewritten choice sequence and the status we would get from running 

825 that with the test function. If the status cannot be predicted 

826 from the existing values it will be None.""" 

827 data = ConjectureData.for_choices(choices) 

828 try: 

829 self.simulate_test_function(data) 

830 return (data.choices, data.status) 

831 except PreviouslyUnseenBehaviour: 

832 return (choices, None) 

833 

834 def simulate_test_function(self, data: ConjectureData) -> None: 

835 """Run a simulated version of the test function recorded by 

836 this tree. Note that this does not currently call ``stop_span`` 

837 or ``start_span`` as these are not currently recorded in the 

838 tree. This will likely change in future.""" 

839 node = self.root 

840 

841 def draw(choice_type, constraints, *, forced=None, convert_forced=True): 

842 if choice_type == "float" and forced is not None and convert_forced: 

843 forced = int_to_float(forced) 

844 

845 draw_func = getattr(data, f"draw_{choice_type}") 

846 value = draw_func(**constraints, forced=forced) 

847 

848 if choice_type == "float": 

849 value = float_to_int(value) 

850 return value 

851 

852 try: 

853 while True: 

854 for i, (choice_type, constraints, previous) in enumerate( 

855 zip(node.choice_types, node.constraints, node.values) 

856 ): 

857 v = draw( 

858 choice_type, 

859 constraints, 

860 forced=previous if i in node.forced else None, 

861 ) 

862 if v != previous: 

863 raise PreviouslyUnseenBehaviour 

864 if isinstance(node.transition, Conclusion): 

865 t = node.transition 

866 data.conclude_test(t.status, t.interesting_origin) 

867 elif node.transition is None: 

868 raise PreviouslyUnseenBehaviour 

869 elif isinstance(node.transition, Branch): 

870 v = draw(node.transition.choice_type, node.transition.constraints) 

871 try: 

872 node = node.transition.children[v] 

873 except KeyError as err: 

874 raise PreviouslyUnseenBehaviour from err 

875 else: 

876 assert isinstance(node.transition, Killed) 

877 data.observer.kill_branch() 

878 node = node.transition.next_node 

879 except StopTest: 

880 pass 

881 

882 def new_observer(self): 

883 return TreeRecordingObserver(self) 

884 

885 def _draw( 

886 self, 

887 choice_type: ChoiceTypeT, 

888 constraints: ChoiceConstraintsT, 

889 *, 

890 random: Random, 

891 ) -> ChoiceT: 

892 from hypothesis.internal.conjecture.data import draw_choice 

893 

894 value = draw_choice(choice_type, constraints, random=random) 

895 # using floats as keys into branch.children breaks things, because 

896 # e.g. hash(0.0) == hash(-0.0) would collide as keys when they are 

897 # in fact distinct child branches. 

898 # To distinguish floats here we'll use their bits representation. This 

899 # entails some bookkeeping such that we're careful about when the 

900 # float key is in its bits form (as a key into branch.children) and 

901 # when it is in its float form (as a value we want to write to the 

902 # buffer), and converting between the two forms as appropriate. 

903 if choice_type == "float": 

904 value = float_to_int(value) 

905 return value 

906 

907 def _get_children_cache( 

908 self, choice_type: ChoiceTypeT, constraints: ChoiceConstraintsT, *, key: ChoiceT 

909 ) -> ChildrenCacheValueT: 

910 # cache the state of the children generator per node/branch (passed as 

911 # `key` here), such that we track which children we've already tried 

912 # for this branch across draws. 

913 # We take advantage of python generators here as one-way iterables, 

914 # so each time we iterate we implicitly store our position in the 

915 # children generator and don't re-draw children. `children` is the 

916 # concrete list of children draw from the generator that we will work 

917 # with. Whenever we need to top up this list, we will draw a new value 

918 # from the generator. 

919 if key not in self._children_cache: 

920 generator = all_children(choice_type, constraints) 

921 children: list[ChoiceT] = [] 

922 rejected: set[ChoiceT] = set() 

923 self._children_cache[key] = (generator, children, rejected) 

924 

925 return self._children_cache[key] 

926 

927 def _draw_from_cache( 

928 self, 

929 choice_type: ChoiceTypeT, 

930 constraints: ChoiceConstraintsT, 

931 *, 

932 key: ChoiceT, 

933 random: Random, 

934 ) -> ChoiceT: 

935 (generator, children, rejected) = self._get_children_cache( 

936 choice_type, constraints, key=key 

937 ) 

938 # Keep a stock of 100 potentially-valid children at all times. 

939 # This number is chosen to balance memory/speed vs randomness. Ideally 

940 # we would sample uniformly from all not-yet-rejected children, but 

941 # computing and storing said children is not free. 

942 # no-branch because coverage of the fall-through case here is a bit 

943 # annoying. 

944 if len(children) < 100: # pragma: no branch 

945 for v in generator: 

946 if choice_type == "float": 

947 assert isinstance(v, float) 

948 v = float_to_int(v) 

949 if v in rejected: 

950 continue 

951 children.append(v) 

952 if len(children) >= 100: 

953 break 

954 

955 return random.choice(children) 

956 

957 def _reject_child( 

958 self, 

959 choice_type: ChoiceTypeT, 

960 constraints: ChoiceConstraintsT, 

961 *, 

962 child: ChoiceT, 

963 key: ChoiceT, 

964 ) -> None: 

965 (_generator, children, rejected) = self._get_children_cache( 

966 choice_type, constraints, key=key 

967 ) 

968 rejected.add(child) 

969 # we remove a child from the list of possible children *only* when it is 

970 # rejected, and not when it is initially drawn in _draw_from_cache. The 

971 # reason is that a child being drawn does not guarantee that child will 

972 # be used in a way such that it is written back to the tree, so it needs 

973 # to be available for future draws until we are certain it has been 

974 # used. 

975 # 

976 # For instance, if we generated novel prefixes in a loop (but never used 

977 # those prefixes to generate new values!) then we don't want to remove 

978 # the drawn children from the available pool until they are actually 

979 # used. 

980 # 

981 # This does result in a small inefficiency: we may draw a child, 

982 # immediately use it (so we know it cannot be drawn again), but still 

983 # wait to draw and reject it here, because DataTree cannot guarantee 

984 # the drawn child has been used. 

985 if child in children: 

986 children.remove(child) 

987 

988 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

989 assert cycle is False 

990 p.pretty(self.root) 

991 

992 

993class TreeRecordingObserver(DataObserver): 

994 def __init__(self, tree: DataTree): 

995 # this attr isn't read, but is very useful for local debugging flaky 

996 # errors, with 

997 # `from hypothesis.vendor import pretty; print(pretty.pretty(self._root))` 

998 self._root = tree.root 

999 self._current_node: TreeNode = tree.root 

1000 self._index_in_current_node: int = 0 

1001 self._trail: list[TreeNode] = [self._current_node] 

1002 self.killed: bool = False 

1003 

1004 def draw_integer( 

1005 self, value: int, *, was_forced: bool, constraints: IntegerConstraints 

1006 ) -> None: 

1007 self.draw_value( 

1008 "integer", value, was_forced=was_forced, constraints=constraints 

1009 ) 

1010 

1011 def draw_float( 

1012 self, value: float, *, was_forced: bool, constraints: FloatConstraints 

1013 ) -> None: 

1014 self.draw_value("float", value, was_forced=was_forced, constraints=constraints) 

1015 

1016 def draw_string( 

1017 self, value: str, *, was_forced: bool, constraints: StringConstraints 

1018 ) -> None: 

1019 self.draw_value("string", value, was_forced=was_forced, constraints=constraints) 

1020 

1021 def draw_bytes( 

1022 self, value: bytes, *, was_forced: bool, constraints: BytesConstraints 

1023 ) -> None: 

1024 self.draw_value("bytes", value, was_forced=was_forced, constraints=constraints) 

1025 

1026 def draw_boolean( 

1027 self, value: bool, *, was_forced: bool, constraints: BooleanConstraints 

1028 ) -> None: 

1029 self.draw_value( 

1030 "boolean", value, was_forced=was_forced, constraints=constraints 

1031 ) 

1032 

1033 def draw_value( 

1034 self, 

1035 choice_type: ChoiceTypeT, 

1036 value: ChoiceT, 

1037 *, 

1038 was_forced: bool, 

1039 constraints: ChoiceConstraintsT, 

1040 ) -> None: 

1041 i = self._index_in_current_node 

1042 self._index_in_current_node += 1 

1043 node = self._current_node 

1044 

1045 if isinstance(value, float): 

1046 value = float_to_int(value) 

1047 

1048 assert len(node.constraints) == len(node.values) == len(node.choice_types) 

1049 if i < len(node.values): 

1050 if ( 

1051 choice_type != node.choice_types[i] 

1052 or constraints != node.constraints[i] 

1053 ): 

1054 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1055 # Note that we don't check whether a previously 

1056 # forced value is now free. That will be caught 

1057 # if we ever split the node there, but otherwise 

1058 # may pass silently. This is acceptable because it 

1059 # means we skip a hash set lookup on every 

1060 # draw and that's a pretty niche failure mode. 

1061 if was_forced and i not in node.forced: 

1062 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1063 if value != node.values[i]: 

1064 node.split_at(i) 

1065 assert i == len(node.values) 

1066 new_node = TreeNode() 

1067 assert isinstance(node.transition, Branch) 

1068 node.transition.children[value] = new_node 

1069 self._current_node = new_node 

1070 self._index_in_current_node = 0 

1071 else: 

1072 trans = node.transition 

1073 if trans is None: 

1074 node.choice_types.append(choice_type) 

1075 node.constraints.append(constraints) 

1076 node.values.append(value) 

1077 if was_forced: 

1078 node.mark_forced(i) 

1079 # generate_novel_prefix assumes the following invariant: any one 

1080 # of the series of draws in a particular node can vary, i.e. the 

1081 # max number of children is at least 2. However, some draws are 

1082 # pseudo-choices and only have a single value, such as 

1083 # integers(0, 0). 

1084 # 

1085 # Currently, we address this by forcefully splitting such 

1086 # single-valued nodes into a transition when we see them. An 

1087 # exception to this is if it was forced: forced pseudo-choices 

1088 # do not cause the above issue because they inherently cannot 

1089 # vary, and moreover they trip other invariants about never 

1090 # splitting forced nodes. 

1091 # 

1092 # An alternative is not writing such choices to the tree at 

1093 # all, and thus guaranteeing that each node has at least 2 max 

1094 # children. 

1095 if ( 

1096 compute_max_children(choice_type, constraints) == 1 

1097 and not was_forced 

1098 ): 

1099 node.split_at(i) 

1100 assert isinstance(node.transition, Branch) 

1101 self._current_node = node.transition.children[value] 

1102 self._index_in_current_node = 0 

1103 elif isinstance(trans, Conclusion): 

1104 assert trans.status != Status.OVERRUN 

1105 # We tried to draw where history says we should have 

1106 # stopped 

1107 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1108 else: 

1109 assert isinstance(trans, Branch), trans 

1110 if choice_type != trans.choice_type or constraints != trans.constraints: 

1111 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1112 try: 

1113 self._current_node = trans.children[value] 

1114 except KeyError: 

1115 self._current_node = trans.children.setdefault(value, TreeNode()) 

1116 self._index_in_current_node = 0 

1117 if self._trail[-1] is not self._current_node: 

1118 self._trail.append(self._current_node) 

1119 

1120 def kill_branch(self) -> None: 

1121 """Mark this part of the tree as not worth re-exploring.""" 

1122 if self.killed: 

1123 return 

1124 

1125 self.killed = True 

1126 

1127 if self._index_in_current_node < len(self._current_node.values) or ( 

1128 self._current_node.transition is not None 

1129 and not isinstance(self._current_node.transition, Killed) 

1130 ): 

1131 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1132 

1133 if self._current_node.transition is None: 

1134 self._current_node.transition = Killed(TreeNode()) 

1135 self.__update_exhausted() 

1136 

1137 self._current_node = self._current_node.transition.next_node 

1138 self._index_in_current_node = 0 

1139 self._trail.append(self._current_node) 

1140 

1141 def conclude_test( 

1142 self, status: Status, interesting_origin: Optional[InterestingOrigin] 

1143 ) -> None: 

1144 """Says that ``status`` occurred at node ``node``. This updates the 

1145 node if necessary and checks for consistency.""" 

1146 if status == Status.OVERRUN: 

1147 return 

1148 i = self._index_in_current_node 

1149 node = self._current_node 

1150 

1151 if i < len(node.values) or isinstance(node.transition, Branch): 

1152 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1153 

1154 new_transition = Conclusion(status, interesting_origin) 

1155 

1156 if node.transition is not None and node.transition != new_transition: 

1157 # As an, I'm afraid, horrible bodge, we deliberately ignore flakiness 

1158 # where tests go from interesting to valid, because it's much easier 

1159 # to produce good error messages for these further up the stack. 

1160 if isinstance(node.transition, Conclusion) and ( 

1161 node.transition.status != Status.INTERESTING 

1162 or new_transition.status != Status.VALID 

1163 ): 

1164 old_origin = node.transition.interesting_origin 

1165 new_origin = new_transition.interesting_origin 

1166 raise FlakyReplay( 

1167 f"Inconsistent results from replaying a test case!\n" 

1168 f" last: {node.transition.status.name} from {old_origin}\n" 

1169 f" this: {new_transition.status.name} from {new_origin}", 

1170 (old_origin, new_origin), 

1171 ) 

1172 else: 

1173 node.transition = new_transition 

1174 

1175 assert node is self._trail[-1] 

1176 node.check_exhausted() 

1177 assert len(node.values) > 0 or node.check_exhausted() 

1178 

1179 if not self.killed: 

1180 self.__update_exhausted() 

1181 

1182 def __update_exhausted(self) -> None: 

1183 for t in reversed(self._trail): 

1184 # Any node we've traversed might have now become exhausted. 

1185 # We check from the right. As soon as we hit a node that 

1186 # isn't exhausted, this automatically implies that all of 

1187 # its parents are not exhausted, so we stop. 

1188 if not t.check_exhausted(): 

1189 break