Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/internal/conjecture/datatree.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

418 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import math 

12from collections.abc import Generator 

13from random import Random 

14from typing import TYPE_CHECKING, AbstractSet, Final, Optional, Union, cast 

15 

16import attr 

17 

18from hypothesis.errors import ( 

19 FlakyReplay, 

20 FlakyStrategyDefinition, 

21 HypothesisException, 

22 StopTest, 

23) 

24from hypothesis.internal import floats as flt 

25from hypothesis.internal.conjecture.choice import ( 

26 BooleanConstraints, 

27 BytesConstraints, 

28 ChoiceConstraintsT, 

29 ChoiceT, 

30 ChoiceTypeT, 

31 FloatConstraints, 

32 IntegerConstraints, 

33 StringConstraints, 

34 choice_from_index, 

35) 

36from hypothesis.internal.conjecture.data import ConjectureData, DataObserver, Status 

37from hypothesis.internal.escalation import InterestingOrigin 

38from hypothesis.internal.floats import ( 

39 count_between_floats, 

40 float_to_int, 

41 int_to_float, 

42 sign_aware_lte, 

43) 

44 

45if TYPE_CHECKING: 

46 from typing import TypeAlias 

47 

48 from hypothesis.vendor.pretty import RepresentationPrinter 

49 

50ChildrenCacheValueT: "TypeAlias" = tuple[ 

51 Generator[ChoiceT, None, None], list[ChoiceT], set[ChoiceT] 

52] 

53 

54 

55class PreviouslyUnseenBehaviour(HypothesisException): 

56 pass 

57 

58 

59_FLAKY_STRAT_MSG = ( 

60 "Inconsistent data generation! Data generation behaved differently " 

61 "between different runs. Is your data generation depending on external " 

62 "state?" 

63) 

64 

65 

66EMPTY: frozenset[int] = frozenset() 

67 

68 

69@attr.s(slots=True) 

70class Killed: 

71 """Represents a transition to part of the tree which has been marked as 

72 "killed", meaning we want to treat it as not worth exploring, so it will 

73 be treated as if it were completely explored for the purposes of 

74 exhaustion.""" 

75 

76 next_node: "TreeNode" = attr.ib() 

77 

78 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

79 assert cycle is False 

80 p.text("Killed") 

81 

82 

83def _node_pretty( 

84 choice_type: ChoiceTypeT, 

85 value: ChoiceT, 

86 constraints: ChoiceConstraintsT, 

87 *, 

88 forced: bool, 

89) -> str: 

90 forced_marker = " [forced]" if forced else "" 

91 return f"{choice_type} {value!r}{forced_marker} {constraints}" 

92 

93 

94@attr.s(slots=True) 

95class Branch: 

96 """Represents a transition where multiple choices can be made as to what 

97 to drawn.""" 

98 

99 constraints: ChoiceConstraintsT = attr.ib() 

100 choice_type: ChoiceTypeT = attr.ib() 

101 children: dict[ChoiceT, "TreeNode"] = attr.ib(repr=False) 

102 

103 @property 

104 def max_children(self) -> int: 

105 max_children = compute_max_children(self.choice_type, self.constraints) 

106 assert max_children > 0 

107 return max_children 

108 

109 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

110 assert cycle is False 

111 for i, (value, child) in enumerate(self.children.items()): 

112 if i > 0: 

113 p.break_() 

114 p.text( 

115 _node_pretty(self.choice_type, value, self.constraints, forced=False) 

116 ) 

117 with p.indent(2): 

118 p.break_() 

119 p.pretty(child) 

120 

121 

122@attr.s(slots=True, frozen=True) 

123class Conclusion: 

124 """Represents a transition to a finished state.""" 

125 

126 status: Status = attr.ib() 

127 interesting_origin: Optional[InterestingOrigin] = attr.ib() 

128 

129 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

130 assert cycle is False 

131 o = self.interesting_origin 

132 # avoid str(o), which can include multiple lines of context 

133 origin = ( 

134 "" if o is None else f", {o.exc_type.__name__} at {o.filename}:{o.lineno}" 

135 ) 

136 p.text(f"Conclusion ({self.status!r}{origin})") 

137 

138 

139# The number of max children where, beyond this, it is practically impossible 

140# for hypothesis to saturate / explore all children nodes in a reasonable time 

141# frame. We use this to bail out of expensive max children computations early, 

142# where the numbers involved are so large that we know they will be larger than 

143# this number. 

144# 

145# Note that it's ok for us to underestimate the number of max children of a node 

146# by using this. We just may think the node is exhausted when in fact it has more 

147# possible children to be explored. This has the potential to finish generation 

148# early due to exhausting the entire tree, but that is quite unlikely: (1) the 

149# number of examples would have to be quite high, and (2) the tree would have to 

150# contain only one or two nodes, or generate_novel_prefix would simply switch to 

151# exploring another non-exhausted node. 

152# 

153# Also note that we may sometimes compute max children above this value. In other 

154# words, this is *not* a hard maximum on the computed max children. It's the point 

155# where further computation is not beneficial - but sometimes doing that computation 

156# unconditionally is cheaper than estimating against this value. 

157# 

158# The one case where this may be detrimental is fuzzing, where the throughput of 

159# examples is so high that it really may saturate important nodes. We'll cross 

160# that bridge when we come to it. 

161MAX_CHILDREN_EFFECTIVELY_INFINITE: Final[int] = 10_000_000 

162 

163 

164def _count_distinct_strings(*, alphabet_size: int, min_size: int, max_size: int) -> int: 

165 # We want to estimate if we're going to have more children than 

166 # MAX_CHILDREN_EFFECTIVELY_INFINITE, without computing a potentially 

167 # extremely expensive pow. We'll check the two extreme cases - if the 

168 # number of strings in the largest string size alone is enough to put us 

169 # over this limit (at alphabet_size >= 2), and if the variation in sizes 

170 # (at alphabet_size == 1) is enough. If neither result in an early return, 

171 # the exact result should be reasonably cheap to compute. 

172 if alphabet_size == 0: 

173 # Special-case the empty string, avoid error in math.log(0). 

174 return 1 

175 elif alphabet_size == 1: 

176 # Special-case the constant alphabet, invalid in the geom-series sum. 

177 return max_size - min_size + 1 

178 else: 

179 # Estimate against log, which is cheaper than computing a pow. 

180 # 

181 # m = max_size 

182 # a = alphabet_size 

183 # N = MAX_CHILDREN_EFFECTIVELY_INFINITE 

184 # 

185 # a**m > N 

186 # <=> m * log(a) > log(N) 

187 log_max_sized_children = max_size * math.log(alphabet_size) 

188 if log_max_sized_children > math.log(MAX_CHILDREN_EFFECTIVELY_INFINITE): 

189 return MAX_CHILDREN_EFFECTIVELY_INFINITE 

190 

191 # The sum of a geometric series is given by (ref: wikipedia): 

192 # ᵐ∑ₖ₌₀ aᵏ = (aᵐ⁺¹ - 1) / (a - 1) 

193 # = S(m) / S(0) 

194 # assuming a != 1 and using the definition 

195 # S(m) := aᵐ⁺¹ - 1. 

196 # The sum we want, starting from a number n [0 <= n <= m] rather than zero, is 

197 # ᵐ∑ₖ₌ₙ aᵏ = ᵐ∑ₖ₌₀ aᵏ - ⁿ⁻¹∑ₖ₌₀ aᵏ = S(m) / S(0) - S(n - 1) / S(0) 

198 def S(n): 

199 return alphabet_size ** (n + 1) - 1 

200 

201 return (S(max_size) - S(min_size - 1)) // S(0) 

202 

203 

204def compute_max_children( 

205 choice_type: ChoiceTypeT, constraints: ChoiceConstraintsT 

206) -> int: 

207 if choice_type == "integer": 

208 constraints = cast(IntegerConstraints, constraints) 

209 min_value = constraints["min_value"] 

210 max_value = constraints["max_value"] 

211 

212 if min_value is None and max_value is None: 

213 # full 128 bit range. 

214 return 2**128 - 1 

215 if min_value is not None and max_value is not None: 

216 # count between min/max value. 

217 return max_value - min_value + 1 

218 

219 # hard case: only one bound was specified. Here we probe either upwards 

220 # or downwards with our full 128 bit generation, but only half of these 

221 # (plus one for the case of generating zero) result in a probe in the 

222 # direction we want. ((2**128 - 1) // 2) + 1 == 2 ** 127 

223 assert (min_value is None) ^ (max_value is None) 

224 return 2**127 

225 elif choice_type == "boolean": 

226 constraints = cast(BooleanConstraints, constraints) 

227 p = constraints["p"] 

228 # probabilities of 0 or 1 (or effectively 0 or 1) only have one choice. 

229 if p <= 2 ** (-64) or p >= (1 - 2 ** (-64)): 

230 return 1 

231 return 2 

232 elif choice_type == "bytes": 

233 constraints = cast(BytesConstraints, constraints) 

234 return _count_distinct_strings( 

235 alphabet_size=2**8, 

236 min_size=constraints["min_size"], 

237 max_size=constraints["max_size"], 

238 ) 

239 elif choice_type == "string": 

240 constraints = cast(StringConstraints, constraints) 

241 min_size = constraints["min_size"] 

242 max_size = constraints["max_size"] 

243 intervals = constraints["intervals"] 

244 

245 return _count_distinct_strings( 

246 alphabet_size=len(intervals), min_size=min_size, max_size=max_size 

247 ) 

248 elif choice_type == "float": 

249 constraints = cast(FloatConstraints, constraints) 

250 min_value_f = constraints["min_value"] 

251 max_value_f = constraints["max_value"] 

252 smallest_nonzero_magnitude = constraints["smallest_nonzero_magnitude"] 

253 

254 count = count_between_floats(min_value_f, max_value_f) 

255 

256 # we have two intervals: 

257 # a. [min_value, max_value] 

258 # b. [-smallest_nonzero_magnitude, smallest_nonzero_magnitude] 

259 # 

260 # which could be subsets (in either order), overlapping, or disjoint. We 

261 # want the interval difference a - b. 

262 

263 # next_down because endpoints are ok with smallest_nonzero_magnitude 

264 min_point = max(min_value_f, -flt.next_down(smallest_nonzero_magnitude)) 

265 max_point = min(max_value_f, flt.next_down(smallest_nonzero_magnitude)) 

266 

267 if min_point > max_point: 

268 # case: disjoint intervals. 

269 return count 

270 

271 count -= count_between_floats(min_point, max_point) 

272 if sign_aware_lte(min_value_f, -0.0) and sign_aware_lte(-0.0, max_value_f): 

273 # account for -0.0 

274 count += 1 

275 if sign_aware_lte(min_value_f, 0.0) and sign_aware_lte(0.0, max_value_f): 

276 # account for 0.0 

277 count += 1 

278 return count 

279 

280 raise NotImplementedError(f"unhandled choice_type {choice_type}") 

281 

282 

283# In theory, this is a strict superset of the functionality of compute_max_children; 

284# 

285# assert len(all_children(choice_type, constraints)) == compute_max_children(choice_type, constraints) 

286# 

287# In practice, we maintain two distinct implementations for efficiency and space 

288# reasons. If you just need the number of children, it is cheaper to use 

289# compute_max_children than to reify the list of children (only to immediately 

290# throw it away). 

291def _floats_between(a: float, b: float) -> Generator[float, None, None]: 

292 for n in range(float_to_int(a), float_to_int(b) + 1): 

293 yield int_to_float(n) 

294 

295 

296def all_children( 

297 choice_type: ChoiceTypeT, constraints: ChoiceConstraintsT 

298) -> Generator[ChoiceT, None, None]: 

299 if choice_type != "float": 

300 for index in range(compute_max_children(choice_type, constraints)): 

301 yield choice_from_index(index, choice_type, constraints) 

302 else: 

303 constraints = cast(FloatConstraints, constraints) 

304 # the float ordering is not injective (because of resampling 

305 # out-of-bounds values), so using choice_from_index would result in 

306 # duplicates. This violates invariants in datatree about being able 

307 # to draw unique new children using all_children. 

308 # 

309 # We instead maintain a separate implementation for floats. 

310 # TODO_IR write a better (bijective) ordering for floats and remove this! 

311 min_value = constraints["min_value"] 

312 max_value = constraints["max_value"] 

313 smallest_nonzero_magnitude = constraints["smallest_nonzero_magnitude"] 

314 

315 # handle zeroes separately so smallest_nonzero_magnitude can think of 

316 # itself as a complete interval (instead of a hole at ±0). 

317 if sign_aware_lte(min_value, -0.0) and sign_aware_lte(-0.0, max_value): 

318 yield -0.0 

319 if sign_aware_lte(min_value, 0.0) and sign_aware_lte(0.0, max_value): 

320 yield 0.0 

321 

322 if flt.is_negative(min_value): 

323 if flt.is_negative(max_value): 

324 # case: both negative. 

325 max_point = min(max_value, -smallest_nonzero_magnitude) 

326 # float_to_int increases as negative magnitude increases, so 

327 # invert order. 

328 yield from _floats_between(max_point, min_value) 

329 else: 

330 # case: straddles midpoint (which is between -0.0 and 0.0). 

331 yield from _floats_between(-smallest_nonzero_magnitude, min_value) 

332 yield from _floats_between(smallest_nonzero_magnitude, max_value) 

333 else: 

334 # case: both positive. 

335 min_point = max(min_value, smallest_nonzero_magnitude) 

336 yield from _floats_between(min_point, max_value) 

337 

338 

339@attr.s(slots=True) 

340class TreeNode: 

341 """ 

342 A node, or collection of directly descended nodes, in a DataTree. 

343 

344 We store the DataTree as a radix tree (https://en.wikipedia.org/wiki/Radix_tree), 

345 which means that nodes that are the only child of their parent are collapsed 

346 into their parent to save space. 

347 

348 Conceptually, you can unfold a single TreeNode storing n values in its lists 

349 into a sequence of n nodes, each a child of the last. In other words, 

350 (constraints[i], values[i], choice_types[i]) corresponds to the single node at index 

351 i. 

352 

353 Note that if a TreeNode represents a choice (i.e. the nodes cannot be compacted 

354 via the radix tree definition), then its lists will be empty and it will 

355 store a `Branch` representing that choce in its `transition`. 

356 

357 Examples 

358 -------- 

359 

360 Consider sequentially drawing a boolean, then an integer. 

361 

362 data.draw_boolean() 

363 data.draw_integer(1, 3) 

364 

365 If we draw True and then 2, the tree may conceptually look like this. 

366 

367 ┌──────┐ 

368 │ root │ 

369 └──┬───┘ 

370 ┌──┴───┐ 

371 │ True │ 

372 └──┬───┘ 

373 ┌──┴───┐ 

374 │ 2 │ 

375 └──────┘ 

376 

377 But since 2 is the only child of True, we will compact these nodes and store 

378 them as a single TreeNode. 

379 

380 ┌──────┐ 

381 │ root │ 

382 └──┬───┘ 

383 ┌────┴──────┐ 

384 │ [True, 2] │ 

385 └───────────┘ 

386 

387 If we then draw True and then 3, True will have multiple children and we 

388 can no longer store this compacted representation. We would call split_at(0) 

389 on the [True, 2] node to indicate that we need to add a choice at 0-index 

390 node (True). 

391 

392 ┌──────┐ 

393 │ root │ 

394 └──┬───┘ 

395 ┌──┴───┐ 

396 ┌─┤ True ├─┐ 

397 │ └──────┘ │ 

398 ┌─┴─┐ ┌─┴─┐ 

399 │ 2 │ │ 3 │ 

400 └───┘ └───┘ 

401 """ 

402 

403 # The constraints, value, and choice_types of the nodes stored here. These always 

404 # have the same length. The values at index i belong to node i. 

405 constraints: list[ChoiceConstraintsT] = attr.ib(factory=list) 

406 values: list[ChoiceT] = attr.ib(factory=list) 

407 choice_types: list[ChoiceTypeT] = attr.ib(factory=list) 

408 

409 # The indices of nodes which had forced values. 

410 # 

411 # Stored as None if no indices have been forced, purely for space saving 

412 # reasons (we force quite rarely). 

413 __forced: Optional[set[int]] = attr.ib(default=None, init=False) 

414 

415 # What happens next after drawing these nodes. (conceptually, "what is the 

416 # child/children of the last node stored here"). 

417 # 

418 # One of: 

419 # - None (we don't know yet) 

420 # - Branch (we have seen multiple possible outcomes here) 

421 # - Conclusion (ConjectureData.conclude_test was called here) 

422 # - Killed (this branch is valid and may even have children, but should not 

423 # be explored when generating novel prefixes) 

424 transition: Union[None, Branch, Conclusion, Killed] = attr.ib(default=None) 

425 

426 # A tree node is exhausted if every possible sequence of draws below it has 

427 # been explored. We only update this when performing operations that could 

428 # change the answer. 

429 # 

430 # See also TreeNode.check_exhausted. 

431 is_exhausted: bool = attr.ib(default=False, init=False) 

432 

433 @property 

434 def forced(self) -> AbstractSet[int]: 

435 if not self.__forced: 

436 return EMPTY 

437 return self.__forced 

438 

439 def mark_forced(self, i: int) -> None: 

440 """ 

441 Note that the draw at node i was forced. 

442 """ 

443 assert 0 <= i < len(self.values) 

444 if self.__forced is None: 

445 self.__forced = set() 

446 self.__forced.add(i) 

447 

448 def split_at(self, i: int) -> None: 

449 """ 

450 Splits the tree so that it can incorporate a decision at the draw call 

451 corresponding to the node at position i. 

452 

453 Raises FlakyStrategyDefinition if node i was forced. 

454 """ 

455 

456 if i in self.forced: 

457 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

458 

459 assert not self.is_exhausted 

460 

461 key = self.values[i] 

462 

463 child = TreeNode( 

464 choice_types=self.choice_types[i + 1 :], 

465 constraints=self.constraints[i + 1 :], 

466 values=self.values[i + 1 :], 

467 transition=self.transition, 

468 ) 

469 self.transition = Branch( 

470 constraints=self.constraints[i], 

471 choice_type=self.choice_types[i], 

472 children={key: child}, 

473 ) 

474 if self.__forced is not None: 

475 child.__forced = {j - i - 1 for j in self.__forced if j > i} 

476 self.__forced = {j for j in self.__forced if j < i} 

477 child.check_exhausted() 

478 del self.choice_types[i:] 

479 del self.values[i:] 

480 del self.constraints[i:] 

481 assert len(self.values) == len(self.constraints) == len(self.choice_types) == i 

482 

483 def check_exhausted(self) -> bool: 

484 """ 

485 Recalculates is_exhausted if necessary, and then returns it. 

486 

487 A node is exhausted if: 

488 - Its transition is Conclusion or Killed 

489 - It has the maximum number of children (i.e. we have found all of its 

490 possible children), and all its children are exhausted 

491 

492 Therefore, we only need to compute this for a node when: 

493 - We first create it in split_at 

494 - We set its transition to either Conclusion or Killed 

495 (TreeRecordingObserver.conclude_test or TreeRecordingObserver.kill_branch) 

496 - We exhaust any of its children 

497 """ 

498 

499 if ( 

500 # a node cannot go from is_exhausted -> not is_exhausted. 

501 not self.is_exhausted 

502 # if we don't know what happens after this node, we don't have 

503 # enough information to tell if it's exhausted. 

504 and self.transition is not None 

505 # if there are still any nodes left which are the only child of their 

506 # parent (len(self.values) > 0), then this TreeNode must be not 

507 # exhausted, unless all of those nodes were forced. 

508 # 

509 # This is because we maintain an invariant of only adding nodes to 

510 # DataTree which have at least 2 possible values, so we know that if 

511 # they do not have any siblings that we still have more choices to 

512 # discover. 

513 # 

514 # (We actually *do* currently add single-valued nodes to the tree, 

515 # but immediately split them into a transition to avoid falsifying 

516 # this check. this is a bit of a hack.) 

517 and len(self.forced) == len(self.values) 

518 ): 

519 if isinstance(self.transition, (Conclusion, Killed)): 

520 self.is_exhausted = True 

521 elif len(self.transition.children) == self.transition.max_children: 

522 self.is_exhausted = all( 

523 v.is_exhausted for v in self.transition.children.values() 

524 ) 

525 return self.is_exhausted 

526 

527 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

528 assert cycle is False 

529 indent = 0 

530 for i, (choice_type, constraints, value) in enumerate( 

531 zip(self.choice_types, self.constraints, self.values) 

532 ): 

533 with p.indent(indent): 

534 if i > 0: 

535 p.break_() 

536 p.text( 

537 _node_pretty( 

538 choice_type, value, constraints, forced=i in self.forced 

539 ) 

540 ) 

541 indent += 2 

542 

543 with p.indent(indent): 

544 if len(self.values) > 0: 

545 p.break_() 

546 if self.transition is not None: 

547 p.pretty(self.transition) 

548 else: 

549 p.text("unknown") 

550 

551 

552class DataTree: 

553 """ 

554 A DataTree tracks the structured history of draws in some test function, 

555 across multiple ConjectureData objects. 

556 

557 This information is used by ConjectureRunner to generate novel prefixes of 

558 this tree (see generate_novel_prefix). A novel prefix is a sequence of draws 

559 which the tree has not seen before, and therefore the ConjectureRunner has 

560 not generated as an input to the test function before. 

561 

562 DataTree tracks the following: 

563 

564 - Drawn choices in the choice sequence 

565 - ConjectureData.draw_integer() 

566 - ConjectureData.draw_float() 

567 - ConjectureData.draw_string() 

568 - ConjectureData.draw_boolean() 

569 - ConjectureData.draw_bytes() 

570 - Test conclusions (with some Status, e.g. Status.VALID) 

571 - ConjectureData.conclude_test() 

572 

573 A DataTree is — surprise — a *tree*. A node in this tree is either a choice draw 

574 with some value, a test conclusion with some Status, or a special `Killed` value, 

575 which denotes that further draws may exist beyond this node but should not be 

576 considered worth exploring when generating novel prefixes. A node is a leaf 

577 iff it is a conclusion or Killed. 

578 

579 A branch from node A to node B indicates that we have previously seen some 

580 sequence (a, b) of draws, where a and b are the values in nodes A and B. 

581 Similar intuition holds for conclusion and Killed nodes. 

582 

583 Examples 

584 -------- 

585 

586 To see how a DataTree gets built through successive sets of draws, consider 

587 the following code that calls through to some ConjecutreData object `data`. 

588 The first call can be either True or False, and the second call can be any 

589 integer in the range [1, 3]. 

590 

591 data.draw_boolean() 

592 data.draw_integer(1, 3) 

593 

594 To start, the corresponding DataTree object is completely empty. 

595 

596 ┌──────┐ 

597 │ root │ 

598 └──────┘ 

599 

600 We happen to draw True and then 2 in the above code. The tree tracks this. 

601 (2 also connects to a child Conclusion node with Status.VALID since it's the 

602 final draw in the code. I'll omit Conclusion nodes in diagrams for brevity.) 

603 

604 ┌──────┐ 

605 │ root │ 

606 └──┬───┘ 

607 ┌──┴───┐ 

608 │ True │ 

609 └──┬───┘ 

610 ┌──┴───┐ 

611 │ 2 │ 

612 └──────┘ 

613 

614 This is a very boring tree so far! But now we happen to draw False and 

615 then 1. This causes a split in the tree. Remember, DataTree tracks history 

616 over all invocations of a function, not just one. The end goal is to know 

617 what invocations haven't been tried yet, after all. 

618 

619 ┌──────┐ 

620 ┌───┤ root ├───┐ 

621 │ └──────┘ │ 

622 ┌──┴───┐ ┌─┴─────┐ 

623 │ True │ │ False │ 

624 └──┬───┘ └──┬────┘ 

625 ┌─┴─┐ ┌─┴─┐ 

626 │ 2 │ │ 1 │ 

627 └───┘ └───┘ 

628 

629 If we were to ask DataTree for a novel prefix at this point, it might 

630 generate any of (True, 1), (True, 3), (False, 2), or (False, 3). 

631 

632 Note that the novel prefix stops as soon as it generates a novel node. For 

633 instance, if we had generated a novel prefix back when the tree was only 

634 root -> True -> 2, we could have gotten any of (True, 1), (True, 3), or 

635 (False). But we could *not* have gotten (False, n), because both False and 

636 n were novel at that point, and we stop at the first novel node — False. 

637 

638 I won't belabor this example. Here's what the tree looks like when fully 

639 explored: 

640 

641 ┌──────┐ 

642 ┌──────┤ root ├──────┐ 

643 │ └──────┘ │ 

644 ┌──┴───┐ ┌─┴─────┐ 

645 ┌──┤ True ├──┐ ┌───┤ False ├──┐ 

646 │ └──┬───┘ │ │ └──┬────┘ │ 

647 ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ 

648 │ 1 │ │ 2 │ │ 3 │ │ 1 │ │ 2 │ │ 3 │ 

649 └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ 

650 

651 You could imagine much more complicated trees than this arising in practice, 

652 and indeed they do. In particular, the tree need not be balanced or 'nice' 

653 like the tree above. For instance, 

654 

655 b = data.draw_boolean() 

656 if b: 

657 data.draw_integer(1, 3) 

658 

659 results in a tree with the entire right part lopped off, and False leading 

660 straight to a conclusion node with Status.VALID. As another example, 

661 

662 n = data.draw_integers() 

663 assume(n >= 3) 

664 data.draw_string() 

665 

666 results in a tree with the 0, 1, and 2 nodes leading straight to a 

667 conclusion node with Status.INVALID, and the rest branching off into all 

668 the possibilities of draw_string. 

669 

670 Notes 

671 ----- 

672 

673 The above examples are slightly simplified and are intended to convey 

674 intuition. In practice, there are some implementation details to be aware 

675 of. 

676 

677 - In draw nodes, we store the constraints used in addition to the value drawn. 

678 E.g. the node corresponding to data.draw_float(min_value=1.0, max_value=1.5) 

679 would store {"min_value": 1.0, "max_value": 1.5, ...} (default values for 

680 other constraints omitted). 

681 

682 The constraints parameters have the potential to change both the range of 

683 possible outputs of a node, and the probability distribution within that 

684 range, so we need to use these when drawing in DataTree as well. We draw 

685 values using these constraints when (1) generating a novel value for a node 

686 and (2) choosing a random child when traversing the tree. 

687 

688 - For space efficiency, rather than tracking the full tree structure, we 

689 store DataTree as a radix tree. This is conceptually equivalent (radix 

690 trees can always be "unfolded" to the full tree) but it means the internal 

691 representation may differ in practice. 

692 

693 See TreeNode for more information. 

694 """ 

695 

696 def __init__(self) -> None: 

697 self.root: TreeNode = TreeNode() 

698 self._children_cache: dict[ChoiceT, ChildrenCacheValueT] = {} 

699 

700 @property 

701 def is_exhausted(self) -> bool: 

702 """ 

703 Returns True if every node is exhausted, and therefore the tree has 

704 been fully explored. 

705 """ 

706 return self.root.is_exhausted 

707 

708 def generate_novel_prefix(self, random: Random) -> tuple[ChoiceT, ...]: 

709 """Generate a short random string that (after rewriting) is not 

710 a prefix of any choice sequence previously added to the tree. 

711 

712 The resulting prefix is essentially arbitrary - it would be nice 

713 for it to be uniform at random, but previous attempts to do that 

714 have proven too expensive. 

715 """ 

716 assert not self.is_exhausted 

717 prefix = [] 

718 

719 def append_choice(choice_type: ChoiceTypeT, choice: ChoiceT) -> None: 

720 if choice_type == "float": 

721 assert isinstance(choice, int) 

722 choice = int_to_float(choice) 

723 prefix.append(choice) 

724 

725 current_node = self.root 

726 while True: 

727 assert not current_node.is_exhausted 

728 for i, (choice_type, constraints, value) in enumerate( 

729 zip( 

730 current_node.choice_types, 

731 current_node.constraints, 

732 current_node.values, 

733 ) 

734 ): 

735 if i in current_node.forced: 

736 append_choice(choice_type, value) 

737 else: 

738 attempts = 0 

739 while True: 

740 if attempts <= 10: 

741 try: 

742 node_value = self._draw( 

743 choice_type, constraints, random=random 

744 ) 

745 except StopTest: # pragma: no cover 

746 # it is possible that drawing from a fresh data can 

747 # overrun BUFFER_SIZE, due to eg unlucky rejection sampling 

748 # of integer probes. Retry these cases. 

749 attempts += 1 

750 continue 

751 else: 

752 node_value = self._draw_from_cache( 

753 choice_type, 

754 constraints, 

755 key=id(current_node), 

756 random=random, 

757 ) 

758 

759 if node_value != value: 

760 append_choice(choice_type, node_value) 

761 break 

762 attempts += 1 

763 self._reject_child( 

764 choice_type, 

765 constraints, 

766 child=node_value, 

767 key=id(current_node), 

768 ) 

769 # We've now found a value that is allowed to 

770 # vary, so what follows is not fixed. 

771 return tuple(prefix) 

772 else: 

773 assert not isinstance(current_node.transition, (Conclusion, Killed)) 

774 if current_node.transition is None: 

775 return tuple(prefix) 

776 branch = current_node.transition 

777 assert isinstance(branch, Branch) 

778 

779 attempts = 0 

780 while True: 

781 if attempts <= 10: 

782 try: 

783 node_value = self._draw( 

784 branch.choice_type, branch.constraints, random=random 

785 ) 

786 except StopTest: # pragma: no cover 

787 attempts += 1 

788 continue 

789 else: 

790 node_value = self._draw_from_cache( 

791 branch.choice_type, 

792 branch.constraints, 

793 key=id(branch), 

794 random=random, 

795 ) 

796 try: 

797 child = branch.children[node_value] 

798 except KeyError: 

799 append_choice(branch.choice_type, node_value) 

800 return tuple(prefix) 

801 if not child.is_exhausted: 

802 append_choice(branch.choice_type, node_value) 

803 current_node = child 

804 break 

805 attempts += 1 

806 self._reject_child( 

807 branch.choice_type, 

808 branch.constraints, 

809 child=node_value, 

810 key=id(branch), 

811 ) 

812 

813 # We don't expect this assertion to ever fire, but coverage 

814 # wants the loop inside to run if you have branch checking 

815 # on, hence the pragma. 

816 assert ( # pragma: no cover 

817 attempts != 1000 

818 or len(branch.children) < branch.max_children 

819 or any(not v.is_exhausted for v in branch.children.values()) 

820 ) 

821 

822 def rewrite(self, choices): 

823 """Use previously seen ConjectureData objects to return a tuple of 

824 the rewritten choice sequence and the status we would get from running 

825 that with the test function. If the status cannot be predicted 

826 from the existing values it will be None.""" 

827 data = ConjectureData.for_choices(choices) 

828 try: 

829 self.simulate_test_function(data) 

830 return (data.choices, data.status) 

831 except PreviouslyUnseenBehaviour: 

832 return (choices, None) 

833 

834 def simulate_test_function(self, data: ConjectureData) -> None: 

835 """Run a simulated version of the test function recorded by 

836 this tree. Note that this does not currently call ``stop_span`` 

837 or ``start_span`` as these are not currently recorded in the 

838 tree. This will likely change in future.""" 

839 node = self.root 

840 

841 def draw(choice_type, constraints, *, forced=None, convert_forced=True): 

842 if choice_type == "float" and forced is not None and convert_forced: 

843 forced = int_to_float(forced) 

844 

845 draw_func = getattr(data, f"draw_{choice_type}") 

846 value = draw_func(**constraints, forced=forced) 

847 

848 if choice_type == "float": 

849 value = float_to_int(value) 

850 return value 

851 

852 try: 

853 while True: 

854 for i, (choice_type, constraints, previous) in enumerate( 

855 zip(node.choice_types, node.constraints, node.values) 

856 ): 

857 v = draw( 

858 choice_type, 

859 constraints, 

860 forced=previous if i in node.forced else None, 

861 ) 

862 if v != previous: 

863 raise PreviouslyUnseenBehaviour 

864 if isinstance(node.transition, Conclusion): 

865 t = node.transition 

866 data.conclude_test(t.status, t.interesting_origin) 

867 elif node.transition is None: 

868 raise PreviouslyUnseenBehaviour 

869 elif isinstance(node.transition, Branch): 

870 v = draw(node.transition.choice_type, node.transition.constraints) 

871 try: 

872 node = node.transition.children[v] 

873 except KeyError as err: 

874 raise PreviouslyUnseenBehaviour from err 

875 else: 

876 assert isinstance(node.transition, Killed) 

877 data.observer.kill_branch() 

878 node = node.transition.next_node 

879 except StopTest: 

880 pass 

881 

882 def new_observer(self): 

883 return TreeRecordingObserver(self) 

884 

885 def _draw( 

886 self, 

887 choice_type: ChoiceTypeT, 

888 constraints: ChoiceConstraintsT, 

889 *, 

890 random: Random, 

891 ) -> ChoiceT: 

892 from hypothesis.internal.conjecture.data import draw_choice 

893 

894 value = draw_choice(choice_type, constraints, random=random) 

895 # using floats as keys into branch.children breaks things, because 

896 # e.g. hash(0.0) == hash(-0.0) would collide as keys when they are 

897 # in fact distinct child branches. 

898 # To distinguish floats here we'll use their bits representation. This 

899 # entails some bookkeeping such that we're careful about when the 

900 # float key is in its bits form (as a key into branch.children) and 

901 # when it is in its float form (as a value we want to write to the 

902 # choice sequence), and converting between the two forms as appropriate. 

903 if choice_type == "float": 

904 assert isinstance(value, float) 

905 value = float_to_int(value) 

906 return value 

907 

908 def _get_children_cache( 

909 self, choice_type: ChoiceTypeT, constraints: ChoiceConstraintsT, *, key: ChoiceT 

910 ) -> ChildrenCacheValueT: 

911 # cache the state of the children generator per node/branch (passed as 

912 # `key` here), such that we track which children we've already tried 

913 # for this branch across draws. 

914 # We take advantage of python generators here as one-way iterables, 

915 # so each time we iterate we implicitly store our position in the 

916 # children generator and don't re-draw children. `children` is the 

917 # concrete list of children draw from the generator that we will work 

918 # with. Whenever we need to top up this list, we will draw a new value 

919 # from the generator. 

920 if key not in self._children_cache: 

921 generator = all_children(choice_type, constraints) 

922 children: list[ChoiceT] = [] 

923 rejected: set[ChoiceT] = set() 

924 self._children_cache[key] = (generator, children, rejected) 

925 

926 return self._children_cache[key] 

927 

928 def _draw_from_cache( 

929 self, 

930 choice_type: ChoiceTypeT, 

931 constraints: ChoiceConstraintsT, 

932 *, 

933 key: ChoiceT, 

934 random: Random, 

935 ) -> ChoiceT: 

936 (generator, children, rejected) = self._get_children_cache( 

937 choice_type, constraints, key=key 

938 ) 

939 # Keep a stock of 100 potentially-valid children at all times. 

940 # This number is chosen to balance memory/speed vs randomness. Ideally 

941 # we would sample uniformly from all not-yet-rejected children, but 

942 # computing and storing said children is not free. 

943 # no-branch because coverage of the fall-through case here is a bit 

944 # annoying. 

945 if len(children) < 100: # pragma: no branch 

946 for v in generator: 

947 if choice_type == "float": 

948 assert isinstance(v, float) 

949 v = float_to_int(v) 

950 if v in rejected: 

951 continue 

952 children.append(v) 

953 if len(children) >= 100: 

954 break 

955 

956 return random.choice(children) 

957 

958 def _reject_child( 

959 self, 

960 choice_type: ChoiceTypeT, 

961 constraints: ChoiceConstraintsT, 

962 *, 

963 child: ChoiceT, 

964 key: ChoiceT, 

965 ) -> None: 

966 (_generator, children, rejected) = self._get_children_cache( 

967 choice_type, constraints, key=key 

968 ) 

969 rejected.add(child) 

970 # we remove a child from the list of possible children *only* when it is 

971 # rejected, and not when it is initially drawn in _draw_from_cache. The 

972 # reason is that a child being drawn does not guarantee that child will 

973 # be used in a way such that it is written back to the tree, so it needs 

974 # to be available for future draws until we are certain it has been 

975 # used. 

976 # 

977 # For instance, if we generated novel prefixes in a loop (but never used 

978 # those prefixes to generate new values!) then we don't want to remove 

979 # the drawn children from the available pool until they are actually 

980 # used. 

981 # 

982 # This does result in a small inefficiency: we may draw a child, 

983 # immediately use it (so we know it cannot be drawn again), but still 

984 # wait to draw and reject it here, because DataTree cannot guarantee 

985 # the drawn child has been used. 

986 if child in children: 

987 children.remove(child) 

988 

989 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

990 assert cycle is False 

991 p.pretty(self.root) 

992 

993 

994class TreeRecordingObserver(DataObserver): 

995 def __init__(self, tree: DataTree): 

996 # this attr isn't read, but is very useful for local debugging flaky 

997 # errors, with 

998 # `from hypothesis.vendor import pretty; print(pretty.pretty(self._root))` 

999 self._root = tree.root 

1000 self._current_node: TreeNode = tree.root 

1001 self._index_in_current_node: int = 0 

1002 self._trail: list[TreeNode] = [self._current_node] 

1003 self.killed: bool = False 

1004 

1005 def draw_integer( 

1006 self, value: int, *, was_forced: bool, constraints: IntegerConstraints 

1007 ) -> None: 

1008 self.draw_value( 

1009 "integer", value, was_forced=was_forced, constraints=constraints 

1010 ) 

1011 

1012 def draw_float( 

1013 self, value: float, *, was_forced: bool, constraints: FloatConstraints 

1014 ) -> None: 

1015 self.draw_value("float", value, was_forced=was_forced, constraints=constraints) 

1016 

1017 def draw_string( 

1018 self, value: str, *, was_forced: bool, constraints: StringConstraints 

1019 ) -> None: 

1020 self.draw_value("string", value, was_forced=was_forced, constraints=constraints) 

1021 

1022 def draw_bytes( 

1023 self, value: bytes, *, was_forced: bool, constraints: BytesConstraints 

1024 ) -> None: 

1025 self.draw_value("bytes", value, was_forced=was_forced, constraints=constraints) 

1026 

1027 def draw_boolean( 

1028 self, value: bool, *, was_forced: bool, constraints: BooleanConstraints 

1029 ) -> None: 

1030 self.draw_value( 

1031 "boolean", value, was_forced=was_forced, constraints=constraints 

1032 ) 

1033 

1034 def draw_value( 

1035 self, 

1036 choice_type: ChoiceTypeT, 

1037 value: ChoiceT, 

1038 *, 

1039 was_forced: bool, 

1040 constraints: ChoiceConstraintsT, 

1041 ) -> None: 

1042 i = self._index_in_current_node 

1043 self._index_in_current_node += 1 

1044 node = self._current_node 

1045 

1046 if isinstance(value, float): 

1047 value = float_to_int(value) 

1048 

1049 assert len(node.constraints) == len(node.values) == len(node.choice_types) 

1050 if i < len(node.values): 

1051 if ( 

1052 choice_type != node.choice_types[i] 

1053 or constraints != node.constraints[i] 

1054 ): 

1055 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1056 # Note that we don't check whether a previously 

1057 # forced value is now free. That will be caught 

1058 # if we ever split the node there, but otherwise 

1059 # may pass silently. This is acceptable because it 

1060 # means we skip a hash set lookup on every 

1061 # draw and that's a pretty niche failure mode. 

1062 if was_forced and i not in node.forced: 

1063 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1064 if value != node.values[i]: 

1065 node.split_at(i) 

1066 assert i == len(node.values) 

1067 new_node = TreeNode() 

1068 assert isinstance(node.transition, Branch) 

1069 node.transition.children[value] = new_node 

1070 self._current_node = new_node 

1071 self._index_in_current_node = 0 

1072 else: 

1073 trans = node.transition 

1074 if trans is None: 

1075 node.choice_types.append(choice_type) 

1076 node.constraints.append(constraints) 

1077 node.values.append(value) 

1078 if was_forced: 

1079 node.mark_forced(i) 

1080 # generate_novel_prefix assumes the following invariant: any one 

1081 # of the series of draws in a particular node can vary, i.e. the 

1082 # max number of children is at least 2. However, some draws are 

1083 # pseudo-choices and only have a single value, such as 

1084 # integers(0, 0). 

1085 # 

1086 # Currently, we address this by forcefully splitting such 

1087 # single-valued nodes into a transition when we see them. An 

1088 # exception to this is if it was forced: forced pseudo-choices 

1089 # do not cause the above issue because they inherently cannot 

1090 # vary, and moreover they trip other invariants about never 

1091 # splitting forced nodes. 

1092 # 

1093 # An alternative is not writing such choices to the tree at 

1094 # all, and thus guaranteeing that each node has at least 2 max 

1095 # children. 

1096 if ( 

1097 compute_max_children(choice_type, constraints) == 1 

1098 and not was_forced 

1099 ): 

1100 node.split_at(i) 

1101 assert isinstance(node.transition, Branch) 

1102 self._current_node = node.transition.children[value] 

1103 self._index_in_current_node = 0 

1104 elif isinstance(trans, Conclusion): 

1105 assert trans.status != Status.OVERRUN 

1106 # We tried to draw where history says we should have 

1107 # stopped 

1108 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1109 else: 

1110 assert isinstance(trans, Branch), trans 

1111 if choice_type != trans.choice_type or constraints != trans.constraints: 

1112 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1113 try: 

1114 self._current_node = trans.children[value] 

1115 except KeyError: 

1116 self._current_node = trans.children.setdefault(value, TreeNode()) 

1117 self._index_in_current_node = 0 

1118 if self._trail[-1] is not self._current_node: 

1119 self._trail.append(self._current_node) 

1120 

1121 def kill_branch(self) -> None: 

1122 """Mark this part of the tree as not worth re-exploring.""" 

1123 if self.killed: 

1124 return 

1125 

1126 self.killed = True 

1127 

1128 if self._index_in_current_node < len(self._current_node.values) or ( 

1129 self._current_node.transition is not None 

1130 and not isinstance(self._current_node.transition, Killed) 

1131 ): 

1132 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1133 

1134 if self._current_node.transition is None: 

1135 self._current_node.transition = Killed(TreeNode()) 

1136 self.__update_exhausted() 

1137 

1138 self._current_node = self._current_node.transition.next_node 

1139 self._index_in_current_node = 0 

1140 self._trail.append(self._current_node) 

1141 

1142 def conclude_test( 

1143 self, status: Status, interesting_origin: Optional[InterestingOrigin] 

1144 ) -> None: 

1145 """Says that ``status`` occurred at node ``node``. This updates the 

1146 node if necessary and checks for consistency.""" 

1147 if status == Status.OVERRUN: 

1148 return 

1149 i = self._index_in_current_node 

1150 node = self._current_node 

1151 

1152 if i < len(node.values) or isinstance(node.transition, Branch): 

1153 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1154 

1155 new_transition = Conclusion(status, interesting_origin) 

1156 

1157 if node.transition is not None and node.transition != new_transition: 

1158 # As an, I'm afraid, horrible bodge, we deliberately ignore flakiness 

1159 # where tests go from interesting to valid, because it's much easier 

1160 # to produce good error messages for these further up the stack. 

1161 if isinstance(node.transition, Conclusion) and ( 

1162 node.transition.status != Status.INTERESTING 

1163 or new_transition.status != Status.VALID 

1164 ): 

1165 old_origin = node.transition.interesting_origin 

1166 new_origin = new_transition.interesting_origin 

1167 raise FlakyReplay( 

1168 f"Inconsistent results from replaying a test case!\n" 

1169 f" last: {node.transition.status.name} from {old_origin}\n" 

1170 f" this: {new_transition.status.name} from {new_origin}", 

1171 (old_origin, new_origin), 

1172 ) 

1173 else: 

1174 node.transition = new_transition 

1175 

1176 assert node is self._trail[-1] 

1177 node.check_exhausted() 

1178 assert len(node.values) > 0 or node.check_exhausted() 

1179 

1180 if not self.killed: 

1181 self.__update_exhausted() 

1182 

1183 def __update_exhausted(self) -> None: 

1184 for t in reversed(self._trail): 

1185 # Any node we've traversed might have now become exhausted. 

1186 # We check from the right. As soon as we hit a node that 

1187 # isn't exhausted, this automatically implies that all of 

1188 # its parents are not exhausted, so we stop. 

1189 if not t.check_exhausted(): 

1190 break