Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/internal/conjecture/datatree.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

417 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import math 

12from collections.abc import Generator, Set 

13from dataclasses import dataclass, field 

14from random import Random 

15from typing import TYPE_CHECKING, Final, TypeAlias, cast 

16 

17from hypothesis.errors import ( 

18 FlakyReplay, 

19 FlakyStrategyDefinition, 

20 HypothesisException, 

21 StopTest, 

22) 

23from hypothesis.internal import floats as flt 

24from hypothesis.internal.conjecture.choice import ( 

25 BooleanConstraints, 

26 BytesConstraints, 

27 ChoiceConstraintsT, 

28 ChoiceT, 

29 ChoiceTypeT, 

30 FloatConstraints, 

31 IntegerConstraints, 

32 StringConstraints, 

33 choice_from_index, 

34) 

35from hypothesis.internal.conjecture.data import ConjectureData, DataObserver, Status 

36from hypothesis.internal.escalation import InterestingOrigin 

37from hypothesis.internal.floats import ( 

38 count_between_floats, 

39 float_to_int, 

40 int_to_float, 

41 sign_aware_lte, 

42) 

43 

44if TYPE_CHECKING: 

45 from hypothesis.vendor.pretty import RepresentationPrinter 

46 

47ChildrenCacheValueT: TypeAlias = tuple[ 

48 Generator[ChoiceT, None, None], list[ChoiceT], set[ChoiceT] 

49] 

50 

51 

52class PreviouslyUnseenBehaviour(HypothesisException): 

53 pass 

54 

55 

56_FLAKY_STRAT_MSG = ( 

57 "Inconsistent data generation! Data generation behaved differently " 

58 "between different runs. Is your data generation depending on external " 

59 "state?" 

60) 

61 

62 

63EMPTY: frozenset[int] = frozenset() 

64 

65 

66@dataclass(slots=True, frozen=True) 

67class Killed: 

68 """Represents a transition to part of the tree which has been marked as 

69 "killed", meaning we want to treat it as not worth exploring, so it will 

70 be treated as if it were completely explored for the purposes of 

71 exhaustion.""" 

72 

73 next_node: "TreeNode" 

74 

75 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

76 assert cycle is False 

77 p.text("Killed") 

78 

79 

80def _node_pretty( 

81 choice_type: ChoiceTypeT, 

82 value: ChoiceT, 

83 constraints: ChoiceConstraintsT, 

84 *, 

85 forced: bool, 

86) -> str: 

87 forced_marker = " [forced]" if forced else "" 

88 return f"{choice_type} {value!r}{forced_marker} {constraints}" 

89 

90 

91@dataclass(slots=True, frozen=False) 

92class Branch: 

93 """Represents a transition where multiple choices can be made as to what 

94 to drawn.""" 

95 

96 constraints: ChoiceConstraintsT 

97 choice_type: ChoiceTypeT 

98 children: dict[ChoiceT, "TreeNode"] = field(repr=False) 

99 

100 @property 

101 def max_children(self) -> int: 

102 max_children = compute_max_children(self.choice_type, self.constraints) 

103 assert max_children > 0 

104 return max_children 

105 

106 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

107 assert cycle is False 

108 for i, (value, child) in enumerate(self.children.items()): 

109 if i > 0: 

110 p.break_() 

111 p.text( 

112 _node_pretty(self.choice_type, value, self.constraints, forced=False) 

113 ) 

114 with p.indent(2): 

115 p.break_() 

116 p.pretty(child) 

117 

118 

119@dataclass(slots=True, frozen=True) 

120class Conclusion: 

121 """Represents a transition to a finished state.""" 

122 

123 status: Status 

124 interesting_origin: InterestingOrigin | None 

125 

126 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

127 assert cycle is False 

128 o = self.interesting_origin 

129 # avoid str(o), which can include multiple lines of context 

130 origin = ( 

131 "" if o is None else f", {o.exc_type.__name__} at {o.filename}:{o.lineno}" 

132 ) 

133 p.text(f"Conclusion ({self.status!r}{origin})") 

134 

135 

136# The number of max children where, beyond this, it is practically impossible 

137# for hypothesis to saturate / explore all children nodes in a reasonable time 

138# frame. We use this to bail out of expensive max children computations early, 

139# where the numbers involved are so large that we know they will be larger than 

140# this number. 

141# 

142# Note that it's ok for us to underestimate the number of max children of a node 

143# by using this. We just may think the node is exhausted when in fact it has more 

144# possible children to be explored. This has the potential to finish generation 

145# early due to exhausting the entire tree, but that is quite unlikely: (1) the 

146# number of examples would have to be quite high, and (2) the tree would have to 

147# contain only one or two nodes, or generate_novel_prefix would simply switch to 

148# exploring another non-exhausted node. 

149# 

150# Also note that we may sometimes compute max children above this value. In other 

151# words, this is *not* a hard maximum on the computed max children. It's the point 

152# where further computation is not beneficial - but sometimes doing that computation 

153# unconditionally is cheaper than estimating against this value. 

154# 

155# The one case where this may be detrimental is fuzzing, where the throughput of 

156# examples is so high that it really may saturate important nodes. We'll cross 

157# that bridge when we come to it. 

158MAX_CHILDREN_EFFECTIVELY_INFINITE: Final[int] = 10_000_000 

159 

160 

161def _count_distinct_strings(*, alphabet_size: int, min_size: int, max_size: int) -> int: 

162 # We want to estimate if we're going to have more children than 

163 # MAX_CHILDREN_EFFECTIVELY_INFINITE, without computing a potentially 

164 # extremely expensive pow. We'll check the two extreme cases - if the 

165 # number of strings in the largest string size alone is enough to put us 

166 # over this limit (at alphabet_size >= 2), and if the variation in sizes 

167 # (at alphabet_size == 1) is enough. If neither result in an early return, 

168 # the exact result should be reasonably cheap to compute. 

169 if alphabet_size == 0: 

170 # Special-case the empty string, avoid error in math.log(0). 

171 return 1 

172 elif alphabet_size == 1: 

173 # Special-case the constant alphabet, invalid in the geom-series sum. 

174 return max_size - min_size + 1 

175 else: 

176 # Estimate against log, which is cheaper than computing a pow. 

177 # 

178 # m = max_size 

179 # a = alphabet_size 

180 # N = MAX_CHILDREN_EFFECTIVELY_INFINITE 

181 # 

182 # a**m > N 

183 # <=> m * log(a) > log(N) 

184 log_max_sized_children = max_size * math.log(alphabet_size) 

185 if log_max_sized_children > math.log(MAX_CHILDREN_EFFECTIVELY_INFINITE): 

186 return MAX_CHILDREN_EFFECTIVELY_INFINITE 

187 

188 # The sum of a geometric series is given by (ref: wikipedia): 

189 # ᵐ∑ₖ₌₀ aᵏ = (aᵐ⁺¹ - 1) / (a - 1) 

190 # = S(m) / S(0) 

191 # assuming a != 1 and using the definition 

192 # S(m) := aᵐ⁺¹ - 1. 

193 # The sum we want, starting from a number n [0 <= n <= m] rather than zero, is 

194 # ᵐ∑ₖ₌ₙ aᵏ = ᵐ∑ₖ₌₀ aᵏ - ⁿ⁻¹∑ₖ₌₀ aᵏ = S(m) / S(0) - S(n - 1) / S(0) 

195 def S(n): 

196 return alphabet_size ** (n + 1) - 1 

197 

198 return (S(max_size) - S(min_size - 1)) // S(0) 

199 

200 

201def compute_max_children( 

202 choice_type: ChoiceTypeT, constraints: ChoiceConstraintsT 

203) -> int: 

204 if choice_type == "integer": 

205 constraints = cast(IntegerConstraints, constraints) 

206 min_value = constraints["min_value"] 

207 max_value = constraints["max_value"] 

208 

209 if min_value is None and max_value is None: 

210 # full 128 bit range. 

211 return 2**128 - 1 

212 if min_value is not None and max_value is not None: 

213 # count between min/max value. 

214 return max_value - min_value + 1 

215 

216 # hard case: only one bound was specified. Here we probe either upwards 

217 # or downwards with our full 128 bit generation, but only half of these 

218 # (plus one for the case of generating zero) result in a probe in the 

219 # direction we want. ((2**128 - 1) // 2) + 1 == 2 ** 127 

220 assert (min_value is None) != (max_value is None) 

221 return 2**127 

222 elif choice_type == "boolean": 

223 constraints = cast(BooleanConstraints, constraints) 

224 p = constraints["p"] 

225 # probabilities of 0 or 1 (or effectively 0 or 1) only have one choice. 

226 if p <= 2 ** (-64) or p >= (1 - 2 ** (-64)): 

227 return 1 

228 return 2 

229 elif choice_type == "bytes": 

230 constraints = cast(BytesConstraints, constraints) 

231 return _count_distinct_strings( 

232 alphabet_size=2**8, 

233 min_size=constraints["min_size"], 

234 max_size=constraints["max_size"], 

235 ) 

236 elif choice_type == "string": 

237 constraints = cast(StringConstraints, constraints) 

238 min_size = constraints["min_size"] 

239 max_size = constraints["max_size"] 

240 intervals = constraints["intervals"] 

241 

242 return _count_distinct_strings( 

243 alphabet_size=len(intervals), min_size=min_size, max_size=max_size 

244 ) 

245 elif choice_type == "float": 

246 constraints = cast(FloatConstraints, constraints) 

247 min_value_f = constraints["min_value"] 

248 max_value_f = constraints["max_value"] 

249 smallest_nonzero_magnitude = constraints["smallest_nonzero_magnitude"] 

250 

251 count = count_between_floats(min_value_f, max_value_f) 

252 

253 # we have two intervals: 

254 # a. [min_value, max_value] 

255 # b. [-smallest_nonzero_magnitude, smallest_nonzero_magnitude] 

256 # 

257 # which could be subsets (in either order), overlapping, or disjoint. We 

258 # want the interval difference a - b. 

259 

260 # next_down because endpoints are ok with smallest_nonzero_magnitude 

261 min_point = max(min_value_f, -flt.next_down(smallest_nonzero_magnitude)) 

262 max_point = min(max_value_f, flt.next_down(smallest_nonzero_magnitude)) 

263 

264 if min_point > max_point: 

265 # case: disjoint intervals. 

266 return count 

267 

268 count -= count_between_floats(min_point, max_point) 

269 if sign_aware_lte(min_value_f, -0.0) and sign_aware_lte(-0.0, max_value_f): 

270 # account for -0.0 

271 count += 1 

272 if sign_aware_lte(min_value_f, 0.0) and sign_aware_lte(0.0, max_value_f): 

273 # account for 0.0 

274 count += 1 

275 return count 

276 

277 raise NotImplementedError(f"unhandled choice_type {choice_type}") 

278 

279 

280# In theory, this is a strict superset of the functionality of compute_max_children; 

281# 

282# assert len(all_children(choice_type, constraints)) == compute_max_children(choice_type, constraints) 

283# 

284# In practice, we maintain two distinct implementations for efficiency and space 

285# reasons. If you just need the number of children, it is cheaper to use 

286# compute_max_children than to reify the list of children (only to immediately 

287# throw it away). 

288def _floats_between(a: float, b: float) -> Generator[float, None, None]: 

289 for n in range(float_to_int(a), float_to_int(b) + 1): 

290 yield int_to_float(n) 

291 

292 

293def all_children( 

294 choice_type: ChoiceTypeT, constraints: ChoiceConstraintsT 

295) -> Generator[ChoiceT, None, None]: 

296 if choice_type != "float": 

297 for index in range(compute_max_children(choice_type, constraints)): 

298 yield choice_from_index(index, choice_type, constraints) 

299 else: 

300 constraints = cast(FloatConstraints, constraints) 

301 # the float ordering is not injective (because of resampling 

302 # out-of-bounds values), so using choice_from_index would result in 

303 # duplicates. This violates invariants in datatree about being able 

304 # to draw unique new children using all_children. 

305 # 

306 # We instead maintain a separate implementation for floats. 

307 # TODO_IR write a better (bijective) ordering for floats and remove this! 

308 min_value = constraints["min_value"] 

309 max_value = constraints["max_value"] 

310 smallest_nonzero_magnitude = constraints["smallest_nonzero_magnitude"] 

311 

312 # handle zeroes separately so smallest_nonzero_magnitude can think of 

313 # itself as a complete interval (instead of a hole at ±0). 

314 if sign_aware_lte(min_value, -0.0) and sign_aware_lte(-0.0, max_value): 

315 yield -0.0 

316 if sign_aware_lte(min_value, 0.0) and sign_aware_lte(0.0, max_value): 

317 yield 0.0 

318 

319 if flt.is_negative(min_value): 

320 if flt.is_negative(max_value): 

321 # case: both negative. 

322 max_point = min(max_value, -smallest_nonzero_magnitude) 

323 # float_to_int increases as negative magnitude increases, so 

324 # invert order. 

325 yield from _floats_between(max_point, min_value) 

326 else: 

327 # case: straddles midpoint (which is between -0.0 and 0.0). 

328 yield from _floats_between(-smallest_nonzero_magnitude, min_value) 

329 yield from _floats_between(smallest_nonzero_magnitude, max_value) 

330 else: 

331 # case: both positive. 

332 min_point = max(min_value, smallest_nonzero_magnitude) 

333 yield from _floats_between(min_point, max_value) 

334 

335 

336@dataclass(slots=True, frozen=False) 

337class TreeNode: 

338 """ 

339 A node, or collection of directly descended nodes, in a DataTree. 

340 

341 We store the DataTree as a radix tree (https://en.wikipedia.org/wiki/Radix_tree), 

342 which means that nodes that are the only child of their parent are collapsed 

343 into their parent to save space. 

344 

345 Conceptually, you can unfold a single TreeNode storing n values in its lists 

346 into a sequence of n nodes, each a child of the last. In other words, 

347 (constraints[i], values[i], choice_types[i]) corresponds to the single node at index 

348 i. 

349 

350 Note that if a TreeNode represents a choice (i.e. the nodes cannot be compacted 

351 via the radix tree definition), then its lists will be empty and it will 

352 store a `Branch` representing that choce in its `transition`. 

353 

354 Examples 

355 -------- 

356 

357 Consider sequentially drawing a boolean, then an integer. 

358 

359 data.draw_boolean() 

360 data.draw_integer(1, 3) 

361 

362 If we draw True and then 2, the tree may conceptually look like this. 

363 

364 ┌──────┐ 

365 │ root │ 

366 └──┬───┘ 

367 ┌──┴───┐ 

368 │ True │ 

369 └──┬───┘ 

370 ┌──┴───┐ 

371 │ 2 │ 

372 └──────┘ 

373 

374 But since 2 is the only child of True, we will compact these nodes and store 

375 them as a single TreeNode. 

376 

377 ┌──────┐ 

378 │ root │ 

379 └──┬───┘ 

380 ┌────┴──────┐ 

381 │ [True, 2] │ 

382 └───────────┘ 

383 

384 If we then draw True and then 3, True will have multiple children and we 

385 can no longer store this compacted representation. We would call split_at(0) 

386 on the [True, 2] node to indicate that we need to add a choice at 0-index 

387 node (True). 

388 

389 ┌──────┐ 

390 │ root │ 

391 └──┬───┘ 

392 ┌──┴───┐ 

393 ┌─┤ True ├─┐ 

394 │ └──────┘ │ 

395 ┌─┴─┐ ┌─┴─┐ 

396 │ 2 │ │ 3 │ 

397 └───┘ └───┘ 

398 """ 

399 

400 # The constraints, value, and choice_types of the nodes stored here. These always 

401 # have the same length. The values at index i belong to node i. 

402 constraints: list[ChoiceConstraintsT] = field(default_factory=list) 

403 values: list[ChoiceT] = field(default_factory=list) 

404 choice_types: list[ChoiceTypeT] = field(default_factory=list) 

405 

406 # The indices of nodes which had forced values. 

407 # 

408 # Stored as None if no indices have been forced, purely for space saving 

409 # reasons (we force quite rarely). 

410 __forced: set[int] | None = field(default=None, init=False) 

411 

412 # What happens next after drawing these nodes. (conceptually, "what is the 

413 # child/children of the last node stored here"). 

414 # 

415 # One of: 

416 # - None (we don't know yet) 

417 # - Branch (we have seen multiple possible outcomes here) 

418 # - Conclusion (ConjectureData.conclude_test was called here) 

419 # - Killed (this branch is valid and may even have children, but should not 

420 # be explored when generating novel prefixes) 

421 transition: None | Branch | Conclusion | Killed = None 

422 

423 # A tree node is exhausted if every possible sequence of draws below it has 

424 # been explored. We only update this when performing operations that could 

425 # change the answer. 

426 # 

427 # See also TreeNode.check_exhausted. 

428 is_exhausted: bool = field(default=False, init=False) 

429 

430 @property 

431 def forced(self) -> Set[int]: 

432 if not self.__forced: 

433 return EMPTY 

434 return self.__forced 

435 

436 def mark_forced(self, i: int) -> None: 

437 """ 

438 Note that the draw at node i was forced. 

439 """ 

440 assert 0 <= i < len(self.values) 

441 if self.__forced is None: 

442 self.__forced = set() 

443 self.__forced.add(i) 

444 

445 def split_at(self, i: int) -> None: 

446 """ 

447 Splits the tree so that it can incorporate a decision at the draw call 

448 corresponding to the node at position i. 

449 

450 Raises FlakyStrategyDefinition if node i was forced. 

451 """ 

452 

453 if i in self.forced: 

454 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

455 

456 assert not self.is_exhausted 

457 

458 key = self.values[i] 

459 

460 child = TreeNode( 

461 choice_types=self.choice_types[i + 1 :], 

462 constraints=self.constraints[i + 1 :], 

463 values=self.values[i + 1 :], 

464 transition=self.transition, 

465 ) 

466 self.transition = Branch( 

467 constraints=self.constraints[i], 

468 choice_type=self.choice_types[i], 

469 children={key: child}, 

470 ) 

471 if self.__forced is not None: 

472 child.__forced = {j - i - 1 for j in self.__forced if j > i} 

473 self.__forced = {j for j in self.__forced if j < i} 

474 child.check_exhausted() 

475 del self.choice_types[i:] 

476 del self.values[i:] 

477 del self.constraints[i:] 

478 assert len(self.values) == len(self.constraints) == len(self.choice_types) == i 

479 

480 def check_exhausted(self) -> bool: 

481 """ 

482 Recalculates is_exhausted if necessary, and then returns it. 

483 

484 A node is exhausted if: 

485 - Its transition is Conclusion or Killed 

486 - It has the maximum number of children (i.e. we have found all of its 

487 possible children), and all its children are exhausted 

488 

489 Therefore, we only need to compute this for a node when: 

490 - We first create it in split_at 

491 - We set its transition to either Conclusion or Killed 

492 (TreeRecordingObserver.conclude_test or TreeRecordingObserver.kill_branch) 

493 - We exhaust any of its children 

494 """ 

495 

496 if ( 

497 # a node cannot go from is_exhausted -> not is_exhausted. 

498 not self.is_exhausted 

499 # if we don't know what happens after this node, we don't have 

500 # enough information to tell if it's exhausted. 

501 and self.transition is not None 

502 # if there are still any nodes left which are the only child of their 

503 # parent (len(self.values) > 0), then this TreeNode must be not 

504 # exhausted, unless all of those nodes were forced. 

505 # 

506 # This is because we maintain an invariant of only adding nodes to 

507 # DataTree which have at least 2 possible values, so we know that if 

508 # they do not have any siblings that we still have more choices to 

509 # discover. 

510 # 

511 # (We actually *do* currently add single-valued nodes to the tree, 

512 # but immediately split them into a transition to avoid falsifying 

513 # this check. this is a bit of a hack.) 

514 and len(self.forced) == len(self.values) 

515 ): 

516 if isinstance(self.transition, (Conclusion, Killed)): 

517 self.is_exhausted = True 

518 elif len(self.transition.children) == self.transition.max_children: 

519 self.is_exhausted = all( 

520 v.is_exhausted for v in self.transition.children.values() 

521 ) 

522 return self.is_exhausted 

523 

524 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

525 assert cycle is False 

526 indent = 0 

527 for i, (choice_type, constraints, value) in enumerate( 

528 zip(self.choice_types, self.constraints, self.values, strict=True) 

529 ): 

530 with p.indent(indent): 

531 if i > 0: 

532 p.break_() 

533 p.text( 

534 _node_pretty( 

535 choice_type, value, constraints, forced=i in self.forced 

536 ) 

537 ) 

538 indent += 2 

539 

540 with p.indent(indent): 

541 if len(self.values) > 0: 

542 p.break_() 

543 if self.transition is not None: 

544 p.pretty(self.transition) 

545 else: 

546 p.text("unknown") 

547 

548 

549class DataTree: 

550 """ 

551 A DataTree tracks the structured history of draws in some test function, 

552 across multiple ConjectureData objects. 

553 

554 This information is used by ConjectureRunner to generate novel prefixes of 

555 this tree (see generate_novel_prefix). A novel prefix is a sequence of draws 

556 which the tree has not seen before, and therefore the ConjectureRunner has 

557 not generated as an input to the test function before. 

558 

559 DataTree tracks the following: 

560 

561 - Drawn choices in the choice sequence 

562 - ConjectureData.draw_integer() 

563 - ConjectureData.draw_float() 

564 - ConjectureData.draw_string() 

565 - ConjectureData.draw_boolean() 

566 - ConjectureData.draw_bytes() 

567 - Test conclusions (with some Status, e.g. Status.VALID) 

568 - ConjectureData.conclude_test() 

569 

570 A DataTree is — surprise — a *tree*. A node in this tree is either a choice draw 

571 with some value, a test conclusion with some Status, or a special `Killed` value, 

572 which denotes that further draws may exist beyond this node but should not be 

573 considered worth exploring when generating novel prefixes. A node is a leaf 

574 iff it is a conclusion or Killed. 

575 

576 A branch from node A to node B indicates that we have previously seen some 

577 sequence (a, b) of draws, where a and b are the values in nodes A and B. 

578 Similar intuition holds for conclusion and Killed nodes. 

579 

580 Examples 

581 -------- 

582 

583 To see how a DataTree gets built through successive sets of draws, consider 

584 the following code that calls through to some ConjecutreData object `data`. 

585 The first call can be either True or False, and the second call can be any 

586 integer in the range [1, 3]. 

587 

588 data.draw_boolean() 

589 data.draw_integer(1, 3) 

590 

591 To start, the corresponding DataTree object is completely empty. 

592 

593 ┌──────┐ 

594 │ root │ 

595 └──────┘ 

596 

597 We happen to draw True and then 2 in the above code. The tree tracks this. 

598 (2 also connects to a child Conclusion node with Status.VALID since it's the 

599 final draw in the code. I'll omit Conclusion nodes in diagrams for brevity.) 

600 

601 ┌──────┐ 

602 │ root │ 

603 └──┬───┘ 

604 ┌──┴───┐ 

605 │ True │ 

606 └──┬───┘ 

607 ┌──┴───┐ 

608 │ 2 │ 

609 └──────┘ 

610 

611 This is a very boring tree so far! But now we happen to draw False and 

612 then 1. This causes a split in the tree. Remember, DataTree tracks history 

613 over all invocations of a function, not just one. The end goal is to know 

614 what invocations haven't been tried yet, after all. 

615 

616 ┌──────┐ 

617 ┌───┤ root ├───┐ 

618 │ └──────┘ │ 

619 ┌──┴───┐ ┌─┴─────┐ 

620 │ True │ │ False │ 

621 └──┬───┘ └──┬────┘ 

622 ┌─┴─┐ ┌─┴─┐ 

623 │ 2 │ │ 1 │ 

624 └───┘ └───┘ 

625 

626 If we were to ask DataTree for a novel prefix at this point, it might 

627 generate any of (True, 1), (True, 3), (False, 2), or (False, 3). 

628 

629 Note that the novel prefix stops as soon as it generates a novel node. For 

630 instance, if we had generated a novel prefix back when the tree was only 

631 root -> True -> 2, we could have gotten any of (True, 1), (True, 3), or 

632 (False). But we could *not* have gotten (False, n), because both False and 

633 n were novel at that point, and we stop at the first novel node — False. 

634 

635 I won't belabor this example. Here's what the tree looks like when fully 

636 explored: 

637 

638 ┌──────┐ 

639 ┌──────┤ root ├──────┐ 

640 │ └──────┘ │ 

641 ┌──┴───┐ ┌─┴─────┐ 

642 ┌──┤ True ├──┐ ┌───┤ False ├──┐ 

643 │ └──┬───┘ │ │ └──┬────┘ │ 

644 ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ 

645 │ 1 │ │ 2 │ │ 3 │ │ 1 │ │ 2 │ │ 3 │ 

646 └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ 

647 

648 You could imagine much more complicated trees than this arising in practice, 

649 and indeed they do. In particular, the tree need not be balanced or 'nice' 

650 like the tree above. For instance, 

651 

652 b = data.draw_boolean() 

653 if b: 

654 data.draw_integer(1, 3) 

655 

656 results in a tree with the entire right part lopped off, and False leading 

657 straight to a conclusion node with Status.VALID. As another example, 

658 

659 n = data.draw_integers() 

660 assume(n >= 3) 

661 data.draw_string() 

662 

663 results in a tree with the 0, 1, and 2 nodes leading straight to a 

664 conclusion node with Status.INVALID, and the rest branching off into all 

665 the possibilities of draw_string. 

666 

667 Notes 

668 ----- 

669 

670 The above examples are slightly simplified and are intended to convey 

671 intuition. In practice, there are some implementation details to be aware 

672 of. 

673 

674 - In draw nodes, we store the constraints used in addition to the value drawn. 

675 E.g. the node corresponding to data.draw_float(min_value=1.0, max_value=1.5) 

676 would store {"min_value": 1.0, "max_value": 1.5, ...} (default values for 

677 other constraints omitted). 

678 

679 The constraints parameters have the potential to change both the range of 

680 possible outputs of a node, and the probability distribution within that 

681 range, so we need to use these when drawing in DataTree as well. We draw 

682 values using these constraints when (1) generating a novel value for a node 

683 and (2) choosing a random child when traversing the tree. 

684 

685 - For space efficiency, rather than tracking the full tree structure, we 

686 store DataTree as a radix tree. This is conceptually equivalent (radix 

687 trees can always be "unfolded" to the full tree) but it means the internal 

688 representation may differ in practice. 

689 

690 See TreeNode for more information. 

691 """ 

692 

693 def __init__(self) -> None: 

694 self.root: TreeNode = TreeNode() 

695 self._children_cache: dict[ChoiceT, ChildrenCacheValueT] = {} 

696 

697 @property 

698 def is_exhausted(self) -> bool: 

699 """ 

700 Returns True if every node is exhausted, and therefore the tree has 

701 been fully explored. 

702 """ 

703 return self.root.is_exhausted 

704 

705 def generate_novel_prefix(self, random: Random) -> tuple[ChoiceT, ...]: 

706 """Generate a short random string that (after rewriting) is not 

707 a prefix of any choice sequence previously added to the tree. 

708 

709 The resulting prefix is essentially arbitrary - it would be nice 

710 for it to be uniform at random, but previous attempts to do that 

711 have proven too expensive. 

712 """ 

713 assert not self.is_exhausted 

714 prefix = [] 

715 

716 def append_choice(choice_type: ChoiceTypeT, choice: ChoiceT) -> None: 

717 if choice_type == "float": 

718 assert isinstance(choice, int) 

719 choice = int_to_float(choice) 

720 prefix.append(choice) 

721 

722 current_node = self.root 

723 while True: 

724 assert not current_node.is_exhausted 

725 for i, (choice_type, constraints, value) in enumerate( 

726 zip( 

727 current_node.choice_types, 

728 current_node.constraints, 

729 current_node.values, 

730 strict=True, 

731 ) 

732 ): 

733 if i in current_node.forced: 

734 append_choice(choice_type, value) 

735 else: 

736 attempts = 0 

737 while True: 

738 if attempts <= 10: 

739 try: 

740 node_value = self._draw( 

741 choice_type, constraints, random=random 

742 ) 

743 except StopTest: # pragma: no cover 

744 # it is possible that drawing from a fresh data can 

745 # overrun BUFFER_SIZE, due to eg unlucky rejection sampling 

746 # of integer probes. Retry these cases. 

747 attempts += 1 

748 continue 

749 else: 

750 node_value = self._draw_from_cache( 

751 choice_type, 

752 constraints, 

753 key=id(current_node), 

754 random=random, 

755 ) 

756 

757 if node_value != value: 

758 append_choice(choice_type, node_value) 

759 break 

760 attempts += 1 

761 self._reject_child( 

762 choice_type, 

763 constraints, 

764 child=node_value, 

765 key=id(current_node), 

766 ) 

767 # We've now found a value that is allowed to 

768 # vary, so what follows is not fixed. 

769 return tuple(prefix) 

770 

771 assert not isinstance(current_node.transition, (Conclusion, Killed)) 

772 if current_node.transition is None: 

773 return tuple(prefix) 

774 branch = current_node.transition 

775 assert isinstance(branch, Branch) 

776 

777 attempts = 0 

778 while True: 

779 if attempts <= 10: 

780 try: 

781 node_value = self._draw( 

782 branch.choice_type, branch.constraints, random=random 

783 ) 

784 except StopTest: # pragma: no cover 

785 attempts += 1 

786 continue 

787 else: 

788 node_value = self._draw_from_cache( 

789 branch.choice_type, 

790 branch.constraints, 

791 key=id(branch), 

792 random=random, 

793 ) 

794 try: 

795 child = branch.children[node_value] 

796 except KeyError: 

797 append_choice(branch.choice_type, node_value) 

798 return tuple(prefix) 

799 if not child.is_exhausted: 

800 append_choice(branch.choice_type, node_value) 

801 current_node = child 

802 break 

803 attempts += 1 

804 self._reject_child( 

805 branch.choice_type, 

806 branch.constraints, 

807 child=node_value, 

808 key=id(branch), 

809 ) 

810 

811 # We don't expect this assertion to ever fire, but coverage 

812 # wants the loop inside to run if you have branch checking 

813 # on, hence the pragma. 

814 assert ( # pragma: no cover 

815 attempts != 1000 

816 or len(branch.children) < branch.max_children 

817 or any(not v.is_exhausted for v in branch.children.values()) 

818 ) 

819 

820 def rewrite(self, choices): 

821 """Use previously seen ConjectureData objects to return a tuple of 

822 the rewritten choice sequence and the status we would get from running 

823 that with the test function. If the status cannot be predicted 

824 from the existing values it will be None.""" 

825 data = ConjectureData.for_choices(choices) 

826 try: 

827 self.simulate_test_function(data) 

828 return (data.choices, data.status) 

829 except PreviouslyUnseenBehaviour: 

830 return (choices, None) 

831 

832 def simulate_test_function(self, data: ConjectureData) -> None: 

833 """Run a simulated version of the test function recorded by 

834 this tree. Note that this does not currently call ``stop_span`` 

835 or ``start_span`` as these are not currently recorded in the 

836 tree. This will likely change in future.""" 

837 node = self.root 

838 

839 def draw(choice_type, constraints, *, forced=None, convert_forced=True): 

840 if choice_type == "float" and forced is not None and convert_forced: 

841 forced = int_to_float(forced) 

842 

843 draw_func = getattr(data, f"draw_{choice_type}") 

844 value = draw_func(**constraints, forced=forced) 

845 

846 if choice_type == "float": 

847 value = float_to_int(value) 

848 return value 

849 

850 try: 

851 while True: 

852 for i, (choice_type, constraints, previous) in enumerate( 

853 zip(node.choice_types, node.constraints, node.values, strict=True) 

854 ): 

855 v = draw( 

856 choice_type, 

857 constraints, 

858 forced=previous if i in node.forced else None, 

859 ) 

860 if v != previous: 

861 raise PreviouslyUnseenBehaviour 

862 if isinstance(node.transition, Conclusion): 

863 t = node.transition 

864 data.conclude_test(t.status, t.interesting_origin) 

865 elif node.transition is None: 

866 raise PreviouslyUnseenBehaviour 

867 elif isinstance(node.transition, Branch): 

868 v = draw(node.transition.choice_type, node.transition.constraints) 

869 try: 

870 node = node.transition.children[v] 

871 except KeyError as err: 

872 raise PreviouslyUnseenBehaviour from err 

873 else: 

874 assert isinstance(node.transition, Killed) 

875 data.observer.kill_branch() 

876 node = node.transition.next_node 

877 except StopTest: 

878 pass 

879 

880 def new_observer(self): 

881 return TreeRecordingObserver(self) 

882 

883 def _draw( 

884 self, 

885 choice_type: ChoiceTypeT, 

886 constraints: ChoiceConstraintsT, 

887 *, 

888 random: Random, 

889 ) -> ChoiceT: 

890 from hypothesis.internal.conjecture.data import draw_choice 

891 

892 value = draw_choice(choice_type, constraints, random=random) 

893 # using floats as keys into branch.children breaks things, because 

894 # e.g. hash(0.0) == hash(-0.0) would collide as keys when they are 

895 # in fact distinct child branches. 

896 # To distinguish floats here we'll use their bits representation. This 

897 # entails some bookkeeping such that we're careful about when the 

898 # float key is in its bits form (as a key into branch.children) and 

899 # when it is in its float form (as a value we want to write to the 

900 # choice sequence), and converting between the two forms as appropriate. 

901 if choice_type == "float": 

902 assert isinstance(value, float) 

903 value = float_to_int(value) 

904 return value 

905 

906 def _get_children_cache( 

907 self, choice_type: ChoiceTypeT, constraints: ChoiceConstraintsT, *, key: ChoiceT 

908 ) -> ChildrenCacheValueT: 

909 # cache the state of the children generator per node/branch (passed as 

910 # `key` here), such that we track which children we've already tried 

911 # for this branch across draws. 

912 # We take advantage of python generators here as one-way iterables, 

913 # so each time we iterate we implicitly store our position in the 

914 # children generator and don't re-draw children. `children` is the 

915 # concrete list of children draw from the generator that we will work 

916 # with. Whenever we need to top up this list, we will draw a new value 

917 # from the generator. 

918 if key not in self._children_cache: 

919 generator = all_children(choice_type, constraints) 

920 children: list[ChoiceT] = [] 

921 rejected: set[ChoiceT] = set() 

922 self._children_cache[key] = (generator, children, rejected) 

923 

924 return self._children_cache[key] 

925 

926 def _draw_from_cache( 

927 self, 

928 choice_type: ChoiceTypeT, 

929 constraints: ChoiceConstraintsT, 

930 *, 

931 key: ChoiceT, 

932 random: Random, 

933 ) -> ChoiceT: 

934 (generator, children, rejected) = self._get_children_cache( 

935 choice_type, constraints, key=key 

936 ) 

937 # Keep a stock of 100 potentially-valid children at all times. 

938 # This number is chosen to balance memory/speed vs randomness. Ideally 

939 # we would sample uniformly from all not-yet-rejected children, but 

940 # computing and storing said children is not free. 

941 # no-branch because coverage of the fall-through case here is a bit 

942 # annoying. 

943 if len(children) < 100: # pragma: no branch 

944 for v in generator: 

945 if choice_type == "float": 

946 assert isinstance(v, float) 

947 v = float_to_int(v) 

948 if v in rejected: 

949 continue 

950 children.append(v) 

951 if len(children) >= 100: 

952 break 

953 

954 return random.choice(children) 

955 

956 def _reject_child( 

957 self, 

958 choice_type: ChoiceTypeT, 

959 constraints: ChoiceConstraintsT, 

960 *, 

961 child: ChoiceT, 

962 key: ChoiceT, 

963 ) -> None: 

964 (_generator, children, rejected) = self._get_children_cache( 

965 choice_type, constraints, key=key 

966 ) 

967 rejected.add(child) 

968 # we remove a child from the list of possible children *only* when it is 

969 # rejected, and not when it is initially drawn in _draw_from_cache. The 

970 # reason is that a child being drawn does not guarantee that child will 

971 # be used in a way such that it is written back to the tree, so it needs 

972 # to be available for future draws until we are certain it has been 

973 # used. 

974 # 

975 # For instance, if we generated novel prefixes in a loop (but never used 

976 # those prefixes to generate new values!) then we don't want to remove 

977 # the drawn children from the available pool until they are actually 

978 # used. 

979 # 

980 # This does result in a small inefficiency: we may draw a child, 

981 # immediately use it (so we know it cannot be drawn again), but still 

982 # wait to draw and reject it here, because DataTree cannot guarantee 

983 # the drawn child has been used. 

984 if child in children: 

985 children.remove(child) 

986 

987 def _repr_pretty_(self, p: "RepresentationPrinter", cycle: bool) -> None: 

988 assert cycle is False 

989 p.pretty(self.root) 

990 

991 

992class TreeRecordingObserver(DataObserver): 

993 def __init__(self, tree: DataTree): 

994 # this attr isn't read, but is very useful for local debugging flaky 

995 # errors, with 

996 # `from hypothesis.vendor import pretty; print(pretty.pretty(self._root))` 

997 self._root = tree.root 

998 self._current_node: TreeNode = tree.root 

999 self._index_in_current_node: int = 0 

1000 self._trail: list[TreeNode] = [self._current_node] 

1001 self.killed: bool = False 

1002 

1003 def draw_integer( 

1004 self, value: int, *, was_forced: bool, constraints: IntegerConstraints 

1005 ) -> None: 

1006 self.draw_value( 

1007 "integer", value, was_forced=was_forced, constraints=constraints 

1008 ) 

1009 

1010 def draw_float( 

1011 self, value: float, *, was_forced: bool, constraints: FloatConstraints 

1012 ) -> None: 

1013 self.draw_value("float", value, was_forced=was_forced, constraints=constraints) 

1014 

1015 def draw_string( 

1016 self, value: str, *, was_forced: bool, constraints: StringConstraints 

1017 ) -> None: 

1018 self.draw_value("string", value, was_forced=was_forced, constraints=constraints) 

1019 

1020 def draw_bytes( 

1021 self, value: bytes, *, was_forced: bool, constraints: BytesConstraints 

1022 ) -> None: 

1023 self.draw_value("bytes", value, was_forced=was_forced, constraints=constraints) 

1024 

1025 def draw_boolean( 

1026 self, value: bool, *, was_forced: bool, constraints: BooleanConstraints 

1027 ) -> None: 

1028 self.draw_value( 

1029 "boolean", value, was_forced=was_forced, constraints=constraints 

1030 ) 

1031 

1032 def draw_value( 

1033 self, 

1034 choice_type: ChoiceTypeT, 

1035 value: ChoiceT, 

1036 *, 

1037 was_forced: bool, 

1038 constraints: ChoiceConstraintsT, 

1039 ) -> None: 

1040 i = self._index_in_current_node 

1041 self._index_in_current_node += 1 

1042 node = self._current_node 

1043 

1044 if isinstance(value, float): 

1045 value = float_to_int(value) 

1046 

1047 assert len(node.constraints) == len(node.values) == len(node.choice_types) 

1048 if i < len(node.values): 

1049 if ( 

1050 choice_type != node.choice_types[i] 

1051 or constraints != node.constraints[i] 

1052 ): 

1053 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1054 # Note that we don't check whether a previously 

1055 # forced value is now free. That will be caught 

1056 # if we ever split the node there, but otherwise 

1057 # may pass silently. This is acceptable because it 

1058 # means we skip a hash set lookup on every 

1059 # draw and that's a pretty niche failure mode. 

1060 if was_forced and i not in node.forced: 

1061 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1062 if value != node.values[i]: 

1063 node.split_at(i) 

1064 assert i == len(node.values) 

1065 new_node = TreeNode() 

1066 assert isinstance(node.transition, Branch) 

1067 node.transition.children[value] = new_node 

1068 self._current_node = new_node 

1069 self._index_in_current_node = 0 

1070 else: 

1071 trans = node.transition 

1072 if trans is None: 

1073 node.choice_types.append(choice_type) 

1074 node.constraints.append(constraints) 

1075 node.values.append(value) 

1076 if was_forced: 

1077 node.mark_forced(i) 

1078 # generate_novel_prefix assumes the following invariant: any one 

1079 # of the series of draws in a particular node can vary, i.e. the 

1080 # max number of children is at least 2. However, some draws are 

1081 # pseudo-choices and only have a single value, such as 

1082 # integers(0, 0). 

1083 # 

1084 # Currently, we address this by forcefully splitting such 

1085 # single-valued nodes into a transition when we see them. An 

1086 # exception to this is if it was forced: forced pseudo-choices 

1087 # do not cause the above issue because they inherently cannot 

1088 # vary, and moreover they trip other invariants about never 

1089 # splitting forced nodes. 

1090 # 

1091 # An alternative is not writing such choices to the tree at 

1092 # all, and thus guaranteeing that each node has at least 2 max 

1093 # children. 

1094 if ( 

1095 compute_max_children(choice_type, constraints) == 1 

1096 and not was_forced 

1097 ): 

1098 node.split_at(i) 

1099 assert isinstance(node.transition, Branch) 

1100 self._current_node = node.transition.children[value] 

1101 self._index_in_current_node = 0 

1102 elif isinstance(trans, Conclusion): 

1103 assert trans.status != Status.OVERRUN 

1104 # We tried to draw where history says we should have 

1105 # stopped 

1106 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1107 else: 

1108 assert isinstance(trans, Branch), trans 

1109 if choice_type != trans.choice_type or constraints != trans.constraints: 

1110 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1111 try: 

1112 self._current_node = trans.children[value] 

1113 except KeyError: 

1114 self._current_node = trans.children.setdefault(value, TreeNode()) 

1115 self._index_in_current_node = 0 

1116 if self._trail[-1] is not self._current_node: 

1117 self._trail.append(self._current_node) 

1118 

1119 def kill_branch(self) -> None: 

1120 """Mark this part of the tree as not worth re-exploring.""" 

1121 if self.killed: 

1122 return 

1123 

1124 self.killed = True 

1125 

1126 if self._index_in_current_node < len(self._current_node.values) or ( 

1127 self._current_node.transition is not None 

1128 and not isinstance(self._current_node.transition, Killed) 

1129 ): 

1130 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1131 

1132 if self._current_node.transition is None: 

1133 self._current_node.transition = Killed(TreeNode()) 

1134 self.__update_exhausted() 

1135 

1136 self._current_node = self._current_node.transition.next_node 

1137 self._index_in_current_node = 0 

1138 self._trail.append(self._current_node) 

1139 

1140 def conclude_test( 

1141 self, status: Status, interesting_origin: InterestingOrigin | None 

1142 ) -> None: 

1143 """Says that ``status`` occurred at node ``node``. This updates the 

1144 node if necessary and checks for consistency.""" 

1145 if status == Status.OVERRUN: 

1146 return 

1147 i = self._index_in_current_node 

1148 node = self._current_node 

1149 

1150 if i < len(node.values) or isinstance(node.transition, Branch): 

1151 raise FlakyStrategyDefinition(_FLAKY_STRAT_MSG) 

1152 

1153 new_transition = Conclusion(status, interesting_origin) 

1154 

1155 if node.transition is not None and node.transition != new_transition: 

1156 # As an, I'm afraid, horrible bodge, we deliberately ignore flakiness 

1157 # where tests go from interesting to valid, because it's much easier 

1158 # to produce good error messages for these further up the stack. 

1159 if isinstance(node.transition, Conclusion) and ( 

1160 node.transition.status != Status.INTERESTING 

1161 or new_transition.status != Status.VALID 

1162 ): 

1163 old_origin = node.transition.interesting_origin 

1164 new_origin = new_transition.interesting_origin 

1165 raise FlakyReplay( 

1166 f"Inconsistent results from replaying a test case!\n" 

1167 f" last: {node.transition.status.name} from {old_origin}\n" 

1168 f" this: {new_transition.status.name} from {new_origin}", 

1169 (old_origin, new_origin), 

1170 ) 

1171 else: 

1172 node.transition = new_transition 

1173 

1174 assert node is self._trail[-1] 

1175 node.check_exhausted() 

1176 assert len(node.values) > 0 or node.check_exhausted() 

1177 

1178 if not self.killed: 

1179 self.__update_exhausted() 

1180 

1181 def __update_exhausted(self) -> None: 

1182 for t in reversed(self._trail): 

1183 # Any node we've traversed might have now become exhausted. 

1184 # We check from the right. As soon as we hit a node that 

1185 # isn't exhausted, this automatically implies that all of 

1186 # its parents are not exhausted, so we stop. 

1187 if not t.check_exhausted(): 

1188 break