Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/hypothesis/internal/conjecture/datatree.py: 17%

1# This file is part of Hypothesis, which may be found at

2# https://github.com/HypothesisWorks/hypothesis/

4# Copyright the Hypothesis Authors.

5# Individual contributors are listed in AUTHORS.rst and the git log.

7# This Source Code Form is subject to the terms of the Mozilla Public License,

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can

9# obtain one at https://mozilla.org/MPL/2.0/.

11import itertools

12import math

13from typing import List, Optional, Union

15import attr

17from hypothesis.errors import Flaky, HypothesisException, StopTest

18from hypothesis.internal import floats as flt

19from hypothesis.internal.compat import int_to_bytes

20from hypothesis.internal.conjecture.data import (

21 BooleanKWargs,

22 BytesKWargs,

23 ConjectureData,

24 DataObserver,

25 FloatKWargs,

26 IntegerKWargs,

27 InvalidAt,

28 IRKWargsType,

29 IRType,

30 IRTypeName,

31 Status,

32 StringKWargs,

33)

34from hypothesis.internal.escalation import InterestingOrigin

35from hypothesis.internal.floats import (

36 count_between_floats,

37 float_to_int,

38 int_to_float,

39 sign_aware_lte,

40)

43class PreviouslyUnseenBehaviour(HypothesisException):

44 pass

47def inconsistent_generation():

48 raise Flaky(

49 "Inconsistent data generation! Data generation behaved differently "

50 "between different runs. Is your data generation depending on external "

51 "state?"

52 )

55EMPTY: frozenset = frozenset()

58@attr.s(slots=True)

59class Killed:

60 """Represents a transition to part of the tree which has been marked as

61 "killed", meaning we want to treat it as not worth exploring, so it will

62 be treated as if it were completely explored for the purposes of

63 exhaustion."""

65 next_node = attr.ib()

67 def _repr_pretty_(self, p, cycle):

68 assert cycle is False

69 p.text("Killed")

72def _node_pretty(ir_type, value, kwargs, *, forced):

73 forced_marker = " [forced]" if forced else ""

74 return f"{ir_type} {value}{forced_marker} {kwargs}"

77@attr.s(slots=True)

78class Branch:

79 """Represents a transition where multiple choices can be made as to what

80 to drawn."""

82 kwargs = attr.ib()

83 ir_type = attr.ib()

84 children = attr.ib(repr=False)

86 @property

87 def max_children(self):

88 max_children = compute_max_children(self.ir_type, self.kwargs)

89 assert max_children > 0

90 return max_children

92 def _repr_pretty_(self, p, cycle):

93 assert cycle is False

94 for i, (value, child) in enumerate(self.children.items()):

95 if i > 0:

96 p.break_()

97 p.text(_node_pretty(self.ir_type, value, self.kwargs, forced=False))

98 with p.indent(2):

99 p.break_()

100 p.pretty(child)

101

102

103@attr.s(slots=True, frozen=True)

104class Conclusion:

105 """Represents a transition to a finished state."""

106

107 status: Status = attr.ib()

108 interesting_origin: Optional[InterestingOrigin] = attr.ib()

109

110 def _repr_pretty_(self, p, cycle):

111 assert cycle is False

112 o = self.interesting_origin

113 # avoid str(o), which can include multiple lines of context

114 origin = (

115 "" if o is None else f", {o.exc_type.__name__} at {o.filename}:{o.lineno}"

116 )

117 p.text(f"Conclusion ({self.status!r}{origin})")

118

119

120# The number of max children where, beyond this, it is practically impossible

121# for hypothesis to saturate / explore all children nodes in a reasonable time

122# frame. We use this to bail out of expensive max children computations early,

123# where the numbers involved are so large that we know they will be larger than

124# this number.

125#

126# Note that it's ok for us to underestimate the number of max children of a node

127# by using this. We just may think the node is exhausted when in fact it has more

128# possible children to be explored. This has the potential to finish generation

129# early due to exhausting the entire tree, but that is quite unlikely: (1) the

130# number of examples would have to be quite high, and (2) the tree would have to

131# contain only one or two nodes, or generate_novel_prefix would simply switch to

132# exploring another non-exhausted node.

133#

134# Also note that we may sometimes compute max children above this value. In other

135# words, this is *not* a hard maximum on the computed max children. It's the point

136# where further computation is not beneficial - but sometimes doing that computation

137# unconditionally is cheaper than estimating against this value.

138#

139# The one case where this may be detrimental is fuzzing, where the throughput of

140# examples is so high that it really may saturate important nodes. We'll cross

141# that bridge when we come to it.

142MAX_CHILDREN_EFFECTIVELY_INFINITE = 100_000

143

144

145def compute_max_children(ir_type, kwargs):

146 from hypothesis.internal.conjecture.data import DRAW_STRING_DEFAULT_MAX_SIZE

147

148 if ir_type == "integer":

149 min_value = kwargs["min_value"]

150 max_value = kwargs["max_value"]

151 weights = kwargs["weights"]

152

153 if min_value is None and max_value is None:

154 # full 128 bit range.

155 return 2**128 - 1

156 if min_value is not None and max_value is not None:

157 # count between min/max value.

158 n = max_value - min_value + 1

159 # remove any values with a zero probability of being drawn (weight=0).

160 if weights is not None:

161 n -= sum(weight == 0 for weight in weights)

162 return n

163

164 # hard case: only one bound was specified. Here we probe either upwards

165 # or downwards with our full 128 bit generation, but only half of these

166 # (plus one for the case of generating zero) result in a probe in the

167 # direction we want. ((2**128 - 1) // 2) + 1 == 2 ** 127

168 assert (min_value is None) ^ (max_value is None)

169 return 2**127

170 elif ir_type == "boolean":

171 p = kwargs["p"]

172 # probabilities of 0 or 1 (or effectively 0 or 1) only have one choice.

173 if p <= 2 ** (-64) or p >= (1 - 2 ** (-64)):

174 return 1

175 return 2

176 elif ir_type == "bytes":

177 return 2 ** (8 * kwargs["size"])

178 elif ir_type == "string":

179 min_size = kwargs["min_size"]

180 max_size = kwargs["max_size"]

181 intervals = kwargs["intervals"]

182

183 if max_size is None:

184 max_size = DRAW_STRING_DEFAULT_MAX_SIZE

185

186 if len(intervals) == 0:

187 # Special-case the empty alphabet to avoid an error in math.log(0).

188 # Only possibility is the empty string.

189 return 1

190

191 # We want to estimate if we're going to have more children than

192 # MAX_CHILDREN_EFFECTIVELY_INFINITE, without computing a potentially

193 # extremely expensive pow. We'll check if the number of strings in

194 # the largest string size alone is enough to put us over this limit.

195 # We'll also employ a trick of estimating against log, which is cheaper

196 # than computing a pow.

197 #

198 # x = max_size

199 # y = len(intervals)

200 # n = MAX_CHILDREN_EFFECTIVELY_INFINITE

201 #

202 # x**y > n

203 # <=> log(x**y) > log(n)

204 # <=> y * log(x) > log(n)

205

206 # avoid math.log(1) == 0 and incorrectly failing the below estimate,

207 # even when we definitely are too large.

208 if len(intervals) == 1:

209 definitely_too_large = max_size > MAX_CHILDREN_EFFECTIVELY_INFINITE

210 else:

211 definitely_too_large = max_size * math.log(len(intervals)) > math.log(

212 MAX_CHILDREN_EFFECTIVELY_INFINITE

213 )

214

215 if definitely_too_large:

216 return MAX_CHILDREN_EFFECTIVELY_INFINITE

217

218 # number of strings of length k, for each k in [min_size, max_size].

219 return sum(len(intervals) ** k for k in range(min_size, max_size + 1))

220

221 elif ir_type == "float":

222 min_value = kwargs["min_value"]

223 max_value = kwargs["max_value"]

224 smallest_nonzero_magnitude = kwargs["smallest_nonzero_magnitude"]

225

226 count = count_between_floats(min_value, max_value)

227

228 # we have two intervals:

229 # a. [min_value, max_value]

230 # b. [-smallest_nonzero_magnitude, smallest_nonzero_magnitude]

231 #

232 # which could be subsets (in either order), overlapping, or disjoint. We

233 # want the interval difference a - b.

234

235 # next_down because endpoints are ok with smallest_nonzero_magnitude

236 min_point = max(min_value, -flt.next_down(smallest_nonzero_magnitude))

237 max_point = min(max_value, flt.next_down(smallest_nonzero_magnitude))

238

239 if min_point > max_point:

240 # case: disjoint intervals.

241 return count

242

243 count -= count_between_floats(min_point, max_point)

244 if sign_aware_lte(min_value, -0.0) and sign_aware_lte(-0.0, max_value):

245 # account for -0.0

246 count += 1

247 if sign_aware_lte(min_value, 0.0) and sign_aware_lte(0.0, max_value):

248 # account for 0.0

249 count += 1

250 return count

251

252 raise NotImplementedError(f"unhandled ir_type {ir_type}")

253

254

255# In theory, this is a strict superset of the functionality of compute_max_children;

256#

257# assert len(all_children(ir_type, kwargs)) == compute_max_children(ir_type, kwargs)

258#

259# In practice, we maintain two distinct implementations for efficiency and space

260# reasons. If you just need the number of children, it is cheaper to use

261# compute_max_children than to reify the list of children (only to immediately

262# throw it away).

263def all_children(ir_type, kwargs):

264 if ir_type == "integer":

265 min_value = kwargs["min_value"]

266 max_value = kwargs["max_value"]

267 weights = kwargs["weights"]

268

269 if min_value is None and max_value is None:

270 # full 128 bit range.

271 yield from range(-(2**127) + 1, 2**127 - 1)

272

273 elif min_value is not None and max_value is not None:

274 if weights is None:

275 yield from range(min_value, max_value + 1)

276 else:

277 # skip any values with a corresponding weight of 0 (can never be drawn).

278 for weight, n in zip(weights, range(min_value, max_value + 1)):

279 if weight == 0:

280 continue

281 yield n

282 else:

283 # hard case: only one bound was specified. Here we probe either upwards

284 # or downwards with our full 128 bit generation, but only half of these

285 # (plus one for the case of generating zero) result in a probe in the

286 # direction we want. ((2**128 - 1) // 2) + 1 == a range of 2 ** 127.

287 #

288 # strictly speaking, I think this is not actually true: if

289 # max_value > shrink_towards then our range is ((-2**127) + 1, max_value),

290 # and it only narrows when max_value < shrink_towards. But it

291 # really doesn't matter for this case because (even half) unbounded

292 # integers generation is hit extremely rarely.

293 assert (min_value is None) ^ (max_value is None)

294 if min_value is None:

295 yield from range(max_value - (2**127) + 1, max_value)

296 else:

297 assert max_value is None

298 yield from range(min_value, min_value + (2**127) - 1)

299

300 if ir_type == "boolean":

301 p = kwargs["p"]

302 if p <= 2 ** (-64):

303 yield False

304 elif p >= (1 - 2 ** (-64)):

305 yield True

306 else:

307 yield from [False, True]

308 if ir_type == "bytes":

309 size = kwargs["size"]

310 yield from (int_to_bytes(i, size) for i in range(2 ** (8 * size)))

311 if ir_type == "string":

312 min_size = kwargs["min_size"]

313 max_size = kwargs["max_size"]

314 intervals = kwargs["intervals"]

315

316 # written unidiomatically in order to handle the case of max_size=inf.

317 size = min_size

318 while size <= max_size:

319 for ords in itertools.product(intervals, repeat=size):

320 yield "".join(chr(n) for n in ords)

321 size += 1

322 if ir_type == "float":

323

324 def floats_between(a, b):

325 for n in range(float_to_int(a), float_to_int(b) + 1):

326 yield int_to_float(n)

327

328 min_value = kwargs["min_value"]

329 max_value = kwargs["max_value"]

330 smallest_nonzero_magnitude = kwargs["smallest_nonzero_magnitude"]

331

332 # handle zeroes separately so smallest_nonzero_magnitude can think of

333 # itself as a complete interval (instead of a hole at ±0).

334 if sign_aware_lte(min_value, -0.0) and sign_aware_lte(-0.0, max_value):

335 yield -0.0

336 if sign_aware_lte(min_value, 0.0) and sign_aware_lte(0.0, max_value):

337 yield 0.0

338

339 if flt.is_negative(min_value):

340 if flt.is_negative(max_value):

341 # case: both negative.

342 max_point = min(max_value, -smallest_nonzero_magnitude)

343 # float_to_int increases as negative magnitude increases, so

344 # invert order.

345 yield from floats_between(max_point, min_value)

346 else:

347 # case: straddles midpoint (which is between -0.0 and 0.0).

348 yield from floats_between(-smallest_nonzero_magnitude, min_value)

349 yield from floats_between(smallest_nonzero_magnitude, max_value)

350 else:

351 # case: both positive.

352 min_point = max(min_value, smallest_nonzero_magnitude)

353 yield from floats_between(min_point, max_value)

354

355

356@attr.s(slots=True)

357class TreeNode:

358 """

359 A node, or collection of directly descended nodes, in a DataTree.

360

361 We store the DataTree as a radix tree (https://en.wikipedia.org/wiki/Radix_tree),

362 which means that nodes that are the only child of their parent are collapsed

363 into their parent to save space.

364

365 Conceptually, you can unfold a single TreeNode storing n values in its lists

366 into a sequence of n nodes, each a child of the last. In other words,

367 (kwargs[i], values[i], ir_types[i]) corresponds to the single node at index

368 i.

369

370 Note that if a TreeNode represents a choice (i.e. the nodes cannot be compacted

371 via the radix tree definition), then its lists will be empty and it will

372 store a `Branch` representing that choce in its `transition`.

373

374 Examples

375 --------

376

377 Consider sequentially drawing a boolean, then an integer.

378

379 data.draw_boolean()

380 data.draw_integer(1, 3)

381

382 If we draw True and then 2, the tree may conceptually look like this.

383

384 ┌──────┐

385 │ root │

386 └──┬───┘

387 ┌──┴───┐

388 │ True │

389 └──┬───┘

390 ┌──┴───┐

391 │ 2 │

392 └──────┘

393

394 But since 2 is the only child of True, we will compact these nodes and store

395 them as a single TreeNode.

396

397 ┌──────┐

398 │ root │

399 └──┬───┘

400 ┌────┴──────┐

401 │ [True, 2] │

402 └───────────┘

403

404 If we then draw True and then 3, True will have multiple children and we

405 can no longer store this compacted representation. We would call split_at(0)

406 on the [True, 2] node to indicate that we need to add a choice at 0-index

407 node (True).

408

409 ┌──────┐

410 │ root │

411 └──┬───┘

412 ┌──┴───┐

413 ┌─┤ True ├─┐

414 │ └──────┘ │

415 ┌─┴─┐ ┌─┴─┐

416 │ 2 │ │ 3 │

417 └───┘ └───┘

418 """

419

420 # The kwargs, value, and ir_types of the nodes stored here. These always

421 # have the same length. The values at index i belong to node i.

422 kwargs: List[IRKWargsType] = attr.ib(factory=list)

423 values: List[IRType] = attr.ib(factory=list)

424 ir_types: List[IRTypeName] = attr.ib(factory=list)

425

426 # The indices of nodes which had forced values.

427 #

428 # Stored as None if no indices have been forced, purely for space saving

429 # reasons (we force quite rarely).

430 __forced: Optional[set] = attr.ib(default=None, init=False)

431

432 # What happens next after drawing these nodes. (conceptually, "what is the

433 # child/children of the last node stored here").

434 #

435 # One of:

436 # - None (we don't know yet)

437 # - Branch (we have seen multiple possible outcomes here)

438 # - Conclusion (ConjectureData.conclude_test was called here)

439 # - Killed (this branch is valid and may even have children, but should not

440 # be explored when generating novel prefixes)

441 transition: Union[None, Branch, Conclusion, Killed] = attr.ib(default=None)

442

443 invalid_at: Optional[InvalidAt] = attr.ib(default=None)

444

445 # A tree node is exhausted if every possible sequence of draws below it has

446 # been explored. We only update this when performing operations that could

447 # change the answer.

448 #

449 # See also TreeNode.check_exhausted.

450 is_exhausted: bool = attr.ib(default=False, init=False)

451

452 @property

453 def forced(self):

454 if not self.__forced:

455 return EMPTY

456 return self.__forced

457

458 def mark_forced(self, i):

459 """

460 Note that the draw at node i was forced.

461 """

462 assert 0 <= i < len(self.values)

463 if self.__forced is None:

464 self.__forced = set()

465 self.__forced.add(i)

466

467 def split_at(self, i):

468 """

469 Splits the tree so that it can incorporate a decision at the draw call

470 corresponding to the node at position i.

471

472 Raises Flaky if node i was forced.

473 """

474

475 if i in self.forced:

476 inconsistent_generation()

477

478 assert not self.is_exhausted

479

480 key = self.values[i]

481

482 child = TreeNode(

483 ir_types=self.ir_types[i + 1 :],

484 kwargs=self.kwargs[i + 1 :],

485 values=self.values[i + 1 :],

486 transition=self.transition,

487 )

488 self.transition = Branch(

489 kwargs=self.kwargs[i], ir_type=self.ir_types[i], children={key: child}

490 )

491 if self.__forced is not None:

492 child.__forced = {j - i - 1 for j in self.__forced if j > i}

493 self.__forced = {j for j in self.__forced if j < i}

494 child.check_exhausted()

495 del self.ir_types[i:]

496 del self.values[i:]

497 del self.kwargs[i:]

498 # we have a transition now, so we don't need to carry around invalid_at.

499 self.invalid_at = None

500 assert len(self.values) == len(self.kwargs) == len(self.ir_types) == i

501

502 def check_exhausted(self):

503 """

504 Recalculates is_exhausted if necessary, and then returns it.

505

506 A node is exhausted if:

507 - Its transition is Conclusion or Killed

508 - It has the maximum number of children (i.e. we have found all of its

509 possible children), and all its children are exhausted

510

511 Therefore, we only need to compute this for a node when:

512 - We first create it in split_at

513 - We set its transition to either Conclusion or Killed

514 (TreeRecordingObserver.conclude_test or TreeRecordingObserver.kill_branch)

515 - We exhaust any of its children

516 """

517

518 if (

519 # a node cannot go from is_exhausted -> not is_exhausted.

520 not self.is_exhausted

521 # if we don't know what happens after this node, we don't have

522 # enough information to tell if it's exhausted.

523 and self.transition is not None

524 # if there are still any nodes left which are the only child of their

525 # parent (len(self.values) > 0), then this TreeNode must be not

526 # exhausted, unless all of those nodes were forced.

527 #

528 # This is because we maintain an invariant of only adding nodes to

529 # DataTree which have at least 2 possible values, so we know that if

530 # they do not have any siblings that we still have more choices to

531 # discover.

532 #

533 # (We actually *do* currently add single-valued nodes to the tree,

534 # but immediately split them into a transition to avoid falsifying

535 # this check. this is a bit of a hack.)

536 and len(self.forced) == len(self.values)

537 ):

538 if isinstance(self.transition, (Conclusion, Killed)):

539 self.is_exhausted = True

540 elif len(self.transition.children) == self.transition.max_children:

541 self.is_exhausted = all(

542 v.is_exhausted for v in self.transition.children.values()

543 )

544 return self.is_exhausted

545

546 def _repr_pretty_(self, p, cycle):

547 assert cycle is False

548 indent = 0

549 for i, (ir_type, kwargs, value) in enumerate(

550 zip(self.ir_types, self.kwargs, self.values)

551 ):

552 with p.indent(indent):

553 if i > 0:

554 p.break_()

555 p.text(_node_pretty(ir_type, value, kwargs, forced=i in self.forced))

556 indent += 2

557

558 with p.indent(indent):

559 if len(self.values) > 0:

560 p.break_()

561 if self.transition is not None:

562 p.pretty(self.transition)

563 else:

564 p.text("unknown")

565

566

567class DataTree:

568 """

569 A DataTree tracks the structured history of draws in some test function,

570 across multiple ConjectureData objects.

571

572 This information is used by ConjectureRunner to generate novel prefixes of

573 this tree (see generate_novel_prefix). A novel prefix is a sequence of draws

574 which the tree has not seen before, and therefore the ConjectureRunner has

575 not generated as an input to the test function before.

576

577 DataTree tracks the following:

578

579 - Draws, at the ir level (with some ir_type, e.g. "integer")

580 - ConjectureData.draw_integer()

581 - ConjectureData.draw_float()

582 - ConjectureData.draw_string()

583 - ConjectureData.draw_boolean()

584 - ConjectureData.draw_bytes()

585 - Test conclusions (with some Status, e.g. Status.VALID)

586 - ConjectureData.conclude_test()

587

588 A DataTree is — surprise — a *tree*. A node in this tree is either a draw with

589 some value, a test conclusion with some Status, or a special `Killed` value,

590 which denotes that further draws may exist beyond this node but should not be

591 considered worth exploring when generating novel prefixes. A node is a leaf

592 iff it is a conclusion or Killed.

593

594 A branch from node A to node B indicates that we have previously seen some

595 sequence (a, b) of draws, where a and b are the values in nodes A and B.

596 Similar intuition holds for conclusion and Killed nodes.

597

598 Examples

599 --------

600

601 To see how a DataTree gets built through successive sets of draws, consider

602 the following code that calls through to some ConjecutreData object `data`.

603 The first call can be either True or False, and the second call can be any

604 integer in the range [1, 3].

605

606 data.draw_boolean()

607 data.draw_integer(1, 3)

608

609 To start, the corresponding DataTree object is completely empty.

610

611 ┌──────┐

612 │ root │

613 └──────┘

614

615 We happen to draw True and then 2 in the above code. The tree tracks this.

616 (2 also connects to a child Conclusion node with Status.VALID since it's the

617 final draw in the code. I'll omit Conclusion nodes in diagrams for brevity.)

618

619 ┌──────┐

620 │ root │

621 └──┬───┘

622 ┌──┴───┐

623 │ True │

624 └──┬───┘

625 ┌──┴───┐

626 │ 2 │

627 └──────┘

628

629 This is a very boring tree so far! But now we happen to draw False and

630 then 1. This causes a split in the tree. Remember, DataTree tracks history

631 over all invocations of a function, not just one. The end goal is to know

632 what invocations haven't been tried yet, after all.

633

634 ┌──────┐

635 ┌───┤ root ├───┐

636 │ └──────┘ │

637 ┌──┴───┐ ┌─┴─────┐

638 │ True │ │ False │

639 └──┬───┘ └──┬────┘

640 ┌─┴─┐ ┌─┴─┐

641 │ 2 │ │ 1 │

642 └───┘ └───┘

643

644 If we were to ask DataTree for a novel prefix at this point, it might

645 generate any of (True, 1), (True, 3), (False, 2), or (False, 3).

646

647 Note that the novel prefix stops as soon as it generates a novel node. For

648 instance, if we had generated a novel prefix back when the tree was only

649 root -> True -> 2, we could have gotten any of (True, 1), (True, 3), or

650 (False). But we could *not* have gotten (False, n), because both False and

651 n were novel at that point, and we stop at the first novel node — False.

652

653 I won't belabor this example. Here's what the tree looks like when fully

654 explored:

655

656 ┌──────┐

657 ┌──────┤ root ├──────┐

658 │ └──────┘ │

659 ┌──┴───┐ ┌─┴─────┐

660 ┌──┤ True ├──┐ ┌───┤ False ├──┐

661 │ └──┬───┘ │ │ └──┬────┘ │

662 ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐ ┌─┴─┐

663 │ 1 │ │ 2 │ │ 3 │ │ 1 │ │ 2 │ │ 3 │

664 └───┘ └───┘ └───┘ └───┘ └───┘ └───┘

665

666 You could imagine much more complicated trees than this arising in practice,

667 and indeed they do. In particular, the tree need not be balanced or 'nice'

668 like the tree above. For instance,

669

670 b = data.draw_boolean()

671 if b:

672 data.draw_integer(1, 3)

673

674 results in a tree with the entire right part lopped off, and False leading

675 straight to a conclusion node with Status.VALID. As another example,

676

677 n = data.draw_integers()

678 assume(n >= 3)

679 data.draw_string()

680

681 results in a tree with the 0, 1, and 2 nodes leading straight to a

682 conclusion node with Status.INVALID, and the rest branching off into all

683 the possibilities of draw_string.

684

685 Notes

686 -----

687

688 The above examples are slightly simplified and are intended to convey

689 intuition. In practice, there are some implementation details to be aware

690 of.

691

692 - In draw nodes, we store the kwargs used in addition to the value drawn.

693 E.g. the node corresponding to data.draw_float(min_value=1.0, max_value=1.5)

694 would store {"min_value": 1.0, "max_value": 1.5, ...} (default values for

695 other kwargs omitted).

696

697 The kwargs parameters have the potential to change both the range of

698 possible outputs of a node, and the probability distribution within that

699 range, so we need to use these when drawing in DataTree as well. We draw

700 values using these kwargs when (1) generating a novel value for a node

701 and (2) choosing a random child when traversing the tree.

702

703 - For space efficiency, rather than tracking the full tree structure, we

704 store DataTree as a radix tree. This is conceptually equivalent (radix

705 trees can always be "unfolded" to the full tree) but it means the internal

706 representation may differ in practice.

707

708 See TreeNode for more information.

709 """

710

711 def __init__(self):

712 self.root = TreeNode()

713 self._children_cache = {}

714

715 @property

716 def is_exhausted(self):

717 """

718 Returns True if every node is exhausted, and therefore the tree has

719 been fully explored.

720 """

721 return self.root.is_exhausted

722

723 def generate_novel_prefix(self, random):

724 """Generate a short random string that (after rewriting) is not

725 a prefix of any buffer previously added to the tree.

726

727 The resulting prefix is essentially arbitrary - it would be nice

728 for it to be uniform at random, but previous attempts to do that

729 have proven too expensive.

730 """

731

732 assert not self.is_exhausted

733 novel_prefix = bytearray()

734

735 def append_buf(buf):

736 novel_prefix.extend(buf)

737

738 current_node = self.root

739 while True:

740 assert not current_node.is_exhausted

741 for i, (ir_type, kwargs, value) in enumerate(

742 zip(current_node.ir_types, current_node.kwargs, current_node.values)

743 ):

744 if i in current_node.forced:

745 if ir_type == "float":

746 value = int_to_float(value)

747 (_value, buf) = self._draw(

748 ir_type, kwargs, forced=value, random=random

749 )

750 append_buf(buf)

751 else:

752 attempts = 0

753 while True:

754 if attempts <= 10:

755 try:

756 (v, buf) = self._draw(ir_type, kwargs, random=random)

757 except StopTest: # pragma: no cover

758 # it is possible that drawing from a fresh data can

759 # overrun BUFFER_SIZE, due to eg unlucky rejection sampling

760 # of integer probes. Retry these cases.

761 attempts += 1

762 continue

763 else:

764 (v, buf) = self._draw_from_cache(

765 ir_type, kwargs, key=id(current_node), random=random

766 )

767

768 if v != value:

769 append_buf(buf)

770 break

771 attempts += 1

772 self._reject_child(

773 ir_type, kwargs, child=v, key=id(current_node)

774 )

775 # We've now found a value that is allowed to

776 # vary, so what follows is not fixed.

777 return bytes(novel_prefix)

778 else:

779 assert not isinstance(current_node.transition, (Conclusion, Killed))

780 if current_node.transition is None:

781 return bytes(novel_prefix)

782 branch = current_node.transition

783 assert isinstance(branch, Branch)

784

785 attempts = 0

786 while True:

787 if attempts <= 10:

788 try:

789 (v, buf) = self._draw(

790 branch.ir_type, branch.kwargs, random=random

791 )

792 except StopTest: # pragma: no cover

793 attempts += 1

794 continue

795 else:

796 (v, buf) = self._draw_from_cache(

797 branch.ir_type, branch.kwargs, key=id(branch), random=random

798 )

799 try:

800 child = branch.children[v]

801 except KeyError:

802 append_buf(buf)

803 return bytes(novel_prefix)

804 if not child.is_exhausted:

805 append_buf(buf)

806 current_node = child

807 break

808 attempts += 1

809 self._reject_child(

810 branch.ir_type, branch.kwargs, child=v, key=id(branch)

811 )

812

813 # We don't expect this assertion to ever fire, but coverage

814 # wants the loop inside to run if you have branch checking

815 # on, hence the pragma.

816 assert ( # pragma: no cover

817 attempts != 1000

818 or len(branch.children) < branch.max_children

819 or any(not v.is_exhausted for v in branch.children.values())

820 )

821

822 def rewrite(self, buffer):

823 """Use previously seen ConjectureData objects to return a tuple of

824 the rewritten buffer and the status we would get from running that

825 buffer with the test function. If the status cannot be predicted

826 from the existing values it will be None."""

827 buffer = bytes(buffer)

828

829 data = ConjectureData.for_buffer(buffer)

830 try:

831 self.simulate_test_function(data)

832 return (data.buffer, data.status)

833 except PreviouslyUnseenBehaviour:

834 return (buffer, None)

835

836 def simulate_test_function(self, data):

837 """Run a simulated version of the test function recorded by

838 this tree. Note that this does not currently call ``stop_example``

839 or ``start_example`` as these are not currently recorded in the

840 tree. This will likely change in future."""

841 node = self.root

842

843 def draw(ir_type, kwargs, *, forced=None, convert_forced=True):

844 if ir_type == "float" and forced is not None and convert_forced:

845 forced = int_to_float(forced)

846

847 draw_func = getattr(data, f"draw_{ir_type}")

848 value = draw_func(**kwargs, forced=forced)

849

850 if ir_type == "float":

851 value = float_to_int(value)

852 return value

853

854 try:

855 while True:

856 for i, (ir_type, kwargs, previous) in enumerate(

857 zip(node.ir_types, node.kwargs, node.values)

858 ):

859 v = draw(

860 ir_type, kwargs, forced=previous if i in node.forced else None

861 )

862 if v != previous:

863 raise PreviouslyUnseenBehaviour

864 if isinstance(node.transition, Conclusion):

865 t = node.transition

866 data.conclude_test(t.status, t.interesting_origin)

867 elif node.transition is None:

868 if node.invalid_at is not None:

869 (ir_type, kwargs, forced) = node.invalid_at

870 try:

871 draw(ir_type, kwargs, forced=forced, convert_forced=False)

872 except StopTest:

873 if data.invalid_at is not None:

874 raise

875 raise PreviouslyUnseenBehaviour

876 elif isinstance(node.transition, Branch):

877 v = draw(node.transition.ir_type, node.transition.kwargs)

878 try:

879 node = node.transition.children[v]

880 except KeyError as err:

881 raise PreviouslyUnseenBehaviour from err

882 else:

883 assert isinstance(node.transition, Killed)

884 data.observer.kill_branch()

885 node = node.transition.next_node

886 except StopTest:

887 pass

888

889 def new_observer(self):

890 return TreeRecordingObserver(self)

891

892 def _draw(self, ir_type, kwargs, *, random, forced=None):

893 # we should possibly pull out BUFFER_SIZE to a common file to avoid this

894 # circular import.

895 from hypothesis.internal.conjecture.engine import BUFFER_SIZE

896

897 cd = ConjectureData(max_length=BUFFER_SIZE, prefix=b"", random=random)

898 draw_func = getattr(cd, f"draw_{ir_type}")

899

900 value = draw_func(**kwargs, forced=forced)

901 buf = cd.buffer

902

903 # using floats as keys into branch.children breaks things, because

904 # e.g. hash(0.0) == hash(-0.0) would collide as keys when they are

905 # in fact distinct child branches.

906 # To distinguish floats here we'll use their bits representation. This

907 # entails some bookkeeping such that we're careful about when the

908 # float key is in its bits form (as a key into branch.children) and

909 # when it is in its float form (as a value we want to write to the

910 # buffer), and converting between the two forms as appropriate.

911 if ir_type == "float":

912 value = float_to_int(value)

913 return (value, buf)

914

915 def _get_children_cache(self, ir_type, kwargs, *, key):

916 # cache the state of the children generator per node/branch (passed as

917 # `key` here), such that we track which children we've already tried

918 # for this branch across draws.

919 # We take advantage of python generators here as one-way iterables,

920 # so each time we iterate we implicitly store our position in the

921 # children generator and don't re-draw children. `children` is the

922 # concrete list of children draw from the generator that we will work

923 # with. Whenever we need to top up this list, we will draw a new value

924 # from the generator.

925 if key not in self._children_cache:

926 generator = all_children(ir_type, kwargs)

927 children = []

928 rejected = set()

929 self._children_cache[key] = (generator, children, rejected)

930

931 return self._children_cache[key]

932

933 def _draw_from_cache(self, ir_type, kwargs, *, key, random):

934 (generator, children, rejected) = self._get_children_cache(

935 ir_type, kwargs, key=key

936 )

937 # Keep a stock of 100 potentially-valid children at all times.

938 # This number is chosen to balance memory/speed vs randomness. Ideally

939 # we would sample uniformly from all not-yet-rejected children, but

940 # computing and storing said children is not free.

941 # no-branch because coverage of the fall-through case here is a bit

942 # annoying.

943 if len(children) < 100: # pragma: no branch

944 for v in generator:

945 if ir_type == "float":

946 v = float_to_int(v)

947 if v in rejected:

948 continue

949 children.append(v)

950 if len(children) >= 100:

951 break

952

953 forced = random.choice(children)

954 if ir_type == "float":

955 forced = int_to_float(forced)

956 (value, buf) = self._draw(ir_type, kwargs, forced=forced, random=random)

957 return (value, buf)

958

959 def _reject_child(self, ir_type, kwargs, *, child, key):

960 (_generator, children, rejected) = self._get_children_cache(

961 ir_type, kwargs, key=key

962 )

963 rejected.add(child)

964 # we remove a child from the list of possible children *only* when it is

965 # rejected, and not when it is initially drawn in _draw_from_cache. The

966 # reason is that a child being drawn does not guarantee that child will

967 # be used in a way such that it is written back to the tree, so it needs

968 # to be available for future draws until we are certain it has been

969 # used.

970 #

971 # For instance, if we generated novel prefixes in a loop (but never used

972 # those prefixes to generate new values!) then we don't want to remove

973 # the drawn children from the available pool until they are actually

974 # used.

975 #

976 # This does result in a small inefficiency: we may draw a child,

977 # immediately use it (so we know it cannot be drawn again), but still

978 # wait to draw and reject it here, because DataTree cannot guarantee

979 # the drawn child has been used.

980 if child in children:

981 children.remove(child)

982

983 def _repr_pretty_(self, p, cycle):

984 assert cycle is False

985 return p.pretty(self.root)

986

987

988class TreeRecordingObserver(DataObserver):

989 def __init__(self, tree):

990 self.__current_node = tree.root

991 self.__index_in_current_node = 0

992 self.__trail = [self.__current_node]

993 self.killed = False

994

995 def draw_integer(

996 self, value: int, *, was_forced: bool, kwargs: IntegerKWargs

997 ) -> None:

998 self.draw_value("integer", value, was_forced=was_forced, kwargs=kwargs)

999

1000 def draw_float(

1001 self, value: float, *, was_forced: bool, kwargs: FloatKWargs

1002 ) -> None:

1003 self.draw_value("float", value, was_forced=was_forced, kwargs=kwargs)

1004

1005 def draw_string(

1006 self, value: str, *, was_forced: bool, kwargs: StringKWargs

1007 ) -> None:

1008 self.draw_value("string", value, was_forced=was_forced, kwargs=kwargs)

1009

1010 def draw_bytes(

1011 self, value: bytes, *, was_forced: bool, kwargs: BytesKWargs

1012 ) -> None:

1013 self.draw_value("bytes", value, was_forced=was_forced, kwargs=kwargs)

1014

1015 def draw_boolean(

1016 self, value: bool, *, was_forced: bool, kwargs: BooleanKWargs

1017 ) -> None:

1018 self.draw_value("boolean", value, was_forced=was_forced, kwargs=kwargs)

1019

1020 def mark_invalid(self, invalid_at: InvalidAt) -> None:

1021 if self.__current_node.transition is None:

1022 self.__current_node.invalid_at = invalid_at

1023

1024 def draw_value(

1025 self,

1026 ir_type: IRTypeName,

1027 value: IRType,

1028 *,

1029 was_forced: bool,

1030 kwargs: IRKWargsType,

1031 ) -> None:

1032 i = self.__index_in_current_node

1033 self.__index_in_current_node += 1

1034 node = self.__current_node

1035

1036 if isinstance(value, float):

1037 value = float_to_int(value)

1038

1039 assert len(node.kwargs) == len(node.values) == len(node.ir_types)

1040 if i < len(node.values):

1041 if ir_type != node.ir_types[i] or kwargs != node.kwargs[i]:

1042 inconsistent_generation()

1043 # Note that we don't check whether a previously

1044 # forced value is now free. That will be caught

1045 # if we ever split the node there, but otherwise

1046 # may pass silently. This is acceptable because it

1047 # means we skip a hash set lookup on every

1048 # draw and that's a pretty niche failure mode.

1049 if was_forced and i not in node.forced:

1050 inconsistent_generation()

1051 if value != node.values[i]:

1052 node.split_at(i)

1053 assert i == len(node.values)

1054 new_node = TreeNode()

1055 node.transition.children[value] = new_node

1056 self.__current_node = new_node

1057 self.__index_in_current_node = 0

1058 else:

1059 trans = node.transition

1060 if trans is None:

1061 node.ir_types.append(ir_type)

1062 node.kwargs.append(kwargs)

1063 node.values.append(value)

1064 if was_forced:

1065 node.mark_forced(i)

1066 # generate_novel_prefix assumes the following invariant: any one

1067 # of the series of draws in a particular node can vary, i.e. the

1068 # max number of children is at least 2. However, some draws are

1069 # pseudo-choices and only have a single value, such as

1070 # integers(0, 0).

1071 #

1072 # Currently, we address this by forcefully splitting such

1073 # single-valued nodes into a transition when we see them. An

1074 # exception to this is if it was forced: forced pseudo-choices

1075 # do not cause the above issue because they inherently cannot

1076 # vary, and moreover they trip other invariants about never

1077 # splitting forced nodes.

1078 #

1079 # An alternative is not writing such choices to the tree at

1080 # all, and thus guaranteeing that each node has at least 2 max

1081 # children.

1082 if compute_max_children(ir_type, kwargs) == 1 and not was_forced:

1083 node.split_at(i)

1084 self.__current_node = node.transition.children[value]

1085 self.__index_in_current_node = 0

1086 elif isinstance(trans, Conclusion):

1087 assert trans.status != Status.OVERRUN

1088 # We tried to draw where history says we should have

1089 # stopped

1090 inconsistent_generation()

1091 else:

1092 assert isinstance(trans, Branch), trans

1093 if ir_type != trans.ir_type or kwargs != trans.kwargs:

1094 inconsistent_generation()

1095 try:

1096 self.__current_node = trans.children[value]

1097 except KeyError:

1098 self.__current_node = trans.children.setdefault(value, TreeNode())

1099 self.__index_in_current_node = 0

1100 if self.__trail[-1] is not self.__current_node:

1101 self.__trail.append(self.__current_node)

1102

1103 def kill_branch(self):

1104 """Mark this part of the tree as not worth re-exploring."""

1105 if self.killed:

1106 return

1107

1108 self.killed = True

1109

1110 if self.__index_in_current_node < len(self.__current_node.values) or (

1111 self.__current_node.transition is not None

1112 and not isinstance(self.__current_node.transition, Killed)

1113 ):

1114 inconsistent_generation()

1115

1116 if self.__current_node.transition is None:

1117 self.__current_node.transition = Killed(TreeNode())

1118 self.__update_exhausted()

1119

1120 self.__current_node = self.__current_node.transition.next_node

1121 self.__index_in_current_node = 0

1122 self.__trail.append(self.__current_node)

1123

1124 def conclude_test(self, status, interesting_origin):

1125 """Says that ``status`` occurred at node ``node``. This updates the

1126 node if necessary and checks for consistency."""

1127 if status == Status.OVERRUN:

1128 return

1129 i = self.__index_in_current_node

1130 node = self.__current_node

1131

1132 if i < len(node.values) or isinstance(node.transition, Branch):

1133 inconsistent_generation()

1134

1135 new_transition = Conclusion(status, interesting_origin)

1136

1137 if node.transition is not None and node.transition != new_transition:

1138 # As an, I'm afraid, horrible bodge, we deliberately ignore flakiness

1139 # where tests go from interesting to valid, because it's much easier

1140 # to produce good error messages for these further up the stack.

1141 if isinstance(node.transition, Conclusion) and (

1142 node.transition.status != Status.INTERESTING

1143 or new_transition.status != Status.VALID

1144 ):

1145 raise Flaky(

1146 f"Inconsistent results from replaying a test case!\n"

1147 f" last: {node.transition.status.name} from {node.transition.interesting_origin}\n"

1148 f" this: {new_transition.status.name} from {new_transition.interesting_origin}"

1149 )

1150 else:

1151 node.transition = new_transition

1152

1153 assert node is self.__trail[-1]

1154 node.check_exhausted()

1155 assert len(node.values) > 0 or node.check_exhausted()

1156

1157 if not self.killed:

1158 self.__update_exhausted()

1159

1160 def __update_exhausted(self):

1161 for t in reversed(self.__trail):

1162 # Any node we've traversed might have now become exhausted.

1163 # We check from the right. As soon as we hit a node that

1164 # isn't exhausted, this automatically implies that all of

1165 # its parents are not exhausted, so we stop.

1166 if not t.check_exhausted():

1167 break