1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import ast
12import hashlib
13import inspect
14import math
15import sys
16from ast import Constant, Expr, NodeVisitor, UnaryOp, USub
17from collections.abc import Iterator, MutableSet
18from functools import lru_cache
19from itertools import chain
20from pathlib import Path
21from types import ModuleType
22from typing import TypeAlias
23
24import hypothesis
25from hypothesis.configuration import storage_directory
26from hypothesis.internal.conjecture.choice import ChoiceTypeT
27from hypothesis.internal.escalation import is_hypothesis_file
28
29ConstantT: TypeAlias = int | float | bytes | str
30
31# unfortunate collision with builtin. I don't want to name the init arg bytes_.
32bytesT = bytes
33
34
35class Constants:
36 def __init__(
37 self,
38 *,
39 integers: MutableSet[int] | None = None,
40 floats: MutableSet[float] | None = None,
41 bytes: MutableSet[bytes] | None = None,
42 strings: MutableSet[str] | None = None,
43 ):
44 self.integers: MutableSet[int] = set() if integers is None else integers
45 self.floats: MutableSet[float] = set() if floats is None else floats
46 self.bytes: MutableSet[bytesT] = set() if bytes is None else bytes
47 self.strings: MutableSet[str] = set() if strings is None else strings
48
49 def set_for_type(
50 self, constant_type: type[ConstantT] | ChoiceTypeT
51 ) -> MutableSet[int] | MutableSet[float] | MutableSet[bytes] | MutableSet[str]:
52 if constant_type is int or constant_type == "integer":
53 return self.integers
54 elif constant_type is float or constant_type == "float":
55 return self.floats
56 elif constant_type is bytes or constant_type == "bytes":
57 return self.bytes
58 elif constant_type is str or constant_type == "string":
59 return self.strings
60 raise ValueError(f"unknown constant_type {constant_type}")
61
62 def add(self, constant: ConstantT) -> None:
63 self.set_for_type(type(constant)).add(constant) # type: ignore
64
65 def __contains__(self, constant: ConstantT) -> bool:
66 return constant in self.set_for_type(type(constant))
67
68 def __or__(self, other: "Constants") -> "Constants":
69 return Constants(
70 integers=self.integers | other.integers, # type: ignore
71 floats=self.floats | other.floats, # type: ignore
72 bytes=self.bytes | other.bytes, # type: ignore
73 strings=self.strings | other.strings, # type: ignore
74 )
75
76 def __iter__(self) -> Iterator[ConstantT]:
77 return iter(chain(self.integers, self.floats, self.bytes, self.strings))
78
79 def __len__(self) -> int:
80 return (
81 len(self.integers) + len(self.floats) + len(self.bytes) + len(self.strings)
82 )
83
84 def __repr__(self) -> str:
85 return f"Constants({self.integers=}, {self.floats=}, {self.bytes=}, {self.strings=})"
86
87 def __eq__(self, other: object) -> bool:
88 if not isinstance(other, Constants):
89 return False
90 return (
91 self.integers == other.integers
92 and self.floats == other.floats
93 and self.bytes == other.bytes
94 and self.strings == other.strings
95 )
96
97
98class TooManyConstants(Exception):
99 # a control flow exception which we raise in ConstantsVisitor when the
100 # number of constants in a module gets too large.
101 pass
102
103
104class ConstantVisitor(NodeVisitor):
105 CONSTANTS_LIMIT: int = 1024
106
107 def __init__(self, *, limit: bool):
108 super().__init__()
109 self.constants = Constants()
110 self.limit = limit
111
112 def _add_constant(self, value: object) -> None:
113 if self.limit and len(self.constants) >= self.CONSTANTS_LIMIT:
114 raise TooManyConstants
115
116 if isinstance(value, str) and (
117 value.isspace()
118 or value == ""
119 # long strings are unlikely to be useful.
120 or len(value) > 20
121 ):
122 return
123 if isinstance(value, bytes) and (
124 value == b""
125 # long bytes seem plausibly more likely to be useful than long strings
126 # (e.g. AES-256 has a 32 byte key), but we still want to cap at some
127 # point to avoid performance issues.
128 or len(value) > 50
129 ):
130 return
131 if isinstance(value, bool):
132 return
133 if isinstance(value, float) and math.isinf(value):
134 # we already upweight inf.
135 return
136 if isinstance(value, int) and -100 < value < 100:
137 # we already upweight small integers.
138 return
139
140 if isinstance(value, (int, float, bytes, str)):
141 self.constants.add(value)
142 return
143
144 # I don't kow what case could go here, but am also not confident there
145 # isn't one.
146 return # pragma: no cover
147
148 def visit_UnaryOp(self, node: UnaryOp) -> None:
149 # `a = -1` is actually a combination of a USub and the constant 1.
150 if (
151 isinstance(node.op, USub)
152 and isinstance(node.operand, Constant)
153 and isinstance(node.operand.value, (int, float))
154 and not isinstance(node.operand.value, bool)
155 ):
156 self._add_constant(-node.operand.value)
157 # don't recurse on this node to avoid adding the positive variant
158 return
159
160 self.generic_visit(node)
161
162 def visit_Expr(self, node: Expr) -> None:
163 if isinstance(node.value, Constant) and isinstance(node.value.value, str):
164 return
165
166 self.generic_visit(node)
167
168 def visit_JoinedStr(self, node):
169 # dont recurse on JoinedStr, i.e. f strings. Constants that appear *only*
170 # in f strings are unlikely to be helpful.
171 return
172
173 def visit_Constant(self, node):
174 self._add_constant(node.value)
175 self.generic_visit(node)
176
177
178def _constants_from_source(source: str | bytes, *, limit: bool) -> Constants:
179 tree = ast.parse(source)
180 visitor = ConstantVisitor(limit=limit)
181
182 try:
183 visitor.visit(tree)
184 except TooManyConstants:
185 # in the case of an incomplete collection, return nothing, to avoid
186 # muddying caches etc.
187 return Constants()
188
189 return visitor.constants
190
191
192def _constants_file_str(constants: Constants) -> str:
193 return str(sorted(constants, key=lambda v: (str(type(v)), v)))
194
195
196@lru_cache(4096)
197def constants_from_module(module: ModuleType, *, limit: bool = True) -> Constants:
198 try:
199 module_file = inspect.getsourcefile(module)
200 # use type: ignore because we know this might error
201 source_bytes = Path(module_file).read_bytes() # type: ignore
202 except Exception:
203 return Constants()
204
205 if limit and len(source_bytes) > 512 * 1024:
206 # Skip files over 512kb. For reference, the largest source file
207 # in Hypothesis is strategies/_internal/core.py at 107kb at time
208 # of writing.
209 return Constants()
210
211 source_hash = hashlib.sha1(source_bytes).hexdigest()[:16]
212 # separate cache files for each limit param. see discussion in pull/4398
213 cache_p = storage_directory("constants") / (
214 source_hash + ("" if limit else "_nolimit")
215 )
216 try:
217 return _constants_from_source(cache_p.read_bytes(), limit=limit)
218 except Exception:
219 # if the cached location doesn't exist, or it does exist but there was
220 # a problem reading it, fall back to standard computation of the constants
221 pass
222
223 try:
224 constants = _constants_from_source(source_bytes, limit=limit)
225 except Exception:
226 # A bunch of things can go wrong here.
227 # * ast.parse may fail on the source code
228 # * NodeVisitor may hit a RecursionError (see many related issues on
229 # e.g. libcst https://github.com/Instagram/LibCST/issues?q=recursion),
230 # or a MemoryError (`"[1, " * 200 + "]" * 200`)
231 return Constants()
232
233 try:
234 cache_p.parent.mkdir(parents=True, exist_ok=True)
235 cache_p.write_text(
236 f"# file: {module_file}\n# hypothesis_version: {hypothesis.__version__}\n\n"
237 # somewhat arbitrary sort order. The cache file doesn't *have* to be
238 # stable... but it is aesthetically pleasing, and means we could rely
239 # on it in the future!
240 + _constants_file_str(constants),
241 encoding="utf-8",
242 )
243 except Exception: # pragma: no cover
244 pass
245
246 return constants
247
248
249@lru_cache(4096)
250def is_local_module_file(path: str) -> bool:
251 from hypothesis.internal.scrutineer import ModuleLocation
252
253 return (
254 # Skip expensive path lookup for stdlib modules.
255 # This will cause false negatives if a user names their module the
256 # same as a stdlib module.
257 path not in sys.stdlib_module_names
258 # A path containing site-packages is extremely likely to be
259 # ModuleLocation.SITE_PACKAGES. Skip the expensive path lookup here.
260 and "/site-packages/" not in path
261 and ModuleLocation.from_path(path) is ModuleLocation.LOCAL
262 # normally, hypothesis is a third-party library and is not returned
263 # by local_modules. However, if it is installed as an editable package
264 # with pip install -e, then we will pick up on it. Just hardcode an
265 # ignore here.
266 and not is_hypothesis_file(path)
267 # avoid collecting constants from test files
268 and not (
269 "test" in (p := Path(path)).parts
270 or "tests" in p.parts
271 or p.stem.startswith("test_")
272 or p.stem.endswith("_test")
273 )
274 )