1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import ast
12import hashlib
13import inspect
14import math
15import sys
16from ast import Constant, Expr, NodeVisitor, UnaryOp, USub
17from collections.abc import Iterator, MutableSet
18from functools import lru_cache
19from itertools import chain
20from pathlib import Path
21from types import ModuleType
22from typing import TYPE_CHECKING, Optional, Union
23
24import hypothesis
25from hypothesis.configuration import storage_directory
26from hypothesis.internal.conjecture.choice import ChoiceTypeT
27from hypothesis.internal.escalation import is_hypothesis_file
28
29if TYPE_CHECKING:
30 from typing import TypeAlias
31
32ConstantT: "TypeAlias" = Union[int, float, bytes, str]
33
34# unfortunate collision with builtin. I don't want to name the init arg bytes_.
35bytesT = bytes
36
37
38class Constants:
39 def __init__(
40 self,
41 *,
42 integers: Optional[MutableSet[int]] = None,
43 floats: Optional[MutableSet[float]] = None,
44 bytes: Optional[MutableSet[bytes]] = None,
45 strings: Optional[MutableSet[str]] = None,
46 ):
47 self.integers: MutableSet[int] = set() if integers is None else integers
48 self.floats: MutableSet[float] = set() if floats is None else floats
49 self.bytes: MutableSet[bytesT] = set() if bytes is None else bytes
50 self.strings: MutableSet[str] = set() if strings is None else strings
51
52 def set_for_type(
53 self, constant_type: Union[type[ConstantT], ChoiceTypeT]
54 ) -> Union[MutableSet[int], MutableSet[float], MutableSet[bytes], MutableSet[str]]:
55 if constant_type is int or constant_type == "integer":
56 return self.integers
57 elif constant_type is float or constant_type == "float":
58 return self.floats
59 elif constant_type is bytes or constant_type == "bytes":
60 return self.bytes
61 elif constant_type is str or constant_type == "string":
62 return self.strings
63 raise ValueError(f"unknown constant_type {constant_type}")
64
65 def add(self, constant: ConstantT) -> None:
66 self.set_for_type(type(constant)).add(constant) # type: ignore
67
68 def __contains__(self, constant: ConstantT) -> bool:
69 return constant in self.set_for_type(type(constant))
70
71 def __or__(self, other: "Constants") -> "Constants":
72 return Constants(
73 integers=self.integers | other.integers, # type: ignore
74 floats=self.floats | other.floats, # type: ignore
75 bytes=self.bytes | other.bytes, # type: ignore
76 strings=self.strings | other.strings, # type: ignore
77 )
78
79 def __iter__(self) -> Iterator[ConstantT]:
80 return iter(chain(self.integers, self.floats, self.bytes, self.strings))
81
82 def __len__(self) -> int:
83 return (
84 len(self.integers) + len(self.floats) + len(self.bytes) + len(self.strings)
85 )
86
87 def __repr__(self) -> str:
88 return f"Constants({self.integers=}, {self.floats=}, {self.bytes=}, {self.strings=})"
89
90 def __eq__(self, other: object) -> bool:
91 if not isinstance(other, Constants):
92 return False
93 return (
94 self.integers == other.integers
95 and self.floats == other.floats
96 and self.bytes == other.bytes
97 and self.strings == other.strings
98 )
99
100
101class TooManyConstants(Exception):
102 # a control flow exception which we raise in ConstantsVisitor when the
103 # number of constants in a module gets too large.
104 pass
105
106
107class ConstantVisitor(NodeVisitor):
108 CONSTANTS_LIMIT: int = 1024
109
110 def __init__(self, *, limit: bool):
111 super().__init__()
112 self.constants = Constants()
113 self.limit = limit
114
115 def _add_constant(self, value: object) -> None:
116 if self.limit and len(self.constants) >= self.CONSTANTS_LIMIT:
117 raise TooManyConstants
118
119 if isinstance(value, str) and (
120 value.isspace()
121 or value == ""
122 # long strings are unlikely to be useful.
123 or len(value) > 20
124 ):
125 return
126 if isinstance(value, bytes) and (
127 value == b""
128 # long bytes seem plausibly more likely to be useful than long strings
129 # (e.g. AES-256 has a 32 byte key), but we still want to cap at some
130 # point to avoid performance issues.
131 or len(value) > 50
132 ):
133 return
134 if isinstance(value, bool):
135 return
136 if isinstance(value, float) and math.isinf(value):
137 # we already upweight inf.
138 return
139 if isinstance(value, int) and -100 < value < 100:
140 # we already upweight small integers.
141 return
142
143 if isinstance(value, (int, float, bytes, str)):
144 self.constants.add(value)
145 return
146
147 # I don't kow what case could go here, but am also not confident there
148 # isn't one.
149 return # pragma: no cover
150
151 def visit_UnaryOp(self, node: UnaryOp) -> None:
152 # `a = -1` is actually a combination of a USub and the constant 1.
153 if (
154 isinstance(node.op, USub)
155 and isinstance(node.operand, Constant)
156 and isinstance(node.operand.value, (int, float))
157 and not isinstance(node.operand.value, bool)
158 ):
159 self._add_constant(-node.operand.value)
160 # don't recurse on this node to avoid adding the positive variant
161 return
162
163 self.generic_visit(node)
164
165 def visit_Expr(self, node: Expr) -> None:
166 if isinstance(node.value, Constant) and isinstance(node.value.value, str):
167 return
168
169 self.generic_visit(node)
170
171 def visit_JoinedStr(self, node):
172 # dont recurse on JoinedStr, i.e. f strings. Constants that appear *only*
173 # in f strings are unlikely to be helpful.
174 return
175
176 def visit_Constant(self, node):
177 self._add_constant(node.value)
178 self.generic_visit(node)
179
180
181def _constants_from_source(source: Union[str, bytes], *, limit: bool) -> Constants:
182 tree = ast.parse(source)
183 visitor = ConstantVisitor(limit=limit)
184
185 try:
186 visitor.visit(tree)
187 except TooManyConstants:
188 # in the case of an incomplete collection, return nothing, to avoid
189 # muddying caches etc.
190 return Constants()
191
192 return visitor.constants
193
194
195def _constants_file_str(constants: Constants) -> str:
196 return str(sorted(constants, key=lambda v: (str(type(v)), v)))
197
198
199@lru_cache(4096)
200def constants_from_module(module: ModuleType, *, limit: bool = True) -> Constants:
201 try:
202 module_file = inspect.getsourcefile(module)
203 # use type: ignore because we know this might error
204 source_bytes = Path(module_file).read_bytes() # type: ignore
205 except Exception:
206 return Constants()
207
208 if limit and len(source_bytes) > 512 * 1024:
209 # Skip files over 512kb. For reference, the largest source file
210 # in Hypothesis is strategies/_internal/core.py at 107kb at time
211 # of writing.
212 return Constants()
213
214 source_hash = hashlib.sha1(source_bytes).hexdigest()[:16]
215 # separate cache files for each limit param. see discussion in pull/4398
216 cache_p = storage_directory("constants") / (
217 source_hash + ("" if limit else "_nolimit")
218 )
219 try:
220 return _constants_from_source(cache_p.read_bytes(), limit=limit)
221 except Exception:
222 # if the cached location doesn't exist, or it does exist but there was
223 # a problem reading it, fall back to standard computation of the constants
224 pass
225
226 try:
227 constants = _constants_from_source(source_bytes, limit=limit)
228 except Exception:
229 # A bunch of things can go wrong here.
230 # * ast.parse may fail on the source code
231 # * NodeVisitor may hit a RecursionError (see many related issues on
232 # e.g. libcst https://github.com/Instagram/LibCST/issues?q=recursion),
233 # or a MemoryError (`"[1, " * 200 + "]" * 200`)
234 return Constants()
235
236 try:
237 cache_p.parent.mkdir(parents=True, exist_ok=True)
238 cache_p.write_text(
239 f"# file: {module_file}\n# hypothesis_version: {hypothesis.__version__}\n\n"
240 # somewhat arbitrary sort order. The cache file doesn't *have* to be
241 # stable... but it is aesthetically pleasing, and means we could rely
242 # on it in the future!
243 + _constants_file_str(constants),
244 encoding="utf-8",
245 )
246 except Exception: # pragma: no cover
247 pass
248
249 return constants
250
251
252@lru_cache(4096)
253def is_local_module_file(path: str) -> bool:
254 from hypothesis.internal.scrutineer import ModuleLocation
255
256 return (
257 # Skip expensive path lookup for stdlib modules.
258 # This will cause false negatives if a user names their module the
259 # same as a stdlib module.
260 #
261 # sys.stdlib_module_names is new in 3.10
262 not (sys.version_info >= (3, 10) and path in sys.stdlib_module_names)
263 # A path containing site-packages is extremely likely to be
264 # ModuleLocation.SITE_PACKAGES. Skip the expensive path lookup here.
265 and "/site-packages/" not in path
266 and ModuleLocation.from_path(path) is ModuleLocation.LOCAL
267 # normally, hypothesis is a third-party library and is not returned
268 # by local_modules. However, if it is installed as an editable package
269 # with pip install -e, then we will pick up on it. Just hardcode an
270 # ignore here.
271 and not is_hypothesis_file(path)
272 # avoid collecting constants from test files
273 and not (
274 "test" in (p := Path(path)).parts
275 or "tests" in p.parts
276 or p.stem.startswith("test_")
277 or p.stem.endswith("_test")
278 )
279 )