1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import copy
12import re
13import warnings
14from collections.abc import Collection
15from functools import cache, lru_cache, partial
16from typing import Optional, Union, cast
17
18from hypothesis.errors import HypothesisWarning, InvalidArgument
19from hypothesis.internal import charmap
20from hypothesis.internal.charmap import Categories
21from hypothesis.internal.conjecture.data import ConjectureData
22from hypothesis.internal.conjecture.providers import COLLECTION_DEFAULT_MAX_SIZE
23from hypothesis.internal.filtering import max_len, min_len
24from hypothesis.internal.intervalsets import IntervalSet
25from hypothesis.internal.reflection import get_pretty_function_description
26from hypothesis.strategies._internal.collections import ListStrategy
27from hypothesis.strategies._internal.lazy import unwrap_strategies
28from hypothesis.strategies._internal.strategies import (
29 OneOfStrategy,
30 SampledFromStrategy,
31 SearchStrategy,
32)
33from hypothesis.vendor.pretty import pretty
34
35
36# Cache size is limited by sys.maxunicode, but passing None makes it slightly faster.
37@cache
38# this is part of our forward-facing validation, so we do *not* tell mypyc that c
39# should be a str, because we don't want it to validate it before we can.
40def _check_is_single_character(c: object) -> str:
41 # In order to mitigate the performance cost of this check, we use a shared cache,
42 # even at the cost of showing the culprit strategy in the error message.
43 if not isinstance(c, str):
44 type_ = get_pretty_function_description(type(c))
45 raise InvalidArgument(f"Got non-string {c!r} (type {type_})")
46 if len(c) != 1:
47 raise InvalidArgument(f"Got {c!r} (length {len(c)} != 1)")
48 return c
49
50
51class OneCharStringStrategy(SearchStrategy[str]):
52 """A strategy which generates single character strings of text type."""
53
54 def __init__(
55 self, intervals: IntervalSet, force_repr: Optional[str] = None
56 ) -> None:
57 super().__init__()
58 assert isinstance(intervals, IntervalSet)
59 self.intervals = intervals
60 self._force_repr = force_repr
61
62 @classmethod
63 def from_characters_args(
64 cls,
65 *,
66 codec: Optional[str] = None,
67 min_codepoint: Optional[int] = None,
68 max_codepoint: Optional[int] = None,
69 categories: Optional[Categories] = None,
70 exclude_characters: Collection[str] = "",
71 include_characters: Collection[str] = "",
72 ) -> "OneCharStringStrategy":
73 assert set(categories or ()).issubset(charmap.categories())
74 intervals = charmap.query(
75 min_codepoint=min_codepoint,
76 max_codepoint=max_codepoint,
77 categories=categories,
78 exclude_characters=exclude_characters,
79 include_characters=include_characters,
80 )
81 if codec is not None:
82 intervals &= charmap.intervals_from_codec(codec)
83
84 _arg_repr = ", ".join(
85 f"{k}={v!r}"
86 for k, v in [
87 ("codec", codec),
88 ("min_codepoint", min_codepoint),
89 ("max_codepoint", max_codepoint),
90 ("categories", categories),
91 ("exclude_characters", exclude_characters),
92 ("include_characters", include_characters),
93 ]
94 if v not in (None, "")
95 and not (
96 k == "categories"
97 # v has to be `categories` here. Help mypy along to infer that.
98 and set(cast(Categories, v)) == set(charmap.categories()) - {"Cs"}
99 )
100 )
101 if not intervals:
102 raise InvalidArgument(
103 "No characters are allowed to be generated by this "
104 f"combination of arguments: {_arg_repr}"
105 )
106 return cls(intervals, force_repr=f"characters({_arg_repr})")
107
108 @classmethod
109 def from_alphabet(
110 cls, alphabet: Union[str, SearchStrategy]
111 ) -> "OneCharStringStrategy":
112 if isinstance(alphabet, str):
113 return cls.from_characters_args(categories=(), include_characters=alphabet)
114
115 assert isinstance(alphabet, SearchStrategy)
116 char_strategy = unwrap_strategies(alphabet)
117 if isinstance(char_strategy, cls):
118 return char_strategy
119 elif isinstance(char_strategy, SampledFromStrategy):
120 for c in char_strategy.elements:
121 _check_is_single_character(c)
122 return cls.from_characters_args(
123 categories=(),
124 include_characters=char_strategy.elements,
125 )
126 elif isinstance(char_strategy, OneOfStrategy):
127 intervals = IntervalSet()
128 for s in char_strategy.element_strategies:
129 intervals = intervals.union(cls.from_alphabet(s).intervals)
130 return cls(intervals, force_repr=repr(alphabet))
131 raise InvalidArgument(
132 f"{alphabet=} must be a sampled_from() or characters() strategy"
133 )
134
135 def __repr__(self) -> str:
136 return self._force_repr or f"OneCharStringStrategy({self.intervals!r})"
137
138 def do_draw(self, data: ConjectureData) -> str:
139 return data.draw_string(self.intervals, min_size=1, max_size=1)
140
141
142_nonempty_names = (
143 "capitalize",
144 "expandtabs",
145 "join",
146 "lower",
147 "rsplit",
148 "split",
149 "splitlines",
150 "swapcase",
151 "title",
152 "upper",
153)
154_nonempty_and_content_names = (
155 "islower",
156 "isupper",
157 "isalnum",
158 "isalpha",
159 "isascii",
160 "isdigit",
161 "isspace",
162 "istitle",
163 "lstrip",
164 "rstrip",
165 "strip",
166)
167
168
169class TextStrategy(ListStrategy[str]):
170 def do_draw(self, data):
171 # if our element strategy is OneCharStringStrategy, we can skip the
172 # ListStrategy draw and jump right to data.draw_string.
173 # Doing so for user-provided element strategies is not correct in
174 # general, as they may define a different distribution than data.draw_string.
175 elems = unwrap_strategies(self.element_strategy)
176 if isinstance(elems, OneCharStringStrategy):
177 return data.draw_string(
178 elems.intervals,
179 min_size=self.min_size,
180 max_size=(
181 COLLECTION_DEFAULT_MAX_SIZE
182 if self.max_size == float("inf")
183 else self.max_size
184 ),
185 )
186 return "".join(super().do_draw(data))
187
188 def __repr__(self) -> str:
189 args = []
190 if repr(self.element_strategy) != "characters()":
191 args.append(repr(self.element_strategy))
192 if self.min_size:
193 args.append(f"min_size={self.min_size}")
194 if self.max_size < float("inf"):
195 args.append(f"max_size={self.max_size}")
196 return f"text({', '.join(args)})"
197
198 # See https://docs.python.org/3/library/stdtypes.html#string-methods
199 # These methods always return Truthy values for any nonempty string.
200 _nonempty_filters = (
201 *ListStrategy._nonempty_filters,
202 str,
203 str.casefold,
204 str.encode,
205 *(getattr(str, n) for n in _nonempty_names),
206 )
207 _nonempty_and_content_filters = (
208 str.isdecimal,
209 str.isnumeric,
210 *(getattr(str, n) for n in _nonempty_and_content_names),
211 )
212
213 def filter(self, condition):
214 elems = unwrap_strategies(self.element_strategy)
215 if (
216 condition is str.isidentifier
217 and self.max_size >= 1
218 and isinstance(elems, OneCharStringStrategy)
219 ):
220 from hypothesis.strategies import builds, nothing
221
222 id_start, id_continue = _identifier_characters()
223 if not (elems.intervals & id_start):
224 return nothing()
225 return builds(
226 "{}{}".format,
227 OneCharStringStrategy(elems.intervals & id_start),
228 TextStrategy(
229 OneCharStringStrategy(elems.intervals & id_continue),
230 min_size=max(0, self.min_size - 1),
231 max_size=self.max_size - 1,
232 ),
233 # Filter to ensure that NFKC normalization keeps working in future
234 ).filter(str.isidentifier)
235 if (new := _string_filter_rewrite(self, str, condition)) is not None:
236 return new
237 return super().filter(condition)
238
239
240def _string_filter_rewrite(self, kind, condition):
241 if condition in (kind.lower, kind.title, kind.upper):
242 k = kind.__name__
243 warnings.warn(
244 f"You applied {k}.{condition.__name__} as a filter, but this allows "
245 f"all nonempty strings! Did you mean {k}.is{condition.__name__}?",
246 HypothesisWarning,
247 stacklevel=2,
248 )
249
250 if (
251 (
252 kind is bytes
253 or isinstance(
254 unwrap_strategies(self.element_strategy), OneCharStringStrategy
255 )
256 )
257 and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern)
258 and isinstance(pattern.pattern, kind)
259 ):
260 from hypothesis.strategies._internal.regex import regex_strategy
261
262 if condition.__name__ == "match":
263 # Replace with an easier-to-handle equivalent condition
264 caret, close = ("^(?:", ")") if kind is str else (b"^(?:", b")")
265 pattern = re.compile(caret + pattern.pattern + close, flags=pattern.flags)
266 condition = pattern.search
267
268 if condition.__name__ in ("search", "findall", "fullmatch"):
269 s = regex_strategy(
270 pattern,
271 fullmatch=condition.__name__ == "fullmatch",
272 alphabet=self.element_strategy if kind is str else None,
273 )
274 if self.min_size > 0:
275 s = s.filter(partial(min_len, self.min_size))
276 if self.max_size < 1e999:
277 s = s.filter(partial(max_len, self.max_size))
278 return s
279 elif condition.__name__ in ("finditer", "scanner"):
280 # PyPy implements `finditer` as an alias to their `scanner` method
281 warnings.warn(
282 f"You applied {pretty(condition)} as a filter, but this allows "
283 f"any string at all! Did you mean .findall ?",
284 HypothesisWarning,
285 stacklevel=3,
286 )
287 return self
288 elif condition.__name__ == "split":
289 warnings.warn(
290 f"You applied {pretty(condition)} as a filter, but this allows "
291 f"any nonempty string! Did you mean .search ?",
292 HypothesisWarning,
293 stacklevel=3,
294 )
295 return self.filter(bool)
296
297 # We use ListStrategy filter logic for the conditions that *only* imply
298 # the string is nonempty. Here, we increment the min_size but still apply
299 # the filter for conditions that imply nonempty *and specific contents*.
300 if condition in self._nonempty_and_content_filters and self.max_size >= 1:
301 self = copy.copy(self)
302 self.min_size = max(1, self.min_size)
303 return ListStrategy.filter(self, condition)
304
305 return None
306
307
308# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
309# Python updates it's Unicode version between minor releases, but fortunately
310# these properties do not change between the Unicode versions in question.
311_PROPLIST = """
312# ================================================
313
3141885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
3152118 ; Other_ID_Start # Sm SCRIPT CAPITAL P
316212E ; Other_ID_Start # So ESTIMATED SYMBOL
317309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
318
319# Total code points: 6
320
321# ================================================
322
32300B7 ; Other_ID_Continue # Po MIDDLE DOT
3240387 ; Other_ID_Continue # Po GREEK ANO TELEIA
3251369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
32619DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE
327
328# Total code points: 12
329"""
330
331
332@lru_cache
333def _identifier_characters() -> tuple[IntervalSet, IntervalSet]:
334 """See https://docs.python.org/3/reference/lexical_analysis.html#identifiers"""
335 # Start by computing the set of special characters
336 chars = {"Other_ID_Start": "", "Other_ID_Continue": ""}
337 for line in _PROPLIST.splitlines():
338 if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line):
339 codes, prop = m.groups()
340 span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1)
341 chars[prop] += "".join(chr(x) for x in span)
342
343 # Then get the basic set by Unicode category and known extras
344 id_start = charmap.query(
345 categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"),
346 include_characters="_" + chars["Other_ID_Start"],
347 )
348 id_start -= IntervalSet.from_string(
349 # Magic value: the characters which NFKC-normalize to be invalid identifiers.
350 # Conveniently they're all in `id_start`, so we only need to do this once.
351 "\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63"
352 "\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f"
353 )
354 id_continue = id_start | charmap.query(
355 categories=("Mn", "Mc", "Nd", "Pc"),
356 include_characters=chars["Other_ID_Continue"],
357 )
358 return id_start, id_continue
359
360
361class BytesStrategy(SearchStrategy):
362 def __init__(self, min_size: int, max_size: Optional[int]):
363 super().__init__()
364 self.min_size = min_size
365 self.max_size = (
366 max_size if max_size is not None else COLLECTION_DEFAULT_MAX_SIZE
367 )
368
369 def do_draw(self, data: ConjectureData) -> bytes:
370 return data.draw_bytes(self.min_size, self.max_size)
371
372 _nonempty_filters = (
373 *ListStrategy._nonempty_filters,
374 bytes,
375 *(getattr(bytes, n) for n in _nonempty_names),
376 )
377 _nonempty_and_content_filters = (
378 *(getattr(bytes, n) for n in _nonempty_and_content_names),
379 )
380
381 def filter(self, condition):
382 if (new := _string_filter_rewrite(self, bytes, condition)) is not None:
383 return new
384 return ListStrategy.filter(self, condition)