1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import copy
12import re
13import warnings
14from collections.abc import Collection
15from functools import cache, lru_cache, partial
16from typing import cast
17
18from hypothesis.errors import HypothesisWarning, InvalidArgument
19from hypothesis.internal import charmap
20from hypothesis.internal.charmap import Categories
21from hypothesis.internal.conjecture.data import ConjectureData
22from hypothesis.internal.conjecture.providers import COLLECTION_DEFAULT_MAX_SIZE
23from hypothesis.internal.filtering import max_len, min_len
24from hypothesis.internal.intervalsets import IntervalSet
25from hypothesis.internal.reflection import get_pretty_function_description
26from hypothesis.strategies._internal.collections import ListStrategy
27from hypothesis.strategies._internal.lazy import unwrap_strategies
28from hypothesis.strategies._internal.strategies import (
29 OneOfStrategy,
30 SampledFromStrategy,
31 SearchStrategy,
32)
33from hypothesis.vendor.pretty import pretty
34
35
36# Cache size is limited by sys.maxunicode, but passing None makes it slightly faster.
37@cache
38# this is part of our forward-facing validation, so we do *not* tell mypyc that c
39# should be a str, because we don't want it to validate it before we can.
40def _check_is_single_character(c: object) -> str:
41 # In order to mitigate the performance cost of this check, we use a shared cache,
42 # even at the cost of showing the culprit strategy in the error message.
43 if not isinstance(c, str):
44 type_ = get_pretty_function_description(type(c))
45 raise InvalidArgument(f"Got non-string {c!r} (type {type_})")
46 if len(c) != 1:
47 raise InvalidArgument(f"Got {c!r} (length {len(c)} != 1)")
48 return c
49
50
51class OneCharStringStrategy(SearchStrategy[str]):
52 """A strategy which generates single character strings of text type."""
53
54 def __init__(self, intervals: IntervalSet, force_repr: str | None = None) -> None:
55 super().__init__()
56 assert isinstance(intervals, IntervalSet)
57 self.intervals = intervals
58 self._force_repr = force_repr
59
60 @classmethod
61 def from_characters_args(
62 cls,
63 *,
64 codec: str | None = None,
65 min_codepoint: int | None = None,
66 max_codepoint: int | None = None,
67 categories: Categories | None = None,
68 exclude_characters: Collection[str] = "",
69 include_characters: Collection[str] = "",
70 ) -> "OneCharStringStrategy":
71 assert set(categories or ()).issubset(charmap.categories())
72 intervals = charmap.query(
73 min_codepoint=min_codepoint,
74 max_codepoint=max_codepoint,
75 categories=categories,
76 exclude_characters=exclude_characters,
77 include_characters=include_characters,
78 )
79 if codec is not None:
80 intervals &= charmap.intervals_from_codec(codec)
81
82 _arg_repr = ", ".join(
83 f"{k}={v!r}"
84 for k, v in [
85 ("codec", codec),
86 ("min_codepoint", min_codepoint),
87 ("max_codepoint", max_codepoint),
88 ("categories", categories),
89 ("exclude_characters", exclude_characters),
90 ("include_characters", include_characters),
91 ]
92 if v not in (None, "")
93 and not (
94 k == "categories"
95 # v has to be `categories` here. Help mypy along to infer that.
96 and set(cast(Categories, v)) == set(charmap.categories()) - {"Cs"}
97 )
98 )
99 if not intervals:
100 raise InvalidArgument(
101 "No characters are allowed to be generated by this "
102 f"combination of arguments: {_arg_repr}"
103 )
104 return cls(intervals, force_repr=f"characters({_arg_repr})")
105
106 @classmethod
107 def from_alphabet(cls, alphabet: str | SearchStrategy) -> "OneCharStringStrategy":
108 if isinstance(alphabet, str):
109 return cls.from_characters_args(categories=(), include_characters=alphabet)
110
111 assert isinstance(alphabet, SearchStrategy)
112 char_strategy = unwrap_strategies(alphabet)
113 if isinstance(char_strategy, cls):
114 return char_strategy
115 elif isinstance(char_strategy, SampledFromStrategy):
116 for c in char_strategy.elements:
117 _check_is_single_character(c)
118 return cls.from_characters_args(
119 categories=(),
120 include_characters=char_strategy.elements,
121 )
122 elif isinstance(char_strategy, OneOfStrategy):
123 intervals = IntervalSet()
124 for s in char_strategy.element_strategies:
125 intervals = intervals.union(cls.from_alphabet(s).intervals)
126 return cls(intervals, force_repr=repr(alphabet))
127 raise InvalidArgument(
128 f"{alphabet=} must be a sampled_from() or characters() strategy"
129 )
130
131 def __repr__(self) -> str:
132 return self._force_repr or f"OneCharStringStrategy({self.intervals!r})"
133
134 def do_draw(self, data: ConjectureData) -> str:
135 return data.draw_string(self.intervals, min_size=1, max_size=1)
136
137
138_nonempty_names = (
139 "capitalize",
140 "expandtabs",
141 "join",
142 "lower",
143 "rsplit",
144 "split",
145 "splitlines",
146 "swapcase",
147 "title",
148 "upper",
149)
150_nonempty_and_content_names = (
151 "islower",
152 "isupper",
153 "isalnum",
154 "isalpha",
155 "isascii",
156 "isdigit",
157 "isspace",
158 "istitle",
159 "lstrip",
160 "rstrip",
161 "strip",
162)
163
164
165class TextStrategy(ListStrategy[str]):
166 def do_draw(self, data):
167 # if our element strategy is OneCharStringStrategy, we can skip the
168 # ListStrategy draw and jump right to data.draw_string.
169 # Doing so for user-provided element strategies is not correct in
170 # general, as they may define a different distribution than data.draw_string.
171 elems = unwrap_strategies(self.element_strategy)
172 if isinstance(elems, OneCharStringStrategy):
173 return data.draw_string(
174 elems.intervals,
175 min_size=self.min_size,
176 max_size=(
177 COLLECTION_DEFAULT_MAX_SIZE
178 if self.max_size == float("inf")
179 else self.max_size
180 ),
181 )
182 return "".join(super().do_draw(data))
183
184 def __repr__(self) -> str:
185 args = []
186 if repr(self.element_strategy) != "characters()":
187 args.append(repr(self.element_strategy))
188 if self.min_size:
189 args.append(f"min_size={self.min_size}")
190 if self.max_size < float("inf"):
191 args.append(f"max_size={self.max_size}")
192 return f"text({', '.join(args)})"
193
194 # See https://docs.python.org/3/library/stdtypes.html#string-methods
195 # These methods always return Truthy values for any nonempty string.
196 _nonempty_filters = (
197 *ListStrategy._nonempty_filters,
198 str,
199 str.casefold,
200 str.encode,
201 *(getattr(str, n) for n in _nonempty_names),
202 )
203 _nonempty_and_content_filters = (
204 str.isdecimal,
205 str.isnumeric,
206 *(getattr(str, n) for n in _nonempty_and_content_names),
207 )
208
209 def filter(self, condition):
210 elems = unwrap_strategies(self.element_strategy)
211 if (
212 condition is str.isidentifier
213 and self.max_size >= 1
214 and isinstance(elems, OneCharStringStrategy)
215 ):
216 from hypothesis.strategies import builds, nothing
217
218 id_start, id_continue = _identifier_characters()
219 if not (elems.intervals & id_start):
220 return nothing()
221 return builds(
222 "{}{}".format,
223 OneCharStringStrategy(elems.intervals & id_start),
224 TextStrategy(
225 OneCharStringStrategy(elems.intervals & id_continue),
226 min_size=max(0, self.min_size - 1),
227 max_size=self.max_size - 1,
228 ),
229 # Filter to ensure that NFKC normalization keeps working in future
230 ).filter(str.isidentifier)
231 if (new := _string_filter_rewrite(self, str, condition)) is not None:
232 return new
233 return super().filter(condition)
234
235
236def _string_filter_rewrite(self, kind, condition):
237 if condition in (kind.lower, kind.title, kind.upper):
238 k = kind.__name__
239 warnings.warn(
240 f"You applied {k}.{condition.__name__} as a filter, but this allows "
241 f"all nonempty strings! Did you mean {k}.is{condition.__name__}?",
242 HypothesisWarning,
243 stacklevel=2,
244 )
245
246 if (
247 (
248 kind is bytes
249 or isinstance(
250 unwrap_strategies(self.element_strategy), OneCharStringStrategy
251 )
252 )
253 and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern)
254 and isinstance(pattern.pattern, kind)
255 ):
256 from hypothesis.strategies._internal.regex import regex_strategy
257
258 if condition.__name__ == "match":
259 # Replace with an easier-to-handle equivalent condition
260 caret, close = ("^(?:", ")") if kind is str else (b"^(?:", b")")
261 pattern = re.compile(caret + pattern.pattern + close, flags=pattern.flags)
262 condition = pattern.search
263
264 if condition.__name__ in ("search", "findall", "fullmatch"):
265 s = regex_strategy(
266 pattern,
267 fullmatch=condition.__name__ == "fullmatch",
268 alphabet=self.element_strategy if kind is str else None,
269 )
270 if self.min_size > 0:
271 s = s.filter(partial(min_len, self.min_size))
272 if self.max_size < 1e999:
273 s = s.filter(partial(max_len, self.max_size))
274 return s
275 elif condition.__name__ in ("finditer", "scanner"):
276 # PyPy implements `finditer` as an alias to their `scanner` method
277 warnings.warn(
278 f"You applied {pretty(condition)} as a filter, but this allows "
279 f"any string at all! Did you mean .findall ?",
280 HypothesisWarning,
281 stacklevel=3,
282 )
283 return self
284 elif condition.__name__ == "split":
285 warnings.warn(
286 f"You applied {pretty(condition)} as a filter, but this allows "
287 f"any nonempty string! Did you mean .search ?",
288 HypothesisWarning,
289 stacklevel=3,
290 )
291 return self.filter(bool)
292
293 # We use ListStrategy filter logic for the conditions that *only* imply
294 # the string is nonempty. Here, we increment the min_size but still apply
295 # the filter for conditions that imply nonempty *and specific contents*.
296 if condition in self._nonempty_and_content_filters and self.max_size >= 1:
297 self = copy.copy(self)
298 self.min_size = max(1, self.min_size)
299 return ListStrategy.filter(self, condition)
300
301 return None
302
303
304# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
305# Python updates it's Unicode version between minor releases, but fortunately
306# these properties do not change between the Unicode versions in question.
307_PROPLIST = """
308# ================================================
309
3101885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
3112118 ; Other_ID_Start # Sm SCRIPT CAPITAL P
312212E ; Other_ID_Start # So ESTIMATED SYMBOL
313309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
314
315# Total code points: 6
316
317# ================================================
318
31900B7 ; Other_ID_Continue # Po MIDDLE DOT
3200387 ; Other_ID_Continue # Po GREEK ANO TELEIA
3211369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
32219DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE
323
324# Total code points: 12
325"""
326
327
328@lru_cache
329def _identifier_characters() -> tuple[IntervalSet, IntervalSet]:
330 """See https://docs.python.org/3/reference/lexical_analysis.html#identifiers"""
331 # Start by computing the set of special characters
332 chars = {"Other_ID_Start": "", "Other_ID_Continue": ""}
333 for line in _PROPLIST.splitlines():
334 if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line):
335 codes, prop = m.groups()
336 span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1)
337 chars[prop] += "".join(chr(x) for x in span)
338
339 # Then get the basic set by Unicode category and known extras
340 id_start = charmap.query(
341 categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"),
342 include_characters="_" + chars["Other_ID_Start"],
343 )
344 id_start -= IntervalSet.from_string(
345 # Magic value: the characters which NFKC-normalize to be invalid identifiers.
346 # Conveniently they're all in `id_start`, so we only need to do this once.
347 "\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63"
348 "\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f"
349 )
350 id_continue = id_start | charmap.query(
351 categories=("Mn", "Mc", "Nd", "Pc"),
352 include_characters=chars["Other_ID_Continue"],
353 )
354 return id_start, id_continue
355
356
357class BytesStrategy(SearchStrategy):
358 def __init__(self, min_size: int, max_size: int | None):
359 super().__init__()
360 self.min_size = min_size
361 self.max_size = (
362 max_size if max_size is not None else COLLECTION_DEFAULT_MAX_SIZE
363 )
364
365 def do_draw(self, data: ConjectureData) -> bytes:
366 return data.draw_bytes(self.min_size, self.max_size)
367
368 _nonempty_filters = (
369 *ListStrategy._nonempty_filters,
370 bytes,
371 *(getattr(bytes, n) for n in _nonempty_names),
372 )
373 _nonempty_and_content_filters = (
374 *(getattr(bytes, n) for n in _nonempty_and_content_names),
375 )
376
377 def filter(self, condition):
378 if (new := _string_filter_rewrite(self, bytes, condition)) is not None:
379 return new
380 return ListStrategy.filter(self, condition)