1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import copy
12import re
13import warnings
14from functools import cache, lru_cache, partial
15from typing import Optional
16
17from hypothesis.errors import HypothesisWarning, InvalidArgument
18from hypothesis.internal import charmap
19from hypothesis.internal.conjecture.data import ConjectureData
20from hypothesis.internal.conjecture.providers import COLLECTION_DEFAULT_MAX_SIZE
21from hypothesis.internal.filtering import max_len, min_len
22from hypothesis.internal.intervalsets import IntervalSet
23from hypothesis.internal.reflection import get_pretty_function_description
24from hypothesis.strategies._internal.collections import ListStrategy
25from hypothesis.strategies._internal.lazy import unwrap_strategies
26from hypothesis.strategies._internal.strategies import (
27 OneOfStrategy,
28 SampledFromStrategy,
29 SearchStrategy,
30)
31from hypothesis.vendor.pretty import pretty
32
33
34# Cache size is limited by sys.maxunicode, but passing None makes it slightly faster.
35@cache
36def _check_is_single_character(c):
37 # In order to mitigate the performance cost of this check, we use a shared cache,
38 # even at the cost of showing the culprit strategy in the error message.
39 if not isinstance(c, str):
40 type_ = get_pretty_function_description(type(c))
41 raise InvalidArgument(f"Got non-string {c!r} (type {type_})")
42 if len(c) != 1:
43 raise InvalidArgument(f"Got {c!r} (length {len(c)} != 1)")
44 return c
45
46
47class OneCharStringStrategy(SearchStrategy[str]):
48 """A strategy which generates single character strings of text type."""
49
50 def __init__(
51 self, intervals: IntervalSet, force_repr: Optional[str] = None
52 ) -> None:
53 assert isinstance(intervals, IntervalSet)
54 self.intervals = intervals
55 self._force_repr = force_repr
56
57 @classmethod
58 def from_characters_args(
59 cls,
60 *,
61 codec=None,
62 min_codepoint=None,
63 max_codepoint=None,
64 categories=None,
65 exclude_characters=None,
66 include_characters=None,
67 ):
68 assert set(categories or ()).issubset(charmap.categories())
69 intervals = charmap.query(
70 min_codepoint=min_codepoint,
71 max_codepoint=max_codepoint,
72 categories=categories,
73 exclude_characters=exclude_characters,
74 include_characters=include_characters,
75 )
76 if codec is not None:
77 intervals &= charmap.intervals_from_codec(codec)
78
79 _arg_repr = ", ".join(
80 f"{k}={v!r}"
81 for k, v in [
82 ("codec", codec),
83 ("min_codepoint", min_codepoint),
84 ("max_codepoint", max_codepoint),
85 ("categories", categories),
86 ("exclude_characters", exclude_characters),
87 ("include_characters", include_characters),
88 ]
89 if v not in (None, "")
90 and not (k == "categories" and set(v) == set(charmap.categories()) - {"Cs"})
91 )
92 if not intervals:
93 raise InvalidArgument(
94 "No characters are allowed to be generated by this "
95 f"combination of arguments: {_arg_repr}"
96 )
97 return cls(intervals, force_repr=f"characters({_arg_repr})")
98
99 @classmethod
100 def from_alphabet(cls, alphabet):
101 if isinstance(alphabet, str):
102 return cls.from_characters_args(categories=(), include_characters=alphabet)
103
104 assert isinstance(alphabet, SearchStrategy)
105 char_strategy = unwrap_strategies(alphabet)
106 if isinstance(char_strategy, cls):
107 return char_strategy
108 elif isinstance(char_strategy, SampledFromStrategy):
109 for c in char_strategy.elements:
110 _check_is_single_character(c)
111 return cls.from_characters_args(
112 categories=(),
113 include_characters=char_strategy.elements,
114 )
115 elif isinstance(char_strategy, OneOfStrategy):
116 intervals = IntervalSet()
117 for s in char_strategy.element_strategies:
118 intervals = intervals.union(cls.from_alphabet(s).intervals)
119 return cls(intervals, force_repr=repr(alphabet))
120 raise InvalidArgument(
121 f"{alphabet=} must be a sampled_from() or characters() strategy"
122 )
123
124 def __repr__(self) -> str:
125 return self._force_repr or f"OneCharStringStrategy({self.intervals!r})"
126
127 def do_draw(self, data: ConjectureData) -> str:
128 return data.draw_string(self.intervals, min_size=1, max_size=1)
129
130
131_nonempty_names = (
132 "capitalize",
133 "expandtabs",
134 "join",
135 "lower",
136 "rsplit",
137 "split",
138 "splitlines",
139 "swapcase",
140 "title",
141 "upper",
142)
143_nonempty_and_content_names = (
144 "islower",
145 "isupper",
146 "isalnum",
147 "isalpha",
148 "isascii",
149 "isdigit",
150 "isspace",
151 "istitle",
152 "lstrip",
153 "rstrip",
154 "strip",
155)
156
157
158class TextStrategy(ListStrategy[str]):
159 def do_draw(self, data):
160 # if our element strategy is OneCharStringStrategy, we can skip the
161 # ListStrategy draw and jump right to data.draw_string.
162 # Doing so for user-provided element strategies is not correct in
163 # general, as they may define a different distribution than data.draw_string.
164 elems = unwrap_strategies(self.element_strategy)
165 if isinstance(elems, OneCharStringStrategy):
166 return data.draw_string(
167 elems.intervals,
168 min_size=self.min_size,
169 max_size=(
170 COLLECTION_DEFAULT_MAX_SIZE
171 if self.max_size == float("inf")
172 else self.max_size
173 ),
174 )
175 return "".join(super().do_draw(data))
176
177 def __repr__(self) -> str:
178 args = []
179 if repr(self.element_strategy) != "characters()":
180 args.append(repr(self.element_strategy))
181 if self.min_size:
182 args.append(f"min_size={self.min_size}")
183 if self.max_size < float("inf"):
184 args.append(f"max_size={self.max_size}")
185 return f"text({', '.join(args)})"
186
187 # See https://docs.python.org/3/library/stdtypes.html#string-methods
188 # These methods always return Truthy values for any nonempty string.
189 _nonempty_filters = (
190 *ListStrategy._nonempty_filters,
191 str,
192 str.casefold,
193 str.encode,
194 *(getattr(str, n) for n in _nonempty_names),
195 )
196 _nonempty_and_content_filters = (
197 str.isdecimal,
198 str.isnumeric,
199 *(getattr(str, n) for n in _nonempty_and_content_names),
200 )
201
202 def filter(self, condition):
203 elems = unwrap_strategies(self.element_strategy)
204 if (
205 condition is str.isidentifier
206 and self.max_size >= 1
207 and isinstance(elems, OneCharStringStrategy)
208 ):
209 from hypothesis.strategies import builds, nothing
210
211 id_start, id_continue = _identifier_characters()
212 if not (elems.intervals & id_start):
213 return nothing()
214 return builds(
215 "{}{}".format,
216 OneCharStringStrategy(elems.intervals & id_start),
217 TextStrategy(
218 OneCharStringStrategy(elems.intervals & id_continue),
219 min_size=max(0, self.min_size - 1),
220 max_size=self.max_size - 1,
221 ),
222 # Filter to ensure that NFKC normalization keeps working in future
223 ).filter(str.isidentifier)
224 if (new := _string_filter_rewrite(self, str, condition)) is not None:
225 return new
226 return super().filter(condition)
227
228
229def _string_filter_rewrite(self, kind, condition):
230 if condition in (kind.lower, kind.title, kind.upper):
231 k = kind.__name__
232 warnings.warn(
233 f"You applied {k}.{condition.__name__} as a filter, but this allows "
234 f"all nonempty strings! Did you mean {k}.is{condition.__name__}?",
235 HypothesisWarning,
236 stacklevel=2,
237 )
238
239 if (
240 (
241 kind is bytes
242 or isinstance(
243 unwrap_strategies(self.element_strategy), OneCharStringStrategy
244 )
245 )
246 and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern)
247 and isinstance(pattern.pattern, kind)
248 ):
249 from hypothesis.strategies._internal.regex import regex_strategy
250
251 if condition.__name__ == "match":
252 # Replace with an easier-to-handle equivalent condition
253 caret, close = ("^(?:", ")") if kind is str else (b"^(?:", b")")
254 pattern = re.compile(caret + pattern.pattern + close, flags=pattern.flags)
255 condition = pattern.search
256
257 if condition.__name__ in ("search", "findall", "fullmatch"):
258 s = regex_strategy(
259 pattern,
260 fullmatch=condition.__name__ == "fullmatch",
261 alphabet=self.element_strategy if kind is str else None,
262 )
263 if self.min_size > 0:
264 s = s.filter(partial(min_len, self.min_size))
265 if self.max_size < 1e999:
266 s = s.filter(partial(max_len, self.max_size))
267 return s
268 elif condition.__name__ in ("finditer", "scanner"):
269 # PyPy implements `finditer` as an alias to their `scanner` method
270 warnings.warn(
271 f"You applied {pretty(condition)} as a filter, but this allows "
272 f"any string at all! Did you mean .findall ?",
273 HypothesisWarning,
274 stacklevel=3,
275 )
276 return self
277 elif condition.__name__ == "split":
278 warnings.warn(
279 f"You applied {pretty(condition)} as a filter, but this allows "
280 f"any nonempty string! Did you mean .search ?",
281 HypothesisWarning,
282 stacklevel=3,
283 )
284 return self.filter(bool)
285
286 # We use ListStrategy filter logic for the conditions that *only* imply
287 # the string is nonempty. Here, we increment the min_size but still apply
288 # the filter for conditions that imply nonempty *and specific contents*.
289 if condition in self._nonempty_and_content_filters and self.max_size >= 1:
290 self = copy.copy(self)
291 self.min_size = max(1, self.min_size)
292 return ListStrategy.filter(self, condition)
293
294 return None
295
296
297# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
298# Python updates it's Unicode version between minor releases, but fortunately
299# these properties do not change between the Unicode versions in question.
300_PROPLIST = """
301# ================================================
302
3031885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
3042118 ; Other_ID_Start # Sm SCRIPT CAPITAL P
305212E ; Other_ID_Start # So ESTIMATED SYMBOL
306309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
307
308# Total code points: 6
309
310# ================================================
311
31200B7 ; Other_ID_Continue # Po MIDDLE DOT
3130387 ; Other_ID_Continue # Po GREEK ANO TELEIA
3141369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
31519DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE
316
317# Total code points: 12
318"""
319
320
321@lru_cache
322def _identifier_characters():
323 """See https://docs.python.org/3/reference/lexical_analysis.html#identifiers"""
324 # Start by computing the set of special characters
325 chars = {"Other_ID_Start": "", "Other_ID_Continue": ""}
326 for line in _PROPLIST.splitlines():
327 if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line):
328 codes, prop = m.groups()
329 span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1)
330 chars[prop] += "".join(chr(x) for x in span)
331
332 # Then get the basic set by Unicode category and known extras
333 id_start = charmap.query(
334 categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"),
335 include_characters="_" + chars["Other_ID_Start"],
336 )
337 id_start -= IntervalSet.from_string(
338 # Magic value: the characters which NFKC-normalize to be invalid identifiers.
339 # Conveniently they're all in `id_start`, so we only need to do this once.
340 "\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63"
341 "\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f"
342 )
343 id_continue = id_start | charmap.query(
344 categories=("Mn", "Mc", "Nd", "Pc"),
345 include_characters=chars["Other_ID_Continue"],
346 )
347 return id_start, id_continue
348
349
350class BytesStrategy(SearchStrategy):
351 def __init__(self, min_size: int, max_size: Optional[int]):
352 self.min_size = min_size
353 self.max_size = (
354 max_size if max_size is not None else COLLECTION_DEFAULT_MAX_SIZE
355 )
356
357 def do_draw(self, data):
358 return data.draw_bytes(self.min_size, self.max_size)
359
360 _nonempty_filters = (
361 *ListStrategy._nonempty_filters,
362 bytes,
363 *(getattr(bytes, n) for n in _nonempty_names),
364 )
365 _nonempty_and_content_filters = (
366 *(getattr(bytes, n) for n in _nonempty_and_content_names),
367 )
368
369 def filter(self, condition):
370 if (new := _string_filter_rewrite(self, bytes, condition)) is not None:
371 return new
372 return ListStrategy.filter(self, condition)