1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import copy
12import re
13import warnings
14from functools import lru_cache, partial
15
16from hypothesis.errors import HypothesisWarning, InvalidArgument
17from hypothesis.internal import charmap
18from hypothesis.internal.filtering import max_len, min_len
19from hypothesis.internal.intervalsets import IntervalSet
20from hypothesis.strategies._internal.collections import ListStrategy
21from hypothesis.strategies._internal.lazy import unwrap_strategies
22from hypothesis.strategies._internal.numbers import IntegersStrategy
23from hypothesis.strategies._internal.strategies import SearchStrategy
24from hypothesis.vendor.pretty import pretty
25
26
27class OneCharStringStrategy(SearchStrategy):
28 """A strategy which generates single character strings of text type."""
29
30 def __init__(self, intervals, force_repr=None):
31 assert isinstance(intervals, IntervalSet)
32 self.intervals = intervals
33 self._force_repr = force_repr
34
35 @classmethod
36 def from_characters_args(
37 cls,
38 *,
39 codec=None,
40 min_codepoint=None,
41 max_codepoint=None,
42 categories=None,
43 exclude_characters=None,
44 include_characters=None,
45 ):
46 assert set(categories or ()).issubset(charmap.categories())
47 intervals = charmap.query(
48 min_codepoint=min_codepoint,
49 max_codepoint=max_codepoint,
50 categories=categories,
51 exclude_characters=exclude_characters,
52 include_characters=include_characters,
53 )
54 if codec is not None:
55 intervals &= charmap.intervals_from_codec(codec)
56 _arg_repr = ", ".join(
57 f"{k}={v!r}"
58 for k, v in [
59 ("codec", codec),
60 ("min_codepoint", min_codepoint),
61 ("max_codepoint", max_codepoint),
62 ("categories", categories),
63 ("exclude_characters", exclude_characters),
64 ("include_characters", include_characters),
65 ]
66 if v not in (None, "", set(charmap.categories()) - {"Cs"})
67 )
68 if not intervals:
69 raise InvalidArgument(
70 "No characters are allowed to be generated by this "
71 f"combination of arguments: {_arg_repr}"
72 )
73 return cls(intervals, force_repr=f"characters({_arg_repr})")
74
75 def __repr__(self):
76 return self._force_repr or f"OneCharStringStrategy({self.intervals!r})"
77
78 def do_draw(self, data):
79 return data.draw_string(self.intervals, min_size=1, max_size=1)
80
81
82_nonempty_names = (
83 "capitalize",
84 "expandtabs",
85 "join",
86 "lower",
87 "rsplit",
88 "split",
89 "splitlines",
90 "swapcase",
91 "title",
92 "upper",
93)
94_nonempty_and_content_names = (
95 "islower",
96 "isupper",
97 "isalnum",
98 "isalpha",
99 "isascii",
100 "isdigit",
101 "isspace",
102 "istitle",
103 "lstrip",
104 "rstrip",
105 "strip",
106)
107
108
109class TextStrategy(ListStrategy):
110 def do_draw(self, data):
111 # if our element strategy is OneCharStringStrategy, we can skip the
112 # ListStrategy draw and jump right to our nice IR string draw.
113 # Doing so for user-provided element strategies is not correct in
114 # general, as they may define a different distribution than our IR.
115 elems = unwrap_strategies(self.element_strategy)
116 if isinstance(elems, OneCharStringStrategy):
117 return data.draw_string(
118 elems.intervals, min_size=self.min_size, max_size=self.max_size
119 )
120 return "".join(super().do_draw(data))
121
122 def __repr__(self):
123 args = []
124 if repr(self.element_strategy) != "characters()":
125 args.append(repr(self.element_strategy))
126 if self.min_size:
127 args.append(f"min_size={self.min_size}")
128 if self.max_size < float("inf"):
129 args.append(f"max_size={self.max_size}")
130 return f"text({', '.join(args)})"
131
132 # See https://docs.python.org/3/library/stdtypes.html#string-methods
133 # These methods always return Truthy values for any nonempty string.
134 _nonempty_filters = (
135 *ListStrategy._nonempty_filters,
136 str,
137 str.casefold,
138 str.encode,
139 *(getattr(str, n) for n in _nonempty_names),
140 )
141 _nonempty_and_content_filters = (
142 str.isdecimal,
143 str.isnumeric,
144 *(getattr(str, n) for n in _nonempty_and_content_names),
145 )
146
147 def filter(self, condition):
148 elems = unwrap_strategies(self.element_strategy)
149 if (
150 condition is str.isidentifier
151 and self.max_size >= 1
152 and isinstance(elems, OneCharStringStrategy)
153 ):
154 from hypothesis.strategies import builds, nothing
155
156 id_start, id_continue = _identifier_characters()
157 if not (elems.intervals & id_start):
158 return nothing()
159 return builds(
160 "{}{}".format,
161 OneCharStringStrategy(elems.intervals & id_start),
162 TextStrategy(
163 OneCharStringStrategy(elems.intervals & id_continue),
164 min_size=max(0, self.min_size - 1),
165 max_size=self.max_size - 1,
166 ),
167 # Filter to ensure that NFKC normalization keeps working in future
168 ).filter(str.isidentifier)
169 if (new := _string_filter_rewrite(self, str, condition)) is not None:
170 return new
171 return super().filter(condition)
172
173
174def _string_filter_rewrite(self, kind, condition):
175 if condition in (kind.lower, kind.title, kind.upper):
176 k = kind.__name__
177 warnings.warn(
178 f"You applied {k}.{condition.__name__} as a filter, but this allows "
179 f"all nonempty strings! Did you mean {k}.is{condition.__name__}?",
180 HypothesisWarning,
181 stacklevel=2,
182 )
183
184 elems = unwrap_strategies(self.element_strategy)
185 if (
186 (kind is bytes or isinstance(elems, OneCharStringStrategy))
187 and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern)
188 and isinstance(pattern.pattern, kind)
189 ):
190 from hypothesis.strategies._internal.regex import regex_strategy
191
192 print(f"{condition=}")
193 print(f"{condition.__name__=}")
194
195 if condition.__name__ == "match":
196 # Replace with an easier-to-handle equivalent condition
197 caret = "^" if kind is str else b"^"
198 pattern = re.compile(caret + pattern.pattern, flags=pattern.flags)
199 condition = pattern.search
200
201 if condition.__name__ in ("search", "findall", "fullmatch"):
202 s = regex_strategy(
203 pattern,
204 fullmatch=condition.__name__ == "fullmatch",
205 alphabet=self.element_strategy if kind is str else None,
206 )
207 if self.min_size > 0:
208 s = s.filter(partial(min_len, self.min_size))
209 if self.max_size < 1e999:
210 s = s.filter(partial(max_len, self.max_size))
211 return s
212 elif condition.__name__ in ("finditer", "scanner"):
213 # PyPy implements `finditer` as an alias to their `scanner` method
214 warnings.warn(
215 f"You applied {pretty(condition)} as a filter, but this allows "
216 f"any string at all! Did you mean .findall ?",
217 HypothesisWarning,
218 stacklevel=3,
219 )
220 return self
221 elif condition.__name__ == "split":
222 warnings.warn(
223 f"You applied {pretty(condition)} as a filter, but this allows "
224 f"any nonempty string! Did you mean .search ?",
225 HypothesisWarning,
226 stacklevel=3,
227 )
228 return self.filter(bool)
229
230 # We use ListStrategy filter logic for the conditions that *only* imply
231 # the string is nonempty. Here, we increment the min_size but still apply
232 # the filter for conditions that imply nonempty *and specific contents*.
233 if condition in self._nonempty_and_content_filters and self.max_size >= 1:
234 self = copy.copy(self)
235 self.min_size = max(1, self.min_size)
236 return ListStrategy.filter(self, condition)
237
238 return None
239
240
241# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt
242# Python updates it's Unicode version between minor releases, but fortunately
243# these properties do not change between the Unicode versions in question.
244_PROPLIST = """
245# ================================================
246
2471885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA
2482118 ; Other_ID_Start # Sm SCRIPT CAPITAL P
249212E ; Other_ID_Start # So ESTIMATED SYMBOL
250309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
251
252# Total code points: 6
253
254# ================================================
255
25600B7 ; Other_ID_Continue # Po MIDDLE DOT
2570387 ; Other_ID_Continue # Po GREEK ANO TELEIA
2581369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE
25919DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE
260
261# Total code points: 12
262"""
263
264
265@lru_cache
266def _identifier_characters():
267 """See https://docs.python.org/3/reference/lexical_analysis.html#identifiers"""
268 # Start by computing the set of special characters
269 chars = {"Other_ID_Start": "", "Other_ID_Continue": ""}
270 for line in _PROPLIST.splitlines():
271 if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line):
272 codes, prop = m.groups()
273 span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1)
274 chars[prop] += "".join(chr(x) for x in span)
275
276 # Then get the basic set by Unicode category and known extras
277 id_start = charmap.query(
278 categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"),
279 include_characters="_" + chars["Other_ID_Start"],
280 )
281 id_start -= IntervalSet.from_string(
282 # Magic value: the characters which NFKC-normalize to be invalid identifiers.
283 # Conveniently they're all in `id_start`, so we only need to do this once.
284 "\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63"
285 "\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f"
286 )
287 id_continue = id_start | charmap.query(
288 categories=("Mn", "Mc", "Nd", "Pc"),
289 include_characters=chars["Other_ID_Continue"],
290 )
291 return id_start, id_continue
292
293
294class BytesStrategy(ListStrategy):
295 def __init__(self, min_size, max_size):
296 super().__init__(IntegersStrategy(0, 255), min_size=min_size, max_size=max_size)
297
298 def do_draw(self, data):
299 # TODO: refactor the underlying provider to support variable-length bytes
300 if self.min_size == self.max_size:
301 return bytes(data.draw_bytes(self.min_size))
302 return bytes(super().do_draw(data))
303
304 _nonempty_filters = (
305 *ListStrategy._nonempty_filters,
306 bytes,
307 *(getattr(bytes, n) for n in _nonempty_names),
308 )
309 _nonempty_and_content_filters = (
310 *(getattr(bytes, n) for n in _nonempty_and_content_names),
311 )
312
313 def filter(self, condition):
314 if (new := _string_filter_rewrite(self, bytes, condition)) is not None:
315 return new
316 return super().filter(condition)