Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/strategies/_internal/strings.py: 38%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

149 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import copy 

12import re 

13import warnings 

14from collections.abc import Collection 

15from functools import cache, lru_cache, partial 

16from typing import cast 

17 

18from hypothesis.errors import HypothesisWarning, InvalidArgument 

19from hypothesis.internal import charmap 

20from hypothesis.internal.charmap import Categories 

21from hypothesis.internal.conjecture.data import ConjectureData 

22from hypothesis.internal.conjecture.providers import COLLECTION_DEFAULT_MAX_SIZE 

23from hypothesis.internal.filtering import max_len, min_len 

24from hypothesis.internal.intervalsets import IntervalSet 

25from hypothesis.internal.reflection import get_pretty_function_description 

26from hypothesis.strategies._internal.collections import ListStrategy 

27from hypothesis.strategies._internal.lazy import unwrap_strategies 

28from hypothesis.strategies._internal.strategies import ( 

29 OneOfStrategy, 

30 SampledFromStrategy, 

31 SearchStrategy, 

32) 

33from hypothesis.vendor.pretty import pretty 

34 

35 

36# Cache size is limited by sys.maxunicode, but passing None makes it slightly faster. 

37@cache 

38# this is part of our forward-facing validation, so we do *not* tell mypyc that c 

39# should be a str, because we don't want it to validate it before we can. 

40def _check_is_single_character(c: object) -> str: 

41 # In order to mitigate the performance cost of this check, we use a shared cache, 

42 # even at the cost of showing the culprit strategy in the error message. 

43 if not isinstance(c, str): 

44 type_ = get_pretty_function_description(type(c)) 

45 raise InvalidArgument(f"Got non-string {c!r} (type {type_})") 

46 if len(c) != 1: 

47 raise InvalidArgument(f"Got {c!r} (length {len(c)} != 1)") 

48 return c 

49 

50 

51class OneCharStringStrategy(SearchStrategy[str]): 

52 """A strategy which generates single character strings of text type.""" 

53 

54 def __init__(self, intervals: IntervalSet, force_repr: str | None = None) -> None: 

55 super().__init__() 

56 assert isinstance(intervals, IntervalSet) 

57 self.intervals = intervals 

58 self._force_repr = force_repr 

59 

60 @classmethod 

61 def from_characters_args( 

62 cls, 

63 *, 

64 codec: str | None = None, 

65 min_codepoint: int | None = None, 

66 max_codepoint: int | None = None, 

67 categories: Categories | None = None, 

68 exclude_characters: Collection[str] = "", 

69 include_characters: Collection[str] = "", 

70 ) -> "OneCharStringStrategy": 

71 assert set(categories or ()).issubset(charmap.categories()) 

72 intervals = charmap.query( 

73 min_codepoint=min_codepoint, 

74 max_codepoint=max_codepoint, 

75 categories=categories, 

76 exclude_characters=exclude_characters, 

77 include_characters=include_characters, 

78 ) 

79 if codec is not None: 

80 intervals &= charmap.intervals_from_codec(codec) 

81 

82 _arg_repr = ", ".join( 

83 f"{k}={v!r}" 

84 for k, v in [ 

85 ("codec", codec), 

86 ("min_codepoint", min_codepoint), 

87 ("max_codepoint", max_codepoint), 

88 ("categories", categories), 

89 ("exclude_characters", exclude_characters), 

90 ("include_characters", include_characters), 

91 ] 

92 if v not in (None, "") 

93 and not ( 

94 k == "categories" 

95 # v has to be `categories` here. Help mypy along to infer that. 

96 and set(cast(Categories, v)) == set(charmap.categories()) - {"Cs"} 

97 ) 

98 ) 

99 if not intervals: 

100 raise InvalidArgument( 

101 "No characters are allowed to be generated by this " 

102 f"combination of arguments: {_arg_repr}" 

103 ) 

104 return cls(intervals, force_repr=f"characters({_arg_repr})") 

105 

106 @classmethod 

107 def from_alphabet(cls, alphabet: str | SearchStrategy) -> "OneCharStringStrategy": 

108 if isinstance(alphabet, str): 

109 return cls.from_characters_args(categories=(), include_characters=alphabet) 

110 

111 assert isinstance(alphabet, SearchStrategy) 

112 char_strategy = unwrap_strategies(alphabet) 

113 if isinstance(char_strategy, cls): 

114 return char_strategy 

115 elif isinstance(char_strategy, SampledFromStrategy): 

116 for c in char_strategy.elements: 

117 _check_is_single_character(c) 

118 return cls.from_characters_args( 

119 categories=(), 

120 include_characters=char_strategy.elements, 

121 ) 

122 elif isinstance(char_strategy, OneOfStrategy): 

123 intervals = IntervalSet() 

124 for s in char_strategy.element_strategies: 

125 intervals = intervals.union(cls.from_alphabet(s).intervals) 

126 return cls(intervals, force_repr=repr(alphabet)) 

127 raise InvalidArgument( 

128 f"{alphabet=} must be a sampled_from() or characters() strategy" 

129 ) 

130 

131 def __repr__(self) -> str: 

132 return self._force_repr or f"OneCharStringStrategy({self.intervals!r})" 

133 

134 def do_draw(self, data: ConjectureData) -> str: 

135 return data.draw_string(self.intervals, min_size=1, max_size=1) 

136 

137 

138_nonempty_names = ( 

139 "capitalize", 

140 "expandtabs", 

141 "join", 

142 "lower", 

143 "rsplit", 

144 "split", 

145 "splitlines", 

146 "swapcase", 

147 "title", 

148 "upper", 

149) 

150_nonempty_and_content_names = ( 

151 "islower", 

152 "isupper", 

153 "isalnum", 

154 "isalpha", 

155 "isascii", 

156 "isdigit", 

157 "isspace", 

158 "istitle", 

159 "lstrip", 

160 "rstrip", 

161 "strip", 

162) 

163 

164 

165class TextStrategy(ListStrategy[str]): 

166 def do_draw(self, data): 

167 # if our element strategy is OneCharStringStrategy, we can skip the 

168 # ListStrategy draw and jump right to data.draw_string. 

169 # Doing so for user-provided element strategies is not correct in 

170 # general, as they may define a different distribution than data.draw_string. 

171 elems = unwrap_strategies(self.element_strategy) 

172 if isinstance(elems, OneCharStringStrategy): 

173 return data.draw_string( 

174 elems.intervals, 

175 min_size=self.min_size, 

176 max_size=( 

177 COLLECTION_DEFAULT_MAX_SIZE 

178 if self.max_size == float("inf") 

179 else self.max_size 

180 ), 

181 ) 

182 return "".join(super().do_draw(data)) 

183 

184 def __repr__(self) -> str: 

185 args = [] 

186 if repr(self.element_strategy) != "characters()": 

187 args.append(repr(self.element_strategy)) 

188 if self.min_size: 

189 args.append(f"min_size={self.min_size}") 

190 if self.max_size < float("inf"): 

191 args.append(f"max_size={self.max_size}") 

192 return f"text({', '.join(args)})" 

193 

194 # See https://docs.python.org/3/library/stdtypes.html#string-methods 

195 # These methods always return Truthy values for any nonempty string. 

196 _nonempty_filters = ( 

197 *ListStrategy._nonempty_filters, 

198 str, 

199 str.casefold, 

200 str.encode, 

201 *(getattr(str, n) for n in _nonempty_names), 

202 ) 

203 _nonempty_and_content_filters = ( 

204 str.isdecimal, 

205 str.isnumeric, 

206 *(getattr(str, n) for n in _nonempty_and_content_names), 

207 ) 

208 

209 def filter(self, condition): 

210 elems = unwrap_strategies(self.element_strategy) 

211 if ( 

212 condition is str.isidentifier 

213 and self.max_size >= 1 

214 and isinstance(elems, OneCharStringStrategy) 

215 ): 

216 from hypothesis.strategies import builds, nothing 

217 

218 id_start, id_continue = _identifier_characters() 

219 if not (elems.intervals & id_start): 

220 return nothing() 

221 return builds( 

222 "{}{}".format, 

223 OneCharStringStrategy(elems.intervals & id_start), 

224 TextStrategy( 

225 OneCharStringStrategy(elems.intervals & id_continue), 

226 min_size=max(0, self.min_size - 1), 

227 max_size=self.max_size - 1, 

228 ), 

229 # Filter to ensure that NFKC normalization keeps working in future 

230 ).filter(str.isidentifier) 

231 if (new := _string_filter_rewrite(self, str, condition)) is not None: 

232 return new 

233 return super().filter(condition) 

234 

235 

236def _string_filter_rewrite(self, kind, condition): 

237 if condition in (kind.lower, kind.title, kind.upper): 

238 k = kind.__name__ 

239 warnings.warn( 

240 f"You applied {k}.{condition.__name__} as a filter, but this allows " 

241 f"all nonempty strings! Did you mean {k}.is{condition.__name__}?", 

242 HypothesisWarning, 

243 stacklevel=2, 

244 ) 

245 

246 if ( 

247 ( 

248 kind is bytes 

249 or isinstance( 

250 unwrap_strategies(self.element_strategy), OneCharStringStrategy 

251 ) 

252 ) 

253 and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern) 

254 and isinstance(pattern.pattern, kind) 

255 ): 

256 from hypothesis.strategies._internal.regex import regex_strategy 

257 

258 if condition.__name__ == "match": 

259 # Replace with an easier-to-handle equivalent condition 

260 caret, close = ("^(?:", ")") if kind is str else (b"^(?:", b")") 

261 pattern = re.compile(caret + pattern.pattern + close, flags=pattern.flags) 

262 condition = pattern.search 

263 

264 if condition.__name__ in ("search", "findall", "fullmatch"): 

265 s = regex_strategy( 

266 pattern, 

267 fullmatch=condition.__name__ == "fullmatch", 

268 alphabet=self.element_strategy if kind is str else None, 

269 ) 

270 if self.min_size > 0: 

271 s = s.filter(partial(min_len, self.min_size)) 

272 if self.max_size < 1e999: 

273 s = s.filter(partial(max_len, self.max_size)) 

274 return s 

275 elif condition.__name__ in ("finditer", "scanner"): 

276 # PyPy implements `finditer` as an alias to their `scanner` method 

277 warnings.warn( 

278 f"You applied {pretty(condition)} as a filter, but this allows " 

279 f"any string at all! Did you mean .findall ?", 

280 HypothesisWarning, 

281 stacklevel=3, 

282 ) 

283 return self 

284 elif condition.__name__ == "split": 

285 warnings.warn( 

286 f"You applied {pretty(condition)} as a filter, but this allows " 

287 f"any nonempty string! Did you mean .search ?", 

288 HypothesisWarning, 

289 stacklevel=3, 

290 ) 

291 return self.filter(bool) 

292 

293 # We use ListStrategy filter logic for the conditions that *only* imply 

294 # the string is nonempty. Here, we increment the min_size but still apply 

295 # the filter for conditions that imply nonempty *and specific contents*. 

296 if condition in self._nonempty_and_content_filters and self.max_size >= 1: 

297 self = copy.copy(self) 

298 self.min_size = max(1, self.min_size) 

299 return ListStrategy.filter(self, condition) 

300 

301 return None 

302 

303 

304# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt 

305# Python updates it's Unicode version between minor releases, but fortunately 

306# these properties do not change between the Unicode versions in question. 

307_PROPLIST = """ 

308# ================================================ 

309 

3101885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 

3112118 ; Other_ID_Start # Sm SCRIPT CAPITAL P 

312212E ; Other_ID_Start # So ESTIMATED SYMBOL 

313309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 

314 

315# Total code points: 6 

316 

317# ================================================ 

318 

31900B7 ; Other_ID_Continue # Po MIDDLE DOT 

3200387 ; Other_ID_Continue # Po GREEK ANO TELEIA 

3211369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE 

32219DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE 

323 

324# Total code points: 12 

325""" 

326 

327 

328@lru_cache 

329def _identifier_characters() -> tuple[IntervalSet, IntervalSet]: 

330 """See https://docs.python.org/3/reference/lexical_analysis.html#identifiers""" 

331 # Start by computing the set of special characters 

332 chars = {"Other_ID_Start": "", "Other_ID_Continue": ""} 

333 for line in _PROPLIST.splitlines(): 

334 if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line): 

335 codes, prop = m.groups() 

336 span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1) 

337 chars[prop] += "".join(chr(x) for x in span) 

338 

339 # Then get the basic set by Unicode category and known extras 

340 id_start = charmap.query( 

341 categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"), 

342 include_characters="_" + chars["Other_ID_Start"], 

343 ) 

344 id_start -= IntervalSet.from_string( 

345 # Magic value: the characters which NFKC-normalize to be invalid identifiers. 

346 # Conveniently they're all in `id_start`, so we only need to do this once. 

347 "\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63" 

348 "\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f" 

349 ) 

350 id_continue = id_start | charmap.query( 

351 categories=("Mn", "Mc", "Nd", "Pc"), 

352 include_characters=chars["Other_ID_Continue"], 

353 ) 

354 return id_start, id_continue 

355 

356 

357class BytesStrategy(SearchStrategy): 

358 def __init__(self, min_size: int, max_size: int | None): 

359 super().__init__() 

360 self.min_size = min_size 

361 self.max_size = ( 

362 max_size if max_size is not None else COLLECTION_DEFAULT_MAX_SIZE 

363 ) 

364 

365 def do_draw(self, data: ConjectureData) -> bytes: 

366 return data.draw_bytes(self.min_size, self.max_size) 

367 

368 _nonempty_filters = ( 

369 *ListStrategy._nonempty_filters, 

370 bytes, 

371 *(getattr(bytes, n) for n in _nonempty_names), 

372 ) 

373 _nonempty_and_content_filters = ( 

374 *(getattr(bytes, n) for n in _nonempty_and_content_names), 

375 ) 

376 

377 def filter(self, condition): 

378 if (new := _string_filter_rewrite(self, bytes, condition)) is not None: 

379 return new 

380 return ListStrategy.filter(self, condition)