Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/strategies/_internal/strings.py: 38%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

149 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import copy 

12import re 

13import warnings 

14from collections.abc import Collection 

15from functools import cache, lru_cache, partial 

16from typing import Optional, Union, cast 

17 

18from hypothesis.errors import HypothesisWarning, InvalidArgument 

19from hypothesis.internal import charmap 

20from hypothesis.internal.charmap import Categories 

21from hypothesis.internal.conjecture.data import ConjectureData 

22from hypothesis.internal.conjecture.providers import COLLECTION_DEFAULT_MAX_SIZE 

23from hypothesis.internal.filtering import max_len, min_len 

24from hypothesis.internal.intervalsets import IntervalSet 

25from hypothesis.internal.reflection import get_pretty_function_description 

26from hypothesis.strategies._internal.collections import ListStrategy 

27from hypothesis.strategies._internal.lazy import unwrap_strategies 

28from hypothesis.strategies._internal.strategies import ( 

29 OneOfStrategy, 

30 SampledFromStrategy, 

31 SearchStrategy, 

32) 

33from hypothesis.vendor.pretty import pretty 

34 

35 

36# Cache size is limited by sys.maxunicode, but passing None makes it slightly faster. 

37@cache 

38# this is part of our forward-facing validation, so we do *not* tell mypyc that c 

39# should be a str, because we don't want it to validate it before we can. 

40def _check_is_single_character(c: object) -> str: 

41 # In order to mitigate the performance cost of this check, we use a shared cache, 

42 # even at the cost of showing the culprit strategy in the error message. 

43 if not isinstance(c, str): 

44 type_ = get_pretty_function_description(type(c)) 

45 raise InvalidArgument(f"Got non-string {c!r} (type {type_})") 

46 if len(c) != 1: 

47 raise InvalidArgument(f"Got {c!r} (length {len(c)} != 1)") 

48 return c 

49 

50 

51class OneCharStringStrategy(SearchStrategy[str]): 

52 """A strategy which generates single character strings of text type.""" 

53 

54 def __init__( 

55 self, intervals: IntervalSet, force_repr: Optional[str] = None 

56 ) -> None: 

57 super().__init__() 

58 assert isinstance(intervals, IntervalSet) 

59 self.intervals = intervals 

60 self._force_repr = force_repr 

61 

62 @classmethod 

63 def from_characters_args( 

64 cls, 

65 *, 

66 codec: Optional[str] = None, 

67 min_codepoint: Optional[int] = None, 

68 max_codepoint: Optional[int] = None, 

69 categories: Optional[Categories] = None, 

70 exclude_characters: Collection[str] = "", 

71 include_characters: Collection[str] = "", 

72 ) -> "OneCharStringStrategy": 

73 assert set(categories or ()).issubset(charmap.categories()) 

74 intervals = charmap.query( 

75 min_codepoint=min_codepoint, 

76 max_codepoint=max_codepoint, 

77 categories=categories, 

78 exclude_characters=exclude_characters, 

79 include_characters=include_characters, 

80 ) 

81 if codec is not None: 

82 intervals &= charmap.intervals_from_codec(codec) 

83 

84 _arg_repr = ", ".join( 

85 f"{k}={v!r}" 

86 for k, v in [ 

87 ("codec", codec), 

88 ("min_codepoint", min_codepoint), 

89 ("max_codepoint", max_codepoint), 

90 ("categories", categories), 

91 ("exclude_characters", exclude_characters), 

92 ("include_characters", include_characters), 

93 ] 

94 if v not in (None, "") 

95 and not ( 

96 k == "categories" 

97 # v has to be `categories` here. Help mypy along to infer that. 

98 and set(cast(Categories, v)) == set(charmap.categories()) - {"Cs"} 

99 ) 

100 ) 

101 if not intervals: 

102 raise InvalidArgument( 

103 "No characters are allowed to be generated by this " 

104 f"combination of arguments: {_arg_repr}" 

105 ) 

106 return cls(intervals, force_repr=f"characters({_arg_repr})") 

107 

108 @classmethod 

109 def from_alphabet( 

110 cls, alphabet: Union[str, SearchStrategy] 

111 ) -> "OneCharStringStrategy": 

112 if isinstance(alphabet, str): 

113 return cls.from_characters_args(categories=(), include_characters=alphabet) 

114 

115 assert isinstance(alphabet, SearchStrategy) 

116 char_strategy = unwrap_strategies(alphabet) 

117 if isinstance(char_strategy, cls): 

118 return char_strategy 

119 elif isinstance(char_strategy, SampledFromStrategy): 

120 for c in char_strategy.elements: 

121 _check_is_single_character(c) 

122 return cls.from_characters_args( 

123 categories=(), 

124 include_characters=char_strategy.elements, 

125 ) 

126 elif isinstance(char_strategy, OneOfStrategy): 

127 intervals = IntervalSet() 

128 for s in char_strategy.element_strategies: 

129 intervals = intervals.union(cls.from_alphabet(s).intervals) 

130 return cls(intervals, force_repr=repr(alphabet)) 

131 raise InvalidArgument( 

132 f"{alphabet=} must be a sampled_from() or characters() strategy" 

133 ) 

134 

135 def __repr__(self) -> str: 

136 return self._force_repr or f"OneCharStringStrategy({self.intervals!r})" 

137 

138 def do_draw(self, data: ConjectureData) -> str: 

139 return data.draw_string(self.intervals, min_size=1, max_size=1) 

140 

141 

142_nonempty_names = ( 

143 "capitalize", 

144 "expandtabs", 

145 "join", 

146 "lower", 

147 "rsplit", 

148 "split", 

149 "splitlines", 

150 "swapcase", 

151 "title", 

152 "upper", 

153) 

154_nonempty_and_content_names = ( 

155 "islower", 

156 "isupper", 

157 "isalnum", 

158 "isalpha", 

159 "isascii", 

160 "isdigit", 

161 "isspace", 

162 "istitle", 

163 "lstrip", 

164 "rstrip", 

165 "strip", 

166) 

167 

168 

169class TextStrategy(ListStrategy[str]): 

170 def do_draw(self, data): 

171 # if our element strategy is OneCharStringStrategy, we can skip the 

172 # ListStrategy draw and jump right to data.draw_string. 

173 # Doing so for user-provided element strategies is not correct in 

174 # general, as they may define a different distribution than data.draw_string. 

175 elems = unwrap_strategies(self.element_strategy) 

176 if isinstance(elems, OneCharStringStrategy): 

177 return data.draw_string( 

178 elems.intervals, 

179 min_size=self.min_size, 

180 max_size=( 

181 COLLECTION_DEFAULT_MAX_SIZE 

182 if self.max_size == float("inf") 

183 else self.max_size 

184 ), 

185 ) 

186 return "".join(super().do_draw(data)) 

187 

188 def __repr__(self) -> str: 

189 args = [] 

190 if repr(self.element_strategy) != "characters()": 

191 args.append(repr(self.element_strategy)) 

192 if self.min_size: 

193 args.append(f"min_size={self.min_size}") 

194 if self.max_size < float("inf"): 

195 args.append(f"max_size={self.max_size}") 

196 return f"text({', '.join(args)})" 

197 

198 # See https://docs.python.org/3/library/stdtypes.html#string-methods 

199 # These methods always return Truthy values for any nonempty string. 

200 _nonempty_filters = ( 

201 *ListStrategy._nonempty_filters, 

202 str, 

203 str.casefold, 

204 str.encode, 

205 *(getattr(str, n) for n in _nonempty_names), 

206 ) 

207 _nonempty_and_content_filters = ( 

208 str.isdecimal, 

209 str.isnumeric, 

210 *(getattr(str, n) for n in _nonempty_and_content_names), 

211 ) 

212 

213 def filter(self, condition): 

214 elems = unwrap_strategies(self.element_strategy) 

215 if ( 

216 condition is str.isidentifier 

217 and self.max_size >= 1 

218 and isinstance(elems, OneCharStringStrategy) 

219 ): 

220 from hypothesis.strategies import builds, nothing 

221 

222 id_start, id_continue = _identifier_characters() 

223 if not (elems.intervals & id_start): 

224 return nothing() 

225 return builds( 

226 "{}{}".format, 

227 OneCharStringStrategy(elems.intervals & id_start), 

228 TextStrategy( 

229 OneCharStringStrategy(elems.intervals & id_continue), 

230 min_size=max(0, self.min_size - 1), 

231 max_size=self.max_size - 1, 

232 ), 

233 # Filter to ensure that NFKC normalization keeps working in future 

234 ).filter(str.isidentifier) 

235 if (new := _string_filter_rewrite(self, str, condition)) is not None: 

236 return new 

237 return super().filter(condition) 

238 

239 

240def _string_filter_rewrite(self, kind, condition): 

241 if condition in (kind.lower, kind.title, kind.upper): 

242 k = kind.__name__ 

243 warnings.warn( 

244 f"You applied {k}.{condition.__name__} as a filter, but this allows " 

245 f"all nonempty strings! Did you mean {k}.is{condition.__name__}?", 

246 HypothesisWarning, 

247 stacklevel=2, 

248 ) 

249 

250 if ( 

251 ( 

252 kind is bytes 

253 or isinstance( 

254 unwrap_strategies(self.element_strategy), OneCharStringStrategy 

255 ) 

256 ) 

257 and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern) 

258 and isinstance(pattern.pattern, kind) 

259 ): 

260 from hypothesis.strategies._internal.regex import regex_strategy 

261 

262 if condition.__name__ == "match": 

263 # Replace with an easier-to-handle equivalent condition 

264 caret, close = ("^(?:", ")") if kind is str else (b"^(?:", b")") 

265 pattern = re.compile(caret + pattern.pattern + close, flags=pattern.flags) 

266 condition = pattern.search 

267 

268 if condition.__name__ in ("search", "findall", "fullmatch"): 

269 s = regex_strategy( 

270 pattern, 

271 fullmatch=condition.__name__ == "fullmatch", 

272 alphabet=self.element_strategy if kind is str else None, 

273 ) 

274 if self.min_size > 0: 

275 s = s.filter(partial(min_len, self.min_size)) 

276 if self.max_size < 1e999: 

277 s = s.filter(partial(max_len, self.max_size)) 

278 return s 

279 elif condition.__name__ in ("finditer", "scanner"): 

280 # PyPy implements `finditer` as an alias to their `scanner` method 

281 warnings.warn( 

282 f"You applied {pretty(condition)} as a filter, but this allows " 

283 f"any string at all! Did you mean .findall ?", 

284 HypothesisWarning, 

285 stacklevel=3, 

286 ) 

287 return self 

288 elif condition.__name__ == "split": 

289 warnings.warn( 

290 f"You applied {pretty(condition)} as a filter, but this allows " 

291 f"any nonempty string! Did you mean .search ?", 

292 HypothesisWarning, 

293 stacklevel=3, 

294 ) 

295 return self.filter(bool) 

296 

297 # We use ListStrategy filter logic for the conditions that *only* imply 

298 # the string is nonempty. Here, we increment the min_size but still apply 

299 # the filter for conditions that imply nonempty *and specific contents*. 

300 if condition in self._nonempty_and_content_filters and self.max_size >= 1: 

301 self = copy.copy(self) 

302 self.min_size = max(1, self.min_size) 

303 return ListStrategy.filter(self, condition) 

304 

305 return None 

306 

307 

308# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt 

309# Python updates it's Unicode version between minor releases, but fortunately 

310# these properties do not change between the Unicode versions in question. 

311_PROPLIST = """ 

312# ================================================ 

313 

3141885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 

3152118 ; Other_ID_Start # Sm SCRIPT CAPITAL P 

316212E ; Other_ID_Start # So ESTIMATED SYMBOL 

317309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 

318 

319# Total code points: 6 

320 

321# ================================================ 

322 

32300B7 ; Other_ID_Continue # Po MIDDLE DOT 

3240387 ; Other_ID_Continue # Po GREEK ANO TELEIA 

3251369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE 

32619DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE 

327 

328# Total code points: 12 

329""" 

330 

331 

332@lru_cache 

333def _identifier_characters() -> tuple[IntervalSet, IntervalSet]: 

334 """See https://docs.python.org/3/reference/lexical_analysis.html#identifiers""" 

335 # Start by computing the set of special characters 

336 chars = {"Other_ID_Start": "", "Other_ID_Continue": ""} 

337 for line in _PROPLIST.splitlines(): 

338 if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line): 

339 codes, prop = m.groups() 

340 span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1) 

341 chars[prop] += "".join(chr(x) for x in span) 

342 

343 # Then get the basic set by Unicode category and known extras 

344 id_start = charmap.query( 

345 categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"), 

346 include_characters="_" + chars["Other_ID_Start"], 

347 ) 

348 id_start -= IntervalSet.from_string( 

349 # Magic value: the characters which NFKC-normalize to be invalid identifiers. 

350 # Conveniently they're all in `id_start`, so we only need to do this once. 

351 "\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63" 

352 "\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f" 

353 ) 

354 id_continue = id_start | charmap.query( 

355 categories=("Mn", "Mc", "Nd", "Pc"), 

356 include_characters=chars["Other_ID_Continue"], 

357 ) 

358 return id_start, id_continue 

359 

360 

361class BytesStrategy(SearchStrategy): 

362 def __init__(self, min_size: int, max_size: Optional[int]): 

363 super().__init__() 

364 self.min_size = min_size 

365 self.max_size = ( 

366 max_size if max_size is not None else COLLECTION_DEFAULT_MAX_SIZE 

367 ) 

368 

369 def do_draw(self, data: ConjectureData) -> bytes: 

370 return data.draw_bytes(self.min_size, self.max_size) 

371 

372 _nonempty_filters = ( 

373 *ListStrategy._nonempty_filters, 

374 bytes, 

375 *(getattr(bytes, n) for n in _nonempty_names), 

376 ) 

377 _nonempty_and_content_filters = ( 

378 *(getattr(bytes, n) for n in _nonempty_and_content_names), 

379 ) 

380 

381 def filter(self, condition): 

382 if (new := _string_filter_rewrite(self, bytes, condition)) is not None: 

383 return new 

384 return ListStrategy.filter(self, condition)