Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/strategies/_internal/strings.py: 44%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

145 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import copy 

12import re 

13import warnings 

14from functools import cache, lru_cache, partial 

15from typing import Optional 

16 

17from hypothesis.errors import HypothesisWarning, InvalidArgument 

18from hypothesis.internal import charmap 

19from hypothesis.internal.conjecture.data import ConjectureData 

20from hypothesis.internal.conjecture.providers import COLLECTION_DEFAULT_MAX_SIZE 

21from hypothesis.internal.filtering import max_len, min_len 

22from hypothesis.internal.intervalsets import IntervalSet 

23from hypothesis.internal.reflection import get_pretty_function_description 

24from hypothesis.strategies._internal.collections import ListStrategy 

25from hypothesis.strategies._internal.lazy import unwrap_strategies 

26from hypothesis.strategies._internal.strategies import ( 

27 OneOfStrategy, 

28 SampledFromStrategy, 

29 SearchStrategy, 

30) 

31from hypothesis.vendor.pretty import pretty 

32 

33 

34# Cache size is limited by sys.maxunicode, but passing None makes it slightly faster. 

35@cache 

36def _check_is_single_character(c): 

37 # In order to mitigate the performance cost of this check, we use a shared cache, 

38 # even at the cost of showing the culprit strategy in the error message. 

39 if not isinstance(c, str): 

40 type_ = get_pretty_function_description(type(c)) 

41 raise InvalidArgument(f"Got non-string {c!r} (type {type_})") 

42 if len(c) != 1: 

43 raise InvalidArgument(f"Got {c!r} (length {len(c)} != 1)") 

44 return c 

45 

46 

47class OneCharStringStrategy(SearchStrategy[str]): 

48 """A strategy which generates single character strings of text type.""" 

49 

50 def __init__( 

51 self, intervals: IntervalSet, force_repr: Optional[str] = None 

52 ) -> None: 

53 assert isinstance(intervals, IntervalSet) 

54 self.intervals = intervals 

55 self._force_repr = force_repr 

56 

57 @classmethod 

58 def from_characters_args( 

59 cls, 

60 *, 

61 codec=None, 

62 min_codepoint=None, 

63 max_codepoint=None, 

64 categories=None, 

65 exclude_characters=None, 

66 include_characters=None, 

67 ): 

68 assert set(categories or ()).issubset(charmap.categories()) 

69 intervals = charmap.query( 

70 min_codepoint=min_codepoint, 

71 max_codepoint=max_codepoint, 

72 categories=categories, 

73 exclude_characters=exclude_characters, 

74 include_characters=include_characters, 

75 ) 

76 if codec is not None: 

77 intervals &= charmap.intervals_from_codec(codec) 

78 

79 _arg_repr = ", ".join( 

80 f"{k}={v!r}" 

81 for k, v in [ 

82 ("codec", codec), 

83 ("min_codepoint", min_codepoint), 

84 ("max_codepoint", max_codepoint), 

85 ("categories", categories), 

86 ("exclude_characters", exclude_characters), 

87 ("include_characters", include_characters), 

88 ] 

89 if v not in (None, "") 

90 and not (k == "categories" and set(v) == set(charmap.categories()) - {"Cs"}) 

91 ) 

92 if not intervals: 

93 raise InvalidArgument( 

94 "No characters are allowed to be generated by this " 

95 f"combination of arguments: {_arg_repr}" 

96 ) 

97 return cls(intervals, force_repr=f"characters({_arg_repr})") 

98 

99 @classmethod 

100 def from_alphabet(cls, alphabet): 

101 if isinstance(alphabet, str): 

102 return cls.from_characters_args(categories=(), include_characters=alphabet) 

103 

104 assert isinstance(alphabet, SearchStrategy) 

105 char_strategy = unwrap_strategies(alphabet) 

106 if isinstance(char_strategy, cls): 

107 return char_strategy 

108 elif isinstance(char_strategy, SampledFromStrategy): 

109 for c in char_strategy.elements: 

110 _check_is_single_character(c) 

111 return cls.from_characters_args( 

112 categories=(), 

113 include_characters=char_strategy.elements, 

114 ) 

115 elif isinstance(char_strategy, OneOfStrategy): 

116 intervals = IntervalSet() 

117 for s in char_strategy.element_strategies: 

118 intervals = intervals.union(cls.from_alphabet(s).intervals) 

119 return cls(intervals, force_repr=repr(alphabet)) 

120 raise InvalidArgument( 

121 f"{alphabet=} must be a sampled_from() or characters() strategy" 

122 ) 

123 

124 def __repr__(self) -> str: 

125 return self._force_repr or f"OneCharStringStrategy({self.intervals!r})" 

126 

127 def do_draw(self, data: ConjectureData) -> str: 

128 return data.draw_string(self.intervals, min_size=1, max_size=1) 

129 

130 

131_nonempty_names = ( 

132 "capitalize", 

133 "expandtabs", 

134 "join", 

135 "lower", 

136 "rsplit", 

137 "split", 

138 "splitlines", 

139 "swapcase", 

140 "title", 

141 "upper", 

142) 

143_nonempty_and_content_names = ( 

144 "islower", 

145 "isupper", 

146 "isalnum", 

147 "isalpha", 

148 "isascii", 

149 "isdigit", 

150 "isspace", 

151 "istitle", 

152 "lstrip", 

153 "rstrip", 

154 "strip", 

155) 

156 

157 

158class TextStrategy(ListStrategy[str]): 

159 def do_draw(self, data): 

160 # if our element strategy is OneCharStringStrategy, we can skip the 

161 # ListStrategy draw and jump right to data.draw_string. 

162 # Doing so for user-provided element strategies is not correct in 

163 # general, as they may define a different distribution than data.draw_string. 

164 elems = unwrap_strategies(self.element_strategy) 

165 if isinstance(elems, OneCharStringStrategy): 

166 return data.draw_string( 

167 elems.intervals, 

168 min_size=self.min_size, 

169 max_size=( 

170 COLLECTION_DEFAULT_MAX_SIZE 

171 if self.max_size == float("inf") 

172 else self.max_size 

173 ), 

174 ) 

175 return "".join(super().do_draw(data)) 

176 

177 def __repr__(self) -> str: 

178 args = [] 

179 if repr(self.element_strategy) != "characters()": 

180 args.append(repr(self.element_strategy)) 

181 if self.min_size: 

182 args.append(f"min_size={self.min_size}") 

183 if self.max_size < float("inf"): 

184 args.append(f"max_size={self.max_size}") 

185 return f"text({', '.join(args)})" 

186 

187 # See https://docs.python.org/3/library/stdtypes.html#string-methods 

188 # These methods always return Truthy values for any nonempty string. 

189 _nonempty_filters = ( 

190 *ListStrategy._nonempty_filters, 

191 str, 

192 str.casefold, 

193 str.encode, 

194 *(getattr(str, n) for n in _nonempty_names), 

195 ) 

196 _nonempty_and_content_filters = ( 

197 str.isdecimal, 

198 str.isnumeric, 

199 *(getattr(str, n) for n in _nonempty_and_content_names), 

200 ) 

201 

202 def filter(self, condition): 

203 elems = unwrap_strategies(self.element_strategy) 

204 if ( 

205 condition is str.isidentifier 

206 and self.max_size >= 1 

207 and isinstance(elems, OneCharStringStrategy) 

208 ): 

209 from hypothesis.strategies import builds, nothing 

210 

211 id_start, id_continue = _identifier_characters() 

212 if not (elems.intervals & id_start): 

213 return nothing() 

214 return builds( 

215 "{}{}".format, 

216 OneCharStringStrategy(elems.intervals & id_start), 

217 TextStrategy( 

218 OneCharStringStrategy(elems.intervals & id_continue), 

219 min_size=max(0, self.min_size - 1), 

220 max_size=self.max_size - 1, 

221 ), 

222 # Filter to ensure that NFKC normalization keeps working in future 

223 ).filter(str.isidentifier) 

224 if (new := _string_filter_rewrite(self, str, condition)) is not None: 

225 return new 

226 return super().filter(condition) 

227 

228 

229def _string_filter_rewrite(self, kind, condition): 

230 if condition in (kind.lower, kind.title, kind.upper): 

231 k = kind.__name__ 

232 warnings.warn( 

233 f"You applied {k}.{condition.__name__} as a filter, but this allows " 

234 f"all nonempty strings! Did you mean {k}.is{condition.__name__}?", 

235 HypothesisWarning, 

236 stacklevel=2, 

237 ) 

238 

239 if ( 

240 ( 

241 kind is bytes 

242 or isinstance( 

243 unwrap_strategies(self.element_strategy), OneCharStringStrategy 

244 ) 

245 ) 

246 and isinstance(pattern := getattr(condition, "__self__", None), re.Pattern) 

247 and isinstance(pattern.pattern, kind) 

248 ): 

249 from hypothesis.strategies._internal.regex import regex_strategy 

250 

251 if condition.__name__ == "match": 

252 # Replace with an easier-to-handle equivalent condition 

253 caret, close = ("^(?:", ")") if kind is str else (b"^(?:", b")") 

254 pattern = re.compile(caret + pattern.pattern + close, flags=pattern.flags) 

255 condition = pattern.search 

256 

257 if condition.__name__ in ("search", "findall", "fullmatch"): 

258 s = regex_strategy( 

259 pattern, 

260 fullmatch=condition.__name__ == "fullmatch", 

261 alphabet=self.element_strategy if kind is str else None, 

262 ) 

263 if self.min_size > 0: 

264 s = s.filter(partial(min_len, self.min_size)) 

265 if self.max_size < 1e999: 

266 s = s.filter(partial(max_len, self.max_size)) 

267 return s 

268 elif condition.__name__ in ("finditer", "scanner"): 

269 # PyPy implements `finditer` as an alias to their `scanner` method 

270 warnings.warn( 

271 f"You applied {pretty(condition)} as a filter, but this allows " 

272 f"any string at all! Did you mean .findall ?", 

273 HypothesisWarning, 

274 stacklevel=3, 

275 ) 

276 return self 

277 elif condition.__name__ == "split": 

278 warnings.warn( 

279 f"You applied {pretty(condition)} as a filter, but this allows " 

280 f"any nonempty string! Did you mean .search ?", 

281 HypothesisWarning, 

282 stacklevel=3, 

283 ) 

284 return self.filter(bool) 

285 

286 # We use ListStrategy filter logic for the conditions that *only* imply 

287 # the string is nonempty. Here, we increment the min_size but still apply 

288 # the filter for conditions that imply nonempty *and specific contents*. 

289 if condition in self._nonempty_and_content_filters and self.max_size >= 1: 

290 self = copy.copy(self) 

291 self.min_size = max(1, self.min_size) 

292 return ListStrategy.filter(self, condition) 

293 

294 return None 

295 

296 

297# Excerpted from https://www.unicode.org/Public/15.0.0/ucd/PropList.txt 

298# Python updates it's Unicode version between minor releases, but fortunately 

299# these properties do not change between the Unicode versions in question. 

300_PROPLIST = """ 

301# ================================================ 

302 

3031885..1886 ; Other_ID_Start # Mn [2] MONGOLIAN LETTER ALI GALI BALUDA..MONGOLIAN LETTER ALI GALI THREE BALUDA 

3042118 ; Other_ID_Start # Sm SCRIPT CAPITAL P 

305212E ; Other_ID_Start # So ESTIMATED SYMBOL 

306309B..309C ; Other_ID_Start # Sk [2] KATAKANA-HIRAGANA VOICED SOUND MARK..KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 

307 

308# Total code points: 6 

309 

310# ================================================ 

311 

31200B7 ; Other_ID_Continue # Po MIDDLE DOT 

3130387 ; Other_ID_Continue # Po GREEK ANO TELEIA 

3141369..1371 ; Other_ID_Continue # No [9] ETHIOPIC DIGIT ONE..ETHIOPIC DIGIT NINE 

31519DA ; Other_ID_Continue # No NEW TAI LUE THAM DIGIT ONE 

316 

317# Total code points: 12 

318""" 

319 

320 

321@lru_cache 

322def _identifier_characters(): 

323 """See https://docs.python.org/3/reference/lexical_analysis.html#identifiers""" 

324 # Start by computing the set of special characters 

325 chars = {"Other_ID_Start": "", "Other_ID_Continue": ""} 

326 for line in _PROPLIST.splitlines(): 

327 if m := re.match(r"([0-9A-F.]+) +; (\w+) # ", line): 

328 codes, prop = m.groups() 

329 span = range(int(codes[:4], base=16), int(codes[-4:], base=16) + 1) 

330 chars[prop] += "".join(chr(x) for x in span) 

331 

332 # Then get the basic set by Unicode category and known extras 

333 id_start = charmap.query( 

334 categories=("Lu", "Ll", "Lt", "Lm", "Lo", "Nl"), 

335 include_characters="_" + chars["Other_ID_Start"], 

336 ) 

337 id_start -= IntervalSet.from_string( 

338 # Magic value: the characters which NFKC-normalize to be invalid identifiers. 

339 # Conveniently they're all in `id_start`, so we only need to do this once. 

340 "\u037a\u0e33\u0eb3\u2e2f\u309b\u309c\ufc5e\ufc5f\ufc60\ufc61\ufc62\ufc63" 

341 "\ufdfa\ufdfb\ufe70\ufe72\ufe74\ufe76\ufe78\ufe7a\ufe7c\ufe7e\uff9e\uff9f" 

342 ) 

343 id_continue = id_start | charmap.query( 

344 categories=("Mn", "Mc", "Nd", "Pc"), 

345 include_characters=chars["Other_ID_Continue"], 

346 ) 

347 return id_start, id_continue 

348 

349 

350class BytesStrategy(SearchStrategy): 

351 def __init__(self, min_size: int, max_size: Optional[int]): 

352 self.min_size = min_size 

353 self.max_size = ( 

354 max_size if max_size is not None else COLLECTION_DEFAULT_MAX_SIZE 

355 ) 

356 

357 def do_draw(self, data): 

358 return data.draw_bytes(self.min_size, self.max_size) 

359 

360 _nonempty_filters = ( 

361 *ListStrategy._nonempty_filters, 

362 bytes, 

363 *(getattr(bytes, n) for n in _nonempty_names), 

364 ) 

365 _nonempty_and_content_filters = ( 

366 *(getattr(bytes, n) for n in _nonempty_and_content_names), 

367 ) 

368 

369 def filter(self, condition): 

370 if (new := _string_filter_rewrite(self, bytes, condition)) is not None: 

371 return new 

372 return ListStrategy.filter(self, condition)