Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/internal/charmap.py: 87%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

131 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import codecs 

12import gzip 

13import json 

14import os 

15import sys 

16import tempfile 

17import unicodedata 

18from collections.abc import Iterable 

19from functools import cache 

20from pathlib import Path 

21from typing import TYPE_CHECKING, Literal, Optional 

22 

23from hypothesis.configuration import storage_directory 

24from hypothesis.control import _current_build_context 

25from hypothesis.errors import InvalidArgument 

26from hypothesis.internal.intervalsets import IntervalSet, IntervalsT 

27 

28if TYPE_CHECKING: 

29 from typing import TypeAlias 

30 

31# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category 

32CategoryName: "TypeAlias" = Literal[ 

33 "L", # Letter 

34 "Lu", # Letter, uppercase 

35 "Ll", # Letter, lowercase 

36 "Lt", # Letter, titlecase 

37 "Lm", # Letter, modifier 

38 "Lo", # Letter, other 

39 "M", # Mark 

40 "Mn", # Mark, nonspacing 

41 "Mc", # Mark, spacing combining 

42 "Me", # Mark, enclosing 

43 "N", # Number 

44 "Nd", # Number, decimal digit 

45 "Nl", # Number, letter 

46 "No", # Number, other 

47 "P", # Punctuation 

48 "Pc", # Punctuation, connector 

49 "Pd", # Punctuation, dash 

50 "Ps", # Punctuation, open 

51 "Pe", # Punctuation, close 

52 "Pi", # Punctuation, initial quote 

53 "Pf", # Punctuation, final quote 

54 "Po", # Punctuation, other 

55 "S", # Symbol 

56 "Sm", # Symbol, math 

57 "Sc", # Symbol, currency 

58 "Sk", # Symbol, modifier 

59 "So", # Symbol, other 

60 "Z", # Separator 

61 "Zs", # Separator, space 

62 "Zl", # Separator, line 

63 "Zp", # Separator, paragraph 

64 "C", # Other 

65 "Cc", # Other, control 

66 "Cf", # Other, format 

67 "Cs", # Other, surrogate 

68 "Co", # Other, private use 

69 "Cn", # Other, not assigned 

70] 

71Categories: "TypeAlias" = Iterable[CategoryName] 

72CategoriesTuple: "TypeAlias" = tuple[CategoryName, ...] 

73 

74 

75def charmap_file(fname: str = "charmap") -> Path: 

76 return storage_directory( 

77 "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz" 

78 ) 

79 

80 

81_charmap = None 

82 

83 

84def charmap() -> dict[CategoryName, IntervalsT]: 

85 """Return a dict that maps a Unicode category, to a tuple of 2-tuples 

86 covering the codepoint intervals for characters in that category. 

87 

88 >>> charmap()['Co'] 

89 ((57344, 63743), (983040, 1048573), (1048576, 1114109)) 

90 """ 

91 global _charmap 

92 # Best-effort caching in the face of missing files and/or unwritable 

93 # filesystems is fairly simple: check if loaded, else try loading, 

94 # else calculate and try writing the cache. 

95 if _charmap is None: 

96 f = charmap_file() 

97 try: 

98 with gzip.GzipFile(f, "rb") as d: 

99 tmp_charmap = dict(json.load(d)) 

100 

101 except Exception: 

102 # This loop is reduced to using only local variables for performance; 

103 # indexing and updating containers is a ~3x slowdown. This doesn't fix 

104 # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps. 

105 category = unicodedata.category # Local variable -> ~20% speedup! 

106 tmp_charmap = {} 

107 last_cat = category(chr(0)) 

108 last_start = 0 

109 for i in range(1, sys.maxunicode + 1): 

110 cat = category(chr(i)) 

111 if cat != last_cat: 

112 tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1)) 

113 last_cat, last_start = cat, i 

114 tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode)) 

115 

116 try: 

117 # Write the Unicode table atomically 

118 tmpdir = storage_directory("tmp") 

119 tmpdir.mkdir(exist_ok=True, parents=True) 

120 fd, tmpfile = tempfile.mkstemp(dir=tmpdir) 

121 os.close(fd) 

122 # Explicitly set the mtime to get reproducible output 

123 with gzip.GzipFile(tmpfile, "wb", mtime=1) as o: 

124 result = json.dumps(sorted(tmp_charmap.items())) 

125 o.write(result.encode()) 

126 

127 os.renames(tmpfile, f) 

128 except Exception: 

129 pass 

130 

131 # convert between lists and tuples 

132 _charmap = { 

133 k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items() 

134 } 

135 # each value is a tuple of 2-tuples (that is, tuples of length 2) 

136 # and that both elements of that tuple are integers. 

137 for vs in _charmap.values(): 

138 ints = list(sum(vs, ())) 

139 assert all(isinstance(x, int) for x in ints) 

140 assert ints == sorted(ints) 

141 assert all(len(tup) == 2 for tup in vs) 

142 

143 assert _charmap is not None 

144 return _charmap 

145 

146 

147@cache 

148def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover 

149 """Return an IntervalSet of characters which are part of this codec.""" 

150 assert codec_name == codecs.lookup(codec_name).name 

151 fname = charmap_file(f"codec-{codec_name}") 

152 try: 

153 with gzip.GzipFile(fname) as gzf: 

154 encodable_intervals = json.load(gzf) 

155 

156 except Exception: 

157 # This loop is kinda slow, but hopefully we don't need to do it very often! 

158 encodable_intervals = [] 

159 for i in range(sys.maxunicode + 1): 

160 try: 

161 chr(i).encode(codec_name) 

162 except Exception: # usually _but not always_ UnicodeEncodeError 

163 pass 

164 else: 

165 encodable_intervals.append((i, i)) 

166 

167 res = IntervalSet(encodable_intervals) 

168 res = res.union(res) 

169 try: 

170 # Write the Unicode table atomically 

171 tmpdir = storage_directory("tmp") 

172 tmpdir.mkdir(exist_ok=True, parents=True) 

173 fd, tmpfile = tempfile.mkstemp(dir=tmpdir) 

174 os.close(fd) 

175 # Explicitly set the mtime to get reproducible output 

176 with gzip.GzipFile(tmpfile, "wb", mtime=1) as o: 

177 o.write(json.dumps(res.intervals).encode()) 

178 os.renames(tmpfile, fname) 

179 except Exception: 

180 pass 

181 return res 

182 

183 

184_categories: Optional[Categories] = None 

185 

186 

187def categories() -> Categories: 

188 """Return a tuple of Unicode categories in a normalised order. 

189 

190 >>> categories() # doctest: +ELLIPSIS 

191 ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs') 

192 """ 

193 global _categories 

194 if _categories is None: 

195 cm = charmap() 

196 categories = sorted(cm.keys(), key=lambda c: len(cm[c])) 

197 categories.remove("Cc") # Other, Control 

198 categories.remove("Cs") # Other, Surrogate 

199 categories.append("Cc") 

200 categories.append("Cs") 

201 _categories = tuple(categories) 

202 return _categories 

203 

204 

205def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple: 

206 """Return a tuple of Unicode categories in a normalised order. 

207 

208 This function expands one-letter designations of a major class to include 

209 all subclasses: 

210 

211 >>> as_general_categories(['N']) 

212 ('Nd', 'Nl', 'No') 

213 

214 See section 4.5 of the Unicode standard for more on classes: 

215 https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf 

216 

217 If the collection ``cats`` includes any elements that do not represent a 

218 major class or a class with subclass, a deprecation warning is raised. 

219 """ 

220 major_classes = ("L", "M", "N", "P", "S", "Z", "C") 

221 cs = categories() 

222 out = set(cats) 

223 for c in cats: 

224 if c in major_classes: 

225 out.discard(c) 

226 out.update(x for x in cs if x.startswith(c)) 

227 elif c not in cs: 

228 raise InvalidArgument( 

229 f"In {name}={cats!r}, {c!r} is not a valid Unicode category." 

230 ) 

231 return tuple(c for c in cs if c in out) 

232 

233 

234category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()} 

235 

236 

237def _category_key(cats: Optional[Iterable[str]]) -> CategoriesTuple: 

238 """Return a normalised tuple of all Unicode categories that are in 

239 `include`, but not in `exclude`. 

240 

241 If include is None then default to including all categories. 

242 Any item in include that is not a unicode character will be excluded. 

243 

244 >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So']) 

245 ('Me', 'Lu', 'Cs') 

246 """ 

247 cs = categories() 

248 if cats is None: 

249 cats = set(cs) 

250 return tuple(c for c in cs if c in cats) 

251 

252 

253def _query_for_key(key: Categories) -> IntervalsT: 

254 """Return a tuple of codepoint intervals covering characters that match one 

255 or more categories in the tuple of categories `key`. 

256 

257 >>> _query_for_key(categories()) 

258 ((0, 1114111),) 

259 >>> _query_for_key(('Zl', 'Zp', 'Co')) 

260 ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109)) 

261 """ 

262 key = tuple(key) 

263 # ignore ordering on the cache key to increase potential cache hits. 

264 cache_key = frozenset(key) 

265 context = _current_build_context.value 

266 if context is None or not context.data.provider.avoid_realization: 

267 try: 

268 return category_index_cache[cache_key] 

269 except KeyError: 

270 pass 

271 elif not key: # pragma: no cover # only on alternative backends 

272 return () 

273 assert key 

274 if set(key) == set(categories()): 

275 result = IntervalSet([(0, sys.maxunicode)]) 

276 else: 

277 result = IntervalSet(_query_for_key(key[:-1])).union( 

278 IntervalSet(charmap()[key[-1]]) 

279 ) 

280 assert isinstance(result, IntervalSet) 

281 if context is None or not context.data.provider.avoid_realization: 

282 category_index_cache[cache_key] = result.intervals 

283 return result.intervals 

284 

285 

286limited_category_index_cache: dict[ 

287 tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet 

288] = {} 

289 

290 

291def query( 

292 *, 

293 categories: Optional[Categories] = None, 

294 min_codepoint: Optional[int] = None, 

295 max_codepoint: Optional[int] = None, 

296 include_characters: str = "", 

297 exclude_characters: str = "", 

298) -> IntervalSet: 

299 """Return a tuple of intervals covering the codepoints for all characters 

300 that meet the criteria. 

301 

302 >>> query() 

303 ((0, 1114111),) 

304 >>> query(min_codepoint=0, max_codepoint=128) 

305 ((0, 128),) 

306 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu']) 

307 ((65, 90),) 

308 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'], 

309 ... include_characters='☃') 

310 ((65, 90), (9731, 9731)) 

311 """ 

312 if min_codepoint is None: 

313 min_codepoint = 0 

314 if max_codepoint is None: 

315 max_codepoint = sys.maxunicode 

316 catkey = _category_key(categories) 

317 character_intervals = IntervalSet.from_string(include_characters or "") 

318 exclude_intervals = IntervalSet.from_string(exclude_characters or "") 

319 qkey = ( 

320 catkey, 

321 min_codepoint, 

322 max_codepoint, 

323 character_intervals.intervals, 

324 exclude_intervals.intervals, 

325 ) 

326 context = _current_build_context.value 

327 if context is None or not context.data.provider.avoid_realization: 

328 try: 

329 return limited_category_index_cache[qkey] 

330 except KeyError: 

331 pass 

332 base = _query_for_key(catkey) 

333 result = [] 

334 for u, v in base: 

335 if v >= min_codepoint and u <= max_codepoint: 

336 result.append((max(u, min_codepoint), min(v, max_codepoint))) 

337 result = (IntervalSet(result) | character_intervals) - exclude_intervals 

338 if context is None or not context.data.provider.avoid_realization: 

339 limited_category_index_cache[qkey] = result 

340 return result