Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/internal/charmap.py: 88%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

129 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import codecs 

12import gzip 

13import json 

14import os 

15import sys 

16import tempfile 

17import unicodedata 

18from collections.abc import Collection, Iterable 

19from functools import cache 

20from pathlib import Path 

21from typing import Literal, TypeAlias 

22 

23from hypothesis.configuration import storage_directory 

24from hypothesis.control import _current_build_context 

25from hypothesis.errors import InvalidArgument 

26from hypothesis.internal.intervalsets import IntervalSet, IntervalsT 

27 

28# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category 

29CategoryName: TypeAlias = Literal[ 

30 "L", # Letter 

31 "Lu", # Letter, uppercase 

32 "Ll", # Letter, lowercase 

33 "Lt", # Letter, titlecase 

34 "Lm", # Letter, modifier 

35 "Lo", # Letter, other 

36 "M", # Mark 

37 "Mn", # Mark, nonspacing 

38 "Mc", # Mark, spacing combining 

39 "Me", # Mark, enclosing 

40 "N", # Number 

41 "Nd", # Number, decimal digit 

42 "Nl", # Number, letter 

43 "No", # Number, other 

44 "P", # Punctuation 

45 "Pc", # Punctuation, connector 

46 "Pd", # Punctuation, dash 

47 "Ps", # Punctuation, open 

48 "Pe", # Punctuation, close 

49 "Pi", # Punctuation, initial quote 

50 "Pf", # Punctuation, final quote 

51 "Po", # Punctuation, other 

52 "S", # Symbol 

53 "Sm", # Symbol, math 

54 "Sc", # Symbol, currency 

55 "Sk", # Symbol, modifier 

56 "So", # Symbol, other 

57 "Z", # Separator 

58 "Zs", # Separator, space 

59 "Zl", # Separator, line 

60 "Zp", # Separator, paragraph 

61 "C", # Other 

62 "Cc", # Other, control 

63 "Cf", # Other, format 

64 "Cs", # Other, surrogate 

65 "Co", # Other, private use 

66 "Cn", # Other, not assigned 

67] 

68Categories: TypeAlias = Iterable[CategoryName] 

69CategoriesTuple: TypeAlias = tuple[CategoryName, ...] 

70 

71 

72def charmap_file(fname: str = "charmap") -> Path: 

73 return storage_directory( 

74 "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz" 

75 ) 

76 

77 

78_charmap: dict[CategoryName, IntervalsT] | None = None 

79 

80 

81def charmap() -> dict[CategoryName, IntervalsT]: 

82 """Return a dict that maps a Unicode category, to a tuple of 2-tuples 

83 covering the codepoint intervals for characters in that category. 

84 

85 >>> charmap()['Co'] 

86 ((57344, 63743), (983040, 1048573), (1048576, 1114109)) 

87 """ 

88 global _charmap 

89 # Best-effort caching in the face of missing files and/or unwritable 

90 # filesystems is fairly simple: check if loaded, else try loading, 

91 # else calculate and try writing the cache. 

92 if _charmap is None: 

93 f = charmap_file() 

94 try: 

95 with gzip.GzipFile(f, "rb") as d: 

96 tmp_charmap = dict(json.load(d)) 

97 

98 except Exception: 

99 # This loop is reduced to using only local variables for performance; 

100 # indexing and updating containers is a ~3x slowdown. This doesn't fix 

101 # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps. 

102 category = unicodedata.category # Local variable -> ~20% speedup! 

103 tmp_charmap = {} 

104 last_cat = category(chr(0)) 

105 last_start = 0 

106 for i in range(1, sys.maxunicode + 1): 

107 cat = category(chr(i)) 

108 if cat != last_cat: 

109 tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1)) 

110 last_cat, last_start = cat, i 

111 tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode)) 

112 

113 try: 

114 # Write the Unicode table atomically 

115 tmpdir = storage_directory("tmp") 

116 tmpdir.mkdir(exist_ok=True, parents=True) 

117 fd, tmpfile = tempfile.mkstemp(dir=tmpdir) 

118 os.close(fd) 

119 # Explicitly set the mtime to get reproducible output 

120 with gzip.GzipFile(tmpfile, "wb", mtime=1) as fp: 

121 result = json.dumps(sorted(tmp_charmap.items())) 

122 fp.write(result.encode()) 

123 

124 os.renames(tmpfile, f) 

125 except Exception: 

126 pass 

127 

128 # convert between lists and tuples 

129 _charmap = { 

130 k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items() 

131 } 

132 # each value is a tuple of 2-tuples (that is, tuples of length 2) 

133 # and both elements of that tuple are integers. 

134 for vs in _charmap.values(): 

135 ints = list(sum(vs, ())) 

136 assert all(isinstance(x, int) for x in ints) 

137 assert ints == sorted(ints) 

138 assert all(len(tup) == 2 for tup in vs) 

139 

140 assert _charmap is not None 

141 return _charmap 

142 

143 

144@cache 

145def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover 

146 """Return an IntervalSet of characters which are part of this codec.""" 

147 assert codec_name == codecs.lookup(codec_name).name 

148 fname = charmap_file(f"codec-{codec_name}") 

149 try: 

150 with gzip.GzipFile(fname) as gzf: 

151 encodable_intervals = json.load(gzf) 

152 

153 except Exception: 

154 # This loop is kinda slow, but hopefully we don't need to do it very often! 

155 encodable_intervals = [] 

156 for i in range(sys.maxunicode + 1): 

157 try: 

158 chr(i).encode(codec_name) 

159 except Exception: # usually _but not always_ UnicodeEncodeError 

160 pass 

161 else: 

162 encodable_intervals.append((i, i)) 

163 

164 res = IntervalSet(encodable_intervals) 

165 res = res.union(res) 

166 try: 

167 # Write the Unicode table atomically 

168 tmpdir = storage_directory("tmp") 

169 tmpdir.mkdir(exist_ok=True, parents=True) 

170 fd, tmpfile = tempfile.mkstemp(dir=tmpdir) 

171 os.close(fd) 

172 # Explicitly set the mtime to get reproducible output 

173 with gzip.GzipFile(tmpfile, "wb", mtime=1) as f: 

174 f.write(json.dumps(res.intervals).encode()) 

175 os.renames(tmpfile, fname) 

176 except Exception: 

177 pass 

178 return res 

179 

180 

181_categories: Categories | None = None 

182 

183 

184def categories() -> Categories: 

185 """Return a tuple of Unicode categories in a normalised order. 

186 

187 >>> categories() # doctest: +ELLIPSIS 

188 ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs') 

189 """ 

190 global _categories 

191 if _categories is None: 

192 cm = charmap() 

193 categories = sorted(cm.keys(), key=lambda c: len(cm[c])) 

194 categories.remove("Cc") # Other, Control 

195 categories.remove("Cs") # Other, Surrogate 

196 categories.append("Cc") 

197 categories.append("Cs") 

198 _categories = tuple(categories) 

199 return _categories 

200 

201 

202def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple: 

203 """Return a tuple of Unicode categories in a normalised order. 

204 

205 This function expands one-letter designations of a major class to include 

206 all subclasses: 

207 

208 >>> as_general_categories(['N']) 

209 ('Nd', 'Nl', 'No') 

210 

211 See section 4.5 of the Unicode standard for more on classes: 

212 https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf 

213 

214 If the collection ``cats`` includes any elements that do not represent a 

215 major class or a class with subclass, a deprecation warning is raised. 

216 """ 

217 major_classes = ("L", "M", "N", "P", "S", "Z", "C") 

218 cs = categories() 

219 out = set(cats) 

220 for c in cats: 

221 if c in major_classes: 

222 out.discard(c) 

223 out.update(x for x in cs if x.startswith(c)) 

224 elif c not in cs: 

225 raise InvalidArgument( 

226 f"In {name}={cats!r}, {c!r} is not a valid Unicode category." 

227 ) 

228 return tuple(c for c in cs if c in out) 

229 

230 

231category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()} 

232 

233 

234def _category_key(cats: Iterable[str] | None) -> CategoriesTuple: 

235 """Return a normalised tuple of all Unicode categories that are in 

236 `include`, but not in `exclude`. 

237 

238 If include is None then default to including all categories. 

239 Any item in include that is not a unicode character will be excluded. 

240 

241 >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So']) 

242 ('Me', 'Lu', 'Cs') 

243 """ 

244 cs = categories() 

245 if cats is None: 

246 cats = set(cs) 

247 return tuple(c for c in cs if c in cats) 

248 

249 

250def _query_for_key(key: Categories) -> IntervalsT: 

251 """Return a tuple of codepoint intervals covering characters that match one 

252 or more categories in the tuple of categories `key`. 

253 

254 >>> _query_for_key(categories()) 

255 ((0, 1114111),) 

256 >>> _query_for_key(('Zl', 'Zp', 'Co')) 

257 ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109)) 

258 """ 

259 key = tuple(key) 

260 # ignore ordering on the cache key to increase potential cache hits. 

261 cache_key = frozenset(key) 

262 context = _current_build_context.value 

263 if context is None or not context.data.provider.avoid_realization: 

264 try: 

265 return category_index_cache[cache_key] 

266 except KeyError: 

267 pass 

268 elif not key: # pragma: no cover # only on alternative backends 

269 return () 

270 assert key 

271 if set(key) == set(categories()): 

272 result = IntervalSet([(0, sys.maxunicode)]) 

273 else: 

274 result = IntervalSet(_query_for_key(key[:-1])).union( 

275 IntervalSet(charmap()[key[-1]]) 

276 ) 

277 assert isinstance(result, IntervalSet) 

278 if context is None or not context.data.provider.avoid_realization: 

279 category_index_cache[cache_key] = result.intervals 

280 return result.intervals 

281 

282 

283limited_category_index_cache: dict[ 

284 tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet 

285] = {} 

286 

287 

288def query( 

289 *, 

290 categories: Categories | None = None, 

291 min_codepoint: int | None = None, 

292 max_codepoint: int | None = None, 

293 include_characters: Collection[str] = "", 

294 exclude_characters: Collection[str] = "", 

295) -> IntervalSet: 

296 """Return a tuple of intervals covering the codepoints for all characters 

297 that meet the criteria. 

298 

299 >>> query() 

300 ((0, 1114111),) 

301 >>> query(min_codepoint=0, max_codepoint=128) 

302 ((0, 128),) 

303 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu']) 

304 ((65, 90),) 

305 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'], 

306 ... include_characters='☃') 

307 ((65, 90), (9731, 9731)) 

308 """ 

309 if min_codepoint is None: 

310 min_codepoint = 0 

311 if max_codepoint is None: 

312 max_codepoint = sys.maxunicode 

313 catkey = _category_key(categories) 

314 character_intervals = IntervalSet.from_string("".join(include_characters)) 

315 exclude_intervals = IntervalSet.from_string("".join(exclude_characters)) 

316 qkey = ( 

317 catkey, 

318 min_codepoint, 

319 max_codepoint, 

320 character_intervals.intervals, 

321 exclude_intervals.intervals, 

322 ) 

323 context = _current_build_context.value 

324 if context is None or not context.data.provider.avoid_realization: 

325 try: 

326 return limited_category_index_cache[qkey] 

327 except KeyError: 

328 pass 

329 base = _query_for_key(catkey) 

330 result = [] 

331 for u, v in base: 

332 if v >= min_codepoint and u <= max_codepoint: 

333 result.append((max(u, min_codepoint), min(v, max_codepoint))) 

334 result = (IntervalSet(result) | character_intervals) - exclude_intervals 

335 if context is None or not context.data.provider.avoid_realization: 

336 limited_category_index_cache[qkey] = result 

337 return result