Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/internal/charmap.py: 87%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

131 statements  

1# This file is part of Hypothesis, which may be found at 

2# https://github.com/HypothesisWorks/hypothesis/ 

3# 

4# Copyright the Hypothesis Authors. 

5# Individual contributors are listed in AUTHORS.rst and the git log. 

6# 

7# This Source Code Form is subject to the terms of the Mozilla Public License, 

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can 

9# obtain one at https://mozilla.org/MPL/2.0/. 

10 

11import codecs 

12import gzip 

13import json 

14import os 

15import sys 

16import tempfile 

17import unicodedata 

18from collections.abc import Collection, Iterable 

19from functools import cache 

20from pathlib import Path 

21from typing import TYPE_CHECKING, Literal, Optional 

22 

23from hypothesis.configuration import storage_directory 

24from hypothesis.control import _current_build_context 

25from hypothesis.errors import InvalidArgument 

26from hypothesis.internal.intervalsets import IntervalSet, IntervalsT 

27 

28if TYPE_CHECKING: 

29 from typing import TypeAlias 

30 

31# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category 

32CategoryName: "TypeAlias" = Literal[ 

33 "L", # Letter 

34 "Lu", # Letter, uppercase 

35 "Ll", # Letter, lowercase 

36 "Lt", # Letter, titlecase 

37 "Lm", # Letter, modifier 

38 "Lo", # Letter, other 

39 "M", # Mark 

40 "Mn", # Mark, nonspacing 

41 "Mc", # Mark, spacing combining 

42 "Me", # Mark, enclosing 

43 "N", # Number 

44 "Nd", # Number, decimal digit 

45 "Nl", # Number, letter 

46 "No", # Number, other 

47 "P", # Punctuation 

48 "Pc", # Punctuation, connector 

49 "Pd", # Punctuation, dash 

50 "Ps", # Punctuation, open 

51 "Pe", # Punctuation, close 

52 "Pi", # Punctuation, initial quote 

53 "Pf", # Punctuation, final quote 

54 "Po", # Punctuation, other 

55 "S", # Symbol 

56 "Sm", # Symbol, math 

57 "Sc", # Symbol, currency 

58 "Sk", # Symbol, modifier 

59 "So", # Symbol, other 

60 "Z", # Separator 

61 "Zs", # Separator, space 

62 "Zl", # Separator, line 

63 "Zp", # Separator, paragraph 

64 "C", # Other 

65 "Cc", # Other, control 

66 "Cf", # Other, format 

67 "Cs", # Other, surrogate 

68 "Co", # Other, private use 

69 "Cn", # Other, not assigned 

70] 

71Categories: "TypeAlias" = Iterable[CategoryName] 

72CategoriesTuple: "TypeAlias" = tuple[CategoryName, ...] 

73 

74 

75def charmap_file(fname: str = "charmap") -> Path: 

76 return storage_directory( 

77 "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz" 

78 ) 

79 

80 

81_charmap: Optional[dict[CategoryName, IntervalsT]] = None 

82 

83 

84def charmap() -> dict[CategoryName, IntervalsT]: 

85 """Return a dict that maps a Unicode category, to a tuple of 2-tuples 

86 covering the codepoint intervals for characters in that category. 

87 

88 >>> charmap()['Co'] 

89 ((57344, 63743), (983040, 1048573), (1048576, 1114109)) 

90 """ 

91 global _charmap 

92 # Best-effort caching in the face of missing files and/or unwritable 

93 # filesystems is fairly simple: check if loaded, else try loading, 

94 # else calculate and try writing the cache. 

95 if _charmap is None: 

96 f = charmap_file() 

97 try: 

98 with gzip.GzipFile(f, "rb") as d: 

99 tmp_charmap = dict(json.load(d)) 

100 

101 except Exception: 

102 # This loop is reduced to using only local variables for performance; 

103 # indexing and updating containers is a ~3x slowdown. This doesn't fix 

104 # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps. 

105 category = unicodedata.category # Local variable -> ~20% speedup! 

106 tmp_charmap = {} 

107 last_cat = category(chr(0)) 

108 last_start = 0 

109 for i in range(1, sys.maxunicode + 1): 

110 cat = category(chr(i)) 

111 if cat != last_cat: 

112 tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1)) 

113 last_cat, last_start = cat, i 

114 tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode)) 

115 

116 try: 

117 # Write the Unicode table atomically 

118 tmpdir = storage_directory("tmp") 

119 tmpdir.mkdir(exist_ok=True, parents=True) 

120 fd, tmpfile = tempfile.mkstemp(dir=tmpdir) 

121 os.close(fd) 

122 # Explicitly set the mtime to get reproducible output 

123 with gzip.GzipFile(tmpfile, "wb", mtime=1) as fp: 

124 result = json.dumps(sorted(tmp_charmap.items())) 

125 fp.write(result.encode()) 

126 

127 os.renames(tmpfile, f) 

128 except Exception: 

129 pass 

130 

131 # convert between lists and tuples 

132 _charmap = { 

133 k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items() 

134 } 

135 # each value is a tuple of 2-tuples (that is, tuples of length 2) 

136 # and both elements of that tuple are integers. 

137 for vs in _charmap.values(): 

138 ints = list(sum(vs, ())) 

139 assert all(isinstance(x, int) for x in ints) 

140 assert ints == sorted(ints) 

141 assert all(len(tup) == 2 for tup in vs) 

142 

143 assert _charmap is not None 

144 return _charmap 

145 

146 

147@cache 

148def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover 

149 """Return an IntervalSet of characters which are part of this codec.""" 

150 assert codec_name == codecs.lookup(codec_name).name 

151 fname = charmap_file(f"codec-{codec_name}") 

152 try: 

153 with gzip.GzipFile(fname) as gzf: 

154 encodable_intervals = json.load(gzf) 

155 

156 except Exception: 

157 # This loop is kinda slow, but hopefully we don't need to do it very often! 

158 encodable_intervals = [] 

159 for i in range(sys.maxunicode + 1): 

160 try: 

161 chr(i).encode(codec_name) 

162 except Exception: # usually _but not always_ UnicodeEncodeError 

163 pass 

164 else: 

165 encodable_intervals.append((i, i)) 

166 

167 res = IntervalSet(encodable_intervals) 

168 res = res.union(res) 

169 try: 

170 # Write the Unicode table atomically 

171 tmpdir = storage_directory("tmp") 

172 tmpdir.mkdir(exist_ok=True, parents=True) 

173 fd, tmpfile = tempfile.mkstemp(dir=tmpdir) 

174 os.close(fd) 

175 # Explicitly set the mtime to get reproducible output 

176 with gzip.GzipFile(tmpfile, "wb", mtime=1) as f: 

177 f.write(json.dumps(res.intervals).encode()) 

178 os.renames(tmpfile, fname) 

179 except Exception: 

180 pass 

181 return res 

182 

183 

184_categories: Optional[Categories] = None 

185 

186 

187def categories() -> Categories: 

188 """Return a tuple of Unicode categories in a normalised order. 

189 

190 >>> categories() # doctest: +ELLIPSIS 

191 ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs') 

192 """ 

193 global _categories 

194 if _categories is None: 

195 cm = charmap() 

196 categories = sorted(cm.keys(), key=lambda c: len(cm[c])) 

197 categories.remove("Cc") # Other, Control 

198 categories.remove("Cs") # Other, Surrogate 

199 categories.append("Cc") 

200 categories.append("Cs") 

201 _categories = tuple(categories) 

202 return _categories 

203 

204 

205def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple: 

206 """Return a tuple of Unicode categories in a normalised order. 

207 

208 This function expands one-letter designations of a major class to include 

209 all subclasses: 

210 

211 >>> as_general_categories(['N']) 

212 ('Nd', 'Nl', 'No') 

213 

214 See section 4.5 of the Unicode standard for more on classes: 

215 https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf 

216 

217 If the collection ``cats`` includes any elements that do not represent a 

218 major class or a class with subclass, a deprecation warning is raised. 

219 """ 

220 major_classes = ("L", "M", "N", "P", "S", "Z", "C") 

221 cs = categories() 

222 out = set(cats) 

223 for c in cats: 

224 if c in major_classes: 

225 out.discard(c) 

226 out.update(x for x in cs if x.startswith(c)) 

227 elif c not in cs: 

228 raise InvalidArgument( 

229 f"In {name}={cats!r}, {c!r} is not a valid Unicode category." 

230 ) 

231 return tuple(c for c in cs if c in out) 

232 

233 

234category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()} 

235 

236 

237def _category_key(cats: Optional[Iterable[str]]) -> CategoriesTuple: 

238 """Return a normalised tuple of all Unicode categories that are in 

239 `include`, but not in `exclude`. 

240 

241 If include is None then default to including all categories. 

242 Any item in include that is not a unicode character will be excluded. 

243 

244 >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So']) 

245 ('Me', 'Lu', 'Cs') 

246 """ 

247 cs = categories() 

248 if cats is None: 

249 cats = set(cs) 

250 return tuple(c for c in cs if c in cats) 

251 

252 

253def _query_for_key(key: Categories) -> IntervalsT: 

254 """Return a tuple of codepoint intervals covering characters that match one 

255 or more categories in the tuple of categories `key`. 

256 

257 >>> _query_for_key(categories()) 

258 ((0, 1114111),) 

259 >>> _query_for_key(('Zl', 'Zp', 'Co')) 

260 ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109)) 

261 """ 

262 key = tuple(key) 

263 # ignore ordering on the cache key to increase potential cache hits. 

264 cache_key = frozenset(key) 

265 context = _current_build_context.value 

266 if context is None or not context.data.provider.avoid_realization: 

267 try: 

268 return category_index_cache[cache_key] 

269 except KeyError: 

270 pass 

271 elif not key: # pragma: no cover # only on alternative backends 

272 return () 

273 assert key 

274 if set(key) == set(categories()): 

275 result = IntervalSet([(0, sys.maxunicode)]) 

276 else: 

277 result = IntervalSet(_query_for_key(key[:-1])).union( 

278 IntervalSet(charmap()[key[-1]]) 

279 ) 

280 assert isinstance(result, IntervalSet) 

281 if context is None or not context.data.provider.avoid_realization: 

282 category_index_cache[cache_key] = result.intervals 

283 return result.intervals 

284 

285 

286limited_category_index_cache: dict[ 

287 tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet 

288] = {} 

289 

290 

291def query( 

292 *, 

293 categories: Optional[Categories] = None, 

294 min_codepoint: Optional[int] = None, 

295 max_codepoint: Optional[int] = None, 

296 include_characters: Collection[str] = "", 

297 exclude_characters: Collection[str] = "", 

298) -> IntervalSet: 

299 """Return a tuple of intervals covering the codepoints for all characters 

300 that meet the criteria. 

301 

302 >>> query() 

303 ((0, 1114111),) 

304 >>> query(min_codepoint=0, max_codepoint=128) 

305 ((0, 128),) 

306 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu']) 

307 ((65, 90),) 

308 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'], 

309 ... include_characters='☃') 

310 ((65, 90), (9731, 9731)) 

311 """ 

312 if min_codepoint is None: 

313 min_codepoint = 0 

314 if max_codepoint is None: 

315 max_codepoint = sys.maxunicode 

316 catkey = _category_key(categories) 

317 character_intervals = IntervalSet.from_string("".join(include_characters)) 

318 exclude_intervals = IntervalSet.from_string("".join(exclude_characters)) 

319 qkey = ( 

320 catkey, 

321 min_codepoint, 

322 max_codepoint, 

323 character_intervals.intervals, 

324 exclude_intervals.intervals, 

325 ) 

326 context = _current_build_context.value 

327 if context is None or not context.data.provider.avoid_realization: 

328 try: 

329 return limited_category_index_cache[qkey] 

330 except KeyError: 

331 pass 

332 base = _query_for_key(catkey) 

333 result = [] 

334 for u, v in base: 

335 if v >= min_codepoint and u <= max_codepoint: 

336 result.append((max(u, min_codepoint), min(v, max_codepoint))) 

337 result = (IntervalSet(result) | character_intervals) - exclude_intervals 

338 if context is None or not context.data.provider.avoid_realization: 

339 limited_category_index_cache[qkey] = result 

340 return result