Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/internal/charmap.py: 88%

1# This file is part of Hypothesis, which may be found at

2# https://github.com/HypothesisWorks/hypothesis/

4# Copyright the Hypothesis Authors.

5# Individual contributors are listed in AUTHORS.rst and the git log.

7# This Source Code Form is subject to the terms of the Mozilla Public License,

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can

9# obtain one at https://mozilla.org/MPL/2.0/.

11import codecs

12import gzip

13import json

14import os

15import sys

16import tempfile

17import unicodedata

18from collections.abc import Collection, Iterable

19from functools import cache

20from pathlib import Path

21from typing import Literal, TypeAlias

23from hypothesis.configuration import storage_directory

24from hypothesis.control import _current_build_context

25from hypothesis.errors import InvalidArgument

26from hypothesis.internal.intervalsets import IntervalSet, IntervalsT

28# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category

29CategoryName: TypeAlias = Literal[

30 "L", # Letter

31 "Lu", # Letter, uppercase

32 "Ll", # Letter, lowercase

33 "Lt", # Letter, titlecase

34 "Lm", # Letter, modifier

35 "Lo", # Letter, other

36 "M", # Mark

37 "Mn", # Mark, nonspacing

38 "Mc", # Mark, spacing combining

39 "Me", # Mark, enclosing

40 "N", # Number

41 "Nd", # Number, decimal digit

42 "Nl", # Number, letter

43 "No", # Number, other

44 "P", # Punctuation

45 "Pc", # Punctuation, connector

46 "Pd", # Punctuation, dash

47 "Ps", # Punctuation, open

48 "Pe", # Punctuation, close

49 "Pi", # Punctuation, initial quote

50 "Pf", # Punctuation, final quote

51 "Po", # Punctuation, other

52 "S", # Symbol

53 "Sm", # Symbol, math

54 "Sc", # Symbol, currency

55 "Sk", # Symbol, modifier

56 "So", # Symbol, other

57 "Z", # Separator

58 "Zs", # Separator, space

59 "Zl", # Separator, line

60 "Zp", # Separator, paragraph

61 "C", # Other

62 "Cc", # Other, control

63 "Cf", # Other, format

64 "Cs", # Other, surrogate

65 "Co", # Other, private use

66 "Cn", # Other, not assigned

67]

68Categories: TypeAlias = Iterable[CategoryName]

69CategoriesTuple: TypeAlias = tuple[CategoryName, ...]

72def charmap_file(fname: str = "charmap") -> Path:

73 return storage_directory(

74 "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz"

75 )

78_charmap: dict[CategoryName, IntervalsT] | None = None

81def charmap() -> dict[CategoryName, IntervalsT]:

82 """Return a dict that maps a Unicode category, to a tuple of 2-tuples

83 covering the codepoint intervals for characters in that category.

85 >>> charmap()['Co']

86 ((57344, 63743), (983040, 1048573), (1048576, 1114109))

87 """

88 global _charmap

89 # Best-effort caching in the face of missing files and/or unwritable

90 # filesystems is fairly simple: check if loaded, else try loading,

91 # else calculate and try writing the cache.

92 if _charmap is None:

93 f = charmap_file()

94 try:

95 with gzip.GzipFile(f, "rb") as d:

96 tmp_charmap = dict(json.load(d))

98 except Exception:

99 # This loop is reduced to using only local variables for performance;

100 # indexing and updating containers is a ~3x slowdown. This doesn't fix

101 # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.

102 category = unicodedata.category # Local variable -> ~20% speedup!

103 tmp_charmap = {}

104 last_cat = category(chr(0))

105 last_start = 0

106 for i in range(1, sys.maxunicode + 1):

107 cat = category(chr(i))

108 if cat != last_cat:

109 tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1))

110 last_cat, last_start = cat, i

111 tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode))

112

113 try:

114 # Write the Unicode table atomically

115 tmpdir = storage_directory("tmp")

116 tmpdir.mkdir(exist_ok=True, parents=True)

117 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)

118 os.close(fd)

119 # Explicitly set the mtime to get reproducible output

120 with gzip.GzipFile(tmpfile, "wb", mtime=1) as fp:

121 result = json.dumps(sorted(tmp_charmap.items()))

122 fp.write(result.encode())

123

124 os.renames(tmpfile, f)

125 except Exception:

126 pass

127

128 # convert between lists and tuples

129 _charmap = {

130 k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()

131 }

132 # each value is a tuple of 2-tuples (that is, tuples of length 2)

133 # and both elements of that tuple are integers.

134 for vs in _charmap.values():

135 ints = list(sum(vs, ()))

136 assert all(isinstance(x, int) for x in ints)

137 assert ints == sorted(ints)

138 assert all(len(tup) == 2 for tup in vs)

139

140 assert _charmap is not None

141 return _charmap

142

143

144@cache

145def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover

146 """Return an IntervalSet of characters which are part of this codec."""

147 assert codec_name == codecs.lookup(codec_name).name

148 fname = charmap_file(f"codec-{codec_name}")

149 try:

150 with gzip.GzipFile(fname) as gzf:

151 encodable_intervals = json.load(gzf)

152

153 except Exception:

154 # This loop is kinda slow, but hopefully we don't need to do it very often!

155 encodable_intervals = []

156 for i in range(sys.maxunicode + 1):

157 try:

158 chr(i).encode(codec_name)

159 except Exception: # usually _but not always_ UnicodeEncodeError

160 pass

161 else:

162 encodable_intervals.append((i, i))

163

164 res = IntervalSet(encodable_intervals)

165 res = res.union(res)

166 try:

167 # Write the Unicode table atomically

168 tmpdir = storage_directory("tmp")

169 tmpdir.mkdir(exist_ok=True, parents=True)

170 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)

171 os.close(fd)

172 # Explicitly set the mtime to get reproducible output

173 with gzip.GzipFile(tmpfile, "wb", mtime=1) as f:

174 f.write(json.dumps(res.intervals).encode())

175 os.renames(tmpfile, fname)

176 except Exception:

177 pass

178 return res

179

180

181_categories: Categories | None = None

182

183

184def categories() -> Categories:

185 """Return a tuple of Unicode categories in a normalised order.

186

187 >>> categories() # doctest: +ELLIPSIS

188 ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')

189 """

190 global _categories

191 if _categories is None:

192 cm = charmap()

193 categories = sorted(cm.keys(), key=lambda c: len(cm[c]))

194 categories.remove("Cc") # Other, Control

195 categories.remove("Cs") # Other, Surrogate

196 categories.append("Cc")

197 categories.append("Cs")

198 _categories = tuple(categories)

199 return _categories

200

201

202def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple:

203 """Return a tuple of Unicode categories in a normalised order.

204

205 This function expands one-letter designations of a major class to include

206 all subclasses:

207

208 >>> as_general_categories(['N'])

209 ('Nd', 'Nl', 'No')

210

211 See section 4.5 of the Unicode standard for more on classes:

212 https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf

213

214 If the collection ``cats`` includes any elements that do not represent a

215 major class or a class with subclass, a deprecation warning is raised.

216 """

217 major_classes = ("L", "M", "N", "P", "S", "Z", "C")

218 cs = categories()

219 out = set(cats)

220 for c in cats:

221 if c in major_classes:

222 out.discard(c)

223 out.update(x for x in cs if x.startswith(c))

224 elif c not in cs:

225 raise InvalidArgument(

226 f"In {name}={cats!r}, {c!r} is not a valid Unicode category."

227 )

228 return tuple(c for c in cs if c in out)

229

230

231category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()}

232

233

234def _category_key(cats: Iterable[str] | None) -> CategoriesTuple:

235 """Return a normalised tuple of all Unicode categories that are in

236 `include`, but not in `exclude`.

237

238 If include is None then default to including all categories.

239 Any item in include that is not a unicode character will be excluded.

240

241 >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So'])

242 ('Me', 'Lu', 'Cs')

243 """

244 cs = categories()

245 if cats is None:

246 cats = set(cs)

247 return tuple(c for c in cs if c in cats)

248

249

250def _query_for_key(key: Categories) -> IntervalsT:

251 """Return a tuple of codepoint intervals covering characters that match one

252 or more categories in the tuple of categories `key`.

253

254 >>> _query_for_key(categories())

255 ((0, 1114111),)

256 >>> _query_for_key(('Zl', 'Zp', 'Co'))

257 ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))

258 """

259 key = tuple(key)

260 # ignore ordering on the cache key to increase potential cache hits.

261 cache_key = frozenset(key)

262 context = _current_build_context.value

263 if context is None or not context.data.provider.avoid_realization:

264 try:

265 return category_index_cache[cache_key]

266 except KeyError:

267 pass

268 elif not key: # pragma: no cover # only on alternative backends

269 return ()

270 assert key

271 if set(key) == set(categories()):

272 result = IntervalSet([(0, sys.maxunicode)])

273 else:

274 result = IntervalSet(_query_for_key(key[:-1])).union(

275 IntervalSet(charmap()[key[-1]])

276 )

277 assert isinstance(result, IntervalSet)

278 if context is None or not context.data.provider.avoid_realization:

279 category_index_cache[cache_key] = result.intervals

280 return result.intervals

281

282

283limited_category_index_cache: dict[

284 tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet

285] = {}

286

287

288def query(

289 *,

290 categories: Categories | None = None,

291 min_codepoint: int | None = None,

292 max_codepoint: int | None = None,

293 include_characters: Collection[str] = "",

294 exclude_characters: Collection[str] = "",

295) -> IntervalSet:

296 """Return a tuple of intervals covering the codepoints for all characters

297 that meet the criteria.

298

299 >>> query()

300 ((0, 1114111),)

301 >>> query(min_codepoint=0, max_codepoint=128)

302 ((0, 128),)

303 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'])

304 ((65, 90),)

305 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'],

306 ... include_characters='☃')

307 ((65, 90), (9731, 9731))

308 """

309 if min_codepoint is None:

310 min_codepoint = 0

311 if max_codepoint is None:

312 max_codepoint = sys.maxunicode

313 catkey = _category_key(categories)

314 character_intervals = IntervalSet.from_string("".join(include_characters))

315 exclude_intervals = IntervalSet.from_string("".join(exclude_characters))

316 qkey = (

317 catkey,

318 min_codepoint,

319 max_codepoint,

320 character_intervals.intervals,

321 exclude_intervals.intervals,

322 )

323 context = _current_build_context.value

324 if context is None or not context.data.provider.avoid_realization:

325 try:

326 return limited_category_index_cache[qkey]

327 except KeyError:

328 pass

329 base = _query_for_key(catkey)

330 result = []

331 for u, v in base:

332 if v >= min_codepoint and u <= max_codepoint:

333 result.append((max(u, min_codepoint), min(v, max_codepoint)))

334 result = (IntervalSet(result) | character_intervals) - exclude_intervals

335 if context is None or not context.data.provider.avoid_realization:

336 limited_category_index_cache[qkey] = result

337 return result