Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/internal/charmap.py: 87%

1# This file is part of Hypothesis, which may be found at

2# https://github.com/HypothesisWorks/hypothesis/

4# Copyright the Hypothesis Authors.

5# Individual contributors are listed in AUTHORS.rst and the git log.

7# This Source Code Form is subject to the terms of the Mozilla Public License,

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can

9# obtain one at https://mozilla.org/MPL/2.0/.

11import codecs

12import gzip

13import json

14import os

15import sys

16import tempfile

17import unicodedata

18from collections.abc import Iterable

19from functools import cache

20from pathlib import Path

21from typing import TYPE_CHECKING, Literal, Optional

23from hypothesis.configuration import storage_directory

24from hypothesis.control import _current_build_context

25from hypothesis.errors import InvalidArgument

26from hypothesis.internal.intervalsets import IntervalSet, IntervalsT

28if TYPE_CHECKING:

29 from typing import TypeAlias

31# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category

32CategoryName: "TypeAlias" = Literal[

33 "L", # Letter

34 "Lu", # Letter, uppercase

35 "Ll", # Letter, lowercase

36 "Lt", # Letter, titlecase

37 "Lm", # Letter, modifier

38 "Lo", # Letter, other

39 "M", # Mark

40 "Mn", # Mark, nonspacing

41 "Mc", # Mark, spacing combining

42 "Me", # Mark, enclosing

43 "N", # Number

44 "Nd", # Number, decimal digit

45 "Nl", # Number, letter

46 "No", # Number, other

47 "P", # Punctuation

48 "Pc", # Punctuation, connector

49 "Pd", # Punctuation, dash

50 "Ps", # Punctuation, open

51 "Pe", # Punctuation, close

52 "Pi", # Punctuation, initial quote

53 "Pf", # Punctuation, final quote

54 "Po", # Punctuation, other

55 "S", # Symbol

56 "Sm", # Symbol, math

57 "Sc", # Symbol, currency

58 "Sk", # Symbol, modifier

59 "So", # Symbol, other

60 "Z", # Separator

61 "Zs", # Separator, space

62 "Zl", # Separator, line

63 "Zp", # Separator, paragraph

64 "C", # Other

65 "Cc", # Other, control

66 "Cf", # Other, format

67 "Cs", # Other, surrogate

68 "Co", # Other, private use

69 "Cn", # Other, not assigned

70]

71Categories: "TypeAlias" = Iterable[CategoryName]

72CategoriesTuple: "TypeAlias" = tuple[CategoryName, ...]

75def charmap_file(fname: str = "charmap") -> Path:

76 return storage_directory(

77 "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz"

78 )

81_charmap = None

84def charmap() -> dict[CategoryName, IntervalsT]:

85 """Return a dict that maps a Unicode category, to a tuple of 2-tuples

86 covering the codepoint intervals for characters in that category.

88 >>> charmap()['Co']

89 ((57344, 63743), (983040, 1048573), (1048576, 1114109))

90 """

91 global _charmap

92 # Best-effort caching in the face of missing files and/or unwritable

93 # filesystems is fairly simple: check if loaded, else try loading,

94 # else calculate and try writing the cache.

95 if _charmap is None:

96 f = charmap_file()

97 try:

98 with gzip.GzipFile(f, "rb") as d:

99 tmp_charmap = dict(json.load(d))

100

101 except Exception:

102 # This loop is reduced to using only local variables for performance;

103 # indexing and updating containers is a ~3x slowdown. This doesn't fix

104 # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.

105 category = unicodedata.category # Local variable -> ~20% speedup!

106 tmp_charmap = {}

107 last_cat = category(chr(0))

108 last_start = 0

109 for i in range(1, sys.maxunicode + 1):

110 cat = category(chr(i))

111 if cat != last_cat:

112 tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1))

113 last_cat, last_start = cat, i

114 tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode))

115

116 try:

117 # Write the Unicode table atomically

118 tmpdir = storage_directory("tmp")

119 tmpdir.mkdir(exist_ok=True, parents=True)

120 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)

121 os.close(fd)

122 # Explicitly set the mtime to get reproducible output

123 with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:

124 result = json.dumps(sorted(tmp_charmap.items()))

125 o.write(result.encode())

126

127 os.renames(tmpfile, f)

128 except Exception:

129 pass

130

131 # convert between lists and tuples

132 _charmap = {

133 k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()

134 }

135 # each value is a tuple of 2-tuples (that is, tuples of length 2)

136 # and that both elements of that tuple are integers.

137 for vs in _charmap.values():

138 ints = list(sum(vs, ()))

139 assert all(isinstance(x, int) for x in ints)

140 assert ints == sorted(ints)

141 assert all(len(tup) == 2 for tup in vs)

142

143 assert _charmap is not None

144 return _charmap

145

146

147@cache

148def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover

149 """Return an IntervalSet of characters which are part of this codec."""

150 assert codec_name == codecs.lookup(codec_name).name

151 fname = charmap_file(f"codec-{codec_name}")

152 try:

153 with gzip.GzipFile(fname) as gzf:

154 encodable_intervals = json.load(gzf)

155

156 except Exception:

157 # This loop is kinda slow, but hopefully we don't need to do it very often!

158 encodable_intervals = []

159 for i in range(sys.maxunicode + 1):

160 try:

161 chr(i).encode(codec_name)

162 except Exception: # usually _but not always_ UnicodeEncodeError

163 pass

164 else:

165 encodable_intervals.append((i, i))

166

167 res = IntervalSet(encodable_intervals)

168 res = res.union(res)

169 try:

170 # Write the Unicode table atomically

171 tmpdir = storage_directory("tmp")

172 tmpdir.mkdir(exist_ok=True, parents=True)

173 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)

174 os.close(fd)

175 # Explicitly set the mtime to get reproducible output

176 with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:

177 o.write(json.dumps(res.intervals).encode())

178 os.renames(tmpfile, fname)

179 except Exception:

180 pass

181 return res

182

183

184_categories: Optional[Categories] = None

185

186

187def categories() -> Categories:

188 """Return a tuple of Unicode categories in a normalised order.

189

190 >>> categories() # doctest: +ELLIPSIS

191 ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')

192 """

193 global _categories

194 if _categories is None:

195 cm = charmap()

196 categories = sorted(cm.keys(), key=lambda c: len(cm[c]))

197 categories.remove("Cc") # Other, Control

198 categories.remove("Cs") # Other, Surrogate

199 categories.append("Cc")

200 categories.append("Cs")

201 _categories = tuple(categories)

202 return _categories

203

204

205def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple:

206 """Return a tuple of Unicode categories in a normalised order.

207

208 This function expands one-letter designations of a major class to include

209 all subclasses:

210

211 >>> as_general_categories(['N'])

212 ('Nd', 'Nl', 'No')

213

214 See section 4.5 of the Unicode standard for more on classes:

215 https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf

216

217 If the collection ``cats`` includes any elements that do not represent a

218 major class or a class with subclass, a deprecation warning is raised.

219 """

220 major_classes = ("L", "M", "N", "P", "S", "Z", "C")

221 cs = categories()

222 out = set(cats)

223 for c in cats:

224 if c in major_classes:

225 out.discard(c)

226 out.update(x for x in cs if x.startswith(c))

227 elif c not in cs:

228 raise InvalidArgument(

229 f"In {name}={cats!r}, {c!r} is not a valid Unicode category."

230 )

231 return tuple(c for c in cs if c in out)

232

233

234category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()}

235

236

237def _category_key(cats: Optional[Iterable[str]]) -> CategoriesTuple:

238 """Return a normalised tuple of all Unicode categories that are in

239 `include`, but not in `exclude`.

240

241 If include is None then default to including all categories.

242 Any item in include that is not a unicode character will be excluded.

243

244 >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So'])

245 ('Me', 'Lu', 'Cs')

246 """

247 cs = categories()

248 if cats is None:

249 cats = set(cs)

250 return tuple(c for c in cs if c in cats)

251

252

253def _query_for_key(key: Categories) -> IntervalsT:

254 """Return a tuple of codepoint intervals covering characters that match one

255 or more categories in the tuple of categories `key`.

256

257 >>> _query_for_key(categories())

258 ((0, 1114111),)

259 >>> _query_for_key(('Zl', 'Zp', 'Co'))

260 ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))

261 """

262 key = tuple(key)

263 # ignore ordering on the cache key to increase potential cache hits.

264 cache_key = frozenset(key)

265 context = _current_build_context.value

266 if context is None or not context.data.provider.avoid_realization:

267 try:

268 return category_index_cache[cache_key]

269 except KeyError:

270 pass

271 elif not key: # pragma: no cover # only on alternative backends

272 return ()

273 assert key

274 if set(key) == set(categories()):

275 result = IntervalSet([(0, sys.maxunicode)])

276 else:

277 result = IntervalSet(_query_for_key(key[:-1])).union(

278 IntervalSet(charmap()[key[-1]])

279 )

280 assert isinstance(result, IntervalSet)

281 if context is None or not context.data.provider.avoid_realization:

282 category_index_cache[cache_key] = result.intervals

283 return result.intervals

284

285

286limited_category_index_cache: dict[

287 tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet

288] = {}

289

290

291def query(

292 *,

293 categories: Optional[Categories] = None,

294 min_codepoint: Optional[int] = None,

295 max_codepoint: Optional[int] = None,

296 include_characters: str = "",

297 exclude_characters: str = "",

298) -> IntervalSet:

299 """Return a tuple of intervals covering the codepoints for all characters

300 that meet the criteria.

301

302 >>> query()

303 ((0, 1114111),)

304 >>> query(min_codepoint=0, max_codepoint=128)

305 ((0, 128),)

306 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'])

307 ((65, 90),)

308 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'],

309 ... include_characters='☃')

310 ((65, 90), (9731, 9731))

311 """

312 if min_codepoint is None:

313 min_codepoint = 0

314 if max_codepoint is None:

315 max_codepoint = sys.maxunicode

316 catkey = _category_key(categories)

317 character_intervals = IntervalSet.from_string(include_characters or "")

318 exclude_intervals = IntervalSet.from_string(exclude_characters or "")

319 qkey = (

320 catkey,

321 min_codepoint,

322 max_codepoint,

323 character_intervals.intervals,

324 exclude_intervals.intervals,

325 )

326 context = _current_build_context.value

327 if context is None or not context.data.provider.avoid_realization:

328 try:

329 return limited_category_index_cache[qkey]

330 except KeyError:

331 pass

332 base = _query_for_key(catkey)

333 result = []

334 for u, v in base:

335 if v >= min_codepoint and u <= max_codepoint:

336 result.append((max(u, min_codepoint), min(v, max_codepoint)))

337 result = (IntervalSet(result) | character_intervals) - exclude_intervals

338 if context is None or not context.data.provider.avoid_realization:

339 limited_category_index_cache[qkey] = result

340 return result