Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/hypothesis/internal/charmap.py: 87%

1# This file is part of Hypothesis, which may be found at

2# https://github.com/HypothesisWorks/hypothesis/

4# Copyright the Hypothesis Authors.

5# Individual contributors are listed in AUTHORS.rst and the git log.

7# This Source Code Form is subject to the terms of the Mozilla Public License,

8# v. 2.0. If a copy of the MPL was not distributed with this file, You can

9# obtain one at https://mozilla.org/MPL/2.0/.

11import codecs

12import gzip

13import json

14import os

15import sys

16import tempfile

17import unicodedata

18from collections.abc import Collection, Iterable

19from functools import cache

20from pathlib import Path

21from typing import TYPE_CHECKING, Literal, Optional

23from hypothesis.configuration import storage_directory

24from hypothesis.control import _current_build_context

25from hypothesis.errors import InvalidArgument

26from hypothesis.internal.intervalsets import IntervalSet, IntervalsT

28if TYPE_CHECKING:

29 from typing import TypeAlias

31# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category

32CategoryName: "TypeAlias" = Literal[

33 "L", # Letter

34 "Lu", # Letter, uppercase

35 "Ll", # Letter, lowercase

36 "Lt", # Letter, titlecase

37 "Lm", # Letter, modifier

38 "Lo", # Letter, other

39 "M", # Mark

40 "Mn", # Mark, nonspacing

41 "Mc", # Mark, spacing combining

42 "Me", # Mark, enclosing

43 "N", # Number

44 "Nd", # Number, decimal digit

45 "Nl", # Number, letter

46 "No", # Number, other

47 "P", # Punctuation

48 "Pc", # Punctuation, connector

49 "Pd", # Punctuation, dash

50 "Ps", # Punctuation, open

51 "Pe", # Punctuation, close

52 "Pi", # Punctuation, initial quote

53 "Pf", # Punctuation, final quote

54 "Po", # Punctuation, other

55 "S", # Symbol

56 "Sm", # Symbol, math

57 "Sc", # Symbol, currency

58 "Sk", # Symbol, modifier

59 "So", # Symbol, other

60 "Z", # Separator

61 "Zs", # Separator, space

62 "Zl", # Separator, line

63 "Zp", # Separator, paragraph

64 "C", # Other

65 "Cc", # Other, control

66 "Cf", # Other, format

67 "Cs", # Other, surrogate

68 "Co", # Other, private use

69 "Cn", # Other, not assigned

70]

71Categories: "TypeAlias" = Iterable[CategoryName]

72CategoriesTuple: "TypeAlias" = tuple[CategoryName, ...]

75def charmap_file(fname: str = "charmap") -> Path:

76 return storage_directory(

77 "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz"

78 )

81_charmap: Optional[dict[CategoryName, IntervalsT]] = None

84def charmap() -> dict[CategoryName, IntervalsT]:

85 """Return a dict that maps a Unicode category, to a tuple of 2-tuples

86 covering the codepoint intervals for characters in that category.

88 >>> charmap()['Co']

89 ((57344, 63743), (983040, 1048573), (1048576, 1114109))

90 """

91 global _charmap

92 # Best-effort caching in the face of missing files and/or unwritable

93 # filesystems is fairly simple: check if loaded, else try loading,

94 # else calculate and try writing the cache.

95 if _charmap is None:

96 f = charmap_file()

97 try:

98 with gzip.GzipFile(f, "rb") as d:

99 tmp_charmap = dict(json.load(d))

100

101 except Exception:

102 # This loop is reduced to using only local variables for performance;

103 # indexing and updating containers is a ~3x slowdown. This doesn't fix

104 # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.

105 category = unicodedata.category # Local variable -> ~20% speedup!

106 tmp_charmap = {}

107 last_cat = category(chr(0))

108 last_start = 0

109 for i in range(1, sys.maxunicode + 1):

110 cat = category(chr(i))

111 if cat != last_cat:

112 tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1))

113 last_cat, last_start = cat, i

114 tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode))

115

116 try:

117 # Write the Unicode table atomically

118 tmpdir = storage_directory("tmp")

119 tmpdir.mkdir(exist_ok=True, parents=True)

120 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)

121 os.close(fd)

122 # Explicitly set the mtime to get reproducible output

123 with gzip.GzipFile(tmpfile, "wb", mtime=1) as fp:

124 result = json.dumps(sorted(tmp_charmap.items()))

125 fp.write(result.encode())

126

127 os.renames(tmpfile, f)

128 except Exception:

129 pass

130

131 # convert between lists and tuples

132 _charmap = {

133 k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()

134 }

135 # each value is a tuple of 2-tuples (that is, tuples of length 2)

136 # and both elements of that tuple are integers.

137 for vs in _charmap.values():

138 ints = list(sum(vs, ()))

139 assert all(isinstance(x, int) for x in ints)

140 assert ints == sorted(ints)

141 assert all(len(tup) == 2 for tup in vs)

142

143 assert _charmap is not None

144 return _charmap

145

146

147@cache

148def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover

149 """Return an IntervalSet of characters which are part of this codec."""

150 assert codec_name == codecs.lookup(codec_name).name

151 fname = charmap_file(f"codec-{codec_name}")

152 try:

153 with gzip.GzipFile(fname) as gzf:

154 encodable_intervals = json.load(gzf)

155

156 except Exception:

157 # This loop is kinda slow, but hopefully we don't need to do it very often!

158 encodable_intervals = []

159 for i in range(sys.maxunicode + 1):

160 try:

161 chr(i).encode(codec_name)

162 except Exception: # usually _but not always_ UnicodeEncodeError

163 pass

164 else:

165 encodable_intervals.append((i, i))

166

167 res = IntervalSet(encodable_intervals)

168 res = res.union(res)

169 try:

170 # Write the Unicode table atomically

171 tmpdir = storage_directory("tmp")

172 tmpdir.mkdir(exist_ok=True, parents=True)

173 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)

174 os.close(fd)

175 # Explicitly set the mtime to get reproducible output

176 with gzip.GzipFile(tmpfile, "wb", mtime=1) as f:

177 f.write(json.dumps(res.intervals).encode())

178 os.renames(tmpfile, fname)

179 except Exception:

180 pass

181 return res

182

183

184_categories: Optional[Categories] = None

185

186

187def categories() -> Categories:

188 """Return a tuple of Unicode categories in a normalised order.

189

190 >>> categories() # doctest: +ELLIPSIS

191 ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')

192 """

193 global _categories

194 if _categories is None:

195 cm = charmap()

196 categories = sorted(cm.keys(), key=lambda c: len(cm[c]))

197 categories.remove("Cc") # Other, Control

198 categories.remove("Cs") # Other, Surrogate

199 categories.append("Cc")

200 categories.append("Cs")

201 _categories = tuple(categories)

202 return _categories

203

204

205def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple:

206 """Return a tuple of Unicode categories in a normalised order.

207

208 This function expands one-letter designations of a major class to include

209 all subclasses:

210

211 >>> as_general_categories(['N'])

212 ('Nd', 'Nl', 'No')

213

214 See section 4.5 of the Unicode standard for more on classes:

215 https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf

216

217 If the collection ``cats`` includes any elements that do not represent a

218 major class or a class with subclass, a deprecation warning is raised.

219 """

220 major_classes = ("L", "M", "N", "P", "S", "Z", "C")

221 cs = categories()

222 out = set(cats)

223 for c in cats:

224 if c in major_classes:

225 out.discard(c)

226 out.update(x for x in cs if x.startswith(c))

227 elif c not in cs:

228 raise InvalidArgument(

229 f"In {name}={cats!r}, {c!r} is not a valid Unicode category."

230 )

231 return tuple(c for c in cs if c in out)

232

233

234category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()}

235

236

237def _category_key(cats: Optional[Iterable[str]]) -> CategoriesTuple:

238 """Return a normalised tuple of all Unicode categories that are in

239 `include`, but not in `exclude`.

240

241 If include is None then default to including all categories.

242 Any item in include that is not a unicode character will be excluded.

243

244 >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So'])

245 ('Me', 'Lu', 'Cs')

246 """

247 cs = categories()

248 if cats is None:

249 cats = set(cs)

250 return tuple(c for c in cs if c in cats)

251

252

253def _query_for_key(key: Categories) -> IntervalsT:

254 """Return a tuple of codepoint intervals covering characters that match one

255 or more categories in the tuple of categories `key`.

256

257 >>> _query_for_key(categories())

258 ((0, 1114111),)

259 >>> _query_for_key(('Zl', 'Zp', 'Co'))

260 ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))

261 """

262 key = tuple(key)

263 # ignore ordering on the cache key to increase potential cache hits.

264 cache_key = frozenset(key)

265 context = _current_build_context.value

266 if context is None or not context.data.provider.avoid_realization:

267 try:

268 return category_index_cache[cache_key]

269 except KeyError:

270 pass

271 elif not key: # pragma: no cover # only on alternative backends

272 return ()

273 assert key

274 if set(key) == set(categories()):

275 result = IntervalSet([(0, sys.maxunicode)])

276 else:

277 result = IntervalSet(_query_for_key(key[:-1])).union(

278 IntervalSet(charmap()[key[-1]])

279 )

280 assert isinstance(result, IntervalSet)

281 if context is None or not context.data.provider.avoid_realization:

282 category_index_cache[cache_key] = result.intervals

283 return result.intervals

284

285

286limited_category_index_cache: dict[

287 tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet

288] = {}

289

290

291def query(

292 *,

293 categories: Optional[Categories] = None,

294 min_codepoint: Optional[int] = None,

295 max_codepoint: Optional[int] = None,

296 include_characters: Collection[str] = "",

297 exclude_characters: Collection[str] = "",

298) -> IntervalSet:

299 """Return a tuple of intervals covering the codepoints for all characters

300 that meet the criteria.

301

302 >>> query()

303 ((0, 1114111),)

304 >>> query(min_codepoint=0, max_codepoint=128)

305 ((0, 128),)

306 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'])

307 ((65, 90),)

308 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'],

309 ... include_characters='☃')

310 ((65, 90), (9731, 9731))

311 """

312 if min_codepoint is None:

313 min_codepoint = 0

314 if max_codepoint is None:

315 max_codepoint = sys.maxunicode

316 catkey = _category_key(categories)

317 character_intervals = IntervalSet.from_string("".join(include_characters))

318 exclude_intervals = IntervalSet.from_string("".join(exclude_characters))

319 qkey = (

320 catkey,

321 min_codepoint,

322 max_codepoint,

323 character_intervals.intervals,

324 exclude_intervals.intervals,

325 )

326 context = _current_build_context.value

327 if context is None or not context.data.provider.avoid_realization:

328 try:

329 return limited_category_index_cache[qkey]

330 except KeyError:

331 pass

332 base = _query_for_key(catkey)

333 result = []

334 for u, v in base:

335 if v >= min_codepoint and u <= max_codepoint:

336 result.append((max(u, min_codepoint), min(v, max_codepoint)))

337 result = (IntervalSet(result) | character_intervals) - exclude_intervals

338 if context is None or not context.data.provider.avoid_realization:

339 limited_category_index_cache[qkey] = result

340 return result