1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import codecs
12import gzip
13import json
14import os
15import sys
16import tempfile
17import unicodedata
18from collections.abc import Collection, Iterable
19from functools import cache
20from pathlib import Path
21from typing import TYPE_CHECKING, Literal, Optional
22
23from hypothesis.configuration import storage_directory
24from hypothesis.control import _current_build_context
25from hypothesis.errors import InvalidArgument
26from hypothesis.internal.intervalsets import IntervalSet, IntervalsT
27
28if TYPE_CHECKING:
29 from typing import TypeAlias
30
31# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
32CategoryName: "TypeAlias" = Literal[
33 "L", # Letter
34 "Lu", # Letter, uppercase
35 "Ll", # Letter, lowercase
36 "Lt", # Letter, titlecase
37 "Lm", # Letter, modifier
38 "Lo", # Letter, other
39 "M", # Mark
40 "Mn", # Mark, nonspacing
41 "Mc", # Mark, spacing combining
42 "Me", # Mark, enclosing
43 "N", # Number
44 "Nd", # Number, decimal digit
45 "Nl", # Number, letter
46 "No", # Number, other
47 "P", # Punctuation
48 "Pc", # Punctuation, connector
49 "Pd", # Punctuation, dash
50 "Ps", # Punctuation, open
51 "Pe", # Punctuation, close
52 "Pi", # Punctuation, initial quote
53 "Pf", # Punctuation, final quote
54 "Po", # Punctuation, other
55 "S", # Symbol
56 "Sm", # Symbol, math
57 "Sc", # Symbol, currency
58 "Sk", # Symbol, modifier
59 "So", # Symbol, other
60 "Z", # Separator
61 "Zs", # Separator, space
62 "Zl", # Separator, line
63 "Zp", # Separator, paragraph
64 "C", # Other
65 "Cc", # Other, control
66 "Cf", # Other, format
67 "Cs", # Other, surrogate
68 "Co", # Other, private use
69 "Cn", # Other, not assigned
70]
71Categories: "TypeAlias" = Iterable[CategoryName]
72CategoriesTuple: "TypeAlias" = tuple[CategoryName, ...]
73
74
75def charmap_file(fname: str = "charmap") -> Path:
76 return storage_directory(
77 "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz"
78 )
79
80
81_charmap: Optional[dict[CategoryName, IntervalsT]] = None
82
83
84def charmap() -> dict[CategoryName, IntervalsT]:
85 """Return a dict that maps a Unicode category, to a tuple of 2-tuples
86 covering the codepoint intervals for characters in that category.
87
88 >>> charmap()['Co']
89 ((57344, 63743), (983040, 1048573), (1048576, 1114109))
90 """
91 global _charmap
92 # Best-effort caching in the face of missing files and/or unwritable
93 # filesystems is fairly simple: check if loaded, else try loading,
94 # else calculate and try writing the cache.
95 if _charmap is None:
96 f = charmap_file()
97 try:
98 with gzip.GzipFile(f, "rb") as d:
99 tmp_charmap = dict(json.load(d))
100
101 except Exception:
102 # This loop is reduced to using only local variables for performance;
103 # indexing and updating containers is a ~3x slowdown. This doesn't fix
104 # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.
105 category = unicodedata.category # Local variable -> ~20% speedup!
106 tmp_charmap = {}
107 last_cat = category(chr(0))
108 last_start = 0
109 for i in range(1, sys.maxunicode + 1):
110 cat = category(chr(i))
111 if cat != last_cat:
112 tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1))
113 last_cat, last_start = cat, i
114 tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode))
115
116 try:
117 # Write the Unicode table atomically
118 tmpdir = storage_directory("tmp")
119 tmpdir.mkdir(exist_ok=True, parents=True)
120 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
121 os.close(fd)
122 # Explicitly set the mtime to get reproducible output
123 with gzip.GzipFile(tmpfile, "wb", mtime=1) as fp:
124 result = json.dumps(sorted(tmp_charmap.items()))
125 fp.write(result.encode())
126
127 os.renames(tmpfile, f)
128 except Exception:
129 pass
130
131 # convert between lists and tuples
132 _charmap = {
133 k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()
134 }
135 # each value is a tuple of 2-tuples (that is, tuples of length 2)
136 # and both elements of that tuple are integers.
137 for vs in _charmap.values():
138 ints = list(sum(vs, ()))
139 assert all(isinstance(x, int) for x in ints)
140 assert ints == sorted(ints)
141 assert all(len(tup) == 2 for tup in vs)
142
143 assert _charmap is not None
144 return _charmap
145
146
147@cache
148def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover
149 """Return an IntervalSet of characters which are part of this codec."""
150 assert codec_name == codecs.lookup(codec_name).name
151 fname = charmap_file(f"codec-{codec_name}")
152 try:
153 with gzip.GzipFile(fname) as gzf:
154 encodable_intervals = json.load(gzf)
155
156 except Exception:
157 # This loop is kinda slow, but hopefully we don't need to do it very often!
158 encodable_intervals = []
159 for i in range(sys.maxunicode + 1):
160 try:
161 chr(i).encode(codec_name)
162 except Exception: # usually _but not always_ UnicodeEncodeError
163 pass
164 else:
165 encodable_intervals.append((i, i))
166
167 res = IntervalSet(encodable_intervals)
168 res = res.union(res)
169 try:
170 # Write the Unicode table atomically
171 tmpdir = storage_directory("tmp")
172 tmpdir.mkdir(exist_ok=True, parents=True)
173 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
174 os.close(fd)
175 # Explicitly set the mtime to get reproducible output
176 with gzip.GzipFile(tmpfile, "wb", mtime=1) as f:
177 f.write(json.dumps(res.intervals).encode())
178 os.renames(tmpfile, fname)
179 except Exception:
180 pass
181 return res
182
183
184_categories: Optional[Categories] = None
185
186
187def categories() -> Categories:
188 """Return a tuple of Unicode categories in a normalised order.
189
190 >>> categories() # doctest: +ELLIPSIS
191 ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')
192 """
193 global _categories
194 if _categories is None:
195 cm = charmap()
196 categories = sorted(cm.keys(), key=lambda c: len(cm[c]))
197 categories.remove("Cc") # Other, Control
198 categories.remove("Cs") # Other, Surrogate
199 categories.append("Cc")
200 categories.append("Cs")
201 _categories = tuple(categories)
202 return _categories
203
204
205def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple:
206 """Return a tuple of Unicode categories in a normalised order.
207
208 This function expands one-letter designations of a major class to include
209 all subclasses:
210
211 >>> as_general_categories(['N'])
212 ('Nd', 'Nl', 'No')
213
214 See section 4.5 of the Unicode standard for more on classes:
215 https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf
216
217 If the collection ``cats`` includes any elements that do not represent a
218 major class or a class with subclass, a deprecation warning is raised.
219 """
220 major_classes = ("L", "M", "N", "P", "S", "Z", "C")
221 cs = categories()
222 out = set(cats)
223 for c in cats:
224 if c in major_classes:
225 out.discard(c)
226 out.update(x for x in cs if x.startswith(c))
227 elif c not in cs:
228 raise InvalidArgument(
229 f"In {name}={cats!r}, {c!r} is not a valid Unicode category."
230 )
231 return tuple(c for c in cs if c in out)
232
233
234category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()}
235
236
237def _category_key(cats: Optional[Iterable[str]]) -> CategoriesTuple:
238 """Return a normalised tuple of all Unicode categories that are in
239 `include`, but not in `exclude`.
240
241 If include is None then default to including all categories.
242 Any item in include that is not a unicode character will be excluded.
243
244 >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So'])
245 ('Me', 'Lu', 'Cs')
246 """
247 cs = categories()
248 if cats is None:
249 cats = set(cs)
250 return tuple(c for c in cs if c in cats)
251
252
253def _query_for_key(key: Categories) -> IntervalsT:
254 """Return a tuple of codepoint intervals covering characters that match one
255 or more categories in the tuple of categories `key`.
256
257 >>> _query_for_key(categories())
258 ((0, 1114111),)
259 >>> _query_for_key(('Zl', 'Zp', 'Co'))
260 ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
261 """
262 key = tuple(key)
263 # ignore ordering on the cache key to increase potential cache hits.
264 cache_key = frozenset(key)
265 context = _current_build_context.value
266 if context is None or not context.data.provider.avoid_realization:
267 try:
268 return category_index_cache[cache_key]
269 except KeyError:
270 pass
271 elif not key: # pragma: no cover # only on alternative backends
272 return ()
273 assert key
274 if set(key) == set(categories()):
275 result = IntervalSet([(0, sys.maxunicode)])
276 else:
277 result = IntervalSet(_query_for_key(key[:-1])).union(
278 IntervalSet(charmap()[key[-1]])
279 )
280 assert isinstance(result, IntervalSet)
281 if context is None or not context.data.provider.avoid_realization:
282 category_index_cache[cache_key] = result.intervals
283 return result.intervals
284
285
286limited_category_index_cache: dict[
287 tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet
288] = {}
289
290
291def query(
292 *,
293 categories: Optional[Categories] = None,
294 min_codepoint: Optional[int] = None,
295 max_codepoint: Optional[int] = None,
296 include_characters: Collection[str] = "",
297 exclude_characters: Collection[str] = "",
298) -> IntervalSet:
299 """Return a tuple of intervals covering the codepoints for all characters
300 that meet the criteria.
301
302 >>> query()
303 ((0, 1114111),)
304 >>> query(min_codepoint=0, max_codepoint=128)
305 ((0, 128),)
306 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'])
307 ((65, 90),)
308 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'],
309 ... include_characters='☃')
310 ((65, 90), (9731, 9731))
311 """
312 if min_codepoint is None:
313 min_codepoint = 0
314 if max_codepoint is None:
315 max_codepoint = sys.maxunicode
316 catkey = _category_key(categories)
317 character_intervals = IntervalSet.from_string("".join(include_characters))
318 exclude_intervals = IntervalSet.from_string("".join(exclude_characters))
319 qkey = (
320 catkey,
321 min_codepoint,
322 max_codepoint,
323 character_intervals.intervals,
324 exclude_intervals.intervals,
325 )
326 context = _current_build_context.value
327 if context is None or not context.data.provider.avoid_realization:
328 try:
329 return limited_category_index_cache[qkey]
330 except KeyError:
331 pass
332 base = _query_for_key(catkey)
333 result = []
334 for u, v in base:
335 if v >= min_codepoint and u <= max_codepoint:
336 result.append((max(u, min_codepoint), min(v, max_codepoint)))
337 result = (IntervalSet(result) | character_intervals) - exclude_intervals
338 if context is None or not context.data.provider.avoid_realization:
339 limited_category_index_cache[qkey] = result
340 return result