1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import codecs
12import gzip
13import json
14import os
15import sys
16import tempfile
17import unicodedata
18from collections.abc import Iterable
19from functools import cache
20from pathlib import Path
21from typing import TYPE_CHECKING, Literal, Optional
22
23from hypothesis.configuration import storage_directory
24from hypothesis.control import _current_build_context
25from hypothesis.errors import InvalidArgument
26from hypothesis.internal.intervalsets import IntervalSet, IntervalsT
27
28if TYPE_CHECKING:
29 from typing import TypeAlias
30
31# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
32CategoryName: "TypeAlias" = Literal[
33 "L", # Letter
34 "Lu", # Letter, uppercase
35 "Ll", # Letter, lowercase
36 "Lt", # Letter, titlecase
37 "Lm", # Letter, modifier
38 "Lo", # Letter, other
39 "M", # Mark
40 "Mn", # Mark, nonspacing
41 "Mc", # Mark, spacing combining
42 "Me", # Mark, enclosing
43 "N", # Number
44 "Nd", # Number, decimal digit
45 "Nl", # Number, letter
46 "No", # Number, other
47 "P", # Punctuation
48 "Pc", # Punctuation, connector
49 "Pd", # Punctuation, dash
50 "Ps", # Punctuation, open
51 "Pe", # Punctuation, close
52 "Pi", # Punctuation, initial quote
53 "Pf", # Punctuation, final quote
54 "Po", # Punctuation, other
55 "S", # Symbol
56 "Sm", # Symbol, math
57 "Sc", # Symbol, currency
58 "Sk", # Symbol, modifier
59 "So", # Symbol, other
60 "Z", # Separator
61 "Zs", # Separator, space
62 "Zl", # Separator, line
63 "Zp", # Separator, paragraph
64 "C", # Other
65 "Cc", # Other, control
66 "Cf", # Other, format
67 "Cs", # Other, surrogate
68 "Co", # Other, private use
69 "Cn", # Other, not assigned
70]
71Categories: "TypeAlias" = Iterable[CategoryName]
72CategoriesTuple: "TypeAlias" = tuple[CategoryName, ...]
73
74
75def charmap_file(fname: str = "charmap") -> Path:
76 return storage_directory(
77 "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz"
78 )
79
80
81_charmap = None
82
83
84def charmap() -> dict[CategoryName, IntervalsT]:
85 """Return a dict that maps a Unicode category, to a tuple of 2-tuples
86 covering the codepoint intervals for characters in that category.
87
88 >>> charmap()['Co']
89 ((57344, 63743), (983040, 1048573), (1048576, 1114109))
90 """
91 global _charmap
92 # Best-effort caching in the face of missing files and/or unwritable
93 # filesystems is fairly simple: check if loaded, else try loading,
94 # else calculate and try writing the cache.
95 if _charmap is None:
96 f = charmap_file()
97 try:
98 with gzip.GzipFile(f, "rb") as d:
99 tmp_charmap = dict(json.load(d))
100
101 except Exception:
102 # This loop is reduced to using only local variables for performance;
103 # indexing and updating containers is a ~3x slowdown. This doesn't fix
104 # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.
105 category = unicodedata.category # Local variable -> ~20% speedup!
106 tmp_charmap = {}
107 last_cat = category(chr(0))
108 last_start = 0
109 for i in range(1, sys.maxunicode + 1):
110 cat = category(chr(i))
111 if cat != last_cat:
112 tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1))
113 last_cat, last_start = cat, i
114 tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode))
115
116 try:
117 # Write the Unicode table atomically
118 tmpdir = storage_directory("tmp")
119 tmpdir.mkdir(exist_ok=True, parents=True)
120 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
121 os.close(fd)
122 # Explicitly set the mtime to get reproducible output
123 with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
124 result = json.dumps(sorted(tmp_charmap.items()))
125 o.write(result.encode())
126
127 os.renames(tmpfile, f)
128 except Exception:
129 pass
130
131 # convert between lists and tuples
132 _charmap = {
133 k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()
134 }
135 # each value is a tuple of 2-tuples (that is, tuples of length 2)
136 # and that both elements of that tuple are integers.
137 for vs in _charmap.values():
138 ints = list(sum(vs, ()))
139 assert all(isinstance(x, int) for x in ints)
140 assert ints == sorted(ints)
141 assert all(len(tup) == 2 for tup in vs)
142
143 assert _charmap is not None
144 return _charmap
145
146
147@cache
148def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover
149 """Return an IntervalSet of characters which are part of this codec."""
150 assert codec_name == codecs.lookup(codec_name).name
151 fname = charmap_file(f"codec-{codec_name}")
152 try:
153 with gzip.GzipFile(fname) as gzf:
154 encodable_intervals = json.load(gzf)
155
156 except Exception:
157 # This loop is kinda slow, but hopefully we don't need to do it very often!
158 encodable_intervals = []
159 for i in range(sys.maxunicode + 1):
160 try:
161 chr(i).encode(codec_name)
162 except Exception: # usually _but not always_ UnicodeEncodeError
163 pass
164 else:
165 encodable_intervals.append((i, i))
166
167 res = IntervalSet(encodable_intervals)
168 res = res.union(res)
169 try:
170 # Write the Unicode table atomically
171 tmpdir = storage_directory("tmp")
172 tmpdir.mkdir(exist_ok=True, parents=True)
173 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
174 os.close(fd)
175 # Explicitly set the mtime to get reproducible output
176 with gzip.GzipFile(tmpfile, "wb", mtime=1) as o:
177 o.write(json.dumps(res.intervals).encode())
178 os.renames(tmpfile, fname)
179 except Exception:
180 pass
181 return res
182
183
184_categories: Optional[Categories] = None
185
186
187def categories() -> Categories:
188 """Return a tuple of Unicode categories in a normalised order.
189
190 >>> categories() # doctest: +ELLIPSIS
191 ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')
192 """
193 global _categories
194 if _categories is None:
195 cm = charmap()
196 categories = sorted(cm.keys(), key=lambda c: len(cm[c]))
197 categories.remove("Cc") # Other, Control
198 categories.remove("Cs") # Other, Surrogate
199 categories.append("Cc")
200 categories.append("Cs")
201 _categories = tuple(categories)
202 return _categories
203
204
205def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple:
206 """Return a tuple of Unicode categories in a normalised order.
207
208 This function expands one-letter designations of a major class to include
209 all subclasses:
210
211 >>> as_general_categories(['N'])
212 ('Nd', 'Nl', 'No')
213
214 See section 4.5 of the Unicode standard for more on classes:
215 https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf
216
217 If the collection ``cats`` includes any elements that do not represent a
218 major class or a class with subclass, a deprecation warning is raised.
219 """
220 major_classes = ("L", "M", "N", "P", "S", "Z", "C")
221 cs = categories()
222 out = set(cats)
223 for c in cats:
224 if c in major_classes:
225 out.discard(c)
226 out.update(x for x in cs if x.startswith(c))
227 elif c not in cs:
228 raise InvalidArgument(
229 f"In {name}={cats!r}, {c!r} is not a valid Unicode category."
230 )
231 return tuple(c for c in cs if c in out)
232
233
234category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()}
235
236
237def _category_key(cats: Optional[Iterable[str]]) -> CategoriesTuple:
238 """Return a normalised tuple of all Unicode categories that are in
239 `include`, but not in `exclude`.
240
241 If include is None then default to including all categories.
242 Any item in include that is not a unicode character will be excluded.
243
244 >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So'])
245 ('Me', 'Lu', 'Cs')
246 """
247 cs = categories()
248 if cats is None:
249 cats = set(cs)
250 return tuple(c for c in cs if c in cats)
251
252
253def _query_for_key(key: Categories) -> IntervalsT:
254 """Return a tuple of codepoint intervals covering characters that match one
255 or more categories in the tuple of categories `key`.
256
257 >>> _query_for_key(categories())
258 ((0, 1114111),)
259 >>> _query_for_key(('Zl', 'Zp', 'Co'))
260 ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
261 """
262 key = tuple(key)
263 # ignore ordering on the cache key to increase potential cache hits.
264 cache_key = frozenset(key)
265 context = _current_build_context.value
266 if context is None or not context.data.provider.avoid_realization:
267 try:
268 return category_index_cache[cache_key]
269 except KeyError:
270 pass
271 elif not key: # pragma: no cover # only on alternative backends
272 return ()
273 assert key
274 if set(key) == set(categories()):
275 result = IntervalSet([(0, sys.maxunicode)])
276 else:
277 result = IntervalSet(_query_for_key(key[:-1])).union(
278 IntervalSet(charmap()[key[-1]])
279 )
280 assert isinstance(result, IntervalSet)
281 if context is None or not context.data.provider.avoid_realization:
282 category_index_cache[cache_key] = result.intervals
283 return result.intervals
284
285
286limited_category_index_cache: dict[
287 tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet
288] = {}
289
290
291def query(
292 *,
293 categories: Optional[Categories] = None,
294 min_codepoint: Optional[int] = None,
295 max_codepoint: Optional[int] = None,
296 include_characters: str = "",
297 exclude_characters: str = "",
298) -> IntervalSet:
299 """Return a tuple of intervals covering the codepoints for all characters
300 that meet the criteria.
301
302 >>> query()
303 ((0, 1114111),)
304 >>> query(min_codepoint=0, max_codepoint=128)
305 ((0, 128),)
306 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'])
307 ((65, 90),)
308 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'],
309 ... include_characters='☃')
310 ((65, 90), (9731, 9731))
311 """
312 if min_codepoint is None:
313 min_codepoint = 0
314 if max_codepoint is None:
315 max_codepoint = sys.maxunicode
316 catkey = _category_key(categories)
317 character_intervals = IntervalSet.from_string(include_characters or "")
318 exclude_intervals = IntervalSet.from_string(exclude_characters or "")
319 qkey = (
320 catkey,
321 min_codepoint,
322 max_codepoint,
323 character_intervals.intervals,
324 exclude_intervals.intervals,
325 )
326 context = _current_build_context.value
327 if context is None or not context.data.provider.avoid_realization:
328 try:
329 return limited_category_index_cache[qkey]
330 except KeyError:
331 pass
332 base = _query_for_key(catkey)
333 result = []
334 for u, v in base:
335 if v >= min_codepoint and u <= max_codepoint:
336 result.append((max(u, min_codepoint), min(v, max_codepoint)))
337 result = (IntervalSet(result) | character_intervals) - exclude_intervals
338 if context is None or not context.data.provider.avoid_realization:
339 limited_category_index_cache[qkey] = result
340 return result