1# This file is part of Hypothesis, which may be found at
2# https://github.com/HypothesisWorks/hypothesis/
3#
4# Copyright the Hypothesis Authors.
5# Individual contributors are listed in AUTHORS.rst and the git log.
6#
7# This Source Code Form is subject to the terms of the Mozilla Public License,
8# v. 2.0. If a copy of the MPL was not distributed with this file, You can
9# obtain one at https://mozilla.org/MPL/2.0/.
10
11import codecs
12import gzip
13import json
14import os
15import sys
16import tempfile
17import unicodedata
18from collections.abc import Collection, Iterable
19from functools import cache
20from pathlib import Path
21from typing import Literal, TypeAlias
22
23from hypothesis.configuration import storage_directory
24from hypothesis.control import _current_build_context
25from hypothesis.errors import InvalidArgument
26from hypothesis.internal.intervalsets import IntervalSet, IntervalsT
27
28# See https://en.wikipedia.org/wiki/Unicode_character_property#General_Category
29CategoryName: TypeAlias = Literal[
30 "L", # Letter
31 "Lu", # Letter, uppercase
32 "Ll", # Letter, lowercase
33 "Lt", # Letter, titlecase
34 "Lm", # Letter, modifier
35 "Lo", # Letter, other
36 "M", # Mark
37 "Mn", # Mark, nonspacing
38 "Mc", # Mark, spacing combining
39 "Me", # Mark, enclosing
40 "N", # Number
41 "Nd", # Number, decimal digit
42 "Nl", # Number, letter
43 "No", # Number, other
44 "P", # Punctuation
45 "Pc", # Punctuation, connector
46 "Pd", # Punctuation, dash
47 "Ps", # Punctuation, open
48 "Pe", # Punctuation, close
49 "Pi", # Punctuation, initial quote
50 "Pf", # Punctuation, final quote
51 "Po", # Punctuation, other
52 "S", # Symbol
53 "Sm", # Symbol, math
54 "Sc", # Symbol, currency
55 "Sk", # Symbol, modifier
56 "So", # Symbol, other
57 "Z", # Separator
58 "Zs", # Separator, space
59 "Zl", # Separator, line
60 "Zp", # Separator, paragraph
61 "C", # Other
62 "Cc", # Other, control
63 "Cf", # Other, format
64 "Cs", # Other, surrogate
65 "Co", # Other, private use
66 "Cn", # Other, not assigned
67]
68Categories: TypeAlias = Iterable[CategoryName]
69CategoriesTuple: TypeAlias = tuple[CategoryName, ...]
70
71
72def charmap_file(fname: str = "charmap") -> Path:
73 return storage_directory(
74 "unicode_data", unicodedata.unidata_version, f"{fname}.json.gz"
75 )
76
77
78_charmap: dict[CategoryName, IntervalsT] | None = None
79
80
81def charmap() -> dict[CategoryName, IntervalsT]:
82 """Return a dict that maps a Unicode category, to a tuple of 2-tuples
83 covering the codepoint intervals for characters in that category.
84
85 >>> charmap()['Co']
86 ((57344, 63743), (983040, 1048573), (1048576, 1114109))
87 """
88 global _charmap
89 # Best-effort caching in the face of missing files and/or unwritable
90 # filesystems is fairly simple: check if loaded, else try loading,
91 # else calculate and try writing the cache.
92 if _charmap is None:
93 f = charmap_file()
94 try:
95 with gzip.GzipFile(f, "rb") as d:
96 tmp_charmap = dict(json.load(d))
97
98 except Exception:
99 # This loop is reduced to using only local variables for performance;
100 # indexing and updating containers is a ~3x slowdown. This doesn't fix
101 # https://github.com/HypothesisWorks/hypothesis/issues/2108 but it helps.
102 category = unicodedata.category # Local variable -> ~20% speedup!
103 tmp_charmap = {}
104 last_cat = category(chr(0))
105 last_start = 0
106 for i in range(1, sys.maxunicode + 1):
107 cat = category(chr(i))
108 if cat != last_cat:
109 tmp_charmap.setdefault(last_cat, []).append((last_start, i - 1))
110 last_cat, last_start = cat, i
111 tmp_charmap.setdefault(last_cat, []).append((last_start, sys.maxunicode))
112
113 try:
114 # Write the Unicode table atomically
115 tmpdir = storage_directory("tmp")
116 tmpdir.mkdir(exist_ok=True, parents=True)
117 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
118 os.close(fd)
119 # Explicitly set the mtime to get reproducible output
120 with gzip.GzipFile(tmpfile, "wb", mtime=1) as fp:
121 result = json.dumps(sorted(tmp_charmap.items()))
122 fp.write(result.encode())
123
124 os.renames(tmpfile, f)
125 except Exception:
126 pass
127
128 # convert between lists and tuples
129 _charmap = {
130 k: tuple(tuple(pair) for pair in pairs) for k, pairs in tmp_charmap.items()
131 }
132 # each value is a tuple of 2-tuples (that is, tuples of length 2)
133 # and both elements of that tuple are integers.
134 for vs in _charmap.values():
135 ints = list(sum(vs, ()))
136 assert all(isinstance(x, int) for x in ints)
137 assert ints == sorted(ints)
138 assert all(len(tup) == 2 for tup in vs)
139
140 assert _charmap is not None
141 return _charmap
142
143
144@cache
145def intervals_from_codec(codec_name: str) -> IntervalSet: # pragma: no cover
146 """Return an IntervalSet of characters which are part of this codec."""
147 assert codec_name == codecs.lookup(codec_name).name
148 fname = charmap_file(f"codec-{codec_name}")
149 try:
150 with gzip.GzipFile(fname) as gzf:
151 encodable_intervals = json.load(gzf)
152
153 except Exception:
154 # This loop is kinda slow, but hopefully we don't need to do it very often!
155 encodable_intervals = []
156 for i in range(sys.maxunicode + 1):
157 try:
158 chr(i).encode(codec_name)
159 except Exception: # usually _but not always_ UnicodeEncodeError
160 pass
161 else:
162 encodable_intervals.append((i, i))
163
164 res = IntervalSet(encodable_intervals)
165 res = res.union(res)
166 try:
167 # Write the Unicode table atomically
168 tmpdir = storage_directory("tmp")
169 tmpdir.mkdir(exist_ok=True, parents=True)
170 fd, tmpfile = tempfile.mkstemp(dir=tmpdir)
171 os.close(fd)
172 # Explicitly set the mtime to get reproducible output
173 with gzip.GzipFile(tmpfile, "wb", mtime=1) as f:
174 f.write(json.dumps(res.intervals).encode())
175 os.renames(tmpfile, fname)
176 except Exception:
177 pass
178 return res
179
180
181_categories: Categories | None = None
182
183
184def categories() -> Categories:
185 """Return a tuple of Unicode categories in a normalised order.
186
187 >>> categories() # doctest: +ELLIPSIS
188 ('Zl', 'Zp', 'Co', 'Me', 'Pc', ..., 'Cc', 'Cs')
189 """
190 global _categories
191 if _categories is None:
192 cm = charmap()
193 categories = sorted(cm.keys(), key=lambda c: len(cm[c]))
194 categories.remove("Cc") # Other, Control
195 categories.remove("Cs") # Other, Surrogate
196 categories.append("Cc")
197 categories.append("Cs")
198 _categories = tuple(categories)
199 return _categories
200
201
202def as_general_categories(cats: Categories, name: str = "cats") -> CategoriesTuple:
203 """Return a tuple of Unicode categories in a normalised order.
204
205 This function expands one-letter designations of a major class to include
206 all subclasses:
207
208 >>> as_general_categories(['N'])
209 ('Nd', 'Nl', 'No')
210
211 See section 4.5 of the Unicode standard for more on classes:
212 https://www.unicode.org/versions/Unicode10.0.0/ch04.pdf
213
214 If the collection ``cats`` includes any elements that do not represent a
215 major class or a class with subclass, a deprecation warning is raised.
216 """
217 major_classes = ("L", "M", "N", "P", "S", "Z", "C")
218 cs = categories()
219 out = set(cats)
220 for c in cats:
221 if c in major_classes:
222 out.discard(c)
223 out.update(x for x in cs if x.startswith(c))
224 elif c not in cs:
225 raise InvalidArgument(
226 f"In {name}={cats!r}, {c!r} is not a valid Unicode category."
227 )
228 return tuple(c for c in cs if c in out)
229
230
231category_index_cache: dict[frozenset[CategoryName], IntervalsT] = {frozenset(): ()}
232
233
234def _category_key(cats: Iterable[str] | None) -> CategoriesTuple:
235 """Return a normalised tuple of all Unicode categories that are in
236 `include`, but not in `exclude`.
237
238 If include is None then default to including all categories.
239 Any item in include that is not a unicode character will be excluded.
240
241 >>> _category_key(exclude=['So'], include=['Lu', 'Me', 'Cs', 'So'])
242 ('Me', 'Lu', 'Cs')
243 """
244 cs = categories()
245 if cats is None:
246 cats = set(cs)
247 return tuple(c for c in cs if c in cats)
248
249
250def _query_for_key(key: Categories) -> IntervalsT:
251 """Return a tuple of codepoint intervals covering characters that match one
252 or more categories in the tuple of categories `key`.
253
254 >>> _query_for_key(categories())
255 ((0, 1114111),)
256 >>> _query_for_key(('Zl', 'Zp', 'Co'))
257 ((8232, 8233), (57344, 63743), (983040, 1048573), (1048576, 1114109))
258 """
259 key = tuple(key)
260 # ignore ordering on the cache key to increase potential cache hits.
261 cache_key = frozenset(key)
262 context = _current_build_context.value
263 if context is None or not context.data.provider.avoid_realization:
264 try:
265 return category_index_cache[cache_key]
266 except KeyError:
267 pass
268 elif not key: # pragma: no cover # only on alternative backends
269 return ()
270 assert key
271 if set(key) == set(categories()):
272 result = IntervalSet([(0, sys.maxunicode)])
273 else:
274 result = IntervalSet(_query_for_key(key[:-1])).union(
275 IntervalSet(charmap()[key[-1]])
276 )
277 assert isinstance(result, IntervalSet)
278 if context is None or not context.data.provider.avoid_realization:
279 category_index_cache[cache_key] = result.intervals
280 return result.intervals
281
282
283limited_category_index_cache: dict[
284 tuple[CategoriesTuple, int, int, IntervalsT, IntervalsT], IntervalSet
285] = {}
286
287
288def query(
289 *,
290 categories: Categories | None = None,
291 min_codepoint: int | None = None,
292 max_codepoint: int | None = None,
293 include_characters: Collection[str] = "",
294 exclude_characters: Collection[str] = "",
295) -> IntervalSet:
296 """Return a tuple of intervals covering the codepoints for all characters
297 that meet the criteria.
298
299 >>> query()
300 ((0, 1114111),)
301 >>> query(min_codepoint=0, max_codepoint=128)
302 ((0, 128),)
303 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'])
304 ((65, 90),)
305 >>> query(min_codepoint=0, max_codepoint=128, categories=['Lu'],
306 ... include_characters='☃')
307 ((65, 90), (9731, 9731))
308 """
309 if min_codepoint is None:
310 min_codepoint = 0
311 if max_codepoint is None:
312 max_codepoint = sys.maxunicode
313 catkey = _category_key(categories)
314 character_intervals = IntervalSet.from_string("".join(include_characters))
315 exclude_intervals = IntervalSet.from_string("".join(exclude_characters))
316 qkey = (
317 catkey,
318 min_codepoint,
319 max_codepoint,
320 character_intervals.intervals,
321 exclude_intervals.intervals,
322 )
323 context = _current_build_context.value
324 if context is None or not context.data.provider.avoid_realization:
325 try:
326 return limited_category_index_cache[qkey]
327 except KeyError:
328 pass
329 base = _query_for_key(catkey)
330 result = []
331 for u, v in base:
332 if v >= min_codepoint and u <= max_codepoint:
333 result.append((max(u, min_codepoint), min(v, max_codepoint)))
334 result = (IntervalSet(result) | character_intervals) - exclude_intervals
335 if context is None or not context.data.provider.avoid_realization:
336 limited_category_index_cache[qkey] = result
337 return result