1# unicode.py
2
3import sys
4from itertools import filterfalse
5from typing import List, Tuple, Union
6
7
8class _lazyclassproperty:
9 def __init__(self, fn):
10 self.fn = fn
11 self.__doc__ = fn.__doc__
12 self.__name__ = fn.__name__
13
14 def __get__(self, obj, cls):
15 if cls is None:
16 cls = type(obj)
17 if not hasattr(cls, "_intern") or any(
18 cls._intern is getattr(superclass, "_intern", [])
19 for superclass in cls.__mro__[1:]
20 ):
21 cls._intern = {}
22 attrname = self.fn.__name__
23 if attrname not in cls._intern:
24 cls._intern[attrname] = self.fn(cls)
25 return cls._intern[attrname]
26
27
28UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
29
30
31class unicode_set:
32 """
33 A set of Unicode characters, for language-specific strings for
34 ``alphas``, ``nums``, ``alphanums``, and ``printables``.
35 A unicode_set is defined by a list of ranges in the Unicode character
36 set, in a class attribute ``_ranges``. Ranges can be specified using
37 2-tuples or a 1-tuple, such as::
38
39 _ranges = [
40 (0x0020, 0x007e),
41 (0x00a0, 0x00ff),
42 (0x0100,),
43 ]
44
45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
46
47 A unicode set can also be defined using multiple inheritance of other unicode sets::
48
49 class CJK(Chinese, Japanese, Korean):
50 pass
51 """
52
53 _ranges: UnicodeRangeList = []
54
55 @_lazyclassproperty
56 def _chars_for_ranges(cls) -> List[str]:
57 ret: List[int] = []
58 for cc in cls.__mro__:
59 if cc is unicode_set:
60 break
61 for rr in getattr(cc, "_ranges", ()):
62 ret.extend(range(rr[0], rr[-1] + 1))
63 return sorted(chr(c) for c in set(ret))
64
65 @_lazyclassproperty
66 def printables(cls) -> str:
67 """all non-whitespace characters in this range"""
68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
69
70 @_lazyclassproperty
71 def alphas(cls) -> str:
72 """all alphabetic characters in this range"""
73 return "".join(filter(str.isalpha, cls._chars_for_ranges))
74
75 @_lazyclassproperty
76 def nums(cls) -> str:
77 """all numeric digit characters in this range"""
78 return "".join(filter(str.isdigit, cls._chars_for_ranges))
79
80 @_lazyclassproperty
81 def alphanums(cls) -> str:
82 """all alphanumeric characters in this range"""
83 return cls.alphas + cls.nums
84
85 @_lazyclassproperty
86 def identchars(cls) -> str:
87 """all characters in this range that are valid identifier characters, plus underscore '_'"""
88 return "".join(
89 sorted(
90 set(filter(str.isidentifier, cls._chars_for_ranges))
91 | set(
92 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
93 "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
94 "_"
95 )
96 )
97 )
98
99 @_lazyclassproperty
100 def identbodychars(cls) -> str:
101 """
102 all characters in this range that are valid identifier body characters,
103 plus the digits 0-9, and · (Unicode MIDDLE DOT)
104 """
105 identifier_chars = set(
106 c for c in cls._chars_for_ranges if ("_" + c).isidentifier()
107 )
108 return "".join(
109 sorted(identifier_chars | set(cls.identchars) | set("0123456789·"))
110 )
111
112 @_lazyclassproperty
113 def identifier(cls):
114 """
115 a pyparsing Word expression for an identifier using this range's definitions for
116 identchars and identbodychars
117 """
118 from pyparsing import Word
119
120 return Word(cls.identchars, cls.identbodychars)
121
122
123class pyparsing_unicode(unicode_set):
124 """
125 A namespace class for defining common language unicode_sets.
126 """
127
128 # fmt: off
129
130 # define ranges in language character sets
131 _ranges: UnicodeRangeList = [
132 (0x0020, sys.maxunicode),
133 ]
134
135 class BasicMultilingualPlane(unicode_set):
136 """Unicode set for the Basic Multilingual Plane"""
137 _ranges: UnicodeRangeList = [
138 (0x0020, 0xFFFF),
139 ]
140
141 class Latin1(unicode_set):
142 """Unicode set for Latin-1 Unicode Character Range"""
143 _ranges: UnicodeRangeList = [
144 (0x0020, 0x007E),
145 (0x00A0, 0x00FF),
146 ]
147
148 class LatinA(unicode_set):
149 """Unicode set for Latin-A Unicode Character Range"""
150 _ranges: UnicodeRangeList = [
151 (0x0100, 0x017F),
152 ]
153
154 class LatinB(unicode_set):
155 """Unicode set for Latin-B Unicode Character Range"""
156 _ranges: UnicodeRangeList = [
157 (0x0180, 0x024F),
158 ]
159
160 class Greek(unicode_set):
161 """Unicode set for Greek Unicode Character Ranges"""
162 _ranges: UnicodeRangeList = [
163 (0x0342, 0x0345),
164 (0x0370, 0x0377),
165 (0x037A, 0x037F),
166 (0x0384, 0x038A),
167 (0x038C,),
168 (0x038E, 0x03A1),
169 (0x03A3, 0x03E1),
170 (0x03F0, 0x03FF),
171 (0x1D26, 0x1D2A),
172 (0x1D5E,),
173 (0x1D60,),
174 (0x1D66, 0x1D6A),
175 (0x1F00, 0x1F15),
176 (0x1F18, 0x1F1D),
177 (0x1F20, 0x1F45),
178 (0x1F48, 0x1F4D),
179 (0x1F50, 0x1F57),
180 (0x1F59,),
181 (0x1F5B,),
182 (0x1F5D,),
183 (0x1F5F, 0x1F7D),
184 (0x1F80, 0x1FB4),
185 (0x1FB6, 0x1FC4),
186 (0x1FC6, 0x1FD3),
187 (0x1FD6, 0x1FDB),
188 (0x1FDD, 0x1FEF),
189 (0x1FF2, 0x1FF4),
190 (0x1FF6, 0x1FFE),
191 (0x2129,),
192 (0x2719, 0x271A),
193 (0xAB65,),
194 (0x10140, 0x1018D),
195 (0x101A0,),
196 (0x1D200, 0x1D245),
197 (0x1F7A1, 0x1F7A7),
198 ]
199
200 class Cyrillic(unicode_set):
201 """Unicode set for Cyrillic Unicode Character Range"""
202 _ranges: UnicodeRangeList = [
203 (0x0400, 0x052F),
204 (0x1C80, 0x1C88),
205 (0x1D2B,),
206 (0x1D78,),
207 (0x2DE0, 0x2DFF),
208 (0xA640, 0xA672),
209 (0xA674, 0xA69F),
210 (0xFE2E, 0xFE2F),
211 ]
212
213 class Chinese(unicode_set):
214 """Unicode set for Chinese Unicode Character Range"""
215 _ranges: UnicodeRangeList = [
216 (0x2E80, 0x2E99),
217 (0x2E9B, 0x2EF3),
218 (0x31C0, 0x31E3),
219 (0x3400, 0x4DB5),
220 (0x4E00, 0x9FEF),
221 (0xA700, 0xA707),
222 (0xF900, 0xFA6D),
223 (0xFA70, 0xFAD9),
224 (0x16FE2, 0x16FE3),
225 (0x1F210, 0x1F212),
226 (0x1F214, 0x1F23B),
227 (0x1F240, 0x1F248),
228 (0x20000, 0x2A6D6),
229 (0x2A700, 0x2B734),
230 (0x2B740, 0x2B81D),
231 (0x2B820, 0x2CEA1),
232 (0x2CEB0, 0x2EBE0),
233 (0x2F800, 0x2FA1D),
234 ]
235
236 class Japanese(unicode_set):
237 """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""
238
239 class Kanji(unicode_set):
240 "Unicode set for Kanji Unicode Character Range"
241 _ranges: UnicodeRangeList = [
242 (0x4E00, 0x9FBF),
243 (0x3000, 0x303F),
244 ]
245
246 class Hiragana(unicode_set):
247 """Unicode set for Hiragana Unicode Character Range"""
248 _ranges: UnicodeRangeList = [
249 (0x3041, 0x3096),
250 (0x3099, 0x30A0),
251 (0x30FC,),
252 (0xFF70,),
253 (0x1B001,),
254 (0x1B150, 0x1B152),
255 (0x1F200,),
256 ]
257
258 class Katakana(unicode_set):
259 """Unicode set for Katakana Unicode Character Range"""
260 _ranges: UnicodeRangeList = [
261 (0x3099, 0x309C),
262 (0x30A0, 0x30FF),
263 (0x31F0, 0x31FF),
264 (0x32D0, 0x32FE),
265 (0xFF65, 0xFF9F),
266 (0x1B000,),
267 (0x1B164, 0x1B167),
268 (0x1F201, 0x1F202),
269 (0x1F213,),
270 ]
271
272 漢字 = Kanji
273 カタカナ = Katakana
274 ひらがな = Hiragana
275
276 _ranges = (
277 Kanji._ranges
278 + Hiragana._ranges
279 + Katakana._ranges
280 )
281
282 class Hangul(unicode_set):
283 """Unicode set for Hangul (Korean) Unicode Character Range"""
284 _ranges: UnicodeRangeList = [
285 (0x1100, 0x11FF),
286 (0x302E, 0x302F),
287 (0x3131, 0x318E),
288 (0x3200, 0x321C),
289 (0x3260, 0x327B),
290 (0x327E,),
291 (0xA960, 0xA97C),
292 (0xAC00, 0xD7A3),
293 (0xD7B0, 0xD7C6),
294 (0xD7CB, 0xD7FB),
295 (0xFFA0, 0xFFBE),
296 (0xFFC2, 0xFFC7),
297 (0xFFCA, 0xFFCF),
298 (0xFFD2, 0xFFD7),
299 (0xFFDA, 0xFFDC),
300 ]
301
302 Korean = Hangul
303
304 class CJK(Chinese, Japanese, Hangul):
305 """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""
306
307 class Thai(unicode_set):
308 """Unicode set for Thai Unicode Character Range"""
309 _ranges: UnicodeRangeList = [
310 (0x0E01, 0x0E3A),
311 (0x0E3F, 0x0E5B)
312 ]
313
314 class Arabic(unicode_set):
315 """Unicode set for Arabic Unicode Character Range"""
316 _ranges: UnicodeRangeList = [
317 (0x0600, 0x061B),
318 (0x061E, 0x06FF),
319 (0x0700, 0x077F),
320 ]
321
322 class Hebrew(unicode_set):
323 """Unicode set for Hebrew Unicode Character Range"""
324 _ranges: UnicodeRangeList = [
325 (0x0591, 0x05C7),
326 (0x05D0, 0x05EA),
327 (0x05EF, 0x05F4),
328 (0xFB1D, 0xFB36),
329 (0xFB38, 0xFB3C),
330 (0xFB3E,),
331 (0xFB40, 0xFB41),
332 (0xFB43, 0xFB44),
333 (0xFB46, 0xFB4F),
334 ]
335
336 class Devanagari(unicode_set):
337 """Unicode set for Devanagari Unicode Character Range"""
338 _ranges: UnicodeRangeList = [
339 (0x0900, 0x097F),
340 (0xA8E0, 0xA8FF)
341 ]
342
343 BMP = BasicMultilingualPlane
344
345 # add language identifiers using language Unicode
346 العربية = Arabic
347 中文 = Chinese
348 кириллица = Cyrillic
349 Ελληνικά = Greek
350 עִברִית = Hebrew
351 日本語 = Japanese
352 한국어 = Korean
353 ไทย = Thai
354 देवनागरी = Devanagari
355
356 # fmt: on