1# unicode.py
2
3import sys
4from itertools import filterfalse
5from typing import List, Tuple, Union
6
7
8class _lazyclassproperty:
9 def __init__(self, fn):
10 self.fn = fn
11 self.__doc__ = fn.__doc__
12 self.__name__ = fn.__name__
13
14 def __get__(self, obj, cls):
15 if cls is None:
16 cls = type(obj)
17 if not hasattr(cls, "_intern") or any(
18 cls._intern is getattr(superclass, "_intern", [])
19 for superclass in cls.__mro__[1:]
20 ):
21 cls._intern = {}
22 attrname = self.fn.__name__
23 if attrname not in cls._intern:
24 cls._intern[attrname] = self.fn(cls)
25 return cls._intern[attrname]
26
27
28UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
29
30
31class unicode_set:
32 """
33 A set of Unicode characters, for language-specific strings for
34 ``alphas``, ``nums``, ``alphanums``, and ``printables``.
35 A unicode_set is defined by a list of ranges in the Unicode character
36 set, in a class attribute ``_ranges``. Ranges can be specified using
37 2-tuples or a 1-tuple, such as::
38
39 _ranges = [
40 (0x0020, 0x007e),
41 (0x00a0, 0x00ff),
42 (0x0100,),
43 ]
44
45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
46
47 A unicode set can also be defined using multiple inheritance of other unicode sets::
48
49 class CJK(Chinese, Japanese, Korean):
50 pass
51 """
52
53 _ranges: UnicodeRangeList = []
54
55 @_lazyclassproperty
56 def _chars_for_ranges(cls):
57 ret = []
58 for cc in cls.__mro__:
59 if cc is unicode_set:
60 break
61 for rr in getattr(cc, "_ranges", ()):
62 ret.extend(range(rr[0], rr[-1] + 1))
63 return [chr(c) for c in sorted(set(ret))]
64
65 @_lazyclassproperty
66 def printables(cls):
67 """all non-whitespace characters in this range"""
68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
69
70 @_lazyclassproperty
71 def alphas(cls):
72 """all alphabetic characters in this range"""
73 return "".join(filter(str.isalpha, cls._chars_for_ranges))
74
75 @_lazyclassproperty
76 def nums(cls):
77 """all numeric digit characters in this range"""
78 return "".join(filter(str.isdigit, cls._chars_for_ranges))
79
80 @_lazyclassproperty
81 def alphanums(cls):
82 """all alphanumeric characters in this range"""
83 return cls.alphas + cls.nums
84
85 @_lazyclassproperty
86 def identchars(cls):
87 """all characters in this range that are valid identifier characters, plus underscore '_'"""
88 return "".join(
89 sorted(
90 set(
91 "".join(filter(str.isidentifier, cls._chars_for_ranges))
92 + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
93 + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
94 + "_"
95 )
96 )
97 )
98
99 @_lazyclassproperty
100 def identbodychars(cls):
101 """
102 all characters in this range that are valid identifier body characters,
103 plus the digits 0-9, and · (Unicode MIDDLE DOT)
104 """
105 identifier_chars = set(
106 c for c in cls._chars_for_ranges if ("_" + c).isidentifier()
107 )
108 return "".join(sorted(identifier_chars | set(cls.identchars + "0123456789·")))
109
110 @_lazyclassproperty
111 def identifier(cls):
112 """
113 a pyparsing Word expression for an identifier using this range's definitions for
114 identchars and identbodychars
115 """
116 from pyparsing import Word
117
118 return Word(cls.identchars, cls.identbodychars)
119
120
121class pyparsing_unicode(unicode_set):
122 """
123 A namespace class for defining common language unicode_sets.
124 """
125
126 # fmt: off
127
128 # define ranges in language character sets
129 _ranges: UnicodeRangeList = [
130 (0x0020, sys.maxunicode),
131 ]
132
133 class BasicMultilingualPlane(unicode_set):
134 """Unicode set for the Basic Multilingual Plane"""
135 _ranges: UnicodeRangeList = [
136 (0x0020, 0xFFFF),
137 ]
138
139 class Latin1(unicode_set):
140 """Unicode set for Latin-1 Unicode Character Range"""
141 _ranges: UnicodeRangeList = [
142 (0x0020, 0x007E),
143 (0x00A0, 0x00FF),
144 ]
145
146 class LatinA(unicode_set):
147 """Unicode set for Latin-A Unicode Character Range"""
148 _ranges: UnicodeRangeList = [
149 (0x0100, 0x017F),
150 ]
151
152 class LatinB(unicode_set):
153 """Unicode set for Latin-B Unicode Character Range"""
154 _ranges: UnicodeRangeList = [
155 (0x0180, 0x024F),
156 ]
157
158 class Greek(unicode_set):
159 """Unicode set for Greek Unicode Character Ranges"""
160 _ranges: UnicodeRangeList = [
161 (0x0342, 0x0345),
162 (0x0370, 0x0377),
163 (0x037A, 0x037F),
164 (0x0384, 0x038A),
165 (0x038C,),
166 (0x038E, 0x03A1),
167 (0x03A3, 0x03E1),
168 (0x03F0, 0x03FF),
169 (0x1D26, 0x1D2A),
170 (0x1D5E,),
171 (0x1D60,),
172 (0x1D66, 0x1D6A),
173 (0x1F00, 0x1F15),
174 (0x1F18, 0x1F1D),
175 (0x1F20, 0x1F45),
176 (0x1F48, 0x1F4D),
177 (0x1F50, 0x1F57),
178 (0x1F59,),
179 (0x1F5B,),
180 (0x1F5D,),
181 (0x1F5F, 0x1F7D),
182 (0x1F80, 0x1FB4),
183 (0x1FB6, 0x1FC4),
184 (0x1FC6, 0x1FD3),
185 (0x1FD6, 0x1FDB),
186 (0x1FDD, 0x1FEF),
187 (0x1FF2, 0x1FF4),
188 (0x1FF6, 0x1FFE),
189 (0x2129,),
190 (0x2719, 0x271A),
191 (0xAB65,),
192 (0x10140, 0x1018D),
193 (0x101A0,),
194 (0x1D200, 0x1D245),
195 (0x1F7A1, 0x1F7A7),
196 ]
197
198 class Cyrillic(unicode_set):
199 """Unicode set for Cyrillic Unicode Character Range"""
200 _ranges: UnicodeRangeList = [
201 (0x0400, 0x052F),
202 (0x1C80, 0x1C88),
203 (0x1D2B,),
204 (0x1D78,),
205 (0x2DE0, 0x2DFF),
206 (0xA640, 0xA672),
207 (0xA674, 0xA69F),
208 (0xFE2E, 0xFE2F),
209 ]
210
211 class Chinese(unicode_set):
212 """Unicode set for Chinese Unicode Character Range"""
213 _ranges: UnicodeRangeList = [
214 (0x2E80, 0x2E99),
215 (0x2E9B, 0x2EF3),
216 (0x31C0, 0x31E3),
217 (0x3400, 0x4DB5),
218 (0x4E00, 0x9FEF),
219 (0xA700, 0xA707),
220 (0xF900, 0xFA6D),
221 (0xFA70, 0xFAD9),
222 (0x16FE2, 0x16FE3),
223 (0x1F210, 0x1F212),
224 (0x1F214, 0x1F23B),
225 (0x1F240, 0x1F248),
226 (0x20000, 0x2A6D6),
227 (0x2A700, 0x2B734),
228 (0x2B740, 0x2B81D),
229 (0x2B820, 0x2CEA1),
230 (0x2CEB0, 0x2EBE0),
231 (0x2F800, 0x2FA1D),
232 ]
233
234 class Japanese(unicode_set):
235 """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"""
236
237 class Kanji(unicode_set):
238 "Unicode set for Kanji Unicode Character Range"
239 _ranges: UnicodeRangeList = [
240 (0x4E00, 0x9FBF),
241 (0x3000, 0x303F),
242 ]
243
244 class Hiragana(unicode_set):
245 """Unicode set for Hiragana Unicode Character Range"""
246 _ranges: UnicodeRangeList = [
247 (0x3041, 0x3096),
248 (0x3099, 0x30A0),
249 (0x30FC,),
250 (0xFF70,),
251 (0x1B001,),
252 (0x1B150, 0x1B152),
253 (0x1F200,),
254 ]
255
256 class Katakana(unicode_set):
257 """Unicode set for Katakana Unicode Character Range"""
258 _ranges: UnicodeRangeList = [
259 (0x3099, 0x309C),
260 (0x30A0, 0x30FF),
261 (0x31F0, 0x31FF),
262 (0x32D0, 0x32FE),
263 (0xFF65, 0xFF9F),
264 (0x1B000,),
265 (0x1B164, 0x1B167),
266 (0x1F201, 0x1F202),
267 (0x1F213,),
268 ]
269
270 漢字 = Kanji
271 カタカナ = Katakana
272 ひらがな = Hiragana
273
274 _ranges = (
275 Kanji._ranges
276 + Hiragana._ranges
277 + Katakana._ranges
278 )
279
280 class Hangul(unicode_set):
281 """Unicode set for Hangul (Korean) Unicode Character Range"""
282 _ranges: UnicodeRangeList = [
283 (0x1100, 0x11FF),
284 (0x302E, 0x302F),
285 (0x3131, 0x318E),
286 (0x3200, 0x321C),
287 (0x3260, 0x327B),
288 (0x327E,),
289 (0xA960, 0xA97C),
290 (0xAC00, 0xD7A3),
291 (0xD7B0, 0xD7C6),
292 (0xD7CB, 0xD7FB),
293 (0xFFA0, 0xFFBE),
294 (0xFFC2, 0xFFC7),
295 (0xFFCA, 0xFFCF),
296 (0xFFD2, 0xFFD7),
297 (0xFFDA, 0xFFDC),
298 ]
299
300 Korean = Hangul
301
302 class CJK(Chinese, Japanese, Hangul):
303 """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"""
304
305 class Thai(unicode_set):
306 """Unicode set for Thai Unicode Character Range"""
307 _ranges: UnicodeRangeList = [
308 (0x0E01, 0x0E3A),
309 (0x0E3F, 0x0E5B)
310 ]
311
312 class Arabic(unicode_set):
313 """Unicode set for Arabic Unicode Character Range"""
314 _ranges: UnicodeRangeList = [
315 (0x0600, 0x061B),
316 (0x061E, 0x06FF),
317 (0x0700, 0x077F),
318 ]
319
320 class Hebrew(unicode_set):
321 """Unicode set for Hebrew Unicode Character Range"""
322 _ranges: UnicodeRangeList = [
323 (0x0591, 0x05C7),
324 (0x05D0, 0x05EA),
325 (0x05EF, 0x05F4),
326 (0xFB1D, 0xFB36),
327 (0xFB38, 0xFB3C),
328 (0xFB3E,),
329 (0xFB40, 0xFB41),
330 (0xFB43, 0xFB44),
331 (0xFB46, 0xFB4F),
332 ]
333
334 class Devanagari(unicode_set):
335 """Unicode set for Devanagari Unicode Character Range"""
336 _ranges: UnicodeRangeList = [
337 (0x0900, 0x097F),
338 (0xA8E0, 0xA8FF)
339 ]
340
341 BMP = BasicMultilingualPlane
342
343 # add language identifiers using language Unicode
344 العربية = Arabic
345 中文 = Chinese
346 кириллица = Cyrillic
347 Ελληνικά = Greek
348 עִברִית = Hebrew
349 日本語 = Japanese
350 한국어 = Korean
351 ไทย = Thai
352 देवनागरी = Devanagari
353
354 # fmt: on