Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pip/_vendor/pyparsing/unicode.py: 98%
97 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:48 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:48 +0000
1# unicode.py
3import sys
4from itertools import filterfalse
5from typing import List, Tuple, Union
8class _lazyclassproperty:
9 def __init__(self, fn):
10 self.fn = fn
11 self.__doc__ = fn.__doc__
12 self.__name__ = fn.__name__
14 def __get__(self, obj, cls):
15 if cls is None:
16 cls = type(obj)
17 if not hasattr(cls, "_intern") or any(
18 cls._intern is getattr(superclass, "_intern", [])
19 for superclass in cls.__mro__[1:]
20 ):
21 cls._intern = {}
22 attrname = self.fn.__name__
23 if attrname not in cls._intern:
24 cls._intern[attrname] = self.fn(cls)
25 return cls._intern[attrname]
28UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]]
31class unicode_set:
32 """
33 A set of Unicode characters, for language-specific strings for
34 ``alphas``, ``nums``, ``alphanums``, and ``printables``.
35 A unicode_set is defined by a list of ranges in the Unicode character
36 set, in a class attribute ``_ranges``. Ranges can be specified using
37 2-tuples or a 1-tuple, such as::
39 _ranges = [
40 (0x0020, 0x007e),
41 (0x00a0, 0x00ff),
42 (0x0100,),
43 ]
45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x).
47 A unicode set can also be defined using multiple inheritance of other unicode sets::
49 class CJK(Chinese, Japanese, Korean):
50 pass
51 """
53 _ranges: UnicodeRangeList = []
55 @_lazyclassproperty
56 def _chars_for_ranges(cls):
57 ret = []
58 for cc in cls.__mro__:
59 if cc is unicode_set:
60 break
61 for rr in getattr(cc, "_ranges", ()):
62 ret.extend(range(rr[0], rr[-1] + 1))
63 return [chr(c) for c in sorted(set(ret))]
65 @_lazyclassproperty
66 def printables(cls):
67 "all non-whitespace characters in this range"
68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges))
70 @_lazyclassproperty
71 def alphas(cls):
72 "all alphabetic characters in this range"
73 return "".join(filter(str.isalpha, cls._chars_for_ranges))
75 @_lazyclassproperty
76 def nums(cls):
77 "all numeric digit characters in this range"
78 return "".join(filter(str.isdigit, cls._chars_for_ranges))
80 @_lazyclassproperty
81 def alphanums(cls):
82 "all alphanumeric characters in this range"
83 return cls.alphas + cls.nums
85 @_lazyclassproperty
86 def identchars(cls):
87 "all characters in this range that are valid identifier characters, plus underscore '_'"
88 return "".join(
89 sorted(
90 set(
91 "".join(filter(str.isidentifier, cls._chars_for_ranges))
92 + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº"
93 + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
94 + "_"
95 )
96 )
97 )
99 @_lazyclassproperty
100 def identbodychars(cls):
101 """
102 all characters in this range that are valid identifier body characters,
103 plus the digits 0-9
104 """
105 return "".join(
106 sorted(
107 set(
108 cls.identchars
109 + "0123456789"
110 + "".join(
111 [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()]
112 )
113 )
114 )
115 )
118class pyparsing_unicode(unicode_set):
119 """
120 A namespace class for defining common language unicode_sets.
121 """
123 # fmt: off
125 # define ranges in language character sets
126 _ranges: UnicodeRangeList = [
127 (0x0020, sys.maxunicode),
128 ]
130 class BasicMultilingualPlane(unicode_set):
131 "Unicode set for the Basic Multilingual Plane"
132 _ranges: UnicodeRangeList = [
133 (0x0020, 0xFFFF),
134 ]
136 class Latin1(unicode_set):
137 "Unicode set for Latin-1 Unicode Character Range"
138 _ranges: UnicodeRangeList = [
139 (0x0020, 0x007E),
140 (0x00A0, 0x00FF),
141 ]
143 class LatinA(unicode_set):
144 "Unicode set for Latin-A Unicode Character Range"
145 _ranges: UnicodeRangeList = [
146 (0x0100, 0x017F),
147 ]
149 class LatinB(unicode_set):
150 "Unicode set for Latin-B Unicode Character Range"
151 _ranges: UnicodeRangeList = [
152 (0x0180, 0x024F),
153 ]
155 class Greek(unicode_set):
156 "Unicode set for Greek Unicode Character Ranges"
157 _ranges: UnicodeRangeList = [
158 (0x0342, 0x0345),
159 (0x0370, 0x0377),
160 (0x037A, 0x037F),
161 (0x0384, 0x038A),
162 (0x038C,),
163 (0x038E, 0x03A1),
164 (0x03A3, 0x03E1),
165 (0x03F0, 0x03FF),
166 (0x1D26, 0x1D2A),
167 (0x1D5E,),
168 (0x1D60,),
169 (0x1D66, 0x1D6A),
170 (0x1F00, 0x1F15),
171 (0x1F18, 0x1F1D),
172 (0x1F20, 0x1F45),
173 (0x1F48, 0x1F4D),
174 (0x1F50, 0x1F57),
175 (0x1F59,),
176 (0x1F5B,),
177 (0x1F5D,),
178 (0x1F5F, 0x1F7D),
179 (0x1F80, 0x1FB4),
180 (0x1FB6, 0x1FC4),
181 (0x1FC6, 0x1FD3),
182 (0x1FD6, 0x1FDB),
183 (0x1FDD, 0x1FEF),
184 (0x1FF2, 0x1FF4),
185 (0x1FF6, 0x1FFE),
186 (0x2129,),
187 (0x2719, 0x271A),
188 (0xAB65,),
189 (0x10140, 0x1018D),
190 (0x101A0,),
191 (0x1D200, 0x1D245),
192 (0x1F7A1, 0x1F7A7),
193 ]
195 class Cyrillic(unicode_set):
196 "Unicode set for Cyrillic Unicode Character Range"
197 _ranges: UnicodeRangeList = [
198 (0x0400, 0x052F),
199 (0x1C80, 0x1C88),
200 (0x1D2B,),
201 (0x1D78,),
202 (0x2DE0, 0x2DFF),
203 (0xA640, 0xA672),
204 (0xA674, 0xA69F),
205 (0xFE2E, 0xFE2F),
206 ]
208 class Chinese(unicode_set):
209 "Unicode set for Chinese Unicode Character Range"
210 _ranges: UnicodeRangeList = [
211 (0x2E80, 0x2E99),
212 (0x2E9B, 0x2EF3),
213 (0x31C0, 0x31E3),
214 (0x3400, 0x4DB5),
215 (0x4E00, 0x9FEF),
216 (0xA700, 0xA707),
217 (0xF900, 0xFA6D),
218 (0xFA70, 0xFAD9),
219 (0x16FE2, 0x16FE3),
220 (0x1F210, 0x1F212),
221 (0x1F214, 0x1F23B),
222 (0x1F240, 0x1F248),
223 (0x20000, 0x2A6D6),
224 (0x2A700, 0x2B734),
225 (0x2B740, 0x2B81D),
226 (0x2B820, 0x2CEA1),
227 (0x2CEB0, 0x2EBE0),
228 (0x2F800, 0x2FA1D),
229 ]
231 class Japanese(unicode_set):
232 "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges"
233 _ranges: UnicodeRangeList = []
235 class Kanji(unicode_set):
236 "Unicode set for Kanji Unicode Character Range"
237 _ranges: UnicodeRangeList = [
238 (0x4E00, 0x9FBF),
239 (0x3000, 0x303F),
240 ]
242 class Hiragana(unicode_set):
243 "Unicode set for Hiragana Unicode Character Range"
244 _ranges: UnicodeRangeList = [
245 (0x3041, 0x3096),
246 (0x3099, 0x30A0),
247 (0x30FC,),
248 (0xFF70,),
249 (0x1B001,),
250 (0x1B150, 0x1B152),
251 (0x1F200,),
252 ]
254 class Katakana(unicode_set):
255 "Unicode set for Katakana Unicode Character Range"
256 _ranges: UnicodeRangeList = [
257 (0x3099, 0x309C),
258 (0x30A0, 0x30FF),
259 (0x31F0, 0x31FF),
260 (0x32D0, 0x32FE),
261 (0xFF65, 0xFF9F),
262 (0x1B000,),
263 (0x1B164, 0x1B167),
264 (0x1F201, 0x1F202),
265 (0x1F213,),
266 ]
268 class Hangul(unicode_set):
269 "Unicode set for Hangul (Korean) Unicode Character Range"
270 _ranges: UnicodeRangeList = [
271 (0x1100, 0x11FF),
272 (0x302E, 0x302F),
273 (0x3131, 0x318E),
274 (0x3200, 0x321C),
275 (0x3260, 0x327B),
276 (0x327E,),
277 (0xA960, 0xA97C),
278 (0xAC00, 0xD7A3),
279 (0xD7B0, 0xD7C6),
280 (0xD7CB, 0xD7FB),
281 (0xFFA0, 0xFFBE),
282 (0xFFC2, 0xFFC7),
283 (0xFFCA, 0xFFCF),
284 (0xFFD2, 0xFFD7),
285 (0xFFDA, 0xFFDC),
286 ]
288 Korean = Hangul
290 class CJK(Chinese, Japanese, Hangul):
291 "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range"
293 class Thai(unicode_set):
294 "Unicode set for Thai Unicode Character Range"
295 _ranges: UnicodeRangeList = [
296 (0x0E01, 0x0E3A),
297 (0x0E3F, 0x0E5B)
298 ]
300 class Arabic(unicode_set):
301 "Unicode set for Arabic Unicode Character Range"
302 _ranges: UnicodeRangeList = [
303 (0x0600, 0x061B),
304 (0x061E, 0x06FF),
305 (0x0700, 0x077F),
306 ]
308 class Hebrew(unicode_set):
309 "Unicode set for Hebrew Unicode Character Range"
310 _ranges: UnicodeRangeList = [
311 (0x0591, 0x05C7),
312 (0x05D0, 0x05EA),
313 (0x05EF, 0x05F4),
314 (0xFB1D, 0xFB36),
315 (0xFB38, 0xFB3C),
316 (0xFB3E,),
317 (0xFB40, 0xFB41),
318 (0xFB43, 0xFB44),
319 (0xFB46, 0xFB4F),
320 ]
322 class Devanagari(unicode_set):
323 "Unicode set for Devanagari Unicode Character Range"
324 _ranges: UnicodeRangeList = [
325 (0x0900, 0x097F),
326 (0xA8E0, 0xA8FF)
327 ]
329 # fmt: on
332pyparsing_unicode.Japanese._ranges = (
333 pyparsing_unicode.Japanese.Kanji._ranges
334 + pyparsing_unicode.Japanese.Hiragana._ranges
335 + pyparsing_unicode.Japanese.Katakana._ranges
336)
338pyparsing_unicode.BMP = pyparsing_unicode.BasicMultilingualPlane
340# add language identifiers using language Unicode
341pyparsing_unicode.العربية = pyparsing_unicode.Arabic
342pyparsing_unicode.中文 = pyparsing_unicode.Chinese
343pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic
344pyparsing_unicode.Ελληνικά = pyparsing_unicode.Greek
345pyparsing_unicode.עִברִית = pyparsing_unicode.Hebrew
346pyparsing_unicode.日本語 = pyparsing_unicode.Japanese
347pyparsing_unicode.Japanese.漢字 = pyparsing_unicode.Japanese.Kanji
348pyparsing_unicode.Japanese.カタカナ = pyparsing_unicode.Japanese.Katakana
349pyparsing_unicode.Japanese.ひらがな = pyparsing_unicode.Japanese.Hiragana
350pyparsing_unicode.한국어 = pyparsing_unicode.Korean
351pyparsing_unicode.ไทย = pyparsing_unicode.Thai
352pyparsing_unicode.देवनागरी = pyparsing_unicode.Devanagari