Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pip/_vendor/pyparsing/unicode.py: 98%

97 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:48 +0000

1# unicode.py 

2 

3import sys 

4from itertools import filterfalse 

5from typing import List, Tuple, Union 

6 

7 

8class _lazyclassproperty: 

9 def __init__(self, fn): 

10 self.fn = fn 

11 self.__doc__ = fn.__doc__ 

12 self.__name__ = fn.__name__ 

13 

14 def __get__(self, obj, cls): 

15 if cls is None: 

16 cls = type(obj) 

17 if not hasattr(cls, "_intern") or any( 

18 cls._intern is getattr(superclass, "_intern", []) 

19 for superclass in cls.__mro__[1:] 

20 ): 

21 cls._intern = {} 

22 attrname = self.fn.__name__ 

23 if attrname not in cls._intern: 

24 cls._intern[attrname] = self.fn(cls) 

25 return cls._intern[attrname] 

26 

27 

28UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]] 

29 

30 

31class unicode_set: 

32 """ 

33 A set of Unicode characters, for language-specific strings for 

34 ``alphas``, ``nums``, ``alphanums``, and ``printables``. 

35 A unicode_set is defined by a list of ranges in the Unicode character 

36 set, in a class attribute ``_ranges``. Ranges can be specified using 

37 2-tuples or a 1-tuple, such as:: 

38 

39 _ranges = [ 

40 (0x0020, 0x007e), 

41 (0x00a0, 0x00ff), 

42 (0x0100,), 

43 ] 

44 

45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x). 

46 

47 A unicode set can also be defined using multiple inheritance of other unicode sets:: 

48 

49 class CJK(Chinese, Japanese, Korean): 

50 pass 

51 """ 

52 

53 _ranges: UnicodeRangeList = [] 

54 

55 @_lazyclassproperty 

56 def _chars_for_ranges(cls): 

57 ret = [] 

58 for cc in cls.__mro__: 

59 if cc is unicode_set: 

60 break 

61 for rr in getattr(cc, "_ranges", ()): 

62 ret.extend(range(rr[0], rr[-1] + 1)) 

63 return [chr(c) for c in sorted(set(ret))] 

64 

65 @_lazyclassproperty 

66 def printables(cls): 

67 "all non-whitespace characters in this range" 

68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges)) 

69 

70 @_lazyclassproperty 

71 def alphas(cls): 

72 "all alphabetic characters in this range" 

73 return "".join(filter(str.isalpha, cls._chars_for_ranges)) 

74 

75 @_lazyclassproperty 

76 def nums(cls): 

77 "all numeric digit characters in this range" 

78 return "".join(filter(str.isdigit, cls._chars_for_ranges)) 

79 

80 @_lazyclassproperty 

81 def alphanums(cls): 

82 "all alphanumeric characters in this range" 

83 return cls.alphas + cls.nums 

84 

85 @_lazyclassproperty 

86 def identchars(cls): 

87 "all characters in this range that are valid identifier characters, plus underscore '_'" 

88 return "".join( 

89 sorted( 

90 set( 

91 "".join(filter(str.isidentifier, cls._chars_for_ranges)) 

92 + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº" 

93 + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" 

94 + "_" 

95 ) 

96 ) 

97 ) 

98 

99 @_lazyclassproperty 

100 def identbodychars(cls): 

101 """ 

102 all characters in this range that are valid identifier body characters, 

103 plus the digits 0-9 

104 """ 

105 return "".join( 

106 sorted( 

107 set( 

108 cls.identchars 

109 + "0123456789" 

110 + "".join( 

111 [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()] 

112 ) 

113 ) 

114 ) 

115 ) 

116 

117 

118class pyparsing_unicode(unicode_set): 

119 """ 

120 A namespace class for defining common language unicode_sets. 

121 """ 

122 

123 # fmt: off 

124 

125 # define ranges in language character sets 

126 _ranges: UnicodeRangeList = [ 

127 (0x0020, sys.maxunicode), 

128 ] 

129 

130 class BasicMultilingualPlane(unicode_set): 

131 "Unicode set for the Basic Multilingual Plane" 

132 _ranges: UnicodeRangeList = [ 

133 (0x0020, 0xFFFF), 

134 ] 

135 

136 class Latin1(unicode_set): 

137 "Unicode set for Latin-1 Unicode Character Range" 

138 _ranges: UnicodeRangeList = [ 

139 (0x0020, 0x007E), 

140 (0x00A0, 0x00FF), 

141 ] 

142 

143 class LatinA(unicode_set): 

144 "Unicode set for Latin-A Unicode Character Range" 

145 _ranges: UnicodeRangeList = [ 

146 (0x0100, 0x017F), 

147 ] 

148 

149 class LatinB(unicode_set): 

150 "Unicode set for Latin-B Unicode Character Range" 

151 _ranges: UnicodeRangeList = [ 

152 (0x0180, 0x024F), 

153 ] 

154 

155 class Greek(unicode_set): 

156 "Unicode set for Greek Unicode Character Ranges" 

157 _ranges: UnicodeRangeList = [ 

158 (0x0342, 0x0345), 

159 (0x0370, 0x0377), 

160 (0x037A, 0x037F), 

161 (0x0384, 0x038A), 

162 (0x038C,), 

163 (0x038E, 0x03A1), 

164 (0x03A3, 0x03E1), 

165 (0x03F0, 0x03FF), 

166 (0x1D26, 0x1D2A), 

167 (0x1D5E,), 

168 (0x1D60,), 

169 (0x1D66, 0x1D6A), 

170 (0x1F00, 0x1F15), 

171 (0x1F18, 0x1F1D), 

172 (0x1F20, 0x1F45), 

173 (0x1F48, 0x1F4D), 

174 (0x1F50, 0x1F57), 

175 (0x1F59,), 

176 (0x1F5B,), 

177 (0x1F5D,), 

178 (0x1F5F, 0x1F7D), 

179 (0x1F80, 0x1FB4), 

180 (0x1FB6, 0x1FC4), 

181 (0x1FC6, 0x1FD3), 

182 (0x1FD6, 0x1FDB), 

183 (0x1FDD, 0x1FEF), 

184 (0x1FF2, 0x1FF4), 

185 (0x1FF6, 0x1FFE), 

186 (0x2129,), 

187 (0x2719, 0x271A), 

188 (0xAB65,), 

189 (0x10140, 0x1018D), 

190 (0x101A0,), 

191 (0x1D200, 0x1D245), 

192 (0x1F7A1, 0x1F7A7), 

193 ] 

194 

195 class Cyrillic(unicode_set): 

196 "Unicode set for Cyrillic Unicode Character Range" 

197 _ranges: UnicodeRangeList = [ 

198 (0x0400, 0x052F), 

199 (0x1C80, 0x1C88), 

200 (0x1D2B,), 

201 (0x1D78,), 

202 (0x2DE0, 0x2DFF), 

203 (0xA640, 0xA672), 

204 (0xA674, 0xA69F), 

205 (0xFE2E, 0xFE2F), 

206 ] 

207 

208 class Chinese(unicode_set): 

209 "Unicode set for Chinese Unicode Character Range" 

210 _ranges: UnicodeRangeList = [ 

211 (0x2E80, 0x2E99), 

212 (0x2E9B, 0x2EF3), 

213 (0x31C0, 0x31E3), 

214 (0x3400, 0x4DB5), 

215 (0x4E00, 0x9FEF), 

216 (0xA700, 0xA707), 

217 (0xF900, 0xFA6D), 

218 (0xFA70, 0xFAD9), 

219 (0x16FE2, 0x16FE3), 

220 (0x1F210, 0x1F212), 

221 (0x1F214, 0x1F23B), 

222 (0x1F240, 0x1F248), 

223 (0x20000, 0x2A6D6), 

224 (0x2A700, 0x2B734), 

225 (0x2B740, 0x2B81D), 

226 (0x2B820, 0x2CEA1), 

227 (0x2CEB0, 0x2EBE0), 

228 (0x2F800, 0x2FA1D), 

229 ] 

230 

231 class Japanese(unicode_set): 

232 "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges" 

233 _ranges: UnicodeRangeList = [] 

234 

235 class Kanji(unicode_set): 

236 "Unicode set for Kanji Unicode Character Range" 

237 _ranges: UnicodeRangeList = [ 

238 (0x4E00, 0x9FBF), 

239 (0x3000, 0x303F), 

240 ] 

241 

242 class Hiragana(unicode_set): 

243 "Unicode set for Hiragana Unicode Character Range" 

244 _ranges: UnicodeRangeList = [ 

245 (0x3041, 0x3096), 

246 (0x3099, 0x30A0), 

247 (0x30FC,), 

248 (0xFF70,), 

249 (0x1B001,), 

250 (0x1B150, 0x1B152), 

251 (0x1F200,), 

252 ] 

253 

254 class Katakana(unicode_set): 

255 "Unicode set for Katakana Unicode Character Range" 

256 _ranges: UnicodeRangeList = [ 

257 (0x3099, 0x309C), 

258 (0x30A0, 0x30FF), 

259 (0x31F0, 0x31FF), 

260 (0x32D0, 0x32FE), 

261 (0xFF65, 0xFF9F), 

262 (0x1B000,), 

263 (0x1B164, 0x1B167), 

264 (0x1F201, 0x1F202), 

265 (0x1F213,), 

266 ] 

267 

268 class Hangul(unicode_set): 

269 "Unicode set for Hangul (Korean) Unicode Character Range" 

270 _ranges: UnicodeRangeList = [ 

271 (0x1100, 0x11FF), 

272 (0x302E, 0x302F), 

273 (0x3131, 0x318E), 

274 (0x3200, 0x321C), 

275 (0x3260, 0x327B), 

276 (0x327E,), 

277 (0xA960, 0xA97C), 

278 (0xAC00, 0xD7A3), 

279 (0xD7B0, 0xD7C6), 

280 (0xD7CB, 0xD7FB), 

281 (0xFFA0, 0xFFBE), 

282 (0xFFC2, 0xFFC7), 

283 (0xFFCA, 0xFFCF), 

284 (0xFFD2, 0xFFD7), 

285 (0xFFDA, 0xFFDC), 

286 ] 

287 

288 Korean = Hangul 

289 

290 class CJK(Chinese, Japanese, Hangul): 

291 "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range" 

292 

293 class Thai(unicode_set): 

294 "Unicode set for Thai Unicode Character Range" 

295 _ranges: UnicodeRangeList = [ 

296 (0x0E01, 0x0E3A), 

297 (0x0E3F, 0x0E5B) 

298 ] 

299 

300 class Arabic(unicode_set): 

301 "Unicode set for Arabic Unicode Character Range" 

302 _ranges: UnicodeRangeList = [ 

303 (0x0600, 0x061B), 

304 (0x061E, 0x06FF), 

305 (0x0700, 0x077F), 

306 ] 

307 

308 class Hebrew(unicode_set): 

309 "Unicode set for Hebrew Unicode Character Range" 

310 _ranges: UnicodeRangeList = [ 

311 (0x0591, 0x05C7), 

312 (0x05D0, 0x05EA), 

313 (0x05EF, 0x05F4), 

314 (0xFB1D, 0xFB36), 

315 (0xFB38, 0xFB3C), 

316 (0xFB3E,), 

317 (0xFB40, 0xFB41), 

318 (0xFB43, 0xFB44), 

319 (0xFB46, 0xFB4F), 

320 ] 

321 

322 class Devanagari(unicode_set): 

323 "Unicode set for Devanagari Unicode Character Range" 

324 _ranges: UnicodeRangeList = [ 

325 (0x0900, 0x097F), 

326 (0xA8E0, 0xA8FF) 

327 ] 

328 

329 # fmt: on 

330 

331 

332pyparsing_unicode.Japanese._ranges = ( 

333 pyparsing_unicode.Japanese.Kanji._ranges 

334 + pyparsing_unicode.Japanese.Hiragana._ranges 

335 + pyparsing_unicode.Japanese.Katakana._ranges 

336) 

337 

338pyparsing_unicode.BMP = pyparsing_unicode.BasicMultilingualPlane 

339 

340# add language identifiers using language Unicode 

341pyparsing_unicode.العربية = pyparsing_unicode.Arabic 

342pyparsing_unicode.中文 = pyparsing_unicode.Chinese 

343pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic 

344pyparsing_unicode.Ελληνικά = pyparsing_unicode.Greek 

345pyparsing_unicode.עִברִית = pyparsing_unicode.Hebrew 

346pyparsing_unicode.日本語 = pyparsing_unicode.Japanese 

347pyparsing_unicode.Japanese.漢字 = pyparsing_unicode.Japanese.Kanji 

348pyparsing_unicode.Japanese.カタカナ = pyparsing_unicode.Japanese.Katakana 

349pyparsing_unicode.Japanese.ひらがな = pyparsing_unicode.Japanese.Hiragana 

350pyparsing_unicode.한국어 = pyparsing_unicode.Korean 

351pyparsing_unicode.ไทย = pyparsing_unicode.Thai 

352pyparsing_unicode.वनगर = pyparsing_unicode.Devanagari