Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pyparsing/unicode.py: 96%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

101 statements  

1# unicode.py 

2 

3import sys 

4from itertools import filterfalse 

5from typing import List, Tuple, Union 

6 

7 

8class _lazyclassproperty: 

9 def __init__(self, fn): 

10 self.fn = fn 

11 self.__doc__ = fn.__doc__ 

12 self.__name__ = fn.__name__ 

13 

14 def __get__(self, obj, cls): 

15 if cls is None: 

16 cls = type(obj) 

17 if not hasattr(cls, "_intern") or any( 

18 cls._intern is getattr(superclass, "_intern", []) 

19 for superclass in cls.__mro__[1:] 

20 ): 

21 cls._intern = {} 

22 attrname = self.fn.__name__ 

23 if attrname not in cls._intern: 

24 cls._intern[attrname] = self.fn(cls) 

25 return cls._intern[attrname] 

26 

27 

28UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]] 

29 

30 

31class unicode_set: 

32 """ 

33 A set of Unicode characters, for language-specific strings for 

34 ``alphas``, ``nums``, ``alphanums``, and ``printables``. 

35 A unicode_set is defined by a list of ranges in the Unicode character 

36 set, in a class attribute ``_ranges``. Ranges can be specified using 

37 2-tuples or a 1-tuple, such as:: 

38 

39 _ranges = [ 

40 (0x0020, 0x007e), 

41 (0x00a0, 0x00ff), 

42 (0x0100,), 

43 ] 

44 

45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x). 

46 

47 A unicode set can also be defined using multiple inheritance of other unicode sets:: 

48 

49 class CJK(Chinese, Japanese, Korean): 

50 pass 

51 """ 

52 

53 _ranges: UnicodeRangeList = [] 

54 

55 @_lazyclassproperty 

56 def _chars_for_ranges(cls) -> List[str]: 

57 ret: List[int] = [] 

58 for cc in cls.__mro__: 

59 if cc is unicode_set: 

60 break 

61 for rr in getattr(cc, "_ranges", ()): 

62 ret.extend(range(rr[0], rr[-1] + 1)) 

63 return sorted(chr(c) for c in set(ret)) 

64 

65 @_lazyclassproperty 

66 def printables(cls) -> str: 

67 """all non-whitespace characters in this range""" 

68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges)) 

69 

70 @_lazyclassproperty 

71 def alphas(cls) -> str: 

72 """all alphabetic characters in this range""" 

73 return "".join(filter(str.isalpha, cls._chars_for_ranges)) 

74 

75 @_lazyclassproperty 

76 def nums(cls) -> str: 

77 """all numeric digit characters in this range""" 

78 return "".join(filter(str.isdigit, cls._chars_for_ranges)) 

79 

80 @_lazyclassproperty 

81 def alphanums(cls) -> str: 

82 """all alphanumeric characters in this range""" 

83 return cls.alphas + cls.nums 

84 

85 @_lazyclassproperty 

86 def identchars(cls) -> str: 

87 """all characters in this range that are valid identifier characters, plus underscore '_'""" 

88 return "".join( 

89 sorted( 

90 set(filter(str.isidentifier, cls._chars_for_ranges)) 

91 | set( 

92 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº" 

93 "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" 

94 "_" 

95 ) 

96 ) 

97 ) 

98 

99 @_lazyclassproperty 

100 def identbodychars(cls) -> str: 

101 """ 

102 all characters in this range that are valid identifier body characters, 

103 plus the digits 0-9, and · (Unicode MIDDLE DOT) 

104 """ 

105 identifier_chars = set( 

106 c for c in cls._chars_for_ranges if ("_" + c).isidentifier() 

107 ) 

108 return "".join( 

109 sorted(identifier_chars | set(cls.identchars) | set("0123456789·")) 

110 ) 

111 

112 @_lazyclassproperty 

113 def identifier(cls): 

114 """ 

115 a pyparsing Word expression for an identifier using this range's definitions for 

116 identchars and identbodychars 

117 """ 

118 from pyparsing import Word 

119 

120 return Word(cls.identchars, cls.identbodychars) 

121 

122 

123class pyparsing_unicode(unicode_set): 

124 """ 

125 A namespace class for defining common language unicode_sets. 

126 """ 

127 

128 # fmt: off 

129 

130 # define ranges in language character sets 

131 _ranges: UnicodeRangeList = [ 

132 (0x0020, sys.maxunicode), 

133 ] 

134 

135 class BasicMultilingualPlane(unicode_set): 

136 """Unicode set for the Basic Multilingual Plane""" 

137 _ranges: UnicodeRangeList = [ 

138 (0x0020, 0xFFFF), 

139 ] 

140 

141 class Latin1(unicode_set): 

142 """Unicode set for Latin-1 Unicode Character Range""" 

143 _ranges: UnicodeRangeList = [ 

144 (0x0020, 0x007E), 

145 (0x00A0, 0x00FF), 

146 ] 

147 

148 class LatinA(unicode_set): 

149 """Unicode set for Latin-A Unicode Character Range""" 

150 _ranges: UnicodeRangeList = [ 

151 (0x0100, 0x017F), 

152 ] 

153 

154 class LatinB(unicode_set): 

155 """Unicode set for Latin-B Unicode Character Range""" 

156 _ranges: UnicodeRangeList = [ 

157 (0x0180, 0x024F), 

158 ] 

159 

160 class Greek(unicode_set): 

161 """Unicode set for Greek Unicode Character Ranges""" 

162 _ranges: UnicodeRangeList = [ 

163 (0x0342, 0x0345), 

164 (0x0370, 0x0377), 

165 (0x037A, 0x037F), 

166 (0x0384, 0x038A), 

167 (0x038C,), 

168 (0x038E, 0x03A1), 

169 (0x03A3, 0x03E1), 

170 (0x03F0, 0x03FF), 

171 (0x1D26, 0x1D2A), 

172 (0x1D5E,), 

173 (0x1D60,), 

174 (0x1D66, 0x1D6A), 

175 (0x1F00, 0x1F15), 

176 (0x1F18, 0x1F1D), 

177 (0x1F20, 0x1F45), 

178 (0x1F48, 0x1F4D), 

179 (0x1F50, 0x1F57), 

180 (0x1F59,), 

181 (0x1F5B,), 

182 (0x1F5D,), 

183 (0x1F5F, 0x1F7D), 

184 (0x1F80, 0x1FB4), 

185 (0x1FB6, 0x1FC4), 

186 (0x1FC6, 0x1FD3), 

187 (0x1FD6, 0x1FDB), 

188 (0x1FDD, 0x1FEF), 

189 (0x1FF2, 0x1FF4), 

190 (0x1FF6, 0x1FFE), 

191 (0x2129,), 

192 (0x2719, 0x271A), 

193 (0xAB65,), 

194 (0x10140, 0x1018D), 

195 (0x101A0,), 

196 (0x1D200, 0x1D245), 

197 (0x1F7A1, 0x1F7A7), 

198 ] 

199 

200 class Cyrillic(unicode_set): 

201 """Unicode set for Cyrillic Unicode Character Range""" 

202 _ranges: UnicodeRangeList = [ 

203 (0x0400, 0x052F), 

204 (0x1C80, 0x1C88), 

205 (0x1D2B,), 

206 (0x1D78,), 

207 (0x2DE0, 0x2DFF), 

208 (0xA640, 0xA672), 

209 (0xA674, 0xA69F), 

210 (0xFE2E, 0xFE2F), 

211 ] 

212 

213 class Chinese(unicode_set): 

214 """Unicode set for Chinese Unicode Character Range""" 

215 _ranges: UnicodeRangeList = [ 

216 (0x2E80, 0x2E99), 

217 (0x2E9B, 0x2EF3), 

218 (0x31C0, 0x31E3), 

219 (0x3400, 0x4DB5), 

220 (0x4E00, 0x9FEF), 

221 (0xA700, 0xA707), 

222 (0xF900, 0xFA6D), 

223 (0xFA70, 0xFAD9), 

224 (0x16FE2, 0x16FE3), 

225 (0x1F210, 0x1F212), 

226 (0x1F214, 0x1F23B), 

227 (0x1F240, 0x1F248), 

228 (0x20000, 0x2A6D6), 

229 (0x2A700, 0x2B734), 

230 (0x2B740, 0x2B81D), 

231 (0x2B820, 0x2CEA1), 

232 (0x2CEB0, 0x2EBE0), 

233 (0x2F800, 0x2FA1D), 

234 ] 

235 

236 class Japanese(unicode_set): 

237 """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges""" 

238 

239 class Kanji(unicode_set): 

240 "Unicode set for Kanji Unicode Character Range" 

241 _ranges: UnicodeRangeList = [ 

242 (0x4E00, 0x9FBF), 

243 (0x3000, 0x303F), 

244 ] 

245 

246 class Hiragana(unicode_set): 

247 """Unicode set for Hiragana Unicode Character Range""" 

248 _ranges: UnicodeRangeList = [ 

249 (0x3041, 0x3096), 

250 (0x3099, 0x30A0), 

251 (0x30FC,), 

252 (0xFF70,), 

253 (0x1B001,), 

254 (0x1B150, 0x1B152), 

255 (0x1F200,), 

256 ] 

257 

258 class Katakana(unicode_set): 

259 """Unicode set for Katakana Unicode Character Range""" 

260 _ranges: UnicodeRangeList = [ 

261 (0x3099, 0x309C), 

262 (0x30A0, 0x30FF), 

263 (0x31F0, 0x31FF), 

264 (0x32D0, 0x32FE), 

265 (0xFF65, 0xFF9F), 

266 (0x1B000,), 

267 (0x1B164, 0x1B167), 

268 (0x1F201, 0x1F202), 

269 (0x1F213,), 

270 ] 

271 

272 漢字 = Kanji 

273 カタカナ = Katakana 

274 ひらがな = Hiragana 

275 

276 _ranges = ( 

277 Kanji._ranges 

278 + Hiragana._ranges 

279 + Katakana._ranges 

280 ) 

281 

282 class Hangul(unicode_set): 

283 """Unicode set for Hangul (Korean) Unicode Character Range""" 

284 _ranges: UnicodeRangeList = [ 

285 (0x1100, 0x11FF), 

286 (0x302E, 0x302F), 

287 (0x3131, 0x318E), 

288 (0x3200, 0x321C), 

289 (0x3260, 0x327B), 

290 (0x327E,), 

291 (0xA960, 0xA97C), 

292 (0xAC00, 0xD7A3), 

293 (0xD7B0, 0xD7C6), 

294 (0xD7CB, 0xD7FB), 

295 (0xFFA0, 0xFFBE), 

296 (0xFFC2, 0xFFC7), 

297 (0xFFCA, 0xFFCF), 

298 (0xFFD2, 0xFFD7), 

299 (0xFFDA, 0xFFDC), 

300 ] 

301 

302 Korean = Hangul 

303 

304 class CJK(Chinese, Japanese, Hangul): 

305 """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range""" 

306 

307 class Thai(unicode_set): 

308 """Unicode set for Thai Unicode Character Range""" 

309 _ranges: UnicodeRangeList = [ 

310 (0x0E01, 0x0E3A), 

311 (0x0E3F, 0x0E5B) 

312 ] 

313 

314 class Arabic(unicode_set): 

315 """Unicode set for Arabic Unicode Character Range""" 

316 _ranges: UnicodeRangeList = [ 

317 (0x0600, 0x061B), 

318 (0x061E, 0x06FF), 

319 (0x0700, 0x077F), 

320 ] 

321 

322 class Hebrew(unicode_set): 

323 """Unicode set for Hebrew Unicode Character Range""" 

324 _ranges: UnicodeRangeList = [ 

325 (0x0591, 0x05C7), 

326 (0x05D0, 0x05EA), 

327 (0x05EF, 0x05F4), 

328 (0xFB1D, 0xFB36), 

329 (0xFB38, 0xFB3C), 

330 (0xFB3E,), 

331 (0xFB40, 0xFB41), 

332 (0xFB43, 0xFB44), 

333 (0xFB46, 0xFB4F), 

334 ] 

335 

336 class Devanagari(unicode_set): 

337 """Unicode set for Devanagari Unicode Character Range""" 

338 _ranges: UnicodeRangeList = [ 

339 (0x0900, 0x097F), 

340 (0xA8E0, 0xA8FF) 

341 ] 

342 

343 BMP = BasicMultilingualPlane 

344 

345 # add language identifiers using language Unicode 

346 العربية = Arabic 

347 中文 = Chinese 

348 кириллица = Cyrillic 

349 Ελληνικά = Greek 

350 עִברִית = Hebrew 

351 日本語 = Japanese 

352 한국어 = Korean 

353 ไทย = Thai 

354 वनगर = Devanagari 

355 

356 # fmt: on