Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pyparsing/unicode.py: 96%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

101 statements  

1# unicode.py 

2 

3import sys 

4from itertools import filterfalse 

5from typing import List, Tuple, Union 

6 

7 

8class _lazyclassproperty: 

9 def __init__(self, fn): 

10 self.fn = fn 

11 self.__doc__ = fn.__doc__ 

12 self.__name__ = fn.__name__ 

13 

14 def __get__(self, obj, cls): 

15 if cls is None: 

16 cls = type(obj) 

17 if not hasattr(cls, "_intern") or any( 

18 cls._intern is getattr(superclass, "_intern", []) 

19 for superclass in cls.__mro__[1:] 

20 ): 

21 cls._intern = {} 

22 attrname = self.fn.__name__ 

23 if attrname not in cls._intern: 

24 cls._intern[attrname] = self.fn(cls) 

25 return cls._intern[attrname] 

26 

27 

28UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]] 

29 

30 

31class unicode_set: 

32 """ 

33 A set of Unicode characters, for language-specific strings for 

34 ``alphas``, ``nums``, ``alphanums``, and ``printables``. 

35 A unicode_set is defined by a list of ranges in the Unicode character 

36 set, in a class attribute ``_ranges``. Ranges can be specified using 

37 2-tuples or a 1-tuple, such as:: 

38 

39 _ranges = [ 

40 (0x0020, 0x007e), 

41 (0x00a0, 0x00ff), 

42 (0x0100,), 

43 ] 

44 

45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x). 

46 

47 A unicode set can also be defined using multiple inheritance of other unicode sets:: 

48 

49 class CJK(Chinese, Japanese, Korean): 

50 pass 

51 """ 

52 

53 _ranges: UnicodeRangeList = [] 

54 

55 @_lazyclassproperty 

56 def _chars_for_ranges(cls): 

57 ret = [] 

58 for cc in cls.__mro__: 

59 if cc is unicode_set: 

60 break 

61 for rr in getattr(cc, "_ranges", ()): 

62 ret.extend(range(rr[0], rr[-1] + 1)) 

63 return [chr(c) for c in sorted(set(ret))] 

64 

65 @_lazyclassproperty 

66 def printables(cls): 

67 """all non-whitespace characters in this range""" 

68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges)) 

69 

70 @_lazyclassproperty 

71 def alphas(cls): 

72 """all alphabetic characters in this range""" 

73 return "".join(filter(str.isalpha, cls._chars_for_ranges)) 

74 

75 @_lazyclassproperty 

76 def nums(cls): 

77 """all numeric digit characters in this range""" 

78 return "".join(filter(str.isdigit, cls._chars_for_ranges)) 

79 

80 @_lazyclassproperty 

81 def alphanums(cls): 

82 """all alphanumeric characters in this range""" 

83 return cls.alphas + cls.nums 

84 

85 @_lazyclassproperty 

86 def identchars(cls): 

87 """all characters in this range that are valid identifier characters, plus underscore '_'""" 

88 return "".join( 

89 sorted( 

90 set( 

91 "".join(filter(str.isidentifier, cls._chars_for_ranges)) 

92 + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº" 

93 + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" 

94 + "_" 

95 ) 

96 ) 

97 ) 

98 

99 @_lazyclassproperty 

100 def identbodychars(cls): 

101 """ 

102 all characters in this range that are valid identifier body characters, 

103 plus the digits 0-9, and · (Unicode MIDDLE DOT) 

104 """ 

105 identifier_chars = set( 

106 c for c in cls._chars_for_ranges if ("_" + c).isidentifier() 

107 ) 

108 return "".join(sorted(identifier_chars | set(cls.identchars + "0123456789·"))) 

109 

110 @_lazyclassproperty 

111 def identifier(cls): 

112 """ 

113 a pyparsing Word expression for an identifier using this range's definitions for 

114 identchars and identbodychars 

115 """ 

116 from pyparsing import Word 

117 

118 return Word(cls.identchars, cls.identbodychars) 

119 

120 

121class pyparsing_unicode(unicode_set): 

122 """ 

123 A namespace class for defining common language unicode_sets. 

124 """ 

125 

126 # fmt: off 

127 

128 # define ranges in language character sets 

129 _ranges: UnicodeRangeList = [ 

130 (0x0020, sys.maxunicode), 

131 ] 

132 

133 class BasicMultilingualPlane(unicode_set): 

134 """Unicode set for the Basic Multilingual Plane""" 

135 _ranges: UnicodeRangeList = [ 

136 (0x0020, 0xFFFF), 

137 ] 

138 

139 class Latin1(unicode_set): 

140 """Unicode set for Latin-1 Unicode Character Range""" 

141 _ranges: UnicodeRangeList = [ 

142 (0x0020, 0x007E), 

143 (0x00A0, 0x00FF), 

144 ] 

145 

146 class LatinA(unicode_set): 

147 """Unicode set for Latin-A Unicode Character Range""" 

148 _ranges: UnicodeRangeList = [ 

149 (0x0100, 0x017F), 

150 ] 

151 

152 class LatinB(unicode_set): 

153 """Unicode set for Latin-B Unicode Character Range""" 

154 _ranges: UnicodeRangeList = [ 

155 (0x0180, 0x024F), 

156 ] 

157 

158 class Greek(unicode_set): 

159 """Unicode set for Greek Unicode Character Ranges""" 

160 _ranges: UnicodeRangeList = [ 

161 (0x0342, 0x0345), 

162 (0x0370, 0x0377), 

163 (0x037A, 0x037F), 

164 (0x0384, 0x038A), 

165 (0x038C,), 

166 (0x038E, 0x03A1), 

167 (0x03A3, 0x03E1), 

168 (0x03F0, 0x03FF), 

169 (0x1D26, 0x1D2A), 

170 (0x1D5E,), 

171 (0x1D60,), 

172 (0x1D66, 0x1D6A), 

173 (0x1F00, 0x1F15), 

174 (0x1F18, 0x1F1D), 

175 (0x1F20, 0x1F45), 

176 (0x1F48, 0x1F4D), 

177 (0x1F50, 0x1F57), 

178 (0x1F59,), 

179 (0x1F5B,), 

180 (0x1F5D,), 

181 (0x1F5F, 0x1F7D), 

182 (0x1F80, 0x1FB4), 

183 (0x1FB6, 0x1FC4), 

184 (0x1FC6, 0x1FD3), 

185 (0x1FD6, 0x1FDB), 

186 (0x1FDD, 0x1FEF), 

187 (0x1FF2, 0x1FF4), 

188 (0x1FF6, 0x1FFE), 

189 (0x2129,), 

190 (0x2719, 0x271A), 

191 (0xAB65,), 

192 (0x10140, 0x1018D), 

193 (0x101A0,), 

194 (0x1D200, 0x1D245), 

195 (0x1F7A1, 0x1F7A7), 

196 ] 

197 

198 class Cyrillic(unicode_set): 

199 """Unicode set for Cyrillic Unicode Character Range""" 

200 _ranges: UnicodeRangeList = [ 

201 (0x0400, 0x052F), 

202 (0x1C80, 0x1C88), 

203 (0x1D2B,), 

204 (0x1D78,), 

205 (0x2DE0, 0x2DFF), 

206 (0xA640, 0xA672), 

207 (0xA674, 0xA69F), 

208 (0xFE2E, 0xFE2F), 

209 ] 

210 

211 class Chinese(unicode_set): 

212 """Unicode set for Chinese Unicode Character Range""" 

213 _ranges: UnicodeRangeList = [ 

214 (0x2E80, 0x2E99), 

215 (0x2E9B, 0x2EF3), 

216 (0x31C0, 0x31E3), 

217 (0x3400, 0x4DB5), 

218 (0x4E00, 0x9FEF), 

219 (0xA700, 0xA707), 

220 (0xF900, 0xFA6D), 

221 (0xFA70, 0xFAD9), 

222 (0x16FE2, 0x16FE3), 

223 (0x1F210, 0x1F212), 

224 (0x1F214, 0x1F23B), 

225 (0x1F240, 0x1F248), 

226 (0x20000, 0x2A6D6), 

227 (0x2A700, 0x2B734), 

228 (0x2B740, 0x2B81D), 

229 (0x2B820, 0x2CEA1), 

230 (0x2CEB0, 0x2EBE0), 

231 (0x2F800, 0x2FA1D), 

232 ] 

233 

234 class Japanese(unicode_set): 

235 """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges""" 

236 

237 class Kanji(unicode_set): 

238 "Unicode set for Kanji Unicode Character Range" 

239 _ranges: UnicodeRangeList = [ 

240 (0x4E00, 0x9FBF), 

241 (0x3000, 0x303F), 

242 ] 

243 

244 class Hiragana(unicode_set): 

245 """Unicode set for Hiragana Unicode Character Range""" 

246 _ranges: UnicodeRangeList = [ 

247 (0x3041, 0x3096), 

248 (0x3099, 0x30A0), 

249 (0x30FC,), 

250 (0xFF70,), 

251 (0x1B001,), 

252 (0x1B150, 0x1B152), 

253 (0x1F200,), 

254 ] 

255 

256 class Katakana(unicode_set): 

257 """Unicode set for Katakana Unicode Character Range""" 

258 _ranges: UnicodeRangeList = [ 

259 (0x3099, 0x309C), 

260 (0x30A0, 0x30FF), 

261 (0x31F0, 0x31FF), 

262 (0x32D0, 0x32FE), 

263 (0xFF65, 0xFF9F), 

264 (0x1B000,), 

265 (0x1B164, 0x1B167), 

266 (0x1F201, 0x1F202), 

267 (0x1F213,), 

268 ] 

269 

270 漢字 = Kanji 

271 カタカナ = Katakana 

272 ひらがな = Hiragana 

273 

274 _ranges = ( 

275 Kanji._ranges 

276 + Hiragana._ranges 

277 + Katakana._ranges 

278 ) 

279 

280 class Hangul(unicode_set): 

281 """Unicode set for Hangul (Korean) Unicode Character Range""" 

282 _ranges: UnicodeRangeList = [ 

283 (0x1100, 0x11FF), 

284 (0x302E, 0x302F), 

285 (0x3131, 0x318E), 

286 (0x3200, 0x321C), 

287 (0x3260, 0x327B), 

288 (0x327E,), 

289 (0xA960, 0xA97C), 

290 (0xAC00, 0xD7A3), 

291 (0xD7B0, 0xD7C6), 

292 (0xD7CB, 0xD7FB), 

293 (0xFFA0, 0xFFBE), 

294 (0xFFC2, 0xFFC7), 

295 (0xFFCA, 0xFFCF), 

296 (0xFFD2, 0xFFD7), 

297 (0xFFDA, 0xFFDC), 

298 ] 

299 

300 Korean = Hangul 

301 

302 class CJK(Chinese, Japanese, Hangul): 

303 """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range""" 

304 

305 class Thai(unicode_set): 

306 """Unicode set for Thai Unicode Character Range""" 

307 _ranges: UnicodeRangeList = [ 

308 (0x0E01, 0x0E3A), 

309 (0x0E3F, 0x0E5B) 

310 ] 

311 

312 class Arabic(unicode_set): 

313 """Unicode set for Arabic Unicode Character Range""" 

314 _ranges: UnicodeRangeList = [ 

315 (0x0600, 0x061B), 

316 (0x061E, 0x06FF), 

317 (0x0700, 0x077F), 

318 ] 

319 

320 class Hebrew(unicode_set): 

321 """Unicode set for Hebrew Unicode Character Range""" 

322 _ranges: UnicodeRangeList = [ 

323 (0x0591, 0x05C7), 

324 (0x05D0, 0x05EA), 

325 (0x05EF, 0x05F4), 

326 (0xFB1D, 0xFB36), 

327 (0xFB38, 0xFB3C), 

328 (0xFB3E,), 

329 (0xFB40, 0xFB41), 

330 (0xFB43, 0xFB44), 

331 (0xFB46, 0xFB4F), 

332 ] 

333 

334 class Devanagari(unicode_set): 

335 """Unicode set for Devanagari Unicode Character Range""" 

336 _ranges: UnicodeRangeList = [ 

337 (0x0900, 0x097F), 

338 (0xA8E0, 0xA8FF) 

339 ] 

340 

341 BMP = BasicMultilingualPlane 

342 

343 # add language identifiers using language Unicode 

344 العربية = Arabic 

345 中文 = Chinese 

346 кириллица = Cyrillic 

347 Ελληνικά = Greek 

348 עִברִית = Hebrew 

349 日本語 = Japanese 

350 한국어 = Korean 

351 ไทย = Thai 

352 वनगर = Devanagari 

353 

354 # fmt: on