Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pyparsing/unicode.py: 96%

100 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-08 06:51 +0000

1# unicode.py 

2 

3import sys 

4from itertools import filterfalse 

5from typing import List, Tuple, Union 

6 

7 

8class _lazyclassproperty: 

9 def __init__(self, fn): 

10 self.fn = fn 

11 self.__doc__ = fn.__doc__ 

12 self.__name__ = fn.__name__ 

13 

14 def __get__(self, obj, cls): 

15 if cls is None: 

16 cls = type(obj) 

17 if not hasattr(cls, "_intern") or any( 

18 cls._intern is getattr(superclass, "_intern", []) 

19 for superclass in cls.__mro__[1:] 

20 ): 

21 cls._intern = {} 

22 attrname = self.fn.__name__ 

23 if attrname not in cls._intern: 

24 cls._intern[attrname] = self.fn(cls) 

25 return cls._intern[attrname] 

26 

27 

28UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]] 

29 

30 

31class unicode_set: 

32 """ 

33 A set of Unicode characters, for language-specific strings for 

34 ``alphas``, ``nums``, ``alphanums``, and ``printables``. 

35 A unicode_set is defined by a list of ranges in the Unicode character 

36 set, in a class attribute ``_ranges``. Ranges can be specified using 

37 2-tuples or a 1-tuple, such as:: 

38 

39 _ranges = [ 

40 (0x0020, 0x007e), 

41 (0x00a0, 0x00ff), 

42 (0x0100,), 

43 ] 

44 

45 Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x). 

46 

47 A unicode set can also be defined using multiple inheritance of other unicode sets:: 

48 

49 class CJK(Chinese, Japanese, Korean): 

50 pass 

51 """ 

52 

53 _ranges: UnicodeRangeList = [] 

54 

55 @_lazyclassproperty 

56 def _chars_for_ranges(cls): 

57 ret = [] 

58 for cc in cls.__mro__: 

59 if cc is unicode_set: 

60 break 

61 for rr in getattr(cc, "_ranges", ()): 

62 ret.extend(range(rr[0], rr[-1] + 1)) 

63 return [chr(c) for c in sorted(set(ret))] 

64 

65 @_lazyclassproperty 

66 def printables(cls): 

67 """all non-whitespace characters in this range""" 

68 return "".join(filterfalse(str.isspace, cls._chars_for_ranges)) 

69 

70 @_lazyclassproperty 

71 def alphas(cls): 

72 """all alphabetic characters in this range""" 

73 return "".join(filter(str.isalpha, cls._chars_for_ranges)) 

74 

75 @_lazyclassproperty 

76 def nums(cls): 

77 """all numeric digit characters in this range""" 

78 return "".join(filter(str.isdigit, cls._chars_for_ranges)) 

79 

80 @_lazyclassproperty 

81 def alphanums(cls): 

82 """all alphanumeric characters in this range""" 

83 return cls.alphas + cls.nums 

84 

85 @_lazyclassproperty 

86 def identchars(cls): 

87 """all characters in this range that are valid identifier characters, plus underscore '_'""" 

88 return "".join( 

89 sorted( 

90 set( 

91 "".join(filter(str.isidentifier, cls._chars_for_ranges)) 

92 + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº" 

93 + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" 

94 + "_" 

95 ) 

96 ) 

97 ) 

98 

99 @_lazyclassproperty 

100 def identbodychars(cls): 

101 """ 

102 all characters in this range that are valid identifier body characters, 

103 plus the digits 0-9, and · (Unicode MIDDLE DOT) 

104 """ 

105 return "".join( 

106 sorted( 

107 set( 

108 cls.identchars 

109 + "0123456789·" 

110 + "".join( 

111 [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()] 

112 ) 

113 ) 

114 ) 

115 ) 

116 

117 @_lazyclassproperty 

118 def identifier(cls): 

119 """ 

120 a pyparsing Word expression for an identifier using this range's definitions for 

121 identchars and identbodychars 

122 """ 

123 from pyparsing import Word 

124 

125 return Word(cls.identchars, cls.identbodychars) 

126 

127 

128class pyparsing_unicode(unicode_set): 

129 """ 

130 A namespace class for defining common language unicode_sets. 

131 """ 

132 

133 # fmt: off 

134 

135 # define ranges in language character sets 

136 _ranges: UnicodeRangeList = [ 

137 (0x0020, sys.maxunicode), 

138 ] 

139 

140 class BasicMultilingualPlane(unicode_set): 

141 """Unicode set for the Basic Multilingual Plane""" 

142 _ranges: UnicodeRangeList = [ 

143 (0x0020, 0xFFFF), 

144 ] 

145 

146 class Latin1(unicode_set): 

147 """Unicode set for Latin-1 Unicode Character Range""" 

148 _ranges: UnicodeRangeList = [ 

149 (0x0020, 0x007E), 

150 (0x00A0, 0x00FF), 

151 ] 

152 

153 class LatinA(unicode_set): 

154 """Unicode set for Latin-A Unicode Character Range""" 

155 _ranges: UnicodeRangeList = [ 

156 (0x0100, 0x017F), 

157 ] 

158 

159 class LatinB(unicode_set): 

160 """Unicode set for Latin-B Unicode Character Range""" 

161 _ranges: UnicodeRangeList = [ 

162 (0x0180, 0x024F), 

163 ] 

164 

165 class Greek(unicode_set): 

166 """Unicode set for Greek Unicode Character Ranges""" 

167 _ranges: UnicodeRangeList = [ 

168 (0x0342, 0x0345), 

169 (0x0370, 0x0377), 

170 (0x037A, 0x037F), 

171 (0x0384, 0x038A), 

172 (0x038C,), 

173 (0x038E, 0x03A1), 

174 (0x03A3, 0x03E1), 

175 (0x03F0, 0x03FF), 

176 (0x1D26, 0x1D2A), 

177 (0x1D5E,), 

178 (0x1D60,), 

179 (0x1D66, 0x1D6A), 

180 (0x1F00, 0x1F15), 

181 (0x1F18, 0x1F1D), 

182 (0x1F20, 0x1F45), 

183 (0x1F48, 0x1F4D), 

184 (0x1F50, 0x1F57), 

185 (0x1F59,), 

186 (0x1F5B,), 

187 (0x1F5D,), 

188 (0x1F5F, 0x1F7D), 

189 (0x1F80, 0x1FB4), 

190 (0x1FB6, 0x1FC4), 

191 (0x1FC6, 0x1FD3), 

192 (0x1FD6, 0x1FDB), 

193 (0x1FDD, 0x1FEF), 

194 (0x1FF2, 0x1FF4), 

195 (0x1FF6, 0x1FFE), 

196 (0x2129,), 

197 (0x2719, 0x271A), 

198 (0xAB65,), 

199 (0x10140, 0x1018D), 

200 (0x101A0,), 

201 (0x1D200, 0x1D245), 

202 (0x1F7A1, 0x1F7A7), 

203 ] 

204 

205 class Cyrillic(unicode_set): 

206 """Unicode set for Cyrillic Unicode Character Range""" 

207 _ranges: UnicodeRangeList = [ 

208 (0x0400, 0x052F), 

209 (0x1C80, 0x1C88), 

210 (0x1D2B,), 

211 (0x1D78,), 

212 (0x2DE0, 0x2DFF), 

213 (0xA640, 0xA672), 

214 (0xA674, 0xA69F), 

215 (0xFE2E, 0xFE2F), 

216 ] 

217 

218 class Chinese(unicode_set): 

219 """Unicode set for Chinese Unicode Character Range""" 

220 _ranges: UnicodeRangeList = [ 

221 (0x2E80, 0x2E99), 

222 (0x2E9B, 0x2EF3), 

223 (0x31C0, 0x31E3), 

224 (0x3400, 0x4DB5), 

225 (0x4E00, 0x9FEF), 

226 (0xA700, 0xA707), 

227 (0xF900, 0xFA6D), 

228 (0xFA70, 0xFAD9), 

229 (0x16FE2, 0x16FE3), 

230 (0x1F210, 0x1F212), 

231 (0x1F214, 0x1F23B), 

232 (0x1F240, 0x1F248), 

233 (0x20000, 0x2A6D6), 

234 (0x2A700, 0x2B734), 

235 (0x2B740, 0x2B81D), 

236 (0x2B820, 0x2CEA1), 

237 (0x2CEB0, 0x2EBE0), 

238 (0x2F800, 0x2FA1D), 

239 ] 

240 

241 class Japanese(unicode_set): 

242 """Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges""" 

243 

244 class Kanji(unicode_set): 

245 "Unicode set for Kanji Unicode Character Range" 

246 _ranges: UnicodeRangeList = [ 

247 (0x4E00, 0x9FBF), 

248 (0x3000, 0x303F), 

249 ] 

250 

251 class Hiragana(unicode_set): 

252 """Unicode set for Hiragana Unicode Character Range""" 

253 _ranges: UnicodeRangeList = [ 

254 (0x3041, 0x3096), 

255 (0x3099, 0x30A0), 

256 (0x30FC,), 

257 (0xFF70,), 

258 (0x1B001,), 

259 (0x1B150, 0x1B152), 

260 (0x1F200,), 

261 ] 

262 

263 class Katakana(unicode_set): 

264 """Unicode set for Katakana Unicode Character Range""" 

265 _ranges: UnicodeRangeList = [ 

266 (0x3099, 0x309C), 

267 (0x30A0, 0x30FF), 

268 (0x31F0, 0x31FF), 

269 (0x32D0, 0x32FE), 

270 (0xFF65, 0xFF9F), 

271 (0x1B000,), 

272 (0x1B164, 0x1B167), 

273 (0x1F201, 0x1F202), 

274 (0x1F213,), 

275 ] 

276 

277 漢字 = Kanji 

278 カタカナ = Katakana 

279 ひらがな = Hiragana 

280 

281 _ranges = ( 

282 Kanji._ranges 

283 + Hiragana._ranges 

284 + Katakana._ranges 

285 ) 

286 

287 class Hangul(unicode_set): 

288 """Unicode set for Hangul (Korean) Unicode Character Range""" 

289 _ranges: UnicodeRangeList = [ 

290 (0x1100, 0x11FF), 

291 (0x302E, 0x302F), 

292 (0x3131, 0x318E), 

293 (0x3200, 0x321C), 

294 (0x3260, 0x327B), 

295 (0x327E,), 

296 (0xA960, 0xA97C), 

297 (0xAC00, 0xD7A3), 

298 (0xD7B0, 0xD7C6), 

299 (0xD7CB, 0xD7FB), 

300 (0xFFA0, 0xFFBE), 

301 (0xFFC2, 0xFFC7), 

302 (0xFFCA, 0xFFCF), 

303 (0xFFD2, 0xFFD7), 

304 (0xFFDA, 0xFFDC), 

305 ] 

306 

307 Korean = Hangul 

308 

309 class CJK(Chinese, Japanese, Hangul): 

310 """Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range""" 

311 

312 class Thai(unicode_set): 

313 """Unicode set for Thai Unicode Character Range""" 

314 _ranges: UnicodeRangeList = [ 

315 (0x0E01, 0x0E3A), 

316 (0x0E3F, 0x0E5B) 

317 ] 

318 

319 class Arabic(unicode_set): 

320 """Unicode set for Arabic Unicode Character Range""" 

321 _ranges: UnicodeRangeList = [ 

322 (0x0600, 0x061B), 

323 (0x061E, 0x06FF), 

324 (0x0700, 0x077F), 

325 ] 

326 

327 class Hebrew(unicode_set): 

328 """Unicode set for Hebrew Unicode Character Range""" 

329 _ranges: UnicodeRangeList = [ 

330 (0x0591, 0x05C7), 

331 (0x05D0, 0x05EA), 

332 (0x05EF, 0x05F4), 

333 (0xFB1D, 0xFB36), 

334 (0xFB38, 0xFB3C), 

335 (0xFB3E,), 

336 (0xFB40, 0xFB41), 

337 (0xFB43, 0xFB44), 

338 (0xFB46, 0xFB4F), 

339 ] 

340 

341 class Devanagari(unicode_set): 

342 """Unicode set for Devanagari Unicode Character Range""" 

343 _ranges: UnicodeRangeList = [ 

344 (0x0900, 0x097F), 

345 (0xA8E0, 0xA8FF) 

346 ] 

347 

348 BMP = BasicMultilingualPlane 

349 

350 # add language identifiers using language Unicode 

351 العربية = Arabic 

352 中文 = Chinese 

353 кириллица = Cyrillic 

354 Ελληνικά = Greek 

355 עִברִית = Hebrew 

356 日本語 = Japanese 

357 한국어 = Korean 

358 ไทย = Thai 

359 वनगर = Devanagari 

360 

361 # fmt: on