Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pyparsing/common.py: 79%

63 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-08 06:51 +0000

1# common.py 

2from .core import * 

3from .helpers import DelimitedList, any_open_tag, any_close_tag 

4from datetime import datetime 

5 

6 

7# some other useful expressions - using lower-case class name since we are really using this as a namespace 

8class pyparsing_common: 

9 """Here are some common low-level expressions that may be useful in 

10 jump-starting parser development: 

11 

12 - numeric forms (:class:`integers<integer>`, :class:`reals<real>`, 

13 :class:`scientific notation<sci_real>`) 

14 - common :class:`programming identifiers<identifier>` 

15 - network addresses (:class:`MAC<mac_address>`, 

16 :class:`IPv4<ipv4_address>`, :class:`IPv6<ipv6_address>`) 

17 - ISO8601 :class:`dates<iso8601_date>` and 

18 :class:`datetime<iso8601_datetime>` 

19 - :class:`UUID<uuid>` 

20 - :class:`comma-separated list<comma_separated_list>` 

21 - :class:`url` 

22 

23 Parse actions: 

24 

25 - :class:`convert_to_integer` 

26 - :class:`convert_to_float` 

27 - :class:`convert_to_date` 

28 - :class:`convert_to_datetime` 

29 - :class:`strip_html_tags` 

30 - :class:`upcase_tokens` 

31 - :class:`downcase_tokens` 

32 

33 Example:: 

34 

35 pyparsing_common.number.run_tests(''' 

36 # any int or real number, returned as the appropriate type 

37 100 

38 -100 

39 +100 

40 3.14159 

41 6.02e23 

42 1e-12 

43 ''') 

44 

45 pyparsing_common.fnumber.run_tests(''' 

46 # any int or real number, returned as float 

47 100 

48 -100 

49 +100 

50 3.14159 

51 6.02e23 

52 1e-12 

53 ''') 

54 

55 pyparsing_common.hex_integer.run_tests(''' 

56 # hex numbers 

57 100 

58 FF 

59 ''') 

60 

61 pyparsing_common.fraction.run_tests(''' 

62 # fractions 

63 1/2 

64 -3/4 

65 ''') 

66 

67 pyparsing_common.mixed_integer.run_tests(''' 

68 # mixed fractions 

69 1 

70 1/2 

71 -3/4 

72 1-3/4 

73 ''') 

74 

75 import uuid 

76 pyparsing_common.uuid.set_parse_action(token_map(uuid.UUID)) 

77 pyparsing_common.uuid.run_tests(''' 

78 # uuid 

79 12345678-1234-5678-1234-567812345678 

80 ''') 

81 

82 prints:: 

83 

84 # any int or real number, returned as the appropriate type 

85 100 

86 [100] 

87 

88 -100 

89 [-100] 

90 

91 +100 

92 [100] 

93 

94 3.14159 

95 [3.14159] 

96 

97 6.02e23 

98 [6.02e+23] 

99 

100 1e-12 

101 [1e-12] 

102 

103 # any int or real number, returned as float 

104 100 

105 [100.0] 

106 

107 -100 

108 [-100.0] 

109 

110 +100 

111 [100.0] 

112 

113 3.14159 

114 [3.14159] 

115 

116 6.02e23 

117 [6.02e+23] 

118 

119 1e-12 

120 [1e-12] 

121 

122 # hex numbers 

123 100 

124 [256] 

125 

126 FF 

127 [255] 

128 

129 # fractions 

130 1/2 

131 [0.5] 

132 

133 -3/4 

134 [-0.75] 

135 

136 # mixed fractions 

137 1 

138 [1] 

139 

140 1/2 

141 [0.5] 

142 

143 -3/4 

144 [-0.75] 

145 

146 1-3/4 

147 [1.75] 

148 

149 # uuid 

150 12345678-1234-5678-1234-567812345678 

151 [UUID('12345678-1234-5678-1234-567812345678')] 

152 """ 

153 

154 convert_to_integer = token_map(int) 

155 """ 

156 Parse action for converting parsed integers to Python int 

157 """ 

158 

159 convert_to_float = token_map(float) 

160 """ 

161 Parse action for converting parsed numbers to Python float 

162 """ 

163 

164 integer = Word(nums).set_name("integer").set_parse_action(convert_to_integer) 

165 """expression that parses an unsigned integer, returns an int""" 

166 

167 hex_integer = ( 

168 Word(hexnums).set_name("hex integer").set_parse_action(token_map(int, 16)) 

169 ) 

170 """expression that parses a hexadecimal integer, returns an int""" 

171 

172 signed_integer = ( 

173 Regex(r"[+-]?\d+") 

174 .set_name("signed integer") 

175 .set_parse_action(convert_to_integer) 

176 ) 

177 """expression that parses an integer with optional leading sign, returns an int""" 

178 

179 fraction = ( 

180 signed_integer().set_parse_action(convert_to_float) 

181 + "/" 

182 + signed_integer().set_parse_action(convert_to_float) 

183 ).set_name("fraction") 

184 """fractional expression of an integer divided by an integer, returns a float""" 

185 fraction.add_parse_action(lambda tt: tt[0] / tt[-1]) 

186 

187 mixed_integer = ( 

188 fraction | signed_integer + Opt(Opt("-").suppress() + fraction) 

189 ).set_name("fraction or mixed integer-fraction") 

190 """mixed integer of the form 'integer - fraction', with optional leading integer, returns float""" 

191 mixed_integer.add_parse_action(sum) 

192 

193 real = ( 

194 Regex(r"[+-]?(?:\d+\.\d*|\.\d+)") 

195 .set_name("real number") 

196 .set_parse_action(convert_to_float) 

197 ) 

198 """expression that parses a floating point number and returns a float""" 

199 

200 sci_real = ( 

201 Regex(r"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)") 

202 .set_name("real number with scientific notation") 

203 .set_parse_action(convert_to_float) 

204 ) 

205 """expression that parses a floating point number with optional 

206 scientific notation and returns a float""" 

207 

208 # streamlining this expression makes the docs nicer-looking 

209 number = (sci_real | real | signed_integer).setName("number").streamline() 

210 """any numeric expression, returns the corresponding Python type""" 

211 

212 fnumber = ( 

213 Regex(r"[+-]?\d+\.?\d*([eE][+-]?\d+)?") 

214 .set_name("fnumber") 

215 .set_parse_action(convert_to_float) 

216 ) 

217 """any int or real number, returned as float""" 

218 

219 identifier = Word(identchars, identbodychars).set_name("identifier") 

220 """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')""" 

221 

222 ipv4_address = Regex( 

223 r"(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}" 

224 ).set_name("IPv4 address") 

225 "IPv4 address (``0.0.0.0 - 255.255.255.255``)" 

226 

227 _ipv6_part = Regex(r"[0-9a-fA-F]{1,4}").set_name("hex_integer") 

228 _full_ipv6_address = (_ipv6_part + (":" + _ipv6_part) * 7).set_name( 

229 "full IPv6 address" 

230 ) 

231 _short_ipv6_address = ( 

232 Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6)) 

233 + "::" 

234 + Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6)) 

235 ).set_name("short IPv6 address") 

236 _short_ipv6_address.add_condition( 

237 lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8 

238 ) 

239 _mixed_ipv6_address = ("::ffff:" + ipv4_address).set_name("mixed IPv6 address") 

240 ipv6_address = Combine( 

241 (_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).set_name( 

242 "IPv6 address" 

243 ) 

244 ).set_name("IPv6 address") 

245 "IPv6 address (long, short, or mixed form)" 

246 

247 mac_address = Regex( 

248 r"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}" 

249 ).set_name("MAC address") 

250 "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)" 

251 

252 @staticmethod 

253 def convert_to_date(fmt: str = "%Y-%m-%d"): 

254 """ 

255 Helper to create a parse action for converting parsed date string to Python datetime.date 

256 

257 Params - 

258 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``) 

259 

260 Example:: 

261 

262 date_expr = pyparsing_common.iso8601_date.copy() 

263 date_expr.set_parse_action(pyparsing_common.convert_to_date()) 

264 print(date_expr.parse_string("1999-12-31")) 

265 

266 prints:: 

267 

268 [datetime.date(1999, 12, 31)] 

269 """ 

270 

271 def cvt_fn(ss, ll, tt): 

272 try: 

273 return datetime.strptime(tt[0], fmt).date() 

274 except ValueError as ve: 

275 raise ParseException(ss, ll, str(ve)) 

276 

277 return cvt_fn 

278 

279 @staticmethod 

280 def convert_to_datetime(fmt: str = "%Y-%m-%dT%H:%M:%S.%f"): 

281 """Helper to create a parse action for converting parsed 

282 datetime string to Python datetime.datetime 

283 

284 Params - 

285 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%dT%H:%M:%S.%f"``) 

286 

287 Example:: 

288 

289 dt_expr = pyparsing_common.iso8601_datetime.copy() 

290 dt_expr.set_parse_action(pyparsing_common.convert_to_datetime()) 

291 print(dt_expr.parse_string("1999-12-31T23:59:59.999")) 

292 

293 prints:: 

294 

295 [datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)] 

296 """ 

297 

298 def cvt_fn(s, l, t): 

299 try: 

300 return datetime.strptime(t[0], fmt) 

301 except ValueError as ve: 

302 raise ParseException(s, l, str(ve)) 

303 

304 return cvt_fn 

305 

306 iso8601_date = Regex( 

307 r"(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?" 

308 ).set_name("ISO8601 date") 

309 "ISO8601 date (``yyyy-mm-dd``)" 

310 

311 iso8601_datetime = Regex( 

312 r"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?" 

313 ).set_name("ISO8601 datetime") 

314 "ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``" 

315 

316 uuid = Regex(r"[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name("UUID") 

317 "UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)" 

318 

319 _html_stripper = any_open_tag.suppress() | any_close_tag.suppress() 

320 

321 @staticmethod 

322 def strip_html_tags(s: str, l: int, tokens: ParseResults): 

323 """Parse action to remove HTML tags from web page HTML source 

324 

325 Example:: 

326 

327 # strip HTML links from normal text 

328 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>' 

329 td, td_end = make_html_tags("TD") 

330 table_text = td + SkipTo(td_end).set_parse_action(pyparsing_common.strip_html_tags)("body") + td_end 

331 print(table_text.parse_string(text).body) 

332 

333 Prints:: 

334 

335 More info at the pyparsing wiki page 

336 """ 

337 return pyparsing_common._html_stripper.transform_string(tokens[0]) 

338 

339 _commasepitem = ( 

340 Combine( 

341 OneOrMore( 

342 ~Literal(",") 

343 + ~LineEnd() 

344 + Word(printables, exclude_chars=",") 

345 + Opt(White(" \t") + ~FollowedBy(LineEnd() | ",")) 

346 ) 

347 ) 

348 .streamline() 

349 .set_name("commaItem") 

350 ) 

351 comma_separated_list = DelimitedList( 

352 Opt(quoted_string.copy() | _commasepitem, default="") 

353 ).set_name("comma separated list") 

354 """Predefined expression of 1 or more printable words or quoted strings, separated by commas.""" 

355 

356 upcase_tokens = staticmethod(token_map(lambda t: t.upper())) 

357 """Parse action to convert tokens to upper case.""" 

358 

359 downcase_tokens = staticmethod(token_map(lambda t: t.lower())) 

360 """Parse action to convert tokens to lower case.""" 

361 

362 # fmt: off 

363 url = Regex( 

364 # https://mathiasbynens.be/demo/url-regex 

365 # https://gist.github.com/dperini/729294 

366 r"(?P<url>" + 

367 # protocol identifier (optional) 

368 # short syntax // still required 

369 r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" + 

370 # user:pass BasicAuth (optional) 

371 r"(?:(?P<auth>\S+(?::\S*)?)@)?" + 

372 r"(?P<host>" + 

373 # IP address exclusion 

374 # private & local networks 

375 r"(?!(?:10|127)(?:\.\d{1,3}){3})" + 

376 r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" + 

377 r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" + 

378 # IP address dotted notation octets 

379 # excludes loopback network 0.0.0.0 

380 # excludes reserved space >= 224.0.0.0 

381 # excludes network & broadcast addresses 

382 # (first & last IP address of each class) 

383 r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" + 

384 r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" + 

385 r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" + 

386 r"|" + 

387 # host & domain names, may end with dot 

388 # can be replaced by a shortest alternative 

389 # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+ 

390 r"(?:" + 

391 r"(?:" + 

392 r"[a-z0-9\u00a1-\uffff]" + 

393 r"[a-z0-9\u00a1-\uffff_-]{0,62}" + 

394 r")?" + 

395 r"[a-z0-9\u00a1-\uffff]\." + 

396 r")+" + 

397 # TLD identifier name, may end with dot 

398 r"(?:[a-z\u00a1-\uffff]{2,}\.?)" + 

399 r")" + 

400 # port number (optional) 

401 r"(:(?P<port>\d{2,5}))?" + 

402 # resource path (optional) 

403 r"(?P<path>\/[^?# ]*)?" + 

404 # query string (optional) 

405 r"(\?(?P<query>[^#]*))?" + 

406 # fragment (optional) 

407 r"(#(?P<fragment>\S*))?" + 

408 r")" 

409 ).set_name("url") 

410 """URL (http/https/ftp scheme)""" 

411 # fmt: on 

412 

413 # pre-PEP8 compatibility names 

414 convertToInteger = convert_to_integer 

415 """Deprecated - use :class:`convert_to_integer`""" 

416 convertToFloat = convert_to_float 

417 """Deprecated - use :class:`convert_to_float`""" 

418 convertToDate = convert_to_date 

419 """Deprecated - use :class:`convert_to_date`""" 

420 convertToDatetime = convert_to_datetime 

421 """Deprecated - use :class:`convert_to_datetime`""" 

422 stripHTMLTags = strip_html_tags 

423 """Deprecated - use :class:`strip_html_tags`""" 

424 upcaseTokens = upcase_tokens 

425 """Deprecated - use :class:`upcase_tokens`""" 

426 downcaseTokens = downcase_tokens 

427 """Deprecated - use :class:`downcase_tokens`""" 

428 

429 

430_builtin_exprs = [ 

431 v for v in vars(pyparsing_common).values() if isinstance(v, ParserElement) 

432]