1# common.py
2from .core import *
3from .helpers import DelimitedList, any_open_tag, any_close_tag
4from datetime import datetime
5import sys
6
7PY_310 = sys.version_info >= (3, 10)
8
9
10# some other useful expressions - using lower-case class name since we are really using this as a namespace
11class pyparsing_common:
12 """Here are some common low-level expressions that may be useful in
13 jump-starting parser development:
14
15 - numeric forms (:class:`integers<integer>`, :class:`reals<real>`,
16 :class:`scientific notation<sci_real>`)
17 - common :class:`programming identifiers<identifier>`
18 - network addresses (:class:`MAC<mac_address>`,
19 :class:`IPv4<ipv4_address>`, :class:`IPv6<ipv6_address>`)
20 - ISO8601 :class:`dates<iso8601_date>` and
21 :class:`datetime<iso8601_datetime>`
22 - :class:`UUID<uuid>`
23 - :class:`comma-separated list<comma_separated_list>`
24 - :class:`url`
25
26 Parse actions:
27
28 - :class:`convert_to_integer`
29 - :class:`convert_to_float`
30 - :class:`convert_to_date`
31 - :class:`convert_to_datetime`
32 - :class:`strip_html_tags`
33 - :class:`upcase_tokens`
34 - :class:`downcase_tokens`
35
36 Examples:
37
38 .. testcode::
39
40 pyparsing_common.number.run_tests('''
41 # any int or real number, returned as the appropriate type
42 100
43 -100
44 +100
45 3.14159
46 6.02e23
47 1e-12
48 ''')
49
50 .. testoutput::
51 :options: +NORMALIZE_WHITESPACE
52
53
54 # any int or real number, returned as the appropriate type
55 100
56 [100]
57
58 -100
59 [-100]
60
61 +100
62 [100]
63
64 3.14159
65 [3.14159]
66
67 6.02e23
68 [6.02e+23]
69
70 1e-12
71 [1e-12]
72
73 .. testcode::
74
75 pyparsing_common.fnumber.run_tests('''
76 # any int or real number, returned as float
77 100
78 -100
79 +100
80 3.14159
81 6.02e23
82 1e-12
83 ''')
84
85 .. testoutput::
86 :options: +NORMALIZE_WHITESPACE
87
88
89 # any int or real number, returned as float
90 100
91 [100.0]
92
93 -100
94 [-100.0]
95
96 +100
97 [100.0]
98
99 3.14159
100 [3.14159]
101
102 6.02e23
103 [6.02e+23]
104
105 1e-12
106 [1e-12]
107
108 .. testcode::
109
110 pyparsing_common.hex_integer.run_tests('''
111 # hex numbers
112 100
113 FF
114 ''')
115
116 .. testoutput::
117 :options: +NORMALIZE_WHITESPACE
118
119
120 # hex numbers
121 100
122 [256]
123
124 FF
125 [255]
126
127 .. testcode::
128
129 pyparsing_common.fraction.run_tests('''
130 # fractions
131 1/2
132 -3/4
133 ''')
134
135 .. testoutput::
136 :options: +NORMALIZE_WHITESPACE
137
138
139 # fractions
140 1/2
141 [0.5]
142
143 -3/4
144 [-0.75]
145
146 .. testcode::
147
148 pyparsing_common.mixed_integer.run_tests('''
149 # mixed fractions
150 1
151 1/2
152 -3/4
153 1-3/4
154 ''')
155
156 .. testoutput::
157 :options: +NORMALIZE_WHITESPACE
158
159
160 # mixed fractions
161 1
162 [1]
163
164 1/2
165 [0.5]
166
167 -3/4
168 [-0.75]
169
170 1-3/4
171 [1.75]
172 .. testcode::
173
174 import uuid
175 pyparsing_common.uuid.set_parse_action(token_map(uuid.UUID))
176 pyparsing_common.uuid.run_tests('''
177 # uuid
178 12345678-1234-5678-1234-567812345678
179 ''')
180
181 .. testoutput::
182 :options: +NORMALIZE_WHITESPACE
183
184
185 # uuid
186 12345678-1234-5678-1234-567812345678
187 [UUID('12345678-1234-5678-1234-567812345678')]
188 """
189
190 @staticmethod
191 def convert_to_integer(_, __, t):
192 """
193 Parse action for converting parsed integers to Python int
194 """
195 return [int(tt) for tt in t]
196
197 @staticmethod
198 def convert_to_float(_, __, t):
199 """
200 Parse action for converting parsed numbers to Python float
201 """
202 return [float(tt) for tt in t]
203
204 integer = (
205 Word(nums)
206 .set_name("integer")
207 .set_parse_action(
208 convert_to_integer
209 if PY_310
210 else lambda t: [int(tt) for tt in t] # type: ignore[misc]
211 )
212 )
213 """expression that parses an unsigned integer, converts to an int"""
214
215 hex_integer = (
216 Word(hexnums).set_name("hex integer").set_parse_action(token_map(int, 16))
217 )
218 """expression that parses a hexadecimal integer, converts to an int"""
219
220 signed_integer = (
221 Regex(r"[+-]?\d+")
222 .set_name("signed integer")
223 .set_parse_action(
224 convert_to_integer
225 if PY_310
226 else lambda t: [int(tt) for tt in t] # type: ignore[misc]
227 )
228 )
229 """expression that parses an integer with optional leading sign, converts to an int"""
230
231 fraction = (
232 signed_integer().set_parse_action(
233 convert_to_float
234 if PY_310
235 else lambda t: [float(tt) for tt in t] # type: ignore[misc]
236 )
237 + "/"
238 + signed_integer().set_parse_action(
239 convert_to_float
240 if PY_310
241 else lambda t: [float(tt) for tt in t] # type: ignore[misc]
242 )
243 ).set_name("fraction")
244 """fractional expression of an integer divided by an integer, converts to a float"""
245 fraction.add_parse_action(lambda tt: tt[0] / tt[-1])
246
247 mixed_integer = (
248 fraction | signed_integer + Opt(Opt("-").suppress() + fraction)
249 ).set_name("fraction or mixed integer-fraction")
250 """mixed integer of the form 'integer - fraction', with optional leading integer, converts to a float"""
251 mixed_integer.add_parse_action(sum)
252
253 real = (
254 Regex(r"[+-]?(?:\d+\.\d*|\.\d+)")
255 .set_name("real number")
256 .set_parse_action(
257 convert_to_float
258 if PY_310
259 else lambda t: [float(tt) for tt in t] # type: ignore[misc]
260 )
261 )
262 """expression that parses a floating point number, converts to a float"""
263
264 sci_real = (
265 Regex(r"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)")
266 .set_name("real number with scientific notation")
267 .set_parse_action(
268 convert_to_float
269 if PY_310
270 else lambda t: [float(tt) for tt in t] # type: ignore[misc]
271 )
272 )
273 """expression that parses a floating point number with optional
274 scientific notation, converts to a float"""
275
276 # streamlining this expression makes the docs nicer-looking
277 number = (sci_real | real | signed_integer).set_name("number").streamline()
278 """any numeric expression, converts to the corresponding Python type"""
279
280 fnumber = (
281 Regex(r"[+-]?\d+\.?\d*(?:[eE][+-]?\d+)?")
282 .set_name("fnumber")
283 .set_parse_action(
284 convert_to_float
285 if PY_310
286 else lambda t: [float(tt) for tt in t] # type: ignore[misc]
287 )
288 )
289 """any int or real number, always converts to a float"""
290
291 ieee_float = (
292 Regex(r"(?i:[+-]?(?:(?:\d+\.?\d*(?:e[+-]?\d+)?)|nan|inf(?:inity)?))")
293 .set_name("ieee_float")
294 .set_parse_action(
295 convert_to_float
296 if PY_310
297 else lambda t: [float(tt) for tt in t] # type: ignore[misc]
298 )
299 )
300 """any floating-point literal (int, real number, infinity, or NaN), converts to a float"""
301
302 identifier = Word(identchars, identbodychars).set_name("identifier")
303 """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
304
305 ipv4_address = Regex(
306 r"(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}"
307 ).set_name("IPv4 address")
308 "IPv4 address (``0.0.0.0 - 255.255.255.255``)"
309
310 _ipv6_part = Regex(r"[0-9a-fA-F]{1,4}").set_name("hex_integer")
311 _full_ipv6_address = (_ipv6_part + (":" + _ipv6_part) * 7).set_name(
312 "full IPv6 address"
313 )
314 _short_ipv6_address = (
315 Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
316 + "::"
317 + Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
318 ).set_name("short IPv6 address")
319 _short_ipv6_address.add_condition(
320 lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8
321 )
322 _mixed_ipv6_address = ("::ffff:" + ipv4_address).set_name("mixed IPv6 address")
323 ipv6_address = Combine(
324 (_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).set_name(
325 "IPv6 address"
326 )
327 ).set_name("IPv6 address")
328 "IPv6 address (long, short, or mixed form)"
329
330 mac_address = Regex(
331 r"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}"
332 ).set_name("MAC address")
333 "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
334
335 @staticmethod
336 def convert_to_date(fmt: str = "%Y-%m-%d"):
337 """
338 Helper to create a parse action for converting parsed date string to Python datetime.date
339
340 Params -
341 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``)
342
343 Example:
344
345 .. testcode::
346
347 date_expr = pyparsing_common.iso8601_date.copy()
348 date_expr.set_parse_action(pyparsing_common.convert_to_date())
349 print(date_expr.parse_string("1999-12-31"))
350
351 prints:
352
353 .. testoutput::
354
355 [datetime.date(1999, 12, 31)]
356 """
357
358 def cvt_fn(ss, ll, tt):
359 try:
360 return datetime.strptime(tt[0], fmt).date()
361 except ValueError as ve:
362 raise ParseException(ss, ll, str(ve))
363
364 return cvt_fn
365
366 @staticmethod
367 def convert_to_datetime(fmt: str = "%Y-%m-%dT%H:%M:%S.%f"):
368 """Helper to create a parse action for converting parsed
369 datetime string to Python datetime.datetime
370
371 Params -
372 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%dT%H:%M:%S.%f"``)
373
374 Example:
375
376 .. testcode::
377
378 dt_expr = pyparsing_common.iso8601_datetime.copy()
379 dt_expr.set_parse_action(pyparsing_common.convert_to_datetime())
380 print(dt_expr.parse_string("1999-12-31T23:59:59.999"))
381
382 prints:
383
384 .. testoutput::
385
386 [datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)]
387 """
388
389 def cvt_fn(s, l, t):
390 try:
391 return datetime.strptime(t[0], fmt)
392 except ValueError as ve:
393 raise ParseException(s, l, str(ve))
394
395 return cvt_fn
396
397 iso8601_date = Regex(
398 r"(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?"
399 ).set_name("ISO8601 date")
400 "ISO8601 date (``yyyy-mm-dd``)"
401
402 iso8601_datetime = Regex(
403 r"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?"
404 ).set_name("ISO8601 datetime")
405 "ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``"
406
407 uuid = Regex(r"[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name("UUID")
408 "UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)"
409
410 _html_stripper = any_open_tag.suppress() | any_close_tag.suppress()
411
412 @staticmethod
413 def strip_html_tags(s: str, l: int, tokens: ParseResults):
414 """Parse action to remove HTML tags from web page HTML source
415
416 Example:
417
418 .. testcode::
419
420 # strip HTML links from normal text
421 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
422 td, td_end = make_html_tags("TD")
423 table_text = td + SkipTo(td_end).set_parse_action(
424 pyparsing_common.strip_html_tags)("body") + td_end
425 print(table_text.parse_string(text).body)
426
427 Prints:
428
429 .. testoutput::
430
431 More info at the pyparsing wiki page
432 """
433 return pyparsing_common._html_stripper.transform_string(tokens[0])
434
435 _commasepitem = (
436 Combine(
437 OneOrMore(
438 ~Literal(",")
439 + ~LineEnd()
440 + Word(printables, exclude_chars=",")
441 + Opt(White(" \t") + ~FollowedBy(LineEnd() | ","))
442 )
443 )
444 .streamline()
445 .set_name("commaItem")
446 )
447 comma_separated_list = DelimitedList(
448 Opt(quoted_string.copy() | _commasepitem, default="")
449 ).set_name("comma separated list")
450 """Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
451
452 @staticmethod
453 def upcase_tokens(s, l, t):
454 """Parse action to convert tokens to upper case."""
455 return [tt.upper() for tt in t]
456
457 @staticmethod
458 def downcase_tokens(s, l, t):
459 """Parse action to convert tokens to lower case."""
460 return [tt.lower() for tt in t]
461
462 # fmt: off
463 url = Regex(
464 # https://mathiasbynens.be/demo/url-regex
465 # https://gist.github.com/dperini/729294
466 r"(?P<url>" +
467 # protocol identifier (optional)
468 # short syntax // still required
469 r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" +
470 # user:pass BasicAuth (optional)
471 r"(?:(?P<auth>\S+(?::\S*)?)@)?" +
472 r"(?P<host>" +
473 # IP address exclusion
474 # private & local networks
475 r"(?!(?:10|127)(?:\.\d{1,3}){3})" +
476 r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" +
477 r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" +
478 # IP address dotted notation octets
479 # excludes loopback network 0.0.0.0
480 # excludes reserved space >= 224.0.0.0
481 # excludes network & broadcast addresses
482 # (first & last IP address of each class)
483 r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" +
484 r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" +
485 r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" +
486 r"|" +
487 # host & domain names, may end with dot
488 # can be replaced by a shortest alternative
489 # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+
490 r"(?:" +
491 r"(?:" +
492 r"[a-z0-9\u00a1-\uffff]" +
493 r"[a-z0-9\u00a1-\uffff_-]{0,62}" +
494 r")?" +
495 r"[a-z0-9\u00a1-\uffff]\." +
496 r")+" +
497 # TLD identifier name, may end with dot
498 r"(?:[a-z\u00a1-\uffff]{2,}\.?)" +
499 r")" +
500 # port number (optional)
501 r"(:(?P<port>\d{2,5}))?" +
502 # resource path (optional)
503 r"(?P<path>\/[^?# ]*)?" +
504 # query string (optional)
505 r"(\?(?P<query>[^#]*))?" +
506 # fragment (optional)
507 r"(#(?P<fragment>\S*))?" +
508 r")"
509 ).set_name("url")
510 """
511 URL (http/https/ftp scheme)
512
513 .. versionchanged:: 3.1.0
514 ``url`` named group added
515 """
516 # fmt: on
517
518 # pre-PEP8 compatibility names
519 # fmt: off
520 convertToInteger = staticmethod(replaced_by_pep8("convertToInteger", convert_to_integer))
521 convertToFloat = staticmethod(replaced_by_pep8("convertToFloat", convert_to_float))
522 convertToDate = staticmethod(replaced_by_pep8("convertToDate", convert_to_date))
523 convertToDatetime = staticmethod(replaced_by_pep8("convertToDatetime", convert_to_datetime))
524 stripHTMLTags = staticmethod(replaced_by_pep8("stripHTMLTags", strip_html_tags))
525 upcaseTokens = staticmethod(replaced_by_pep8("upcaseTokens", upcase_tokens))
526 downcaseTokens = staticmethod(replaced_by_pep8("downcaseTokens", downcase_tokens))
527 # fmt: on
528
529
530_builtin_exprs = [
531 v for v in vars(pyparsing_common).values() if isinstance(v, ParserElement)
532]