1# common.py
2from .core import *
3from .helpers import DelimitedList, any_open_tag, any_close_tag
4from datetime import datetime
5
6
7# some other useful expressions - using lower-case class name since we are really using this as a namespace
8class pyparsing_common:
9 """Here are some common low-level expressions that may be useful in
10 jump-starting parser development:
11
12 - numeric forms (:class:`integers<integer>`, :class:`reals<real>`,
13 :class:`scientific notation<sci_real>`)
14 - common :class:`programming identifiers<identifier>`
15 - network addresses (:class:`MAC<mac_address>`,
16 :class:`IPv4<ipv4_address>`, :class:`IPv6<ipv6_address>`)
17 - ISO8601 :class:`dates<iso8601_date>` and
18 :class:`datetime<iso8601_datetime>`
19 - :class:`UUID<uuid>`
20 - :class:`comma-separated list<comma_separated_list>`
21 - :class:`url`
22
23 Parse actions:
24
25 - :class:`convert_to_integer`
26 - :class:`convert_to_float`
27 - :class:`convert_to_date`
28 - :class:`convert_to_datetime`
29 - :class:`strip_html_tags`
30 - :class:`upcase_tokens`
31 - :class:`downcase_tokens`
32
33 Examples:
34
35 .. testcode::
36
37 pyparsing_common.number.run_tests('''
38 # any int or real number, returned as the appropriate type
39 100
40 -100
41 +100
42 3.14159
43 6.02e23
44 1e-12
45 ''')
46
47 .. testoutput::
48 :options: +NORMALIZE_WHITESPACE
49
50
51 # any int or real number, returned as the appropriate type
52 100
53 [100]
54
55 -100
56 [-100]
57
58 +100
59 [100]
60
61 3.14159
62 [3.14159]
63
64 6.02e23
65 [6.02e+23]
66
67 1e-12
68 [1e-12]
69
70 .. testcode::
71
72 pyparsing_common.fnumber.run_tests('''
73 # any int or real number, returned as float
74 100
75 -100
76 +100
77 3.14159
78 6.02e23
79 1e-12
80 ''')
81
82 .. testoutput::
83 :options: +NORMALIZE_WHITESPACE
84
85
86 # any int or real number, returned as float
87 100
88 [100.0]
89
90 -100
91 [-100.0]
92
93 +100
94 [100.0]
95
96 3.14159
97 [3.14159]
98
99 6.02e23
100 [6.02e+23]
101
102 1e-12
103 [1e-12]
104
105 .. testcode::
106
107 pyparsing_common.hex_integer.run_tests('''
108 # hex numbers
109 100
110 FF
111 ''')
112
113 .. testoutput::
114 :options: +NORMALIZE_WHITESPACE
115
116
117 # hex numbers
118 100
119 [256]
120
121 FF
122 [255]
123
124 .. testcode::
125
126 pyparsing_common.fraction.run_tests('''
127 # fractions
128 1/2
129 -3/4
130 ''')
131
132 .. testoutput::
133 :options: +NORMALIZE_WHITESPACE
134
135
136 # fractions
137 1/2
138 [0.5]
139
140 -3/4
141 [-0.75]
142
143 .. testcode::
144
145 pyparsing_common.mixed_integer.run_tests('''
146 # mixed fractions
147 1
148 1/2
149 -3/4
150 1-3/4
151 ''')
152
153 .. testoutput::
154 :options: +NORMALIZE_WHITESPACE
155
156
157 # mixed fractions
158 1
159 [1]
160
161 1/2
162 [0.5]
163
164 -3/4
165 [-0.75]
166
167 1-3/4
168 [1.75]
169 .. testcode::
170
171 import uuid
172 pyparsing_common.uuid.set_parse_action(token_map(uuid.UUID))
173 pyparsing_common.uuid.run_tests('''
174 # uuid
175 12345678-1234-5678-1234-567812345678
176 ''')
177
178 .. testoutput::
179 :options: +NORMALIZE_WHITESPACE
180
181
182 # uuid
183 12345678-1234-5678-1234-567812345678
184 [UUID('12345678-1234-5678-1234-567812345678')]
185 """
186
187 convert_to_integer = token_map(int)
188 """
189 Parse action for converting parsed integers to Python int
190 """
191
192 convert_to_float = token_map(float)
193 """
194 Parse action for converting parsed numbers to Python float
195 """
196
197 integer = Word(nums).set_name("integer").set_parse_action(convert_to_integer)
198 """expression that parses an unsigned integer, returns an int"""
199
200 hex_integer = (
201 Word(hexnums).set_name("hex integer").set_parse_action(token_map(int, 16))
202 )
203 """expression that parses a hexadecimal integer, returns an int"""
204
205 signed_integer = (
206 Regex(r"[+-]?\d+")
207 .set_name("signed integer")
208 .set_parse_action(convert_to_integer)
209 )
210 """expression that parses an integer with optional leading sign, returns an int"""
211
212 fraction = (
213 signed_integer().set_parse_action(convert_to_float)
214 + "/"
215 + signed_integer().set_parse_action(convert_to_float)
216 ).set_name("fraction")
217 """fractional expression of an integer divided by an integer, returns a float"""
218 fraction.add_parse_action(lambda tt: tt[0] / tt[-1])
219
220 mixed_integer = (
221 fraction | signed_integer + Opt(Opt("-").suppress() + fraction)
222 ).set_name("fraction or mixed integer-fraction")
223 """mixed integer of the form 'integer - fraction', with optional leading integer, returns float"""
224 mixed_integer.add_parse_action(sum)
225
226 real = (
227 Regex(r"[+-]?(?:\d+\.\d*|\.\d+)")
228 .set_name("real number")
229 .set_parse_action(convert_to_float)
230 )
231 """expression that parses a floating point number and returns a float"""
232
233 sci_real = (
234 Regex(r"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)")
235 .set_name("real number with scientific notation")
236 .set_parse_action(convert_to_float)
237 )
238 """expression that parses a floating point number with optional
239 scientific notation and returns a float"""
240
241 # streamlining this expression makes the docs nicer-looking
242 number = (sci_real | real | signed_integer).set_name("number").streamline()
243 """any numeric expression, returns the corresponding Python type"""
244
245 fnumber = (
246 Regex(r"[+-]?\d+\.?\d*(?:[eE][+-]?\d+)?")
247 .set_name("fnumber")
248 .set_parse_action(convert_to_float)
249 )
250 """any int or real number, returned as float"""
251
252 ieee_float = (
253 Regex(r"(?i:[+-]?(?:(?:\d+\.?\d*(?:e[+-]?\d+)?)|nan|inf(?:inity)?))")
254 .set_name("ieee_float")
255 .set_parse_action(convert_to_float)
256 )
257 """any floating-point literal (int, real number, infinity, or NaN), returned as float"""
258
259 identifier = Word(identchars, identbodychars).set_name("identifier")
260 """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
261
262 ipv4_address = Regex(
263 r"(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}"
264 ).set_name("IPv4 address")
265 "IPv4 address (``0.0.0.0 - 255.255.255.255``)"
266
267 _ipv6_part = Regex(r"[0-9a-fA-F]{1,4}").set_name("hex_integer")
268 _full_ipv6_address = (_ipv6_part + (":" + _ipv6_part) * 7).set_name(
269 "full IPv6 address"
270 )
271 _short_ipv6_address = (
272 Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
273 + "::"
274 + Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
275 ).set_name("short IPv6 address")
276 _short_ipv6_address.add_condition(
277 lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8
278 )
279 _mixed_ipv6_address = ("::ffff:" + ipv4_address).set_name("mixed IPv6 address")
280 ipv6_address = Combine(
281 (_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).set_name(
282 "IPv6 address"
283 )
284 ).set_name("IPv6 address")
285 "IPv6 address (long, short, or mixed form)"
286
287 mac_address = Regex(
288 r"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}"
289 ).set_name("MAC address")
290 "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
291
292 @staticmethod
293 def convert_to_date(fmt: str = "%Y-%m-%d"):
294 """
295 Helper to create a parse action for converting parsed date string to Python datetime.date
296
297 Params -
298 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``)
299
300 Example:
301
302 .. testcode::
303
304 date_expr = pyparsing_common.iso8601_date.copy()
305 date_expr.set_parse_action(pyparsing_common.convert_to_date())
306 print(date_expr.parse_string("1999-12-31"))
307
308 prints:
309
310 .. testoutput::
311
312 [datetime.date(1999, 12, 31)]
313 """
314
315 def cvt_fn(ss, ll, tt):
316 try:
317 return datetime.strptime(tt[0], fmt).date()
318 except ValueError as ve:
319 raise ParseException(ss, ll, str(ve))
320
321 return cvt_fn
322
323 @staticmethod
324 def convert_to_datetime(fmt: str = "%Y-%m-%dT%H:%M:%S.%f"):
325 """Helper to create a parse action for converting parsed
326 datetime string to Python datetime.datetime
327
328 Params -
329 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%dT%H:%M:%S.%f"``)
330
331 Example:
332
333 .. testcode::
334
335 dt_expr = pyparsing_common.iso8601_datetime.copy()
336 dt_expr.set_parse_action(pyparsing_common.convert_to_datetime())
337 print(dt_expr.parse_string("1999-12-31T23:59:59.999"))
338
339 prints:
340
341 .. testoutput::
342
343 [datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)]
344 """
345
346 def cvt_fn(s, l, t):
347 try:
348 return datetime.strptime(t[0], fmt)
349 except ValueError as ve:
350 raise ParseException(s, l, str(ve))
351
352 return cvt_fn
353
354 iso8601_date = Regex(
355 r"(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?"
356 ).set_name("ISO8601 date")
357 "ISO8601 date (``yyyy-mm-dd``)"
358
359 iso8601_datetime = Regex(
360 r"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?"
361 ).set_name("ISO8601 datetime")
362 "ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``"
363
364 uuid = Regex(r"[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name("UUID")
365 "UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)"
366
367 _html_stripper = any_open_tag.suppress() | any_close_tag.suppress()
368
369 @staticmethod
370 def strip_html_tags(s: str, l: int, tokens: ParseResults):
371 """Parse action to remove HTML tags from web page HTML source
372
373 Example:
374
375 .. testcode::
376
377 # strip HTML links from normal text
378 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
379 td, td_end = make_html_tags("TD")
380 table_text = td + SkipTo(td_end).set_parse_action(
381 pyparsing_common.strip_html_tags)("body") + td_end
382 print(table_text.parse_string(text).body)
383
384 Prints:
385
386 .. testoutput::
387
388 More info at the pyparsing wiki page
389 """
390 return pyparsing_common._html_stripper.transform_string(tokens[0])
391
392 _commasepitem = (
393 Combine(
394 OneOrMore(
395 ~Literal(",")
396 + ~LineEnd()
397 + Word(printables, exclude_chars=",")
398 + Opt(White(" \t") + ~FollowedBy(LineEnd() | ","))
399 )
400 )
401 .streamline()
402 .set_name("commaItem")
403 )
404 comma_separated_list = DelimitedList(
405 Opt(quoted_string.copy() | _commasepitem, default="")
406 ).set_name("comma separated list")
407 """Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
408
409 upcase_tokens = staticmethod(token_map(lambda t: t.upper()))
410 """Parse action to convert tokens to upper case."""
411
412 downcase_tokens = staticmethod(token_map(lambda t: t.lower()))
413 """Parse action to convert tokens to lower case."""
414
415 # fmt: off
416 url = Regex(
417 # https://mathiasbynens.be/demo/url-regex
418 # https://gist.github.com/dperini/729294
419 r"(?P<url>" +
420 # protocol identifier (optional)
421 # short syntax // still required
422 r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" +
423 # user:pass BasicAuth (optional)
424 r"(?:(?P<auth>\S+(?::\S*)?)@)?" +
425 r"(?P<host>" +
426 # IP address exclusion
427 # private & local networks
428 r"(?!(?:10|127)(?:\.\d{1,3}){3})" +
429 r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" +
430 r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" +
431 # IP address dotted notation octets
432 # excludes loopback network 0.0.0.0
433 # excludes reserved space >= 224.0.0.0
434 # excludes network & broadcast addresses
435 # (first & last IP address of each class)
436 r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" +
437 r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" +
438 r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" +
439 r"|" +
440 # host & domain names, may end with dot
441 # can be replaced by a shortest alternative
442 # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+
443 r"(?:" +
444 r"(?:" +
445 r"[a-z0-9\u00a1-\uffff]" +
446 r"[a-z0-9\u00a1-\uffff_-]{0,62}" +
447 r")?" +
448 r"[a-z0-9\u00a1-\uffff]\." +
449 r")+" +
450 # TLD identifier name, may end with dot
451 r"(?:[a-z\u00a1-\uffff]{2,}\.?)" +
452 r")" +
453 # port number (optional)
454 r"(:(?P<port>\d{2,5}))?" +
455 # resource path (optional)
456 r"(?P<path>\/[^?# ]*)?" +
457 # query string (optional)
458 r"(\?(?P<query>[^#]*))?" +
459 # fragment (optional)
460 r"(#(?P<fragment>\S*))?" +
461 r")"
462 ).set_name("url")
463 """
464 URL (http/https/ftp scheme)
465
466 .. versionchanged:: 3.1.0
467 ``url`` named group added
468 """
469 # fmt: on
470
471 # pre-PEP8 compatibility names
472 # fmt: off
473 convertToInteger = staticmethod(replaced_by_pep8("convertToInteger", convert_to_integer))
474 convertToFloat = staticmethod(replaced_by_pep8("convertToFloat", convert_to_float))
475 convertToDate = staticmethod(replaced_by_pep8("convertToDate", convert_to_date))
476 convertToDatetime = staticmethod(replaced_by_pep8("convertToDatetime", convert_to_datetime))
477 stripHTMLTags = staticmethod(replaced_by_pep8("stripHTMLTags", strip_html_tags))
478 upcaseTokens = staticmethod(replaced_by_pep8("upcaseTokens", upcase_tokens))
479 downcaseTokens = staticmethod(replaced_by_pep8("downcaseTokens", downcase_tokens))
480 # fmt: on
481
482
483_builtin_exprs = [
484 v for v in vars(pyparsing_common).values() if isinstance(v, ParserElement)
485]