1# common.py
2from .core import *
3from .helpers import DelimitedList, any_open_tag, any_close_tag
4from datetime import datetime
5
6
7# some other useful expressions - using lower-case class name since we are really using this as a namespace
8class pyparsing_common:
9 """Here are some common low-level expressions that may be useful in
10 jump-starting parser development:
11
12 - numeric forms (:class:`integers<integer>`, :class:`reals<real>`,
13 :class:`scientific notation<sci_real>`)
14 - common :class:`programming identifiers<identifier>`
15 - network addresses (:class:`MAC<mac_address>`,
16 :class:`IPv4<ipv4_address>`, :class:`IPv6<ipv6_address>`)
17 - ISO8601 :class:`dates<iso8601_date>` and
18 :class:`datetime<iso8601_datetime>`
19 - :class:`UUID<uuid>`
20 - :class:`comma-separated list<comma_separated_list>`
21 - :class:`url`
22
23 Parse actions:
24
25 - :class:`convert_to_integer`
26 - :class:`convert_to_float`
27 - :class:`convert_to_date`
28 - :class:`convert_to_datetime`
29 - :class:`strip_html_tags`
30 - :class:`upcase_tokens`
31 - :class:`downcase_tokens`
32
33 Example::
34
35 pyparsing_common.number.run_tests('''
36 # any int or real number, returned as the appropriate type
37 100
38 -100
39 +100
40 3.14159
41 6.02e23
42 1e-12
43 ''')
44
45 pyparsing_common.fnumber.run_tests('''
46 # any int or real number, returned as float
47 100
48 -100
49 +100
50 3.14159
51 6.02e23
52 1e-12
53 ''')
54
55 pyparsing_common.hex_integer.run_tests('''
56 # hex numbers
57 100
58 FF
59 ''')
60
61 pyparsing_common.fraction.run_tests('''
62 # fractions
63 1/2
64 -3/4
65 ''')
66
67 pyparsing_common.mixed_integer.run_tests('''
68 # mixed fractions
69 1
70 1/2
71 -3/4
72 1-3/4
73 ''')
74
75 import uuid
76 pyparsing_common.uuid.set_parse_action(token_map(uuid.UUID))
77 pyparsing_common.uuid.run_tests('''
78 # uuid
79 12345678-1234-5678-1234-567812345678
80 ''')
81
82 prints::
83
84 # any int or real number, returned as the appropriate type
85 100
86 [100]
87
88 -100
89 [-100]
90
91 +100
92 [100]
93
94 3.14159
95 [3.14159]
96
97 6.02e23
98 [6.02e+23]
99
100 1e-12
101 [1e-12]
102
103 # any int or real number, returned as float
104 100
105 [100.0]
106
107 -100
108 [-100.0]
109
110 +100
111 [100.0]
112
113 3.14159
114 [3.14159]
115
116 6.02e23
117 [6.02e+23]
118
119 1e-12
120 [1e-12]
121
122 # hex numbers
123 100
124 [256]
125
126 FF
127 [255]
128
129 # fractions
130 1/2
131 [0.5]
132
133 -3/4
134 [-0.75]
135
136 # mixed fractions
137 1
138 [1]
139
140 1/2
141 [0.5]
142
143 -3/4
144 [-0.75]
145
146 1-3/4
147 [1.75]
148
149 # uuid
150 12345678-1234-5678-1234-567812345678
151 [UUID('12345678-1234-5678-1234-567812345678')]
152 """
153
154 convert_to_integer = token_map(int)
155 """
156 Parse action for converting parsed integers to Python int
157 """
158
159 convert_to_float = token_map(float)
160 """
161 Parse action for converting parsed numbers to Python float
162 """
163
164 integer = Word(nums).set_name("integer").set_parse_action(convert_to_integer)
165 """expression that parses an unsigned integer, returns an int"""
166
167 hex_integer = (
168 Word(hexnums).set_name("hex integer").set_parse_action(token_map(int, 16))
169 )
170 """expression that parses a hexadecimal integer, returns an int"""
171
172 signed_integer = (
173 Regex(r"[+-]?\d+")
174 .set_name("signed integer")
175 .set_parse_action(convert_to_integer)
176 )
177 """expression that parses an integer with optional leading sign, returns an int"""
178
179 fraction = (
180 signed_integer().set_parse_action(convert_to_float)
181 + "/"
182 + signed_integer().set_parse_action(convert_to_float)
183 ).set_name("fraction")
184 """fractional expression of an integer divided by an integer, returns a float"""
185 fraction.add_parse_action(lambda tt: tt[0] / tt[-1])
186
187 mixed_integer = (
188 fraction | signed_integer + Opt(Opt("-").suppress() + fraction)
189 ).set_name("fraction or mixed integer-fraction")
190 """mixed integer of the form 'integer - fraction', with optional leading integer, returns float"""
191 mixed_integer.add_parse_action(sum)
192
193 real = (
194 Regex(r"[+-]?(?:\d+\.\d*|\.\d+)")
195 .set_name("real number")
196 .set_parse_action(convert_to_float)
197 )
198 """expression that parses a floating point number and returns a float"""
199
200 sci_real = (
201 Regex(r"[+-]?(?:\d+(?:[eE][+-]?\d+)|(?:\d+\.\d*|\.\d+)(?:[eE][+-]?\d+)?)")
202 .set_name("real number with scientific notation")
203 .set_parse_action(convert_to_float)
204 )
205 """expression that parses a floating point number with optional
206 scientific notation and returns a float"""
207
208 # streamlining this expression makes the docs nicer-looking
209 number = (sci_real | real | signed_integer).set_name("number").streamline()
210 """any numeric expression, returns the corresponding Python type"""
211
212 fnumber = (
213 Regex(r"[+-]?\d+\.?\d*(?:[eE][+-]?\d+)?")
214 .set_name("fnumber")
215 .set_parse_action(convert_to_float)
216 )
217 """any int or real number, returned as float"""
218
219 ieee_float = (
220 Regex(r"(?i:[+-]?(?:(?:\d+\.?\d*(?:e[+-]?\d+)?)|nan|inf(?:inity)?))")
221 .set_name("ieee_float")
222 .set_parse_action(convert_to_float)
223 )
224 """any floating-point literal (int, real number, infinity, or NaN), returned as float"""
225
226 identifier = Word(identchars, identbodychars).set_name("identifier")
227 """typical code identifier (leading alpha or '_', followed by 0 or more alphas, nums, or '_')"""
228
229 ipv4_address = Regex(
230 r"(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})(\.(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})){3}"
231 ).set_name("IPv4 address")
232 "IPv4 address (``0.0.0.0 - 255.255.255.255``)"
233
234 _ipv6_part = Regex(r"[0-9a-fA-F]{1,4}").set_name("hex_integer")
235 _full_ipv6_address = (_ipv6_part + (":" + _ipv6_part) * 7).set_name(
236 "full IPv6 address"
237 )
238 _short_ipv6_address = (
239 Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
240 + "::"
241 + Opt(_ipv6_part + (":" + _ipv6_part) * (0, 6))
242 ).set_name("short IPv6 address")
243 _short_ipv6_address.add_condition(
244 lambda t: sum(1 for tt in t if pyparsing_common._ipv6_part.matches(tt)) < 8
245 )
246 _mixed_ipv6_address = ("::ffff:" + ipv4_address).set_name("mixed IPv6 address")
247 ipv6_address = Combine(
248 (_full_ipv6_address | _mixed_ipv6_address | _short_ipv6_address).set_name(
249 "IPv6 address"
250 )
251 ).set_name("IPv6 address")
252 "IPv6 address (long, short, or mixed form)"
253
254 mac_address = Regex(
255 r"[0-9a-fA-F]{2}([:.-])[0-9a-fA-F]{2}(?:\1[0-9a-fA-F]{2}){4}"
256 ).set_name("MAC address")
257 "MAC address xx:xx:xx:xx:xx (may also have '-' or '.' delimiters)"
258
259 @staticmethod
260 def convert_to_date(fmt: str = "%Y-%m-%d"):
261 """
262 Helper to create a parse action for converting parsed date string to Python datetime.date
263
264 Params -
265 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%d"``)
266
267 Example::
268
269 date_expr = pyparsing_common.iso8601_date.copy()
270 date_expr.set_parse_action(pyparsing_common.convert_to_date())
271 print(date_expr.parse_string("1999-12-31"))
272
273 prints::
274
275 [datetime.date(1999, 12, 31)]
276 """
277
278 def cvt_fn(ss, ll, tt):
279 try:
280 return datetime.strptime(tt[0], fmt).date()
281 except ValueError as ve:
282 raise ParseException(ss, ll, str(ve))
283
284 return cvt_fn
285
286 @staticmethod
287 def convert_to_datetime(fmt: str = "%Y-%m-%dT%H:%M:%S.%f"):
288 """Helper to create a parse action for converting parsed
289 datetime string to Python datetime.datetime
290
291 Params -
292 - fmt - format to be passed to datetime.strptime (default= ``"%Y-%m-%dT%H:%M:%S.%f"``)
293
294 Example::
295
296 dt_expr = pyparsing_common.iso8601_datetime.copy()
297 dt_expr.set_parse_action(pyparsing_common.convert_to_datetime())
298 print(dt_expr.parse_string("1999-12-31T23:59:59.999"))
299
300 prints::
301
302 [datetime.datetime(1999, 12, 31, 23, 59, 59, 999000)]
303 """
304
305 def cvt_fn(s, l, t):
306 try:
307 return datetime.strptime(t[0], fmt)
308 except ValueError as ve:
309 raise ParseException(s, l, str(ve))
310
311 return cvt_fn
312
313 iso8601_date = Regex(
314 r"(?P<year>\d{4})(?:-(?P<month>\d\d)(?:-(?P<day>\d\d))?)?"
315 ).set_name("ISO8601 date")
316 "ISO8601 date (``yyyy-mm-dd``)"
317
318 iso8601_datetime = Regex(
319 r"(?P<year>\d{4})-(?P<month>\d\d)-(?P<day>\d\d)[T ](?P<hour>\d\d):(?P<minute>\d\d)(:(?P<second>\d\d(\.\d*)?)?)?(?P<tz>Z|[+-]\d\d:?\d\d)?"
320 ).set_name("ISO8601 datetime")
321 "ISO8601 datetime (``yyyy-mm-ddThh:mm:ss.s(Z|+-00:00)``) - trailing seconds, milliseconds, and timezone optional; accepts separating ``'T'`` or ``' '``"
322
323 uuid = Regex(r"[0-9a-fA-F]{8}(-[0-9a-fA-F]{4}){3}-[0-9a-fA-F]{12}").set_name("UUID")
324 "UUID (``xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx``)"
325
326 _html_stripper = any_open_tag.suppress() | any_close_tag.suppress()
327
328 @staticmethod
329 def strip_html_tags(s: str, l: int, tokens: ParseResults):
330 """Parse action to remove HTML tags from web page HTML source
331
332 Example::
333
334 # strip HTML links from normal text
335 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
336 td, td_end = make_html_tags("TD")
337 table_text = td + SkipTo(td_end).set_parse_action(pyparsing_common.strip_html_tags)("body") + td_end
338 print(table_text.parse_string(text).body)
339
340 Prints::
341
342 More info at the pyparsing wiki page
343 """
344 return pyparsing_common._html_stripper.transform_string(tokens[0])
345
346 _commasepitem = (
347 Combine(
348 OneOrMore(
349 ~Literal(",")
350 + ~LineEnd()
351 + Word(printables, exclude_chars=",")
352 + Opt(White(" \t") + ~FollowedBy(LineEnd() | ","))
353 )
354 )
355 .streamline()
356 .set_name("commaItem")
357 )
358 comma_separated_list = DelimitedList(
359 Opt(quoted_string.copy() | _commasepitem, default="")
360 ).set_name("comma separated list")
361 """Predefined expression of 1 or more printable words or quoted strings, separated by commas."""
362
363 upcase_tokens = staticmethod(token_map(lambda t: t.upper()))
364 """Parse action to convert tokens to upper case."""
365
366 downcase_tokens = staticmethod(token_map(lambda t: t.lower()))
367 """Parse action to convert tokens to lower case."""
368
369 # fmt: off
370 url = Regex(
371 # https://mathiasbynens.be/demo/url-regex
372 # https://gist.github.com/dperini/729294
373 r"(?P<url>" +
374 # protocol identifier (optional)
375 # short syntax // still required
376 r"(?:(?:(?P<scheme>https?|ftp):)?\/\/)" +
377 # user:pass BasicAuth (optional)
378 r"(?:(?P<auth>\S+(?::\S*)?)@)?" +
379 r"(?P<host>" +
380 # IP address exclusion
381 # private & local networks
382 r"(?!(?:10|127)(?:\.\d{1,3}){3})" +
383 r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" +
384 r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" +
385 # IP address dotted notation octets
386 # excludes loopback network 0.0.0.0
387 # excludes reserved space >= 224.0.0.0
388 # excludes network & broadcast addresses
389 # (first & last IP address of each class)
390 r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" +
391 r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" +
392 r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" +
393 r"|" +
394 # host & domain names, may end with dot
395 # can be replaced by a shortest alternative
396 # (?![-_])(?:[-\w\u00a1-\uffff]{0,63}[^-_]\.)+
397 r"(?:" +
398 r"(?:" +
399 r"[a-z0-9\u00a1-\uffff]" +
400 r"[a-z0-9\u00a1-\uffff_-]{0,62}" +
401 r")?" +
402 r"[a-z0-9\u00a1-\uffff]\." +
403 r")+" +
404 # TLD identifier name, may end with dot
405 r"(?:[a-z\u00a1-\uffff]{2,}\.?)" +
406 r")" +
407 # port number (optional)
408 r"(:(?P<port>\d{2,5}))?" +
409 # resource path (optional)
410 r"(?P<path>\/[^?# ]*)?" +
411 # query string (optional)
412 r"(\?(?P<query>[^#]*))?" +
413 # fragment (optional)
414 r"(#(?P<fragment>\S*))?" +
415 r")"
416 ).set_name("url")
417 """URL (http/https/ftp scheme)"""
418 # fmt: on
419
420 # pre-PEP8 compatibility names
421 # fmt: off
422 convertToInteger = staticmethod(replaced_by_pep8("convertToInteger", convert_to_integer))
423 convertToFloat = staticmethod(replaced_by_pep8("convertToFloat", convert_to_float))
424 convertToDate = staticmethod(replaced_by_pep8("convertToDate", convert_to_date))
425 convertToDatetime = staticmethod(replaced_by_pep8("convertToDatetime", convert_to_datetime))
426 stripHTMLTags = staticmethod(replaced_by_pep8("stripHTMLTags", strip_html_tags))
427 upcaseTokens = staticmethod(replaced_by_pep8("upcaseTokens", upcase_tokens))
428 downcaseTokens = staticmethod(replaced_by_pep8("downcaseTokens", downcase_tokens))
429 # fmt: on
430
431
432_builtin_exprs = [
433 v for v in vars(pyparsing_common).values() if isinstance(v, ParserElement)
434]