1# helpers.py
2import html.entities
3import operator
4import re
5import sys
6import typing
7
8from . import __diag__
9from .core import *
10from .util import (
11 _bslash,
12 _flatten,
13 _escape_regex_range_chars,
14 make_compressed_re,
15 replaced_by_pep8,
16)
17
18
19#
20# global helpers
21#
22def counted_array(
23 expr: ParserElement, int_expr: typing.Optional[ParserElement] = None, **kwargs
24) -> ParserElement:
25 """Helper to define a counted list of expressions.
26
27 This helper defines a pattern of the form::
28
29 integer expr expr expr...
30
31 where the leading integer tells how many expr expressions follow.
32 The matched tokens returns the array of expr tokens as a list - the
33 leading count token is suppressed.
34
35 If ``int_expr`` is specified, it should be a pyparsing expression
36 that produces an integer value.
37
38 Examples:
39
40 .. doctest::
41
42 >>> counted_array(Word(alphas)).parse_string('2 ab cd ef')
43 ParseResults(['ab', 'cd'], {})
44
45 - In this parser, the leading integer value is given in binary,
46 '10' indicating that 2 values are in the array:
47
48 .. doctest::
49
50 >>> binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))
51 >>> counted_array(Word(alphas), int_expr=binary_constant
52 ... ).parse_string('10 ab cd ef')
53 ParseResults(['ab', 'cd'], {})
54
55 - If other fields must be parsed after the count but before the
56 list items, give the fields results names and they will
57 be preserved in the returned ParseResults:
58
59 .. doctest::
60
61 >>> ppc = pyparsing.common
62 >>> count_with_metadata = ppc.integer + Word(alphas)("type")
63 >>> typed_array = counted_array(Word(alphanums),
64 ... int_expr=count_with_metadata)("items")
65 >>> result = typed_array.parse_string("3 bool True True False")
66 >>> print(result.dump())
67 ['True', 'True', 'False']
68 - items: ['True', 'True', 'False']
69 - type: 'bool'
70 """
71 intExpr: typing.Optional[ParserElement] = deprecate_argument(
72 kwargs, "intExpr", None
73 )
74
75 intExpr = intExpr or int_expr
76 array_expr = Forward()
77
78 def count_field_parse_action(s, l, t):
79 nonlocal array_expr
80 n = t[0]
81 array_expr <<= (expr * n) if n else Empty()
82 # clear list contents, but keep any named results
83 del t[:]
84
85 if intExpr is None:
86 intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))
87 else:
88 intExpr = intExpr.copy()
89 intExpr.set_name("arrayLen")
90 intExpr.add_parse_action(count_field_parse_action, call_during_try=True)
91 return (intExpr + array_expr).set_name(f"(len) {expr}...")
92
93
94def match_previous_literal(expr: ParserElement) -> ParserElement:
95 """Helper to define an expression that is indirectly defined from
96 the tokens matched in a previous expression, that is, it looks for
97 a 'repeat' of a previous expression. For example::
98
99 .. testcode::
100
101 first = Word(nums)
102 second = match_previous_literal(first)
103 match_expr = first + ":" + second
104
105 will match ``"1:1"``, but not ``"1:2"``. Because this
106 matches a previous literal, will also match the leading
107 ``"1:1"`` in ``"1:10"``. If this is not desired, use
108 :class:`match_previous_expr`. Do *not* use with packrat parsing
109 enabled.
110 """
111 rep = Forward()
112
113 def copy_token_to_repeater(s, l, t):
114 if not t:
115 rep << Empty()
116 return
117
118 if len(t) == 1:
119 rep << t[0]
120 return
121
122 # flatten t tokens
123 tflat = _flatten(t.as_list())
124 rep << And(Literal(tt) for tt in tflat)
125
126 expr.add_parse_action(copy_token_to_repeater, call_during_try=True)
127 rep.set_name("(prev) " + str(expr))
128 return rep
129
130
131def match_previous_expr(expr: ParserElement) -> ParserElement:
132 """Helper to define an expression that is indirectly defined from
133 the tokens matched in a previous expression, that is, it looks for
134 a 'repeat' of a previous expression. For example:
135
136 .. testcode::
137
138 first = Word(nums)
139 second = match_previous_expr(first)
140 match_expr = first + ":" + second
141
142 will match ``"1:1"``, but not ``"1:2"``. Because this
143 matches by expressions, will *not* match the leading ``"1:1"``
144 in ``"1:10"``; the expressions are evaluated first, and then
145 compared, so ``"1"`` is compared with ``"10"``. Do *not* use
146 with packrat parsing enabled.
147 """
148 rep = Forward()
149 e2 = expr.copy()
150 rep <<= e2
151
152 def copy_token_to_repeater(s, l, t):
153 matchTokens = _flatten(t.as_list())
154
155 def must_match_these_tokens(s, l, t):
156 theseTokens = _flatten(t.as_list())
157 if theseTokens != matchTokens:
158 raise ParseException(
159 s, l, f"Expected {matchTokens}, found{theseTokens}"
160 )
161
162 rep.set_parse_action(must_match_these_tokens, call_during_try=True)
163
164 expr.add_parse_action(copy_token_to_repeater, call_during_try=True)
165 rep.set_name("(prev) " + str(expr))
166 return rep
167
168
169def one_of(
170 strs: Union[typing.Iterable[str], str],
171 caseless: bool = False,
172 use_regex: bool = True,
173 as_keyword: bool = False,
174 **kwargs,
175) -> ParserElement:
176 """Helper to quickly define a set of alternative :class:`Literal` s,
177 and makes sure to do longest-first testing when there is a conflict,
178 regardless of the input order, but returns
179 a :class:`MatchFirst` for best performance.
180
181 :param strs: a string of space-delimited literals, or a collection of
182 string literals
183 :param caseless: treat all literals as caseless
184 :param use_regex: bool - as an optimization, will
185 generate a :class:`Regex` object; otherwise, will generate
186 a :class:`MatchFirst` object (if ``caseless=True`` or
187 ``as_keyword=True``, or if creating a :class:`Regex` raises an exception)
188 :param as_keyword: bool - enforce :class:`Keyword`-style matching on the
189 generated expressions
190
191 Parameters ``asKeyword`` and ``useRegex`` are retained for pre-PEP8
192 compatibility, but will be removed in a future release.
193
194 Example:
195
196 .. testcode::
197
198 comp_oper = one_of("< = > <= >= !=")
199 var = Word(alphas)
200 number = Word(nums)
201 term = var | number
202 comparison_expr = term + comp_oper + term
203 print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12"))
204
205 prints:
206
207 .. testoutput::
208
209 [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
210 """
211 useRegex: bool = deprecate_argument(kwargs, "useRegex", True)
212 asKeyword: bool = deprecate_argument(kwargs, "asKeyword", False)
213
214 asKeyword = asKeyword or as_keyword
215 useRegex = useRegex and use_regex
216
217 if (
218 isinstance(caseless, str_type)
219 and __diag__.warn_on_multiple_string_args_to_oneof
220 ):
221 warnings.warn(
222 "warn_on_multiple_string_args_to_oneof:"
223 " More than one string argument passed to one_of, pass"
224 " choices as a list or space-delimited string",
225 stacklevel=2,
226 )
227
228 if caseless:
229 is_equal = lambda a, b: a.upper() == b.upper()
230 masks = lambda a, b: b.upper().startswith(a.upper())
231 else:
232 is_equal = operator.eq
233 masks = lambda a, b: b.startswith(a)
234
235 symbols: list[str]
236 if isinstance(strs, str_type):
237 strs = typing.cast(str, strs)
238 symbols = strs.split()
239 elif isinstance(strs, Iterable):
240 symbols = list(strs)
241 else:
242 raise TypeError("Invalid argument to one_of, expected string or iterable")
243 if not symbols:
244 return NoMatch()
245
246 # reorder given symbols to take care to avoid masking longer choices with shorter ones
247 # (but only if the given symbols are not just single characters)
248 i = 0
249 while i < len(symbols) - 1:
250 cur = symbols[i]
251 for j, other in enumerate(symbols[i + 1 :]):
252 if is_equal(other, cur):
253 del symbols[i + j + 1]
254 break
255 if len(other) > len(cur) and masks(cur, other):
256 del symbols[i + j + 1]
257 symbols.insert(i, other)
258 break
259 else:
260 i += 1
261
262 if useRegex:
263 re_flags: int = re.IGNORECASE if caseless else 0
264
265 try:
266 if all(len(sym) == 1 for sym in symbols):
267 # symbols are just single characters, create range regex pattern
268 patt = f"[{''.join(_escape_regex_range_chars(sym) for sym in symbols)}]"
269 else:
270 patt = "|".join(re.escape(sym) for sym in symbols)
271
272 # wrap with \b word break markers if defining as keywords
273 if asKeyword:
274 patt = rf"\b(?:{patt})\b"
275
276 ret = Regex(patt, flags=re_flags)
277 ret.set_name(" | ".join(repr(s) for s in symbols))
278
279 if caseless:
280 # add parse action to return symbols as specified, not in random
281 # casing as found in input string
282 symbol_map = {sym.lower(): sym for sym in symbols}
283 ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])
284
285 return ret
286
287 except re.error:
288 warnings.warn(
289 "Exception creating Regex for one_of, building MatchFirst", stacklevel=2
290 )
291
292 # last resort, just use MatchFirst of Token class corresponding to caseless
293 # and asKeyword settings
294 CASELESS = KEYWORD = True
295 parse_element_class = {
296 (CASELESS, KEYWORD): CaselessKeyword,
297 (CASELESS, not KEYWORD): CaselessLiteral,
298 (not CASELESS, KEYWORD): Keyword,
299 (not CASELESS, not KEYWORD): Literal,
300 }[(caseless, asKeyword)]
301 return MatchFirst(parse_element_class(sym) for sym in symbols).set_name(
302 " | ".join(symbols)
303 )
304
305
306def dict_of(key: ParserElement, value: ParserElement) -> Dict:
307 """Helper to easily and clearly define a dictionary by specifying
308 the respective patterns for the key and value. Takes care of
309 defining the :class:`Dict`, :class:`ZeroOrMore`, and
310 :class:`Group` tokens in the proper order. The key pattern
311 can include delimiting markers or punctuation, as long as they are
312 suppressed, thereby leaving the significant key text. The value
313 pattern can include named results, so that the :class:`Dict` results
314 can include named token fields.
315
316 Example:
317
318 .. doctest::
319
320 >>> text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
321
322 >>> data_word = Word(alphas)
323 >>> label = data_word + FollowedBy(':')
324 >>> attr_expr = (
325 ... label
326 ... + Suppress(':')
327 ... + OneOrMore(data_word, stop_on=label)
328 ... .set_parse_action(' '.join))
329 >>> print(attr_expr[1, ...].parse_string(text).dump())
330 ['shape', 'SQUARE', 'posn', 'upper left', 'color', 'light blue', 'texture', 'burlap']
331
332 >>> attr_label = label
333 >>> attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label
334 ... ).set_parse_action(' '.join)
335
336 # similar to Dict, but simpler call format
337 >>> result = dict_of(attr_label, attr_value).parse_string(text)
338 >>> print(result.dump())
339 [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
340 - color: 'light blue'
341 - posn: 'upper left'
342 - shape: 'SQUARE'
343 - texture: 'burlap'
344 [0]:
345 ['shape', 'SQUARE']
346 [1]:
347 ['posn', 'upper left']
348 [2]:
349 ['color', 'light blue']
350 [3]:
351 ['texture', 'burlap']
352
353 >>> print(result['shape'])
354 SQUARE
355 >>> print(result.shape) # object attribute access works too
356 SQUARE
357 >>> print(result.as_dict())
358 {'shape': 'SQUARE', 'posn': 'upper left', 'color': 'light blue', 'texture': 'burlap'}
359 """
360 return Dict(OneOrMore(Group(key + value)))
361
362
363def original_text_for(
364 expr: ParserElement, as_string: bool = True, **kwargs
365) -> ParserElement:
366 """Helper to return the original, untokenized text for a given
367 expression. Useful to restore the parsed fields of an HTML start
368 tag into the raw tag text itself, or to revert separate tokens with
369 intervening whitespace back to the original matching input text. By
370 default, returns a string containing the original parsed text.
371
372 If the optional ``as_string`` argument is passed as
373 ``False``, then the return value is
374 a :class:`ParseResults` containing any results names that
375 were originally matched, and a single token containing the original
376 matched text from the input string. So if the expression passed to
377 :class:`original_text_for` contains expressions with defined
378 results names, you must set ``as_string`` to ``False`` if you
379 want to preserve those results name values.
380
381 The ``asString`` pre-PEP8 argument is retained for compatibility,
382 but will be removed in a future release.
383
384 Example:
385
386 .. testcode::
387
388 src = "this is test <b> bold <i>text</i> </b> normal text "
389 for tag in ("b", "i"):
390 opener, closer = make_html_tags(tag)
391 patt = original_text_for(opener + ... + closer)
392 print(patt.search_string(src)[0])
393
394 prints:
395
396 .. testoutput::
397
398 ['<b> bold <i>text</i> </b>']
399 ['<i>text</i>']
400 """
401 asString: bool = deprecate_argument(kwargs, "asString", True)
402
403 asString = asString and as_string
404
405 locMarker = Empty().set_parse_action(lambda s, loc, t: loc)
406 endlocMarker = locMarker.copy()
407 endlocMarker.callPreparse = False
408 matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
409 if asString:
410 extractText = lambda s, l, t: s[t._original_start : t._original_end]
411 else:
412
413 def extractText(s, l, t):
414 t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]
415
416 matchExpr.set_parse_action(extractText)
417 matchExpr.ignoreExprs = expr.ignoreExprs
418 matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)
419 return matchExpr
420
421
422def ungroup(expr: ParserElement) -> ParserElement:
423 """Helper to undo pyparsing's default grouping of And expressions,
424 even if all but one are non-empty.
425 """
426 return TokenConverter(expr).add_parse_action(lambda t: t[0])
427
428
429def locatedExpr(expr: ParserElement) -> ParserElement:
430 """
431 .. deprecated:: 3.0.0
432 Use the :class:`Located` class instead. Note that `Located`
433 returns results with one less grouping level.
434
435 Helper to decorate a returned token with its starting and ending
436 locations in the input string.
437
438 This helper adds the following results names:
439
440 - ``locn_start`` - location where matched expression begins
441 - ``locn_end`` - location where matched expression ends
442 - ``value`` - the actual parsed results
443
444 Be careful if the input text contains ``<TAB>`` characters, you
445 may want to call :meth:`ParserElement.parse_with_tabs`
446 """
447 warnings.warn(
448 f"{'locatedExpr'!r} deprecated - use {'Located'!r}",
449 DeprecationWarning,
450 stacklevel=2,
451 )
452
453 locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
454 return Group(
455 locator("locn_start")
456 + expr("value")
457 + locator.copy().leave_whitespace()("locn_end")
458 )
459
460
461# define special default value to permit None as a significant value for
462# ignore_expr
463_NO_IGNORE_EXPR_GIVEN = NoMatch()
464
465
466def nested_expr(
467 opener: Union[str, ParserElement] = "(",
468 closer: Union[str, ParserElement] = ")",
469 content: typing.Optional[ParserElement] = None,
470 ignore_expr: typing.Optional[ParserElement] = _NO_IGNORE_EXPR_GIVEN,
471 **kwargs,
472) -> ParserElement:
473 """Helper method for defining nested lists enclosed in opening and
474 closing delimiters (``"("`` and ``")"`` are the default).
475
476 :param opener: str - opening character for a nested list
477 (default= ``"("``); can also be a pyparsing expression
478
479 :param closer: str - closing character for a nested list
480 (default= ``")"``); can also be a pyparsing expression
481
482 :param content: expression for items within the nested lists
483
484 :param ignore_expr: expression for ignoring opening and closing delimiters
485 (default = :class:`quoted_string`)
486
487 Parameter ``ignoreExpr`` is retained for compatibility
488 but will be removed in a future release.
489
490 If an expression is not provided for the content argument, the
491 nested expression will capture all whitespace-delimited content
492 between delimiters as a list of separate values.
493
494 Use the ``ignore_expr`` argument to define expressions that may
495 contain opening or closing characters that should not be treated as
496 opening or closing characters for nesting, such as quoted_string or
497 a comment expression. Specify multiple expressions using an
498 :class:`Or` or :class:`MatchFirst`. The default is
499 :class:`quoted_string`, but if no expressions are to be ignored, then
500 pass ``None`` for this argument.
501
502 Example:
503
504 .. testcode::
505
506 data_type = one_of("void int short long char float double")
507 decl_data_type = Combine(data_type + Opt(Word('*')))
508 ident = Word(alphas+'_', alphanums+'_')
509 number = pyparsing_common.number
510 arg = Group(decl_data_type + ident)
511 LPAR, RPAR = map(Suppress, "()")
512
513 code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
514
515 c_function = (decl_data_type("type")
516 + ident("name")
517 + LPAR + Opt(DelimitedList(arg), [])("args") + RPAR
518 + code_body("body"))
519 c_function.ignore(c_style_comment)
520
521 source_code = '''
522 int is_odd(int x) {
523 return (x%2);
524 }
525
526 int dec_to_hex(char hchar) {
527 if (hchar >= '0' && hchar <= '9') {
528 return (ord(hchar)-ord('0'));
529 } else {
530 return (10+ord(hchar)-ord('A'));
531 }
532 }
533 '''
534 for func in c_function.search_string(source_code):
535 print(f"{func.name} ({func.type}) args: {func.args}")
536
537
538 prints:
539
540 .. testoutput::
541
542 is_odd (int) args: [['int', 'x']]
543 dec_to_hex (int) args: [['char', 'hchar']]
544 """
545 ignoreExpr: ParserElement = deprecate_argument(
546 kwargs, "ignoreExpr", _NO_IGNORE_EXPR_GIVEN
547 )
548
549 if ignoreExpr != ignore_expr:
550 ignoreExpr = ignore_expr if ignoreExpr is _NO_IGNORE_EXPR_GIVEN else ignoreExpr # type: ignore [assignment]
551
552 if ignoreExpr is _NO_IGNORE_EXPR_GIVEN:
553 ignoreExpr = quoted_string()
554
555 if opener == closer:
556 raise ValueError("opening and closing strings cannot be the same")
557
558 if content is None:
559 if isinstance(opener, str_type) and isinstance(closer, str_type):
560 opener = typing.cast(str, opener)
561 closer = typing.cast(str, closer)
562 if len(opener) == 1 and len(closer) == 1:
563 if ignoreExpr is not None:
564 content = Combine(
565 OneOrMore(
566 ~ignoreExpr
567 + CharsNotIn(
568 opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
569 exact=1,
570 )
571 )
572 )
573 else:
574 content = Combine(
575 Empty()
576 + CharsNotIn(
577 opener + closer + ParserElement.DEFAULT_WHITE_CHARS
578 )
579 )
580 else:
581 if ignoreExpr is not None:
582 content = Combine(
583 OneOrMore(
584 ~ignoreExpr
585 + ~Literal(opener)
586 + ~Literal(closer)
587 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
588 )
589 )
590 else:
591 content = Combine(
592 OneOrMore(
593 ~Literal(opener)
594 + ~Literal(closer)
595 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
596 )
597 )
598 else:
599 raise ValueError(
600 "opening and closing arguments must be strings if no content expression is given"
601 )
602
603 # for these internally-created context expressions, simulate whitespace-skipping
604 if ParserElement.DEFAULT_WHITE_CHARS:
605 content.set_parse_action(
606 lambda t: t[0].strip(ParserElement.DEFAULT_WHITE_CHARS)
607 )
608
609 ret = Forward()
610 if ignoreExpr is not None:
611 ret <<= Group(
612 Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer)
613 )
614 else:
615 ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
616
617 ret.set_name(f"nested {opener}{closer} expression")
618
619 # don't override error message from content expressions
620 ret.errmsg = None
621 return ret
622
623
624def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
625 """Internal helper to construct opening and closing tag expressions,
626 given a tag name"""
627 if isinstance(tagStr, str_type):
628 resname = tagStr
629 tagStr = Keyword(tagStr, caseless=not xml)
630 else:
631 resname = tagStr.name
632
633 tagAttrName = Word(alphas, alphanums + "_-:")
634 if xml:
635 tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
636 openTag = (
637 suppress_LT
638 + tagStr("tag")
639 + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
640 + Opt("/", default=[False])("empty").set_parse_action(
641 lambda s, l, t: t[0] == "/"
642 )
643 + suppress_GT
644 )
645 else:
646 tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
647 printables, exclude_chars=">"
648 )
649 openTag = (
650 suppress_LT
651 + tagStr("tag")
652 + Dict(
653 ZeroOrMore(
654 Group(
655 tagAttrName.set_parse_action(lambda t: t[0].lower())
656 + Opt(Suppress("=") + tagAttrValue)
657 )
658 )
659 )
660 + Opt("/", default=[False])("empty").set_parse_action(
661 lambda s, l, t: t[0] == "/"
662 )
663 + suppress_GT
664 )
665 closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)
666
667 openTag.set_name(f"<{resname}>")
668 # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels
669 openTag.add_parse_action(
670 lambda t: t.__setitem__(
671 "start" + "".join(resname.replace(":", " ").title().split()), t.copy()
672 )
673 )
674 closeTag = closeTag(
675 "end" + "".join(resname.replace(":", " ").title().split())
676 ).set_name(f"</{resname}>")
677 openTag.tag = resname
678 closeTag.tag = resname
679 openTag.tag_body = SkipTo(closeTag())
680 return openTag, closeTag
681
682
683def make_html_tags(
684 tag_str: Union[str, ParserElement],
685) -> tuple[ParserElement, ParserElement]:
686 """Helper to construct opening and closing tag expressions for HTML,
687 given a tag name. Matches tags in either upper or lower case,
688 attributes with namespaces and with quoted or unquoted values.
689
690 Example:
691
692 .. testcode::
693
694 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
695 # make_html_tags returns pyparsing expressions for the opening and
696 # closing tags as a 2-tuple
697 a, a_end = make_html_tags("A")
698 link_expr = a + SkipTo(a_end)("link_text") + a_end
699
700 for link in link_expr.search_string(text):
701 # attributes in the <A> tag (like "href" shown here) are
702 # also accessible as named results
703 print(link.link_text, '->', link.href)
704
705 prints:
706
707 .. testoutput::
708
709 pyparsing -> https://github.com/pyparsing/pyparsing/wiki
710 """
711 return _makeTags(tag_str, False)
712
713
714def make_xml_tags(
715 tag_str: Union[str, ParserElement],
716) -> tuple[ParserElement, ParserElement]:
717 """Helper to construct opening and closing tag expressions for XML,
718 given a tag name. Matches tags only in the given upper/lower case.
719
720 Example: similar to :class:`make_html_tags`
721 """
722 return _makeTags(tag_str, True)
723
724
725any_open_tag: ParserElement
726any_close_tag: ParserElement
727any_open_tag, any_close_tag = make_html_tags(
728 Word(alphas, alphanums + "_:").set_name("any tag")
729)
730
731_htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
732_most_common_entities = "nbsp lt gt amp quot apos cent pound euro copy".replace(
733 " ", "|"
734)
735common_html_entity = Regex(
736 lambda: f"&(?P<entity>{_most_common_entities}|{make_compressed_re(_htmlEntityMap)});"
737).set_name("common HTML entity")
738
739
740def replace_html_entity(s, l, t):
741 """Helper parser action to replace common HTML entities with their special characters"""
742 return _htmlEntityMap.get(t.entity)
743
744
745class OpAssoc(Enum):
746 """Enumeration of operator associativity
747 - used in constructing InfixNotationOperatorSpec for :class:`infix_notation`"""
748
749 LEFT = 1
750 RIGHT = 2
751
752
753InfixNotationOperatorArgType = Union[
754 ParserElement, str, tuple[Union[ParserElement, str], Union[ParserElement, str]]
755]
756InfixNotationOperatorSpec = Union[
757 tuple[
758 InfixNotationOperatorArgType,
759 int,
760 OpAssoc,
761 typing.Optional[ParseAction],
762 ],
763 tuple[
764 InfixNotationOperatorArgType,
765 int,
766 OpAssoc,
767 ],
768]
769
770
771def infix_notation(
772 base_expr: ParserElement,
773 op_list: list[InfixNotationOperatorSpec],
774 lpar: Union[str, ParserElement] = Suppress("("),
775 rpar: Union[str, ParserElement] = Suppress(")"),
776) -> Forward:
777 """Helper method for constructing grammars of expressions made up of
778 operators working in a precedence hierarchy. Operators may be unary
779 or binary, left- or right-associative. Parse actions can also be
780 attached to operator expressions. The generated parser will also
781 recognize the use of parentheses to override operator precedences
782 (see example below).
783
784 Note: if you define a deep operator list, you may see performance
785 issues when using infix_notation. See
786 :class:`ParserElement.enable_packrat` for a mechanism to potentially
787 improve your parser performance.
788
789 Parameters:
790
791 :param base_expr: expression representing the most basic operand to
792 be used in the expression
793 :param op_list: list of tuples, one for each operator precedence level
794 in the expression grammar; each tuple is of the form ``(op_expr,
795 num_operands, right_left_assoc, (optional)parse_action)``, where:
796
797 - ``op_expr`` is the pyparsing expression for the operator; may also
798 be a string, which will be converted to a Literal; if ``num_operands``
799 is 3, ``op_expr`` is a tuple of two expressions, for the two
800 operators separating the 3 terms
801 - ``num_operands`` is the number of terms for this operator (must be 1,
802 2, or 3)
803 - ``right_left_assoc`` is the indicator whether the operator is right
804 or left associative, using the pyparsing-defined constants
805 ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
806 - ``parse_action`` is the parse action to be associated with
807 expressions matching this operator expression (the parse action
808 tuple member may be omitted); if the parse action is passed
809 a tuple or list of functions, this is equivalent to calling
810 ``set_parse_action(*fn)``
811 (:class:`ParserElement.set_parse_action`)
812
813 :param lpar: expression for matching left-parentheses; if passed as a
814 str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as
815 an expression (such as ``Literal('(')``), then it will be kept in
816 the parsed results, and grouped with them. (default= ``Suppress('(')``)
817 :param rpar: expression for matching right-parentheses; if passed as a
818 str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as
819 an expression (such as ``Literal(')')``), then it will be kept in
820 the parsed results, and grouped with them. (default= ``Suppress(')')``)
821
822 Example:
823
824 .. testcode::
825
826 # simple example of four-function arithmetic with ints and
827 # variable names
828 integer = pyparsing_common.signed_integer
829 varname = pyparsing_common.identifier
830
831 arith_expr = infix_notation(integer | varname,
832 [
833 ('-', 1, OpAssoc.RIGHT),
834 (one_of('* /'), 2, OpAssoc.LEFT),
835 (one_of('+ -'), 2, OpAssoc.LEFT),
836 ])
837
838 arith_expr.run_tests('''
839 5+3*6
840 (5+3)*6
841 (5+x)*y
842 -2--11
843 ''', full_dump=False)
844
845 prints:
846
847 .. testoutput::
848 :options: +NORMALIZE_WHITESPACE
849
850
851 5+3*6
852 [[5, '+', [3, '*', 6]]]
853
854 (5+3)*6
855 [[[5, '+', 3], '*', 6]]
856
857 (5+x)*y
858 [[[5, '+', 'x'], '*', 'y']]
859
860 -2--11
861 [[['-', 2], '-', ['-', 11]]]
862 """
863
864 # captive version of FollowedBy that does not do parse actions or capture results names
865 class _FB(FollowedBy):
866 def parseImpl(self, instring, loc, doActions=True):
867 self.expr.try_parse(instring, loc)
868 return loc, []
869
870 _FB.__name__ = "FollowedBy>"
871
872 ret = Forward()
873 ret.set_name(f"{base_expr.name}_expression")
874 if isinstance(lpar, str):
875 lpar = Suppress(lpar)
876 if isinstance(rpar, str):
877 rpar = Suppress(rpar)
878
879 nested_expr = (lpar + ret + rpar).set_name(f"nested_{base_expr.name}_expression")
880
881 # if lpar and rpar are not suppressed, wrap in group
882 if not (isinstance(lpar, Suppress) and isinstance(rpar, Suppress)):
883 lastExpr = base_expr | Group(nested_expr)
884 else:
885 lastExpr = base_expr | nested_expr
886
887 arity: int
888 rightLeftAssoc: opAssoc
889 pa: typing.Optional[ParseAction]
890 opExpr1: ParserElement
891 opExpr2: ParserElement
892 matchExpr: ParserElement
893 match_lookahead: ParserElement
894 for operDef in op_list:
895 opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment]
896 if isinstance(opExpr, str_type):
897 opExpr = ParserElement._literalStringClass(opExpr)
898 opExpr = typing.cast(ParserElement, opExpr)
899 if arity == 3:
900 if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
901 raise ValueError(
902 "if numterms=3, opExpr must be a tuple or list of two expressions"
903 )
904 opExpr1, opExpr2 = opExpr
905 term_name = f"{opExpr1}{opExpr2} operations"
906 else:
907 term_name = f"{opExpr} operations"
908
909 if not 1 <= arity <= 3:
910 raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
911
912 if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
913 raise ValueError("operator must indicate right or left associativity")
914
915 thisExpr: ParserElement = Forward().set_name(term_name)
916 thisExpr = typing.cast(Forward, thisExpr)
917 match_lookahead = And([])
918 if rightLeftAssoc is OpAssoc.LEFT:
919 if arity == 1:
920 match_lookahead = _FB(lastExpr + opExpr)
921 matchExpr = Group(lastExpr + opExpr[1, ...])
922 elif arity == 2:
923 if opExpr is not None:
924 match_lookahead = _FB(lastExpr + opExpr + lastExpr)
925 matchExpr = Group(lastExpr + (opExpr + lastExpr)[1, ...])
926 else:
927 match_lookahead = _FB(lastExpr + lastExpr)
928 matchExpr = Group(lastExpr[2, ...])
929 elif arity == 3:
930 match_lookahead = _FB(
931 lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
932 )
933 matchExpr = Group(
934 lastExpr + (opExpr1 + lastExpr + opExpr2 + lastExpr)[1, ...]
935 )
936 elif rightLeftAssoc is OpAssoc.RIGHT:
937 if arity == 1:
938 # try to avoid LR with this extra test
939 if not isinstance(opExpr, Opt):
940 opExpr = Opt(opExpr)
941 match_lookahead = _FB(opExpr.expr + thisExpr)
942 matchExpr = Group(opExpr + thisExpr)
943 elif arity == 2:
944 if opExpr is not None:
945 match_lookahead = _FB(lastExpr + opExpr + thisExpr)
946 matchExpr = Group(lastExpr + (opExpr + thisExpr)[1, ...])
947 else:
948 match_lookahead = _FB(lastExpr + thisExpr)
949 matchExpr = Group(lastExpr + thisExpr[1, ...])
950 elif arity == 3:
951 match_lookahead = _FB(
952 lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
953 )
954 matchExpr = Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
955
956 # suppress lookahead expr from railroad diagrams
957 match_lookahead.show_in_diagram = False
958
959 # TODO - determine why this statement can't be included in the following
960 # if pa block
961 matchExpr = match_lookahead + matchExpr
962
963 if pa:
964 if isinstance(pa, (tuple, list)):
965 matchExpr.set_parse_action(*pa)
966 else:
967 matchExpr.set_parse_action(pa)
968
969 thisExpr <<= (matchExpr | lastExpr).set_name(term_name)
970 lastExpr = thisExpr
971
972 ret <<= lastExpr
973 return ret
974
975
976def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
977 """
978 .. deprecated:: 3.0.0
979 Use the :class:`IndentedBlock` class instead. Note that `IndentedBlock`
980 has a difference method signature.
981
982 Helper method for defining space-delimited indentation blocks,
983 such as those used to define block statements in Python source code.
984
985 :param blockStatementExpr: expression defining syntax of statement that
986 is repeated within the indented block
987
988 :param indentStack: list created by caller to manage indentation stack
989 (multiple ``statementWithIndentedBlock`` expressions within a single
990 grammar should share a common ``indentStack``)
991
992 :param indent: boolean indicating whether block must be indented beyond
993 the current level; set to ``False`` for block of left-most statements
994
995 A valid block must contain at least one ``blockStatement``.
996
997 (Note that indentedBlock uses internal parse actions which make it
998 incompatible with packrat parsing.)
999
1000 Example:
1001
1002 .. testcode::
1003
1004 data = '''
1005 def A(z):
1006 A1
1007 B = 100
1008 G = A2
1009 A2
1010 A3
1011 B
1012 def BB(a,b,c):
1013 BB1
1014 def BBA():
1015 bba1
1016 bba2
1017 bba3
1018 C
1019 D
1020 def spam(x,y):
1021 def eggs(z):
1022 pass
1023 '''
1024
1025 indentStack = [1]
1026 stmt = Forward()
1027
1028 identifier = Word(alphas, alphanums)
1029 funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
1030 func_body = indentedBlock(stmt, indentStack)
1031 funcDef = Group(funcDecl + func_body)
1032
1033 rvalue = Forward()
1034 funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
1035 rvalue << (funcCall | identifier | Word(nums))
1036 assignment = Group(identifier + "=" + rvalue)
1037 stmt << (funcDef | assignment | identifier)
1038
1039 module_body = stmt[1, ...]
1040
1041 parseTree = module_body.parseString(data)
1042 parseTree.pprint()
1043
1044 prints:
1045
1046 .. testoutput::
1047
1048 [['def',
1049 'A',
1050 ['(', 'z', ')'],
1051 ':',
1052 [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
1053 'B',
1054 ['def',
1055 'BB',
1056 ['(', 'a', 'b', 'c', ')'],
1057 ':',
1058 [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
1059 'C',
1060 'D',
1061 ['def',
1062 'spam',
1063 ['(', 'x', 'y', ')'],
1064 ':',
1065 [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
1066 """
1067 warnings.warn(
1068 f"{'indentedBlock'!r} deprecated - use {'IndentedBlock'!r}",
1069 DeprecationWarning,
1070 stacklevel=2,
1071 )
1072
1073 backup_stacks.append(indentStack[:])
1074
1075 def reset_stack():
1076 indentStack[:] = backup_stacks[-1]
1077
1078 def checkPeerIndent(s, l, t):
1079 if l >= len(s):
1080 return
1081 curCol = col(l, s)
1082 if curCol != indentStack[-1]:
1083 if curCol > indentStack[-1]:
1084 raise ParseException(s, l, "illegal nesting")
1085 raise ParseException(s, l, "not a peer entry")
1086
1087 def checkSubIndent(s, l, t):
1088 curCol = col(l, s)
1089 if curCol > indentStack[-1]:
1090 indentStack.append(curCol)
1091 else:
1092 raise ParseException(s, l, "not a subentry")
1093
1094 def checkUnindent(s, l, t):
1095 if l >= len(s):
1096 return
1097 curCol = col(l, s)
1098 if not (indentStack and curCol in indentStack):
1099 raise ParseException(s, l, "not an unindent")
1100 if curCol < indentStack[-1]:
1101 indentStack.pop()
1102
1103 NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
1104 INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
1105 PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
1106 UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
1107 if indent:
1108 smExpr = Group(
1109 Opt(NL)
1110 + INDENT
1111 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
1112 + UNDENT
1113 )
1114 else:
1115 smExpr = Group(
1116 Opt(NL)
1117 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
1118 + Opt(UNDENT)
1119 )
1120
1121 # add a parse action to remove backup_stack from list of backups
1122 smExpr.add_parse_action(
1123 lambda: backup_stacks.pop(-1) and None if backup_stacks else None
1124 )
1125 smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
1126 blockStatementExpr.ignore(_bslash + LineEnd())
1127 return smExpr.set_name("indented block")
1128
1129
1130# it's easy to get these comment structures wrong - they're very common,
1131# so may as well make them available
1132c_style_comment = Regex(r"/\*(?:[^*]|\*(?!/))*\*\/").set_name("C style comment")
1133"Comment of the form ``/* ... */``"
1134
1135html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment")
1136"Comment of the form ``<!-- ... -->``"
1137
1138rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
1139dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
1140"Comment of the form ``// ... (to end of line)``"
1141
1142cpp_style_comment = Regex(
1143 r"(?:/\*(?:[^*]|\*(?!/))*\*\/)|(?://(?:\\\n|[^\n])*)"
1144).set_name("C++ style comment")
1145"Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
1146
1147java_style_comment = cpp_style_comment
1148"Same as :class:`cpp_style_comment`"
1149
1150python_style_comment = Regex(r"#.*").set_name("Python style comment")
1151"Comment of the form ``# ... (to end of line)``"
1152
1153
1154# build list of built-in expressions, for future reference if a global default value
1155# gets updated
1156_builtin_exprs: list[ParserElement] = [
1157 v for v in vars().values() if isinstance(v, ParserElement)
1158]
1159
1160
1161# compatibility function, superseded by DelimitedList class
1162def delimited_list(
1163 expr: Union[str, ParserElement],
1164 delim: Union[str, ParserElement] = ",",
1165 combine: bool = False,
1166 min: typing.Optional[int] = None,
1167 max: typing.Optional[int] = None,
1168 *,
1169 allow_trailing_delim: bool = False,
1170) -> ParserElement:
1171 """
1172 .. deprecated:: 3.1.0
1173 Use the :class:`DelimitedList` class instead.
1174 """
1175 return DelimitedList(
1176 expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim
1177 )
1178
1179
1180# Compatibility synonyms
1181# fmt: off
1182opAssoc = OpAssoc
1183anyOpenTag = any_open_tag
1184anyCloseTag = any_close_tag
1185commonHTMLEntity = common_html_entity
1186cStyleComment = c_style_comment
1187htmlComment = html_comment
1188restOfLine = rest_of_line
1189dblSlashComment = dbl_slash_comment
1190cppStyleComment = cpp_style_comment
1191javaStyleComment = java_style_comment
1192pythonStyleComment = python_style_comment
1193delimitedList = replaced_by_pep8("delimitedList", DelimitedList)
1194delimited_list = replaced_by_pep8("delimited_list", DelimitedList)
1195countedArray = replaced_by_pep8("countedArray", counted_array)
1196matchPreviousLiteral = replaced_by_pep8("matchPreviousLiteral", match_previous_literal)
1197matchPreviousExpr = replaced_by_pep8("matchPreviousExpr", match_previous_expr)
1198oneOf = replaced_by_pep8("oneOf", one_of)
1199dictOf = replaced_by_pep8("dictOf", dict_of)
1200originalTextFor = replaced_by_pep8("originalTextFor", original_text_for)
1201nestedExpr = replaced_by_pep8("nestedExpr", nested_expr)
1202makeHTMLTags = replaced_by_pep8("makeHTMLTags", make_html_tags)
1203makeXMLTags = replaced_by_pep8("makeXMLTags", make_xml_tags)
1204replaceHTMLEntity = replaced_by_pep8("replaceHTMLEntity", replace_html_entity)
1205infixNotation = replaced_by_pep8("infixNotation", infix_notation)
1206# fmt: on