1# helpers.py
2import html.entities
3import operator
4import re
5import sys
6import typing
7
8from . import __diag__
9from .core import *
10from .util import (
11 _bslash,
12 _flatten,
13 _escape_regex_range_chars,
14 make_compressed_re,
15 replaced_by_pep8,
16)
17
18
19#
20# global helpers
21#
22def counted_array(
23 expr: ParserElement,
24 int_expr: typing.Optional[ParserElement] = None,
25 *,
26 intExpr: typing.Optional[ParserElement] = None,
27) -> ParserElement:
28 """Helper to define a counted list of expressions.
29
30 This helper defines a pattern of the form::
31
32 integer expr expr expr...
33
34 where the leading integer tells how many expr expressions follow.
35 The matched tokens returns the array of expr tokens as a list - the
36 leading count token is suppressed.
37
38 If ``int_expr`` is specified, it should be a pyparsing expression
39 that produces an integer value.
40
41 Examples:
42
43 .. doctest::
44
45 >>> counted_array(Word(alphas)).parse_string('2 ab cd ef')
46 ParseResults(['ab', 'cd'], {})
47
48 - In this parser, the leading integer value is given in binary,
49 '10' indicating that 2 values are in the array:
50
51 .. doctest::
52
53 >>> binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))
54 >>> counted_array(Word(alphas), int_expr=binary_constant
55 ... ).parse_string('10 ab cd ef')
56 ParseResults(['ab', 'cd'], {})
57
58 - If other fields must be parsed after the count but before the
59 list items, give the fields results names and they will
60 be preserved in the returned ParseResults:
61
62 .. doctest::
63
64 >>> ppc = pyparsing.common
65 >>> count_with_metadata = ppc.integer + Word(alphas)("type")
66 >>> typed_array = counted_array(Word(alphanums),
67 ... int_expr=count_with_metadata)("items")
68 >>> result = typed_array.parse_string("3 bool True True False")
69 >>> print(result.dump())
70 ['True', 'True', 'False']
71 - items: ['True', 'True', 'False']
72 - type: 'bool'
73 """
74 intExpr = intExpr or int_expr
75 array_expr = Forward()
76
77 def count_field_parse_action(s, l, t):
78 nonlocal array_expr
79 n = t[0]
80 array_expr <<= (expr * n) if n else Empty()
81 # clear list contents, but keep any named results
82 del t[:]
83
84 if intExpr is None:
85 intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))
86 else:
87 intExpr = intExpr.copy()
88 intExpr.set_name("arrayLen")
89 intExpr.add_parse_action(count_field_parse_action, call_during_try=True)
90 return (intExpr + array_expr).set_name(f"(len) {expr}...")
91
92
93def match_previous_literal(expr: ParserElement) -> ParserElement:
94 """Helper to define an expression that is indirectly defined from
95 the tokens matched in a previous expression, that is, it looks for
96 a 'repeat' of a previous expression. For example::
97
98 .. testcode::
99
100 first = Word(nums)
101 second = match_previous_literal(first)
102 match_expr = first + ":" + second
103
104 will match ``"1:1"``, but not ``"1:2"``. Because this
105 matches a previous literal, will also match the leading
106 ``"1:1"`` in ``"1:10"``. If this is not desired, use
107 :class:`match_previous_expr`. Do *not* use with packrat parsing
108 enabled.
109 """
110 rep = Forward()
111
112 def copy_token_to_repeater(s, l, t):
113 if not t:
114 rep << Empty()
115 return
116
117 if len(t) == 1:
118 rep << t[0]
119 return
120
121 # flatten t tokens
122 tflat = _flatten(t.as_list())
123 rep << And(Literal(tt) for tt in tflat)
124
125 expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
126 rep.set_name("(prev) " + str(expr))
127 return rep
128
129
130def match_previous_expr(expr: ParserElement) -> ParserElement:
131 """Helper to define an expression that is indirectly defined from
132 the tokens matched in a previous expression, that is, it looks for
133 a 'repeat' of a previous expression. For example:
134
135 .. testcode::
136
137 first = Word(nums)
138 second = match_previous_expr(first)
139 match_expr = first + ":" + second
140
141 will match ``"1:1"``, but not ``"1:2"``. Because this
142 matches by expressions, will *not* match the leading ``"1:1"``
143 in ``"1:10"``; the expressions are evaluated first, and then
144 compared, so ``"1"`` is compared with ``"10"``. Do *not* use
145 with packrat parsing enabled.
146 """
147 rep = Forward()
148 e2 = expr.copy()
149 rep <<= e2
150
151 def copy_token_to_repeater(s, l, t):
152 matchTokens = _flatten(t.as_list())
153
154 def must_match_these_tokens(s, l, t):
155 theseTokens = _flatten(t.as_list())
156 if theseTokens != matchTokens:
157 raise ParseException(
158 s, l, f"Expected {matchTokens}, found{theseTokens}"
159 )
160
161 rep.set_parse_action(must_match_these_tokens, callDuringTry=True)
162
163 expr.add_parse_action(copy_token_to_repeater, callDuringTry=True)
164 rep.set_name("(prev) " + str(expr))
165 return rep
166
167
168def one_of(
169 strs: Union[typing.Iterable[str], str],
170 caseless: bool = False,
171 use_regex: bool = True,
172 as_keyword: bool = False,
173 *,
174 useRegex: bool = True,
175 asKeyword: bool = False,
176) -> ParserElement:
177 """Helper to quickly define a set of alternative :class:`Literal` s,
178 and makes sure to do longest-first testing when there is a conflict,
179 regardless of the input order, but returns
180 a :class:`MatchFirst` for best performance.
181
182 :param strs: a string of space-delimited literals, or a collection of
183 string literals
184 :param caseless: treat all literals as caseless
185 :param use_regex: bool - as an optimization, will
186 generate a :class:`Regex` object; otherwise, will generate
187 a :class:`MatchFirst` object (if ``caseless=True`` or
188 ``as_keyword=True``, or if creating a :class:`Regex` raises an exception)
189 :param as_keyword: bool - enforce :class:`Keyword`-style matching on the
190 generated expressions
191
192 Parameters ``asKeyword`` and ``useRegex`` are retained for pre-PEP8
193 compatibility, but will be removed in a future release.
194
195 Example:
196
197 .. testcode::
198
199 comp_oper = one_of("< = > <= >= !=")
200 var = Word(alphas)
201 number = Word(nums)
202 term = var | number
203 comparison_expr = term + comp_oper + term
204 print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12"))
205
206 prints:
207
208 .. testoutput::
209
210 [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]
211 """
212 asKeyword = asKeyword or as_keyword
213 useRegex = useRegex and use_regex
214
215 if (
216 isinstance(caseless, str_type)
217 and __diag__.warn_on_multiple_string_args_to_oneof
218 ):
219 warnings.warn(
220 "warn_on_multiple_string_args_to_oneof:"
221 " More than one string argument passed to one_of, pass"
222 " choices as a list or space-delimited string",
223 stacklevel=2,
224 )
225
226 if caseless:
227 is_equal = lambda a, b: a.upper() == b.upper()
228 masks = lambda a, b: b.upper().startswith(a.upper())
229 else:
230 is_equal = operator.eq
231 masks = lambda a, b: b.startswith(a)
232
233 symbols: list[str]
234 if isinstance(strs, str_type):
235 strs = typing.cast(str, strs)
236 symbols = strs.split()
237 elif isinstance(strs, Iterable):
238 symbols = list(strs)
239 else:
240 raise TypeError("Invalid argument to one_of, expected string or iterable")
241 if not symbols:
242 return NoMatch()
243
244 # reorder given symbols to take care to avoid masking longer choices with shorter ones
245 # (but only if the given symbols are not just single characters)
246 i = 0
247 while i < len(symbols) - 1:
248 cur = symbols[i]
249 for j, other in enumerate(symbols[i + 1 :]):
250 if is_equal(other, cur):
251 del symbols[i + j + 1]
252 break
253 if len(other) > len(cur) and masks(cur, other):
254 del symbols[i + j + 1]
255 symbols.insert(i, other)
256 break
257 else:
258 i += 1
259
260 if useRegex:
261 re_flags: int = re.IGNORECASE if caseless else 0
262
263 try:
264 if all(len(sym) == 1 for sym in symbols):
265 # symbols are just single characters, create range regex pattern
266 patt = f"[{''.join(_escape_regex_range_chars(sym) for sym in symbols)}]"
267 else:
268 patt = "|".join(re.escape(sym) for sym in symbols)
269
270 # wrap with \b word break markers if defining as keywords
271 if asKeyword:
272 patt = rf"\b(?:{patt})\b"
273
274 ret = Regex(patt, flags=re_flags)
275 ret.set_name(" | ".join(repr(s) for s in symbols))
276
277 if caseless:
278 # add parse action to return symbols as specified, not in random
279 # casing as found in input string
280 symbol_map = {sym.lower(): sym for sym in symbols}
281 ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])
282
283 return ret
284
285 except re.error:
286 warnings.warn(
287 "Exception creating Regex for one_of, building MatchFirst", stacklevel=2
288 )
289
290 # last resort, just use MatchFirst of Token class corresponding to caseless
291 # and asKeyword settings
292 CASELESS = KEYWORD = True
293 parse_element_class = {
294 (CASELESS, KEYWORD): CaselessKeyword,
295 (CASELESS, not KEYWORD): CaselessLiteral,
296 (not CASELESS, KEYWORD): Keyword,
297 (not CASELESS, not KEYWORD): Literal,
298 }[(caseless, asKeyword)]
299 return MatchFirst(parse_element_class(sym) for sym in symbols).set_name(
300 " | ".join(symbols)
301 )
302
303
304def dict_of(key: ParserElement, value: ParserElement) -> Dict:
305 """Helper to easily and clearly define a dictionary by specifying
306 the respective patterns for the key and value. Takes care of
307 defining the :class:`Dict`, :class:`ZeroOrMore`, and
308 :class:`Group` tokens in the proper order. The key pattern
309 can include delimiting markers or punctuation, as long as they are
310 suppressed, thereby leaving the significant key text. The value
311 pattern can include named results, so that the :class:`Dict` results
312 can include named token fields.
313
314 Example:
315
316 .. doctest::
317
318 >>> text = "shape: SQUARE posn: upper left color: light blue texture: burlap"
319
320 >>> data_word = Word(alphas)
321 >>> label = data_word + FollowedBy(':')
322 >>> attr_expr = (
323 ... label
324 ... + Suppress(':')
325 ... + OneOrMore(data_word, stop_on=label)
326 ... .set_parse_action(' '.join))
327 >>> print(attr_expr[1, ...].parse_string(text).dump())
328 ['shape', 'SQUARE', 'posn', 'upper left', 'color', 'light blue', 'texture', 'burlap']
329
330 >>> attr_label = label
331 >>> attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label
332 ... ).set_parse_action(' '.join)
333
334 # similar to Dict, but simpler call format
335 >>> result = dict_of(attr_label, attr_value).parse_string(text)
336 >>> print(result.dump())
337 [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]
338 - color: 'light blue'
339 - posn: 'upper left'
340 - shape: 'SQUARE'
341 - texture: 'burlap'
342 [0]:
343 ['shape', 'SQUARE']
344 [1]:
345 ['posn', 'upper left']
346 [2]:
347 ['color', 'light blue']
348 [3]:
349 ['texture', 'burlap']
350
351 >>> print(result['shape'])
352 SQUARE
353 >>> print(result.shape) # object attribute access works too
354 SQUARE
355 >>> print(result.as_dict())
356 {'shape': 'SQUARE', 'posn': 'upper left', 'color': 'light blue', 'texture': 'burlap'}
357 """
358 return Dict(OneOrMore(Group(key + value)))
359
360
361def original_text_for(
362 expr: ParserElement, as_string: bool = True, *, asString: bool = True
363) -> ParserElement:
364 """Helper to return the original, untokenized text for a given
365 expression. Useful to restore the parsed fields of an HTML start
366 tag into the raw tag text itself, or to revert separate tokens with
367 intervening whitespace back to the original matching input text. By
368 default, returns a string containing the original parsed text.
369
370 If the optional ``as_string`` argument is passed as
371 ``False``, then the return value is
372 a :class:`ParseResults` containing any results names that
373 were originally matched, and a single token containing the original
374 matched text from the input string. So if the expression passed to
375 :class:`original_text_for` contains expressions with defined
376 results names, you must set ``as_string`` to ``False`` if you
377 want to preserve those results name values.
378
379 The ``asString`` pre-PEP8 argument is retained for compatibility,
380 but will be removed in a future release.
381
382 Example:
383
384 .. testcode::
385
386 src = "this is test <b> bold <i>text</i> </b> normal text "
387 for tag in ("b", "i"):
388 opener, closer = make_html_tags(tag)
389 patt = original_text_for(opener + ... + closer)
390 print(patt.search_string(src)[0])
391
392 prints:
393
394 .. testoutput::
395
396 ['<b> bold <i>text</i> </b>']
397 ['<i>text</i>']
398 """
399 asString = asString and as_string
400
401 locMarker = Empty().set_parse_action(lambda s, loc, t: loc)
402 endlocMarker = locMarker.copy()
403 endlocMarker.callPreparse = False
404 matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")
405 if asString:
406 extractText = lambda s, l, t: s[t._original_start : t._original_end]
407 else:
408
409 def extractText(s, l, t):
410 t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]
411
412 matchExpr.set_parse_action(extractText)
413 matchExpr.ignoreExprs = expr.ignoreExprs
414 matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)
415 return matchExpr
416
417
418def ungroup(expr: ParserElement) -> ParserElement:
419 """Helper to undo pyparsing's default grouping of And expressions,
420 even if all but one are non-empty.
421 """
422 return TokenConverter(expr).add_parse_action(lambda t: t[0])
423
424
425def locatedExpr(expr: ParserElement) -> ParserElement:
426 """
427 .. deprecated:: 3.0.0
428 Use the :class:`Located` class instead.
429
430 Helper to decorate a returned token with its starting and ending
431 locations in the input string.
432
433 This helper adds the following results names:
434
435 - ``locn_start`` - location where matched expression begins
436 - ``locn_end`` - location where matched expression ends
437 - ``value`` - the actual parsed results
438
439 Be careful if the input text contains ``<TAB>`` characters, you
440 may want to call :meth:`ParserElement.parse_with_tabs`
441
442 Example:
443
444 .. testcode::
445
446 wd = Word(alphas)
447 res = locatedExpr(wd).search_string("ljsdf123lksdjjf123lkkjj1222")
448 for match in res:
449 print(match)
450
451 prints:
452
453 .. testoutput::
454
455 [[0, 'ljsdf', 5]]
456 [[8, 'lksdjjf', 15]]
457 [[18, 'lkkjj', 23]]
458 """
459 locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
460 return Group(
461 locator("locn_start")
462 + expr("value")
463 + locator.copy().leaveWhitespace()("locn_end")
464 )
465
466
467# define special default value to permit None as a significant value for
468# ignore_expr
469_NO_IGNORE_EXPR_GIVEN = NoMatch()
470
471
472def nested_expr(
473 opener: Union[str, ParserElement] = "(",
474 closer: Union[str, ParserElement] = ")",
475 content: typing.Optional[ParserElement] = None,
476 ignore_expr: typing.Optional[ParserElement] = _NO_IGNORE_EXPR_GIVEN,
477 *,
478 ignoreExpr: typing.Optional[ParserElement] = _NO_IGNORE_EXPR_GIVEN,
479) -> ParserElement:
480 """Helper method for defining nested lists enclosed in opening and
481 closing delimiters (``"("`` and ``")"`` are the default).
482
483 :param opener: str - opening character for a nested list
484 (default= ``"("``); can also be a pyparsing expression
485
486 :param closer: str - closing character for a nested list
487 (default= ``")"``); can also be a pyparsing expression
488
489 :param content: expression for items within the nested lists
490
491 :param ignore_expr: expression for ignoring opening and closing delimiters
492 (default = :class:`quoted_string`)
493
494 Parameter ``ignoreExpr`` is retained for compatibility
495 but will be removed in a future release.
496
497 If an expression is not provided for the content argument, the
498 nested expression will capture all whitespace-delimited content
499 between delimiters as a list of separate values.
500
501 Use the ``ignore_expr`` argument to define expressions that may
502 contain opening or closing characters that should not be treated as
503 opening or closing characters for nesting, such as quoted_string or
504 a comment expression. Specify multiple expressions using an
505 :class:`Or` or :class:`MatchFirst`. The default is
506 :class:`quoted_string`, but if no expressions are to be ignored, then
507 pass ``None`` for this argument.
508
509 Example:
510
511 .. testcode::
512
513 data_type = one_of("void int short long char float double")
514 decl_data_type = Combine(data_type + Opt(Word('*')))
515 ident = Word(alphas+'_', alphanums+'_')
516 number = pyparsing_common.number
517 arg = Group(decl_data_type + ident)
518 LPAR, RPAR = map(Suppress, "()")
519
520 code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
521
522 c_function = (decl_data_type("type")
523 + ident("name")
524 + LPAR + Opt(DelimitedList(arg), [])("args") + RPAR
525 + code_body("body"))
526 c_function.ignore(c_style_comment)
527
528 source_code = '''
529 int is_odd(int x) {
530 return (x%2);
531 }
532
533 int dec_to_hex(char hchar) {
534 if (hchar >= '0' && hchar <= '9') {
535 return (ord(hchar)-ord('0'));
536 } else {
537 return (10+ord(hchar)-ord('A'));
538 }
539 }
540 '''
541 for func in c_function.search_string(source_code):
542 print(f"{func.name} ({func.type}) args: {func.args}")
543
544
545 prints:
546
547 .. testoutput::
548
549 is_odd (int) args: [['int', 'x']]
550 dec_to_hex (int) args: [['char', 'hchar']]
551 """
552 if ignoreExpr != ignore_expr:
553 ignoreExpr = ignore_expr if ignoreExpr is _NO_IGNORE_EXPR_GIVEN else ignoreExpr
554
555 if ignoreExpr is _NO_IGNORE_EXPR_GIVEN:
556 ignoreExpr = quoted_string()
557
558 if opener == closer:
559 raise ValueError("opening and closing strings cannot be the same")
560
561 if content is None:
562 if isinstance(opener, str_type) and isinstance(closer, str_type):
563 opener = typing.cast(str, opener)
564 closer = typing.cast(str, closer)
565 if len(opener) == 1 and len(closer) == 1:
566 if ignoreExpr is not None:
567 content = Combine(
568 OneOrMore(
569 ~ignoreExpr
570 + CharsNotIn(
571 opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
572 exact=1,
573 )
574 )
575 )
576 else:
577 content = Combine(
578 Empty()
579 + CharsNotIn(
580 opener + closer + ParserElement.DEFAULT_WHITE_CHARS
581 )
582 )
583 else:
584 if ignoreExpr is not None:
585 content = Combine(
586 OneOrMore(
587 ~ignoreExpr
588 + ~Literal(opener)
589 + ~Literal(closer)
590 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
591 )
592 )
593 else:
594 content = Combine(
595 OneOrMore(
596 ~Literal(opener)
597 + ~Literal(closer)
598 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
599 )
600 )
601 else:
602 raise ValueError(
603 "opening and closing arguments must be strings if no content expression is given"
604 )
605
606 # for these internally-created context expressions, simulate whitespace-skipping
607 if ParserElement.DEFAULT_WHITE_CHARS:
608 content.set_parse_action(
609 lambda t: t[0].strip(ParserElement.DEFAULT_WHITE_CHARS)
610 )
611
612 ret = Forward()
613 if ignoreExpr is not None:
614 ret <<= Group(
615 Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer)
616 )
617 else:
618 ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
619
620 ret.set_name(f"nested {opener}{closer} expression")
621
622 # don't override error message from content expressions
623 ret.errmsg = None
624 return ret
625
626
627def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
628 """Internal helper to construct opening and closing tag expressions,
629 given a tag name"""
630 if isinstance(tagStr, str_type):
631 resname = tagStr
632 tagStr = Keyword(tagStr, caseless=not xml)
633 else:
634 resname = tagStr.name
635
636 tagAttrName = Word(alphas, alphanums + "_-:")
637 if xml:
638 tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
639 openTag = (
640 suppress_LT
641 + tagStr("tag")
642 + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
643 + Opt("/", default=[False])("empty").set_parse_action(
644 lambda s, l, t: t[0] == "/"
645 )
646 + suppress_GT
647 )
648 else:
649 tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
650 printables, exclude_chars=">"
651 )
652 openTag = (
653 suppress_LT
654 + tagStr("tag")
655 + Dict(
656 ZeroOrMore(
657 Group(
658 tagAttrName.set_parse_action(lambda t: t[0].lower())
659 + Opt(Suppress("=") + tagAttrValue)
660 )
661 )
662 )
663 + Opt("/", default=[False])("empty").set_parse_action(
664 lambda s, l, t: t[0] == "/"
665 )
666 + suppress_GT
667 )
668 closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)
669
670 openTag.set_name(f"<{resname}>")
671 # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels
672 openTag.add_parse_action(
673 lambda t: t.__setitem__(
674 "start" + "".join(resname.replace(":", " ").title().split()), t.copy()
675 )
676 )
677 closeTag = closeTag(
678 "end" + "".join(resname.replace(":", " ").title().split())
679 ).set_name(f"</{resname}>")
680 openTag.tag = resname
681 closeTag.tag = resname
682 openTag.tag_body = SkipTo(closeTag())
683 return openTag, closeTag
684
685
686def make_html_tags(
687 tag_str: Union[str, ParserElement],
688) -> tuple[ParserElement, ParserElement]:
689 """Helper to construct opening and closing tag expressions for HTML,
690 given a tag name. Matches tags in either upper or lower case,
691 attributes with namespaces and with quoted or unquoted values.
692
693 Example:
694
695 .. testcode::
696
697 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'
698 # make_html_tags returns pyparsing expressions for the opening and
699 # closing tags as a 2-tuple
700 a, a_end = make_html_tags("A")
701 link_expr = a + SkipTo(a_end)("link_text") + a_end
702
703 for link in link_expr.search_string(text):
704 # attributes in the <A> tag (like "href" shown here) are
705 # also accessible as named results
706 print(link.link_text, '->', link.href)
707
708 prints:
709
710 .. testoutput::
711
712 pyparsing -> https://github.com/pyparsing/pyparsing/wiki
713 """
714 return _makeTags(tag_str, False)
715
716
717def make_xml_tags(
718 tag_str: Union[str, ParserElement],
719) -> tuple[ParserElement, ParserElement]:
720 """Helper to construct opening and closing tag expressions for XML,
721 given a tag name. Matches tags only in the given upper/lower case.
722
723 Example: similar to :class:`make_html_tags`
724 """
725 return _makeTags(tag_str, True)
726
727
728any_open_tag: ParserElement
729any_close_tag: ParserElement
730any_open_tag, any_close_tag = make_html_tags(
731 Word(alphas, alphanums + "_:").set_name("any tag")
732)
733
734_htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
735_most_common_entities = "nbsp lt gt amp quot apos cent pound euro copy".replace(
736 " ", "|"
737)
738common_html_entity = Regex(
739 lambda: f"&(?P<entity>{_most_common_entities}|{make_compressed_re(_htmlEntityMap)});"
740).set_name("common HTML entity")
741
742
743def replace_html_entity(s, l, t):
744 """Helper parser action to replace common HTML entities with their special characters"""
745 return _htmlEntityMap.get(t.entity)
746
747
748class OpAssoc(Enum):
749 """Enumeration of operator associativity
750 - used in constructing InfixNotationOperatorSpec for :class:`infix_notation`"""
751
752 LEFT = 1
753 RIGHT = 2
754
755
756InfixNotationOperatorArgType = Union[
757 ParserElement, str, tuple[Union[ParserElement, str], Union[ParserElement, str]]
758]
759InfixNotationOperatorSpec = Union[
760 tuple[
761 InfixNotationOperatorArgType,
762 int,
763 OpAssoc,
764 typing.Optional[ParseAction],
765 ],
766 tuple[
767 InfixNotationOperatorArgType,
768 int,
769 OpAssoc,
770 ],
771]
772
773
774def infix_notation(
775 base_expr: ParserElement,
776 op_list: list[InfixNotationOperatorSpec],
777 lpar: Union[str, ParserElement] = Suppress("("),
778 rpar: Union[str, ParserElement] = Suppress(")"),
779) -> Forward:
780 """Helper method for constructing grammars of expressions made up of
781 operators working in a precedence hierarchy. Operators may be unary
782 or binary, left- or right-associative. Parse actions can also be
783 attached to operator expressions. The generated parser will also
784 recognize the use of parentheses to override operator precedences
785 (see example below).
786
787 Note: if you define a deep operator list, you may see performance
788 issues when using infix_notation. See
789 :class:`ParserElement.enable_packrat` for a mechanism to potentially
790 improve your parser performance.
791
792 Parameters:
793
794 :param base_expr: expression representing the most basic operand to
795 be used in the expression
796 :param op_list: list of tuples, one for each operator precedence level
797 in the expression grammar; each tuple is of the form ``(op_expr,
798 num_operands, right_left_assoc, (optional)parse_action)``, where:
799
800 - ``op_expr`` is the pyparsing expression for the operator; may also
801 be a string, which will be converted to a Literal; if ``num_operands``
802 is 3, ``op_expr`` is a tuple of two expressions, for the two
803 operators separating the 3 terms
804 - ``num_operands`` is the number of terms for this operator (must be 1,
805 2, or 3)
806 - ``right_left_assoc`` is the indicator whether the operator is right
807 or left associative, using the pyparsing-defined constants
808 ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
809 - ``parse_action`` is the parse action to be associated with
810 expressions matching this operator expression (the parse action
811 tuple member may be omitted); if the parse action is passed
812 a tuple or list of functions, this is equivalent to calling
813 ``set_parse_action(*fn)``
814 (:class:`ParserElement.set_parse_action`)
815
816 :param lpar: expression for matching left-parentheses; if passed as a
817 str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as
818 an expression (such as ``Literal('(')``), then it will be kept in
819 the parsed results, and grouped with them. (default= ``Suppress('(')``)
820 :param rpar: expression for matching right-parentheses; if passed as a
821 str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as
822 an expression (such as ``Literal(')')``), then it will be kept in
823 the parsed results, and grouped with them. (default= ``Suppress(')')``)
824
825 Example:
826
827 .. testcode::
828
829 # simple example of four-function arithmetic with ints and
830 # variable names
831 integer = pyparsing_common.signed_integer
832 varname = pyparsing_common.identifier
833
834 arith_expr = infix_notation(integer | varname,
835 [
836 ('-', 1, OpAssoc.RIGHT),
837 (one_of('* /'), 2, OpAssoc.LEFT),
838 (one_of('+ -'), 2, OpAssoc.LEFT),
839 ])
840
841 arith_expr.run_tests('''
842 5+3*6
843 (5+3)*6
844 (5+x)*y
845 -2--11
846 ''', full_dump=False)
847
848 prints:
849
850 .. testoutput::
851 :options: +NORMALIZE_WHITESPACE
852
853
854 5+3*6
855 [[5, '+', [3, '*', 6]]]
856
857 (5+3)*6
858 [[[5, '+', 3], '*', 6]]
859
860 (5+x)*y
861 [[[5, '+', 'x'], '*', 'y']]
862
863 -2--11
864 [[['-', 2], '-', ['-', 11]]]
865 """
866
867 # captive version of FollowedBy that does not do parse actions or capture results names
868 class _FB(FollowedBy):
869 def parseImpl(self, instring, loc, doActions=True):
870 self.expr.try_parse(instring, loc)
871 return loc, []
872
873 _FB.__name__ = "FollowedBy>"
874
875 ret = Forward()
876 ret.set_name(f"{base_expr.name}_expression")
877 if isinstance(lpar, str):
878 lpar = Suppress(lpar)
879 if isinstance(rpar, str):
880 rpar = Suppress(rpar)
881
882 nested_expr = (lpar + ret + rpar).set_name(f"nested_{base_expr.name}_expression")
883
884 # if lpar and rpar are not suppressed, wrap in group
885 if not (isinstance(lpar, Suppress) and isinstance(rpar, Suppress)):
886 lastExpr = base_expr | Group(nested_expr)
887 else:
888 lastExpr = base_expr | nested_expr
889
890 arity: int
891 rightLeftAssoc: opAssoc
892 pa: typing.Optional[ParseAction]
893 opExpr1: ParserElement
894 opExpr2: ParserElement
895 matchExpr: ParserElement
896 match_lookahead: ParserElement
897 for operDef in op_list:
898 opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment]
899 if isinstance(opExpr, str_type):
900 opExpr = ParserElement._literalStringClass(opExpr)
901 opExpr = typing.cast(ParserElement, opExpr)
902 if arity == 3:
903 if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
904 raise ValueError(
905 "if numterms=3, opExpr must be a tuple or list of two expressions"
906 )
907 opExpr1, opExpr2 = opExpr
908 term_name = f"{opExpr1}{opExpr2} operations"
909 else:
910 term_name = f"{opExpr} operations"
911
912 if not 1 <= arity <= 3:
913 raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
914
915 if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
916 raise ValueError("operator must indicate right or left associativity")
917
918 thisExpr: ParserElement = Forward().set_name(term_name)
919 thisExpr = typing.cast(Forward, thisExpr)
920 match_lookahead = And([])
921 if rightLeftAssoc is OpAssoc.LEFT:
922 if arity == 1:
923 match_lookahead = _FB(lastExpr + opExpr)
924 matchExpr = Group(lastExpr + opExpr[1, ...])
925 elif arity == 2:
926 if opExpr is not None:
927 match_lookahead = _FB(lastExpr + opExpr + lastExpr)
928 matchExpr = Group(lastExpr + (opExpr + lastExpr)[1, ...])
929 else:
930 match_lookahead = _FB(lastExpr + lastExpr)
931 matchExpr = Group(lastExpr[2, ...])
932 elif arity == 3:
933 match_lookahead = _FB(
934 lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
935 )
936 matchExpr = Group(
937 lastExpr + (opExpr1 + lastExpr + opExpr2 + lastExpr)[1, ...]
938 )
939 elif rightLeftAssoc is OpAssoc.RIGHT:
940 if arity == 1:
941 # try to avoid LR with this extra test
942 if not isinstance(opExpr, Opt):
943 opExpr = Opt(opExpr)
944 match_lookahead = _FB(opExpr.expr + thisExpr)
945 matchExpr = Group(opExpr + thisExpr)
946 elif arity == 2:
947 if opExpr is not None:
948 match_lookahead = _FB(lastExpr + opExpr + thisExpr)
949 matchExpr = Group(lastExpr + (opExpr + thisExpr)[1, ...])
950 else:
951 match_lookahead = _FB(lastExpr + thisExpr)
952 matchExpr = Group(lastExpr + thisExpr[1, ...])
953 elif arity == 3:
954 match_lookahead = _FB(
955 lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
956 )
957 matchExpr = Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
958
959 # suppress lookahead expr from railroad diagrams
960 match_lookahead.show_in_diagram = False
961
962 # TODO - determine why this statement can't be included in the following
963 # if pa block
964 matchExpr = match_lookahead + matchExpr
965
966 if pa:
967 if isinstance(pa, (tuple, list)):
968 matchExpr.set_parse_action(*pa)
969 else:
970 matchExpr.set_parse_action(pa)
971
972 thisExpr <<= (matchExpr | lastExpr).setName(term_name)
973 lastExpr = thisExpr
974
975 ret <<= lastExpr
976 return ret
977
978
979def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
980 """
981 .. deprecated:: 3.0.0
982 Use the :class:`IndentedBlock` class instead.
983
984 Helper method for defining space-delimited indentation blocks,
985 such as those used to define block statements in Python source code.
986
987 :param blockStatementExpr: expression defining syntax of statement that
988 is repeated within the indented block
989
990 :param indentStack: list created by caller to manage indentation stack
991 (multiple ``statementWithIndentedBlock`` expressions within a single
992 grammar should share a common ``indentStack``)
993
994 :param indent: boolean indicating whether block must be indented beyond
995 the current level; set to ``False`` for block of left-most statements
996
997 A valid block must contain at least one ``blockStatement``.
998
999 (Note that indentedBlock uses internal parse actions which make it
1000 incompatible with packrat parsing.)
1001
1002 Example:
1003
1004 .. testcode::
1005
1006 data = '''
1007 def A(z):
1008 A1
1009 B = 100
1010 G = A2
1011 A2
1012 A3
1013 B
1014 def BB(a,b,c):
1015 BB1
1016 def BBA():
1017 bba1
1018 bba2
1019 bba3
1020 C
1021 D
1022 def spam(x,y):
1023 def eggs(z):
1024 pass
1025 '''
1026
1027 indentStack = [1]
1028 stmt = Forward()
1029
1030 identifier = Word(alphas, alphanums)
1031 funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
1032 func_body = indentedBlock(stmt, indentStack)
1033 funcDef = Group(funcDecl + func_body)
1034
1035 rvalue = Forward()
1036 funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
1037 rvalue << (funcCall | identifier | Word(nums))
1038 assignment = Group(identifier + "=" + rvalue)
1039 stmt << (funcDef | assignment | identifier)
1040
1041 module_body = stmt[1, ...]
1042
1043 parseTree = module_body.parseString(data)
1044 parseTree.pprint()
1045
1046 prints:
1047
1048 .. testoutput::
1049
1050 [['def',
1051 'A',
1052 ['(', 'z', ')'],
1053 ':',
1054 [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
1055 'B',
1056 ['def',
1057 'BB',
1058 ['(', 'a', 'b', 'c', ')'],
1059 ':',
1060 [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
1061 'C',
1062 'D',
1063 ['def',
1064 'spam',
1065 ['(', 'x', 'y', ')'],
1066 ':',
1067 [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
1068 """
1069 backup_stacks.append(indentStack[:])
1070
1071 def reset_stack():
1072 indentStack[:] = backup_stacks[-1]
1073
1074 def checkPeerIndent(s, l, t):
1075 if l >= len(s):
1076 return
1077 curCol = col(l, s)
1078 if curCol != indentStack[-1]:
1079 if curCol > indentStack[-1]:
1080 raise ParseException(s, l, "illegal nesting")
1081 raise ParseException(s, l, "not a peer entry")
1082
1083 def checkSubIndent(s, l, t):
1084 curCol = col(l, s)
1085 if curCol > indentStack[-1]:
1086 indentStack.append(curCol)
1087 else:
1088 raise ParseException(s, l, "not a subentry")
1089
1090 def checkUnindent(s, l, t):
1091 if l >= len(s):
1092 return
1093 curCol = col(l, s)
1094 if not (indentStack and curCol in indentStack):
1095 raise ParseException(s, l, "not an unindent")
1096 if curCol < indentStack[-1]:
1097 indentStack.pop()
1098
1099 NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
1100 INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
1101 PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
1102 UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
1103 if indent:
1104 smExpr = Group(
1105 Opt(NL)
1106 + INDENT
1107 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
1108 + UNDENT
1109 )
1110 else:
1111 smExpr = Group(
1112 Opt(NL)
1113 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
1114 + Opt(UNDENT)
1115 )
1116
1117 # add a parse action to remove backup_stack from list of backups
1118 smExpr.add_parse_action(
1119 lambda: backup_stacks.pop(-1) and None if backup_stacks else None
1120 )
1121 smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
1122 blockStatementExpr.ignore(_bslash + LineEnd())
1123 return smExpr.set_name("indented block")
1124
1125
1126# it's easy to get these comment structures wrong - they're very common,
1127# so may as well make them available
1128c_style_comment = Regex(r"/\*(?:[^*]|\*(?!/))*\*\/").set_name("C style comment")
1129"Comment of the form ``/* ... */``"
1130
1131html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment")
1132"Comment of the form ``<!-- ... -->``"
1133
1134rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
1135dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
1136"Comment of the form ``// ... (to end of line)``"
1137
1138cpp_style_comment = Regex(
1139 r"(?:/\*(?:[^*]|\*(?!/))*\*\/)|(?://(?:\\\n|[^\n])*)"
1140).set_name("C++ style comment")
1141"Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
1142
1143java_style_comment = cpp_style_comment
1144"Same as :class:`cpp_style_comment`"
1145
1146python_style_comment = Regex(r"#.*").set_name("Python style comment")
1147"Comment of the form ``# ... (to end of line)``"
1148
1149
1150# build list of built-in expressions, for future reference if a global default value
1151# gets updated
1152_builtin_exprs: list[ParserElement] = [
1153 v for v in vars().values() if isinstance(v, ParserElement)
1154]
1155
1156
1157# compatibility function, superseded by DelimitedList class
1158def delimited_list(
1159 expr: Union[str, ParserElement],
1160 delim: Union[str, ParserElement] = ",",
1161 combine: bool = False,
1162 min: typing.Optional[int] = None,
1163 max: typing.Optional[int] = None,
1164 *,
1165 allow_trailing_delim: bool = False,
1166) -> ParserElement:
1167 """
1168 .. deprecated:: 3.1.0
1169 Use the :class:`DelimitedList` class instead.
1170 """
1171 return DelimitedList(
1172 expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim
1173 )
1174
1175
1176# Compatibility synonyms
1177# fmt: off
1178opAssoc = OpAssoc
1179anyOpenTag = any_open_tag
1180anyCloseTag = any_close_tag
1181commonHTMLEntity = common_html_entity
1182cStyleComment = c_style_comment
1183htmlComment = html_comment
1184restOfLine = rest_of_line
1185dblSlashComment = dbl_slash_comment
1186cppStyleComment = cpp_style_comment
1187javaStyleComment = java_style_comment
1188pythonStyleComment = python_style_comment
1189delimitedList = replaced_by_pep8("delimitedList", DelimitedList)
1190delimited_list = replaced_by_pep8("delimited_list", DelimitedList)
1191countedArray = replaced_by_pep8("countedArray", counted_array)
1192matchPreviousLiteral = replaced_by_pep8("matchPreviousLiteral", match_previous_literal)
1193matchPreviousExpr = replaced_by_pep8("matchPreviousExpr", match_previous_expr)
1194oneOf = replaced_by_pep8("oneOf", one_of)
1195dictOf = replaced_by_pep8("dictOf", dict_of)
1196originalTextFor = replaced_by_pep8("originalTextFor", original_text_for)
1197nestedExpr = replaced_by_pep8("nestedExpr", nested_expr)
1198makeHTMLTags = replaced_by_pep8("makeHTMLTags", make_html_tags)
1199makeXMLTags = replaced_by_pep8("makeXMLTags", make_xml_tags)
1200replaceHTMLEntity = replaced_by_pep8("replaceHTMLEntity", replace_html_entity)
1201infixNotation = replaced_by_pep8("infixNotation", infix_notation)
1202# fmt: on