Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pyparsing/helpers.py: 26%

1# helpers.py

2import html.entities

3import operator

4import re

5import sys

6import typing

8from . import __diag__

9from .core import *

10from .util import (

11 _bslash,

12 _flatten,

13 _escape_regex_range_chars,

14 make_compressed_re,

15 replaced_by_pep8,

16)

19def _suppression(expr: Union[ParserElement, str]) -> ParserElement:

20 # internal helper to avoid wrapping Suppress inside another Suppress

21 if isinstance(expr, Suppress):

22 return expr

23 return Suppress(expr)

26#

27# global helpers

28#

29def counted_array(

30 expr: ParserElement, int_expr: typing.Optional[ParserElement] = None, **kwargs

31) -> ParserElement:

32 """Helper to define a counted list of expressions.

34 This helper defines a pattern of the form::

36 integer expr expr expr...

38 where the leading integer tells how many expr expressions follow.

39 The matched tokens returns the array of expr tokens as a list - the

40 leading count token is suppressed.

42 If ``int_expr`` is specified, it should be a pyparsing expression

43 that produces an integer value.

45 Examples:

47 .. doctest::

49 >>> counted_array(Word(alphas)).parse_string('2 ab cd ef')

50 ParseResults(['ab', 'cd'], {})

52 - In this parser, the leading integer value is given in binary,

53 '10' indicating that 2 values are in the array:

55 .. doctest::

57 >>> binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2))

58 >>> counted_array(Word(alphas), int_expr=binary_constant

59 ... ).parse_string('10 ab cd ef')

60 ParseResults(['ab', 'cd'], {})

62 - If other fields must be parsed after the count but before the

63 list items, give the fields results names and they will

64 be preserved in the returned ParseResults:

66 .. doctest::

68 >>> ppc = pyparsing.common

69 >>> count_with_metadata = ppc.integer + Word(alphas)("type")

70 >>> typed_array = counted_array(Word(alphanums),

71 ... int_expr=count_with_metadata)("items")

72 >>> result = typed_array.parse_string("3 bool True True False")

73 >>> print(result.dump())

74 ['True', 'True', 'False']

75 - items: ['True', 'True', 'False']

76 - type: 'bool'

77 """

78 intExpr: typing.Optional[ParserElement] = deprecate_argument(

79 kwargs, "intExpr", None

80 )

82 intExpr = intExpr or int_expr

83 array_expr = Forward()

85 def count_field_parse_action(s, l, t):

86 nonlocal array_expr

87 n = t[0]

88 array_expr <<= (expr * n) if n else Empty()

89 # clear list contents, but keep any named results

90 del t[:]

92 if intExpr is None:

93 intExpr = Word(nums).set_parse_action(lambda t: int(t[0]))

94 else:

95 intExpr = intExpr.copy()

96 intExpr.set_name("arrayLen")

97 intExpr.add_parse_action(count_field_parse_action, call_during_try=True)

98 return (intExpr + array_expr).set_name(f"(len) {expr}...")

100

101def match_previous_literal(expr: ParserElement) -> ParserElement:

102 """Helper to define an expression that is indirectly defined from

103 the tokens matched in a previous expression, that is, it looks for

104 a 'repeat' of a previous expression. For example::

105

106 .. testcode::

107

108 first = Word(nums)

109 second = match_previous_literal(first)

110 match_expr = first + ":" + second

111

112 will match ``"1:1"``, but not ``"1:2"``. Because this

113 matches a previous literal, will also match the leading

114 ``"1:1"`` in ``"1:10"``. If this is not desired, use

115 :class:`match_previous_expr`. Do *not* use with packrat parsing

116 enabled.

117 """

118 rep = Forward()

119

120 def copy_token_to_repeater(s, l, t):

121 if not t:

122 rep << Empty()

123 return

124

125 if len(t) == 1:

126 rep << t[0]

127 return

128

129 # flatten t tokens

130 tflat = _flatten(t.as_list())

131 rep << And(Literal(tt) for tt in tflat)

132

133 expr.add_parse_action(copy_token_to_repeater, call_during_try=True)

134 rep.set_name("(prev) " + str(expr))

135 return rep

136

137

138def match_previous_expr(expr: ParserElement) -> ParserElement:

139 """Helper to define an expression that is indirectly defined from

140 the tokens matched in a previous expression, that is, it looks for

141 a 'repeat' of a previous expression. For example:

142

143 .. testcode::

144

145 first = Word(nums)

146 second = match_previous_expr(first)

147 match_expr = first + ":" + second

148

149 will match ``"1:1"``, but not ``"1:2"``. Because this

150 matches by expressions, will *not* match the leading ``"1:1"``

151 in ``"1:10"``; the expressions are evaluated first, and then

152 compared, so ``"1"`` is compared with ``"10"``. Do *not* use

153 with packrat parsing enabled.

154 """

155 rep = Forward()

156 e2 = expr.copy()

157 rep <<= e2

158

159 def copy_token_to_repeater(s, l, t):

160 matchTokens = _flatten(t.as_list())

161

162 def must_match_these_tokens(s, l, t):

163 theseTokens = _flatten(t.as_list())

164 if theseTokens != matchTokens:

165 raise ParseException(

166 s, l, f"Expected {matchTokens}, found{theseTokens}"

167 )

168

169 rep.set_parse_action(must_match_these_tokens, call_during_try=True)

170

171 expr.add_parse_action(copy_token_to_repeater, call_during_try=True)

172 rep.set_name("(prev) " + str(expr))

173 return rep

174

175

176def one_of(

177 strs: Union[typing.Iterable[str], str],

178 caseless: bool = False,

179 use_regex: bool = True,

180 as_keyword: bool = False,

181 **kwargs,

182) -> ParserElement:

183 """Helper to quickly define a set of alternative :class:`Literal` s,

184 and makes sure to do longest-first testing when there is a conflict,

185 regardless of the input order, but returns

186 a :class:`MatchFirst` for best performance.

187

188 :param strs: a string of space-delimited literals, or a collection of

189 string literals

190 :param caseless: treat all literals as caseless

191 :param use_regex: bool - as an optimization, will

192 generate a :class:`Regex` object; otherwise, will generate

193 a :class:`MatchFirst` object (if ``caseless=True`` or

194 ``as_keyword=True``, or if creating a :class:`Regex` raises an exception)

195 :param as_keyword: bool - enforce :class:`Keyword`-style matching on the

196 generated expressions

197

198 Parameters ``asKeyword`` and ``useRegex`` are retained for pre-PEP8

199 compatibility, but will be removed in a future release.

200

201 Example:

202

203 .. testcode::

204

205 comp_oper = one_of("< = > <= >= !=")

206 var = Word(alphas)

207 number = Word(nums)

208 term = var | number

209 comparison_expr = term + comp_oper + term

210 print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12"))

211

212 prints:

213

214 .. testoutput::

215

216 [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']]

217 """

218 useRegex: bool = deprecate_argument(kwargs, "useRegex", True)

219 asKeyword: bool = deprecate_argument(kwargs, "asKeyword", False)

220

221 asKeyword = asKeyword or as_keyword

222 useRegex = useRegex and use_regex

223

224 if (

225 isinstance(caseless, str_type)

226 and __diag__.warn_on_multiple_string_args_to_oneof

227 ):

228 warnings.warn(

229 "warn_on_multiple_string_args_to_oneof:"

230 " More than one string argument passed to one_of, pass"

231 " choices as a list or space-delimited string",

232 stacklevel=2,

233 )

234

235 if caseless:

236 is_equal = lambda a, b: a.upper() == b.upper()

237 masks = lambda a, b: b.upper().startswith(a.upper())

238 else:

239 is_equal = operator.eq

240 masks = lambda a, b: b.startswith(a)

241

242 symbols: list[str]

243 if isinstance(strs, str_type):

244 strs = typing.cast(str, strs)

245 symbols = strs.split()

246 elif isinstance(strs, Iterable):

247 symbols = list(strs)

248 else:

249 raise TypeError("Invalid argument to one_of, expected string or iterable")

250 if not symbols:

251 return NoMatch()

252

253 # reorder given symbols to take care to avoid masking longer choices with shorter ones

254 # (but only if the given symbols are not just single characters)

255 i = 0

256 while i < len(symbols) - 1:

257 cur = symbols[i]

258 for j, other in enumerate(symbols[i + 1 :]):

259 if is_equal(other, cur):

260 del symbols[i + j + 1]

261 break

262 if len(other) > len(cur) and masks(cur, other):

263 del symbols[i + j + 1]

264 symbols.insert(i, other)

265 break

266 else:

267 i += 1

268

269 if useRegex:

270 re_flags: int = re.IGNORECASE if caseless else 0

271

272 try:

273 if all(len(sym) == 1 for sym in symbols):

274 # symbols are just single characters, create range regex pattern

275 patt = f"[{''.join(_escape_regex_range_chars(sym) for sym in symbols)}]"

276 else:

277 patt = "|".join(re.escape(sym) for sym in symbols)

278

279 # wrap with \b word break markers if defining as keywords

280 if asKeyword:

281 patt = rf"\b(?:{patt})\b"

282

283 ret = Regex(patt, flags=re_flags)

284 ret.set_name(" | ".join(repr(s) for s in symbols))

285

286 if caseless:

287 # add parse action to return symbols as specified, not in random

288 # casing as found in input string

289 symbol_map = {sym.lower(): sym for sym in symbols}

290 ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()])

291

292 return ret

293

294 except re.error:

295 warnings.warn(

296 "Exception creating Regex for one_of, building MatchFirst", stacklevel=2

297 )

298

299 # last resort, just use MatchFirst of Token class corresponding to caseless

300 # and asKeyword settings

301 CASELESS = KEYWORD = True

302 parse_element_class = {

303 (CASELESS, KEYWORD): CaselessKeyword,

304 (CASELESS, not KEYWORD): CaselessLiteral,

305 (not CASELESS, KEYWORD): Keyword,

306 (not CASELESS, not KEYWORD): Literal,

307 }[(caseless, asKeyword)]

308 return MatchFirst(parse_element_class(sym) for sym in symbols).set_name(

309 " | ".join(symbols)

310 )

311

312

313def dict_of(key: ParserElement, value: ParserElement) -> Dict:

314 """Helper to easily and clearly define a dictionary by specifying

315 the respective patterns for the key and value. Takes care of

316 defining the :class:`Dict`, :class:`ZeroOrMore`, and

317 :class:`Group` tokens in the proper order. The key pattern

318 can include delimiting markers or punctuation, as long as they are

319 suppressed, thereby leaving the significant key text. The value

320 pattern can include named results, so that the :class:`Dict` results

321 can include named token fields.

322

323 Example:

324

325 .. doctest::

326

327 >>> text = "shape: SQUARE posn: upper left color: light blue texture: burlap"

328

329 >>> data_word = Word(alphas)

330 >>> label = data_word + FollowedBy(':')

331 >>> attr_expr = (

332 ... label

333 ... + Suppress(':')

334 ... + OneOrMore(data_word, stop_on=label)

335 ... .set_parse_action(' '.join))

336 >>> print(attr_expr[1, ...].parse_string(text).dump())

337 ['shape', 'SQUARE', 'posn', 'upper left', 'color', 'light blue', 'texture', 'burlap']

338

339 >>> attr_label = label

340 >>> attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label

341 ... ).set_parse_action(' '.join)

342

343 # similar to Dict, but simpler call format

344 >>> result = dict_of(attr_label, attr_value).parse_string(text)

345 >>> print(result.dump())

346 [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']]

347 - color: 'light blue'

348 - posn: 'upper left'

349 - shape: 'SQUARE'

350 - texture: 'burlap'

351 [0]:

352 ['shape', 'SQUARE']

353 [1]:

354 ['posn', 'upper left']

355 [2]:

356 ['color', 'light blue']

357 [3]:

358 ['texture', 'burlap']

359

360 >>> print(result['shape'])

361 SQUARE

362 >>> print(result.shape) # object attribute access works too

363 SQUARE

364 >>> print(result.as_dict())

365 {'shape': 'SQUARE', 'posn': 'upper left', 'color': 'light blue', 'texture': 'burlap'}

366 """

367 return Dict(OneOrMore(Group(key + value)))

368

369

370def original_text_for(

371 expr: ParserElement, as_string: bool = True, **kwargs

372) -> ParserElement:

373 """Helper to return the original, untokenized text for a given

374 expression. Useful to restore the parsed fields of an HTML start

375 tag into the raw tag text itself, or to revert separate tokens with

376 intervening whitespace back to the original matching input text. By

377 default, returns a string containing the original parsed text.

378

379 If the optional ``as_string`` argument is passed as

380 ``False``, then the return value is

381 a :class:`ParseResults` containing any results names that

382 were originally matched, and a single token containing the original

383 matched text from the input string. So if the expression passed to

384 :class:`original_text_for` contains expressions with defined

385 results names, you must set ``as_string`` to ``False`` if you

386 want to preserve those results name values.

387

388 The ``asString`` pre-PEP8 argument is retained for compatibility,

389 but will be removed in a future release.

390

391 Example:

392

393 .. testcode::

394

395 src = "this is test bold text normal text "

396 for tag in ("b", "i"):

397 opener, closer = make_html_tags(tag)

398 patt = original_text_for(opener + ... + closer)

399 print(patt.search_string(src)[0])

400

401 prints:

402

403 .. testoutput::

404

405 [' bold text ']

406 ['text']

407 """

408 asString: bool = deprecate_argument(kwargs, "asString", True)

409

410 asString = asString and as_string

411

412 locMarker = Empty().set_parse_action(lambda s, loc, t: loc)

413 endlocMarker = locMarker.copy()

414 endlocMarker.callPreparse = False

415 matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end")

416 if asString:

417 extractText = lambda s, l, t: s[t._original_start : t._original_end]

418 else:

419

420 def extractText(s, l, t):

421 t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]]

422

423 matchExpr.set_parse_action(extractText)

424 matchExpr.ignoreExprs = expr.ignoreExprs

425 matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection)

426 return matchExpr

427

428

429def ungroup(expr: ParserElement) -> ParserElement:

430 """Helper to undo pyparsing's default grouping of And expressions,

431 even if all but one are non-empty.

432 """

433 return TokenConverter(expr).add_parse_action(lambda t: t[0])

434

435

436def locatedExpr(expr: ParserElement) -> ParserElement:

437 """

438 .. deprecated:: 3.0.0

439 Use the :class:`Located` class instead. Note that `Located`

440 returns results with one less grouping level.

441

442 Helper to decorate a returned token with its starting and ending

443 locations in the input string.

444

445 This helper adds the following results names:

446

447 - ``locn_start`` - location where matched expression begins

448 - ``locn_end`` - location where matched expression ends

449 - ``value`` - the actual parsed results

450

451 Be careful if the input text contains ``<TAB>`` characters, you

452 may want to call :meth:`ParserElement.parse_with_tabs`

453 """

454 warnings.warn(

455 f"{'locatedExpr'!r} deprecated - use {'Located'!r}",

456 DeprecationWarning,

457 stacklevel=2,

458 )

459

460 locator = Empty().set_parse_action(lambda ss, ll, tt: ll)

461 return Group(

462 locator("locn_start")

463 + expr("value")

464 + locator.copy().leave_whitespace()("locn_end")

465 )

466

467

468# define special default value to permit None as a significant value for

469# ignore_expr

470_NO_IGNORE_EXPR_GIVEN = NoMatch()

471

472

473def nested_expr(

474 opener: Union[str, ParserElement] = "(",

475 closer: Union[str, ParserElement] = ")",

476 content: typing.Optional[ParserElement] = None,

477 ignore_expr: typing.Optional[ParserElement] = _NO_IGNORE_EXPR_GIVEN,

478 **kwargs,

479) -> ParserElement:

480 """Helper method for defining nested lists enclosed in opening and

481 closing delimiters (``"("`` and ``")"`` are the default).

482

483 :param opener: str - opening character for a nested list

484 (default= ``"("``); can also be a pyparsing expression

485

486 :param closer: str - closing character for a nested list

487 (default= ``")"``); can also be a pyparsing expression

488

489 :param content: expression for items within the nested lists

490

491 :param ignore_expr: expression for ignoring opening and closing delimiters

492 (default = :class:`quoted_string`)

493

494 Parameter ``ignoreExpr`` is retained for compatibility

495 but will be removed in a future release.

496

497 If an expression is not provided for the content argument, the

498 nested expression will capture all whitespace-delimited content

499 between delimiters as a list of separate values.

500

501 Use the ``ignore_expr`` argument to define expressions that may

502 contain opening or closing characters that should not be treated as

503 opening or closing characters for nesting, such as quoted_string or

504 a comment expression. Specify multiple expressions using an

505 :class:`Or` or :class:`MatchFirst`. The default is

506 :class:`quoted_string`, but if no expressions are to be ignored, then

507 pass ``None`` for this argument.

508

509 Example:

510

511 .. testcode::

512

513 data_type = one_of("void int short long char float double")

514 decl_data_type = Combine(data_type + Opt(Word('*')))

515 ident = Word(alphas+'_', alphanums+'_')

516 number = pyparsing_common.number

517 arg = Group(decl_data_type + ident)

518 LPAR, RPAR = map(Suppress, "()")

519

520 code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))

521

522 c_function = (decl_data_type("type")

523 + ident("name")

524 + LPAR + Opt(DelimitedList(arg), [])("args") + RPAR

525 + code_body("body"))

526 c_function.ignore(c_style_comment)

527

528 source_code = '''

529 int is_odd(int x) {

530 return (x%2);

531 }

532

533 int dec_to_hex(char hchar) {

534 if (hchar >= '0' && hchar <= '9') {

535 return (ord(hchar)-ord('0'));

536 } else {

537 return (10+ord(hchar)-ord('A'));

538 }

539 }

540 '''

541 for func in c_function.search_string(source_code):

542 print(f"{func.name} ({func.type}) args: {func.args}")

545 prints:

547 .. testoutput::

549 is_odd (int) args: [['int', 'x']]

550 dec_to_hex (int) args: [['char', 'hchar']]

551 """

552 ignoreExpr: ParserElement = deprecate_argument(

553 kwargs, "ignoreExpr", _NO_IGNORE_EXPR_GIVEN

554 )

555

556 if ignoreExpr != ignore_expr:

557 ignoreExpr = ignore_expr if ignoreExpr is _NO_IGNORE_EXPR_GIVEN else ignoreExpr # type: ignore [assignment]

558

559 if ignoreExpr is _NO_IGNORE_EXPR_GIVEN:

560 ignoreExpr = quoted_string()

561

562 if opener == closer:

563 raise ValueError("opening and closing strings cannot be the same")

564

565 if content is None:

566 if isinstance(opener, str_type) and isinstance(closer, str_type):

567 opener = typing.cast(str, opener)

568 closer = typing.cast(str, closer)

569 if len(opener) == 1 and len(closer) == 1:

570 if ignoreExpr is not None:

571 content = Combine(

572 OneOrMore(

573 ~ignoreExpr

574 + CharsNotIn(

575 opener + closer + ParserElement.DEFAULT_WHITE_CHARS,

576 exact=1,

577 )

578 )

579 )

580 else:

581 content = Combine(

582 Empty()

583 + CharsNotIn(

584 opener + closer + ParserElement.DEFAULT_WHITE_CHARS

585 )

586 )

587 else:

588 if ignoreExpr is not None:

589 content = Combine(

590 OneOrMore(

591 ~ignoreExpr

592 + ~Literal(opener)

593 + ~Literal(closer)

594 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)

595 )

596 )

597 else:

598 content = Combine(

599 OneOrMore(

600 ~Literal(opener)

601 + ~Literal(closer)

602 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)

603 )

604 )

605 else:

606 raise ValueError(

607 "opening and closing arguments must be strings if no content expression is given"

608 )

609

610 # for these internally-created context expressions, simulate whitespace-skipping

611 if ParserElement.DEFAULT_WHITE_CHARS:

612 content.set_parse_action(

613 lambda t: t[0].strip(ParserElement.DEFAULT_WHITE_CHARS)

614 )

615

616 ret = Forward()

617 if ignoreExpr is not None:

618 ret <<= Group(

619 _suppression(opener)

620 + ZeroOrMore(ignoreExpr | ret | content)

621 + _suppression(closer)

622 )

623 else:

624 ret <<= Group(

625 _suppression(opener) + ZeroOrMore(ret | content) + _suppression(closer)

626 )

627

628 ret.set_name(f"nested {opener}{closer} expression")

629

630 # don't override error message from content expressions

631 ret.errmsg = None

632 return ret

633

634

635def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):

636 """Internal helper to construct opening and closing tag expressions,

637 given a tag name"""

638 if isinstance(tagStr, str_type):

639 resname = tagStr

640 tagStr = Keyword(tagStr, caseless=not xml)

641 else:

642 resname = tagStr.name

643

644 tagAttrName = Word(alphas, alphanums + "_-:")

645 if xml:

646 tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)

647 openTag = (

648 suppress_LT

649 + tagStr("tag")

650 + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))

651 + Opt("/", default=[False])("empty").set_parse_action(

652 lambda s, l, t: t[0] == "/"

653 )

654 + suppress_GT

655 )

656 else:

657 tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(

658 printables, exclude_chars=">"

659 )

660 openTag = (

661 suppress_LT

662 + tagStr("tag")

663 + Dict(

664 ZeroOrMore(

665 Group(

666 tagAttrName.set_parse_action(lambda t: t[0].lower())

667 + Opt(Suppress("=") + tagAttrValue)

668 )

669 )

670 )

671 + Opt("/", default=[False])("empty").set_parse_action(

672 lambda s, l, t: t[0] == "/"

673 )

674 + suppress_GT

675 )

676 closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False)

677

678 openTag.set_name(f"<{resname}>")

679 # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels

680 openTag.add_parse_action(

681 lambda t: t.__setitem__(

682 "start" + "".join(resname.replace(":", " ").title().split()), t.copy()

683 )

684 )

685 closeTag = closeTag(

686 "end" + "".join(resname.replace(":", " ").title().split())

687 ).set_name(f"</{resname}>")

688 openTag.tag = resname

689 closeTag.tag = resname

690 openTag.tag_body = SkipTo(closeTag())

691 return openTag, closeTag

692

693

694def make_html_tags(

695 tag_str: Union[str, ParserElement],

696) -> tuple[ParserElement, ParserElement]:

697 """Helper to construct opening and closing tag expressions for HTML,

698 given a tag name. Matches tags in either upper or lower case,

699 attributes with namespaces and with quoted or unquoted values.

700

701 Example:

702

703 .. testcode::

704

705 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>'

706 # make_html_tags returns pyparsing expressions for the opening and

707 # closing tags as a 2-tuple

708 a, a_end = make_html_tags("A")

709 link_expr = a + SkipTo(a_end)("link_text") + a_end

710

711 for link in link_expr.search_string(text):

712 # attributes in the <A> tag (like "href" shown here) are

713 # also accessible as named results

714 print(link.link_text, '->', link.href)

715

716 prints:

717

718 .. testoutput::

719

720 pyparsing -> https://github.com/pyparsing/pyparsing/wiki

721 """

722 return _makeTags(tag_str, False)

723

724

725def make_xml_tags(

726 tag_str: Union[str, ParserElement],

727) -> tuple[ParserElement, ParserElement]:

728 """Helper to construct opening and closing tag expressions for XML,

729 given a tag name. Matches tags only in the given upper/lower case.

730

731 Example: similar to :class:`make_html_tags`

732 """

733 return _makeTags(tag_str, True)

734

735

736any_open_tag: ParserElement

737any_close_tag: ParserElement

738any_open_tag, any_close_tag = make_html_tags(

739 Word(alphas, alphanums + "_:").set_name("any tag")

740)

741

742_htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}

743_most_common_entities = "nbsp lt gt amp quot apos cent pound euro copy".replace(

744 " ", "|"

745)

746common_html_entity = Regex(

747 lambda: f"&(?P<entity>{_most_common_entities}|{make_compressed_re(_htmlEntityMap)});"

748).set_name("common HTML entity")

749

750

751def replace_html_entity(s, l, t):

752 """Helper parser action to replace common HTML entities with their special characters"""

753 return _htmlEntityMap.get(t.entity)

754

755

756class OpAssoc(Enum):

757 """Enumeration of operator associativity

758 - used in constructing InfixNotationOperatorSpec for :class:`infix_notation`"""

759

760 LEFT = 1

761 RIGHT = 2

762

763

764InfixNotationOperatorArgType = Union[

765 ParserElement, str, tuple[Union[ParserElement, str], Union[ParserElement, str]]

766]

767InfixNotationOperatorSpec = Union[

768 tuple[

769 InfixNotationOperatorArgType,

770 int,

771 OpAssoc,

772 typing.Optional[ParseAction],

773 ],

774 tuple[

775 InfixNotationOperatorArgType,

776 int,

777 OpAssoc,

778 ],

779]

780

781

782def infix_notation(

783 base_expr: ParserElement,

784 op_list: list[InfixNotationOperatorSpec],

785 lpar: Union[str, ParserElement] = Suppress("("),

786 rpar: Union[str, ParserElement] = Suppress(")"),

787) -> Forward:

788 """Helper method for constructing grammars of expressions made up of

789 operators working in a precedence hierarchy. Operators may be unary

790 or binary, left- or right-associative. Parse actions can also be

791 attached to operator expressions. The generated parser will also

792 recognize the use of parentheses to override operator precedences

793 (see example below).

794

795 Note: if you define a deep operator list, you may see performance

796 issues when using infix_notation. See

797 :class:`ParserElement.enable_packrat` for a mechanism to potentially

798 improve your parser performance.

799

800 Parameters:

801

802 :param base_expr: expression representing the most basic operand to

803 be used in the expression

804 :param op_list: list of tuples, one for each operator precedence level

805 in the expression grammar; each tuple is of the form ``(op_expr,

806 num_operands, right_left_assoc, (optional)parse_action)``, where:

807

808 - ``op_expr`` is the pyparsing expression for the operator; may also

809 be a string, which will be converted to a Literal; if ``num_operands``

810 is 3, ``op_expr`` is a tuple of two expressions, for the two

811 operators separating the 3 terms

812 - ``num_operands`` is the number of terms for this operator (must be 1,

813 2, or 3)

814 - ``right_left_assoc`` is the indicator whether the operator is right

815 or left associative, using the pyparsing-defined constants

816 ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.

817 - ``parse_action`` is the parse action to be associated with

818 expressions matching this operator expression (the parse action

819 tuple member may be omitted); if the parse action is passed

820 a tuple or list of functions, this is equivalent to calling

821 ``set_parse_action(*fn)``

822 (:class:`ParserElement.set_parse_action`)

823

824 :param lpar: expression for matching left-parentheses; if passed as a

825 str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as

826 an expression (such as ``Literal('(')``), then it will be kept in

827 the parsed results, and grouped with them. (default= ``Suppress('(')``)

828 :param rpar: expression for matching right-parentheses; if passed as a

829 str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as

830 an expression (such as ``Literal(')')``), then it will be kept in

831 the parsed results, and grouped with them. (default= ``Suppress(')')``)

832

833 Example:

834

835 .. testcode::

836

837 # simple example of four-function arithmetic with ints and

838 # variable names

839 integer = pyparsing_common.signed_integer

840 varname = pyparsing_common.identifier

841

842 arith_expr = infix_notation(integer | varname,

843 [

844 ('-', 1, OpAssoc.RIGHT),

845 (one_of('* /'), 2, OpAssoc.LEFT),

846 (one_of('+ -'), 2, OpAssoc.LEFT),

847 ])

848

849 arith_expr.run_tests('''

850 5+3*6

851 (5+3)*6

852 (5+x)*y

853 -2--11

854 ''', full_dump=False)

855

856 prints:

857

858 .. testoutput::

859 :options: +NORMALIZE_WHITESPACE

860

861

862 5+3*6

863 [[5, '+', [3, '*', 6]]]

864

865 (5+3)*6

866 [[[5, '+', 3], '*', 6]]

867

868 (5+x)*y

869 [[[5, '+', 'x'], '*', 'y']]

870

871 -2--11

872 [[['-', 2], '-', ['-', 11]]]

873 """

874

875 # captive version of FollowedBy that does not do parse actions or capture results names

876 class _FB(FollowedBy):

877 def parseImpl(self, instring, loc, doActions=True):

878 self.expr.try_parse(instring, loc)

879 return loc, []

880

881 _FB.__name__ = "FollowedBy>"

882

883 ret = Forward()

884 ret.set_name(f"{base_expr.name}_expression")

885 if isinstance(lpar, str):

886 lpar = Suppress(lpar)

887 if isinstance(rpar, str):

888 rpar = Suppress(rpar)

889

890 nested_expr = (lpar + ret + rpar).set_name(f"nested_{base_expr.name}_expression")

891

892 # if lpar and rpar are not suppressed, wrap in group

893 if not (isinstance(lpar, Suppress) and isinstance(rpar, Suppress)):

894 lastExpr = base_expr | Group(nested_expr)

895 else:

896 lastExpr = base_expr | nested_expr

897

898 arity: int

899 rightLeftAssoc: opAssoc

900 pa: typing.Optional[ParseAction]

901 opExpr1: ParserElement

902 opExpr2: ParserElement

903 matchExpr: ParserElement

904 match_lookahead: ParserElement

905 for operDef in op_list:

906 opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment]

907 if isinstance(opExpr, str_type):

908 opExpr = ParserElement._literalStringClass(opExpr)

909 opExpr = typing.cast(ParserElement, opExpr)

910 if arity == 3:

911 if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:

912 raise ValueError(

913 "if numterms=3, opExpr must be a tuple or list of two expressions"

914 )

915 opExpr1, opExpr2 = opExpr

916 term_name = f"{opExpr1}{opExpr2} operations"

917 else:

918 term_name = f"{opExpr} operations"

919

920 if not 1 <= arity <= 3:

921 raise ValueError("operator must be unary (1), binary (2), or ternary (3)")

922

923 if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):

924 raise ValueError("operator must indicate right or left associativity")

925

926 thisExpr: ParserElement = Forward().set_name(term_name)

927 thisExpr = typing.cast(Forward, thisExpr)

928 match_lookahead = And([])

929 if rightLeftAssoc is OpAssoc.LEFT:

930 if arity == 1:

931 match_lookahead = _FB(lastExpr + opExpr)

932 matchExpr = Group(lastExpr + opExpr[1, ...])

933 elif arity == 2:

934 if opExpr is not None:

935 match_lookahead = _FB(lastExpr + opExpr + lastExpr)

936 matchExpr = Group(lastExpr + (opExpr + lastExpr)[1, ...])

937 else:

938 match_lookahead = _FB(lastExpr + lastExpr)

939 matchExpr = Group(lastExpr[2, ...])

940 elif arity == 3:

941 match_lookahead = _FB(

942 lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr

943 )

944 matchExpr = Group(

945 lastExpr + (opExpr1 + lastExpr + opExpr2 + lastExpr)[1, ...]

946 )

947 elif rightLeftAssoc is OpAssoc.RIGHT:

948 if arity == 1:

949 # try to avoid LR with this extra test

950 if not isinstance(opExpr, Opt):

951 opExpr = Opt(opExpr)

952 match_lookahead = _FB(opExpr.expr + thisExpr)

953 matchExpr = Group(opExpr + thisExpr)

954 elif arity == 2:

955 if opExpr is not None:

956 match_lookahead = _FB(lastExpr + opExpr + thisExpr)

957 matchExpr = Group(lastExpr + (opExpr + thisExpr)[1, ...])

958 else:

959 match_lookahead = _FB(lastExpr + thisExpr)

960 matchExpr = Group(lastExpr + thisExpr[1, ...])

961 elif arity == 3:

962 match_lookahead = _FB(

963 lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr

964 )

965 matchExpr = Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)

966

967 # suppress lookahead expr from railroad diagrams

968 match_lookahead.show_in_diagram = False

969

970 # TODO - determine why this statement can't be included in the following

971 # if pa block

972 matchExpr = match_lookahead + matchExpr

973

974 if pa:

975 if isinstance(pa, (tuple, list)):

976 matchExpr.set_parse_action(*pa)

977 else:

978 matchExpr.set_parse_action(pa)

979

980 thisExpr <<= (matchExpr | lastExpr).set_name(term_name)

981 lastExpr = thisExpr

982

983 ret <<= lastExpr

984 return ret

985

986

987def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):

988 """

989 .. deprecated:: 3.0.0

990 Use the :class:`IndentedBlock` class instead. Note that `IndentedBlock`

991 has a difference method signature.

992

993 Helper method for defining space-delimited indentation blocks,

994 such as those used to define block statements in Python source code.

995

996 :param blockStatementExpr: expression defining syntax of statement that

997 is repeated within the indented block

998

999 :param indentStack: list created by caller to manage indentation stack

1000 (multiple ``statementWithIndentedBlock`` expressions within a single

1001 grammar should share a common ``indentStack``)

1002

1003 :param indent: boolean indicating whether block must be indented beyond

1004 the current level; set to ``False`` for block of left-most statements

1005

1006 A valid block must contain at least one ``blockStatement``.

1007

1008 (Note that indentedBlock uses internal parse actions which make it

1009 incompatible with packrat parsing.)

1010

1011 Example:

1012

1013 .. testcode::

1014

1015 data = '''

1016 def A(z):

1017 A1

1018 B = 100

1019 G = A2

1020 A2

1021 A3

1022 B

1023 def BB(a,b,c):

1024 BB1

1025 def BBA():

1026 bba1

1027 bba2

1028 bba3

1029 C

1030 D

1031 def spam(x,y):

1032 def eggs(z):

1033 pass

1034 '''

1035

1036 indentStack = [1]

1037 stmt = Forward()

1038

1039 identifier = Word(alphas, alphanums)

1040 funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")

1041 func_body = indentedBlock(stmt, indentStack)

1042 funcDef = Group(funcDecl + func_body)

1043

1044 rvalue = Forward()

1045 funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")

1046 rvalue << (funcCall | identifier | Word(nums))

1047 assignment = Group(identifier + "=" + rvalue)

1048 stmt << (funcDef | assignment | identifier)

1049

1050 module_body = stmt[1, ...]

1051

1052 parseTree = module_body.parseString(data)

1053 parseTree.pprint()

1054

1055 prints:

1056

1057 .. testoutput::

1058

1059 [['def',

1060 'A',

1061 ['(', 'z', ')'],

1062 ':',

1063 [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],

1064 'B',

1065 ['def',

1066 'BB',

1067 ['(', 'a', 'b', 'c', ')'],

1068 ':',

1069 [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],

1070 'C',

1071 'D',

1072 ['def',

1073 'spam',

1074 ['(', 'x', 'y', ')'],

1075 ':',

1076 [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]

1077 """

1078 warnings.warn(

1079 f"{'indentedBlock'!r} deprecated - use {'IndentedBlock'!r}",

1080 DeprecationWarning,

1081 stacklevel=2,

1082 )

1083

1084 backup_stacks.append(indentStack[:])

1085

1086 def reset_stack():

1087 indentStack[:] = backup_stacks[-1]

1088

1089 def checkPeerIndent(s, l, t):

1090 if l >= len(s):

1091 return

1092 curCol = col(l, s)

1093 if curCol != indentStack[-1]:

1094 if curCol > indentStack[-1]:

1095 raise ParseException(s, l, "illegal nesting")

1096 raise ParseException(s, l, "not a peer entry")

1097

1098 def checkSubIndent(s, l, t):

1099 curCol = col(l, s)

1100 if curCol > indentStack[-1]:

1101 indentStack.append(curCol)

1102 else:

1103 raise ParseException(s, l, "not a subentry")

1104

1105 def checkUnindent(s, l, t):

1106 if l >= len(s):

1107 return

1108 curCol = col(l, s)

1109 if not (indentStack and curCol in indentStack):

1110 raise ParseException(s, l, "not an unindent")

1111 if curCol < indentStack[-1]:

1112 indentStack.pop()

1113

1114 NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())

1115 INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")

1116 PEER = Empty().set_parse_action(checkPeerIndent).set_name("")

1117 UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")

1118 if indent:

1119 smExpr = Group(

1120 Opt(NL)

1121 + INDENT

1122 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))

1123 + UNDENT

1124 )

1125 else:

1126 smExpr = Group(

1127 Opt(NL)

1128 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))

1129 + Opt(UNDENT)

1130 )

1131

1132 # add a parse action to remove backup_stack from list of backups

1133 smExpr.add_parse_action(

1134 lambda: backup_stacks.pop(-1) and None if backup_stacks else None

1135 )

1136 smExpr.set_fail_action(lambda a, b, c, d: reset_stack())

1137 blockStatementExpr.ignore(_bslash + LineEnd())

1138 return smExpr.set_name("indented block")

1139

1140

1141# it's easy to get these comment structures wrong - they're very common,

1142# so may as well make them available

1143c_style_comment = Regex(r"/\*(?:[^*]|\*(?!/))*\*\/").set_name("C style comment")

1144"Comment of the form ``/* ... */``"

1145

1146html_comment = Regex(r"").set_name("HTML comment")

1147"Comment of the form ````"

1148

1149rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")

1150dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")

1151"Comment of the form ``// ... (to end of line)``"

1152

1153cpp_style_comment = Regex(

1154 r"(?:/\*(?:[^*]|\*(?!/))*\*\/)|(?://(?:\\\n|[^\n])*)"

1155).set_name("C++ style comment")

1156"Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"

1157

1158java_style_comment = cpp_style_comment

1159"Same as :class:`cpp_style_comment`"

1160

1161python_style_comment = Regex(r"#.*").set_name("Python style comment")

1162"Comment of the form ``# ... (to end of line)``"

1163

1164

1165# build list of built-in expressions, for future reference if a global default value

1166# gets updated

1167_builtin_exprs: list[ParserElement] = [

1168 v for v in vars().values() if isinstance(v, ParserElement)

1169]

1170

1171

1172# compatibility function, superseded by DelimitedList class

1173def delimited_list(

1174 expr: Union[str, ParserElement],

1175 delim: Union[str, ParserElement] = ",",

1176 combine: bool = False,

1177 min: typing.Optional[int] = None,

1178 max: typing.Optional[int] = None,

1179 *,

1180 allow_trailing_delim: bool = False,

1181) -> ParserElement:

1182 """

1183 .. deprecated:: 3.1.0

1184 Use the :class:`DelimitedList` class instead.

1185 """

1186 return DelimitedList(

1187 expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim

1188 )

1189

1190

1191# Compatibility synonyms

1192# fmt: off

1193opAssoc = OpAssoc

1194anyOpenTag = any_open_tag

1195anyCloseTag = any_close_tag

1196commonHTMLEntity = common_html_entity

1197cStyleComment = c_style_comment

1198htmlComment = html_comment

1199restOfLine = rest_of_line

1200dblSlashComment = dbl_slash_comment

1201cppStyleComment = cpp_style_comment

1202javaStyleComment = java_style_comment

1203pythonStyleComment = python_style_comment

1204delimitedList = replaced_by_pep8("delimitedList", DelimitedList)

1205delimited_list = replaced_by_pep8("delimited_list", DelimitedList)

1206countedArray = replaced_by_pep8("countedArray", counted_array)

1207matchPreviousLiteral = replaced_by_pep8("matchPreviousLiteral", match_previous_literal)

1208matchPreviousExpr = replaced_by_pep8("matchPreviousExpr", match_previous_expr)

1209oneOf = replaced_by_pep8("oneOf", one_of)

1210dictOf = replaced_by_pep8("dictOf", dict_of)

1211originalTextFor = replaced_by_pep8("originalTextFor", original_text_for)

1212nestedExpr = replaced_by_pep8("nestedExpr", nested_expr)

1213makeHTMLTags = replaced_by_pep8("makeHTMLTags", make_html_tags)

1214makeXMLTags = replaced_by_pep8("makeXMLTags", make_xml_tags)

1215replaceHTMLEntity = replaced_by_pep8("replaceHTMLEntity", replace_html_entity)

1216infixNotation = replaced_by_pep8("infixNotation", infix_notation)

1217# fmt: on