Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pyparsing/helpers.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

333 statements  

1# helpers.py 

2import html.entities 

3import operator 

4import re 

5import sys 

6import typing 

7 

8from . import __diag__ 

9from .core import * 

10from .util import ( 

11 _bslash, 

12 _flatten, 

13 _escape_regex_range_chars, 

14 make_compressed_re, 

15 replaced_by_pep8, 

16) 

17 

18 

19# 

20# global helpers 

21# 

22def counted_array( 

23 expr: ParserElement, 

24 int_expr: typing.Optional[ParserElement] = None, 

25 *, 

26 intExpr: typing.Optional[ParserElement] = None, 

27) -> ParserElement: 

28 """Helper to define a counted list of expressions. 

29 

30 This helper defines a pattern of the form:: 

31 

32 integer expr expr expr... 

33 

34 where the leading integer tells how many expr expressions follow. 

35 The matched tokens returns the array of expr tokens as a list - the 

36 leading count token is suppressed. 

37 

38 If ``int_expr`` is specified, it should be a pyparsing expression 

39 that produces an integer value. 

40 

41 Example:: 

42 

43 counted_array(Word(alphas)).parse_string('2 ab cd ef') # -> ['ab', 'cd'] 

44 

45 # in this parser, the leading integer value is given in binary, 

46 # '10' indicating that 2 values are in the array 

47 binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2)) 

48 counted_array(Word(alphas), int_expr=binary_constant).parse_string('10 ab cd ef') # -> ['ab', 'cd'] 

49 

50 # if other fields must be parsed after the count but before the 

51 # list items, give the fields results names and they will 

52 # be preserved in the returned ParseResults: 

53 count_with_metadata = integer + Word(alphas)("type") 

54 typed_array = counted_array(Word(alphanums), int_expr=count_with_metadata)("items") 

55 result = typed_array.parse_string("3 bool True True False") 

56 print(result.dump()) 

57 

58 # prints 

59 # ['True', 'True', 'False'] 

60 # - items: ['True', 'True', 'False'] 

61 # - type: 'bool' 

62 """ 

63 intExpr = intExpr or int_expr 

64 array_expr = Forward() 

65 

66 def count_field_parse_action(s, l, t): 

67 nonlocal array_expr 

68 n = t[0] 

69 array_expr <<= (expr * n) if n else Empty() 

70 # clear list contents, but keep any named results 

71 del t[:] 

72 

73 if intExpr is None: 

74 intExpr = Word(nums).set_parse_action(lambda t: int(t[0])) 

75 else: 

76 intExpr = intExpr.copy() 

77 intExpr.set_name("arrayLen") 

78 intExpr.add_parse_action(count_field_parse_action, call_during_try=True) 

79 return (intExpr + array_expr).set_name(f"(len) {expr}...") 

80 

81 

82def match_previous_literal(expr: ParserElement) -> ParserElement: 

83 """Helper to define an expression that is indirectly defined from 

84 the tokens matched in a previous expression, that is, it looks for 

85 a 'repeat' of a previous expression. For example:: 

86 

87 first = Word(nums) 

88 second = match_previous_literal(first) 

89 match_expr = first + ":" + second 

90 

91 will match ``"1:1"``, but not ``"1:2"``. Because this 

92 matches a previous literal, will also match the leading 

93 ``"1:1"`` in ``"1:10"``. If this is not desired, use 

94 :class:`match_previous_expr`. Do *not* use with packrat parsing 

95 enabled. 

96 """ 

97 rep = Forward() 

98 

99 def copy_token_to_repeater(s, l, t): 

100 if not t: 

101 rep << Empty() 

102 return 

103 

104 if len(t) == 1: 

105 rep << t[0] 

106 return 

107 

108 # flatten t tokens 

109 tflat = _flatten(t.as_list()) 

110 rep << And(Literal(tt) for tt in tflat) 

111 

112 expr.add_parse_action(copy_token_to_repeater, callDuringTry=True) 

113 rep.set_name("(prev) " + str(expr)) 

114 return rep 

115 

116 

117def match_previous_expr(expr: ParserElement) -> ParserElement: 

118 """Helper to define an expression that is indirectly defined from 

119 the tokens matched in a previous expression, that is, it looks for 

120 a 'repeat' of a previous expression. For example:: 

121 

122 first = Word(nums) 

123 second = match_previous_expr(first) 

124 match_expr = first + ":" + second 

125 

126 will match ``"1:1"``, but not ``"1:2"``. Because this 

127 matches by expressions, will *not* match the leading ``"1:1"`` 

128 in ``"1:10"``; the expressions are evaluated first, and then 

129 compared, so ``"1"`` is compared with ``"10"``. Do *not* use 

130 with packrat parsing enabled. 

131 """ 

132 rep = Forward() 

133 e2 = expr.copy() 

134 rep <<= e2 

135 

136 def copy_token_to_repeater(s, l, t): 

137 matchTokens = _flatten(t.as_list()) 

138 

139 def must_match_these_tokens(s, l, t): 

140 theseTokens = _flatten(t.as_list()) 

141 if theseTokens != matchTokens: 

142 raise ParseException( 

143 s, l, f"Expected {matchTokens}, found{theseTokens}" 

144 ) 

145 

146 rep.set_parse_action(must_match_these_tokens, callDuringTry=True) 

147 

148 expr.add_parse_action(copy_token_to_repeater, callDuringTry=True) 

149 rep.set_name("(prev) " + str(expr)) 

150 return rep 

151 

152 

153def one_of( 

154 strs: Union[typing.Iterable[str], str], 

155 caseless: bool = False, 

156 use_regex: bool = True, 

157 as_keyword: bool = False, 

158 *, 

159 useRegex: bool = True, 

160 asKeyword: bool = False, 

161) -> ParserElement: 

162 """Helper to quickly define a set of alternative :class:`Literal` s, 

163 and makes sure to do longest-first testing when there is a conflict, 

164 regardless of the input order, but returns 

165 a :class:`MatchFirst` for best performance. 

166 

167 Parameters: 

168 

169 - ``strs`` - a string of space-delimited literals, or a collection of 

170 string literals 

171 - ``caseless`` - treat all literals as caseless - (default= ``False``) 

172 - ``use_regex`` - as an optimization, will 

173 generate a :class:`Regex` object; otherwise, will generate 

174 a :class:`MatchFirst` object (if ``caseless=True`` or ``as_keyword=True``, or if 

175 creating a :class:`Regex` raises an exception) - (default= ``True``) 

176 - ``as_keyword`` - enforce :class:`Keyword`-style matching on the 

177 generated expressions - (default= ``False``) 

178 - ``asKeyword`` and ``useRegex`` are retained for pre-PEP8 compatibility, 

179 but will be removed in a future release 

180 

181 Example:: 

182 

183 comp_oper = one_of("< = > <= >= !=") 

184 var = Word(alphas) 

185 number = Word(nums) 

186 term = var | number 

187 comparison_expr = term + comp_oper + term 

188 print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12")) 

189 

190 prints:: 

191 

192 [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']] 

193 """ 

194 asKeyword = asKeyword or as_keyword 

195 useRegex = useRegex and use_regex 

196 

197 if ( 

198 isinstance(caseless, str_type) 

199 and __diag__.warn_on_multiple_string_args_to_oneof 

200 ): 

201 warnings.warn( 

202 "warn_on_multiple_string_args_to_oneof:" 

203 " More than one string argument passed to one_of, pass" 

204 " choices as a list or space-delimited string", 

205 stacklevel=2, 

206 ) 

207 

208 if caseless: 

209 is_equal = lambda a, b: a.upper() == b.upper() 

210 masks = lambda a, b: b.upper().startswith(a.upper()) 

211 else: 

212 is_equal = operator.eq 

213 masks = lambda a, b: b.startswith(a) 

214 

215 symbols: list[str] 

216 if isinstance(strs, str_type): 

217 strs = typing.cast(str, strs) 

218 symbols = strs.split() 

219 elif isinstance(strs, Iterable): 

220 symbols = list(strs) 

221 else: 

222 raise TypeError("Invalid argument to one_of, expected string or iterable") 

223 if not symbols: 

224 return NoMatch() 

225 

226 # reorder given symbols to take care to avoid masking longer choices with shorter ones 

227 # (but only if the given symbols are not just single characters) 

228 i = 0 

229 while i < len(symbols) - 1: 

230 cur = symbols[i] 

231 for j, other in enumerate(symbols[i + 1 :]): 

232 if is_equal(other, cur): 

233 del symbols[i + j + 1] 

234 break 

235 if len(other) > len(cur) and masks(cur, other): 

236 del symbols[i + j + 1] 

237 symbols.insert(i, other) 

238 break 

239 else: 

240 i += 1 

241 

242 if useRegex: 

243 re_flags: int = re.IGNORECASE if caseless else 0 

244 

245 try: 

246 if all(len(sym) == 1 for sym in symbols): 

247 # symbols are just single characters, create range regex pattern 

248 patt = f"[{''.join(_escape_regex_range_chars(sym) for sym in symbols)}]" 

249 else: 

250 patt = "|".join(re.escape(sym) for sym in symbols) 

251 

252 # wrap with \b word break markers if defining as keywords 

253 if asKeyword: 

254 patt = rf"\b(?:{patt})\b" 

255 

256 ret = Regex(patt, flags=re_flags) 

257 ret.set_name(" | ".join(re.escape(s) for s in symbols)) 

258 

259 if caseless: 

260 # add parse action to return symbols as specified, not in random 

261 # casing as found in input string 

262 symbol_map = {sym.lower(): sym for sym in symbols} 

263 ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()]) 

264 

265 return ret 

266 

267 except re.error: 

268 warnings.warn( 

269 "Exception creating Regex for one_of, building MatchFirst", stacklevel=2 

270 ) 

271 

272 # last resort, just use MatchFirst of Token class corresponding to caseless 

273 # and asKeyword settings 

274 CASELESS = KEYWORD = True 

275 parse_element_class = { 

276 (CASELESS, KEYWORD): CaselessKeyword, 

277 (CASELESS, not KEYWORD): CaselessLiteral, 

278 (not CASELESS, KEYWORD): Keyword, 

279 (not CASELESS, not KEYWORD): Literal, 

280 }[(caseless, asKeyword)] 

281 return MatchFirst(parse_element_class(sym) for sym in symbols).set_name( 

282 " | ".join(symbols) 

283 ) 

284 

285 

286def dict_of(key: ParserElement, value: ParserElement) -> Dict: 

287 """Helper to easily and clearly define a dictionary by specifying 

288 the respective patterns for the key and value. Takes care of 

289 defining the :class:`Dict`, :class:`ZeroOrMore`, and 

290 :class:`Group` tokens in the proper order. The key pattern 

291 can include delimiting markers or punctuation, as long as they are 

292 suppressed, thereby leaving the significant key text. The value 

293 pattern can include named results, so that the :class:`Dict` results 

294 can include named token fields. 

295 

296 Example:: 

297 

298 text = "shape: SQUARE posn: upper left color: light blue texture: burlap" 

299 attr_expr = (label + Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join)) 

300 print(attr_expr[1, ...].parse_string(text).dump()) 

301 

302 attr_label = label 

303 attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join) 

304 

305 # similar to Dict, but simpler call format 

306 result = dict_of(attr_label, attr_value).parse_string(text) 

307 print(result.dump()) 

308 print(result['shape']) 

309 print(result.shape) # object attribute access works too 

310 print(result.as_dict()) 

311 

312 prints:: 

313 

314 [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']] 

315 - color: 'light blue' 

316 - posn: 'upper left' 

317 - shape: 'SQUARE' 

318 - texture: 'burlap' 

319 SQUARE 

320 SQUARE 

321 {'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'} 

322 """ 

323 return Dict(OneOrMore(Group(key + value))) 

324 

325 

326def original_text_for( 

327 expr: ParserElement, as_string: bool = True, *, asString: bool = True 

328) -> ParserElement: 

329 """Helper to return the original, untokenized text for a given 

330 expression. Useful to restore the parsed fields of an HTML start 

331 tag into the raw tag text itself, or to revert separate tokens with 

332 intervening whitespace back to the original matching input text. By 

333 default, returns a string containing the original parsed text. 

334 

335 If the optional ``as_string`` argument is passed as 

336 ``False``, then the return value is 

337 a :class:`ParseResults` containing any results names that 

338 were originally matched, and a single token containing the original 

339 matched text from the input string. So if the expression passed to 

340 :class:`original_text_for` contains expressions with defined 

341 results names, you must set ``as_string`` to ``False`` if you 

342 want to preserve those results name values. 

343 

344 The ``asString`` pre-PEP8 argument is retained for compatibility, 

345 but will be removed in a future release. 

346 

347 Example:: 

348 

349 src = "this is test <b> bold <i>text</i> </b> normal text " 

350 for tag in ("b", "i"): 

351 opener, closer = make_html_tags(tag) 

352 patt = original_text_for(opener + ... + closer) 

353 print(patt.search_string(src)[0]) 

354 

355 prints:: 

356 

357 ['<b> bold <i>text</i> </b>'] 

358 ['<i>text</i>'] 

359 """ 

360 asString = asString and as_string 

361 

362 locMarker = Empty().set_parse_action(lambda s, loc, t: loc) 

363 endlocMarker = locMarker.copy() 

364 endlocMarker.callPreparse = False 

365 matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end") 

366 if asString: 

367 extractText = lambda s, l, t: s[t._original_start : t._original_end] 

368 else: 

369 

370 def extractText(s, l, t): 

371 t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]] 

372 

373 matchExpr.set_parse_action(extractText) 

374 matchExpr.ignoreExprs = expr.ignoreExprs 

375 matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection) 

376 return matchExpr 

377 

378 

379def ungroup(expr: ParserElement) -> ParserElement: 

380 """Helper to undo pyparsing's default grouping of And expressions, 

381 even if all but one are non-empty. 

382 """ 

383 return TokenConverter(expr).add_parse_action(lambda t: t[0]) 

384 

385 

386def locatedExpr(expr: ParserElement) -> ParserElement: 

387 """ 

388 (DEPRECATED - future code should use the :class:`Located` class) 

389 Helper to decorate a returned token with its starting and ending 

390 locations in the input string. 

391 

392 This helper adds the following results names: 

393 

394 - ``locn_start`` - location where matched expression begins 

395 - ``locn_end`` - location where matched expression ends 

396 - ``value`` - the actual parsed results 

397 

398 Be careful if the input text contains ``<TAB>`` characters, you 

399 may want to call :class:`ParserElement.parse_with_tabs` 

400 

401 Example:: 

402 

403 wd = Word(alphas) 

404 for match in locatedExpr(wd).search_string("ljsdf123lksdjjf123lkkjj1222"): 

405 print(match) 

406 

407 prints:: 

408 

409 [[0, 'ljsdf', 5]] 

410 [[8, 'lksdjjf', 15]] 

411 [[18, 'lkkjj', 23]] 

412 """ 

413 locator = Empty().set_parse_action(lambda ss, ll, tt: ll) 

414 return Group( 

415 locator("locn_start") 

416 + expr("value") 

417 + locator.copy().leaveWhitespace()("locn_end") 

418 ) 

419 

420 

421# define special default value to permit None as a significant value for 

422# ignore_expr 

423_NO_IGNORE_EXPR_GIVEN = NoMatch() 

424 

425 

426def nested_expr( 

427 opener: Union[str, ParserElement] = "(", 

428 closer: Union[str, ParserElement] = ")", 

429 content: typing.Optional[ParserElement] = None, 

430 ignore_expr: ParserElement = _NO_IGNORE_EXPR_GIVEN, 

431 *, 

432 ignoreExpr: ParserElement = _NO_IGNORE_EXPR_GIVEN, 

433) -> ParserElement: 

434 """Helper method for defining nested lists enclosed in opening and 

435 closing delimiters (``"("`` and ``")"`` are the default). 

436 

437 Parameters: 

438 

439 - ``opener`` - opening character for a nested list 

440 (default= ``"("``); can also be a pyparsing expression 

441 - ``closer`` - closing character for a nested list 

442 (default= ``")"``); can also be a pyparsing expression 

443 - ``content`` - expression for items within the nested lists 

444 (default= ``None``) 

445 - ``ignore_expr`` - expression for ignoring opening and closing delimiters 

446 (default= :class:`quoted_string`) 

447 - ``ignoreExpr`` - this pre-PEP8 argument is retained for compatibility 

448 but will be removed in a future release 

449 

450 If an expression is not provided for the content argument, the 

451 nested expression will capture all whitespace-delimited content 

452 between delimiters as a list of separate values. 

453 

454 Use the ``ignore_expr`` argument to define expressions that may 

455 contain opening or closing characters that should not be treated as 

456 opening or closing characters for nesting, such as quoted_string or 

457 a comment expression. Specify multiple expressions using an 

458 :class:`Or` or :class:`MatchFirst`. The default is 

459 :class:`quoted_string`, but if no expressions are to be ignored, then 

460 pass ``None`` for this argument. 

461 

462 Example:: 

463 

464 data_type = one_of("void int short long char float double") 

465 decl_data_type = Combine(data_type + Opt(Word('*'))) 

466 ident = Word(alphas+'_', alphanums+'_') 

467 number = pyparsing_common.number 

468 arg = Group(decl_data_type + ident) 

469 LPAR, RPAR = map(Suppress, "()") 

470 

471 code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment)) 

472 

473 c_function = (decl_data_type("type") 

474 + ident("name") 

475 + LPAR + Opt(DelimitedList(arg), [])("args") + RPAR 

476 + code_body("body")) 

477 c_function.ignore(c_style_comment) 

478 

479 source_code = ''' 

480 int is_odd(int x) { 

481 return (x%2); 

482 } 

483 

484 int dec_to_hex(char hchar) { 

485 if (hchar >= '0' && hchar <= '9') { 

486 return (ord(hchar)-ord('0')); 

487 } else { 

488 return (10+ord(hchar)-ord('A')); 

489 } 

490 } 

491 ''' 

492 for func in c_function.search_string(source_code): 

493 print("%(name)s (%(type)s) args: %(args)s" % func) 

494 

495 

496 prints:: 

497 

498 is_odd (int) args: [['int', 'x']] 

499 dec_to_hex (int) args: [['char', 'hchar']] 

500 """ 

501 if ignoreExpr != ignore_expr: 

502 ignoreExpr = ignore_expr if ignoreExpr is _NO_IGNORE_EXPR_GIVEN else ignoreExpr 

503 

504 if ignoreExpr is _NO_IGNORE_EXPR_GIVEN: 

505 ignoreExpr = quoted_string() 

506 

507 if opener == closer: 

508 raise ValueError("opening and closing strings cannot be the same") 

509 

510 if content is None: 

511 if isinstance(opener, str_type) and isinstance(closer, str_type): 

512 opener = typing.cast(str, opener) 

513 closer = typing.cast(str, closer) 

514 if len(opener) == 1 and len(closer) == 1: 

515 if ignoreExpr is not None: 

516 content = Combine( 

517 OneOrMore( 

518 ~ignoreExpr 

519 + CharsNotIn( 

520 opener + closer + ParserElement.DEFAULT_WHITE_CHARS, 

521 exact=1, 

522 ) 

523 ) 

524 ) 

525 else: 

526 content = Combine( 

527 Empty() 

528 + CharsNotIn( 

529 opener + closer + ParserElement.DEFAULT_WHITE_CHARS 

530 ) 

531 ) 

532 else: 

533 if ignoreExpr is not None: 

534 content = Combine( 

535 OneOrMore( 

536 ~ignoreExpr 

537 + ~Literal(opener) 

538 + ~Literal(closer) 

539 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1) 

540 ) 

541 ) 

542 else: 

543 content = Combine( 

544 OneOrMore( 

545 ~Literal(opener) 

546 + ~Literal(closer) 

547 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1) 

548 ) 

549 ) 

550 else: 

551 raise ValueError( 

552 "opening and closing arguments must be strings if no content expression is given" 

553 ) 

554 

555 # for these internally-created context expressions, simulate whitespace-skipping 

556 if ParserElement.DEFAULT_WHITE_CHARS: 

557 content.set_parse_action( 

558 lambda t: t[0].strip(ParserElement.DEFAULT_WHITE_CHARS) 

559 ) 

560 

561 ret = Forward() 

562 if ignoreExpr is not None: 

563 ret <<= Group( 

564 Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer) 

565 ) 

566 else: 

567 ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer)) 

568 

569 ret.set_name(f"nested {opener}{closer} expression") 

570 

571 # don't override error message from content expressions 

572 ret.errmsg = None 

573 return ret 

574 

575 

576def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")): 

577 """Internal helper to construct opening and closing tag expressions, given a tag name""" 

578 if isinstance(tagStr, str_type): 

579 resname = tagStr 

580 tagStr = Keyword(tagStr, caseless=not xml) 

581 else: 

582 resname = tagStr.name 

583 

584 tagAttrName = Word(alphas, alphanums + "_-:") 

585 if xml: 

586 tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes) 

587 openTag = ( 

588 suppress_LT 

589 + tagStr("tag") 

590 + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue))) 

591 + Opt("/", default=[False])("empty").set_parse_action( 

592 lambda s, l, t: t[0] == "/" 

593 ) 

594 + suppress_GT 

595 ) 

596 else: 

597 tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word( 

598 printables, exclude_chars=">" 

599 ) 

600 openTag = ( 

601 suppress_LT 

602 + tagStr("tag") 

603 + Dict( 

604 ZeroOrMore( 

605 Group( 

606 tagAttrName.set_parse_action(lambda t: t[0].lower()) 

607 + Opt(Suppress("=") + tagAttrValue) 

608 ) 

609 ) 

610 ) 

611 + Opt("/", default=[False])("empty").set_parse_action( 

612 lambda s, l, t: t[0] == "/" 

613 ) 

614 + suppress_GT 

615 ) 

616 closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False) 

617 

618 openTag.set_name(f"<{resname}>") 

619 # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels 

620 openTag.add_parse_action( 

621 lambda t: t.__setitem__( 

622 "start" + "".join(resname.replace(":", " ").title().split()), t.copy() 

623 ) 

624 ) 

625 closeTag = closeTag( 

626 "end" + "".join(resname.replace(":", " ").title().split()) 

627 ).set_name(f"</{resname}>") 

628 openTag.tag = resname 

629 closeTag.tag = resname 

630 openTag.tag_body = SkipTo(closeTag()) 

631 return openTag, closeTag 

632 

633 

634def make_html_tags( 

635 tag_str: Union[str, ParserElement], 

636) -> tuple[ParserElement, ParserElement]: 

637 """Helper to construct opening and closing tag expressions for HTML, 

638 given a tag name. Matches tags in either upper or lower case, 

639 attributes with namespaces and with quoted or unquoted values. 

640 

641 Example:: 

642 

643 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>' 

644 # make_html_tags returns pyparsing expressions for the opening and 

645 # closing tags as a 2-tuple 

646 a, a_end = make_html_tags("A") 

647 link_expr = a + SkipTo(a_end)("link_text") + a_end 

648 

649 for link in link_expr.search_string(text): 

650 # attributes in the <A> tag (like "href" shown here) are 

651 # also accessible as named results 

652 print(link.link_text, '->', link.href) 

653 

654 prints:: 

655 

656 pyparsing -> https://github.com/pyparsing/pyparsing/wiki 

657 """ 

658 return _makeTags(tag_str, False) 

659 

660 

661def make_xml_tags( 

662 tag_str: Union[str, ParserElement], 

663) -> tuple[ParserElement, ParserElement]: 

664 """Helper to construct opening and closing tag expressions for XML, 

665 given a tag name. Matches tags only in the given upper/lower case. 

666 

667 Example: similar to :class:`make_html_tags` 

668 """ 

669 return _makeTags(tag_str, True) 

670 

671 

672any_open_tag: ParserElement 

673any_close_tag: ParserElement 

674any_open_tag, any_close_tag = make_html_tags( 

675 Word(alphas, alphanums + "_:").set_name("any tag") 

676) 

677 

678_htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()} 

679_most_common_entities = "nbsp lt gt amp quot apos cent pound euro copy".replace( 

680 " ", "|" 

681) 

682common_html_entity = Regex( 

683 lambda: f"&(?P<entity>{_most_common_entities}|{make_compressed_re(_htmlEntityMap)});" 

684).set_name("common HTML entity") 

685 

686 

687def replace_html_entity(s, l, t): 

688 """Helper parser action to replace common HTML entities with their special characters""" 

689 return _htmlEntityMap.get(t.entity) 

690 

691 

692class OpAssoc(Enum): 

693 """Enumeration of operator associativity 

694 - used in constructing InfixNotationOperatorSpec for :class:`infix_notation`""" 

695 

696 LEFT = 1 

697 RIGHT = 2 

698 

699 

700InfixNotationOperatorArgType = Union[ 

701 ParserElement, str, tuple[Union[ParserElement, str], Union[ParserElement, str]] 

702] 

703InfixNotationOperatorSpec = Union[ 

704 tuple[ 

705 InfixNotationOperatorArgType, 

706 int, 

707 OpAssoc, 

708 typing.Optional[ParseAction], 

709 ], 

710 tuple[ 

711 InfixNotationOperatorArgType, 

712 int, 

713 OpAssoc, 

714 ], 

715] 

716 

717 

718def infix_notation( 

719 base_expr: ParserElement, 

720 op_list: list[InfixNotationOperatorSpec], 

721 lpar: Union[str, ParserElement] = Suppress("("), 

722 rpar: Union[str, ParserElement] = Suppress(")"), 

723) -> Forward: 

724 """Helper method for constructing grammars of expressions made up of 

725 operators working in a precedence hierarchy. Operators may be unary 

726 or binary, left- or right-associative. Parse actions can also be 

727 attached to operator expressions. The generated parser will also 

728 recognize the use of parentheses to override operator precedences 

729 (see example below). 

730 

731 Note: if you define a deep operator list, you may see performance 

732 issues when using infix_notation. See 

733 :class:`ParserElement.enable_packrat` for a mechanism to potentially 

734 improve your parser performance. 

735 

736 Parameters: 

737 

738 - ``base_expr`` - expression representing the most basic operand to 

739 be used in the expression 

740 - ``op_list`` - list of tuples, one for each operator precedence level 

741 in the expression grammar; each tuple is of the form ``(op_expr, 

742 num_operands, right_left_assoc, (optional)parse_action)``, where: 

743 

744 - ``op_expr`` is the pyparsing expression for the operator; may also 

745 be a string, which will be converted to a Literal; if ``num_operands`` 

746 is 3, ``op_expr`` is a tuple of two expressions, for the two 

747 operators separating the 3 terms 

748 - ``num_operands`` is the number of terms for this operator (must be 1, 

749 2, or 3) 

750 - ``right_left_assoc`` is the indicator whether the operator is right 

751 or left associative, using the pyparsing-defined constants 

752 ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``. 

753 - ``parse_action`` is the parse action to be associated with 

754 expressions matching this operator expression (the parse action 

755 tuple member may be omitted); if the parse action is passed 

756 a tuple or list of functions, this is equivalent to calling 

757 ``set_parse_action(*fn)`` 

758 (:class:`ParserElement.set_parse_action`) 

759 - ``lpar`` - expression for matching left-parentheses; if passed as a 

760 str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as 

761 an expression (such as ``Literal('(')``), then it will be kept in 

762 the parsed results, and grouped with them. (default= ``Suppress('(')``) 

763 - ``rpar`` - expression for matching right-parentheses; if passed as a 

764 str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as 

765 an expression (such as ``Literal(')')``), then it will be kept in 

766 the parsed results, and grouped with them. (default= ``Suppress(')')``) 

767 

768 Example:: 

769 

770 # simple example of four-function arithmetic with ints and 

771 # variable names 

772 integer = pyparsing_common.signed_integer 

773 varname = pyparsing_common.identifier 

774 

775 arith_expr = infix_notation(integer | varname, 

776 [ 

777 ('-', 1, OpAssoc.RIGHT), 

778 (one_of('* /'), 2, OpAssoc.LEFT), 

779 (one_of('+ -'), 2, OpAssoc.LEFT), 

780 ]) 

781 

782 arith_expr.run_tests(''' 

783 5+3*6 

784 (5+3)*6 

785 -2--11 

786 ''', full_dump=False) 

787 

788 prints:: 

789 

790 5+3*6 

791 [[5, '+', [3, '*', 6]]] 

792 

793 (5+3)*6 

794 [[[5, '+', 3], '*', 6]] 

795 

796 (5+x)*y 

797 [[[5, '+', 'x'], '*', 'y']] 

798 

799 -2--11 

800 [[['-', 2], '-', ['-', 11]]] 

801 """ 

802 

803 # captive version of FollowedBy that does not do parse actions or capture results names 

804 class _FB(FollowedBy): 

805 def parseImpl(self, instring, loc, doActions=True): 

806 self.expr.try_parse(instring, loc) 

807 return loc, [] 

808 

809 _FB.__name__ = "FollowedBy>" 

810 

811 ret = Forward() 

812 ret.set_name(f"{base_expr.name}_expression") 

813 if isinstance(lpar, str): 

814 lpar = Suppress(lpar) 

815 if isinstance(rpar, str): 

816 rpar = Suppress(rpar) 

817 

818 nested_expr = (lpar + ret + rpar).set_name(f"nested_{base_expr.name}") 

819 

820 # if lpar and rpar are not suppressed, wrap in group 

821 if not (isinstance(lpar, Suppress) and isinstance(rpar, Suppress)): 

822 lastExpr = base_expr | Group(nested_expr) 

823 else: 

824 lastExpr = base_expr | nested_expr 

825 

826 arity: int 

827 rightLeftAssoc: opAssoc 

828 pa: typing.Optional[ParseAction] 

829 opExpr1: ParserElement 

830 opExpr2: ParserElement 

831 matchExpr: ParserElement 

832 match_lookahead: ParserElement 

833 for operDef in op_list: 

834 opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment] 

835 if isinstance(opExpr, str_type): 

836 opExpr = ParserElement._literalStringClass(opExpr) 

837 opExpr = typing.cast(ParserElement, opExpr) 

838 if arity == 3: 

839 if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2: 

840 raise ValueError( 

841 "if numterms=3, opExpr must be a tuple or list of two expressions" 

842 ) 

843 opExpr1, opExpr2 = opExpr 

844 term_name = f"{opExpr1}{opExpr2} operations" 

845 else: 

846 term_name = f"{opExpr} operations" 

847 

848 if not 1 <= arity <= 3: 

849 raise ValueError("operator must be unary (1), binary (2), or ternary (3)") 

850 

851 if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT): 

852 raise ValueError("operator must indicate right or left associativity") 

853 

854 thisExpr: ParserElement = Forward().set_name(term_name) 

855 thisExpr = typing.cast(Forward, thisExpr) 

856 match_lookahead = And([]) 

857 if rightLeftAssoc is OpAssoc.LEFT: 

858 if arity == 1: 

859 match_lookahead = _FB(lastExpr + opExpr) 

860 matchExpr = Group(lastExpr + opExpr[1, ...]) 

861 elif arity == 2: 

862 if opExpr is not None: 

863 match_lookahead = _FB(lastExpr + opExpr + lastExpr) 

864 matchExpr = Group(lastExpr + (opExpr + lastExpr)[1, ...]) 

865 else: 

866 match_lookahead = _FB(lastExpr + lastExpr) 

867 matchExpr = Group(lastExpr[2, ...]) 

868 elif arity == 3: 

869 match_lookahead = _FB( 

870 lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr 

871 ) 

872 matchExpr = Group( 

873 lastExpr + (opExpr1 + lastExpr + opExpr2 + lastExpr)[1, ...] 

874 ) 

875 elif rightLeftAssoc is OpAssoc.RIGHT: 

876 if arity == 1: 

877 # try to avoid LR with this extra test 

878 if not isinstance(opExpr, Opt): 

879 opExpr = Opt(opExpr) 

880 match_lookahead = _FB(opExpr.expr + thisExpr) 

881 matchExpr = Group(opExpr + thisExpr) 

882 elif arity == 2: 

883 if opExpr is not None: 

884 match_lookahead = _FB(lastExpr + opExpr + thisExpr) 

885 matchExpr = Group(lastExpr + (opExpr + thisExpr)[1, ...]) 

886 else: 

887 match_lookahead = _FB(lastExpr + thisExpr) 

888 matchExpr = Group(lastExpr + thisExpr[1, ...]) 

889 elif arity == 3: 

890 match_lookahead = _FB( 

891 lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr 

892 ) 

893 matchExpr = Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) 

894 

895 # suppress lookahead expr from railroad diagrams 

896 match_lookahead.show_in_diagram = False 

897 

898 # TODO - determine why this statement can't be included in the following 

899 # if pa block 

900 matchExpr = match_lookahead + matchExpr 

901 

902 if pa: 

903 if isinstance(pa, (tuple, list)): 

904 matchExpr.set_parse_action(*pa) 

905 else: 

906 matchExpr.set_parse_action(pa) 

907 

908 thisExpr <<= (matchExpr | lastExpr).setName(term_name) 

909 lastExpr = thisExpr 

910 

911 ret <<= lastExpr 

912 return ret 

913 

914 

915def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]): 

916 """ 

917 (DEPRECATED - use :class:`IndentedBlock` class instead) 

918 Helper method for defining space-delimited indentation blocks, 

919 such as those used to define block statements in Python source code. 

920 

921 Parameters: 

922 

923 - ``blockStatementExpr`` - expression defining syntax of statement that 

924 is repeated within the indented block 

925 - ``indentStack`` - list created by caller to manage indentation stack 

926 (multiple ``statementWithIndentedBlock`` expressions within a single 

927 grammar should share a common ``indentStack``) 

928 - ``indent`` - boolean indicating whether block must be indented beyond 

929 the current level; set to ``False`` for block of left-most statements 

930 (default= ``True``) 

931 

932 A valid block must contain at least one ``blockStatement``. 

933 

934 (Note that indentedBlock uses internal parse actions which make it 

935 incompatible with packrat parsing.) 

936 

937 Example:: 

938 

939 data = ''' 

940 def A(z): 

941 A1 

942 B = 100 

943 G = A2 

944 A2 

945 A3 

946 B 

947 def BB(a,b,c): 

948 BB1 

949 def BBA(): 

950 bba1 

951 bba2 

952 bba3 

953 C 

954 D 

955 def spam(x,y): 

956 def eggs(z): 

957 pass 

958 ''' 

959 

960 

961 indentStack = [1] 

962 stmt = Forward() 

963 

964 identifier = Word(alphas, alphanums) 

965 funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":") 

966 func_body = indentedBlock(stmt, indentStack) 

967 funcDef = Group(funcDecl + func_body) 

968 

969 rvalue = Forward() 

970 funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")") 

971 rvalue << (funcCall | identifier | Word(nums)) 

972 assignment = Group(identifier + "=" + rvalue) 

973 stmt << (funcDef | assignment | identifier) 

974 

975 module_body = stmt[1, ...] 

976 

977 parseTree = module_body.parseString(data) 

978 parseTree.pprint() 

979 

980 prints:: 

981 

982 [['def', 

983 'A', 

984 ['(', 'z', ')'], 

985 ':', 

986 [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]], 

987 'B', 

988 ['def', 

989 'BB', 

990 ['(', 'a', 'b', 'c', ')'], 

991 ':', 

992 [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]], 

993 'C', 

994 'D', 

995 ['def', 

996 'spam', 

997 ['(', 'x', 'y', ')'], 

998 ':', 

999 [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]] 

1000 """ 

1001 backup_stacks.append(indentStack[:]) 

1002 

1003 def reset_stack(): 

1004 indentStack[:] = backup_stacks[-1] 

1005 

1006 def checkPeerIndent(s, l, t): 

1007 if l >= len(s): 

1008 return 

1009 curCol = col(l, s) 

1010 if curCol != indentStack[-1]: 

1011 if curCol > indentStack[-1]: 

1012 raise ParseException(s, l, "illegal nesting") 

1013 raise ParseException(s, l, "not a peer entry") 

1014 

1015 def checkSubIndent(s, l, t): 

1016 curCol = col(l, s) 

1017 if curCol > indentStack[-1]: 

1018 indentStack.append(curCol) 

1019 else: 

1020 raise ParseException(s, l, "not a subentry") 

1021 

1022 def checkUnindent(s, l, t): 

1023 if l >= len(s): 

1024 return 

1025 curCol = col(l, s) 

1026 if not (indentStack and curCol in indentStack): 

1027 raise ParseException(s, l, "not an unindent") 

1028 if curCol < indentStack[-1]: 

1029 indentStack.pop() 

1030 

1031 NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress()) 

1032 INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT") 

1033 PEER = Empty().set_parse_action(checkPeerIndent).set_name("") 

1034 UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT") 

1035 if indent: 

1036 smExpr = Group( 

1037 Opt(NL) 

1038 + INDENT 

1039 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL)) 

1040 + UNDENT 

1041 ) 

1042 else: 

1043 smExpr = Group( 

1044 Opt(NL) 

1045 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL)) 

1046 + Opt(UNDENT) 

1047 ) 

1048 

1049 # add a parse action to remove backup_stack from list of backups 

1050 smExpr.add_parse_action( 

1051 lambda: backup_stacks.pop(-1) and None if backup_stacks else None 

1052 ) 

1053 smExpr.set_fail_action(lambda a, b, c, d: reset_stack()) 

1054 blockStatementExpr.ignore(_bslash + LineEnd()) 

1055 return smExpr.set_name("indented block") 

1056 

1057 

1058# it's easy to get these comment structures wrong - they're very common, 

1059# so may as well make them available 

1060c_style_comment = Regex(r"/\*(?:[^*]|\*(?!/))*\*\/").set_name("C style comment") 

1061"Comment of the form ``/* ... */``" 

1062 

1063html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment") 

1064"Comment of the form ``<!-- ... -->``" 

1065 

1066rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line") 

1067dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment") 

1068"Comment of the form ``// ... (to end of line)``" 

1069 

1070cpp_style_comment = Regex( 

1071 r"(?:/\*(?:[^*]|\*(?!/))*\*\/)|(?://(?:\\\n|[^\n])*)" 

1072).set_name("C++ style comment") 

1073"Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`" 

1074 

1075java_style_comment = cpp_style_comment 

1076"Same as :class:`cpp_style_comment`" 

1077 

1078python_style_comment = Regex(r"#.*").set_name("Python style comment") 

1079"Comment of the form ``# ... (to end of line)``" 

1080 

1081 

1082# build list of built-in expressions, for future reference if a global default value 

1083# gets updated 

1084_builtin_exprs: list[ParserElement] = [ 

1085 v for v in vars().values() if isinstance(v, ParserElement) 

1086] 

1087 

1088 

1089# compatibility function, superseded by DelimitedList class 

1090def delimited_list( 

1091 expr: Union[str, ParserElement], 

1092 delim: Union[str, ParserElement] = ",", 

1093 combine: bool = False, 

1094 min: typing.Optional[int] = None, 

1095 max: typing.Optional[int] = None, 

1096 *, 

1097 allow_trailing_delim: bool = False, 

1098) -> ParserElement: 

1099 """(DEPRECATED - use :class:`DelimitedList` class)""" 

1100 return DelimitedList( 

1101 expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim 

1102 ) 

1103 

1104 

1105# Compatibility synonyms 

1106# fmt: off 

1107opAssoc = OpAssoc 

1108anyOpenTag = any_open_tag 

1109anyCloseTag = any_close_tag 

1110commonHTMLEntity = common_html_entity 

1111cStyleComment = c_style_comment 

1112htmlComment = html_comment 

1113restOfLine = rest_of_line 

1114dblSlashComment = dbl_slash_comment 

1115cppStyleComment = cpp_style_comment 

1116javaStyleComment = java_style_comment 

1117pythonStyleComment = python_style_comment 

1118delimitedList = replaced_by_pep8("delimitedList", DelimitedList) 

1119delimited_list = replaced_by_pep8("delimited_list", DelimitedList) 

1120countedArray = replaced_by_pep8("countedArray", counted_array) 

1121matchPreviousLiteral = replaced_by_pep8("matchPreviousLiteral", match_previous_literal) 

1122matchPreviousExpr = replaced_by_pep8("matchPreviousExpr", match_previous_expr) 

1123oneOf = replaced_by_pep8("oneOf", one_of) 

1124dictOf = replaced_by_pep8("dictOf", dict_of) 

1125originalTextFor = replaced_by_pep8("originalTextFor", original_text_for) 

1126nestedExpr = replaced_by_pep8("nestedExpr", nested_expr) 

1127makeHTMLTags = replaced_by_pep8("makeHTMLTags", make_html_tags) 

1128makeXMLTags = replaced_by_pep8("makeXMLTags", make_xml_tags) 

1129replaceHTMLEntity = replaced_by_pep8("replaceHTMLEntity", replace_html_entity) 

1130infixNotation = replaced_by_pep8("infixNotation", infix_notation) 

1131# fmt: on