Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/pyparsing/helpers.py: 27%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

333 statements  

1# helpers.py 

2import html.entities 

3import operator 

4import re 

5import sys 

6import typing 

7 

8from . import __diag__ 

9from .core import * 

10from .util import ( 

11 _bslash, 

12 _flatten, 

13 _escape_regex_range_chars, 

14 make_compressed_re, 

15 replaced_by_pep8, 

16) 

17 

18 

19# 

20# global helpers 

21# 

22def counted_array( 

23 expr: ParserElement, 

24 int_expr: typing.Optional[ParserElement] = None, 

25 *, 

26 intExpr: typing.Optional[ParserElement] = None, 

27) -> ParserElement: 

28 """Helper to define a counted list of expressions. 

29 

30 This helper defines a pattern of the form:: 

31 

32 integer expr expr expr... 

33 

34 where the leading integer tells how many expr expressions follow. 

35 The matched tokens returns the array of expr tokens as a list - the 

36 leading count token is suppressed. 

37 

38 If ``int_expr`` is specified, it should be a pyparsing expression 

39 that produces an integer value. 

40 

41 Example:: 

42 

43 counted_array(Word(alphas)).parse_string('2 ab cd ef') # -> ['ab', 'cd'] 

44 

45 # in this parser, the leading integer value is given in binary, 

46 # '10' indicating that 2 values are in the array 

47 binary_constant = Word('01').set_parse_action(lambda t: int(t[0], 2)) 

48 counted_array(Word(alphas), int_expr=binary_constant).parse_string('10 ab cd ef') # -> ['ab', 'cd'] 

49 

50 # if other fields must be parsed after the count but before the 

51 # list items, give the fields results names and they will 

52 # be preserved in the returned ParseResults: 

53 count_with_metadata = integer + Word(alphas)("type") 

54 typed_array = counted_array(Word(alphanums), int_expr=count_with_metadata)("items") 

55 result = typed_array.parse_string("3 bool True True False") 

56 print(result.dump()) 

57 

58 # prints 

59 # ['True', 'True', 'False'] 

60 # - items: ['True', 'True', 'False'] 

61 # - type: 'bool' 

62 """ 

63 intExpr = intExpr or int_expr 

64 array_expr = Forward() 

65 

66 def count_field_parse_action(s, l, t): 

67 nonlocal array_expr 

68 n = t[0] 

69 array_expr <<= (expr * n) if n else Empty() 

70 # clear list contents, but keep any named results 

71 del t[:] 

72 

73 if intExpr is None: 

74 intExpr = Word(nums).set_parse_action(lambda t: int(t[0])) 

75 else: 

76 intExpr = intExpr.copy() 

77 intExpr.set_name("arrayLen") 

78 intExpr.add_parse_action(count_field_parse_action, call_during_try=True) 

79 return (intExpr + array_expr).set_name(f"(len) {expr}...") 

80 

81 

82def match_previous_literal(expr: ParserElement) -> ParserElement: 

83 """Helper to define an expression that is indirectly defined from 

84 the tokens matched in a previous expression, that is, it looks for 

85 a 'repeat' of a previous expression. For example:: 

86 

87 first = Word(nums) 

88 second = match_previous_literal(first) 

89 match_expr = first + ":" + second 

90 

91 will match ``"1:1"``, but not ``"1:2"``. Because this 

92 matches a previous literal, will also match the leading 

93 ``"1:1"`` in ``"1:10"``. If this is not desired, use 

94 :class:`match_previous_expr`. Do *not* use with packrat parsing 

95 enabled. 

96 """ 

97 rep = Forward() 

98 

99 def copy_token_to_repeater(s, l, t): 

100 if not t: 

101 rep << Empty() 

102 return 

103 

104 if len(t) == 1: 

105 rep << t[0] 

106 return 

107 

108 # flatten t tokens 

109 tflat = _flatten(t.as_list()) 

110 rep << And(Literal(tt) for tt in tflat) 

111 

112 expr.add_parse_action(copy_token_to_repeater, callDuringTry=True) 

113 rep.set_name("(prev) " + str(expr)) 

114 return rep 

115 

116 

117def match_previous_expr(expr: ParserElement) -> ParserElement: 

118 """Helper to define an expression that is indirectly defined from 

119 the tokens matched in a previous expression, that is, it looks for 

120 a 'repeat' of a previous expression. For example:: 

121 

122 first = Word(nums) 

123 second = match_previous_expr(first) 

124 match_expr = first + ":" + second 

125 

126 will match ``"1:1"``, but not ``"1:2"``. Because this 

127 matches by expressions, will *not* match the leading ``"1:1"`` 

128 in ``"1:10"``; the expressions are evaluated first, and then 

129 compared, so ``"1"`` is compared with ``"10"``. Do *not* use 

130 with packrat parsing enabled. 

131 """ 

132 rep = Forward() 

133 e2 = expr.copy() 

134 rep <<= e2 

135 

136 def copy_token_to_repeater(s, l, t): 

137 matchTokens = _flatten(t.as_list()) 

138 

139 def must_match_these_tokens(s, l, t): 

140 theseTokens = _flatten(t.as_list()) 

141 if theseTokens != matchTokens: 

142 raise ParseException( 

143 s, l, f"Expected {matchTokens}, found{theseTokens}" 

144 ) 

145 

146 rep.set_parse_action(must_match_these_tokens, callDuringTry=True) 

147 

148 expr.add_parse_action(copy_token_to_repeater, callDuringTry=True) 

149 rep.set_name("(prev) " + str(expr)) 

150 return rep 

151 

152 

153def one_of( 

154 strs: Union[typing.Iterable[str], str], 

155 caseless: bool = False, 

156 use_regex: bool = True, 

157 as_keyword: bool = False, 

158 *, 

159 useRegex: bool = True, 

160 asKeyword: bool = False, 

161) -> ParserElement: 

162 """Helper to quickly define a set of alternative :class:`Literal` s, 

163 and makes sure to do longest-first testing when there is a conflict, 

164 regardless of the input order, but returns 

165 a :class:`MatchFirst` for best performance. 

166 

167 Parameters: 

168 

169 - ``strs`` - a string of space-delimited literals, or a collection of 

170 string literals 

171 - ``caseless`` - treat all literals as caseless - (default= ``False``) 

172 - ``use_regex`` - as an optimization, will 

173 generate a :class:`Regex` object; otherwise, will generate 

174 a :class:`MatchFirst` object (if ``caseless=True`` or ``as_keyword=True``, or if 

175 creating a :class:`Regex` raises an exception) - (default= ``True``) 

176 - ``as_keyword`` - enforce :class:`Keyword`-style matching on the 

177 generated expressions - (default= ``False``) 

178 - ``asKeyword`` and ``useRegex`` are retained for pre-PEP8 compatibility, 

179 but will be removed in a future release 

180 

181 Example:: 

182 

183 comp_oper = one_of("< = > <= >= !=") 

184 var = Word(alphas) 

185 number = Word(nums) 

186 term = var | number 

187 comparison_expr = term + comp_oper + term 

188 print(comparison_expr.search_string("B = 12 AA=23 B<=AA AA>12")) 

189 

190 prints:: 

191 

192 [['B', '=', '12'], ['AA', '=', '23'], ['B', '<=', 'AA'], ['AA', '>', '12']] 

193 """ 

194 asKeyword = asKeyword or as_keyword 

195 useRegex = useRegex and use_regex 

196 

197 if ( 

198 isinstance(caseless, str_type) 

199 and __diag__.warn_on_multiple_string_args_to_oneof 

200 ): 

201 warnings.warn( 

202 "warn_on_multiple_string_args_to_oneof:" 

203 " More than one string argument passed to one_of, pass" 

204 " choices as a list or space-delimited string", 

205 stacklevel=2, 

206 ) 

207 

208 if caseless: 

209 is_equal = lambda a, b: a.upper() == b.upper() 

210 masks = lambda a, b: b.upper().startswith(a.upper()) 

211 else: 

212 is_equal = operator.eq 

213 masks = lambda a, b: b.startswith(a) 

214 

215 symbols: list[str] 

216 if isinstance(strs, str_type): 

217 strs = typing.cast(str, strs) 

218 symbols = strs.split() 

219 elif isinstance(strs, Iterable): 

220 symbols = list(strs) 

221 else: 

222 raise TypeError("Invalid argument to one_of, expected string or iterable") 

223 if not symbols: 

224 return NoMatch() 

225 

226 # reorder given symbols to take care to avoid masking longer choices with shorter ones 

227 # (but only if the given symbols are not just single characters) 

228 i = 0 

229 while i < len(symbols) - 1: 

230 cur = symbols[i] 

231 for j, other in enumerate(symbols[i + 1 :]): 

232 if is_equal(other, cur): 

233 del symbols[i + j + 1] 

234 break 

235 if len(other) > len(cur) and masks(cur, other): 

236 del symbols[i + j + 1] 

237 symbols.insert(i, other) 

238 break 

239 else: 

240 i += 1 

241 

242 if useRegex: 

243 re_flags: int = re.IGNORECASE if caseless else 0 

244 

245 try: 

246 if all(len(sym) == 1 for sym in symbols): 

247 # symbols are just single characters, create range regex pattern 

248 patt = f"[{''.join(_escape_regex_range_chars(sym) for sym in symbols)}]" 

249 else: 

250 patt = "|".join(re.escape(sym) for sym in symbols) 

251 

252 # wrap with \b word break markers if defining as keywords 

253 if asKeyword: 

254 patt = rf"\b(?:{patt})\b" 

255 

256 ret = Regex(patt, flags=re_flags) 

257 ret.set_name(" | ".join(re.escape(s) for s in symbols)) 

258 

259 if caseless: 

260 # add parse action to return symbols as specified, not in random 

261 # casing as found in input string 

262 symbol_map = {sym.lower(): sym for sym in symbols} 

263 ret.add_parse_action(lambda s, l, t: symbol_map[t[0].lower()]) 

264 

265 return ret 

266 

267 except re.error: 

268 warnings.warn( 

269 "Exception creating Regex for one_of, building MatchFirst", stacklevel=2 

270 ) 

271 

272 # last resort, just use MatchFirst of Token class corresponding to caseless 

273 # and asKeyword settings 

274 CASELESS = KEYWORD = True 

275 parse_element_class = { 

276 (CASELESS, KEYWORD): CaselessKeyword, 

277 (CASELESS, not KEYWORD): CaselessLiteral, 

278 (not CASELESS, KEYWORD): Keyword, 

279 (not CASELESS, not KEYWORD): Literal, 

280 }[(caseless, asKeyword)] 

281 return MatchFirst(parse_element_class(sym) for sym in symbols).set_name( 

282 " | ".join(symbols) 

283 ) 

284 

285 

286def dict_of(key: ParserElement, value: ParserElement) -> Dict: 

287 """Helper to easily and clearly define a dictionary by specifying 

288 the respective patterns for the key and value. Takes care of 

289 defining the :class:`Dict`, :class:`ZeroOrMore`, and 

290 :class:`Group` tokens in the proper order. The key pattern 

291 can include delimiting markers or punctuation, as long as they are 

292 suppressed, thereby leaving the significant key text. The value 

293 pattern can include named results, so that the :class:`Dict` results 

294 can include named token fields. 

295 

296 Example:: 

297 

298 text = "shape: SQUARE posn: upper left color: light blue texture: burlap" 

299 attr_expr = (label + Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join)) 

300 print(attr_expr[1, ...].parse_string(text).dump()) 

301 

302 attr_label = label 

303 attr_value = Suppress(':') + OneOrMore(data_word, stop_on=label).set_parse_action(' '.join) 

304 

305 # similar to Dict, but simpler call format 

306 result = dict_of(attr_label, attr_value).parse_string(text) 

307 print(result.dump()) 

308 print(result['shape']) 

309 print(result.shape) # object attribute access works too 

310 print(result.as_dict()) 

311 

312 prints:: 

313 

314 [['shape', 'SQUARE'], ['posn', 'upper left'], ['color', 'light blue'], ['texture', 'burlap']] 

315 - color: 'light blue' 

316 - posn: 'upper left' 

317 - shape: 'SQUARE' 

318 - texture: 'burlap' 

319 SQUARE 

320 SQUARE 

321 {'color': 'light blue', 'shape': 'SQUARE', 'posn': 'upper left', 'texture': 'burlap'} 

322 """ 

323 return Dict(OneOrMore(Group(key + value))) 

324 

325 

326def original_text_for( 

327 expr: ParserElement, as_string: bool = True, *, asString: bool = True 

328) -> ParserElement: 

329 """Helper to return the original, untokenized text for a given 

330 expression. Useful to restore the parsed fields of an HTML start 

331 tag into the raw tag text itself, or to revert separate tokens with 

332 intervening whitespace back to the original matching input text. By 

333 default, returns a string containing the original parsed text. 

334 

335 If the optional ``as_string`` argument is passed as 

336 ``False``, then the return value is 

337 a :class:`ParseResults` containing any results names that 

338 were originally matched, and a single token containing the original 

339 matched text from the input string. So if the expression passed to 

340 :class:`original_text_for` contains expressions with defined 

341 results names, you must set ``as_string`` to ``False`` if you 

342 want to preserve those results name values. 

343 

344 The ``asString`` pre-PEP8 argument is retained for compatibility, 

345 but will be removed in a future release. 

346 

347 Example:: 

348 

349 src = "this is test <b> bold <i>text</i> </b> normal text " 

350 for tag in ("b", "i"): 

351 opener, closer = make_html_tags(tag) 

352 patt = original_text_for(opener + ... + closer) 

353 print(patt.search_string(src)[0]) 

354 

355 prints:: 

356 

357 ['<b> bold <i>text</i> </b>'] 

358 ['<i>text</i>'] 

359 """ 

360 asString = asString and as_string 

361 

362 locMarker = Empty().set_parse_action(lambda s, loc, t: loc) 

363 endlocMarker = locMarker.copy() 

364 endlocMarker.callPreparse = False 

365 matchExpr = locMarker("_original_start") + expr + endlocMarker("_original_end") 

366 if asString: 

367 extractText = lambda s, l, t: s[t._original_start : t._original_end] 

368 else: 

369 

370 def extractText(s, l, t): 

371 t[:] = [s[t.pop("_original_start") : t.pop("_original_end")]] 

372 

373 matchExpr.set_parse_action(extractText) 

374 matchExpr.ignoreExprs = expr.ignoreExprs 

375 matchExpr.suppress_warning(Diagnostics.warn_ungrouped_named_tokens_in_collection) 

376 return matchExpr 

377 

378 

379def ungroup(expr: ParserElement) -> ParserElement: 

380 """Helper to undo pyparsing's default grouping of And expressions, 

381 even if all but one are non-empty. 

382 """ 

383 return TokenConverter(expr).add_parse_action(lambda t: t[0]) 

384 

385 

386def locatedExpr(expr: ParserElement) -> ParserElement: 

387 """ 

388 .. deprecated:: 3.0.0 

389 Use the :class:`Located` class instead. 

390 

391 Helper to decorate a returned token with its starting and ending 

392 locations in the input string. 

393 

394 This helper adds the following results names: 

395 

396 - ``locn_start`` - location where matched expression begins 

397 - ``locn_end`` - location where matched expression ends 

398 - ``value`` - the actual parsed results 

399 

400 Be careful if the input text contains ``<TAB>`` characters, you 

401 may want to call :class:`ParserElement.parse_with_tabs` 

402 

403 Example:: 

404 

405 wd = Word(alphas) 

406 for match in locatedExpr(wd).search_string("ljsdf123lksdjjf123lkkjj1222"): 

407 print(match) 

408 

409 prints:: 

410 

411 [[0, 'ljsdf', 5]] 

412 [[8, 'lksdjjf', 15]] 

413 [[18, 'lkkjj', 23]] 

414 """ 

415 locator = Empty().set_parse_action(lambda ss, ll, tt: ll) 

416 return Group( 

417 locator("locn_start") 

418 + expr("value") 

419 + locator.copy().leaveWhitespace()("locn_end") 

420 ) 

421 

422 

423# define special default value to permit None as a significant value for 

424# ignore_expr 

425_NO_IGNORE_EXPR_GIVEN = NoMatch() 

426 

427 

428def nested_expr( 

429 opener: Union[str, ParserElement] = "(", 

430 closer: Union[str, ParserElement] = ")", 

431 content: typing.Optional[ParserElement] = None, 

432 ignore_expr: typing.Optional[ParserElement] = _NO_IGNORE_EXPR_GIVEN, 

433 *, 

434 ignoreExpr: typing.Optional[ParserElement] = _NO_IGNORE_EXPR_GIVEN, 

435) -> ParserElement: 

436 """Helper method for defining nested lists enclosed in opening and 

437 closing delimiters (``"("`` and ``")"`` are the default). 

438 

439 Parameters: 

440 

441 - ``opener`` - opening character for a nested list 

442 (default= ``"("``); can also be a pyparsing expression 

443 - ``closer`` - closing character for a nested list 

444 (default= ``")"``); can also be a pyparsing expression 

445 - ``content`` - expression for items within the nested lists 

446 (default= ``None``) 

447 - ``ignore_expr`` - expression for ignoring opening and closing delimiters 

448 (default= :class:`quoted_string`) 

449 - ``ignoreExpr`` - this pre-PEP8 argument is retained for compatibility 

450 but will be removed in a future release 

451 

452 If an expression is not provided for the content argument, the 

453 nested expression will capture all whitespace-delimited content 

454 between delimiters as a list of separate values. 

455 

456 Use the ``ignore_expr`` argument to define expressions that may 

457 contain opening or closing characters that should not be treated as 

458 opening or closing characters for nesting, such as quoted_string or 

459 a comment expression. Specify multiple expressions using an 

460 :class:`Or` or :class:`MatchFirst`. The default is 

461 :class:`quoted_string`, but if no expressions are to be ignored, then 

462 pass ``None`` for this argument. 

463 

464 Example:: 

465 

466 data_type = one_of("void int short long char float double") 

467 decl_data_type = Combine(data_type + Opt(Word('*'))) 

468 ident = Word(alphas+'_', alphanums+'_') 

469 number = pyparsing_common.number 

470 arg = Group(decl_data_type + ident) 

471 LPAR, RPAR = map(Suppress, "()") 

472 

473 code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment)) 

474 

475 c_function = (decl_data_type("type") 

476 + ident("name") 

477 + LPAR + Opt(DelimitedList(arg), [])("args") + RPAR 

478 + code_body("body")) 

479 c_function.ignore(c_style_comment) 

480 

481 source_code = ''' 

482 int is_odd(int x) { 

483 return (x%2); 

484 } 

485 

486 int dec_to_hex(char hchar) { 

487 if (hchar >= '0' && hchar <= '9') { 

488 return (ord(hchar)-ord('0')); 

489 } else { 

490 return (10+ord(hchar)-ord('A')); 

491 } 

492 } 

493 ''' 

494 for func in c_function.search_string(source_code): 

495 print("%(name)s (%(type)s) args: %(args)s" % func) 

496 

497 

498 prints:: 

499 

500 is_odd (int) args: [['int', 'x']] 

501 dec_to_hex (int) args: [['char', 'hchar']] 

502 """ 

503 if ignoreExpr != ignore_expr: 

504 ignoreExpr = ignore_expr if ignoreExpr is _NO_IGNORE_EXPR_GIVEN else ignoreExpr 

505 

506 if ignoreExpr is _NO_IGNORE_EXPR_GIVEN: 

507 ignoreExpr = quoted_string() 

508 

509 if opener == closer: 

510 raise ValueError("opening and closing strings cannot be the same") 

511 

512 if content is None: 

513 if isinstance(opener, str_type) and isinstance(closer, str_type): 

514 opener = typing.cast(str, opener) 

515 closer = typing.cast(str, closer) 

516 if len(opener) == 1 and len(closer) == 1: 

517 if ignoreExpr is not None: 

518 content = Combine( 

519 OneOrMore( 

520 ~ignoreExpr 

521 + CharsNotIn( 

522 opener + closer + ParserElement.DEFAULT_WHITE_CHARS, 

523 exact=1, 

524 ) 

525 ) 

526 ) 

527 else: 

528 content = Combine( 

529 Empty() 

530 + CharsNotIn( 

531 opener + closer + ParserElement.DEFAULT_WHITE_CHARS 

532 ) 

533 ) 

534 else: 

535 if ignoreExpr is not None: 

536 content = Combine( 

537 OneOrMore( 

538 ~ignoreExpr 

539 + ~Literal(opener) 

540 + ~Literal(closer) 

541 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1) 

542 ) 

543 ) 

544 else: 

545 content = Combine( 

546 OneOrMore( 

547 ~Literal(opener) 

548 + ~Literal(closer) 

549 + CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1) 

550 ) 

551 ) 

552 else: 

553 raise ValueError( 

554 "opening and closing arguments must be strings if no content expression is given" 

555 ) 

556 

557 # for these internally-created context expressions, simulate whitespace-skipping 

558 if ParserElement.DEFAULT_WHITE_CHARS: 

559 content.set_parse_action( 

560 lambda t: t[0].strip(ParserElement.DEFAULT_WHITE_CHARS) 

561 ) 

562 

563 ret = Forward() 

564 if ignoreExpr is not None: 

565 ret <<= Group( 

566 Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer) 

567 ) 

568 else: 

569 ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer)) 

570 

571 ret.set_name(f"nested {opener}{closer} expression") 

572 

573 # don't override error message from content expressions 

574 ret.errmsg = None 

575 return ret 

576 

577 

578def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")): 

579 """Internal helper to construct opening and closing tag expressions, given a tag name""" 

580 if isinstance(tagStr, str_type): 

581 resname = tagStr 

582 tagStr = Keyword(tagStr, caseless=not xml) 

583 else: 

584 resname = tagStr.name 

585 

586 tagAttrName = Word(alphas, alphanums + "_-:") 

587 if xml: 

588 tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes) 

589 openTag = ( 

590 suppress_LT 

591 + tagStr("tag") 

592 + Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue))) 

593 + Opt("/", default=[False])("empty").set_parse_action( 

594 lambda s, l, t: t[0] == "/" 

595 ) 

596 + suppress_GT 

597 ) 

598 else: 

599 tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word( 

600 printables, exclude_chars=">" 

601 ) 

602 openTag = ( 

603 suppress_LT 

604 + tagStr("tag") 

605 + Dict( 

606 ZeroOrMore( 

607 Group( 

608 tagAttrName.set_parse_action(lambda t: t[0].lower()) 

609 + Opt(Suppress("=") + tagAttrValue) 

610 ) 

611 ) 

612 ) 

613 + Opt("/", default=[False])("empty").set_parse_action( 

614 lambda s, l, t: t[0] == "/" 

615 ) 

616 + suppress_GT 

617 ) 

618 closeTag = Combine(Literal("</") + tagStr + ">", adjacent=False) 

619 

620 openTag.set_name(f"<{resname}>") 

621 # add start<tagname> results name in parse action now that ungrouped names are not reported at two levels 

622 openTag.add_parse_action( 

623 lambda t: t.__setitem__( 

624 "start" + "".join(resname.replace(":", " ").title().split()), t.copy() 

625 ) 

626 ) 

627 closeTag = closeTag( 

628 "end" + "".join(resname.replace(":", " ").title().split()) 

629 ).set_name(f"</{resname}>") 

630 openTag.tag = resname 

631 closeTag.tag = resname 

632 openTag.tag_body = SkipTo(closeTag()) 

633 return openTag, closeTag 

634 

635 

636def make_html_tags( 

637 tag_str: Union[str, ParserElement], 

638) -> tuple[ParserElement, ParserElement]: 

639 """Helper to construct opening and closing tag expressions for HTML, 

640 given a tag name. Matches tags in either upper or lower case, 

641 attributes with namespaces and with quoted or unquoted values. 

642 

643 Example:: 

644 

645 text = '<td>More info at the <a href="https://github.com/pyparsing/pyparsing/wiki">pyparsing</a> wiki page</td>' 

646 # make_html_tags returns pyparsing expressions for the opening and 

647 # closing tags as a 2-tuple 

648 a, a_end = make_html_tags("A") 

649 link_expr = a + SkipTo(a_end)("link_text") + a_end 

650 

651 for link in link_expr.search_string(text): 

652 # attributes in the <A> tag (like "href" shown here) are 

653 # also accessible as named results 

654 print(link.link_text, '->', link.href) 

655 

656 prints:: 

657 

658 pyparsing -> https://github.com/pyparsing/pyparsing/wiki 

659 """ 

660 return _makeTags(tag_str, False) 

661 

662 

663def make_xml_tags( 

664 tag_str: Union[str, ParserElement], 

665) -> tuple[ParserElement, ParserElement]: 

666 """Helper to construct opening and closing tag expressions for XML, 

667 given a tag name. Matches tags only in the given upper/lower case. 

668 

669 Example: similar to :class:`make_html_tags` 

670 """ 

671 return _makeTags(tag_str, True) 

672 

673 

674any_open_tag: ParserElement 

675any_close_tag: ParserElement 

676any_open_tag, any_close_tag = make_html_tags( 

677 Word(alphas, alphanums + "_:").set_name("any tag") 

678) 

679 

680_htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()} 

681_most_common_entities = "nbsp lt gt amp quot apos cent pound euro copy".replace( 

682 " ", "|" 

683) 

684common_html_entity = Regex( 

685 lambda: f"&(?P<entity>{_most_common_entities}|{make_compressed_re(_htmlEntityMap)});" 

686).set_name("common HTML entity") 

687 

688 

689def replace_html_entity(s, l, t): 

690 """Helper parser action to replace common HTML entities with their special characters""" 

691 return _htmlEntityMap.get(t.entity) 

692 

693 

694class OpAssoc(Enum): 

695 """Enumeration of operator associativity 

696 - used in constructing InfixNotationOperatorSpec for :class:`infix_notation`""" 

697 

698 LEFT = 1 

699 RIGHT = 2 

700 

701 

702InfixNotationOperatorArgType = Union[ 

703 ParserElement, str, tuple[Union[ParserElement, str], Union[ParserElement, str]] 

704] 

705InfixNotationOperatorSpec = Union[ 

706 tuple[ 

707 InfixNotationOperatorArgType, 

708 int, 

709 OpAssoc, 

710 typing.Optional[ParseAction], 

711 ], 

712 tuple[ 

713 InfixNotationOperatorArgType, 

714 int, 

715 OpAssoc, 

716 ], 

717] 

718 

719 

720def infix_notation( 

721 base_expr: ParserElement, 

722 op_list: list[InfixNotationOperatorSpec], 

723 lpar: Union[str, ParserElement] = Suppress("("), 

724 rpar: Union[str, ParserElement] = Suppress(")"), 

725) -> Forward: 

726 """Helper method for constructing grammars of expressions made up of 

727 operators working in a precedence hierarchy. Operators may be unary 

728 or binary, left- or right-associative. Parse actions can also be 

729 attached to operator expressions. The generated parser will also 

730 recognize the use of parentheses to override operator precedences 

731 (see example below). 

732 

733 Note: if you define a deep operator list, you may see performance 

734 issues when using infix_notation. See 

735 :class:`ParserElement.enable_packrat` for a mechanism to potentially 

736 improve your parser performance. 

737 

738 Parameters: 

739 

740 - ``base_expr`` - expression representing the most basic operand to 

741 be used in the expression 

742 - ``op_list`` - list of tuples, one for each operator precedence level 

743 in the expression grammar; each tuple is of the form ``(op_expr, 

744 num_operands, right_left_assoc, (optional)parse_action)``, where: 

745 

746 - ``op_expr`` is the pyparsing expression for the operator; may also 

747 be a string, which will be converted to a Literal; if ``num_operands`` 

748 is 3, ``op_expr`` is a tuple of two expressions, for the two 

749 operators separating the 3 terms 

750 - ``num_operands`` is the number of terms for this operator (must be 1, 

751 2, or 3) 

752 - ``right_left_assoc`` is the indicator whether the operator is right 

753 or left associative, using the pyparsing-defined constants 

754 ``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``. 

755 - ``parse_action`` is the parse action to be associated with 

756 expressions matching this operator expression (the parse action 

757 tuple member may be omitted); if the parse action is passed 

758 a tuple or list of functions, this is equivalent to calling 

759 ``set_parse_action(*fn)`` 

760 (:class:`ParserElement.set_parse_action`) 

761 - ``lpar`` - expression for matching left-parentheses; if passed as a 

762 str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as 

763 an expression (such as ``Literal('(')``), then it will be kept in 

764 the parsed results, and grouped with them. (default= ``Suppress('(')``) 

765 - ``rpar`` - expression for matching right-parentheses; if passed as a 

766 str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as 

767 an expression (such as ``Literal(')')``), then it will be kept in 

768 the parsed results, and grouped with them. (default= ``Suppress(')')``) 

769 

770 Example:: 

771 

772 # simple example of four-function arithmetic with ints and 

773 # variable names 

774 integer = pyparsing_common.signed_integer 

775 varname = pyparsing_common.identifier 

776 

777 arith_expr = infix_notation(integer | varname, 

778 [ 

779 ('-', 1, OpAssoc.RIGHT), 

780 (one_of('* /'), 2, OpAssoc.LEFT), 

781 (one_of('+ -'), 2, OpAssoc.LEFT), 

782 ]) 

783 

784 arith_expr.run_tests(''' 

785 5+3*6 

786 (5+3)*6 

787 -2--11 

788 ''', full_dump=False) 

789 

790 prints:: 

791 

792 5+3*6 

793 [[5, '+', [3, '*', 6]]] 

794 

795 (5+3)*6 

796 [[[5, '+', 3], '*', 6]] 

797 

798 (5+x)*y 

799 [[[5, '+', 'x'], '*', 'y']] 

800 

801 -2--11 

802 [[['-', 2], '-', ['-', 11]]] 

803 """ 

804 

805 # captive version of FollowedBy that does not do parse actions or capture results names 

806 class _FB(FollowedBy): 

807 def parseImpl(self, instring, loc, doActions=True): 

808 self.expr.try_parse(instring, loc) 

809 return loc, [] 

810 

811 _FB.__name__ = "FollowedBy>" 

812 

813 ret = Forward() 

814 ret.set_name(f"{base_expr.name}_expression") 

815 if isinstance(lpar, str): 

816 lpar = Suppress(lpar) 

817 if isinstance(rpar, str): 

818 rpar = Suppress(rpar) 

819 

820 nested_expr = (lpar + ret + rpar).set_name(f"nested_{base_expr.name}") 

821 

822 # if lpar and rpar are not suppressed, wrap in group 

823 if not (isinstance(lpar, Suppress) and isinstance(rpar, Suppress)): 

824 lastExpr = base_expr | Group(nested_expr) 

825 else: 

826 lastExpr = base_expr | nested_expr 

827 

828 arity: int 

829 rightLeftAssoc: opAssoc 

830 pa: typing.Optional[ParseAction] 

831 opExpr1: ParserElement 

832 opExpr2: ParserElement 

833 matchExpr: ParserElement 

834 match_lookahead: ParserElement 

835 for operDef in op_list: 

836 opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment] 

837 if isinstance(opExpr, str_type): 

838 opExpr = ParserElement._literalStringClass(opExpr) 

839 opExpr = typing.cast(ParserElement, opExpr) 

840 if arity == 3: 

841 if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2: 

842 raise ValueError( 

843 "if numterms=3, opExpr must be a tuple or list of two expressions" 

844 ) 

845 opExpr1, opExpr2 = opExpr 

846 term_name = f"{opExpr1}{opExpr2} operations" 

847 else: 

848 term_name = f"{opExpr} operations" 

849 

850 if not 1 <= arity <= 3: 

851 raise ValueError("operator must be unary (1), binary (2), or ternary (3)") 

852 

853 if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT): 

854 raise ValueError("operator must indicate right or left associativity") 

855 

856 thisExpr: ParserElement = Forward().set_name(term_name) 

857 thisExpr = typing.cast(Forward, thisExpr) 

858 match_lookahead = And([]) 

859 if rightLeftAssoc is OpAssoc.LEFT: 

860 if arity == 1: 

861 match_lookahead = _FB(lastExpr + opExpr) 

862 matchExpr = Group(lastExpr + opExpr[1, ...]) 

863 elif arity == 2: 

864 if opExpr is not None: 

865 match_lookahead = _FB(lastExpr + opExpr + lastExpr) 

866 matchExpr = Group(lastExpr + (opExpr + lastExpr)[1, ...]) 

867 else: 

868 match_lookahead = _FB(lastExpr + lastExpr) 

869 matchExpr = Group(lastExpr[2, ...]) 

870 elif arity == 3: 

871 match_lookahead = _FB( 

872 lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr 

873 ) 

874 matchExpr = Group( 

875 lastExpr + (opExpr1 + lastExpr + opExpr2 + lastExpr)[1, ...] 

876 ) 

877 elif rightLeftAssoc is OpAssoc.RIGHT: 

878 if arity == 1: 

879 # try to avoid LR with this extra test 

880 if not isinstance(opExpr, Opt): 

881 opExpr = Opt(opExpr) 

882 match_lookahead = _FB(opExpr.expr + thisExpr) 

883 matchExpr = Group(opExpr + thisExpr) 

884 elif arity == 2: 

885 if opExpr is not None: 

886 match_lookahead = _FB(lastExpr + opExpr + thisExpr) 

887 matchExpr = Group(lastExpr + (opExpr + thisExpr)[1, ...]) 

888 else: 

889 match_lookahead = _FB(lastExpr + thisExpr) 

890 matchExpr = Group(lastExpr + thisExpr[1, ...]) 

891 elif arity == 3: 

892 match_lookahead = _FB( 

893 lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr 

894 ) 

895 matchExpr = Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr) 

896 

897 # suppress lookahead expr from railroad diagrams 

898 match_lookahead.show_in_diagram = False 

899 

900 # TODO - determine why this statement can't be included in the following 

901 # if pa block 

902 matchExpr = match_lookahead + matchExpr 

903 

904 if pa: 

905 if isinstance(pa, (tuple, list)): 

906 matchExpr.set_parse_action(*pa) 

907 else: 

908 matchExpr.set_parse_action(pa) 

909 

910 thisExpr <<= (matchExpr | lastExpr).setName(term_name) 

911 lastExpr = thisExpr 

912 

913 ret <<= lastExpr 

914 return ret 

915 

916 

917def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]): 

918 """ 

919 .. deprecated:: 3.0.0 

920 Use the :class:`IndentedBlock` class instead. 

921 

922 Helper method for defining space-delimited indentation blocks, 

923 such as those used to define block statements in Python source code. 

924 

925 Parameters: 

926 

927 - ``blockStatementExpr`` - expression defining syntax of statement that 

928 is repeated within the indented block 

929 - ``indentStack`` - list created by caller to manage indentation stack 

930 (multiple ``statementWithIndentedBlock`` expressions within a single 

931 grammar should share a common ``indentStack``) 

932 - ``indent`` - boolean indicating whether block must be indented beyond 

933 the current level; set to ``False`` for block of left-most statements 

934 (default= ``True``) 

935 

936 A valid block must contain at least one ``blockStatement``. 

937 

938 (Note that indentedBlock uses internal parse actions which make it 

939 incompatible with packrat parsing.) 

940 

941 Example:: 

942 

943 data = ''' 

944 def A(z): 

945 A1 

946 B = 100 

947 G = A2 

948 A2 

949 A3 

950 B 

951 def BB(a,b,c): 

952 BB1 

953 def BBA(): 

954 bba1 

955 bba2 

956 bba3 

957 C 

958 D 

959 def spam(x,y): 

960 def eggs(z): 

961 pass 

962 ''' 

963 

964 

965 indentStack = [1] 

966 stmt = Forward() 

967 

968 identifier = Word(alphas, alphanums) 

969 funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":") 

970 func_body = indentedBlock(stmt, indentStack) 

971 funcDef = Group(funcDecl + func_body) 

972 

973 rvalue = Forward() 

974 funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")") 

975 rvalue << (funcCall | identifier | Word(nums)) 

976 assignment = Group(identifier + "=" + rvalue) 

977 stmt << (funcDef | assignment | identifier) 

978 

979 module_body = stmt[1, ...] 

980 

981 parseTree = module_body.parseString(data) 

982 parseTree.pprint() 

983 

984 prints:: 

985 

986 [['def', 

987 'A', 

988 ['(', 'z', ')'], 

989 ':', 

990 [['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]], 

991 'B', 

992 ['def', 

993 'BB', 

994 ['(', 'a', 'b', 'c', ')'], 

995 ':', 

996 [['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]], 

997 'C', 

998 'D', 

999 ['def', 

1000 'spam', 

1001 ['(', 'x', 'y', ')'], 

1002 ':', 

1003 [[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]] 

1004 """ 

1005 backup_stacks.append(indentStack[:]) 

1006 

1007 def reset_stack(): 

1008 indentStack[:] = backup_stacks[-1] 

1009 

1010 def checkPeerIndent(s, l, t): 

1011 if l >= len(s): 

1012 return 

1013 curCol = col(l, s) 

1014 if curCol != indentStack[-1]: 

1015 if curCol > indentStack[-1]: 

1016 raise ParseException(s, l, "illegal nesting") 

1017 raise ParseException(s, l, "not a peer entry") 

1018 

1019 def checkSubIndent(s, l, t): 

1020 curCol = col(l, s) 

1021 if curCol > indentStack[-1]: 

1022 indentStack.append(curCol) 

1023 else: 

1024 raise ParseException(s, l, "not a subentry") 

1025 

1026 def checkUnindent(s, l, t): 

1027 if l >= len(s): 

1028 return 

1029 curCol = col(l, s) 

1030 if not (indentStack and curCol in indentStack): 

1031 raise ParseException(s, l, "not an unindent") 

1032 if curCol < indentStack[-1]: 

1033 indentStack.pop() 

1034 

1035 NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress()) 

1036 INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT") 

1037 PEER = Empty().set_parse_action(checkPeerIndent).set_name("") 

1038 UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT") 

1039 if indent: 

1040 smExpr = Group( 

1041 Opt(NL) 

1042 + INDENT 

1043 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL)) 

1044 + UNDENT 

1045 ) 

1046 else: 

1047 smExpr = Group( 

1048 Opt(NL) 

1049 + OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL)) 

1050 + Opt(UNDENT) 

1051 ) 

1052 

1053 # add a parse action to remove backup_stack from list of backups 

1054 smExpr.add_parse_action( 

1055 lambda: backup_stacks.pop(-1) and None if backup_stacks else None 

1056 ) 

1057 smExpr.set_fail_action(lambda a, b, c, d: reset_stack()) 

1058 blockStatementExpr.ignore(_bslash + LineEnd()) 

1059 return smExpr.set_name("indented block") 

1060 

1061 

1062# it's easy to get these comment structures wrong - they're very common, 

1063# so may as well make them available 

1064c_style_comment = Regex(r"/\*(?:[^*]|\*(?!/))*\*\/").set_name("C style comment") 

1065"Comment of the form ``/* ... */``" 

1066 

1067html_comment = Regex(r"<!--[\s\S]*?-->").set_name("HTML comment") 

1068"Comment of the form ``<!-- ... -->``" 

1069 

1070rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line") 

1071dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment") 

1072"Comment of the form ``// ... (to end of line)``" 

1073 

1074cpp_style_comment = Regex( 

1075 r"(?:/\*(?:[^*]|\*(?!/))*\*\/)|(?://(?:\\\n|[^\n])*)" 

1076).set_name("C++ style comment") 

1077"Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`" 

1078 

1079java_style_comment = cpp_style_comment 

1080"Same as :class:`cpp_style_comment`" 

1081 

1082python_style_comment = Regex(r"#.*").set_name("Python style comment") 

1083"Comment of the form ``# ... (to end of line)``" 

1084 

1085 

1086# build list of built-in expressions, for future reference if a global default value 

1087# gets updated 

1088_builtin_exprs: list[ParserElement] = [ 

1089 v for v in vars().values() if isinstance(v, ParserElement) 

1090] 

1091 

1092 

1093# compatibility function, superseded by DelimitedList class 

1094def delimited_list( 

1095 expr: Union[str, ParserElement], 

1096 delim: Union[str, ParserElement] = ",", 

1097 combine: bool = False, 

1098 min: typing.Optional[int] = None, 

1099 max: typing.Optional[int] = None, 

1100 *, 

1101 allow_trailing_delim: bool = False, 

1102) -> ParserElement: 

1103 """ 

1104 .. deprecated:: 3.1.0 

1105 Use the :class:`DelimitedList` class instead. 

1106 """ 

1107 return DelimitedList( 

1108 expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim 

1109 ) 

1110 

1111 

1112# Compatibility synonyms 

1113# fmt: off 

1114opAssoc = OpAssoc 

1115anyOpenTag = any_open_tag 

1116anyCloseTag = any_close_tag 

1117commonHTMLEntity = common_html_entity 

1118cStyleComment = c_style_comment 

1119htmlComment = html_comment 

1120restOfLine = rest_of_line 

1121dblSlashComment = dbl_slash_comment 

1122cppStyleComment = cpp_style_comment 

1123javaStyleComment = java_style_comment 

1124pythonStyleComment = python_style_comment 

1125delimitedList = replaced_by_pep8("delimitedList", DelimitedList) 

1126delimited_list = replaced_by_pep8("delimited_list", DelimitedList) 

1127countedArray = replaced_by_pep8("countedArray", counted_array) 

1128matchPreviousLiteral = replaced_by_pep8("matchPreviousLiteral", match_previous_literal) 

1129matchPreviousExpr = replaced_by_pep8("matchPreviousExpr", match_previous_expr) 

1130oneOf = replaced_by_pep8("oneOf", one_of) 

1131dictOf = replaced_by_pep8("dictOf", dict_of) 

1132originalTextFor = replaced_by_pep8("originalTextFor", original_text_for) 

1133nestedExpr = replaced_by_pep8("nestedExpr", nested_expr) 

1134makeHTMLTags = replaced_by_pep8("makeHTMLTags", make_html_tags) 

1135makeXMLTags = replaced_by_pep8("makeXMLTags", make_xml_tags) 

1136replaceHTMLEntity = replaced_by_pep8("replaceHTMLEntity", replace_html_entity) 

1137infixNotation = replaced_by_pep8("infixNotation", infix_notation) 

1138# fmt: on