Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 26%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

2903 statements  

1# 

2# Secret Labs' Regular Expression Engine core module 

3# 

4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 

5# 

6# This version of the SRE library can be redistributed under CNRI's 

7# Python 1.6 license. For any other use, please contact Secret Labs 

8# AB (info@pythonware.com). 

9# 

10# Portions of this engine have been developed in cooperation with 

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and 

12# other compatibility work. 

13# 

14# 2010-01-16 mrab Python front-end re-written and extended 

15 

16import enum 

17import string 

18import unicodedata 

19from collections import defaultdict 

20 

21from regex import _regex 

22 

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", 

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", 

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", 

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error", 

27 "Scanner", "RegexFlag"] 

28 

29# The regex exception. 

30class error(Exception): 

31 """Exception raised for invalid regular expressions. 

32 

33 Attributes: 

34 

35 msg: The unformatted error message 

36 pattern: The regular expression pattern 

37 pos: The position in the pattern where compilation failed, or None 

38 lineno: The line number where compilation failed, unless pos is None 

39 colno: The column number where compilation failed, unless pos is None 

40 """ 

41 

42 def __init__(self, message, pattern=None, pos=None): 

43 newline = '\n' if isinstance(pattern, str) else b'\n' 

44 self.msg = message 

45 self.pattern = pattern 

46 self.pos = pos 

47 if pattern is not None and pos is not None: 

48 self.lineno = pattern.count(newline, 0, pos) + 1 

49 self.colno = pos - pattern.rfind(newline, 0, pos) 

50 

51 message = "{} at position {}".format(message, pos) 

52 

53 if newline in pattern: 

54 message += " (line {}, column {})".format(self.lineno, 

55 self.colno) 

56 

57 Exception.__init__(self, message) 

58 

59# The exception for when a positional flag has been turned on in the old 

60# behaviour. 

61class _UnscopedFlagSet(Exception): 

62 pass 

63 

64# The exception for when parsing fails and we want to try something else. 

65class ParseError(Exception): 

66 pass 

67 

68# The exception for when there isn't a valid first set. 

69class _FirstSetError(Exception): 

70 pass 

71 

72# Flags. 

73class RegexFlag(enum.IntFlag): 

74 A = ASCII = 0x80 # Assume ASCII locale. 

75 B = BESTMATCH = 0x1000 # Best fuzzy match. 

76 D = DEBUG = 0x200 # Print parsed pattern. 

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first 

78 # fuzzy match. 

79 F = FULLCASE = 0x4000 # Unicode full case-folding. 

80 I = IGNORECASE = 0x2 # Ignore case. 

81 L = LOCALE = 0x4 # Assume current 8-bit locale. 

82 M = MULTILINE = 0x8 # Make anchors look for newline. 

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest). 

84 R = REVERSE = 0x400 # Search backwards. 

85 S = DOTALL = 0x10 # Make dot match newline. 

86 U = UNICODE = 0x20 # Assume Unicode locale. 

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour. 

88 V1 = VERSION1 = 0x100 # New enhanced behaviour. 

89 W = WORD = 0x800 # Default Unicode word breaks. 

90 X = VERBOSE = 0x40 # Ignore whitespace and comments. 

91 T = TEMPLATE = 0x1 # Template (present because re module has it). 

92 

93 def __repr__(self): 

94 if self._name_ is not None: 

95 return 'regex.%s' % self._name_ 

96 

97 value = self._value_ 

98 members = [] 

99 negative = value < 0 

100 

101 if negative: 

102 value = ~value 

103 

104 for m in self.__class__: 

105 if value & m._value_: 

106 value &= ~m._value_ 

107 members.append('regex.%s' % m._name_) 

108 

109 if value: 

110 members.append(hex(value)) 

111 

112 res = '|'.join(members) 

113 

114 if negative: 

115 if len(members) > 1: 

116 res = '~(%s)' % res 

117 else: 

118 res = '~%s' % res 

119 

120 return res 

121 

122 __str__ = object.__str__ 

123 

124# Put the flags into the module namespace. Being explicit here helps tools like 

125# linters and IDEs understand the code better. 

126ASCII = RegexFlag.ASCII 

127BESTMATCH = RegexFlag.BESTMATCH 

128DEBUG = RegexFlag.DEBUG 

129DOTALL = RegexFlag.DOTALL 

130ENHANCEMATCH = RegexFlag.ENHANCEMATCH 

131FULLCASE = RegexFlag.FULLCASE 

132IGNORECASE = RegexFlag.IGNORECASE 

133LOCALE = RegexFlag.LOCALE 

134MULTILINE = RegexFlag.MULTILINE 

135POSIX = RegexFlag.POSIX 

136REVERSE = RegexFlag.REVERSE 

137TEMPLATE = RegexFlag.TEMPLATE 

138UNICODE = RegexFlag.UNICODE 

139VERBOSE = RegexFlag.VERBOSE 

140VERSION0 = RegexFlag.VERSION0 

141VERSION1 = RegexFlag.VERSION1 

142WORD = RegexFlag.WORD 

143A = RegexFlag.A 

144B = RegexFlag.B 

145D = RegexFlag.D 

146E = RegexFlag.E 

147F = RegexFlag.F 

148I = RegexFlag.I 

149L = RegexFlag.L 

150M = RegexFlag.M 

151P = RegexFlag.P 

152R = RegexFlag.R 

153S = RegexFlag.S 

154U = RegexFlag.U 

155V0 = RegexFlag.V0 

156V1 = RegexFlag.V1 

157W = RegexFlag.W 

158X = RegexFlag.X 

159T = RegexFlag.T 

160 

161DEFAULT_VERSION = VERSION1 

162 

163_ALL_VERSIONS = VERSION0 | VERSION1 

164_ALL_ENCODINGS = ASCII | LOCALE | UNICODE 

165 

166# The default flags for the various versions. 

167DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE} 

168 

169# The mask for the flags. 

170GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX | 

171 REVERSE) 

172SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE | 

173 _ALL_ENCODINGS) 

174 

175ALPHA = frozenset(string.ascii_letters) 

176DIGITS = frozenset(string.digits) 

177ALNUM = ALPHA | DIGITS 

178OCT_DIGITS = frozenset(string.octdigits) 

179HEX_DIGITS = frozenset(string.hexdigits) 

180SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""]) 

181NAMED_CHAR_PART = ALNUM | frozenset(" -") 

182PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.") 

183SET_OPS = ("||", "~~", "&&", "--") 

184 

185# The width of the code words inside the regex engine. 

186BYTES_PER_CODE = _regex.get_code_size() 

187BITS_PER_CODE = BYTES_PER_CODE * 8 

188 

189# The repeat count which represents infinity. 

190UNLIMITED = (1 << BITS_PER_CODE) - 1 

191 

192# The regular expression flags. 

193REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE, 

194 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE, 

195 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x": 

196 VERBOSE} 

197 

198# The case flags. 

199CASE_FLAGS = FULLCASE | IGNORECASE 

200NOCASE = 0 

201FULLIGNORECASE = FULLCASE | IGNORECASE 

202 

203FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE 

204 

205CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE, 

206 FULLIGNORECASE: FULLIGNORECASE} 

207 

208# The number of digits in hexadecimal escapes. 

209HEX_ESCAPES = {"x": 2, "u": 4, "U": 8} 

210 

211# The names of the opcodes. 

212OPCODES = """ 

213FAILURE 

214SUCCESS 

215ANY 

216ANY_ALL 

217ANY_ALL_REV 

218ANY_REV 

219ANY_U 

220ANY_U_REV 

221ATOMIC 

222BOUNDARY 

223BRANCH 

224CALL_REF 

225CHARACTER 

226CHARACTER_IGN 

227CHARACTER_IGN_REV 

228CHARACTER_REV 

229CONDITIONAL 

230DEFAULT_BOUNDARY 

231DEFAULT_END_OF_WORD 

232DEFAULT_START_OF_WORD 

233END 

234END_OF_LINE 

235END_OF_LINE_U 

236END_OF_STRING 

237END_OF_STRING_LINE 

238END_OF_STRING_LINE_U 

239END_OF_WORD 

240FUZZY 

241GRAPHEME_BOUNDARY 

242GREEDY_REPEAT 

243GROUP 

244GROUP_CALL 

245GROUP_EXISTS 

246KEEP 

247LAZY_REPEAT 

248LOOKAROUND 

249NEXT 

250PROPERTY 

251PROPERTY_IGN 

252PROPERTY_IGN_REV 

253PROPERTY_REV 

254PRUNE 

255RANGE 

256RANGE_IGN 

257RANGE_IGN_REV 

258RANGE_REV 

259REF_GROUP 

260REF_GROUP_FLD 

261REF_GROUP_FLD_REV 

262REF_GROUP_IGN 

263REF_GROUP_IGN_REV 

264REF_GROUP_REV 

265SEARCH_ANCHOR 

266SET_DIFF 

267SET_DIFF_IGN 

268SET_DIFF_IGN_REV 

269SET_DIFF_REV 

270SET_INTER 

271SET_INTER_IGN 

272SET_INTER_IGN_REV 

273SET_INTER_REV 

274SET_SYM_DIFF 

275SET_SYM_DIFF_IGN 

276SET_SYM_DIFF_IGN_REV 

277SET_SYM_DIFF_REV 

278SET_UNION 

279SET_UNION_IGN 

280SET_UNION_IGN_REV 

281SET_UNION_REV 

282SKIP 

283START_OF_LINE 

284START_OF_LINE_U 

285START_OF_STRING 

286START_OF_WORD 

287STRING 

288STRING_FLD 

289STRING_FLD_REV 

290STRING_IGN 

291STRING_IGN_REV 

292STRING_REV 

293FUZZY_EXT 

294""" 

295 

296# Define the opcodes in a namespace. 

297class Namespace: 

298 pass 

299 

300OP = Namespace() 

301for i, op in enumerate(OPCODES.split()): 

302 setattr(OP, op, i) 

303 

304def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5): 

305 """Make room in the given cache. 

306 

307 Args: 

308 cache_dict: The cache dictionary to modify. 

309 args_dict: The dictionary of named list args used by patterns. 

310 max_length: Maximum # of entries in cache_dict before it is shrunk. 

311 divisor: Cache will shrink to max_length - 1/divisor*max_length items. 

312 """ 

313 # Toss out a fraction of the entries at random to make room for new ones. 

314 # A random algorithm was chosen as opposed to simply cache_dict.popitem() 

315 # as popitem could penalize the same regular expression repeatedly based 

316 # on its internal hash value. Being random should spread the cache miss 

317 # love around. 

318 cache_keys = tuple(cache_dict.keys()) 

319 overage = len(cache_keys) - max_length 

320 if overage < 0: 

321 # Cache is already within limits. Normally this should not happen 

322 # but it could due to multithreading. 

323 return 

324 

325 number_to_toss = max_length // divisor + overage 

326 

327 # The import is done here to avoid a circular dependency. 

328 import random 

329 if not hasattr(random, 'sample'): 

330 # Do nothing while resolving the circular dependency: 

331 # re->random->warnings->tokenize->string->re 

332 return 

333 

334 for doomed_key in random.sample(cache_keys, number_to_toss): 

335 try: 

336 del cache_dict[doomed_key] 

337 except KeyError: 

338 # Ignore problems if the cache changed from another thread. 

339 pass 

340 

341 # Rebuild the arguments and locale-sensitivity dictionaries. 

342 args_dict.clear() 

343 sensitivity_dict = {} 

344 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict): 

345 args_dict[pattern, pattern_type, flags, default_version, locale] = args 

346 try: 

347 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern] 

348 except KeyError: 

349 pass 

350 

351 locale_sensitive.clear() 

352 locale_sensitive.update(sensitivity_dict) 

353 

354def _fold_case(info, string): 

355 "Folds the case of a string." 

356 flags = info.flags 

357 if (flags & _ALL_ENCODINGS) == 0: 

358 flags |= info.guess_encoding 

359 

360 return _regex.fold_case(flags, string) 

361 

362def is_cased_i(info, char): 

363 "Checks whether a character is cased." 

364 return len(_regex.get_all_cases(info.flags, char)) > 1 

365 

366def is_cased_f(flags, char): 

367 "Checks whether a character is cased." 

368 return len(_regex.get_all_cases(flags, char)) > 1 

369 

370def _compile_firstset(info, fs): 

371 "Compiles the firstset for the pattern." 

372 reverse = bool(info.flags & REVERSE) 

373 fs = _check_firstset(info, reverse, fs) 

374 if not fs or isinstance(fs, AnyAll): 

375 return [] 

376 

377 # Compile the firstset. 

378 return fs.compile(reverse) 

379 

380def _check_firstset(info, reverse, fs): 

381 "Checks the firstset for the pattern." 

382 if not fs or None in fs: 

383 return None 

384 

385 # If we ignore the case, for simplicity we won't build a firstset. 

386 members = set() 

387 case_flags = NOCASE 

388 for i in fs: 

389 if isinstance(i, Character) and not i.positive: 

390 return None 

391 

392# if i.case_flags: 

393# if isinstance(i, Character): 

394# if is_cased_i(info, i.value): 

395# return [] 

396# elif isinstance(i, SetBase): 

397# return [] 

398 case_flags |= i.case_flags 

399 members.add(i.with_flags(case_flags=NOCASE)) 

400 

401 if case_flags == (FULLCASE | IGNORECASE): 

402 return None 

403 

404 # Build the firstset. 

405 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE, 

406 zerowidth=True) 

407 fs = fs.optimise(info, reverse, in_set=True) 

408 

409 return fs 

410 

411def _flatten_code(code): 

412 "Flattens the code from a list of tuples." 

413 flat_code = [] 

414 for c in code: 

415 flat_code.extend(c) 

416 

417 return flat_code 

418 

419def make_case_flags(info): 

420 "Makes the case flags." 

421 flags = info.flags & CASE_FLAGS 

422 

423 # Turn off FULLCASE if ASCII is turned on. 

424 if info.flags & ASCII: 

425 flags &= ~FULLCASE 

426 

427 return flags 

428 

429def make_character(info, value, in_set=False): 

430 "Makes a character literal." 

431 if in_set: 

432 # A character set is built case-sensitively. 

433 return Character(value) 

434 

435 return Character(value, case_flags=make_case_flags(info)) 

436 

437def make_ref_group(info, name, position): 

438 "Makes a group reference." 

439 return RefGroup(info, name, position, case_flags=make_case_flags(info)) 

440 

441def make_string_set(info, name): 

442 "Makes a string set." 

443 return StringSet(info, name, case_flags=make_case_flags(info)) 

444 

445def make_property(info, prop, in_set): 

446 "Makes a property." 

447 if in_set: 

448 return prop 

449 

450 return prop.with_flags(case_flags=make_case_flags(info)) 

451 

452def _parse_pattern(source, info): 

453 "Parses a pattern, eg. 'a|b|c'." 

454 branches = [parse_sequence(source, info)] 

455 while source.match("|"): 

456 branches.append(parse_sequence(source, info)) 

457 

458 if len(branches) == 1: 

459 return branches[0] 

460 return Branch(branches) 

461 

462def parse_sequence(source, info): 

463 "Parses a sequence, eg. 'abc'." 

464 sequence = [None] 

465 case_flags = make_case_flags(info) 

466 while True: 

467 saved_pos = source.pos 

468 ch = source.get() 

469 if ch in SPECIAL_CHARS: 

470 if ch in ")|": 

471 # The end of a sequence. At the end of the pattern ch is "". 

472 source.pos = saved_pos 

473 break 

474 elif ch == "\\": 

475 # An escape sequence outside a set. 

476 sequence.append(parse_escape(source, info, False)) 

477 elif ch == "(": 

478 # A parenthesised subpattern or a flag. 

479 element = parse_paren(source, info) 

480 if element is None: 

481 case_flags = make_case_flags(info) 

482 else: 

483 sequence.append(element) 

484 elif ch == ".": 

485 # Any character. 

486 if info.flags & DOTALL: 

487 sequence.append(AnyAll()) 

488 elif info.flags & WORD: 

489 sequence.append(AnyU()) 

490 else: 

491 sequence.append(Any()) 

492 elif ch == "[": 

493 # A character set. 

494 sequence.append(parse_set(source, info)) 

495 elif ch == "^": 

496 # The start of a line or the string. 

497 if info.flags & MULTILINE: 

498 if info.flags & WORD: 

499 sequence.append(StartOfLineU()) 

500 else: 

501 sequence.append(StartOfLine()) 

502 else: 

503 sequence.append(StartOfString()) 

504 elif ch == "$": 

505 # The end of a line or the string. 

506 if info.flags & MULTILINE: 

507 if info.flags & WORD: 

508 sequence.append(EndOfLineU()) 

509 else: 

510 sequence.append(EndOfLine()) 

511 else: 

512 if info.flags & WORD: 

513 sequence.append(EndOfStringLineU()) 

514 else: 

515 sequence.append(EndOfStringLine()) 

516 elif ch in "?*+{": 

517 # Looks like a quantifier. 

518 counts = parse_quantifier(source, info, ch) 

519 if counts: 

520 # It _is_ a quantifier. 

521 apply_quantifier(source, info, counts, case_flags, ch, 

522 saved_pos, sequence) 

523 sequence.append(None) 

524 else: 

525 # It's not a quantifier. Maybe it's a fuzzy constraint. 

526 constraints = parse_fuzzy(source, info, ch, case_flags) 

527 

528 if constraints: 

529 # It _is_ a fuzzy constraint. 

530 if is_actually_fuzzy(constraints): 

531 apply_constraint(source, info, constraints, case_flags, 

532 saved_pos, sequence) 

533 sequence.append(None) 

534 else: 

535 # The element was just a literal. 

536 sequence.append(Character(ord(ch), 

537 case_flags=case_flags)) 

538 else: 

539 # A literal. 

540 sequence.append(Character(ord(ch), case_flags=case_flags)) 

541 else: 

542 # A literal. 

543 sequence.append(Character(ord(ch), case_flags=case_flags)) 

544 

545 sequence = [item for item in sequence if item is not None] 

546 return Sequence(sequence) 

547 

548def is_actually_fuzzy(constraints): 

549 "Checks whether a fuzzy constraint is actually fuzzy." 

550 if constraints.get("e") == (0, 0): 

551 return False 

552 

553 if (constraints.get("s"), constraints.get("i"), constraints.get("d")) == ((0, 0), (0, 0), (0, 0)): 

554 return False 

555 

556 return True 

557 

558def apply_quantifier(source, info, counts, case_flags, ch, saved_pos, 

559 sequence): 

560 element = sequence.pop() 

561 if element is None: 

562 if sequence: 

563 raise error("multiple repeat", source.string, saved_pos) 

564 raise error("nothing to repeat", source.string, saved_pos) 

565 

566 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)): 

567 raise error("multiple repeat", source.string, saved_pos) 

568 

569 min_count, max_count = counts 

570 saved_pos = source.pos 

571 ch = source.get() 

572 if ch == "?": 

573 # The "?" suffix that means it's a lazy repeat. 

574 repeated = LazyRepeat 

575 elif ch == "+": 

576 # The "+" suffix that means it's a possessive repeat. 

577 repeated = PossessiveRepeat 

578 else: 

579 # No suffix means that it's a greedy repeat. 

580 source.pos = saved_pos 

581 repeated = GreedyRepeat 

582 

583 # Ignore the quantifier if it applies to a zero-width item or the number of 

584 # repeats is fixed at 1. 

585 if not element.is_empty() and (min_count != 1 or max_count != 1): 

586 element = repeated(element, min_count, max_count) 

587 

588 sequence.append(element) 

589 

590def apply_constraint(source, info, constraints, case_flags, saved_pos, 

591 sequence): 

592 element = sequence.pop() 

593 if element is None: 

594 raise error("nothing for fuzzy constraint", source.string, saved_pos) 

595 

596 # If a group is marked as fuzzy then put all of the fuzzy part in the 

597 # group. 

598 if isinstance(element, Group): 

599 element.subpattern = Fuzzy(element.subpattern, constraints) 

600 sequence.append(element) 

601 else: 

602 sequence.append(Fuzzy(element, constraints)) 

603 

604_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)} 

605 

606def parse_quantifier(source, info, ch): 

607 "Parses a quantifier." 

608 q = _QUANTIFIERS.get(ch) 

609 if q: 

610 # It's a quantifier. 

611 return q 

612 

613 if ch == "{": 

614 # Looks like a limited repeated element, eg. 'a{2,3}'. 

615 counts = parse_limited_quantifier(source) 

616 if counts: 

617 return counts 

618 

619 return None 

620 

621def is_above_limit(count): 

622 "Checks whether a count is above the maximum." 

623 return count is not None and count >= UNLIMITED 

624 

625def parse_limited_quantifier(source): 

626 "Parses a limited quantifier." 

627 saved_pos = source.pos 

628 min_count = parse_count(source) 

629 if source.match(","): 

630 max_count = parse_count(source) 

631 

632 # No minimum means 0 and no maximum means unlimited. 

633 min_count = int(min_count or 0) 

634 max_count = int(max_count) if max_count else None 

635 else: 

636 if not min_count: 

637 source.pos = saved_pos 

638 return None 

639 

640 min_count = max_count = int(min_count) 

641 

642 if not source.match ("}"): 

643 source.pos = saved_pos 

644 return None 

645 

646 if is_above_limit(min_count) or is_above_limit(max_count): 

647 raise error("repeat count too big", source.string, saved_pos) 

648 

649 if max_count is not None and min_count > max_count: 

650 raise error("min repeat greater than max repeat", source.string, 

651 saved_pos) 

652 

653 return min_count, max_count 

654 

655def parse_fuzzy(source, info, ch, case_flags): 

656 "Parses a fuzzy setting, if present." 

657 saved_pos = source.pos 

658 

659 if ch != "{": 

660 return None 

661 

662 constraints = {} 

663 try: 

664 parse_fuzzy_item(source, constraints) 

665 while source.match(","): 

666 parse_fuzzy_item(source, constraints) 

667 except ParseError: 

668 source.pos = saved_pos 

669 return None 

670 

671 if source.match(":"): 

672 constraints["test"] = parse_fuzzy_test(source, info, case_flags) 

673 

674 if not source.match("}"): 

675 raise error("expected }", source.string, source.pos) 

676 

677 return constraints 

678 

679def parse_fuzzy_item(source, constraints): 

680 "Parses a fuzzy setting item." 

681 saved_pos = source.pos 

682 try: 

683 parse_cost_constraint(source, constraints) 

684 except ParseError: 

685 source.pos = saved_pos 

686 

687 parse_cost_equation(source, constraints) 

688 

689def parse_cost_constraint(source, constraints): 

690 "Parses a cost constraint." 

691 saved_pos = source.pos 

692 ch = source.get() 

693 if ch in ALPHA: 

694 # Syntax: constraint [("<=" | "<") cost] 

695 constraint = parse_constraint(source, constraints, ch) 

696 

697 max_inc = parse_fuzzy_compare(source) 

698 

699 if max_inc is None: 

700 # No maximum cost. 

701 constraints[constraint] = 0, None 

702 else: 

703 # There's a maximum cost. 

704 cost_pos = source.pos 

705 max_cost = parse_cost_limit(source) 

706 

707 # Inclusive or exclusive limit? 

708 if not max_inc: 

709 max_cost -= 1 

710 

711 if max_cost < 0: 

712 raise error("bad fuzzy cost limit", source.string, cost_pos) 

713 

714 constraints[constraint] = 0, max_cost 

715 elif ch in DIGITS: 

716 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost 

717 source.pos = saved_pos 

718 

719 # Minimum cost. 

720 cost_pos = source.pos 

721 min_cost = parse_cost_limit(source) 

722 

723 min_inc = parse_fuzzy_compare(source) 

724 if min_inc is None: 

725 raise ParseError() 

726 

727 constraint = parse_constraint(source, constraints, source.get()) 

728 

729 max_inc = parse_fuzzy_compare(source) 

730 if max_inc is None: 

731 raise ParseError() 

732 

733 # Maximum cost. 

734 cost_pos = source.pos 

735 max_cost = parse_cost_limit(source) 

736 

737 # Inclusive or exclusive limits? 

738 if not min_inc: 

739 min_cost += 1 

740 if not max_inc: 

741 max_cost -= 1 

742 

743 if not 0 <= min_cost <= max_cost: 

744 raise error("bad fuzzy cost limit", source.string, cost_pos) 

745 

746 constraints[constraint] = min_cost, max_cost 

747 else: 

748 raise ParseError() 

749 

750def parse_cost_limit(source): 

751 "Parses a cost limit." 

752 cost_pos = source.pos 

753 digits = parse_count(source) 

754 

755 try: 

756 return int(digits) 

757 except ValueError: 

758 pass 

759 

760 raise error("bad fuzzy cost limit", source.string, cost_pos) 

761 

762def parse_constraint(source, constraints, ch): 

763 "Parses a constraint." 

764 if ch not in "deis": 

765 raise ParseError() 

766 

767 if ch in constraints: 

768 raise ParseError() 

769 

770 return ch 

771 

772def parse_fuzzy_compare(source): 

773 "Parses a cost comparator." 

774 if source.match("<="): 

775 return True 

776 elif source.match("<"): 

777 return False 

778 else: 

779 return None 

780 

781def parse_cost_equation(source, constraints): 

782 "Parses a cost equation." 

783 if "cost" in constraints: 

784 raise error("more than one cost equation", source.string, source.pos) 

785 

786 cost = {} 

787 

788 parse_cost_term(source, cost) 

789 while source.match("+"): 

790 parse_cost_term(source, cost) 

791 

792 max_inc = parse_fuzzy_compare(source) 

793 if max_inc is None: 

794 raise ParseError() 

795 

796 max_cost = int(parse_count(source)) 

797 

798 if not max_inc: 

799 max_cost -= 1 

800 

801 if max_cost < 0: 

802 raise error("bad fuzzy cost limit", source.string, source.pos) 

803 

804 cost["max"] = max_cost 

805 

806 constraints["cost"] = cost 

807 

808def parse_cost_term(source, cost): 

809 "Parses a cost equation term." 

810 coeff = parse_count(source) 

811 ch = source.get() 

812 if ch not in "dis": 

813 raise ParseError() 

814 

815 if ch in cost: 

816 raise error("repeated fuzzy cost", source.string, source.pos) 

817 

818 cost[ch] = int(coeff or 1) 

819 

820def parse_fuzzy_test(source, info, case_flags): 

821 saved_pos = source.pos 

822 ch = source.get() 

823 if ch in SPECIAL_CHARS: 

824 if ch == "\\": 

825 # An escape sequence outside a set. 

826 return parse_escape(source, info, False) 

827 elif ch == ".": 

828 # Any character. 

829 if info.flags & DOTALL: 

830 return AnyAll() 

831 elif info.flags & WORD: 

832 return AnyU() 

833 else: 

834 return Any() 

835 elif ch == "[": 

836 # A character set. 

837 return parse_set(source, info) 

838 else: 

839 raise error("expected character set", source.string, saved_pos) 

840 elif ch: 

841 # A literal. 

842 return Character(ord(ch), case_flags=case_flags) 

843 else: 

844 raise error("expected character set", source.string, saved_pos) 

845 

846def parse_count(source): 

847 "Parses a quantifier's count, which can be empty." 

848 return source.get_while(DIGITS) 

849 

850def parse_paren(source, info): 

851 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an 

852 inline flag. 

853 """ 

854 saved_pos = source.pos 

855 ch = source.get(True) 

856 if ch == "?": 

857 # (?... 

858 saved_pos_2 = source.pos 

859 ch = source.get(True) 

860 if ch == "<": 

861 # (?<... 

862 saved_pos_3 = source.pos 

863 ch = source.get() 

864 if ch in ("=", "!"): 

865 # (?<=... or (?<!...: lookbehind. 

866 return parse_lookaround(source, info, True, ch == "=") 

867 

868 # (?<...: a named capture group. 

869 source.pos = saved_pos_3 

870 name = parse_name(source) 

871 group = info.open_group(name) 

872 source.expect(">") 

873 saved_flags = info.flags 

874 try: 

875 subpattern = _parse_pattern(source, info) 

876 source.expect(")") 

877 finally: 

878 info.flags = saved_flags 

879 source.ignore_space = bool(info.flags & VERBOSE) 

880 

881 info.close_group() 

882 return Group(info, group, subpattern) 

883 if ch in ("=", "!"): 

884 # (?=... or (?!...: lookahead. 

885 return parse_lookaround(source, info, False, ch == "=") 

886 if ch == "P": 

887 # (?P...: a Python extension. 

888 return parse_extension(source, info) 

889 if ch == "#": 

890 # (?#...: a comment. 

891 return parse_comment(source) 

892 if ch == "(": 

893 # (?(...: a conditional subpattern. 

894 return parse_conditional(source, info) 

895 if ch == ">": 

896 # (?>...: an atomic subpattern. 

897 return parse_atomic(source, info) 

898 if ch == "|": 

899 # (?|...: a common/reset groups branch. 

900 return parse_common(source, info) 

901 if ch == "R" or "0" <= ch <= "9": 

902 # (?R...: probably a call to a group. 

903 return parse_call_group(source, info, ch, saved_pos_2) 

904 if ch == "&": 

905 # (?&...: a call to a named group. 

906 return parse_call_named_group(source, info, saved_pos_2) 

907 if (ch == "+" or ch == "-") and source.peek() in DIGITS: 

908 return parse_rel_call_group(source, info, ch, saved_pos_2) 

909 

910 # (?...: probably a flags subpattern. 

911 source.pos = saved_pos_2 

912 return parse_flags_subpattern(source, info) 

913 

914 if ch == "*": 

915 # (*... 

916 saved_pos_2 = source.pos 

917 word = source.get_while(set(")>"), include=False) 

918 if word[ : 1].isalpha(): 

919 verb = VERBS.get(word) 

920 if not verb: 

921 raise error("unknown verb", source.string, saved_pos_2) 

922 

923 source.expect(")") 

924 

925 return verb 

926 

927 # (...: an unnamed capture group. 

928 source.pos = saved_pos 

929 group = info.open_group() 

930 saved_flags = info.flags 

931 try: 

932 subpattern = _parse_pattern(source, info) 

933 source.expect(")") 

934 finally: 

935 info.flags = saved_flags 

936 source.ignore_space = bool(info.flags & VERBOSE) 

937 

938 info.close_group() 

939 

940 return Group(info, group, subpattern) 

941 

942def parse_extension(source, info): 

943 "Parses a Python extension." 

944 saved_pos = source.pos 

945 ch = source.get() 

946 if ch == "<": 

947 # (?P<...: a named capture group. 

948 name = parse_name(source) 

949 group = info.open_group(name) 

950 source.expect(">") 

951 saved_flags = info.flags 

952 try: 

953 subpattern = _parse_pattern(source, info) 

954 source.expect(")") 

955 finally: 

956 info.flags = saved_flags 

957 source.ignore_space = bool(info.flags & VERBOSE) 

958 

959 info.close_group() 

960 

961 return Group(info, group, subpattern) 

962 if ch == "=": 

963 # (?P=...: a named group reference. 

964 name = parse_name(source, allow_numeric=True) 

965 source.expect(")") 

966 if info.is_open_group(name): 

967 raise error("cannot refer to an open group", source.string, 

968 saved_pos) 

969 

970 return make_ref_group(info, name, saved_pos) 

971 if ch == ">" or ch == "&": 

972 # (?P>...: a call to a group. 

973 return parse_call_named_group(source, info, saved_pos) 

974 

975 source.pos = saved_pos 

976 raise error("unknown extension", source.string, saved_pos) 

977 

978def parse_comment(source): 

979 "Parses a comment." 

980 while True: 

981 saved_pos = source.pos 

982 c = source.get(True) 

983 

984 if not c or c == ")": 

985 break 

986 

987 if c == "\\": 

988 c = source.get(True) 

989 

990 source.pos = saved_pos 

991 source.expect(")") 

992 

993 return None 

994 

995def parse_lookaround(source, info, behind, positive): 

996 "Parses a lookaround." 

997 saved_flags = info.flags 

998 try: 

999 subpattern = _parse_pattern(source, info) 

1000 source.expect(")") 

1001 finally: 

1002 info.flags = saved_flags 

1003 source.ignore_space = bool(info.flags & VERBOSE) 

1004 

1005 return LookAround(behind, positive, subpattern) 

1006 

1007def parse_conditional(source, info): 

1008 "Parses a conditional subpattern." 

1009 saved_flags = info.flags 

1010 saved_pos = source.pos 

1011 ch = source.get() 

1012 if ch == "?": 

1013 # (?(?... 

1014 ch = source.get() 

1015 if ch in ("=", "!"): 

1016 # (?(?=... or (?(?!...: lookahead conditional. 

1017 return parse_lookaround_conditional(source, info, False, ch == "=") 

1018 if ch == "<": 

1019 # (?(?<... 

1020 ch = source.get() 

1021 if ch in ("=", "!"): 

1022 # (?(?<=... or (?(?<!...: lookbehind conditional. 

1023 return parse_lookaround_conditional(source, info, True, ch == 

1024 "=") 

1025 

1026 source.pos = saved_pos 

1027 raise error("expected lookaround conditional", source.string, 

1028 source.pos) 

1029 

1030 source.pos = saved_pos 

1031 try: 

1032 group = parse_name(source, True) 

1033 source.expect(")") 

1034 yes_branch = parse_sequence(source, info) 

1035 if source.match("|"): 

1036 no_branch = parse_sequence(source, info) 

1037 else: 

1038 no_branch = Sequence() 

1039 

1040 source.expect(")") 

1041 finally: 

1042 info.flags = saved_flags 

1043 source.ignore_space = bool(info.flags & VERBOSE) 

1044 

1045 if yes_branch.is_empty() and no_branch.is_empty(): 

1046 return Sequence() 

1047 

1048 return Conditional(info, group, yes_branch, no_branch, saved_pos) 

1049 

1050def parse_lookaround_conditional(source, info, behind, positive): 

1051 saved_flags = info.flags 

1052 try: 

1053 subpattern = _parse_pattern(source, info) 

1054 source.expect(")") 

1055 finally: 

1056 info.flags = saved_flags 

1057 source.ignore_space = bool(info.flags & VERBOSE) 

1058 

1059 yes_branch = parse_sequence(source, info) 

1060 if source.match("|"): 

1061 no_branch = parse_sequence(source, info) 

1062 else: 

1063 no_branch = Sequence() 

1064 

1065 source.expect(")") 

1066 

1067 return LookAroundConditional(behind, positive, subpattern, yes_branch, 

1068 no_branch) 

1069 

1070def parse_atomic(source, info): 

1071 "Parses an atomic subpattern." 

1072 saved_flags = info.flags 

1073 try: 

1074 subpattern = _parse_pattern(source, info) 

1075 source.expect(")") 

1076 finally: 

1077 info.flags = saved_flags 

1078 source.ignore_space = bool(info.flags & VERBOSE) 

1079 

1080 return Atomic(subpattern) 

1081 

1082def parse_common(source, info): 

1083 "Parses a common groups branch." 

1084 # Capture group numbers in different branches can reuse the group numbers. 

1085 initial_group_count = info.group_count 

1086 branches = [parse_sequence(source, info)] 

1087 final_group_count = info.group_count 

1088 while source.match("|"): 

1089 info.group_count = initial_group_count 

1090 branches.append(parse_sequence(source, info)) 

1091 final_group_count = max(final_group_count, info.group_count) 

1092 

1093 info.group_count = final_group_count 

1094 source.expect(")") 

1095 

1096 if len(branches) == 1: 

1097 return branches[0] 

1098 return Branch(branches) 

1099 

1100def parse_call_group(source, info, ch, pos): 

1101 "Parses a call to a group." 

1102 if ch == "R": 

1103 group = "0" 

1104 else: 

1105 group = ch + source.get_while(DIGITS) 

1106 

1107 source.expect(")") 

1108 

1109 return CallGroup(info, group, pos) 

1110 

1111def parse_rel_call_group(source, info, ch, pos): 

1112 "Parses a relative call to a group." 

1113 digits = source.get_while(DIGITS) 

1114 if not digits: 

1115 raise error("missing relative group number", source.string, source.pos) 

1116 

1117 offset = int(digits) 

1118 group = info.group_count + offset if ch == "+" else info.group_count - offset + 1 

1119 if group <= 0: 

1120 raise error("invalid relative group number", source.string, source.pos) 

1121 

1122 source.expect(")") 

1123 

1124 return CallGroup(info, group, pos) 

1125 

1126def parse_call_named_group(source, info, pos): 

1127 "Parses a call to a named group." 

1128 group = parse_name(source) 

1129 source.expect(")") 

1130 

1131 return CallGroup(info, group, pos) 

1132 

1133def parse_flag_set(source): 

1134 "Parses a set of inline flags." 

1135 flags = 0 

1136 

1137 try: 

1138 while True: 

1139 saved_pos = source.pos 

1140 ch = source.get() 

1141 if ch == "V": 

1142 ch += source.get() 

1143 flags |= REGEX_FLAGS[ch] 

1144 except KeyError: 

1145 source.pos = saved_pos 

1146 

1147 return flags 

1148 

1149def parse_flags(source, info): 

1150 "Parses flags being turned on/off." 

1151 flags_on = parse_flag_set(source) 

1152 if source.match("-"): 

1153 flags_off = parse_flag_set(source) 

1154 if not flags_off: 

1155 raise error("bad inline flags: no flags after '-'", source.string, 

1156 source.pos) 

1157 else: 

1158 flags_off = 0 

1159 

1160 if flags_on & LOCALE: 

1161 # Remember that this pattern as an inline locale flag. 

1162 info.inline_locale = True 

1163 

1164 return flags_on, flags_off 

1165 

1166def parse_subpattern(source, info, flags_on, flags_off): 

1167 "Parses a subpattern with scoped flags." 

1168 saved_flags = info.flags 

1169 info.flags = (info.flags | flags_on) & ~flags_off 

1170 

1171 # Ensure that there aren't multiple encoding flags set. 

1172 if info.flags & (ASCII | LOCALE | UNICODE): 

1173 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on 

1174 

1175 source.ignore_space = bool(info.flags & VERBOSE) 

1176 try: 

1177 subpattern = _parse_pattern(source, info) 

1178 source.expect(")") 

1179 finally: 

1180 info.flags = saved_flags 

1181 source.ignore_space = bool(info.flags & VERBOSE) 

1182 

1183 return subpattern 

1184 

1185def parse_flags_subpattern(source, info): 

1186 """Parses a flags subpattern. It could be inline flags or a subpattern 

1187 possibly with local flags. If it's a subpattern, then that's returned; 

1188 if it's a inline flags, then None is returned. 

1189 """ 

1190 flags_on, flags_off = parse_flags(source, info) 

1191 

1192 if flags_off & GLOBAL_FLAGS: 

1193 raise error("bad inline flags: cannot turn off global flag", 

1194 source.string, source.pos) 

1195 

1196 if flags_on & flags_off: 

1197 raise error("bad inline flags: flag turned on and off", source.string, 

1198 source.pos) 

1199 

1200 # Handle flags which are global in all regex behaviours. 

1201 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS 

1202 if new_global_flags: 

1203 info.global_flags |= new_global_flags 

1204 

1205 # A global has been turned on, so reparse the pattern. 

1206 raise _UnscopedFlagSet(info.global_flags) 

1207 

1208 # Ensure that from now on we have only scoped flags. 

1209 flags_on &= ~GLOBAL_FLAGS 

1210 

1211 if source.match(":"): 

1212 return parse_subpattern(source, info, flags_on, flags_off) 

1213 

1214 if source.match(")"): 

1215 parse_positional_flags(source, info, flags_on, flags_off) 

1216 return None 

1217 

1218 raise error("unknown extension", source.string, source.pos) 

1219 

1220def parse_positional_flags(source, info, flags_on, flags_off): 

1221 "Parses positional flags." 

1222 info.flags = (info.flags | flags_on) & ~flags_off 

1223 source.ignore_space = bool(info.flags & VERBOSE) 

1224 

1225def parse_name(source, allow_numeric=False, allow_group_0=False): 

1226 "Parses a name." 

1227 name = source.get_while(set(")>"), include=False) 

1228 

1229 if not name: 

1230 raise error("missing group name", source.string, source.pos) 

1231 

1232 if name.isdigit(): 

1233 min_group = 0 if allow_group_0 else 1 

1234 if not allow_numeric or int(name) < min_group: 

1235 raise error("bad character in group name", source.string, 

1236 source.pos) 

1237 else: 

1238 if not name.isidentifier(): 

1239 raise error("bad character in group name", source.string, 

1240 source.pos) 

1241 

1242 return name 

1243 

1244def is_octal(string): 

1245 "Checks whether a string is octal." 

1246 return all(ch in OCT_DIGITS for ch in string) 

1247 

1248def is_decimal(string): 

1249 "Checks whether a string is decimal." 

1250 return all(ch in DIGITS for ch in string) 

1251 

1252def is_hexadecimal(string): 

1253 "Checks whether a string is hexadecimal." 

1254 return all(ch in HEX_DIGITS for ch in string) 

1255 

1256def parse_escape(source, info, in_set): 

1257 "Parses an escape sequence." 

1258 saved_ignore = source.ignore_space 

1259 source.ignore_space = False 

1260 ch = source.get() 

1261 source.ignore_space = saved_ignore 

1262 if not ch: 

1263 # A backslash at the end of the pattern. 

1264 raise error("bad escape (end of pattern)", source.string, source.pos) 

1265 if ch in HEX_ESCAPES: 

1266 # A hexadecimal escape sequence. 

1267 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch) 

1268 elif ch == "g" and not in_set: 

1269 # A group reference. 

1270 saved_pos = source.pos 

1271 try: 

1272 return parse_group_ref(source, info) 

1273 except error: 

1274 # Invalid as a group reference, so assume it's a literal. 

1275 source.pos = saved_pos 

1276 

1277 return make_character(info, ord(ch), in_set) 

1278 elif ch == "G" and not in_set: 

1279 # A search anchor. 

1280 return SearchAnchor() 

1281 elif ch == "L" and not in_set: 

1282 # A string set. 

1283 return parse_string_set(source, info) 

1284 elif ch == "N": 

1285 # A named codepoint. 

1286 return parse_named_char(source, info, in_set) 

1287 elif ch in "pP": 

1288 # A Unicode property, positive or negative. 

1289 return parse_property(source, info, ch == "p", in_set) 

1290 elif ch == "R" and not in_set: 

1291 # A line ending. 

1292 charset = [0x0A, 0x0B, 0x0C, 0x0D] 

1293 if info.guess_encoding == UNICODE: 

1294 charset.extend([0x85, 0x2028, 0x2029]) 

1295 

1296 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c) 

1297 for c in charset])])) 

1298 elif ch == "X" and not in_set: 

1299 # A grapheme cluster. 

1300 return Grapheme() 

1301 elif ch in ALPHA: 

1302 # An alphabetic escape sequence. 

1303 # Positional escapes aren't allowed inside a character set. 

1304 if not in_set: 

1305 if info.flags & WORD: 

1306 value = WORD_POSITION_ESCAPES.get(ch) 

1307 elif info.flags & ASCII: 

1308 value = ASCII_POSITION_ESCAPES.get(ch) 

1309 elif info.flags & UNICODE: 

1310 value = UNICODE_POSITION_ESCAPES.get(ch) 

1311 else: 

1312 value = POSITION_ESCAPES.get(ch) 

1313 

1314 if value: 

1315 return value 

1316 

1317 if info.flags & ASCII: 

1318 value = ASCII_CHARSET_ESCAPES.get(ch) 

1319 elif info.flags & UNICODE: 

1320 value = UNICODE_CHARSET_ESCAPES.get(ch) 

1321 else: 

1322 value = CHARSET_ESCAPES.get(ch) 

1323 

1324 if value: 

1325 return value 

1326 

1327 value = CHARACTER_ESCAPES.get(ch) 

1328 if value: 

1329 return Character(ord(value)) 

1330 

1331 raise error("bad escape \\%s" % ch, source.string, source.pos) 

1332 elif ch in DIGITS: 

1333 # A numeric escape sequence. 

1334 return parse_numeric_escape(source, info, ch, in_set) 

1335 else: 

1336 # A literal. 

1337 return make_character(info, ord(ch), in_set) 

1338 

1339def parse_numeric_escape(source, info, ch, in_set): 

1340 "Parses a numeric escape sequence." 

1341 if in_set or ch == "0": 

1342 # Octal escape sequence, max 3 digits. 

1343 return parse_octal_escape(source, info, [ch], in_set) 

1344 

1345 # At least 1 digit, so either octal escape or group. 

1346 digits = ch 

1347 saved_pos = source.pos 

1348 ch = source.get() 

1349 if ch in DIGITS: 

1350 # At least 2 digits, so either octal escape or group. 

1351 digits += ch 

1352 saved_pos = source.pos 

1353 ch = source.get() 

1354 if is_octal(digits) and ch in OCT_DIGITS: 

1355 # 3 octal digits, so octal escape sequence. 

1356 encoding = info.flags & _ALL_ENCODINGS 

1357 if encoding == ASCII or encoding == LOCALE: 

1358 octal_mask = 0xFF 

1359 else: 

1360 octal_mask = 0x1FF 

1361 

1362 value = int(digits + ch, 8) & octal_mask 

1363 return make_character(info, value) 

1364 

1365 # Group reference. 

1366 source.pos = saved_pos 

1367 if info.is_open_group(digits): 

1368 raise error("cannot refer to an open group", source.string, source.pos) 

1369 

1370 return make_ref_group(info, digits, source.pos) 

1371 

1372def parse_octal_escape(source, info, digits, in_set): 

1373 "Parses an octal escape sequence." 

1374 saved_pos = source.pos 

1375 ch = source.get() 

1376 while len(digits) < 3 and ch in OCT_DIGITS: 

1377 digits.append(ch) 

1378 saved_pos = source.pos 

1379 ch = source.get() 

1380 

1381 source.pos = saved_pos 

1382 try: 

1383 value = int("".join(digits), 8) 

1384 return make_character(info, value, in_set) 

1385 except ValueError: 

1386 if digits[0] in OCT_DIGITS: 

1387 raise error("incomplete escape \\%s" % ''.join(digits), 

1388 source.string, source.pos) 

1389 else: 

1390 raise error("bad escape \\%s" % digits[0], source.string, 

1391 source.pos) 

1392 

1393def parse_hex_escape(source, info, esc, expected_len, in_set, type): 

1394 "Parses a hex escape sequence." 

1395 saved_pos = source.pos 

1396 digits = [] 

1397 for i in range(expected_len): 

1398 ch = source.get() 

1399 if ch not in HEX_DIGITS: 

1400 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), 

1401 source.string, saved_pos) 

1402 digits.append(ch) 

1403 

1404 try: 

1405 value = int("".join(digits), 16) 

1406 except ValueError: 

1407 pass 

1408 else: 

1409 if value < 0x110000: 

1410 return make_character(info, value, in_set) 

1411 

1412 # Bad hex escape. 

1413 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)), 

1414 source.string, saved_pos) 

1415 

1416def parse_group_ref(source, info): 

1417 "Parses a group reference." 

1418 source.expect("<") 

1419 saved_pos = source.pos 

1420 name = parse_name(source, True) 

1421 source.expect(">") 

1422 if info.is_open_group(name): 

1423 raise error("cannot refer to an open group", source.string, source.pos) 

1424 

1425 return make_ref_group(info, name, saved_pos) 

1426 

1427def parse_string_set(source, info): 

1428 "Parses a string set reference." 

1429 source.expect("<") 

1430 name = parse_name(source, True) 

1431 source.expect(">") 

1432 if name is None or name not in info.kwargs: 

1433 raise error("undefined named list", source.string, source.pos) 

1434 

1435 return make_string_set(info, name) 

1436 

1437def parse_named_char(source, info, in_set): 

1438 "Parses a named character." 

1439 saved_pos = source.pos 

1440 if source.match("{"): 

1441 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True) 

1442 if source.match("}"): 

1443 try: 

1444 value = unicodedata.lookup(name) 

1445 return make_character(info, ord(value), in_set) 

1446 except KeyError: 

1447 raise error("undefined character name", source.string, 

1448 source.pos) 

1449 

1450 source.pos = saved_pos 

1451 return make_character(info, ord("N"), in_set) 

1452 

1453def parse_property(source, info, positive, in_set): 

1454 "Parses a Unicode property." 

1455 saved_pos = source.pos 

1456 ch = source.get() 

1457 if ch == "{": 

1458 negate = source.match("^") 

1459 prop_name, name = parse_property_name(source) 

1460 if source.match("}"): 

1461 # It's correctly delimited. 

1462 if info.flags & ASCII: 

1463 encoding = ASCII_ENCODING 

1464 elif info.flags & UNICODE: 

1465 encoding = UNICODE_ENCODING 

1466 else: 

1467 encoding = 0 

1468 

1469 prop = lookup_property(prop_name, name, positive != negate, source, 

1470 encoding=encoding) 

1471 return make_property(info, prop, in_set) 

1472 elif ch and ch in "CLMNPSZ": 

1473 # An abbreviated property, eg \pL. 

1474 if info.flags & ASCII: 

1475 encoding = ASCII_ENCODING 

1476 elif info.flags & UNICODE: 

1477 encoding = UNICODE_ENCODING 

1478 else: 

1479 encoding = 0 

1480 

1481 prop = lookup_property(None, ch, positive, source, encoding=encoding) 

1482 return make_property(info, prop, in_set) 

1483 

1484 # Not a property, so treat as a literal "p" or "P". 

1485 source.pos = saved_pos 

1486 ch = "p" if positive else "P" 

1487 return make_character(info, ord(ch), in_set) 

1488 

1489def parse_property_name(source): 

1490 "Parses a property name, which may be qualified." 

1491 name = source.get_while(PROPERTY_NAME_PART) 

1492 saved_pos = source.pos 

1493 

1494 ch = source.get() 

1495 if ch and ch in ":=": 

1496 prop_name = name 

1497 name = source.get_while(ALNUM | set(" &_-./")).strip() 

1498 

1499 if name: 

1500 # Name after the ":" or "=", so it's a qualified name. 

1501 saved_pos = source.pos 

1502 else: 

1503 # No name after the ":" or "=", so assume it's an unqualified name. 

1504 prop_name, name = None, prop_name 

1505 else: 

1506 prop_name = None 

1507 

1508 source.pos = saved_pos 

1509 return prop_name, name 

1510 

1511def parse_set(source, info): 

1512 "Parses a character set." 

1513 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1514 

1515 saved_ignore = source.ignore_space 

1516 source.ignore_space = False 

1517 # Negative set? 

1518 negate = source.match("^") 

1519 try: 

1520 if version == VERSION0: 

1521 item = parse_set_imp_union(source, info) 

1522 else: 

1523 item = parse_set_union(source, info) 

1524 

1525 if not source.match("]"): 

1526 raise error("missing ]", source.string, source.pos) 

1527 finally: 

1528 source.ignore_space = saved_ignore 

1529 

1530 if negate: 

1531 item = item.with_flags(positive=not item.positive) 

1532 

1533 item = item.with_flags(case_flags=make_case_flags(info)) 

1534 

1535 return item 

1536 

1537def parse_set_union(source, info): 

1538 "Parses a set union ([x||y])." 

1539 items = [parse_set_symm_diff(source, info)] 

1540 while source.match("||"): 

1541 items.append(parse_set_symm_diff(source, info)) 

1542 

1543 if len(items) == 1: 

1544 return items[0] 

1545 return SetUnion(info, items) 

1546 

1547def parse_set_symm_diff(source, info): 

1548 "Parses a set symmetric difference ([x~~y])." 

1549 items = [parse_set_inter(source, info)] 

1550 while source.match("~~"): 

1551 items.append(parse_set_inter(source, info)) 

1552 

1553 if len(items) == 1: 

1554 return items[0] 

1555 return SetSymDiff(info, items) 

1556 

1557def parse_set_inter(source, info): 

1558 "Parses a set intersection ([x&&y])." 

1559 items = [parse_set_diff(source, info)] 

1560 while source.match("&&"): 

1561 items.append(parse_set_diff(source, info)) 

1562 

1563 if len(items) == 1: 

1564 return items[0] 

1565 return SetInter(info, items) 

1566 

1567def parse_set_diff(source, info): 

1568 "Parses a set difference ([x--y])." 

1569 items = [parse_set_imp_union(source, info)] 

1570 while source.match("--"): 

1571 items.append(parse_set_imp_union(source, info)) 

1572 

1573 if len(items) == 1: 

1574 return items[0] 

1575 return SetDiff(info, items) 

1576 

1577def parse_set_imp_union(source, info): 

1578 "Parses a set implicit union ([xy])." 

1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1580 

1581 items = [parse_set_member(source, info)] 

1582 while True: 

1583 saved_pos = source.pos 

1584 if source.match("]"): 

1585 # End of the set. 

1586 source.pos = saved_pos 

1587 break 

1588 

1589 if version == VERSION1 and any(source.match(op) for op in SET_OPS): 

1590 # The new behaviour has set operators. 

1591 source.pos = saved_pos 

1592 break 

1593 

1594 items.append(parse_set_member(source, info)) 

1595 

1596 if len(items) == 1: 

1597 return items[0] 

1598 return SetUnion(info, items) 

1599 

1600def parse_set_member(source, info): 

1601 "Parses a member in a character set." 

1602 # Parse a set item. 

1603 start = parse_set_item(source, info) 

1604 saved_pos1 = source.pos 

1605 if (not isinstance(start, Character) or not start.positive or not 

1606 source.match("-")): 

1607 # It's not the start of a range. 

1608 return start 

1609 

1610 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1611 

1612 # It looks like the start of a range of characters. 

1613 saved_pos2 = source.pos 

1614 if version == VERSION1 and source.match("-"): 

1615 # It's actually the set difference operator '--', so return the 

1616 # character. 

1617 source.pos = saved_pos1 

1618 return start 

1619 

1620 if source.match("]"): 

1621 # We've reached the end of the set, so return both the character and 

1622 # hyphen. 

1623 source.pos = saved_pos2 

1624 return SetUnion(info, [start, Character(ord("-"))]) 

1625 

1626 # Parse a set item. 

1627 end = parse_set_item(source, info) 

1628 if not isinstance(end, Character) or not end.positive: 

1629 # It's not a range, so return the character, hyphen and property. 

1630 return SetUnion(info, [start, Character(ord("-")), end]) 

1631 

1632 # It _is_ a range. 

1633 if start.value > end.value: 

1634 raise error("bad character range", source.string, source.pos) 

1635 

1636 if start.value == end.value: 

1637 return start 

1638 

1639 return Range(start.value, end.value) 

1640 

1641def parse_set_item(source, info): 

1642 "Parses an item in a character set." 

1643 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1644 

1645 if source.match("\\"): 

1646 # An escape sequence in a set. 

1647 return parse_escape(source, info, True) 

1648 

1649 saved_pos = source.pos 

1650 if source.match("[:"): 

1651 # Looks like a POSIX character class. 

1652 try: 

1653 return parse_posix_class(source, info) 

1654 except ParseError: 

1655 # Not a POSIX character class. 

1656 source.pos = saved_pos 

1657 

1658 if version == VERSION1 and source.match("["): 

1659 # It's the start of a nested set. 

1660 

1661 # Negative set? 

1662 negate = source.match("^") 

1663 item = parse_set_union(source, info) 

1664 

1665 if not source.match("]"): 

1666 raise error("missing ]", source.string, source.pos) 

1667 

1668 if negate: 

1669 item = item.with_flags(positive=not item.positive) 

1670 

1671 return item 

1672 

1673 ch = source.get() 

1674 if not ch: 

1675 raise error("unterminated character set", source.string, source.pos) 

1676 

1677 return Character(ord(ch)) 

1678 

1679def parse_posix_class(source, info): 

1680 "Parses a POSIX character class." 

1681 negate = source.match("^") 

1682 prop_name, name = parse_property_name(source) 

1683 if not source.match(":]"): 

1684 raise ParseError() 

1685 

1686 return lookup_property(prop_name, name, not negate, source, posix=True) 

1687 

1688def float_to_rational(flt): 

1689 "Converts a float to a rational pair." 

1690 int_part = int(flt) 

1691 error = flt - int_part 

1692 if abs(error) < 0.0001: 

1693 return int_part, 1 

1694 

1695 den, num = float_to_rational(1.0 / error) 

1696 

1697 return int_part * den + num, den 

1698 

1699def numeric_to_rational(numeric): 

1700 "Converts a numeric string to a rational string, if possible." 

1701 if numeric[ : 1] == "-": 

1702 sign, numeric = numeric[0], numeric[1 : ] 

1703 else: 

1704 sign = "" 

1705 

1706 parts = numeric.split("/") 

1707 if len(parts) == 2: 

1708 num, den = float_to_rational(float(parts[0]) / float(parts[1])) 

1709 elif len(parts) == 1: 

1710 num, den = float_to_rational(float(parts[0])) 

1711 else: 

1712 raise ValueError() 

1713 

1714 result = "{}{}/{}".format(sign, num, den) 

1715 if result.endswith("/1"): 

1716 return result[ : -2] 

1717 

1718 return result 

1719 

1720def standardise_name(name): 

1721 "Standardises a property or value name." 

1722 try: 

1723 return numeric_to_rational("".join(name)) 

1724 except (ValueError, ZeroDivisionError): 

1725 return "".join(ch for ch in name if ch not in "_- ").upper() 

1726 

1727_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split()) 

1728 

1729_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split()) 

1730 

1731def lookup_property(property, value, positive, source=None, posix=False, encoding=0): 

1732 "Looks up a property." 

1733 # Normalise the names (which may still be lists). 

1734 property = standardise_name(property) if property else None 

1735 value = standardise_name(value) 

1736 

1737 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"): 

1738 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive 

1739 

1740 if posix and not property and value.upper() in _POSIX_CLASSES: 

1741 value = 'POSIX' + value 

1742 

1743 if property: 

1744 # Both the property and the value are provided. 

1745 prop = PROPERTIES.get(property) 

1746 if not prop: 

1747 if not source: 

1748 raise error("unknown property") 

1749 

1750 raise error("unknown property", source.string, source.pos) 

1751 

1752 prop_id, value_dict = prop 

1753 val_id = value_dict.get(value) 

1754 if val_id is None: 

1755 if not source: 

1756 raise error("unknown property value") 

1757 

1758 raise error("unknown property value", source.string, source.pos) 

1759 

1760 return Property((prop_id << 16) | val_id, positive, encoding=encoding) 

1761 

1762 # Only the value is provided. 

1763 # It might be the name of a GC, script or block value. 

1764 for property in ("GC", "SCRIPT", "BLOCK"): 

1765 prop_id, value_dict = PROPERTIES.get(property) 

1766 val_id = value_dict.get(value) 

1767 if val_id is not None: 

1768 return Property((prop_id << 16) | val_id, positive, encoding=encoding) 

1769 

1770 # It might be the name of a binary property. 

1771 prop = PROPERTIES.get(value) 

1772 if prop: 

1773 prop_id, value_dict = prop 

1774 if set(value_dict) == _BINARY_VALUES: 

1775 return Property((prop_id << 16) | 1, positive, encoding=encoding) 

1776 

1777 return Property(prop_id << 16, not positive, encoding=encoding) 

1778 

1779 # It might be the name of a binary property starting with a prefix. 

1780 if value.startswith("IS"): 

1781 prop = PROPERTIES.get(value[2 : ]) 

1782 if prop: 

1783 prop_id, value_dict = prop 

1784 if "YES" in value_dict: 

1785 return Property((prop_id << 16) | 1, positive, encoding=encoding) 

1786 

1787 # It might be the name of a script or block starting with a prefix. 

1788 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")): 

1789 if value.startswith(prefix): 

1790 prop_id, value_dict = PROPERTIES.get(property) 

1791 val_id = value_dict.get(value[2 : ]) 

1792 if val_id is not None: 

1793 return Property((prop_id << 16) | val_id, positive, encoding=encoding) 

1794 

1795 # Unknown property. 

1796 if not source: 

1797 raise error("unknown property") 

1798 

1799 raise error("unknown property", source.string, source.pos) 

1800 

1801def _compile_replacement(source, pattern, is_unicode): 

1802 "Compiles a replacement template escape sequence." 

1803 ch = source.get() 

1804 if ch in ALPHA: 

1805 # An alphabetic escape sequence. 

1806 value = CHARACTER_ESCAPES.get(ch) 

1807 if value: 

1808 return False, [ord(value)] 

1809 

1810 if ch in HEX_ESCAPES and (ch == "x" or is_unicode): 

1811 # A hexadecimal escape sequence. 

1812 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)] 

1813 

1814 if ch == "g": 

1815 # A group preference. 

1816 return True, [compile_repl_group(source, pattern)] 

1817 

1818 if ch == "N" and is_unicode: 

1819 # A named character. 

1820 value = parse_repl_named_char(source) 

1821 if value is not None: 

1822 return False, [value] 

1823 

1824 raise error("bad escape \\%s" % ch, source.string, source.pos) 

1825 

1826 if isinstance(source.sep, bytes): 

1827 octal_mask = 0xFF 

1828 else: 

1829 octal_mask = 0x1FF 

1830 

1831 if ch == "0": 

1832 # An octal escape sequence. 

1833 digits = ch 

1834 while len(digits) < 3: 

1835 saved_pos = source.pos 

1836 ch = source.get() 

1837 if ch not in OCT_DIGITS: 

1838 source.pos = saved_pos 

1839 break 

1840 digits += ch 

1841 

1842 return False, [int(digits, 8) & octal_mask] 

1843 

1844 if ch in DIGITS: 

1845 # Either an octal escape sequence (3 digits) or a group reference (max 

1846 # 2 digits). 

1847 digits = ch 

1848 saved_pos = source.pos 

1849 ch = source.get() 

1850 if ch in DIGITS: 

1851 digits += ch 

1852 saved_pos = source.pos 

1853 ch = source.get() 

1854 if ch and is_octal(digits + ch): 

1855 # An octal escape sequence. 

1856 return False, [int(digits + ch, 8) & octal_mask] 

1857 

1858 # A group reference. 

1859 source.pos = saved_pos 

1860 return True, [int(digits)] 

1861 

1862 if ch == "\\": 

1863 # An escaped backslash is a backslash. 

1864 return False, [ord("\\")] 

1865 

1866 if not ch: 

1867 # A trailing backslash. 

1868 raise error("bad escape (end of pattern)", source.string, source.pos) 

1869 

1870 # An escaped non-backslash is a backslash followed by the literal. 

1871 return False, [ord("\\"), ord(ch)] 

1872 

1873def parse_repl_hex_escape(source, expected_len, type): 

1874 "Parses a hex escape sequence in a replacement string." 

1875 digits = [] 

1876 for i in range(expected_len): 

1877 ch = source.get() 

1878 if ch not in HEX_DIGITS: 

1879 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), 

1880 source.string, source.pos) 

1881 digits.append(ch) 

1882 

1883 return int("".join(digits), 16) 

1884 

1885def parse_repl_named_char(source): 

1886 "Parses a named character in a replacement string." 

1887 saved_pos = source.pos 

1888 if source.match("{"): 

1889 name = source.get_while(ALPHA | set(" ")) 

1890 

1891 if source.match("}"): 

1892 try: 

1893 value = unicodedata.lookup(name) 

1894 return ord(value) 

1895 except KeyError: 

1896 raise error("undefined character name", source.string, 

1897 source.pos) 

1898 

1899 source.pos = saved_pos 

1900 return None 

1901 

1902def compile_repl_group(source, pattern): 

1903 "Compiles a replacement template group reference." 

1904 source.expect("<") 

1905 name = parse_name(source, True, True) 

1906 

1907 source.expect(">") 

1908 if name.isdigit(): 

1909 index = int(name) 

1910 if not 0 <= index <= pattern.groups: 

1911 raise error("invalid group reference", source.string, source.pos) 

1912 

1913 return index 

1914 

1915 try: 

1916 return pattern.groupindex[name] 

1917 except KeyError: 

1918 raise IndexError("unknown group") 

1919 

1920# The regular expression is parsed into a syntax tree. The different types of 

1921# node are defined below. 

1922 

1923INDENT = " " 

1924POSITIVE_OP = 0x1 

1925ZEROWIDTH_OP = 0x2 

1926FUZZY_OP = 0x4 

1927REVERSE_OP = 0x8 

1928REQUIRED_OP = 0x10 

1929ENCODING_OP_SHIFT = 5 

1930 

1931POS_TEXT = {False: "NON-MATCH", True: "MATCH"} 

1932CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "", 

1933 FULLIGNORECASE: " FULL_IGNORE_CASE"} 

1934 

1935def make_sequence(items): 

1936 if len(items) == 1: 

1937 return items[0] 

1938 return Sequence(items) 

1939 

1940# Common base class for all nodes. 

1941class RegexBase: 

1942 def __init__(self): 

1943 self._key = self.__class__ 

1944 

1945 def with_flags(self, positive=None, case_flags=None, zerowidth=None): 

1946 if positive is None: 

1947 positive = self.positive 

1948 else: 

1949 positive = bool(positive) 

1950 if case_flags is None: 

1951 case_flags = self.case_flags 

1952 else: 

1953 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS] 

1954 if zerowidth is None: 

1955 zerowidth = self.zerowidth 

1956 else: 

1957 zerowidth = bool(zerowidth) 

1958 

1959 if (positive == self.positive and case_flags == self.case_flags and 

1960 zerowidth == self.zerowidth): 

1961 return self 

1962 

1963 return self.rebuild(positive, case_flags, zerowidth) 

1964 

1965 def fix_groups(self, pattern, reverse, fuzzy): 

1966 pass 

1967 

1968 def optimise(self, info, reverse): 

1969 return self 

1970 

1971 def pack_characters(self, info): 

1972 return self 

1973 

1974 def remove_captures(self): 

1975 return self 

1976 

1977 def is_atomic(self): 

1978 return True 

1979 

1980 def can_be_affix(self): 

1981 return True 

1982 

1983 def contains_group(self): 

1984 return False 

1985 

1986 def get_firstset(self, reverse): 

1987 raise _FirstSetError() 

1988 

1989 def has_simple_start(self): 

1990 return False 

1991 

1992 def compile(self, reverse=False, fuzzy=False): 

1993 return self._compile(reverse, fuzzy) 

1994 

1995 def is_empty(self): 

1996 return False 

1997 

1998 def __hash__(self): 

1999 return hash(self._key) 

2000 

2001 def __eq__(self, other): 

2002 return type(self) is type(other) and self._key == other._key 

2003 

2004 def __ne__(self, other): 

2005 return not self.__eq__(other) 

2006 

2007 def get_required_string(self, reverse): 

2008 return self.max_width(), None 

2009 

2010# Base class for zero-width nodes. 

2011class ZeroWidthBase(RegexBase): 

2012 def __init__(self, positive=True, encoding=0): 

2013 RegexBase.__init__(self) 

2014 self.positive = bool(positive) 

2015 self.encoding = encoding 

2016 

2017 self._key = self.__class__, self.positive 

2018 

2019 def get_firstset(self, reverse): 

2020 return set([None]) 

2021 

2022 def _compile(self, reverse, fuzzy): 

2023 flags = 0 

2024 if self.positive: 

2025 flags |= POSITIVE_OP 

2026 if fuzzy: 

2027 flags |= FUZZY_OP 

2028 if reverse: 

2029 flags |= REVERSE_OP 

2030 flags |= self.encoding << ENCODING_OP_SHIFT 

2031 return [(self._opcode, flags)] 

2032 

2033 def dump(self, indent, reverse): 

2034 print("{}{} {}{}".format(INDENT * indent, self._op_name, 

2035 POS_TEXT[self.positive], ["", " ASCII"][self.encoding])) 

2036 

2037 def max_width(self): 

2038 return 0 

2039 

2040class Any(RegexBase): 

2041 _opcode = {False: OP.ANY, True: OP.ANY_REV} 

2042 _op_name = "ANY" 

2043 

2044 def has_simple_start(self): 

2045 return True 

2046 

2047 def _compile(self, reverse, fuzzy): 

2048 flags = 0 

2049 if fuzzy: 

2050 flags |= FUZZY_OP 

2051 return [(self._opcode[reverse], flags)] 

2052 

2053 def dump(self, indent, reverse): 

2054 print("{}{}".format(INDENT * indent, self._op_name)) 

2055 

2056 def max_width(self): 

2057 return 1 

2058 

2059class AnyAll(Any): 

2060 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV} 

2061 _op_name = "ANY_ALL" 

2062 

2063 def __init__(self): 

2064 self.positive = True 

2065 self.zerowidth = False 

2066 self.case_flags = 0 

2067 

2068 self._key = self.__class__, self.positive 

2069 

2070class AnyU(Any): 

2071 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV} 

2072 _op_name = "ANY_U" 

2073 

2074class Atomic(RegexBase): 

2075 def __init__(self, subpattern): 

2076 RegexBase.__init__(self) 

2077 self.subpattern = subpattern 

2078 

2079 def fix_groups(self, pattern, reverse, fuzzy): 

2080 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

2081 

2082 def optimise(self, info, reverse): 

2083 self.subpattern = self.subpattern.optimise(info, reverse) 

2084 

2085 if self.subpattern.is_empty(): 

2086 return self.subpattern 

2087 return self 

2088 

2089 def pack_characters(self, info): 

2090 self.subpattern = self.subpattern.pack_characters(info) 

2091 return self 

2092 

2093 def remove_captures(self): 

2094 self.subpattern = self.subpattern.remove_captures() 

2095 return self 

2096 

2097 def can_be_affix(self): 

2098 return self.subpattern.can_be_affix() 

2099 

2100 def contains_group(self): 

2101 return self.subpattern.contains_group() 

2102 

2103 def get_firstset(self, reverse): 

2104 return self.subpattern.get_firstset(reverse) 

2105 

2106 def has_simple_start(self): 

2107 return self.subpattern.has_simple_start() 

2108 

2109 def _compile(self, reverse, fuzzy): 

2110 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) + 

2111 [(OP.END, )]) 

2112 

2113 def dump(self, indent, reverse): 

2114 print("{}ATOMIC".format(INDENT * indent)) 

2115 self.subpattern.dump(indent + 1, reverse) 

2116 

2117 def is_empty(self): 

2118 return self.subpattern.is_empty() 

2119 

2120 def __eq__(self, other): 

2121 return (type(self) is type(other) and self.subpattern == 

2122 other.subpattern) 

2123 

2124 def max_width(self): 

2125 return self.subpattern.max_width() 

2126 

2127 def get_required_string(self, reverse): 

2128 return self.subpattern.get_required_string(reverse) 

2129 

2130class Boundary(ZeroWidthBase): 

2131 _opcode = OP.BOUNDARY 

2132 _op_name = "BOUNDARY" 

2133 

2134class Branch(RegexBase): 

2135 def __init__(self, branches): 

2136 RegexBase.__init__(self) 

2137 self.branches = branches 

2138 

2139 def fix_groups(self, pattern, reverse, fuzzy): 

2140 for b in self.branches: 

2141 b.fix_groups(pattern, reverse, fuzzy) 

2142 

2143 def optimise(self, info, reverse): 

2144 if not self.branches: 

2145 return Sequence([]) 

2146 

2147 # Flatten branches within branches. 

2148 branches = Branch._flatten_branches(info, reverse, self.branches) 

2149 

2150 # Move any common prefix or suffix out of the branches. 

2151 if reverse: 

2152 suffix, branches = Branch._split_common_suffix(info, branches) 

2153 prefix = [] 

2154 else: 

2155 prefix, branches = Branch._split_common_prefix(info, branches) 

2156 suffix = [] 

2157 

2158 # Try to reduce adjacent single-character branches to sets. 

2159 branches = Branch._reduce_to_set(info, reverse, branches) 

2160 

2161 if len(branches) > 1: 

2162 sequence = [Branch(branches)] 

2163 

2164 if not prefix or not suffix: 

2165 # We might be able to add a quick precheck before the branches. 

2166 firstset = self._add_precheck(info, reverse, branches) 

2167 

2168 if firstset: 

2169 if reverse: 

2170 sequence.append(firstset) 

2171 else: 

2172 sequence.insert(0, firstset) 

2173 else: 

2174 sequence = branches 

2175 

2176 return make_sequence(prefix + sequence + suffix) 

2177 

2178 def _add_precheck(self, info, reverse, branches): 

2179 charset = set() 

2180 pos = -1 if reverse else 0 

2181 

2182 for branch in branches: 

2183 if type(branch) is Literal and branch.case_flags == NOCASE: 

2184 charset.add(branch.characters[pos]) 

2185 else: 

2186 return 

2187 

2188 if not charset: 

2189 return None 

2190 

2191 return _check_firstset(info, reverse, [Character(c) for c in charset]) 

2192 

2193 def pack_characters(self, info): 

2194 self.branches = [b.pack_characters(info) for b in self.branches] 

2195 return self 

2196 

2197 def remove_captures(self): 

2198 self.branches = [b.remove_captures() for b in self.branches] 

2199 return self 

2200 

2201 def is_atomic(self): 

2202 return all(b.is_atomic() for b in self.branches) 

2203 

2204 def can_be_affix(self): 

2205 return all(b.can_be_affix() for b in self.branches) 

2206 

2207 def contains_group(self): 

2208 return any(b.contains_group() for b in self.branches) 

2209 

2210 def get_firstset(self, reverse): 

2211 fs = set() 

2212 for b in self.branches: 

2213 fs |= b.get_firstset(reverse) 

2214 

2215 return fs or set([None]) 

2216 

2217 def _compile(self, reverse, fuzzy): 

2218 if not self.branches: 

2219 return [] 

2220 

2221 code = [(OP.BRANCH, )] 

2222 for b in self.branches: 

2223 code.extend(b.compile(reverse, fuzzy)) 

2224 code.append((OP.NEXT, )) 

2225 

2226 code[-1] = (OP.END, ) 

2227 

2228 return code 

2229 

2230 def dump(self, indent, reverse): 

2231 print("{}BRANCH".format(INDENT * indent)) 

2232 self.branches[0].dump(indent + 1, reverse) 

2233 for b in self.branches[1 : ]: 

2234 print("{}OR".format(INDENT * indent)) 

2235 b.dump(indent + 1, reverse) 

2236 

2237 @staticmethod 

2238 def _flatten_branches(info, reverse, branches): 

2239 # Flatten the branches so that there aren't branches of branches. 

2240 new_branches = [] 

2241 for b in branches: 

2242 b = b.optimise(info, reverse) 

2243 if isinstance(b, Branch): 

2244 new_branches.extend(b.branches) 

2245 else: 

2246 new_branches.append(b) 

2247 

2248 return new_branches 

2249 

2250 @staticmethod 

2251 def _split_common_prefix(info, branches): 

2252 # Common leading items can be moved out of the branches. 

2253 # Get the items in the branches. 

2254 alternatives = [] 

2255 for b in branches: 

2256 if isinstance(b, Sequence): 

2257 alternatives.append(b.items) 

2258 else: 

2259 alternatives.append([b]) 

2260 

2261 # What is the maximum possible length of the prefix? 

2262 max_count = min(len(a) for a in alternatives) 

2263 

2264 # What is the longest common prefix? 

2265 prefix = alternatives[0] 

2266 pos = 0 

2267 end_pos = max_count 

2268 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] == 

2269 prefix[pos] for a in alternatives): 

2270 pos += 1 

2271 count = pos 

2272 

2273 if info.flags & UNICODE: 

2274 # We need to check that we're not splitting a sequence of 

2275 # characters which could form part of full case-folding. 

2276 count = pos 

2277 while count > 0 and not all(Branch._can_split(a, count) for a in 

2278 alternatives): 

2279 count -= 1 

2280 

2281 # No common prefix is possible. 

2282 if count == 0: 

2283 return [], branches 

2284 

2285 # Rebuild the branches. 

2286 new_branches = [] 

2287 for a in alternatives: 

2288 new_branches.append(make_sequence(a[count : ])) 

2289 

2290 return prefix[ : count], new_branches 

2291 

2292 @staticmethod 

2293 def _split_common_suffix(info, branches): 

2294 # Common trailing items can be moved out of the branches. 

2295 # Get the items in the branches. 

2296 alternatives = [] 

2297 for b in branches: 

2298 if isinstance(b, Sequence): 

2299 alternatives.append(b.items) 

2300 else: 

2301 alternatives.append([b]) 

2302 

2303 # What is the maximum possible length of the suffix? 

2304 max_count = min(len(a) for a in alternatives) 

2305 

2306 # What is the longest common suffix? 

2307 suffix = alternatives[0] 

2308 pos = -1 

2309 end_pos = -1 - max_count 

2310 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] == 

2311 suffix[pos] for a in alternatives): 

2312 pos -= 1 

2313 count = -1 - pos 

2314 

2315 if info.flags & UNICODE: 

2316 # We need to check that we're not splitting a sequence of 

2317 # characters which could form part of full case-folding. 

2318 while count > 0 and not all(Branch._can_split_rev(a, count) for a 

2319 in alternatives): 

2320 count -= 1 

2321 

2322 # No common suffix is possible. 

2323 if count == 0: 

2324 return [], branches 

2325 

2326 # Rebuild the branches. 

2327 new_branches = [] 

2328 for a in alternatives: 

2329 new_branches.append(make_sequence(a[ : -count])) 

2330 

2331 return suffix[-count : ], new_branches 

2332 

2333 @staticmethod 

2334 def _can_split(items, count): 

2335 # Check the characters either side of the proposed split. 

2336 if not Branch._is_full_case(items, count - 1): 

2337 return True 

2338 

2339 if not Branch._is_full_case(items, count): 

2340 return True 

2341 

2342 # Check whether a 1-1 split would be OK. 

2343 if Branch._is_folded(items[count - 1 : count + 1]): 

2344 return False 

2345 

2346 # Check whether a 1-2 split would be OK. 

2347 if (Branch._is_full_case(items, count + 2) and 

2348 Branch._is_folded(items[count - 1 : count + 2])): 

2349 return False 

2350 

2351 # Check whether a 2-1 split would be OK. 

2352 if (Branch._is_full_case(items, count - 2) and 

2353 Branch._is_folded(items[count - 2 : count + 1])): 

2354 return False 

2355 

2356 return True 

2357 

2358 @staticmethod 

2359 def _can_split_rev(items, count): 

2360 end = len(items) 

2361 

2362 # Check the characters either side of the proposed split. 

2363 if not Branch._is_full_case(items, end - count): 

2364 return True 

2365 

2366 if not Branch._is_full_case(items, end - count - 1): 

2367 return True 

2368 

2369 # Check whether a 1-1 split would be OK. 

2370 if Branch._is_folded(items[end - count - 1 : end - count + 1]): 

2371 return False 

2372 

2373 # Check whether a 1-2 split would be OK. 

2374 if (Branch._is_full_case(items, end - count + 2) and 

2375 Branch._is_folded(items[end - count - 1 : end - count + 2])): 

2376 return False 

2377 

2378 # Check whether a 2-1 split would be OK. 

2379 if (Branch._is_full_case(items, end - count - 2) and 

2380 Branch._is_folded(items[end - count - 2 : end - count + 1])): 

2381 return False 

2382 

2383 return True 

2384 

2385 @staticmethod 

2386 def _merge_common_prefixes(info, reverse, branches): 

2387 # Branches with the same case-sensitive character prefix can be grouped 

2388 # together if they are separated only by other branches with a 

2389 # character prefix. 

2390 prefixed = defaultdict(list) 

2391 order = {} 

2392 new_branches = [] 

2393 for b in branches: 

2394 if Branch._is_simple_character(b): 

2395 # Branch starts with a simple character. 

2396 prefixed[b.value].append([b]) 

2397 order.setdefault(b.value, len(order)) 

2398 elif (isinstance(b, Sequence) and b.items and 

2399 Branch._is_simple_character(b.items[0])): 

2400 # Branch starts with a simple character. 

2401 prefixed[b.items[0].value].append(b.items) 

2402 order.setdefault(b.items[0].value, len(order)) 

2403 else: 

2404 Branch._flush_char_prefix(info, reverse, prefixed, order, 

2405 new_branches) 

2406 

2407 new_branches.append(b) 

2408 

2409 Branch._flush_char_prefix(info, prefixed, order, new_branches) 

2410 

2411 return new_branches 

2412 

2413 @staticmethod 

2414 def _is_simple_character(c): 

2415 return isinstance(c, Character) and c.positive and not c.case_flags 

2416 

2417 @staticmethod 

2418 def _reduce_to_set(info, reverse, branches): 

2419 # Can the branches be reduced to a set? 

2420 new_branches = [] 

2421 items = set() 

2422 case_flags = NOCASE 

2423 for b in branches: 

2424 if isinstance(b, (Character, Property, SetBase)): 

2425 # Branch starts with a single character. 

2426 if b.case_flags != case_flags: 

2427 # Different case sensitivity, so flush. 

2428 Branch._flush_set_members(info, reverse, items, case_flags, 

2429 new_branches) 

2430 

2431 case_flags = b.case_flags 

2432 

2433 items.add(b.with_flags(case_flags=NOCASE)) 

2434 else: 

2435 Branch._flush_set_members(info, reverse, items, case_flags, 

2436 new_branches) 

2437 

2438 new_branches.append(b) 

2439 

2440 Branch._flush_set_members(info, reverse, items, case_flags, 

2441 new_branches) 

2442 

2443 return new_branches 

2444 

2445 @staticmethod 

2446 def _flush_char_prefix(info, reverse, prefixed, order, new_branches): 

2447 # Flush the prefixed branches. 

2448 if not prefixed: 

2449 return 

2450 

2451 for value, branches in sorted(prefixed.items(), key=lambda pair: 

2452 order[pair[0]]): 

2453 if len(branches) == 1: 

2454 new_branches.append(make_sequence(branches[0])) 

2455 else: 

2456 subbranches = [] 

2457 optional = False 

2458 for b in branches: 

2459 if len(b) > 1: 

2460 subbranches.append(make_sequence(b[1 : ])) 

2461 elif not optional: 

2462 subbranches.append(Sequence()) 

2463 optional = True 

2464 

2465 sequence = Sequence([Character(value), Branch(subbranches)]) 

2466 new_branches.append(sequence.optimise(info, reverse)) 

2467 

2468 prefixed.clear() 

2469 order.clear() 

2470 

2471 @staticmethod 

2472 def _flush_set_members(info, reverse, items, case_flags, new_branches): 

2473 # Flush the set members. 

2474 if not items: 

2475 return 

2476 

2477 if len(items) == 1: 

2478 item = list(items)[0] 

2479 else: 

2480 item = SetUnion(info, list(items)).optimise(info, reverse) 

2481 

2482 new_branches.append(item.with_flags(case_flags=case_flags)) 

2483 

2484 items.clear() 

2485 

2486 @staticmethod 

2487 def _is_full_case(items, i): 

2488 if not 0 <= i < len(items): 

2489 return False 

2490 

2491 item = items[i] 

2492 return (isinstance(item, Character) and item.positive and 

2493 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE) 

2494 

2495 @staticmethod 

2496 def _is_folded(items): 

2497 if len(items) < 2: 

2498 return False 

2499 

2500 for i in items: 

2501 if (not isinstance(i, Character) or not i.positive or not 

2502 i.case_flags): 

2503 return False 

2504 

2505 folded = "".join(chr(i.value) for i in items) 

2506 folded = _regex.fold_case(FULL_CASE_FOLDING, folded) 

2507 

2508 # Get the characters which expand to multiple codepoints on folding. 

2509 expanding_chars = _regex.get_expand_on_folding() 

2510 

2511 for c in expanding_chars: 

2512 if folded == _regex.fold_case(FULL_CASE_FOLDING, c): 

2513 return True 

2514 

2515 return False 

2516 

2517 def is_empty(self): 

2518 return all(b.is_empty() for b in self.branches) 

2519 

2520 def __eq__(self, other): 

2521 return type(self) is type(other) and self.branches == other.branches 

2522 

2523 def max_width(self): 

2524 return max(b.max_width() for b in self.branches) 

2525 

2526class CallGroup(RegexBase): 

2527 def __init__(self, info, group, position): 

2528 RegexBase.__init__(self) 

2529 self.info = info 

2530 self.group = group 

2531 self.position = position 

2532 

2533 self._key = self.__class__, self.group 

2534 

2535 def fix_groups(self, pattern, reverse, fuzzy): 

2536 try: 

2537 self.group = int(self.group) 

2538 except ValueError: 

2539 try: 

2540 self.group = self.info.group_index[self.group] 

2541 except KeyError: 

2542 raise error("invalid group reference", pattern, self.position) 

2543 

2544 if not 0 <= self.group <= self.info.group_count: 

2545 raise error("unknown group", pattern, self.position) 

2546 

2547 if self.group > 0 and self.info.open_group_count[self.group] > 1: 

2548 raise error("ambiguous group reference", pattern, self.position) 

2549 

2550 self.info.group_calls.append((self, reverse, fuzzy)) 

2551 

2552 self._key = self.__class__, self.group 

2553 

2554 def remove_captures(self): 

2555 raise error("group reference not allowed", self.pattern, self.position) 

2556 

2557 def _compile(self, reverse, fuzzy): 

2558 return [(OP.GROUP_CALL, self.call_ref)] 

2559 

2560 def dump(self, indent, reverse): 

2561 print("{}GROUP_CALL {}".format(INDENT * indent, self.group)) 

2562 

2563 def __eq__(self, other): 

2564 return type(self) is type(other) and self.group == other.group 

2565 

2566 def max_width(self): 

2567 return UNLIMITED 

2568 

2569 def __del__(self): 

2570 self.info = None 

2571 

2572class CallRef(RegexBase): 

2573 def __init__(self, ref, parsed): 

2574 self.ref = ref 

2575 self.parsed = parsed 

2576 

2577 def _compile(self, reverse, fuzzy): 

2578 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse, 

2579 fuzzy) + [(OP.END, )]) 

2580 

2581class Character(RegexBase): 

2582 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False): 

2583 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE, 

2584 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE, 

2585 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV, 

2586 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV} 

2587 

2588 def __init__(self, value, positive=True, case_flags=NOCASE, 

2589 zerowidth=False): 

2590 RegexBase.__init__(self) 

2591 self.value = value 

2592 self.positive = bool(positive) 

2593 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

2594 self.zerowidth = bool(zerowidth) 

2595 

2596 if (self.positive and (self.case_flags & FULLIGNORECASE) == 

2597 FULLIGNORECASE): 

2598 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value)) 

2599 else: 

2600 self.folded = chr(self.value) 

2601 

2602 self._key = (self.__class__, self.value, self.positive, 

2603 self.case_flags, self.zerowidth) 

2604 

2605 def rebuild(self, positive, case_flags, zerowidth): 

2606 return Character(self.value, positive, case_flags, zerowidth) 

2607 

2608 def optimise(self, info, reverse, in_set=False): 

2609 return self 

2610 

2611 def get_firstset(self, reverse): 

2612 return set([self]) 

2613 

2614 def has_simple_start(self): 

2615 return True 

2616 

2617 def _compile(self, reverse, fuzzy): 

2618 flags = 0 

2619 if self.positive: 

2620 flags |= POSITIVE_OP 

2621 if self.zerowidth: 

2622 flags |= ZEROWIDTH_OP 

2623 if fuzzy: 

2624 flags |= FUZZY_OP 

2625 

2626 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags, 

2627 self.value]) 

2628 

2629 if len(self.folded) > 1: 

2630 # The character expands on full case-folding. 

2631 code = Branch([code, String([ord(c) for c in self.folded], 

2632 case_flags=self.case_flags)]) 

2633 

2634 return code.compile(reverse, fuzzy) 

2635 

2636 def dump(self, indent, reverse): 

2637 display = ascii(chr(self.value)).lstrip("bu") 

2638 print("{}CHARACTER {} {}{}".format(INDENT * indent, 

2639 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags])) 

2640 

2641 def matches(self, ch): 

2642 return (ch == self.value) == self.positive 

2643 

2644 def max_width(self): 

2645 return len(self.folded) 

2646 

2647 def get_required_string(self, reverse): 

2648 if not self.positive: 

2649 return 1, None 

2650 

2651 self.folded_characters = tuple(ord(c) for c in self.folded) 

2652 

2653 return 0, self 

2654 

2655class Conditional(RegexBase): 

2656 def __init__(self, info, group, yes_item, no_item, position): 

2657 RegexBase.__init__(self) 

2658 self.info = info 

2659 self.group = group 

2660 self.yes_item = yes_item 

2661 self.no_item = no_item 

2662 self.position = position 

2663 

2664 def fix_groups(self, pattern, reverse, fuzzy): 

2665 try: 

2666 self.group = int(self.group) 

2667 except ValueError: 

2668 try: 

2669 self.group = self.info.group_index[self.group] 

2670 except KeyError: 

2671 if self.group == 'DEFINE': 

2672 # 'DEFINE' is a special name unless there's a group with 

2673 # that name. 

2674 self.group = 0 

2675 else: 

2676 raise error("unknown group", pattern, self.position) 

2677 

2678 if not 0 <= self.group <= self.info.group_count: 

2679 raise error("invalid group reference", pattern, self.position) 

2680 

2681 self.yes_item.fix_groups(pattern, reverse, fuzzy) 

2682 self.no_item.fix_groups(pattern, reverse, fuzzy) 

2683 

2684 def optimise(self, info, reverse): 

2685 yes_item = self.yes_item.optimise(info, reverse) 

2686 no_item = self.no_item.optimise(info, reverse) 

2687 

2688 return Conditional(info, self.group, yes_item, no_item, self.position) 

2689 

2690 def pack_characters(self, info): 

2691 self.yes_item = self.yes_item.pack_characters(info) 

2692 self.no_item = self.no_item.pack_characters(info) 

2693 return self 

2694 

2695 def remove_captures(self): 

2696 self.yes_item = self.yes_item.remove_captures() 

2697 self.no_item = self.no_item.remove_captures() 

2698 

2699 def is_atomic(self): 

2700 return self.yes_item.is_atomic() and self.no_item.is_atomic() 

2701 

2702 def can_be_affix(self): 

2703 return self.yes_item.can_be_affix() and self.no_item.can_be_affix() 

2704 

2705 def contains_group(self): 

2706 return self.yes_item.contains_group() or self.no_item.contains_group() 

2707 

2708 def get_firstset(self, reverse): 

2709 return (self.yes_item.get_firstset(reverse) | 

2710 self.no_item.get_firstset(reverse)) 

2711 

2712 def _compile(self, reverse, fuzzy): 

2713 code = [(OP.GROUP_EXISTS, self.group)] 

2714 code.extend(self.yes_item.compile(reverse, fuzzy)) 

2715 add_code = self.no_item.compile(reverse, fuzzy) 

2716 if add_code: 

2717 code.append((OP.NEXT, )) 

2718 code.extend(add_code) 

2719 

2720 code.append((OP.END, )) 

2721 

2722 return code 

2723 

2724 def dump(self, indent, reverse): 

2725 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group)) 

2726 self.yes_item.dump(indent + 1, reverse) 

2727 if not self.no_item.is_empty(): 

2728 print("{}OR".format(INDENT * indent)) 

2729 self.no_item.dump(indent + 1, reverse) 

2730 

2731 def is_empty(self): 

2732 return self.yes_item.is_empty() and self.no_item.is_empty() 

2733 

2734 def __eq__(self, other): 

2735 return type(self) is type(other) and (self.group, self.yes_item, 

2736 self.no_item) == (other.group, other.yes_item, other.no_item) 

2737 

2738 def max_width(self): 

2739 return max(self.yes_item.max_width(), self.no_item.max_width()) 

2740 

2741 def __del__(self): 

2742 self.info = None 

2743 

2744class DefaultBoundary(ZeroWidthBase): 

2745 _opcode = OP.DEFAULT_BOUNDARY 

2746 _op_name = "DEFAULT_BOUNDARY" 

2747 

2748class DefaultEndOfWord(ZeroWidthBase): 

2749 _opcode = OP.DEFAULT_END_OF_WORD 

2750 _op_name = "DEFAULT_END_OF_WORD" 

2751 

2752class DefaultStartOfWord(ZeroWidthBase): 

2753 _opcode = OP.DEFAULT_START_OF_WORD 

2754 _op_name = "DEFAULT_START_OF_WORD" 

2755 

2756class EndOfLine(ZeroWidthBase): 

2757 _opcode = OP.END_OF_LINE 

2758 _op_name = "END_OF_LINE" 

2759 

2760class EndOfLineU(EndOfLine): 

2761 _opcode = OP.END_OF_LINE_U 

2762 _op_name = "END_OF_LINE_U" 

2763 

2764class EndOfString(ZeroWidthBase): 

2765 _opcode = OP.END_OF_STRING 

2766 _op_name = "END_OF_STRING" 

2767 

2768class EndOfStringLine(ZeroWidthBase): 

2769 _opcode = OP.END_OF_STRING_LINE 

2770 _op_name = "END_OF_STRING_LINE" 

2771 

2772class EndOfStringLineU(EndOfStringLine): 

2773 _opcode = OP.END_OF_STRING_LINE_U 

2774 _op_name = "END_OF_STRING_LINE_U" 

2775 

2776class EndOfWord(ZeroWidthBase): 

2777 _opcode = OP.END_OF_WORD 

2778 _op_name = "END_OF_WORD" 

2779 

2780class Failure(ZeroWidthBase): 

2781 _op_name = "FAILURE" 

2782 

2783 def _compile(self, reverse, fuzzy): 

2784 return [(OP.FAILURE, )] 

2785 

2786class Fuzzy(RegexBase): 

2787 def __init__(self, subpattern, constraints=None): 

2788 RegexBase.__init__(self) 

2789 if constraints is None: 

2790 constraints = {} 

2791 self.subpattern = subpattern 

2792 self.constraints = constraints 

2793 

2794 # If an error type is mentioned in the cost equation, then its maximum 

2795 # defaults to unlimited. 

2796 if "cost" in constraints: 

2797 for e in "dis": 

2798 if e in constraints["cost"]: 

2799 constraints.setdefault(e, (0, None)) 

2800 

2801 # If any error type is mentioned, then all the error maxima default to 

2802 # 0, otherwise they default to unlimited. 

2803 if set(constraints) & set("dis"): 

2804 for e in "dis": 

2805 constraints.setdefault(e, (0, 0)) 

2806 else: 

2807 for e in "dis": 

2808 constraints.setdefault(e, (0, None)) 

2809 

2810 # The maximum of the generic error type defaults to unlimited. 

2811 constraints.setdefault("e", (0, None)) 

2812 

2813 # The cost equation defaults to equal costs. Also, the cost of any 

2814 # error type not mentioned in the cost equation defaults to 0. 

2815 if "cost" in constraints: 

2816 for e in "dis": 

2817 constraints["cost"].setdefault(e, 0) 

2818 else: 

2819 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max": 

2820 constraints["e"][1]} 

2821 

2822 def fix_groups(self, pattern, reverse, fuzzy): 

2823 self.subpattern.fix_groups(pattern, reverse, True) 

2824 

2825 def pack_characters(self, info): 

2826 self.subpattern = self.subpattern.pack_characters(info) 

2827 return self 

2828 

2829 def remove_captures(self): 

2830 self.subpattern = self.subpattern.remove_captures() 

2831 return self 

2832 

2833 def is_atomic(self): 

2834 return self.subpattern.is_atomic() 

2835 

2836 def contains_group(self): 

2837 return self.subpattern.contains_group() 

2838 

2839 def _compile(self, reverse, fuzzy): 

2840 # The individual limits. 

2841 arguments = [] 

2842 for e in "dise": 

2843 v = self.constraints[e] 

2844 arguments.append(v[0]) 

2845 arguments.append(UNLIMITED if v[1] is None else v[1]) 

2846 

2847 # The coeffs of the cost equation. 

2848 for e in "dis": 

2849 arguments.append(self.constraints["cost"][e]) 

2850 

2851 # The maximum of the cost equation. 

2852 v = self.constraints["cost"]["max"] 

2853 arguments.append(UNLIMITED if v is None else v) 

2854 

2855 flags = 0 

2856 if reverse: 

2857 flags |= REVERSE_OP 

2858 

2859 test = self.constraints.get("test") 

2860 

2861 if test: 

2862 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] + 

2863 test.compile(reverse, True) + [(OP.NEXT,)] + 

2864 self.subpattern.compile(reverse, True) + [(OP.END,)]) 

2865 

2866 return ([(OP.FUZZY, flags) + tuple(arguments)] + 

2867 self.subpattern.compile(reverse, True) + [(OP.END,)]) 

2868 

2869 def dump(self, indent, reverse): 

2870 constraints = self._constraints_to_string() 

2871 if constraints: 

2872 constraints = " " + constraints 

2873 print("{}FUZZY{}".format(INDENT * indent, constraints)) 

2874 self.subpattern.dump(indent + 1, reverse) 

2875 

2876 def is_empty(self): 

2877 return self.subpattern.is_empty() 

2878 

2879 def __eq__(self, other): 

2880 return (type(self) is type(other) and self.subpattern == 

2881 other.subpattern and self.constraints == other.constraints) 

2882 

2883 def max_width(self): 

2884 return UNLIMITED 

2885 

2886 def _constraints_to_string(self): 

2887 constraints = [] 

2888 

2889 for name in "ids": 

2890 min, max = self.constraints[name] 

2891 if max == 0: 

2892 continue 

2893 

2894 con = "" 

2895 

2896 if min > 0: 

2897 con = "{}<=".format(min) 

2898 

2899 con += name 

2900 

2901 if max is not None: 

2902 con += "<={}".format(max) 

2903 

2904 constraints.append(con) 

2905 

2906 cost = [] 

2907 for name in "ids": 

2908 coeff = self.constraints["cost"][name] 

2909 if coeff > 0: 

2910 cost.append("{}{}".format(coeff, name)) 

2911 

2912 limit = self.constraints["cost"]["max"] 

2913 if limit is not None and limit > 0: 

2914 cost = "{}<={}".format("+".join(cost), limit) 

2915 constraints.append(cost) 

2916 

2917 return ",".join(constraints) 

2918 

2919class Grapheme(RegexBase): 

2920 def _compile(self, reverse, fuzzy): 

2921 # Match at least 1 character until a grapheme boundary is reached. Note 

2922 # that this is the same whether matching forwards or backwards. 

2923 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None), 

2924 GraphemeBoundary()])) 

2925 

2926 return grapheme_matcher.compile(reverse, fuzzy) 

2927 

2928 def dump(self, indent, reverse): 

2929 print("{}GRAPHEME".format(INDENT * indent)) 

2930 

2931 def max_width(self): 

2932 return UNLIMITED 

2933 

2934class GraphemeBoundary: 

2935 def compile(self, reverse, fuzzy): 

2936 return [(OP.GRAPHEME_BOUNDARY, 1)] 

2937 

2938class GreedyRepeat(RegexBase): 

2939 _opcode = OP.GREEDY_REPEAT 

2940 _op_name = "GREEDY_REPEAT" 

2941 

2942 def __init__(self, subpattern, min_count, max_count): 

2943 RegexBase.__init__(self) 

2944 self.subpattern = subpattern 

2945 self.min_count = min_count 

2946 self.max_count = max_count 

2947 

2948 def fix_groups(self, pattern, reverse, fuzzy): 

2949 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

2950 

2951 def optimise(self, info, reverse): 

2952 subpattern = self.subpattern.optimise(info, reverse) 

2953 

2954 return type(self)(subpattern, self.min_count, self.max_count) 

2955 

2956 def pack_characters(self, info): 

2957 self.subpattern = self.subpattern.pack_characters(info) 

2958 return self 

2959 

2960 def remove_captures(self): 

2961 self.subpattern = self.subpattern.remove_captures() 

2962 return self 

2963 

2964 def is_atomic(self): 

2965 return self.min_count == self.max_count and self.subpattern.is_atomic() 

2966 

2967 def can_be_affix(self): 

2968 return False 

2969 

2970 def contains_group(self): 

2971 return self.subpattern.contains_group() 

2972 

2973 def get_firstset(self, reverse): 

2974 fs = self.subpattern.get_firstset(reverse) 

2975 if self.min_count == 0: 

2976 fs.add(None) 

2977 

2978 return fs 

2979 

2980 def _compile(self, reverse, fuzzy): 

2981 repeat = [self._opcode, self.min_count] 

2982 if self.max_count is None: 

2983 repeat.append(UNLIMITED) 

2984 else: 

2985 repeat.append(self.max_count) 

2986 

2987 subpattern = self.subpattern.compile(reverse, fuzzy) 

2988 if not subpattern: 

2989 return [] 

2990 

2991 return ([tuple(repeat)] + subpattern + [(OP.END, )]) 

2992 

2993 def dump(self, indent, reverse): 

2994 if self.max_count is None: 

2995 limit = "INF" 

2996 else: 

2997 limit = self.max_count 

2998 print("{}{} {} {}".format(INDENT * indent, self._op_name, 

2999 self.min_count, limit)) 

3000 

3001 self.subpattern.dump(indent + 1, reverse) 

3002 

3003 def is_empty(self): 

3004 return self.subpattern.is_empty() 

3005 

3006 def __eq__(self, other): 

3007 return type(self) is type(other) and (self.subpattern, self.min_count, 

3008 self.max_count) == (other.subpattern, other.min_count, 

3009 other.max_count) 

3010 

3011 def max_width(self): 

3012 if self.max_count is None: 

3013 return UNLIMITED 

3014 

3015 return self.subpattern.max_width() * self.max_count 

3016 

3017 def get_required_string(self, reverse): 

3018 max_count = UNLIMITED if self.max_count is None else self.max_count 

3019 if self.min_count == 0: 

3020 w = self.subpattern.max_width() * max_count 

3021 return min(w, UNLIMITED), None 

3022 

3023 ofs, req = self.subpattern.get_required_string(reverse) 

3024 if req: 

3025 return ofs, req 

3026 

3027 w = self.subpattern.max_width() * max_count 

3028 return min(w, UNLIMITED), None 

3029 

3030class PossessiveRepeat(GreedyRepeat): 

3031 def is_atomic(self): 

3032 return True 

3033 

3034 def _compile(self, reverse, fuzzy): 

3035 subpattern = self.subpattern.compile(reverse, fuzzy) 

3036 if not subpattern: 

3037 return [] 

3038 

3039 repeat = [self._opcode, self.min_count] 

3040 if self.max_count is None: 

3041 repeat.append(UNLIMITED) 

3042 else: 

3043 repeat.append(self.max_count) 

3044 

3045 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ), 

3046 (OP.END, )]) 

3047 

3048 def dump(self, indent, reverse): 

3049 print("{}ATOMIC".format(INDENT * indent)) 

3050 

3051 if self.max_count is None: 

3052 limit = "INF" 

3053 else: 

3054 limit = self.max_count 

3055 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name, 

3056 self.min_count, limit)) 

3057 

3058 self.subpattern.dump(indent + 2, reverse) 

3059 

3060class Group(RegexBase): 

3061 def __init__(self, info, group, subpattern): 

3062 RegexBase.__init__(self) 

3063 self.info = info 

3064 self.group = group 

3065 self.subpattern = subpattern 

3066 

3067 self.call_ref = None 

3068 

3069 def fix_groups(self, pattern, reverse, fuzzy): 

3070 self.info.defined_groups[self.group] = (self, reverse, fuzzy) 

3071 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

3072 

3073 def optimise(self, info, reverse): 

3074 subpattern = self.subpattern.optimise(info, reverse) 

3075 

3076 return Group(self.info, self.group, subpattern) 

3077 

3078 def pack_characters(self, info): 

3079 self.subpattern = self.subpattern.pack_characters(info) 

3080 return self 

3081 

3082 def remove_captures(self): 

3083 return self.subpattern.remove_captures() 

3084 

3085 def is_atomic(self): 

3086 return self.subpattern.is_atomic() 

3087 

3088 def can_be_affix(self): 

3089 return False 

3090 

3091 def contains_group(self): 

3092 return True 

3093 

3094 def get_firstset(self, reverse): 

3095 return self.subpattern.get_firstset(reverse) 

3096 

3097 def has_simple_start(self): 

3098 return self.subpattern.has_simple_start() 

3099 

3100 def _compile(self, reverse, fuzzy): 

3101 code = [] 

3102 

3103 public_group = private_group = self.group 

3104 if private_group < 0: 

3105 public_group = self.info.private_groups[private_group] 

3106 private_group = self.info.group_count - private_group 

3107 

3108 key = self.group, reverse, fuzzy 

3109 ref = self.info.call_refs.get(key) 

3110 if ref is not None: 

3111 code += [(OP.CALL_REF, ref)] 

3112 

3113 code += [(OP.GROUP, int(not reverse), private_group, public_group)] 

3114 code += self.subpattern.compile(reverse, fuzzy) 

3115 code += [(OP.END, )] 

3116 

3117 if ref is not None: 

3118 code += [(OP.END, )] 

3119 

3120 return code 

3121 

3122 def dump(self, indent, reverse): 

3123 group = self.group 

3124 if group < 0: 

3125 group = self.info.private_groups[group] 

3126 print("{}GROUP {}".format(INDENT * indent, group)) 

3127 self.subpattern.dump(indent + 1, reverse) 

3128 

3129 def __eq__(self, other): 

3130 return (type(self) is type(other) and (self.group, self.subpattern) == 

3131 (other.group, other.subpattern)) 

3132 

3133 def max_width(self): 

3134 return self.subpattern.max_width() 

3135 

3136 def get_required_string(self, reverse): 

3137 return self.subpattern.get_required_string(reverse) 

3138 

3139 def __del__(self): 

3140 self.info = None 

3141 

3142class Keep(ZeroWidthBase): 

3143 _opcode = OP.KEEP 

3144 _op_name = "KEEP" 

3145 

3146class LazyRepeat(GreedyRepeat): 

3147 _opcode = OP.LAZY_REPEAT 

3148 _op_name = "LAZY_REPEAT" 

3149 

3150class LookAround(RegexBase): 

3151 _dir_text = {False: "AHEAD", True: "BEHIND"} 

3152 

3153 def __init__(self, behind, positive, subpattern): 

3154 RegexBase.__init__(self) 

3155 self.behind = bool(behind) 

3156 self.positive = bool(positive) 

3157 self.subpattern = subpattern 

3158 

3159 def fix_groups(self, pattern, reverse, fuzzy): 

3160 self.subpattern.fix_groups(pattern, self.behind, fuzzy) 

3161 

3162 def optimise(self, info, reverse): 

3163 subpattern = self.subpattern.optimise(info, self.behind) 

3164 if self.positive and subpattern.is_empty(): 

3165 return subpattern 

3166 

3167 return LookAround(self.behind, self.positive, subpattern) 

3168 

3169 def pack_characters(self, info): 

3170 self.subpattern = self.subpattern.pack_characters(info) 

3171 return self 

3172 

3173 def remove_captures(self): 

3174 return self.subpattern.remove_captures() 

3175 

3176 def is_atomic(self): 

3177 return self.subpattern.is_atomic() 

3178 

3179 def can_be_affix(self): 

3180 return self.subpattern.can_be_affix() 

3181 

3182 def contains_group(self): 

3183 return self.subpattern.contains_group() 

3184 

3185 def get_firstset(self, reverse): 

3186 if self.positive and self.behind == reverse: 

3187 return self.subpattern.get_firstset(reverse) 

3188 

3189 return set([None]) 

3190 

3191 def _compile(self, reverse, fuzzy): 

3192 flags = 0 

3193 if self.positive: 

3194 flags |= POSITIVE_OP 

3195 if fuzzy: 

3196 flags |= FUZZY_OP 

3197 if reverse: 

3198 flags |= REVERSE_OP 

3199 

3200 return ([(OP.LOOKAROUND, flags, int(not self.behind))] + 

3201 self.subpattern.compile(self.behind) + [(OP.END, )]) 

3202 

3203 def dump(self, indent, reverse): 

3204 print("{}LOOK{} {}".format(INDENT * indent, 

3205 self._dir_text[self.behind], POS_TEXT[self.positive])) 

3206 self.subpattern.dump(indent + 1, self.behind) 

3207 

3208 def is_empty(self): 

3209 return self.positive and self.subpattern.is_empty() 

3210 

3211 def __eq__(self, other): 

3212 return type(self) is type(other) and (self.behind, self.positive, 

3213 self.subpattern) == (other.behind, other.positive, other.subpattern) 

3214 

3215 def max_width(self): 

3216 return 0 

3217 

3218class LookAroundConditional(RegexBase): 

3219 _dir_text = {False: "AHEAD", True: "BEHIND"} 

3220 

3221 def __init__(self, behind, positive, subpattern, yes_item, no_item): 

3222 RegexBase.__init__(self) 

3223 self.behind = bool(behind) 

3224 self.positive = bool(positive) 

3225 self.subpattern = subpattern 

3226 self.yes_item = yes_item 

3227 self.no_item = no_item 

3228 

3229 def fix_groups(self, pattern, reverse, fuzzy): 

3230 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

3231 self.yes_item.fix_groups(pattern, reverse, fuzzy) 

3232 self.no_item.fix_groups(pattern, reverse, fuzzy) 

3233 

3234 def optimise(self, info, reverse): 

3235 subpattern = self.subpattern.optimise(info, self.behind) 

3236 yes_item = self.yes_item.optimise(info, self.behind) 

3237 no_item = self.no_item.optimise(info, self.behind) 

3238 

3239 return LookAroundConditional(self.behind, self.positive, subpattern, 

3240 yes_item, no_item) 

3241 

3242 def pack_characters(self, info): 

3243 self.subpattern = self.subpattern.pack_characters(info) 

3244 self.yes_item = self.yes_item.pack_characters(info) 

3245 self.no_item = self.no_item.pack_characters(info) 

3246 return self 

3247 

3248 def remove_captures(self): 

3249 self.subpattern = self.subpattern.remove_captures() 

3250 self.yes_item = self.yes_item.remove_captures() 

3251 self.no_item = self.no_item.remove_captures() 

3252 

3253 def is_atomic(self): 

3254 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and 

3255 self.no_item.is_atomic()) 

3256 

3257 def can_be_affix(self): 

3258 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix() 

3259 and self.no_item.can_be_affix()) 

3260 

3261 def contains_group(self): 

3262 return (self.subpattern.contains_group() or 

3263 self.yes_item.contains_group() or self.no_item.contains_group()) 

3264 

3265 def _compile(self, reverse, fuzzy): 

3266 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))] 

3267 code.extend(self.subpattern.compile(self.behind, fuzzy)) 

3268 code.append((OP.NEXT, )) 

3269 code.extend(self.yes_item.compile(reverse, fuzzy)) 

3270 add_code = self.no_item.compile(reverse, fuzzy) 

3271 if add_code: 

3272 code.append((OP.NEXT, )) 

3273 code.extend(add_code) 

3274 

3275 code.append((OP.END, )) 

3276 

3277 return code 

3278 

3279 def dump(self, indent, reverse): 

3280 print("{}CONDITIONAL {} {}".format(INDENT * indent, 

3281 self._dir_text[self.behind], POS_TEXT[self.positive])) 

3282 self.subpattern.dump(indent + 1, self.behind) 

3283 print("{}EITHER".format(INDENT * indent)) 

3284 self.yes_item.dump(indent + 1, reverse) 

3285 if not self.no_item.is_empty(): 

3286 print("{}OR".format(INDENT * indent)) 

3287 self.no_item.dump(indent + 1, reverse) 

3288 

3289 def is_empty(self): 

3290 return (self.subpattern.is_empty() and self.yes_item.is_empty() or 

3291 self.no_item.is_empty()) 

3292 

3293 def __eq__(self, other): 

3294 return type(self) is type(other) and (self.subpattern, self.yes_item, 

3295 self.no_item) == (other.subpattern, other.yes_item, other.no_item) 

3296 

3297 def max_width(self): 

3298 return max(self.yes_item.max_width(), self.no_item.max_width()) 

3299 

3300 def get_required_string(self, reverse): 

3301 return self.max_width(), None 

3302 

3303class PrecompiledCode(RegexBase): 

3304 def __init__(self, code): 

3305 self.code = code 

3306 

3307 def _compile(self, reverse, fuzzy): 

3308 return [tuple(self.code)] 

3309 

3310class Property(RegexBase): 

3311 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False): 

3312 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False): 

3313 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True): 

3314 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE, 

3315 True): OP.PROPERTY_IGN_REV} 

3316 

3317 def __init__(self, value, positive=True, case_flags=NOCASE, 

3318 zerowidth=False, encoding=0): 

3319 RegexBase.__init__(self) 

3320 self.value = value 

3321 self.positive = bool(positive) 

3322 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3323 self.zerowidth = bool(zerowidth) 

3324 self.encoding = encoding 

3325 

3326 self._key = (self.__class__, self.value, self.positive, 

3327 self.case_flags, self.zerowidth) 

3328 

3329 def rebuild(self, positive, case_flags, zerowidth): 

3330 return Property(self.value, positive, case_flags, zerowidth, 

3331 self.encoding) 

3332 

3333 def optimise(self, info, reverse, in_set=False): 

3334 return self 

3335 

3336 def get_firstset(self, reverse): 

3337 return set([self]) 

3338 

3339 def has_simple_start(self): 

3340 return True 

3341 

3342 def _compile(self, reverse, fuzzy): 

3343 flags = 0 

3344 if self.positive: 

3345 flags |= POSITIVE_OP 

3346 if self.zerowidth: 

3347 flags |= ZEROWIDTH_OP 

3348 if fuzzy: 

3349 flags |= FUZZY_OP 

3350 flags |= self.encoding << ENCODING_OP_SHIFT 

3351 return [(self._opcode[self.case_flags, reverse], flags, self.value)] 

3352 

3353 def dump(self, indent, reverse): 

3354 prop = PROPERTY_NAMES[self.value >> 16] 

3355 name, value = prop[0], prop[1][self.value & 0xFFFF] 

3356 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent, 

3357 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags], 

3358 ["", " ASCII"][self.encoding])) 

3359 

3360 def matches(self, ch): 

3361 return _regex.has_property_value(self.value, ch) == self.positive 

3362 

3363 def max_width(self): 

3364 return 1 

3365 

3366class Prune(ZeroWidthBase): 

3367 _op_name = "PRUNE" 

3368 

3369 def _compile(self, reverse, fuzzy): 

3370 return [(OP.PRUNE, )] 

3371 

3372class Range(RegexBase): 

3373 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN, 

3374 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN, 

3375 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV, 

3376 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV} 

3377 _op_name = "RANGE" 

3378 

3379 def __init__(self, lower, upper, positive=True, case_flags=NOCASE, 

3380 zerowidth=False): 

3381 RegexBase.__init__(self) 

3382 self.lower = lower 

3383 self.upper = upper 

3384 self.positive = bool(positive) 

3385 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3386 self.zerowidth = bool(zerowidth) 

3387 

3388 self._key = (self.__class__, self.lower, self.upper, self.positive, 

3389 self.case_flags, self.zerowidth) 

3390 

3391 def rebuild(self, positive, case_flags, zerowidth): 

3392 return Range(self.lower, self.upper, positive, case_flags, zerowidth) 

3393 

3394 def optimise(self, info, reverse, in_set=False): 

3395 # Is the range case-sensitive? 

3396 if not self.positive or not (self.case_flags & IGNORECASE) or in_set: 

3397 return self 

3398 

3399 # Is full case-folding possible? 

3400 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) != 

3401 FULLIGNORECASE): 

3402 return self 

3403 

3404 # Get the characters which expand to multiple codepoints on folding. 

3405 expanding_chars = _regex.get_expand_on_folding() 

3406 

3407 # Get the folded characters in the range. 

3408 items = [] 

3409 for ch in expanding_chars: 

3410 if self.lower <= ord(ch) <= self.upper: 

3411 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 

3412 items.append(String([ord(c) for c in folded], 

3413 case_flags=self.case_flags)) 

3414 

3415 if not items: 

3416 # We can fall back to simple case-folding. 

3417 return self 

3418 

3419 if len(items) < self.upper - self.lower + 1: 

3420 # Not all the characters are covered by the full case-folding. 

3421 items.insert(0, self) 

3422 

3423 return Branch(items) 

3424 

3425 def _compile(self, reverse, fuzzy): 

3426 flags = 0 

3427 if self.positive: 

3428 flags |= POSITIVE_OP 

3429 if self.zerowidth: 

3430 flags |= ZEROWIDTH_OP 

3431 if fuzzy: 

3432 flags |= FUZZY_OP 

3433 return [(self._opcode[self.case_flags, reverse], flags, self.lower, 

3434 self.upper)] 

3435 

3436 def dump(self, indent, reverse): 

3437 display_lower = ascii(chr(self.lower)).lstrip("bu") 

3438 display_upper = ascii(chr(self.upper)).lstrip("bu") 

3439 print("{}RANGE {} {} {}{}".format(INDENT * indent, 

3440 POS_TEXT[self.positive], display_lower, display_upper, 

3441 CASE_TEXT[self.case_flags])) 

3442 

3443 def matches(self, ch): 

3444 return (self.lower <= ch <= self.upper) == self.positive 

3445 

3446 def max_width(self): 

3447 return 1 

3448 

3449class RefGroup(RegexBase): 

3450 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False): 

3451 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE, 

3452 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE, 

3453 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV, 

3454 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV} 

3455 

3456 def __init__(self, info, group, position, case_flags=NOCASE): 

3457 RegexBase.__init__(self) 

3458 self.info = info 

3459 self.group = group 

3460 self.position = position 

3461 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3462 

3463 self._key = self.__class__, self.group, self.case_flags 

3464 

3465 def fix_groups(self, pattern, reverse, fuzzy): 

3466 try: 

3467 self.group = int(self.group) 

3468 except ValueError: 

3469 try: 

3470 self.group = self.info.group_index[self.group] 

3471 except KeyError: 

3472 raise error("unknown group", pattern, self.position) 

3473 

3474 if not 1 <= self.group <= self.info.group_count: 

3475 raise error("invalid group reference", pattern, self.position) 

3476 

3477 self._key = self.__class__, self.group, self.case_flags 

3478 

3479 def remove_captures(self): 

3480 raise error("group reference not allowed", self.pattern, self.position) 

3481 

3482 def _compile(self, reverse, fuzzy): 

3483 flags = 0 

3484 if fuzzy: 

3485 flags |= FUZZY_OP 

3486 return [(self._opcode[self.case_flags, reverse], flags, self.group)] 

3487 

3488 def dump(self, indent, reverse): 

3489 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group, 

3490 CASE_TEXT[self.case_flags])) 

3491 

3492 def max_width(self): 

3493 return UNLIMITED 

3494 

3495 def __del__(self): 

3496 self.info = None 

3497 

3498class SearchAnchor(ZeroWidthBase): 

3499 _opcode = OP.SEARCH_ANCHOR 

3500 _op_name = "SEARCH_ANCHOR" 

3501 

3502class Sequence(RegexBase): 

3503 def __init__(self, items=None): 

3504 RegexBase.__init__(self) 

3505 if items is None: 

3506 items = [] 

3507 

3508 self.items = items 

3509 

3510 def fix_groups(self, pattern, reverse, fuzzy): 

3511 for s in self.items: 

3512 s.fix_groups(pattern, reverse, fuzzy) 

3513 

3514 def optimise(self, info, reverse): 

3515 # Flatten the sequences. 

3516 items = [] 

3517 for s in self.items: 

3518 s = s.optimise(info, reverse) 

3519 if isinstance(s, Sequence): 

3520 items.extend(s.items) 

3521 else: 

3522 items.append(s) 

3523 

3524 return make_sequence(items) 

3525 

3526 def pack_characters(self, info): 

3527 "Packs sequences of characters into strings." 

3528 items = [] 

3529 characters = [] 

3530 case_flags = NOCASE 

3531 for s in self.items: 

3532 if type(s) is Character and s.positive and not s.zerowidth: 

3533 if s.case_flags != case_flags: 

3534 # Different case sensitivity, so flush, unless neither the 

3535 # previous nor the new character are cased. 

3536 if s.case_flags or is_cased_i(info, s.value): 

3537 Sequence._flush_characters(info, characters, 

3538 case_flags, items) 

3539 

3540 case_flags = s.case_flags 

3541 

3542 characters.append(s.value) 

3543 elif type(s) is String or type(s) is Literal: 

3544 if s.case_flags != case_flags: 

3545 # Different case sensitivity, so flush, unless the neither 

3546 # the previous nor the new string are cased. 

3547 if s.case_flags or any(is_cased_i(info, c) for c in 

3548 characters): 

3549 Sequence._flush_characters(info, characters, 

3550 case_flags, items) 

3551 

3552 case_flags = s.case_flags 

3553 

3554 characters.extend(s.characters) 

3555 else: 

3556 Sequence._flush_characters(info, characters, case_flags, items) 

3557 

3558 items.append(s.pack_characters(info)) 

3559 

3560 Sequence._flush_characters(info, characters, case_flags, items) 

3561 

3562 return make_sequence(items) 

3563 

3564 def remove_captures(self): 

3565 self.items = [s.remove_captures() for s in self.items] 

3566 return self 

3567 

3568 def is_atomic(self): 

3569 return all(s.is_atomic() for s in self.items) 

3570 

3571 def can_be_affix(self): 

3572 return False 

3573 

3574 def contains_group(self): 

3575 return any(s.contains_group() for s in self.items) 

3576 

3577 def get_firstset(self, reverse): 

3578 fs = set() 

3579 items = self.items 

3580 if reverse: 

3581 items.reverse() 

3582 for s in items: 

3583 fs |= s.get_firstset(reverse) 

3584 if None not in fs: 

3585 return fs 

3586 fs.discard(None) 

3587 

3588 return fs | set([None]) 

3589 

3590 def has_simple_start(self): 

3591 return bool(self.items) and self.items[0].has_simple_start() 

3592 

3593 def _compile(self, reverse, fuzzy): 

3594 seq = self.items 

3595 if reverse: 

3596 seq = seq[::-1] 

3597 

3598 code = [] 

3599 for s in seq: 

3600 code.extend(s.compile(reverse, fuzzy)) 

3601 

3602 return code 

3603 

3604 def dump(self, indent, reverse): 

3605 for s in self.items: 

3606 s.dump(indent, reverse) 

3607 

3608 @staticmethod 

3609 def _flush_characters(info, characters, case_flags, items): 

3610 if not characters: 

3611 return 

3612 

3613 # Disregard case_flags if all of the characters are case-less. 

3614 if case_flags & IGNORECASE: 

3615 if not any(is_cased_i(info, c) for c in characters): 

3616 case_flags = NOCASE 

3617 

3618 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE: 

3619 literals = Sequence._fix_full_casefold(characters) 

3620 

3621 for item in literals: 

3622 chars = item.characters 

3623 

3624 if len(chars) == 1: 

3625 items.append(Character(chars[0], case_flags=item.case_flags)) 

3626 else: 

3627 items.append(String(chars, case_flags=item.case_flags)) 

3628 else: 

3629 if len(characters) == 1: 

3630 items.append(Character(characters[0], case_flags=case_flags)) 

3631 else: 

3632 items.append(String(characters, case_flags=case_flags)) 

3633 

3634 characters[:] = [] 

3635 

3636 @staticmethod 

3637 def _fix_full_casefold(characters): 

3638 # Split a literal needing full case-folding into chunks that need it 

3639 # and chunks that can use simple case-folding, which is faster. 

3640 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in 

3641 _regex.get_expand_on_folding()] 

3642 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c) 

3643 for c in characters)).lower() 

3644 chunks = [] 

3645 

3646 for e in expanded: 

3647 found = string.find(e) 

3648 

3649 while found >= 0: 

3650 chunks.append((found, found + len(e))) 

3651 found = string.find(e, found + 1) 

3652 

3653 pos = 0 

3654 literals = [] 

3655 

3656 for start, end in Sequence._merge_chunks(chunks): 

3657 if pos < start: 

3658 literals.append(Literal(characters[pos : start], 

3659 case_flags=IGNORECASE)) 

3660 

3661 literals.append(Literal(characters[start : end], 

3662 case_flags=FULLIGNORECASE)) 

3663 pos = end 

3664 

3665 if pos < len(characters): 

3666 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE)) 

3667 

3668 return literals 

3669 

3670 @staticmethod 

3671 def _merge_chunks(chunks): 

3672 if len(chunks) < 2: 

3673 return chunks 

3674 

3675 chunks.sort() 

3676 

3677 start, end = chunks[0] 

3678 new_chunks = [] 

3679 

3680 for s, e in chunks[1 : ]: 

3681 if s <= end: 

3682 end = max(end, e) 

3683 else: 

3684 new_chunks.append((start, end)) 

3685 start, end = s, e 

3686 

3687 new_chunks.append((start, end)) 

3688 

3689 return new_chunks 

3690 

3691 def is_empty(self): 

3692 return all(i.is_empty() for i in self.items) 

3693 

3694 def __eq__(self, other): 

3695 return type(self) is type(other) and self.items == other.items 

3696 

3697 def max_width(self): 

3698 return sum(s.max_width() for s in self.items) 

3699 

3700 def get_required_string(self, reverse): 

3701 seq = self.items 

3702 if reverse: 

3703 seq = seq[::-1] 

3704 

3705 offset = 0 

3706 

3707 for s in seq: 

3708 ofs, req = s.get_required_string(reverse) 

3709 offset += ofs 

3710 if req: 

3711 return offset, req 

3712 

3713 return offset, None 

3714 

3715class SetBase(RegexBase): 

3716 def __init__(self, info, items, positive=True, case_flags=NOCASE, 

3717 zerowidth=False): 

3718 RegexBase.__init__(self) 

3719 self.info = info 

3720 self.items = tuple(items) 

3721 self.positive = bool(positive) 

3722 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3723 self.zerowidth = bool(zerowidth) 

3724 

3725 self.char_width = 1 

3726 

3727 self._key = (self.__class__, self.items, self.positive, 

3728 self.case_flags, self.zerowidth) 

3729 

3730 def rebuild(self, positive, case_flags, zerowidth): 

3731 return type(self)(self.info, self.items, positive, case_flags, 

3732 zerowidth).optimise(self.info, False) 

3733 

3734 def get_firstset(self, reverse): 

3735 return set([self]) 

3736 

3737 def has_simple_start(self): 

3738 return True 

3739 

3740 def _compile(self, reverse, fuzzy): 

3741 flags = 0 

3742 if self.positive: 

3743 flags |= POSITIVE_OP 

3744 if self.zerowidth: 

3745 flags |= ZEROWIDTH_OP 

3746 if fuzzy: 

3747 flags |= FUZZY_OP 

3748 code = [(self._opcode[self.case_flags, reverse], flags)] 

3749 for m in self.items: 

3750 code.extend(m.compile()) 

3751 

3752 code.append((OP.END, )) 

3753 

3754 return code 

3755 

3756 def dump(self, indent, reverse): 

3757 print("{}{} {}{}".format(INDENT * indent, self._op_name, 

3758 POS_TEXT[self.positive], CASE_TEXT[self.case_flags])) 

3759 for i in self.items: 

3760 i.dump(indent + 1, reverse) 

3761 

3762 def _handle_case_folding(self, info, in_set): 

3763 # Is the set case-sensitive? 

3764 if not self.positive or not (self.case_flags & IGNORECASE) or in_set: 

3765 return self 

3766 

3767 # Is full case-folding possible? 

3768 if (not (self.info.flags & UNICODE) or (self.case_flags & 

3769 FULLIGNORECASE) != FULLIGNORECASE): 

3770 return self 

3771 

3772 # Get the characters which expand to multiple codepoints on folding. 

3773 expanding_chars = _regex.get_expand_on_folding() 

3774 

3775 # Get the folded characters in the set. 

3776 items = [] 

3777 seen = set() 

3778 for ch in expanding_chars: 

3779 if self.matches(ord(ch)): 

3780 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 

3781 if folded not in seen: 

3782 items.append(String([ord(c) for c in folded], 

3783 case_flags=self.case_flags)) 

3784 seen.add(folded) 

3785 

3786 if not items: 

3787 # We can fall back to simple case-folding. 

3788 return self 

3789 

3790 return Branch([self] + items) 

3791 

3792 def max_width(self): 

3793 # Is the set case-sensitive? 

3794 if not self.positive or not (self.case_flags & IGNORECASE): 

3795 return 1 

3796 

3797 # Is full case-folding possible? 

3798 if (not (self.info.flags & UNICODE) or (self.case_flags & 

3799 FULLIGNORECASE) != FULLIGNORECASE): 

3800 return 1 

3801 

3802 # Get the characters which expand to multiple codepoints on folding. 

3803 expanding_chars = _regex.get_expand_on_folding() 

3804 

3805 # Get the folded characters in the set. 

3806 seen = set() 

3807 for ch in expanding_chars: 

3808 if self.matches(ord(ch)): 

3809 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 

3810 seen.add(folded) 

3811 

3812 if not seen: 

3813 return 1 

3814 

3815 return max(len(folded) for folded in seen) 

3816 

3817 def __del__(self): 

3818 self.info = None 

3819 

3820class SetDiff(SetBase): 

3821 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False): 

3822 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False): 

3823 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True): 

3824 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE, 

3825 True): OP.SET_DIFF_IGN_REV} 

3826 _op_name = "SET_DIFF" 

3827 

3828 def optimise(self, info, reverse, in_set=False): 

3829 items = self.items 

3830 if len(items) > 2: 

3831 items = [items[0], SetUnion(info, items[1 : ])] 

3832 

3833 if len(items) == 1: 

3834 return items[0].with_flags(case_flags=self.case_flags, 

3835 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3836 

3837 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in 

3838 items) 

3839 

3840 return self._handle_case_folding(info, in_set) 

3841 

3842 def matches(self, ch): 

3843 m = self.items[0].matches(ch) and not self.items[1].matches(ch) 

3844 return m == self.positive 

3845 

3846class SetInter(SetBase): 

3847 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False): 

3848 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE, 

3849 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE, 

3850 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV, 

3851 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV} 

3852 _op_name = "SET_INTER" 

3853 

3854 def optimise(self, info, reverse, in_set=False): 

3855 items = [] 

3856 for m in self.items: 

3857 m = m.optimise(info, reverse, in_set=True) 

3858 if isinstance(m, SetInter) and m.positive: 

3859 # Intersection in intersection. 

3860 items.extend(m.items) 

3861 else: 

3862 items.append(m) 

3863 

3864 if len(items) == 1: 

3865 return items[0].with_flags(case_flags=self.case_flags, 

3866 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3867 

3868 self.items = tuple(items) 

3869 

3870 return self._handle_case_folding(info, in_set) 

3871 

3872 def matches(self, ch): 

3873 m = all(i.matches(ch) for i in self.items) 

3874 return m == self.positive 

3875 

3876class SetSymDiff(SetBase): 

3877 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False): 

3878 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE, 

3879 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV, 

3880 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True): 

3881 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV} 

3882 _op_name = "SET_SYM_DIFF" 

3883 

3884 def optimise(self, info, reverse, in_set=False): 

3885 items = [] 

3886 for m in self.items: 

3887 m = m.optimise(info, reverse, in_set=True) 

3888 if isinstance(m, SetSymDiff) and m.positive: 

3889 # Symmetric difference in symmetric difference. 

3890 items.extend(m.items) 

3891 else: 

3892 items.append(m) 

3893 

3894 if len(items) == 1: 

3895 return items[0].with_flags(case_flags=self.case_flags, 

3896 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3897 

3898 self.items = tuple(items) 

3899 

3900 return self._handle_case_folding(info, in_set) 

3901 

3902 def matches(self, ch): 

3903 m = False 

3904 for i in self.items: 

3905 m = m != i.matches(ch) 

3906 

3907 return m == self.positive 

3908 

3909class SetUnion(SetBase): 

3910 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False): 

3911 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE, 

3912 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE, 

3913 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV, 

3914 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV} 

3915 _op_name = "SET_UNION" 

3916 

3917 def optimise(self, info, reverse, in_set=False): 

3918 items = [] 

3919 for m in self.items: 

3920 m = m.optimise(info, reverse, in_set=True) 

3921 if isinstance(m, SetUnion) and m.positive: 

3922 # Union in union. 

3923 items.extend(m.items) 

3924 elif isinstance(m, AnyAll): 

3925 return AnyAll() 

3926 else: 

3927 items.append(m) 

3928 

3929 # Are there complementary properties? 

3930 properties = (set(), set()) 

3931 

3932 for m in items: 

3933 if isinstance(m, Property): 

3934 properties[m.positive].add((m.value, m.case_flags, m.zerowidth)) 

3935 

3936 if properties[0] & properties[1]: 

3937 return AnyAll() 

3938 

3939 if len(items) == 1: 

3940 i = items[0] 

3941 return i.with_flags(positive=i.positive == self.positive, 

3942 case_flags=self.case_flags, 

3943 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3944 

3945 self.items = tuple(items) 

3946 

3947 return self._handle_case_folding(info, in_set) 

3948 

3949 def _compile(self, reverse, fuzzy): 

3950 flags = 0 

3951 if self.positive: 

3952 flags |= POSITIVE_OP 

3953 if self.zerowidth: 

3954 flags |= ZEROWIDTH_OP 

3955 if fuzzy: 

3956 flags |= FUZZY_OP 

3957 

3958 characters, others = defaultdict(list), [] 

3959 for m in self.items: 

3960 if isinstance(m, Character): 

3961 characters[m.positive].append(m.value) 

3962 else: 

3963 others.append(m) 

3964 

3965 code = [(self._opcode[self.case_flags, reverse], flags)] 

3966 

3967 for positive, values in characters.items(): 

3968 flags = 0 

3969 if positive: 

3970 flags |= POSITIVE_OP 

3971 if len(values) == 1: 

3972 code.append((OP.CHARACTER, flags, values[0])) 

3973 else: 

3974 code.append((OP.STRING, flags, len(values)) + tuple(values)) 

3975 

3976 for m in others: 

3977 code.extend(m.compile()) 

3978 

3979 code.append((OP.END, )) 

3980 

3981 return code 

3982 

3983 def matches(self, ch): 

3984 m = any(i.matches(ch) for i in self.items) 

3985 return m == self.positive 

3986 

3987class Skip(ZeroWidthBase): 

3988 _op_name = "SKIP" 

3989 _opcode = OP.SKIP 

3990 

3991class StartOfLine(ZeroWidthBase): 

3992 _opcode = OP.START_OF_LINE 

3993 _op_name = "START_OF_LINE" 

3994 

3995class StartOfLineU(StartOfLine): 

3996 _opcode = OP.START_OF_LINE_U 

3997 _op_name = "START_OF_LINE_U" 

3998 

3999class StartOfString(ZeroWidthBase): 

4000 _opcode = OP.START_OF_STRING 

4001 _op_name = "START_OF_STRING" 

4002 

4003class StartOfWord(ZeroWidthBase): 

4004 _opcode = OP.START_OF_WORD 

4005 _op_name = "START_OF_WORD" 

4006 

4007class String(RegexBase): 

4008 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN, 

4009 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD, 

4010 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV, 

4011 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True): 

4012 OP.STRING_FLD_REV} 

4013 

4014 def __init__(self, characters, case_flags=NOCASE): 

4015 self.characters = tuple(characters) 

4016 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

4017 

4018 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE: 

4019 folded_characters = [] 

4020 for char in self.characters: 

4021 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char)) 

4022 folded_characters.extend(ord(c) for c in folded) 

4023 else: 

4024 folded_characters = self.characters 

4025 

4026 self.folded_characters = tuple(folded_characters) 

4027 self.required = False 

4028 

4029 self._key = self.__class__, self.characters, self.case_flags 

4030 

4031 def get_firstset(self, reverse): 

4032 if reverse: 

4033 pos = -1 

4034 else: 

4035 pos = 0 

4036 return set([Character(self.characters[pos], 

4037 case_flags=self.case_flags)]) 

4038 

4039 def has_simple_start(self): 

4040 return True 

4041 

4042 def _compile(self, reverse, fuzzy): 

4043 flags = 0 

4044 if fuzzy: 

4045 flags |= FUZZY_OP 

4046 if self.required: 

4047 flags |= REQUIRED_OP 

4048 return [(self._opcode[self.case_flags, reverse], flags, 

4049 len(self.folded_characters)) + self.folded_characters] 

4050 

4051 def dump(self, indent, reverse): 

4052 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu") 

4053 print("{}STRING {}{}".format(INDENT * indent, display, 

4054 CASE_TEXT[self.case_flags])) 

4055 

4056 def max_width(self): 

4057 return len(self.folded_characters) 

4058 

4059 def get_required_string(self, reverse): 

4060 return 0, self 

4061 

4062class Literal(String): 

4063 def dump(self, indent, reverse): 

4064 literal = ''.join(chr(c) for c in self.characters) 

4065 display = ascii(literal).lstrip("bu") 

4066 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display, 

4067 CASE_TEXT[self.case_flags])) 

4068 

4069class StringSet(Branch): 

4070 def __init__(self, info, name, case_flags=NOCASE): 

4071 self.info = info 

4072 self.name = name 

4073 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

4074 

4075 self._key = self.__class__, self.name, self.case_flags 

4076 

4077 self.set_key = (name, self.case_flags) 

4078 if self.set_key not in info.named_lists_used: 

4079 info.named_lists_used[self.set_key] = len(info.named_lists_used) 

4080 

4081 index = self.info.named_lists_used[self.set_key] 

4082 items = self.info.kwargs[self.name] 

4083 

4084 case_flags = self.case_flags 

4085 

4086 encoding = self.info.flags & _ALL_ENCODINGS 

4087 fold_flags = encoding | case_flags 

4088 

4089 choices = [] 

4090 

4091 for string in items: 

4092 if isinstance(string, str): 

4093 string = [ord(c) for c in string] 

4094 

4095 choices.append([Character(c, case_flags=case_flags) for c in 

4096 string]) 

4097 

4098 # Sort from longest to shortest. 

4099 choices.sort(key=len, reverse=True) 

4100 

4101 self.branches = [Sequence(choice) for choice in choices] 

4102 

4103 def dump(self, indent, reverse): 

4104 print("{}STRING_SET {}{}".format(INDENT * indent, self.name, 

4105 CASE_TEXT[self.case_flags])) 

4106 

4107 def __del__(self): 

4108 self.info = None 

4109 

4110class Source: 

4111 "Scanner for the regular expression source string." 

4112 def __init__(self, string): 

4113 if isinstance(string, str): 

4114 self.string = string 

4115 self.char_type = chr 

4116 else: 

4117 self.string = string.decode("latin-1") 

4118 self.char_type = lambda c: bytes([c]) 

4119 

4120 self.pos = 0 

4121 self.ignore_space = False 

4122 self.sep = string[ : 0] 

4123 

4124 def peek(self, override_ignore=False): 

4125 string = self.string 

4126 pos = self.pos 

4127 

4128 try: 

4129 if self.ignore_space and not override_ignore: 

4130 while True: 

4131 if string[pos].isspace(): 

4132 # Skip over the whitespace. 

4133 pos += 1 

4134 elif string[pos] == "#": 

4135 # Skip over the comment to the end of the line. 

4136 pos = string.index("\n", pos) 

4137 else: 

4138 break 

4139 

4140 return string[pos] 

4141 except IndexError: 

4142 # We've reached the end of the string. 

4143 return string[ : 0] 

4144 except ValueError: 

4145 # The comment extended to the end of the string. 

4146 return string[ : 0] 

4147 

4148 def get(self, override_ignore=False): 

4149 string = self.string 

4150 pos = self.pos 

4151 

4152 try: 

4153 if self.ignore_space and not override_ignore: 

4154 while True: 

4155 if string[pos].isspace(): 

4156 # Skip over the whitespace. 

4157 pos += 1 

4158 elif string[pos] == "#": 

4159 # Skip over the comment to the end of the line. 

4160 pos = string.index("\n", pos) 

4161 else: 

4162 break 

4163 

4164 ch = string[pos] 

4165 self.pos = pos + 1 

4166 return ch 

4167 except IndexError: 

4168 # We've reached the end of the string. 

4169 self.pos = pos 

4170 return string[ : 0] 

4171 except ValueError: 

4172 # The comment extended to the end of the string. 

4173 self.pos = len(string) 

4174 return string[ : 0] 

4175 

4176 def get_many(self, count=1): 

4177 string = self.string 

4178 pos = self.pos 

4179 

4180 try: 

4181 if self.ignore_space: 

4182 substring = [] 

4183 

4184 while len(substring) < count: 

4185 while True: 

4186 if string[pos].isspace(): 

4187 # Skip over the whitespace. 

4188 pos += 1 

4189 elif string[pos] == "#": 

4190 # Skip over the comment to the end of the line. 

4191 pos = string.index("\n", pos) 

4192 else: 

4193 break 

4194 

4195 substring.append(string[pos]) 

4196 pos += 1 

4197 

4198 substring = "".join(substring) 

4199 else: 

4200 substring = string[pos : pos + count] 

4201 pos += len(substring) 

4202 

4203 self.pos = pos 

4204 return substring 

4205 except IndexError: 

4206 # We've reached the end of the string. 

4207 self.pos = len(string) 

4208 return "".join(substring) 

4209 except ValueError: 

4210 # The comment extended to the end of the string. 

4211 self.pos = len(string) 

4212 return "".join(substring) 

4213 

4214 def get_while(self, test_set, include=True, keep_spaces=False): 

4215 string = self.string 

4216 pos = self.pos 

4217 

4218 if self.ignore_space and not keep_spaces: 

4219 try: 

4220 substring = [] 

4221 

4222 while True: 

4223 if string[pos].isspace(): 

4224 # Skip over the whitespace. 

4225 pos += 1 

4226 elif string[pos] == "#": 

4227 # Skip over the comment to the end of the line. 

4228 pos = string.index("\n", pos) 

4229 elif (string[pos] in test_set) == include: 

4230 substring.append(string[pos]) 

4231 pos += 1 

4232 else: 

4233 break 

4234 

4235 self.pos = pos 

4236 except IndexError: 

4237 # We've reached the end of the string. 

4238 self.pos = len(string) 

4239 except ValueError: 

4240 # The comment extended to the end of the string. 

4241 self.pos = len(string) 

4242 

4243 return "".join(substring) 

4244 else: 

4245 try: 

4246 while (string[pos] in test_set) == include: 

4247 pos += 1 

4248 

4249 substring = string[self.pos : pos] 

4250 

4251 self.pos = pos 

4252 

4253 return substring 

4254 except IndexError: 

4255 # We've reached the end of the string. 

4256 substring = string[self.pos : pos] 

4257 

4258 self.pos = pos 

4259 

4260 return substring 

4261 

4262 def skip_while(self, test_set, include=True): 

4263 string = self.string 

4264 pos = self.pos 

4265 

4266 try: 

4267 if self.ignore_space: 

4268 while True: 

4269 if string[pos].isspace(): 

4270 # Skip over the whitespace. 

4271 pos += 1 

4272 elif string[pos] == "#": 

4273 # Skip over the comment to the end of the line. 

4274 pos = string.index("\n", pos) 

4275 elif (string[pos] in test_set) == include: 

4276 pos += 1 

4277 else: 

4278 break 

4279 else: 

4280 while (string[pos] in test_set) == include: 

4281 pos += 1 

4282 

4283 self.pos = pos 

4284 except IndexError: 

4285 # We've reached the end of the string. 

4286 self.pos = len(string) 

4287 except ValueError: 

4288 # The comment extended to the end of the string. 

4289 self.pos = len(string) 

4290 

4291 def match(self, substring): 

4292 string = self.string 

4293 pos = self.pos 

4294 

4295 if self.ignore_space: 

4296 try: 

4297 for c in substring: 

4298 while True: 

4299 if string[pos].isspace(): 

4300 # Skip over the whitespace. 

4301 pos += 1 

4302 elif string[pos] == "#": 

4303 # Skip over the comment to the end of the line. 

4304 pos = string.index("\n", pos) 

4305 else: 

4306 break 

4307 

4308 if string[pos] != c: 

4309 return False 

4310 

4311 pos += 1 

4312 

4313 self.pos = pos 

4314 

4315 return True 

4316 except IndexError: 

4317 # We've reached the end of the string. 

4318 return False 

4319 except ValueError: 

4320 # The comment extended to the end of the string. 

4321 return False 

4322 else: 

4323 if not string.startswith(substring, pos): 

4324 return False 

4325 

4326 self.pos = pos + len(substring) 

4327 

4328 return True 

4329 

4330 def expect(self, substring): 

4331 if not self.match(substring): 

4332 raise error("missing {}".format(substring), self.string, self.pos) 

4333 

4334 def at_end(self): 

4335 string = self.string 

4336 pos = self.pos 

4337 

4338 try: 

4339 if self.ignore_space: 

4340 while True: 

4341 if string[pos].isspace(): 

4342 pos += 1 

4343 elif string[pos] == "#": 

4344 pos = string.index("\n", pos) 

4345 else: 

4346 break 

4347 

4348 return pos >= len(string) 

4349 except IndexError: 

4350 # We've reached the end of the string. 

4351 return True 

4352 except ValueError: 

4353 # The comment extended to the end of the string. 

4354 return True 

4355 

4356class Info: 

4357 "Info about the regular expression." 

4358 

4359 def __init__(self, flags=0, char_type=None, kwargs={}): 

4360 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION] 

4361 self.flags = flags 

4362 self.global_flags = flags 

4363 self.inline_locale = False 

4364 

4365 self.kwargs = kwargs 

4366 

4367 self.group_count = 0 

4368 self.group_index = {} 

4369 self.group_name = {} 

4370 self.char_type = char_type 

4371 self.named_lists_used = {} 

4372 self.open_groups = [] 

4373 self.open_group_count = {} 

4374 self.defined_groups = {} 

4375 self.group_calls = [] 

4376 self.private_groups = {} 

4377 

4378 def open_group(self, name=None): 

4379 group = self.group_index.get(name) 

4380 if group is None: 

4381 while True: 

4382 self.group_count += 1 

4383 if name is None or self.group_count not in self.group_name: 

4384 break 

4385 

4386 group = self.group_count 

4387 if name: 

4388 self.group_index[name] = group 

4389 self.group_name[group] = name 

4390 

4391 if group in self.open_groups: 

4392 # We have a nested named group. We'll assign it a private group 

4393 # number, initially negative until we can assign a proper 

4394 # (positive) number. 

4395 group_alias = -(len(self.private_groups) + 1) 

4396 self.private_groups[group_alias] = group 

4397 group = group_alias 

4398 

4399 self.open_groups.append(group) 

4400 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1 

4401 

4402 return group 

4403 

4404 def close_group(self): 

4405 self.open_groups.pop() 

4406 

4407 def is_open_group(self, name): 

4408 # In version 1, a group reference can refer to an open group. We'll 

4409 # just pretend the group isn't open. 

4410 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

4411 if version == VERSION1: 

4412 return False 

4413 

4414 if name.isdigit(): 

4415 group = int(name) 

4416 else: 

4417 group = self.group_index.get(name) 

4418 

4419 return group in self.open_groups 

4420 

4421def _check_group_features(info, parsed): 

4422 """Checks whether the reverse and fuzzy features of the group calls match 

4423 the groups which they call. 

4424 """ 

4425 call_refs = {} 

4426 additional_groups = [] 

4427 for call, reverse, fuzzy in info.group_calls: 

4428 # Look up the reference of this group call. 

4429 key = (call.group, reverse, fuzzy) 

4430 ref = call_refs.get(key) 

4431 if ref is None: 

4432 # This group doesn't have a reference yet, so look up its features. 

4433 if call.group == 0: 

4434 # Calling the pattern as a whole. 

4435 rev = bool(info.flags & REVERSE) 

4436 fuz = isinstance(parsed, Fuzzy) 

4437 if (rev, fuz) != (reverse, fuzzy): 

4438 # The pattern as a whole doesn't have the features we want, 

4439 # so we'll need to make a copy of it with the desired 

4440 # features. 

4441 additional_groups.append((CallRef(len(call_refs), parsed), 

4442 reverse, fuzzy)) 

4443 else: 

4444 # Calling a capture group. 

4445 def_info = info.defined_groups[call.group] 

4446 group = def_info[0] 

4447 if def_info[1 : ] != (reverse, fuzzy): 

4448 # The group doesn't have the features we want, so we'll 

4449 # need to make a copy of it with the desired features. 

4450 additional_groups.append((group, reverse, fuzzy)) 

4451 

4452 ref = len(call_refs) 

4453 call_refs[key] = ref 

4454 

4455 call.call_ref = ref 

4456 

4457 info.call_refs = call_refs 

4458 info.additional_groups = additional_groups 

4459 

4460def _get_required_string(parsed, flags): 

4461 "Gets the required string and related info of a parsed pattern." 

4462 

4463 req_offset, required = parsed.get_required_string(bool(flags & REVERSE)) 

4464 if required: 

4465 required.required = True 

4466 if req_offset >= UNLIMITED: 

4467 req_offset = -1 

4468 

4469 req_flags = required.case_flags 

4470 if not (flags & UNICODE): 

4471 req_flags &= ~UNICODE 

4472 

4473 req_chars = required.folded_characters 

4474 else: 

4475 req_offset = 0 

4476 req_chars = () 

4477 req_flags = 0 

4478 

4479 return req_offset, req_chars, req_flags 

4480 

4481class Scanner: 

4482 def __init__(self, lexicon, flags=0): 

4483 self.lexicon = lexicon 

4484 

4485 # Combine phrases into a compound pattern. 

4486 patterns = [] 

4487 for phrase, action in lexicon: 

4488 # Parse the regular expression. 

4489 source = Source(phrase) 

4490 info = Info(flags, source.char_type) 

4491 source.ignore_space = bool(info.flags & VERBOSE) 

4492 parsed = _parse_pattern(source, info) 

4493 if not source.at_end(): 

4494 raise error("unbalanced parenthesis", source.string, 

4495 source.pos) 

4496 

4497 # We want to forbid capture groups within each phrase. 

4498 patterns.append(parsed.remove_captures()) 

4499 

4500 # Combine all the subpatterns into one pattern. 

4501 info = Info(flags) 

4502 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)] 

4503 parsed = Branch(patterns) 

4504 

4505 # Optimise the compound pattern. 

4506 reverse = bool(info.flags & REVERSE) 

4507 parsed = parsed.optimise(info, reverse) 

4508 parsed = parsed.pack_characters(info) 

4509 

4510 # Get the required string. 

4511 req_offset, req_chars, req_flags = _get_required_string(parsed, 

4512 info.flags) 

4513 

4514 # Check the features of the groups. 

4515 _check_group_features(info, parsed) 

4516 

4517 # Complain if there are any group calls. They are not supported by the 

4518 # Scanner class. 

4519 if info.call_refs: 

4520 raise error("recursive regex not supported by Scanner", 

4521 source.string, source.pos) 

4522 

4523 reverse = bool(info.flags & REVERSE) 

4524 

4525 # Compile the compound pattern. The result is a list of tuples. 

4526 code = parsed.compile(reverse) + [(OP.SUCCESS, )] 

4527 

4528 # Flatten the code into a list of ints. 

4529 code = _flatten_code(code) 

4530 

4531 if not parsed.has_simple_start(): 

4532 # Get the first set, if possible. 

4533 try: 

4534 fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) 

4535 fs_code = _flatten_code(fs_code) 

4536 code = fs_code + code 

4537 except _FirstSetError: 

4538 pass 

4539 

4540 # Check the global flags for conflicts. 

4541 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

4542 if version not in (0, VERSION0, VERSION1): 

4543 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") 

4544 

4545 # Create the PatternObject. 

4546 # 

4547 # Local flags like IGNORECASE affect the code generation, but aren't 

4548 # needed by the PatternObject itself. Conversely, global flags like 

4549 # LOCALE _don't_ affect the code generation but _are_ needed by the 

4550 # PatternObject. 

4551 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version, 

4552 code, {}, {}, {}, [], req_offset, req_chars, req_flags, 

4553 len(patterns)) 

4554 

4555 def scan(self, string): 

4556 result = [] 

4557 append = result.append 

4558 match = self.scanner.scanner(string).match 

4559 i = 0 

4560 while True: 

4561 m = match() 

4562 if not m: 

4563 break 

4564 j = m.end() 

4565 if i == j: 

4566 break 

4567 action = self.lexicon[m.lastindex - 1][1] 

4568 if hasattr(action, '__call__'): 

4569 self.match = m 

4570 action = action(self, m.group()) 

4571 if action is not None: 

4572 append(action) 

4573 i = j 

4574 

4575 return result, string[i : ] 

4576 

4577# Get the known properties dict. 

4578PROPERTIES = _regex.get_properties() 

4579 

4580# Build the inverse of the properties dict. 

4581PROPERTY_NAMES = {} 

4582for prop_name, (prop_id, values) in PROPERTIES.items(): 

4583 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {})) 

4584 name = max(name, prop_name, key=len) 

4585 PROPERTY_NAMES[prop_id] = name, prop_values 

4586 

4587 for val_name, val_id in values.items(): 

4588 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name, 

4589 key=len) 

4590 

4591# Character escape sequences. 

4592CHARACTER_ESCAPES = { 

4593 "a": "\a", 

4594 "b": "\b", 

4595 "f": "\f", 

4596 "n": "\n", 

4597 "r": "\r", 

4598 "t": "\t", 

4599 "v": "\v", 

4600} 

4601 

4602ASCII_ENCODING = 1 

4603UNICODE_ENCODING = 2 

4604 

4605# Predefined character set escape sequences. 

4606CHARSET_ESCAPES = { 

4607 "d": lookup_property(None, "Digit", True), 

4608 "D": lookup_property(None, "Digit", False), 

4609 "h": lookup_property(None, "Blank", True), 

4610 "s": lookup_property(None, "Space", True), 

4611 "S": lookup_property(None, "Space", False), 

4612 "w": lookup_property(None, "Word", True), 

4613 "W": lookup_property(None, "Word", False), 

4614} 

4615 

4616ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES) 

4617ASCII_CHARSET_ESCAPES.update({ 

4618 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING), 

4619 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING), 

4620 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING), 

4621 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING), 

4622 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING), 

4623 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING), 

4624}) 

4625UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES) 

4626UNICODE_CHARSET_ESCAPES.update({ 

4627 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING), 

4628 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING), 

4629 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING), 

4630 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING), 

4631 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING), 

4632 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING), 

4633}) 

4634 

4635# Positional escape sequences. 

4636POSITION_ESCAPES = { 

4637 "A": StartOfString(), 

4638 "b": Boundary(), 

4639 "B": Boundary(False), 

4640 "K": Keep(), 

4641 "m": StartOfWord(), 

4642 "M": EndOfWord(), 

4643 "Z": EndOfString(), 

4644} 

4645ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES) 

4646ASCII_POSITION_ESCAPES.update({ 

4647 "b": Boundary(encoding=ASCII_ENCODING), 

4648 "B": Boundary(False, encoding=ASCII_ENCODING), 

4649 "m": StartOfWord(encoding=ASCII_ENCODING), 

4650 "M": EndOfWord(encoding=ASCII_ENCODING), 

4651}) 

4652UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES) 

4653UNICODE_POSITION_ESCAPES.update({ 

4654 "b": Boundary(encoding=UNICODE_ENCODING), 

4655 "B": Boundary(False, encoding=UNICODE_ENCODING), 

4656 "m": StartOfWord(encoding=UNICODE_ENCODING), 

4657 "M": EndOfWord(encoding=UNICODE_ENCODING), 

4658}) 

4659 

4660# Positional escape sequences when WORD flag set. 

4661WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES) 

4662WORD_POSITION_ESCAPES.update({ 

4663 "b": DefaultBoundary(), 

4664 "B": DefaultBoundary(False), 

4665 "m": DefaultStartOfWord(), 

4666 "M": DefaultEndOfWord(), 

4667}) 

4668 

4669# Regex control verbs. 

4670VERBS = { 

4671 "FAIL": Failure(), 

4672 "F": Failure(), 

4673 "PRUNE": Prune(), 

4674 "SKIP": Skip(), 

4675}