Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 43%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

2896 statements  

1# 

2# Secret Labs' Regular Expression Engine core module 

3# 

4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 

5# 

6# This version of the SRE library can be redistributed under CNRI's 

7# Python 1.6 license. For any other use, please contact Secret Labs 

8# AB (info@pythonware.com). 

9# 

10# Portions of this engine have been developed in cooperation with 

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and 

12# other compatibility work. 

13# 

14# 2010-01-16 mrab Python front-end re-written and extended 

15 

16import enum 

17import string 

18import unicodedata 

19from collections import defaultdict 

20 

21from regex import _regex 

22 

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", 

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", 

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", 

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error", 

27 "Scanner", "RegexFlag"] 

28 

29# The regex exception. 

30class error(Exception): 

31 """Exception raised for invalid regular expressions. 

32 

33 Attributes: 

34 

35 msg: The unformatted error message 

36 pattern: The regular expression pattern 

37 pos: The position in the pattern where compilation failed, or None 

38 lineno: The line number where compilation failed, unless pos is None 

39 colno: The column number where compilation failed, unless pos is None 

40 """ 

41 

42 def __init__(self, message, pattern=None, pos=None): 

43 newline = '\n' if isinstance(pattern, str) else b'\n' 

44 self.msg = message 

45 self.pattern = pattern 

46 self.pos = pos 

47 if pattern is not None and pos is not None: 

48 self.lineno = pattern.count(newline, 0, pos) + 1 

49 self.colno = pos - pattern.rfind(newline, 0, pos) 

50 

51 message = "{} at position {}".format(message, pos) 

52 

53 if newline in pattern: 

54 message += " (line {}, column {})".format(self.lineno, 

55 self.colno) 

56 

57 Exception.__init__(self, message) 

58 

59# The exception for when a positional flag has been turned on in the old 

60# behaviour. 

61class _UnscopedFlagSet(Exception): 

62 pass 

63 

64# The exception for when parsing fails and we want to try something else. 

65class ParseError(Exception): 

66 pass 

67 

68# The exception for when there isn't a valid first set. 

69class _FirstSetError(Exception): 

70 pass 

71 

72# Flags. 

73class RegexFlag(enum.IntFlag): 

74 A = ASCII = 0x80 # Assume ASCII locale. 

75 B = BESTMATCH = 0x1000 # Best fuzzy match. 

76 D = DEBUG = 0x200 # Print parsed pattern. 

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first 

78 # fuzzy match. 

79 F = FULLCASE = 0x4000 # Unicode full case-folding. 

80 I = IGNORECASE = 0x2 # Ignore case. 

81 L = LOCALE = 0x4 # Assume current 8-bit locale. 

82 M = MULTILINE = 0x8 # Make anchors look for newline. 

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest). 

84 R = REVERSE = 0x400 # Search backwards. 

85 S = DOTALL = 0x10 # Make dot match newline. 

86 U = UNICODE = 0x20 # Assume Unicode locale. 

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour. 

88 V1 = VERSION1 = 0x100 # New enhanced behaviour. 

89 W = WORD = 0x800 # Default Unicode word breaks. 

90 X = VERBOSE = 0x40 # Ignore whitespace and comments. 

91 T = TEMPLATE = 0x1 # Template (present because re module has it). 

92 

93 def __repr__(self): 

94 if self._name_ is not None: 

95 return 'regex.%s' % self._name_ 

96 

97 value = self._value_ 

98 members = [] 

99 negative = value < 0 

100 

101 if negative: 

102 value = ~value 

103 

104 for m in self.__class__: 

105 if value & m._value_: 

106 value &= ~m._value_ 

107 members.append('regex.%s' % m._name_) 

108 

109 if value: 

110 members.append(hex(value)) 

111 

112 res = '|'.join(members) 

113 

114 if negative: 

115 if len(members) > 1: 

116 res = '~(%s)' % res 

117 else: 

118 res = '~%s' % res 

119 

120 return res 

121 

122 __str__ = object.__str__ 

123 

124# Put the flags into the module namespace. Being explicit here helps tools like 

125# linters and IDEs understand the code better. 

126ASCII = RegexFlag.ASCII 

127BESTMATCH = RegexFlag.BESTMATCH 

128DEBUG = RegexFlag.DEBUG 

129DOTALL = RegexFlag.DOTALL 

130ENHANCEMATCH = RegexFlag.ENHANCEMATCH 

131FULLCASE = RegexFlag.FULLCASE 

132IGNORECASE = RegexFlag.IGNORECASE 

133LOCALE = RegexFlag.LOCALE 

134MULTILINE = RegexFlag.MULTILINE 

135POSIX = RegexFlag.POSIX 

136REVERSE = RegexFlag.REVERSE 

137TEMPLATE = RegexFlag.TEMPLATE 

138UNICODE = RegexFlag.UNICODE 

139VERBOSE = RegexFlag.VERBOSE 

140VERSION0 = RegexFlag.VERSION0 

141VERSION1 = RegexFlag.VERSION1 

142WORD = RegexFlag.WORD 

143A = RegexFlag.A 

144B = RegexFlag.B 

145D = RegexFlag.D 

146E = RegexFlag.E 

147F = RegexFlag.F 

148I = RegexFlag.I 

149L = RegexFlag.L 

150M = RegexFlag.M 

151P = RegexFlag.P 

152R = RegexFlag.R 

153S = RegexFlag.S 

154U = RegexFlag.U 

155V0 = RegexFlag.V0 

156V1 = RegexFlag.V1 

157W = RegexFlag.W 

158X = RegexFlag.X 

159T = RegexFlag.T 

160 

161DEFAULT_VERSION = VERSION1 

162 

163_ALL_VERSIONS = VERSION0 | VERSION1 

164_ALL_ENCODINGS = ASCII | LOCALE | UNICODE 

165 

166# The default flags for the various versions. 

167DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE} 

168 

169# The mask for the flags. 

170GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX | 

171 REVERSE) 

172SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE | 

173 _ALL_ENCODINGS) 

174 

175ALPHA = frozenset(string.ascii_letters) 

176DIGITS = frozenset(string.digits) 

177ALNUM = ALPHA | DIGITS 

178OCT_DIGITS = frozenset(string.octdigits) 

179HEX_DIGITS = frozenset(string.hexdigits) 

180SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""]) 

181NAMED_CHAR_PART = ALNUM | frozenset(" -") 

182PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.") 

183SET_OPS = ("||", "~~", "&&", "--") 

184 

185# The width of the code words inside the regex engine. 

186BYTES_PER_CODE = _regex.get_code_size() 

187BITS_PER_CODE = BYTES_PER_CODE * 8 

188 

189# The repeat count which represents infinity. 

190UNLIMITED = (1 << BITS_PER_CODE) - 1 

191 

192# The regular expression flags. 

193REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE, 

194 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE, 

195 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x": 

196 VERBOSE} 

197 

198# The case flags. 

199CASE_FLAGS = FULLCASE | IGNORECASE 

200NOCASE = 0 

201FULLIGNORECASE = FULLCASE | IGNORECASE 

202 

203FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE 

204 

205CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE, 

206 FULLIGNORECASE: FULLIGNORECASE} 

207 

208# The number of digits in hexadecimal escapes. 

209HEX_ESCAPES = {"x": 2, "u": 4, "U": 8} 

210 

211# The names of the opcodes. 

212OPCODES = """ 

213FAILURE 

214SUCCESS 

215ANY 

216ANY_ALL 

217ANY_ALL_REV 

218ANY_REV 

219ANY_U 

220ANY_U_REV 

221ATOMIC 

222BOUNDARY 

223BRANCH 

224CALL_REF 

225CHARACTER 

226CHARACTER_IGN 

227CHARACTER_IGN_REV 

228CHARACTER_REV 

229CONDITIONAL 

230DEFAULT_BOUNDARY 

231DEFAULT_END_OF_WORD 

232DEFAULT_START_OF_WORD 

233END 

234END_OF_LINE 

235END_OF_LINE_U 

236END_OF_STRING 

237END_OF_STRING_LINE 

238END_OF_STRING_LINE_U 

239END_OF_WORD 

240FUZZY 

241GRAPHEME_BOUNDARY 

242GREEDY_REPEAT 

243GROUP 

244GROUP_CALL 

245GROUP_EXISTS 

246KEEP 

247LAZY_REPEAT 

248LOOKAROUND 

249NEXT 

250PROPERTY 

251PROPERTY_IGN 

252PROPERTY_IGN_REV 

253PROPERTY_REV 

254PRUNE 

255RANGE 

256RANGE_IGN 

257RANGE_IGN_REV 

258RANGE_REV 

259REF_GROUP 

260REF_GROUP_FLD 

261REF_GROUP_FLD_REV 

262REF_GROUP_IGN 

263REF_GROUP_IGN_REV 

264REF_GROUP_REV 

265SEARCH_ANCHOR 

266SET_DIFF 

267SET_DIFF_IGN 

268SET_DIFF_IGN_REV 

269SET_DIFF_REV 

270SET_INTER 

271SET_INTER_IGN 

272SET_INTER_IGN_REV 

273SET_INTER_REV 

274SET_SYM_DIFF 

275SET_SYM_DIFF_IGN 

276SET_SYM_DIFF_IGN_REV 

277SET_SYM_DIFF_REV 

278SET_UNION 

279SET_UNION_IGN 

280SET_UNION_IGN_REV 

281SET_UNION_REV 

282SKIP 

283START_OF_LINE 

284START_OF_LINE_U 

285START_OF_STRING 

286START_OF_WORD 

287STRING 

288STRING_FLD 

289STRING_FLD_REV 

290STRING_IGN 

291STRING_IGN_REV 

292STRING_REV 

293FUZZY_EXT 

294""" 

295 

296# Define the opcodes in a namespace. 

297class Namespace: 

298 pass 

299 

300OP = Namespace() 

301for i, op in enumerate(OPCODES.split()): 

302 setattr(OP, op, i) 

303 

304def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5): 

305 """Make room in the given cache. 

306 

307 Args: 

308 cache_dict: The cache dictionary to modify. 

309 args_dict: The dictionary of named list args used by patterns. 

310 max_length: Maximum # of entries in cache_dict before it is shrunk. 

311 divisor: Cache will shrink to max_length - 1/divisor*max_length items. 

312 """ 

313 # Toss out a fraction of the entries at random to make room for new ones. 

314 # A random algorithm was chosen as opposed to simply cache_dict.popitem() 

315 # as popitem could penalize the same regular expression repeatedly based 

316 # on its internal hash value. Being random should spread the cache miss 

317 # love around. 

318 cache_keys = tuple(cache_dict.keys()) 

319 overage = len(cache_keys) - max_length 

320 if overage < 0: 

321 # Cache is already within limits. Normally this should not happen 

322 # but it could due to multithreading. 

323 return 

324 

325 number_to_toss = max_length // divisor + overage 

326 

327 # The import is done here to avoid a circular dependency. 

328 import random 

329 if not hasattr(random, 'sample'): 

330 # Do nothing while resolving the circular dependency: 

331 # re->random->warnings->tokenize->string->re 

332 return 

333 

334 for doomed_key in random.sample(cache_keys, number_to_toss): 

335 try: 

336 del cache_dict[doomed_key] 

337 except KeyError: 

338 # Ignore problems if the cache changed from another thread. 

339 pass 

340 

341 # Rebuild the arguments and locale-sensitivity dictionaries. 

342 args_dict.clear() 

343 sensitivity_dict = {} 

344 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict): 

345 args_dict[pattern, pattern_type, flags, default_version, locale] = args 

346 try: 

347 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern] 

348 except KeyError: 

349 pass 

350 

351 locale_sensitive.clear() 

352 locale_sensitive.update(sensitivity_dict) 

353 

354def _fold_case(info, string): 

355 "Folds the case of a string." 

356 flags = info.flags 

357 if (flags & _ALL_ENCODINGS) == 0: 

358 flags |= info.guess_encoding 

359 

360 return _regex.fold_case(flags, string) 

361 

362def is_cased_i(info, char): 

363 "Checks whether a character is cased." 

364 return len(_regex.get_all_cases(info.flags, char)) > 1 

365 

366def is_cased_f(flags, char): 

367 "Checks whether a character is cased." 

368 return len(_regex.get_all_cases(flags, char)) > 1 

369 

370def _compile_firstset(info, fs): 

371 "Compiles the firstset for the pattern." 

372 reverse = bool(info.flags & REVERSE) 

373 fs = _check_firstset(info, reverse, fs) 

374 if not fs or isinstance(fs, AnyAll): 

375 return [] 

376 

377 # Compile the firstset. 

378 return fs.compile(reverse) 

379 

380def _check_firstset(info, reverse, fs): 

381 "Checks the firstset for the pattern." 

382 if not fs or None in fs: 

383 return None 

384 

385 # If we ignore the case, for simplicity we won't build a firstset. 

386 members = set() 

387 case_flags = NOCASE 

388 for i in fs: 

389 if isinstance(i, Character) and not i.positive: 

390 return None 

391 

392# if i.case_flags: 

393# if isinstance(i, Character): 

394# if is_cased_i(info, i.value): 

395# return [] 

396# elif isinstance(i, SetBase): 

397# return [] 

398 case_flags |= i.case_flags 

399 members.add(i.with_flags(case_flags=NOCASE)) 

400 

401 if case_flags == (FULLCASE | IGNORECASE): 

402 return None 

403 

404 # Build the firstset. 

405 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE, 

406 zerowidth=True) 

407 fs = fs.optimise(info, reverse, in_set=True) 

408 

409 return fs 

410 

411def _flatten_code(code): 

412 "Flattens the code from a list of tuples." 

413 flat_code = [] 

414 for c in code: 

415 flat_code.extend(c) 

416 

417 return flat_code 

418 

419def make_case_flags(info): 

420 "Makes the case flags." 

421 flags = info.flags & CASE_FLAGS 

422 

423 # Turn off FULLCASE if ASCII is turned on. 

424 if info.flags & ASCII: 

425 flags &= ~FULLCASE 

426 

427 return flags 

428 

429def make_character(info, value, in_set=False): 

430 "Makes a character literal." 

431 if in_set: 

432 # A character set is built case-sensitively. 

433 return Character(value) 

434 

435 return Character(value, case_flags=make_case_flags(info)) 

436 

437def make_ref_group(info, name, position): 

438 "Makes a group reference." 

439 return RefGroup(info, name, position, case_flags=make_case_flags(info)) 

440 

441def make_string_set(info, name): 

442 "Makes a string set." 

443 return StringSet(info, name, case_flags=make_case_flags(info)) 

444 

445def make_property(info, prop, in_set): 

446 "Makes a property." 

447 if in_set: 

448 return prop 

449 

450 return prop.with_flags(case_flags=make_case_flags(info)) 

451 

452def _parse_pattern(source, info): 

453 "Parses a pattern, eg. 'a|b|c'." 

454 branches = [parse_sequence(source, info)] 

455 while source.match("|"): 

456 branches.append(parse_sequence(source, info)) 

457 

458 if len(branches) == 1: 

459 return branches[0] 

460 return Branch(branches) 

461 

462def parse_sequence(source, info): 

463 "Parses a sequence, eg. 'abc'." 

464 sequence = [None] 

465 case_flags = make_case_flags(info) 

466 while True: 

467 saved_pos = source.pos 

468 ch = source.get() 

469 if ch in SPECIAL_CHARS: 

470 if ch in ")|": 

471 # The end of a sequence. At the end of the pattern ch is "". 

472 source.pos = saved_pos 

473 break 

474 elif ch == "\\": 

475 # An escape sequence outside a set. 

476 sequence.append(parse_escape(source, info, False)) 

477 elif ch == "(": 

478 # A parenthesised subpattern or a flag. 

479 element = parse_paren(source, info) 

480 if element is None: 

481 case_flags = make_case_flags(info) 

482 else: 

483 sequence.append(element) 

484 elif ch == ".": 

485 # Any character. 

486 if info.flags & DOTALL: 

487 sequence.append(AnyAll()) 

488 elif info.flags & WORD: 

489 sequence.append(AnyU()) 

490 else: 

491 sequence.append(Any()) 

492 elif ch == "[": 

493 # A character set. 

494 sequence.append(parse_set(source, info)) 

495 elif ch == "^": 

496 # The start of a line or the string. 

497 if info.flags & MULTILINE: 

498 if info.flags & WORD: 

499 sequence.append(StartOfLineU()) 

500 else: 

501 sequence.append(StartOfLine()) 

502 else: 

503 sequence.append(StartOfString()) 

504 elif ch == "$": 

505 # The end of a line or the string. 

506 if info.flags & MULTILINE: 

507 if info.flags & WORD: 

508 sequence.append(EndOfLineU()) 

509 else: 

510 sequence.append(EndOfLine()) 

511 else: 

512 if info.flags & WORD: 

513 sequence.append(EndOfStringLineU()) 

514 else: 

515 sequence.append(EndOfStringLine()) 

516 elif ch in "?*+{": 

517 # Looks like a quantifier. 

518 counts = parse_quantifier(source, info, ch) 

519 if counts: 

520 # It _is_ a quantifier. 

521 apply_quantifier(source, info, counts, case_flags, ch, 

522 saved_pos, sequence) 

523 sequence.append(None) 

524 else: 

525 # It's not a quantifier. Maybe it's a fuzzy constraint. 

526 constraints = parse_fuzzy(source, info, ch, case_flags) 

527 if constraints: 

528 # It _is_ a fuzzy constraint. 

529 apply_constraint(source, info, constraints, case_flags, 

530 saved_pos, sequence) 

531 sequence.append(None) 

532 else: 

533 # The element was just a literal. 

534 sequence.append(Character(ord(ch), 

535 case_flags=case_flags)) 

536 else: 

537 # A literal. 

538 sequence.append(Character(ord(ch), case_flags=case_flags)) 

539 else: 

540 # A literal. 

541 sequence.append(Character(ord(ch), case_flags=case_flags)) 

542 

543 sequence = [item for item in sequence if item is not None] 

544 return Sequence(sequence) 

545 

546def apply_quantifier(source, info, counts, case_flags, ch, saved_pos, 

547 sequence): 

548 element = sequence.pop() 

549 if element is None: 

550 if sequence: 

551 raise error("multiple repeat", source.string, saved_pos) 

552 raise error("nothing to repeat", source.string, saved_pos) 

553 

554 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)): 

555 raise error("multiple repeat", source.string, saved_pos) 

556 

557 min_count, max_count = counts 

558 saved_pos = source.pos 

559 ch = source.get() 

560 if ch == "?": 

561 # The "?" suffix that means it's a lazy repeat. 

562 repeated = LazyRepeat 

563 elif ch == "+": 

564 # The "+" suffix that means it's a possessive repeat. 

565 repeated = PossessiveRepeat 

566 else: 

567 # No suffix means that it's a greedy repeat. 

568 source.pos = saved_pos 

569 repeated = GreedyRepeat 

570 

571 # Ignore the quantifier if it applies to a zero-width item or the number of 

572 # repeats is fixed at 1. 

573 if not element.is_empty() and (min_count != 1 or max_count != 1): 

574 element = repeated(element, min_count, max_count) 

575 

576 sequence.append(element) 

577 

578def apply_constraint(source, info, constraints, case_flags, saved_pos, 

579 sequence): 

580 element = sequence.pop() 

581 if element is None: 

582 raise error("nothing for fuzzy constraint", source.string, saved_pos) 

583 

584 # If a group is marked as fuzzy then put all of the fuzzy part in the 

585 # group. 

586 if isinstance(element, Group): 

587 element.subpattern = Fuzzy(element.subpattern, constraints) 

588 sequence.append(element) 

589 else: 

590 sequence.append(Fuzzy(element, constraints)) 

591 

592_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)} 

593 

594def parse_quantifier(source, info, ch): 

595 "Parses a quantifier." 

596 q = _QUANTIFIERS.get(ch) 

597 if q: 

598 # It's a quantifier. 

599 return q 

600 

601 if ch == "{": 

602 # Looks like a limited repeated element, eg. 'a{2,3}'. 

603 counts = parse_limited_quantifier(source) 

604 if counts: 

605 return counts 

606 

607 return None 

608 

609def is_above_limit(count): 

610 "Checks whether a count is above the maximum." 

611 return count is not None and count >= UNLIMITED 

612 

613def parse_limited_quantifier(source): 

614 "Parses a limited quantifier." 

615 saved_pos = source.pos 

616 min_count = parse_count(source) 

617 if source.match(","): 

618 max_count = parse_count(source) 

619 

620 # No minimum means 0 and no maximum means unlimited. 

621 min_count = int(min_count or 0) 

622 max_count = int(max_count) if max_count else None 

623 else: 

624 if not min_count: 

625 source.pos = saved_pos 

626 return None 

627 

628 min_count = max_count = int(min_count) 

629 

630 if not source.match ("}"): 

631 source.pos = saved_pos 

632 return None 

633 

634 if is_above_limit(min_count) or is_above_limit(max_count): 

635 raise error("repeat count too big", source.string, saved_pos) 

636 

637 if max_count is not None and min_count > max_count: 

638 raise error("min repeat greater than max repeat", source.string, 

639 saved_pos) 

640 

641 return min_count, max_count 

642 

643def parse_fuzzy(source, info, ch, case_flags): 

644 "Parses a fuzzy setting, if present." 

645 saved_pos = source.pos 

646 

647 if ch != "{": 

648 return None 

649 

650 constraints = {} 

651 try: 

652 parse_fuzzy_item(source, constraints) 

653 while source.match(","): 

654 parse_fuzzy_item(source, constraints) 

655 except ParseError: 

656 source.pos = saved_pos 

657 return None 

658 

659 if source.match(":"): 

660 constraints["test"] = parse_fuzzy_test(source, info, case_flags) 

661 

662 if not source.match("}"): 

663 raise error("expected }", source.string, source.pos) 

664 

665 return constraints 

666 

667def parse_fuzzy_item(source, constraints): 

668 "Parses a fuzzy setting item." 

669 saved_pos = source.pos 

670 try: 

671 parse_cost_constraint(source, constraints) 

672 except ParseError: 

673 source.pos = saved_pos 

674 

675 parse_cost_equation(source, constraints) 

676 

677def parse_cost_constraint(source, constraints): 

678 "Parses a cost constraint." 

679 saved_pos = source.pos 

680 ch = source.get() 

681 if ch in ALPHA: 

682 # Syntax: constraint [("<=" | "<") cost] 

683 constraint = parse_constraint(source, constraints, ch) 

684 

685 max_inc = parse_fuzzy_compare(source) 

686 

687 if max_inc is None: 

688 # No maximum cost. 

689 constraints[constraint] = 0, None 

690 else: 

691 # There's a maximum cost. 

692 cost_pos = source.pos 

693 max_cost = parse_cost_limit(source) 

694 

695 # Inclusive or exclusive limit? 

696 if not max_inc: 

697 max_cost -= 1 

698 

699 if max_cost < 0: 

700 raise error("bad fuzzy cost limit", source.string, cost_pos) 

701 

702 constraints[constraint] = 0, max_cost 

703 elif ch in DIGITS: 

704 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost 

705 source.pos = saved_pos 

706 

707 # Minimum cost. 

708 cost_pos = source.pos 

709 min_cost = parse_cost_limit(source) 

710 

711 min_inc = parse_fuzzy_compare(source) 

712 if min_inc is None: 

713 raise ParseError() 

714 

715 constraint = parse_constraint(source, constraints, source.get()) 

716 

717 max_inc = parse_fuzzy_compare(source) 

718 if max_inc is None: 

719 raise ParseError() 

720 

721 # Maximum cost. 

722 cost_pos = source.pos 

723 max_cost = parse_cost_limit(source) 

724 

725 # Inclusive or exclusive limits? 

726 if not min_inc: 

727 min_cost += 1 

728 if not max_inc: 

729 max_cost -= 1 

730 

731 if not 0 <= min_cost <= max_cost: 

732 raise error("bad fuzzy cost limit", source.string, cost_pos) 

733 

734 constraints[constraint] = min_cost, max_cost 

735 else: 

736 raise ParseError() 

737 

738def parse_cost_limit(source): 

739 "Parses a cost limit." 

740 cost_pos = source.pos 

741 digits = parse_count(source) 

742 

743 try: 

744 return int(digits) 

745 except ValueError: 

746 pass 

747 

748 raise error("bad fuzzy cost limit", source.string, cost_pos) 

749 

750def parse_constraint(source, constraints, ch): 

751 "Parses a constraint." 

752 if ch not in "deis": 

753 raise ParseError() 

754 

755 if ch in constraints: 

756 raise ParseError() 

757 

758 return ch 

759 

760def parse_fuzzy_compare(source): 

761 "Parses a cost comparator." 

762 if source.match("<="): 

763 return True 

764 elif source.match("<"): 

765 return False 

766 else: 

767 return None 

768 

769def parse_cost_equation(source, constraints): 

770 "Parses a cost equation." 

771 if "cost" in constraints: 

772 raise error("more than one cost equation", source.string, source.pos) 

773 

774 cost = {} 

775 

776 parse_cost_term(source, cost) 

777 while source.match("+"): 

778 parse_cost_term(source, cost) 

779 

780 max_inc = parse_fuzzy_compare(source) 

781 if max_inc is None: 

782 raise ParseError() 

783 

784 max_cost = int(parse_count(source)) 

785 

786 if not max_inc: 

787 max_cost -= 1 

788 

789 if max_cost < 0: 

790 raise error("bad fuzzy cost limit", source.string, source.pos) 

791 

792 cost["max"] = max_cost 

793 

794 constraints["cost"] = cost 

795 

796def parse_cost_term(source, cost): 

797 "Parses a cost equation term." 

798 coeff = parse_count(source) 

799 ch = source.get() 

800 if ch not in "dis": 

801 raise ParseError() 

802 

803 if ch in cost: 

804 raise error("repeated fuzzy cost", source.string, source.pos) 

805 

806 cost[ch] = int(coeff or 1) 

807 

808def parse_fuzzy_test(source, info, case_flags): 

809 saved_pos = source.pos 

810 ch = source.get() 

811 if ch in SPECIAL_CHARS: 

812 if ch == "\\": 

813 # An escape sequence outside a set. 

814 return parse_escape(source, info, False) 

815 elif ch == ".": 

816 # Any character. 

817 if info.flags & DOTALL: 

818 return AnyAll() 

819 elif info.flags & WORD: 

820 return AnyU() 

821 else: 

822 return Any() 

823 elif ch == "[": 

824 # A character set. 

825 return parse_set(source, info) 

826 else: 

827 raise error("expected character set", source.string, saved_pos) 

828 elif ch: 

829 # A literal. 

830 return Character(ord(ch), case_flags=case_flags) 

831 else: 

832 raise error("expected character set", source.string, saved_pos) 

833 

834def parse_count(source): 

835 "Parses a quantifier's count, which can be empty." 

836 return source.get_while(DIGITS) 

837 

838def parse_paren(source, info): 

839 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an 

840 inline flag. 

841 """ 

842 saved_pos = source.pos 

843 ch = source.get(True) 

844 if ch == "?": 

845 # (?... 

846 saved_pos_2 = source.pos 

847 ch = source.get(True) 

848 if ch == "<": 

849 # (?<... 

850 saved_pos_3 = source.pos 

851 ch = source.get() 

852 if ch in ("=", "!"): 

853 # (?<=... or (?<!...: lookbehind. 

854 return parse_lookaround(source, info, True, ch == "=") 

855 

856 # (?<...: a named capture group. 

857 source.pos = saved_pos_3 

858 name = parse_name(source) 

859 group = info.open_group(name) 

860 source.expect(">") 

861 saved_flags = info.flags 

862 try: 

863 subpattern = _parse_pattern(source, info) 

864 source.expect(")") 

865 finally: 

866 info.flags = saved_flags 

867 source.ignore_space = bool(info.flags & VERBOSE) 

868 

869 info.close_group() 

870 return Group(info, group, subpattern) 

871 if ch in ("=", "!"): 

872 # (?=... or (?!...: lookahead. 

873 return parse_lookaround(source, info, False, ch == "=") 

874 if ch == "P": 

875 # (?P...: a Python extension. 

876 return parse_extension(source, info) 

877 if ch == "#": 

878 # (?#...: a comment. 

879 return parse_comment(source) 

880 if ch == "(": 

881 # (?(...: a conditional subpattern. 

882 return parse_conditional(source, info) 

883 if ch == ">": 

884 # (?>...: an atomic subpattern. 

885 return parse_atomic(source, info) 

886 if ch == "|": 

887 # (?|...: a common/reset groups branch. 

888 return parse_common(source, info) 

889 if ch == "R" or "0" <= ch <= "9": 

890 # (?R...: probably a call to a group. 

891 return parse_call_group(source, info, ch, saved_pos_2) 

892 if ch == "&": 

893 # (?&...: a call to a named group. 

894 return parse_call_named_group(source, info, saved_pos_2) 

895 if (ch == "+" or ch == "-") and source.peek() in DIGITS: 

896 return parse_rel_call_group(source, info, ch, saved_pos_2) 

897 

898 # (?...: probably a flags subpattern. 

899 source.pos = saved_pos_2 

900 return parse_flags_subpattern(source, info) 

901 

902 if ch == "*": 

903 # (*... 

904 saved_pos_2 = source.pos 

905 word = source.get_while(set(")>"), include=False) 

906 if word[ : 1].isalpha(): 

907 verb = VERBS.get(word) 

908 if not verb: 

909 raise error("unknown verb", source.string, saved_pos_2) 

910 

911 source.expect(")") 

912 

913 return verb 

914 

915 # (...: an unnamed capture group. 

916 source.pos = saved_pos 

917 group = info.open_group() 

918 saved_flags = info.flags 

919 try: 

920 subpattern = _parse_pattern(source, info) 

921 source.expect(")") 

922 finally: 

923 info.flags = saved_flags 

924 source.ignore_space = bool(info.flags & VERBOSE) 

925 

926 info.close_group() 

927 

928 return Group(info, group, subpattern) 

929 

930def parse_extension(source, info): 

931 "Parses a Python extension." 

932 saved_pos = source.pos 

933 ch = source.get() 

934 if ch == "<": 

935 # (?P<...: a named capture group. 

936 name = parse_name(source) 

937 group = info.open_group(name) 

938 source.expect(">") 

939 saved_flags = info.flags 

940 try: 

941 subpattern = _parse_pattern(source, info) 

942 source.expect(")") 

943 finally: 

944 info.flags = saved_flags 

945 source.ignore_space = bool(info.flags & VERBOSE) 

946 

947 info.close_group() 

948 

949 return Group(info, group, subpattern) 

950 if ch == "=": 

951 # (?P=...: a named group reference. 

952 name = parse_name(source, allow_numeric=True) 

953 source.expect(")") 

954 if info.is_open_group(name): 

955 raise error("cannot refer to an open group", source.string, 

956 saved_pos) 

957 

958 return make_ref_group(info, name, saved_pos) 

959 if ch == ">" or ch == "&": 

960 # (?P>...: a call to a group. 

961 return parse_call_named_group(source, info, saved_pos) 

962 

963 source.pos = saved_pos 

964 raise error("unknown extension", source.string, saved_pos) 

965 

966def parse_comment(source): 

967 "Parses a comment." 

968 while True: 

969 saved_pos = source.pos 

970 c = source.get(True) 

971 

972 if not c or c == ")": 

973 break 

974 

975 if c == "\\": 

976 c = source.get(True) 

977 

978 source.pos = saved_pos 

979 source.expect(")") 

980 

981 return None 

982 

983def parse_lookaround(source, info, behind, positive): 

984 "Parses a lookaround." 

985 saved_flags = info.flags 

986 try: 

987 subpattern = _parse_pattern(source, info) 

988 source.expect(")") 

989 finally: 

990 info.flags = saved_flags 

991 source.ignore_space = bool(info.flags & VERBOSE) 

992 

993 return LookAround(behind, positive, subpattern) 

994 

995def parse_conditional(source, info): 

996 "Parses a conditional subpattern." 

997 saved_flags = info.flags 

998 saved_pos = source.pos 

999 ch = source.get() 

1000 if ch == "?": 

1001 # (?(?... 

1002 ch = source.get() 

1003 if ch in ("=", "!"): 

1004 # (?(?=... or (?(?!...: lookahead conditional. 

1005 return parse_lookaround_conditional(source, info, False, ch == "=") 

1006 if ch == "<": 

1007 # (?(?<... 

1008 ch = source.get() 

1009 if ch in ("=", "!"): 

1010 # (?(?<=... or (?(?<!...: lookbehind conditional. 

1011 return parse_lookaround_conditional(source, info, True, ch == 

1012 "=") 

1013 

1014 source.pos = saved_pos 

1015 raise error("expected lookaround conditional", source.string, 

1016 source.pos) 

1017 

1018 source.pos = saved_pos 

1019 try: 

1020 group = parse_name(source, True) 

1021 source.expect(")") 

1022 yes_branch = parse_sequence(source, info) 

1023 if source.match("|"): 

1024 no_branch = parse_sequence(source, info) 

1025 else: 

1026 no_branch = Sequence() 

1027 

1028 source.expect(")") 

1029 finally: 

1030 info.flags = saved_flags 

1031 source.ignore_space = bool(info.flags & VERBOSE) 

1032 

1033 if yes_branch.is_empty() and no_branch.is_empty(): 

1034 return Sequence() 

1035 

1036 return Conditional(info, group, yes_branch, no_branch, saved_pos) 

1037 

1038def parse_lookaround_conditional(source, info, behind, positive): 

1039 saved_flags = info.flags 

1040 try: 

1041 subpattern = _parse_pattern(source, info) 

1042 source.expect(")") 

1043 finally: 

1044 info.flags = saved_flags 

1045 source.ignore_space = bool(info.flags & VERBOSE) 

1046 

1047 yes_branch = parse_sequence(source, info) 

1048 if source.match("|"): 

1049 no_branch = parse_sequence(source, info) 

1050 else: 

1051 no_branch = Sequence() 

1052 

1053 source.expect(")") 

1054 

1055 return LookAroundConditional(behind, positive, subpattern, yes_branch, 

1056 no_branch) 

1057 

1058def parse_atomic(source, info): 

1059 "Parses an atomic subpattern." 

1060 saved_flags = info.flags 

1061 try: 

1062 subpattern = _parse_pattern(source, info) 

1063 source.expect(")") 

1064 finally: 

1065 info.flags = saved_flags 

1066 source.ignore_space = bool(info.flags & VERBOSE) 

1067 

1068 return Atomic(subpattern) 

1069 

1070def parse_common(source, info): 

1071 "Parses a common groups branch." 

1072 # Capture group numbers in different branches can reuse the group numbers. 

1073 initial_group_count = info.group_count 

1074 branches = [parse_sequence(source, info)] 

1075 final_group_count = info.group_count 

1076 while source.match("|"): 

1077 info.group_count = initial_group_count 

1078 branches.append(parse_sequence(source, info)) 

1079 final_group_count = max(final_group_count, info.group_count) 

1080 

1081 info.group_count = final_group_count 

1082 source.expect(")") 

1083 

1084 if len(branches) == 1: 

1085 return branches[0] 

1086 return Branch(branches) 

1087 

1088def parse_call_group(source, info, ch, pos): 

1089 "Parses a call to a group." 

1090 if ch == "R": 

1091 group = "0" 

1092 else: 

1093 group = ch + source.get_while(DIGITS) 

1094 

1095 source.expect(")") 

1096 

1097 return CallGroup(info, group, pos) 

1098 

1099def parse_rel_call_group(source, info, ch, pos): 

1100 "Parses a relative call to a group." 

1101 digits = source.get_while(DIGITS) 

1102 if not digits: 

1103 raise error("missing relative group number", source.string, source.pos) 

1104 

1105 offset = int(digits) 

1106 group = info.group_count + offset if ch == "+" else info.group_count - offset + 1 

1107 if group <= 0: 

1108 raise error("invalid relative group number", source.string, source.pos) 

1109 

1110 source.expect(")") 

1111 

1112 return CallGroup(info, group, pos) 

1113 

1114def parse_call_named_group(source, info, pos): 

1115 "Parses a call to a named group." 

1116 group = parse_name(source) 

1117 source.expect(")") 

1118 

1119 return CallGroup(info, group, pos) 

1120 

1121def parse_flag_set(source): 

1122 "Parses a set of inline flags." 

1123 flags = 0 

1124 

1125 try: 

1126 while True: 

1127 saved_pos = source.pos 

1128 ch = source.get() 

1129 if ch == "V": 

1130 ch += source.get() 

1131 flags |= REGEX_FLAGS[ch] 

1132 except KeyError: 

1133 source.pos = saved_pos 

1134 

1135 return flags 

1136 

1137def parse_flags(source, info): 

1138 "Parses flags being turned on/off." 

1139 flags_on = parse_flag_set(source) 

1140 if source.match("-"): 

1141 flags_off = parse_flag_set(source) 

1142 if not flags_off: 

1143 raise error("bad inline flags: no flags after '-'", source.string, 

1144 source.pos) 

1145 else: 

1146 flags_off = 0 

1147 

1148 if flags_on & LOCALE: 

1149 # Remember that this pattern as an inline locale flag. 

1150 info.inline_locale = True 

1151 

1152 return flags_on, flags_off 

1153 

1154def parse_subpattern(source, info, flags_on, flags_off): 

1155 "Parses a subpattern with scoped flags." 

1156 saved_flags = info.flags 

1157 info.flags = (info.flags | flags_on) & ~flags_off 

1158 

1159 # Ensure that there aren't multiple encoding flags set. 

1160 if info.flags & (ASCII | LOCALE | UNICODE): 

1161 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on 

1162 

1163 source.ignore_space = bool(info.flags & VERBOSE) 

1164 try: 

1165 subpattern = _parse_pattern(source, info) 

1166 source.expect(")") 

1167 finally: 

1168 info.flags = saved_flags 

1169 source.ignore_space = bool(info.flags & VERBOSE) 

1170 

1171 return subpattern 

1172 

1173def parse_flags_subpattern(source, info): 

1174 """Parses a flags subpattern. It could be inline flags or a subpattern 

1175 possibly with local flags. If it's a subpattern, then that's returned; 

1176 if it's a inline flags, then None is returned. 

1177 """ 

1178 flags_on, flags_off = parse_flags(source, info) 

1179 

1180 if flags_off & GLOBAL_FLAGS: 

1181 raise error("bad inline flags: cannot turn off global flag", 

1182 source.string, source.pos) 

1183 

1184 if flags_on & flags_off: 

1185 raise error("bad inline flags: flag turned on and off", source.string, 

1186 source.pos) 

1187 

1188 # Handle flags which are global in all regex behaviours. 

1189 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS 

1190 if new_global_flags: 

1191 info.global_flags |= new_global_flags 

1192 

1193 # A global has been turned on, so reparse the pattern. 

1194 raise _UnscopedFlagSet(info.global_flags) 

1195 

1196 # Ensure that from now on we have only scoped flags. 

1197 flags_on &= ~GLOBAL_FLAGS 

1198 

1199 if source.match(":"): 

1200 return parse_subpattern(source, info, flags_on, flags_off) 

1201 

1202 if source.match(")"): 

1203 parse_positional_flags(source, info, flags_on, flags_off) 

1204 return None 

1205 

1206 raise error("unknown extension", source.string, source.pos) 

1207 

1208def parse_positional_flags(source, info, flags_on, flags_off): 

1209 "Parses positional flags." 

1210 info.flags = (info.flags | flags_on) & ~flags_off 

1211 source.ignore_space = bool(info.flags & VERBOSE) 

1212 

1213def parse_name(source, allow_numeric=False, allow_group_0=False): 

1214 "Parses a name." 

1215 name = source.get_while(set(")>"), include=False) 

1216 

1217 if not name: 

1218 raise error("missing group name", source.string, source.pos) 

1219 

1220 if name.isdigit(): 

1221 min_group = 0 if allow_group_0 else 1 

1222 if not allow_numeric or int(name) < min_group: 

1223 raise error("bad character in group name", source.string, 

1224 source.pos) 

1225 else: 

1226 if not name.isidentifier(): 

1227 raise error("bad character in group name", source.string, 

1228 source.pos) 

1229 

1230 return name 

1231 

1232def is_octal(string): 

1233 "Checks whether a string is octal." 

1234 return all(ch in OCT_DIGITS for ch in string) 

1235 

1236def is_decimal(string): 

1237 "Checks whether a string is decimal." 

1238 return all(ch in DIGITS for ch in string) 

1239 

1240def is_hexadecimal(string): 

1241 "Checks whether a string is hexadecimal." 

1242 return all(ch in HEX_DIGITS for ch in string) 

1243 

1244def parse_escape(source, info, in_set): 

1245 "Parses an escape sequence." 

1246 saved_ignore = source.ignore_space 

1247 source.ignore_space = False 

1248 ch = source.get() 

1249 source.ignore_space = saved_ignore 

1250 if not ch: 

1251 # A backslash at the end of the pattern. 

1252 raise error("bad escape (end of pattern)", source.string, source.pos) 

1253 if ch in HEX_ESCAPES: 

1254 # A hexadecimal escape sequence. 

1255 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch) 

1256 elif ch == "g" and not in_set: 

1257 # A group reference. 

1258 saved_pos = source.pos 

1259 try: 

1260 return parse_group_ref(source, info) 

1261 except error: 

1262 # Invalid as a group reference, so assume it's a literal. 

1263 source.pos = saved_pos 

1264 

1265 return make_character(info, ord(ch), in_set) 

1266 elif ch == "G" and not in_set: 

1267 # A search anchor. 

1268 return SearchAnchor() 

1269 elif ch == "L" and not in_set: 

1270 # A string set. 

1271 return parse_string_set(source, info) 

1272 elif ch == "N": 

1273 # A named codepoint. 

1274 return parse_named_char(source, info, in_set) 

1275 elif ch in "pP": 

1276 # A Unicode property, positive or negative. 

1277 return parse_property(source, info, ch == "p", in_set) 

1278 elif ch == "R" and not in_set: 

1279 # A line ending. 

1280 charset = [0x0A, 0x0B, 0x0C, 0x0D] 

1281 if info.guess_encoding == UNICODE: 

1282 charset.extend([0x85, 0x2028, 0x2029]) 

1283 

1284 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c) 

1285 for c in charset])])) 

1286 elif ch == "X" and not in_set: 

1287 # A grapheme cluster. 

1288 return Grapheme() 

1289 elif ch in ALPHA: 

1290 # An alphabetic escape sequence. 

1291 # Positional escapes aren't allowed inside a character set. 

1292 if not in_set: 

1293 if info.flags & WORD: 

1294 value = WORD_POSITION_ESCAPES.get(ch) 

1295 elif info.flags & ASCII: 

1296 value = ASCII_POSITION_ESCAPES.get(ch) 

1297 elif info.flags & UNICODE: 

1298 value = UNICODE_POSITION_ESCAPES.get(ch) 

1299 else: 

1300 value = POSITION_ESCAPES.get(ch) 

1301 

1302 if value: 

1303 return value 

1304 

1305 if info.flags & ASCII: 

1306 value = ASCII_CHARSET_ESCAPES.get(ch) 

1307 elif info.flags & UNICODE: 

1308 value = UNICODE_CHARSET_ESCAPES.get(ch) 

1309 else: 

1310 value = CHARSET_ESCAPES.get(ch) 

1311 

1312 if value: 

1313 return value 

1314 

1315 value = CHARACTER_ESCAPES.get(ch) 

1316 if value: 

1317 return Character(ord(value)) 

1318 

1319 raise error("bad escape \\%s" % ch, source.string, source.pos) 

1320 elif ch in DIGITS: 

1321 # A numeric escape sequence. 

1322 return parse_numeric_escape(source, info, ch, in_set) 

1323 else: 

1324 # A literal. 

1325 return make_character(info, ord(ch), in_set) 

1326 

1327def parse_numeric_escape(source, info, ch, in_set): 

1328 "Parses a numeric escape sequence." 

1329 if in_set or ch == "0": 

1330 # Octal escape sequence, max 3 digits. 

1331 return parse_octal_escape(source, info, [ch], in_set) 

1332 

1333 # At least 1 digit, so either octal escape or group. 

1334 digits = ch 

1335 saved_pos = source.pos 

1336 ch = source.get() 

1337 if ch in DIGITS: 

1338 # At least 2 digits, so either octal escape or group. 

1339 digits += ch 

1340 saved_pos = source.pos 

1341 ch = source.get() 

1342 if is_octal(digits) and ch in OCT_DIGITS: 

1343 # 3 octal digits, so octal escape sequence. 

1344 encoding = info.flags & _ALL_ENCODINGS 

1345 if encoding == ASCII or encoding == LOCALE: 

1346 octal_mask = 0xFF 

1347 else: 

1348 octal_mask = 0x1FF 

1349 

1350 value = int(digits + ch, 8) & octal_mask 

1351 return make_character(info, value) 

1352 

1353 # Group reference. 

1354 source.pos = saved_pos 

1355 if info.is_open_group(digits): 

1356 raise error("cannot refer to an open group", source.string, source.pos) 

1357 

1358 return make_ref_group(info, digits, source.pos) 

1359 

1360def parse_octal_escape(source, info, digits, in_set): 

1361 "Parses an octal escape sequence." 

1362 saved_pos = source.pos 

1363 ch = source.get() 

1364 while len(digits) < 3 and ch in OCT_DIGITS: 

1365 digits.append(ch) 

1366 saved_pos = source.pos 

1367 ch = source.get() 

1368 

1369 source.pos = saved_pos 

1370 try: 

1371 value = int("".join(digits), 8) 

1372 return make_character(info, value, in_set) 

1373 except ValueError: 

1374 if digits[0] in OCT_DIGITS: 

1375 raise error("incomplete escape \\%s" % ''.join(digits), 

1376 source.string, source.pos) 

1377 else: 

1378 raise error("bad escape \\%s" % digits[0], source.string, 

1379 source.pos) 

1380 

1381def parse_hex_escape(source, info, esc, expected_len, in_set, type): 

1382 "Parses a hex escape sequence." 

1383 saved_pos = source.pos 

1384 digits = [] 

1385 for i in range(expected_len): 

1386 ch = source.get() 

1387 if ch not in HEX_DIGITS: 

1388 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), 

1389 source.string, saved_pos) 

1390 digits.append(ch) 

1391 

1392 try: 

1393 value = int("".join(digits), 16) 

1394 except ValueError: 

1395 pass 

1396 else: 

1397 if value < 0x110000: 

1398 return make_character(info, value, in_set) 

1399 

1400 # Bad hex escape. 

1401 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)), 

1402 source.string, saved_pos) 

1403 

1404def parse_group_ref(source, info): 

1405 "Parses a group reference." 

1406 source.expect("<") 

1407 saved_pos = source.pos 

1408 name = parse_name(source, True) 

1409 source.expect(">") 

1410 if info.is_open_group(name): 

1411 raise error("cannot refer to an open group", source.string, source.pos) 

1412 

1413 return make_ref_group(info, name, saved_pos) 

1414 

1415def parse_string_set(source, info): 

1416 "Parses a string set reference." 

1417 source.expect("<") 

1418 name = parse_name(source, True) 

1419 source.expect(">") 

1420 if name is None or name not in info.kwargs: 

1421 raise error("undefined named list", source.string, source.pos) 

1422 

1423 return make_string_set(info, name) 

1424 

1425def parse_named_char(source, info, in_set): 

1426 "Parses a named character." 

1427 saved_pos = source.pos 

1428 if source.match("{"): 

1429 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True) 

1430 if source.match("}"): 

1431 try: 

1432 value = unicodedata.lookup(name) 

1433 return make_character(info, ord(value), in_set) 

1434 except KeyError: 

1435 raise error("undefined character name", source.string, 

1436 source.pos) 

1437 

1438 source.pos = saved_pos 

1439 return make_character(info, ord("N"), in_set) 

1440 

1441def parse_property(source, info, positive, in_set): 

1442 "Parses a Unicode property." 

1443 saved_pos = source.pos 

1444 ch = source.get() 

1445 if ch == "{": 

1446 negate = source.match("^") 

1447 prop_name, name = parse_property_name(source) 

1448 if source.match("}"): 

1449 # It's correctly delimited. 

1450 if info.flags & ASCII: 

1451 encoding = ASCII_ENCODING 

1452 elif info.flags & UNICODE: 

1453 encoding = UNICODE_ENCODING 

1454 else: 

1455 encoding = 0 

1456 

1457 prop = lookup_property(prop_name, name, positive != negate, source, 

1458 encoding=encoding) 

1459 return make_property(info, prop, in_set) 

1460 elif ch and ch in "CLMNPSZ": 

1461 # An abbreviated property, eg \pL. 

1462 if info.flags & ASCII: 

1463 encoding = ASCII_ENCODING 

1464 elif info.flags & UNICODE: 

1465 encoding = UNICODE_ENCODING 

1466 else: 

1467 encoding = 0 

1468 

1469 prop = lookup_property(None, ch, positive, source, encoding=encoding) 

1470 return make_property(info, prop, in_set) 

1471 

1472 # Not a property, so treat as a literal "p" or "P". 

1473 source.pos = saved_pos 

1474 ch = "p" if positive else "P" 

1475 return make_character(info, ord(ch), in_set) 

1476 

1477def parse_property_name(source): 

1478 "Parses a property name, which may be qualified." 

1479 name = source.get_while(PROPERTY_NAME_PART) 

1480 saved_pos = source.pos 

1481 

1482 ch = source.get() 

1483 if ch and ch in ":=": 

1484 prop_name = name 

1485 name = source.get_while(ALNUM | set(" &_-./")).strip() 

1486 

1487 if name: 

1488 # Name after the ":" or "=", so it's a qualified name. 

1489 saved_pos = source.pos 

1490 else: 

1491 # No name after the ":" or "=", so assume it's an unqualified name. 

1492 prop_name, name = None, prop_name 

1493 else: 

1494 prop_name = None 

1495 

1496 source.pos = saved_pos 

1497 return prop_name, name 

1498 

1499def parse_set(source, info): 

1500 "Parses a character set." 

1501 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1502 

1503 saved_ignore = source.ignore_space 

1504 source.ignore_space = False 

1505 # Negative set? 

1506 negate = source.match("^") 

1507 try: 

1508 if version == VERSION0: 

1509 item = parse_set_imp_union(source, info) 

1510 else: 

1511 item = parse_set_union(source, info) 

1512 

1513 if not source.match("]"): 

1514 raise error("missing ]", source.string, source.pos) 

1515 finally: 

1516 source.ignore_space = saved_ignore 

1517 

1518 if negate: 

1519 item = item.with_flags(positive=not item.positive) 

1520 

1521 item = item.with_flags(case_flags=make_case_flags(info)) 

1522 

1523 return item 

1524 

1525def parse_set_union(source, info): 

1526 "Parses a set union ([x||y])." 

1527 items = [parse_set_symm_diff(source, info)] 

1528 while source.match("||"): 

1529 items.append(parse_set_symm_diff(source, info)) 

1530 

1531 if len(items) == 1: 

1532 return items[0] 

1533 return SetUnion(info, items) 

1534 

1535def parse_set_symm_diff(source, info): 

1536 "Parses a set symmetric difference ([x~~y])." 

1537 items = [parse_set_inter(source, info)] 

1538 while source.match("~~"): 

1539 items.append(parse_set_inter(source, info)) 

1540 

1541 if len(items) == 1: 

1542 return items[0] 

1543 return SetSymDiff(info, items) 

1544 

1545def parse_set_inter(source, info): 

1546 "Parses a set intersection ([x&&y])." 

1547 items = [parse_set_diff(source, info)] 

1548 while source.match("&&"): 

1549 items.append(parse_set_diff(source, info)) 

1550 

1551 if len(items) == 1: 

1552 return items[0] 

1553 return SetInter(info, items) 

1554 

1555def parse_set_diff(source, info): 

1556 "Parses a set difference ([x--y])." 

1557 items = [parse_set_imp_union(source, info)] 

1558 while source.match("--"): 

1559 items.append(parse_set_imp_union(source, info)) 

1560 

1561 if len(items) == 1: 

1562 return items[0] 

1563 return SetDiff(info, items) 

1564 

1565def parse_set_imp_union(source, info): 

1566 "Parses a set implicit union ([xy])." 

1567 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1568 

1569 items = [parse_set_member(source, info)] 

1570 while True: 

1571 saved_pos = source.pos 

1572 if source.match("]"): 

1573 # End of the set. 

1574 source.pos = saved_pos 

1575 break 

1576 

1577 if version == VERSION1 and any(source.match(op) for op in SET_OPS): 

1578 # The new behaviour has set operators. 

1579 source.pos = saved_pos 

1580 break 

1581 

1582 items.append(parse_set_member(source, info)) 

1583 

1584 if len(items) == 1: 

1585 return items[0] 

1586 return SetUnion(info, items) 

1587 

1588def parse_set_member(source, info): 

1589 "Parses a member in a character set." 

1590 # Parse a set item. 

1591 start = parse_set_item(source, info) 

1592 saved_pos1 = source.pos 

1593 if (not isinstance(start, Character) or not start.positive or not 

1594 source.match("-")): 

1595 # It's not the start of a range. 

1596 return start 

1597 

1598 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1599 

1600 # It looks like the start of a range of characters. 

1601 saved_pos2 = source.pos 

1602 if version == VERSION1 and source.match("-"): 

1603 # It's actually the set difference operator '--', so return the 

1604 # character. 

1605 source.pos = saved_pos1 

1606 return start 

1607 

1608 if source.match("]"): 

1609 # We've reached the end of the set, so return both the character and 

1610 # hyphen. 

1611 source.pos = saved_pos2 

1612 return SetUnion(info, [start, Character(ord("-"))]) 

1613 

1614 # Parse a set item. 

1615 end = parse_set_item(source, info) 

1616 if not isinstance(end, Character) or not end.positive: 

1617 # It's not a range, so return the character, hyphen and property. 

1618 return SetUnion(info, [start, Character(ord("-")), end]) 

1619 

1620 # It _is_ a range. 

1621 if start.value > end.value: 

1622 raise error("bad character range", source.string, source.pos) 

1623 

1624 if start.value == end.value: 

1625 return start 

1626 

1627 return Range(start.value, end.value) 

1628 

1629def parse_set_item(source, info): 

1630 "Parses an item in a character set." 

1631 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1632 

1633 if source.match("\\"): 

1634 # An escape sequence in a set. 

1635 return parse_escape(source, info, True) 

1636 

1637 saved_pos = source.pos 

1638 if source.match("[:"): 

1639 # Looks like a POSIX character class. 

1640 try: 

1641 return parse_posix_class(source, info) 

1642 except ParseError: 

1643 # Not a POSIX character class. 

1644 source.pos = saved_pos 

1645 

1646 if version == VERSION1 and source.match("["): 

1647 # It's the start of a nested set. 

1648 

1649 # Negative set? 

1650 negate = source.match("^") 

1651 item = parse_set_union(source, info) 

1652 

1653 if not source.match("]"): 

1654 raise error("missing ]", source.string, source.pos) 

1655 

1656 if negate: 

1657 item = item.with_flags(positive=not item.positive) 

1658 

1659 return item 

1660 

1661 ch = source.get() 

1662 if not ch: 

1663 raise error("unterminated character set", source.string, source.pos) 

1664 

1665 return Character(ord(ch)) 

1666 

1667def parse_posix_class(source, info): 

1668 "Parses a POSIX character class." 

1669 negate = source.match("^") 

1670 prop_name, name = parse_property_name(source) 

1671 if not source.match(":]"): 

1672 raise ParseError() 

1673 

1674 return lookup_property(prop_name, name, not negate, source, posix=True) 

1675 

1676def float_to_rational(flt): 

1677 "Converts a float to a rational pair." 

1678 int_part = int(flt) 

1679 error = flt - int_part 

1680 if abs(error) < 0.0001: 

1681 return int_part, 1 

1682 

1683 den, num = float_to_rational(1.0 / error) 

1684 

1685 return int_part * den + num, den 

1686 

1687def numeric_to_rational(numeric): 

1688 "Converts a numeric string to a rational string, if possible." 

1689 if numeric[ : 1] == "-": 

1690 sign, numeric = numeric[0], numeric[1 : ] 

1691 else: 

1692 sign = "" 

1693 

1694 parts = numeric.split("/") 

1695 if len(parts) == 2: 

1696 num, den = float_to_rational(float(parts[0]) / float(parts[1])) 

1697 elif len(parts) == 1: 

1698 num, den = float_to_rational(float(parts[0])) 

1699 else: 

1700 raise ValueError() 

1701 

1702 result = "{}{}/{}".format(sign, num, den) 

1703 if result.endswith("/1"): 

1704 return result[ : -2] 

1705 

1706 return result 

1707 

1708def standardise_name(name): 

1709 "Standardises a property or value name." 

1710 try: 

1711 return numeric_to_rational("".join(name)) 

1712 except (ValueError, ZeroDivisionError): 

1713 return "".join(ch for ch in name if ch not in "_- ").upper() 

1714 

1715_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split()) 

1716 

1717_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split()) 

1718 

1719def lookup_property(property, value, positive, source=None, posix=False, encoding=0): 

1720 "Looks up a property." 

1721 # Normalise the names (which may still be lists). 

1722 property = standardise_name(property) if property else None 

1723 value = standardise_name(value) 

1724 

1725 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"): 

1726 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive 

1727 

1728 if posix and not property and value.upper() in _POSIX_CLASSES: 

1729 value = 'POSIX' + value 

1730 

1731 if property: 

1732 # Both the property and the value are provided. 

1733 prop = PROPERTIES.get(property) 

1734 if not prop: 

1735 if not source: 

1736 raise error("unknown property") 

1737 

1738 raise error("unknown property", source.string, source.pos) 

1739 

1740 prop_id, value_dict = prop 

1741 val_id = value_dict.get(value) 

1742 if val_id is None: 

1743 if not source: 

1744 raise error("unknown property value") 

1745 

1746 raise error("unknown property value", source.string, source.pos) 

1747 

1748 return Property((prop_id << 16) | val_id, positive, encoding=encoding) 

1749 

1750 # Only the value is provided. 

1751 # It might be the name of a GC, script or block value. 

1752 for property in ("GC", "SCRIPT", "BLOCK"): 

1753 prop_id, value_dict = PROPERTIES.get(property) 

1754 val_id = value_dict.get(value) 

1755 if val_id is not None: 

1756 return Property((prop_id << 16) | val_id, positive, encoding=encoding) 

1757 

1758 # It might be the name of a binary property. 

1759 prop = PROPERTIES.get(value) 

1760 if prop: 

1761 prop_id, value_dict = prop 

1762 if set(value_dict) == _BINARY_VALUES: 

1763 return Property((prop_id << 16) | 1, positive, encoding=encoding) 

1764 

1765 return Property(prop_id << 16, not positive, encoding=encoding) 

1766 

1767 # It might be the name of a binary property starting with a prefix. 

1768 if value.startswith("IS"): 

1769 prop = PROPERTIES.get(value[2 : ]) 

1770 if prop: 

1771 prop_id, value_dict = prop 

1772 if "YES" in value_dict: 

1773 return Property((prop_id << 16) | 1, positive, encoding=encoding) 

1774 

1775 # It might be the name of a script or block starting with a prefix. 

1776 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")): 

1777 if value.startswith(prefix): 

1778 prop_id, value_dict = PROPERTIES.get(property) 

1779 val_id = value_dict.get(value[2 : ]) 

1780 if val_id is not None: 

1781 return Property((prop_id << 16) | val_id, positive, encoding=encoding) 

1782 

1783 # Unknown property. 

1784 if not source: 

1785 raise error("unknown property") 

1786 

1787 raise error("unknown property", source.string, source.pos) 

1788 

1789def _compile_replacement(source, pattern, is_unicode): 

1790 "Compiles a replacement template escape sequence." 

1791 ch = source.get() 

1792 if ch in ALPHA: 

1793 # An alphabetic escape sequence. 

1794 value = CHARACTER_ESCAPES.get(ch) 

1795 if value: 

1796 return False, [ord(value)] 

1797 

1798 if ch in HEX_ESCAPES and (ch == "x" or is_unicode): 

1799 # A hexadecimal escape sequence. 

1800 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)] 

1801 

1802 if ch == "g": 

1803 # A group preference. 

1804 return True, [compile_repl_group(source, pattern)] 

1805 

1806 if ch == "N" and is_unicode: 

1807 # A named character. 

1808 value = parse_repl_named_char(source) 

1809 if value is not None: 

1810 return False, [value] 

1811 

1812 raise error("bad escape \\%s" % ch, source.string, source.pos) 

1813 

1814 if isinstance(source.sep, bytes): 

1815 octal_mask = 0xFF 

1816 else: 

1817 octal_mask = 0x1FF 

1818 

1819 if ch == "0": 

1820 # An octal escape sequence. 

1821 digits = ch 

1822 while len(digits) < 3: 

1823 saved_pos = source.pos 

1824 ch = source.get() 

1825 if ch not in OCT_DIGITS: 

1826 source.pos = saved_pos 

1827 break 

1828 digits += ch 

1829 

1830 return False, [int(digits, 8) & octal_mask] 

1831 

1832 if ch in DIGITS: 

1833 # Either an octal escape sequence (3 digits) or a group reference (max 

1834 # 2 digits). 

1835 digits = ch 

1836 saved_pos = source.pos 

1837 ch = source.get() 

1838 if ch in DIGITS: 

1839 digits += ch 

1840 saved_pos = source.pos 

1841 ch = source.get() 

1842 if ch and is_octal(digits + ch): 

1843 # An octal escape sequence. 

1844 return False, [int(digits + ch, 8) & octal_mask] 

1845 

1846 # A group reference. 

1847 source.pos = saved_pos 

1848 return True, [int(digits)] 

1849 

1850 if ch == "\\": 

1851 # An escaped backslash is a backslash. 

1852 return False, [ord("\\")] 

1853 

1854 if not ch: 

1855 # A trailing backslash. 

1856 raise error("bad escape (end of pattern)", source.string, source.pos) 

1857 

1858 # An escaped non-backslash is a backslash followed by the literal. 

1859 return False, [ord("\\"), ord(ch)] 

1860 

1861def parse_repl_hex_escape(source, expected_len, type): 

1862 "Parses a hex escape sequence in a replacement string." 

1863 digits = [] 

1864 for i in range(expected_len): 

1865 ch = source.get() 

1866 if ch not in HEX_DIGITS: 

1867 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), 

1868 source.string, source.pos) 

1869 digits.append(ch) 

1870 

1871 return int("".join(digits), 16) 

1872 

1873def parse_repl_named_char(source): 

1874 "Parses a named character in a replacement string." 

1875 saved_pos = source.pos 

1876 if source.match("{"): 

1877 name = source.get_while(ALPHA | set(" ")) 

1878 

1879 if source.match("}"): 

1880 try: 

1881 value = unicodedata.lookup(name) 

1882 return ord(value) 

1883 except KeyError: 

1884 raise error("undefined character name", source.string, 

1885 source.pos) 

1886 

1887 source.pos = saved_pos 

1888 return None 

1889 

1890def compile_repl_group(source, pattern): 

1891 "Compiles a replacement template group reference." 

1892 source.expect("<") 

1893 name = parse_name(source, True, True) 

1894 

1895 source.expect(">") 

1896 if name.isdigit(): 

1897 index = int(name) 

1898 if not 0 <= index <= pattern.groups: 

1899 raise error("invalid group reference", source.string, source.pos) 

1900 

1901 return index 

1902 

1903 try: 

1904 return pattern.groupindex[name] 

1905 except KeyError: 

1906 raise IndexError("unknown group") 

1907 

1908# The regular expression is parsed into a syntax tree. The different types of 

1909# node are defined below. 

1910 

1911INDENT = " " 

1912POSITIVE_OP = 0x1 

1913ZEROWIDTH_OP = 0x2 

1914FUZZY_OP = 0x4 

1915REVERSE_OP = 0x8 

1916REQUIRED_OP = 0x10 

1917ENCODING_OP_SHIFT = 5 

1918 

1919POS_TEXT = {False: "NON-MATCH", True: "MATCH"} 

1920CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "", 

1921 FULLIGNORECASE: " FULL_IGNORE_CASE"} 

1922 

1923def make_sequence(items): 

1924 if len(items) == 1: 

1925 return items[0] 

1926 return Sequence(items) 

1927 

1928# Common base class for all nodes. 

1929class RegexBase: 

1930 def __init__(self): 

1931 self._key = self.__class__ 

1932 

1933 def with_flags(self, positive=None, case_flags=None, zerowidth=None): 

1934 if positive is None: 

1935 positive = self.positive 

1936 else: 

1937 positive = bool(positive) 

1938 if case_flags is None: 

1939 case_flags = self.case_flags 

1940 else: 

1941 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS] 

1942 if zerowidth is None: 

1943 zerowidth = self.zerowidth 

1944 else: 

1945 zerowidth = bool(zerowidth) 

1946 

1947 if (positive == self.positive and case_flags == self.case_flags and 

1948 zerowidth == self.zerowidth): 

1949 return self 

1950 

1951 return self.rebuild(positive, case_flags, zerowidth) 

1952 

1953 def fix_groups(self, pattern, reverse, fuzzy): 

1954 pass 

1955 

1956 def optimise(self, info, reverse): 

1957 return self 

1958 

1959 def pack_characters(self, info): 

1960 return self 

1961 

1962 def remove_captures(self): 

1963 return self 

1964 

1965 def is_atomic(self): 

1966 return True 

1967 

1968 def can_be_affix(self): 

1969 return True 

1970 

1971 def contains_group(self): 

1972 return False 

1973 

1974 def get_firstset(self, reverse): 

1975 raise _FirstSetError() 

1976 

1977 def has_simple_start(self): 

1978 return False 

1979 

1980 def compile(self, reverse=False, fuzzy=False): 

1981 return self._compile(reverse, fuzzy) 

1982 

1983 def is_empty(self): 

1984 return False 

1985 

1986 def __hash__(self): 

1987 return hash(self._key) 

1988 

1989 def __eq__(self, other): 

1990 return type(self) is type(other) and self._key == other._key 

1991 

1992 def __ne__(self, other): 

1993 return not self.__eq__(other) 

1994 

1995 def get_required_string(self, reverse): 

1996 return self.max_width(), None 

1997 

1998# Base class for zero-width nodes. 

1999class ZeroWidthBase(RegexBase): 

2000 def __init__(self, positive=True, encoding=0): 

2001 RegexBase.__init__(self) 

2002 self.positive = bool(positive) 

2003 self.encoding = encoding 

2004 

2005 self._key = self.__class__, self.positive 

2006 

2007 def get_firstset(self, reverse): 

2008 return set([None]) 

2009 

2010 def _compile(self, reverse, fuzzy): 

2011 flags = 0 

2012 if self.positive: 

2013 flags |= POSITIVE_OP 

2014 if fuzzy: 

2015 flags |= FUZZY_OP 

2016 if reverse: 

2017 flags |= REVERSE_OP 

2018 flags |= self.encoding << ENCODING_OP_SHIFT 

2019 return [(self._opcode, flags)] 

2020 

2021 def dump(self, indent, reverse): 

2022 print("{}{} {}{}".format(INDENT * indent, self._op_name, 

2023 POS_TEXT[self.positive], ["", " ASCII"][self.encoding])) 

2024 

2025 def max_width(self): 

2026 return 0 

2027 

2028class Any(RegexBase): 

2029 _opcode = {False: OP.ANY, True: OP.ANY_REV} 

2030 _op_name = "ANY" 

2031 

2032 def has_simple_start(self): 

2033 return True 

2034 

2035 def _compile(self, reverse, fuzzy): 

2036 flags = 0 

2037 if fuzzy: 

2038 flags |= FUZZY_OP 

2039 return [(self._opcode[reverse], flags)] 

2040 

2041 def dump(self, indent, reverse): 

2042 print("{}{}".format(INDENT * indent, self._op_name)) 

2043 

2044 def max_width(self): 

2045 return 1 

2046 

2047class AnyAll(Any): 

2048 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV} 

2049 _op_name = "ANY_ALL" 

2050 

2051 def __init__(self): 

2052 self.positive = True 

2053 self.zerowidth = False 

2054 self.case_flags = 0 

2055 

2056 self._key = self.__class__, self.positive 

2057 

2058class AnyU(Any): 

2059 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV} 

2060 _op_name = "ANY_U" 

2061 

2062class Atomic(RegexBase): 

2063 def __init__(self, subpattern): 

2064 RegexBase.__init__(self) 

2065 self.subpattern = subpattern 

2066 

2067 def fix_groups(self, pattern, reverse, fuzzy): 

2068 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

2069 

2070 def optimise(self, info, reverse): 

2071 self.subpattern = self.subpattern.optimise(info, reverse) 

2072 

2073 if self.subpattern.is_empty(): 

2074 return self.subpattern 

2075 return self 

2076 

2077 def pack_characters(self, info): 

2078 self.subpattern = self.subpattern.pack_characters(info) 

2079 return self 

2080 

2081 def remove_captures(self): 

2082 self.subpattern = self.subpattern.remove_captures() 

2083 return self 

2084 

2085 def can_be_affix(self): 

2086 return self.subpattern.can_be_affix() 

2087 

2088 def contains_group(self): 

2089 return self.subpattern.contains_group() 

2090 

2091 def get_firstset(self, reverse): 

2092 return self.subpattern.get_firstset(reverse) 

2093 

2094 def has_simple_start(self): 

2095 return self.subpattern.has_simple_start() 

2096 

2097 def _compile(self, reverse, fuzzy): 

2098 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) + 

2099 [(OP.END, )]) 

2100 

2101 def dump(self, indent, reverse): 

2102 print("{}ATOMIC".format(INDENT * indent)) 

2103 self.subpattern.dump(indent + 1, reverse) 

2104 

2105 def is_empty(self): 

2106 return self.subpattern.is_empty() 

2107 

2108 def __eq__(self, other): 

2109 return (type(self) is type(other) and self.subpattern == 

2110 other.subpattern) 

2111 

2112 def max_width(self): 

2113 return self.subpattern.max_width() 

2114 

2115 def get_required_string(self, reverse): 

2116 return self.subpattern.get_required_string(reverse) 

2117 

2118class Boundary(ZeroWidthBase): 

2119 _opcode = OP.BOUNDARY 

2120 _op_name = "BOUNDARY" 

2121 

2122class Branch(RegexBase): 

2123 def __init__(self, branches): 

2124 RegexBase.__init__(self) 

2125 self.branches = branches 

2126 

2127 def fix_groups(self, pattern, reverse, fuzzy): 

2128 for b in self.branches: 

2129 b.fix_groups(pattern, reverse, fuzzy) 

2130 

2131 def optimise(self, info, reverse): 

2132 if not self.branches: 

2133 return Sequence([]) 

2134 

2135 # Flatten branches within branches. 

2136 branches = Branch._flatten_branches(info, reverse, self.branches) 

2137 

2138 # Move any common prefix or suffix out of the branches. 

2139 if reverse: 

2140 suffix, branches = Branch._split_common_suffix(info, branches) 

2141 prefix = [] 

2142 else: 

2143 prefix, branches = Branch._split_common_prefix(info, branches) 

2144 suffix = [] 

2145 

2146 # Try to reduce adjacent single-character branches to sets. 

2147 branches = Branch._reduce_to_set(info, reverse, branches) 

2148 

2149 if len(branches) > 1: 

2150 sequence = [Branch(branches)] 

2151 

2152 if not prefix or not suffix: 

2153 # We might be able to add a quick precheck before the branches. 

2154 firstset = self._add_precheck(info, reverse, branches) 

2155 

2156 if firstset: 

2157 if reverse: 

2158 sequence.append(firstset) 

2159 else: 

2160 sequence.insert(0, firstset) 

2161 else: 

2162 sequence = branches 

2163 

2164 return make_sequence(prefix + sequence + suffix) 

2165 

2166 def _add_precheck(self, info, reverse, branches): 

2167 charset = set() 

2168 pos = -1 if reverse else 0 

2169 

2170 for branch in branches: 

2171 if type(branch) is Literal and branch.case_flags == NOCASE: 

2172 charset.add(branch.characters[pos]) 

2173 else: 

2174 return 

2175 

2176 if not charset: 

2177 return None 

2178 

2179 return _check_firstset(info, reverse, [Character(c) for c in charset]) 

2180 

2181 def pack_characters(self, info): 

2182 self.branches = [b.pack_characters(info) for b in self.branches] 

2183 return self 

2184 

2185 def remove_captures(self): 

2186 self.branches = [b.remove_captures() for b in self.branches] 

2187 return self 

2188 

2189 def is_atomic(self): 

2190 return all(b.is_atomic() for b in self.branches) 

2191 

2192 def can_be_affix(self): 

2193 return all(b.can_be_affix() for b in self.branches) 

2194 

2195 def contains_group(self): 

2196 return any(b.contains_group() for b in self.branches) 

2197 

2198 def get_firstset(self, reverse): 

2199 fs = set() 

2200 for b in self.branches: 

2201 fs |= b.get_firstset(reverse) 

2202 

2203 return fs or set([None]) 

2204 

2205 def _compile(self, reverse, fuzzy): 

2206 if not self.branches: 

2207 return [] 

2208 

2209 code = [(OP.BRANCH, )] 

2210 for b in self.branches: 

2211 code.extend(b.compile(reverse, fuzzy)) 

2212 code.append((OP.NEXT, )) 

2213 

2214 code[-1] = (OP.END, ) 

2215 

2216 return code 

2217 

2218 def dump(self, indent, reverse): 

2219 print("{}BRANCH".format(INDENT * indent)) 

2220 self.branches[0].dump(indent + 1, reverse) 

2221 for b in self.branches[1 : ]: 

2222 print("{}OR".format(INDENT * indent)) 

2223 b.dump(indent + 1, reverse) 

2224 

2225 @staticmethod 

2226 def _flatten_branches(info, reverse, branches): 

2227 # Flatten the branches so that there aren't branches of branches. 

2228 new_branches = [] 

2229 for b in branches: 

2230 b = b.optimise(info, reverse) 

2231 if isinstance(b, Branch): 

2232 new_branches.extend(b.branches) 

2233 else: 

2234 new_branches.append(b) 

2235 

2236 return new_branches 

2237 

2238 @staticmethod 

2239 def _split_common_prefix(info, branches): 

2240 # Common leading items can be moved out of the branches. 

2241 # Get the items in the branches. 

2242 alternatives = [] 

2243 for b in branches: 

2244 if isinstance(b, Sequence): 

2245 alternatives.append(b.items) 

2246 else: 

2247 alternatives.append([b]) 

2248 

2249 # What is the maximum possible length of the prefix? 

2250 max_count = min(len(a) for a in alternatives) 

2251 

2252 # What is the longest common prefix? 

2253 prefix = alternatives[0] 

2254 pos = 0 

2255 end_pos = max_count 

2256 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] == 

2257 prefix[pos] for a in alternatives): 

2258 pos += 1 

2259 count = pos 

2260 

2261 if info.flags & UNICODE: 

2262 # We need to check that we're not splitting a sequence of 

2263 # characters which could form part of full case-folding. 

2264 count = pos 

2265 while count > 0 and not all(Branch._can_split(a, count) for a in 

2266 alternatives): 

2267 count -= 1 

2268 

2269 # No common prefix is possible. 

2270 if count == 0: 

2271 return [], branches 

2272 

2273 # Rebuild the branches. 

2274 new_branches = [] 

2275 for a in alternatives: 

2276 new_branches.append(make_sequence(a[count : ])) 

2277 

2278 return prefix[ : count], new_branches 

2279 

2280 @staticmethod 

2281 def _split_common_suffix(info, branches): 

2282 # Common trailing items can be moved out of the branches. 

2283 # Get the items in the branches. 

2284 alternatives = [] 

2285 for b in branches: 

2286 if isinstance(b, Sequence): 

2287 alternatives.append(b.items) 

2288 else: 

2289 alternatives.append([b]) 

2290 

2291 # What is the maximum possible length of the suffix? 

2292 max_count = min(len(a) for a in alternatives) 

2293 

2294 # What is the longest common suffix? 

2295 suffix = alternatives[0] 

2296 pos = -1 

2297 end_pos = -1 - max_count 

2298 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] == 

2299 suffix[pos] for a in alternatives): 

2300 pos -= 1 

2301 count = -1 - pos 

2302 

2303 if info.flags & UNICODE: 

2304 # We need to check that we're not splitting a sequence of 

2305 # characters which could form part of full case-folding. 

2306 while count > 0 and not all(Branch._can_split_rev(a, count) for a 

2307 in alternatives): 

2308 count -= 1 

2309 

2310 # No common suffix is possible. 

2311 if count == 0: 

2312 return [], branches 

2313 

2314 # Rebuild the branches. 

2315 new_branches = [] 

2316 for a in alternatives: 

2317 new_branches.append(make_sequence(a[ : -count])) 

2318 

2319 return suffix[-count : ], new_branches 

2320 

2321 @staticmethod 

2322 def _can_split(items, count): 

2323 # Check the characters either side of the proposed split. 

2324 if not Branch._is_full_case(items, count - 1): 

2325 return True 

2326 

2327 if not Branch._is_full_case(items, count): 

2328 return True 

2329 

2330 # Check whether a 1-1 split would be OK. 

2331 if Branch._is_folded(items[count - 1 : count + 1]): 

2332 return False 

2333 

2334 # Check whether a 1-2 split would be OK. 

2335 if (Branch._is_full_case(items, count + 2) and 

2336 Branch._is_folded(items[count - 1 : count + 2])): 

2337 return False 

2338 

2339 # Check whether a 2-1 split would be OK. 

2340 if (Branch._is_full_case(items, count - 2) and 

2341 Branch._is_folded(items[count - 2 : count + 1])): 

2342 return False 

2343 

2344 return True 

2345 

2346 @staticmethod 

2347 def _can_split_rev(items, count): 

2348 end = len(items) 

2349 

2350 # Check the characters either side of the proposed split. 

2351 if not Branch._is_full_case(items, end - count): 

2352 return True 

2353 

2354 if not Branch._is_full_case(items, end - count - 1): 

2355 return True 

2356 

2357 # Check whether a 1-1 split would be OK. 

2358 if Branch._is_folded(items[end - count - 1 : end - count + 1]): 

2359 return False 

2360 

2361 # Check whether a 1-2 split would be OK. 

2362 if (Branch._is_full_case(items, end - count + 2) and 

2363 Branch._is_folded(items[end - count - 1 : end - count + 2])): 

2364 return False 

2365 

2366 # Check whether a 2-1 split would be OK. 

2367 if (Branch._is_full_case(items, end - count - 2) and 

2368 Branch._is_folded(items[end - count - 2 : end - count + 1])): 

2369 return False 

2370 

2371 return True 

2372 

2373 @staticmethod 

2374 def _merge_common_prefixes(info, reverse, branches): 

2375 # Branches with the same case-sensitive character prefix can be grouped 

2376 # together if they are separated only by other branches with a 

2377 # character prefix. 

2378 prefixed = defaultdict(list) 

2379 order = {} 

2380 new_branches = [] 

2381 for b in branches: 

2382 if Branch._is_simple_character(b): 

2383 # Branch starts with a simple character. 

2384 prefixed[b.value].append([b]) 

2385 order.setdefault(b.value, len(order)) 

2386 elif (isinstance(b, Sequence) and b.items and 

2387 Branch._is_simple_character(b.items[0])): 

2388 # Branch starts with a simple character. 

2389 prefixed[b.items[0].value].append(b.items) 

2390 order.setdefault(b.items[0].value, len(order)) 

2391 else: 

2392 Branch._flush_char_prefix(info, reverse, prefixed, order, 

2393 new_branches) 

2394 

2395 new_branches.append(b) 

2396 

2397 Branch._flush_char_prefix(info, prefixed, order, new_branches) 

2398 

2399 return new_branches 

2400 

2401 @staticmethod 

2402 def _is_simple_character(c): 

2403 return isinstance(c, Character) and c.positive and not c.case_flags 

2404 

2405 @staticmethod 

2406 def _reduce_to_set(info, reverse, branches): 

2407 # Can the branches be reduced to a set? 

2408 new_branches = [] 

2409 items = set() 

2410 case_flags = NOCASE 

2411 for b in branches: 

2412 if isinstance(b, (Character, Property, SetBase)): 

2413 # Branch starts with a single character. 

2414 if b.case_flags != case_flags: 

2415 # Different case sensitivity, so flush. 

2416 Branch._flush_set_members(info, reverse, items, case_flags, 

2417 new_branches) 

2418 

2419 case_flags = b.case_flags 

2420 

2421 items.add(b.with_flags(case_flags=NOCASE)) 

2422 else: 

2423 Branch._flush_set_members(info, reverse, items, case_flags, 

2424 new_branches) 

2425 

2426 new_branches.append(b) 

2427 

2428 Branch._flush_set_members(info, reverse, items, case_flags, 

2429 new_branches) 

2430 

2431 return new_branches 

2432 

2433 @staticmethod 

2434 def _flush_char_prefix(info, reverse, prefixed, order, new_branches): 

2435 # Flush the prefixed branches. 

2436 if not prefixed: 

2437 return 

2438 

2439 for value, branches in sorted(prefixed.items(), key=lambda pair: 

2440 order[pair[0]]): 

2441 if len(branches) == 1: 

2442 new_branches.append(make_sequence(branches[0])) 

2443 else: 

2444 subbranches = [] 

2445 optional = False 

2446 for b in branches: 

2447 if len(b) > 1: 

2448 subbranches.append(make_sequence(b[1 : ])) 

2449 elif not optional: 

2450 subbranches.append(Sequence()) 

2451 optional = True 

2452 

2453 sequence = Sequence([Character(value), Branch(subbranches)]) 

2454 new_branches.append(sequence.optimise(info, reverse)) 

2455 

2456 prefixed.clear() 

2457 order.clear() 

2458 

2459 @staticmethod 

2460 def _flush_set_members(info, reverse, items, case_flags, new_branches): 

2461 # Flush the set members. 

2462 if not items: 

2463 return 

2464 

2465 if len(items) == 1: 

2466 item = list(items)[0] 

2467 else: 

2468 item = SetUnion(info, list(items)).optimise(info, reverse) 

2469 

2470 new_branches.append(item.with_flags(case_flags=case_flags)) 

2471 

2472 items.clear() 

2473 

2474 @staticmethod 

2475 def _is_full_case(items, i): 

2476 if not 0 <= i < len(items): 

2477 return False 

2478 

2479 item = items[i] 

2480 return (isinstance(item, Character) and item.positive and 

2481 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE) 

2482 

2483 @staticmethod 

2484 def _is_folded(items): 

2485 if len(items) < 2: 

2486 return False 

2487 

2488 for i in items: 

2489 if (not isinstance(i, Character) or not i.positive or not 

2490 i.case_flags): 

2491 return False 

2492 

2493 folded = "".join(chr(i.value) for i in items) 

2494 folded = _regex.fold_case(FULL_CASE_FOLDING, folded) 

2495 

2496 # Get the characters which expand to multiple codepoints on folding. 

2497 expanding_chars = _regex.get_expand_on_folding() 

2498 

2499 for c in expanding_chars: 

2500 if folded == _regex.fold_case(FULL_CASE_FOLDING, c): 

2501 return True 

2502 

2503 return False 

2504 

2505 def is_empty(self): 

2506 return all(b.is_empty() for b in self.branches) 

2507 

2508 def __eq__(self, other): 

2509 return type(self) is type(other) and self.branches == other.branches 

2510 

2511 def max_width(self): 

2512 return max(b.max_width() for b in self.branches) 

2513 

2514class CallGroup(RegexBase): 

2515 def __init__(self, info, group, position): 

2516 RegexBase.__init__(self) 

2517 self.info = info 

2518 self.group = group 

2519 self.position = position 

2520 

2521 self._key = self.__class__, self.group 

2522 

2523 def fix_groups(self, pattern, reverse, fuzzy): 

2524 try: 

2525 self.group = int(self.group) 

2526 except ValueError: 

2527 try: 

2528 self.group = self.info.group_index[self.group] 

2529 except KeyError: 

2530 raise error("invalid group reference", pattern, self.position) 

2531 

2532 if not 0 <= self.group <= self.info.group_count: 

2533 raise error("unknown group", pattern, self.position) 

2534 

2535 if self.group > 0 and self.info.open_group_count[self.group] > 1: 

2536 raise error("ambiguous group reference", pattern, self.position) 

2537 

2538 self.info.group_calls.append((self, reverse, fuzzy)) 

2539 

2540 self._key = self.__class__, self.group 

2541 

2542 def remove_captures(self): 

2543 raise error("group reference not allowed", self.pattern, self.position) 

2544 

2545 def _compile(self, reverse, fuzzy): 

2546 return [(OP.GROUP_CALL, self.call_ref)] 

2547 

2548 def dump(self, indent, reverse): 

2549 print("{}GROUP_CALL {}".format(INDENT * indent, self.group)) 

2550 

2551 def __eq__(self, other): 

2552 return type(self) is type(other) and self.group == other.group 

2553 

2554 def max_width(self): 

2555 return UNLIMITED 

2556 

2557 def __del__(self): 

2558 self.info = None 

2559 

2560class CallRef(RegexBase): 

2561 def __init__(self, ref, parsed): 

2562 self.ref = ref 

2563 self.parsed = parsed 

2564 

2565 def _compile(self, reverse, fuzzy): 

2566 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse, 

2567 fuzzy) + [(OP.END, )]) 

2568 

2569class Character(RegexBase): 

2570 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False): 

2571 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE, 

2572 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE, 

2573 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV, 

2574 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV} 

2575 

2576 def __init__(self, value, positive=True, case_flags=NOCASE, 

2577 zerowidth=False): 

2578 RegexBase.__init__(self) 

2579 self.value = value 

2580 self.positive = bool(positive) 

2581 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

2582 self.zerowidth = bool(zerowidth) 

2583 

2584 if (self.positive and (self.case_flags & FULLIGNORECASE) == 

2585 FULLIGNORECASE): 

2586 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value)) 

2587 else: 

2588 self.folded = chr(self.value) 

2589 

2590 self._key = (self.__class__, self.value, self.positive, 

2591 self.case_flags, self.zerowidth) 

2592 

2593 def rebuild(self, positive, case_flags, zerowidth): 

2594 return Character(self.value, positive, case_flags, zerowidth) 

2595 

2596 def optimise(self, info, reverse, in_set=False): 

2597 return self 

2598 

2599 def get_firstset(self, reverse): 

2600 return set([self]) 

2601 

2602 def has_simple_start(self): 

2603 return True 

2604 

2605 def _compile(self, reverse, fuzzy): 

2606 flags = 0 

2607 if self.positive: 

2608 flags |= POSITIVE_OP 

2609 if self.zerowidth: 

2610 flags |= ZEROWIDTH_OP 

2611 if fuzzy: 

2612 flags |= FUZZY_OP 

2613 

2614 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags, 

2615 self.value]) 

2616 

2617 if len(self.folded) > 1: 

2618 # The character expands on full case-folding. 

2619 code = Branch([code, String([ord(c) for c in self.folded], 

2620 case_flags=self.case_flags)]) 

2621 

2622 return code.compile(reverse, fuzzy) 

2623 

2624 def dump(self, indent, reverse): 

2625 display = ascii(chr(self.value)).lstrip("bu") 

2626 print("{}CHARACTER {} {}{}".format(INDENT * indent, 

2627 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags])) 

2628 

2629 def matches(self, ch): 

2630 return (ch == self.value) == self.positive 

2631 

2632 def max_width(self): 

2633 return len(self.folded) 

2634 

2635 def get_required_string(self, reverse): 

2636 if not self.positive: 

2637 return 1, None 

2638 

2639 self.folded_characters = tuple(ord(c) for c in self.folded) 

2640 

2641 return 0, self 

2642 

2643class Conditional(RegexBase): 

2644 def __init__(self, info, group, yes_item, no_item, position): 

2645 RegexBase.__init__(self) 

2646 self.info = info 

2647 self.group = group 

2648 self.yes_item = yes_item 

2649 self.no_item = no_item 

2650 self.position = position 

2651 

2652 def fix_groups(self, pattern, reverse, fuzzy): 

2653 try: 

2654 self.group = int(self.group) 

2655 except ValueError: 

2656 try: 

2657 self.group = self.info.group_index[self.group] 

2658 except KeyError: 

2659 if self.group == 'DEFINE': 

2660 # 'DEFINE' is a special name unless there's a group with 

2661 # that name. 

2662 self.group = 0 

2663 else: 

2664 raise error("unknown group", pattern, self.position) 

2665 

2666 if not 0 <= self.group <= self.info.group_count: 

2667 raise error("invalid group reference", pattern, self.position) 

2668 

2669 self.yes_item.fix_groups(pattern, reverse, fuzzy) 

2670 self.no_item.fix_groups(pattern, reverse, fuzzy) 

2671 

2672 def optimise(self, info, reverse): 

2673 yes_item = self.yes_item.optimise(info, reverse) 

2674 no_item = self.no_item.optimise(info, reverse) 

2675 

2676 return Conditional(info, self.group, yes_item, no_item, self.position) 

2677 

2678 def pack_characters(self, info): 

2679 self.yes_item = self.yes_item.pack_characters(info) 

2680 self.no_item = self.no_item.pack_characters(info) 

2681 return self 

2682 

2683 def remove_captures(self): 

2684 self.yes_item = self.yes_item.remove_captures() 

2685 self.no_item = self.no_item.remove_captures() 

2686 

2687 def is_atomic(self): 

2688 return self.yes_item.is_atomic() and self.no_item.is_atomic() 

2689 

2690 def can_be_affix(self): 

2691 return self.yes_item.can_be_affix() and self.no_item.can_be_affix() 

2692 

2693 def contains_group(self): 

2694 return self.yes_item.contains_group() or self.no_item.contains_group() 

2695 

2696 def get_firstset(self, reverse): 

2697 return (self.yes_item.get_firstset(reverse) | 

2698 self.no_item.get_firstset(reverse)) 

2699 

2700 def _compile(self, reverse, fuzzy): 

2701 code = [(OP.GROUP_EXISTS, self.group)] 

2702 code.extend(self.yes_item.compile(reverse, fuzzy)) 

2703 add_code = self.no_item.compile(reverse, fuzzy) 

2704 if add_code: 

2705 code.append((OP.NEXT, )) 

2706 code.extend(add_code) 

2707 

2708 code.append((OP.END, )) 

2709 

2710 return code 

2711 

2712 def dump(self, indent, reverse): 

2713 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group)) 

2714 self.yes_item.dump(indent + 1, reverse) 

2715 if not self.no_item.is_empty(): 

2716 print("{}OR".format(INDENT * indent)) 

2717 self.no_item.dump(indent + 1, reverse) 

2718 

2719 def is_empty(self): 

2720 return self.yes_item.is_empty() and self.no_item.is_empty() 

2721 

2722 def __eq__(self, other): 

2723 return type(self) is type(other) and (self.group, self.yes_item, 

2724 self.no_item) == (other.group, other.yes_item, other.no_item) 

2725 

2726 def max_width(self): 

2727 return max(self.yes_item.max_width(), self.no_item.max_width()) 

2728 

2729 def __del__(self): 

2730 self.info = None 

2731 

2732class DefaultBoundary(ZeroWidthBase): 

2733 _opcode = OP.DEFAULT_BOUNDARY 

2734 _op_name = "DEFAULT_BOUNDARY" 

2735 

2736class DefaultEndOfWord(ZeroWidthBase): 

2737 _opcode = OP.DEFAULT_END_OF_WORD 

2738 _op_name = "DEFAULT_END_OF_WORD" 

2739 

2740class DefaultStartOfWord(ZeroWidthBase): 

2741 _opcode = OP.DEFAULT_START_OF_WORD 

2742 _op_name = "DEFAULT_START_OF_WORD" 

2743 

2744class EndOfLine(ZeroWidthBase): 

2745 _opcode = OP.END_OF_LINE 

2746 _op_name = "END_OF_LINE" 

2747 

2748class EndOfLineU(EndOfLine): 

2749 _opcode = OP.END_OF_LINE_U 

2750 _op_name = "END_OF_LINE_U" 

2751 

2752class EndOfString(ZeroWidthBase): 

2753 _opcode = OP.END_OF_STRING 

2754 _op_name = "END_OF_STRING" 

2755 

2756class EndOfStringLine(ZeroWidthBase): 

2757 _opcode = OP.END_OF_STRING_LINE 

2758 _op_name = "END_OF_STRING_LINE" 

2759 

2760class EndOfStringLineU(EndOfStringLine): 

2761 _opcode = OP.END_OF_STRING_LINE_U 

2762 _op_name = "END_OF_STRING_LINE_U" 

2763 

2764class EndOfWord(ZeroWidthBase): 

2765 _opcode = OP.END_OF_WORD 

2766 _op_name = "END_OF_WORD" 

2767 

2768class Failure(ZeroWidthBase): 

2769 _op_name = "FAILURE" 

2770 

2771 def _compile(self, reverse, fuzzy): 

2772 return [(OP.FAILURE, )] 

2773 

2774class Fuzzy(RegexBase): 

2775 def __init__(self, subpattern, constraints=None): 

2776 RegexBase.__init__(self) 

2777 if constraints is None: 

2778 constraints = {} 

2779 self.subpattern = subpattern 

2780 self.constraints = constraints 

2781 

2782 # If an error type is mentioned in the cost equation, then its maximum 

2783 # defaults to unlimited. 

2784 if "cost" in constraints: 

2785 for e in "dis": 

2786 if e in constraints["cost"]: 

2787 constraints.setdefault(e, (0, None)) 

2788 

2789 # If any error type is mentioned, then all the error maxima default to 

2790 # 0, otherwise they default to unlimited. 

2791 if set(constraints) & set("dis"): 

2792 for e in "dis": 

2793 constraints.setdefault(e, (0, 0)) 

2794 else: 

2795 for e in "dis": 

2796 constraints.setdefault(e, (0, None)) 

2797 

2798 # The maximum of the generic error type defaults to unlimited. 

2799 constraints.setdefault("e", (0, None)) 

2800 

2801 # The cost equation defaults to equal costs. Also, the cost of any 

2802 # error type not mentioned in the cost equation defaults to 0. 

2803 if "cost" in constraints: 

2804 for e in "dis": 

2805 constraints["cost"].setdefault(e, 0) 

2806 else: 

2807 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max": 

2808 constraints["e"][1]} 

2809 

2810 def fix_groups(self, pattern, reverse, fuzzy): 

2811 self.subpattern.fix_groups(pattern, reverse, True) 

2812 

2813 def pack_characters(self, info): 

2814 self.subpattern = self.subpattern.pack_characters(info) 

2815 return self 

2816 

2817 def remove_captures(self): 

2818 self.subpattern = self.subpattern.remove_captures() 

2819 return self 

2820 

2821 def is_atomic(self): 

2822 return self.subpattern.is_atomic() 

2823 

2824 def contains_group(self): 

2825 return self.subpattern.contains_group() 

2826 

2827 def _compile(self, reverse, fuzzy): 

2828 # The individual limits. 

2829 arguments = [] 

2830 for e in "dise": 

2831 v = self.constraints[e] 

2832 arguments.append(v[0]) 

2833 arguments.append(UNLIMITED if v[1] is None else v[1]) 

2834 

2835 # The coeffs of the cost equation. 

2836 for e in "dis": 

2837 arguments.append(self.constraints["cost"][e]) 

2838 

2839 # The maximum of the cost equation. 

2840 v = self.constraints["cost"]["max"] 

2841 arguments.append(UNLIMITED if v is None else v) 

2842 

2843 flags = 0 

2844 if reverse: 

2845 flags |= REVERSE_OP 

2846 

2847 test = self.constraints.get("test") 

2848 

2849 if test: 

2850 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] + 

2851 test.compile(reverse, True) + [(OP.NEXT,)] + 

2852 self.subpattern.compile(reverse, True) + [(OP.END,)]) 

2853 

2854 return ([(OP.FUZZY, flags) + tuple(arguments)] + 

2855 self.subpattern.compile(reverse, True) + [(OP.END,)]) 

2856 

2857 def dump(self, indent, reverse): 

2858 constraints = self._constraints_to_string() 

2859 if constraints: 

2860 constraints = " " + constraints 

2861 print("{}FUZZY{}".format(INDENT * indent, constraints)) 

2862 self.subpattern.dump(indent + 1, reverse) 

2863 

2864 def is_empty(self): 

2865 return self.subpattern.is_empty() 

2866 

2867 def __eq__(self, other): 

2868 return (type(self) is type(other) and self.subpattern == 

2869 other.subpattern and self.constraints == other.constraints) 

2870 

2871 def max_width(self): 

2872 return UNLIMITED 

2873 

2874 def _constraints_to_string(self): 

2875 constraints = [] 

2876 

2877 for name in "ids": 

2878 min, max = self.constraints[name] 

2879 if max == 0: 

2880 continue 

2881 

2882 con = "" 

2883 

2884 if min > 0: 

2885 con = "{}<=".format(min) 

2886 

2887 con += name 

2888 

2889 if max is not None: 

2890 con += "<={}".format(max) 

2891 

2892 constraints.append(con) 

2893 

2894 cost = [] 

2895 for name in "ids": 

2896 coeff = self.constraints["cost"][name] 

2897 if coeff > 0: 

2898 cost.append("{}{}".format(coeff, name)) 

2899 

2900 limit = self.constraints["cost"]["max"] 

2901 if limit is not None and limit > 0: 

2902 cost = "{}<={}".format("+".join(cost), limit) 

2903 constraints.append(cost) 

2904 

2905 return ",".join(constraints) 

2906 

2907class Grapheme(RegexBase): 

2908 def _compile(self, reverse, fuzzy): 

2909 # Match at least 1 character until a grapheme boundary is reached. Note 

2910 # that this is the same whether matching forwards or backwards. 

2911 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None), 

2912 GraphemeBoundary()])) 

2913 

2914 return grapheme_matcher.compile(reverse, fuzzy) 

2915 

2916 def dump(self, indent, reverse): 

2917 print("{}GRAPHEME".format(INDENT * indent)) 

2918 

2919 def max_width(self): 

2920 return UNLIMITED 

2921 

2922class GraphemeBoundary: 

2923 def compile(self, reverse, fuzzy): 

2924 return [(OP.GRAPHEME_BOUNDARY, 1)] 

2925 

2926class GreedyRepeat(RegexBase): 

2927 _opcode = OP.GREEDY_REPEAT 

2928 _op_name = "GREEDY_REPEAT" 

2929 

2930 def __init__(self, subpattern, min_count, max_count): 

2931 RegexBase.__init__(self) 

2932 self.subpattern = subpattern 

2933 self.min_count = min_count 

2934 self.max_count = max_count 

2935 

2936 def fix_groups(self, pattern, reverse, fuzzy): 

2937 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

2938 

2939 def optimise(self, info, reverse): 

2940 subpattern = self.subpattern.optimise(info, reverse) 

2941 

2942 return type(self)(subpattern, self.min_count, self.max_count) 

2943 

2944 def pack_characters(self, info): 

2945 self.subpattern = self.subpattern.pack_characters(info) 

2946 return self 

2947 

2948 def remove_captures(self): 

2949 self.subpattern = self.subpattern.remove_captures() 

2950 return self 

2951 

2952 def is_atomic(self): 

2953 return self.min_count == self.max_count and self.subpattern.is_atomic() 

2954 

2955 def can_be_affix(self): 

2956 return False 

2957 

2958 def contains_group(self): 

2959 return self.subpattern.contains_group() 

2960 

2961 def get_firstset(self, reverse): 

2962 fs = self.subpattern.get_firstset(reverse) 

2963 if self.min_count == 0: 

2964 fs.add(None) 

2965 

2966 return fs 

2967 

2968 def _compile(self, reverse, fuzzy): 

2969 repeat = [self._opcode, self.min_count] 

2970 if self.max_count is None: 

2971 repeat.append(UNLIMITED) 

2972 else: 

2973 repeat.append(self.max_count) 

2974 

2975 subpattern = self.subpattern.compile(reverse, fuzzy) 

2976 if not subpattern: 

2977 return [] 

2978 

2979 return ([tuple(repeat)] + subpattern + [(OP.END, )]) 

2980 

2981 def dump(self, indent, reverse): 

2982 if self.max_count is None: 

2983 limit = "INF" 

2984 else: 

2985 limit = self.max_count 

2986 print("{}{} {} {}".format(INDENT * indent, self._op_name, 

2987 self.min_count, limit)) 

2988 

2989 self.subpattern.dump(indent + 1, reverse) 

2990 

2991 def is_empty(self): 

2992 return self.subpattern.is_empty() 

2993 

2994 def __eq__(self, other): 

2995 return type(self) is type(other) and (self.subpattern, self.min_count, 

2996 self.max_count) == (other.subpattern, other.min_count, 

2997 other.max_count) 

2998 

2999 def max_width(self): 

3000 if self.max_count is None: 

3001 return UNLIMITED 

3002 

3003 return self.subpattern.max_width() * self.max_count 

3004 

3005 def get_required_string(self, reverse): 

3006 max_count = UNLIMITED if self.max_count is None else self.max_count 

3007 if self.min_count == 0: 

3008 w = self.subpattern.max_width() * max_count 

3009 return min(w, UNLIMITED), None 

3010 

3011 ofs, req = self.subpattern.get_required_string(reverse) 

3012 if req: 

3013 return ofs, req 

3014 

3015 w = self.subpattern.max_width() * max_count 

3016 return min(w, UNLIMITED), None 

3017 

3018class PossessiveRepeat(GreedyRepeat): 

3019 def is_atomic(self): 

3020 return True 

3021 

3022 def _compile(self, reverse, fuzzy): 

3023 subpattern = self.subpattern.compile(reverse, fuzzy) 

3024 if not subpattern: 

3025 return [] 

3026 

3027 repeat = [self._opcode, self.min_count] 

3028 if self.max_count is None: 

3029 repeat.append(UNLIMITED) 

3030 else: 

3031 repeat.append(self.max_count) 

3032 

3033 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ), 

3034 (OP.END, )]) 

3035 

3036 def dump(self, indent, reverse): 

3037 print("{}ATOMIC".format(INDENT * indent)) 

3038 

3039 if self.max_count is None: 

3040 limit = "INF" 

3041 else: 

3042 limit = self.max_count 

3043 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name, 

3044 self.min_count, limit)) 

3045 

3046 self.subpattern.dump(indent + 2, reverse) 

3047 

3048class Group(RegexBase): 

3049 def __init__(self, info, group, subpattern): 

3050 RegexBase.__init__(self) 

3051 self.info = info 

3052 self.group = group 

3053 self.subpattern = subpattern 

3054 

3055 self.call_ref = None 

3056 

3057 def fix_groups(self, pattern, reverse, fuzzy): 

3058 self.info.defined_groups[self.group] = (self, reverse, fuzzy) 

3059 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

3060 

3061 def optimise(self, info, reverse): 

3062 subpattern = self.subpattern.optimise(info, reverse) 

3063 

3064 return Group(self.info, self.group, subpattern) 

3065 

3066 def pack_characters(self, info): 

3067 self.subpattern = self.subpattern.pack_characters(info) 

3068 return self 

3069 

3070 def remove_captures(self): 

3071 return self.subpattern.remove_captures() 

3072 

3073 def is_atomic(self): 

3074 return self.subpattern.is_atomic() 

3075 

3076 def can_be_affix(self): 

3077 return False 

3078 

3079 def contains_group(self): 

3080 return True 

3081 

3082 def get_firstset(self, reverse): 

3083 return self.subpattern.get_firstset(reverse) 

3084 

3085 def has_simple_start(self): 

3086 return self.subpattern.has_simple_start() 

3087 

3088 def _compile(self, reverse, fuzzy): 

3089 code = [] 

3090 

3091 public_group = private_group = self.group 

3092 if private_group < 0: 

3093 public_group = self.info.private_groups[private_group] 

3094 private_group = self.info.group_count - private_group 

3095 

3096 key = self.group, reverse, fuzzy 

3097 ref = self.info.call_refs.get(key) 

3098 if ref is not None: 

3099 code += [(OP.CALL_REF, ref)] 

3100 

3101 code += [(OP.GROUP, int(not reverse), private_group, public_group)] 

3102 code += self.subpattern.compile(reverse, fuzzy) 

3103 code += [(OP.END, )] 

3104 

3105 if ref is not None: 

3106 code += [(OP.END, )] 

3107 

3108 return code 

3109 

3110 def dump(self, indent, reverse): 

3111 group = self.group 

3112 if group < 0: 

3113 group = self.info.private_groups[group] 

3114 print("{}GROUP {}".format(INDENT * indent, group)) 

3115 self.subpattern.dump(indent + 1, reverse) 

3116 

3117 def __eq__(self, other): 

3118 return (type(self) is type(other) and (self.group, self.subpattern) == 

3119 (other.group, other.subpattern)) 

3120 

3121 def max_width(self): 

3122 return self.subpattern.max_width() 

3123 

3124 def get_required_string(self, reverse): 

3125 return self.subpattern.get_required_string(reverse) 

3126 

3127 def __del__(self): 

3128 self.info = None 

3129 

3130class Keep(ZeroWidthBase): 

3131 _opcode = OP.KEEP 

3132 _op_name = "KEEP" 

3133 

3134class LazyRepeat(GreedyRepeat): 

3135 _opcode = OP.LAZY_REPEAT 

3136 _op_name = "LAZY_REPEAT" 

3137 

3138class LookAround(RegexBase): 

3139 _dir_text = {False: "AHEAD", True: "BEHIND"} 

3140 

3141 def __init__(self, behind, positive, subpattern): 

3142 RegexBase.__init__(self) 

3143 self.behind = bool(behind) 

3144 self.positive = bool(positive) 

3145 self.subpattern = subpattern 

3146 

3147 def fix_groups(self, pattern, reverse, fuzzy): 

3148 self.subpattern.fix_groups(pattern, self.behind, fuzzy) 

3149 

3150 def optimise(self, info, reverse): 

3151 subpattern = self.subpattern.optimise(info, self.behind) 

3152 if self.positive and subpattern.is_empty(): 

3153 return subpattern 

3154 

3155 return LookAround(self.behind, self.positive, subpattern) 

3156 

3157 def pack_characters(self, info): 

3158 self.subpattern = self.subpattern.pack_characters(info) 

3159 return self 

3160 

3161 def remove_captures(self): 

3162 return self.subpattern.remove_captures() 

3163 

3164 def is_atomic(self): 

3165 return self.subpattern.is_atomic() 

3166 

3167 def can_be_affix(self): 

3168 return self.subpattern.can_be_affix() 

3169 

3170 def contains_group(self): 

3171 return self.subpattern.contains_group() 

3172 

3173 def get_firstset(self, reverse): 

3174 if self.positive and self.behind == reverse: 

3175 return self.subpattern.get_firstset(reverse) 

3176 

3177 return set([None]) 

3178 

3179 def _compile(self, reverse, fuzzy): 

3180 flags = 0 

3181 if self.positive: 

3182 flags |= POSITIVE_OP 

3183 if fuzzy: 

3184 flags |= FUZZY_OP 

3185 if reverse: 

3186 flags |= REVERSE_OP 

3187 

3188 return ([(OP.LOOKAROUND, flags, int(not self.behind))] + 

3189 self.subpattern.compile(self.behind) + [(OP.END, )]) 

3190 

3191 def dump(self, indent, reverse): 

3192 print("{}LOOK{} {}".format(INDENT * indent, 

3193 self._dir_text[self.behind], POS_TEXT[self.positive])) 

3194 self.subpattern.dump(indent + 1, self.behind) 

3195 

3196 def is_empty(self): 

3197 return self.positive and self.subpattern.is_empty() 

3198 

3199 def __eq__(self, other): 

3200 return type(self) is type(other) and (self.behind, self.positive, 

3201 self.subpattern) == (other.behind, other.positive, other.subpattern) 

3202 

3203 def max_width(self): 

3204 return 0 

3205 

3206class LookAroundConditional(RegexBase): 

3207 _dir_text = {False: "AHEAD", True: "BEHIND"} 

3208 

3209 def __init__(self, behind, positive, subpattern, yes_item, no_item): 

3210 RegexBase.__init__(self) 

3211 self.behind = bool(behind) 

3212 self.positive = bool(positive) 

3213 self.subpattern = subpattern 

3214 self.yes_item = yes_item 

3215 self.no_item = no_item 

3216 

3217 def fix_groups(self, pattern, reverse, fuzzy): 

3218 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

3219 self.yes_item.fix_groups(pattern, reverse, fuzzy) 

3220 self.no_item.fix_groups(pattern, reverse, fuzzy) 

3221 

3222 def optimise(self, info, reverse): 

3223 subpattern = self.subpattern.optimise(info, self.behind) 

3224 yes_item = self.yes_item.optimise(info, self.behind) 

3225 no_item = self.no_item.optimise(info, self.behind) 

3226 

3227 return LookAroundConditional(self.behind, self.positive, subpattern, 

3228 yes_item, no_item) 

3229 

3230 def pack_characters(self, info): 

3231 self.subpattern = self.subpattern.pack_characters(info) 

3232 self.yes_item = self.yes_item.pack_characters(info) 

3233 self.no_item = self.no_item.pack_characters(info) 

3234 return self 

3235 

3236 def remove_captures(self): 

3237 self.subpattern = self.subpattern.remove_captures() 

3238 self.yes_item = self.yes_item.remove_captures() 

3239 self.no_item = self.no_item.remove_captures() 

3240 

3241 def is_atomic(self): 

3242 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and 

3243 self.no_item.is_atomic()) 

3244 

3245 def can_be_affix(self): 

3246 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix() 

3247 and self.no_item.can_be_affix()) 

3248 

3249 def contains_group(self): 

3250 return (self.subpattern.contains_group() or 

3251 self.yes_item.contains_group() or self.no_item.contains_group()) 

3252 

3253 def _compile(self, reverse, fuzzy): 

3254 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))] 

3255 code.extend(self.subpattern.compile(self.behind, fuzzy)) 

3256 code.append((OP.NEXT, )) 

3257 code.extend(self.yes_item.compile(reverse, fuzzy)) 

3258 add_code = self.no_item.compile(reverse, fuzzy) 

3259 if add_code: 

3260 code.append((OP.NEXT, )) 

3261 code.extend(add_code) 

3262 

3263 code.append((OP.END, )) 

3264 

3265 return code 

3266 

3267 def dump(self, indent, reverse): 

3268 print("{}CONDITIONAL {} {}".format(INDENT * indent, 

3269 self._dir_text[self.behind], POS_TEXT[self.positive])) 

3270 self.subpattern.dump(indent + 1, self.behind) 

3271 print("{}EITHER".format(INDENT * indent)) 

3272 self.yes_item.dump(indent + 1, reverse) 

3273 if not self.no_item.is_empty(): 

3274 print("{}OR".format(INDENT * indent)) 

3275 self.no_item.dump(indent + 1, reverse) 

3276 

3277 def is_empty(self): 

3278 return (self.subpattern.is_empty() and self.yes_item.is_empty() or 

3279 self.no_item.is_empty()) 

3280 

3281 def __eq__(self, other): 

3282 return type(self) is type(other) and (self.subpattern, self.yes_item, 

3283 self.no_item) == (other.subpattern, other.yes_item, other.no_item) 

3284 

3285 def max_width(self): 

3286 return max(self.yes_item.max_width(), self.no_item.max_width()) 

3287 

3288 def get_required_string(self, reverse): 

3289 return self.max_width(), None 

3290 

3291class PrecompiledCode(RegexBase): 

3292 def __init__(self, code): 

3293 self.code = code 

3294 

3295 def _compile(self, reverse, fuzzy): 

3296 return [tuple(self.code)] 

3297 

3298class Property(RegexBase): 

3299 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False): 

3300 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False): 

3301 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True): 

3302 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE, 

3303 True): OP.PROPERTY_IGN_REV} 

3304 

3305 def __init__(self, value, positive=True, case_flags=NOCASE, 

3306 zerowidth=False, encoding=0): 

3307 RegexBase.__init__(self) 

3308 self.value = value 

3309 self.positive = bool(positive) 

3310 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3311 self.zerowidth = bool(zerowidth) 

3312 self.encoding = encoding 

3313 

3314 self._key = (self.__class__, self.value, self.positive, 

3315 self.case_flags, self.zerowidth) 

3316 

3317 def rebuild(self, positive, case_flags, zerowidth): 

3318 return Property(self.value, positive, case_flags, zerowidth, 

3319 self.encoding) 

3320 

3321 def optimise(self, info, reverse, in_set=False): 

3322 return self 

3323 

3324 def get_firstset(self, reverse): 

3325 return set([self]) 

3326 

3327 def has_simple_start(self): 

3328 return True 

3329 

3330 def _compile(self, reverse, fuzzy): 

3331 flags = 0 

3332 if self.positive: 

3333 flags |= POSITIVE_OP 

3334 if self.zerowidth: 

3335 flags |= ZEROWIDTH_OP 

3336 if fuzzy: 

3337 flags |= FUZZY_OP 

3338 flags |= self.encoding << ENCODING_OP_SHIFT 

3339 return [(self._opcode[self.case_flags, reverse], flags, self.value)] 

3340 

3341 def dump(self, indent, reverse): 

3342 prop = PROPERTY_NAMES[self.value >> 16] 

3343 name, value = prop[0], prop[1][self.value & 0xFFFF] 

3344 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent, 

3345 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags], 

3346 ["", " ASCII"][self.encoding])) 

3347 

3348 def matches(self, ch): 

3349 return _regex.has_property_value(self.value, ch) == self.positive 

3350 

3351 def max_width(self): 

3352 return 1 

3353 

3354class Prune(ZeroWidthBase): 

3355 _op_name = "PRUNE" 

3356 

3357 def _compile(self, reverse, fuzzy): 

3358 return [(OP.PRUNE, )] 

3359 

3360class Range(RegexBase): 

3361 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN, 

3362 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN, 

3363 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV, 

3364 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV} 

3365 _op_name = "RANGE" 

3366 

3367 def __init__(self, lower, upper, positive=True, case_flags=NOCASE, 

3368 zerowidth=False): 

3369 RegexBase.__init__(self) 

3370 self.lower = lower 

3371 self.upper = upper 

3372 self.positive = bool(positive) 

3373 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3374 self.zerowidth = bool(zerowidth) 

3375 

3376 self._key = (self.__class__, self.lower, self.upper, self.positive, 

3377 self.case_flags, self.zerowidth) 

3378 

3379 def rebuild(self, positive, case_flags, zerowidth): 

3380 return Range(self.lower, self.upper, positive, case_flags, zerowidth) 

3381 

3382 def optimise(self, info, reverse, in_set=False): 

3383 # Is the range case-sensitive? 

3384 if not self.positive or not (self.case_flags & IGNORECASE) or in_set: 

3385 return self 

3386 

3387 # Is full case-folding possible? 

3388 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) != 

3389 FULLIGNORECASE): 

3390 return self 

3391 

3392 # Get the characters which expand to multiple codepoints on folding. 

3393 expanding_chars = _regex.get_expand_on_folding() 

3394 

3395 # Get the folded characters in the range. 

3396 items = [] 

3397 for ch in expanding_chars: 

3398 if self.lower <= ord(ch) <= self.upper: 

3399 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 

3400 items.append(String([ord(c) for c in folded], 

3401 case_flags=self.case_flags)) 

3402 

3403 if not items: 

3404 # We can fall back to simple case-folding. 

3405 return self 

3406 

3407 if len(items) < self.upper - self.lower + 1: 

3408 # Not all the characters are covered by the full case-folding. 

3409 items.insert(0, self) 

3410 

3411 return Branch(items) 

3412 

3413 def _compile(self, reverse, fuzzy): 

3414 flags = 0 

3415 if self.positive: 

3416 flags |= POSITIVE_OP 

3417 if self.zerowidth: 

3418 flags |= ZEROWIDTH_OP 

3419 if fuzzy: 

3420 flags |= FUZZY_OP 

3421 return [(self._opcode[self.case_flags, reverse], flags, self.lower, 

3422 self.upper)] 

3423 

3424 def dump(self, indent, reverse): 

3425 display_lower = ascii(chr(self.lower)).lstrip("bu") 

3426 display_upper = ascii(chr(self.upper)).lstrip("bu") 

3427 print("{}RANGE {} {} {}{}".format(INDENT * indent, 

3428 POS_TEXT[self.positive], display_lower, display_upper, 

3429 CASE_TEXT[self.case_flags])) 

3430 

3431 def matches(self, ch): 

3432 return (self.lower <= ch <= self.upper) == self.positive 

3433 

3434 def max_width(self): 

3435 return 1 

3436 

3437class RefGroup(RegexBase): 

3438 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False): 

3439 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE, 

3440 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE, 

3441 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV, 

3442 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV} 

3443 

3444 def __init__(self, info, group, position, case_flags=NOCASE): 

3445 RegexBase.__init__(self) 

3446 self.info = info 

3447 self.group = group 

3448 self.position = position 

3449 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3450 

3451 self._key = self.__class__, self.group, self.case_flags 

3452 

3453 def fix_groups(self, pattern, reverse, fuzzy): 

3454 try: 

3455 self.group = int(self.group) 

3456 except ValueError: 

3457 try: 

3458 self.group = self.info.group_index[self.group] 

3459 except KeyError: 

3460 raise error("unknown group", pattern, self.position) 

3461 

3462 if not 1 <= self.group <= self.info.group_count: 

3463 raise error("invalid group reference", pattern, self.position) 

3464 

3465 self._key = self.__class__, self.group, self.case_flags 

3466 

3467 def remove_captures(self): 

3468 raise error("group reference not allowed", self.pattern, self.position) 

3469 

3470 def _compile(self, reverse, fuzzy): 

3471 flags = 0 

3472 if fuzzy: 

3473 flags |= FUZZY_OP 

3474 return [(self._opcode[self.case_flags, reverse], flags, self.group)] 

3475 

3476 def dump(self, indent, reverse): 

3477 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group, 

3478 CASE_TEXT[self.case_flags])) 

3479 

3480 def max_width(self): 

3481 return UNLIMITED 

3482 

3483 def __del__(self): 

3484 self.info = None 

3485 

3486class SearchAnchor(ZeroWidthBase): 

3487 _opcode = OP.SEARCH_ANCHOR 

3488 _op_name = "SEARCH_ANCHOR" 

3489 

3490class Sequence(RegexBase): 

3491 def __init__(self, items=None): 

3492 RegexBase.__init__(self) 

3493 if items is None: 

3494 items = [] 

3495 

3496 self.items = items 

3497 

3498 def fix_groups(self, pattern, reverse, fuzzy): 

3499 for s in self.items: 

3500 s.fix_groups(pattern, reverse, fuzzy) 

3501 

3502 def optimise(self, info, reverse): 

3503 # Flatten the sequences. 

3504 items = [] 

3505 for s in self.items: 

3506 s = s.optimise(info, reverse) 

3507 if isinstance(s, Sequence): 

3508 items.extend(s.items) 

3509 else: 

3510 items.append(s) 

3511 

3512 return make_sequence(items) 

3513 

3514 def pack_characters(self, info): 

3515 "Packs sequences of characters into strings." 

3516 items = [] 

3517 characters = [] 

3518 case_flags = NOCASE 

3519 for s in self.items: 

3520 if type(s) is Character and s.positive and not s.zerowidth: 

3521 if s.case_flags != case_flags: 

3522 # Different case sensitivity, so flush, unless neither the 

3523 # previous nor the new character are cased. 

3524 if s.case_flags or is_cased_i(info, s.value): 

3525 Sequence._flush_characters(info, characters, 

3526 case_flags, items) 

3527 

3528 case_flags = s.case_flags 

3529 

3530 characters.append(s.value) 

3531 elif type(s) is String or type(s) is Literal: 

3532 if s.case_flags != case_flags: 

3533 # Different case sensitivity, so flush, unless the neither 

3534 # the previous nor the new string are cased. 

3535 if s.case_flags or any(is_cased_i(info, c) for c in 

3536 characters): 

3537 Sequence._flush_characters(info, characters, 

3538 case_flags, items) 

3539 

3540 case_flags = s.case_flags 

3541 

3542 characters.extend(s.characters) 

3543 else: 

3544 Sequence._flush_characters(info, characters, case_flags, items) 

3545 

3546 items.append(s.pack_characters(info)) 

3547 

3548 Sequence._flush_characters(info, characters, case_flags, items) 

3549 

3550 return make_sequence(items) 

3551 

3552 def remove_captures(self): 

3553 self.items = [s.remove_captures() for s in self.items] 

3554 return self 

3555 

3556 def is_atomic(self): 

3557 return all(s.is_atomic() for s in self.items) 

3558 

3559 def can_be_affix(self): 

3560 return False 

3561 

3562 def contains_group(self): 

3563 return any(s.contains_group() for s in self.items) 

3564 

3565 def get_firstset(self, reverse): 

3566 fs = set() 

3567 items = self.items 

3568 if reverse: 

3569 items.reverse() 

3570 for s in items: 

3571 fs |= s.get_firstset(reverse) 

3572 if None not in fs: 

3573 return fs 

3574 fs.discard(None) 

3575 

3576 return fs | set([None]) 

3577 

3578 def has_simple_start(self): 

3579 return bool(self.items) and self.items[0].has_simple_start() 

3580 

3581 def _compile(self, reverse, fuzzy): 

3582 seq = self.items 

3583 if reverse: 

3584 seq = seq[::-1] 

3585 

3586 code = [] 

3587 for s in seq: 

3588 code.extend(s.compile(reverse, fuzzy)) 

3589 

3590 return code 

3591 

3592 def dump(self, indent, reverse): 

3593 for s in self.items: 

3594 s.dump(indent, reverse) 

3595 

3596 @staticmethod 

3597 def _flush_characters(info, characters, case_flags, items): 

3598 if not characters: 

3599 return 

3600 

3601 # Disregard case_flags if all of the characters are case-less. 

3602 if case_flags & IGNORECASE: 

3603 if not any(is_cased_i(info, c) for c in characters): 

3604 case_flags = NOCASE 

3605 

3606 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE: 

3607 literals = Sequence._fix_full_casefold(characters) 

3608 

3609 for item in literals: 

3610 chars = item.characters 

3611 

3612 if len(chars) == 1: 

3613 items.append(Character(chars[0], case_flags=item.case_flags)) 

3614 else: 

3615 items.append(String(chars, case_flags=item.case_flags)) 

3616 else: 

3617 if len(characters) == 1: 

3618 items.append(Character(characters[0], case_flags=case_flags)) 

3619 else: 

3620 items.append(String(characters, case_flags=case_flags)) 

3621 

3622 characters[:] = [] 

3623 

3624 @staticmethod 

3625 def _fix_full_casefold(characters): 

3626 # Split a literal needing full case-folding into chunks that need it 

3627 # and chunks that can use simple case-folding, which is faster. 

3628 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in 

3629 _regex.get_expand_on_folding()] 

3630 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c) 

3631 for c in characters)).lower() 

3632 chunks = [] 

3633 

3634 for e in expanded: 

3635 found = string.find(e) 

3636 

3637 while found >= 0: 

3638 chunks.append((found, found + len(e))) 

3639 found = string.find(e, found + 1) 

3640 

3641 pos = 0 

3642 literals = [] 

3643 

3644 for start, end in Sequence._merge_chunks(chunks): 

3645 if pos < start: 

3646 literals.append(Literal(characters[pos : start], 

3647 case_flags=IGNORECASE)) 

3648 

3649 literals.append(Literal(characters[start : end], 

3650 case_flags=FULLIGNORECASE)) 

3651 pos = end 

3652 

3653 if pos < len(characters): 

3654 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE)) 

3655 

3656 return literals 

3657 

3658 @staticmethod 

3659 def _merge_chunks(chunks): 

3660 if len(chunks) < 2: 

3661 return chunks 

3662 

3663 chunks.sort() 

3664 

3665 start, end = chunks[0] 

3666 new_chunks = [] 

3667 

3668 for s, e in chunks[1 : ]: 

3669 if s <= end: 

3670 end = max(end, e) 

3671 else: 

3672 new_chunks.append((start, end)) 

3673 start, end = s, e 

3674 

3675 new_chunks.append((start, end)) 

3676 

3677 return new_chunks 

3678 

3679 def is_empty(self): 

3680 return all(i.is_empty() for i in self.items) 

3681 

3682 def __eq__(self, other): 

3683 return type(self) is type(other) and self.items == other.items 

3684 

3685 def max_width(self): 

3686 return sum(s.max_width() for s in self.items) 

3687 

3688 def get_required_string(self, reverse): 

3689 seq = self.items 

3690 if reverse: 

3691 seq = seq[::-1] 

3692 

3693 offset = 0 

3694 

3695 for s in seq: 

3696 ofs, req = s.get_required_string(reverse) 

3697 offset += ofs 

3698 if req: 

3699 return offset, req 

3700 

3701 return offset, None 

3702 

3703class SetBase(RegexBase): 

3704 def __init__(self, info, items, positive=True, case_flags=NOCASE, 

3705 zerowidth=False): 

3706 RegexBase.__init__(self) 

3707 self.info = info 

3708 self.items = tuple(items) 

3709 self.positive = bool(positive) 

3710 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3711 self.zerowidth = bool(zerowidth) 

3712 

3713 self.char_width = 1 

3714 

3715 self._key = (self.__class__, self.items, self.positive, 

3716 self.case_flags, self.zerowidth) 

3717 

3718 def rebuild(self, positive, case_flags, zerowidth): 

3719 return type(self)(self.info, self.items, positive, case_flags, 

3720 zerowidth).optimise(self.info, False) 

3721 

3722 def get_firstset(self, reverse): 

3723 return set([self]) 

3724 

3725 def has_simple_start(self): 

3726 return True 

3727 

3728 def _compile(self, reverse, fuzzy): 

3729 flags = 0 

3730 if self.positive: 

3731 flags |= POSITIVE_OP 

3732 if self.zerowidth: 

3733 flags |= ZEROWIDTH_OP 

3734 if fuzzy: 

3735 flags |= FUZZY_OP 

3736 code = [(self._opcode[self.case_flags, reverse], flags)] 

3737 for m in self.items: 

3738 code.extend(m.compile()) 

3739 

3740 code.append((OP.END, )) 

3741 

3742 return code 

3743 

3744 def dump(self, indent, reverse): 

3745 print("{}{} {}{}".format(INDENT * indent, self._op_name, 

3746 POS_TEXT[self.positive], CASE_TEXT[self.case_flags])) 

3747 for i in self.items: 

3748 i.dump(indent + 1, reverse) 

3749 

3750 def _handle_case_folding(self, info, in_set): 

3751 # Is the set case-sensitive? 

3752 if not self.positive or not (self.case_flags & IGNORECASE) or in_set: 

3753 return self 

3754 

3755 # Is full case-folding possible? 

3756 if (not (self.info.flags & UNICODE) or (self.case_flags & 

3757 FULLIGNORECASE) != FULLIGNORECASE): 

3758 return self 

3759 

3760 # Get the characters which expand to multiple codepoints on folding. 

3761 expanding_chars = _regex.get_expand_on_folding() 

3762 

3763 # Get the folded characters in the set. 

3764 items = [] 

3765 seen = set() 

3766 for ch in expanding_chars: 

3767 if self.matches(ord(ch)): 

3768 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 

3769 if folded not in seen: 

3770 items.append(String([ord(c) for c in folded], 

3771 case_flags=self.case_flags)) 

3772 seen.add(folded) 

3773 

3774 if not items: 

3775 # We can fall back to simple case-folding. 

3776 return self 

3777 

3778 return Branch([self] + items) 

3779 

3780 def max_width(self): 

3781 # Is the set case-sensitive? 

3782 if not self.positive or not (self.case_flags & IGNORECASE): 

3783 return 1 

3784 

3785 # Is full case-folding possible? 

3786 if (not (self.info.flags & UNICODE) or (self.case_flags & 

3787 FULLIGNORECASE) != FULLIGNORECASE): 

3788 return 1 

3789 

3790 # Get the characters which expand to multiple codepoints on folding. 

3791 expanding_chars = _regex.get_expand_on_folding() 

3792 

3793 # Get the folded characters in the set. 

3794 seen = set() 

3795 for ch in expanding_chars: 

3796 if self.matches(ord(ch)): 

3797 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 

3798 seen.add(folded) 

3799 

3800 if not seen: 

3801 return 1 

3802 

3803 return max(len(folded) for folded in seen) 

3804 

3805 def __del__(self): 

3806 self.info = None 

3807 

3808class SetDiff(SetBase): 

3809 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False): 

3810 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False): 

3811 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True): 

3812 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE, 

3813 True): OP.SET_DIFF_IGN_REV} 

3814 _op_name = "SET_DIFF" 

3815 

3816 def optimise(self, info, reverse, in_set=False): 

3817 items = self.items 

3818 if len(items) > 2: 

3819 items = [items[0], SetUnion(info, items[1 : ])] 

3820 

3821 if len(items) == 1: 

3822 return items[0].with_flags(case_flags=self.case_flags, 

3823 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3824 

3825 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in 

3826 items) 

3827 

3828 return self._handle_case_folding(info, in_set) 

3829 

3830 def matches(self, ch): 

3831 m = self.items[0].matches(ch) and not self.items[1].matches(ch) 

3832 return m == self.positive 

3833 

3834class SetInter(SetBase): 

3835 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False): 

3836 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE, 

3837 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE, 

3838 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV, 

3839 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV} 

3840 _op_name = "SET_INTER" 

3841 

3842 def optimise(self, info, reverse, in_set=False): 

3843 items = [] 

3844 for m in self.items: 

3845 m = m.optimise(info, reverse, in_set=True) 

3846 if isinstance(m, SetInter) and m.positive: 

3847 # Intersection in intersection. 

3848 items.extend(m.items) 

3849 else: 

3850 items.append(m) 

3851 

3852 if len(items) == 1: 

3853 return items[0].with_flags(case_flags=self.case_flags, 

3854 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3855 

3856 self.items = tuple(items) 

3857 

3858 return self._handle_case_folding(info, in_set) 

3859 

3860 def matches(self, ch): 

3861 m = all(i.matches(ch) for i in self.items) 

3862 return m == self.positive 

3863 

3864class SetSymDiff(SetBase): 

3865 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False): 

3866 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE, 

3867 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV, 

3868 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True): 

3869 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV} 

3870 _op_name = "SET_SYM_DIFF" 

3871 

3872 def optimise(self, info, reverse, in_set=False): 

3873 items = [] 

3874 for m in self.items: 

3875 m = m.optimise(info, reverse, in_set=True) 

3876 if isinstance(m, SetSymDiff) and m.positive: 

3877 # Symmetric difference in symmetric difference. 

3878 items.extend(m.items) 

3879 else: 

3880 items.append(m) 

3881 

3882 if len(items) == 1: 

3883 return items[0].with_flags(case_flags=self.case_flags, 

3884 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3885 

3886 self.items = tuple(items) 

3887 

3888 return self._handle_case_folding(info, in_set) 

3889 

3890 def matches(self, ch): 

3891 m = False 

3892 for i in self.items: 

3893 m = m != i.matches(ch) 

3894 

3895 return m == self.positive 

3896 

3897class SetUnion(SetBase): 

3898 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False): 

3899 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE, 

3900 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE, 

3901 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV, 

3902 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV} 

3903 _op_name = "SET_UNION" 

3904 

3905 def optimise(self, info, reverse, in_set=False): 

3906 items = [] 

3907 for m in self.items: 

3908 m = m.optimise(info, reverse, in_set=True) 

3909 if isinstance(m, SetUnion) and m.positive: 

3910 # Union in union. 

3911 items.extend(m.items) 

3912 elif isinstance(m, AnyAll): 

3913 return AnyAll() 

3914 else: 

3915 items.append(m) 

3916 

3917 # Are there complementary properties? 

3918 properties = (set(), set()) 

3919 

3920 for m in items: 

3921 if isinstance(m, Property): 

3922 properties[m.positive].add((m.value, m.case_flags, m.zerowidth)) 

3923 

3924 if properties[0] & properties[1]: 

3925 return AnyAll() 

3926 

3927 if len(items) == 1: 

3928 i = items[0] 

3929 return i.with_flags(positive=i.positive == self.positive, 

3930 case_flags=self.case_flags, 

3931 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3932 

3933 self.items = tuple(items) 

3934 

3935 return self._handle_case_folding(info, in_set) 

3936 

3937 def _compile(self, reverse, fuzzy): 

3938 flags = 0 

3939 if self.positive: 

3940 flags |= POSITIVE_OP 

3941 if self.zerowidth: 

3942 flags |= ZEROWIDTH_OP 

3943 if fuzzy: 

3944 flags |= FUZZY_OP 

3945 

3946 characters, others = defaultdict(list), [] 

3947 for m in self.items: 

3948 if isinstance(m, Character): 

3949 characters[m.positive].append(m.value) 

3950 else: 

3951 others.append(m) 

3952 

3953 code = [(self._opcode[self.case_flags, reverse], flags)] 

3954 

3955 for positive, values in characters.items(): 

3956 flags = 0 

3957 if positive: 

3958 flags |= POSITIVE_OP 

3959 if len(values) == 1: 

3960 code.append((OP.CHARACTER, flags, values[0])) 

3961 else: 

3962 code.append((OP.STRING, flags, len(values)) + tuple(values)) 

3963 

3964 for m in others: 

3965 code.extend(m.compile()) 

3966 

3967 code.append((OP.END, )) 

3968 

3969 return code 

3970 

3971 def matches(self, ch): 

3972 m = any(i.matches(ch) for i in self.items) 

3973 return m == self.positive 

3974 

3975class Skip(ZeroWidthBase): 

3976 _op_name = "SKIP" 

3977 _opcode = OP.SKIP 

3978 

3979class StartOfLine(ZeroWidthBase): 

3980 _opcode = OP.START_OF_LINE 

3981 _op_name = "START_OF_LINE" 

3982 

3983class StartOfLineU(StartOfLine): 

3984 _opcode = OP.START_OF_LINE_U 

3985 _op_name = "START_OF_LINE_U" 

3986 

3987class StartOfString(ZeroWidthBase): 

3988 _opcode = OP.START_OF_STRING 

3989 _op_name = "START_OF_STRING" 

3990 

3991class StartOfWord(ZeroWidthBase): 

3992 _opcode = OP.START_OF_WORD 

3993 _op_name = "START_OF_WORD" 

3994 

3995class String(RegexBase): 

3996 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN, 

3997 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD, 

3998 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV, 

3999 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True): 

4000 OP.STRING_FLD_REV} 

4001 

4002 def __init__(self, characters, case_flags=NOCASE): 

4003 self.characters = tuple(characters) 

4004 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

4005 

4006 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE: 

4007 folded_characters = [] 

4008 for char in self.characters: 

4009 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char)) 

4010 folded_characters.extend(ord(c) for c in folded) 

4011 else: 

4012 folded_characters = self.characters 

4013 

4014 self.folded_characters = tuple(folded_characters) 

4015 self.required = False 

4016 

4017 self._key = self.__class__, self.characters, self.case_flags 

4018 

4019 def get_firstset(self, reverse): 

4020 if reverse: 

4021 pos = -1 

4022 else: 

4023 pos = 0 

4024 return set([Character(self.characters[pos], 

4025 case_flags=self.case_flags)]) 

4026 

4027 def has_simple_start(self): 

4028 return True 

4029 

4030 def _compile(self, reverse, fuzzy): 

4031 flags = 0 

4032 if fuzzy: 

4033 flags |= FUZZY_OP 

4034 if self.required: 

4035 flags |= REQUIRED_OP 

4036 return [(self._opcode[self.case_flags, reverse], flags, 

4037 len(self.folded_characters)) + self.folded_characters] 

4038 

4039 def dump(self, indent, reverse): 

4040 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu") 

4041 print("{}STRING {}{}".format(INDENT * indent, display, 

4042 CASE_TEXT[self.case_flags])) 

4043 

4044 def max_width(self): 

4045 return len(self.folded_characters) 

4046 

4047 def get_required_string(self, reverse): 

4048 return 0, self 

4049 

4050class Literal(String): 

4051 def dump(self, indent, reverse): 

4052 literal = ''.join(chr(c) for c in self.characters) 

4053 display = ascii(literal).lstrip("bu") 

4054 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display, 

4055 CASE_TEXT[self.case_flags])) 

4056 

4057class StringSet(Branch): 

4058 def __init__(self, info, name, case_flags=NOCASE): 

4059 self.info = info 

4060 self.name = name 

4061 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

4062 

4063 self._key = self.__class__, self.name, self.case_flags 

4064 

4065 self.set_key = (name, self.case_flags) 

4066 if self.set_key not in info.named_lists_used: 

4067 info.named_lists_used[self.set_key] = len(info.named_lists_used) 

4068 

4069 index = self.info.named_lists_used[self.set_key] 

4070 items = self.info.kwargs[self.name] 

4071 

4072 case_flags = self.case_flags 

4073 

4074 encoding = self.info.flags & _ALL_ENCODINGS 

4075 fold_flags = encoding | case_flags 

4076 

4077 choices = [] 

4078 

4079 for string in items: 

4080 if isinstance(string, str): 

4081 string = [ord(c) for c in string] 

4082 

4083 choices.append([Character(c, case_flags=case_flags) for c in 

4084 string]) 

4085 

4086 # Sort from longest to shortest. 

4087 choices.sort(key=len, reverse=True) 

4088 

4089 self.branches = [Sequence(choice) for choice in choices] 

4090 

4091 def dump(self, indent, reverse): 

4092 print("{}STRING_SET {}{}".format(INDENT * indent, self.name, 

4093 CASE_TEXT[self.case_flags])) 

4094 

4095 def __del__(self): 

4096 self.info = None 

4097 

4098class Source: 

4099 "Scanner for the regular expression source string." 

4100 def __init__(self, string): 

4101 if isinstance(string, str): 

4102 self.string = string 

4103 self.char_type = chr 

4104 else: 

4105 self.string = string.decode("latin-1") 

4106 self.char_type = lambda c: bytes([c]) 

4107 

4108 self.pos = 0 

4109 self.ignore_space = False 

4110 self.sep = string[ : 0] 

4111 

4112 def peek(self, override_ignore=False): 

4113 string = self.string 

4114 pos = self.pos 

4115 

4116 try: 

4117 if self.ignore_space and not override_ignore: 

4118 while True: 

4119 if string[pos].isspace(): 

4120 # Skip over the whitespace. 

4121 pos += 1 

4122 elif string[pos] == "#": 

4123 # Skip over the comment to the end of the line. 

4124 pos = string.index("\n", pos) 

4125 else: 

4126 break 

4127 

4128 return string[pos] 

4129 except IndexError: 

4130 # We've reached the end of the string. 

4131 return string[ : 0] 

4132 except ValueError: 

4133 # The comment extended to the end of the string. 

4134 return string[ : 0] 

4135 

4136 def get(self, override_ignore=False): 

4137 string = self.string 

4138 pos = self.pos 

4139 

4140 try: 

4141 if self.ignore_space and not override_ignore: 

4142 while True: 

4143 if string[pos].isspace(): 

4144 # Skip over the whitespace. 

4145 pos += 1 

4146 elif string[pos] == "#": 

4147 # Skip over the comment to the end of the line. 

4148 pos = string.index("\n", pos) 

4149 else: 

4150 break 

4151 

4152 ch = string[pos] 

4153 self.pos = pos + 1 

4154 return ch 

4155 except IndexError: 

4156 # We've reached the end of the string. 

4157 self.pos = pos 

4158 return string[ : 0] 

4159 except ValueError: 

4160 # The comment extended to the end of the string. 

4161 self.pos = len(string) 

4162 return string[ : 0] 

4163 

4164 def get_many(self, count=1): 

4165 string = self.string 

4166 pos = self.pos 

4167 

4168 try: 

4169 if self.ignore_space: 

4170 substring = [] 

4171 

4172 while len(substring) < count: 

4173 while True: 

4174 if string[pos].isspace(): 

4175 # Skip over the whitespace. 

4176 pos += 1 

4177 elif string[pos] == "#": 

4178 # Skip over the comment to the end of the line. 

4179 pos = string.index("\n", pos) 

4180 else: 

4181 break 

4182 

4183 substring.append(string[pos]) 

4184 pos += 1 

4185 

4186 substring = "".join(substring) 

4187 else: 

4188 substring = string[pos : pos + count] 

4189 pos += len(substring) 

4190 

4191 self.pos = pos 

4192 return substring 

4193 except IndexError: 

4194 # We've reached the end of the string. 

4195 self.pos = len(string) 

4196 return "".join(substring) 

4197 except ValueError: 

4198 # The comment extended to the end of the string. 

4199 self.pos = len(string) 

4200 return "".join(substring) 

4201 

4202 def get_while(self, test_set, include=True, keep_spaces=False): 

4203 string = self.string 

4204 pos = self.pos 

4205 

4206 if self.ignore_space and not keep_spaces: 

4207 try: 

4208 substring = [] 

4209 

4210 while True: 

4211 if string[pos].isspace(): 

4212 # Skip over the whitespace. 

4213 pos += 1 

4214 elif string[pos] == "#": 

4215 # Skip over the comment to the end of the line. 

4216 pos = string.index("\n", pos) 

4217 elif (string[pos] in test_set) == include: 

4218 substring.append(string[pos]) 

4219 pos += 1 

4220 else: 

4221 break 

4222 

4223 self.pos = pos 

4224 except IndexError: 

4225 # We've reached the end of the string. 

4226 self.pos = len(string) 

4227 except ValueError: 

4228 # The comment extended to the end of the string. 

4229 self.pos = len(string) 

4230 

4231 return "".join(substring) 

4232 else: 

4233 try: 

4234 while (string[pos] in test_set) == include: 

4235 pos += 1 

4236 

4237 substring = string[self.pos : pos] 

4238 

4239 self.pos = pos 

4240 

4241 return substring 

4242 except IndexError: 

4243 # We've reached the end of the string. 

4244 substring = string[self.pos : pos] 

4245 

4246 self.pos = pos 

4247 

4248 return substring 

4249 

4250 def skip_while(self, test_set, include=True): 

4251 string = self.string 

4252 pos = self.pos 

4253 

4254 try: 

4255 if self.ignore_space: 

4256 while True: 

4257 if string[pos].isspace(): 

4258 # Skip over the whitespace. 

4259 pos += 1 

4260 elif string[pos] == "#": 

4261 # Skip over the comment to the end of the line. 

4262 pos = string.index("\n", pos) 

4263 elif (string[pos] in test_set) == include: 

4264 pos += 1 

4265 else: 

4266 break 

4267 else: 

4268 while (string[pos] in test_set) == include: 

4269 pos += 1 

4270 

4271 self.pos = pos 

4272 except IndexError: 

4273 # We've reached the end of the string. 

4274 self.pos = len(string) 

4275 except ValueError: 

4276 # The comment extended to the end of the string. 

4277 self.pos = len(string) 

4278 

4279 def match(self, substring): 

4280 string = self.string 

4281 pos = self.pos 

4282 

4283 if self.ignore_space: 

4284 try: 

4285 for c in substring: 

4286 while True: 

4287 if string[pos].isspace(): 

4288 # Skip over the whitespace. 

4289 pos += 1 

4290 elif string[pos] == "#": 

4291 # Skip over the comment to the end of the line. 

4292 pos = string.index("\n", pos) 

4293 else: 

4294 break 

4295 

4296 if string[pos] != c: 

4297 return False 

4298 

4299 pos += 1 

4300 

4301 self.pos = pos 

4302 

4303 return True 

4304 except IndexError: 

4305 # We've reached the end of the string. 

4306 return False 

4307 except ValueError: 

4308 # The comment extended to the end of the string. 

4309 return False 

4310 else: 

4311 if not string.startswith(substring, pos): 

4312 return False 

4313 

4314 self.pos = pos + len(substring) 

4315 

4316 return True 

4317 

4318 def expect(self, substring): 

4319 if not self.match(substring): 

4320 raise error("missing {}".format(substring), self.string, self.pos) 

4321 

4322 def at_end(self): 

4323 string = self.string 

4324 pos = self.pos 

4325 

4326 try: 

4327 if self.ignore_space: 

4328 while True: 

4329 if string[pos].isspace(): 

4330 pos += 1 

4331 elif string[pos] == "#": 

4332 pos = string.index("\n", pos) 

4333 else: 

4334 break 

4335 

4336 return pos >= len(string) 

4337 except IndexError: 

4338 # We've reached the end of the string. 

4339 return True 

4340 except ValueError: 

4341 # The comment extended to the end of the string. 

4342 return True 

4343 

4344class Info: 

4345 "Info about the regular expression." 

4346 

4347 def __init__(self, flags=0, char_type=None, kwargs={}): 

4348 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION] 

4349 self.flags = flags 

4350 self.global_flags = flags 

4351 self.inline_locale = False 

4352 

4353 self.kwargs = kwargs 

4354 

4355 self.group_count = 0 

4356 self.group_index = {} 

4357 self.group_name = {} 

4358 self.char_type = char_type 

4359 self.named_lists_used = {} 

4360 self.open_groups = [] 

4361 self.open_group_count = {} 

4362 self.defined_groups = {} 

4363 self.group_calls = [] 

4364 self.private_groups = {} 

4365 

4366 def open_group(self, name=None): 

4367 group = self.group_index.get(name) 

4368 if group is None: 

4369 while True: 

4370 self.group_count += 1 

4371 if name is None or self.group_count not in self.group_name: 

4372 break 

4373 

4374 group = self.group_count 

4375 if name: 

4376 self.group_index[name] = group 

4377 self.group_name[group] = name 

4378 

4379 if group in self.open_groups: 

4380 # We have a nested named group. We'll assign it a private group 

4381 # number, initially negative until we can assign a proper 

4382 # (positive) number. 

4383 group_alias = -(len(self.private_groups) + 1) 

4384 self.private_groups[group_alias] = group 

4385 group = group_alias 

4386 

4387 self.open_groups.append(group) 

4388 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1 

4389 

4390 return group 

4391 

4392 def close_group(self): 

4393 self.open_groups.pop() 

4394 

4395 def is_open_group(self, name): 

4396 # In version 1, a group reference can refer to an open group. We'll 

4397 # just pretend the group isn't open. 

4398 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

4399 if version == VERSION1: 

4400 return False 

4401 

4402 if name.isdigit(): 

4403 group = int(name) 

4404 else: 

4405 group = self.group_index.get(name) 

4406 

4407 return group in self.open_groups 

4408 

4409def _check_group_features(info, parsed): 

4410 """Checks whether the reverse and fuzzy features of the group calls match 

4411 the groups which they call. 

4412 """ 

4413 call_refs = {} 

4414 additional_groups = [] 

4415 for call, reverse, fuzzy in info.group_calls: 

4416 # Look up the reference of this group call. 

4417 key = (call.group, reverse, fuzzy) 

4418 ref = call_refs.get(key) 

4419 if ref is None: 

4420 # This group doesn't have a reference yet, so look up its features. 

4421 if call.group == 0: 

4422 # Calling the pattern as a whole. 

4423 rev = bool(info.flags & REVERSE) 

4424 fuz = isinstance(parsed, Fuzzy) 

4425 if (rev, fuz) != (reverse, fuzzy): 

4426 # The pattern as a whole doesn't have the features we want, 

4427 # so we'll need to make a copy of it with the desired 

4428 # features. 

4429 additional_groups.append((CallRef(len(call_refs), parsed), 

4430 reverse, fuzzy)) 

4431 else: 

4432 # Calling a capture group. 

4433 def_info = info.defined_groups[call.group] 

4434 group = def_info[0] 

4435 if def_info[1 : ] != (reverse, fuzzy): 

4436 # The group doesn't have the features we want, so we'll 

4437 # need to make a copy of it with the desired features. 

4438 additional_groups.append((group, reverse, fuzzy)) 

4439 

4440 ref = len(call_refs) 

4441 call_refs[key] = ref 

4442 

4443 call.call_ref = ref 

4444 

4445 info.call_refs = call_refs 

4446 info.additional_groups = additional_groups 

4447 

4448def _get_required_string(parsed, flags): 

4449 "Gets the required string and related info of a parsed pattern." 

4450 

4451 req_offset, required = parsed.get_required_string(bool(flags & REVERSE)) 

4452 if required: 

4453 required.required = True 

4454 if req_offset >= UNLIMITED: 

4455 req_offset = -1 

4456 

4457 req_flags = required.case_flags 

4458 if not (flags & UNICODE): 

4459 req_flags &= ~UNICODE 

4460 

4461 req_chars = required.folded_characters 

4462 else: 

4463 req_offset = 0 

4464 req_chars = () 

4465 req_flags = 0 

4466 

4467 return req_offset, req_chars, req_flags 

4468 

4469class Scanner: 

4470 def __init__(self, lexicon, flags=0): 

4471 self.lexicon = lexicon 

4472 

4473 # Combine phrases into a compound pattern. 

4474 patterns = [] 

4475 for phrase, action in lexicon: 

4476 # Parse the regular expression. 

4477 source = Source(phrase) 

4478 info = Info(flags, source.char_type) 

4479 source.ignore_space = bool(info.flags & VERBOSE) 

4480 parsed = _parse_pattern(source, info) 

4481 if not source.at_end(): 

4482 raise error("unbalanced parenthesis", source.string, 

4483 source.pos) 

4484 

4485 # We want to forbid capture groups within each phrase. 

4486 patterns.append(parsed.remove_captures()) 

4487 

4488 # Combine all the subpatterns into one pattern. 

4489 info = Info(flags) 

4490 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)] 

4491 parsed = Branch(patterns) 

4492 

4493 # Optimise the compound pattern. 

4494 reverse = bool(info.flags & REVERSE) 

4495 parsed = parsed.optimise(info, reverse) 

4496 parsed = parsed.pack_characters(info) 

4497 

4498 # Get the required string. 

4499 req_offset, req_chars, req_flags = _get_required_string(parsed, 

4500 info.flags) 

4501 

4502 # Check the features of the groups. 

4503 _check_group_features(info, parsed) 

4504 

4505 # Complain if there are any group calls. They are not supported by the 

4506 # Scanner class. 

4507 if info.call_refs: 

4508 raise error("recursive regex not supported by Scanner", 

4509 source.string, source.pos) 

4510 

4511 reverse = bool(info.flags & REVERSE) 

4512 

4513 # Compile the compound pattern. The result is a list of tuples. 

4514 code = parsed.compile(reverse) + [(OP.SUCCESS, )] 

4515 

4516 # Flatten the code into a list of ints. 

4517 code = _flatten_code(code) 

4518 

4519 if not parsed.has_simple_start(): 

4520 # Get the first set, if possible. 

4521 try: 

4522 fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) 

4523 fs_code = _flatten_code(fs_code) 

4524 code = fs_code + code 

4525 except _FirstSetError: 

4526 pass 

4527 

4528 # Check the global flags for conflicts. 

4529 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

4530 if version not in (0, VERSION0, VERSION1): 

4531 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") 

4532 

4533 # Create the PatternObject. 

4534 # 

4535 # Local flags like IGNORECASE affect the code generation, but aren't 

4536 # needed by the PatternObject itself. Conversely, global flags like 

4537 # LOCALE _don't_ affect the code generation but _are_ needed by the 

4538 # PatternObject. 

4539 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version, 

4540 code, {}, {}, {}, [], req_offset, req_chars, req_flags, 

4541 len(patterns)) 

4542 

4543 def scan(self, string): 

4544 result = [] 

4545 append = result.append 

4546 match = self.scanner.scanner(string).match 

4547 i = 0 

4548 while True: 

4549 m = match() 

4550 if not m: 

4551 break 

4552 j = m.end() 

4553 if i == j: 

4554 break 

4555 action = self.lexicon[m.lastindex - 1][1] 

4556 if hasattr(action, '__call__'): 

4557 self.match = m 

4558 action = action(self, m.group()) 

4559 if action is not None: 

4560 append(action) 

4561 i = j 

4562 

4563 return result, string[i : ] 

4564 

4565# Get the known properties dict. 

4566PROPERTIES = _regex.get_properties() 

4567 

4568# Build the inverse of the properties dict. 

4569PROPERTY_NAMES = {} 

4570for prop_name, (prop_id, values) in PROPERTIES.items(): 

4571 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {})) 

4572 name = max(name, prop_name, key=len) 

4573 PROPERTY_NAMES[prop_id] = name, prop_values 

4574 

4575 for val_name, val_id in values.items(): 

4576 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name, 

4577 key=len) 

4578 

4579# Character escape sequences. 

4580CHARACTER_ESCAPES = { 

4581 "a": "\a", 

4582 "b": "\b", 

4583 "f": "\f", 

4584 "n": "\n", 

4585 "r": "\r", 

4586 "t": "\t", 

4587 "v": "\v", 

4588} 

4589 

4590ASCII_ENCODING = 1 

4591UNICODE_ENCODING = 2 

4592 

4593# Predefined character set escape sequences. 

4594CHARSET_ESCAPES = { 

4595 "d": lookup_property(None, "Digit", True), 

4596 "D": lookup_property(None, "Digit", False), 

4597 "h": lookup_property(None, "Blank", True), 

4598 "s": lookup_property(None, "Space", True), 

4599 "S": lookup_property(None, "Space", False), 

4600 "w": lookup_property(None, "Word", True), 

4601 "W": lookup_property(None, "Word", False), 

4602} 

4603 

4604ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES) 

4605ASCII_CHARSET_ESCAPES.update({ 

4606 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING), 

4607 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING), 

4608 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING), 

4609 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING), 

4610 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING), 

4611 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING), 

4612}) 

4613UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES) 

4614UNICODE_CHARSET_ESCAPES.update({ 

4615 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING), 

4616 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING), 

4617 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING), 

4618 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING), 

4619 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING), 

4620 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING), 

4621}) 

4622 

4623# Positional escape sequences. 

4624POSITION_ESCAPES = { 

4625 "A": StartOfString(), 

4626 "b": Boundary(), 

4627 "B": Boundary(False), 

4628 "K": Keep(), 

4629 "m": StartOfWord(), 

4630 "M": EndOfWord(), 

4631 "Z": EndOfString(), 

4632} 

4633ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES) 

4634ASCII_POSITION_ESCAPES.update({ 

4635 "b": Boundary(encoding=ASCII_ENCODING), 

4636 "B": Boundary(False, encoding=ASCII_ENCODING), 

4637 "m": StartOfWord(encoding=ASCII_ENCODING), 

4638 "M": EndOfWord(encoding=ASCII_ENCODING), 

4639}) 

4640UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES) 

4641UNICODE_POSITION_ESCAPES.update({ 

4642 "b": Boundary(encoding=UNICODE_ENCODING), 

4643 "B": Boundary(False, encoding=UNICODE_ENCODING), 

4644 "m": StartOfWord(encoding=UNICODE_ENCODING), 

4645 "M": EndOfWord(encoding=UNICODE_ENCODING), 

4646}) 

4647 

4648# Positional escape sequences when WORD flag set. 

4649WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES) 

4650WORD_POSITION_ESCAPES.update({ 

4651 "b": DefaultBoundary(), 

4652 "B": DefaultBoundary(False), 

4653 "m": DefaultStartOfWord(), 

4654 "M": DefaultEndOfWord(), 

4655}) 

4656 

4657# Regex control verbs. 

4658VERBS = { 

4659 "FAIL": Failure(), 

4660 "F": Failure(), 

4661 "PRUNE": Prune(), 

4662 "SKIP": Skip(), 

4663}