Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 43%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

2835 statements  

1# 

2# Secret Labs' Regular Expression Engine core module 

3# 

4# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 

5# 

6# This version of the SRE library can be redistributed under CNRI's 

7# Python 1.6 license. For any other use, please contact Secret Labs 

8# AB (info@pythonware.com). 

9# 

10# Portions of this engine have been developed in cooperation with 

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and 

12# other compatibility work. 

13# 

14# 2010-01-16 mrab Python front-end re-written and extended 

15 

16import enum 

17import string 

18import unicodedata 

19from collections import defaultdict 

20 

21import regex._regex as _regex 

22 

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH", 

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P", 

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE", 

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error", 

27 "Scanner", "RegexFlag"] 

28 

29# The regex exception. 

30class error(Exception): 

31 """Exception raised for invalid regular expressions. 

32 

33 Attributes: 

34 

35 msg: The unformatted error message 

36 pattern: The regular expression pattern 

37 pos: The position in the pattern where compilation failed, or None 

38 lineno: The line number where compilation failed, unless pos is None 

39 colno: The column number where compilation failed, unless pos is None 

40 """ 

41 

42 def __init__(self, message, pattern=None, pos=None): 

43 newline = '\n' if isinstance(pattern, str) else b'\n' 

44 self.msg = message 

45 self.pattern = pattern 

46 self.pos = pos 

47 if pattern is not None and pos is not None: 

48 self.lineno = pattern.count(newline, 0, pos) + 1 

49 self.colno = pos - pattern.rfind(newline, 0, pos) 

50 

51 message = "{} at position {}".format(message, pos) 

52 

53 if newline in pattern: 

54 message += " (line {}, column {})".format(self.lineno, 

55 self.colno) 

56 

57 Exception.__init__(self, message) 

58 

59# The exception for when a positional flag has been turned on in the old 

60# behaviour. 

61class _UnscopedFlagSet(Exception): 

62 pass 

63 

64# The exception for when parsing fails and we want to try something else. 

65class ParseError(Exception): 

66 pass 

67 

68# The exception for when there isn't a valid first set. 

69class _FirstSetError(Exception): 

70 pass 

71 

72# Flags. 

73class RegexFlag(enum.IntFlag): 

74 A = ASCII = 0x80 # Assume ASCII locale. 

75 B = BESTMATCH = 0x1000 # Best fuzzy match. 

76 D = DEBUG = 0x200 # Print parsed pattern. 

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first 

78 # fuzzy match. 

79 F = FULLCASE = 0x4000 # Unicode full case-folding. 

80 I = IGNORECASE = 0x2 # Ignore case. 

81 L = LOCALE = 0x4 # Assume current 8-bit locale. 

82 M = MULTILINE = 0x8 # Make anchors look for newline. 

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest). 

84 R = REVERSE = 0x400 # Search backwards. 

85 S = DOTALL = 0x10 # Make dot match newline. 

86 U = UNICODE = 0x20 # Assume Unicode locale. 

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour. 

88 V1 = VERSION1 = 0x100 # New enhanced behaviour. 

89 W = WORD = 0x800 # Default Unicode word breaks. 

90 X = VERBOSE = 0x40 # Ignore whitespace and comments. 

91 T = TEMPLATE = 0x1 # Template (present because re module has it). 

92 

93 def __repr__(self): 

94 if self._name_ is not None: 

95 return 'regex.%s' % self._name_ 

96 

97 value = self._value_ 

98 members = [] 

99 negative = value < 0 

100 

101 if negative: 

102 value = ~value 

103 

104 for m in self.__class__: 

105 if value & m._value_: 

106 value &= ~m._value_ 

107 members.append('regex.%s' % m._name_) 

108 

109 if value: 

110 members.append(hex(value)) 

111 

112 res = '|'.join(members) 

113 

114 if negative: 

115 if len(members) > 1: 

116 res = '~(%s)' % res 

117 else: 

118 res = '~%s' % res 

119 

120 return res 

121 

122 __str__ = object.__str__ 

123 

124globals().update(RegexFlag.__members__) 

125 

126DEFAULT_VERSION = VERSION1 

127 

128_ALL_VERSIONS = VERSION0 | VERSION1 

129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE 

130 

131# The default flags for the various versions. 

132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE} 

133 

134# The mask for the flags. 

135GLOBAL_FLAGS = (_ALL_VERSIONS | BESTMATCH | DEBUG | ENHANCEMATCH | POSIX | 

136 REVERSE) 

137SCOPED_FLAGS = (FULLCASE | IGNORECASE | MULTILINE | DOTALL | WORD | VERBOSE | 

138 _ALL_ENCODINGS) 

139 

140ALPHA = frozenset(string.ascii_letters) 

141DIGITS = frozenset(string.digits) 

142ALNUM = ALPHA | DIGITS 

143OCT_DIGITS = frozenset(string.octdigits) 

144HEX_DIGITS = frozenset(string.hexdigits) 

145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""]) 

146NAMED_CHAR_PART = ALNUM | frozenset(" -") 

147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.") 

148SET_OPS = ("||", "~~", "&&", "--") 

149 

150# The width of the code words inside the regex engine. 

151BYTES_PER_CODE = _regex.get_code_size() 

152BITS_PER_CODE = BYTES_PER_CODE * 8 

153 

154# The repeat count which represents infinity. 

155UNLIMITED = (1 << BITS_PER_CODE) - 1 

156 

157# The regular expression flags. 

158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE, 

159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE, 

160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x": 

161 VERBOSE} 

162 

163# The case flags. 

164CASE_FLAGS = FULLCASE | IGNORECASE 

165NOCASE = 0 

166FULLIGNORECASE = FULLCASE | IGNORECASE 

167 

168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE 

169 

170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE, 

171 FULLIGNORECASE: FULLIGNORECASE} 

172 

173# The number of digits in hexadecimal escapes. 

174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8} 

175 

176# The names of the opcodes. 

177OPCODES = """ 

178FAILURE 

179SUCCESS 

180ANY 

181ANY_ALL 

182ANY_ALL_REV 

183ANY_REV 

184ANY_U 

185ANY_U_REV 

186ATOMIC 

187BOUNDARY 

188BRANCH 

189CALL_REF 

190CHARACTER 

191CHARACTER_IGN 

192CHARACTER_IGN_REV 

193CHARACTER_REV 

194CONDITIONAL 

195DEFAULT_BOUNDARY 

196DEFAULT_END_OF_WORD 

197DEFAULT_START_OF_WORD 

198END 

199END_OF_LINE 

200END_OF_LINE_U 

201END_OF_STRING 

202END_OF_STRING_LINE 

203END_OF_STRING_LINE_U 

204END_OF_WORD 

205FUZZY 

206GRAPHEME_BOUNDARY 

207GREEDY_REPEAT 

208GROUP 

209GROUP_CALL 

210GROUP_EXISTS 

211KEEP 

212LAZY_REPEAT 

213LOOKAROUND 

214NEXT 

215PROPERTY 

216PROPERTY_IGN 

217PROPERTY_IGN_REV 

218PROPERTY_REV 

219PRUNE 

220RANGE 

221RANGE_IGN 

222RANGE_IGN_REV 

223RANGE_REV 

224REF_GROUP 

225REF_GROUP_FLD 

226REF_GROUP_FLD_REV 

227REF_GROUP_IGN 

228REF_GROUP_IGN_REV 

229REF_GROUP_REV 

230SEARCH_ANCHOR 

231SET_DIFF 

232SET_DIFF_IGN 

233SET_DIFF_IGN_REV 

234SET_DIFF_REV 

235SET_INTER 

236SET_INTER_IGN 

237SET_INTER_IGN_REV 

238SET_INTER_REV 

239SET_SYM_DIFF 

240SET_SYM_DIFF_IGN 

241SET_SYM_DIFF_IGN_REV 

242SET_SYM_DIFF_REV 

243SET_UNION 

244SET_UNION_IGN 

245SET_UNION_IGN_REV 

246SET_UNION_REV 

247SKIP 

248START_OF_LINE 

249START_OF_LINE_U 

250START_OF_STRING 

251START_OF_WORD 

252STRING 

253STRING_FLD 

254STRING_FLD_REV 

255STRING_IGN 

256STRING_IGN_REV 

257STRING_REV 

258FUZZY_EXT 

259""" 

260 

261# Define the opcodes in a namespace. 

262class Namespace: 

263 pass 

264 

265OP = Namespace() 

266for i, op in enumerate(OPCODES.split()): 

267 setattr(OP, op, i) 

268 

269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5): 

270 """Make room in the given cache. 

271 

272 Args: 

273 cache_dict: The cache dictionary to modify. 

274 args_dict: The dictionary of named list args used by patterns. 

275 max_length: Maximum # of entries in cache_dict before it is shrunk. 

276 divisor: Cache will shrink to max_length - 1/divisor*max_length items. 

277 """ 

278 # Toss out a fraction of the entries at random to make room for new ones. 

279 # A random algorithm was chosen as opposed to simply cache_dict.popitem() 

280 # as popitem could penalize the same regular expression repeatedly based 

281 # on its internal hash value. Being random should spread the cache miss 

282 # love around. 

283 cache_keys = tuple(cache_dict.keys()) 

284 overage = len(cache_keys) - max_length 

285 if overage < 0: 

286 # Cache is already within limits. Normally this should not happen 

287 # but it could due to multithreading. 

288 return 

289 

290 number_to_toss = max_length // divisor + overage 

291 

292 # The import is done here to avoid a circular dependency. 

293 import random 

294 if not hasattr(random, 'sample'): 

295 # Do nothing while resolving the circular dependency: 

296 # re->random->warnings->tokenize->string->re 

297 return 

298 

299 for doomed_key in random.sample(cache_keys, number_to_toss): 

300 try: 

301 del cache_dict[doomed_key] 

302 except KeyError: 

303 # Ignore problems if the cache changed from another thread. 

304 pass 

305 

306 # Rebuild the arguments and locale-sensitivity dictionaries. 

307 args_dict.clear() 

308 sensitivity_dict = {} 

309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict): 

310 args_dict[pattern, pattern_type, flags, default_version, locale] = args 

311 try: 

312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern] 

313 except KeyError: 

314 pass 

315 

316 locale_sensitive.clear() 

317 locale_sensitive.update(sensitivity_dict) 

318 

319def _fold_case(info, string): 

320 "Folds the case of a string." 

321 flags = info.flags 

322 if (flags & _ALL_ENCODINGS) == 0: 

323 flags |= info.guess_encoding 

324 

325 return _regex.fold_case(flags, string) 

326 

327def is_cased_i(info, char): 

328 "Checks whether a character is cased." 

329 return len(_regex.get_all_cases(info.flags, char)) > 1 

330 

331def is_cased_f(flags, char): 

332 "Checks whether a character is cased." 

333 return len(_regex.get_all_cases(flags, char)) > 1 

334 

335def _compile_firstset(info, fs): 

336 "Compiles the firstset for the pattern." 

337 reverse = bool(info.flags & REVERSE) 

338 fs = _check_firstset(info, reverse, fs) 

339 if not fs or isinstance(fs, AnyAll): 

340 return [] 

341 

342 # Compile the firstset. 

343 return fs.compile(reverse) 

344 

345def _check_firstset(info, reverse, fs): 

346 "Checks the firstset for the pattern." 

347 if not fs or None in fs: 

348 return None 

349 

350 # If we ignore the case, for simplicity we won't build a firstset. 

351 members = set() 

352 case_flags = NOCASE 

353 for i in fs: 

354 if isinstance(i, Character) and not i.positive: 

355 return None 

356 

357# if i.case_flags: 

358# if isinstance(i, Character): 

359# if is_cased_i(info, i.value): 

360# return [] 

361# elif isinstance(i, SetBase): 

362# return [] 

363 case_flags |= i.case_flags 

364 members.add(i.with_flags(case_flags=NOCASE)) 

365 

366 if case_flags == (FULLCASE | IGNORECASE): 

367 return None 

368 

369 # Build the firstset. 

370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE, 

371 zerowidth=True) 

372 fs = fs.optimise(info, reverse, in_set=True) 

373 

374 return fs 

375 

376def _flatten_code(code): 

377 "Flattens the code from a list of tuples." 

378 flat_code = [] 

379 for c in code: 

380 flat_code.extend(c) 

381 

382 return flat_code 

383 

384def make_case_flags(info): 

385 "Makes the case flags." 

386 flags = info.flags & CASE_FLAGS 

387 

388 # Turn off FULLCASE if ASCII is turned on. 

389 if info.flags & ASCII: 

390 flags &= ~FULLCASE 

391 

392 return flags 

393 

394def make_character(info, value, in_set=False): 

395 "Makes a character literal." 

396 if in_set: 

397 # A character set is built case-sensitively. 

398 return Character(value) 

399 

400 return Character(value, case_flags=make_case_flags(info)) 

401 

402def make_ref_group(info, name, position): 

403 "Makes a group reference." 

404 return RefGroup(info, name, position, case_flags=make_case_flags(info)) 

405 

406def make_string_set(info, name): 

407 "Makes a string set." 

408 return StringSet(info, name, case_flags=make_case_flags(info)) 

409 

410def make_property(info, prop, in_set): 

411 "Makes a property." 

412 if in_set: 

413 return prop 

414 

415 return prop.with_flags(case_flags=make_case_flags(info)) 

416 

417def _parse_pattern(source, info): 

418 "Parses a pattern, eg. 'a|b|c'." 

419 branches = [parse_sequence(source, info)] 

420 while source.match("|"): 

421 branches.append(parse_sequence(source, info)) 

422 

423 if len(branches) == 1: 

424 return branches[0] 

425 return Branch(branches) 

426 

427def parse_sequence(source, info): 

428 "Parses a sequence, eg. 'abc'." 

429 sequence = [None] 

430 case_flags = make_case_flags(info) 

431 while True: 

432 saved_pos = source.pos 

433 ch = source.get() 

434 if ch in SPECIAL_CHARS: 

435 if ch in ")|": 

436 # The end of a sequence. At the end of the pattern ch is "". 

437 source.pos = saved_pos 

438 break 

439 elif ch == "\\": 

440 # An escape sequence outside a set. 

441 sequence.append(parse_escape(source, info, False)) 

442 elif ch == "(": 

443 # A parenthesised subpattern or a flag. 

444 element = parse_paren(source, info) 

445 if element is None: 

446 case_flags = make_case_flags(info) 

447 else: 

448 sequence.append(element) 

449 elif ch == ".": 

450 # Any character. 

451 if info.flags & DOTALL: 

452 sequence.append(AnyAll()) 

453 elif info.flags & WORD: 

454 sequence.append(AnyU()) 

455 else: 

456 sequence.append(Any()) 

457 elif ch == "[": 

458 # A character set. 

459 sequence.append(parse_set(source, info)) 

460 elif ch == "^": 

461 # The start of a line or the string. 

462 if info.flags & MULTILINE: 

463 if info.flags & WORD: 

464 sequence.append(StartOfLineU()) 

465 else: 

466 sequence.append(StartOfLine()) 

467 else: 

468 sequence.append(StartOfString()) 

469 elif ch == "$": 

470 # The end of a line or the string. 

471 if info.flags & MULTILINE: 

472 if info.flags & WORD: 

473 sequence.append(EndOfLineU()) 

474 else: 

475 sequence.append(EndOfLine()) 

476 else: 

477 if info.flags & WORD: 

478 sequence.append(EndOfStringLineU()) 

479 else: 

480 sequence.append(EndOfStringLine()) 

481 elif ch in "?*+{": 

482 # Looks like a quantifier. 

483 counts = parse_quantifier(source, info, ch) 

484 if counts: 

485 # It _is_ a quantifier. 

486 apply_quantifier(source, info, counts, case_flags, ch, 

487 saved_pos, sequence) 

488 sequence.append(None) 

489 else: 

490 # It's not a quantifier. Maybe it's a fuzzy constraint. 

491 constraints = parse_fuzzy(source, info, ch, case_flags) 

492 if constraints: 

493 # It _is_ a fuzzy constraint. 

494 apply_constraint(source, info, constraints, case_flags, 

495 saved_pos, sequence) 

496 sequence.append(None) 

497 else: 

498 # The element was just a literal. 

499 sequence.append(Character(ord(ch), 

500 case_flags=case_flags)) 

501 else: 

502 # A literal. 

503 sequence.append(Character(ord(ch), case_flags=case_flags)) 

504 else: 

505 # A literal. 

506 sequence.append(Character(ord(ch), case_flags=case_flags)) 

507 

508 sequence = [item for item in sequence if item is not None] 

509 return Sequence(sequence) 

510 

511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos, 

512 sequence): 

513 element = sequence.pop() 

514 if element is None: 

515 if sequence: 

516 raise error("multiple repeat", source.string, saved_pos) 

517 raise error("nothing to repeat", source.string, saved_pos) 

518 

519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)): 

520 raise error("multiple repeat", source.string, saved_pos) 

521 

522 min_count, max_count = counts 

523 saved_pos = source.pos 

524 ch = source.get() 

525 if ch == "?": 

526 # The "?" suffix that means it's a lazy repeat. 

527 repeated = LazyRepeat 

528 elif ch == "+": 

529 # The "+" suffix that means it's a possessive repeat. 

530 repeated = PossessiveRepeat 

531 else: 

532 # No suffix means that it's a greedy repeat. 

533 source.pos = saved_pos 

534 repeated = GreedyRepeat 

535 

536 # Ignore the quantifier if it applies to a zero-width item or the number of 

537 # repeats is fixed at 1. 

538 if not element.is_empty() and (min_count != 1 or max_count != 1): 

539 element = repeated(element, min_count, max_count) 

540 

541 sequence.append(element) 

542 

543def apply_constraint(source, info, constraints, case_flags, saved_pos, 

544 sequence): 

545 element = sequence.pop() 

546 if element is None: 

547 raise error("nothing for fuzzy constraint", source.string, saved_pos) 

548 

549 # If a group is marked as fuzzy then put all of the fuzzy part in the 

550 # group. 

551 if isinstance(element, Group): 

552 element.subpattern = Fuzzy(element.subpattern, constraints) 

553 sequence.append(element) 

554 else: 

555 sequence.append(Fuzzy(element, constraints)) 

556 

557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)} 

558 

559def parse_quantifier(source, info, ch): 

560 "Parses a quantifier." 

561 q = _QUANTIFIERS.get(ch) 

562 if q: 

563 # It's a quantifier. 

564 return q 

565 

566 if ch == "{": 

567 # Looks like a limited repeated element, eg. 'a{2,3}'. 

568 counts = parse_limited_quantifier(source) 

569 if counts: 

570 return counts 

571 

572 return None 

573 

574def is_above_limit(count): 

575 "Checks whether a count is above the maximum." 

576 return count is not None and count >= UNLIMITED 

577 

578def parse_limited_quantifier(source): 

579 "Parses a limited quantifier." 

580 saved_pos = source.pos 

581 min_count = parse_count(source) 

582 if source.match(","): 

583 max_count = parse_count(source) 

584 

585 # No minimum means 0 and no maximum means unlimited. 

586 min_count = int(min_count or 0) 

587 max_count = int(max_count) if max_count else None 

588 else: 

589 if not min_count: 

590 source.pos = saved_pos 

591 return None 

592 

593 min_count = max_count = int(min_count) 

594 

595 if not source.match ("}"): 

596 source.pos = saved_pos 

597 return None 

598 

599 if is_above_limit(min_count) or is_above_limit(max_count): 

600 raise error("repeat count too big", source.string, saved_pos) 

601 

602 if max_count is not None and min_count > max_count: 

603 raise error("min repeat greater than max repeat", source.string, 

604 saved_pos) 

605 

606 return min_count, max_count 

607 

608def parse_fuzzy(source, info, ch, case_flags): 

609 "Parses a fuzzy setting, if present." 

610 saved_pos = source.pos 

611 

612 if ch != "{": 

613 return None 

614 

615 constraints = {} 

616 try: 

617 parse_fuzzy_item(source, constraints) 

618 while source.match(","): 

619 parse_fuzzy_item(source, constraints) 

620 except ParseError: 

621 source.pos = saved_pos 

622 return None 

623 

624 if source.match(":"): 

625 constraints["test"] = parse_fuzzy_test(source, info, case_flags) 

626 

627 if not source.match("}"): 

628 raise error("expected }", source.string, source.pos) 

629 

630 return constraints 

631 

632def parse_fuzzy_item(source, constraints): 

633 "Parses a fuzzy setting item." 

634 saved_pos = source.pos 

635 try: 

636 parse_cost_constraint(source, constraints) 

637 except ParseError: 

638 source.pos = saved_pos 

639 

640 parse_cost_equation(source, constraints) 

641 

642def parse_cost_constraint(source, constraints): 

643 "Parses a cost constraint." 

644 saved_pos = source.pos 

645 ch = source.get() 

646 if ch in ALPHA: 

647 # Syntax: constraint [("<=" | "<") cost] 

648 constraint = parse_constraint(source, constraints, ch) 

649 

650 max_inc = parse_fuzzy_compare(source) 

651 

652 if max_inc is None: 

653 # No maximum cost. 

654 constraints[constraint] = 0, None 

655 else: 

656 # There's a maximum cost. 

657 cost_pos = source.pos 

658 max_cost = parse_cost_limit(source) 

659 

660 # Inclusive or exclusive limit? 

661 if not max_inc: 

662 max_cost -= 1 

663 

664 if max_cost < 0: 

665 raise error("bad fuzzy cost limit", source.string, cost_pos) 

666 

667 constraints[constraint] = 0, max_cost 

668 elif ch in DIGITS: 

669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost 

670 source.pos = saved_pos 

671 

672 # Minimum cost. 

673 cost_pos = source.pos 

674 min_cost = parse_cost_limit(source) 

675 

676 min_inc = parse_fuzzy_compare(source) 

677 if min_inc is None: 

678 raise ParseError() 

679 

680 constraint = parse_constraint(source, constraints, source.get()) 

681 

682 max_inc = parse_fuzzy_compare(source) 

683 if max_inc is None: 

684 raise ParseError() 

685 

686 # Maximum cost. 

687 cost_pos = source.pos 

688 max_cost = parse_cost_limit(source) 

689 

690 # Inclusive or exclusive limits? 

691 if not min_inc: 

692 min_cost += 1 

693 if not max_inc: 

694 max_cost -= 1 

695 

696 if not 0 <= min_cost <= max_cost: 

697 raise error("bad fuzzy cost limit", source.string, cost_pos) 

698 

699 constraints[constraint] = min_cost, max_cost 

700 else: 

701 raise ParseError() 

702 

703def parse_cost_limit(source): 

704 "Parses a cost limit." 

705 cost_pos = source.pos 

706 digits = parse_count(source) 

707 

708 try: 

709 return int(digits) 

710 except ValueError: 

711 pass 

712 

713 raise error("bad fuzzy cost limit", source.string, cost_pos) 

714 

715def parse_constraint(source, constraints, ch): 

716 "Parses a constraint." 

717 if ch not in "deis": 

718 raise ParseError() 

719 

720 if ch in constraints: 

721 raise ParseError() 

722 

723 return ch 

724 

725def parse_fuzzy_compare(source): 

726 "Parses a cost comparator." 

727 if source.match("<="): 

728 return True 

729 elif source.match("<"): 

730 return False 

731 else: 

732 return None 

733 

734def parse_cost_equation(source, constraints): 

735 "Parses a cost equation." 

736 if "cost" in constraints: 

737 raise error("more than one cost equation", source.string, source.pos) 

738 

739 cost = {} 

740 

741 parse_cost_term(source, cost) 

742 while source.match("+"): 

743 parse_cost_term(source, cost) 

744 

745 max_inc = parse_fuzzy_compare(source) 

746 if max_inc is None: 

747 raise ParseError() 

748 

749 max_cost = int(parse_count(source)) 

750 

751 if not max_inc: 

752 max_cost -= 1 

753 

754 if max_cost < 0: 

755 raise error("bad fuzzy cost limit", source.string, source.pos) 

756 

757 cost["max"] = max_cost 

758 

759 constraints["cost"] = cost 

760 

761def parse_cost_term(source, cost): 

762 "Parses a cost equation term." 

763 coeff = parse_count(source) 

764 ch = source.get() 

765 if ch not in "dis": 

766 raise ParseError() 

767 

768 if ch in cost: 

769 raise error("repeated fuzzy cost", source.string, source.pos) 

770 

771 cost[ch] = int(coeff or 1) 

772 

773def parse_fuzzy_test(source, info, case_flags): 

774 saved_pos = source.pos 

775 ch = source.get() 

776 if ch in SPECIAL_CHARS: 

777 if ch == "\\": 

778 # An escape sequence outside a set. 

779 return parse_escape(source, info, False) 

780 elif ch == ".": 

781 # Any character. 

782 if info.flags & DOTALL: 

783 return AnyAll() 

784 elif info.flags & WORD: 

785 return AnyU() 

786 else: 

787 return Any() 

788 elif ch == "[": 

789 # A character set. 

790 return parse_set(source, info) 

791 else: 

792 raise error("expected character set", source.string, saved_pos) 

793 elif ch: 

794 # A literal. 

795 return Character(ord(ch), case_flags=case_flags) 

796 else: 

797 raise error("expected character set", source.string, saved_pos) 

798 

799def parse_count(source): 

800 "Parses a quantifier's count, which can be empty." 

801 return source.get_while(DIGITS) 

802 

803def parse_paren(source, info): 

804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an 

805 inline flag. 

806 """ 

807 saved_pos = source.pos 

808 ch = source.get(True) 

809 if ch == "?": 

810 # (?... 

811 saved_pos_2 = source.pos 

812 ch = source.get(True) 

813 if ch == "<": 

814 # (?<... 

815 saved_pos_3 = source.pos 

816 ch = source.get() 

817 if ch in ("=", "!"): 

818 # (?<=... or (?<!...: lookbehind. 

819 return parse_lookaround(source, info, True, ch == "=") 

820 

821 # (?<...: a named capture group. 

822 source.pos = saved_pos_3 

823 name = parse_name(source) 

824 group = info.open_group(name) 

825 source.expect(">") 

826 saved_flags = info.flags 

827 try: 

828 subpattern = _parse_pattern(source, info) 

829 source.expect(")") 

830 finally: 

831 info.flags = saved_flags 

832 source.ignore_space = bool(info.flags & VERBOSE) 

833 

834 info.close_group() 

835 return Group(info, group, subpattern) 

836 if ch in ("=", "!"): 

837 # (?=... or (?!...: lookahead. 

838 return parse_lookaround(source, info, False, ch == "=") 

839 if ch == "P": 

840 # (?P...: a Python extension. 

841 return parse_extension(source, info) 

842 if ch == "#": 

843 # (?#...: a comment. 

844 return parse_comment(source) 

845 if ch == "(": 

846 # (?(...: a conditional subpattern. 

847 return parse_conditional(source, info) 

848 if ch == ">": 

849 # (?>...: an atomic subpattern. 

850 return parse_atomic(source, info) 

851 if ch == "|": 

852 # (?|...: a common/reset groups branch. 

853 return parse_common(source, info) 

854 if ch == "R" or "0" <= ch <= "9": 

855 # (?R...: probably a call to a group. 

856 return parse_call_group(source, info, ch, saved_pos_2) 

857 if ch == "&": 

858 # (?&...: a call to a named group. 

859 return parse_call_named_group(source, info, saved_pos_2) 

860 

861 # (?...: probably a flags subpattern. 

862 source.pos = saved_pos_2 

863 return parse_flags_subpattern(source, info) 

864 

865 if ch == "*": 

866 # (*... 

867 saved_pos_2 = source.pos 

868 word = source.get_while(set(")>"), include=False) 

869 if word[ : 1].isalpha(): 

870 verb = VERBS.get(word) 

871 if not verb: 

872 raise error("unknown verb", source.string, saved_pos_2) 

873 

874 source.expect(")") 

875 

876 return verb 

877 

878 # (...: an unnamed capture group. 

879 source.pos = saved_pos 

880 group = info.open_group() 

881 saved_flags = info.flags 

882 try: 

883 subpattern = _parse_pattern(source, info) 

884 source.expect(")") 

885 finally: 

886 info.flags = saved_flags 

887 source.ignore_space = bool(info.flags & VERBOSE) 

888 

889 info.close_group() 

890 

891 return Group(info, group, subpattern) 

892 

893def parse_extension(source, info): 

894 "Parses a Python extension." 

895 saved_pos = source.pos 

896 ch = source.get() 

897 if ch == "<": 

898 # (?P<...: a named capture group. 

899 name = parse_name(source) 

900 group = info.open_group(name) 

901 source.expect(">") 

902 saved_flags = info.flags 

903 try: 

904 subpattern = _parse_pattern(source, info) 

905 source.expect(")") 

906 finally: 

907 info.flags = saved_flags 

908 source.ignore_space = bool(info.flags & VERBOSE) 

909 

910 info.close_group() 

911 

912 return Group(info, group, subpattern) 

913 if ch == "=": 

914 # (?P=...: a named group reference. 

915 name = parse_name(source, allow_numeric=True) 

916 source.expect(")") 

917 if info.is_open_group(name): 

918 raise error("cannot refer to an open group", source.string, 

919 saved_pos) 

920 

921 return make_ref_group(info, name, saved_pos) 

922 if ch == ">" or ch == "&": 

923 # (?P>...: a call to a group. 

924 return parse_call_named_group(source, info, saved_pos) 

925 

926 source.pos = saved_pos 

927 raise error("unknown extension", source.string, saved_pos) 

928 

929def parse_comment(source): 

930 "Parses a comment." 

931 while True: 

932 saved_pos = source.pos 

933 c = source.get(True) 

934 

935 if not c or c == ")": 

936 break 

937 

938 if c == "\\": 

939 c = source.get(True) 

940 

941 source.pos = saved_pos 

942 source.expect(")") 

943 

944 return None 

945 

946def parse_lookaround(source, info, behind, positive): 

947 "Parses a lookaround." 

948 saved_flags = info.flags 

949 try: 

950 subpattern = _parse_pattern(source, info) 

951 source.expect(")") 

952 finally: 

953 info.flags = saved_flags 

954 source.ignore_space = bool(info.flags & VERBOSE) 

955 

956 return LookAround(behind, positive, subpattern) 

957 

958def parse_conditional(source, info): 

959 "Parses a conditional subpattern." 

960 saved_flags = info.flags 

961 saved_pos = source.pos 

962 ch = source.get() 

963 if ch == "?": 

964 # (?(?... 

965 ch = source.get() 

966 if ch in ("=", "!"): 

967 # (?(?=... or (?(?!...: lookahead conditional. 

968 return parse_lookaround_conditional(source, info, False, ch == "=") 

969 if ch == "<": 

970 # (?(?<... 

971 ch = source.get() 

972 if ch in ("=", "!"): 

973 # (?(?<=... or (?(?<!...: lookbehind conditional. 

974 return parse_lookaround_conditional(source, info, True, ch == 

975 "=") 

976 

977 source.pos = saved_pos 

978 raise error("expected lookaround conditional", source.string, 

979 source.pos) 

980 

981 source.pos = saved_pos 

982 try: 

983 group = parse_name(source, True) 

984 source.expect(")") 

985 yes_branch = parse_sequence(source, info) 

986 if source.match("|"): 

987 no_branch = parse_sequence(source, info) 

988 else: 

989 no_branch = Sequence() 

990 

991 source.expect(")") 

992 finally: 

993 info.flags = saved_flags 

994 source.ignore_space = bool(info.flags & VERBOSE) 

995 

996 if yes_branch.is_empty() and no_branch.is_empty(): 

997 return Sequence() 

998 

999 return Conditional(info, group, yes_branch, no_branch, saved_pos) 

1000 

1001def parse_lookaround_conditional(source, info, behind, positive): 

1002 saved_flags = info.flags 

1003 try: 

1004 subpattern = _parse_pattern(source, info) 

1005 source.expect(")") 

1006 finally: 

1007 info.flags = saved_flags 

1008 source.ignore_space = bool(info.flags & VERBOSE) 

1009 

1010 yes_branch = parse_sequence(source, info) 

1011 if source.match("|"): 

1012 no_branch = parse_sequence(source, info) 

1013 else: 

1014 no_branch = Sequence() 

1015 

1016 source.expect(")") 

1017 

1018 return LookAroundConditional(behind, positive, subpattern, yes_branch, 

1019 no_branch) 

1020 

1021def parse_atomic(source, info): 

1022 "Parses an atomic subpattern." 

1023 saved_flags = info.flags 

1024 try: 

1025 subpattern = _parse_pattern(source, info) 

1026 source.expect(")") 

1027 finally: 

1028 info.flags = saved_flags 

1029 source.ignore_space = bool(info.flags & VERBOSE) 

1030 

1031 return Atomic(subpattern) 

1032 

1033def parse_common(source, info): 

1034 "Parses a common groups branch." 

1035 # Capture group numbers in different branches can reuse the group numbers. 

1036 initial_group_count = info.group_count 

1037 branches = [parse_sequence(source, info)] 

1038 final_group_count = info.group_count 

1039 while source.match("|"): 

1040 info.group_count = initial_group_count 

1041 branches.append(parse_sequence(source, info)) 

1042 final_group_count = max(final_group_count, info.group_count) 

1043 

1044 info.group_count = final_group_count 

1045 source.expect(")") 

1046 

1047 if len(branches) == 1: 

1048 return branches[0] 

1049 return Branch(branches) 

1050 

1051def parse_call_group(source, info, ch, pos): 

1052 "Parses a call to a group." 

1053 if ch == "R": 

1054 group = "0" 

1055 else: 

1056 group = ch + source.get_while(DIGITS) 

1057 

1058 source.expect(")") 

1059 

1060 return CallGroup(info, group, pos) 

1061 

1062def parse_call_named_group(source, info, pos): 

1063 "Parses a call to a named group." 

1064 group = parse_name(source) 

1065 source.expect(")") 

1066 

1067 return CallGroup(info, group, pos) 

1068 

1069def parse_flag_set(source): 

1070 "Parses a set of inline flags." 

1071 flags = 0 

1072 

1073 try: 

1074 while True: 

1075 saved_pos = source.pos 

1076 ch = source.get() 

1077 if ch == "V": 

1078 ch += source.get() 

1079 flags |= REGEX_FLAGS[ch] 

1080 except KeyError: 

1081 source.pos = saved_pos 

1082 

1083 return flags 

1084 

1085def parse_flags(source, info): 

1086 "Parses flags being turned on/off." 

1087 flags_on = parse_flag_set(source) 

1088 if source.match("-"): 

1089 flags_off = parse_flag_set(source) 

1090 if not flags_off: 

1091 raise error("bad inline flags: no flags after '-'", source.string, 

1092 source.pos) 

1093 else: 

1094 flags_off = 0 

1095 

1096 if flags_on & LOCALE: 

1097 # Remember that this pattern as an inline locale flag. 

1098 info.inline_locale = True 

1099 

1100 return flags_on, flags_off 

1101 

1102def parse_subpattern(source, info, flags_on, flags_off): 

1103 "Parses a subpattern with scoped flags." 

1104 saved_flags = info.flags 

1105 info.flags = (info.flags | flags_on) & ~flags_off 

1106 

1107 # Ensure that there aren't multiple encoding flags set. 

1108 if info.flags & (ASCII | LOCALE | UNICODE): 

1109 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on 

1110 

1111 source.ignore_space = bool(info.flags & VERBOSE) 

1112 try: 

1113 subpattern = _parse_pattern(source, info) 

1114 source.expect(")") 

1115 finally: 

1116 info.flags = saved_flags 

1117 source.ignore_space = bool(info.flags & VERBOSE) 

1118 

1119 return subpattern 

1120 

1121def parse_flags_subpattern(source, info): 

1122 """Parses a flags subpattern. It could be inline flags or a subpattern 

1123 possibly with local flags. If it's a subpattern, then that's returned; 

1124 if it's a inline flags, then None is returned. 

1125 """ 

1126 flags_on, flags_off = parse_flags(source, info) 

1127 

1128 if flags_off & GLOBAL_FLAGS: 

1129 raise error("bad inline flags: cannot turn off global flag", 

1130 source.string, source.pos) 

1131 

1132 if flags_on & flags_off: 

1133 raise error("bad inline flags: flag turned on and off", source.string, 

1134 source.pos) 

1135 

1136 # Handle flags which are global in all regex behaviours. 

1137 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS 

1138 if new_global_flags: 

1139 info.global_flags |= new_global_flags 

1140 

1141 # A global has been turned on, so reparse the pattern. 

1142 raise _UnscopedFlagSet(info.global_flags) 

1143 

1144 # Ensure that from now on we have only scoped flags. 

1145 flags_on &= ~GLOBAL_FLAGS 

1146 

1147 if source.match(":"): 

1148 return parse_subpattern(source, info, flags_on, flags_off) 

1149 

1150 if source.match(")"): 

1151 parse_positional_flags(source, info, flags_on, flags_off) 

1152 return None 

1153 

1154 raise error("unknown extension", source.string, source.pos) 

1155 

1156def parse_positional_flags(source, info, flags_on, flags_off): 

1157 "Parses positional flags." 

1158 info.flags = (info.flags | flags_on) & ~flags_off 

1159 source.ignore_space = bool(info.flags & VERBOSE) 

1160 

1161def parse_name(source, allow_numeric=False, allow_group_0=False): 

1162 "Parses a name." 

1163 name = source.get_while(set(")>"), include=False) 

1164 

1165 if not name: 

1166 raise error("missing group name", source.string, source.pos) 

1167 

1168 if name.isdigit(): 

1169 min_group = 0 if allow_group_0 else 1 

1170 if not allow_numeric or int(name) < min_group: 

1171 raise error("bad character in group name", source.string, 

1172 source.pos) 

1173 else: 

1174 if not name.isidentifier(): 

1175 raise error("bad character in group name", source.string, 

1176 source.pos) 

1177 

1178 return name 

1179 

1180def is_octal(string): 

1181 "Checks whether a string is octal." 

1182 return all(ch in OCT_DIGITS for ch in string) 

1183 

1184def is_decimal(string): 

1185 "Checks whether a string is decimal." 

1186 return all(ch in DIGITS for ch in string) 

1187 

1188def is_hexadecimal(string): 

1189 "Checks whether a string is hexadecimal." 

1190 return all(ch in HEX_DIGITS for ch in string) 

1191 

1192def parse_escape(source, info, in_set): 

1193 "Parses an escape sequence." 

1194 saved_ignore = source.ignore_space 

1195 source.ignore_space = False 

1196 ch = source.get() 

1197 source.ignore_space = saved_ignore 

1198 if not ch: 

1199 # A backslash at the end of the pattern. 

1200 raise error("bad escape (end of pattern)", source.string, source.pos) 

1201 if ch in HEX_ESCAPES: 

1202 # A hexadecimal escape sequence. 

1203 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch) 

1204 elif ch == "g" and not in_set: 

1205 # A group reference. 

1206 saved_pos = source.pos 

1207 try: 

1208 return parse_group_ref(source, info) 

1209 except error: 

1210 # Invalid as a group reference, so assume it's a literal. 

1211 source.pos = saved_pos 

1212 

1213 return make_character(info, ord(ch), in_set) 

1214 elif ch == "G" and not in_set: 

1215 # A search anchor. 

1216 return SearchAnchor() 

1217 elif ch == "L" and not in_set: 

1218 # A string set. 

1219 return parse_string_set(source, info) 

1220 elif ch == "N": 

1221 # A named codepoint. 

1222 return parse_named_char(source, info, in_set) 

1223 elif ch in "pP": 

1224 # A Unicode property, positive or negative. 

1225 return parse_property(source, info, ch == "p", in_set) 

1226 elif ch == "R" and not in_set: 

1227 # A line ending. 

1228 charset = [0x0A, 0x0B, 0x0C, 0x0D] 

1229 if info.guess_encoding == UNICODE: 

1230 charset.extend([0x85, 0x2028, 0x2029]) 

1231 

1232 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c) 

1233 for c in charset])])) 

1234 elif ch == "X" and not in_set: 

1235 # A grapheme cluster. 

1236 return Grapheme() 

1237 elif ch in ALPHA: 

1238 # An alphabetic escape sequence. 

1239 # Positional escapes aren't allowed inside a character set. 

1240 if not in_set: 

1241 if info.flags & WORD: 

1242 value = WORD_POSITION_ESCAPES.get(ch) 

1243 elif info.flags & ASCII: 

1244 value = ASCII_POSITION_ESCAPES.get(ch) 

1245 elif info.flags & UNICODE: 

1246 value = UNICODE_POSITION_ESCAPES.get(ch) 

1247 else: 

1248 value = POSITION_ESCAPES.get(ch) 

1249 

1250 if value: 

1251 return value 

1252 

1253 if info.flags & ASCII: 

1254 value = ASCII_CHARSET_ESCAPES.get(ch) 

1255 elif info.flags & UNICODE: 

1256 value = UNICODE_CHARSET_ESCAPES.get(ch) 

1257 else: 

1258 value = CHARSET_ESCAPES.get(ch) 

1259 

1260 if value: 

1261 return value 

1262 

1263 value = CHARACTER_ESCAPES.get(ch) 

1264 if value: 

1265 return Character(ord(value)) 

1266 

1267 raise error("bad escape \\%s" % ch, source.string, source.pos) 

1268 elif ch in DIGITS: 

1269 # A numeric escape sequence. 

1270 return parse_numeric_escape(source, info, ch, in_set) 

1271 else: 

1272 # A literal. 

1273 return make_character(info, ord(ch), in_set) 

1274 

1275def parse_numeric_escape(source, info, ch, in_set): 

1276 "Parses a numeric escape sequence." 

1277 if in_set or ch == "0": 

1278 # Octal escape sequence, max 3 digits. 

1279 return parse_octal_escape(source, info, [ch], in_set) 

1280 

1281 # At least 1 digit, so either octal escape or group. 

1282 digits = ch 

1283 saved_pos = source.pos 

1284 ch = source.get() 

1285 if ch in DIGITS: 

1286 # At least 2 digits, so either octal escape or group. 

1287 digits += ch 

1288 saved_pos = source.pos 

1289 ch = source.get() 

1290 if is_octal(digits) and ch in OCT_DIGITS: 

1291 # 3 octal digits, so octal escape sequence. 

1292 encoding = info.flags & _ALL_ENCODINGS 

1293 if encoding == ASCII or encoding == LOCALE: 

1294 octal_mask = 0xFF 

1295 else: 

1296 octal_mask = 0x1FF 

1297 

1298 value = int(digits + ch, 8) & octal_mask 

1299 return make_character(info, value) 

1300 

1301 # Group reference. 

1302 source.pos = saved_pos 

1303 if info.is_open_group(digits): 

1304 raise error("cannot refer to an open group", source.string, source.pos) 

1305 

1306 return make_ref_group(info, digits, source.pos) 

1307 

1308def parse_octal_escape(source, info, digits, in_set): 

1309 "Parses an octal escape sequence." 

1310 saved_pos = source.pos 

1311 ch = source.get() 

1312 while len(digits) < 3 and ch in OCT_DIGITS: 

1313 digits.append(ch) 

1314 saved_pos = source.pos 

1315 ch = source.get() 

1316 

1317 source.pos = saved_pos 

1318 try: 

1319 value = int("".join(digits), 8) 

1320 return make_character(info, value, in_set) 

1321 except ValueError: 

1322 if digits[0] in OCT_DIGITS: 

1323 raise error("incomplete escape \\%s" % ''.join(digits), 

1324 source.string, source.pos) 

1325 else: 

1326 raise error("bad escape \\%s" % digits[0], source.string, 

1327 source.pos) 

1328 

1329def parse_hex_escape(source, info, esc, expected_len, in_set, type): 

1330 "Parses a hex escape sequence." 

1331 saved_pos = source.pos 

1332 digits = [] 

1333 for i in range(expected_len): 

1334 ch = source.get() 

1335 if ch not in HEX_DIGITS: 

1336 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), 

1337 source.string, saved_pos) 

1338 digits.append(ch) 

1339 

1340 try: 

1341 value = int("".join(digits), 16) 

1342 except ValueError: 

1343 pass 

1344 else: 

1345 if value < 0x110000: 

1346 return make_character(info, value, in_set) 

1347 

1348 # Bad hex escape. 

1349 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)), 

1350 source.string, saved_pos) 

1351 

1352def parse_group_ref(source, info): 

1353 "Parses a group reference." 

1354 source.expect("<") 

1355 saved_pos = source.pos 

1356 name = parse_name(source, True) 

1357 source.expect(">") 

1358 if info.is_open_group(name): 

1359 raise error("cannot refer to an open group", source.string, source.pos) 

1360 

1361 return make_ref_group(info, name, saved_pos) 

1362 

1363def parse_string_set(source, info): 

1364 "Parses a string set reference." 

1365 source.expect("<") 

1366 name = parse_name(source, True) 

1367 source.expect(">") 

1368 if name is None or name not in info.kwargs: 

1369 raise error("undefined named list", source.string, source.pos) 

1370 

1371 return make_string_set(info, name) 

1372 

1373def parse_named_char(source, info, in_set): 

1374 "Parses a named character." 

1375 saved_pos = source.pos 

1376 if source.match("{"): 

1377 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True) 

1378 if source.match("}"): 

1379 try: 

1380 value = unicodedata.lookup(name) 

1381 return make_character(info, ord(value), in_set) 

1382 except KeyError: 

1383 raise error("undefined character name", source.string, 

1384 source.pos) 

1385 

1386 source.pos = saved_pos 

1387 return make_character(info, ord("N"), in_set) 

1388 

1389def parse_property(source, info, positive, in_set): 

1390 "Parses a Unicode property." 

1391 saved_pos = source.pos 

1392 ch = source.get() 

1393 if ch == "{": 

1394 negate = source.match("^") 

1395 prop_name, name = parse_property_name(source) 

1396 if source.match("}"): 

1397 # It's correctly delimited. 

1398 if info.flags & ASCII: 

1399 encoding = ASCII_ENCODING 

1400 elif info.flags & UNICODE: 

1401 encoding = UNICODE_ENCODING 

1402 else: 

1403 encoding = 0 

1404 

1405 prop = lookup_property(prop_name, name, positive != negate, source, 

1406 encoding=encoding) 

1407 return make_property(info, prop, in_set) 

1408 elif ch and ch in "CLMNPSZ": 

1409 # An abbreviated property, eg \pL. 

1410 if info.flags & ASCII: 

1411 encoding = ASCII_ENCODING 

1412 elif info.flags & UNICODE: 

1413 encoding = UNICODE_ENCODING 

1414 else: 

1415 encoding = 0 

1416 

1417 prop = lookup_property(None, ch, positive, source, encoding=encoding) 

1418 return make_property(info, prop, in_set) 

1419 

1420 # Not a property, so treat as a literal "p" or "P". 

1421 source.pos = saved_pos 

1422 ch = "p" if positive else "P" 

1423 return make_character(info, ord(ch), in_set) 

1424 

1425def parse_property_name(source): 

1426 "Parses a property name, which may be qualified." 

1427 name = source.get_while(PROPERTY_NAME_PART) 

1428 saved_pos = source.pos 

1429 

1430 ch = source.get() 

1431 if ch and ch in ":=": 

1432 prop_name = name 

1433 name = source.get_while(ALNUM | set(" &_-./")).strip() 

1434 

1435 if name: 

1436 # Name after the ":" or "=", so it's a qualified name. 

1437 saved_pos = source.pos 

1438 else: 

1439 # No name after the ":" or "=", so assume it's an unqualified name. 

1440 prop_name, name = None, prop_name 

1441 else: 

1442 prop_name = None 

1443 

1444 source.pos = saved_pos 

1445 return prop_name, name 

1446 

1447def parse_set(source, info): 

1448 "Parses a character set." 

1449 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1450 

1451 saved_ignore = source.ignore_space 

1452 source.ignore_space = False 

1453 # Negative set? 

1454 negate = source.match("^") 

1455 try: 

1456 if version == VERSION0: 

1457 item = parse_set_imp_union(source, info) 

1458 else: 

1459 item = parse_set_union(source, info) 

1460 

1461 if not source.match("]"): 

1462 raise error("missing ]", source.string, source.pos) 

1463 finally: 

1464 source.ignore_space = saved_ignore 

1465 

1466 if negate: 

1467 item = item.with_flags(positive=not item.positive) 

1468 

1469 item = item.with_flags(case_flags=make_case_flags(info)) 

1470 

1471 return item 

1472 

1473def parse_set_union(source, info): 

1474 "Parses a set union ([x||y])." 

1475 items = [parse_set_symm_diff(source, info)] 

1476 while source.match("||"): 

1477 items.append(parse_set_symm_diff(source, info)) 

1478 

1479 if len(items) == 1: 

1480 return items[0] 

1481 return SetUnion(info, items) 

1482 

1483def parse_set_symm_diff(source, info): 

1484 "Parses a set symmetric difference ([x~~y])." 

1485 items = [parse_set_inter(source, info)] 

1486 while source.match("~~"): 

1487 items.append(parse_set_inter(source, info)) 

1488 

1489 if len(items) == 1: 

1490 return items[0] 

1491 return SetSymDiff(info, items) 

1492 

1493def parse_set_inter(source, info): 

1494 "Parses a set intersection ([x&&y])." 

1495 items = [parse_set_diff(source, info)] 

1496 while source.match("&&"): 

1497 items.append(parse_set_diff(source, info)) 

1498 

1499 if len(items) == 1: 

1500 return items[0] 

1501 return SetInter(info, items) 

1502 

1503def parse_set_diff(source, info): 

1504 "Parses a set difference ([x--y])." 

1505 items = [parse_set_imp_union(source, info)] 

1506 while source.match("--"): 

1507 items.append(parse_set_imp_union(source, info)) 

1508 

1509 if len(items) == 1: 

1510 return items[0] 

1511 return SetDiff(info, items) 

1512 

1513def parse_set_imp_union(source, info): 

1514 "Parses a set implicit union ([xy])." 

1515 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1516 

1517 items = [parse_set_member(source, info)] 

1518 while True: 

1519 saved_pos = source.pos 

1520 if source.match("]"): 

1521 # End of the set. 

1522 source.pos = saved_pos 

1523 break 

1524 

1525 if version == VERSION1 and any(source.match(op) for op in SET_OPS): 

1526 # The new behaviour has set operators. 

1527 source.pos = saved_pos 

1528 break 

1529 

1530 items.append(parse_set_member(source, info)) 

1531 

1532 if len(items) == 1: 

1533 return items[0] 

1534 return SetUnion(info, items) 

1535 

1536def parse_set_member(source, info): 

1537 "Parses a member in a character set." 

1538 # Parse a set item. 

1539 start = parse_set_item(source, info) 

1540 saved_pos1 = source.pos 

1541 if (not isinstance(start, Character) or not start.positive or not 

1542 source.match("-")): 

1543 # It's not the start of a range. 

1544 return start 

1545 

1546 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1547 

1548 # It looks like the start of a range of characters. 

1549 saved_pos2 = source.pos 

1550 if version == VERSION1 and source.match("-"): 

1551 # It's actually the set difference operator '--', so return the 

1552 # character. 

1553 source.pos = saved_pos1 

1554 return start 

1555 

1556 if source.match("]"): 

1557 # We've reached the end of the set, so return both the character and 

1558 # hyphen. 

1559 source.pos = saved_pos2 

1560 return SetUnion(info, [start, Character(ord("-"))]) 

1561 

1562 # Parse a set item. 

1563 end = parse_set_item(source, info) 

1564 if not isinstance(end, Character) or not end.positive: 

1565 # It's not a range, so return the character, hyphen and property. 

1566 return SetUnion(info, [start, Character(ord("-")), end]) 

1567 

1568 # It _is_ a range. 

1569 if start.value > end.value: 

1570 raise error("bad character range", source.string, source.pos) 

1571 

1572 if start.value == end.value: 

1573 return start 

1574 

1575 return Range(start.value, end.value) 

1576 

1577def parse_set_item(source, info): 

1578 "Parses an item in a character set." 

1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

1580 

1581 if source.match("\\"): 

1582 # An escape sequence in a set. 

1583 return parse_escape(source, info, True) 

1584 

1585 saved_pos = source.pos 

1586 if source.match("[:"): 

1587 # Looks like a POSIX character class. 

1588 try: 

1589 return parse_posix_class(source, info) 

1590 except ParseError: 

1591 # Not a POSIX character class. 

1592 source.pos = saved_pos 

1593 

1594 if version == VERSION1 and source.match("["): 

1595 # It's the start of a nested set. 

1596 

1597 # Negative set? 

1598 negate = source.match("^") 

1599 item = parse_set_union(source, info) 

1600 

1601 if not source.match("]"): 

1602 raise error("missing ]", source.string, source.pos) 

1603 

1604 if negate: 

1605 item = item.with_flags(positive=not item.positive) 

1606 

1607 return item 

1608 

1609 ch = source.get() 

1610 if not ch: 

1611 raise error("unterminated character set", source.string, source.pos) 

1612 

1613 return Character(ord(ch)) 

1614 

1615def parse_posix_class(source, info): 

1616 "Parses a POSIX character class." 

1617 negate = source.match("^") 

1618 prop_name, name = parse_property_name(source) 

1619 if not source.match(":]"): 

1620 raise ParseError() 

1621 

1622 return lookup_property(prop_name, name, not negate, source, posix=True) 

1623 

1624def float_to_rational(flt): 

1625 "Converts a float to a rational pair." 

1626 int_part = int(flt) 

1627 error = flt - int_part 

1628 if abs(error) < 0.0001: 

1629 return int_part, 1 

1630 

1631 den, num = float_to_rational(1.0 / error) 

1632 

1633 return int_part * den + num, den 

1634 

1635def numeric_to_rational(numeric): 

1636 "Converts a numeric string to a rational string, if possible." 

1637 if numeric[ : 1] == "-": 

1638 sign, numeric = numeric[0], numeric[1 : ] 

1639 else: 

1640 sign = "" 

1641 

1642 parts = numeric.split("/") 

1643 if len(parts) == 2: 

1644 num, den = float_to_rational(float(parts[0]) / float(parts[1])) 

1645 elif len(parts) == 1: 

1646 num, den = float_to_rational(float(parts[0])) 

1647 else: 

1648 raise ValueError() 

1649 

1650 result = "{}{}/{}".format(sign, num, den) 

1651 if result.endswith("/1"): 

1652 return result[ : -2] 

1653 

1654 return result 

1655 

1656def standardise_name(name): 

1657 "Standardises a property or value name." 

1658 try: 

1659 return numeric_to_rational("".join(name)) 

1660 except (ValueError, ZeroDivisionError): 

1661 return "".join(ch for ch in name if ch not in "_- ").upper() 

1662 

1663_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split()) 

1664 

1665_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split()) 

1666 

1667def lookup_property(property, value, positive, source=None, posix=False, encoding=0): 

1668 "Looks up a property." 

1669 # Normalise the names (which may still be lists). 

1670 property = standardise_name(property) if property else None 

1671 value = standardise_name(value) 

1672 

1673 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"): 

1674 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive 

1675 

1676 if posix and not property and value.upper() in _POSIX_CLASSES: 

1677 value = 'POSIX' + value 

1678 

1679 if property: 

1680 # Both the property and the value are provided. 

1681 prop = PROPERTIES.get(property) 

1682 if not prop: 

1683 if not source: 

1684 raise error("unknown property") 

1685 

1686 raise error("unknown property", source.string, source.pos) 

1687 

1688 prop_id, value_dict = prop 

1689 val_id = value_dict.get(value) 

1690 if val_id is None: 

1691 if not source: 

1692 raise error("unknown property value") 

1693 

1694 raise error("unknown property value", source.string, source.pos) 

1695 

1696 return Property((prop_id << 16) | val_id, positive, encoding=encoding) 

1697 

1698 # Only the value is provided. 

1699 # It might be the name of a GC, script or block value. 

1700 for property in ("GC", "SCRIPT", "BLOCK"): 

1701 prop_id, value_dict = PROPERTIES.get(property) 

1702 val_id = value_dict.get(value) 

1703 if val_id is not None: 

1704 return Property((prop_id << 16) | val_id, positive, encoding=encoding) 

1705 

1706 # It might be the name of a binary property. 

1707 prop = PROPERTIES.get(value) 

1708 if prop: 

1709 prop_id, value_dict = prop 

1710 if set(value_dict) == _BINARY_VALUES: 

1711 return Property((prop_id << 16) | 1, positive, encoding=encoding) 

1712 

1713 return Property(prop_id << 16, not positive, encoding=encoding) 

1714 

1715 # It might be the name of a binary property starting with a prefix. 

1716 if value.startswith("IS"): 

1717 prop = PROPERTIES.get(value[2 : ]) 

1718 if prop: 

1719 prop_id, value_dict = prop 

1720 if "YES" in value_dict: 

1721 return Property((prop_id << 16) | 1, positive, encoding=encoding) 

1722 

1723 # It might be the name of a script or block starting with a prefix. 

1724 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")): 

1725 if value.startswith(prefix): 

1726 prop_id, value_dict = PROPERTIES.get(property) 

1727 val_id = value_dict.get(value[2 : ]) 

1728 if val_id is not None: 

1729 return Property((prop_id << 16) | val_id, positive, encoding=encoding) 

1730 

1731 # Unknown property. 

1732 if not source: 

1733 raise error("unknown property") 

1734 

1735 raise error("unknown property", source.string, source.pos) 

1736 

1737def _compile_replacement(source, pattern, is_unicode): 

1738 "Compiles a replacement template escape sequence." 

1739 ch = source.get() 

1740 if ch in ALPHA: 

1741 # An alphabetic escape sequence. 

1742 value = CHARACTER_ESCAPES.get(ch) 

1743 if value: 

1744 return False, [ord(value)] 

1745 

1746 if ch in HEX_ESCAPES and (ch == "x" or is_unicode): 

1747 # A hexadecimal escape sequence. 

1748 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)] 

1749 

1750 if ch == "g": 

1751 # A group preference. 

1752 return True, [compile_repl_group(source, pattern)] 

1753 

1754 if ch == "N" and is_unicode: 

1755 # A named character. 

1756 value = parse_repl_named_char(source) 

1757 if value is not None: 

1758 return False, [value] 

1759 

1760 raise error("bad escape \\%s" % ch, source.string, source.pos) 

1761 

1762 if isinstance(source.sep, bytes): 

1763 octal_mask = 0xFF 

1764 else: 

1765 octal_mask = 0x1FF 

1766 

1767 if ch == "0": 

1768 # An octal escape sequence. 

1769 digits = ch 

1770 while len(digits) < 3: 

1771 saved_pos = source.pos 

1772 ch = source.get() 

1773 if ch not in OCT_DIGITS: 

1774 source.pos = saved_pos 

1775 break 

1776 digits += ch 

1777 

1778 return False, [int(digits, 8) & octal_mask] 

1779 

1780 if ch in DIGITS: 

1781 # Either an octal escape sequence (3 digits) or a group reference (max 

1782 # 2 digits). 

1783 digits = ch 

1784 saved_pos = source.pos 

1785 ch = source.get() 

1786 if ch in DIGITS: 

1787 digits += ch 

1788 saved_pos = source.pos 

1789 ch = source.get() 

1790 if ch and is_octal(digits + ch): 

1791 # An octal escape sequence. 

1792 return False, [int(digits + ch, 8) & octal_mask] 

1793 

1794 # A group reference. 

1795 source.pos = saved_pos 

1796 return True, [int(digits)] 

1797 

1798 if ch == "\\": 

1799 # An escaped backslash is a backslash. 

1800 return False, [ord("\\")] 

1801 

1802 if not ch: 

1803 # A trailing backslash. 

1804 raise error("bad escape (end of pattern)", source.string, source.pos) 

1805 

1806 # An escaped non-backslash is a backslash followed by the literal. 

1807 return False, [ord("\\"), ord(ch)] 

1808 

1809def parse_repl_hex_escape(source, expected_len, type): 

1810 "Parses a hex escape sequence in a replacement string." 

1811 digits = [] 

1812 for i in range(expected_len): 

1813 ch = source.get() 

1814 if ch not in HEX_DIGITS: 

1815 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)), 

1816 source.string, source.pos) 

1817 digits.append(ch) 

1818 

1819 return int("".join(digits), 16) 

1820 

1821def parse_repl_named_char(source): 

1822 "Parses a named character in a replacement string." 

1823 saved_pos = source.pos 

1824 if source.match("{"): 

1825 name = source.get_while(ALPHA | set(" ")) 

1826 

1827 if source.match("}"): 

1828 try: 

1829 value = unicodedata.lookup(name) 

1830 return ord(value) 

1831 except KeyError: 

1832 raise error("undefined character name", source.string, 

1833 source.pos) 

1834 

1835 source.pos = saved_pos 

1836 return None 

1837 

1838def compile_repl_group(source, pattern): 

1839 "Compiles a replacement template group reference." 

1840 source.expect("<") 

1841 name = parse_name(source, True, True) 

1842 

1843 source.expect(">") 

1844 if name.isdigit(): 

1845 index = int(name) 

1846 if not 0 <= index <= pattern.groups: 

1847 raise error("invalid group reference", source.string, source.pos) 

1848 

1849 return index 

1850 

1851 try: 

1852 return pattern.groupindex[name] 

1853 except KeyError: 

1854 raise IndexError("unknown group") 

1855 

1856# The regular expression is parsed into a syntax tree. The different types of 

1857# node are defined below. 

1858 

1859INDENT = " " 

1860POSITIVE_OP = 0x1 

1861ZEROWIDTH_OP = 0x2 

1862FUZZY_OP = 0x4 

1863REVERSE_OP = 0x8 

1864REQUIRED_OP = 0x10 

1865ENCODING_OP_SHIFT = 5 

1866 

1867POS_TEXT = {False: "NON-MATCH", True: "MATCH"} 

1868CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "", 

1869 FULLIGNORECASE: " FULL_IGNORE_CASE"} 

1870 

1871def make_sequence(items): 

1872 if len(items) == 1: 

1873 return items[0] 

1874 return Sequence(items) 

1875 

1876# Common base class for all nodes. 

1877class RegexBase: 

1878 def __init__(self): 

1879 self._key = self.__class__ 

1880 

1881 def with_flags(self, positive=None, case_flags=None, zerowidth=None): 

1882 if positive is None: 

1883 positive = self.positive 

1884 else: 

1885 positive = bool(positive) 

1886 if case_flags is None: 

1887 case_flags = self.case_flags 

1888 else: 

1889 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS] 

1890 if zerowidth is None: 

1891 zerowidth = self.zerowidth 

1892 else: 

1893 zerowidth = bool(zerowidth) 

1894 

1895 if (positive == self.positive and case_flags == self.case_flags and 

1896 zerowidth == self.zerowidth): 

1897 return self 

1898 

1899 return self.rebuild(positive, case_flags, zerowidth) 

1900 

1901 def fix_groups(self, pattern, reverse, fuzzy): 

1902 pass 

1903 

1904 def optimise(self, info, reverse): 

1905 return self 

1906 

1907 def pack_characters(self, info): 

1908 return self 

1909 

1910 def remove_captures(self): 

1911 return self 

1912 

1913 def is_atomic(self): 

1914 return True 

1915 

1916 def can_be_affix(self): 

1917 return True 

1918 

1919 def contains_group(self): 

1920 return False 

1921 

1922 def get_firstset(self, reverse): 

1923 raise _FirstSetError() 

1924 

1925 def has_simple_start(self): 

1926 return False 

1927 

1928 def compile(self, reverse=False, fuzzy=False): 

1929 return self._compile(reverse, fuzzy) 

1930 

1931 def is_empty(self): 

1932 return False 

1933 

1934 def __hash__(self): 

1935 return hash(self._key) 

1936 

1937 def __eq__(self, other): 

1938 return type(self) is type(other) and self._key == other._key 

1939 

1940 def __ne__(self, other): 

1941 return not self.__eq__(other) 

1942 

1943 def get_required_string(self, reverse): 

1944 return self.max_width(), None 

1945 

1946# Base class for zero-width nodes. 

1947class ZeroWidthBase(RegexBase): 

1948 def __init__(self, positive=True, encoding=0): 

1949 RegexBase.__init__(self) 

1950 self.positive = bool(positive) 

1951 self.encoding = encoding 

1952 

1953 self._key = self.__class__, self.positive 

1954 

1955 def get_firstset(self, reverse): 

1956 return set([None]) 

1957 

1958 def _compile(self, reverse, fuzzy): 

1959 flags = 0 

1960 if self.positive: 

1961 flags |= POSITIVE_OP 

1962 if fuzzy: 

1963 flags |= FUZZY_OP 

1964 if reverse: 

1965 flags |= REVERSE_OP 

1966 flags |= self.encoding << ENCODING_OP_SHIFT 

1967 return [(self._opcode, flags)] 

1968 

1969 def dump(self, indent, reverse): 

1970 print("{}{} {}{}".format(INDENT * indent, self._op_name, 

1971 POS_TEXT[self.positive], ["", " ASCII"][self.encoding])) 

1972 

1973 def max_width(self): 

1974 return 0 

1975 

1976class Any(RegexBase): 

1977 _opcode = {False: OP.ANY, True: OP.ANY_REV} 

1978 _op_name = "ANY" 

1979 

1980 def has_simple_start(self): 

1981 return True 

1982 

1983 def _compile(self, reverse, fuzzy): 

1984 flags = 0 

1985 if fuzzy: 

1986 flags |= FUZZY_OP 

1987 return [(self._opcode[reverse], flags)] 

1988 

1989 def dump(self, indent, reverse): 

1990 print("{}{}".format(INDENT * indent, self._op_name)) 

1991 

1992 def max_width(self): 

1993 return 1 

1994 

1995class AnyAll(Any): 

1996 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV} 

1997 _op_name = "ANY_ALL" 

1998 

1999 def __init__(self): 

2000 self.positive = True 

2001 self.zerowidth = False 

2002 self.case_flags = 0 

2003 

2004 self._key = self.__class__, self.positive 

2005 

2006class AnyU(Any): 

2007 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV} 

2008 _op_name = "ANY_U" 

2009 

2010class Atomic(RegexBase): 

2011 def __init__(self, subpattern): 

2012 RegexBase.__init__(self) 

2013 self.subpattern = subpattern 

2014 

2015 def fix_groups(self, pattern, reverse, fuzzy): 

2016 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

2017 

2018 def optimise(self, info, reverse): 

2019 self.subpattern = self.subpattern.optimise(info, reverse) 

2020 

2021 if self.subpattern.is_empty(): 

2022 return self.subpattern 

2023 return self 

2024 

2025 def pack_characters(self, info): 

2026 self.subpattern = self.subpattern.pack_characters(info) 

2027 return self 

2028 

2029 def remove_captures(self): 

2030 self.subpattern = self.subpattern.remove_captures() 

2031 return self 

2032 

2033 def can_be_affix(self): 

2034 return self.subpattern.can_be_affix() 

2035 

2036 def contains_group(self): 

2037 return self.subpattern.contains_group() 

2038 

2039 def get_firstset(self, reverse): 

2040 return self.subpattern.get_firstset(reverse) 

2041 

2042 def has_simple_start(self): 

2043 return self.subpattern.has_simple_start() 

2044 

2045 def _compile(self, reverse, fuzzy): 

2046 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) + 

2047 [(OP.END, )]) 

2048 

2049 def dump(self, indent, reverse): 

2050 print("{}ATOMIC".format(INDENT * indent)) 

2051 self.subpattern.dump(indent + 1, reverse) 

2052 

2053 def is_empty(self): 

2054 return self.subpattern.is_empty() 

2055 

2056 def __eq__(self, other): 

2057 return (type(self) is type(other) and self.subpattern == 

2058 other.subpattern) 

2059 

2060 def max_width(self): 

2061 return self.subpattern.max_width() 

2062 

2063 def get_required_string(self, reverse): 

2064 return self.subpattern.get_required_string(reverse) 

2065 

2066class Boundary(ZeroWidthBase): 

2067 _opcode = OP.BOUNDARY 

2068 _op_name = "BOUNDARY" 

2069 

2070class Branch(RegexBase): 

2071 def __init__(self, branches): 

2072 RegexBase.__init__(self) 

2073 self.branches = branches 

2074 

2075 def fix_groups(self, pattern, reverse, fuzzy): 

2076 for b in self.branches: 

2077 b.fix_groups(pattern, reverse, fuzzy) 

2078 

2079 def optimise(self, info, reverse): 

2080 if not self.branches: 

2081 return Sequence([]) 

2082 

2083 # Flatten branches within branches. 

2084 branches = Branch._flatten_branches(info, reverse, self.branches) 

2085 

2086 # Move any common prefix or suffix out of the branches. 

2087 if reverse: 

2088 suffix, branches = Branch._split_common_suffix(info, branches) 

2089 prefix = [] 

2090 else: 

2091 prefix, branches = Branch._split_common_prefix(info, branches) 

2092 suffix = [] 

2093 

2094 # Try to reduce adjacent single-character branches to sets. 

2095 branches = Branch._reduce_to_set(info, reverse, branches) 

2096 

2097 if len(branches) > 1: 

2098 sequence = [Branch(branches)] 

2099 

2100 if not prefix or not suffix: 

2101 # We might be able to add a quick precheck before the branches. 

2102 firstset = self._add_precheck(info, reverse, branches) 

2103 

2104 if firstset: 

2105 if reverse: 

2106 sequence.append(firstset) 

2107 else: 

2108 sequence.insert(0, firstset) 

2109 else: 

2110 sequence = branches 

2111 

2112 return make_sequence(prefix + sequence + suffix) 

2113 

2114 def _add_precheck(self, info, reverse, branches): 

2115 charset = set() 

2116 pos = -1 if reverse else 0 

2117 

2118 for branch in branches: 

2119 if type(branch) is Literal and branch.case_flags == NOCASE: 

2120 charset.add(branch.characters[pos]) 

2121 else: 

2122 return 

2123 

2124 if not charset: 

2125 return None 

2126 

2127 return _check_firstset(info, reverse, [Character(c) for c in charset]) 

2128 

2129 def pack_characters(self, info): 

2130 self.branches = [b.pack_characters(info) for b in self.branches] 

2131 return self 

2132 

2133 def remove_captures(self): 

2134 self.branches = [b.remove_captures() for b in self.branches] 

2135 return self 

2136 

2137 def is_atomic(self): 

2138 return all(b.is_atomic() for b in self.branches) 

2139 

2140 def can_be_affix(self): 

2141 return all(b.can_be_affix() for b in self.branches) 

2142 

2143 def contains_group(self): 

2144 return any(b.contains_group() for b in self.branches) 

2145 

2146 def get_firstset(self, reverse): 

2147 fs = set() 

2148 for b in self.branches: 

2149 fs |= b.get_firstset(reverse) 

2150 

2151 return fs or set([None]) 

2152 

2153 def _compile(self, reverse, fuzzy): 

2154 if not self.branches: 

2155 return [] 

2156 

2157 code = [(OP.BRANCH, )] 

2158 for b in self.branches: 

2159 code.extend(b.compile(reverse, fuzzy)) 

2160 code.append((OP.NEXT, )) 

2161 

2162 code[-1] = (OP.END, ) 

2163 

2164 return code 

2165 

2166 def dump(self, indent, reverse): 

2167 print("{}BRANCH".format(INDENT * indent)) 

2168 self.branches[0].dump(indent + 1, reverse) 

2169 for b in self.branches[1 : ]: 

2170 print("{}OR".format(INDENT * indent)) 

2171 b.dump(indent + 1, reverse) 

2172 

2173 @staticmethod 

2174 def _flatten_branches(info, reverse, branches): 

2175 # Flatten the branches so that there aren't branches of branches. 

2176 new_branches = [] 

2177 for b in branches: 

2178 b = b.optimise(info, reverse) 

2179 if isinstance(b, Branch): 

2180 new_branches.extend(b.branches) 

2181 else: 

2182 new_branches.append(b) 

2183 

2184 return new_branches 

2185 

2186 @staticmethod 

2187 def _split_common_prefix(info, branches): 

2188 # Common leading items can be moved out of the branches. 

2189 # Get the items in the branches. 

2190 alternatives = [] 

2191 for b in branches: 

2192 if isinstance(b, Sequence): 

2193 alternatives.append(b.items) 

2194 else: 

2195 alternatives.append([b]) 

2196 

2197 # What is the maximum possible length of the prefix? 

2198 max_count = min(len(a) for a in alternatives) 

2199 

2200 # What is the longest common prefix? 

2201 prefix = alternatives[0] 

2202 pos = 0 

2203 end_pos = max_count 

2204 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] == 

2205 prefix[pos] for a in alternatives): 

2206 pos += 1 

2207 count = pos 

2208 

2209 if info.flags & UNICODE: 

2210 # We need to check that we're not splitting a sequence of 

2211 # characters which could form part of full case-folding. 

2212 count = pos 

2213 while count > 0 and not all(Branch._can_split(a, count) for a in 

2214 alternatives): 

2215 count -= 1 

2216 

2217 # No common prefix is possible. 

2218 if count == 0: 

2219 return [], branches 

2220 

2221 # Rebuild the branches. 

2222 new_branches = [] 

2223 for a in alternatives: 

2224 new_branches.append(make_sequence(a[count : ])) 

2225 

2226 return prefix[ : count], new_branches 

2227 

2228 @staticmethod 

2229 def _split_common_suffix(info, branches): 

2230 # Common trailing items can be moved out of the branches. 

2231 # Get the items in the branches. 

2232 alternatives = [] 

2233 for b in branches: 

2234 if isinstance(b, Sequence): 

2235 alternatives.append(b.items) 

2236 else: 

2237 alternatives.append([b]) 

2238 

2239 # What is the maximum possible length of the suffix? 

2240 max_count = min(len(a) for a in alternatives) 

2241 

2242 # What is the longest common suffix? 

2243 suffix = alternatives[0] 

2244 pos = -1 

2245 end_pos = -1 - max_count 

2246 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] == 

2247 suffix[pos] for a in alternatives): 

2248 pos -= 1 

2249 count = -1 - pos 

2250 

2251 if info.flags & UNICODE: 

2252 # We need to check that we're not splitting a sequence of 

2253 # characters which could form part of full case-folding. 

2254 while count > 0 and not all(Branch._can_split_rev(a, count) for a 

2255 in alternatives): 

2256 count -= 1 

2257 

2258 # No common suffix is possible. 

2259 if count == 0: 

2260 return [], branches 

2261 

2262 # Rebuild the branches. 

2263 new_branches = [] 

2264 for a in alternatives: 

2265 new_branches.append(make_sequence(a[ : -count])) 

2266 

2267 return suffix[-count : ], new_branches 

2268 

2269 @staticmethod 

2270 def _can_split(items, count): 

2271 # Check the characters either side of the proposed split. 

2272 if not Branch._is_full_case(items, count - 1): 

2273 return True 

2274 

2275 if not Branch._is_full_case(items, count): 

2276 return True 

2277 

2278 # Check whether a 1-1 split would be OK. 

2279 if Branch._is_folded(items[count - 1 : count + 1]): 

2280 return False 

2281 

2282 # Check whether a 1-2 split would be OK. 

2283 if (Branch._is_full_case(items, count + 2) and 

2284 Branch._is_folded(items[count - 1 : count + 2])): 

2285 return False 

2286 

2287 # Check whether a 2-1 split would be OK. 

2288 if (Branch._is_full_case(items, count - 2) and 

2289 Branch._is_folded(items[count - 2 : count + 1])): 

2290 return False 

2291 

2292 return True 

2293 

2294 @staticmethod 

2295 def _can_split_rev(items, count): 

2296 end = len(items) 

2297 

2298 # Check the characters either side of the proposed split. 

2299 if not Branch._is_full_case(items, end - count): 

2300 return True 

2301 

2302 if not Branch._is_full_case(items, end - count - 1): 

2303 return True 

2304 

2305 # Check whether a 1-1 split would be OK. 

2306 if Branch._is_folded(items[end - count - 1 : end - count + 1]): 

2307 return False 

2308 

2309 # Check whether a 1-2 split would be OK. 

2310 if (Branch._is_full_case(items, end - count + 2) and 

2311 Branch._is_folded(items[end - count - 1 : end - count + 2])): 

2312 return False 

2313 

2314 # Check whether a 2-1 split would be OK. 

2315 if (Branch._is_full_case(items, end - count - 2) and 

2316 Branch._is_folded(items[end - count - 2 : end - count + 1])): 

2317 return False 

2318 

2319 return True 

2320 

2321 @staticmethod 

2322 def _merge_common_prefixes(info, reverse, branches): 

2323 # Branches with the same case-sensitive character prefix can be grouped 

2324 # together if they are separated only by other branches with a 

2325 # character prefix. 

2326 prefixed = defaultdict(list) 

2327 order = {} 

2328 new_branches = [] 

2329 for b in branches: 

2330 if Branch._is_simple_character(b): 

2331 # Branch starts with a simple character. 

2332 prefixed[b.value].append([b]) 

2333 order.setdefault(b.value, len(order)) 

2334 elif (isinstance(b, Sequence) and b.items and 

2335 Branch._is_simple_character(b.items[0])): 

2336 # Branch starts with a simple character. 

2337 prefixed[b.items[0].value].append(b.items) 

2338 order.setdefault(b.items[0].value, len(order)) 

2339 else: 

2340 Branch._flush_char_prefix(info, reverse, prefixed, order, 

2341 new_branches) 

2342 

2343 new_branches.append(b) 

2344 

2345 Branch._flush_char_prefix(info, prefixed, order, new_branches) 

2346 

2347 return new_branches 

2348 

2349 @staticmethod 

2350 def _is_simple_character(c): 

2351 return isinstance(c, Character) and c.positive and not c.case_flags 

2352 

2353 @staticmethod 

2354 def _reduce_to_set(info, reverse, branches): 

2355 # Can the branches be reduced to a set? 

2356 new_branches = [] 

2357 items = set() 

2358 case_flags = NOCASE 

2359 for b in branches: 

2360 if isinstance(b, (Character, Property, SetBase)): 

2361 # Branch starts with a single character. 

2362 if b.case_flags != case_flags: 

2363 # Different case sensitivity, so flush. 

2364 Branch._flush_set_members(info, reverse, items, case_flags, 

2365 new_branches) 

2366 

2367 case_flags = b.case_flags 

2368 

2369 items.add(b.with_flags(case_flags=NOCASE)) 

2370 else: 

2371 Branch._flush_set_members(info, reverse, items, case_flags, 

2372 new_branches) 

2373 

2374 new_branches.append(b) 

2375 

2376 Branch._flush_set_members(info, reverse, items, case_flags, 

2377 new_branches) 

2378 

2379 return new_branches 

2380 

2381 @staticmethod 

2382 def _flush_char_prefix(info, reverse, prefixed, order, new_branches): 

2383 # Flush the prefixed branches. 

2384 if not prefixed: 

2385 return 

2386 

2387 for value, branches in sorted(prefixed.items(), key=lambda pair: 

2388 order[pair[0]]): 

2389 if len(branches) == 1: 

2390 new_branches.append(make_sequence(branches[0])) 

2391 else: 

2392 subbranches = [] 

2393 optional = False 

2394 for b in branches: 

2395 if len(b) > 1: 

2396 subbranches.append(make_sequence(b[1 : ])) 

2397 elif not optional: 

2398 subbranches.append(Sequence()) 

2399 optional = True 

2400 

2401 sequence = Sequence([Character(value), Branch(subbranches)]) 

2402 new_branches.append(sequence.optimise(info, reverse)) 

2403 

2404 prefixed.clear() 

2405 order.clear() 

2406 

2407 @staticmethod 

2408 def _flush_set_members(info, reverse, items, case_flags, new_branches): 

2409 # Flush the set members. 

2410 if not items: 

2411 return 

2412 

2413 if len(items) == 1: 

2414 item = list(items)[0] 

2415 else: 

2416 item = SetUnion(info, list(items)).optimise(info, reverse) 

2417 

2418 new_branches.append(item.with_flags(case_flags=case_flags)) 

2419 

2420 items.clear() 

2421 

2422 @staticmethod 

2423 def _is_full_case(items, i): 

2424 if not 0 <= i < len(items): 

2425 return False 

2426 

2427 item = items[i] 

2428 return (isinstance(item, Character) and item.positive and 

2429 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE) 

2430 

2431 @staticmethod 

2432 def _is_folded(items): 

2433 if len(items) < 2: 

2434 return False 

2435 

2436 for i in items: 

2437 if (not isinstance(i, Character) or not i.positive or not 

2438 i.case_flags): 

2439 return False 

2440 

2441 folded = "".join(chr(i.value) for i in items) 

2442 folded = _regex.fold_case(FULL_CASE_FOLDING, folded) 

2443 

2444 # Get the characters which expand to multiple codepoints on folding. 

2445 expanding_chars = _regex.get_expand_on_folding() 

2446 

2447 for c in expanding_chars: 

2448 if folded == _regex.fold_case(FULL_CASE_FOLDING, c): 

2449 return True 

2450 

2451 return False 

2452 

2453 def is_empty(self): 

2454 return all(b.is_empty() for b in self.branches) 

2455 

2456 def __eq__(self, other): 

2457 return type(self) is type(other) and self.branches == other.branches 

2458 

2459 def max_width(self): 

2460 return max(b.max_width() for b in self.branches) 

2461 

2462class CallGroup(RegexBase): 

2463 def __init__(self, info, group, position): 

2464 RegexBase.__init__(self) 

2465 self.info = info 

2466 self.group = group 

2467 self.position = position 

2468 

2469 self._key = self.__class__, self.group 

2470 

2471 def fix_groups(self, pattern, reverse, fuzzy): 

2472 try: 

2473 self.group = int(self.group) 

2474 except ValueError: 

2475 try: 

2476 self.group = self.info.group_index[self.group] 

2477 except KeyError: 

2478 raise error("invalid group reference", pattern, self.position) 

2479 

2480 if not 0 <= self.group <= self.info.group_count: 

2481 raise error("unknown group", pattern, self.position) 

2482 

2483 if self.group > 0 and self.info.open_group_count[self.group] > 1: 

2484 raise error("ambiguous group reference", pattern, self.position) 

2485 

2486 self.info.group_calls.append((self, reverse, fuzzy)) 

2487 

2488 self._key = self.__class__, self.group 

2489 

2490 def remove_captures(self): 

2491 raise error("group reference not allowed", pattern, self.position) 

2492 

2493 def _compile(self, reverse, fuzzy): 

2494 return [(OP.GROUP_CALL, self.call_ref)] 

2495 

2496 def dump(self, indent, reverse): 

2497 print("{}GROUP_CALL {}".format(INDENT * indent, self.group)) 

2498 

2499 def __eq__(self, other): 

2500 return type(self) is type(other) and self.group == other.group 

2501 

2502 def max_width(self): 

2503 return UNLIMITED 

2504 

2505 def __del__(self): 

2506 self.info = None 

2507 

2508class CallRef(RegexBase): 

2509 def __init__(self, ref, parsed): 

2510 self.ref = ref 

2511 self.parsed = parsed 

2512 

2513 def _compile(self, reverse, fuzzy): 

2514 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse, 

2515 fuzzy) + [(OP.END, )]) 

2516 

2517class Character(RegexBase): 

2518 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False): 

2519 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE, 

2520 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE, 

2521 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV, 

2522 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV} 

2523 

2524 def __init__(self, value, positive=True, case_flags=NOCASE, 

2525 zerowidth=False): 

2526 RegexBase.__init__(self) 

2527 self.value = value 

2528 self.positive = bool(positive) 

2529 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

2530 self.zerowidth = bool(zerowidth) 

2531 

2532 if (self.positive and (self.case_flags & FULLIGNORECASE) == 

2533 FULLIGNORECASE): 

2534 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value)) 

2535 else: 

2536 self.folded = chr(self.value) 

2537 

2538 self._key = (self.__class__, self.value, self.positive, 

2539 self.case_flags, self.zerowidth) 

2540 

2541 def rebuild(self, positive, case_flags, zerowidth): 

2542 return Character(self.value, positive, case_flags, zerowidth) 

2543 

2544 def optimise(self, info, reverse, in_set=False): 

2545 return self 

2546 

2547 def get_firstset(self, reverse): 

2548 return set([self]) 

2549 

2550 def has_simple_start(self): 

2551 return True 

2552 

2553 def _compile(self, reverse, fuzzy): 

2554 flags = 0 

2555 if self.positive: 

2556 flags |= POSITIVE_OP 

2557 if self.zerowidth: 

2558 flags |= ZEROWIDTH_OP 

2559 if fuzzy: 

2560 flags |= FUZZY_OP 

2561 

2562 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags, 

2563 self.value]) 

2564 

2565 if len(self.folded) > 1: 

2566 # The character expands on full case-folding. 

2567 code = Branch([code, String([ord(c) for c in self.folded], 

2568 case_flags=self.case_flags)]) 

2569 

2570 return code.compile(reverse, fuzzy) 

2571 

2572 def dump(self, indent, reverse): 

2573 display = ascii(chr(self.value)).lstrip("bu") 

2574 print("{}CHARACTER {} {}{}".format(INDENT * indent, 

2575 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags])) 

2576 

2577 def matches(self, ch): 

2578 return (ch == self.value) == self.positive 

2579 

2580 def max_width(self): 

2581 return len(self.folded) 

2582 

2583 def get_required_string(self, reverse): 

2584 if not self.positive: 

2585 return 1, None 

2586 

2587 self.folded_characters = tuple(ord(c) for c in self.folded) 

2588 

2589 return 0, self 

2590 

2591class Conditional(RegexBase): 

2592 def __init__(self, info, group, yes_item, no_item, position): 

2593 RegexBase.__init__(self) 

2594 self.info = info 

2595 self.group = group 

2596 self.yes_item = yes_item 

2597 self.no_item = no_item 

2598 self.position = position 

2599 

2600 def fix_groups(self, pattern, reverse, fuzzy): 

2601 try: 

2602 self.group = int(self.group) 

2603 except ValueError: 

2604 try: 

2605 self.group = self.info.group_index[self.group] 

2606 except KeyError: 

2607 if self.group == 'DEFINE': 

2608 # 'DEFINE' is a special name unless there's a group with 

2609 # that name. 

2610 self.group = 0 

2611 else: 

2612 raise error("unknown group", pattern, self.position) 

2613 

2614 if not 0 <= self.group <= self.info.group_count: 

2615 raise error("invalid group reference", pattern, self.position) 

2616 

2617 self.yes_item.fix_groups(pattern, reverse, fuzzy) 

2618 self.no_item.fix_groups(pattern, reverse, fuzzy) 

2619 

2620 def optimise(self, info, reverse): 

2621 yes_item = self.yes_item.optimise(info, reverse) 

2622 no_item = self.no_item.optimise(info, reverse) 

2623 

2624 return Conditional(info, self.group, yes_item, no_item, self.position) 

2625 

2626 def pack_characters(self, info): 

2627 self.yes_item = self.yes_item.pack_characters(info) 

2628 self.no_item = self.no_item.pack_characters(info) 

2629 return self 

2630 

2631 def remove_captures(self): 

2632 self.yes_item = self.yes_item.remove_captures() 

2633 self.no_item = self.no_item.remove_captures() 

2634 

2635 def is_atomic(self): 

2636 return self.yes_item.is_atomic() and self.no_item.is_atomic() 

2637 

2638 def can_be_affix(self): 

2639 return self.yes_item.can_be_affix() and self.no_item.can_be_affix() 

2640 

2641 def contains_group(self): 

2642 return self.yes_item.contains_group() or self.no_item.contains_group() 

2643 

2644 def get_firstset(self, reverse): 

2645 return (self.yes_item.get_firstset(reverse) | 

2646 self.no_item.get_firstset(reverse)) 

2647 

2648 def _compile(self, reverse, fuzzy): 

2649 code = [(OP.GROUP_EXISTS, self.group)] 

2650 code.extend(self.yes_item.compile(reverse, fuzzy)) 

2651 add_code = self.no_item.compile(reverse, fuzzy) 

2652 if add_code: 

2653 code.append((OP.NEXT, )) 

2654 code.extend(add_code) 

2655 

2656 code.append((OP.END, )) 

2657 

2658 return code 

2659 

2660 def dump(self, indent, reverse): 

2661 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group)) 

2662 self.yes_item.dump(indent + 1, reverse) 

2663 if not self.no_item.is_empty(): 

2664 print("{}OR".format(INDENT * indent)) 

2665 self.no_item.dump(indent + 1, reverse) 

2666 

2667 def is_empty(self): 

2668 return self.yes_item.is_empty() and self.no_item.is_empty() 

2669 

2670 def __eq__(self, other): 

2671 return type(self) is type(other) and (self.group, self.yes_item, 

2672 self.no_item) == (other.group, other.yes_item, other.no_item) 

2673 

2674 def max_width(self): 

2675 return max(self.yes_item.max_width(), self.no_item.max_width()) 

2676 

2677 def __del__(self): 

2678 self.info = None 

2679 

2680class DefaultBoundary(ZeroWidthBase): 

2681 _opcode = OP.DEFAULT_BOUNDARY 

2682 _op_name = "DEFAULT_BOUNDARY" 

2683 

2684class DefaultEndOfWord(ZeroWidthBase): 

2685 _opcode = OP.DEFAULT_END_OF_WORD 

2686 _op_name = "DEFAULT_END_OF_WORD" 

2687 

2688class DefaultStartOfWord(ZeroWidthBase): 

2689 _opcode = OP.DEFAULT_START_OF_WORD 

2690 _op_name = "DEFAULT_START_OF_WORD" 

2691 

2692class EndOfLine(ZeroWidthBase): 

2693 _opcode = OP.END_OF_LINE 

2694 _op_name = "END_OF_LINE" 

2695 

2696class EndOfLineU(EndOfLine): 

2697 _opcode = OP.END_OF_LINE_U 

2698 _op_name = "END_OF_LINE_U" 

2699 

2700class EndOfString(ZeroWidthBase): 

2701 _opcode = OP.END_OF_STRING 

2702 _op_name = "END_OF_STRING" 

2703 

2704class EndOfStringLine(ZeroWidthBase): 

2705 _opcode = OP.END_OF_STRING_LINE 

2706 _op_name = "END_OF_STRING_LINE" 

2707 

2708class EndOfStringLineU(EndOfStringLine): 

2709 _opcode = OP.END_OF_STRING_LINE_U 

2710 _op_name = "END_OF_STRING_LINE_U" 

2711 

2712class EndOfWord(ZeroWidthBase): 

2713 _opcode = OP.END_OF_WORD 

2714 _op_name = "END_OF_WORD" 

2715 

2716class Failure(ZeroWidthBase): 

2717 _op_name = "FAILURE" 

2718 

2719 def _compile(self, reverse, fuzzy): 

2720 return [(OP.FAILURE, )] 

2721 

2722class Fuzzy(RegexBase): 

2723 def __init__(self, subpattern, constraints=None): 

2724 RegexBase.__init__(self) 

2725 if constraints is None: 

2726 constraints = {} 

2727 self.subpattern = subpattern 

2728 self.constraints = constraints 

2729 

2730 # If an error type is mentioned in the cost equation, then its maximum 

2731 # defaults to unlimited. 

2732 if "cost" in constraints: 

2733 for e in "dis": 

2734 if e in constraints["cost"]: 

2735 constraints.setdefault(e, (0, None)) 

2736 

2737 # If any error type is mentioned, then all the error maxima default to 

2738 # 0, otherwise they default to unlimited. 

2739 if set(constraints) & set("dis"): 

2740 for e in "dis": 

2741 constraints.setdefault(e, (0, 0)) 

2742 else: 

2743 for e in "dis": 

2744 constraints.setdefault(e, (0, None)) 

2745 

2746 # The maximum of the generic error type defaults to unlimited. 

2747 constraints.setdefault("e", (0, None)) 

2748 

2749 # The cost equation defaults to equal costs. Also, the cost of any 

2750 # error type not mentioned in the cost equation defaults to 0. 

2751 if "cost" in constraints: 

2752 for e in "dis": 

2753 constraints["cost"].setdefault(e, 0) 

2754 else: 

2755 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max": 

2756 constraints["e"][1]} 

2757 

2758 def fix_groups(self, pattern, reverse, fuzzy): 

2759 self.subpattern.fix_groups(pattern, reverse, True) 

2760 

2761 def pack_characters(self, info): 

2762 self.subpattern = self.subpattern.pack_characters(info) 

2763 return self 

2764 

2765 def remove_captures(self): 

2766 self.subpattern = self.subpattern.remove_captures() 

2767 return self 

2768 

2769 def is_atomic(self): 

2770 return self.subpattern.is_atomic() 

2771 

2772 def contains_group(self): 

2773 return self.subpattern.contains_group() 

2774 

2775 def _compile(self, reverse, fuzzy): 

2776 # The individual limits. 

2777 arguments = [] 

2778 for e in "dise": 

2779 v = self.constraints[e] 

2780 arguments.append(v[0]) 

2781 arguments.append(UNLIMITED if v[1] is None else v[1]) 

2782 

2783 # The coeffs of the cost equation. 

2784 for e in "dis": 

2785 arguments.append(self.constraints["cost"][e]) 

2786 

2787 # The maximum of the cost equation. 

2788 v = self.constraints["cost"]["max"] 

2789 arguments.append(UNLIMITED if v is None else v) 

2790 

2791 flags = 0 

2792 if reverse: 

2793 flags |= REVERSE_OP 

2794 

2795 test = self.constraints.get("test") 

2796 

2797 if test: 

2798 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] + 

2799 test.compile(reverse, True) + [(OP.NEXT,)] + 

2800 self.subpattern.compile(reverse, True) + [(OP.END,)]) 

2801 

2802 return ([(OP.FUZZY, flags) + tuple(arguments)] + 

2803 self.subpattern.compile(reverse, True) + [(OP.END,)]) 

2804 

2805 def dump(self, indent, reverse): 

2806 constraints = self._constraints_to_string() 

2807 if constraints: 

2808 constraints = " " + constraints 

2809 print("{}FUZZY{}".format(INDENT * indent, constraints)) 

2810 self.subpattern.dump(indent + 1, reverse) 

2811 

2812 def is_empty(self): 

2813 return self.subpattern.is_empty() 

2814 

2815 def __eq__(self, other): 

2816 return (type(self) is type(other) and self.subpattern == 

2817 other.subpattern and self.constraints == other.constraints) 

2818 

2819 def max_width(self): 

2820 return UNLIMITED 

2821 

2822 def _constraints_to_string(self): 

2823 constraints = [] 

2824 

2825 for name in "ids": 

2826 min, max = self.constraints[name] 

2827 if max == 0: 

2828 continue 

2829 

2830 con = "" 

2831 

2832 if min > 0: 

2833 con = "{}<=".format(min) 

2834 

2835 con += name 

2836 

2837 if max is not None: 

2838 con += "<={}".format(max) 

2839 

2840 constraints.append(con) 

2841 

2842 cost = [] 

2843 for name in "ids": 

2844 coeff = self.constraints["cost"][name] 

2845 if coeff > 0: 

2846 cost.append("{}{}".format(coeff, name)) 

2847 

2848 limit = self.constraints["cost"]["max"] 

2849 if limit is not None and limit > 0: 

2850 cost = "{}<={}".format("+".join(cost), limit) 

2851 constraints.append(cost) 

2852 

2853 return ",".join(constraints) 

2854 

2855class Grapheme(RegexBase): 

2856 def _compile(self, reverse, fuzzy): 

2857 # Match at least 1 character until a grapheme boundary is reached. Note 

2858 # that this is the same whether matching forwards or backwards. 

2859 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None), 

2860 GraphemeBoundary()])) 

2861 

2862 return grapheme_matcher.compile(reverse, fuzzy) 

2863 

2864 def dump(self, indent, reverse): 

2865 print("{}GRAPHEME".format(INDENT * indent)) 

2866 

2867 def max_width(self): 

2868 return UNLIMITED 

2869 

2870class GraphemeBoundary: 

2871 def compile(self, reverse, fuzzy): 

2872 return [(OP.GRAPHEME_BOUNDARY, 1)] 

2873 

2874class GreedyRepeat(RegexBase): 

2875 _opcode = OP.GREEDY_REPEAT 

2876 _op_name = "GREEDY_REPEAT" 

2877 

2878 def __init__(self, subpattern, min_count, max_count): 

2879 RegexBase.__init__(self) 

2880 self.subpattern = subpattern 

2881 self.min_count = min_count 

2882 self.max_count = max_count 

2883 

2884 def fix_groups(self, pattern, reverse, fuzzy): 

2885 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

2886 

2887 def optimise(self, info, reverse): 

2888 subpattern = self.subpattern.optimise(info, reverse) 

2889 

2890 return type(self)(subpattern, self.min_count, self.max_count) 

2891 

2892 def pack_characters(self, info): 

2893 self.subpattern = self.subpattern.pack_characters(info) 

2894 return self 

2895 

2896 def remove_captures(self): 

2897 self.subpattern = self.subpattern.remove_captures() 

2898 return self 

2899 

2900 def is_atomic(self): 

2901 return self.min_count == self.max_count and self.subpattern.is_atomic() 

2902 

2903 def can_be_affix(self): 

2904 return False 

2905 

2906 def contains_group(self): 

2907 return self.subpattern.contains_group() 

2908 

2909 def get_firstset(self, reverse): 

2910 fs = self.subpattern.get_firstset(reverse) 

2911 if self.min_count == 0: 

2912 fs.add(None) 

2913 

2914 return fs 

2915 

2916 def _compile(self, reverse, fuzzy): 

2917 repeat = [self._opcode, self.min_count] 

2918 if self.max_count is None: 

2919 repeat.append(UNLIMITED) 

2920 else: 

2921 repeat.append(self.max_count) 

2922 

2923 subpattern = self.subpattern.compile(reverse, fuzzy) 

2924 if not subpattern: 

2925 return [] 

2926 

2927 return ([tuple(repeat)] + subpattern + [(OP.END, )]) 

2928 

2929 def dump(self, indent, reverse): 

2930 if self.max_count is None: 

2931 limit = "INF" 

2932 else: 

2933 limit = self.max_count 

2934 print("{}{} {} {}".format(INDENT * indent, self._op_name, 

2935 self.min_count, limit)) 

2936 

2937 self.subpattern.dump(indent + 1, reverse) 

2938 

2939 def is_empty(self): 

2940 return self.subpattern.is_empty() 

2941 

2942 def __eq__(self, other): 

2943 return type(self) is type(other) and (self.subpattern, self.min_count, 

2944 self.max_count) == (other.subpattern, other.min_count, 

2945 other.max_count) 

2946 

2947 def max_width(self): 

2948 if self.max_count is None: 

2949 return UNLIMITED 

2950 

2951 return self.subpattern.max_width() * self.max_count 

2952 

2953 def get_required_string(self, reverse): 

2954 max_count = UNLIMITED if self.max_count is None else self.max_count 

2955 if self.min_count == 0: 

2956 w = self.subpattern.max_width() * max_count 

2957 return min(w, UNLIMITED), None 

2958 

2959 ofs, req = self.subpattern.get_required_string(reverse) 

2960 if req: 

2961 return ofs, req 

2962 

2963 w = self.subpattern.max_width() * max_count 

2964 return min(w, UNLIMITED), None 

2965 

2966class PossessiveRepeat(GreedyRepeat): 

2967 def is_atomic(self): 

2968 return True 

2969 

2970 def _compile(self, reverse, fuzzy): 

2971 subpattern = self.subpattern.compile(reverse, fuzzy) 

2972 if not subpattern: 

2973 return [] 

2974 

2975 repeat = [self._opcode, self.min_count] 

2976 if self.max_count is None: 

2977 repeat.append(UNLIMITED) 

2978 else: 

2979 repeat.append(self.max_count) 

2980 

2981 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ), 

2982 (OP.END, )]) 

2983 

2984 def dump(self, indent, reverse): 

2985 print("{}ATOMIC".format(INDENT * indent)) 

2986 

2987 if self.max_count is None: 

2988 limit = "INF" 

2989 else: 

2990 limit = self.max_count 

2991 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name, 

2992 self.min_count, limit)) 

2993 

2994 self.subpattern.dump(indent + 2, reverse) 

2995 

2996class Group(RegexBase): 

2997 def __init__(self, info, group, subpattern): 

2998 RegexBase.__init__(self) 

2999 self.info = info 

3000 self.group = group 

3001 self.subpattern = subpattern 

3002 

3003 self.call_ref = None 

3004 

3005 def fix_groups(self, pattern, reverse, fuzzy): 

3006 self.info.defined_groups[self.group] = (self, reverse, fuzzy) 

3007 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

3008 

3009 def optimise(self, info, reverse): 

3010 subpattern = self.subpattern.optimise(info, reverse) 

3011 

3012 return Group(self.info, self.group, subpattern) 

3013 

3014 def pack_characters(self, info): 

3015 self.subpattern = self.subpattern.pack_characters(info) 

3016 return self 

3017 

3018 def remove_captures(self): 

3019 return self.subpattern.remove_captures() 

3020 

3021 def is_atomic(self): 

3022 return self.subpattern.is_atomic() 

3023 

3024 def can_be_affix(self): 

3025 return False 

3026 

3027 def contains_group(self): 

3028 return True 

3029 

3030 def get_firstset(self, reverse): 

3031 return self.subpattern.get_firstset(reverse) 

3032 

3033 def has_simple_start(self): 

3034 return self.subpattern.has_simple_start() 

3035 

3036 def _compile(self, reverse, fuzzy): 

3037 code = [] 

3038 

3039 public_group = private_group = self.group 

3040 if private_group < 0: 

3041 public_group = self.info.private_groups[private_group] 

3042 private_group = self.info.group_count - private_group 

3043 

3044 key = self.group, reverse, fuzzy 

3045 ref = self.info.call_refs.get(key) 

3046 if ref is not None: 

3047 code += [(OP.CALL_REF, ref)] 

3048 

3049 code += [(OP.GROUP, int(not reverse), private_group, public_group)] 

3050 code += self.subpattern.compile(reverse, fuzzy) 

3051 code += [(OP.END, )] 

3052 

3053 if ref is not None: 

3054 code += [(OP.END, )] 

3055 

3056 return code 

3057 

3058 def dump(self, indent, reverse): 

3059 group = self.group 

3060 if group < 0: 

3061 group = private_groups[group] 

3062 print("{}GROUP {}".format(INDENT * indent, group)) 

3063 self.subpattern.dump(indent + 1, reverse) 

3064 

3065 def __eq__(self, other): 

3066 return (type(self) is type(other) and (self.group, self.subpattern) == 

3067 (other.group, other.subpattern)) 

3068 

3069 def max_width(self): 

3070 return self.subpattern.max_width() 

3071 

3072 def get_required_string(self, reverse): 

3073 return self.subpattern.get_required_string(reverse) 

3074 

3075 def __del__(self): 

3076 self.info = None 

3077 

3078class Keep(ZeroWidthBase): 

3079 _opcode = OP.KEEP 

3080 _op_name = "KEEP" 

3081 

3082class LazyRepeat(GreedyRepeat): 

3083 _opcode = OP.LAZY_REPEAT 

3084 _op_name = "LAZY_REPEAT" 

3085 

3086class LookAround(RegexBase): 

3087 _dir_text = {False: "AHEAD", True: "BEHIND"} 

3088 

3089 def __init__(self, behind, positive, subpattern): 

3090 RegexBase.__init__(self) 

3091 self.behind = bool(behind) 

3092 self.positive = bool(positive) 

3093 self.subpattern = subpattern 

3094 

3095 def fix_groups(self, pattern, reverse, fuzzy): 

3096 self.subpattern.fix_groups(pattern, self.behind, fuzzy) 

3097 

3098 def optimise(self, info, reverse): 

3099 subpattern = self.subpattern.optimise(info, self.behind) 

3100 if self.positive and subpattern.is_empty(): 

3101 return subpattern 

3102 

3103 return LookAround(self.behind, self.positive, subpattern) 

3104 

3105 def pack_characters(self, info): 

3106 self.subpattern = self.subpattern.pack_characters(info) 

3107 return self 

3108 

3109 def remove_captures(self): 

3110 return self.subpattern.remove_captures() 

3111 

3112 def is_atomic(self): 

3113 return self.subpattern.is_atomic() 

3114 

3115 def can_be_affix(self): 

3116 return self.subpattern.can_be_affix() 

3117 

3118 def contains_group(self): 

3119 return self.subpattern.contains_group() 

3120 

3121 def get_firstset(self, reverse): 

3122 if self.positive and self.behind == reverse: 

3123 return self.subpattern.get_firstset(reverse) 

3124 

3125 return set([None]) 

3126 

3127 def _compile(self, reverse, fuzzy): 

3128 flags = 0 

3129 if self.positive: 

3130 flags |= POSITIVE_OP 

3131 if fuzzy: 

3132 flags |= FUZZY_OP 

3133 if reverse: 

3134 flags |= REVERSE_OP 

3135 

3136 return ([(OP.LOOKAROUND, flags, int(not self.behind))] + 

3137 self.subpattern.compile(self.behind) + [(OP.END, )]) 

3138 

3139 def dump(self, indent, reverse): 

3140 print("{}LOOK{} {}".format(INDENT * indent, 

3141 self._dir_text[self.behind], POS_TEXT[self.positive])) 

3142 self.subpattern.dump(indent + 1, self.behind) 

3143 

3144 def is_empty(self): 

3145 return self.positive and self.subpattern.is_empty() 

3146 

3147 def __eq__(self, other): 

3148 return type(self) is type(other) and (self.behind, self.positive, 

3149 self.subpattern) == (other.behind, other.positive, other.subpattern) 

3150 

3151 def max_width(self): 

3152 return 0 

3153 

3154class LookAroundConditional(RegexBase): 

3155 _dir_text = {False: "AHEAD", True: "BEHIND"} 

3156 

3157 def __init__(self, behind, positive, subpattern, yes_item, no_item): 

3158 RegexBase.__init__(self) 

3159 self.behind = bool(behind) 

3160 self.positive = bool(positive) 

3161 self.subpattern = subpattern 

3162 self.yes_item = yes_item 

3163 self.no_item = no_item 

3164 

3165 def fix_groups(self, pattern, reverse, fuzzy): 

3166 self.subpattern.fix_groups(pattern, reverse, fuzzy) 

3167 self.yes_item.fix_groups(pattern, reverse, fuzzy) 

3168 self.no_item.fix_groups(pattern, reverse, fuzzy) 

3169 

3170 def optimise(self, info, reverse): 

3171 subpattern = self.subpattern.optimise(info, self.behind) 

3172 yes_item = self.yes_item.optimise(info, self.behind) 

3173 no_item = self.no_item.optimise(info, self.behind) 

3174 

3175 return LookAroundConditional(self.behind, self.positive, subpattern, 

3176 yes_item, no_item) 

3177 

3178 def pack_characters(self, info): 

3179 self.subpattern = self.subpattern.pack_characters(info) 

3180 self.yes_item = self.yes_item.pack_characters(info) 

3181 self.no_item = self.no_item.pack_characters(info) 

3182 return self 

3183 

3184 def remove_captures(self): 

3185 self.subpattern = self.subpattern.remove_captures() 

3186 self.yes_item = self.yes_item.remove_captures() 

3187 self.no_item = self.no_item.remove_captures() 

3188 

3189 def is_atomic(self): 

3190 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and 

3191 self.no_item.is_atomic()) 

3192 

3193 def can_be_affix(self): 

3194 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix() 

3195 and self.no_item.can_be_affix()) 

3196 

3197 def contains_group(self): 

3198 return (self.subpattern.contains_group() or 

3199 self.yes_item.contains_group() or self.no_item.contains_group()) 

3200 

3201 def _compile(self, reverse, fuzzy): 

3202 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))] 

3203 code.extend(self.subpattern.compile(self.behind, fuzzy)) 

3204 code.append((OP.NEXT, )) 

3205 code.extend(self.yes_item.compile(reverse, fuzzy)) 

3206 add_code = self.no_item.compile(reverse, fuzzy) 

3207 if add_code: 

3208 code.append((OP.NEXT, )) 

3209 code.extend(add_code) 

3210 

3211 code.append((OP.END, )) 

3212 

3213 return code 

3214 

3215 def dump(self, indent, reverse): 

3216 print("{}CONDITIONAL {} {}".format(INDENT * indent, 

3217 self._dir_text[self.behind], POS_TEXT[self.positive])) 

3218 self.subpattern.dump(indent + 1, self.behind) 

3219 print("{}EITHER".format(INDENT * indent)) 

3220 self.yes_item.dump(indent + 1, reverse) 

3221 if not self.no_item.is_empty(): 

3222 print("{}OR".format(INDENT * indent)) 

3223 self.no_item.dump(indent + 1, reverse) 

3224 

3225 def is_empty(self): 

3226 return (self.subpattern.is_empty() and self.yes_item.is_empty() or 

3227 self.no_item.is_empty()) 

3228 

3229 def __eq__(self, other): 

3230 return type(self) is type(other) and (self.subpattern, self.yes_item, 

3231 self.no_item) == (other.subpattern, other.yes_item, other.no_item) 

3232 

3233 def max_width(self): 

3234 return max(self.yes_item.max_width(), self.no_item.max_width()) 

3235 

3236 def get_required_string(self, reverse): 

3237 return self.max_width(), None 

3238 

3239class PrecompiledCode(RegexBase): 

3240 def __init__(self, code): 

3241 self.code = code 

3242 

3243 def _compile(self, reverse, fuzzy): 

3244 return [tuple(self.code)] 

3245 

3246class Property(RegexBase): 

3247 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False): 

3248 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False): 

3249 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True): 

3250 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE, 

3251 True): OP.PROPERTY_IGN_REV} 

3252 

3253 def __init__(self, value, positive=True, case_flags=NOCASE, 

3254 zerowidth=False, encoding=0): 

3255 RegexBase.__init__(self) 

3256 self.value = value 

3257 self.positive = bool(positive) 

3258 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3259 self.zerowidth = bool(zerowidth) 

3260 self.encoding = encoding 

3261 

3262 self._key = (self.__class__, self.value, self.positive, 

3263 self.case_flags, self.zerowidth) 

3264 

3265 def rebuild(self, positive, case_flags, zerowidth): 

3266 return Property(self.value, positive, case_flags, zerowidth, 

3267 self.encoding) 

3268 

3269 def optimise(self, info, reverse, in_set=False): 

3270 return self 

3271 

3272 def get_firstset(self, reverse): 

3273 return set([self]) 

3274 

3275 def has_simple_start(self): 

3276 return True 

3277 

3278 def _compile(self, reverse, fuzzy): 

3279 flags = 0 

3280 if self.positive: 

3281 flags |= POSITIVE_OP 

3282 if self.zerowidth: 

3283 flags |= ZEROWIDTH_OP 

3284 if fuzzy: 

3285 flags |= FUZZY_OP 

3286 flags |= self.encoding << ENCODING_OP_SHIFT 

3287 return [(self._opcode[self.case_flags, reverse], flags, self.value)] 

3288 

3289 def dump(self, indent, reverse): 

3290 prop = PROPERTY_NAMES[self.value >> 16] 

3291 name, value = prop[0], prop[1][self.value & 0xFFFF] 

3292 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent, 

3293 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags], 

3294 ["", " ASCII"][self.encoding])) 

3295 

3296 def matches(self, ch): 

3297 return _regex.has_property_value(self.value, ch) == self.positive 

3298 

3299 def max_width(self): 

3300 return 1 

3301 

3302class Prune(ZeroWidthBase): 

3303 _op_name = "PRUNE" 

3304 

3305 def _compile(self, reverse, fuzzy): 

3306 return [(OP.PRUNE, )] 

3307 

3308class Range(RegexBase): 

3309 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN, 

3310 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN, 

3311 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV, 

3312 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV} 

3313 _op_name = "RANGE" 

3314 

3315 def __init__(self, lower, upper, positive=True, case_flags=NOCASE, 

3316 zerowidth=False): 

3317 RegexBase.__init__(self) 

3318 self.lower = lower 

3319 self.upper = upper 

3320 self.positive = bool(positive) 

3321 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3322 self.zerowidth = bool(zerowidth) 

3323 

3324 self._key = (self.__class__, self.lower, self.upper, self.positive, 

3325 self.case_flags, self.zerowidth) 

3326 

3327 def rebuild(self, positive, case_flags, zerowidth): 

3328 return Range(self.lower, self.upper, positive, case_flags, zerowidth) 

3329 

3330 def optimise(self, info, reverse, in_set=False): 

3331 # Is the range case-sensitive? 

3332 if not self.positive or not (self.case_flags & IGNORECASE) or in_set: 

3333 return self 

3334 

3335 # Is full case-folding possible? 

3336 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) != 

3337 FULLIGNORECASE): 

3338 return self 

3339 

3340 # Get the characters which expand to multiple codepoints on folding. 

3341 expanding_chars = _regex.get_expand_on_folding() 

3342 

3343 # Get the folded characters in the range. 

3344 items = [] 

3345 for ch in expanding_chars: 

3346 if self.lower <= ord(ch) <= self.upper: 

3347 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 

3348 items.append(String([ord(c) for c in folded], 

3349 case_flags=self.case_flags)) 

3350 

3351 if not items: 

3352 # We can fall back to simple case-folding. 

3353 return self 

3354 

3355 if len(items) < self.upper - self.lower + 1: 

3356 # Not all the characters are covered by the full case-folding. 

3357 items.insert(0, self) 

3358 

3359 return Branch(items) 

3360 

3361 def _compile(self, reverse, fuzzy): 

3362 flags = 0 

3363 if self.positive: 

3364 flags |= POSITIVE_OP 

3365 if self.zerowidth: 

3366 flags |= ZEROWIDTH_OP 

3367 if fuzzy: 

3368 flags |= FUZZY_OP 

3369 return [(self._opcode[self.case_flags, reverse], flags, self.lower, 

3370 self.upper)] 

3371 

3372 def dump(self, indent, reverse): 

3373 display_lower = ascii(chr(self.lower)).lstrip("bu") 

3374 display_upper = ascii(chr(self.upper)).lstrip("bu") 

3375 print("{}RANGE {} {} {}{}".format(INDENT * indent, 

3376 POS_TEXT[self.positive], display_lower, display_upper, 

3377 CASE_TEXT[self.case_flags])) 

3378 

3379 def matches(self, ch): 

3380 return (self.lower <= ch <= self.upper) == self.positive 

3381 

3382 def max_width(self): 

3383 return 1 

3384 

3385class RefGroup(RegexBase): 

3386 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False): 

3387 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE, 

3388 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE, 

3389 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV, 

3390 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV} 

3391 

3392 def __init__(self, info, group, position, case_flags=NOCASE): 

3393 RegexBase.__init__(self) 

3394 self.info = info 

3395 self.group = group 

3396 self.position = position 

3397 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3398 

3399 self._key = self.__class__, self.group, self.case_flags 

3400 

3401 def fix_groups(self, pattern, reverse, fuzzy): 

3402 try: 

3403 self.group = int(self.group) 

3404 except ValueError: 

3405 try: 

3406 self.group = self.info.group_index[self.group] 

3407 except KeyError: 

3408 raise error("unknown group", pattern, self.position) 

3409 

3410 if not 1 <= self.group <= self.info.group_count: 

3411 raise error("invalid group reference", pattern, self.position) 

3412 

3413 self._key = self.__class__, self.group, self.case_flags 

3414 

3415 def remove_captures(self): 

3416 raise error("group reference not allowed", pattern, self.position) 

3417 

3418 def _compile(self, reverse, fuzzy): 

3419 flags = 0 

3420 if fuzzy: 

3421 flags |= FUZZY_OP 

3422 return [(self._opcode[self.case_flags, reverse], flags, self.group)] 

3423 

3424 def dump(self, indent, reverse): 

3425 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group, 

3426 CASE_TEXT[self.case_flags])) 

3427 

3428 def max_width(self): 

3429 return UNLIMITED 

3430 

3431 def __del__(self): 

3432 self.info = None 

3433 

3434class SearchAnchor(ZeroWidthBase): 

3435 _opcode = OP.SEARCH_ANCHOR 

3436 _op_name = "SEARCH_ANCHOR" 

3437 

3438class Sequence(RegexBase): 

3439 def __init__(self, items=None): 

3440 RegexBase.__init__(self) 

3441 if items is None: 

3442 items = [] 

3443 

3444 self.items = items 

3445 

3446 def fix_groups(self, pattern, reverse, fuzzy): 

3447 for s in self.items: 

3448 s.fix_groups(pattern, reverse, fuzzy) 

3449 

3450 def optimise(self, info, reverse): 

3451 # Flatten the sequences. 

3452 items = [] 

3453 for s in self.items: 

3454 s = s.optimise(info, reverse) 

3455 if isinstance(s, Sequence): 

3456 items.extend(s.items) 

3457 else: 

3458 items.append(s) 

3459 

3460 return make_sequence(items) 

3461 

3462 def pack_characters(self, info): 

3463 "Packs sequences of characters into strings." 

3464 items = [] 

3465 characters = [] 

3466 case_flags = NOCASE 

3467 for s in self.items: 

3468 if type(s) is Character and s.positive and not s.zerowidth: 

3469 if s.case_flags != case_flags: 

3470 # Different case sensitivity, so flush, unless neither the 

3471 # previous nor the new character are cased. 

3472 if s.case_flags or is_cased_i(info, s.value): 

3473 Sequence._flush_characters(info, characters, 

3474 case_flags, items) 

3475 

3476 case_flags = s.case_flags 

3477 

3478 characters.append(s.value) 

3479 elif type(s) is String or type(s) is Literal: 

3480 if s.case_flags != case_flags: 

3481 # Different case sensitivity, so flush, unless the neither 

3482 # the previous nor the new string are cased. 

3483 if s.case_flags or any(is_cased_i(info, c) for c in 

3484 characters): 

3485 Sequence._flush_characters(info, characters, 

3486 case_flags, items) 

3487 

3488 case_flags = s.case_flags 

3489 

3490 characters.extend(s.characters) 

3491 else: 

3492 Sequence._flush_characters(info, characters, case_flags, items) 

3493 

3494 items.append(s.pack_characters(info)) 

3495 

3496 Sequence._flush_characters(info, characters, case_flags, items) 

3497 

3498 return make_sequence(items) 

3499 

3500 def remove_captures(self): 

3501 self.items = [s.remove_captures() for s in self.items] 

3502 return self 

3503 

3504 def is_atomic(self): 

3505 return all(s.is_atomic() for s in self.items) 

3506 

3507 def can_be_affix(self): 

3508 return False 

3509 

3510 def contains_group(self): 

3511 return any(s.contains_group() for s in self.items) 

3512 

3513 def get_firstset(self, reverse): 

3514 fs = set() 

3515 items = self.items 

3516 if reverse: 

3517 items.reverse() 

3518 for s in items: 

3519 fs |= s.get_firstset(reverse) 

3520 if None not in fs: 

3521 return fs 

3522 fs.discard(None) 

3523 

3524 return fs | set([None]) 

3525 

3526 def has_simple_start(self): 

3527 return bool(self.items) and self.items[0].has_simple_start() 

3528 

3529 def _compile(self, reverse, fuzzy): 

3530 seq = self.items 

3531 if reverse: 

3532 seq = seq[::-1] 

3533 

3534 code = [] 

3535 for s in seq: 

3536 code.extend(s.compile(reverse, fuzzy)) 

3537 

3538 return code 

3539 

3540 def dump(self, indent, reverse): 

3541 for s in self.items: 

3542 s.dump(indent, reverse) 

3543 

3544 @staticmethod 

3545 def _flush_characters(info, characters, case_flags, items): 

3546 if not characters: 

3547 return 

3548 

3549 # Disregard case_flags if all of the characters are case-less. 

3550 if case_flags & IGNORECASE: 

3551 if not any(is_cased_i(info, c) for c in characters): 

3552 case_flags = NOCASE 

3553 

3554 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE: 

3555 literals = Sequence._fix_full_casefold(characters) 

3556 

3557 for item in literals: 

3558 chars = item.characters 

3559 

3560 if len(chars) == 1: 

3561 items.append(Character(chars[0], case_flags=item.case_flags)) 

3562 else: 

3563 items.append(String(chars, case_flags=item.case_flags)) 

3564 else: 

3565 if len(characters) == 1: 

3566 items.append(Character(characters[0], case_flags=case_flags)) 

3567 else: 

3568 items.append(String(characters, case_flags=case_flags)) 

3569 

3570 characters[:] = [] 

3571 

3572 @staticmethod 

3573 def _fix_full_casefold(characters): 

3574 # Split a literal needing full case-folding into chunks that need it 

3575 # and chunks that can use simple case-folding, which is faster. 

3576 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in 

3577 _regex.get_expand_on_folding()] 

3578 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c) 

3579 for c in characters)).lower() 

3580 chunks = [] 

3581 

3582 for e in expanded: 

3583 found = string.find(e) 

3584 

3585 while found >= 0: 

3586 chunks.append((found, found + len(e))) 

3587 found = string.find(e, found + 1) 

3588 

3589 pos = 0 

3590 literals = [] 

3591 

3592 for start, end in Sequence._merge_chunks(chunks): 

3593 if pos < start: 

3594 literals.append(Literal(characters[pos : start], 

3595 case_flags=IGNORECASE)) 

3596 

3597 literals.append(Literal(characters[start : end], 

3598 case_flags=FULLIGNORECASE)) 

3599 pos = end 

3600 

3601 if pos < len(characters): 

3602 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE)) 

3603 

3604 return literals 

3605 

3606 @staticmethod 

3607 def _merge_chunks(chunks): 

3608 if len(chunks) < 2: 

3609 return chunks 

3610 

3611 chunks.sort() 

3612 

3613 start, end = chunks[0] 

3614 new_chunks = [] 

3615 

3616 for s, e in chunks[1 : ]: 

3617 if s <= end: 

3618 end = max(end, e) 

3619 else: 

3620 new_chunks.append((start, end)) 

3621 start, end = s, e 

3622 

3623 new_chunks.append((start, end)) 

3624 

3625 return new_chunks 

3626 

3627 def is_empty(self): 

3628 return all(i.is_empty() for i in self.items) 

3629 

3630 def __eq__(self, other): 

3631 return type(self) is type(other) and self.items == other.items 

3632 

3633 def max_width(self): 

3634 return sum(s.max_width() for s in self.items) 

3635 

3636 def get_required_string(self, reverse): 

3637 seq = self.items 

3638 if reverse: 

3639 seq = seq[::-1] 

3640 

3641 offset = 0 

3642 

3643 for s in seq: 

3644 ofs, req = s.get_required_string(reverse) 

3645 offset += ofs 

3646 if req: 

3647 return offset, req 

3648 

3649 return offset, None 

3650 

3651class SetBase(RegexBase): 

3652 def __init__(self, info, items, positive=True, case_flags=NOCASE, 

3653 zerowidth=False): 

3654 RegexBase.__init__(self) 

3655 self.info = info 

3656 self.items = tuple(items) 

3657 self.positive = bool(positive) 

3658 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3659 self.zerowidth = bool(zerowidth) 

3660 

3661 self.char_width = 1 

3662 

3663 self._key = (self.__class__, self.items, self.positive, 

3664 self.case_flags, self.zerowidth) 

3665 

3666 def rebuild(self, positive, case_flags, zerowidth): 

3667 return type(self)(self.info, self.items, positive, case_flags, 

3668 zerowidth).optimise(self.info, False) 

3669 

3670 def get_firstset(self, reverse): 

3671 return set([self]) 

3672 

3673 def has_simple_start(self): 

3674 return True 

3675 

3676 def _compile(self, reverse, fuzzy): 

3677 flags = 0 

3678 if self.positive: 

3679 flags |= POSITIVE_OP 

3680 if self.zerowidth: 

3681 flags |= ZEROWIDTH_OP 

3682 if fuzzy: 

3683 flags |= FUZZY_OP 

3684 code = [(self._opcode[self.case_flags, reverse], flags)] 

3685 for m in self.items: 

3686 code.extend(m.compile()) 

3687 

3688 code.append((OP.END, )) 

3689 

3690 return code 

3691 

3692 def dump(self, indent, reverse): 

3693 print("{}{} {}{}".format(INDENT * indent, self._op_name, 

3694 POS_TEXT[self.positive], CASE_TEXT[self.case_flags])) 

3695 for i in self.items: 

3696 i.dump(indent + 1, reverse) 

3697 

3698 def _handle_case_folding(self, info, in_set): 

3699 # Is the set case-sensitive? 

3700 if not self.positive or not (self.case_flags & IGNORECASE) or in_set: 

3701 return self 

3702 

3703 # Is full case-folding possible? 

3704 if (not (self.info.flags & UNICODE) or (self.case_flags & 

3705 FULLIGNORECASE) != FULLIGNORECASE): 

3706 return self 

3707 

3708 # Get the characters which expand to multiple codepoints on folding. 

3709 expanding_chars = _regex.get_expand_on_folding() 

3710 

3711 # Get the folded characters in the set. 

3712 items = [] 

3713 seen = set() 

3714 for ch in expanding_chars: 

3715 if self.matches(ord(ch)): 

3716 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 

3717 if folded not in seen: 

3718 items.append(String([ord(c) for c in folded], 

3719 case_flags=self.case_flags)) 

3720 seen.add(folded) 

3721 

3722 if not items: 

3723 # We can fall back to simple case-folding. 

3724 return self 

3725 

3726 return Branch([self] + items) 

3727 

3728 def max_width(self): 

3729 # Is the set case-sensitive? 

3730 if not self.positive or not (self.case_flags & IGNORECASE): 

3731 return 1 

3732 

3733 # Is full case-folding possible? 

3734 if (not (self.info.flags & UNICODE) or (self.case_flags & 

3735 FULLIGNORECASE) != FULLIGNORECASE): 

3736 return 1 

3737 

3738 # Get the characters which expand to multiple codepoints on folding. 

3739 expanding_chars = _regex.get_expand_on_folding() 

3740 

3741 # Get the folded characters in the set. 

3742 seen = set() 

3743 for ch in expanding_chars: 

3744 if self.matches(ord(ch)): 

3745 folded = _regex.fold_case(FULL_CASE_FOLDING, ch) 

3746 seen.add(folded) 

3747 

3748 if not seen: 

3749 return 1 

3750 

3751 return max(len(folded) for folded in seen) 

3752 

3753 def __del__(self): 

3754 self.info = None 

3755 

3756class SetDiff(SetBase): 

3757 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False): 

3758 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False): 

3759 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True): 

3760 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE, 

3761 True): OP.SET_DIFF_IGN_REV} 

3762 _op_name = "SET_DIFF" 

3763 

3764 def optimise(self, info, reverse, in_set=False): 

3765 items = self.items 

3766 if len(items) > 2: 

3767 items = [items[0], SetUnion(info, items[1 : ])] 

3768 

3769 if len(items) == 1: 

3770 return items[0].with_flags(case_flags=self.case_flags, 

3771 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3772 

3773 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in 

3774 items) 

3775 

3776 return self._handle_case_folding(info, in_set) 

3777 

3778 def matches(self, ch): 

3779 m = self.items[0].matches(ch) and not self.items[1].matches(ch) 

3780 return m == self.positive 

3781 

3782class SetInter(SetBase): 

3783 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False): 

3784 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE, 

3785 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE, 

3786 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV, 

3787 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV} 

3788 _op_name = "SET_INTER" 

3789 

3790 def optimise(self, info, reverse, in_set=False): 

3791 items = [] 

3792 for m in self.items: 

3793 m = m.optimise(info, reverse, in_set=True) 

3794 if isinstance(m, SetInter) and m.positive: 

3795 # Intersection in intersection. 

3796 items.extend(m.items) 

3797 else: 

3798 items.append(m) 

3799 

3800 if len(items) == 1: 

3801 return items[0].with_flags(case_flags=self.case_flags, 

3802 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3803 

3804 self.items = tuple(items) 

3805 

3806 return self._handle_case_folding(info, in_set) 

3807 

3808 def matches(self, ch): 

3809 m = all(i.matches(ch) for i in self.items) 

3810 return m == self.positive 

3811 

3812class SetSymDiff(SetBase): 

3813 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False): 

3814 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE, 

3815 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV, 

3816 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True): 

3817 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV} 

3818 _op_name = "SET_SYM_DIFF" 

3819 

3820 def optimise(self, info, reverse, in_set=False): 

3821 items = [] 

3822 for m in self.items: 

3823 m = m.optimise(info, reverse, in_set=True) 

3824 if isinstance(m, SetSymDiff) and m.positive: 

3825 # Symmetric difference in symmetric difference. 

3826 items.extend(m.items) 

3827 else: 

3828 items.append(m) 

3829 

3830 if len(items) == 1: 

3831 return items[0].with_flags(case_flags=self.case_flags, 

3832 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3833 

3834 self.items = tuple(items) 

3835 

3836 return self._handle_case_folding(info, in_set) 

3837 

3838 def matches(self, ch): 

3839 m = False 

3840 for i in self.items: 

3841 m = m != i.matches(ch) 

3842 

3843 return m == self.positive 

3844 

3845class SetUnion(SetBase): 

3846 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False): 

3847 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE, 

3848 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE, 

3849 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV, 

3850 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV} 

3851 _op_name = "SET_UNION" 

3852 

3853 def optimise(self, info, reverse, in_set=False): 

3854 items = [] 

3855 for m in self.items: 

3856 m = m.optimise(info, reverse, in_set=True) 

3857 if isinstance(m, SetUnion) and m.positive: 

3858 # Union in union. 

3859 items.extend(m.items) 

3860 elif isinstance(m, AnyAll): 

3861 return AnyAll() 

3862 else: 

3863 items.append(m) 

3864 

3865 # Are there complementary properties? 

3866 properties = (set(), set()) 

3867 

3868 for m in items: 

3869 if isinstance(m, Property): 

3870 properties[m.positive].add((m.value, m.case_flags, m.zerowidth)) 

3871 

3872 if properties[0] & properties[1]: 

3873 return AnyAll() 

3874 

3875 if len(items) == 1: 

3876 i = items[0] 

3877 return i.with_flags(positive=i.positive == self.positive, 

3878 case_flags=self.case_flags, 

3879 zerowidth=self.zerowidth).optimise(info, reverse, in_set) 

3880 

3881 self.items = tuple(items) 

3882 

3883 return self._handle_case_folding(info, in_set) 

3884 

3885 def _compile(self, reverse, fuzzy): 

3886 flags = 0 

3887 if self.positive: 

3888 flags |= POSITIVE_OP 

3889 if self.zerowidth: 

3890 flags |= ZEROWIDTH_OP 

3891 if fuzzy: 

3892 flags |= FUZZY_OP 

3893 

3894 characters, others = defaultdict(list), [] 

3895 for m in self.items: 

3896 if isinstance(m, Character): 

3897 characters[m.positive].append(m.value) 

3898 else: 

3899 others.append(m) 

3900 

3901 code = [(self._opcode[self.case_flags, reverse], flags)] 

3902 

3903 for positive, values in characters.items(): 

3904 flags = 0 

3905 if positive: 

3906 flags |= POSITIVE_OP 

3907 if len(values) == 1: 

3908 code.append((OP.CHARACTER, flags, values[0])) 

3909 else: 

3910 code.append((OP.STRING, flags, len(values)) + tuple(values)) 

3911 

3912 for m in others: 

3913 code.extend(m.compile()) 

3914 

3915 code.append((OP.END, )) 

3916 

3917 return code 

3918 

3919 def matches(self, ch): 

3920 m = any(i.matches(ch) for i in self.items) 

3921 return m == self.positive 

3922 

3923class Skip(ZeroWidthBase): 

3924 _op_name = "SKIP" 

3925 _opcode = OP.SKIP 

3926 

3927class StartOfLine(ZeroWidthBase): 

3928 _opcode = OP.START_OF_LINE 

3929 _op_name = "START_OF_LINE" 

3930 

3931class StartOfLineU(StartOfLine): 

3932 _opcode = OP.START_OF_LINE_U 

3933 _op_name = "START_OF_LINE_U" 

3934 

3935class StartOfString(ZeroWidthBase): 

3936 _opcode = OP.START_OF_STRING 

3937 _op_name = "START_OF_STRING" 

3938 

3939class StartOfWord(ZeroWidthBase): 

3940 _opcode = OP.START_OF_WORD 

3941 _op_name = "START_OF_WORD" 

3942 

3943class String(RegexBase): 

3944 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN, 

3945 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD, 

3946 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV, 

3947 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True): 

3948 OP.STRING_FLD_REV} 

3949 

3950 def __init__(self, characters, case_flags=NOCASE): 

3951 self.characters = tuple(characters) 

3952 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

3953 

3954 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE: 

3955 folded_characters = [] 

3956 for char in self.characters: 

3957 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char)) 

3958 folded_characters.extend(ord(c) for c in folded) 

3959 else: 

3960 folded_characters = self.characters 

3961 

3962 self.folded_characters = tuple(folded_characters) 

3963 self.required = False 

3964 

3965 self._key = self.__class__, self.characters, self.case_flags 

3966 

3967 def get_firstset(self, reverse): 

3968 if reverse: 

3969 pos = -1 

3970 else: 

3971 pos = 0 

3972 return set([Character(self.characters[pos], 

3973 case_flags=self.case_flags)]) 

3974 

3975 def has_simple_start(self): 

3976 return True 

3977 

3978 def _compile(self, reverse, fuzzy): 

3979 flags = 0 

3980 if fuzzy: 

3981 flags |= FUZZY_OP 

3982 if self.required: 

3983 flags |= REQUIRED_OP 

3984 return [(self._opcode[self.case_flags, reverse], flags, 

3985 len(self.folded_characters)) + self.folded_characters] 

3986 

3987 def dump(self, indent, reverse): 

3988 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu") 

3989 print("{}STRING {}{}".format(INDENT * indent, display, 

3990 CASE_TEXT[self.case_flags])) 

3991 

3992 def max_width(self): 

3993 return len(self.folded_characters) 

3994 

3995 def get_required_string(self, reverse): 

3996 return 0, self 

3997 

3998class Literal(String): 

3999 def dump(self, indent, reverse): 

4000 literal = ''.join(chr(c) for c in self.characters) 

4001 display = ascii(literal).lstrip("bu") 

4002 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display, 

4003 CASE_TEXT[self.case_flags])) 

4004 

4005class StringSet(Branch): 

4006 def __init__(self, info, name, case_flags=NOCASE): 

4007 self.info = info 

4008 self.name = name 

4009 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags] 

4010 

4011 self._key = self.__class__, self.name, self.case_flags 

4012 

4013 self.set_key = (name, self.case_flags) 

4014 if self.set_key not in info.named_lists_used: 

4015 info.named_lists_used[self.set_key] = len(info.named_lists_used) 

4016 

4017 index = self.info.named_lists_used[self.set_key] 

4018 items = self.info.kwargs[self.name] 

4019 

4020 case_flags = self.case_flags 

4021 

4022 encoding = self.info.flags & _ALL_ENCODINGS 

4023 fold_flags = encoding | case_flags 

4024 

4025 choices = [] 

4026 

4027 for string in items: 

4028 if isinstance(string, str): 

4029 string = [ord(c) for c in string] 

4030 

4031 choices.append([Character(c, case_flags=case_flags) for c in 

4032 string]) 

4033 

4034 # Sort from longest to shortest. 

4035 choices.sort(key=len, reverse=True) 

4036 

4037 self.branches = [Sequence(choice) for choice in choices] 

4038 

4039 def dump(self, indent, reverse): 

4040 print("{}STRING_SET {}{}".format(INDENT * indent, self.name, 

4041 CASE_TEXT[self.case_flags])) 

4042 

4043 def __del__(self): 

4044 self.info = None 

4045 

4046class Source: 

4047 "Scanner for the regular expression source string." 

4048 def __init__(self, string): 

4049 if isinstance(string, str): 

4050 self.string = string 

4051 self.char_type = chr 

4052 else: 

4053 self.string = string.decode("latin-1") 

4054 self.char_type = lambda c: bytes([c]) 

4055 

4056 self.pos = 0 

4057 self.ignore_space = False 

4058 self.sep = string[ : 0] 

4059 

4060 def get(self, override_ignore=False): 

4061 string = self.string 

4062 pos = self.pos 

4063 

4064 try: 

4065 if self.ignore_space and not override_ignore: 

4066 while True: 

4067 if string[pos].isspace(): 

4068 # Skip over the whitespace. 

4069 pos += 1 

4070 elif string[pos] == "#": 

4071 # Skip over the comment to the end of the line. 

4072 pos = string.index("\n", pos) 

4073 else: 

4074 break 

4075 

4076 ch = string[pos] 

4077 self.pos = pos + 1 

4078 return ch 

4079 except IndexError: 

4080 # We've reached the end of the string. 

4081 self.pos = pos 

4082 return string[ : 0] 

4083 except ValueError: 

4084 # The comment extended to the end of the string. 

4085 self.pos = len(string) 

4086 return string[ : 0] 

4087 

4088 def get_many(self, count=1): 

4089 string = self.string 

4090 pos = self.pos 

4091 

4092 try: 

4093 if self.ignore_space: 

4094 substring = [] 

4095 

4096 while len(substring) < count: 

4097 while True: 

4098 if string[pos].isspace(): 

4099 # Skip over the whitespace. 

4100 pos += 1 

4101 elif string[pos] == "#": 

4102 # Skip over the comment to the end of the line. 

4103 pos = string.index("\n", pos) 

4104 else: 

4105 break 

4106 

4107 substring.append(string[pos]) 

4108 pos += 1 

4109 

4110 substring = "".join(substring) 

4111 else: 

4112 substring = string[pos : pos + count] 

4113 pos += len(substring) 

4114 

4115 self.pos = pos 

4116 return substring 

4117 except IndexError: 

4118 # We've reached the end of the string. 

4119 self.pos = len(string) 

4120 return "".join(substring) 

4121 except ValueError: 

4122 # The comment extended to the end of the string. 

4123 self.pos = len(string) 

4124 return "".join(substring) 

4125 

4126 def get_while(self, test_set, include=True, keep_spaces=False): 

4127 string = self.string 

4128 pos = self.pos 

4129 

4130 if self.ignore_space and not keep_spaces: 

4131 try: 

4132 substring = [] 

4133 

4134 while True: 

4135 if string[pos].isspace(): 

4136 # Skip over the whitespace. 

4137 pos += 1 

4138 elif string[pos] == "#": 

4139 # Skip over the comment to the end of the line. 

4140 pos = string.index("\n", pos) 

4141 elif (string[pos] in test_set) == include: 

4142 substring.append(string[pos]) 

4143 pos += 1 

4144 else: 

4145 break 

4146 

4147 self.pos = pos 

4148 except IndexError: 

4149 # We've reached the end of the string. 

4150 self.pos = len(string) 

4151 except ValueError: 

4152 # The comment extended to the end of the string. 

4153 self.pos = len(string) 

4154 

4155 return "".join(substring) 

4156 else: 

4157 try: 

4158 while (string[pos] in test_set) == include: 

4159 pos += 1 

4160 

4161 substring = string[self.pos : pos] 

4162 

4163 self.pos = pos 

4164 

4165 return substring 

4166 except IndexError: 

4167 # We've reached the end of the string. 

4168 substring = string[self.pos : pos] 

4169 

4170 self.pos = pos 

4171 

4172 return substring 

4173 

4174 def skip_while(self, test_set, include=True): 

4175 string = self.string 

4176 pos = self.pos 

4177 

4178 try: 

4179 if self.ignore_space: 

4180 while True: 

4181 if string[pos].isspace(): 

4182 # Skip over the whitespace. 

4183 pos += 1 

4184 elif string[pos] == "#": 

4185 # Skip over the comment to the end of the line. 

4186 pos = string.index("\n", pos) 

4187 elif (string[pos] in test_set) == include: 

4188 pos += 1 

4189 else: 

4190 break 

4191 else: 

4192 while (string[pos] in test_set) == include: 

4193 pos += 1 

4194 

4195 self.pos = pos 

4196 except IndexError: 

4197 # We've reached the end of the string. 

4198 self.pos = len(string) 

4199 except ValueError: 

4200 # The comment extended to the end of the string. 

4201 self.pos = len(string) 

4202 

4203 def match(self, substring): 

4204 string = self.string 

4205 pos = self.pos 

4206 

4207 if self.ignore_space: 

4208 try: 

4209 for c in substring: 

4210 while True: 

4211 if string[pos].isspace(): 

4212 # Skip over the whitespace. 

4213 pos += 1 

4214 elif string[pos] == "#": 

4215 # Skip over the comment to the end of the line. 

4216 pos = string.index("\n", pos) 

4217 else: 

4218 break 

4219 

4220 if string[pos] != c: 

4221 return False 

4222 

4223 pos += 1 

4224 

4225 self.pos = pos 

4226 

4227 return True 

4228 except IndexError: 

4229 # We've reached the end of the string. 

4230 return False 

4231 except ValueError: 

4232 # The comment extended to the end of the string. 

4233 return False 

4234 else: 

4235 if not string.startswith(substring, pos): 

4236 return False 

4237 

4238 self.pos = pos + len(substring) 

4239 

4240 return True 

4241 

4242 def expect(self, substring): 

4243 if not self.match(substring): 

4244 raise error("missing {}".format(substring), self.string, self.pos) 

4245 

4246 def at_end(self): 

4247 string = self.string 

4248 pos = self.pos 

4249 

4250 try: 

4251 if self.ignore_space: 

4252 while True: 

4253 if string[pos].isspace(): 

4254 pos += 1 

4255 elif string[pos] == "#": 

4256 pos = string.index("\n", pos) 

4257 else: 

4258 break 

4259 

4260 return pos >= len(string) 

4261 except IndexError: 

4262 # We've reached the end of the string. 

4263 return True 

4264 except ValueError: 

4265 # The comment extended to the end of the string. 

4266 return True 

4267 

4268class Info: 

4269 "Info about the regular expression." 

4270 

4271 def __init__(self, flags=0, char_type=None, kwargs={}): 

4272 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION] 

4273 self.flags = flags 

4274 self.global_flags = flags 

4275 self.inline_locale = False 

4276 

4277 self.kwargs = kwargs 

4278 

4279 self.group_count = 0 

4280 self.group_index = {} 

4281 self.group_name = {} 

4282 self.char_type = char_type 

4283 self.named_lists_used = {} 

4284 self.open_groups = [] 

4285 self.open_group_count = {} 

4286 self.defined_groups = {} 

4287 self.group_calls = [] 

4288 self.private_groups = {} 

4289 

4290 def open_group(self, name=None): 

4291 group = self.group_index.get(name) 

4292 if group is None: 

4293 while True: 

4294 self.group_count += 1 

4295 if name is None or self.group_count not in self.group_name: 

4296 break 

4297 

4298 group = self.group_count 

4299 if name: 

4300 self.group_index[name] = group 

4301 self.group_name[group] = name 

4302 

4303 if group in self.open_groups: 

4304 # We have a nested named group. We'll assign it a private group 

4305 # number, initially negative until we can assign a proper 

4306 # (positive) number. 

4307 group_alias = -(len(self.private_groups) + 1) 

4308 self.private_groups[group_alias] = group 

4309 group = group_alias 

4310 

4311 self.open_groups.append(group) 

4312 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1 

4313 

4314 return group 

4315 

4316 def close_group(self): 

4317 self.open_groups.pop() 

4318 

4319 def is_open_group(self, name): 

4320 # In version 1, a group reference can refer to an open group. We'll 

4321 # just pretend the group isn't open. 

4322 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

4323 if version == VERSION1: 

4324 return False 

4325 

4326 if name.isdigit(): 

4327 group = int(name) 

4328 else: 

4329 group = self.group_index.get(name) 

4330 

4331 return group in self.open_groups 

4332 

4333def _check_group_features(info, parsed): 

4334 """Checks whether the reverse and fuzzy features of the group calls match 

4335 the groups which they call. 

4336 """ 

4337 call_refs = {} 

4338 additional_groups = [] 

4339 for call, reverse, fuzzy in info.group_calls: 

4340 # Look up the reference of this group call. 

4341 key = (call.group, reverse, fuzzy) 

4342 ref = call_refs.get(key) 

4343 if ref is None: 

4344 # This group doesn't have a reference yet, so look up its features. 

4345 if call.group == 0: 

4346 # Calling the pattern as a whole. 

4347 rev = bool(info.flags & REVERSE) 

4348 fuz = isinstance(parsed, Fuzzy) 

4349 if (rev, fuz) != (reverse, fuzzy): 

4350 # The pattern as a whole doesn't have the features we want, 

4351 # so we'll need to make a copy of it with the desired 

4352 # features. 

4353 additional_groups.append((CallRef(len(call_refs), parsed), 

4354 reverse, fuzzy)) 

4355 else: 

4356 # Calling a capture group. 

4357 def_info = info.defined_groups[call.group] 

4358 group = def_info[0] 

4359 if def_info[1 : ] != (reverse, fuzzy): 

4360 # The group doesn't have the features we want, so we'll 

4361 # need to make a copy of it with the desired features. 

4362 additional_groups.append((group, reverse, fuzzy)) 

4363 

4364 ref = len(call_refs) 

4365 call_refs[key] = ref 

4366 

4367 call.call_ref = ref 

4368 

4369 info.call_refs = call_refs 

4370 info.additional_groups = additional_groups 

4371 

4372def _get_required_string(parsed, flags): 

4373 "Gets the required string and related info of a parsed pattern." 

4374 

4375 req_offset, required = parsed.get_required_string(bool(flags & REVERSE)) 

4376 if required: 

4377 required.required = True 

4378 if req_offset >= UNLIMITED: 

4379 req_offset = -1 

4380 

4381 req_flags = required.case_flags 

4382 if not (flags & UNICODE): 

4383 req_flags &= ~UNICODE 

4384 

4385 req_chars = required.folded_characters 

4386 else: 

4387 req_offset = 0 

4388 req_chars = () 

4389 req_flags = 0 

4390 

4391 return req_offset, req_chars, req_flags 

4392 

4393class Scanner: 

4394 def __init__(self, lexicon, flags=0): 

4395 self.lexicon = lexicon 

4396 

4397 # Combine phrases into a compound pattern. 

4398 patterns = [] 

4399 for phrase, action in lexicon: 

4400 # Parse the regular expression. 

4401 source = Source(phrase) 

4402 info = Info(flags, source.char_type) 

4403 source.ignore_space = bool(info.flags & VERBOSE) 

4404 parsed = _parse_pattern(source, info) 

4405 if not source.at_end(): 

4406 raise error("unbalanced parenthesis", source.string, 

4407 source.pos) 

4408 

4409 # We want to forbid capture groups within each phrase. 

4410 patterns.append(parsed.remove_captures()) 

4411 

4412 # Combine all the subpatterns into one pattern. 

4413 info = Info(flags) 

4414 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)] 

4415 parsed = Branch(patterns) 

4416 

4417 # Optimise the compound pattern. 

4418 reverse = bool(info.flags & REVERSE) 

4419 parsed = parsed.optimise(info, reverse) 

4420 parsed = parsed.pack_characters(info) 

4421 

4422 # Get the required string. 

4423 req_offset, req_chars, req_flags = _get_required_string(parsed, 

4424 info.flags) 

4425 

4426 # Check the features of the groups. 

4427 _check_group_features(info, parsed) 

4428 

4429 # Complain if there are any group calls. They are not supported by the 

4430 # Scanner class. 

4431 if info.call_refs: 

4432 raise error("recursive regex not supported by Scanner", 

4433 source.string, source.pos) 

4434 

4435 reverse = bool(info.flags & REVERSE) 

4436 

4437 # Compile the compound pattern. The result is a list of tuples. 

4438 code = parsed.compile(reverse) + [(OP.SUCCESS, )] 

4439 

4440 # Flatten the code into a list of ints. 

4441 code = _flatten_code(code) 

4442 

4443 if not parsed.has_simple_start(): 

4444 # Get the first set, if possible. 

4445 try: 

4446 fs_code = _compile_firstset(info, parsed.get_firstset(reverse)) 

4447 fs_code = _flatten_code(fs_code) 

4448 code = fs_code + code 

4449 except _FirstSetError: 

4450 pass 

4451 

4452 # Check the global flags for conflicts. 

4453 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION 

4454 if version not in (0, VERSION0, VERSION1): 

4455 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible") 

4456 

4457 # Create the PatternObject. 

4458 # 

4459 # Local flags like IGNORECASE affect the code generation, but aren't 

4460 # needed by the PatternObject itself. Conversely, global flags like 

4461 # LOCALE _don't_ affect the code generation but _are_ needed by the 

4462 # PatternObject. 

4463 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version, 

4464 code, {}, {}, {}, [], req_offset, req_chars, req_flags, 

4465 len(patterns)) 

4466 

4467 def scan(self, string): 

4468 result = [] 

4469 append = result.append 

4470 match = self.scanner.scanner(string).match 

4471 i = 0 

4472 while True: 

4473 m = match() 

4474 if not m: 

4475 break 

4476 j = m.end() 

4477 if i == j: 

4478 break 

4479 action = self.lexicon[m.lastindex - 1][1] 

4480 if hasattr(action, '__call__'): 

4481 self.match = m 

4482 action = action(self, m.group()) 

4483 if action is not None: 

4484 append(action) 

4485 i = j 

4486 

4487 return result, string[i : ] 

4488 

4489# Get the known properties dict. 

4490PROPERTIES = _regex.get_properties() 

4491 

4492# Build the inverse of the properties dict. 

4493PROPERTY_NAMES = {} 

4494for prop_name, (prop_id, values) in PROPERTIES.items(): 

4495 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {})) 

4496 name = max(name, prop_name, key=len) 

4497 PROPERTY_NAMES[prop_id] = name, prop_values 

4498 

4499 for val_name, val_id in values.items(): 

4500 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name, 

4501 key=len) 

4502 

4503# Character escape sequences. 

4504CHARACTER_ESCAPES = { 

4505 "a": "\a", 

4506 "b": "\b", 

4507 "f": "\f", 

4508 "n": "\n", 

4509 "r": "\r", 

4510 "t": "\t", 

4511 "v": "\v", 

4512} 

4513 

4514ASCII_ENCODING = 1 

4515UNICODE_ENCODING = 2 

4516 

4517# Predefined character set escape sequences. 

4518CHARSET_ESCAPES = { 

4519 "d": lookup_property(None, "Digit", True), 

4520 "D": lookup_property(None, "Digit", False), 

4521 "h": lookup_property(None, "Blank", True), 

4522 "s": lookup_property(None, "Space", True), 

4523 "S": lookup_property(None, "Space", False), 

4524 "w": lookup_property(None, "Word", True), 

4525 "W": lookup_property(None, "Word", False), 

4526} 

4527 

4528ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES) 

4529ASCII_CHARSET_ESCAPES.update({ 

4530 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING), 

4531 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING), 

4532 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING), 

4533 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING), 

4534 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING), 

4535 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING), 

4536}) 

4537UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES) 

4538UNICODE_CHARSET_ESCAPES.update({ 

4539 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING), 

4540 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING), 

4541 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING), 

4542 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING), 

4543 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING), 

4544 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING), 

4545}) 

4546 

4547# Positional escape sequences. 

4548POSITION_ESCAPES = { 

4549 "A": StartOfString(), 

4550 "b": Boundary(), 

4551 "B": Boundary(False), 

4552 "K": Keep(), 

4553 "m": StartOfWord(), 

4554 "M": EndOfWord(), 

4555 "Z": EndOfString(), 

4556} 

4557ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES) 

4558ASCII_POSITION_ESCAPES.update({ 

4559 "b": Boundary(encoding=ASCII_ENCODING), 

4560 "B": Boundary(False, encoding=ASCII_ENCODING), 

4561 "m": StartOfWord(encoding=ASCII_ENCODING), 

4562 "M": EndOfWord(encoding=ASCII_ENCODING), 

4563}) 

4564UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES) 

4565UNICODE_POSITION_ESCAPES.update({ 

4566 "b": Boundary(encoding=UNICODE_ENCODING), 

4567 "B": Boundary(False, encoding=UNICODE_ENCODING), 

4568 "m": StartOfWord(encoding=UNICODE_ENCODING), 

4569 "M": EndOfWord(encoding=UNICODE_ENCODING), 

4570}) 

4571 

4572# Positional escape sequences when WORD flag set. 

4573WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES) 

4574WORD_POSITION_ESCAPES.update({ 

4575 "b": DefaultBoundary(), 

4576 "B": DefaultBoundary(False), 

4577 "m": DefaultStartOfWord(), 

4578 "M": DefaultEndOfWord(), 

4579}) 

4580 

4581# Regex control verbs. 

4582VERBS = { 

4583 "FAIL": Failure(), 

4584 "F": Failure(), 

4585 "PRUNE": Prune(), 

4586 "SKIP": Skip(), 

4587}