Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex

2# Secret Labs' Regular Expression Engine core module

6# This version of the SRE library can be redistributed under CNRI's

7# Python 1.6 license. For any other use, please contact Secret Labs

8# AB (info@pythonware.com).

10# Portions of this engine have been developed in cooperation with

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and

12# other compatibility work.

13#

14# 2010-01-16 mrab Python front-end re-written and extended

16import enum

17import string

18import unicodedata

19from collections import defaultdict

21from regex import _regex

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",

27 "Scanner", "RegexFlag"]

29# The regex exception.

30class error(Exception):

31 """Exception raised for invalid regular expressions.

33 Attributes:

35 msg: The unformatted error message

36 pattern: The regular expression pattern

37 pos: The position in the pattern where compilation failed, or None

38 lineno: The line number where compilation failed, unless pos is None

39 colno: The column number where compilation failed, unless pos is None

40 """

42 def __init__(self, message, pattern=None, pos=None):

43 newline = '\n' if isinstance(pattern, str) else b'\n'

44 self.msg = message

45 self.pattern = pattern

46 self.pos = pos

47 if pattern is not None and pos is not None:

48 self.lineno = pattern.count(newline, 0, pos) + 1

49 self.colno = pos - pattern.rfind(newline, 0, pos)

51 message = "{} at position {}".format(message, pos)

53 if newline in pattern:

54 message += " (line {}, column {})".format(self.lineno,

55 self.colno)

57 Exception.__init__(self, message)

59# The exception for when a positional flag has been turned on in the old

60# behaviour.

61class _UnscopedFlagSet(Exception):

62 pass

64# The exception for when parsing fails and we want to try something else.

65class ParseError(Exception):

66 pass

68# The exception for when there isn't a valid first set.

69class _FirstSetError(Exception):

70 pass

72# Flags.

73class RegexFlag(enum.IntFlag):

74 A = ASCII = 0x80 # Assume ASCII locale.

75 B = BESTMATCH = 0x1000 # Best fuzzy match.

76 D = DEBUG = 0x200 # Print parsed pattern.

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first

78 # fuzzy match.

79 F = FULLCASE = 0x4000 # Unicode full case-folding.

80 I = IGNORECASE = 0x2 # Ignore case.

81 L = LOCALE = 0x4 # Assume current 8-bit locale.

82 M = MULTILINE = 0x8 # Make anchors look for newline.

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).

84 R = REVERSE = 0x400 # Search backwards.

85 S = DOTALL = 0x10 # Make dot match newline.

86 U = UNICODE = 0x20 # Assume Unicode locale.

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.

88 V1 = VERSION1 = 0x100 # New enhanced behaviour.

89 W = WORD = 0x800 # Default Unicode word breaks.

90 X = VERBOSE = 0x40 # Ignore whitespace and comments.

91 T = TEMPLATE = 0x1 # Template (present because re module has it).

93 def __repr__(self):

94 if self._name_ is not None:

95 return 'regex.%s' % self._name_

97 value = self._value_

98 members = []

99 negative = value < 0

100

101 if negative:

102 value = ~value

103

104 for m in self.__class__:

105 if value & m._value_:

106 value &= ~m._value_

107 members.append('regex.%s' % m._name_)

108

109 if value:

110 members.append(hex(value))

111

112 res = '|'.join(members)

113

114 if negative:

115 if len(members) > 1:

116 res = '~(%s)' % res

117 else:

118 res = '~%s' % res

119

120 return res

121

122 __str__ = object.__str__

123

124# Put the flags into the module namespace. Being explicit here helps tools like

125# linters and IDEs understand the code better.

126ASCII = RegexFlag.ASCII

127BESTMATCH = RegexFlag.BESTMATCH

128DEBUG = RegexFlag.DEBUG

129DOTALL = RegexFlag.DOTALL

130ENHANCEMATCH = RegexFlag.ENHANCEMATCH

131FULLCASE = RegexFlag.FULLCASE

132IGNORECASE = RegexFlag.IGNORECASE

133LOCALE = RegexFlag.LOCALE

134MULTILINE = RegexFlag.MULTILINE

135POSIX = RegexFlag.POSIX

136REVERSE = RegexFlag.REVERSE

137TEMPLATE = RegexFlag.TEMPLATE

138UNICODE = RegexFlag.UNICODE

139VERBOSE = RegexFlag.VERBOSE

140VERSION0 = RegexFlag.VERSION0

141VERSION1 = RegexFlag.VERSION1

142WORD = RegexFlag.WORD

143A = RegexFlag.A

144B = RegexFlag.B

145D = RegexFlag.D

146E = RegexFlag.E

147F = RegexFlag.F

148I = RegexFlag.I

149L = RegexFlag.L

150M = RegexFlag.M

151P = RegexFlag.P

152R = RegexFlag.R

153S = RegexFlag.S

154U = RegexFlag.U

155V0 = RegexFlag.V0

156V1 = RegexFlag.V1

157W = RegexFlag.W

158X = RegexFlag.X

159T = RegexFlag.T

160

161DEFAULT_VERSION = VERSION1

162

163_ALL_VERSIONS = VERSION0 | VERSION1

164_ALL_ENCODINGS = ASCII | LOCALE | UNICODE

165

166# The default flags for the various versions.

167DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}

168

169# The mask for the flags.

171 REVERSE)

173 _ALL_ENCODINGS)

174

175ALPHA = frozenset(string.ascii_letters)

176DIGITS = frozenset(string.digits)

177ALNUM = ALPHA | DIGITS

178OCT_DIGITS = frozenset(string.octdigits)

179HEX_DIGITS = frozenset(string.hexdigits)

180SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])

181NAMED_CHAR_PART = ALNUM | frozenset(" -")

182PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")

183SET_OPS = ("||", "~~", "&&", "--")

184

185# The width of the code words inside the regex engine.

186BYTES_PER_CODE = _regex.get_code_size()

187BITS_PER_CODE = BYTES_PER_CODE * 8

188

189# The repeat count which represents infinity.

190UNLIMITED = (1 << BITS_PER_CODE) - 1

191

192# The regular expression flags.

193REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,

194 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,

195 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":

196 VERBOSE}

197

198# The case flags.

199CASE_FLAGS = FULLCASE | IGNORECASE

200NOCASE = 0

201FULLIGNORECASE = FULLCASE | IGNORECASE

202

203FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE

204

205CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,

206 FULLIGNORECASE: FULLIGNORECASE}

207

208# The number of digits in hexadecimal escapes.

209HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}

210

211# The names of the opcodes.

212OPCODES = """

213FAILURE

214SUCCESS

215ANY

216ANY_ALL

217ANY_ALL_REV

218ANY_REV

219ANY_U

220ANY_U_REV

221ATOMIC

222BOUNDARY

223BRANCH

224CALL_REF

225CHARACTER

226CHARACTER_IGN

227CHARACTER_IGN_REV

228CHARACTER_REV

229CONDITIONAL

230DEFAULT_BOUNDARY

231DEFAULT_END_OF_WORD

232DEFAULT_START_OF_WORD

233END

234END_OF_LINE

235END_OF_LINE_U

236END_OF_STRING

237END_OF_STRING_LINE

238END_OF_STRING_LINE_U

239END_OF_WORD

240FUZZY

241GRAPHEME_BOUNDARY

242GREEDY_REPEAT

243GROUP

244GROUP_CALL

245GROUP_EXISTS

246KEEP

247LAZY_REPEAT

248LOOKAROUND

249NEXT

250PROPERTY

251PROPERTY_IGN

252PROPERTY_IGN_REV

253PROPERTY_REV

254PRUNE

255RANGE

256RANGE_IGN

257RANGE_IGN_REV

258RANGE_REV

259REF_GROUP

260REF_GROUP_FLD

261REF_GROUP_FLD_REV

262REF_GROUP_IGN

263REF_GROUP_IGN_REV

264REF_GROUP_REV

265SEARCH_ANCHOR

266SET_DIFF

267SET_DIFF_IGN

268SET_DIFF_IGN_REV

269SET_DIFF_REV

270SET_INTER

271SET_INTER_IGN

272SET_INTER_IGN_REV

273SET_INTER_REV

274SET_SYM_DIFF

275SET_SYM_DIFF_IGN

276SET_SYM_DIFF_IGN_REV

277SET_SYM_DIFF_REV

278SET_UNION

279SET_UNION_IGN

280SET_UNION_IGN_REV

281SET_UNION_REV

282SKIP

283START_OF_LINE

284START_OF_LINE_U

285START_OF_STRING

286START_OF_WORD

287STRING

288STRING_FLD

289STRING_FLD_REV

290STRING_IGN

291STRING_IGN_REV

292STRING_REV

293FUZZY_EXT

294"""

295

296# Define the opcodes in a namespace.

297class Namespace:

298 pass

299

300OP = Namespace()

301for i, op in enumerate(OPCODES.split()):

302 setattr(OP, op, i)

303

304def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):

305 """Make room in the given cache.

306

307 Args:

308 cache_dict: The cache dictionary to modify.

309 args_dict: The dictionary of named list args used by patterns.

310 max_length: Maximum # of entries in cache_dict before it is shrunk.

311 divisor: Cache will shrink to max_length - 1/divisor*max_length items.

312 """

313 # Toss out a fraction of the entries at random to make room for new ones.

314 # A random algorithm was chosen as opposed to simply cache_dict.popitem()

315 # as popitem could penalize the same regular expression repeatedly based

316 # on its internal hash value. Being random should spread the cache miss

317 # love around.

318 cache_keys = tuple(cache_dict.keys())

319 overage = len(cache_keys) - max_length

320 if overage < 0:

321 # Cache is already within limits. Normally this should not happen

322 # but it could due to multithreading.

323 return

324

325 number_to_toss = max_length // divisor + overage

326

327 # The import is done here to avoid a circular dependency.

328 import random

329 if not hasattr(random, 'sample'):

330 # Do nothing while resolving the circular dependency:

331 # re->random->warnings->tokenize->string->re

332 return

333

334 for doomed_key in random.sample(cache_keys, number_to_toss):

335 try:

336 del cache_dict[doomed_key]

337 except KeyError:

338 # Ignore problems if the cache changed from another thread.

339 pass

340

341 # Rebuild the arguments and locale-sensitivity dictionaries.

342 args_dict.clear()

343 sensitivity_dict = {}

344 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):

345 args_dict[pattern, pattern_type, flags, default_version, locale] = args

346 try:

347 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]

348 except KeyError:

349 pass

350

351 locale_sensitive.clear()

352 locale_sensitive.update(sensitivity_dict)

353

354def _fold_case(info, string):

355 "Folds the case of a string."

356 flags = info.flags

357 if (flags & _ALL_ENCODINGS) == 0:

358 flags |= info.guess_encoding

359

360 return _regex.fold_case(flags, string)

361

362def is_cased_i(info, char):

363 "Checks whether a character is cased."

364 return len(_regex.get_all_cases(info.flags, char)) > 1

365

366def is_cased_f(flags, char):

367 "Checks whether a character is cased."

368 return len(_regex.get_all_cases(flags, char)) > 1

369

370def _compile_firstset(info, fs):

371 "Compiles the firstset for the pattern."

372 reverse = bool(info.flags & REVERSE)

373 fs = _check_firstset(info, reverse, fs)

374 if not fs or isinstance(fs, AnyAll):

375 return []

376

377 # Compile the firstset.

378 return fs.compile(reverse)

379

380def _check_firstset(info, reverse, fs):

381 "Checks the firstset for the pattern."

382 if not fs or None in fs:

383 return None

384

385 # If we ignore the case, for simplicity we won't build a firstset.

386 members = set()

387 case_flags = NOCASE

388 for i in fs:

389 if isinstance(i, Character) and not i.positive:

390 return None

391

392# if i.case_flags:

393# if isinstance(i, Character):

394# if is_cased_i(info, i.value):

395# return []

396# elif isinstance(i, SetBase):

397# return []

398 case_flags |= i.case_flags

399 members.add(i.with_flags(case_flags=NOCASE))

400

401 if case_flags == (FULLCASE | IGNORECASE):

402 return None

403

404 # Build the firstset.

405 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,

406 zerowidth=True)

407 fs = fs.optimise(info, reverse, in_set=True)

408

409 return fs

410

411def _flatten_code(code):

412 "Flattens the code from a list of tuples."

413 flat_code = []

414 for c in code:

415 flat_code.extend(c)

416

417 return flat_code

418

419def make_case_flags(info):

420 "Makes the case flags."

421 flags = info.flags & CASE_FLAGS

422

423 # Turn off FULLCASE if ASCII is turned on.

424 if info.flags & ASCII:

425 flags &= ~FULLCASE

426

427 return flags

428

429def make_character(info, value, in_set=False):

430 "Makes a character literal."

431 if in_set:

432 # A character set is built case-sensitively.

433 return Character(value)

434

435 return Character(value, case_flags=make_case_flags(info))

436

437def make_ref_group(info, name, position):

438 "Makes a group reference."

439 return RefGroup(info, name, position, case_flags=make_case_flags(info))

440

441def make_string_set(info, name):

442 "Makes a string set."

443 return StringSet(info, name, case_flags=make_case_flags(info))

444

445def make_property(info, prop, in_set):

446 "Makes a property."

447 if in_set:

448 return prop

449

450 return prop.with_flags(case_flags=make_case_flags(info))

451

452def _parse_pattern(source, info):

453 "Parses a pattern, eg. 'a|b|c'."

454 branches = [parse_sequence(source, info)]

455 while source.match("|"):

456 branches.append(parse_sequence(source, info))

457

458 if len(branches) == 1:

459 return branches[0]

460 return Branch(branches)

461

462def parse_sequence(source, info):

463 "Parses a sequence, eg. 'abc'."

464 sequence = [None]

465 case_flags = make_case_flags(info)

466 while True:

467 saved_pos = source.pos

468 ch = source.get()

469 if ch in SPECIAL_CHARS:

470 if ch in ")|":

471 # The end of a sequence. At the end of the pattern ch is "".

472 source.pos = saved_pos

473 break

474 elif ch == "\\":

475 # An escape sequence outside a set.

476 sequence.append(parse_escape(source, info, False))

477 elif ch == "(":

478 # A parenthesised subpattern or a flag.

479 element = parse_paren(source, info)

480 if element is None:

481 case_flags = make_case_flags(info)

482 else:

483 sequence.append(element)

484 elif ch == ".":

485 # Any character.

486 if info.flags & DOTALL:

487 sequence.append(AnyAll())

488 elif info.flags & WORD:

489 sequence.append(AnyU())

490 else:

491 sequence.append(Any())

492 elif ch == "[":

493 # A character set.

494 sequence.append(parse_set(source, info))

495 elif ch == "^":

496 # The start of a line or the string.

497 if info.flags & MULTILINE:

498 if info.flags & WORD:

499 sequence.append(StartOfLineU())

500 else:

501 sequence.append(StartOfLine())

502 else:

503 sequence.append(StartOfString())

504 elif ch == "$":

505 # The end of a line or the string.

506 if info.flags & MULTILINE:

507 if info.flags & WORD:

508 sequence.append(EndOfLineU())

509 else:

510 sequence.append(EndOfLine())

511 else:

512 if info.flags & WORD:

513 sequence.append(EndOfStringLineU())

514 else:

515 sequence.append(EndOfStringLine())

516 elif ch in "?*+{":

517 # Looks like a quantifier.

518 counts = parse_quantifier(source, info, ch)

519 if counts:

520 # It _is_ a quantifier.

521 apply_quantifier(source, info, counts, case_flags, ch,

522 saved_pos, sequence)

523 sequence.append(None)

524 else:

525 # It's not a quantifier. Maybe it's a fuzzy constraint.

526 constraints = parse_fuzzy(source, info, ch, case_flags)

527

528 if constraints:

529 # It _is_ a fuzzy constraint.

530 if is_actually_fuzzy(constraints):

531 apply_constraint(source, info, constraints, case_flags,

532 saved_pos, sequence)

533 sequence.append(None)

534 else:

535 # The element was just a literal.

536 sequence.append(Character(ord(ch),

537 case_flags=case_flags))

538 else:

539 # A literal.

540 sequence.append(Character(ord(ch), case_flags=case_flags))

541 else:

542 # A literal.

543 sequence.append(Character(ord(ch), case_flags=case_flags))

544

545 sequence = [item for item in sequence if item is not None]

546 return Sequence(sequence)

547

548def is_actually_fuzzy(constraints):

549 "Checks whether a fuzzy constraint is actually fuzzy."

550 if constraints.get("e") == (0, 0):

551 return False

552

553 if (constraints.get("s"), constraints.get("i"), constraints.get("d")) == ((0, 0), (0, 0), (0, 0)):

554 return False

555

556 return True

557

558def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,

559 sequence):

560 element = sequence.pop()

561 if element is None:

562 if sequence:

563 raise error("multiple repeat", source.string, saved_pos)

564 raise error("nothing to repeat", source.string, saved_pos)

565

566 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):

567 raise error("multiple repeat", source.string, saved_pos)

568

569 min_count, max_count = counts

570 saved_pos = source.pos

571 ch = source.get()

572 if ch == "?":

573 # The "?" suffix that means it's a lazy repeat.

574 repeated = LazyRepeat

575 elif ch == "+":

576 # The "+" suffix that means it's a possessive repeat.

577 repeated = PossessiveRepeat

578 else:

579 # No suffix means that it's a greedy repeat.

580 source.pos = saved_pos

581 repeated = GreedyRepeat

582

583 # Ignore the quantifier if it applies to a zero-width item or the number of

584 # repeats is fixed at 1.

585 if not element.is_empty() and (min_count != 1 or max_count != 1):

586 element = repeated(element, min_count, max_count)

587

588 sequence.append(element)

589

590def apply_constraint(source, info, constraints, case_flags, saved_pos,

591 sequence):

592 element = sequence.pop()

593 if element is None:

594 raise error("nothing for fuzzy constraint", source.string, saved_pos)

595

596 # If a group is marked as fuzzy then put all of the fuzzy part in the

597 # group.

598 if isinstance(element, Group):

599 element.subpattern = Fuzzy(element.subpattern, constraints)

600 sequence.append(element)

601 else:

602 sequence.append(Fuzzy(element, constraints))

603

604_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}

605

606def parse_quantifier(source, info, ch):

607 "Parses a quantifier."

608 q = _QUANTIFIERS.get(ch)

609 if q:

610 # It's a quantifier.

611 return q

612

613 if ch == "{":

614 # Looks like a limited repeated element, eg. 'a{2,3}'.

615 counts = parse_limited_quantifier(source)

616 if counts:

617 return counts

618

619 return None

620

621def is_above_limit(count):

622 "Checks whether a count is above the maximum."

623 return count is not None and count >= UNLIMITED

624

625def parse_limited_quantifier(source):

626 "Parses a limited quantifier."

627 saved_pos = source.pos

628 min_count = parse_count(source)

629 if source.match(","):

630 max_count = parse_count(source)

631

632 # No minimum means 0 and no maximum means unlimited.

633 min_count = int(min_count or 0)

634 max_count = int(max_count) if max_count else None

635 else:

636 if not min_count:

637 source.pos = saved_pos

638 return None

639

640 min_count = max_count = int(min_count)

641

642 if not source.match ("}"):

643 source.pos = saved_pos

644 return None

645

646 if is_above_limit(min_count) or is_above_limit(max_count):

647 raise error("repeat count too big", source.string, saved_pos)

648

649 if max_count is not None and min_count > max_count:

650 raise error("min repeat greater than max repeat", source.string,

651 saved_pos)

652

653 return min_count, max_count

654

655def parse_fuzzy(source, info, ch, case_flags):

656 "Parses a fuzzy setting, if present."

657 saved_pos = source.pos

658

659 if ch != "{":

660 return None

661

662 constraints = {}

663 try:

664 parse_fuzzy_item(source, constraints)

665 while source.match(","):

666 parse_fuzzy_item(source, constraints)

667 except ParseError:

668 source.pos = saved_pos

669 return None

670

671 if source.match(":"):

672 constraints["test"] = parse_fuzzy_test(source, info, case_flags)

673

674 if not source.match("}"):

675 raise error("expected }", source.string, source.pos)

676

677 return constraints

678

679def parse_fuzzy_item(source, constraints):

680 "Parses a fuzzy setting item."

681 saved_pos = source.pos

682 try:

683 parse_cost_constraint(source, constraints)

684 except ParseError:

685 source.pos = saved_pos

686

687 parse_cost_equation(source, constraints)

688

689def parse_cost_constraint(source, constraints):

690 "Parses a cost constraint."

691 saved_pos = source.pos

692 ch = source.get()

693 if ch in ALPHA:

694 # Syntax: constraint [("<=" | "<") cost]

695 constraint = parse_constraint(source, constraints, ch)

696

697 max_inc = parse_fuzzy_compare(source)

698

699 if max_inc is None:

700 # No maximum cost.

701 constraints[constraint] = 0, None

702 else:

703 # There's a maximum cost.

704 cost_pos = source.pos

705 max_cost = parse_cost_limit(source)

706

707 # Inclusive or exclusive limit?

708 if not max_inc:

709 max_cost -= 1

710

711 if max_cost < 0:

712 raise error("bad fuzzy cost limit", source.string, cost_pos)

713

714 constraints[constraint] = 0, max_cost

715 elif ch in DIGITS:

716 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost

717 source.pos = saved_pos

718

719 # Minimum cost.

720 cost_pos = source.pos

721 min_cost = parse_cost_limit(source)

722

723 min_inc = parse_fuzzy_compare(source)

724 if min_inc is None:

725 raise ParseError()

726

727 constraint = parse_constraint(source, constraints, source.get())

728

729 max_inc = parse_fuzzy_compare(source)

730 if max_inc is None:

731 raise ParseError()

732

733 # Maximum cost.

734 cost_pos = source.pos

735 max_cost = parse_cost_limit(source)

736

737 # Inclusive or exclusive limits?

738 if not min_inc:

739 min_cost += 1

740 if not max_inc:

741 max_cost -= 1

742

743 if not 0 <= min_cost <= max_cost:

744 raise error("bad fuzzy cost limit", source.string, cost_pos)

745

746 constraints[constraint] = min_cost, max_cost

747 else:

748 raise ParseError()

749

750def parse_cost_limit(source):

751 "Parses a cost limit."

752 cost_pos = source.pos

753 digits = parse_count(source)

754

755 try:

756 return int(digits)

757 except ValueError:

758 pass

759

760 raise error("bad fuzzy cost limit", source.string, cost_pos)

761

762def parse_constraint(source, constraints, ch):

763 "Parses a constraint."

764 if ch not in "deis":

765 raise ParseError()

766

767 if ch in constraints:

768 raise ParseError()

769

770 return ch

771

772def parse_fuzzy_compare(source):

773 "Parses a cost comparator."

774 if source.match("<="):

775 return True

776 elif source.match("<"):

777 return False

778 else:

779 return None

780

781def parse_cost_equation(source, constraints):

782 "Parses a cost equation."

783 if "cost" in constraints:

784 raise error("more than one cost equation", source.string, source.pos)

785

786 cost = {}

787

788 parse_cost_term(source, cost)

789 while source.match("+"):

790 parse_cost_term(source, cost)

791

792 max_inc = parse_fuzzy_compare(source)

793 if max_inc is None:

794 raise ParseError()

795

796 max_cost = int(parse_count(source))

797

798 if not max_inc:

799 max_cost -= 1

800

801 if max_cost < 0:

802 raise error("bad fuzzy cost limit", source.string, source.pos)

803

804 cost["max"] = max_cost

805

806 constraints["cost"] = cost

807

808def parse_cost_term(source, cost):

809 "Parses a cost equation term."

810 coeff = parse_count(source)

811 ch = source.get()

812 if ch not in "dis":

813 raise ParseError()

814

815 if ch in cost:

816 raise error("repeated fuzzy cost", source.string, source.pos)

817

818 cost[ch] = int(coeff or 1)

819

820def parse_fuzzy_test(source, info, case_flags):

821 saved_pos = source.pos

822 ch = source.get()

823 if ch in SPECIAL_CHARS:

824 if ch == "\\":

825 # An escape sequence outside a set.

826 return parse_escape(source, info, False)

827 elif ch == ".":

828 # Any character.

829 if info.flags & DOTALL:

830 return AnyAll()

831 elif info.flags & WORD:

832 return AnyU()

833 else:

834 return Any()

835 elif ch == "[":

836 # A character set.

837 return parse_set(source, info)

838 else:

839 raise error("expected character set", source.string, saved_pos)

840 elif ch:

841 # A literal.

842 return Character(ord(ch), case_flags=case_flags)

843 else:

844 raise error("expected character set", source.string, saved_pos)

845

846def parse_count(source):

847 "Parses a quantifier's count, which can be empty."

848 return source.get_while(DIGITS)

849

850def parse_paren(source, info):

851 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an

852 inline flag.

853 """

854 saved_pos = source.pos

855 ch = source.get(True)

856 if ch == "?":

857 # (?...

858 saved_pos_2 = source.pos

859 ch = source.get(True)

860 if ch == "<":

861 # (?<...

862 saved_pos_3 = source.pos

863 ch = source.get()

864 if ch in ("=", "!"):

865 # (?<=... or (?<!...: lookbehind.

866 return parse_lookaround(source, info, True, ch == "=")

867

868 # (?<...: a named capture group.

869 source.pos = saved_pos_3

870 name = parse_name(source)

871 group = info.open_group(name)

872 source.expect(">")

873 saved_flags = info.flags

874 try:

875 subpattern = _parse_pattern(source, info)

876 source.expect(")")

877 finally:

878 info.flags = saved_flags

879 source.ignore_space = bool(info.flags & VERBOSE)

880

881 info.close_group()

882 return Group(info, group, subpattern)

883 if ch in ("=", "!"):

884 # (?=... or (?!...: lookahead.

885 return parse_lookaround(source, info, False, ch == "=")

886 if ch == "P":

887 # (?P...: a Python extension.

888 return parse_extension(source, info)

889 if ch == "#":

890 # (?#...: a comment.

891 return parse_comment(source)

892 if ch == "(":

893 # (?(...: a conditional subpattern.

894 return parse_conditional(source, info)

895 if ch == ">":

896 # (?>...: an atomic subpattern.

897 return parse_atomic(source, info)

898 if ch == "|":

899 # (?|...: a common/reset groups branch.

900 return parse_common(source, info)

901 if ch == "R" or "0" <= ch <= "9":

902 # (?R...: probably a call to a group.

903 return parse_call_group(source, info, ch, saved_pos_2)

904 if ch == "&":

905 # (?&...: a call to a named group.

906 return parse_call_named_group(source, info, saved_pos_2)

907 if (ch == "+" or ch == "-") and source.peek() in DIGITS:

908 return parse_rel_call_group(source, info, ch, saved_pos_2)

909

910 # (?...: probably a flags subpattern.

911 source.pos = saved_pos_2

912 return parse_flags_subpattern(source, info)

913

914 if ch == "*":

915 # (*...

916 saved_pos_2 = source.pos

917 word = source.get_while(set(")>"), include=False)

918 if word[ : 1].isalpha():

919 verb = VERBS.get(word)

920 if not verb:

921 raise error("unknown verb", source.string, saved_pos_2)

922

923 source.expect(")")

924

925 return verb

926

927 # (...: an unnamed capture group.

928 source.pos = saved_pos

929 group = info.open_group()

930 saved_flags = info.flags

931 try:

932 subpattern = _parse_pattern(source, info)

933 source.expect(")")

934 finally:

935 info.flags = saved_flags

936 source.ignore_space = bool(info.flags & VERBOSE)

937

938 info.close_group()

939

940 return Group(info, group, subpattern)

941

942def parse_extension(source, info):

943 "Parses a Python extension."

944 saved_pos = source.pos

945 ch = source.get()

946 if ch == "<":

947 # (?P<...: a named capture group.

948 name = parse_name(source)

949 group = info.open_group(name)

950 source.expect(">")

951 saved_flags = info.flags

952 try:

953 subpattern = _parse_pattern(source, info)

954 source.expect(")")

955 finally:

956 info.flags = saved_flags

957 source.ignore_space = bool(info.flags & VERBOSE)

958

959 info.close_group()

960

961 return Group(info, group, subpattern)

962 if ch == "=":

963 # (?P=...: a named group reference.

964 name = parse_name(source, allow_numeric=True)

965 source.expect(")")

966 if info.is_open_group(name):

967 raise error("cannot refer to an open group", source.string,

968 saved_pos)

969

970 return make_ref_group(info, name, saved_pos)

971 if ch == ">" or ch == "&":

972 # (?P>...: a call to a group.

973 return parse_call_named_group(source, info, saved_pos)

974

975 source.pos = saved_pos

976 raise error("unknown extension", source.string, saved_pos)

977

978def parse_comment(source):

979 "Parses a comment."

980 while True:

981 saved_pos = source.pos

982 c = source.get(True)

983

984 if not c or c == ")":

985 break

986

987 if c == "\\":

988 c = source.get(True)

989

990 source.pos = saved_pos

991 source.expect(")")

992

993 return None

994

995def parse_lookaround(source, info, behind, positive):

996 "Parses a lookaround."

997 saved_flags = info.flags

998 try:

999 subpattern = _parse_pattern(source, info)

1000 source.expect(")")

1001 finally:

1002 info.flags = saved_flags

1003 source.ignore_space = bool(info.flags & VERBOSE)

1004

1005 return LookAround(behind, positive, subpattern)

1006

1007def parse_conditional(source, info):

1008 "Parses a conditional subpattern."

1009 saved_flags = info.flags

1010 saved_pos = source.pos

1011 ch = source.get()

1012 if ch == "?":

1013 # (?(?...

1014 ch = source.get()

1015 if ch in ("=", "!"):

1016 # (?(?=... or (?(?!...: lookahead conditional.

1017 return parse_lookaround_conditional(source, info, False, ch == "=")

1018 if ch == "<":

1019 # (?(?<...

1020 ch = source.get()

1021 if ch in ("=", "!"):

1022 # (?(?<=... or (?(?<!...: lookbehind conditional.

1023 return parse_lookaround_conditional(source, info, True, ch ==

1024 "=")

1025

1026 source.pos = saved_pos

1027 raise error("expected lookaround conditional", source.string,

1028 source.pos)

1029

1030 source.pos = saved_pos

1031 try:

1032 group = parse_name(source, True)

1033 source.expect(")")

1034 yes_branch = parse_sequence(source, info)

1035 if source.match("|"):

1036 no_branch = parse_sequence(source, info)

1037 else:

1038 no_branch = Sequence()

1039

1040 source.expect(")")

1041 finally:

1042 info.flags = saved_flags

1043 source.ignore_space = bool(info.flags & VERBOSE)

1044

1045 if yes_branch.is_empty() and no_branch.is_empty():

1046 return Sequence()

1047

1048 return Conditional(info, group, yes_branch, no_branch, saved_pos)

1049

1050def parse_lookaround_conditional(source, info, behind, positive):

1051 saved_flags = info.flags

1052 try:

1053 subpattern = _parse_pattern(source, info)

1054 source.expect(")")

1055 finally:

1056 info.flags = saved_flags

1057 source.ignore_space = bool(info.flags & VERBOSE)

1058

1059 yes_branch = parse_sequence(source, info)

1060 if source.match("|"):

1061 no_branch = parse_sequence(source, info)

1062 else:

1063 no_branch = Sequence()

1064

1065 source.expect(")")

1066

1067 return LookAroundConditional(behind, positive, subpattern, yes_branch,

1068 no_branch)

1069

1070def parse_atomic(source, info):

1071 "Parses an atomic subpattern."

1072 saved_flags = info.flags

1073 try:

1074 subpattern = _parse_pattern(source, info)

1075 source.expect(")")

1076 finally:

1077 info.flags = saved_flags

1078 source.ignore_space = bool(info.flags & VERBOSE)

1079

1080 return Atomic(subpattern)

1081

1082def parse_common(source, info):

1083 "Parses a common groups branch."

1084 # Capture group numbers in different branches can reuse the group numbers.

1085 initial_group_count = info.group_count

1086 branches = [parse_sequence(source, info)]

1087 final_group_count = info.group_count

1088 while source.match("|"):

1089 info.group_count = initial_group_count

1090 branches.append(parse_sequence(source, info))

1091 final_group_count = max(final_group_count, info.group_count)

1092

1093 info.group_count = final_group_count

1094 source.expect(")")

1095

1096 if len(branches) == 1:

1097 return branches[0]

1098 return Branch(branches)

1099

1100def parse_call_group(source, info, ch, pos):

1101 "Parses a call to a group."

1102 if ch == "R":

1103 group = "0"

1104 else:

1105 group = ch + source.get_while(DIGITS)

1106

1107 source.expect(")")

1108

1109 return CallGroup(info, group, pos)

1110

1111def parse_rel_call_group(source, info, ch, pos):

1112 "Parses a relative call to a group."

1113 digits = source.get_while(DIGITS)

1114 if not digits:

1115 raise error("missing relative group number", source.string, source.pos)

1116

1117 offset = int(digits)

1118 group = info.group_count + offset if ch == "+" else info.group_count - offset + 1

1119 if group <= 0:

1120 raise error("invalid relative group number", source.string, source.pos)

1121

1122 source.expect(")")

1123

1124 return CallGroup(info, group, pos)

1125

1126def parse_call_named_group(source, info, pos):

1127 "Parses a call to a named group."

1128 group = parse_name(source)

1129 source.expect(")")

1130

1131 return CallGroup(info, group, pos)

1132

1133def parse_flag_set(source):

1134 "Parses a set of inline flags."

1135 flags = 0

1136

1137 try:

1138 while True:

1139 saved_pos = source.pos

1140 ch = source.get()

1141 if ch == "V":

1142 ch += source.get()

1143 flags |= REGEX_FLAGS[ch]

1144 except KeyError:

1145 source.pos = saved_pos

1146

1147 return flags

1148

1149def parse_flags(source, info):

1150 "Parses flags being turned on/off."

1151 flags_on = parse_flag_set(source)

1152 if source.match("-"):

1153 flags_off = parse_flag_set(source)

1154 if not flags_off:

1155 raise error("bad inline flags: no flags after '-'", source.string,

1156 source.pos)

1157 else:

1158 flags_off = 0

1159

1160 if flags_on & LOCALE:

1161 # Remember that this pattern as an inline locale flag.

1162 info.inline_locale = True

1163

1164 return flags_on, flags_off

1165

1166def parse_subpattern(source, info, flags_on, flags_off):

1167 "Parses a subpattern with scoped flags."

1168 saved_flags = info.flags

1169 info.flags = (info.flags | flags_on) & ~flags_off

1170

1171 # Ensure that there aren't multiple encoding flags set.

1172 if info.flags & (ASCII | LOCALE | UNICODE):

1173 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on

1174

1175 source.ignore_space = bool(info.flags & VERBOSE)

1176 try:

1177 subpattern = _parse_pattern(source, info)

1178 source.expect(")")

1179 finally:

1180 info.flags = saved_flags

1181 source.ignore_space = bool(info.flags & VERBOSE)

1182

1183 return subpattern

1184

1185def parse_flags_subpattern(source, info):

1186 """Parses a flags subpattern. It could be inline flags or a subpattern

1187 possibly with local flags. If it's a subpattern, then that's returned;

1188 if it's a inline flags, then None is returned.

1189 """

1190 flags_on, flags_off = parse_flags(source, info)

1191

1192 if flags_off & GLOBAL_FLAGS:

1193 raise error("bad inline flags: cannot turn off global flag",

1194 source.string, source.pos)

1195

1196 if flags_on & flags_off:

1197 raise error("bad inline flags: flag turned on and off", source.string,

1198 source.pos)

1199

1200 # Handle flags which are global in all regex behaviours.

1201 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS

1202 if new_global_flags:

1203 info.global_flags |= new_global_flags

1204

1205 # A global has been turned on, so reparse the pattern.

1206 raise _UnscopedFlagSet(info.global_flags)

1207

1208 # Ensure that from now on we have only scoped flags.

1209 flags_on &= ~GLOBAL_FLAGS

1210

1211 if source.match(":"):

1212 return parse_subpattern(source, info, flags_on, flags_off)

1213

1214 if source.match(")"):

1215 parse_positional_flags(source, info, flags_on, flags_off)

1216 return None

1217

1218 raise error("unknown extension", source.string, source.pos)

1219

1220def parse_positional_flags(source, info, flags_on, flags_off):

1221 "Parses positional flags."

1222 info.flags = (info.flags | flags_on) & ~flags_off

1223 source.ignore_space = bool(info.flags & VERBOSE)

1224

1225def parse_name(source, allow_numeric=False, allow_group_0=False):

1226 "Parses a name."

1227 name = source.get_while(set(")>"), include=False)

1228

1229 if not name:

1230 raise error("missing group name", source.string, source.pos)

1231

1232 if name.isdigit():

1233 min_group = 0 if allow_group_0 else 1

1234 if not allow_numeric or int(name) < min_group:

1235 raise error("bad character in group name", source.string,

1236 source.pos)

1237 else:

1238 if not name.isidentifier():

1239 raise error("bad character in group name", source.string,

1240 source.pos)

1241

1242 return name

1243

1244def is_octal(string):

1245 "Checks whether a string is octal."

1246 return all(ch in OCT_DIGITS for ch in string)

1247

1248def is_decimal(string):

1249 "Checks whether a string is decimal."

1250 return all(ch in DIGITS for ch in string)

1251

1252def is_hexadecimal(string):

1253 "Checks whether a string is hexadecimal."

1254 return all(ch in HEX_DIGITS for ch in string)

1255

1256def parse_escape(source, info, in_set):

1257 "Parses an escape sequence."

1258 saved_ignore = source.ignore_space

1259 source.ignore_space = False

1260 ch = source.get()

1261 source.ignore_space = saved_ignore

1262 if not ch:

1263 # A backslash at the end of the pattern.

1264 raise error("bad escape (end of pattern)", source.string, source.pos)

1265 if ch in HEX_ESCAPES:

1266 # A hexadecimal escape sequence.

1267 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)

1268 elif ch == "g" and not in_set:

1269 # A group reference.

1270 saved_pos = source.pos

1271 try:

1272 return parse_group_ref(source, info)

1273 except error:

1274 # Invalid as a group reference, so assume it's a literal.

1275 source.pos = saved_pos

1276

1277 return make_character(info, ord(ch), in_set)

1278 elif ch == "G" and not in_set:

1279 # A search anchor.

1280 return SearchAnchor()

1281 elif ch == "L" and not in_set:

1282 # A string set.

1283 return parse_string_set(source, info)

1284 elif ch == "N":

1285 # A named codepoint.

1286 return parse_named_char(source, info, in_set)

1287 elif ch in "pP":

1288 # A Unicode property, positive or negative.

1289 return parse_property(source, info, ch == "p", in_set)

1290 elif ch == "R" and not in_set:

1291 # A line ending.

1292 charset = [0x0A, 0x0B, 0x0C, 0x0D]

1293 if info.guess_encoding == UNICODE:

1294 charset.extend([0x85, 0x2028, 0x2029])

1295

1296 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)

1297 for c in charset])]))

1298 elif ch == "X" and not in_set:

1299 # A grapheme cluster.

1300 return Grapheme()

1301 elif ch in ALPHA:

1302 # An alphabetic escape sequence.

1303 # Positional escapes aren't allowed inside a character set.

1304 if not in_set:

1305 if info.flags & WORD:

1306 value = WORD_POSITION_ESCAPES.get(ch)

1307 elif info.flags & ASCII:

1308 value = ASCII_POSITION_ESCAPES.get(ch)

1309 elif info.flags & UNICODE:

1310 value = UNICODE_POSITION_ESCAPES.get(ch)

1311 else:

1312 value = POSITION_ESCAPES.get(ch)

1313

1314 if value:

1315 return value

1316

1317 if info.flags & ASCII:

1318 value = ASCII_CHARSET_ESCAPES.get(ch)

1319 elif info.flags & UNICODE:

1320 value = UNICODE_CHARSET_ESCAPES.get(ch)

1321 else:

1322 value = CHARSET_ESCAPES.get(ch)

1323

1324 if value:

1325 return value

1326

1327 value = CHARACTER_ESCAPES.get(ch)

1328 if value:

1329 return Character(ord(value))

1330

1331 raise error("bad escape \\%s" % ch, source.string, source.pos)

1332 elif ch in DIGITS:

1333 # A numeric escape sequence.

1334 return parse_numeric_escape(source, info, ch, in_set)

1335 else:

1336 # A literal.

1337 return make_character(info, ord(ch), in_set)

1338

1339def parse_numeric_escape(source, info, ch, in_set):

1340 "Parses a numeric escape sequence."

1341 if in_set or ch == "0":

1342 # Octal escape sequence, max 3 digits.

1343 return parse_octal_escape(source, info, [ch], in_set)

1344

1345 # At least 1 digit, so either octal escape or group.

1346 digits = ch

1347 saved_pos = source.pos

1348 ch = source.get()

1349 if ch in DIGITS:

1350 # At least 2 digits, so either octal escape or group.

1351 digits += ch

1352 saved_pos = source.pos

1353 ch = source.get()

1354 if is_octal(digits) and ch in OCT_DIGITS:

1355 # 3 octal digits, so octal escape sequence.

1356 encoding = info.flags & _ALL_ENCODINGS

1357 if encoding == ASCII or encoding == LOCALE:

1358 octal_mask = 0xFF

1359 else:

1360 octal_mask = 0x1FF

1361

1362 value = int(digits + ch, 8) & octal_mask

1363 return make_character(info, value)

1364

1365 # Group reference.

1366 source.pos = saved_pos

1367 if info.is_open_group(digits):

1368 raise error("cannot refer to an open group", source.string, source.pos)

1369

1370 return make_ref_group(info, digits, source.pos)

1371

1372def parse_octal_escape(source, info, digits, in_set):

1373 "Parses an octal escape sequence."

1374 saved_pos = source.pos

1375 ch = source.get()

1376 while len(digits) < 3 and ch in OCT_DIGITS:

1377 digits.append(ch)

1378 saved_pos = source.pos

1379 ch = source.get()

1380

1381 source.pos = saved_pos

1382 try:

1383 value = int("".join(digits), 8)

1384 return make_character(info, value, in_set)

1385 except ValueError:

1386 if digits[0] in OCT_DIGITS:

1387 raise error("incomplete escape \\%s" % ''.join(digits),

1388 source.string, source.pos)

1389 else:

1390 raise error("bad escape \\%s" % digits[0], source.string,

1391 source.pos)

1392

1393def parse_hex_escape(source, info, esc, expected_len, in_set, type):

1394 "Parses a hex escape sequence."

1395 saved_pos = source.pos

1396 digits = []

1397 for i in range(expected_len):

1398 ch = source.get()

1399 if ch not in HEX_DIGITS:

1400 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1401 source.string, saved_pos)

1402 digits.append(ch)

1403

1404 try:

1405 value = int("".join(digits), 16)

1406 except ValueError:

1407 pass

1408 else:

1409 if value < 0x110000:

1410 return make_character(info, value, in_set)

1411

1412 # Bad hex escape.

1413 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),

1414 source.string, saved_pos)

1415

1416def parse_group_ref(source, info):

1417 "Parses a group reference."

1418 source.expect("<")

1419 saved_pos = source.pos

1420 name = parse_name(source, True)

1421 source.expect(">")

1422 if info.is_open_group(name):

1423 raise error("cannot refer to an open group", source.string, source.pos)

1424

1425 return make_ref_group(info, name, saved_pos)

1426

1427def parse_string_set(source, info):

1428 "Parses a string set reference."

1429 source.expect("<")

1430 name = parse_name(source, True)

1431 source.expect(">")

1432 if name is None or name not in info.kwargs:

1433 raise error("undefined named list", source.string, source.pos)

1434

1435 return make_string_set(info, name)

1436

1437def parse_named_char(source, info, in_set):

1438 "Parses a named character."

1439 saved_pos = source.pos

1440 if source.match("{"):

1441 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)

1442 if source.match("}"):

1443 try:

1444 value = unicodedata.lookup(name)

1445 return make_character(info, ord(value), in_set)

1446 except KeyError:

1447 raise error("undefined character name", source.string,

1448 source.pos)

1449

1450 source.pos = saved_pos

1451 return make_character(info, ord("N"), in_set)

1452

1453def parse_property(source, info, positive, in_set):

1454 "Parses a Unicode property."

1455 saved_pos = source.pos

1456 ch = source.get()

1457 if ch == "{":

1458 negate = source.match("^")

1459 prop_name, name = parse_property_name(source)

1460 if source.match("}"):

1461 # It's correctly delimited.

1462 if info.flags & ASCII:

1463 encoding = ASCII_ENCODING

1464 elif info.flags & UNICODE:

1465 encoding = UNICODE_ENCODING

1466 else:

1467 encoding = 0

1468

1469 prop = lookup_property(prop_name, name, positive != negate, source,

1470 encoding=encoding)

1471 return make_property(info, prop, in_set)

1472 elif ch and ch in "CLMNPSZ":

1473 # An abbreviated property, eg \pL.

1474 if info.flags & ASCII:

1475 encoding = ASCII_ENCODING

1476 elif info.flags & UNICODE:

1477 encoding = UNICODE_ENCODING

1478 else:

1479 encoding = 0

1480

1481 prop = lookup_property(None, ch, positive, source, encoding=encoding)

1482 return make_property(info, prop, in_set)

1483

1484 # Not a property, so treat as a literal "p" or "P".

1485 source.pos = saved_pos

1486 ch = "p" if positive else "P"

1487 return make_character(info, ord(ch), in_set)

1488

1489def parse_property_name(source):

1490 "Parses a property name, which may be qualified."

1491 name = source.get_while(PROPERTY_NAME_PART)

1492 saved_pos = source.pos

1493

1494 ch = source.get()

1495 if ch and ch in ":=":

1496 prop_name = name

1497 name = source.get_while(ALNUM | set(" &_-./")).strip()

1498

1499 if name:

1500 # Name after the ":" or "=", so it's a qualified name.

1501 saved_pos = source.pos

1502 else:

1503 # No name after the ":" or "=", so assume it's an unqualified name.

1504 prop_name, name = None, prop_name

1505 else:

1506 prop_name = None

1507

1508 source.pos = saved_pos

1509 return prop_name, name

1510

1511def parse_set(source, info):

1512 "Parses a character set."

1513 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1514

1515 saved_ignore = source.ignore_space

1516 source.ignore_space = False

1517 # Negative set?

1518 negate = source.match("^")

1519 try:

1520 if version == VERSION0:

1521 item = parse_set_imp_union(source, info)

1522 else:

1523 item = parse_set_union(source, info)

1524

1525 if not source.match("]"):

1526 raise error("missing ]", source.string, source.pos)

1527 finally:

1528 source.ignore_space = saved_ignore

1529

1530 if negate:

1531 item = item.with_flags(positive=not item.positive)

1532

1533 item = item.with_flags(case_flags=make_case_flags(info))

1534

1535 return item

1536

1537def parse_set_union(source, info):

1538 "Parses a set union ([x||y])."

1539 items = [parse_set_symm_diff(source, info)]

1540 while source.match("||"):

1541 items.append(parse_set_symm_diff(source, info))

1542

1543 if len(items) == 1:

1544 return items[0]

1545 return SetUnion(info, items)

1546

1547def parse_set_symm_diff(source, info):

1548 "Parses a set symmetric difference ([x~~y])."

1549 items = [parse_set_inter(source, info)]

1550 while source.match("~~"):

1551 items.append(parse_set_inter(source, info))

1552

1553 if len(items) == 1:

1554 return items[0]

1555 return SetSymDiff(info, items)

1556

1557def parse_set_inter(source, info):

1558 "Parses a set intersection ([x&&y])."

1559 items = [parse_set_diff(source, info)]

1560 while source.match("&&"):

1561 items.append(parse_set_diff(source, info))

1562

1563 if len(items) == 1:

1564 return items[0]

1565 return SetInter(info, items)

1566

1567def parse_set_diff(source, info):

1568 "Parses a set difference ([x--y])."

1569 items = [parse_set_imp_union(source, info)]

1570 while source.match("--"):

1571 items.append(parse_set_imp_union(source, info))

1572

1573 if len(items) == 1:

1574 return items[0]

1575 return SetDiff(info, items)

1576

1577def parse_set_imp_union(source, info):

1578 "Parses a set implicit union ([xy])."

1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1580

1581 items = [parse_set_member(source, info)]

1582 while True:

1583 saved_pos = source.pos

1584 if source.match("]"):

1585 # End of the set.

1586 source.pos = saved_pos

1587 break

1588

1589 if version == VERSION1 and any(source.match(op) for op in SET_OPS):

1590 # The new behaviour has set operators.

1591 source.pos = saved_pos

1592 break

1593

1594 items.append(parse_set_member(source, info))

1595

1596 if len(items) == 1:

1597 return items[0]

1598 return SetUnion(info, items)

1599

1600def parse_set_member(source, info):

1601 "Parses a member in a character set."

1602 # Parse a set item.

1603 start = parse_set_item(source, info)

1604 saved_pos1 = source.pos

1605 if (not isinstance(start, Character) or not start.positive or not

1606 source.match("-")):

1607 # It's not the start of a range.

1608 return start

1609

1610 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1611

1612 # It looks like the start of a range of characters.

1613 saved_pos2 = source.pos

1614 if version == VERSION1 and source.match("-"):

1615 # It's actually the set difference operator '--', so return the

1616 # character.

1617 source.pos = saved_pos1

1618 return start

1619

1620 if source.match("]"):

1621 # We've reached the end of the set, so return both the character and

1622 # hyphen.

1623 source.pos = saved_pos2

1624 return SetUnion(info, [start, Character(ord("-"))])

1625

1626 # Parse a set item.

1627 end = parse_set_item(source, info)

1628 if not isinstance(end, Character) or not end.positive:

1629 # It's not a range, so return the character, hyphen and property.

1630 return SetUnion(info, [start, Character(ord("-")), end])

1631

1632 # It _is_ a range.

1633 if start.value > end.value:

1634 raise error("bad character range", source.string, source.pos)

1635

1636 if start.value == end.value:

1637 return start

1638

1639 return Range(start.value, end.value)

1640

1641def parse_set_item(source, info):

1642 "Parses an item in a character set."

1643 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1644

1645 if source.match("\\"):

1646 # An escape sequence in a set.

1647 return parse_escape(source, info, True)

1648

1649 saved_pos = source.pos

1650 if source.match("[:"):

1651 # Looks like a POSIX character class.

1652 try:

1653 return parse_posix_class(source, info)

1654 except ParseError:

1655 # Not a POSIX character class.

1656 source.pos = saved_pos

1657

1658 if version == VERSION1 and source.match("["):

1659 # It's the start of a nested set.

1660

1661 # Negative set?

1662 negate = source.match("^")

1663 item = parse_set_union(source, info)

1664

1665 if not source.match("]"):

1666 raise error("missing ]", source.string, source.pos)

1667

1668 if negate:

1669 item = item.with_flags(positive=not item.positive)

1670

1671 return item

1672

1673 ch = source.get()

1674 if not ch:

1675 raise error("unterminated character set", source.string, source.pos)

1676

1677 return Character(ord(ch))

1678

1679def parse_posix_class(source, info):

1680 "Parses a POSIX character class."

1681 negate = source.match("^")

1682 prop_name, name = parse_property_name(source)

1683 if not source.match(":]"):

1684 raise ParseError()

1685

1686 return lookup_property(prop_name, name, not negate, source, posix=True)

1687

1688def float_to_rational(flt):

1689 "Converts a float to a rational pair."

1690 int_part = int(flt)

1691 error = flt - int_part

1692 if abs(error) < 0.0001:

1693 return int_part, 1

1694

1695 den, num = float_to_rational(1.0 / error)

1696

1697 return int_part * den + num, den

1698

1699def numeric_to_rational(numeric):

1700 "Converts a numeric string to a rational string, if possible."

1701 if numeric[ : 1] == "-":

1702 sign, numeric = numeric[0], numeric[1 : ]

1703 else:

1704 sign = ""

1705

1706 parts = numeric.split("/")

1707 if len(parts) == 2:

1708 num, den = float_to_rational(float(parts[0]) / float(parts[1]))

1709 elif len(parts) == 1:

1710 num, den = float_to_rational(float(parts[0]))

1711 else:

1712 raise ValueError()

1713

1714 result = "{}{}/{}".format(sign, num, den)

1715 if result.endswith("/1"):

1716 return result[ : -2]

1717

1718 return result

1719

1720def standardise_name(name):

1721 "Standardises a property or value name."

1722 try:

1723 return numeric_to_rational("".join(name))

1724 except (ValueError, ZeroDivisionError):

1725 return "".join(ch for ch in name if ch not in "_- ").upper()

1726

1727_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())

1728

1729_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())

1730

1731def lookup_property(property, value, positive, source=None, posix=False, encoding=0):

1732 "Looks up a property."

1733 # Normalise the names (which may still be lists).

1734 property = standardise_name(property) if property else None

1735 value = standardise_name(value)

1736

1737 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):

1738 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive

1739

1740 if posix and not property and value.upper() in _POSIX_CLASSES:

1741 value = 'POSIX' + value

1742

1743 if property:

1744 # Both the property and the value are provided.

1745 prop = PROPERTIES.get(property)

1746 if not prop:

1747 if not source:

1748 raise error("unknown property")

1749

1750 raise error("unknown property", source.string, source.pos)

1751

1752 prop_id, value_dict = prop

1753 val_id = value_dict.get(value)

1754 if val_id is None:

1755 if not source:

1756 raise error("unknown property value")

1757

1758 raise error("unknown property value", source.string, source.pos)

1759

1760 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1761

1762 # Only the value is provided.

1763 # It might be the name of a GC, script or block value.

1764 for property in ("GC", "SCRIPT", "BLOCK"):

1765 prop_id, value_dict = PROPERTIES.get(property)

1766 val_id = value_dict.get(value)

1767 if val_id is not None:

1768 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1769

1770 # It might be the name of a binary property.

1771 prop = PROPERTIES.get(value)

1772 if prop:

1773 prop_id, value_dict = prop

1774 if set(value_dict) == _BINARY_VALUES:

1775 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1776

1777 return Property(prop_id << 16, not positive, encoding=encoding)

1778

1779 # It might be the name of a binary property starting with a prefix.

1780 if value.startswith("IS"):

1781 prop = PROPERTIES.get(value[2 : ])

1782 if prop:

1783 prop_id, value_dict = prop

1784 if "YES" in value_dict:

1785 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1786

1787 # It might be the name of a script or block starting with a prefix.

1788 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):

1789 if value.startswith(prefix):

1790 prop_id, value_dict = PROPERTIES.get(property)

1791 val_id = value_dict.get(value[2 : ])

1792 if val_id is not None:

1793 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1794

1795 # Unknown property.

1796 if not source:

1797 raise error("unknown property")

1798

1799 raise error("unknown property", source.string, source.pos)

1800

1801def _compile_replacement(source, pattern, is_unicode):

1802 "Compiles a replacement template escape sequence."

1803 ch = source.get()

1804 if ch in ALPHA:

1805 # An alphabetic escape sequence.

1806 value = CHARACTER_ESCAPES.get(ch)

1807 if value:

1808 return False, [ord(value)]

1809

1810 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):

1811 # A hexadecimal escape sequence.

1812 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]

1813

1814 if ch == "g":

1815 # A group preference.

1816 return True, [compile_repl_group(source, pattern)]

1817

1818 if ch == "N" and is_unicode:

1819 # A named character.

1820 value = parse_repl_named_char(source)

1821 if value is not None:

1822 return False, [value]

1823

1824 raise error("bad escape \\%s" % ch, source.string, source.pos)

1825

1826 if isinstance(source.sep, bytes):

1827 octal_mask = 0xFF

1828 else:

1829 octal_mask = 0x1FF

1830

1831 if ch == "0":

1832 # An octal escape sequence.

1833 digits = ch

1834 while len(digits) < 3:

1835 saved_pos = source.pos

1836 ch = source.get()

1837 if ch not in OCT_DIGITS:

1838 source.pos = saved_pos

1839 break

1840 digits += ch

1841

1842 return False, [int(digits, 8) & octal_mask]

1843

1844 if ch in DIGITS:

1845 # Either an octal escape sequence (3 digits) or a group reference (max

1846 # 2 digits).

1847 digits = ch

1848 saved_pos = source.pos

1849 ch = source.get()

1850 if ch in DIGITS:

1851 digits += ch

1852 saved_pos = source.pos

1853 ch = source.get()

1854 if ch and is_octal(digits + ch):

1855 # An octal escape sequence.

1856 return False, [int(digits + ch, 8) & octal_mask]

1857

1858 # A group reference.

1859 source.pos = saved_pos

1860 return True, [int(digits)]

1861

1862 if ch == "\\":

1863 # An escaped backslash is a backslash.

1864 return False, [ord("\\")]

1865

1866 if not ch:

1867 # A trailing backslash.

1868 raise error("bad escape (end of pattern)", source.string, source.pos)

1869

1870 # An escaped non-backslash is a backslash followed by the literal.

1871 return False, [ord("\\"), ord(ch)]

1872

1873def parse_repl_hex_escape(source, expected_len, type):

1874 "Parses a hex escape sequence in a replacement string."

1875 digits = []

1876 for i in range(expected_len):

1877 ch = source.get()

1878 if ch not in HEX_DIGITS:

1879 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1880 source.string, source.pos)

1881 digits.append(ch)

1882

1883 return int("".join(digits), 16)

1884

1885def parse_repl_named_char(source):

1886 "Parses a named character in a replacement string."

1887 saved_pos = source.pos

1888 if source.match("{"):

1889 name = source.get_while(ALPHA | set(" "))

1890

1891 if source.match("}"):

1892 try:

1893 value = unicodedata.lookup(name)

1894 return ord(value)

1895 except KeyError:

1896 raise error("undefined character name", source.string,

1897 source.pos)

1898

1899 source.pos = saved_pos

1900 return None

1901

1902def compile_repl_group(source, pattern):

1903 "Compiles a replacement template group reference."

1904 source.expect("<")

1905 name = parse_name(source, True, True)

1906

1907 source.expect(">")

1908 if name.isdigit():

1909 index = int(name)

1910 if not 0 <= index <= pattern.groups:

1911 raise error("invalid group reference", source.string, source.pos)

1912

1913 return index

1914

1915 try:

1916 return pattern.groupindex[name]

1917 except KeyError:

1918 raise IndexError("unknown group")

1919

1920# The regular expression is parsed into a syntax tree. The different types of

1921# node are defined below.

1922

1923INDENT = " "

1924POSITIVE_OP = 0x1

1925ZEROWIDTH_OP = 0x2

1926FUZZY_OP = 0x4

1927REVERSE_OP = 0x8

1928REQUIRED_OP = 0x10

1929ENCODING_OP_SHIFT = 5

1930

1931POS_TEXT = {False: "NON-MATCH", True: "MATCH"}

1932CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",

1933 FULLIGNORECASE: " FULL_IGNORE_CASE"}

1934

1935def make_sequence(items):

1936 if len(items) == 1:

1937 return items[0]

1938 return Sequence(items)

1939

1940# Common base class for all nodes.

1941class RegexBase:

1942 def __init__(self):

1943 self._key = self.__class__

1944

1945 def with_flags(self, positive=None, case_flags=None, zerowidth=None):

1946 if positive is None:

1947 positive = self.positive

1948 else:

1949 positive = bool(positive)

1950 if case_flags is None:

1951 case_flags = self.case_flags

1952 else:

1953 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]

1954 if zerowidth is None:

1955 zerowidth = self.zerowidth

1956 else:

1957 zerowidth = bool(zerowidth)

1958

1959 if (positive == self.positive and case_flags == self.case_flags and

1960 zerowidth == self.zerowidth):

1961 return self

1962

1963 return self.rebuild(positive, case_flags, zerowidth)

1964

1965 def fix_groups(self, pattern, reverse, fuzzy):

1966 pass

1967

1968 def optimise(self, info, reverse):

1969 return self

1970

1971 def pack_characters(self, info):

1972 return self

1973

1974 def remove_captures(self):

1975 return self

1976

1977 def is_atomic(self):

1978 return True

1979

1980 def can_be_affix(self):

1981 return True

1982

1983 def contains_group(self):

1984 return False

1985

1986 def get_firstset(self, reverse):

1987 raise _FirstSetError()

1988

1989 def has_simple_start(self):

1990 return False

1991

1992 def compile(self, reverse=False, fuzzy=False):

1993 return self._compile(reverse, fuzzy)

1994

1995 def is_empty(self):

1996 return False

1997

1998 def __hash__(self):

1999 return hash(self._key)

2000

2001 def __eq__(self, other):

2002 return type(self) is type(other) and self._key == other._key

2003

2004 def __ne__(self, other):

2005 return not self.__eq__(other)

2006

2007 def get_required_string(self, reverse):

2008 return self.max_width(), None

2009

2010# Base class for zero-width nodes.

2011class ZeroWidthBase(RegexBase):

2012 def __init__(self, positive=True, encoding=0):

2013 RegexBase.__init__(self)

2014 self.positive = bool(positive)

2015 self.encoding = encoding

2016

2017 self._key = self.__class__, self.positive

2018

2019 def get_firstset(self, reverse):

2020 return set([None])

2021

2022 def _compile(self, reverse, fuzzy):

2023 flags = 0

2024 if self.positive:

2025 flags |= POSITIVE_OP

2026 if fuzzy:

2027 flags |= FUZZY_OP

2028 if reverse:

2029 flags |= REVERSE_OP

2030 flags |= self.encoding << ENCODING_OP_SHIFT

2031 return [(self._opcode, flags)]

2032

2033 def dump(self, indent, reverse):

2034 print("{}{} {}{}".format(INDENT * indent, self._op_name,

2035 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))

2036

2037 def max_width(self):

2038 return 0

2039

2040class Any(RegexBase):

2041 _opcode = {False: OP.ANY, True: OP.ANY_REV}

2042 _op_name = "ANY"

2043

2044 def has_simple_start(self):

2045 return True

2046

2047 def _compile(self, reverse, fuzzy):

2048 flags = 0

2049 if fuzzy:

2050 flags |= FUZZY_OP

2051 return [(self._opcode[reverse], flags)]

2052

2053 def dump(self, indent, reverse):

2054 print("{}{}".format(INDENT * indent, self._op_name))

2055

2056 def max_width(self):

2057 return 1

2058

2059class AnyAll(Any):

2060 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}

2061 _op_name = "ANY_ALL"

2062

2063 def __init__(self):

2064 self.positive = True

2065 self.zerowidth = False

2066 self.case_flags = 0

2067

2068 self._key = self.__class__, self.positive

2069

2070class AnyU(Any):

2071 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}

2072 _op_name = "ANY_U"

2073

2074class Atomic(RegexBase):

2075 def __init__(self, subpattern):

2076 RegexBase.__init__(self)

2077 self.subpattern = subpattern

2078

2079 def fix_groups(self, pattern, reverse, fuzzy):

2080 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2081

2082 def optimise(self, info, reverse):

2083 self.subpattern = self.subpattern.optimise(info, reverse)

2084

2085 if self.subpattern.is_empty():

2086 return self.subpattern

2087 return self

2088

2089 def pack_characters(self, info):

2090 self.subpattern = self.subpattern.pack_characters(info)

2091 return self

2092

2093 def remove_captures(self):

2094 self.subpattern = self.subpattern.remove_captures()

2095 return self

2096

2097 def can_be_affix(self):

2098 return self.subpattern.can_be_affix()

2099

2100 def contains_group(self):

2101 return self.subpattern.contains_group()

2102

2103 def get_firstset(self, reverse):

2104 return self.subpattern.get_firstset(reverse)

2105

2106 def has_simple_start(self):

2107 return self.subpattern.has_simple_start()

2108

2109 def _compile(self, reverse, fuzzy):

2110 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +

2111 [(OP.END, )])

2112

2113 def dump(self, indent, reverse):

2114 print("{}ATOMIC".format(INDENT * indent))

2115 self.subpattern.dump(indent + 1, reverse)

2116

2117 def is_empty(self):

2118 return self.subpattern.is_empty()

2119

2120 def __eq__(self, other):

2121 return (type(self) is type(other) and self.subpattern ==

2122 other.subpattern)

2123

2124 def max_width(self):

2125 return self.subpattern.max_width()

2126

2127 def get_required_string(self, reverse):

2128 return self.subpattern.get_required_string(reverse)

2129

2130class Boundary(ZeroWidthBase):

2131 _opcode = OP.BOUNDARY

2132 _op_name = "BOUNDARY"

2133

2134class Branch(RegexBase):

2135 def __init__(self, branches):

2136 RegexBase.__init__(self)

2137 self.branches = branches

2138

2139 def fix_groups(self, pattern, reverse, fuzzy):

2140 for b in self.branches:

2141 b.fix_groups(pattern, reverse, fuzzy)

2142

2143 def optimise(self, info, reverse):

2144 if not self.branches:

2145 return Sequence([])

2146

2147 # Flatten branches within branches.

2148 branches = Branch._flatten_branches(info, reverse, self.branches)

2149

2150 # Move any common prefix or suffix out of the branches.

2151 if reverse:

2152 suffix, branches = Branch._split_common_suffix(info, branches)

2153 prefix = []

2154 else:

2155 prefix, branches = Branch._split_common_prefix(info, branches)

2156 suffix = []

2157

2158 # Try to reduce adjacent single-character branches to sets.

2159 branches = Branch._reduce_to_set(info, reverse, branches)

2160

2161 if len(branches) > 1:

2162 sequence = [Branch(branches)]

2163

2164 if not prefix or not suffix:

2165 # We might be able to add a quick precheck before the branches.

2166 firstset = self._add_precheck(info, reverse, branches)

2167

2168 if firstset:

2169 if reverse:

2170 sequence.append(firstset)

2171 else:

2172 sequence.insert(0, firstset)

2173 else:

2174 sequence = branches

2175

2176 return make_sequence(prefix + sequence + suffix)

2177

2178 def _add_precheck(self, info, reverse, branches):

2179 charset = set()

2180 pos = -1 if reverse else 0

2181

2182 for branch in branches:

2183 if type(branch) is Literal and branch.case_flags == NOCASE:

2184 charset.add(branch.characters[pos])

2185 else:

2186 return

2187

2188 if not charset:

2189 return None

2190

2191 return _check_firstset(info, reverse, [Character(c) for c in charset])

2192

2193 def pack_characters(self, info):

2194 self.branches = [b.pack_characters(info) for b in self.branches]

2195 return self

2196

2197 def remove_captures(self):

2198 self.branches = [b.remove_captures() for b in self.branches]

2199 return self

2200

2201 def is_atomic(self):

2202 return all(b.is_atomic() for b in self.branches)

2203

2204 def can_be_affix(self):

2205 return all(b.can_be_affix() for b in self.branches)

2206

2207 def contains_group(self):

2208 return any(b.contains_group() for b in self.branches)

2209

2210 def get_firstset(self, reverse):

2211 fs = set()

2212 for b in self.branches:

2213 fs |= b.get_firstset(reverse)

2214

2215 return fs or set([None])

2216

2217 def _compile(self, reverse, fuzzy):

2218 if not self.branches:

2219 return []

2220

2221 code = [(OP.BRANCH, )]

2222 for b in self.branches:

2223 code.extend(b.compile(reverse, fuzzy))

2224 code.append((OP.NEXT, ))

2225

2226 code[-1] = (OP.END, )

2227

2228 return code

2229

2230 def dump(self, indent, reverse):

2231 print("{}BRANCH".format(INDENT * indent))

2232 self.branches[0].dump(indent + 1, reverse)

2233 for b in self.branches[1 : ]:

2234 print("{}OR".format(INDENT * indent))

2235 b.dump(indent + 1, reverse)

2236

2237 @staticmethod

2238 def _flatten_branches(info, reverse, branches):

2239 # Flatten the branches so that there aren't branches of branches.

2240 new_branches = []

2241 for b in branches:

2242 b = b.optimise(info, reverse)

2243 if isinstance(b, Branch):

2244 new_branches.extend(b.branches)

2245 else:

2246 new_branches.append(b)

2247

2248 return new_branches

2249

2250 @staticmethod

2251 def _split_common_prefix(info, branches):

2252 # Common leading items can be moved out of the branches.

2253 # Get the items in the branches.

2254 alternatives = []

2255 for b in branches:

2256 if isinstance(b, Sequence):

2257 alternatives.append(b.items)

2258 else:

2259 alternatives.append([b])

2260

2261 # What is the maximum possible length of the prefix?

2262 max_count = min(len(a) for a in alternatives)

2263

2264 # What is the longest common prefix?

2265 prefix = alternatives[0]

2266 pos = 0

2267 end_pos = max_count

2268 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==

2269 prefix[pos] for a in alternatives):

2270 pos += 1

2271 count = pos

2272

2273 if info.flags & UNICODE:

2274 # We need to check that we're not splitting a sequence of

2275 # characters which could form part of full case-folding.

2276 count = pos

2277 while count > 0 and not all(Branch._can_split(a, count) for a in

2278 alternatives):

2279 count -= 1

2280

2281 # No common prefix is possible.

2282 if count == 0:

2283 return [], branches

2284

2285 # Rebuild the branches.

2286 new_branches = []

2287 for a in alternatives:

2288 new_branches.append(make_sequence(a[count : ]))

2289

2290 return prefix[ : count], new_branches

2291

2292 @staticmethod

2293 def _split_common_suffix(info, branches):

2294 # Common trailing items can be moved out of the branches.

2295 # Get the items in the branches.

2296 alternatives = []

2297 for b in branches:

2298 if isinstance(b, Sequence):

2299 alternatives.append(b.items)

2300 else:

2301 alternatives.append([b])

2302

2303 # What is the maximum possible length of the suffix?

2304 max_count = min(len(a) for a in alternatives)

2305

2306 # What is the longest common suffix?

2307 suffix = alternatives[0]

2308 pos = -1

2309 end_pos = -1 - max_count

2310 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==

2311 suffix[pos] for a in alternatives):

2312 pos -= 1

2313 count = -1 - pos

2314

2315 if info.flags & UNICODE:

2316 # We need to check that we're not splitting a sequence of

2317 # characters which could form part of full case-folding.

2318 while count > 0 and not all(Branch._can_split_rev(a, count) for a

2319 in alternatives):

2320 count -= 1

2321

2322 # No common suffix is possible.

2323 if count == 0:

2324 return [], branches

2325

2326 # Rebuild the branches.

2327 new_branches = []

2328 for a in alternatives:

2329 new_branches.append(make_sequence(a[ : -count]))

2330

2331 return suffix[-count : ], new_branches

2332

2333 @staticmethod

2334 def _can_split(items, count):

2335 # Check the characters either side of the proposed split.

2336 if not Branch._is_full_case(items, count - 1):

2337 return True

2338

2339 if not Branch._is_full_case(items, count):

2340 return True

2341

2342 # Check whether a 1-1 split would be OK.

2343 if Branch._is_folded(items[count - 1 : count + 1]):

2344 return False

2345

2346 # Check whether a 1-2 split would be OK.

2347 if (Branch._is_full_case(items, count + 2) and

2348 Branch._is_folded(items[count - 1 : count + 2])):

2349 return False

2350

2351 # Check whether a 2-1 split would be OK.

2352 if (Branch._is_full_case(items, count - 2) and

2353 Branch._is_folded(items[count - 2 : count + 1])):

2354 return False

2355

2356 return True

2357

2358 @staticmethod

2359 def _can_split_rev(items, count):

2360 end = len(items)

2361

2362 # Check the characters either side of the proposed split.

2363 if not Branch._is_full_case(items, end - count):

2364 return True

2365

2366 if not Branch._is_full_case(items, end - count - 1):

2367 return True

2368

2369 # Check whether a 1-1 split would be OK.

2370 if Branch._is_folded(items[end - count - 1 : end - count + 1]):

2371 return False

2372

2373 # Check whether a 1-2 split would be OK.

2374 if (Branch._is_full_case(items, end - count + 2) and

2375 Branch._is_folded(items[end - count - 1 : end - count + 2])):

2376 return False

2377

2378 # Check whether a 2-1 split would be OK.

2379 if (Branch._is_full_case(items, end - count - 2) and

2380 Branch._is_folded(items[end - count - 2 : end - count + 1])):

2381 return False

2382

2383 return True

2384

2385 @staticmethod

2386 def _merge_common_prefixes(info, reverse, branches):

2387 # Branches with the same case-sensitive character prefix can be grouped

2388 # together if they are separated only by other branches with a

2389 # character prefix.

2390 prefixed = defaultdict(list)

2391 order = {}

2392 new_branches = []

2393 for b in branches:

2394 if Branch._is_simple_character(b):

2395 # Branch starts with a simple character.

2396 prefixed[b.value].append([b])

2397 order.setdefault(b.value, len(order))

2398 elif (isinstance(b, Sequence) and b.items and

2399 Branch._is_simple_character(b.items[0])):

2400 # Branch starts with a simple character.

2401 prefixed[b.items[0].value].append(b.items)

2402 order.setdefault(b.items[0].value, len(order))

2403 else:

2404 Branch._flush_char_prefix(info, reverse, prefixed, order,

2405 new_branches)

2406

2407 new_branches.append(b)

2408

2409 Branch._flush_char_prefix(info, prefixed, order, new_branches)

2410

2411 return new_branches

2412

2413 @staticmethod

2414 def _is_simple_character(c):

2415 return isinstance(c, Character) and c.positive and not c.case_flags

2416

2417 @staticmethod

2418 def _reduce_to_set(info, reverse, branches):

2419 # Can the branches be reduced to a set?

2420 new_branches = []

2421 items = set()

2422 case_flags = NOCASE

2423 for b in branches:

2424 if isinstance(b, (Character, Property, SetBase)):

2425 # Branch starts with a single character.

2426 if b.case_flags != case_flags:

2427 # Different case sensitivity, so flush.

2428 Branch._flush_set_members(info, reverse, items, case_flags,

2429 new_branches)

2430

2431 case_flags = b.case_flags

2432

2433 items.add(b.with_flags(case_flags=NOCASE))

2434 else:

2435 Branch._flush_set_members(info, reverse, items, case_flags,

2436 new_branches)

2437

2438 new_branches.append(b)

2439

2440 Branch._flush_set_members(info, reverse, items, case_flags,

2441 new_branches)

2442

2443 return new_branches

2444

2445 @staticmethod

2446 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):

2447 # Flush the prefixed branches.

2448 if not prefixed:

2449 return

2450

2451 for value, branches in sorted(prefixed.items(), key=lambda pair:

2452 order[pair[0]]):

2453 if len(branches) == 1:

2454 new_branches.append(make_sequence(branches[0]))

2455 else:

2456 subbranches = []

2457 optional = False

2458 for b in branches:

2459 if len(b) > 1:

2460 subbranches.append(make_sequence(b[1 : ]))

2461 elif not optional:

2462 subbranches.append(Sequence())

2463 optional = True

2464

2465 sequence = Sequence([Character(value), Branch(subbranches)])

2466 new_branches.append(sequence.optimise(info, reverse))

2467

2468 prefixed.clear()

2469 order.clear()

2470

2471 @staticmethod

2472 def _flush_set_members(info, reverse, items, case_flags, new_branches):

2473 # Flush the set members.

2474 if not items:

2475 return

2476

2477 if len(items) == 1:

2478 item = list(items)[0]

2479 else:

2480 item = SetUnion(info, list(items)).optimise(info, reverse)

2481

2482 new_branches.append(item.with_flags(case_flags=case_flags))

2483

2484 items.clear()

2485

2486 @staticmethod

2487 def _is_full_case(items, i):

2488 if not 0 <= i < len(items):

2489 return False

2490

2491 item = items[i]

2492 return (isinstance(item, Character) and item.positive and

2493 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)

2494

2495 @staticmethod

2496 def _is_folded(items):

2497 if len(items) < 2:

2498 return False

2499

2500 for i in items:

2501 if (not isinstance(i, Character) or not i.positive or not

2502 i.case_flags):

2503 return False

2504

2505 folded = "".join(chr(i.value) for i in items)

2506 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)

2507

2508 # Get the characters which expand to multiple codepoints on folding.

2509 expanding_chars = _regex.get_expand_on_folding()

2510

2511 for c in expanding_chars:

2512 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):

2513 return True

2514

2515 return False

2516

2517 def is_empty(self):

2518 return all(b.is_empty() for b in self.branches)

2519

2520 def __eq__(self, other):

2521 return type(self) is type(other) and self.branches == other.branches

2522

2523 def max_width(self):

2524 return max(b.max_width() for b in self.branches)

2525

2526class CallGroup(RegexBase):

2527 def __init__(self, info, group, position):

2528 RegexBase.__init__(self)

2529 self.info = info

2530 self.group = group

2531 self.position = position

2532

2533 self._key = self.__class__, self.group

2534

2535 def fix_groups(self, pattern, reverse, fuzzy):

2536 try:

2537 self.group = int(self.group)

2538 except ValueError:

2539 try:

2540 self.group = self.info.group_index[self.group]

2541 except KeyError:

2542 raise error("invalid group reference", pattern, self.position)

2543

2544 if not 0 <= self.group <= self.info.group_count:

2545 raise error("unknown group", pattern, self.position)

2546

2547 if self.group > 0 and self.info.open_group_count[self.group] > 1:

2548 raise error("ambiguous group reference", pattern, self.position)

2549

2550 self.info.group_calls.append((self, reverse, fuzzy))

2551

2552 self._key = self.__class__, self.group

2553

2554 def remove_captures(self):

2555 raise error("group reference not allowed", self.pattern, self.position)

2556

2557 def _compile(self, reverse, fuzzy):

2558 return [(OP.GROUP_CALL, self.call_ref)]

2559

2560 def dump(self, indent, reverse):

2561 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))

2562

2563 def __eq__(self, other):

2564 return type(self) is type(other) and self.group == other.group

2565

2566 def max_width(self):

2567 return UNLIMITED

2568

2569 def __del__(self):

2570 self.info = None

2571

2572class CallRef(RegexBase):

2573 def __init__(self, ref, parsed):

2574 self.ref = ref

2575 self.parsed = parsed

2576

2577 def _compile(self, reverse, fuzzy):

2578 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,

2579 fuzzy) + [(OP.END, )])

2580

2581class Character(RegexBase):

2582 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):

2583 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,

2584 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,

2585 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,

2586 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}

2587

2588 def __init__(self, value, positive=True, case_flags=NOCASE,

2589 zerowidth=False):

2590 RegexBase.__init__(self)

2591 self.value = value

2592 self.positive = bool(positive)

2593 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

2594 self.zerowidth = bool(zerowidth)

2595

2596 if (self.positive and (self.case_flags & FULLIGNORECASE) ==

2597 FULLIGNORECASE):

2598 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))

2599 else:

2600 self.folded = chr(self.value)

2601

2602 self._key = (self.__class__, self.value, self.positive,

2603 self.case_flags, self.zerowidth)

2604

2605 def rebuild(self, positive, case_flags, zerowidth):

2606 return Character(self.value, positive, case_flags, zerowidth)

2607

2608 def optimise(self, info, reverse, in_set=False):

2609 return self

2610

2611 def get_firstset(self, reverse):

2612 return set([self])

2613

2614 def has_simple_start(self):

2615 return True

2616

2617 def _compile(self, reverse, fuzzy):

2618 flags = 0

2619 if self.positive:

2620 flags |= POSITIVE_OP

2621 if self.zerowidth:

2622 flags |= ZEROWIDTH_OP

2623 if fuzzy:

2624 flags |= FUZZY_OP

2625

2626 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,

2627 self.value])

2628

2629 if len(self.folded) > 1:

2630 # The character expands on full case-folding.

2631 code = Branch([code, String([ord(c) for c in self.folded],

2632 case_flags=self.case_flags)])

2633

2634 return code.compile(reverse, fuzzy)

2635

2636 def dump(self, indent, reverse):

2637 display = ascii(chr(self.value)).lstrip("bu")

2638 print("{}CHARACTER {} {}{}".format(INDENT * indent,

2639 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))

2640

2641 def matches(self, ch):

2642 return (ch == self.value) == self.positive

2643

2644 def max_width(self):

2645 return len(self.folded)

2646

2647 def get_required_string(self, reverse):

2648 if not self.positive:

2649 return 1, None

2650

2651 self.folded_characters = tuple(ord(c) for c in self.folded)

2652

2653 return 0, self

2654

2655class Conditional(RegexBase):

2656 def __init__(self, info, group, yes_item, no_item, position):

2657 RegexBase.__init__(self)

2658 self.info = info

2659 self.group = group

2660 self.yes_item = yes_item

2661 self.no_item = no_item

2662 self.position = position

2663

2664 def fix_groups(self, pattern, reverse, fuzzy):

2665 try:

2666 self.group = int(self.group)

2667 except ValueError:

2668 try:

2669 self.group = self.info.group_index[self.group]

2670 except KeyError:

2671 if self.group == 'DEFINE':

2672 # 'DEFINE' is a special name unless there's a group with

2673 # that name.

2674 self.group = 0

2675 else:

2676 raise error("unknown group", pattern, self.position)

2677

2678 if not 0 <= self.group <= self.info.group_count:

2679 raise error("invalid group reference", pattern, self.position)

2680

2681 self.yes_item.fix_groups(pattern, reverse, fuzzy)

2682 self.no_item.fix_groups(pattern, reverse, fuzzy)

2683

2684 def optimise(self, info, reverse):

2685 yes_item = self.yes_item.optimise(info, reverse)

2686 no_item = self.no_item.optimise(info, reverse)

2687

2688 return Conditional(info, self.group, yes_item, no_item, self.position)

2689

2690 def pack_characters(self, info):

2691 self.yes_item = self.yes_item.pack_characters(info)

2692 self.no_item = self.no_item.pack_characters(info)

2693 return self

2694

2695 def remove_captures(self):

2696 self.yes_item = self.yes_item.remove_captures()

2697 self.no_item = self.no_item.remove_captures()

2698

2699 def is_atomic(self):

2700 return self.yes_item.is_atomic() and self.no_item.is_atomic()

2701

2702 def can_be_affix(self):

2703 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()

2704

2705 def contains_group(self):

2706 return self.yes_item.contains_group() or self.no_item.contains_group()

2707

2708 def get_firstset(self, reverse):

2709 return (self.yes_item.get_firstset(reverse) |

2710 self.no_item.get_firstset(reverse))

2711

2712 def _compile(self, reverse, fuzzy):

2713 code = [(OP.GROUP_EXISTS, self.group)]

2714 code.extend(self.yes_item.compile(reverse, fuzzy))

2715 add_code = self.no_item.compile(reverse, fuzzy)

2716 if add_code:

2717 code.append((OP.NEXT, ))

2718 code.extend(add_code)

2719

2720 code.append((OP.END, ))

2721

2722 return code

2723

2724 def dump(self, indent, reverse):

2725 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))

2726 self.yes_item.dump(indent + 1, reverse)

2727 if not self.no_item.is_empty():

2728 print("{}OR".format(INDENT * indent))

2729 self.no_item.dump(indent + 1, reverse)

2730

2731 def is_empty(self):

2732 return self.yes_item.is_empty() and self.no_item.is_empty()

2733

2734 def __eq__(self, other):

2735 return type(self) is type(other) and (self.group, self.yes_item,

2736 self.no_item) == (other.group, other.yes_item, other.no_item)

2737

2738 def max_width(self):

2739 return max(self.yes_item.max_width(), self.no_item.max_width())

2740

2741 def __del__(self):

2742 self.info = None

2743

2744class DefaultBoundary(ZeroWidthBase):

2745 _opcode = OP.DEFAULT_BOUNDARY

2746 _op_name = "DEFAULT_BOUNDARY"

2747

2748class DefaultEndOfWord(ZeroWidthBase):

2749 _opcode = OP.DEFAULT_END_OF_WORD

2750 _op_name = "DEFAULT_END_OF_WORD"

2751

2752class DefaultStartOfWord(ZeroWidthBase):

2753 _opcode = OP.DEFAULT_START_OF_WORD

2754 _op_name = "DEFAULT_START_OF_WORD"

2755

2756class EndOfLine(ZeroWidthBase):

2757 _opcode = OP.END_OF_LINE

2758 _op_name = "END_OF_LINE"

2759

2760class EndOfLineU(EndOfLine):

2761 _opcode = OP.END_OF_LINE_U

2762 _op_name = "END_OF_LINE_U"

2763

2764class EndOfString(ZeroWidthBase):

2765 _opcode = OP.END_OF_STRING

2766 _op_name = "END_OF_STRING"

2767

2768class EndOfStringLine(ZeroWidthBase):

2769 _opcode = OP.END_OF_STRING_LINE

2770 _op_name = "END_OF_STRING_LINE"

2771

2772class EndOfStringLineU(EndOfStringLine):

2773 _opcode = OP.END_OF_STRING_LINE_U

2774 _op_name = "END_OF_STRING_LINE_U"

2775

2776class EndOfWord(ZeroWidthBase):

2777 _opcode = OP.END_OF_WORD

2778 _op_name = "END_OF_WORD"

2779

2780class Failure(ZeroWidthBase):

2781 _op_name = "FAILURE"

2782

2783 def _compile(self, reverse, fuzzy):

2784 return [(OP.FAILURE, )]

2785

2786class Fuzzy(RegexBase):

2787 def __init__(self, subpattern, constraints=None):

2788 RegexBase.__init__(self)

2789 if constraints is None:

2790 constraints = {}

2791 self.subpattern = subpattern

2792 self.constraints = constraints

2793

2794 # If an error type is mentioned in the cost equation, then its maximum

2795 # defaults to unlimited.

2796 if "cost" in constraints:

2797 for e in "dis":

2798 if e in constraints["cost"]:

2799 constraints.setdefault(e, (0, None))

2800

2801 # If any error type is mentioned, then all the error maxima default to

2802 # 0, otherwise they default to unlimited.

2803 if set(constraints) & set("dis"):

2804 for e in "dis":

2805 constraints.setdefault(e, (0, 0))

2806 else:

2807 for e in "dis":

2808 constraints.setdefault(e, (0, None))

2809

2810 # The maximum of the generic error type defaults to unlimited.

2811 constraints.setdefault("e", (0, None))

2812

2813 # The cost equation defaults to equal costs. Also, the cost of any

2814 # error type not mentioned in the cost equation defaults to 0.

2815 if "cost" in constraints:

2816 for e in "dis":

2817 constraints["cost"].setdefault(e, 0)

2818 else:

2819 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":

2820 constraints["e"][1]}

2821

2822 def fix_groups(self, pattern, reverse, fuzzy):

2823 self.subpattern.fix_groups(pattern, reverse, True)

2824

2825 def pack_characters(self, info):

2826 self.subpattern = self.subpattern.pack_characters(info)

2827 return self

2828

2829 def remove_captures(self):

2830 self.subpattern = self.subpattern.remove_captures()

2831 return self

2832

2833 def is_atomic(self):

2834 return self.subpattern.is_atomic()

2835

2836 def contains_group(self):

2837 return self.subpattern.contains_group()

2838

2839 def _compile(self, reverse, fuzzy):

2840 # The individual limits.

2841 arguments = []

2842 for e in "dise":

2843 v = self.constraints[e]

2844 arguments.append(v[0])

2845 arguments.append(UNLIMITED if v[1] is None else v[1])

2846

2847 # The coeffs of the cost equation.

2848 for e in "dis":

2849 arguments.append(self.constraints["cost"][e])

2850

2851 # The maximum of the cost equation.

2852 v = self.constraints["cost"]["max"]

2853 arguments.append(UNLIMITED if v is None else v)

2854

2855 flags = 0

2856 if reverse:

2857 flags |= REVERSE_OP

2858

2859 test = self.constraints.get("test")

2860

2861 if test:

2862 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +

2863 test.compile(reverse, True) + [(OP.NEXT,)] +

2864 self.subpattern.compile(reverse, True) + [(OP.END,)])

2865

2866 return ([(OP.FUZZY, flags) + tuple(arguments)] +

2867 self.subpattern.compile(reverse, True) + [(OP.END,)])

2868

2869 def dump(self, indent, reverse):

2870 constraints = self._constraints_to_string()

2871 if constraints:

2872 constraints = " " + constraints

2873 print("{}FUZZY{}".format(INDENT * indent, constraints))

2874 self.subpattern.dump(indent + 1, reverse)

2875

2876 def is_empty(self):

2877 return self.subpattern.is_empty()

2878

2879 def __eq__(self, other):

2880 return (type(self) is type(other) and self.subpattern ==

2881 other.subpattern and self.constraints == other.constraints)

2882

2883 def max_width(self):

2884 return UNLIMITED

2885

2886 def _constraints_to_string(self):

2887 constraints = []

2888

2889 for name in "ids":

2890 min, max = self.constraints[name]

2891 if max == 0:

2892 continue

2893

2894 con = ""

2895

2896 if min > 0:

2897 con = "{}<=".format(min)

2898

2899 con += name

2900

2901 if max is not None:

2902 con += "<={}".format(max)

2903

2904 constraints.append(con)

2905

2906 cost = []

2907 for name in "ids":

2908 coeff = self.constraints["cost"][name]

2909 if coeff > 0:

2910 cost.append("{}{}".format(coeff, name))

2911

2912 limit = self.constraints["cost"]["max"]

2913 if limit is not None and limit > 0:

2914 cost = "{}<={}".format("+".join(cost), limit)

2915 constraints.append(cost)

2916

2917 return ",".join(constraints)

2918

2919class Grapheme(RegexBase):

2920 def _compile(self, reverse, fuzzy):

2921 # Match at least 1 character until a grapheme boundary is reached. Note

2922 # that this is the same whether matching forwards or backwards.

2923 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),

2924 GraphemeBoundary()]))

2925

2926 return grapheme_matcher.compile(reverse, fuzzy)

2927

2928 def dump(self, indent, reverse):

2929 print("{}GRAPHEME".format(INDENT * indent))

2930

2931 def max_width(self):

2932 return UNLIMITED

2933

2934class GraphemeBoundary:

2935 def compile(self, reverse, fuzzy):

2936 return [(OP.GRAPHEME_BOUNDARY, 1)]

2937

2938class GreedyRepeat(RegexBase):

2939 _opcode = OP.GREEDY_REPEAT

2940 _op_name = "GREEDY_REPEAT"

2941

2942 def __init__(self, subpattern, min_count, max_count):

2943 RegexBase.__init__(self)

2944 self.subpattern = subpattern

2945 self.min_count = min_count

2946 self.max_count = max_count

2947

2948 def fix_groups(self, pattern, reverse, fuzzy):

2949 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2950

2951 def optimise(self, info, reverse):

2952 subpattern = self.subpattern.optimise(info, reverse)

2953

2954 return type(self)(subpattern, self.min_count, self.max_count)

2955

2956 def pack_characters(self, info):

2957 self.subpattern = self.subpattern.pack_characters(info)

2958 return self

2959

2960 def remove_captures(self):

2961 self.subpattern = self.subpattern.remove_captures()

2962 return self

2963

2964 def is_atomic(self):

2965 return self.min_count == self.max_count and self.subpattern.is_atomic()

2966

2967 def can_be_affix(self):

2968 return False

2969

2970 def contains_group(self):

2971 return self.subpattern.contains_group()

2972

2973 def get_firstset(self, reverse):

2974 fs = self.subpattern.get_firstset(reverse)

2975 if self.min_count == 0:

2976 fs.add(None)

2977

2978 return fs

2979

2980 def _compile(self, reverse, fuzzy):

2981 repeat = [self._opcode, self.min_count]

2982 if self.max_count is None:

2983 repeat.append(UNLIMITED)

2984 else:

2985 repeat.append(self.max_count)

2986

2987 subpattern = self.subpattern.compile(reverse, fuzzy)

2988 if not subpattern:

2989 return []

2990

2991 return ([tuple(repeat)] + subpattern + [(OP.END, )])

2992

2993 def dump(self, indent, reverse):

2994 if self.max_count is None:

2995 limit = "INF"

2996 else:

2997 limit = self.max_count

2998 print("{}{} {} {}".format(INDENT * indent, self._op_name,

2999 self.min_count, limit))

3000

3001 self.subpattern.dump(indent + 1, reverse)

3002

3003 def is_empty(self):

3004 return self.subpattern.is_empty()

3005

3006 def __eq__(self, other):

3007 return type(self) is type(other) and (self.subpattern, self.min_count,

3008 self.max_count) == (other.subpattern, other.min_count,

3009 other.max_count)

3010

3011 def max_width(self):

3012 if self.max_count is None:

3013 return UNLIMITED

3014

3015 return self.subpattern.max_width() * self.max_count

3016

3017 def get_required_string(self, reverse):

3018 max_count = UNLIMITED if self.max_count is None else self.max_count

3019 if self.min_count == 0:

3020 w = self.subpattern.max_width() * max_count

3021 return min(w, UNLIMITED), None

3022

3023 ofs, req = self.subpattern.get_required_string(reverse)

3024 if req:

3025 return ofs, req

3026

3027 w = self.subpattern.max_width() * max_count

3028 return min(w, UNLIMITED), None

3029

3030class PossessiveRepeat(GreedyRepeat):

3031 def is_atomic(self):

3032 return True

3033

3034 def _compile(self, reverse, fuzzy):

3035 subpattern = self.subpattern.compile(reverse, fuzzy)

3036 if not subpattern:

3037 return []

3038

3039 repeat = [self._opcode, self.min_count]

3040 if self.max_count is None:

3041 repeat.append(UNLIMITED)

3042 else:

3043 repeat.append(self.max_count)

3044

3045 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),

3046 (OP.END, )])

3047

3048 def dump(self, indent, reverse):

3049 print("{}ATOMIC".format(INDENT * indent))

3050

3051 if self.max_count is None:

3052 limit = "INF"

3053 else:

3054 limit = self.max_count

3055 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,

3056 self.min_count, limit))

3057

3058 self.subpattern.dump(indent + 2, reverse)

3059

3060class Group(RegexBase):

3061 def __init__(self, info, group, subpattern):

3062 RegexBase.__init__(self)

3063 self.info = info

3064 self.group = group

3065 self.subpattern = subpattern

3066

3067 self.call_ref = None

3068

3069 def fix_groups(self, pattern, reverse, fuzzy):

3070 self.info.defined_groups[self.group] = (self, reverse, fuzzy)

3071 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3072

3073 def optimise(self, info, reverse):

3074 subpattern = self.subpattern.optimise(info, reverse)

3075

3076 return Group(self.info, self.group, subpattern)

3077

3078 def pack_characters(self, info):

3079 self.subpattern = self.subpattern.pack_characters(info)

3080 return self

3081

3082 def remove_captures(self):

3083 return self.subpattern.remove_captures()

3084

3085 def is_atomic(self):

3086 return self.subpattern.is_atomic()

3087

3088 def can_be_affix(self):

3089 return False

3090

3091 def contains_group(self):

3092 return True

3093

3094 def get_firstset(self, reverse):

3095 return self.subpattern.get_firstset(reverse)

3096

3097 def has_simple_start(self):

3098 return self.subpattern.has_simple_start()

3099

3100 def _compile(self, reverse, fuzzy):

3101 code = []

3102

3103 public_group = private_group = self.group

3104 if private_group < 0:

3105 public_group = self.info.private_groups[private_group]

3106 private_group = self.info.group_count - private_group

3107

3108 key = self.group, reverse, fuzzy

3109 ref = self.info.call_refs.get(key)

3110 if ref is not None:

3111 code += [(OP.CALL_REF, ref)]

3112

3113 code += [(OP.GROUP, int(not reverse), private_group, public_group)]

3114 code += self.subpattern.compile(reverse, fuzzy)

3115 code += [(OP.END, )]

3116

3117 if ref is not None:

3118 code += [(OP.END, )]

3119

3120 return code

3121

3122 def dump(self, indent, reverse):

3123 group = self.group

3124 if group < 0:

3125 group = self.info.private_groups[group]

3126 print("{}GROUP {}".format(INDENT * indent, group))

3127 self.subpattern.dump(indent + 1, reverse)

3128

3129 def __eq__(self, other):

3130 return (type(self) is type(other) and (self.group, self.subpattern) ==

3131 (other.group, other.subpattern))

3132

3133 def max_width(self):

3134 return self.subpattern.max_width()

3135

3136 def get_required_string(self, reverse):

3137 return self.subpattern.get_required_string(reverse)

3138

3139 def __del__(self):

3140 self.info = None

3141

3142class Keep(ZeroWidthBase):

3143 _opcode = OP.KEEP

3144 _op_name = "KEEP"

3145

3146class LazyRepeat(GreedyRepeat):

3147 _opcode = OP.LAZY_REPEAT

3148 _op_name = "LAZY_REPEAT"

3149

3150class LookAround(RegexBase):

3151 _dir_text = {False: "AHEAD", True: "BEHIND"}

3152

3153 def __init__(self, behind, positive, subpattern):

3154 RegexBase.__init__(self)

3155 self.behind = bool(behind)

3156 self.positive = bool(positive)

3157 self.subpattern = subpattern

3158

3159 def fix_groups(self, pattern, reverse, fuzzy):

3160 self.subpattern.fix_groups(pattern, self.behind, fuzzy)

3161

3162 def optimise(self, info, reverse):

3163 subpattern = self.subpattern.optimise(info, self.behind)

3164 if self.positive and subpattern.is_empty():

3165 return subpattern

3166

3167 return LookAround(self.behind, self.positive, subpattern)

3168

3169 def pack_characters(self, info):

3170 self.subpattern = self.subpattern.pack_characters(info)

3171 return self

3172

3173 def remove_captures(self):

3174 return self.subpattern.remove_captures()

3175

3176 def is_atomic(self):

3177 return self.subpattern.is_atomic()

3178

3179 def can_be_affix(self):

3180 return self.subpattern.can_be_affix()

3181

3182 def contains_group(self):

3183 return self.subpattern.contains_group()

3184

3185 def get_firstset(self, reverse):

3186 if self.positive and self.behind == reverse:

3187 return self.subpattern.get_firstset(reverse)

3188

3189 return set([None])

3190

3191 def _compile(self, reverse, fuzzy):

3192 flags = 0

3193 if self.positive:

3194 flags |= POSITIVE_OP

3195 if fuzzy:

3196 flags |= FUZZY_OP

3197 if reverse:

3198 flags |= REVERSE_OP

3199

3200 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +

3201 self.subpattern.compile(self.behind) + [(OP.END, )])

3202

3203 def dump(self, indent, reverse):

3204 print("{}LOOK{} {}".format(INDENT * indent,

3205 self._dir_text[self.behind], POS_TEXT[self.positive]))

3206 self.subpattern.dump(indent + 1, self.behind)

3207

3208 def is_empty(self):

3209 return self.positive and self.subpattern.is_empty()

3210

3211 def __eq__(self, other):

3212 return type(self) is type(other) and (self.behind, self.positive,

3213 self.subpattern) == (other.behind, other.positive, other.subpattern)

3214

3215 def max_width(self):

3216 return 0

3217

3218class LookAroundConditional(RegexBase):

3219 _dir_text = {False: "AHEAD", True: "BEHIND"}

3220

3221 def __init__(self, behind, positive, subpattern, yes_item, no_item):

3222 RegexBase.__init__(self)

3223 self.behind = bool(behind)

3224 self.positive = bool(positive)

3225 self.subpattern = subpattern

3226 self.yes_item = yes_item

3227 self.no_item = no_item

3228

3229 def fix_groups(self, pattern, reverse, fuzzy):

3230 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3231 self.yes_item.fix_groups(pattern, reverse, fuzzy)

3232 self.no_item.fix_groups(pattern, reverse, fuzzy)

3233

3234 def optimise(self, info, reverse):

3235 subpattern = self.subpattern.optimise(info, self.behind)

3236 yes_item = self.yes_item.optimise(info, self.behind)

3237 no_item = self.no_item.optimise(info, self.behind)

3238

3239 return LookAroundConditional(self.behind, self.positive, subpattern,

3240 yes_item, no_item)

3241

3242 def pack_characters(self, info):

3243 self.subpattern = self.subpattern.pack_characters(info)

3244 self.yes_item = self.yes_item.pack_characters(info)

3245 self.no_item = self.no_item.pack_characters(info)

3246 return self

3247

3248 def remove_captures(self):

3249 self.subpattern = self.subpattern.remove_captures()

3250 self.yes_item = self.yes_item.remove_captures()

3251 self.no_item = self.no_item.remove_captures()

3252

3253 def is_atomic(self):

3254 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and

3255 self.no_item.is_atomic())

3256

3257 def can_be_affix(self):

3258 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()

3259 and self.no_item.can_be_affix())

3260

3261 def contains_group(self):

3262 return (self.subpattern.contains_group() or

3263 self.yes_item.contains_group() or self.no_item.contains_group())

3264

3265 def _compile(self, reverse, fuzzy):

3266 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]

3267 code.extend(self.subpattern.compile(self.behind, fuzzy))

3268 code.append((OP.NEXT, ))

3269 code.extend(self.yes_item.compile(reverse, fuzzy))

3270 add_code = self.no_item.compile(reverse, fuzzy)

3271 if add_code:

3272 code.append((OP.NEXT, ))

3273 code.extend(add_code)

3274

3275 code.append((OP.END, ))

3276

3277 return code

3278

3279 def dump(self, indent, reverse):

3280 print("{}CONDITIONAL {} {}".format(INDENT * indent,

3281 self._dir_text[self.behind], POS_TEXT[self.positive]))

3282 self.subpattern.dump(indent + 1, self.behind)

3283 print("{}EITHER".format(INDENT * indent))

3284 self.yes_item.dump(indent + 1, reverse)

3285 if not self.no_item.is_empty():

3286 print("{}OR".format(INDENT * indent))

3287 self.no_item.dump(indent + 1, reverse)

3288

3289 def is_empty(self):

3290 return (self.subpattern.is_empty() and self.yes_item.is_empty() or

3291 self.no_item.is_empty())

3292

3293 def __eq__(self, other):

3294 return type(self) is type(other) and (self.subpattern, self.yes_item,

3295 self.no_item) == (other.subpattern, other.yes_item, other.no_item)

3296

3297 def max_width(self):

3298 return max(self.yes_item.max_width(), self.no_item.max_width())

3299

3300 def get_required_string(self, reverse):

3301 return self.max_width(), None

3302

3303class PrecompiledCode(RegexBase):

3304 def __init__(self, code):

3305 self.code = code

3306

3307 def _compile(self, reverse, fuzzy):

3308 return [tuple(self.code)]

3309

3310class Property(RegexBase):

3311 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):

3312 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):

3313 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):

3314 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,

3315 True): OP.PROPERTY_IGN_REV}

3316

3317 def __init__(self, value, positive=True, case_flags=NOCASE,

3318 zerowidth=False, encoding=0):

3319 RegexBase.__init__(self)

3320 self.value = value

3321 self.positive = bool(positive)

3322 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3323 self.zerowidth = bool(zerowidth)

3324 self.encoding = encoding

3325

3326 self._key = (self.__class__, self.value, self.positive,

3327 self.case_flags, self.zerowidth)

3328

3329 def rebuild(self, positive, case_flags, zerowidth):

3330 return Property(self.value, positive, case_flags, zerowidth,

3331 self.encoding)

3332

3333 def optimise(self, info, reverse, in_set=False):

3334 return self

3335

3336 def get_firstset(self, reverse):

3337 return set([self])

3338

3339 def has_simple_start(self):

3340 return True

3341

3342 def _compile(self, reverse, fuzzy):

3343 flags = 0

3344 if self.positive:

3345 flags |= POSITIVE_OP

3346 if self.zerowidth:

3347 flags |= ZEROWIDTH_OP

3348 if fuzzy:

3349 flags |= FUZZY_OP

3350 flags |= self.encoding << ENCODING_OP_SHIFT

3351 return [(self._opcode[self.case_flags, reverse], flags, self.value)]

3352

3353 def dump(self, indent, reverse):

3354 prop = PROPERTY_NAMES[self.value >> 16]

3355 name, value = prop[0], prop[1][self.value & 0xFFFF]

3356 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,

3357 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],

3358 ["", " ASCII"][self.encoding]))

3359

3360 def matches(self, ch):

3361 return _regex.has_property_value(self.value, ch) == self.positive

3362

3363 def max_width(self):

3364 return 1

3365

3366class Prune(ZeroWidthBase):

3367 _op_name = "PRUNE"

3368

3369 def _compile(self, reverse, fuzzy):

3370 return [(OP.PRUNE, )]

3371

3372class Range(RegexBase):

3373 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,

3374 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,

3375 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,

3376 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}

3377 _op_name = "RANGE"

3378

3379 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,

3380 zerowidth=False):

3381 RegexBase.__init__(self)

3382 self.lower = lower

3383 self.upper = upper

3384 self.positive = bool(positive)

3385 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3386 self.zerowidth = bool(zerowidth)

3387

3388 self._key = (self.__class__, self.lower, self.upper, self.positive,

3389 self.case_flags, self.zerowidth)

3390

3391 def rebuild(self, positive, case_flags, zerowidth):

3392 return Range(self.lower, self.upper, positive, case_flags, zerowidth)

3393

3394 def optimise(self, info, reverse, in_set=False):

3395 # Is the range case-sensitive?

3396 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3397 return self

3398

3399 # Is full case-folding possible?

3400 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=

3401 FULLIGNORECASE):

3402 return self

3403

3404 # Get the characters which expand to multiple codepoints on folding.

3405 expanding_chars = _regex.get_expand_on_folding()

3406

3407 # Get the folded characters in the range.

3408 items = []

3409 for ch in expanding_chars:

3410 if self.lower <= ord(ch) <= self.upper:

3411 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3412 items.append(String([ord(c) for c in folded],

3413 case_flags=self.case_flags))

3414

3415 if not items:

3416 # We can fall back to simple case-folding.

3417 return self

3418

3419 if len(items) < self.upper - self.lower + 1:

3420 # Not all the characters are covered by the full case-folding.

3421 items.insert(0, self)

3422

3423 return Branch(items)

3424

3425 def _compile(self, reverse, fuzzy):

3426 flags = 0

3427 if self.positive:

3428 flags |= POSITIVE_OP

3429 if self.zerowidth:

3430 flags |= ZEROWIDTH_OP

3431 if fuzzy:

3432 flags |= FUZZY_OP

3433 return [(self._opcode[self.case_flags, reverse], flags, self.lower,

3434 self.upper)]

3435

3436 def dump(self, indent, reverse):

3437 display_lower = ascii(chr(self.lower)).lstrip("bu")

3438 display_upper = ascii(chr(self.upper)).lstrip("bu")

3439 print("{}RANGE {} {} {}{}".format(INDENT * indent,

3440 POS_TEXT[self.positive], display_lower, display_upper,

3441 CASE_TEXT[self.case_flags]))

3442

3443 def matches(self, ch):

3444 return (self.lower <= ch <= self.upper) == self.positive

3445

3446 def max_width(self):

3447 return 1

3448

3449class RefGroup(RegexBase):

3450 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):

3451 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,

3452 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,

3453 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,

3454 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}

3455

3456 def __init__(self, info, group, position, case_flags=NOCASE):

3457 RegexBase.__init__(self)

3458 self.info = info

3459 self.group = group

3460 self.position = position

3461 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3462

3463 self._key = self.__class__, self.group, self.case_flags

3464

3465 def fix_groups(self, pattern, reverse, fuzzy):

3466 try:

3467 self.group = int(self.group)

3468 except ValueError:

3469 try:

3470 self.group = self.info.group_index[self.group]

3471 except KeyError:

3472 raise error("unknown group", pattern, self.position)

3473

3474 if not 1 <= self.group <= self.info.group_count:

3475 raise error("invalid group reference", pattern, self.position)

3476

3477 self._key = self.__class__, self.group, self.case_flags

3478

3479 def remove_captures(self):

3480 raise error("group reference not allowed", self.pattern, self.position)

3481

3482 def _compile(self, reverse, fuzzy):

3483 flags = 0

3484 if fuzzy:

3485 flags |= FUZZY_OP

3486 return [(self._opcode[self.case_flags, reverse], flags, self.group)]

3487

3488 def dump(self, indent, reverse):

3489 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,

3490 CASE_TEXT[self.case_flags]))

3491

3492 def max_width(self):

3493 return UNLIMITED

3494

3495 def __del__(self):

3496 self.info = None

3497

3498class SearchAnchor(ZeroWidthBase):

3499 _opcode = OP.SEARCH_ANCHOR

3500 _op_name = "SEARCH_ANCHOR"

3501

3502class Sequence(RegexBase):

3503 def __init__(self, items=None):

3504 RegexBase.__init__(self)

3505 if items is None:

3506 items = []

3507

3508 self.items = items

3509

3510 def fix_groups(self, pattern, reverse, fuzzy):

3511 for s in self.items:

3512 s.fix_groups(pattern, reverse, fuzzy)

3513

3514 def optimise(self, info, reverse):

3515 # Flatten the sequences.

3516 items = []

3517 for s in self.items:

3518 s = s.optimise(info, reverse)

3519 if isinstance(s, Sequence):

3520 items.extend(s.items)

3521 else:

3522 items.append(s)

3523

3524 return make_sequence(items)

3525

3526 def pack_characters(self, info):

3527 "Packs sequences of characters into strings."

3528 items = []

3529 characters = []

3530 case_flags = NOCASE

3531 for s in self.items:

3532 if type(s) is Character and s.positive and not s.zerowidth:

3533 if s.case_flags != case_flags:

3534 # Different case sensitivity, so flush, unless neither the

3535 # previous nor the new character are cased.

3536 if s.case_flags or is_cased_i(info, s.value):

3537 Sequence._flush_characters(info, characters,

3538 case_flags, items)

3539

3540 case_flags = s.case_flags

3541

3542 characters.append(s.value)

3543 elif type(s) is String or type(s) is Literal:

3544 if s.case_flags != case_flags:

3545 # Different case sensitivity, so flush, unless the neither

3546 # the previous nor the new string are cased.

3547 if s.case_flags or any(is_cased_i(info, c) for c in

3548 characters):

3549 Sequence._flush_characters(info, characters,

3550 case_flags, items)

3551

3552 case_flags = s.case_flags

3553

3554 characters.extend(s.characters)

3555 else:

3556 Sequence._flush_characters(info, characters, case_flags, items)

3557

3558 items.append(s.pack_characters(info))

3559

3560 Sequence._flush_characters(info, characters, case_flags, items)

3561

3562 return make_sequence(items)

3563

3564 def remove_captures(self):

3565 self.items = [s.remove_captures() for s in self.items]

3566 return self

3567

3568 def is_atomic(self):

3569 return all(s.is_atomic() for s in self.items)

3570

3571 def can_be_affix(self):

3572 return False

3573

3574 def contains_group(self):

3575 return any(s.contains_group() for s in self.items)

3576

3577 def get_firstset(self, reverse):

3578 fs = set()

3579 items = self.items

3580 if reverse:

3581 items.reverse()

3582 for s in items:

3583 fs |= s.get_firstset(reverse)

3584 if None not in fs:

3585 return fs

3586 fs.discard(None)

3587

3588 return fs | set([None])

3589

3590 def has_simple_start(self):

3591 return bool(self.items) and self.items[0].has_simple_start()

3592

3593 def _compile(self, reverse, fuzzy):

3594 seq = self.items

3595 if reverse:

3596 seq = seq[::-1]

3597

3598 code = []

3599 for s in seq:

3600 code.extend(s.compile(reverse, fuzzy))

3601

3602 return code

3603

3604 def dump(self, indent, reverse):

3605 for s in self.items:

3606 s.dump(indent, reverse)

3607

3608 @staticmethod

3609 def _flush_characters(info, characters, case_flags, items):

3610 if not characters:

3611 return

3612

3613 # Disregard case_flags if all of the characters are case-less.

3614 if case_flags & IGNORECASE:

3615 if not any(is_cased_i(info, c) for c in characters):

3616 case_flags = NOCASE

3617

3618 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3619 literals = Sequence._fix_full_casefold(characters)

3620

3621 for item in literals:

3622 chars = item.characters

3623

3624 if len(chars) == 1:

3625 items.append(Character(chars[0], case_flags=item.case_flags))

3626 else:

3627 items.append(String(chars, case_flags=item.case_flags))

3628 else:

3629 if len(characters) == 1:

3630 items.append(Character(characters[0], case_flags=case_flags))

3631 else:

3632 items.append(String(characters, case_flags=case_flags))

3633

3634 characters[:] = []

3635

3636 @staticmethod

3637 def _fix_full_casefold(characters):

3638 # Split a literal needing full case-folding into chunks that need it

3639 # and chunks that can use simple case-folding, which is faster.

3640 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in

3641 _regex.get_expand_on_folding()]

3642 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)

3643 for c in characters)).lower()

3644 chunks = []

3645

3646 for e in expanded:

3647 found = string.find(e)

3648

3649 while found >= 0:

3650 chunks.append((found, found + len(e)))

3651 found = string.find(e, found + 1)

3652

3653 pos = 0

3654 literals = []

3655

3656 for start, end in Sequence._merge_chunks(chunks):

3657 if pos < start:

3658 literals.append(Literal(characters[pos : start],

3659 case_flags=IGNORECASE))

3660

3661 literals.append(Literal(characters[start : end],

3662 case_flags=FULLIGNORECASE))

3663 pos = end

3664

3665 if pos < len(characters):

3666 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))

3667

3668 return literals

3669

3670 @staticmethod

3671 def _merge_chunks(chunks):

3672 if len(chunks) < 2:

3673 return chunks

3674

3675 chunks.sort()

3676

3677 start, end = chunks[0]

3678 new_chunks = []

3679

3680 for s, e in chunks[1 : ]:

3681 if s <= end:

3682 end = max(end, e)

3683 else:

3684 new_chunks.append((start, end))

3685 start, end = s, e

3686

3687 new_chunks.append((start, end))

3688

3689 return new_chunks

3690

3691 def is_empty(self):

3692 return all(i.is_empty() for i in self.items)

3693

3694 def __eq__(self, other):

3695 return type(self) is type(other) and self.items == other.items

3696

3697 def max_width(self):

3698 return sum(s.max_width() for s in self.items)

3699

3700 def get_required_string(self, reverse):

3701 seq = self.items

3702 if reverse:

3703 seq = seq[::-1]

3704

3705 offset = 0

3706

3707 for s in seq:

3708 ofs, req = s.get_required_string(reverse)

3709 offset += ofs

3710 if req:

3711 return offset, req

3712

3713 return offset, None

3714

3715class SetBase(RegexBase):

3716 def __init__(self, info, items, positive=True, case_flags=NOCASE,

3717 zerowidth=False):

3718 RegexBase.__init__(self)

3719 self.info = info

3720 self.items = tuple(items)

3721 self.positive = bool(positive)

3722 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3723 self.zerowidth = bool(zerowidth)

3724

3725 self.char_width = 1

3726

3727 self._key = (self.__class__, self.items, self.positive,

3728 self.case_flags, self.zerowidth)

3729

3730 def rebuild(self, positive, case_flags, zerowidth):

3731 return type(self)(self.info, self.items, positive, case_flags,

3732 zerowidth).optimise(self.info, False)

3733

3734 def get_firstset(self, reverse):

3735 return set([self])

3736

3737 def has_simple_start(self):

3738 return True

3739

3740 def _compile(self, reverse, fuzzy):

3741 flags = 0

3742 if self.positive:

3743 flags |= POSITIVE_OP

3744 if self.zerowidth:

3745 flags |= ZEROWIDTH_OP

3746 if fuzzy:

3747 flags |= FUZZY_OP

3748 code = [(self._opcode[self.case_flags, reverse], flags)]

3749 for m in self.items:

3750 code.extend(m.compile())

3751

3752 code.append((OP.END, ))

3753

3754 return code

3755

3756 def dump(self, indent, reverse):

3757 print("{}{} {}{}".format(INDENT * indent, self._op_name,

3758 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))

3759 for i in self.items:

3760 i.dump(indent + 1, reverse)

3761

3762 def _handle_case_folding(self, info, in_set):

3763 # Is the set case-sensitive?

3764 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3765 return self

3766

3767 # Is full case-folding possible?

3768 if (not (self.info.flags & UNICODE) or (self.case_flags &

3769 FULLIGNORECASE) != FULLIGNORECASE):

3770 return self

3771

3772 # Get the characters which expand to multiple codepoints on folding.

3773 expanding_chars = _regex.get_expand_on_folding()

3774

3775 # Get the folded characters in the set.

3776 items = []

3777 seen = set()

3778 for ch in expanding_chars:

3779 if self.matches(ord(ch)):

3780 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3781 if folded not in seen:

3782 items.append(String([ord(c) for c in folded],

3783 case_flags=self.case_flags))

3784 seen.add(folded)

3785

3786 if not items:

3787 # We can fall back to simple case-folding.

3788 return self

3789

3790 return Branch([self] + items)

3791

3792 def max_width(self):

3793 # Is the set case-sensitive?

3794 if not self.positive or not (self.case_flags & IGNORECASE):

3795 return 1

3796

3797 # Is full case-folding possible?

3798 if (not (self.info.flags & UNICODE) or (self.case_flags &

3799 FULLIGNORECASE) != FULLIGNORECASE):

3800 return 1

3801

3802 # Get the characters which expand to multiple codepoints on folding.

3803 expanding_chars = _regex.get_expand_on_folding()

3804

3805 # Get the folded characters in the set.

3806 seen = set()

3807 for ch in expanding_chars:

3808 if self.matches(ord(ch)):

3809 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3810 seen.add(folded)

3811

3812 if not seen:

3813 return 1

3814

3815 return max(len(folded) for folded in seen)

3816

3817 def __del__(self):

3818 self.info = None

3819

3820class SetDiff(SetBase):

3821 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):

3822 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):

3823 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):

3824 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,

3825 True): OP.SET_DIFF_IGN_REV}

3826 _op_name = "SET_DIFF"

3827

3828 def optimise(self, info, reverse, in_set=False):

3829 items = self.items

3830 if len(items) > 2:

3831 items = [items[0], SetUnion(info, items[1 : ])]

3832

3833 if len(items) == 1:

3834 return items[0].with_flags(case_flags=self.case_flags,

3835 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3836

3837 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in

3838 items)

3839

3840 return self._handle_case_folding(info, in_set)

3841

3842 def matches(self, ch):

3843 m = self.items[0].matches(ch) and not self.items[1].matches(ch)

3844 return m == self.positive

3845

3846class SetInter(SetBase):

3847 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):

3848 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,

3849 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,

3850 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,

3851 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}

3852 _op_name = "SET_INTER"

3853

3854 def optimise(self, info, reverse, in_set=False):

3855 items = []

3856 for m in self.items:

3857 m = m.optimise(info, reverse, in_set=True)

3858 if isinstance(m, SetInter) and m.positive:

3859 # Intersection in intersection.

3860 items.extend(m.items)

3861 else:

3862 items.append(m)

3863

3864 if len(items) == 1:

3865 return items[0].with_flags(case_flags=self.case_flags,

3866 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3867

3868 self.items = tuple(items)

3869

3870 return self._handle_case_folding(info, in_set)

3871

3872 def matches(self, ch):

3873 m = all(i.matches(ch) for i in self.items)

3874 return m == self.positive

3875

3876class SetSymDiff(SetBase):

3877 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):

3878 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,

3879 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,

3880 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):

3881 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}

3882 _op_name = "SET_SYM_DIFF"

3883

3884 def optimise(self, info, reverse, in_set=False):

3885 items = []

3886 for m in self.items:

3887 m = m.optimise(info, reverse, in_set=True)

3888 if isinstance(m, SetSymDiff) and m.positive:

3889 # Symmetric difference in symmetric difference.

3890 items.extend(m.items)

3891 else:

3892 items.append(m)

3893

3894 if len(items) == 1:

3895 return items[0].with_flags(case_flags=self.case_flags,

3896 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3897

3898 self.items = tuple(items)

3899

3900 return self._handle_case_folding(info, in_set)

3901

3902 def matches(self, ch):

3903 m = False

3904 for i in self.items:

3905 m = m != i.matches(ch)

3906

3907 return m == self.positive

3908

3909class SetUnion(SetBase):

3910 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):

3911 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,

3912 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,

3913 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,

3914 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}

3915 _op_name = "SET_UNION"

3916

3917 def optimise(self, info, reverse, in_set=False):

3918 items = []

3919 for m in self.items:

3920 m = m.optimise(info, reverse, in_set=True)

3921 if isinstance(m, SetUnion) and m.positive:

3922 # Union in union.

3923 items.extend(m.items)

3924 elif isinstance(m, AnyAll):

3925 return AnyAll()

3926 else:

3927 items.append(m)

3928

3929 # Are there complementary properties?

3930 properties = (set(), set())

3931

3932 for m in items:

3933 if isinstance(m, Property):

3934 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))

3935

3936 if properties[0] & properties[1]:

3937 return AnyAll()

3938

3939 if len(items) == 1:

3940 i = items[0]

3941 return i.with_flags(positive=i.positive == self.positive,

3942 case_flags=self.case_flags,

3943 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3944

3945 self.items = tuple(items)

3946

3947 return self._handle_case_folding(info, in_set)

3948

3949 def _compile(self, reverse, fuzzy):

3950 flags = 0

3951 if self.positive:

3952 flags |= POSITIVE_OP

3953 if self.zerowidth:

3954 flags |= ZEROWIDTH_OP

3955 if fuzzy:

3956 flags |= FUZZY_OP

3957

3958 characters, others = defaultdict(list), []

3959 for m in self.items:

3960 if isinstance(m, Character):

3961 characters[m.positive].append(m.value)

3962 else:

3963 others.append(m)

3964

3965 code = [(self._opcode[self.case_flags, reverse], flags)]

3966

3967 for positive, values in characters.items():

3968 flags = 0

3969 if positive:

3970 flags |= POSITIVE_OP

3971 if len(values) == 1:

3972 code.append((OP.CHARACTER, flags, values[0]))

3973 else:

3974 code.append((OP.STRING, flags, len(values)) + tuple(values))

3975

3976 for m in others:

3977 code.extend(m.compile())

3978

3979 code.append((OP.END, ))

3980

3981 return code

3982

3983 def matches(self, ch):

3984 m = any(i.matches(ch) for i in self.items)

3985 return m == self.positive

3986

3987class Skip(ZeroWidthBase):

3988 _op_name = "SKIP"

3989 _opcode = OP.SKIP

3990

3991class StartOfLine(ZeroWidthBase):

3992 _opcode = OP.START_OF_LINE

3993 _op_name = "START_OF_LINE"

3994

3995class StartOfLineU(StartOfLine):

3996 _opcode = OP.START_OF_LINE_U

3997 _op_name = "START_OF_LINE_U"

3998

3999class StartOfString(ZeroWidthBase):

4000 _opcode = OP.START_OF_STRING

4001 _op_name = "START_OF_STRING"

4002

4003class StartOfWord(ZeroWidthBase):

4004 _opcode = OP.START_OF_WORD

4005 _op_name = "START_OF_WORD"

4006

4007class String(RegexBase):

4008 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,

4009 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,

4010 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,

4011 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):

4012 OP.STRING_FLD_REV}

4013

4014 def __init__(self, characters, case_flags=NOCASE):

4015 self.characters = tuple(characters)

4016 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

4017

4018 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:

4019 folded_characters = []

4020 for char in self.characters:

4021 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))

4022 folded_characters.extend(ord(c) for c in folded)

4023 else:

4024 folded_characters = self.characters

4025

4026 self.folded_characters = tuple(folded_characters)

4027 self.required = False

4028

4029 self._key = self.__class__, self.characters, self.case_flags

4030

4031 def get_firstset(self, reverse):

4032 if reverse:

4033 pos = -1

4034 else:

4035 pos = 0

4036 return set([Character(self.characters[pos],

4037 case_flags=self.case_flags)])

4038

4039 def has_simple_start(self):

4040 return True

4041

4042 def _compile(self, reverse, fuzzy):

4043 flags = 0

4044 if fuzzy:

4045 flags |= FUZZY_OP

4046 if self.required:

4047 flags |= REQUIRED_OP

4048 return [(self._opcode[self.case_flags, reverse], flags,

4049 len(self.folded_characters)) + self.folded_characters]

4050

4051 def dump(self, indent, reverse):

4052 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")

4053 print("{}STRING {}{}".format(INDENT * indent, display,

4054 CASE_TEXT[self.case_flags]))

4055

4056 def max_width(self):

4057 return len(self.folded_characters)

4058

4059 def get_required_string(self, reverse):

4060 return 0, self

4061

4062class Literal(String):

4063 def dump(self, indent, reverse):

4064 literal = ''.join(chr(c) for c in self.characters)

4065 display = ascii(literal).lstrip("bu")

4066 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,

4067 CASE_TEXT[self.case_flags]))

4068

4069class StringSet(Branch):

4070 def __init__(self, info, name, case_flags=NOCASE):

4071 self.info = info

4072 self.name = name

4073 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

4074

4075 self._key = self.__class__, self.name, self.case_flags

4076

4077 self.set_key = (name, self.case_flags)

4078 if self.set_key not in info.named_lists_used:

4079 info.named_lists_used[self.set_key] = len(info.named_lists_used)

4080

4081 index = self.info.named_lists_used[self.set_key]

4082 items = self.info.kwargs[self.name]

4083

4084 case_flags = self.case_flags

4085

4086 encoding = self.info.flags & _ALL_ENCODINGS

4087 fold_flags = encoding | case_flags

4088

4089 choices = []

4090

4091 for string in items:

4092 if isinstance(string, str):

4093 string = [ord(c) for c in string]

4094

4095 choices.append([Character(c, case_flags=case_flags) for c in

4096 string])

4097

4098 # Sort from longest to shortest.

4099 choices.sort(key=len, reverse=True)

4100

4101 self.branches = [Sequence(choice) for choice in choices]

4102

4103 def dump(self, indent, reverse):

4104 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,

4105 CASE_TEXT[self.case_flags]))

4106

4107 def __del__(self):

4108 self.info = None

4109

4110class Source:

4111 "Scanner for the regular expression source string."

4112 def __init__(self, string):

4113 if isinstance(string, str):

4114 self.string = string

4115 self.char_type = chr

4116 else:

4117 self.string = string.decode("latin-1")

4118 self.char_type = lambda c: bytes([c])

4119

4120 self.pos = 0

4121 self.ignore_space = False

4122 self.sep = string[ : 0]

4123

4124 def peek(self, override_ignore=False):

4125 string = self.string

4126 pos = self.pos

4127

4128 try:

4129 if self.ignore_space and not override_ignore:

4130 while True:

4131 if string[pos].isspace():

4132 # Skip over the whitespace.

4133 pos += 1

4134 elif string[pos] == "#":

4135 # Skip over the comment to the end of the line.

4136 pos = string.index("\n", pos)

4137 else:

4138 break

4139

4140 return string[pos]

4141 except IndexError:

4142 # We've reached the end of the string.

4143 return string[ : 0]

4144 except ValueError:

4145 # The comment extended to the end of the string.

4146 return string[ : 0]

4147

4148 def get(self, override_ignore=False):

4149 string = self.string

4150 pos = self.pos

4151

4152 try:

4153 if self.ignore_space and not override_ignore:

4154 while True:

4155 if string[pos].isspace():

4156 # Skip over the whitespace.

4157 pos += 1

4158 elif string[pos] == "#":

4159 # Skip over the comment to the end of the line.

4160 pos = string.index("\n", pos)

4161 else:

4162 break

4163

4164 ch = string[pos]

4165 self.pos = pos + 1

4166 return ch

4167 except IndexError:

4168 # We've reached the end of the string.

4169 self.pos = pos

4170 return string[ : 0]

4171 except ValueError:

4172 # The comment extended to the end of the string.

4173 self.pos = len(string)

4174 return string[ : 0]

4175

4176 def get_many(self, count=1):

4177 string = self.string

4178 pos = self.pos

4179

4180 try:

4181 if self.ignore_space:

4182 substring = []

4183

4184 while len(substring) < count:

4185 while True:

4186 if string[pos].isspace():

4187 # Skip over the whitespace.

4188 pos += 1

4189 elif string[pos] == "#":

4190 # Skip over the comment to the end of the line.

4191 pos = string.index("\n", pos)

4192 else:

4193 break

4194

4195 substring.append(string[pos])

4196 pos += 1

4197

4198 substring = "".join(substring)

4199 else:

4200 substring = string[pos : pos + count]

4201 pos += len(substring)

4202

4203 self.pos = pos

4204 return substring

4205 except IndexError:

4206 # We've reached the end of the string.

4207 self.pos = len(string)

4208 return "".join(substring)

4209 except ValueError:

4210 # The comment extended to the end of the string.

4211 self.pos = len(string)

4212 return "".join(substring)

4213

4214 def get_while(self, test_set, include=True, keep_spaces=False):

4215 string = self.string

4216 pos = self.pos

4217

4218 if self.ignore_space and not keep_spaces:

4219 try:

4220 substring = []

4221

4222 while True:

4223 if string[pos].isspace():

4224 # Skip over the whitespace.

4225 pos += 1

4226 elif string[pos] == "#":

4227 # Skip over the comment to the end of the line.

4228 pos = string.index("\n", pos)

4229 elif (string[pos] in test_set) == include:

4230 substring.append(string[pos])

4231 pos += 1

4232 else:

4233 break

4234

4235 self.pos = pos

4236 except IndexError:

4237 # We've reached the end of the string.

4238 self.pos = len(string)

4239 except ValueError:

4240 # The comment extended to the end of the string.

4241 self.pos = len(string)

4242

4243 return "".join(substring)

4244 else:

4245 try:

4246 while (string[pos] in test_set) == include:

4247 pos += 1

4248

4249 substring = string[self.pos : pos]

4250

4251 self.pos = pos

4252

4253 return substring

4254 except IndexError:

4255 # We've reached the end of the string.

4256 substring = string[self.pos : pos]

4257

4258 self.pos = pos

4259

4260 return substring

4261

4262 def skip_while(self, test_set, include=True):

4263 string = self.string

4264 pos = self.pos

4265

4266 try:

4267 if self.ignore_space:

4268 while True:

4269 if string[pos].isspace():

4270 # Skip over the whitespace.

4271 pos += 1

4272 elif string[pos] == "#":

4273 # Skip over the comment to the end of the line.

4274 pos = string.index("\n", pos)

4275 elif (string[pos] in test_set) == include:

4276 pos += 1

4277 else:

4278 break

4279 else:

4280 while (string[pos] in test_set) == include:

4281 pos += 1

4282

4283 self.pos = pos

4284 except IndexError:

4285 # We've reached the end of the string.

4286 self.pos = len(string)

4287 except ValueError:

4288 # The comment extended to the end of the string.

4289 self.pos = len(string)

4290

4291 def match(self, substring):

4292 string = self.string

4293 pos = self.pos

4294

4295 if self.ignore_space:

4296 try:

4297 for c in substring:

4298 while True:

4299 if string[pos].isspace():

4300 # Skip over the whitespace.

4301 pos += 1

4302 elif string[pos] == "#":

4303 # Skip over the comment to the end of the line.

4304 pos = string.index("\n", pos)

4305 else:

4306 break

4307

4308 if string[pos] != c:

4309 return False

4310

4311 pos += 1

4312

4313 self.pos = pos

4314

4315 return True

4316 except IndexError:

4317 # We've reached the end of the string.

4318 return False

4319 except ValueError:

4320 # The comment extended to the end of the string.

4321 return False

4322 else:

4323 if not string.startswith(substring, pos):

4324 return False

4325

4326 self.pos = pos + len(substring)

4327

4328 return True

4329

4330 def expect(self, substring):

4331 if not self.match(substring):

4332 raise error("missing {}".format(substring), self.string, self.pos)

4333

4334 def at_end(self):

4335 string = self.string

4336 pos = self.pos

4337

4338 try:

4339 if self.ignore_space:

4340 while True:

4341 if string[pos].isspace():

4342 pos += 1

4343 elif string[pos] == "#":

4344 pos = string.index("\n", pos)

4345 else:

4346 break

4347

4348 return pos >= len(string)

4349 except IndexError:

4350 # We've reached the end of the string.

4351 return True

4352 except ValueError:

4353 # The comment extended to the end of the string.

4354 return True

4355

4356class Info:

4357 "Info about the regular expression."

4358

4359 def __init__(self, flags=0, char_type=None, kwargs={}):

4360 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]

4361 self.flags = flags

4362 self.global_flags = flags

4363 self.inline_locale = False

4364

4365 self.kwargs = kwargs

4366

4367 self.group_count = 0

4368 self.group_index = {}

4369 self.group_name = {}

4370 self.char_type = char_type

4371 self.named_lists_used = {}

4372 self.open_groups = []

4373 self.open_group_count = {}

4374 self.defined_groups = {}

4375 self.group_calls = []

4376 self.private_groups = {}

4377

4378 def open_group(self, name=None):

4379 group = self.group_index.get(name)

4380 if group is None:

4381 while True:

4382 self.group_count += 1

4383 if name is None or self.group_count not in self.group_name:

4384 break

4385

4386 group = self.group_count

4387 if name:

4388 self.group_index[name] = group

4389 self.group_name[group] = name

4390

4391 if group in self.open_groups:

4392 # We have a nested named group. We'll assign it a private group

4393 # number, initially negative until we can assign a proper

4394 # (positive) number.

4395 group_alias = -(len(self.private_groups) + 1)

4396 self.private_groups[group_alias] = group

4397 group = group_alias

4398

4399 self.open_groups.append(group)

4400 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1

4401

4402 return group

4403

4404 def close_group(self):

4405 self.open_groups.pop()

4406

4407 def is_open_group(self, name):

4408 # In version 1, a group reference can refer to an open group. We'll

4409 # just pretend the group isn't open.

4410 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4411 if version == VERSION1:

4412 return False

4413

4414 if name.isdigit():

4415 group = int(name)

4416 else:

4417 group = self.group_index.get(name)

4418

4419 return group in self.open_groups

4420

4421def _check_group_features(info, parsed):

4422 """Checks whether the reverse and fuzzy features of the group calls match

4423 the groups which they call.

4424 """

4425 call_refs = {}

4426 additional_groups = []

4427 for call, reverse, fuzzy in info.group_calls:

4428 # Look up the reference of this group call.

4429 key = (call.group, reverse, fuzzy)

4430 ref = call_refs.get(key)

4431 if ref is None:

4432 # This group doesn't have a reference yet, so look up its features.

4433 if call.group == 0:

4434 # Calling the pattern as a whole.

4435 rev = bool(info.flags & REVERSE)

4436 fuz = isinstance(parsed, Fuzzy)

4437 if (rev, fuz) != (reverse, fuzzy):

4438 # The pattern as a whole doesn't have the features we want,

4439 # so we'll need to make a copy of it with the desired

4440 # features.

4441 additional_groups.append((CallRef(len(call_refs), parsed),

4442 reverse, fuzzy))

4443 else:

4444 # Calling a capture group.

4445 def_info = info.defined_groups[call.group]

4446 group = def_info[0]

4447 if def_info[1 : ] != (reverse, fuzzy):

4448 # The group doesn't have the features we want, so we'll

4449 # need to make a copy of it with the desired features.

4450 additional_groups.append((group, reverse, fuzzy))

4451

4452 ref = len(call_refs)

4453 call_refs[key] = ref

4454

4455 call.call_ref = ref

4456

4457 info.call_refs = call_refs

4458 info.additional_groups = additional_groups

4459

4460def _get_required_string(parsed, flags):

4461 "Gets the required string and related info of a parsed pattern."

4462

4463 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))

4464 if required:

4465 required.required = True

4466 if req_offset >= UNLIMITED:

4467 req_offset = -1

4468

4469 req_flags = required.case_flags

4470 if not (flags & UNICODE):

4471 req_flags &= ~UNICODE

4472

4473 req_chars = required.folded_characters

4474 else:

4475 req_offset = 0

4476 req_chars = ()

4477 req_flags = 0

4478

4479 return req_offset, req_chars, req_flags

4480

4481class Scanner:

4482 def __init__(self, lexicon, flags=0):

4483 self.lexicon = lexicon

4484

4485 # Combine phrases into a compound pattern.

4486 patterns = []

4487 for phrase, action in lexicon:

4488 # Parse the regular expression.

4489 source = Source(phrase)

4490 info = Info(flags, source.char_type)

4491 source.ignore_space = bool(info.flags & VERBOSE)

4492 parsed = _parse_pattern(source, info)

4493 if not source.at_end():

4494 raise error("unbalanced parenthesis", source.string,

4495 source.pos)

4496

4497 # We want to forbid capture groups within each phrase.

4498 patterns.append(parsed.remove_captures())

4499

4500 # Combine all the subpatterns into one pattern.

4501 info = Info(flags)

4502 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]

4503 parsed = Branch(patterns)

4504

4505 # Optimise the compound pattern.

4506 reverse = bool(info.flags & REVERSE)

4507 parsed = parsed.optimise(info, reverse)

4508 parsed = parsed.pack_characters(info)

4509

4510 # Get the required string.

4511 req_offset, req_chars, req_flags = _get_required_string(parsed,

4512 info.flags)

4513

4514 # Check the features of the groups.

4515 _check_group_features(info, parsed)

4516

4517 # Complain if there are any group calls. They are not supported by the

4518 # Scanner class.

4519 if info.call_refs:

4520 raise error("recursive regex not supported by Scanner",

4521 source.string, source.pos)

4522

4523 reverse = bool(info.flags & REVERSE)

4524

4525 # Compile the compound pattern. The result is a list of tuples.

4526 code = parsed.compile(reverse) + [(OP.SUCCESS, )]

4527

4528 # Flatten the code into a list of ints.

4529 code = _flatten_code(code)

4530

4531 if not parsed.has_simple_start():

4532 # Get the first set, if possible.

4533 try:

4534 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))

4535 fs_code = _flatten_code(fs_code)

4536 code = fs_code + code

4537 except _FirstSetError:

4538 pass

4539

4540 # Check the global flags for conflicts.

4541 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4542 if version not in (0, VERSION0, VERSION1):

4543 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")

4544

4545 # Create the PatternObject.

4546 #

4547 # Local flags like IGNORECASE affect the code generation, but aren't

4548 # needed by the PatternObject itself. Conversely, global flags like

4549 # LOCALE _don't_ affect the code generation but _are_ needed by the

4550 # PatternObject.

4551 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,

4552 code, {}, {}, {}, [], req_offset, req_chars, req_flags,

4553 len(patterns))

4554

4555 def scan(self, string):

4556 result = []

4557 append = result.append

4558 match = self.scanner.scanner(string).match

4559 i = 0

4560 while True:

4561 m = match()

4562 if not m:

4563 break

4564 j = m.end()

4565 if i == j:

4566 break

4567 action = self.lexicon[m.lastindex - 1][1]

4568 if hasattr(action, '__call__'):

4569 self.match = m

4570 action = action(self, m.group())

4571 if action is not None:

4572 append(action)

4573 i = j

4574

4575 return result, string[i : ]

4576

4577# Get the known properties dict.

4578PROPERTIES = _regex.get_properties()

4579

4580# Build the inverse of the properties dict.

4581PROPERTY_NAMES = {}

4582for prop_name, (prop_id, values) in PROPERTIES.items():

4583 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))

4584 name = max(name, prop_name, key=len)

4585 PROPERTY_NAMES[prop_id] = name, prop_values

4586

4587 for val_name, val_id in values.items():

4588 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,

4589 key=len)

4590

4591# Character escape sequences.

4592CHARACTER_ESCAPES = {

4593 "a": "\a",

4594 "b": "\b",

4595 "f": "\f",

4596 "n": "\n",

4597 "r": "\r",

4598 "t": "\t",

4599 "v": "\v",

4600}

4601

4602ASCII_ENCODING = 1

4603UNICODE_ENCODING = 2

4604

4605# Predefined character set escape sequences.

4606CHARSET_ESCAPES = {

4607 "d": lookup_property(None, "Digit", True),

4608 "D": lookup_property(None, "Digit", False),

4609 "h": lookup_property(None, "Blank", True),

4610 "s": lookup_property(None, "Space", True),

4611 "S": lookup_property(None, "Space", False),

4612 "w": lookup_property(None, "Word", True),

4613 "W": lookup_property(None, "Word", False),

4614}

4615

4616ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4617ASCII_CHARSET_ESCAPES.update({

4618 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),

4619 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),

4620 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),

4621 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),

4622 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),

4623 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),

4624})

4625UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4626UNICODE_CHARSET_ESCAPES.update({

4627 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),

4628 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),

4629 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),

4630 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),

4631 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),

4632 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),

4633})

4634

4635# Positional escape sequences.

4636POSITION_ESCAPES = {

4637 "A": StartOfString(),

4638 "b": Boundary(),

4639 "B": Boundary(False),

4640 "K": Keep(),

4641 "m": StartOfWord(),

4642 "M": EndOfWord(),

4643 "Z": EndOfString(),

4644}

4645ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4646ASCII_POSITION_ESCAPES.update({

4647 "b": Boundary(encoding=ASCII_ENCODING),

4648 "B": Boundary(False, encoding=ASCII_ENCODING),

4649 "m": StartOfWord(encoding=ASCII_ENCODING),

4650 "M": EndOfWord(encoding=ASCII_ENCODING),

4651})

4652UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4653UNICODE_POSITION_ESCAPES.update({

4654 "b": Boundary(encoding=UNICODE_ENCODING),

4655 "B": Boundary(False, encoding=UNICODE_ENCODING),

4656 "m": StartOfWord(encoding=UNICODE_ENCODING),

4657 "M": EndOfWord(encoding=UNICODE_ENCODING),

4658})

4659

4660# Positional escape sequences when WORD flag set.

4661WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4662WORD_POSITION_ESCAPES.update({

4663 "b": DefaultBoundary(),

4664 "B": DefaultBoundary(False),

4665 "m": DefaultStartOfWord(),

4666 "M": DefaultEndOfWord(),

4667})

4668

4669# Regex control verbs.

4670VERBS = {

4671 "FAIL": Failure(),

4672 "F": Failure(),

4673 "PRUNE": Prune(),

4674 "SKIP": Skip(),

4675}

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 26%

2903 statements