Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex

2# Secret Labs' Regular Expression Engine core module

6# This version of the SRE library can be redistributed under CNRI's

7# Python 1.6 license. For any other use, please contact Secret Labs

8# AB (info@pythonware.com).

10# Portions of this engine have been developed in cooperation with

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and

12# other compatibility work.

13#

14# 2010-01-16 mrab Python front-end re-written and extended

16import enum

17import string

18import unicodedata

19from collections import defaultdict

21from regex import _regex

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",

27 "Scanner", "RegexFlag"]

29# The regex exception.

30class error(Exception):

31 """Exception raised for invalid regular expressions.

33 Attributes:

35 msg: The unformatted error message

36 pattern: The regular expression pattern

37 pos: The position in the pattern where compilation failed, or None

38 lineno: The line number where compilation failed, unless pos is None

39 colno: The column number where compilation failed, unless pos is None

40 """

42 def __init__(self, message, pattern=None, pos=None):

43 newline = '\n' if isinstance(pattern, str) else b'\n'

44 self.msg = message

45 self.pattern = pattern

46 self.pos = pos

47 if pattern is not None and pos is not None:

48 self.lineno = pattern.count(newline, 0, pos) + 1

49 self.colno = pos - pattern.rfind(newline, 0, pos)

51 message = "{} at position {}".format(message, pos)

53 if newline in pattern:

54 message += " (line {}, column {})".format(self.lineno,

55 self.colno)

57 Exception.__init__(self, message)

59# The exception for when a positional flag has been turned on in the old

60# behaviour.

61class _UnscopedFlagSet(Exception):

62 pass

64# The exception for when parsing fails and we want to try something else.

65class ParseError(Exception):

66 pass

68# The exception for when there isn't a valid first set.

69class _FirstSetError(Exception):

70 pass

72# Flags.

73class RegexFlag(enum.IntFlag):

74 A = ASCII = 0x80 # Assume ASCII locale.

75 B = BESTMATCH = 0x1000 # Best fuzzy match.

76 D = DEBUG = 0x200 # Print parsed pattern.

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first

78 # fuzzy match.

79 F = FULLCASE = 0x4000 # Unicode full case-folding.

80 I = IGNORECASE = 0x2 # Ignore case.

81 L = LOCALE = 0x4 # Assume current 8-bit locale.

82 M = MULTILINE = 0x8 # Make anchors look for newline.

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).

84 R = REVERSE = 0x400 # Search backwards.

85 S = DOTALL = 0x10 # Make dot match newline.

86 U = UNICODE = 0x20 # Assume Unicode locale.

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.

88 V1 = VERSION1 = 0x100 # New enhanced behaviour.

89 W = WORD = 0x800 # Default Unicode word breaks.

90 X = VERBOSE = 0x40 # Ignore whitespace and comments.

91 T = TEMPLATE = 0x1 # Template (present because re module has it).

93 def __repr__(self):

94 if self._name_ is not None:

95 return 'regex.%s' % self._name_

97 value = self._value_

98 members = []

99 negative = value < 0

100

101 if negative:

102 value = ~value

103

104 for m in self.__class__:

105 if value & m._value_:

106 value &= ~m._value_

107 members.append('regex.%s' % m._name_)

108

109 if value:

110 members.append(hex(value))

111

112 res = '|'.join(members)

113

114 if negative:

115 if len(members) > 1:

116 res = '~(%s)' % res

117 else:

118 res = '~%s' % res

119

120 return res

121

122 __str__ = object.__str__

123

124# Put the flags into the module namespace. Being explicit here helps tools like

125# linters and IDEs understand the code better.

126ASCII = RegexFlag.ASCII

127BESTMATCH = RegexFlag.BESTMATCH

128DEBUG = RegexFlag.DEBUG

129DOTALL = RegexFlag.DOTALL

130ENHANCEMATCH = RegexFlag.ENHANCEMATCH

131FULLCASE = RegexFlag.FULLCASE

132IGNORECASE = RegexFlag.IGNORECASE

133LOCALE = RegexFlag.LOCALE

134MULTILINE = RegexFlag.MULTILINE

135POSIX = RegexFlag.POSIX

136REVERSE = RegexFlag.REVERSE

137TEMPLATE = RegexFlag.TEMPLATE

138UNICODE = RegexFlag.UNICODE

139VERBOSE = RegexFlag.VERBOSE

140VERSION0 = RegexFlag.VERSION0

141VERSION1 = RegexFlag.VERSION1

142WORD = RegexFlag.WORD

143A = RegexFlag.A

144B = RegexFlag.B

145D = RegexFlag.D

146E = RegexFlag.E

147F = RegexFlag.F

148I = RegexFlag.I

149L = RegexFlag.L

150M = RegexFlag.M

151P = RegexFlag.P

152R = RegexFlag.R

153S = RegexFlag.S

154U = RegexFlag.U

155V0 = RegexFlag.V0

156V1 = RegexFlag.V1

157W = RegexFlag.W

158X = RegexFlag.X

159T = RegexFlag.T

160

161DEFAULT_VERSION = VERSION1

162

163_ALL_VERSIONS = VERSION0 | VERSION1

164_ALL_ENCODINGS = ASCII | LOCALE | UNICODE

165

166# The default flags for the various versions.

167DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}

168

169# The mask for the flags.

171 REVERSE)

173 _ALL_ENCODINGS)

174

175ALPHA = frozenset(string.ascii_letters)

176DIGITS = frozenset(string.digits)

177ALNUM = ALPHA | DIGITS

178OCT_DIGITS = frozenset(string.octdigits)

179HEX_DIGITS = frozenset(string.hexdigits)

180SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])

181NAMED_CHAR_PART = ALNUM | frozenset(" -")

182PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")

183SET_OPS = ("||", "~~", "&&", "--")

184

185# The width of the code words inside the regex engine.

186BYTES_PER_CODE = _regex.get_code_size()

187BITS_PER_CODE = BYTES_PER_CODE * 8

188

189# The repeat count which represents infinity.

190UNLIMITED = (1 << BITS_PER_CODE) - 1

191

192# The regular expression flags.

193REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,

194 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,

195 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":

196 VERBOSE}

197

198# The case flags.

199CASE_FLAGS = FULLCASE | IGNORECASE

200NOCASE = 0

201FULLIGNORECASE = FULLCASE | IGNORECASE

202

203FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE

204

205CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,

206 FULLIGNORECASE: FULLIGNORECASE}

207

208# The number of digits in hexadecimal escapes.

209HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}

210

211# The names of the opcodes.

212OPCODES = """

213FAILURE

214SUCCESS

215ANY

216ANY_ALL

217ANY_ALL_REV

218ANY_REV

219ANY_U

220ANY_U_REV

221ATOMIC

222BOUNDARY

223BRANCH

224CALL_REF

225CHARACTER

226CHARACTER_IGN

227CHARACTER_IGN_REV

228CHARACTER_REV

229CONDITIONAL

230DEFAULT_BOUNDARY

231DEFAULT_END_OF_WORD

232DEFAULT_START_OF_WORD

233END

234END_OF_LINE

235END_OF_LINE_U

236END_OF_STRING

237END_OF_STRING_LINE

238END_OF_STRING_LINE_U

239END_OF_WORD

240FUZZY

241GRAPHEME_BOUNDARY

242GREEDY_REPEAT

243GROUP

244GROUP_CALL

245GROUP_EXISTS

246KEEP

247LAZY_REPEAT

248LOOKAROUND

249NEXT

250PROPERTY

251PROPERTY_IGN

252PROPERTY_IGN_REV

253PROPERTY_REV

254PRUNE

255RANGE

256RANGE_IGN

257RANGE_IGN_REV

258RANGE_REV

259REF_GROUP

260REF_GROUP_FLD

261REF_GROUP_FLD_REV

262REF_GROUP_IGN

263REF_GROUP_IGN_REV

264REF_GROUP_REV

265SEARCH_ANCHOR

266SET_DIFF

267SET_DIFF_IGN

268SET_DIFF_IGN_REV

269SET_DIFF_REV

270SET_INTER

271SET_INTER_IGN

272SET_INTER_IGN_REV

273SET_INTER_REV

274SET_SYM_DIFF

275SET_SYM_DIFF_IGN

276SET_SYM_DIFF_IGN_REV

277SET_SYM_DIFF_REV

278SET_UNION

279SET_UNION_IGN

280SET_UNION_IGN_REV

281SET_UNION_REV

282SKIP

283START_OF_LINE

284START_OF_LINE_U

285START_OF_STRING

286START_OF_WORD

287STRING

288STRING_FLD

289STRING_FLD_REV

290STRING_IGN

291STRING_IGN_REV

292STRING_REV

293FUZZY_EXT

294"""

295

296# Define the opcodes in a namespace.

297class Namespace:

298 pass

299

300OP = Namespace()

301for i, op in enumerate(OPCODES.split()):

302 setattr(OP, op, i)

303

304def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):

305 """Make room in the given cache.

306

307 Args:

308 cache_dict: The cache dictionary to modify.

309 args_dict: The dictionary of named list args used by patterns.

310 max_length: Maximum # of entries in cache_dict before it is shrunk.

311 divisor: Cache will shrink to max_length - 1/divisor*max_length items.

312 """

313 # Toss out a fraction of the entries at random to make room for new ones.

314 # A random algorithm was chosen as opposed to simply cache_dict.popitem()

315 # as popitem could penalize the same regular expression repeatedly based

316 # on its internal hash value. Being random should spread the cache miss

317 # love around.

318 cache_keys = tuple(cache_dict.keys())

319 overage = len(cache_keys) - max_length

320 if overage < 0:

321 # Cache is already within limits. Normally this should not happen

322 # but it could due to multithreading.

323 return

324

325 number_to_toss = max_length // divisor + overage

326

327 # The import is done here to avoid a circular dependency.

328 import random

329 if not hasattr(random, 'sample'):

330 # Do nothing while resolving the circular dependency:

331 # re->random->warnings->tokenize->string->re

332 return

333

334 for doomed_key in random.sample(cache_keys, number_to_toss):

335 try:

336 del cache_dict[doomed_key]

337 except KeyError:

338 # Ignore problems if the cache changed from another thread.

339 pass

340

341 # Rebuild the arguments and locale-sensitivity dictionaries.

342 args_dict.clear()

343 sensitivity_dict = {}

344 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):

345 args_dict[pattern, pattern_type, flags, default_version, locale] = args

346 try:

347 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]

348 except KeyError:

349 pass

350

351 locale_sensitive.clear()

352 locale_sensitive.update(sensitivity_dict)

353

354def _fold_case(info, string):

355 "Folds the case of a string."

356 flags = info.flags

357 if (flags & _ALL_ENCODINGS) == 0:

358 flags |= info.guess_encoding

359

360 return _regex.fold_case(flags, string)

361

362def is_cased_i(info, char):

363 "Checks whether a character is cased."

364 return len(_regex.get_all_cases(info.flags, char)) > 1

365

366def is_cased_f(flags, char):

367 "Checks whether a character is cased."

368 return len(_regex.get_all_cases(flags, char)) > 1

369

370def _compile_firstset(info, fs):

371 "Compiles the firstset for the pattern."

372 reverse = bool(info.flags & REVERSE)

373 fs = _check_firstset(info, reverse, fs)

374 if not fs or isinstance(fs, AnyAll):

375 return []

376

377 # Compile the firstset.

378 return fs.compile(reverse)

379

380def _check_firstset(info, reverse, fs):

381 "Checks the firstset for the pattern."

382 if not fs or None in fs:

383 return None

384

385 # If we ignore the case, for simplicity we won't build a firstset.

386 members = set()

387 case_flags = NOCASE

388 for i in fs:

389 if isinstance(i, Character) and not i.positive:

390 return None

391

392# if i.case_flags:

393# if isinstance(i, Character):

394# if is_cased_i(info, i.value):

395# return []

396# elif isinstance(i, SetBase):

397# return []

398 case_flags |= i.case_flags

399 members.add(i.with_flags(case_flags=NOCASE))

400

401 if case_flags == (FULLCASE | IGNORECASE):

402 return None

403

404 # Build the firstset.

405 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,

406 zerowidth=True)

407 fs = fs.optimise(info, reverse, in_set=True)

408

409 return fs

410

411def _flatten_code(code):

412 "Flattens the code from a list of tuples."

413 flat_code = []

414 for c in code:

415 flat_code.extend(c)

416

417 return flat_code

418

419def make_case_flags(info):

420 "Makes the case flags."

421 flags = info.flags & CASE_FLAGS

422

423 # Turn off FULLCASE if ASCII is turned on.

424 if info.flags & ASCII:

425 flags &= ~FULLCASE

426

427 return flags

428

429def make_character(info, value, in_set=False):

430 "Makes a character literal."

431 if in_set:

432 # A character set is built case-sensitively.

433 return Character(value)

434

435 return Character(value, case_flags=make_case_flags(info))

436

437def make_ref_group(info, name, position):

438 "Makes a group reference."

439 return RefGroup(info, name, position, case_flags=make_case_flags(info))

440

441def make_string_set(info, name):

442 "Makes a string set."

443 return StringSet(info, name, case_flags=make_case_flags(info))

444

445def make_property(info, prop, in_set):

446 "Makes a property."

447 if in_set:

448 return prop

449

450 return prop.with_flags(case_flags=make_case_flags(info))

451

452def _parse_pattern(source, info):

453 "Parses a pattern, eg. 'a|b|c'."

454 branches = [parse_sequence(source, info)]

455 while source.match("|"):

456 branches.append(parse_sequence(source, info))

457

458 if len(branches) == 1:

459 return branches[0]

460 return Branch(branches)

461

462def parse_sequence(source, info):

463 "Parses a sequence, eg. 'abc'."

464 sequence = [None]

465 case_flags = make_case_flags(info)

466 while True:

467 saved_pos = source.pos

468 ch = source.get()

469 if ch in SPECIAL_CHARS:

470 if ch in ")|":

471 # The end of a sequence. At the end of the pattern ch is "".

472 source.pos = saved_pos

473 break

474 elif ch == "\\":

475 # An escape sequence outside a set.

476 sequence.append(parse_escape(source, info, False))

477 elif ch == "(":

478 # A parenthesised subpattern or a flag.

479 element = parse_paren(source, info)

480 if element is None:

481 case_flags = make_case_flags(info)

482 else:

483 sequence.append(element)

484 elif ch == ".":

485 # Any character.

486 if info.flags & DOTALL:

487 sequence.append(AnyAll())

488 elif info.flags & WORD:

489 sequence.append(AnyU())

490 else:

491 sequence.append(Any())

492 elif ch == "[":

493 # A character set.

494 sequence.append(parse_set(source, info))

495 elif ch == "^":

496 # The start of a line or the string.

497 if info.flags & MULTILINE:

498 if info.flags & WORD:

499 sequence.append(StartOfLineU())

500 else:

501 sequence.append(StartOfLine())

502 else:

503 sequence.append(StartOfString())

504 elif ch == "$":

505 # The end of a line or the string.

506 if info.flags & MULTILINE:

507 if info.flags & WORD:

508 sequence.append(EndOfLineU())

509 else:

510 sequence.append(EndOfLine())

511 else:

512 if info.flags & WORD:

513 sequence.append(EndOfStringLineU())

514 else:

515 sequence.append(EndOfStringLine())

516 elif ch in "?*+{":

517 # Looks like a quantifier.

518 counts = parse_quantifier(source, info, ch)

519 if counts:

520 # It _is_ a quantifier.

521 apply_quantifier(source, info, counts, case_flags, ch,

522 saved_pos, sequence)

523 sequence.append(None)

524 else:

525 # It's not a quantifier. Maybe it's a fuzzy constraint.

526 constraints = parse_fuzzy(source, info, ch, case_flags)

527 if constraints:

528 # It _is_ a fuzzy constraint.

529 apply_constraint(source, info, constraints, case_flags,

530 saved_pos, sequence)

531 sequence.append(None)

532 else:

533 # The element was just a literal.

534 sequence.append(Character(ord(ch),

535 case_flags=case_flags))

536 else:

537 # A literal.

538 sequence.append(Character(ord(ch), case_flags=case_flags))

539 else:

540 # A literal.

541 sequence.append(Character(ord(ch), case_flags=case_flags))

542

543 sequence = [item for item in sequence if item is not None]

544 return Sequence(sequence)

545

546def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,

547 sequence):

548 element = sequence.pop()

549 if element is None:

550 if sequence:

551 raise error("multiple repeat", source.string, saved_pos)

552 raise error("nothing to repeat", source.string, saved_pos)

553

554 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):

555 raise error("multiple repeat", source.string, saved_pos)

556

557 min_count, max_count = counts

558 saved_pos = source.pos

559 ch = source.get()

560 if ch == "?":

561 # The "?" suffix that means it's a lazy repeat.

562 repeated = LazyRepeat

563 elif ch == "+":

564 # The "+" suffix that means it's a possessive repeat.

565 repeated = PossessiveRepeat

566 else:

567 # No suffix means that it's a greedy repeat.

568 source.pos = saved_pos

569 repeated = GreedyRepeat

570

571 # Ignore the quantifier if it applies to a zero-width item or the number of

572 # repeats is fixed at 1.

573 if not element.is_empty() and (min_count != 1 or max_count != 1):

574 element = repeated(element, min_count, max_count)

575

576 sequence.append(element)

577

578def apply_constraint(source, info, constraints, case_flags, saved_pos,

579 sequence):

580 element = sequence.pop()

581 if element is None:

582 raise error("nothing for fuzzy constraint", source.string, saved_pos)

583

584 # If a group is marked as fuzzy then put all of the fuzzy part in the

585 # group.

586 if isinstance(element, Group):

587 element.subpattern = Fuzzy(element.subpattern, constraints)

588 sequence.append(element)

589 else:

590 sequence.append(Fuzzy(element, constraints))

591

592_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}

593

594def parse_quantifier(source, info, ch):

595 "Parses a quantifier."

596 q = _QUANTIFIERS.get(ch)

597 if q:

598 # It's a quantifier.

599 return q

600

601 if ch == "{":

602 # Looks like a limited repeated element, eg. 'a{2,3}'.

603 counts = parse_limited_quantifier(source)

604 if counts:

605 return counts

606

607 return None

608

609def is_above_limit(count):

610 "Checks whether a count is above the maximum."

611 return count is not None and count >= UNLIMITED

612

613def parse_limited_quantifier(source):

614 "Parses a limited quantifier."

615 saved_pos = source.pos

616 min_count = parse_count(source)

617 if source.match(","):

618 max_count = parse_count(source)

619

620 # No minimum means 0 and no maximum means unlimited.

621 min_count = int(min_count or 0)

622 max_count = int(max_count) if max_count else None

623 else:

624 if not min_count:

625 source.pos = saved_pos

626 return None

627

628 min_count = max_count = int(min_count)

629

630 if not source.match ("}"):

631 source.pos = saved_pos

632 return None

633

634 if is_above_limit(min_count) or is_above_limit(max_count):

635 raise error("repeat count too big", source.string, saved_pos)

636

637 if max_count is not None and min_count > max_count:

638 raise error("min repeat greater than max repeat", source.string,

639 saved_pos)

640

641 return min_count, max_count

642

643def parse_fuzzy(source, info, ch, case_flags):

644 "Parses a fuzzy setting, if present."

645 saved_pos = source.pos

646

647 if ch != "{":

648 return None

649

650 constraints = {}

651 try:

652 parse_fuzzy_item(source, constraints)

653 while source.match(","):

654 parse_fuzzy_item(source, constraints)

655 except ParseError:

656 source.pos = saved_pos

657 return None

658

659 if source.match(":"):

660 constraints["test"] = parse_fuzzy_test(source, info, case_flags)

661

662 if not source.match("}"):

663 raise error("expected }", source.string, source.pos)

664

665 return constraints

666

667def parse_fuzzy_item(source, constraints):

668 "Parses a fuzzy setting item."

669 saved_pos = source.pos

670 try:

671 parse_cost_constraint(source, constraints)

672 except ParseError:

673 source.pos = saved_pos

674

675 parse_cost_equation(source, constraints)

676

677def parse_cost_constraint(source, constraints):

678 "Parses a cost constraint."

679 saved_pos = source.pos

680 ch = source.get()

681 if ch in ALPHA:

682 # Syntax: constraint [("<=" | "<") cost]

683 constraint = parse_constraint(source, constraints, ch)

684

685 max_inc = parse_fuzzy_compare(source)

686

687 if max_inc is None:

688 # No maximum cost.

689 constraints[constraint] = 0, None

690 else:

691 # There's a maximum cost.

692 cost_pos = source.pos

693 max_cost = parse_cost_limit(source)

694

695 # Inclusive or exclusive limit?

696 if not max_inc:

697 max_cost -= 1

698

699 if max_cost < 0:

700 raise error("bad fuzzy cost limit", source.string, cost_pos)

701

702 constraints[constraint] = 0, max_cost

703 elif ch in DIGITS:

704 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost

705 source.pos = saved_pos

706

707 # Minimum cost.

708 cost_pos = source.pos

709 min_cost = parse_cost_limit(source)

710

711 min_inc = parse_fuzzy_compare(source)

712 if min_inc is None:

713 raise ParseError()

714

715 constraint = parse_constraint(source, constraints, source.get())

716

717 max_inc = parse_fuzzy_compare(source)

718 if max_inc is None:

719 raise ParseError()

720

721 # Maximum cost.

722 cost_pos = source.pos

723 max_cost = parse_cost_limit(source)

724

725 # Inclusive or exclusive limits?

726 if not min_inc:

727 min_cost += 1

728 if not max_inc:

729 max_cost -= 1

730

731 if not 0 <= min_cost <= max_cost:

732 raise error("bad fuzzy cost limit", source.string, cost_pos)

733

734 constraints[constraint] = min_cost, max_cost

735 else:

736 raise ParseError()

737

738def parse_cost_limit(source):

739 "Parses a cost limit."

740 cost_pos = source.pos

741 digits = parse_count(source)

742

743 try:

744 return int(digits)

745 except ValueError:

746 pass

747

748 raise error("bad fuzzy cost limit", source.string, cost_pos)

749

750def parse_constraint(source, constraints, ch):

751 "Parses a constraint."

752 if ch not in "deis":

753 raise ParseError()

754

755 if ch in constraints:

756 raise ParseError()

757

758 return ch

759

760def parse_fuzzy_compare(source):

761 "Parses a cost comparator."

762 if source.match("<="):

763 return True

764 elif source.match("<"):

765 return False

766 else:

767 return None

768

769def parse_cost_equation(source, constraints):

770 "Parses a cost equation."

771 if "cost" in constraints:

772 raise error("more than one cost equation", source.string, source.pos)

773

774 cost = {}

775

776 parse_cost_term(source, cost)

777 while source.match("+"):

778 parse_cost_term(source, cost)

779

780 max_inc = parse_fuzzy_compare(source)

781 if max_inc is None:

782 raise ParseError()

783

784 max_cost = int(parse_count(source))

785

786 if not max_inc:

787 max_cost -= 1

788

789 if max_cost < 0:

790 raise error("bad fuzzy cost limit", source.string, source.pos)

791

792 cost["max"] = max_cost

793

794 constraints["cost"] = cost

795

796def parse_cost_term(source, cost):

797 "Parses a cost equation term."

798 coeff = parse_count(source)

799 ch = source.get()

800 if ch not in "dis":

801 raise ParseError()

802

803 if ch in cost:

804 raise error("repeated fuzzy cost", source.string, source.pos)

805

806 cost[ch] = int(coeff or 1)

807

808def parse_fuzzy_test(source, info, case_flags):

809 saved_pos = source.pos

810 ch = source.get()

811 if ch in SPECIAL_CHARS:

812 if ch == "\\":

813 # An escape sequence outside a set.

814 return parse_escape(source, info, False)

815 elif ch == ".":

816 # Any character.

817 if info.flags & DOTALL:

818 return AnyAll()

819 elif info.flags & WORD:

820 return AnyU()

821 else:

822 return Any()

823 elif ch == "[":

824 # A character set.

825 return parse_set(source, info)

826 else:

827 raise error("expected character set", source.string, saved_pos)

828 elif ch:

829 # A literal.

830 return Character(ord(ch), case_flags=case_flags)

831 else:

832 raise error("expected character set", source.string, saved_pos)

833

834def parse_count(source):

835 "Parses a quantifier's count, which can be empty."

836 return source.get_while(DIGITS)

837

838def parse_paren(source, info):

839 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an

840 inline flag.

841 """

842 saved_pos = source.pos

843 ch = source.get(True)

844 if ch == "?":

845 # (?...

846 saved_pos_2 = source.pos

847 ch = source.get(True)

848 if ch == "<":

849 # (?<...

850 saved_pos_3 = source.pos

851 ch = source.get()

852 if ch in ("=", "!"):

853 # (?<=... or (?<!...: lookbehind.

854 return parse_lookaround(source, info, True, ch == "=")

855

856 # (?<...: a named capture group.

857 source.pos = saved_pos_3

858 name = parse_name(source)

859 group = info.open_group(name)

860 source.expect(">")

861 saved_flags = info.flags

862 try:

863 subpattern = _parse_pattern(source, info)

864 source.expect(")")

865 finally:

866 info.flags = saved_flags

867 source.ignore_space = bool(info.flags & VERBOSE)

868

869 info.close_group()

870 return Group(info, group, subpattern)

871 if ch in ("=", "!"):

872 # (?=... or (?!...: lookahead.

873 return parse_lookaround(source, info, False, ch == "=")

874 if ch == "P":

875 # (?P...: a Python extension.

876 return parse_extension(source, info)

877 if ch == "#":

878 # (?#...: a comment.

879 return parse_comment(source)

880 if ch == "(":

881 # (?(...: a conditional subpattern.

882 return parse_conditional(source, info)

883 if ch == ">":

884 # (?>...: an atomic subpattern.

885 return parse_atomic(source, info)

886 if ch == "|":

887 # (?|...: a common/reset groups branch.

888 return parse_common(source, info)

889 if ch == "R" or "0" <= ch <= "9":

890 # (?R...: probably a call to a group.

891 return parse_call_group(source, info, ch, saved_pos_2)

892 if ch == "&":

893 # (?&...: a call to a named group.

894 return parse_call_named_group(source, info, saved_pos_2)

895

896 # (?...: probably a flags subpattern.

897 source.pos = saved_pos_2

898 return parse_flags_subpattern(source, info)

899

900 if ch == "*":

901 # (*...

902 saved_pos_2 = source.pos

903 word = source.get_while(set(")>"), include=False)

904 if word[ : 1].isalpha():

905 verb = VERBS.get(word)

906 if not verb:

907 raise error("unknown verb", source.string, saved_pos_2)

908

909 source.expect(")")

910

911 return verb

912

913 # (...: an unnamed capture group.

914 source.pos = saved_pos

915 group = info.open_group()

916 saved_flags = info.flags

917 try:

918 subpattern = _parse_pattern(source, info)

919 source.expect(")")

920 finally:

921 info.flags = saved_flags

922 source.ignore_space = bool(info.flags & VERBOSE)

923

924 info.close_group()

925

926 return Group(info, group, subpattern)

927

928def parse_extension(source, info):

929 "Parses a Python extension."

930 saved_pos = source.pos

931 ch = source.get()

932 if ch == "<":

933 # (?P<...: a named capture group.

934 name = parse_name(source)

935 group = info.open_group(name)

936 source.expect(">")

937 saved_flags = info.flags

938 try:

939 subpattern = _parse_pattern(source, info)

940 source.expect(")")

941 finally:

942 info.flags = saved_flags

943 source.ignore_space = bool(info.flags & VERBOSE)

944

945 info.close_group()

946

947 return Group(info, group, subpattern)

948 if ch == "=":

949 # (?P=...: a named group reference.

950 name = parse_name(source, allow_numeric=True)

951 source.expect(")")

952 if info.is_open_group(name):

953 raise error("cannot refer to an open group", source.string,

954 saved_pos)

955

956 return make_ref_group(info, name, saved_pos)

957 if ch == ">" or ch == "&":

958 # (?P>...: a call to a group.

959 return parse_call_named_group(source, info, saved_pos)

960

961 source.pos = saved_pos

962 raise error("unknown extension", source.string, saved_pos)

963

964def parse_comment(source):

965 "Parses a comment."

966 while True:

967 saved_pos = source.pos

968 c = source.get(True)

969

970 if not c or c == ")":

971 break

972

973 if c == "\\":

974 c = source.get(True)

975

976 source.pos = saved_pos

977 source.expect(")")

978

979 return None

980

981def parse_lookaround(source, info, behind, positive):

982 "Parses a lookaround."

983 saved_flags = info.flags

984 try:

985 subpattern = _parse_pattern(source, info)

986 source.expect(")")

987 finally:

988 info.flags = saved_flags

989 source.ignore_space = bool(info.flags & VERBOSE)

990

991 return LookAround(behind, positive, subpattern)

992

993def parse_conditional(source, info):

994 "Parses a conditional subpattern."

995 saved_flags = info.flags

996 saved_pos = source.pos

997 ch = source.get()

998 if ch == "?":

999 # (?(?...

1000 ch = source.get()

1001 if ch in ("=", "!"):

1002 # (?(?=... or (?(?!...: lookahead conditional.

1003 return parse_lookaround_conditional(source, info, False, ch == "=")

1004 if ch == "<":

1005 # (?(?<...

1006 ch = source.get()

1007 if ch in ("=", "!"):

1008 # (?(?<=... or (?(?<!...: lookbehind conditional.

1009 return parse_lookaround_conditional(source, info, True, ch ==

1010 "=")

1011

1012 source.pos = saved_pos

1013 raise error("expected lookaround conditional", source.string,

1014 source.pos)

1015

1016 source.pos = saved_pos

1017 try:

1018 group = parse_name(source, True)

1019 source.expect(")")

1020 yes_branch = parse_sequence(source, info)

1021 if source.match("|"):

1022 no_branch = parse_sequence(source, info)

1023 else:

1024 no_branch = Sequence()

1025

1026 source.expect(")")

1027 finally:

1028 info.flags = saved_flags

1029 source.ignore_space = bool(info.flags & VERBOSE)

1030

1031 if yes_branch.is_empty() and no_branch.is_empty():

1032 return Sequence()

1033

1034 return Conditional(info, group, yes_branch, no_branch, saved_pos)

1035

1036def parse_lookaround_conditional(source, info, behind, positive):

1037 saved_flags = info.flags

1038 try:

1039 subpattern = _parse_pattern(source, info)

1040 source.expect(")")

1041 finally:

1042 info.flags = saved_flags

1043 source.ignore_space = bool(info.flags & VERBOSE)

1044

1045 yes_branch = parse_sequence(source, info)

1046 if source.match("|"):

1047 no_branch = parse_sequence(source, info)

1048 else:

1049 no_branch = Sequence()

1050

1051 source.expect(")")

1052

1053 return LookAroundConditional(behind, positive, subpattern, yes_branch,

1054 no_branch)

1055

1056def parse_atomic(source, info):

1057 "Parses an atomic subpattern."

1058 saved_flags = info.flags

1059 try:

1060 subpattern = _parse_pattern(source, info)

1061 source.expect(")")

1062 finally:

1063 info.flags = saved_flags

1064 source.ignore_space = bool(info.flags & VERBOSE)

1065

1066 return Atomic(subpattern)

1067

1068def parse_common(source, info):

1069 "Parses a common groups branch."

1070 # Capture group numbers in different branches can reuse the group numbers.

1071 initial_group_count = info.group_count

1072 branches = [parse_sequence(source, info)]

1073 final_group_count = info.group_count

1074 while source.match("|"):

1075 info.group_count = initial_group_count

1076 branches.append(parse_sequence(source, info))

1077 final_group_count = max(final_group_count, info.group_count)

1078

1079 info.group_count = final_group_count

1080 source.expect(")")

1081

1082 if len(branches) == 1:

1083 return branches[0]

1084 return Branch(branches)

1085

1086def parse_call_group(source, info, ch, pos):

1087 "Parses a call to a group."

1088 if ch == "R":

1089 group = "0"

1090 else:

1091 group = ch + source.get_while(DIGITS)

1092

1093 source.expect(")")

1094

1095 return CallGroup(info, group, pos)

1096

1097def parse_call_named_group(source, info, pos):

1098 "Parses a call to a named group."

1099 group = parse_name(source)

1100 source.expect(")")

1101

1102 return CallGroup(info, group, pos)

1103

1104def parse_flag_set(source):

1105 "Parses a set of inline flags."

1106 flags = 0

1107

1108 try:

1109 while True:

1110 saved_pos = source.pos

1111 ch = source.get()

1112 if ch == "V":

1113 ch += source.get()

1114 flags |= REGEX_FLAGS[ch]

1115 except KeyError:

1116 source.pos = saved_pos

1117

1118 return flags

1119

1120def parse_flags(source, info):

1121 "Parses flags being turned on/off."

1122 flags_on = parse_flag_set(source)

1123 if source.match("-"):

1124 flags_off = parse_flag_set(source)

1125 if not flags_off:

1126 raise error("bad inline flags: no flags after '-'", source.string,

1127 source.pos)

1128 else:

1129 flags_off = 0

1130

1131 if flags_on & LOCALE:

1132 # Remember that this pattern as an inline locale flag.

1133 info.inline_locale = True

1134

1135 return flags_on, flags_off

1136

1137def parse_subpattern(source, info, flags_on, flags_off):

1138 "Parses a subpattern with scoped flags."

1139 saved_flags = info.flags

1140 info.flags = (info.flags | flags_on) & ~flags_off

1141

1142 # Ensure that there aren't multiple encoding flags set.

1143 if info.flags & (ASCII | LOCALE | UNICODE):

1144 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on

1145

1146 source.ignore_space = bool(info.flags & VERBOSE)

1147 try:

1148 subpattern = _parse_pattern(source, info)

1149 source.expect(")")

1150 finally:

1151 info.flags = saved_flags

1152 source.ignore_space = bool(info.flags & VERBOSE)

1153

1154 return subpattern

1155

1156def parse_flags_subpattern(source, info):

1157 """Parses a flags subpattern. It could be inline flags or a subpattern

1158 possibly with local flags. If it's a subpattern, then that's returned;

1159 if it's a inline flags, then None is returned.

1160 """

1161 flags_on, flags_off = parse_flags(source, info)

1162

1163 if flags_off & GLOBAL_FLAGS:

1164 raise error("bad inline flags: cannot turn off global flag",

1165 source.string, source.pos)

1166

1167 if flags_on & flags_off:

1168 raise error("bad inline flags: flag turned on and off", source.string,

1169 source.pos)

1170

1171 # Handle flags which are global in all regex behaviours.

1172 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS

1173 if new_global_flags:

1174 info.global_flags |= new_global_flags

1175

1176 # A global has been turned on, so reparse the pattern.

1177 raise _UnscopedFlagSet(info.global_flags)

1178

1179 # Ensure that from now on we have only scoped flags.

1180 flags_on &= ~GLOBAL_FLAGS

1181

1182 if source.match(":"):

1183 return parse_subpattern(source, info, flags_on, flags_off)

1184

1185 if source.match(")"):

1186 parse_positional_flags(source, info, flags_on, flags_off)

1187 return None

1188

1189 raise error("unknown extension", source.string, source.pos)

1190

1191def parse_positional_flags(source, info, flags_on, flags_off):

1192 "Parses positional flags."

1193 info.flags = (info.flags | flags_on) & ~flags_off

1194 source.ignore_space = bool(info.flags & VERBOSE)

1195

1196def parse_name(source, allow_numeric=False, allow_group_0=False):

1197 "Parses a name."

1198 name = source.get_while(set(")>"), include=False)

1199

1200 if not name:

1201 raise error("missing group name", source.string, source.pos)

1202

1203 if name.isdigit():

1204 min_group = 0 if allow_group_0 else 1

1205 if not allow_numeric or int(name) < min_group:

1206 raise error("bad character in group name", source.string,

1207 source.pos)

1208 else:

1209 if not name.isidentifier():

1210 raise error("bad character in group name", source.string,

1211 source.pos)

1212

1213 return name

1214

1215def is_octal(string):

1216 "Checks whether a string is octal."

1217 return all(ch in OCT_DIGITS for ch in string)

1218

1219def is_decimal(string):

1220 "Checks whether a string is decimal."

1221 return all(ch in DIGITS for ch in string)

1222

1223def is_hexadecimal(string):

1224 "Checks whether a string is hexadecimal."

1225 return all(ch in HEX_DIGITS for ch in string)

1226

1227def parse_escape(source, info, in_set):

1228 "Parses an escape sequence."

1229 saved_ignore = source.ignore_space

1230 source.ignore_space = False

1231 ch = source.get()

1232 source.ignore_space = saved_ignore

1233 if not ch:

1234 # A backslash at the end of the pattern.

1235 raise error("bad escape (end of pattern)", source.string, source.pos)

1236 if ch in HEX_ESCAPES:

1237 # A hexadecimal escape sequence.

1238 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)

1239 elif ch == "g" and not in_set:

1240 # A group reference.

1241 saved_pos = source.pos

1242 try:

1243 return parse_group_ref(source, info)

1244 except error:

1245 # Invalid as a group reference, so assume it's a literal.

1246 source.pos = saved_pos

1247

1248 return make_character(info, ord(ch), in_set)

1249 elif ch == "G" and not in_set:

1250 # A search anchor.

1251 return SearchAnchor()

1252 elif ch == "L" and not in_set:

1253 # A string set.

1254 return parse_string_set(source, info)

1255 elif ch == "N":

1256 # A named codepoint.

1257 return parse_named_char(source, info, in_set)

1258 elif ch in "pP":

1259 # A Unicode property, positive or negative.

1260 return parse_property(source, info, ch == "p", in_set)

1261 elif ch == "R" and not in_set:

1262 # A line ending.

1263 charset = [0x0A, 0x0B, 0x0C, 0x0D]

1264 if info.guess_encoding == UNICODE:

1265 charset.extend([0x85, 0x2028, 0x2029])

1266

1267 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)

1268 for c in charset])]))

1269 elif ch == "X" and not in_set:

1270 # A grapheme cluster.

1271 return Grapheme()

1272 elif ch in ALPHA:

1273 # An alphabetic escape sequence.

1274 # Positional escapes aren't allowed inside a character set.

1275 if not in_set:

1276 if info.flags & WORD:

1277 value = WORD_POSITION_ESCAPES.get(ch)

1278 elif info.flags & ASCII:

1279 value = ASCII_POSITION_ESCAPES.get(ch)

1280 elif info.flags & UNICODE:

1281 value = UNICODE_POSITION_ESCAPES.get(ch)

1282 else:

1283 value = POSITION_ESCAPES.get(ch)

1284

1285 if value:

1286 return value

1287

1288 if info.flags & ASCII:

1289 value = ASCII_CHARSET_ESCAPES.get(ch)

1290 elif info.flags & UNICODE:

1291 value = UNICODE_CHARSET_ESCAPES.get(ch)

1292 else:

1293 value = CHARSET_ESCAPES.get(ch)

1294

1295 if value:

1296 return value

1297

1298 value = CHARACTER_ESCAPES.get(ch)

1299 if value:

1300 return Character(ord(value))

1301

1302 raise error("bad escape \\%s" % ch, source.string, source.pos)

1303 elif ch in DIGITS:

1304 # A numeric escape sequence.

1305 return parse_numeric_escape(source, info, ch, in_set)

1306 else:

1307 # A literal.

1308 return make_character(info, ord(ch), in_set)

1309

1310def parse_numeric_escape(source, info, ch, in_set):

1311 "Parses a numeric escape sequence."

1312 if in_set or ch == "0":

1313 # Octal escape sequence, max 3 digits.

1314 return parse_octal_escape(source, info, [ch], in_set)

1315

1316 # At least 1 digit, so either octal escape or group.

1317 digits = ch

1318 saved_pos = source.pos

1319 ch = source.get()

1320 if ch in DIGITS:

1321 # At least 2 digits, so either octal escape or group.

1322 digits += ch

1323 saved_pos = source.pos

1324 ch = source.get()

1325 if is_octal(digits) and ch in OCT_DIGITS:

1326 # 3 octal digits, so octal escape sequence.

1327 encoding = info.flags & _ALL_ENCODINGS

1328 if encoding == ASCII or encoding == LOCALE:

1329 octal_mask = 0xFF

1330 else:

1331 octal_mask = 0x1FF

1332

1333 value = int(digits + ch, 8) & octal_mask

1334 return make_character(info, value)

1335

1336 # Group reference.

1337 source.pos = saved_pos

1338 if info.is_open_group(digits):

1339 raise error("cannot refer to an open group", source.string, source.pos)

1340

1341 return make_ref_group(info, digits, source.pos)

1342

1343def parse_octal_escape(source, info, digits, in_set):

1344 "Parses an octal escape sequence."

1345 saved_pos = source.pos

1346 ch = source.get()

1347 while len(digits) < 3 and ch in OCT_DIGITS:

1348 digits.append(ch)

1349 saved_pos = source.pos

1350 ch = source.get()

1351

1352 source.pos = saved_pos

1353 try:

1354 value = int("".join(digits), 8)

1355 return make_character(info, value, in_set)

1356 except ValueError:

1357 if digits[0] in OCT_DIGITS:

1358 raise error("incomplete escape \\%s" % ''.join(digits),

1359 source.string, source.pos)

1360 else:

1361 raise error("bad escape \\%s" % digits[0], source.string,

1362 source.pos)

1363

1364def parse_hex_escape(source, info, esc, expected_len, in_set, type):

1365 "Parses a hex escape sequence."

1366 saved_pos = source.pos

1367 digits = []

1368 for i in range(expected_len):

1369 ch = source.get()

1370 if ch not in HEX_DIGITS:

1371 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1372 source.string, saved_pos)

1373 digits.append(ch)

1374

1375 try:

1376 value = int("".join(digits), 16)

1377 except ValueError:

1378 pass

1379 else:

1380 if value < 0x110000:

1381 return make_character(info, value, in_set)

1382

1383 # Bad hex escape.

1384 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),

1385 source.string, saved_pos)

1386

1387def parse_group_ref(source, info):

1388 "Parses a group reference."

1389 source.expect("<")

1390 saved_pos = source.pos

1391 name = parse_name(source, True)

1392 source.expect(">")

1393 if info.is_open_group(name):

1394 raise error("cannot refer to an open group", source.string, source.pos)

1395

1396 return make_ref_group(info, name, saved_pos)

1397

1398def parse_string_set(source, info):

1399 "Parses a string set reference."

1400 source.expect("<")

1401 name = parse_name(source, True)

1402 source.expect(">")

1403 if name is None or name not in info.kwargs:

1404 raise error("undefined named list", source.string, source.pos)

1405

1406 return make_string_set(info, name)

1407

1408def parse_named_char(source, info, in_set):

1409 "Parses a named character."

1410 saved_pos = source.pos

1411 if source.match("{"):

1412 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)

1413 if source.match("}"):

1414 try:

1415 value = unicodedata.lookup(name)

1416 return make_character(info, ord(value), in_set)

1417 except KeyError:

1418 raise error("undefined character name", source.string,

1419 source.pos)

1420

1421 source.pos = saved_pos

1422 return make_character(info, ord("N"), in_set)

1423

1424def parse_property(source, info, positive, in_set):

1425 "Parses a Unicode property."

1426 saved_pos = source.pos

1427 ch = source.get()

1428 if ch == "{":

1429 negate = source.match("^")

1430 prop_name, name = parse_property_name(source)

1431 if source.match("}"):

1432 # It's correctly delimited.

1433 if info.flags & ASCII:

1434 encoding = ASCII_ENCODING

1435 elif info.flags & UNICODE:

1436 encoding = UNICODE_ENCODING

1437 else:

1438 encoding = 0

1439

1440 prop = lookup_property(prop_name, name, positive != negate, source,

1441 encoding=encoding)

1442 return make_property(info, prop, in_set)

1443 elif ch and ch in "CLMNPSZ":

1444 # An abbreviated property, eg \pL.

1445 if info.flags & ASCII:

1446 encoding = ASCII_ENCODING

1447 elif info.flags & UNICODE:

1448 encoding = UNICODE_ENCODING

1449 else:

1450 encoding = 0

1451

1452 prop = lookup_property(None, ch, positive, source, encoding=encoding)

1453 return make_property(info, prop, in_set)

1454

1455 # Not a property, so treat as a literal "p" or "P".

1456 source.pos = saved_pos

1457 ch = "p" if positive else "P"

1458 return make_character(info, ord(ch), in_set)

1459

1460def parse_property_name(source):

1461 "Parses a property name, which may be qualified."

1462 name = source.get_while(PROPERTY_NAME_PART)

1463 saved_pos = source.pos

1464

1465 ch = source.get()

1466 if ch and ch in ":=":

1467 prop_name = name

1468 name = source.get_while(ALNUM | set(" &_-./")).strip()

1469

1470 if name:

1471 # Name after the ":" or "=", so it's a qualified name.

1472 saved_pos = source.pos

1473 else:

1474 # No name after the ":" or "=", so assume it's an unqualified name.

1475 prop_name, name = None, prop_name

1476 else:

1477 prop_name = None

1478

1479 source.pos = saved_pos

1480 return prop_name, name

1481

1482def parse_set(source, info):

1483 "Parses a character set."

1484 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1485

1486 saved_ignore = source.ignore_space

1487 source.ignore_space = False

1488 # Negative set?

1489 negate = source.match("^")

1490 try:

1491 if version == VERSION0:

1492 item = parse_set_imp_union(source, info)

1493 else:

1494 item = parse_set_union(source, info)

1495

1496 if not source.match("]"):

1497 raise error("missing ]", source.string, source.pos)

1498 finally:

1499 source.ignore_space = saved_ignore

1500

1501 if negate:

1502 item = item.with_flags(positive=not item.positive)

1503

1504 item = item.with_flags(case_flags=make_case_flags(info))

1505

1506 return item

1507

1508def parse_set_union(source, info):

1509 "Parses a set union ([x||y])."

1510 items = [parse_set_symm_diff(source, info)]

1511 while source.match("||"):

1512 items.append(parse_set_symm_diff(source, info))

1513

1514 if len(items) == 1:

1515 return items[0]

1516 return SetUnion(info, items)

1517

1518def parse_set_symm_diff(source, info):

1519 "Parses a set symmetric difference ([x~~y])."

1520 items = [parse_set_inter(source, info)]

1521 while source.match("~~"):

1522 items.append(parse_set_inter(source, info))

1523

1524 if len(items) == 1:

1525 return items[0]

1526 return SetSymDiff(info, items)

1527

1528def parse_set_inter(source, info):

1529 "Parses a set intersection ([x&&y])."

1530 items = [parse_set_diff(source, info)]

1531 while source.match("&&"):

1532 items.append(parse_set_diff(source, info))

1533

1534 if len(items) == 1:

1535 return items[0]

1536 return SetInter(info, items)

1537

1538def parse_set_diff(source, info):

1539 "Parses a set difference ([x--y])."

1540 items = [parse_set_imp_union(source, info)]

1541 while source.match("--"):

1542 items.append(parse_set_imp_union(source, info))

1543

1544 if len(items) == 1:

1545 return items[0]

1546 return SetDiff(info, items)

1547

1548def parse_set_imp_union(source, info):

1549 "Parses a set implicit union ([xy])."

1550 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1551

1552 items = [parse_set_member(source, info)]

1553 while True:

1554 saved_pos = source.pos

1555 if source.match("]"):

1556 # End of the set.

1557 source.pos = saved_pos

1558 break

1559

1560 if version == VERSION1 and any(source.match(op) for op in SET_OPS):

1561 # The new behaviour has set operators.

1562 source.pos = saved_pos

1563 break

1564

1565 items.append(parse_set_member(source, info))

1566

1567 if len(items) == 1:

1568 return items[0]

1569 return SetUnion(info, items)

1570

1571def parse_set_member(source, info):

1572 "Parses a member in a character set."

1573 # Parse a set item.

1574 start = parse_set_item(source, info)

1575 saved_pos1 = source.pos

1576 if (not isinstance(start, Character) or not start.positive or not

1577 source.match("-")):

1578 # It's not the start of a range.

1579 return start

1580

1581 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1582

1583 # It looks like the start of a range of characters.

1584 saved_pos2 = source.pos

1585 if version == VERSION1 and source.match("-"):

1586 # It's actually the set difference operator '--', so return the

1587 # character.

1588 source.pos = saved_pos1

1589 return start

1590

1591 if source.match("]"):

1592 # We've reached the end of the set, so return both the character and

1593 # hyphen.

1594 source.pos = saved_pos2

1595 return SetUnion(info, [start, Character(ord("-"))])

1596

1597 # Parse a set item.

1598 end = parse_set_item(source, info)

1599 if not isinstance(end, Character) or not end.positive:

1600 # It's not a range, so return the character, hyphen and property.

1601 return SetUnion(info, [start, Character(ord("-")), end])

1602

1603 # It _is_ a range.

1604 if start.value > end.value:

1605 raise error("bad character range", source.string, source.pos)

1606

1607 if start.value == end.value:

1608 return start

1609

1610 return Range(start.value, end.value)

1611

1612def parse_set_item(source, info):

1613 "Parses an item in a character set."

1614 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1615

1616 if source.match("\\"):

1617 # An escape sequence in a set.

1618 return parse_escape(source, info, True)

1619

1620 saved_pos = source.pos

1621 if source.match("[:"):

1622 # Looks like a POSIX character class.

1623 try:

1624 return parse_posix_class(source, info)

1625 except ParseError:

1626 # Not a POSIX character class.

1627 source.pos = saved_pos

1628

1629 if version == VERSION1 and source.match("["):

1630 # It's the start of a nested set.

1631

1632 # Negative set?

1633 negate = source.match("^")

1634 item = parse_set_union(source, info)

1635

1636 if not source.match("]"):

1637 raise error("missing ]", source.string, source.pos)

1638

1639 if negate:

1640 item = item.with_flags(positive=not item.positive)

1641

1642 return item

1643

1644 ch = source.get()

1645 if not ch:

1646 raise error("unterminated character set", source.string, source.pos)

1647

1648 return Character(ord(ch))

1649

1650def parse_posix_class(source, info):

1651 "Parses a POSIX character class."

1652 negate = source.match("^")

1653 prop_name, name = parse_property_name(source)

1654 if not source.match(":]"):

1655 raise ParseError()

1656

1657 return lookup_property(prop_name, name, not negate, source, posix=True)

1658

1659def float_to_rational(flt):

1660 "Converts a float to a rational pair."

1661 int_part = int(flt)

1662 error = flt - int_part

1663 if abs(error) < 0.0001:

1664 return int_part, 1

1665

1666 den, num = float_to_rational(1.0 / error)

1667

1668 return int_part * den + num, den

1669

1670def numeric_to_rational(numeric):

1671 "Converts a numeric string to a rational string, if possible."

1672 if numeric[ : 1] == "-":

1673 sign, numeric = numeric[0], numeric[1 : ]

1674 else:

1675 sign = ""

1676

1677 parts = numeric.split("/")

1678 if len(parts) == 2:

1679 num, den = float_to_rational(float(parts[0]) / float(parts[1]))

1680 elif len(parts) == 1:

1681 num, den = float_to_rational(float(parts[0]))

1682 else:

1683 raise ValueError()

1684

1685 result = "{}{}/{}".format(sign, num, den)

1686 if result.endswith("/1"):

1687 return result[ : -2]

1688

1689 return result

1690

1691def standardise_name(name):

1692 "Standardises a property or value name."

1693 try:

1694 return numeric_to_rational("".join(name))

1695 except (ValueError, ZeroDivisionError):

1696 return "".join(ch for ch in name if ch not in "_- ").upper()

1697

1698_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())

1699

1700_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())

1701

1702def lookup_property(property, value, positive, source=None, posix=False, encoding=0):

1703 "Looks up a property."

1704 # Normalise the names (which may still be lists).

1705 property = standardise_name(property) if property else None

1706 value = standardise_name(value)

1707

1708 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):

1709 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive

1710

1711 if posix and not property and value.upper() in _POSIX_CLASSES:

1712 value = 'POSIX' + value

1713

1714 if property:

1715 # Both the property and the value are provided.

1716 prop = PROPERTIES.get(property)

1717 if not prop:

1718 if not source:

1719 raise error("unknown property")

1720

1721 raise error("unknown property", source.string, source.pos)

1722

1723 prop_id, value_dict = prop

1724 val_id = value_dict.get(value)

1725 if val_id is None:

1726 if not source:

1727 raise error("unknown property value")

1728

1729 raise error("unknown property value", source.string, source.pos)

1730

1731 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1732

1733 # Only the value is provided.

1734 # It might be the name of a GC, script or block value.

1735 for property in ("GC", "SCRIPT", "BLOCK"):

1736 prop_id, value_dict = PROPERTIES.get(property)

1737 val_id = value_dict.get(value)

1738 if val_id is not None:

1739 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1740

1741 # It might be the name of a binary property.

1742 prop = PROPERTIES.get(value)

1743 if prop:

1744 prop_id, value_dict = prop

1745 if set(value_dict) == _BINARY_VALUES:

1746 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1747

1748 return Property(prop_id << 16, not positive, encoding=encoding)

1749

1750 # It might be the name of a binary property starting with a prefix.

1751 if value.startswith("IS"):

1752 prop = PROPERTIES.get(value[2 : ])

1753 if prop:

1754 prop_id, value_dict = prop

1755 if "YES" in value_dict:

1756 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1757

1758 # It might be the name of a script or block starting with a prefix.

1759 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):

1760 if value.startswith(prefix):

1761 prop_id, value_dict = PROPERTIES.get(property)

1762 val_id = value_dict.get(value[2 : ])

1763 if val_id is not None:

1764 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1765

1766 # Unknown property.

1767 if not source:

1768 raise error("unknown property")

1769

1770 raise error("unknown property", source.string, source.pos)

1771

1772def _compile_replacement(source, pattern, is_unicode):

1773 "Compiles a replacement template escape sequence."

1774 ch = source.get()

1775 if ch in ALPHA:

1776 # An alphabetic escape sequence.

1777 value = CHARACTER_ESCAPES.get(ch)

1778 if value:

1779 return False, [ord(value)]

1780

1781 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):

1782 # A hexadecimal escape sequence.

1783 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]

1784

1785 if ch == "g":

1786 # A group preference.

1787 return True, [compile_repl_group(source, pattern)]

1788

1789 if ch == "N" and is_unicode:

1790 # A named character.

1791 value = parse_repl_named_char(source)

1792 if value is not None:

1793 return False, [value]

1794

1795 raise error("bad escape \\%s" % ch, source.string, source.pos)

1796

1797 if isinstance(source.sep, bytes):

1798 octal_mask = 0xFF

1799 else:

1800 octal_mask = 0x1FF

1801

1802 if ch == "0":

1803 # An octal escape sequence.

1804 digits = ch

1805 while len(digits) < 3:

1806 saved_pos = source.pos

1807 ch = source.get()

1808 if ch not in OCT_DIGITS:

1809 source.pos = saved_pos

1810 break

1811 digits += ch

1812

1813 return False, [int(digits, 8) & octal_mask]

1814

1815 if ch in DIGITS:

1816 # Either an octal escape sequence (3 digits) or a group reference (max

1817 # 2 digits).

1818 digits = ch

1819 saved_pos = source.pos

1820 ch = source.get()

1821 if ch in DIGITS:

1822 digits += ch

1823 saved_pos = source.pos

1824 ch = source.get()

1825 if ch and is_octal(digits + ch):

1826 # An octal escape sequence.

1827 return False, [int(digits + ch, 8) & octal_mask]

1828

1829 # A group reference.

1830 source.pos = saved_pos

1831 return True, [int(digits)]

1832

1833 if ch == "\\":

1834 # An escaped backslash is a backslash.

1835 return False, [ord("\\")]

1836

1837 if not ch:

1838 # A trailing backslash.

1839 raise error("bad escape (end of pattern)", source.string, source.pos)

1840

1841 # An escaped non-backslash is a backslash followed by the literal.

1842 return False, [ord("\\"), ord(ch)]

1843

1844def parse_repl_hex_escape(source, expected_len, type):

1845 "Parses a hex escape sequence in a replacement string."

1846 digits = []

1847 for i in range(expected_len):

1848 ch = source.get()

1849 if ch not in HEX_DIGITS:

1850 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1851 source.string, source.pos)

1852 digits.append(ch)

1853

1854 return int("".join(digits), 16)

1855

1856def parse_repl_named_char(source):

1857 "Parses a named character in a replacement string."

1858 saved_pos = source.pos

1859 if source.match("{"):

1860 name = source.get_while(ALPHA | set(" "))

1861

1862 if source.match("}"):

1863 try:

1864 value = unicodedata.lookup(name)

1865 return ord(value)

1866 except KeyError:

1867 raise error("undefined character name", source.string,

1868 source.pos)

1869

1870 source.pos = saved_pos

1871 return None

1872

1873def compile_repl_group(source, pattern):

1874 "Compiles a replacement template group reference."

1875 source.expect("<")

1876 name = parse_name(source, True, True)

1877

1878 source.expect(">")

1879 if name.isdigit():

1880 index = int(name)

1881 if not 0 <= index <= pattern.groups:

1882 raise error("invalid group reference", source.string, source.pos)

1883

1884 return index

1885

1886 try:

1887 return pattern.groupindex[name]

1888 except KeyError:

1889 raise IndexError("unknown group")

1890

1891# The regular expression is parsed into a syntax tree. The different types of

1892# node are defined below.

1893

1894INDENT = " "

1895POSITIVE_OP = 0x1

1896ZEROWIDTH_OP = 0x2

1897FUZZY_OP = 0x4

1898REVERSE_OP = 0x8

1899REQUIRED_OP = 0x10

1900ENCODING_OP_SHIFT = 5

1901

1902POS_TEXT = {False: "NON-MATCH", True: "MATCH"}

1903CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",

1904 FULLIGNORECASE: " FULL_IGNORE_CASE"}

1905

1906def make_sequence(items):

1907 if len(items) == 1:

1908 return items[0]

1909 return Sequence(items)

1910

1911# Common base class for all nodes.

1912class RegexBase:

1913 def __init__(self):

1914 self._key = self.__class__

1915

1916 def with_flags(self, positive=None, case_flags=None, zerowidth=None):

1917 if positive is None:

1918 positive = self.positive

1919 else:

1920 positive = bool(positive)

1921 if case_flags is None:

1922 case_flags = self.case_flags

1923 else:

1924 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]

1925 if zerowidth is None:

1926 zerowidth = self.zerowidth

1927 else:

1928 zerowidth = bool(zerowidth)

1929

1930 if (positive == self.positive and case_flags == self.case_flags and

1931 zerowidth == self.zerowidth):

1932 return self

1933

1934 return self.rebuild(positive, case_flags, zerowidth)

1935

1936 def fix_groups(self, pattern, reverse, fuzzy):

1937 pass

1938

1939 def optimise(self, info, reverse):

1940 return self

1941

1942 def pack_characters(self, info):

1943 return self

1944

1945 def remove_captures(self):

1946 return self

1947

1948 def is_atomic(self):

1949 return True

1950

1951 def can_be_affix(self):

1952 return True

1953

1954 def contains_group(self):

1955 return False

1956

1957 def get_firstset(self, reverse):

1958 raise _FirstSetError()

1959

1960 def has_simple_start(self):

1961 return False

1962

1963 def compile(self, reverse=False, fuzzy=False):

1964 return self._compile(reverse, fuzzy)

1965

1966 def is_empty(self):

1967 return False

1968

1969 def __hash__(self):

1970 return hash(self._key)

1971

1972 def __eq__(self, other):

1973 return type(self) is type(other) and self._key == other._key

1974

1975 def __ne__(self, other):

1976 return not self.__eq__(other)

1977

1978 def get_required_string(self, reverse):

1979 return self.max_width(), None

1980

1981# Base class for zero-width nodes.

1982class ZeroWidthBase(RegexBase):

1983 def __init__(self, positive=True, encoding=0):

1984 RegexBase.__init__(self)

1985 self.positive = bool(positive)

1986 self.encoding = encoding

1987

1988 self._key = self.__class__, self.positive

1989

1990 def get_firstset(self, reverse):

1991 return set([None])

1992

1993 def _compile(self, reverse, fuzzy):

1994 flags = 0

1995 if self.positive:

1996 flags |= POSITIVE_OP

1997 if fuzzy:

1998 flags |= FUZZY_OP

1999 if reverse:

2000 flags |= REVERSE_OP

2001 flags |= self.encoding << ENCODING_OP_SHIFT

2002 return [(self._opcode, flags)]

2003

2004 def dump(self, indent, reverse):

2005 print("{}{} {}{}".format(INDENT * indent, self._op_name,

2006 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))

2007

2008 def max_width(self):

2009 return 0

2010

2011class Any(RegexBase):

2012 _opcode = {False: OP.ANY, True: OP.ANY_REV}

2013 _op_name = "ANY"

2014

2015 def has_simple_start(self):

2016 return True

2017

2018 def _compile(self, reverse, fuzzy):

2019 flags = 0

2020 if fuzzy:

2021 flags |= FUZZY_OP

2022 return [(self._opcode[reverse], flags)]

2023

2024 def dump(self, indent, reverse):

2025 print("{}{}".format(INDENT * indent, self._op_name))

2026

2027 def max_width(self):

2028 return 1

2029

2030class AnyAll(Any):

2031 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}

2032 _op_name = "ANY_ALL"

2033

2034 def __init__(self):

2035 self.positive = True

2036 self.zerowidth = False

2037 self.case_flags = 0

2038

2039 self._key = self.__class__, self.positive

2040

2041class AnyU(Any):

2042 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}

2043 _op_name = "ANY_U"

2044

2045class Atomic(RegexBase):

2046 def __init__(self, subpattern):

2047 RegexBase.__init__(self)

2048 self.subpattern = subpattern

2049

2050 def fix_groups(self, pattern, reverse, fuzzy):

2051 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2052

2053 def optimise(self, info, reverse):

2054 self.subpattern = self.subpattern.optimise(info, reverse)

2055

2056 if self.subpattern.is_empty():

2057 return self.subpattern

2058 return self

2059

2060 def pack_characters(self, info):

2061 self.subpattern = self.subpattern.pack_characters(info)

2062 return self

2063

2064 def remove_captures(self):

2065 self.subpattern = self.subpattern.remove_captures()

2066 return self

2067

2068 def can_be_affix(self):

2069 return self.subpattern.can_be_affix()

2070

2071 def contains_group(self):

2072 return self.subpattern.contains_group()

2073

2074 def get_firstset(self, reverse):

2075 return self.subpattern.get_firstset(reverse)

2076

2077 def has_simple_start(self):

2078 return self.subpattern.has_simple_start()

2079

2080 def _compile(self, reverse, fuzzy):

2081 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +

2082 [(OP.END, )])

2083

2084 def dump(self, indent, reverse):

2085 print("{}ATOMIC".format(INDENT * indent))

2086 self.subpattern.dump(indent + 1, reverse)

2087

2088 def is_empty(self):

2089 return self.subpattern.is_empty()

2090

2091 def __eq__(self, other):

2092 return (type(self) is type(other) and self.subpattern ==

2093 other.subpattern)

2094

2095 def max_width(self):

2096 return self.subpattern.max_width()

2097

2098 def get_required_string(self, reverse):

2099 return self.subpattern.get_required_string(reverse)

2100

2101class Boundary(ZeroWidthBase):

2102 _opcode = OP.BOUNDARY

2103 _op_name = "BOUNDARY"

2104

2105class Branch(RegexBase):

2106 def __init__(self, branches):

2107 RegexBase.__init__(self)

2108 self.branches = branches

2109

2110 def fix_groups(self, pattern, reverse, fuzzy):

2111 for b in self.branches:

2112 b.fix_groups(pattern, reverse, fuzzy)

2113

2114 def optimise(self, info, reverse):

2115 if not self.branches:

2116 return Sequence([])

2117

2118 # Flatten branches within branches.

2119 branches = Branch._flatten_branches(info, reverse, self.branches)

2120

2121 # Move any common prefix or suffix out of the branches.

2122 if reverse:

2123 suffix, branches = Branch._split_common_suffix(info, branches)

2124 prefix = []

2125 else:

2126 prefix, branches = Branch._split_common_prefix(info, branches)

2127 suffix = []

2128

2129 # Try to reduce adjacent single-character branches to sets.

2130 branches = Branch._reduce_to_set(info, reverse, branches)

2131

2132 if len(branches) > 1:

2133 sequence = [Branch(branches)]

2134

2135 if not prefix or not suffix:

2136 # We might be able to add a quick precheck before the branches.

2137 firstset = self._add_precheck(info, reverse, branches)

2138

2139 if firstset:

2140 if reverse:

2141 sequence.append(firstset)

2142 else:

2143 sequence.insert(0, firstset)

2144 else:

2145 sequence = branches

2146

2147 return make_sequence(prefix + sequence + suffix)

2148

2149 def _add_precheck(self, info, reverse, branches):

2150 charset = set()

2151 pos = -1 if reverse else 0

2152

2153 for branch in branches:

2154 if type(branch) is Literal and branch.case_flags == NOCASE:

2155 charset.add(branch.characters[pos])

2156 else:

2157 return

2158

2159 if not charset:

2160 return None

2161

2162 return _check_firstset(info, reverse, [Character(c) for c in charset])

2163

2164 def pack_characters(self, info):

2165 self.branches = [b.pack_characters(info) for b in self.branches]

2166 return self

2167

2168 def remove_captures(self):

2169 self.branches = [b.remove_captures() for b in self.branches]

2170 return self

2171

2172 def is_atomic(self):

2173 return all(b.is_atomic() for b in self.branches)

2174

2175 def can_be_affix(self):

2176 return all(b.can_be_affix() for b in self.branches)

2177

2178 def contains_group(self):

2179 return any(b.contains_group() for b in self.branches)

2180

2181 def get_firstset(self, reverse):

2182 fs = set()

2183 for b in self.branches:

2184 fs |= b.get_firstset(reverse)

2185

2186 return fs or set([None])

2187

2188 def _compile(self, reverse, fuzzy):

2189 if not self.branches:

2190 return []

2191

2192 code = [(OP.BRANCH, )]

2193 for b in self.branches:

2194 code.extend(b.compile(reverse, fuzzy))

2195 code.append((OP.NEXT, ))

2196

2197 code[-1] = (OP.END, )

2198

2199 return code

2200

2201 def dump(self, indent, reverse):

2202 print("{}BRANCH".format(INDENT * indent))

2203 self.branches[0].dump(indent + 1, reverse)

2204 for b in self.branches[1 : ]:

2205 print("{}OR".format(INDENT * indent))

2206 b.dump(indent + 1, reverse)

2207

2208 @staticmethod

2209 def _flatten_branches(info, reverse, branches):

2210 # Flatten the branches so that there aren't branches of branches.

2211 new_branches = []

2212 for b in branches:

2213 b = b.optimise(info, reverse)

2214 if isinstance(b, Branch):

2215 new_branches.extend(b.branches)

2216 else:

2217 new_branches.append(b)

2218

2219 return new_branches

2220

2221 @staticmethod

2222 def _split_common_prefix(info, branches):

2223 # Common leading items can be moved out of the branches.

2224 # Get the items in the branches.

2225 alternatives = []

2226 for b in branches:

2227 if isinstance(b, Sequence):

2228 alternatives.append(b.items)

2229 else:

2230 alternatives.append([b])

2231

2232 # What is the maximum possible length of the prefix?

2233 max_count = min(len(a) for a in alternatives)

2234

2235 # What is the longest common prefix?

2236 prefix = alternatives[0]

2237 pos = 0

2238 end_pos = max_count

2239 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==

2240 prefix[pos] for a in alternatives):

2241 pos += 1

2242 count = pos

2243

2244 if info.flags & UNICODE:

2245 # We need to check that we're not splitting a sequence of

2246 # characters which could form part of full case-folding.

2247 count = pos

2248 while count > 0 and not all(Branch._can_split(a, count) for a in

2249 alternatives):

2250 count -= 1

2251

2252 # No common prefix is possible.

2253 if count == 0:

2254 return [], branches

2255

2256 # Rebuild the branches.

2257 new_branches = []

2258 for a in alternatives:

2259 new_branches.append(make_sequence(a[count : ]))

2260

2261 return prefix[ : count], new_branches

2262

2263 @staticmethod

2264 def _split_common_suffix(info, branches):

2265 # Common trailing items can be moved out of the branches.

2266 # Get the items in the branches.

2267 alternatives = []

2268 for b in branches:

2269 if isinstance(b, Sequence):

2270 alternatives.append(b.items)

2271 else:

2272 alternatives.append([b])

2273

2274 # What is the maximum possible length of the suffix?

2275 max_count = min(len(a) for a in alternatives)

2276

2277 # What is the longest common suffix?

2278 suffix = alternatives[0]

2279 pos = -1

2280 end_pos = -1 - max_count

2281 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==

2282 suffix[pos] for a in alternatives):

2283 pos -= 1

2284 count = -1 - pos

2285

2286 if info.flags & UNICODE:

2287 # We need to check that we're not splitting a sequence of

2288 # characters which could form part of full case-folding.

2289 while count > 0 and not all(Branch._can_split_rev(a, count) for a

2290 in alternatives):

2291 count -= 1

2292

2293 # No common suffix is possible.

2294 if count == 0:

2295 return [], branches

2296

2297 # Rebuild the branches.

2298 new_branches = []

2299 for a in alternatives:

2300 new_branches.append(make_sequence(a[ : -count]))

2301

2302 return suffix[-count : ], new_branches

2303

2304 @staticmethod

2305 def _can_split(items, count):

2306 # Check the characters either side of the proposed split.

2307 if not Branch._is_full_case(items, count - 1):

2308 return True

2309

2310 if not Branch._is_full_case(items, count):

2311 return True

2312

2313 # Check whether a 1-1 split would be OK.

2314 if Branch._is_folded(items[count - 1 : count + 1]):

2315 return False

2316

2317 # Check whether a 1-2 split would be OK.

2318 if (Branch._is_full_case(items, count + 2) and

2319 Branch._is_folded(items[count - 1 : count + 2])):

2320 return False

2321

2322 # Check whether a 2-1 split would be OK.

2323 if (Branch._is_full_case(items, count - 2) and

2324 Branch._is_folded(items[count - 2 : count + 1])):

2325 return False

2326

2327 return True

2328

2329 @staticmethod

2330 def _can_split_rev(items, count):

2331 end = len(items)

2332

2333 # Check the characters either side of the proposed split.

2334 if not Branch._is_full_case(items, end - count):

2335 return True

2336

2337 if not Branch._is_full_case(items, end - count - 1):

2338 return True

2339

2340 # Check whether a 1-1 split would be OK.

2341 if Branch._is_folded(items[end - count - 1 : end - count + 1]):

2342 return False

2343

2344 # Check whether a 1-2 split would be OK.

2345 if (Branch._is_full_case(items, end - count + 2) and

2346 Branch._is_folded(items[end - count - 1 : end - count + 2])):

2347 return False

2348

2349 # Check whether a 2-1 split would be OK.

2350 if (Branch._is_full_case(items, end - count - 2) and

2351 Branch._is_folded(items[end - count - 2 : end - count + 1])):

2352 return False

2353

2354 return True

2355

2356 @staticmethod

2357 def _merge_common_prefixes(info, reverse, branches):

2358 # Branches with the same case-sensitive character prefix can be grouped

2359 # together if they are separated only by other branches with a

2360 # character prefix.

2361 prefixed = defaultdict(list)

2362 order = {}

2363 new_branches = []

2364 for b in branches:

2365 if Branch._is_simple_character(b):

2366 # Branch starts with a simple character.

2367 prefixed[b.value].append([b])

2368 order.setdefault(b.value, len(order))

2369 elif (isinstance(b, Sequence) and b.items and

2370 Branch._is_simple_character(b.items[0])):

2371 # Branch starts with a simple character.

2372 prefixed[b.items[0].value].append(b.items)

2373 order.setdefault(b.items[0].value, len(order))

2374 else:

2375 Branch._flush_char_prefix(info, reverse, prefixed, order,

2376 new_branches)

2377

2378 new_branches.append(b)

2379

2380 Branch._flush_char_prefix(info, prefixed, order, new_branches)

2381

2382 return new_branches

2383

2384 @staticmethod

2385 def _is_simple_character(c):

2386 return isinstance(c, Character) and c.positive and not c.case_flags

2387

2388 @staticmethod

2389 def _reduce_to_set(info, reverse, branches):

2390 # Can the branches be reduced to a set?

2391 new_branches = []

2392 items = set()

2393 case_flags = NOCASE

2394 for b in branches:

2395 if isinstance(b, (Character, Property, SetBase)):

2396 # Branch starts with a single character.

2397 if b.case_flags != case_flags:

2398 # Different case sensitivity, so flush.

2399 Branch._flush_set_members(info, reverse, items, case_flags,

2400 new_branches)

2401

2402 case_flags = b.case_flags

2403

2404 items.add(b.with_flags(case_flags=NOCASE))

2405 else:

2406 Branch._flush_set_members(info, reverse, items, case_flags,

2407 new_branches)

2408

2409 new_branches.append(b)

2410

2411 Branch._flush_set_members(info, reverse, items, case_flags,

2412 new_branches)

2413

2414 return new_branches

2415

2416 @staticmethod

2417 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):

2418 # Flush the prefixed branches.

2419 if not prefixed:

2420 return

2421

2422 for value, branches in sorted(prefixed.items(), key=lambda pair:

2423 order[pair[0]]):

2424 if len(branches) == 1:

2425 new_branches.append(make_sequence(branches[0]))

2426 else:

2427 subbranches = []

2428 optional = False

2429 for b in branches:

2430 if len(b) > 1:

2431 subbranches.append(make_sequence(b[1 : ]))

2432 elif not optional:

2433 subbranches.append(Sequence())

2434 optional = True

2435

2436 sequence = Sequence([Character(value), Branch(subbranches)])

2437 new_branches.append(sequence.optimise(info, reverse))

2438

2439 prefixed.clear()

2440 order.clear()

2441

2442 @staticmethod

2443 def _flush_set_members(info, reverse, items, case_flags, new_branches):

2444 # Flush the set members.

2445 if not items:

2446 return

2447

2448 if len(items) == 1:

2449 item = list(items)[0]

2450 else:

2451 item = SetUnion(info, list(items)).optimise(info, reverse)

2452

2453 new_branches.append(item.with_flags(case_flags=case_flags))

2454

2455 items.clear()

2456

2457 @staticmethod

2458 def _is_full_case(items, i):

2459 if not 0 <= i < len(items):

2460 return False

2461

2462 item = items[i]

2463 return (isinstance(item, Character) and item.positive and

2464 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)

2465

2466 @staticmethod

2467 def _is_folded(items):

2468 if len(items) < 2:

2469 return False

2470

2471 for i in items:

2472 if (not isinstance(i, Character) or not i.positive or not

2473 i.case_flags):

2474 return False

2475

2476 folded = "".join(chr(i.value) for i in items)

2477 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)

2478

2479 # Get the characters which expand to multiple codepoints on folding.

2480 expanding_chars = _regex.get_expand_on_folding()

2481

2482 for c in expanding_chars:

2483 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):

2484 return True

2485

2486 return False

2487

2488 def is_empty(self):

2489 return all(b.is_empty() for b in self.branches)

2490

2491 def __eq__(self, other):

2492 return type(self) is type(other) and self.branches == other.branches

2493

2494 def max_width(self):

2495 return max(b.max_width() for b in self.branches)

2496

2497class CallGroup(RegexBase):

2498 def __init__(self, info, group, position):

2499 RegexBase.__init__(self)

2500 self.info = info

2501 self.group = group

2502 self.position = position

2503

2504 self._key = self.__class__, self.group

2505

2506 def fix_groups(self, pattern, reverse, fuzzy):

2507 try:

2508 self.group = int(self.group)

2509 except ValueError:

2510 try:

2511 self.group = self.info.group_index[self.group]

2512 except KeyError:

2513 raise error("invalid group reference", pattern, self.position)

2514

2515 if not 0 <= self.group <= self.info.group_count:

2516 raise error("unknown group", pattern, self.position)

2517

2518 if self.group > 0 and self.info.open_group_count[self.group] > 1:

2519 raise error("ambiguous group reference", pattern, self.position)

2520

2521 self.info.group_calls.append((self, reverse, fuzzy))

2522

2523 self._key = self.__class__, self.group

2524

2525 def remove_captures(self):

2526 raise error("group reference not allowed", self.pattern, self.position)

2527

2528 def _compile(self, reverse, fuzzy):

2529 return [(OP.GROUP_CALL, self.call_ref)]

2530

2531 def dump(self, indent, reverse):

2532 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))

2533

2534 def __eq__(self, other):

2535 return type(self) is type(other) and self.group == other.group

2536

2537 def max_width(self):

2538 return UNLIMITED

2539

2540 def __del__(self):

2541 self.info = None

2542

2543class CallRef(RegexBase):

2544 def __init__(self, ref, parsed):

2545 self.ref = ref

2546 self.parsed = parsed

2547

2548 def _compile(self, reverse, fuzzy):

2549 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,

2550 fuzzy) + [(OP.END, )])

2551

2552class Character(RegexBase):

2553 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):

2554 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,

2555 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,

2556 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,

2557 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}

2558

2559 def __init__(self, value, positive=True, case_flags=NOCASE,

2560 zerowidth=False):

2561 RegexBase.__init__(self)

2562 self.value = value

2563 self.positive = bool(positive)

2564 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

2565 self.zerowidth = bool(zerowidth)

2566

2567 if (self.positive and (self.case_flags & FULLIGNORECASE) ==

2568 FULLIGNORECASE):

2569 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))

2570 else:

2571 self.folded = chr(self.value)

2572

2573 self._key = (self.__class__, self.value, self.positive,

2574 self.case_flags, self.zerowidth)

2575

2576 def rebuild(self, positive, case_flags, zerowidth):

2577 return Character(self.value, positive, case_flags, zerowidth)

2578

2579 def optimise(self, info, reverse, in_set=False):

2580 return self

2581

2582 def get_firstset(self, reverse):

2583 return set([self])

2584

2585 def has_simple_start(self):

2586 return True

2587

2588 def _compile(self, reverse, fuzzy):

2589 flags = 0

2590 if self.positive:

2591 flags |= POSITIVE_OP

2592 if self.zerowidth:

2593 flags |= ZEROWIDTH_OP

2594 if fuzzy:

2595 flags |= FUZZY_OP

2596

2597 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,

2598 self.value])

2599

2600 if len(self.folded) > 1:

2601 # The character expands on full case-folding.

2602 code = Branch([code, String([ord(c) for c in self.folded],

2603 case_flags=self.case_flags)])

2604

2605 return code.compile(reverse, fuzzy)

2606

2607 def dump(self, indent, reverse):

2608 display = ascii(chr(self.value)).lstrip("bu")

2609 print("{}CHARACTER {} {}{}".format(INDENT * indent,

2610 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))

2611

2612 def matches(self, ch):

2613 return (ch == self.value) == self.positive

2614

2615 def max_width(self):

2616 return len(self.folded)

2617

2618 def get_required_string(self, reverse):

2619 if not self.positive:

2620 return 1, None

2621

2622 self.folded_characters = tuple(ord(c) for c in self.folded)

2623

2624 return 0, self

2625

2626class Conditional(RegexBase):

2627 def __init__(self, info, group, yes_item, no_item, position):

2628 RegexBase.__init__(self)

2629 self.info = info

2630 self.group = group

2631 self.yes_item = yes_item

2632 self.no_item = no_item

2633 self.position = position

2634

2635 def fix_groups(self, pattern, reverse, fuzzy):

2636 try:

2637 self.group = int(self.group)

2638 except ValueError:

2639 try:

2640 self.group = self.info.group_index[self.group]

2641 except KeyError:

2642 if self.group == 'DEFINE':

2643 # 'DEFINE' is a special name unless there's a group with

2644 # that name.

2645 self.group = 0

2646 else:

2647 raise error("unknown group", pattern, self.position)

2648

2649 if not 0 <= self.group <= self.info.group_count:

2650 raise error("invalid group reference", pattern, self.position)

2651

2652 self.yes_item.fix_groups(pattern, reverse, fuzzy)

2653 self.no_item.fix_groups(pattern, reverse, fuzzy)

2654

2655 def optimise(self, info, reverse):

2656 yes_item = self.yes_item.optimise(info, reverse)

2657 no_item = self.no_item.optimise(info, reverse)

2658

2659 return Conditional(info, self.group, yes_item, no_item, self.position)

2660

2661 def pack_characters(self, info):

2662 self.yes_item = self.yes_item.pack_characters(info)

2663 self.no_item = self.no_item.pack_characters(info)

2664 return self

2665

2666 def remove_captures(self):

2667 self.yes_item = self.yes_item.remove_captures()

2668 self.no_item = self.no_item.remove_captures()

2669

2670 def is_atomic(self):

2671 return self.yes_item.is_atomic() and self.no_item.is_atomic()

2672

2673 def can_be_affix(self):

2674 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()

2675

2676 def contains_group(self):

2677 return self.yes_item.contains_group() or self.no_item.contains_group()

2678

2679 def get_firstset(self, reverse):

2680 return (self.yes_item.get_firstset(reverse) |

2681 self.no_item.get_firstset(reverse))

2682

2683 def _compile(self, reverse, fuzzy):

2684 code = [(OP.GROUP_EXISTS, self.group)]

2685 code.extend(self.yes_item.compile(reverse, fuzzy))

2686 add_code = self.no_item.compile(reverse, fuzzy)

2687 if add_code:

2688 code.append((OP.NEXT, ))

2689 code.extend(add_code)

2690

2691 code.append((OP.END, ))

2692

2693 return code

2694

2695 def dump(self, indent, reverse):

2696 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))

2697 self.yes_item.dump(indent + 1, reverse)

2698 if not self.no_item.is_empty():

2699 print("{}OR".format(INDENT * indent))

2700 self.no_item.dump(indent + 1, reverse)

2701

2702 def is_empty(self):

2703 return self.yes_item.is_empty() and self.no_item.is_empty()

2704

2705 def __eq__(self, other):

2706 return type(self) is type(other) and (self.group, self.yes_item,

2707 self.no_item) == (other.group, other.yes_item, other.no_item)

2708

2709 def max_width(self):

2710 return max(self.yes_item.max_width(), self.no_item.max_width())

2711

2712 def __del__(self):

2713 self.info = None

2714

2715class DefaultBoundary(ZeroWidthBase):

2716 _opcode = OP.DEFAULT_BOUNDARY

2717 _op_name = "DEFAULT_BOUNDARY"

2718

2719class DefaultEndOfWord(ZeroWidthBase):

2720 _opcode = OP.DEFAULT_END_OF_WORD

2721 _op_name = "DEFAULT_END_OF_WORD"

2722

2723class DefaultStartOfWord(ZeroWidthBase):

2724 _opcode = OP.DEFAULT_START_OF_WORD

2725 _op_name = "DEFAULT_START_OF_WORD"

2726

2727class EndOfLine(ZeroWidthBase):

2728 _opcode = OP.END_OF_LINE

2729 _op_name = "END_OF_LINE"

2730

2731class EndOfLineU(EndOfLine):

2732 _opcode = OP.END_OF_LINE_U

2733 _op_name = "END_OF_LINE_U"

2734

2735class EndOfString(ZeroWidthBase):

2736 _opcode = OP.END_OF_STRING

2737 _op_name = "END_OF_STRING"

2738

2739class EndOfStringLine(ZeroWidthBase):

2740 _opcode = OP.END_OF_STRING_LINE

2741 _op_name = "END_OF_STRING_LINE"

2742

2743class EndOfStringLineU(EndOfStringLine):

2744 _opcode = OP.END_OF_STRING_LINE_U

2745 _op_name = "END_OF_STRING_LINE_U"

2746

2747class EndOfWord(ZeroWidthBase):

2748 _opcode = OP.END_OF_WORD

2749 _op_name = "END_OF_WORD"

2750

2751class Failure(ZeroWidthBase):

2752 _op_name = "FAILURE"

2753

2754 def _compile(self, reverse, fuzzy):

2755 return [(OP.FAILURE, )]

2756

2757class Fuzzy(RegexBase):

2758 def __init__(self, subpattern, constraints=None):

2759 RegexBase.__init__(self)

2760 if constraints is None:

2761 constraints = {}

2762 self.subpattern = subpattern

2763 self.constraints = constraints

2764

2765 # If an error type is mentioned in the cost equation, then its maximum

2766 # defaults to unlimited.

2767 if "cost" in constraints:

2768 for e in "dis":

2769 if e in constraints["cost"]:

2770 constraints.setdefault(e, (0, None))

2771

2772 # If any error type is mentioned, then all the error maxima default to

2773 # 0, otherwise they default to unlimited.

2774 if set(constraints) & set("dis"):

2775 for e in "dis":

2776 constraints.setdefault(e, (0, 0))

2777 else:

2778 for e in "dis":

2779 constraints.setdefault(e, (0, None))

2780

2781 # The maximum of the generic error type defaults to unlimited.

2782 constraints.setdefault("e", (0, None))

2783

2784 # The cost equation defaults to equal costs. Also, the cost of any

2785 # error type not mentioned in the cost equation defaults to 0.

2786 if "cost" in constraints:

2787 for e in "dis":

2788 constraints["cost"].setdefault(e, 0)

2789 else:

2790 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":

2791 constraints["e"][1]}

2792

2793 def fix_groups(self, pattern, reverse, fuzzy):

2794 self.subpattern.fix_groups(pattern, reverse, True)

2795

2796 def pack_characters(self, info):

2797 self.subpattern = self.subpattern.pack_characters(info)

2798 return self

2799

2800 def remove_captures(self):

2801 self.subpattern = self.subpattern.remove_captures()

2802 return self

2803

2804 def is_atomic(self):

2805 return self.subpattern.is_atomic()

2806

2807 def contains_group(self):

2808 return self.subpattern.contains_group()

2809

2810 def _compile(self, reverse, fuzzy):

2811 # The individual limits.

2812 arguments = []

2813 for e in "dise":

2814 v = self.constraints[e]

2815 arguments.append(v[0])

2816 arguments.append(UNLIMITED if v[1] is None else v[1])

2817

2818 # The coeffs of the cost equation.

2819 for e in "dis":

2820 arguments.append(self.constraints["cost"][e])

2821

2822 # The maximum of the cost equation.

2823 v = self.constraints["cost"]["max"]

2824 arguments.append(UNLIMITED if v is None else v)

2825

2826 flags = 0

2827 if reverse:

2828 flags |= REVERSE_OP

2829

2830 test = self.constraints.get("test")

2831

2832 if test:

2833 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +

2834 test.compile(reverse, True) + [(OP.NEXT,)] +

2835 self.subpattern.compile(reverse, True) + [(OP.END,)])

2836

2837 return ([(OP.FUZZY, flags) + tuple(arguments)] +

2838 self.subpattern.compile(reverse, True) + [(OP.END,)])

2839

2840 def dump(self, indent, reverse):

2841 constraints = self._constraints_to_string()

2842 if constraints:

2843 constraints = " " + constraints

2844 print("{}FUZZY{}".format(INDENT * indent, constraints))

2845 self.subpattern.dump(indent + 1, reverse)

2846

2847 def is_empty(self):

2848 return self.subpattern.is_empty()

2849

2850 def __eq__(self, other):

2851 return (type(self) is type(other) and self.subpattern ==

2852 other.subpattern and self.constraints == other.constraints)

2853

2854 def max_width(self):

2855 return UNLIMITED

2856

2857 def _constraints_to_string(self):

2858 constraints = []

2859

2860 for name in "ids":

2861 min, max = self.constraints[name]

2862 if max == 0:

2863 continue

2864

2865 con = ""

2866

2867 if min > 0:

2868 con = "{}<=".format(min)

2869

2870 con += name

2871

2872 if max is not None:

2873 con += "<={}".format(max)

2874

2875 constraints.append(con)

2876

2877 cost = []

2878 for name in "ids":

2879 coeff = self.constraints["cost"][name]

2880 if coeff > 0:

2881 cost.append("{}{}".format(coeff, name))

2882

2883 limit = self.constraints["cost"]["max"]

2884 if limit is not None and limit > 0:

2885 cost = "{}<={}".format("+".join(cost), limit)

2886 constraints.append(cost)

2887

2888 return ",".join(constraints)

2889

2890class Grapheme(RegexBase):

2891 def _compile(self, reverse, fuzzy):

2892 # Match at least 1 character until a grapheme boundary is reached. Note

2893 # that this is the same whether matching forwards or backwards.

2894 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),

2895 GraphemeBoundary()]))

2896

2897 return grapheme_matcher.compile(reverse, fuzzy)

2898

2899 def dump(self, indent, reverse):

2900 print("{}GRAPHEME".format(INDENT * indent))

2901

2902 def max_width(self):

2903 return UNLIMITED

2904

2905class GraphemeBoundary:

2906 def compile(self, reverse, fuzzy):

2907 return [(OP.GRAPHEME_BOUNDARY, 1)]

2908

2909class GreedyRepeat(RegexBase):

2910 _opcode = OP.GREEDY_REPEAT

2911 _op_name = "GREEDY_REPEAT"

2912

2913 def __init__(self, subpattern, min_count, max_count):

2914 RegexBase.__init__(self)

2915 self.subpattern = subpattern

2916 self.min_count = min_count

2917 self.max_count = max_count

2918

2919 def fix_groups(self, pattern, reverse, fuzzy):

2920 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2921

2922 def optimise(self, info, reverse):

2923 subpattern = self.subpattern.optimise(info, reverse)

2924

2925 return type(self)(subpattern, self.min_count, self.max_count)

2926

2927 def pack_characters(self, info):

2928 self.subpattern = self.subpattern.pack_characters(info)

2929 return self

2930

2931 def remove_captures(self):

2932 self.subpattern = self.subpattern.remove_captures()

2933 return self

2934

2935 def is_atomic(self):

2936 return self.min_count == self.max_count and self.subpattern.is_atomic()

2937

2938 def can_be_affix(self):

2939 return False

2940

2941 def contains_group(self):

2942 return self.subpattern.contains_group()

2943

2944 def get_firstset(self, reverse):

2945 fs = self.subpattern.get_firstset(reverse)

2946 if self.min_count == 0:

2947 fs.add(None)

2948

2949 return fs

2950

2951 def _compile(self, reverse, fuzzy):

2952 repeat = [self._opcode, self.min_count]

2953 if self.max_count is None:

2954 repeat.append(UNLIMITED)

2955 else:

2956 repeat.append(self.max_count)

2957

2958 subpattern = self.subpattern.compile(reverse, fuzzy)

2959 if not subpattern:

2960 return []

2961

2962 return ([tuple(repeat)] + subpattern + [(OP.END, )])

2963

2964 def dump(self, indent, reverse):

2965 if self.max_count is None:

2966 limit = "INF"

2967 else:

2968 limit = self.max_count

2969 print("{}{} {} {}".format(INDENT * indent, self._op_name,

2970 self.min_count, limit))

2971

2972 self.subpattern.dump(indent + 1, reverse)

2973

2974 def is_empty(self):

2975 return self.subpattern.is_empty()

2976

2977 def __eq__(self, other):

2978 return type(self) is type(other) and (self.subpattern, self.min_count,

2979 self.max_count) == (other.subpattern, other.min_count,

2980 other.max_count)

2981

2982 def max_width(self):

2983 if self.max_count is None:

2984 return UNLIMITED

2985

2986 return self.subpattern.max_width() * self.max_count

2987

2988 def get_required_string(self, reverse):

2989 max_count = UNLIMITED if self.max_count is None else self.max_count

2990 if self.min_count == 0:

2991 w = self.subpattern.max_width() * max_count

2992 return min(w, UNLIMITED), None

2993

2994 ofs, req = self.subpattern.get_required_string(reverse)

2995 if req:

2996 return ofs, req

2997

2998 w = self.subpattern.max_width() * max_count

2999 return min(w, UNLIMITED), None

3000

3001class PossessiveRepeat(GreedyRepeat):

3002 def is_atomic(self):

3003 return True

3004

3005 def _compile(self, reverse, fuzzy):

3006 subpattern = self.subpattern.compile(reverse, fuzzy)

3007 if not subpattern:

3008 return []

3009

3010 repeat = [self._opcode, self.min_count]

3011 if self.max_count is None:

3012 repeat.append(UNLIMITED)

3013 else:

3014 repeat.append(self.max_count)

3015

3016 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),

3017 (OP.END, )])

3018

3019 def dump(self, indent, reverse):

3020 print("{}ATOMIC".format(INDENT * indent))

3021

3022 if self.max_count is None:

3023 limit = "INF"

3024 else:

3025 limit = self.max_count

3026 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,

3027 self.min_count, limit))

3028

3029 self.subpattern.dump(indent + 2, reverse)

3030

3031class Group(RegexBase):

3032 def __init__(self, info, group, subpattern):

3033 RegexBase.__init__(self)

3034 self.info = info

3035 self.group = group

3036 self.subpattern = subpattern

3037

3038 self.call_ref = None

3039

3040 def fix_groups(self, pattern, reverse, fuzzy):

3041 self.info.defined_groups[self.group] = (self, reverse, fuzzy)

3042 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3043

3044 def optimise(self, info, reverse):

3045 subpattern = self.subpattern.optimise(info, reverse)

3046

3047 return Group(self.info, self.group, subpattern)

3048

3049 def pack_characters(self, info):

3050 self.subpattern = self.subpattern.pack_characters(info)

3051 return self

3052

3053 def remove_captures(self):

3054 return self.subpattern.remove_captures()

3055

3056 def is_atomic(self):

3057 return self.subpattern.is_atomic()

3058

3059 def can_be_affix(self):

3060 return False

3061

3062 def contains_group(self):

3063 return True

3064

3065 def get_firstset(self, reverse):

3066 return self.subpattern.get_firstset(reverse)

3067

3068 def has_simple_start(self):

3069 return self.subpattern.has_simple_start()

3070

3071 def _compile(self, reverse, fuzzy):

3072 code = []

3073

3074 public_group = private_group = self.group

3075 if private_group < 0:

3076 public_group = self.info.private_groups[private_group]

3077 private_group = self.info.group_count - private_group

3078

3079 key = self.group, reverse, fuzzy

3080 ref = self.info.call_refs.get(key)

3081 if ref is not None:

3082 code += [(OP.CALL_REF, ref)]

3083

3084 code += [(OP.GROUP, int(not reverse), private_group, public_group)]

3085 code += self.subpattern.compile(reverse, fuzzy)

3086 code += [(OP.END, )]

3087

3088 if ref is not None:

3089 code += [(OP.END, )]

3090

3091 return code

3092

3093 def dump(self, indent, reverse):

3094 group = self.group

3095 if group < 0:

3096 group = self.info.private_groups[group]

3097 print("{}GROUP {}".format(INDENT * indent, group))

3098 self.subpattern.dump(indent + 1, reverse)

3099

3100 def __eq__(self, other):

3101 return (type(self) is type(other) and (self.group, self.subpattern) ==

3102 (other.group, other.subpattern))

3103

3104 def max_width(self):

3105 return self.subpattern.max_width()

3106

3107 def get_required_string(self, reverse):

3108 return self.subpattern.get_required_string(reverse)

3109

3110 def __del__(self):

3111 self.info = None

3112

3113class Keep(ZeroWidthBase):

3114 _opcode = OP.KEEP

3115 _op_name = "KEEP"

3116

3117class LazyRepeat(GreedyRepeat):

3118 _opcode = OP.LAZY_REPEAT

3119 _op_name = "LAZY_REPEAT"

3120

3121class LookAround(RegexBase):

3122 _dir_text = {False: "AHEAD", True: "BEHIND"}

3123

3124 def __init__(self, behind, positive, subpattern):

3125 RegexBase.__init__(self)

3126 self.behind = bool(behind)

3127 self.positive = bool(positive)

3128 self.subpattern = subpattern

3129

3130 def fix_groups(self, pattern, reverse, fuzzy):

3131 self.subpattern.fix_groups(pattern, self.behind, fuzzy)

3132

3133 def optimise(self, info, reverse):

3134 subpattern = self.subpattern.optimise(info, self.behind)

3135 if self.positive and subpattern.is_empty():

3136 return subpattern

3137

3138 return LookAround(self.behind, self.positive, subpattern)

3139

3140 def pack_characters(self, info):

3141 self.subpattern = self.subpattern.pack_characters(info)

3142 return self

3143

3144 def remove_captures(self):

3145 return self.subpattern.remove_captures()

3146

3147 def is_atomic(self):

3148 return self.subpattern.is_atomic()

3149

3150 def can_be_affix(self):

3151 return self.subpattern.can_be_affix()

3152

3153 def contains_group(self):

3154 return self.subpattern.contains_group()

3155

3156 def get_firstset(self, reverse):

3157 if self.positive and self.behind == reverse:

3158 return self.subpattern.get_firstset(reverse)

3159

3160 return set([None])

3161

3162 def _compile(self, reverse, fuzzy):

3163 flags = 0

3164 if self.positive:

3165 flags |= POSITIVE_OP

3166 if fuzzy:

3167 flags |= FUZZY_OP

3168 if reverse:

3169 flags |= REVERSE_OP

3170

3171 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +

3172 self.subpattern.compile(self.behind) + [(OP.END, )])

3173

3174 def dump(self, indent, reverse):

3175 print("{}LOOK{} {}".format(INDENT * indent,

3176 self._dir_text[self.behind], POS_TEXT[self.positive]))

3177 self.subpattern.dump(indent + 1, self.behind)

3178

3179 def is_empty(self):

3180 return self.positive and self.subpattern.is_empty()

3181

3182 def __eq__(self, other):

3183 return type(self) is type(other) and (self.behind, self.positive,

3184 self.subpattern) == (other.behind, other.positive, other.subpattern)

3185

3186 def max_width(self):

3187 return 0

3188

3189class LookAroundConditional(RegexBase):

3190 _dir_text = {False: "AHEAD", True: "BEHIND"}

3191

3192 def __init__(self, behind, positive, subpattern, yes_item, no_item):

3193 RegexBase.__init__(self)

3194 self.behind = bool(behind)

3195 self.positive = bool(positive)

3196 self.subpattern = subpattern

3197 self.yes_item = yes_item

3198 self.no_item = no_item

3199

3200 def fix_groups(self, pattern, reverse, fuzzy):

3201 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3202 self.yes_item.fix_groups(pattern, reverse, fuzzy)

3203 self.no_item.fix_groups(pattern, reverse, fuzzy)

3204

3205 def optimise(self, info, reverse):

3206 subpattern = self.subpattern.optimise(info, self.behind)

3207 yes_item = self.yes_item.optimise(info, self.behind)

3208 no_item = self.no_item.optimise(info, self.behind)

3209

3210 return LookAroundConditional(self.behind, self.positive, subpattern,

3211 yes_item, no_item)

3212

3213 def pack_characters(self, info):

3214 self.subpattern = self.subpattern.pack_characters(info)

3215 self.yes_item = self.yes_item.pack_characters(info)

3216 self.no_item = self.no_item.pack_characters(info)

3217 return self

3218

3219 def remove_captures(self):

3220 self.subpattern = self.subpattern.remove_captures()

3221 self.yes_item = self.yes_item.remove_captures()

3222 self.no_item = self.no_item.remove_captures()

3223

3224 def is_atomic(self):

3225 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and

3226 self.no_item.is_atomic())

3227

3228 def can_be_affix(self):

3229 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()

3230 and self.no_item.can_be_affix())

3231

3232 def contains_group(self):

3233 return (self.subpattern.contains_group() or

3234 self.yes_item.contains_group() or self.no_item.contains_group())

3235

3236 def _compile(self, reverse, fuzzy):

3237 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]

3238 code.extend(self.subpattern.compile(self.behind, fuzzy))

3239 code.append((OP.NEXT, ))

3240 code.extend(self.yes_item.compile(reverse, fuzzy))

3241 add_code = self.no_item.compile(reverse, fuzzy)

3242 if add_code:

3243 code.append((OP.NEXT, ))

3244 code.extend(add_code)

3245

3246 code.append((OP.END, ))

3247

3248 return code

3249

3250 def dump(self, indent, reverse):

3251 print("{}CONDITIONAL {} {}".format(INDENT * indent,

3252 self._dir_text[self.behind], POS_TEXT[self.positive]))

3253 self.subpattern.dump(indent + 1, self.behind)

3254 print("{}EITHER".format(INDENT * indent))

3255 self.yes_item.dump(indent + 1, reverse)

3256 if not self.no_item.is_empty():

3257 print("{}OR".format(INDENT * indent))

3258 self.no_item.dump(indent + 1, reverse)

3259

3260 def is_empty(self):

3261 return (self.subpattern.is_empty() and self.yes_item.is_empty() or

3262 self.no_item.is_empty())

3263

3264 def __eq__(self, other):

3265 return type(self) is type(other) and (self.subpattern, self.yes_item,

3266 self.no_item) == (other.subpattern, other.yes_item, other.no_item)

3267

3268 def max_width(self):

3269 return max(self.yes_item.max_width(), self.no_item.max_width())

3270

3271 def get_required_string(self, reverse):

3272 return self.max_width(), None

3273

3274class PrecompiledCode(RegexBase):

3275 def __init__(self, code):

3276 self.code = code

3277

3278 def _compile(self, reverse, fuzzy):

3279 return [tuple(self.code)]

3280

3281class Property(RegexBase):

3282 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):

3283 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):

3284 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):

3285 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,

3286 True): OP.PROPERTY_IGN_REV}

3287

3288 def __init__(self, value, positive=True, case_flags=NOCASE,

3289 zerowidth=False, encoding=0):

3290 RegexBase.__init__(self)

3291 self.value = value

3292 self.positive = bool(positive)

3293 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3294 self.zerowidth = bool(zerowidth)

3295 self.encoding = encoding

3296

3297 self._key = (self.__class__, self.value, self.positive,

3298 self.case_flags, self.zerowidth)

3299

3300 def rebuild(self, positive, case_flags, zerowidth):

3301 return Property(self.value, positive, case_flags, zerowidth,

3302 self.encoding)

3303

3304 def optimise(self, info, reverse, in_set=False):

3305 return self

3306

3307 def get_firstset(self, reverse):

3308 return set([self])

3309

3310 def has_simple_start(self):

3311 return True

3312

3313 def _compile(self, reverse, fuzzy):

3314 flags = 0

3315 if self.positive:

3316 flags |= POSITIVE_OP

3317 if self.zerowidth:

3318 flags |= ZEROWIDTH_OP

3319 if fuzzy:

3320 flags |= FUZZY_OP

3321 flags |= self.encoding << ENCODING_OP_SHIFT

3322 return [(self._opcode[self.case_flags, reverse], flags, self.value)]

3323

3324 def dump(self, indent, reverse):

3325 prop = PROPERTY_NAMES[self.value >> 16]

3326 name, value = prop[0], prop[1][self.value & 0xFFFF]

3327 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,

3328 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],

3329 ["", " ASCII"][self.encoding]))

3330

3331 def matches(self, ch):

3332 return _regex.has_property_value(self.value, ch) == self.positive

3333

3334 def max_width(self):

3335 return 1

3336

3337class Prune(ZeroWidthBase):

3338 _op_name = "PRUNE"

3339

3340 def _compile(self, reverse, fuzzy):

3341 return [(OP.PRUNE, )]

3342

3343class Range(RegexBase):

3344 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,

3345 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,

3346 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,

3347 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}

3348 _op_name = "RANGE"

3349

3350 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,

3351 zerowidth=False):

3352 RegexBase.__init__(self)

3353 self.lower = lower

3354 self.upper = upper

3355 self.positive = bool(positive)

3356 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3357 self.zerowidth = bool(zerowidth)

3358

3359 self._key = (self.__class__, self.lower, self.upper, self.positive,

3360 self.case_flags, self.zerowidth)

3361

3362 def rebuild(self, positive, case_flags, zerowidth):

3363 return Range(self.lower, self.upper, positive, case_flags, zerowidth)

3364

3365 def optimise(self, info, reverse, in_set=False):

3366 # Is the range case-sensitive?

3367 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3368 return self

3369

3370 # Is full case-folding possible?

3371 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=

3372 FULLIGNORECASE):

3373 return self

3374

3375 # Get the characters which expand to multiple codepoints on folding.

3376 expanding_chars = _regex.get_expand_on_folding()

3377

3378 # Get the folded characters in the range.

3379 items = []

3380 for ch in expanding_chars:

3381 if self.lower <= ord(ch) <= self.upper:

3382 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3383 items.append(String([ord(c) for c in folded],

3384 case_flags=self.case_flags))

3385

3386 if not items:

3387 # We can fall back to simple case-folding.

3388 return self

3389

3390 if len(items) < self.upper - self.lower + 1:

3391 # Not all the characters are covered by the full case-folding.

3392 items.insert(0, self)

3393

3394 return Branch(items)

3395

3396 def _compile(self, reverse, fuzzy):

3397 flags = 0

3398 if self.positive:

3399 flags |= POSITIVE_OP

3400 if self.zerowidth:

3401 flags |= ZEROWIDTH_OP

3402 if fuzzy:

3403 flags |= FUZZY_OP

3404 return [(self._opcode[self.case_flags, reverse], flags, self.lower,

3405 self.upper)]

3406

3407 def dump(self, indent, reverse):

3408 display_lower = ascii(chr(self.lower)).lstrip("bu")

3409 display_upper = ascii(chr(self.upper)).lstrip("bu")

3410 print("{}RANGE {} {} {}{}".format(INDENT * indent,

3411 POS_TEXT[self.positive], display_lower, display_upper,

3412 CASE_TEXT[self.case_flags]))

3413

3414 def matches(self, ch):

3415 return (self.lower <= ch <= self.upper) == self.positive

3416

3417 def max_width(self):

3418 return 1

3419

3420class RefGroup(RegexBase):

3421 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):

3422 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,

3423 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,

3424 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,

3425 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}

3426

3427 def __init__(self, info, group, position, case_flags=NOCASE):

3428 RegexBase.__init__(self)

3429 self.info = info

3430 self.group = group

3431 self.position = position

3432 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3433

3434 self._key = self.__class__, self.group, self.case_flags

3435

3436 def fix_groups(self, pattern, reverse, fuzzy):

3437 try:

3438 self.group = int(self.group)

3439 except ValueError:

3440 try:

3441 self.group = self.info.group_index[self.group]

3442 except KeyError:

3443 raise error("unknown group", pattern, self.position)

3444

3445 if not 1 <= self.group <= self.info.group_count:

3446 raise error("invalid group reference", pattern, self.position)

3447

3448 self._key = self.__class__, self.group, self.case_flags

3449

3450 def remove_captures(self):

3451 raise error("group reference not allowed", self.pattern, self.position)

3452

3453 def _compile(self, reverse, fuzzy):

3454 flags = 0

3455 if fuzzy:

3456 flags |= FUZZY_OP

3457 return [(self._opcode[self.case_flags, reverse], flags, self.group)]

3458

3459 def dump(self, indent, reverse):

3460 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,

3461 CASE_TEXT[self.case_flags]))

3462

3463 def max_width(self):

3464 return UNLIMITED

3465

3466 def __del__(self):

3467 self.info = None

3468

3469class SearchAnchor(ZeroWidthBase):

3470 _opcode = OP.SEARCH_ANCHOR

3471 _op_name = "SEARCH_ANCHOR"

3472

3473class Sequence(RegexBase):

3474 def __init__(self, items=None):

3475 RegexBase.__init__(self)

3476 if items is None:

3477 items = []

3478

3479 self.items = items

3480

3481 def fix_groups(self, pattern, reverse, fuzzy):

3482 for s in self.items:

3483 s.fix_groups(pattern, reverse, fuzzy)

3484

3485 def optimise(self, info, reverse):

3486 # Flatten the sequences.

3487 items = []

3488 for s in self.items:

3489 s = s.optimise(info, reverse)

3490 if isinstance(s, Sequence):

3491 items.extend(s.items)

3492 else:

3493 items.append(s)

3494

3495 return make_sequence(items)

3496

3497 def pack_characters(self, info):

3498 "Packs sequences of characters into strings."

3499 items = []

3500 characters = []

3501 case_flags = NOCASE

3502 for s in self.items:

3503 if type(s) is Character and s.positive and not s.zerowidth:

3504 if s.case_flags != case_flags:

3505 # Different case sensitivity, so flush, unless neither the

3506 # previous nor the new character are cased.

3507 if s.case_flags or is_cased_i(info, s.value):

3508 Sequence._flush_characters(info, characters,

3509 case_flags, items)

3510

3511 case_flags = s.case_flags

3512

3513 characters.append(s.value)

3514 elif type(s) is String or type(s) is Literal:

3515 if s.case_flags != case_flags:

3516 # Different case sensitivity, so flush, unless the neither

3517 # the previous nor the new string are cased.

3518 if s.case_flags or any(is_cased_i(info, c) for c in

3519 characters):

3520 Sequence._flush_characters(info, characters,

3521 case_flags, items)

3522

3523 case_flags = s.case_flags

3524

3525 characters.extend(s.characters)

3526 else:

3527 Sequence._flush_characters(info, characters, case_flags, items)

3528

3529 items.append(s.pack_characters(info))

3530

3531 Sequence._flush_characters(info, characters, case_flags, items)

3532

3533 return make_sequence(items)

3534

3535 def remove_captures(self):

3536 self.items = [s.remove_captures() for s in self.items]

3537 return self

3538

3539 def is_atomic(self):

3540 return all(s.is_atomic() for s in self.items)

3541

3542 def can_be_affix(self):

3543 return False

3544

3545 def contains_group(self):

3546 return any(s.contains_group() for s in self.items)

3547

3548 def get_firstset(self, reverse):

3549 fs = set()

3550 items = self.items

3551 if reverse:

3552 items.reverse()

3553 for s in items:

3554 fs |= s.get_firstset(reverse)

3555 if None not in fs:

3556 return fs

3557 fs.discard(None)

3558

3559 return fs | set([None])

3560

3561 def has_simple_start(self):

3562 return bool(self.items) and self.items[0].has_simple_start()

3563

3564 def _compile(self, reverse, fuzzy):

3565 seq = self.items

3566 if reverse:

3567 seq = seq[::-1]

3568

3569 code = []

3570 for s in seq:

3571 code.extend(s.compile(reverse, fuzzy))

3572

3573 return code

3574

3575 def dump(self, indent, reverse):

3576 for s in self.items:

3577 s.dump(indent, reverse)

3578

3579 @staticmethod

3580 def _flush_characters(info, characters, case_flags, items):

3581 if not characters:

3582 return

3583

3584 # Disregard case_flags if all of the characters are case-less.

3585 if case_flags & IGNORECASE:

3586 if not any(is_cased_i(info, c) for c in characters):

3587 case_flags = NOCASE

3588

3589 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3590 literals = Sequence._fix_full_casefold(characters)

3591

3592 for item in literals:

3593 chars = item.characters

3594

3595 if len(chars) == 1:

3596 items.append(Character(chars[0], case_flags=item.case_flags))

3597 else:

3598 items.append(String(chars, case_flags=item.case_flags))

3599 else:

3600 if len(characters) == 1:

3601 items.append(Character(characters[0], case_flags=case_flags))

3602 else:

3603 items.append(String(characters, case_flags=case_flags))

3604

3605 characters[:] = []

3606

3607 @staticmethod

3608 def _fix_full_casefold(characters):

3609 # Split a literal needing full case-folding into chunks that need it

3610 # and chunks that can use simple case-folding, which is faster.

3611 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in

3612 _regex.get_expand_on_folding()]

3613 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)

3614 for c in characters)).lower()

3615 chunks = []

3616

3617 for e in expanded:

3618 found = string.find(e)

3619

3620 while found >= 0:

3621 chunks.append((found, found + len(e)))

3622 found = string.find(e, found + 1)

3623

3624 pos = 0

3625 literals = []

3626

3627 for start, end in Sequence._merge_chunks(chunks):

3628 if pos < start:

3629 literals.append(Literal(characters[pos : start],

3630 case_flags=IGNORECASE))

3631

3632 literals.append(Literal(characters[start : end],

3633 case_flags=FULLIGNORECASE))

3634 pos = end

3635

3636 if pos < len(characters):

3637 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))

3638

3639 return literals

3640

3641 @staticmethod

3642 def _merge_chunks(chunks):

3643 if len(chunks) < 2:

3644 return chunks

3645

3646 chunks.sort()

3647

3648 start, end = chunks[0]

3649 new_chunks = []

3650

3651 for s, e in chunks[1 : ]:

3652 if s <= end:

3653 end = max(end, e)

3654 else:

3655 new_chunks.append((start, end))

3656 start, end = s, e

3657

3658 new_chunks.append((start, end))

3659

3660 return new_chunks

3661

3662 def is_empty(self):

3663 return all(i.is_empty() for i in self.items)

3664

3665 def __eq__(self, other):

3666 return type(self) is type(other) and self.items == other.items

3667

3668 def max_width(self):

3669 return sum(s.max_width() for s in self.items)

3670

3671 def get_required_string(self, reverse):

3672 seq = self.items

3673 if reverse:

3674 seq = seq[::-1]

3675

3676 offset = 0

3677

3678 for s in seq:

3679 ofs, req = s.get_required_string(reverse)

3680 offset += ofs

3681 if req:

3682 return offset, req

3683

3684 return offset, None

3685

3686class SetBase(RegexBase):

3687 def __init__(self, info, items, positive=True, case_flags=NOCASE,

3688 zerowidth=False):

3689 RegexBase.__init__(self)

3690 self.info = info

3691 self.items = tuple(items)

3692 self.positive = bool(positive)

3693 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3694 self.zerowidth = bool(zerowidth)

3695

3696 self.char_width = 1

3697

3698 self._key = (self.__class__, self.items, self.positive,

3699 self.case_flags, self.zerowidth)

3700

3701 def rebuild(self, positive, case_flags, zerowidth):

3702 return type(self)(self.info, self.items, positive, case_flags,

3703 zerowidth).optimise(self.info, False)

3704

3705 def get_firstset(self, reverse):

3706 return set([self])

3707

3708 def has_simple_start(self):

3709 return True

3710

3711 def _compile(self, reverse, fuzzy):

3712 flags = 0

3713 if self.positive:

3714 flags |= POSITIVE_OP

3715 if self.zerowidth:

3716 flags |= ZEROWIDTH_OP

3717 if fuzzy:

3718 flags |= FUZZY_OP

3719 code = [(self._opcode[self.case_flags, reverse], flags)]

3720 for m in self.items:

3721 code.extend(m.compile())

3722

3723 code.append((OP.END, ))

3724

3725 return code

3726

3727 def dump(self, indent, reverse):

3728 print("{}{} {}{}".format(INDENT * indent, self._op_name,

3729 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))

3730 for i in self.items:

3731 i.dump(indent + 1, reverse)

3732

3733 def _handle_case_folding(self, info, in_set):

3734 # Is the set case-sensitive?

3735 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3736 return self

3737

3738 # Is full case-folding possible?

3739 if (not (self.info.flags & UNICODE) or (self.case_flags &

3740 FULLIGNORECASE) != FULLIGNORECASE):

3741 return self

3742

3743 # Get the characters which expand to multiple codepoints on folding.

3744 expanding_chars = _regex.get_expand_on_folding()

3745

3746 # Get the folded characters in the set.

3747 items = []

3748 seen = set()

3749 for ch in expanding_chars:

3750 if self.matches(ord(ch)):

3751 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3752 if folded not in seen:

3753 items.append(String([ord(c) for c in folded],

3754 case_flags=self.case_flags))

3755 seen.add(folded)

3756

3757 if not items:

3758 # We can fall back to simple case-folding.

3759 return self

3760

3761 return Branch([self] + items)

3762

3763 def max_width(self):

3764 # Is the set case-sensitive?

3765 if not self.positive or not (self.case_flags & IGNORECASE):

3766 return 1

3767

3768 # Is full case-folding possible?

3769 if (not (self.info.flags & UNICODE) or (self.case_flags &

3770 FULLIGNORECASE) != FULLIGNORECASE):

3771 return 1

3772

3773 # Get the characters which expand to multiple codepoints on folding.

3774 expanding_chars = _regex.get_expand_on_folding()

3775

3776 # Get the folded characters in the set.

3777 seen = set()

3778 for ch in expanding_chars:

3779 if self.matches(ord(ch)):

3780 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3781 seen.add(folded)

3782

3783 if not seen:

3784 return 1

3785

3786 return max(len(folded) for folded in seen)

3787

3788 def __del__(self):

3789 self.info = None

3790

3791class SetDiff(SetBase):

3792 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):

3793 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):

3794 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):

3795 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,

3796 True): OP.SET_DIFF_IGN_REV}

3797 _op_name = "SET_DIFF"

3798

3799 def optimise(self, info, reverse, in_set=False):

3800 items = self.items

3801 if len(items) > 2:

3802 items = [items[0], SetUnion(info, items[1 : ])]

3803

3804 if len(items) == 1:

3805 return items[0].with_flags(case_flags=self.case_flags,

3806 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3807

3808 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in

3809 items)

3810

3811 return self._handle_case_folding(info, in_set)

3812

3813 def matches(self, ch):

3814 m = self.items[0].matches(ch) and not self.items[1].matches(ch)

3815 return m == self.positive

3816

3817class SetInter(SetBase):

3818 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):

3819 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,

3820 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,

3821 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,

3822 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}

3823 _op_name = "SET_INTER"

3824

3825 def optimise(self, info, reverse, in_set=False):

3826 items = []

3827 for m in self.items:

3828 m = m.optimise(info, reverse, in_set=True)

3829 if isinstance(m, SetInter) and m.positive:

3830 # Intersection in intersection.

3831 items.extend(m.items)

3832 else:

3833 items.append(m)

3834

3835 if len(items) == 1:

3836 return items[0].with_flags(case_flags=self.case_flags,

3837 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3838

3839 self.items = tuple(items)

3840

3841 return self._handle_case_folding(info, in_set)

3842

3843 def matches(self, ch):

3844 m = all(i.matches(ch) for i in self.items)

3845 return m == self.positive

3846

3847class SetSymDiff(SetBase):

3848 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):

3849 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,

3850 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,

3851 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):

3852 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}

3853 _op_name = "SET_SYM_DIFF"

3854

3855 def optimise(self, info, reverse, in_set=False):

3856 items = []

3857 for m in self.items:

3858 m = m.optimise(info, reverse, in_set=True)

3859 if isinstance(m, SetSymDiff) and m.positive:

3860 # Symmetric difference in symmetric difference.

3861 items.extend(m.items)

3862 else:

3863 items.append(m)

3864

3865 if len(items) == 1:

3866 return items[0].with_flags(case_flags=self.case_flags,

3867 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3868

3869 self.items = tuple(items)

3870

3871 return self._handle_case_folding(info, in_set)

3872

3873 def matches(self, ch):

3874 m = False

3875 for i in self.items:

3876 m = m != i.matches(ch)

3877

3878 return m == self.positive

3879

3880class SetUnion(SetBase):

3881 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):

3882 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,

3883 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,

3884 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,

3885 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}

3886 _op_name = "SET_UNION"

3887

3888 def optimise(self, info, reverse, in_set=False):

3889 items = []

3890 for m in self.items:

3891 m = m.optimise(info, reverse, in_set=True)

3892 if isinstance(m, SetUnion) and m.positive:

3893 # Union in union.

3894 items.extend(m.items)

3895 elif isinstance(m, AnyAll):

3896 return AnyAll()

3897 else:

3898 items.append(m)

3899

3900 # Are there complementary properties?

3901 properties = (set(), set())

3902

3903 for m in items:

3904 if isinstance(m, Property):

3905 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))

3906

3907 if properties[0] & properties[1]:

3908 return AnyAll()

3909

3910 if len(items) == 1:

3911 i = items[0]

3912 return i.with_flags(positive=i.positive == self.positive,

3913 case_flags=self.case_flags,

3914 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3915

3916 self.items = tuple(items)

3917

3918 return self._handle_case_folding(info, in_set)

3919

3920 def _compile(self, reverse, fuzzy):

3921 flags = 0

3922 if self.positive:

3923 flags |= POSITIVE_OP

3924 if self.zerowidth:

3925 flags |= ZEROWIDTH_OP

3926 if fuzzy:

3927 flags |= FUZZY_OP

3928

3929 characters, others = defaultdict(list), []

3930 for m in self.items:

3931 if isinstance(m, Character):

3932 characters[m.positive].append(m.value)

3933 else:

3934 others.append(m)

3935

3936 code = [(self._opcode[self.case_flags, reverse], flags)]

3937

3938 for positive, values in characters.items():

3939 flags = 0

3940 if positive:

3941 flags |= POSITIVE_OP

3942 if len(values) == 1:

3943 code.append((OP.CHARACTER, flags, values[0]))

3944 else:

3945 code.append((OP.STRING, flags, len(values)) + tuple(values))

3946

3947 for m in others:

3948 code.extend(m.compile())

3949

3950 code.append((OP.END, ))

3951

3952 return code

3953

3954 def matches(self, ch):

3955 m = any(i.matches(ch) for i in self.items)

3956 return m == self.positive

3957

3958class Skip(ZeroWidthBase):

3959 _op_name = "SKIP"

3960 _opcode = OP.SKIP

3961

3962class StartOfLine(ZeroWidthBase):

3963 _opcode = OP.START_OF_LINE

3964 _op_name = "START_OF_LINE"

3965

3966class StartOfLineU(StartOfLine):

3967 _opcode = OP.START_OF_LINE_U

3968 _op_name = "START_OF_LINE_U"

3969

3970class StartOfString(ZeroWidthBase):

3971 _opcode = OP.START_OF_STRING

3972 _op_name = "START_OF_STRING"

3973

3974class StartOfWord(ZeroWidthBase):

3975 _opcode = OP.START_OF_WORD

3976 _op_name = "START_OF_WORD"

3977

3978class String(RegexBase):

3979 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,

3980 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,

3981 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,

3982 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):

3983 OP.STRING_FLD_REV}

3984

3985 def __init__(self, characters, case_flags=NOCASE):

3986 self.characters = tuple(characters)

3987 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3988

3989 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3990 folded_characters = []

3991 for char in self.characters:

3992 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))

3993 folded_characters.extend(ord(c) for c in folded)

3994 else:

3995 folded_characters = self.characters

3996

3997 self.folded_characters = tuple(folded_characters)

3998 self.required = False

3999

4000 self._key = self.__class__, self.characters, self.case_flags

4001

4002 def get_firstset(self, reverse):

4003 if reverse:

4004 pos = -1

4005 else:

4006 pos = 0

4007 return set([Character(self.characters[pos],

4008 case_flags=self.case_flags)])

4009

4010 def has_simple_start(self):

4011 return True

4012

4013 def _compile(self, reverse, fuzzy):

4014 flags = 0

4015 if fuzzy:

4016 flags |= FUZZY_OP

4017 if self.required:

4018 flags |= REQUIRED_OP

4019 return [(self._opcode[self.case_flags, reverse], flags,

4020 len(self.folded_characters)) + self.folded_characters]

4021

4022 def dump(self, indent, reverse):

4023 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")

4024 print("{}STRING {}{}".format(INDENT * indent, display,

4025 CASE_TEXT[self.case_flags]))

4026

4027 def max_width(self):

4028 return len(self.folded_characters)

4029

4030 def get_required_string(self, reverse):

4031 return 0, self

4032

4033class Literal(String):

4034 def dump(self, indent, reverse):

4035 literal = ''.join(chr(c) for c in self.characters)

4036 display = ascii(literal).lstrip("bu")

4037 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,

4038 CASE_TEXT[self.case_flags]))

4039

4040class StringSet(Branch):

4041 def __init__(self, info, name, case_flags=NOCASE):

4042 self.info = info

4043 self.name = name

4044 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

4045

4046 self._key = self.__class__, self.name, self.case_flags

4047

4048 self.set_key = (name, self.case_flags)

4049 if self.set_key not in info.named_lists_used:

4050 info.named_lists_used[self.set_key] = len(info.named_lists_used)

4051

4052 index = self.info.named_lists_used[self.set_key]

4053 items = self.info.kwargs[self.name]

4054

4055 case_flags = self.case_flags

4056

4057 encoding = self.info.flags & _ALL_ENCODINGS

4058 fold_flags = encoding | case_flags

4059

4060 choices = []

4061

4062 for string in items:

4063 if isinstance(string, str):

4064 string = [ord(c) for c in string]

4065

4066 choices.append([Character(c, case_flags=case_flags) for c in

4067 string])

4068

4069 # Sort from longest to shortest.

4070 choices.sort(key=len, reverse=True)

4071

4072 self.branches = [Sequence(choice) for choice in choices]

4073

4074 def dump(self, indent, reverse):

4075 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,

4076 CASE_TEXT[self.case_flags]))

4077

4078 def __del__(self):

4079 self.info = None

4080

4081class Source:

4082 "Scanner for the regular expression source string."

4083 def __init__(self, string):

4084 if isinstance(string, str):

4085 self.string = string

4086 self.char_type = chr

4087 else:

4088 self.string = string.decode("latin-1")

4089 self.char_type = lambda c: bytes([c])

4090

4091 self.pos = 0

4092 self.ignore_space = False

4093 self.sep = string[ : 0]

4094

4095 def get(self, override_ignore=False):

4096 string = self.string

4097 pos = self.pos

4098

4099 try:

4100 if self.ignore_space and not override_ignore:

4101 while True:

4102 if string[pos].isspace():

4103 # Skip over the whitespace.

4104 pos += 1

4105 elif string[pos] == "#":

4106 # Skip over the comment to the end of the line.

4107 pos = string.index("\n", pos)

4108 else:

4109 break

4110

4111 ch = string[pos]

4112 self.pos = pos + 1

4113 return ch

4114 except IndexError:

4115 # We've reached the end of the string.

4116 self.pos = pos

4117 return string[ : 0]

4118 except ValueError:

4119 # The comment extended to the end of the string.

4120 self.pos = len(string)

4121 return string[ : 0]

4122

4123 def get_many(self, count=1):

4124 string = self.string

4125 pos = self.pos

4126

4127 try:

4128 if self.ignore_space:

4129 substring = []

4130

4131 while len(substring) < count:

4132 while True:

4133 if string[pos].isspace():

4134 # Skip over the whitespace.

4135 pos += 1

4136 elif string[pos] == "#":

4137 # Skip over the comment to the end of the line.

4138 pos = string.index("\n", pos)

4139 else:

4140 break

4141

4142 substring.append(string[pos])

4143 pos += 1

4144

4145 substring = "".join(substring)

4146 else:

4147 substring = string[pos : pos + count]

4148 pos += len(substring)

4149

4150 self.pos = pos

4151 return substring

4152 except IndexError:

4153 # We've reached the end of the string.

4154 self.pos = len(string)

4155 return "".join(substring)

4156 except ValueError:

4157 # The comment extended to the end of the string.

4158 self.pos = len(string)

4159 return "".join(substring)

4160

4161 def get_while(self, test_set, include=True, keep_spaces=False):

4162 string = self.string

4163 pos = self.pos

4164

4165 if self.ignore_space and not keep_spaces:

4166 try:

4167 substring = []

4168

4169 while True:

4170 if string[pos].isspace():

4171 # Skip over the whitespace.

4172 pos += 1

4173 elif string[pos] == "#":

4174 # Skip over the comment to the end of the line.

4175 pos = string.index("\n", pos)

4176 elif (string[pos] in test_set) == include:

4177 substring.append(string[pos])

4178 pos += 1

4179 else:

4180 break

4181

4182 self.pos = pos

4183 except IndexError:

4184 # We've reached the end of the string.

4185 self.pos = len(string)

4186 except ValueError:

4187 # The comment extended to the end of the string.

4188 self.pos = len(string)

4189

4190 return "".join(substring)

4191 else:

4192 try:

4193 while (string[pos] in test_set) == include:

4194 pos += 1

4195

4196 substring = string[self.pos : pos]

4197

4198 self.pos = pos

4199

4200 return substring

4201 except IndexError:

4202 # We've reached the end of the string.

4203 substring = string[self.pos : pos]

4204

4205 self.pos = pos

4206

4207 return substring

4208

4209 def skip_while(self, test_set, include=True):

4210 string = self.string

4211 pos = self.pos

4212

4213 try:

4214 if self.ignore_space:

4215 while True:

4216 if string[pos].isspace():

4217 # Skip over the whitespace.

4218 pos += 1

4219 elif string[pos] == "#":

4220 # Skip over the comment to the end of the line.

4221 pos = string.index("\n", pos)

4222 elif (string[pos] in test_set) == include:

4223 pos += 1

4224 else:

4225 break

4226 else:

4227 while (string[pos] in test_set) == include:

4228 pos += 1

4229

4230 self.pos = pos

4231 except IndexError:

4232 # We've reached the end of the string.

4233 self.pos = len(string)

4234 except ValueError:

4235 # The comment extended to the end of the string.

4236 self.pos = len(string)

4237

4238 def match(self, substring):

4239 string = self.string

4240 pos = self.pos

4241

4242 if self.ignore_space:

4243 try:

4244 for c in substring:

4245 while True:

4246 if string[pos].isspace():

4247 # Skip over the whitespace.

4248 pos += 1

4249 elif string[pos] == "#":

4250 # Skip over the comment to the end of the line.

4251 pos = string.index("\n", pos)

4252 else:

4253 break

4254

4255 if string[pos] != c:

4256 return False

4257

4258 pos += 1

4259

4260 self.pos = pos

4261

4262 return True

4263 except IndexError:

4264 # We've reached the end of the string.

4265 return False

4266 except ValueError:

4267 # The comment extended to the end of the string.

4268 return False

4269 else:

4270 if not string.startswith(substring, pos):

4271 return False

4272

4273 self.pos = pos + len(substring)

4274

4275 return True

4276

4277 def expect(self, substring):

4278 if not self.match(substring):

4279 raise error("missing {}".format(substring), self.string, self.pos)

4280

4281 def at_end(self):

4282 string = self.string

4283 pos = self.pos

4284

4285 try:

4286 if self.ignore_space:

4287 while True:

4288 if string[pos].isspace():

4289 pos += 1

4290 elif string[pos] == "#":

4291 pos = string.index("\n", pos)

4292 else:

4293 break

4294

4295 return pos >= len(string)

4296 except IndexError:

4297 # We've reached the end of the string.

4298 return True

4299 except ValueError:

4300 # The comment extended to the end of the string.

4301 return True

4302

4303class Info:

4304 "Info about the regular expression."

4305

4306 def __init__(self, flags=0, char_type=None, kwargs={}):

4307 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]

4308 self.flags = flags

4309 self.global_flags = flags

4310 self.inline_locale = False

4311

4312 self.kwargs = kwargs

4313

4314 self.group_count = 0

4315 self.group_index = {}

4316 self.group_name = {}

4317 self.char_type = char_type

4318 self.named_lists_used = {}

4319 self.open_groups = []

4320 self.open_group_count = {}

4321 self.defined_groups = {}

4322 self.group_calls = []

4323 self.private_groups = {}

4324

4325 def open_group(self, name=None):

4326 group = self.group_index.get(name)

4327 if group is None:

4328 while True:

4329 self.group_count += 1

4330 if name is None or self.group_count not in self.group_name:

4331 break

4332

4333 group = self.group_count

4334 if name:

4335 self.group_index[name] = group

4336 self.group_name[group] = name

4337

4338 if group in self.open_groups:

4339 # We have a nested named group. We'll assign it a private group

4340 # number, initially negative until we can assign a proper

4341 # (positive) number.

4342 group_alias = -(len(self.private_groups) + 1)

4343 self.private_groups[group_alias] = group

4344 group = group_alias

4345

4346 self.open_groups.append(group)

4347 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1

4348

4349 return group

4350

4351 def close_group(self):

4352 self.open_groups.pop()

4353

4354 def is_open_group(self, name):

4355 # In version 1, a group reference can refer to an open group. We'll

4356 # just pretend the group isn't open.

4357 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4358 if version == VERSION1:

4359 return False

4360

4361 if name.isdigit():

4362 group = int(name)

4363 else:

4364 group = self.group_index.get(name)

4365

4366 return group in self.open_groups

4367

4368def _check_group_features(info, parsed):

4369 """Checks whether the reverse and fuzzy features of the group calls match

4370 the groups which they call.

4371 """

4372 call_refs = {}

4373 additional_groups = []

4374 for call, reverse, fuzzy in info.group_calls:

4375 # Look up the reference of this group call.

4376 key = (call.group, reverse, fuzzy)

4377 ref = call_refs.get(key)

4378 if ref is None:

4379 # This group doesn't have a reference yet, so look up its features.

4380 if call.group == 0:

4381 # Calling the pattern as a whole.

4382 rev = bool(info.flags & REVERSE)

4383 fuz = isinstance(parsed, Fuzzy)

4384 if (rev, fuz) != (reverse, fuzzy):

4385 # The pattern as a whole doesn't have the features we want,

4386 # so we'll need to make a copy of it with the desired

4387 # features.

4388 additional_groups.append((CallRef(len(call_refs), parsed),

4389 reverse, fuzzy))

4390 else:

4391 # Calling a capture group.

4392 def_info = info.defined_groups[call.group]

4393 group = def_info[0]

4394 if def_info[1 : ] != (reverse, fuzzy):

4395 # The group doesn't have the features we want, so we'll

4396 # need to make a copy of it with the desired features.

4397 additional_groups.append((group, reverse, fuzzy))

4398

4399 ref = len(call_refs)

4400 call_refs[key] = ref

4401

4402 call.call_ref = ref

4403

4404 info.call_refs = call_refs

4405 info.additional_groups = additional_groups

4406

4407def _get_required_string(parsed, flags):

4408 "Gets the required string and related info of a parsed pattern."

4409

4410 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))

4411 if required:

4412 required.required = True

4413 if req_offset >= UNLIMITED:

4414 req_offset = -1

4415

4416 req_flags = required.case_flags

4417 if not (flags & UNICODE):

4418 req_flags &= ~UNICODE

4419

4420 req_chars = required.folded_characters

4421 else:

4422 req_offset = 0

4423 req_chars = ()

4424 req_flags = 0

4425

4426 return req_offset, req_chars, req_flags

4427

4428class Scanner:

4429 def __init__(self, lexicon, flags=0):

4430 self.lexicon = lexicon

4431

4432 # Combine phrases into a compound pattern.

4433 patterns = []

4434 for phrase, action in lexicon:

4435 # Parse the regular expression.

4436 source = Source(phrase)

4437 info = Info(flags, source.char_type)

4438 source.ignore_space = bool(info.flags & VERBOSE)

4439 parsed = _parse_pattern(source, info)

4440 if not source.at_end():

4441 raise error("unbalanced parenthesis", source.string,

4442 source.pos)

4443

4444 # We want to forbid capture groups within each phrase.

4445 patterns.append(parsed.remove_captures())

4446

4447 # Combine all the subpatterns into one pattern.

4448 info = Info(flags)

4449 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]

4450 parsed = Branch(patterns)

4451

4452 # Optimise the compound pattern.

4453 reverse = bool(info.flags & REVERSE)

4454 parsed = parsed.optimise(info, reverse)

4455 parsed = parsed.pack_characters(info)

4456

4457 # Get the required string.

4458 req_offset, req_chars, req_flags = _get_required_string(parsed,

4459 info.flags)

4460

4461 # Check the features of the groups.

4462 _check_group_features(info, parsed)

4463

4464 # Complain if there are any group calls. They are not supported by the

4465 # Scanner class.

4466 if info.call_refs:

4467 raise error("recursive regex not supported by Scanner",

4468 source.string, source.pos)

4469

4470 reverse = bool(info.flags & REVERSE)

4471

4472 # Compile the compound pattern. The result is a list of tuples.

4473 code = parsed.compile(reverse) + [(OP.SUCCESS, )]

4474

4475 # Flatten the code into a list of ints.

4476 code = _flatten_code(code)

4477

4478 if not parsed.has_simple_start():

4479 # Get the first set, if possible.

4480 try:

4481 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))

4482 fs_code = _flatten_code(fs_code)

4483 code = fs_code + code

4484 except _FirstSetError:

4485 pass

4486

4487 # Check the global flags for conflicts.

4488 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4489 if version not in (0, VERSION0, VERSION1):

4490 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")

4491

4492 # Create the PatternObject.

4493 #

4494 # Local flags like IGNORECASE affect the code generation, but aren't

4495 # needed by the PatternObject itself. Conversely, global flags like

4496 # LOCALE _don't_ affect the code generation but _are_ needed by the

4497 # PatternObject.

4498 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,

4499 code, {}, {}, {}, [], req_offset, req_chars, req_flags,

4500 len(patterns))

4501

4502 def scan(self, string):

4503 result = []

4504 append = result.append

4505 match = self.scanner.scanner(string).match

4506 i = 0

4507 while True:

4508 m = match()

4509 if not m:

4510 break

4511 j = m.end()

4512 if i == j:

4513 break

4514 action = self.lexicon[m.lastindex - 1][1]

4515 if hasattr(action, '__call__'):

4516 self.match = m

4517 action = action(self, m.group())

4518 if action is not None:

4519 append(action)

4520 i = j

4521

4522 return result, string[i : ]

4523

4524# Get the known properties dict.

4525PROPERTIES = _regex.get_properties()

4526

4527# Build the inverse of the properties dict.

4528PROPERTY_NAMES = {}

4529for prop_name, (prop_id, values) in PROPERTIES.items():

4530 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))

4531 name = max(name, prop_name, key=len)

4532 PROPERTY_NAMES[prop_id] = name, prop_values

4533

4534 for val_name, val_id in values.items():

4535 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,

4536 key=len)

4537

4538# Character escape sequences.

4539CHARACTER_ESCAPES = {

4540 "a": "\a",

4541 "b": "\b",

4542 "f": "\f",

4543 "n": "\n",

4544 "r": "\r",

4545 "t": "\t",

4546 "v": "\v",

4547}

4548

4549ASCII_ENCODING = 1

4550UNICODE_ENCODING = 2

4551

4552# Predefined character set escape sequences.

4553CHARSET_ESCAPES = {

4554 "d": lookup_property(None, "Digit", True),

4555 "D": lookup_property(None, "Digit", False),

4556 "h": lookup_property(None, "Blank", True),

4557 "s": lookup_property(None, "Space", True),

4558 "S": lookup_property(None, "Space", False),

4559 "w": lookup_property(None, "Word", True),

4560 "W": lookup_property(None, "Word", False),

4561}

4562

4563ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4564ASCII_CHARSET_ESCAPES.update({

4565 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),

4566 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),

4567 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),

4568 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),

4569 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),

4570 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),

4571})

4572UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4573UNICODE_CHARSET_ESCAPES.update({

4574 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),

4575 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),

4576 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),

4577 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),

4578 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),

4579 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),

4580})

4581

4582# Positional escape sequences.

4583POSITION_ESCAPES = {

4584 "A": StartOfString(),

4585 "b": Boundary(),

4586 "B": Boundary(False),

4587 "K": Keep(),

4588 "m": StartOfWord(),

4589 "M": EndOfWord(),

4590 "Z": EndOfString(),

4591}

4592ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4593ASCII_POSITION_ESCAPES.update({

4594 "b": Boundary(encoding=ASCII_ENCODING),

4595 "B": Boundary(False, encoding=ASCII_ENCODING),

4596 "m": StartOfWord(encoding=ASCII_ENCODING),

4597 "M": EndOfWord(encoding=ASCII_ENCODING),

4598})

4599UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4600UNICODE_POSITION_ESCAPES.update({

4601 "b": Boundary(encoding=UNICODE_ENCODING),

4602 "B": Boundary(False, encoding=UNICODE_ENCODING),

4603 "m": StartOfWord(encoding=UNICODE_ENCODING),

4604 "M": EndOfWord(encoding=UNICODE_ENCODING),

4605})

4606

4607# Positional escape sequences when WORD flag set.

4608WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4609WORD_POSITION_ESCAPES.update({

4610 "b": DefaultBoundary(),

4611 "B": DefaultBoundary(False),

4612 "m": DefaultStartOfWord(),

4613 "M": DefaultEndOfWord(),

4614})

4615

4616# Regex control verbs.

4617VERBS = {

4618 "FAIL": Failure(),

4619 "F": Failure(),

4620 "PRUNE": Prune(),

4621 "SKIP": Skip(),

4622}

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 44%

2868 statements