Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex

2# Secret Labs' Regular Expression Engine core module

6# This version of the SRE library can be redistributed under CNRI's

7# Python 1.6 license. For any other use, please contact Secret Labs

8# AB (info@pythonware.com).

10# Portions of this engine have been developed in cooperation with

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and

12# other compatibility work.

13#

14# 2010-01-16 mrab Python front-end re-written and extended

16import enum

17import string

18import unicodedata

19from collections import defaultdict

21from regex import _regex

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",

27 "Scanner", "RegexFlag"]

29# The regex exception.

30class error(Exception):

31 """Exception raised for invalid regular expressions.

33 Attributes:

35 msg: The unformatted error message

36 pattern: The regular expression pattern

37 pos: The position in the pattern where compilation failed, or None

38 lineno: The line number where compilation failed, unless pos is None

39 colno: The column number where compilation failed, unless pos is None

40 """

42 def __init__(self, message, pattern=None, pos=None):

43 newline = '\n' if isinstance(pattern, str) else b'\n'

44 self.msg = message

45 self.pattern = pattern

46 self.pos = pos

47 if pattern is not None and pos is not None:

48 self.lineno = pattern.count(newline, 0, pos) + 1

49 self.colno = pos - pattern.rfind(newline, 0, pos)

51 message = "{} at position {}".format(message, pos)

53 if newline in pattern:

54 message += " (line {}, column {})".format(self.lineno,

55 self.colno)

57 Exception.__init__(self, message)

59# The exception for when a positional flag has been turned on in the old

60# behaviour.

61class _UnscopedFlagSet(Exception):

62 pass

64# The exception for when parsing fails and we want to try something else.

65class ParseError(Exception):

66 pass

68# The exception for when there isn't a valid first set.

69class _FirstSetError(Exception):

70 pass

72# Flags.

73class RegexFlag(enum.IntFlag):

74 A = ASCII = 0x80 # Assume ASCII locale.

75 B = BESTMATCH = 0x1000 # Best fuzzy match.

76 D = DEBUG = 0x200 # Print parsed pattern.

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first

78 # fuzzy match.

79 F = FULLCASE = 0x4000 # Unicode full case-folding.

80 I = IGNORECASE = 0x2 # Ignore case.

81 L = LOCALE = 0x4 # Assume current 8-bit locale.

82 M = MULTILINE = 0x8 # Make anchors look for newline.

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).

84 R = REVERSE = 0x400 # Search backwards.

85 S = DOTALL = 0x10 # Make dot match newline.

86 U = UNICODE = 0x20 # Assume Unicode locale.

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.

88 V1 = VERSION1 = 0x100 # New enhanced behaviour.

89 W = WORD = 0x800 # Default Unicode word breaks.

90 X = VERBOSE = 0x40 # Ignore whitespace and comments.

91 T = TEMPLATE = 0x1 # Template (present because re module has it).

93 def __repr__(self):

94 if self._name_ is not None:

95 return 'regex.%s' % self._name_

97 value = self._value_

98 members = []

99 negative = value < 0

100

101 if negative:

102 value = ~value

103

104 for m in self.__class__:

105 if value & m._value_:

106 value &= ~m._value_

107 members.append('regex.%s' % m._name_)

108

109 if value:

110 members.append(hex(value))

111

112 res = '|'.join(members)

113

114 if negative:

115 if len(members) > 1:

116 res = '~(%s)' % res

117 else:

118 res = '~%s' % res

119

120 return res

121

122 __str__ = object.__str__

123

124# Put the flags into the module namespace. Being explicit here helps tools like

125# linters and IDEs understand the code better.

126ASCII = RegexFlag.ASCII

127BESTMATCH = RegexFlag.BESTMATCH

128DEBUG = RegexFlag.DEBUG

129DOTALL = RegexFlag.DOTALL

130ENHANCEMATCH = RegexFlag.ENHANCEMATCH

131FULLCASE = RegexFlag.FULLCASE

132IGNORECASE = RegexFlag.IGNORECASE

133LOCALE = RegexFlag.LOCALE

134MULTILINE = RegexFlag.MULTILINE

135POSIX = RegexFlag.POSIX

136REVERSE = RegexFlag.REVERSE

137TEMPLATE = RegexFlag.TEMPLATE

138UNICODE = RegexFlag.UNICODE

139VERBOSE = RegexFlag.VERBOSE

140VERSION0 = RegexFlag.VERSION0

141VERSION1 = RegexFlag.VERSION1

142WORD = RegexFlag.WORD

143A = RegexFlag.A

144B = RegexFlag.B

145D = RegexFlag.D

146E = RegexFlag.E

147F = RegexFlag.F

148I = RegexFlag.I

149L = RegexFlag.L

150M = RegexFlag.M

151P = RegexFlag.P

152R = RegexFlag.R

153S = RegexFlag.S

154U = RegexFlag.U

155V0 = RegexFlag.V0

156V1 = RegexFlag.V1

157W = RegexFlag.W

158X = RegexFlag.X

159T = RegexFlag.T

160

161DEFAULT_VERSION = VERSION1

162

163_ALL_VERSIONS = VERSION0 | VERSION1

164_ALL_ENCODINGS = ASCII | LOCALE | UNICODE

165

166# The default flags for the various versions.

167DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}

168

169# The mask for the flags.

171 REVERSE)

173 _ALL_ENCODINGS)

174

175ALPHA = frozenset(string.ascii_letters)

176DIGITS = frozenset(string.digits)

177ALNUM = ALPHA | DIGITS

178OCT_DIGITS = frozenset(string.octdigits)

179HEX_DIGITS = frozenset(string.hexdigits)

180SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])

181NAMED_CHAR_PART = ALNUM | frozenset(" -")

182PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")

183SET_OPS = ("||", "~~", "&&", "--")

184

185# The width of the code words inside the regex engine.

186BYTES_PER_CODE = _regex.get_code_size()

187BITS_PER_CODE = BYTES_PER_CODE * 8

188

189# The repeat count which represents infinity.

190UNLIMITED = (1 << BITS_PER_CODE) - 1

191

192# The regular expression flags.

193REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,

194 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,

195 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":

196 VERBOSE}

197

198# The case flags.

199CASE_FLAGS = FULLCASE | IGNORECASE

200NOCASE = 0

201FULLIGNORECASE = FULLCASE | IGNORECASE

202

203FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE

204

205CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,

206 FULLIGNORECASE: FULLIGNORECASE}

207

208# The number of digits in hexadecimal escapes.

209HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}

210

211# The names of the opcodes.

212OPCODES = """

213FAILURE

214SUCCESS

215ANY

216ANY_ALL

217ANY_ALL_REV

218ANY_REV

219ANY_U

220ANY_U_REV

221ATOMIC

222BOUNDARY

223BRANCH

224CALL_REF

225CHARACTER

226CHARACTER_IGN

227CHARACTER_IGN_REV

228CHARACTER_REV

229CONDITIONAL

230DEFAULT_BOUNDARY

231DEFAULT_END_OF_WORD

232DEFAULT_START_OF_WORD

233END

234END_OF_LINE

235END_OF_LINE_U

236END_OF_STRING

237END_OF_STRING_LINE

238END_OF_STRING_LINE_U

239END_OF_WORD

240FUZZY

241GRAPHEME_BOUNDARY

242GREEDY_REPEAT

243GROUP

244GROUP_CALL

245GROUP_EXISTS

246KEEP

247LAZY_REPEAT

248LOOKAROUND

249NEXT

250PROPERTY

251PROPERTY_IGN

252PROPERTY_IGN_REV

253PROPERTY_REV

254PRUNE

255RANGE

256RANGE_IGN

257RANGE_IGN_REV

258RANGE_REV

259REF_GROUP

260REF_GROUP_FLD

261REF_GROUP_FLD_REV

262REF_GROUP_IGN

263REF_GROUP_IGN_REV

264REF_GROUP_REV

265SEARCH_ANCHOR

266SET_DIFF

267SET_DIFF_IGN

268SET_DIFF_IGN_REV

269SET_DIFF_REV

270SET_INTER

271SET_INTER_IGN

272SET_INTER_IGN_REV

273SET_INTER_REV

274SET_SYM_DIFF

275SET_SYM_DIFF_IGN

276SET_SYM_DIFF_IGN_REV

277SET_SYM_DIFF_REV

278SET_UNION

279SET_UNION_IGN

280SET_UNION_IGN_REV

281SET_UNION_REV

282SKIP

283START_OF_LINE

284START_OF_LINE_U

285START_OF_STRING

286START_OF_WORD

287STRING

288STRING_FLD

289STRING_FLD_REV

290STRING_IGN

291STRING_IGN_REV

292STRING_REV

293FUZZY_EXT

294"""

295

296# Define the opcodes in a namespace.

297class Namespace:

298 pass

299

300OP = Namespace()

301for i, op in enumerate(OPCODES.split()):

302 setattr(OP, op, i)

303

304def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):

305 """Make room in the given cache.

306

307 Args:

308 cache_dict: The cache dictionary to modify.

309 args_dict: The dictionary of named list args used by patterns.

310 max_length: Maximum # of entries in cache_dict before it is shrunk.

311 divisor: Cache will shrink to max_length - 1/divisor*max_length items.

312 """

313 # Toss out a fraction of the entries at random to make room for new ones.

314 # A random algorithm was chosen as opposed to simply cache_dict.popitem()

315 # as popitem could penalize the same regular expression repeatedly based

316 # on its internal hash value. Being random should spread the cache miss

317 # love around.

318 cache_keys = tuple(cache_dict.keys())

319 overage = len(cache_keys) - max_length

320 if overage < 0:

321 # Cache is already within limits. Normally this should not happen

322 # but it could due to multithreading.

323 return

324

325 number_to_toss = max_length // divisor + overage

326

327 # The import is done here to avoid a circular dependency.

328 import random

329 if not hasattr(random, 'sample'):

330 # Do nothing while resolving the circular dependency:

331 # re->random->warnings->tokenize->string->re

332 return

333

334 for doomed_key in random.sample(cache_keys, number_to_toss):

335 try:

336 del cache_dict[doomed_key]

337 except KeyError:

338 # Ignore problems if the cache changed from another thread.

339 pass

340

341 # Rebuild the arguments and locale-sensitivity dictionaries.

342 args_dict.clear()

343 sensitivity_dict = {}

344 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):

345 args_dict[pattern, pattern_type, flags, default_version, locale] = args

346 try:

347 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]

348 except KeyError:

349 pass

350

351 locale_sensitive.clear()

352 locale_sensitive.update(sensitivity_dict)

353

354def _fold_case(info, string):

355 "Folds the case of a string."

356 flags = info.flags

357 if (flags & _ALL_ENCODINGS) == 0:

358 flags |= info.guess_encoding

359

360 return _regex.fold_case(flags, string)

361

362def is_cased_i(info, char):

363 "Checks whether a character is cased."

364 return len(_regex.get_all_cases(info.flags, char)) > 1

365

366def is_cased_f(flags, char):

367 "Checks whether a character is cased."

368 return len(_regex.get_all_cases(flags, char)) > 1

369

370def _compile_firstset(info, fs):

371 "Compiles the firstset for the pattern."

372 reverse = bool(info.flags & REVERSE)

373 fs = _check_firstset(info, reverse, fs)

374 if not fs or isinstance(fs, AnyAll):

375 return []

376

377 # Compile the firstset.

378 return fs.compile(reverse)

379

380def _check_firstset(info, reverse, fs):

381 "Checks the firstset for the pattern."

382 if not fs or None in fs:

383 return None

384

385 # If we ignore the case, for simplicity we won't build a firstset.

386 members = set()

387 case_flags = NOCASE

388 for i in fs:

389 if isinstance(i, Character) and not i.positive:

390 return None

391

392# if i.case_flags:

393# if isinstance(i, Character):

394# if is_cased_i(info, i.value):

395# return []

396# elif isinstance(i, SetBase):

397# return []

398 case_flags |= i.case_flags

399 members.add(i.with_flags(case_flags=NOCASE))

400

401 if case_flags == (FULLCASE | IGNORECASE):

402 return None

403

404 # Build the firstset.

405 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,

406 zerowidth=True)

407 fs = fs.optimise(info, reverse, in_set=True)

408

409 return fs

410

411def _flatten_code(code):

412 "Flattens the code from a list of tuples."

413 flat_code = []

414 for c in code:

415 flat_code.extend(c)

416

417 return flat_code

418

419def make_case_flags(info):

420 "Makes the case flags."

421 flags = info.flags & CASE_FLAGS

422

423 # Turn off FULLCASE if ASCII is turned on.

424 if info.flags & ASCII:

425 flags &= ~FULLCASE

426

427 return flags

428

429def make_character(info, value, in_set=False):

430 "Makes a character literal."

431 if in_set:

432 # A character set is built case-sensitively.

433 return Character(value)

434

435 return Character(value, case_flags=make_case_flags(info))

436

437def make_ref_group(info, name, position):

438 "Makes a group reference."

439 return RefGroup(info, name, position, case_flags=make_case_flags(info))

440

441def make_string_set(info, name):

442 "Makes a string set."

443 return StringSet(info, name, case_flags=make_case_flags(info))

444

445def make_property(info, prop, in_set):

446 "Makes a property."

447 if in_set:

448 return prop

449

450 return prop.with_flags(case_flags=make_case_flags(info))

451

452def _parse_pattern(source, info):

453 "Parses a pattern, eg. 'a|b|c'."

454 branches = [parse_sequence(source, info)]

455 while source.match("|"):

456 branches.append(parse_sequence(source, info))

457

458 if len(branches) == 1:

459 return branches[0]

460 return Branch(branches)

461

462def parse_sequence(source, info):

463 "Parses a sequence, eg. 'abc'."

464 sequence = [None]

465 case_flags = make_case_flags(info)

466 while True:

467 saved_pos = source.pos

468 ch = source.get()

469 if ch in SPECIAL_CHARS:

470 if ch in ")|":

471 # The end of a sequence. At the end of the pattern ch is "".

472 source.pos = saved_pos

473 break

474 elif ch == "\\":

475 # An escape sequence outside a set.

476 sequence.append(parse_escape(source, info, False))

477 elif ch == "(":

478 # A parenthesised subpattern or a flag.

479 element = parse_paren(source, info)

480 if element is None:

481 case_flags = make_case_flags(info)

482 else:

483 sequence.append(element)

484 elif ch == ".":

485 # Any character.

486 if info.flags & DOTALL:

487 sequence.append(AnyAll())

488 elif info.flags & WORD:

489 sequence.append(AnyU())

490 else:

491 sequence.append(Any())

492 elif ch == "[":

493 # A character set.

494 sequence.append(parse_set(source, info))

495 elif ch == "^":

496 # The start of a line or the string.

497 if info.flags & MULTILINE:

498 if info.flags & WORD:

499 sequence.append(StartOfLineU())

500 else:

501 sequence.append(StartOfLine())

502 else:

503 sequence.append(StartOfString())

504 elif ch == "$":

505 # The end of a line or the string.

506 if info.flags & MULTILINE:

507 if info.flags & WORD:

508 sequence.append(EndOfLineU())

509 else:

510 sequence.append(EndOfLine())

511 else:

512 if info.flags & WORD:

513 sequence.append(EndOfStringLineU())

514 else:

515 sequence.append(EndOfStringLine())

516 elif ch in "?*+{":

517 # Looks like a quantifier.

518 counts = parse_quantifier(source, info, ch)

519 if counts:

520 # It _is_ a quantifier.

521 apply_quantifier(source, info, counts, case_flags, ch,

522 saved_pos, sequence)

523 sequence.append(None)

524 else:

525 # It's not a quantifier. Maybe it's a fuzzy constraint.

526 constraints = parse_fuzzy(source, info, ch, case_flags)

527 if constraints:

528 # It _is_ a fuzzy constraint.

529 apply_constraint(source, info, constraints, case_flags,

530 saved_pos, sequence)

531 sequence.append(None)

532 else:

533 # The element was just a literal.

534 sequence.append(Character(ord(ch),

535 case_flags=case_flags))

536 else:

537 # A literal.

538 sequence.append(Character(ord(ch), case_flags=case_flags))

539 else:

540 # A literal.

541 sequence.append(Character(ord(ch), case_flags=case_flags))

542

543 sequence = [item for item in sequence if item is not None]

544 return Sequence(sequence)

545

546def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,

547 sequence):

548 element = sequence.pop()

549 if element is None:

550 if sequence:

551 raise error("multiple repeat", source.string, saved_pos)

552 raise error("nothing to repeat", source.string, saved_pos)

553

554 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):

555 raise error("multiple repeat", source.string, saved_pos)

556

557 min_count, max_count = counts

558 saved_pos = source.pos

559 ch = source.get()

560 if ch == "?":

561 # The "?" suffix that means it's a lazy repeat.

562 repeated = LazyRepeat

563 elif ch == "+":

564 # The "+" suffix that means it's a possessive repeat.

565 repeated = PossessiveRepeat

566 else:

567 # No suffix means that it's a greedy repeat.

568 source.pos = saved_pos

569 repeated = GreedyRepeat

570

571 # Ignore the quantifier if it applies to a zero-width item or the number of

572 # repeats is fixed at 1.

573 if not element.is_empty() and (min_count != 1 or max_count != 1):

574 element = repeated(element, min_count, max_count)

575

576 sequence.append(element)

577

578def apply_constraint(source, info, constraints, case_flags, saved_pos,

579 sequence):

580 element = sequence.pop()

581 if element is None:

582 raise error("nothing for fuzzy constraint", source.string, saved_pos)

583

584 # If a group is marked as fuzzy then put all of the fuzzy part in the

585 # group.

586 if isinstance(element, Group):

587 element.subpattern = Fuzzy(element.subpattern, constraints)

588 sequence.append(element)

589 else:

590 sequence.append(Fuzzy(element, constraints))

591

592_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}

593

594def parse_quantifier(source, info, ch):

595 "Parses a quantifier."

596 q = _QUANTIFIERS.get(ch)

597 if q:

598 # It's a quantifier.

599 return q

600

601 if ch == "{":

602 # Looks like a limited repeated element, eg. 'a{2,3}'.

603 counts = parse_limited_quantifier(source)

604 if counts:

605 return counts

606

607 return None

608

609def is_above_limit(count):

610 "Checks whether a count is above the maximum."

611 return count is not None and count >= UNLIMITED

612

613def parse_limited_quantifier(source):

614 "Parses a limited quantifier."

615 saved_pos = source.pos

616 min_count = parse_count(source)

617 if source.match(","):

618 max_count = parse_count(source)

619

620 # No minimum means 0 and no maximum means unlimited.

621 min_count = int(min_count or 0)

622 max_count = int(max_count) if max_count else None

623 else:

624 if not min_count:

625 source.pos = saved_pos

626 return None

627

628 min_count = max_count = int(min_count)

629

630 if not source.match ("}"):

631 source.pos = saved_pos

632 return None

633

634 if is_above_limit(min_count) or is_above_limit(max_count):

635 raise error("repeat count too big", source.string, saved_pos)

636

637 if max_count is not None and min_count > max_count:

638 raise error("min repeat greater than max repeat", source.string,

639 saved_pos)

640

641 return min_count, max_count

642

643def parse_fuzzy(source, info, ch, case_flags):

644 "Parses a fuzzy setting, if present."

645 saved_pos = source.pos

646

647 if ch != "{":

648 return None

649

650 constraints = {}

651 try:

652 parse_fuzzy_item(source, constraints)

653 while source.match(","):

654 parse_fuzzy_item(source, constraints)

655 except ParseError:

656 source.pos = saved_pos

657 return None

658

659 if source.match(":"):

660 constraints["test"] = parse_fuzzy_test(source, info, case_flags)

661

662 if not source.match("}"):

663 raise error("expected }", source.string, source.pos)

664

665 return constraints

666

667def parse_fuzzy_item(source, constraints):

668 "Parses a fuzzy setting item."

669 saved_pos = source.pos

670 try:

671 parse_cost_constraint(source, constraints)

672 except ParseError:

673 source.pos = saved_pos

674

675 parse_cost_equation(source, constraints)

676

677def parse_cost_constraint(source, constraints):

678 "Parses a cost constraint."

679 saved_pos = source.pos

680 ch = source.get()

681 if ch in ALPHA:

682 # Syntax: constraint [("<=" | "<") cost]

683 constraint = parse_constraint(source, constraints, ch)

684

685 max_inc = parse_fuzzy_compare(source)

686

687 if max_inc is None:

688 # No maximum cost.

689 constraints[constraint] = 0, None

690 else:

691 # There's a maximum cost.

692 cost_pos = source.pos

693 max_cost = parse_cost_limit(source)

694

695 # Inclusive or exclusive limit?

696 if not max_inc:

697 max_cost -= 1

698

699 if max_cost < 0:

700 raise error("bad fuzzy cost limit", source.string, cost_pos)

701

702 constraints[constraint] = 0, max_cost

703 elif ch in DIGITS:

704 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost

705 source.pos = saved_pos

706

707 # Minimum cost.

708 cost_pos = source.pos

709 min_cost = parse_cost_limit(source)

710

711 min_inc = parse_fuzzy_compare(source)

712 if min_inc is None:

713 raise ParseError()

714

715 constraint = parse_constraint(source, constraints, source.get())

716

717 max_inc = parse_fuzzy_compare(source)

718 if max_inc is None:

719 raise ParseError()

720

721 # Maximum cost.

722 cost_pos = source.pos

723 max_cost = parse_cost_limit(source)

724

725 # Inclusive or exclusive limits?

726 if not min_inc:

727 min_cost += 1

728 if not max_inc:

729 max_cost -= 1

730

731 if not 0 <= min_cost <= max_cost:

732 raise error("bad fuzzy cost limit", source.string, cost_pos)

733

734 constraints[constraint] = min_cost, max_cost

735 else:

736 raise ParseError()

737

738def parse_cost_limit(source):

739 "Parses a cost limit."

740 cost_pos = source.pos

741 digits = parse_count(source)

742

743 try:

744 return int(digits)

745 except ValueError:

746 pass

747

748 raise error("bad fuzzy cost limit", source.string, cost_pos)

749

750def parse_constraint(source, constraints, ch):

751 "Parses a constraint."

752 if ch not in "deis":

753 raise ParseError()

754

755 if ch in constraints:

756 raise ParseError()

757

758 return ch

759

760def parse_fuzzy_compare(source):

761 "Parses a cost comparator."

762 if source.match("<="):

763 return True

764 elif source.match("<"):

765 return False

766 else:

767 return None

768

769def parse_cost_equation(source, constraints):

770 "Parses a cost equation."

771 if "cost" in constraints:

772 raise error("more than one cost equation", source.string, source.pos)

773

774 cost = {}

775

776 parse_cost_term(source, cost)

777 while source.match("+"):

778 parse_cost_term(source, cost)

779

780 max_inc = parse_fuzzy_compare(source)

781 if max_inc is None:

782 raise ParseError()

783

784 max_cost = int(parse_count(source))

785

786 if not max_inc:

787 max_cost -= 1

788

789 if max_cost < 0:

790 raise error("bad fuzzy cost limit", source.string, source.pos)

791

792 cost["max"] = max_cost

793

794 constraints["cost"] = cost

795

796def parse_cost_term(source, cost):

797 "Parses a cost equation term."

798 coeff = parse_count(source)

799 ch = source.get()

800 if ch not in "dis":

801 raise ParseError()

802

803 if ch in cost:

804 raise error("repeated fuzzy cost", source.string, source.pos)

805

806 cost[ch] = int(coeff or 1)

807

808def parse_fuzzy_test(source, info, case_flags):

809 saved_pos = source.pos

810 ch = source.get()

811 if ch in SPECIAL_CHARS:

812 if ch == "\\":

813 # An escape sequence outside a set.

814 return parse_escape(source, info, False)

815 elif ch == ".":

816 # Any character.

817 if info.flags & DOTALL:

818 return AnyAll()

819 elif info.flags & WORD:

820 return AnyU()

821 else:

822 return Any()

823 elif ch == "[":

824 # A character set.

825 return parse_set(source, info)

826 else:

827 raise error("expected character set", source.string, saved_pos)

828 elif ch:

829 # A literal.

830 return Character(ord(ch), case_flags=case_flags)

831 else:

832 raise error("expected character set", source.string, saved_pos)

833

834def parse_count(source):

835 "Parses a quantifier's count, which can be empty."

836 return source.get_while(DIGITS)

837

838def parse_paren(source, info):

839 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an

840 inline flag.

841 """

842 saved_pos = source.pos

843 ch = source.get(True)

844 if ch == "?":

845 # (?...

846 saved_pos_2 = source.pos

847 ch = source.get(True)

848 if ch == "<":

849 # (?<...

850 saved_pos_3 = source.pos

851 ch = source.get()

852 if ch in ("=", "!"):

853 # (?<=... or (?<!...: lookbehind.

854 return parse_lookaround(source, info, True, ch == "=")

855

856 # (?<...: a named capture group.

857 source.pos = saved_pos_3

858 name = parse_name(source)

859 group = info.open_group(name)

860 source.expect(">")

861 saved_flags = info.flags

862 try:

863 subpattern = _parse_pattern(source, info)

864 source.expect(")")

865 finally:

866 info.flags = saved_flags

867 source.ignore_space = bool(info.flags & VERBOSE)

868

869 info.close_group()

870 return Group(info, group, subpattern)

871 if ch in ("=", "!"):

872 # (?=... or (?!...: lookahead.

873 return parse_lookaround(source, info, False, ch == "=")

874 if ch == "P":

875 # (?P...: a Python extension.

876 return parse_extension(source, info)

877 if ch == "#":

878 # (?#...: a comment.

879 return parse_comment(source)

880 if ch == "(":

881 # (?(...: a conditional subpattern.

882 return parse_conditional(source, info)

883 if ch == ">":

884 # (?>...: an atomic subpattern.

885 return parse_atomic(source, info)

886 if ch == "|":

887 # (?|...: a common/reset groups branch.

888 return parse_common(source, info)

889 if ch == "R" or "0" <= ch <= "9":

890 # (?R...: probably a call to a group.

891 return parse_call_group(source, info, ch, saved_pos_2)

892 if ch == "&":

893 # (?&...: a call to a named group.

894 return parse_call_named_group(source, info, saved_pos_2)

895 if (ch == "+" or ch == "-") and source.peek() in DIGITS:

896 return parse_rel_call_group(source, info, ch, saved_pos_2)

897

898 # (?...: probably a flags subpattern.

899 source.pos = saved_pos_2

900 return parse_flags_subpattern(source, info)

901

902 if ch == "*":

903 # (*...

904 saved_pos_2 = source.pos

905 word = source.get_while(set(")>"), include=False)

906 if word[ : 1].isalpha():

907 verb = VERBS.get(word)

908 if not verb:

909 raise error("unknown verb", source.string, saved_pos_2)

910

911 source.expect(")")

912

913 return verb

914

915 # (...: an unnamed capture group.

916 source.pos = saved_pos

917 group = info.open_group()

918 saved_flags = info.flags

919 try:

920 subpattern = _parse_pattern(source, info)

921 source.expect(")")

922 finally:

923 info.flags = saved_flags

924 source.ignore_space = bool(info.flags & VERBOSE)

925

926 info.close_group()

927

928 return Group(info, group, subpattern)

929

930def parse_extension(source, info):

931 "Parses a Python extension."

932 saved_pos = source.pos

933 ch = source.get()

934 if ch == "<":

935 # (?P<...: a named capture group.

936 name = parse_name(source)

937 group = info.open_group(name)

938 source.expect(">")

939 saved_flags = info.flags

940 try:

941 subpattern = _parse_pattern(source, info)

942 source.expect(")")

943 finally:

944 info.flags = saved_flags

945 source.ignore_space = bool(info.flags & VERBOSE)

946

947 info.close_group()

948

949 return Group(info, group, subpattern)

950 if ch == "=":

951 # (?P=...: a named group reference.

952 name = parse_name(source, allow_numeric=True)

953 source.expect(")")

954 if info.is_open_group(name):

955 raise error("cannot refer to an open group", source.string,

956 saved_pos)

957

958 return make_ref_group(info, name, saved_pos)

959 if ch == ">" or ch == "&":

960 # (?P>...: a call to a group.

961 return parse_call_named_group(source, info, saved_pos)

962

963 source.pos = saved_pos

964 raise error("unknown extension", source.string, saved_pos)

965

966def parse_comment(source):

967 "Parses a comment."

968 while True:

969 saved_pos = source.pos

970 c = source.get(True)

971

972 if not c or c == ")":

973 break

974

975 if c == "\\":

976 c = source.get(True)

977

978 source.pos = saved_pos

979 source.expect(")")

980

981 return None

982

983def parse_lookaround(source, info, behind, positive):

984 "Parses a lookaround."

985 saved_flags = info.flags

986 try:

987 subpattern = _parse_pattern(source, info)

988 source.expect(")")

989 finally:

990 info.flags = saved_flags

991 source.ignore_space = bool(info.flags & VERBOSE)

992

993 return LookAround(behind, positive, subpattern)

994

995def parse_conditional(source, info):

996 "Parses a conditional subpattern."

997 saved_flags = info.flags

998 saved_pos = source.pos

999 ch = source.get()

1000 if ch == "?":

1001 # (?(?...

1002 ch = source.get()

1003 if ch in ("=", "!"):

1004 # (?(?=... or (?(?!...: lookahead conditional.

1005 return parse_lookaround_conditional(source, info, False, ch == "=")

1006 if ch == "<":

1007 # (?(?<...

1008 ch = source.get()

1009 if ch in ("=", "!"):

1010 # (?(?<=... or (?(?<!...: lookbehind conditional.

1011 return parse_lookaround_conditional(source, info, True, ch ==

1012 "=")

1013

1014 source.pos = saved_pos

1015 raise error("expected lookaround conditional", source.string,

1016 source.pos)

1017

1018 source.pos = saved_pos

1019 try:

1020 group = parse_name(source, True)

1021 source.expect(")")

1022 yes_branch = parse_sequence(source, info)

1023 if source.match("|"):

1024 no_branch = parse_sequence(source, info)

1025 else:

1026 no_branch = Sequence()

1027

1028 source.expect(")")

1029 finally:

1030 info.flags = saved_flags

1031 source.ignore_space = bool(info.flags & VERBOSE)

1032

1033 if yes_branch.is_empty() and no_branch.is_empty():

1034 return Sequence()

1035

1036 return Conditional(info, group, yes_branch, no_branch, saved_pos)

1037

1038def parse_lookaround_conditional(source, info, behind, positive):

1039 saved_flags = info.flags

1040 try:

1041 subpattern = _parse_pattern(source, info)

1042 source.expect(")")

1043 finally:

1044 info.flags = saved_flags

1045 source.ignore_space = bool(info.flags & VERBOSE)

1046

1047 yes_branch = parse_sequence(source, info)

1048 if source.match("|"):

1049 no_branch = parse_sequence(source, info)

1050 else:

1051 no_branch = Sequence()

1052

1053 source.expect(")")

1054

1055 return LookAroundConditional(behind, positive, subpattern, yes_branch,

1056 no_branch)

1057

1058def parse_atomic(source, info):

1059 "Parses an atomic subpattern."

1060 saved_flags = info.flags

1061 try:

1062 subpattern = _parse_pattern(source, info)

1063 source.expect(")")

1064 finally:

1065 info.flags = saved_flags

1066 source.ignore_space = bool(info.flags & VERBOSE)

1067

1068 return Atomic(subpattern)

1069

1070def parse_common(source, info):

1071 "Parses a common groups branch."

1072 # Capture group numbers in different branches can reuse the group numbers.

1073 initial_group_count = info.group_count

1074 branches = [parse_sequence(source, info)]

1075 final_group_count = info.group_count

1076 while source.match("|"):

1077 info.group_count = initial_group_count

1078 branches.append(parse_sequence(source, info))

1079 final_group_count = max(final_group_count, info.group_count)

1080

1081 info.group_count = final_group_count

1082 source.expect(")")

1083

1084 if len(branches) == 1:

1085 return branches[0]

1086 return Branch(branches)

1087

1088def parse_call_group(source, info, ch, pos):

1089 "Parses a call to a group."

1090 if ch == "R":

1091 group = "0"

1092 else:

1093 group = ch + source.get_while(DIGITS)

1094

1095 source.expect(")")

1096

1097 return CallGroup(info, group, pos)

1098

1099def parse_rel_call_group(source, info, ch, pos):

1100 "Parses a relative call to a group."

1101 digits = source.get_while(DIGITS)

1102 if not digits:

1103 raise error("missing relative group number", source.string, source.pos)

1104

1105 offset = int(digits)

1106 group = info.group_count + offset if ch == "+" else info.group_count - offset + 1

1107 if group <= 0:

1108 raise error("invalid relative group number", source.string, source.pos)

1109

1110 source.expect(")")

1111

1112 return CallGroup(info, group, pos)

1113

1114def parse_call_named_group(source, info, pos):

1115 "Parses a call to a named group."

1116 group = parse_name(source)

1117 source.expect(")")

1118

1119 return CallGroup(info, group, pos)

1120

1121def parse_flag_set(source):

1122 "Parses a set of inline flags."

1123 flags = 0

1124

1125 try:

1126 while True:

1127 saved_pos = source.pos

1128 ch = source.get()

1129 if ch == "V":

1130 ch += source.get()

1131 flags |= REGEX_FLAGS[ch]

1132 except KeyError:

1133 source.pos = saved_pos

1134

1135 return flags

1136

1137def parse_flags(source, info):

1138 "Parses flags being turned on/off."

1139 flags_on = parse_flag_set(source)

1140 if source.match("-"):

1141 flags_off = parse_flag_set(source)

1142 if not flags_off:

1143 raise error("bad inline flags: no flags after '-'", source.string,

1144 source.pos)

1145 else:

1146 flags_off = 0

1147

1148 if flags_on & LOCALE:

1149 # Remember that this pattern as an inline locale flag.

1150 info.inline_locale = True

1151

1152 return flags_on, flags_off

1153

1154def parse_subpattern(source, info, flags_on, flags_off):

1155 "Parses a subpattern with scoped flags."

1156 saved_flags = info.flags

1157 info.flags = (info.flags | flags_on) & ~flags_off

1158

1159 # Ensure that there aren't multiple encoding flags set.

1160 if info.flags & (ASCII | LOCALE | UNICODE):

1161 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on

1162

1163 source.ignore_space = bool(info.flags & VERBOSE)

1164 try:

1165 subpattern = _parse_pattern(source, info)

1166 source.expect(")")

1167 finally:

1168 info.flags = saved_flags

1169 source.ignore_space = bool(info.flags & VERBOSE)

1170

1171 return subpattern

1172

1173def parse_flags_subpattern(source, info):

1174 """Parses a flags subpattern. It could be inline flags or a subpattern

1175 possibly with local flags. If it's a subpattern, then that's returned;

1176 if it's a inline flags, then None is returned.

1177 """

1178 flags_on, flags_off = parse_flags(source, info)

1179

1180 if flags_off & GLOBAL_FLAGS:

1181 raise error("bad inline flags: cannot turn off global flag",

1182 source.string, source.pos)

1183

1184 if flags_on & flags_off:

1185 raise error("bad inline flags: flag turned on and off", source.string,

1186 source.pos)

1187

1188 # Handle flags which are global in all regex behaviours.

1189 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS

1190 if new_global_flags:

1191 info.global_flags |= new_global_flags

1192

1193 # A global has been turned on, so reparse the pattern.

1194 raise _UnscopedFlagSet(info.global_flags)

1195

1196 # Ensure that from now on we have only scoped flags.

1197 flags_on &= ~GLOBAL_FLAGS

1198

1199 if source.match(":"):

1200 return parse_subpattern(source, info, flags_on, flags_off)

1201

1202 if source.match(")"):

1203 parse_positional_flags(source, info, flags_on, flags_off)

1204 return None

1205

1206 raise error("unknown extension", source.string, source.pos)

1207

1208def parse_positional_flags(source, info, flags_on, flags_off):

1209 "Parses positional flags."

1210 info.flags = (info.flags | flags_on) & ~flags_off

1211 source.ignore_space = bool(info.flags & VERBOSE)

1212

1213def parse_name(source, allow_numeric=False, allow_group_0=False):

1214 "Parses a name."

1215 name = source.get_while(set(")>"), include=False)

1216

1217 if not name:

1218 raise error("missing group name", source.string, source.pos)

1219

1220 if name.isdigit():

1221 min_group = 0 if allow_group_0 else 1

1222 if not allow_numeric or int(name) < min_group:

1223 raise error("bad character in group name", source.string,

1224 source.pos)

1225 else:

1226 if not name.isidentifier():

1227 raise error("bad character in group name", source.string,

1228 source.pos)

1229

1230 return name

1231

1232def is_octal(string):

1233 "Checks whether a string is octal."

1234 return all(ch in OCT_DIGITS for ch in string)

1235

1236def is_decimal(string):

1237 "Checks whether a string is decimal."

1238 return all(ch in DIGITS for ch in string)

1239

1240def is_hexadecimal(string):

1241 "Checks whether a string is hexadecimal."

1242 return all(ch in HEX_DIGITS for ch in string)

1243

1244def parse_escape(source, info, in_set):

1245 "Parses an escape sequence."

1246 saved_ignore = source.ignore_space

1247 source.ignore_space = False

1248 ch = source.get()

1249 source.ignore_space = saved_ignore

1250 if not ch:

1251 # A backslash at the end of the pattern.

1252 raise error("bad escape (end of pattern)", source.string, source.pos)

1253 if ch in HEX_ESCAPES:

1254 # A hexadecimal escape sequence.

1255 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)

1256 elif ch == "g" and not in_set:

1257 # A group reference.

1258 saved_pos = source.pos

1259 try:

1260 return parse_group_ref(source, info)

1261 except error:

1262 # Invalid as a group reference, so assume it's a literal.

1263 source.pos = saved_pos

1264

1265 return make_character(info, ord(ch), in_set)

1266 elif ch == "G" and not in_set:

1267 # A search anchor.

1268 return SearchAnchor()

1269 elif ch == "L" and not in_set:

1270 # A string set.

1271 return parse_string_set(source, info)

1272 elif ch == "N":

1273 # A named codepoint.

1274 return parse_named_char(source, info, in_set)

1275 elif ch in "pP":

1276 # A Unicode property, positive or negative.

1277 return parse_property(source, info, ch == "p", in_set)

1278 elif ch == "R" and not in_set:

1279 # A line ending.

1280 charset = [0x0A, 0x0B, 0x0C, 0x0D]

1281 if info.guess_encoding == UNICODE:

1282 charset.extend([0x85, 0x2028, 0x2029])

1283

1284 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)

1285 for c in charset])]))

1286 elif ch == "X" and not in_set:

1287 # A grapheme cluster.

1288 return Grapheme()

1289 elif ch in ALPHA:

1290 # An alphabetic escape sequence.

1291 # Positional escapes aren't allowed inside a character set.

1292 if not in_set:

1293 if info.flags & WORD:

1294 value = WORD_POSITION_ESCAPES.get(ch)

1295 elif info.flags & ASCII:

1296 value = ASCII_POSITION_ESCAPES.get(ch)

1297 elif info.flags & UNICODE:

1298 value = UNICODE_POSITION_ESCAPES.get(ch)

1299 else:

1300 value = POSITION_ESCAPES.get(ch)

1301

1302 if value:

1303 return value

1304

1305 if info.flags & ASCII:

1306 value = ASCII_CHARSET_ESCAPES.get(ch)

1307 elif info.flags & UNICODE:

1308 value = UNICODE_CHARSET_ESCAPES.get(ch)

1309 else:

1310 value = CHARSET_ESCAPES.get(ch)

1311

1312 if value:

1313 return value

1314

1315 value = CHARACTER_ESCAPES.get(ch)

1316 if value:

1317 return Character(ord(value))

1318

1319 raise error("bad escape \\%s" % ch, source.string, source.pos)

1320 elif ch in DIGITS:

1321 # A numeric escape sequence.

1322 return parse_numeric_escape(source, info, ch, in_set)

1323 else:

1324 # A literal.

1325 return make_character(info, ord(ch), in_set)

1326

1327def parse_numeric_escape(source, info, ch, in_set):

1328 "Parses a numeric escape sequence."

1329 if in_set or ch == "0":

1330 # Octal escape sequence, max 3 digits.

1331 return parse_octal_escape(source, info, [ch], in_set)

1332

1333 # At least 1 digit, so either octal escape or group.

1334 digits = ch

1335 saved_pos = source.pos

1336 ch = source.get()

1337 if ch in DIGITS:

1338 # At least 2 digits, so either octal escape or group.

1339 digits += ch

1340 saved_pos = source.pos

1341 ch = source.get()

1342 if is_octal(digits) and ch in OCT_DIGITS:

1343 # 3 octal digits, so octal escape sequence.

1344 encoding = info.flags & _ALL_ENCODINGS

1345 if encoding == ASCII or encoding == LOCALE:

1346 octal_mask = 0xFF

1347 else:

1348 octal_mask = 0x1FF

1349

1350 value = int(digits + ch, 8) & octal_mask

1351 return make_character(info, value)

1352

1353 # Group reference.

1354 source.pos = saved_pos

1355 if info.is_open_group(digits):

1356 raise error("cannot refer to an open group", source.string, source.pos)

1357

1358 return make_ref_group(info, digits, source.pos)

1359

1360def parse_octal_escape(source, info, digits, in_set):

1361 "Parses an octal escape sequence."

1362 saved_pos = source.pos

1363 ch = source.get()

1364 while len(digits) < 3 and ch in OCT_DIGITS:

1365 digits.append(ch)

1366 saved_pos = source.pos

1367 ch = source.get()

1368

1369 source.pos = saved_pos

1370 try:

1371 value = int("".join(digits), 8)

1372 return make_character(info, value, in_set)

1373 except ValueError:

1374 if digits[0] in OCT_DIGITS:

1375 raise error("incomplete escape \\%s" % ''.join(digits),

1376 source.string, source.pos)

1377 else:

1378 raise error("bad escape \\%s" % digits[0], source.string,

1379 source.pos)

1380

1381def parse_hex_escape(source, info, esc, expected_len, in_set, type):

1382 "Parses a hex escape sequence."

1383 saved_pos = source.pos

1384 digits = []

1385 for i in range(expected_len):

1386 ch = source.get()

1387 if ch not in HEX_DIGITS:

1388 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1389 source.string, saved_pos)

1390 digits.append(ch)

1391

1392 try:

1393 value = int("".join(digits), 16)

1394 except ValueError:

1395 pass

1396 else:

1397 if value < 0x110000:

1398 return make_character(info, value, in_set)

1399

1400 # Bad hex escape.

1401 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),

1402 source.string, saved_pos)

1403

1404def parse_group_ref(source, info):

1405 "Parses a group reference."

1406 source.expect("<")

1407 saved_pos = source.pos

1408 name = parse_name(source, True)

1409 source.expect(">")

1410 if info.is_open_group(name):

1411 raise error("cannot refer to an open group", source.string, source.pos)

1412

1413 return make_ref_group(info, name, saved_pos)

1414

1415def parse_string_set(source, info):

1416 "Parses a string set reference."

1417 source.expect("<")

1418 name = parse_name(source, True)

1419 source.expect(">")

1420 if name is None or name not in info.kwargs:

1421 raise error("undefined named list", source.string, source.pos)

1422

1423 return make_string_set(info, name)

1424

1425def parse_named_char(source, info, in_set):

1426 "Parses a named character."

1427 saved_pos = source.pos

1428 if source.match("{"):

1429 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)

1430 if source.match("}"):

1431 try:

1432 value = unicodedata.lookup(name)

1433 return make_character(info, ord(value), in_set)

1434 except KeyError:

1435 raise error("undefined character name", source.string,

1436 source.pos)

1437

1438 source.pos = saved_pos

1439 return make_character(info, ord("N"), in_set)

1440

1441def parse_property(source, info, positive, in_set):

1442 "Parses a Unicode property."

1443 saved_pos = source.pos

1444 ch = source.get()

1445 if ch == "{":

1446 negate = source.match("^")

1447 prop_name, name = parse_property_name(source)

1448 if source.match("}"):

1449 # It's correctly delimited.

1450 if info.flags & ASCII:

1451 encoding = ASCII_ENCODING

1452 elif info.flags & UNICODE:

1453 encoding = UNICODE_ENCODING

1454 else:

1455 encoding = 0

1456

1457 prop = lookup_property(prop_name, name, positive != negate, source,

1458 encoding=encoding)

1459 return make_property(info, prop, in_set)

1460 elif ch and ch in "CLMNPSZ":

1461 # An abbreviated property, eg \pL.

1462 if info.flags & ASCII:

1463 encoding = ASCII_ENCODING

1464 elif info.flags & UNICODE:

1465 encoding = UNICODE_ENCODING

1466 else:

1467 encoding = 0

1468

1469 prop = lookup_property(None, ch, positive, source, encoding=encoding)

1470 return make_property(info, prop, in_set)

1471

1472 # Not a property, so treat as a literal "p" or "P".

1473 source.pos = saved_pos

1474 ch = "p" if positive else "P"

1475 return make_character(info, ord(ch), in_set)

1476

1477def parse_property_name(source):

1478 "Parses a property name, which may be qualified."

1479 name = source.get_while(PROPERTY_NAME_PART)

1480 saved_pos = source.pos

1481

1482 ch = source.get()

1483 if ch and ch in ":=":

1484 prop_name = name

1485 name = source.get_while(ALNUM | set(" &_-./")).strip()

1486

1487 if name:

1488 # Name after the ":" or "=", so it's a qualified name.

1489 saved_pos = source.pos

1490 else:

1491 # No name after the ":" or "=", so assume it's an unqualified name.

1492 prop_name, name = None, prop_name

1493 else:

1494 prop_name = None

1495

1496 source.pos = saved_pos

1497 return prop_name, name

1498

1499def parse_set(source, info):

1500 "Parses a character set."

1501 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1502

1503 saved_ignore = source.ignore_space

1504 source.ignore_space = False

1505 # Negative set?

1506 negate = source.match("^")

1507 try:

1508 if version == VERSION0:

1509 item = parse_set_imp_union(source, info)

1510 else:

1511 item = parse_set_union(source, info)

1512

1513 if not source.match("]"):

1514 raise error("missing ]", source.string, source.pos)

1515 finally:

1516 source.ignore_space = saved_ignore

1517

1518 if negate:

1519 item = item.with_flags(positive=not item.positive)

1520

1521 item = item.with_flags(case_flags=make_case_flags(info))

1522

1523 return item

1524

1525def parse_set_union(source, info):

1526 "Parses a set union ([x||y])."

1527 items = [parse_set_symm_diff(source, info)]

1528 while source.match("||"):

1529 items.append(parse_set_symm_diff(source, info))

1530

1531 if len(items) == 1:

1532 return items[0]

1533 return SetUnion(info, items)

1534

1535def parse_set_symm_diff(source, info):

1536 "Parses a set symmetric difference ([x~~y])."

1537 items = [parse_set_inter(source, info)]

1538 while source.match("~~"):

1539 items.append(parse_set_inter(source, info))

1540

1541 if len(items) == 1:

1542 return items[0]

1543 return SetSymDiff(info, items)

1544

1545def parse_set_inter(source, info):

1546 "Parses a set intersection ([x&&y])."

1547 items = [parse_set_diff(source, info)]

1548 while source.match("&&"):

1549 items.append(parse_set_diff(source, info))

1550

1551 if len(items) == 1:

1552 return items[0]

1553 return SetInter(info, items)

1554

1555def parse_set_diff(source, info):

1556 "Parses a set difference ([x--y])."

1557 items = [parse_set_imp_union(source, info)]

1558 while source.match("--"):

1559 items.append(parse_set_imp_union(source, info))

1560

1561 if len(items) == 1:

1562 return items[0]

1563 return SetDiff(info, items)

1564

1565def parse_set_imp_union(source, info):

1566 "Parses a set implicit union ([xy])."

1567 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1568

1569 items = [parse_set_member(source, info)]

1570 while True:

1571 saved_pos = source.pos

1572 if source.match("]"):

1573 # End of the set.

1574 source.pos = saved_pos

1575 break

1576

1577 if version == VERSION1 and any(source.match(op) for op in SET_OPS):

1578 # The new behaviour has set operators.

1579 source.pos = saved_pos

1580 break

1581

1582 items.append(parse_set_member(source, info))

1583

1584 if len(items) == 1:

1585 return items[0]

1586 return SetUnion(info, items)

1587

1588def parse_set_member(source, info):

1589 "Parses a member in a character set."

1590 # Parse a set item.

1591 start = parse_set_item(source, info)

1592 saved_pos1 = source.pos

1593 if (not isinstance(start, Character) or not start.positive or not

1594 source.match("-")):

1595 # It's not the start of a range.

1596 return start

1597

1598 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1599

1600 # It looks like the start of a range of characters.

1601 saved_pos2 = source.pos

1602 if version == VERSION1 and source.match("-"):

1603 # It's actually the set difference operator '--', so return the

1604 # character.

1605 source.pos = saved_pos1

1606 return start

1607

1608 if source.match("]"):

1609 # We've reached the end of the set, so return both the character and

1610 # hyphen.

1611 source.pos = saved_pos2

1612 return SetUnion(info, [start, Character(ord("-"))])

1613

1614 # Parse a set item.

1615 end = parse_set_item(source, info)

1616 if not isinstance(end, Character) or not end.positive:

1617 # It's not a range, so return the character, hyphen and property.

1618 return SetUnion(info, [start, Character(ord("-")), end])

1619

1620 # It _is_ a range.

1621 if start.value > end.value:

1622 raise error("bad character range", source.string, source.pos)

1623

1624 if start.value == end.value:

1625 return start

1626

1627 return Range(start.value, end.value)

1628

1629def parse_set_item(source, info):

1630 "Parses an item in a character set."

1631 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1632

1633 if source.match("\\"):

1634 # An escape sequence in a set.

1635 return parse_escape(source, info, True)

1636

1637 saved_pos = source.pos

1638 if source.match("[:"):

1639 # Looks like a POSIX character class.

1640 try:

1641 return parse_posix_class(source, info)

1642 except ParseError:

1643 # Not a POSIX character class.

1644 source.pos = saved_pos

1645

1646 if version == VERSION1 and source.match("["):

1647 # It's the start of a nested set.

1648

1649 # Negative set?

1650 negate = source.match("^")

1651 item = parse_set_union(source, info)

1652

1653 if not source.match("]"):

1654 raise error("missing ]", source.string, source.pos)

1655

1656 if negate:

1657 item = item.with_flags(positive=not item.positive)

1658

1659 return item

1660

1661 ch = source.get()

1662 if not ch:

1663 raise error("unterminated character set", source.string, source.pos)

1664

1665 return Character(ord(ch))

1666

1667def parse_posix_class(source, info):

1668 "Parses a POSIX character class."

1669 negate = source.match("^")

1670 prop_name, name = parse_property_name(source)

1671 if not source.match(":]"):

1672 raise ParseError()

1673

1674 return lookup_property(prop_name, name, not negate, source, posix=True)

1675

1676def float_to_rational(flt):

1677 "Converts a float to a rational pair."

1678 int_part = int(flt)

1679 error = flt - int_part

1680 if abs(error) < 0.0001:

1681 return int_part, 1

1682

1683 den, num = float_to_rational(1.0 / error)

1684

1685 return int_part * den + num, den

1686

1687def numeric_to_rational(numeric):

1688 "Converts a numeric string to a rational string, if possible."

1689 if numeric[ : 1] == "-":

1690 sign, numeric = numeric[0], numeric[1 : ]

1691 else:

1692 sign = ""

1693

1694 parts = numeric.split("/")

1695 if len(parts) == 2:

1696 num, den = float_to_rational(float(parts[0]) / float(parts[1]))

1697 elif len(parts) == 1:

1698 num, den = float_to_rational(float(parts[0]))

1699 else:

1700 raise ValueError()

1701

1702 result = "{}{}/{}".format(sign, num, den)

1703 if result.endswith("/1"):

1704 return result[ : -2]

1705

1706 return result

1707

1708def standardise_name(name):

1709 "Standardises a property or value name."

1710 try:

1711 return numeric_to_rational("".join(name))

1712 except (ValueError, ZeroDivisionError):

1713 return "".join(ch for ch in name if ch not in "_- ").upper()

1714

1715_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())

1716

1717_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())

1718

1719def lookup_property(property, value, positive, source=None, posix=False, encoding=0):

1720 "Looks up a property."

1721 # Normalise the names (which may still be lists).

1722 property = standardise_name(property) if property else None

1723 value = standardise_name(value)

1724

1725 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):

1726 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive

1727

1728 if posix and not property and value.upper() in _POSIX_CLASSES:

1729 value = 'POSIX' + value

1730

1731 if property:

1732 # Both the property and the value are provided.

1733 prop = PROPERTIES.get(property)

1734 if not prop:

1735 if not source:

1736 raise error("unknown property")

1737

1738 raise error("unknown property", source.string, source.pos)

1739

1740 prop_id, value_dict = prop

1741 val_id = value_dict.get(value)

1742 if val_id is None:

1743 if not source:

1744 raise error("unknown property value")

1745

1746 raise error("unknown property value", source.string, source.pos)

1747

1748 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1749

1750 # Only the value is provided.

1751 # It might be the name of a GC, script or block value.

1752 for property in ("GC", "SCRIPT", "BLOCK"):

1753 prop_id, value_dict = PROPERTIES.get(property)

1754 val_id = value_dict.get(value)

1755 if val_id is not None:

1756 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1757

1758 # It might be the name of a binary property.

1759 prop = PROPERTIES.get(value)

1760 if prop:

1761 prop_id, value_dict = prop

1762 if set(value_dict) == _BINARY_VALUES:

1763 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1764

1765 return Property(prop_id << 16, not positive, encoding=encoding)

1766

1767 # It might be the name of a binary property starting with a prefix.

1768 if value.startswith("IS"):

1769 prop = PROPERTIES.get(value[2 : ])

1770 if prop:

1771 prop_id, value_dict = prop

1772 if "YES" in value_dict:

1773 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1774

1775 # It might be the name of a script or block starting with a prefix.

1776 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):

1777 if value.startswith(prefix):

1778 prop_id, value_dict = PROPERTIES.get(property)

1779 val_id = value_dict.get(value[2 : ])

1780 if val_id is not None:

1781 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1782

1783 # Unknown property.

1784 if not source:

1785 raise error("unknown property")

1786

1787 raise error("unknown property", source.string, source.pos)

1788

1789def _compile_replacement(source, pattern, is_unicode):

1790 "Compiles a replacement template escape sequence."

1791 ch = source.get()

1792 if ch in ALPHA:

1793 # An alphabetic escape sequence.

1794 value = CHARACTER_ESCAPES.get(ch)

1795 if value:

1796 return False, [ord(value)]

1797

1798 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):

1799 # A hexadecimal escape sequence.

1800 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]

1801

1802 if ch == "g":

1803 # A group preference.

1804 return True, [compile_repl_group(source, pattern)]

1805

1806 if ch == "N" and is_unicode:

1807 # A named character.

1808 value = parse_repl_named_char(source)

1809 if value is not None:

1810 return False, [value]

1811

1812 raise error("bad escape \\%s" % ch, source.string, source.pos)

1813

1814 if isinstance(source.sep, bytes):

1815 octal_mask = 0xFF

1816 else:

1817 octal_mask = 0x1FF

1818

1819 if ch == "0":

1820 # An octal escape sequence.

1821 digits = ch

1822 while len(digits) < 3:

1823 saved_pos = source.pos

1824 ch = source.get()

1825 if ch not in OCT_DIGITS:

1826 source.pos = saved_pos

1827 break

1828 digits += ch

1829

1830 return False, [int(digits, 8) & octal_mask]

1831

1832 if ch in DIGITS:

1833 # Either an octal escape sequence (3 digits) or a group reference (max

1834 # 2 digits).

1835 digits = ch

1836 saved_pos = source.pos

1837 ch = source.get()

1838 if ch in DIGITS:

1839 digits += ch

1840 saved_pos = source.pos

1841 ch = source.get()

1842 if ch and is_octal(digits + ch):

1843 # An octal escape sequence.

1844 return False, [int(digits + ch, 8) & octal_mask]

1845

1846 # A group reference.

1847 source.pos = saved_pos

1848 return True, [int(digits)]

1849

1850 if ch == "\\":

1851 # An escaped backslash is a backslash.

1852 return False, [ord("\\")]

1853

1854 if not ch:

1855 # A trailing backslash.

1856 raise error("bad escape (end of pattern)", source.string, source.pos)

1857

1858 # An escaped non-backslash is a backslash followed by the literal.

1859 return False, [ord("\\"), ord(ch)]

1860

1861def parse_repl_hex_escape(source, expected_len, type):

1862 "Parses a hex escape sequence in a replacement string."

1863 digits = []

1864 for i in range(expected_len):

1865 ch = source.get()

1866 if ch not in HEX_DIGITS:

1867 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1868 source.string, source.pos)

1869 digits.append(ch)

1870

1871 return int("".join(digits), 16)

1872

1873def parse_repl_named_char(source):

1874 "Parses a named character in a replacement string."

1875 saved_pos = source.pos

1876 if source.match("{"):

1877 name = source.get_while(ALPHA | set(" "))

1878

1879 if source.match("}"):

1880 try:

1881 value = unicodedata.lookup(name)

1882 return ord(value)

1883 except KeyError:

1884 raise error("undefined character name", source.string,

1885 source.pos)

1886

1887 source.pos = saved_pos

1888 return None

1889

1890def compile_repl_group(source, pattern):

1891 "Compiles a replacement template group reference."

1892 source.expect("<")

1893 name = parse_name(source, True, True)

1894

1895 source.expect(">")

1896 if name.isdigit():

1897 index = int(name)

1898 if not 0 <= index <= pattern.groups:

1899 raise error("invalid group reference", source.string, source.pos)

1900

1901 return index

1902

1903 try:

1904 return pattern.groupindex[name]

1905 except KeyError:

1906 raise IndexError("unknown group")

1907

1908# The regular expression is parsed into a syntax tree. The different types of

1909# node are defined below.

1910

1911INDENT = " "

1912POSITIVE_OP = 0x1

1913ZEROWIDTH_OP = 0x2

1914FUZZY_OP = 0x4

1915REVERSE_OP = 0x8

1916REQUIRED_OP = 0x10

1917ENCODING_OP_SHIFT = 5

1918

1919POS_TEXT = {False: "NON-MATCH", True: "MATCH"}

1920CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",

1921 FULLIGNORECASE: " FULL_IGNORE_CASE"}

1922

1923def make_sequence(items):

1924 if len(items) == 1:

1925 return items[0]

1926 return Sequence(items)

1927

1928# Common base class for all nodes.

1929class RegexBase:

1930 def __init__(self):

1931 self._key = self.__class__

1932

1933 def with_flags(self, positive=None, case_flags=None, zerowidth=None):

1934 if positive is None:

1935 positive = self.positive

1936 else:

1937 positive = bool(positive)

1938 if case_flags is None:

1939 case_flags = self.case_flags

1940 else:

1941 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]

1942 if zerowidth is None:

1943 zerowidth = self.zerowidth

1944 else:

1945 zerowidth = bool(zerowidth)

1946

1947 if (positive == self.positive and case_flags == self.case_flags and

1948 zerowidth == self.zerowidth):

1949 return self

1950

1951 return self.rebuild(positive, case_flags, zerowidth)

1952

1953 def fix_groups(self, pattern, reverse, fuzzy):

1954 pass

1955

1956 def optimise(self, info, reverse):

1957 return self

1958

1959 def pack_characters(self, info):

1960 return self

1961

1962 def remove_captures(self):

1963 return self

1964

1965 def is_atomic(self):

1966 return True

1967

1968 def can_be_affix(self):

1969 return True

1970

1971 def contains_group(self):

1972 return False

1973

1974 def get_firstset(self, reverse):

1975 raise _FirstSetError()

1976

1977 def has_simple_start(self):

1978 return False

1979

1980 def compile(self, reverse=False, fuzzy=False):

1981 return self._compile(reverse, fuzzy)

1982

1983 def is_empty(self):

1984 return False

1985

1986 def __hash__(self):

1987 return hash(self._key)

1988

1989 def __eq__(self, other):

1990 return type(self) is type(other) and self._key == other._key

1991

1992 def __ne__(self, other):

1993 return not self.__eq__(other)

1994

1995 def get_required_string(self, reverse):

1996 return self.max_width(), None

1997

1998# Base class for zero-width nodes.

1999class ZeroWidthBase(RegexBase):

2000 def __init__(self, positive=True, encoding=0):

2001 RegexBase.__init__(self)

2002 self.positive = bool(positive)

2003 self.encoding = encoding

2004

2005 self._key = self.__class__, self.positive

2006

2007 def get_firstset(self, reverse):

2008 return set([None])

2009

2010 def _compile(self, reverse, fuzzy):

2011 flags = 0

2012 if self.positive:

2013 flags |= POSITIVE_OP

2014 if fuzzy:

2015 flags |= FUZZY_OP

2016 if reverse:

2017 flags |= REVERSE_OP

2018 flags |= self.encoding << ENCODING_OP_SHIFT

2019 return [(self._opcode, flags)]

2020

2021 def dump(self, indent, reverse):

2022 print("{}{} {}{}".format(INDENT * indent, self._op_name,

2023 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))

2024

2025 def max_width(self):

2026 return 0

2027

2028class Any(RegexBase):

2029 _opcode = {False: OP.ANY, True: OP.ANY_REV}

2030 _op_name = "ANY"

2031

2032 def has_simple_start(self):

2033 return True

2034

2035 def _compile(self, reverse, fuzzy):

2036 flags = 0

2037 if fuzzy:

2038 flags |= FUZZY_OP

2039 return [(self._opcode[reverse], flags)]

2040

2041 def dump(self, indent, reverse):

2042 print("{}{}".format(INDENT * indent, self._op_name))

2043

2044 def max_width(self):

2045 return 1

2046

2047class AnyAll(Any):

2048 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}

2049 _op_name = "ANY_ALL"

2050

2051 def __init__(self):

2052 self.positive = True

2053 self.zerowidth = False

2054 self.case_flags = 0

2055

2056 self._key = self.__class__, self.positive

2057

2058class AnyU(Any):

2059 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}

2060 _op_name = "ANY_U"

2061

2062class Atomic(RegexBase):

2063 def __init__(self, subpattern):

2064 RegexBase.__init__(self)

2065 self.subpattern = subpattern

2066

2067 def fix_groups(self, pattern, reverse, fuzzy):

2068 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2069

2070 def optimise(self, info, reverse):

2071 self.subpattern = self.subpattern.optimise(info, reverse)

2072

2073 if self.subpattern.is_empty():

2074 return self.subpattern

2075 return self

2076

2077 def pack_characters(self, info):

2078 self.subpattern = self.subpattern.pack_characters(info)

2079 return self

2080

2081 def remove_captures(self):

2082 self.subpattern = self.subpattern.remove_captures()

2083 return self

2084

2085 def can_be_affix(self):

2086 return self.subpattern.can_be_affix()

2087

2088 def contains_group(self):

2089 return self.subpattern.contains_group()

2090

2091 def get_firstset(self, reverse):

2092 return self.subpattern.get_firstset(reverse)

2093

2094 def has_simple_start(self):

2095 return self.subpattern.has_simple_start()

2096

2097 def _compile(self, reverse, fuzzy):

2098 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +

2099 [(OP.END, )])

2100

2101 def dump(self, indent, reverse):

2102 print("{}ATOMIC".format(INDENT * indent))

2103 self.subpattern.dump(indent + 1, reverse)

2104

2105 def is_empty(self):

2106 return self.subpattern.is_empty()

2107

2108 def __eq__(self, other):

2109 return (type(self) is type(other) and self.subpattern ==

2110 other.subpattern)

2111

2112 def max_width(self):

2113 return self.subpattern.max_width()

2114

2115 def get_required_string(self, reverse):

2116 return self.subpattern.get_required_string(reverse)

2117

2118class Boundary(ZeroWidthBase):

2119 _opcode = OP.BOUNDARY

2120 _op_name = "BOUNDARY"

2121

2122class Branch(RegexBase):

2123 def __init__(self, branches):

2124 RegexBase.__init__(self)

2125 self.branches = branches

2126

2127 def fix_groups(self, pattern, reverse, fuzzy):

2128 for b in self.branches:

2129 b.fix_groups(pattern, reverse, fuzzy)

2130

2131 def optimise(self, info, reverse):

2132 if not self.branches:

2133 return Sequence([])

2134

2135 # Flatten branches within branches.

2136 branches = Branch._flatten_branches(info, reverse, self.branches)

2137

2138 # Move any common prefix or suffix out of the branches.

2139 if reverse:

2140 suffix, branches = Branch._split_common_suffix(info, branches)

2141 prefix = []

2142 else:

2143 prefix, branches = Branch._split_common_prefix(info, branches)

2144 suffix = []

2145

2146 # Try to reduce adjacent single-character branches to sets.

2147 branches = Branch._reduce_to_set(info, reverse, branches)

2148

2149 if len(branches) > 1:

2150 sequence = [Branch(branches)]

2151

2152 if not prefix or not suffix:

2153 # We might be able to add a quick precheck before the branches.

2154 firstset = self._add_precheck(info, reverse, branches)

2155

2156 if firstset:

2157 if reverse:

2158 sequence.append(firstset)

2159 else:

2160 sequence.insert(0, firstset)

2161 else:

2162 sequence = branches

2163

2164 return make_sequence(prefix + sequence + suffix)

2165

2166 def _add_precheck(self, info, reverse, branches):

2167 charset = set()

2168 pos = -1 if reverse else 0

2169

2170 for branch in branches:

2171 if type(branch) is Literal and branch.case_flags == NOCASE:

2172 charset.add(branch.characters[pos])

2173 else:

2174 return

2175

2176 if not charset:

2177 return None

2178

2179 return _check_firstset(info, reverse, [Character(c) for c in charset])

2180

2181 def pack_characters(self, info):

2182 self.branches = [b.pack_characters(info) for b in self.branches]

2183 return self

2184

2185 def remove_captures(self):

2186 self.branches = [b.remove_captures() for b in self.branches]

2187 return self

2188

2189 def is_atomic(self):

2190 return all(b.is_atomic() for b in self.branches)

2191

2192 def can_be_affix(self):

2193 return all(b.can_be_affix() for b in self.branches)

2194

2195 def contains_group(self):

2196 return any(b.contains_group() for b in self.branches)

2197

2198 def get_firstset(self, reverse):

2199 fs = set()

2200 for b in self.branches:

2201 fs |= b.get_firstset(reverse)

2202

2203 return fs or set([None])

2204

2205 def _compile(self, reverse, fuzzy):

2206 if not self.branches:

2207 return []

2208

2209 code = [(OP.BRANCH, )]

2210 for b in self.branches:

2211 code.extend(b.compile(reverse, fuzzy))

2212 code.append((OP.NEXT, ))

2213

2214 code[-1] = (OP.END, )

2215

2216 return code

2217

2218 def dump(self, indent, reverse):

2219 print("{}BRANCH".format(INDENT * indent))

2220 self.branches[0].dump(indent + 1, reverse)

2221 for b in self.branches[1 : ]:

2222 print("{}OR".format(INDENT * indent))

2223 b.dump(indent + 1, reverse)

2224

2225 @staticmethod

2226 def _flatten_branches(info, reverse, branches):

2227 # Flatten the branches so that there aren't branches of branches.

2228 new_branches = []

2229 for b in branches:

2230 b = b.optimise(info, reverse)

2231 if isinstance(b, Branch):

2232 new_branches.extend(b.branches)

2233 else:

2234 new_branches.append(b)

2235

2236 return new_branches

2237

2238 @staticmethod

2239 def _split_common_prefix(info, branches):

2240 # Common leading items can be moved out of the branches.

2241 # Get the items in the branches.

2242 alternatives = []

2243 for b in branches:

2244 if isinstance(b, Sequence):

2245 alternatives.append(b.items)

2246 else:

2247 alternatives.append([b])

2248

2249 # What is the maximum possible length of the prefix?

2250 max_count = min(len(a) for a in alternatives)

2251

2252 # What is the longest common prefix?

2253 prefix = alternatives[0]

2254 pos = 0

2255 end_pos = max_count

2256 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==

2257 prefix[pos] for a in alternatives):

2258 pos += 1

2259 count = pos

2260

2261 if info.flags & UNICODE:

2262 # We need to check that we're not splitting a sequence of

2263 # characters which could form part of full case-folding.

2264 count = pos

2265 while count > 0 and not all(Branch._can_split(a, count) for a in

2266 alternatives):

2267 count -= 1

2268

2269 # No common prefix is possible.

2270 if count == 0:

2271 return [], branches

2272

2273 # Rebuild the branches.

2274 new_branches = []

2275 for a in alternatives:

2276 new_branches.append(make_sequence(a[count : ]))

2277

2278 return prefix[ : count], new_branches

2279

2280 @staticmethod

2281 def _split_common_suffix(info, branches):

2282 # Common trailing items can be moved out of the branches.

2283 # Get the items in the branches.

2284 alternatives = []

2285 for b in branches:

2286 if isinstance(b, Sequence):

2287 alternatives.append(b.items)

2288 else:

2289 alternatives.append([b])

2290

2291 # What is the maximum possible length of the suffix?

2292 max_count = min(len(a) for a in alternatives)

2293

2294 # What is the longest common suffix?

2295 suffix = alternatives[0]

2296 pos = -1

2297 end_pos = -1 - max_count

2298 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==

2299 suffix[pos] for a in alternatives):

2300 pos -= 1

2301 count = -1 - pos

2302

2303 if info.flags & UNICODE:

2304 # We need to check that we're not splitting a sequence of

2305 # characters which could form part of full case-folding.

2306 while count > 0 and not all(Branch._can_split_rev(a, count) for a

2307 in alternatives):

2308 count -= 1

2309

2310 # No common suffix is possible.

2311 if count == 0:

2312 return [], branches

2313

2314 # Rebuild the branches.

2315 new_branches = []

2316 for a in alternatives:

2317 new_branches.append(make_sequence(a[ : -count]))

2318

2319 return suffix[-count : ], new_branches

2320

2321 @staticmethod

2322 def _can_split(items, count):

2323 # Check the characters either side of the proposed split.

2324 if not Branch._is_full_case(items, count - 1):

2325 return True

2326

2327 if not Branch._is_full_case(items, count):

2328 return True

2329

2330 # Check whether a 1-1 split would be OK.

2331 if Branch._is_folded(items[count - 1 : count + 1]):

2332 return False

2333

2334 # Check whether a 1-2 split would be OK.

2335 if (Branch._is_full_case(items, count + 2) and

2336 Branch._is_folded(items[count - 1 : count + 2])):

2337 return False

2338

2339 # Check whether a 2-1 split would be OK.

2340 if (Branch._is_full_case(items, count - 2) and

2341 Branch._is_folded(items[count - 2 : count + 1])):

2342 return False

2343

2344 return True

2345

2346 @staticmethod

2347 def _can_split_rev(items, count):

2348 end = len(items)

2349

2350 # Check the characters either side of the proposed split.

2351 if not Branch._is_full_case(items, end - count):

2352 return True

2353

2354 if not Branch._is_full_case(items, end - count - 1):

2355 return True

2356

2357 # Check whether a 1-1 split would be OK.

2358 if Branch._is_folded(items[end - count - 1 : end - count + 1]):

2359 return False

2360

2361 # Check whether a 1-2 split would be OK.

2362 if (Branch._is_full_case(items, end - count + 2) and

2363 Branch._is_folded(items[end - count - 1 : end - count + 2])):

2364 return False

2365

2366 # Check whether a 2-1 split would be OK.

2367 if (Branch._is_full_case(items, end - count - 2) and

2368 Branch._is_folded(items[end - count - 2 : end - count + 1])):

2369 return False

2370

2371 return True

2372

2373 @staticmethod

2374 def _merge_common_prefixes(info, reverse, branches):

2375 # Branches with the same case-sensitive character prefix can be grouped

2376 # together if they are separated only by other branches with a

2377 # character prefix.

2378 prefixed = defaultdict(list)

2379 order = {}

2380 new_branches = []

2381 for b in branches:

2382 if Branch._is_simple_character(b):

2383 # Branch starts with a simple character.

2384 prefixed[b.value].append([b])

2385 order.setdefault(b.value, len(order))

2386 elif (isinstance(b, Sequence) and b.items and

2387 Branch._is_simple_character(b.items[0])):

2388 # Branch starts with a simple character.

2389 prefixed[b.items[0].value].append(b.items)

2390 order.setdefault(b.items[0].value, len(order))

2391 else:

2392 Branch._flush_char_prefix(info, reverse, prefixed, order,

2393 new_branches)

2394

2395 new_branches.append(b)

2396

2397 Branch._flush_char_prefix(info, prefixed, order, new_branches)

2398

2399 return new_branches

2400

2401 @staticmethod

2402 def _is_simple_character(c):

2403 return isinstance(c, Character) and c.positive and not c.case_flags

2404

2405 @staticmethod

2406 def _reduce_to_set(info, reverse, branches):

2407 # Can the branches be reduced to a set?

2408 new_branches = []

2409 items = set()

2410 case_flags = NOCASE

2411 for b in branches:

2412 if isinstance(b, (Character, Property, SetBase)):

2413 # Branch starts with a single character.

2414 if b.case_flags != case_flags:

2415 # Different case sensitivity, so flush.

2416 Branch._flush_set_members(info, reverse, items, case_flags,

2417 new_branches)

2418

2419 case_flags = b.case_flags

2420

2421 items.add(b.with_flags(case_flags=NOCASE))

2422 else:

2423 Branch._flush_set_members(info, reverse, items, case_flags,

2424 new_branches)

2425

2426 new_branches.append(b)

2427

2428 Branch._flush_set_members(info, reverse, items, case_flags,

2429 new_branches)

2430

2431 return new_branches

2432

2433 @staticmethod

2434 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):

2435 # Flush the prefixed branches.

2436 if not prefixed:

2437 return

2438

2439 for value, branches in sorted(prefixed.items(), key=lambda pair:

2440 order[pair[0]]):

2441 if len(branches) == 1:

2442 new_branches.append(make_sequence(branches[0]))

2443 else:

2444 subbranches = []

2445 optional = False

2446 for b in branches:

2447 if len(b) > 1:

2448 subbranches.append(make_sequence(b[1 : ]))

2449 elif not optional:

2450 subbranches.append(Sequence())

2451 optional = True

2452

2453 sequence = Sequence([Character(value), Branch(subbranches)])

2454 new_branches.append(sequence.optimise(info, reverse))

2455

2456 prefixed.clear()

2457 order.clear()

2458

2459 @staticmethod

2460 def _flush_set_members(info, reverse, items, case_flags, new_branches):

2461 # Flush the set members.

2462 if not items:

2463 return

2464

2465 if len(items) == 1:

2466 item = list(items)[0]

2467 else:

2468 item = SetUnion(info, list(items)).optimise(info, reverse)

2469

2470 new_branches.append(item.with_flags(case_flags=case_flags))

2471

2472 items.clear()

2473

2474 @staticmethod

2475 def _is_full_case(items, i):

2476 if not 0 <= i < len(items):

2477 return False

2478

2479 item = items[i]

2480 return (isinstance(item, Character) and item.positive and

2481 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)

2482

2483 @staticmethod

2484 def _is_folded(items):

2485 if len(items) < 2:

2486 return False

2487

2488 for i in items:

2489 if (not isinstance(i, Character) or not i.positive or not

2490 i.case_flags):

2491 return False

2492

2493 folded = "".join(chr(i.value) for i in items)

2494 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)

2495

2496 # Get the characters which expand to multiple codepoints on folding.

2497 expanding_chars = _regex.get_expand_on_folding()

2498

2499 for c in expanding_chars:

2500 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):

2501 return True

2502

2503 return False

2504

2505 def is_empty(self):

2506 return all(b.is_empty() for b in self.branches)

2507

2508 def __eq__(self, other):

2509 return type(self) is type(other) and self.branches == other.branches

2510

2511 def max_width(self):

2512 return max(b.max_width() for b in self.branches)

2513

2514class CallGroup(RegexBase):

2515 def __init__(self, info, group, position):

2516 RegexBase.__init__(self)

2517 self.info = info

2518 self.group = group

2519 self.position = position

2520

2521 self._key = self.__class__, self.group

2522

2523 def fix_groups(self, pattern, reverse, fuzzy):

2524 try:

2525 self.group = int(self.group)

2526 except ValueError:

2527 try:

2528 self.group = self.info.group_index[self.group]

2529 except KeyError:

2530 raise error("invalid group reference", pattern, self.position)

2531

2532 if not 0 <= self.group <= self.info.group_count:

2533 raise error("unknown group", pattern, self.position)

2534

2535 if self.group > 0 and self.info.open_group_count[self.group] > 1:

2536 raise error("ambiguous group reference", pattern, self.position)

2537

2538 self.info.group_calls.append((self, reverse, fuzzy))

2539

2540 self._key = self.__class__, self.group

2541

2542 def remove_captures(self):

2543 raise error("group reference not allowed", self.pattern, self.position)

2544

2545 def _compile(self, reverse, fuzzy):

2546 return [(OP.GROUP_CALL, self.call_ref)]

2547

2548 def dump(self, indent, reverse):

2549 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))

2550

2551 def __eq__(self, other):

2552 return type(self) is type(other) and self.group == other.group

2553

2554 def max_width(self):

2555 return UNLIMITED

2556

2557 def __del__(self):

2558 self.info = None

2559

2560class CallRef(RegexBase):

2561 def __init__(self, ref, parsed):

2562 self.ref = ref

2563 self.parsed = parsed

2564

2565 def _compile(self, reverse, fuzzy):

2566 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,

2567 fuzzy) + [(OP.END, )])

2568

2569class Character(RegexBase):

2570 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):

2571 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,

2572 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,

2573 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,

2574 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}

2575

2576 def __init__(self, value, positive=True, case_flags=NOCASE,

2577 zerowidth=False):

2578 RegexBase.__init__(self)

2579 self.value = value

2580 self.positive = bool(positive)

2581 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

2582 self.zerowidth = bool(zerowidth)

2583

2584 if (self.positive and (self.case_flags & FULLIGNORECASE) ==

2585 FULLIGNORECASE):

2586 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))

2587 else:

2588 self.folded = chr(self.value)

2589

2590 self._key = (self.__class__, self.value, self.positive,

2591 self.case_flags, self.zerowidth)

2592

2593 def rebuild(self, positive, case_flags, zerowidth):

2594 return Character(self.value, positive, case_flags, zerowidth)

2595

2596 def optimise(self, info, reverse, in_set=False):

2597 return self

2598

2599 def get_firstset(self, reverse):

2600 return set([self])

2601

2602 def has_simple_start(self):

2603 return True

2604

2605 def _compile(self, reverse, fuzzy):

2606 flags = 0

2607 if self.positive:

2608 flags |= POSITIVE_OP

2609 if self.zerowidth:

2610 flags |= ZEROWIDTH_OP

2611 if fuzzy:

2612 flags |= FUZZY_OP

2613

2614 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,

2615 self.value])

2616

2617 if len(self.folded) > 1:

2618 # The character expands on full case-folding.

2619 code = Branch([code, String([ord(c) for c in self.folded],

2620 case_flags=self.case_flags)])

2621

2622 return code.compile(reverse, fuzzy)

2623

2624 def dump(self, indent, reverse):

2625 display = ascii(chr(self.value)).lstrip("bu")

2626 print("{}CHARACTER {} {}{}".format(INDENT * indent,

2627 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))

2628

2629 def matches(self, ch):

2630 return (ch == self.value) == self.positive

2631

2632 def max_width(self):

2633 return len(self.folded)

2634

2635 def get_required_string(self, reverse):

2636 if not self.positive:

2637 return 1, None

2638

2639 self.folded_characters = tuple(ord(c) for c in self.folded)

2640

2641 return 0, self

2642

2643class Conditional(RegexBase):

2644 def __init__(self, info, group, yes_item, no_item, position):

2645 RegexBase.__init__(self)

2646 self.info = info

2647 self.group = group

2648 self.yes_item = yes_item

2649 self.no_item = no_item

2650 self.position = position

2651

2652 def fix_groups(self, pattern, reverse, fuzzy):

2653 try:

2654 self.group = int(self.group)

2655 except ValueError:

2656 try:

2657 self.group = self.info.group_index[self.group]

2658 except KeyError:

2659 if self.group == 'DEFINE':

2660 # 'DEFINE' is a special name unless there's a group with

2661 # that name.

2662 self.group = 0

2663 else:

2664 raise error("unknown group", pattern, self.position)

2665

2666 if not 0 <= self.group <= self.info.group_count:

2667 raise error("invalid group reference", pattern, self.position)

2668

2669 self.yes_item.fix_groups(pattern, reverse, fuzzy)

2670 self.no_item.fix_groups(pattern, reverse, fuzzy)

2671

2672 def optimise(self, info, reverse):

2673 yes_item = self.yes_item.optimise(info, reverse)

2674 no_item = self.no_item.optimise(info, reverse)

2675

2676 return Conditional(info, self.group, yes_item, no_item, self.position)

2677

2678 def pack_characters(self, info):

2679 self.yes_item = self.yes_item.pack_characters(info)

2680 self.no_item = self.no_item.pack_characters(info)

2681 return self

2682

2683 def remove_captures(self):

2684 self.yes_item = self.yes_item.remove_captures()

2685 self.no_item = self.no_item.remove_captures()

2686

2687 def is_atomic(self):

2688 return self.yes_item.is_atomic() and self.no_item.is_atomic()

2689

2690 def can_be_affix(self):

2691 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()

2692

2693 def contains_group(self):

2694 return self.yes_item.contains_group() or self.no_item.contains_group()

2695

2696 def get_firstset(self, reverse):

2697 return (self.yes_item.get_firstset(reverse) |

2698 self.no_item.get_firstset(reverse))

2699

2700 def _compile(self, reverse, fuzzy):

2701 code = [(OP.GROUP_EXISTS, self.group)]

2702 code.extend(self.yes_item.compile(reverse, fuzzy))

2703 add_code = self.no_item.compile(reverse, fuzzy)

2704 if add_code:

2705 code.append((OP.NEXT, ))

2706 code.extend(add_code)

2707

2708 code.append((OP.END, ))

2709

2710 return code

2711

2712 def dump(self, indent, reverse):

2713 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))

2714 self.yes_item.dump(indent + 1, reverse)

2715 if not self.no_item.is_empty():

2716 print("{}OR".format(INDENT * indent))

2717 self.no_item.dump(indent + 1, reverse)

2718

2719 def is_empty(self):

2720 return self.yes_item.is_empty() and self.no_item.is_empty()

2721

2722 def __eq__(self, other):

2723 return type(self) is type(other) and (self.group, self.yes_item,

2724 self.no_item) == (other.group, other.yes_item, other.no_item)

2725

2726 def max_width(self):

2727 return max(self.yes_item.max_width(), self.no_item.max_width())

2728

2729 def __del__(self):

2730 self.info = None

2731

2732class DefaultBoundary(ZeroWidthBase):

2733 _opcode = OP.DEFAULT_BOUNDARY

2734 _op_name = "DEFAULT_BOUNDARY"

2735

2736class DefaultEndOfWord(ZeroWidthBase):

2737 _opcode = OP.DEFAULT_END_OF_WORD

2738 _op_name = "DEFAULT_END_OF_WORD"

2739

2740class DefaultStartOfWord(ZeroWidthBase):

2741 _opcode = OP.DEFAULT_START_OF_WORD

2742 _op_name = "DEFAULT_START_OF_WORD"

2743

2744class EndOfLine(ZeroWidthBase):

2745 _opcode = OP.END_OF_LINE

2746 _op_name = "END_OF_LINE"

2747

2748class EndOfLineU(EndOfLine):

2749 _opcode = OP.END_OF_LINE_U

2750 _op_name = "END_OF_LINE_U"

2751

2752class EndOfString(ZeroWidthBase):

2753 _opcode = OP.END_OF_STRING

2754 _op_name = "END_OF_STRING"

2755

2756class EndOfStringLine(ZeroWidthBase):

2757 _opcode = OP.END_OF_STRING_LINE

2758 _op_name = "END_OF_STRING_LINE"

2759

2760class EndOfStringLineU(EndOfStringLine):

2761 _opcode = OP.END_OF_STRING_LINE_U

2762 _op_name = "END_OF_STRING_LINE_U"

2763

2764class EndOfWord(ZeroWidthBase):

2765 _opcode = OP.END_OF_WORD

2766 _op_name = "END_OF_WORD"

2767

2768class Failure(ZeroWidthBase):

2769 _op_name = "FAILURE"

2770

2771 def _compile(self, reverse, fuzzy):

2772 return [(OP.FAILURE, )]

2773

2774class Fuzzy(RegexBase):

2775 def __init__(self, subpattern, constraints=None):

2776 RegexBase.__init__(self)

2777 if constraints is None:

2778 constraints = {}

2779 self.subpattern = subpattern

2780 self.constraints = constraints

2781

2782 # If an error type is mentioned in the cost equation, then its maximum

2783 # defaults to unlimited.

2784 if "cost" in constraints:

2785 for e in "dis":

2786 if e in constraints["cost"]:

2787 constraints.setdefault(e, (0, None))

2788

2789 # If any error type is mentioned, then all the error maxima default to

2790 # 0, otherwise they default to unlimited.

2791 if set(constraints) & set("dis"):

2792 for e in "dis":

2793 constraints.setdefault(e, (0, 0))

2794 else:

2795 for e in "dis":

2796 constraints.setdefault(e, (0, None))

2797

2798 # The maximum of the generic error type defaults to unlimited.

2799 constraints.setdefault("e", (0, None))

2800

2801 # The cost equation defaults to equal costs. Also, the cost of any

2802 # error type not mentioned in the cost equation defaults to 0.

2803 if "cost" in constraints:

2804 for e in "dis":

2805 constraints["cost"].setdefault(e, 0)

2806 else:

2807 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":

2808 constraints["e"][1]}

2809

2810 def fix_groups(self, pattern, reverse, fuzzy):

2811 self.subpattern.fix_groups(pattern, reverse, True)

2812

2813 def pack_characters(self, info):

2814 self.subpattern = self.subpattern.pack_characters(info)

2815 return self

2816

2817 def remove_captures(self):

2818 self.subpattern = self.subpattern.remove_captures()

2819 return self

2820

2821 def is_atomic(self):

2822 return self.subpattern.is_atomic()

2823

2824 def contains_group(self):

2825 return self.subpattern.contains_group()

2826

2827 def _compile(self, reverse, fuzzy):

2828 # The individual limits.

2829 arguments = []

2830 for e in "dise":

2831 v = self.constraints[e]

2832 arguments.append(v[0])

2833 arguments.append(UNLIMITED if v[1] is None else v[1])

2834

2835 # The coeffs of the cost equation.

2836 for e in "dis":

2837 arguments.append(self.constraints["cost"][e])

2838

2839 # The maximum of the cost equation.

2840 v = self.constraints["cost"]["max"]

2841 arguments.append(UNLIMITED if v is None else v)

2842

2843 flags = 0

2844 if reverse:

2845 flags |= REVERSE_OP

2846

2847 test = self.constraints.get("test")

2848

2849 if test:

2850 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +

2851 test.compile(reverse, True) + [(OP.NEXT,)] +

2852 self.subpattern.compile(reverse, True) + [(OP.END,)])

2853

2854 return ([(OP.FUZZY, flags) + tuple(arguments)] +

2855 self.subpattern.compile(reverse, True) + [(OP.END,)])

2856

2857 def dump(self, indent, reverse):

2858 constraints = self._constraints_to_string()

2859 if constraints:

2860 constraints = " " + constraints

2861 print("{}FUZZY{}".format(INDENT * indent, constraints))

2862 self.subpattern.dump(indent + 1, reverse)

2863

2864 def is_empty(self):

2865 return self.subpattern.is_empty()

2866

2867 def __eq__(self, other):

2868 return (type(self) is type(other) and self.subpattern ==

2869 other.subpattern and self.constraints == other.constraints)

2870

2871 def max_width(self):

2872 return UNLIMITED

2873

2874 def _constraints_to_string(self):

2875 constraints = []

2876

2877 for name in "ids":

2878 min, max = self.constraints[name]

2879 if max == 0:

2880 continue

2881

2882 con = ""

2883

2884 if min > 0:

2885 con = "{}<=".format(min)

2886

2887 con += name

2888

2889 if max is not None:

2890 con += "<={}".format(max)

2891

2892 constraints.append(con)

2893

2894 cost = []

2895 for name in "ids":

2896 coeff = self.constraints["cost"][name]

2897 if coeff > 0:

2898 cost.append("{}{}".format(coeff, name))

2899

2900 limit = self.constraints["cost"]["max"]

2901 if limit is not None and limit > 0:

2902 cost = "{}<={}".format("+".join(cost), limit)

2903 constraints.append(cost)

2904

2905 return ",".join(constraints)

2906

2907class Grapheme(RegexBase):

2908 def _compile(self, reverse, fuzzy):

2909 # Match at least 1 character until a grapheme boundary is reached. Note

2910 # that this is the same whether matching forwards or backwards.

2911 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),

2912 GraphemeBoundary()]))

2913

2914 return grapheme_matcher.compile(reverse, fuzzy)

2915

2916 def dump(self, indent, reverse):

2917 print("{}GRAPHEME".format(INDENT * indent))

2918

2919 def max_width(self):

2920 return UNLIMITED

2921

2922class GraphemeBoundary:

2923 def compile(self, reverse, fuzzy):

2924 return [(OP.GRAPHEME_BOUNDARY, 1)]

2925

2926class GreedyRepeat(RegexBase):

2927 _opcode = OP.GREEDY_REPEAT

2928 _op_name = "GREEDY_REPEAT"

2929

2930 def __init__(self, subpattern, min_count, max_count):

2931 RegexBase.__init__(self)

2932 self.subpattern = subpattern

2933 self.min_count = min_count

2934 self.max_count = max_count

2935

2936 def fix_groups(self, pattern, reverse, fuzzy):

2937 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2938

2939 def optimise(self, info, reverse):

2940 subpattern = self.subpattern.optimise(info, reverse)

2941

2942 return type(self)(subpattern, self.min_count, self.max_count)

2943

2944 def pack_characters(self, info):

2945 self.subpattern = self.subpattern.pack_characters(info)

2946 return self

2947

2948 def remove_captures(self):

2949 self.subpattern = self.subpattern.remove_captures()

2950 return self

2951

2952 def is_atomic(self):

2953 return self.min_count == self.max_count and self.subpattern.is_atomic()

2954

2955 def can_be_affix(self):

2956 return False

2957

2958 def contains_group(self):

2959 return self.subpattern.contains_group()

2960

2961 def get_firstset(self, reverse):

2962 fs = self.subpattern.get_firstset(reverse)

2963 if self.min_count == 0:

2964 fs.add(None)

2965

2966 return fs

2967

2968 def _compile(self, reverse, fuzzy):

2969 repeat = [self._opcode, self.min_count]

2970 if self.max_count is None:

2971 repeat.append(UNLIMITED)

2972 else:

2973 repeat.append(self.max_count)

2974

2975 subpattern = self.subpattern.compile(reverse, fuzzy)

2976 if not subpattern:

2977 return []

2978

2979 return ([tuple(repeat)] + subpattern + [(OP.END, )])

2980

2981 def dump(self, indent, reverse):

2982 if self.max_count is None:

2983 limit = "INF"

2984 else:

2985 limit = self.max_count

2986 print("{}{} {} {}".format(INDENT * indent, self._op_name,

2987 self.min_count, limit))

2988

2989 self.subpattern.dump(indent + 1, reverse)

2990

2991 def is_empty(self):

2992 return self.subpattern.is_empty()

2993

2994 def __eq__(self, other):

2995 return type(self) is type(other) and (self.subpattern, self.min_count,

2996 self.max_count) == (other.subpattern, other.min_count,

2997 other.max_count)

2998

2999 def max_width(self):

3000 if self.max_count is None:

3001 return UNLIMITED

3002

3003 return self.subpattern.max_width() * self.max_count

3004

3005 def get_required_string(self, reverse):

3006 max_count = UNLIMITED if self.max_count is None else self.max_count

3007 if self.min_count == 0:

3008 w = self.subpattern.max_width() * max_count

3009 return min(w, UNLIMITED), None

3010

3011 ofs, req = self.subpattern.get_required_string(reverse)

3012 if req:

3013 return ofs, req

3014

3015 w = self.subpattern.max_width() * max_count

3016 return min(w, UNLIMITED), None

3017

3018class PossessiveRepeat(GreedyRepeat):

3019 def is_atomic(self):

3020 return True

3021

3022 def _compile(self, reverse, fuzzy):

3023 subpattern = self.subpattern.compile(reverse, fuzzy)

3024 if not subpattern:

3025 return []

3026

3027 repeat = [self._opcode, self.min_count]

3028 if self.max_count is None:

3029 repeat.append(UNLIMITED)

3030 else:

3031 repeat.append(self.max_count)

3032

3033 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),

3034 (OP.END, )])

3035

3036 def dump(self, indent, reverse):

3037 print("{}ATOMIC".format(INDENT * indent))

3038

3039 if self.max_count is None:

3040 limit = "INF"

3041 else:

3042 limit = self.max_count

3043 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,

3044 self.min_count, limit))

3045

3046 self.subpattern.dump(indent + 2, reverse)

3047

3048class Group(RegexBase):

3049 def __init__(self, info, group, subpattern):

3050 RegexBase.__init__(self)

3051 self.info = info

3052 self.group = group

3053 self.subpattern = subpattern

3054

3055 self.call_ref = None

3056

3057 def fix_groups(self, pattern, reverse, fuzzy):

3058 self.info.defined_groups[self.group] = (self, reverse, fuzzy)

3059 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3060

3061 def optimise(self, info, reverse):

3062 subpattern = self.subpattern.optimise(info, reverse)

3063

3064 return Group(self.info, self.group, subpattern)

3065

3066 def pack_characters(self, info):

3067 self.subpattern = self.subpattern.pack_characters(info)

3068 return self

3069

3070 def remove_captures(self):

3071 return self.subpattern.remove_captures()

3072

3073 def is_atomic(self):

3074 return self.subpattern.is_atomic()

3075

3076 def can_be_affix(self):

3077 return False

3078

3079 def contains_group(self):

3080 return True

3081

3082 def get_firstset(self, reverse):

3083 return self.subpattern.get_firstset(reverse)

3084

3085 def has_simple_start(self):

3086 return self.subpattern.has_simple_start()

3087

3088 def _compile(self, reverse, fuzzy):

3089 code = []

3090

3091 public_group = private_group = self.group

3092 if private_group < 0:

3093 public_group = self.info.private_groups[private_group]

3094 private_group = self.info.group_count - private_group

3095

3096 key = self.group, reverse, fuzzy

3097 ref = self.info.call_refs.get(key)

3098 if ref is not None:

3099 code += [(OP.CALL_REF, ref)]

3100

3101 code += [(OP.GROUP, int(not reverse), private_group, public_group)]

3102 code += self.subpattern.compile(reverse, fuzzy)

3103 code += [(OP.END, )]

3104

3105 if ref is not None:

3106 code += [(OP.END, )]

3107

3108 return code

3109

3110 def dump(self, indent, reverse):

3111 group = self.group

3112 if group < 0:

3113 group = self.info.private_groups[group]

3114 print("{}GROUP {}".format(INDENT * indent, group))

3115 self.subpattern.dump(indent + 1, reverse)

3116

3117 def __eq__(self, other):

3118 return (type(self) is type(other) and (self.group, self.subpattern) ==

3119 (other.group, other.subpattern))

3120

3121 def max_width(self):

3122 return self.subpattern.max_width()

3123

3124 def get_required_string(self, reverse):

3125 return self.subpattern.get_required_string(reverse)

3126

3127 def __del__(self):

3128 self.info = None

3129

3130class Keep(ZeroWidthBase):

3131 _opcode = OP.KEEP

3132 _op_name = "KEEP"

3133

3134class LazyRepeat(GreedyRepeat):

3135 _opcode = OP.LAZY_REPEAT

3136 _op_name = "LAZY_REPEAT"

3137

3138class LookAround(RegexBase):

3139 _dir_text = {False: "AHEAD", True: "BEHIND"}

3140

3141 def __init__(self, behind, positive, subpattern):

3142 RegexBase.__init__(self)

3143 self.behind = bool(behind)

3144 self.positive = bool(positive)

3145 self.subpattern = subpattern

3146

3147 def fix_groups(self, pattern, reverse, fuzzy):

3148 self.subpattern.fix_groups(pattern, self.behind, fuzzy)

3149

3150 def optimise(self, info, reverse):

3151 subpattern = self.subpattern.optimise(info, self.behind)

3152 if self.positive and subpattern.is_empty():

3153 return subpattern

3154

3155 return LookAround(self.behind, self.positive, subpattern)

3156

3157 def pack_characters(self, info):

3158 self.subpattern = self.subpattern.pack_characters(info)

3159 return self

3160

3161 def remove_captures(self):

3162 return self.subpattern.remove_captures()

3163

3164 def is_atomic(self):

3165 return self.subpattern.is_atomic()

3166

3167 def can_be_affix(self):

3168 return self.subpattern.can_be_affix()

3169

3170 def contains_group(self):

3171 return self.subpattern.contains_group()

3172

3173 def get_firstset(self, reverse):

3174 if self.positive and self.behind == reverse:

3175 return self.subpattern.get_firstset(reverse)

3176

3177 return set([None])

3178

3179 def _compile(self, reverse, fuzzy):

3180 flags = 0

3181 if self.positive:

3182 flags |= POSITIVE_OP

3183 if fuzzy:

3184 flags |= FUZZY_OP

3185 if reverse:

3186 flags |= REVERSE_OP

3187

3188 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +

3189 self.subpattern.compile(self.behind) + [(OP.END, )])

3190

3191 def dump(self, indent, reverse):

3192 print("{}LOOK{} {}".format(INDENT * indent,

3193 self._dir_text[self.behind], POS_TEXT[self.positive]))

3194 self.subpattern.dump(indent + 1, self.behind)

3195

3196 def is_empty(self):

3197 return self.positive and self.subpattern.is_empty()

3198

3199 def __eq__(self, other):

3200 return type(self) is type(other) and (self.behind, self.positive,

3201 self.subpattern) == (other.behind, other.positive, other.subpattern)

3202

3203 def max_width(self):

3204 return 0

3205

3206class LookAroundConditional(RegexBase):

3207 _dir_text = {False: "AHEAD", True: "BEHIND"}

3208

3209 def __init__(self, behind, positive, subpattern, yes_item, no_item):

3210 RegexBase.__init__(self)

3211 self.behind = bool(behind)

3212 self.positive = bool(positive)

3213 self.subpattern = subpattern

3214 self.yes_item = yes_item

3215 self.no_item = no_item

3216

3217 def fix_groups(self, pattern, reverse, fuzzy):

3218 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3219 self.yes_item.fix_groups(pattern, reverse, fuzzy)

3220 self.no_item.fix_groups(pattern, reverse, fuzzy)

3221

3222 def optimise(self, info, reverse):

3223 subpattern = self.subpattern.optimise(info, self.behind)

3224 yes_item = self.yes_item.optimise(info, self.behind)

3225 no_item = self.no_item.optimise(info, self.behind)

3226

3227 return LookAroundConditional(self.behind, self.positive, subpattern,

3228 yes_item, no_item)

3229

3230 def pack_characters(self, info):

3231 self.subpattern = self.subpattern.pack_characters(info)

3232 self.yes_item = self.yes_item.pack_characters(info)

3233 self.no_item = self.no_item.pack_characters(info)

3234 return self

3235

3236 def remove_captures(self):

3237 self.subpattern = self.subpattern.remove_captures()

3238 self.yes_item = self.yes_item.remove_captures()

3239 self.no_item = self.no_item.remove_captures()

3240

3241 def is_atomic(self):

3242 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and

3243 self.no_item.is_atomic())

3244

3245 def can_be_affix(self):

3246 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()

3247 and self.no_item.can_be_affix())

3248

3249 def contains_group(self):

3250 return (self.subpattern.contains_group() or

3251 self.yes_item.contains_group() or self.no_item.contains_group())

3252

3253 def _compile(self, reverse, fuzzy):

3254 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]

3255 code.extend(self.subpattern.compile(self.behind, fuzzy))

3256 code.append((OP.NEXT, ))

3257 code.extend(self.yes_item.compile(reverse, fuzzy))

3258 add_code = self.no_item.compile(reverse, fuzzy)

3259 if add_code:

3260 code.append((OP.NEXT, ))

3261 code.extend(add_code)

3262

3263 code.append((OP.END, ))

3264

3265 return code

3266

3267 def dump(self, indent, reverse):

3268 print("{}CONDITIONAL {} {}".format(INDENT * indent,

3269 self._dir_text[self.behind], POS_TEXT[self.positive]))

3270 self.subpattern.dump(indent + 1, self.behind)

3271 print("{}EITHER".format(INDENT * indent))

3272 self.yes_item.dump(indent + 1, reverse)

3273 if not self.no_item.is_empty():

3274 print("{}OR".format(INDENT * indent))

3275 self.no_item.dump(indent + 1, reverse)

3276

3277 def is_empty(self):

3278 return (self.subpattern.is_empty() and self.yes_item.is_empty() or

3279 self.no_item.is_empty())

3280

3281 def __eq__(self, other):

3282 return type(self) is type(other) and (self.subpattern, self.yes_item,

3283 self.no_item) == (other.subpattern, other.yes_item, other.no_item)

3284

3285 def max_width(self):

3286 return max(self.yes_item.max_width(), self.no_item.max_width())

3287

3288 def get_required_string(self, reverse):

3289 return self.max_width(), None

3290

3291class PrecompiledCode(RegexBase):

3292 def __init__(self, code):

3293 self.code = code

3294

3295 def _compile(self, reverse, fuzzy):

3296 return [tuple(self.code)]

3297

3298class Property(RegexBase):

3299 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):

3300 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):

3301 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):

3302 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,

3303 True): OP.PROPERTY_IGN_REV}

3304

3305 def __init__(self, value, positive=True, case_flags=NOCASE,

3306 zerowidth=False, encoding=0):

3307 RegexBase.__init__(self)

3308 self.value = value

3309 self.positive = bool(positive)

3310 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3311 self.zerowidth = bool(zerowidth)

3312 self.encoding = encoding

3313

3314 self._key = (self.__class__, self.value, self.positive,

3315 self.case_flags, self.zerowidth)

3316

3317 def rebuild(self, positive, case_flags, zerowidth):

3318 return Property(self.value, positive, case_flags, zerowidth,

3319 self.encoding)

3320

3321 def optimise(self, info, reverse, in_set=False):

3322 return self

3323

3324 def get_firstset(self, reverse):

3325 return set([self])

3326

3327 def has_simple_start(self):

3328 return True

3329

3330 def _compile(self, reverse, fuzzy):

3331 flags = 0

3332 if self.positive:

3333 flags |= POSITIVE_OP

3334 if self.zerowidth:

3335 flags |= ZEROWIDTH_OP

3336 if fuzzy:

3337 flags |= FUZZY_OP

3338 flags |= self.encoding << ENCODING_OP_SHIFT

3339 return [(self._opcode[self.case_flags, reverse], flags, self.value)]

3340

3341 def dump(self, indent, reverse):

3342 prop = PROPERTY_NAMES[self.value >> 16]

3343 name, value = prop[0], prop[1][self.value & 0xFFFF]

3344 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,

3345 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],

3346 ["", " ASCII"][self.encoding]))

3347

3348 def matches(self, ch):

3349 return _regex.has_property_value(self.value, ch) == self.positive

3350

3351 def max_width(self):

3352 return 1

3353

3354class Prune(ZeroWidthBase):

3355 _op_name = "PRUNE"

3356

3357 def _compile(self, reverse, fuzzy):

3358 return [(OP.PRUNE, )]

3359

3360class Range(RegexBase):

3361 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,

3362 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,

3363 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,

3364 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}

3365 _op_name = "RANGE"

3366

3367 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,

3368 zerowidth=False):

3369 RegexBase.__init__(self)

3370 self.lower = lower

3371 self.upper = upper

3372 self.positive = bool(positive)

3373 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3374 self.zerowidth = bool(zerowidth)

3375

3376 self._key = (self.__class__, self.lower, self.upper, self.positive,

3377 self.case_flags, self.zerowidth)

3378

3379 def rebuild(self, positive, case_flags, zerowidth):

3380 return Range(self.lower, self.upper, positive, case_flags, zerowidth)

3381

3382 def optimise(self, info, reverse, in_set=False):

3383 # Is the range case-sensitive?

3384 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3385 return self

3386

3387 # Is full case-folding possible?

3388 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=

3389 FULLIGNORECASE):

3390 return self

3391

3392 # Get the characters which expand to multiple codepoints on folding.

3393 expanding_chars = _regex.get_expand_on_folding()

3394

3395 # Get the folded characters in the range.

3396 items = []

3397 for ch in expanding_chars:

3398 if self.lower <= ord(ch) <= self.upper:

3399 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3400 items.append(String([ord(c) for c in folded],

3401 case_flags=self.case_flags))

3402

3403 if not items:

3404 # We can fall back to simple case-folding.

3405 return self

3406

3407 if len(items) < self.upper - self.lower + 1:

3408 # Not all the characters are covered by the full case-folding.

3409 items.insert(0, self)

3410

3411 return Branch(items)

3412

3413 def _compile(self, reverse, fuzzy):

3414 flags = 0

3415 if self.positive:

3416 flags |= POSITIVE_OP

3417 if self.zerowidth:

3418 flags |= ZEROWIDTH_OP

3419 if fuzzy:

3420 flags |= FUZZY_OP

3421 return [(self._opcode[self.case_flags, reverse], flags, self.lower,

3422 self.upper)]

3423

3424 def dump(self, indent, reverse):

3425 display_lower = ascii(chr(self.lower)).lstrip("bu")

3426 display_upper = ascii(chr(self.upper)).lstrip("bu")

3427 print("{}RANGE {} {} {}{}".format(INDENT * indent,

3428 POS_TEXT[self.positive], display_lower, display_upper,

3429 CASE_TEXT[self.case_flags]))

3430

3431 def matches(self, ch):

3432 return (self.lower <= ch <= self.upper) == self.positive

3433

3434 def max_width(self):

3435 return 1

3436

3437class RefGroup(RegexBase):

3438 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):

3439 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,

3440 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,

3441 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,

3442 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}

3443

3444 def __init__(self, info, group, position, case_flags=NOCASE):

3445 RegexBase.__init__(self)

3446 self.info = info

3447 self.group = group

3448 self.position = position

3449 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3450

3451 self._key = self.__class__, self.group, self.case_flags

3452

3453 def fix_groups(self, pattern, reverse, fuzzy):

3454 try:

3455 self.group = int(self.group)

3456 except ValueError:

3457 try:

3458 self.group = self.info.group_index[self.group]

3459 except KeyError:

3460 raise error("unknown group", pattern, self.position)

3461

3462 if not 1 <= self.group <= self.info.group_count:

3463 raise error("invalid group reference", pattern, self.position)

3464

3465 self._key = self.__class__, self.group, self.case_flags

3466

3467 def remove_captures(self):

3468 raise error("group reference not allowed", self.pattern, self.position)

3469

3470 def _compile(self, reverse, fuzzy):

3471 flags = 0

3472 if fuzzy:

3473 flags |= FUZZY_OP

3474 return [(self._opcode[self.case_flags, reverse], flags, self.group)]

3475

3476 def dump(self, indent, reverse):

3477 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,

3478 CASE_TEXT[self.case_flags]))

3479

3480 def max_width(self):

3481 return UNLIMITED

3482

3483 def __del__(self):

3484 self.info = None

3485

3486class SearchAnchor(ZeroWidthBase):

3487 _opcode = OP.SEARCH_ANCHOR

3488 _op_name = "SEARCH_ANCHOR"

3489

3490class Sequence(RegexBase):

3491 def __init__(self, items=None):

3492 RegexBase.__init__(self)

3493 if items is None:

3494 items = []

3495

3496 self.items = items

3497

3498 def fix_groups(self, pattern, reverse, fuzzy):

3499 for s in self.items:

3500 s.fix_groups(pattern, reverse, fuzzy)

3501

3502 def optimise(self, info, reverse):

3503 # Flatten the sequences.

3504 items = []

3505 for s in self.items:

3506 s = s.optimise(info, reverse)

3507 if isinstance(s, Sequence):

3508 items.extend(s.items)

3509 else:

3510 items.append(s)

3511

3512 return make_sequence(items)

3513

3514 def pack_characters(self, info):

3515 "Packs sequences of characters into strings."

3516 items = []

3517 characters = []

3518 case_flags = NOCASE

3519 for s in self.items:

3520 if type(s) is Character and s.positive and not s.zerowidth:

3521 if s.case_flags != case_flags:

3522 # Different case sensitivity, so flush, unless neither the

3523 # previous nor the new character are cased.

3524 if s.case_flags or is_cased_i(info, s.value):

3525 Sequence._flush_characters(info, characters,

3526 case_flags, items)

3527

3528 case_flags = s.case_flags

3529

3530 characters.append(s.value)

3531 elif type(s) is String or type(s) is Literal:

3532 if s.case_flags != case_flags:

3533 # Different case sensitivity, so flush, unless the neither

3534 # the previous nor the new string are cased.

3535 if s.case_flags or any(is_cased_i(info, c) for c in

3536 characters):

3537 Sequence._flush_characters(info, characters,

3538 case_flags, items)

3539

3540 case_flags = s.case_flags

3541

3542 characters.extend(s.characters)

3543 else:

3544 Sequence._flush_characters(info, characters, case_flags, items)

3545

3546 items.append(s.pack_characters(info))

3547

3548 Sequence._flush_characters(info, characters, case_flags, items)

3549

3550 return make_sequence(items)

3551

3552 def remove_captures(self):

3553 self.items = [s.remove_captures() for s in self.items]

3554 return self

3555

3556 def is_atomic(self):

3557 return all(s.is_atomic() for s in self.items)

3558

3559 def can_be_affix(self):

3560 return False

3561

3562 def contains_group(self):

3563 return any(s.contains_group() for s in self.items)

3564

3565 def get_firstset(self, reverse):

3566 fs = set()

3567 items = self.items

3568 if reverse:

3569 items.reverse()

3570 for s in items:

3571 fs |= s.get_firstset(reverse)

3572 if None not in fs:

3573 return fs

3574 fs.discard(None)

3575

3576 return fs | set([None])

3577

3578 def has_simple_start(self):

3579 return bool(self.items) and self.items[0].has_simple_start()

3580

3581 def _compile(self, reverse, fuzzy):

3582 seq = self.items

3583 if reverse:

3584 seq = seq[::-1]

3585

3586 code = []

3587 for s in seq:

3588 code.extend(s.compile(reverse, fuzzy))

3589

3590 return code

3591

3592 def dump(self, indent, reverse):

3593 for s in self.items:

3594 s.dump(indent, reverse)

3595

3596 @staticmethod

3597 def _flush_characters(info, characters, case_flags, items):

3598 if not characters:

3599 return

3600

3601 # Disregard case_flags if all of the characters are case-less.

3602 if case_flags & IGNORECASE:

3603 if not any(is_cased_i(info, c) for c in characters):

3604 case_flags = NOCASE

3605

3606 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3607 literals = Sequence._fix_full_casefold(characters)

3608

3609 for item in literals:

3610 chars = item.characters

3611

3612 if len(chars) == 1:

3613 items.append(Character(chars[0], case_flags=item.case_flags))

3614 else:

3615 items.append(String(chars, case_flags=item.case_flags))

3616 else:

3617 if len(characters) == 1:

3618 items.append(Character(characters[0], case_flags=case_flags))

3619 else:

3620 items.append(String(characters, case_flags=case_flags))

3621

3622 characters[:] = []

3623

3624 @staticmethod

3625 def _fix_full_casefold(characters):

3626 # Split a literal needing full case-folding into chunks that need it

3627 # and chunks that can use simple case-folding, which is faster.

3628 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in

3629 _regex.get_expand_on_folding()]

3630 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)

3631 for c in characters)).lower()

3632 chunks = []

3633

3634 for e in expanded:

3635 found = string.find(e)

3636

3637 while found >= 0:

3638 chunks.append((found, found + len(e)))

3639 found = string.find(e, found + 1)

3640

3641 pos = 0

3642 literals = []

3643

3644 for start, end in Sequence._merge_chunks(chunks):

3645 if pos < start:

3646 literals.append(Literal(characters[pos : start],

3647 case_flags=IGNORECASE))

3648

3649 literals.append(Literal(characters[start : end],

3650 case_flags=FULLIGNORECASE))

3651 pos = end

3652

3653 if pos < len(characters):

3654 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))

3655

3656 return literals

3657

3658 @staticmethod

3659 def _merge_chunks(chunks):

3660 if len(chunks) < 2:

3661 return chunks

3662

3663 chunks.sort()

3664

3665 start, end = chunks[0]

3666 new_chunks = []

3667

3668 for s, e in chunks[1 : ]:

3669 if s <= end:

3670 end = max(end, e)

3671 else:

3672 new_chunks.append((start, end))

3673 start, end = s, e

3674

3675 new_chunks.append((start, end))

3676

3677 return new_chunks

3678

3679 def is_empty(self):

3680 return all(i.is_empty() for i in self.items)

3681

3682 def __eq__(self, other):

3683 return type(self) is type(other) and self.items == other.items

3684

3685 def max_width(self):

3686 return sum(s.max_width() for s in self.items)

3687

3688 def get_required_string(self, reverse):

3689 seq = self.items

3690 if reverse:

3691 seq = seq[::-1]

3692

3693 offset = 0

3694

3695 for s in seq:

3696 ofs, req = s.get_required_string(reverse)

3697 offset += ofs

3698 if req:

3699 return offset, req

3700

3701 return offset, None

3702

3703class SetBase(RegexBase):

3704 def __init__(self, info, items, positive=True, case_flags=NOCASE,

3705 zerowidth=False):

3706 RegexBase.__init__(self)

3707 self.info = info

3708 self.items = tuple(items)

3709 self.positive = bool(positive)

3710 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3711 self.zerowidth = bool(zerowidth)

3712

3713 self.char_width = 1

3714

3715 self._key = (self.__class__, self.items, self.positive,

3716 self.case_flags, self.zerowidth)

3717

3718 def rebuild(self, positive, case_flags, zerowidth):

3719 return type(self)(self.info, self.items, positive, case_flags,

3720 zerowidth).optimise(self.info, False)

3721

3722 def get_firstset(self, reverse):

3723 return set([self])

3724

3725 def has_simple_start(self):

3726 return True

3727

3728 def _compile(self, reverse, fuzzy):

3729 flags = 0

3730 if self.positive:

3731 flags |= POSITIVE_OP

3732 if self.zerowidth:

3733 flags |= ZEROWIDTH_OP

3734 if fuzzy:

3735 flags |= FUZZY_OP

3736 code = [(self._opcode[self.case_flags, reverse], flags)]

3737 for m in self.items:

3738 code.extend(m.compile())

3739

3740 code.append((OP.END, ))

3741

3742 return code

3743

3744 def dump(self, indent, reverse):

3745 print("{}{} {}{}".format(INDENT * indent, self._op_name,

3746 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))

3747 for i in self.items:

3748 i.dump(indent + 1, reverse)

3749

3750 def _handle_case_folding(self, info, in_set):

3751 # Is the set case-sensitive?

3752 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3753 return self

3754

3755 # Is full case-folding possible?

3756 if (not (self.info.flags & UNICODE) or (self.case_flags &

3757 FULLIGNORECASE) != FULLIGNORECASE):

3758 return self

3759

3760 # Get the characters which expand to multiple codepoints on folding.

3761 expanding_chars = _regex.get_expand_on_folding()

3762

3763 # Get the folded characters in the set.

3764 items = []

3765 seen = set()

3766 for ch in expanding_chars:

3767 if self.matches(ord(ch)):

3768 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3769 if folded not in seen:

3770 items.append(String([ord(c) for c in folded],

3771 case_flags=self.case_flags))

3772 seen.add(folded)

3773

3774 if not items:

3775 # We can fall back to simple case-folding.

3776 return self

3777

3778 return Branch([self] + items)

3779

3780 def max_width(self):

3781 # Is the set case-sensitive?

3782 if not self.positive or not (self.case_flags & IGNORECASE):

3783 return 1

3784

3785 # Is full case-folding possible?

3786 if (not (self.info.flags & UNICODE) or (self.case_flags &

3787 FULLIGNORECASE) != FULLIGNORECASE):

3788 return 1

3789

3790 # Get the characters which expand to multiple codepoints on folding.

3791 expanding_chars = _regex.get_expand_on_folding()

3792

3793 # Get the folded characters in the set.

3794 seen = set()

3795 for ch in expanding_chars:

3796 if self.matches(ord(ch)):

3797 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3798 seen.add(folded)

3799

3800 if not seen:

3801 return 1

3802

3803 return max(len(folded) for folded in seen)

3804

3805 def __del__(self):

3806 self.info = None

3807

3808class SetDiff(SetBase):

3809 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):

3810 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):

3811 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):

3812 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,

3813 True): OP.SET_DIFF_IGN_REV}

3814 _op_name = "SET_DIFF"

3815

3816 def optimise(self, info, reverse, in_set=False):

3817 items = self.items

3818 if len(items) > 2:

3819 items = [items[0], SetUnion(info, items[1 : ])]

3820

3821 if len(items) == 1:

3822 return items[0].with_flags(case_flags=self.case_flags,

3823 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3824

3825 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in

3826 items)

3827

3828 return self._handle_case_folding(info, in_set)

3829

3830 def matches(self, ch):

3831 m = self.items[0].matches(ch) and not self.items[1].matches(ch)

3832 return m == self.positive

3833

3834class SetInter(SetBase):

3835 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):

3836 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,

3837 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,

3838 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,

3839 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}

3840 _op_name = "SET_INTER"

3841

3842 def optimise(self, info, reverse, in_set=False):

3843 items = []

3844 for m in self.items:

3845 m = m.optimise(info, reverse, in_set=True)

3846 if isinstance(m, SetInter) and m.positive:

3847 # Intersection in intersection.

3848 items.extend(m.items)

3849 else:

3850 items.append(m)

3851

3852 if len(items) == 1:

3853 return items[0].with_flags(case_flags=self.case_flags,

3854 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3855

3856 self.items = tuple(items)

3857

3858 return self._handle_case_folding(info, in_set)

3859

3860 def matches(self, ch):

3861 m = all(i.matches(ch) for i in self.items)

3862 return m == self.positive

3863

3864class SetSymDiff(SetBase):

3865 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):

3866 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,

3867 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,

3868 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):

3869 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}

3870 _op_name = "SET_SYM_DIFF"

3871

3872 def optimise(self, info, reverse, in_set=False):

3873 items = []

3874 for m in self.items:

3875 m = m.optimise(info, reverse, in_set=True)

3876 if isinstance(m, SetSymDiff) and m.positive:

3877 # Symmetric difference in symmetric difference.

3878 items.extend(m.items)

3879 else:

3880 items.append(m)

3881

3882 if len(items) == 1:

3883 return items[0].with_flags(case_flags=self.case_flags,

3884 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3885

3886 self.items = tuple(items)

3887

3888 return self._handle_case_folding(info, in_set)

3889

3890 def matches(self, ch):

3891 m = False

3892 for i in self.items:

3893 m = m != i.matches(ch)

3894

3895 return m == self.positive

3896

3897class SetUnion(SetBase):

3898 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):

3899 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,

3900 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,

3901 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,

3902 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}

3903 _op_name = "SET_UNION"

3904

3905 def optimise(self, info, reverse, in_set=False):

3906 items = []

3907 for m in self.items:

3908 m = m.optimise(info, reverse, in_set=True)

3909 if isinstance(m, SetUnion) and m.positive:

3910 # Union in union.

3911 items.extend(m.items)

3912 elif isinstance(m, AnyAll):

3913 return AnyAll()

3914 else:

3915 items.append(m)

3916

3917 # Are there complementary properties?

3918 properties = (set(), set())

3919

3920 for m in items:

3921 if isinstance(m, Property):

3922 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))

3923

3924 if properties[0] & properties[1]:

3925 return AnyAll()

3926

3927 if len(items) == 1:

3928 i = items[0]

3929 return i.with_flags(positive=i.positive == self.positive,

3930 case_flags=self.case_flags,

3931 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3932

3933 self.items = tuple(items)

3934

3935 return self._handle_case_folding(info, in_set)

3936

3937 def _compile(self, reverse, fuzzy):

3938 flags = 0

3939 if self.positive:

3940 flags |= POSITIVE_OP

3941 if self.zerowidth:

3942 flags |= ZEROWIDTH_OP

3943 if fuzzy:

3944 flags |= FUZZY_OP

3945

3946 characters, others = defaultdict(list), []

3947 for m in self.items:

3948 if isinstance(m, Character):

3949 characters[m.positive].append(m.value)

3950 else:

3951 others.append(m)

3952

3953 code = [(self._opcode[self.case_flags, reverse], flags)]

3954

3955 for positive, values in characters.items():

3956 flags = 0

3957 if positive:

3958 flags |= POSITIVE_OP

3959 if len(values) == 1:

3960 code.append((OP.CHARACTER, flags, values[0]))

3961 else:

3962 code.append((OP.STRING, flags, len(values)) + tuple(values))

3963

3964 for m in others:

3965 code.extend(m.compile())

3966

3967 code.append((OP.END, ))

3968

3969 return code

3970

3971 def matches(self, ch):

3972 m = any(i.matches(ch) for i in self.items)

3973 return m == self.positive

3974

3975class Skip(ZeroWidthBase):

3976 _op_name = "SKIP"

3977 _opcode = OP.SKIP

3978

3979class StartOfLine(ZeroWidthBase):

3980 _opcode = OP.START_OF_LINE

3981 _op_name = "START_OF_LINE"

3982

3983class StartOfLineU(StartOfLine):

3984 _opcode = OP.START_OF_LINE_U

3985 _op_name = "START_OF_LINE_U"

3986

3987class StartOfString(ZeroWidthBase):

3988 _opcode = OP.START_OF_STRING

3989 _op_name = "START_OF_STRING"

3990

3991class StartOfWord(ZeroWidthBase):

3992 _opcode = OP.START_OF_WORD

3993 _op_name = "START_OF_WORD"

3994

3995class String(RegexBase):

3996 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,

3997 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,

3998 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,

3999 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):

4000 OP.STRING_FLD_REV}

4001

4002 def __init__(self, characters, case_flags=NOCASE):

4003 self.characters = tuple(characters)

4004 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

4005

4006 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:

4007 folded_characters = []

4008 for char in self.characters:

4009 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))

4010 folded_characters.extend(ord(c) for c in folded)

4011 else:

4012 folded_characters = self.characters

4013

4014 self.folded_characters = tuple(folded_characters)

4015 self.required = False

4016

4017 self._key = self.__class__, self.characters, self.case_flags

4018

4019 def get_firstset(self, reverse):

4020 if reverse:

4021 pos = -1

4022 else:

4023 pos = 0

4024 return set([Character(self.characters[pos],

4025 case_flags=self.case_flags)])

4026

4027 def has_simple_start(self):

4028 return True

4029

4030 def _compile(self, reverse, fuzzy):

4031 flags = 0

4032 if fuzzy:

4033 flags |= FUZZY_OP

4034 if self.required:

4035 flags |= REQUIRED_OP

4036 return [(self._opcode[self.case_flags, reverse], flags,

4037 len(self.folded_characters)) + self.folded_characters]

4038

4039 def dump(self, indent, reverse):

4040 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")

4041 print("{}STRING {}{}".format(INDENT * indent, display,

4042 CASE_TEXT[self.case_flags]))

4043

4044 def max_width(self):

4045 return len(self.folded_characters)

4046

4047 def get_required_string(self, reverse):

4048 return 0, self

4049

4050class Literal(String):

4051 def dump(self, indent, reverse):

4052 literal = ''.join(chr(c) for c in self.characters)

4053 display = ascii(literal).lstrip("bu")

4054 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,

4055 CASE_TEXT[self.case_flags]))

4056

4057class StringSet(Branch):

4058 def __init__(self, info, name, case_flags=NOCASE):

4059 self.info = info

4060 self.name = name

4061 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

4062

4063 self._key = self.__class__, self.name, self.case_flags

4064

4065 self.set_key = (name, self.case_flags)

4066 if self.set_key not in info.named_lists_used:

4067 info.named_lists_used[self.set_key] = len(info.named_lists_used)

4068

4069 index = self.info.named_lists_used[self.set_key]

4070 items = self.info.kwargs[self.name]

4071

4072 case_flags = self.case_flags

4073

4074 encoding = self.info.flags & _ALL_ENCODINGS

4075 fold_flags = encoding | case_flags

4076

4077 choices = []

4078

4079 for string in items:

4080 if isinstance(string, str):

4081 string = [ord(c) for c in string]

4082

4083 choices.append([Character(c, case_flags=case_flags) for c in

4084 string])

4085

4086 # Sort from longest to shortest.

4087 choices.sort(key=len, reverse=True)

4088

4089 self.branches = [Sequence(choice) for choice in choices]

4090

4091 def dump(self, indent, reverse):

4092 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,

4093 CASE_TEXT[self.case_flags]))

4094

4095 def __del__(self):

4096 self.info = None

4097

4098class Source:

4099 "Scanner for the regular expression source string."

4100 def __init__(self, string):

4101 if isinstance(string, str):

4102 self.string = string

4103 self.char_type = chr

4104 else:

4105 self.string = string.decode("latin-1")

4106 self.char_type = lambda c: bytes([c])

4107

4108 self.pos = 0

4109 self.ignore_space = False

4110 self.sep = string[ : 0]

4111

4112 def peek(self, override_ignore=False):

4113 string = self.string

4114 pos = self.pos

4115

4116 try:

4117 if self.ignore_space and not override_ignore:

4118 while True:

4119 if string[pos].isspace():

4120 # Skip over the whitespace.

4121 pos += 1

4122 elif string[pos] == "#":

4123 # Skip over the comment to the end of the line.

4124 pos = string.index("\n", pos)

4125 else:

4126 break

4127

4128 return string[pos]

4129 except IndexError:

4130 # We've reached the end of the string.

4131 return string[ : 0]

4132 except ValueError:

4133 # The comment extended to the end of the string.

4134 return string[ : 0]

4135

4136 def get(self, override_ignore=False):

4137 string = self.string

4138 pos = self.pos

4139

4140 try:

4141 if self.ignore_space and not override_ignore:

4142 while True:

4143 if string[pos].isspace():

4144 # Skip over the whitespace.

4145 pos += 1

4146 elif string[pos] == "#":

4147 # Skip over the comment to the end of the line.

4148 pos = string.index("\n", pos)

4149 else:

4150 break

4151

4152 ch = string[pos]

4153 self.pos = pos + 1

4154 return ch

4155 except IndexError:

4156 # We've reached the end of the string.

4157 self.pos = pos

4158 return string[ : 0]

4159 except ValueError:

4160 # The comment extended to the end of the string.

4161 self.pos = len(string)

4162 return string[ : 0]

4163

4164 def get_many(self, count=1):

4165 string = self.string

4166 pos = self.pos

4167

4168 try:

4169 if self.ignore_space:

4170 substring = []

4171

4172 while len(substring) < count:

4173 while True:

4174 if string[pos].isspace():

4175 # Skip over the whitespace.

4176 pos += 1

4177 elif string[pos] == "#":

4178 # Skip over the comment to the end of the line.

4179 pos = string.index("\n", pos)

4180 else:

4181 break

4182

4183 substring.append(string[pos])

4184 pos += 1

4185

4186 substring = "".join(substring)

4187 else:

4188 substring = string[pos : pos + count]

4189 pos += len(substring)

4190

4191 self.pos = pos

4192 return substring

4193 except IndexError:

4194 # We've reached the end of the string.

4195 self.pos = len(string)

4196 return "".join(substring)

4197 except ValueError:

4198 # The comment extended to the end of the string.

4199 self.pos = len(string)

4200 return "".join(substring)

4201

4202 def get_while(self, test_set, include=True, keep_spaces=False):

4203 string = self.string

4204 pos = self.pos

4205

4206 if self.ignore_space and not keep_spaces:

4207 try:

4208 substring = []

4209

4210 while True:

4211 if string[pos].isspace():

4212 # Skip over the whitespace.

4213 pos += 1

4214 elif string[pos] == "#":

4215 # Skip over the comment to the end of the line.

4216 pos = string.index("\n", pos)

4217 elif (string[pos] in test_set) == include:

4218 substring.append(string[pos])

4219 pos += 1

4220 else:

4221 break

4222

4223 self.pos = pos

4224 except IndexError:

4225 # We've reached the end of the string.

4226 self.pos = len(string)

4227 except ValueError:

4228 # The comment extended to the end of the string.

4229 self.pos = len(string)

4230

4231 return "".join(substring)

4232 else:

4233 try:

4234 while (string[pos] in test_set) == include:

4235 pos += 1

4236

4237 substring = string[self.pos : pos]

4238

4239 self.pos = pos

4240

4241 return substring

4242 except IndexError:

4243 # We've reached the end of the string.

4244 substring = string[self.pos : pos]

4245

4246 self.pos = pos

4247

4248 return substring

4249

4250 def skip_while(self, test_set, include=True):

4251 string = self.string

4252 pos = self.pos

4253

4254 try:

4255 if self.ignore_space:

4256 while True:

4257 if string[pos].isspace():

4258 # Skip over the whitespace.

4259 pos += 1

4260 elif string[pos] == "#":

4261 # Skip over the comment to the end of the line.

4262 pos = string.index("\n", pos)

4263 elif (string[pos] in test_set) == include:

4264 pos += 1

4265 else:

4266 break

4267 else:

4268 while (string[pos] in test_set) == include:

4269 pos += 1

4270

4271 self.pos = pos

4272 except IndexError:

4273 # We've reached the end of the string.

4274 self.pos = len(string)

4275 except ValueError:

4276 # The comment extended to the end of the string.

4277 self.pos = len(string)

4278

4279 def match(self, substring):

4280 string = self.string

4281 pos = self.pos

4282

4283 if self.ignore_space:

4284 try:

4285 for c in substring:

4286 while True:

4287 if string[pos].isspace():

4288 # Skip over the whitespace.

4289 pos += 1

4290 elif string[pos] == "#":

4291 # Skip over the comment to the end of the line.

4292 pos = string.index("\n", pos)

4293 else:

4294 break

4295

4296 if string[pos] != c:

4297 return False

4298

4299 pos += 1

4300

4301 self.pos = pos

4302

4303 return True

4304 except IndexError:

4305 # We've reached the end of the string.

4306 return False

4307 except ValueError:

4308 # The comment extended to the end of the string.

4309 return False

4310 else:

4311 if not string.startswith(substring, pos):

4312 return False

4313

4314 self.pos = pos + len(substring)

4315

4316 return True

4317

4318 def expect(self, substring):

4319 if not self.match(substring):

4320 raise error("missing {}".format(substring), self.string, self.pos)

4321

4322 def at_end(self):

4323 string = self.string

4324 pos = self.pos

4325

4326 try:

4327 if self.ignore_space:

4328 while True:

4329 if string[pos].isspace():

4330 pos += 1

4331 elif string[pos] == "#":

4332 pos = string.index("\n", pos)

4333 else:

4334 break

4335

4336 return pos >= len(string)

4337 except IndexError:

4338 # We've reached the end of the string.

4339 return True

4340 except ValueError:

4341 # The comment extended to the end of the string.

4342 return True

4343

4344class Info:

4345 "Info about the regular expression."

4346

4347 def __init__(self, flags=0, char_type=None, kwargs={}):

4348 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]

4349 self.flags = flags

4350 self.global_flags = flags

4351 self.inline_locale = False

4352

4353 self.kwargs = kwargs

4354

4355 self.group_count = 0

4356 self.group_index = {}

4357 self.group_name = {}

4358 self.char_type = char_type

4359 self.named_lists_used = {}

4360 self.open_groups = []

4361 self.open_group_count = {}

4362 self.defined_groups = {}

4363 self.group_calls = []

4364 self.private_groups = {}

4365

4366 def open_group(self, name=None):

4367 group = self.group_index.get(name)

4368 if group is None:

4369 while True:

4370 self.group_count += 1

4371 if name is None or self.group_count not in self.group_name:

4372 break

4373

4374 group = self.group_count

4375 if name:

4376 self.group_index[name] = group

4377 self.group_name[group] = name

4378

4379 if group in self.open_groups:

4380 # We have a nested named group. We'll assign it a private group

4381 # number, initially negative until we can assign a proper

4382 # (positive) number.

4383 group_alias = -(len(self.private_groups) + 1)

4384 self.private_groups[group_alias] = group

4385 group = group_alias

4386

4387 self.open_groups.append(group)

4388 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1

4389

4390 return group

4391

4392 def close_group(self):

4393 self.open_groups.pop()

4394

4395 def is_open_group(self, name):

4396 # In version 1, a group reference can refer to an open group. We'll

4397 # just pretend the group isn't open.

4398 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4399 if version == VERSION1:

4400 return False

4401

4402 if name.isdigit():

4403 group = int(name)

4404 else:

4405 group = self.group_index.get(name)

4406

4407 return group in self.open_groups

4408

4409def _check_group_features(info, parsed):

4410 """Checks whether the reverse and fuzzy features of the group calls match

4411 the groups which they call.

4412 """

4413 call_refs = {}

4414 additional_groups = []

4415 for call, reverse, fuzzy in info.group_calls:

4416 # Look up the reference of this group call.

4417 key = (call.group, reverse, fuzzy)

4418 ref = call_refs.get(key)

4419 if ref is None:

4420 # This group doesn't have a reference yet, so look up its features.

4421 if call.group == 0:

4422 # Calling the pattern as a whole.

4423 rev = bool(info.flags & REVERSE)

4424 fuz = isinstance(parsed, Fuzzy)

4425 if (rev, fuz) != (reverse, fuzzy):

4426 # The pattern as a whole doesn't have the features we want,

4427 # so we'll need to make a copy of it with the desired

4428 # features.

4429 additional_groups.append((CallRef(len(call_refs), parsed),

4430 reverse, fuzzy))

4431 else:

4432 # Calling a capture group.

4433 def_info = info.defined_groups[call.group]

4434 group = def_info[0]

4435 if def_info[1 : ] != (reverse, fuzzy):

4436 # The group doesn't have the features we want, so we'll

4437 # need to make a copy of it with the desired features.

4438 additional_groups.append((group, reverse, fuzzy))

4439

4440 ref = len(call_refs)

4441 call_refs[key] = ref

4442

4443 call.call_ref = ref

4444

4445 info.call_refs = call_refs

4446 info.additional_groups = additional_groups

4447

4448def _get_required_string(parsed, flags):

4449 "Gets the required string and related info of a parsed pattern."

4450

4451 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))

4452 if required:

4453 required.required = True

4454 if req_offset >= UNLIMITED:

4455 req_offset = -1

4456

4457 req_flags = required.case_flags

4458 if not (flags & UNICODE):

4459 req_flags &= ~UNICODE

4460

4461 req_chars = required.folded_characters

4462 else:

4463 req_offset = 0

4464 req_chars = ()

4465 req_flags = 0

4466

4467 return req_offset, req_chars, req_flags

4468

4469class Scanner:

4470 def __init__(self, lexicon, flags=0):

4471 self.lexicon = lexicon

4472

4473 # Combine phrases into a compound pattern.

4474 patterns = []

4475 for phrase, action in lexicon:

4476 # Parse the regular expression.

4477 source = Source(phrase)

4478 info = Info(flags, source.char_type)

4479 source.ignore_space = bool(info.flags & VERBOSE)

4480 parsed = _parse_pattern(source, info)

4481 if not source.at_end():

4482 raise error("unbalanced parenthesis", source.string,

4483 source.pos)

4484

4485 # We want to forbid capture groups within each phrase.

4486 patterns.append(parsed.remove_captures())

4487

4488 # Combine all the subpatterns into one pattern.

4489 info = Info(flags)

4490 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]

4491 parsed = Branch(patterns)

4492

4493 # Optimise the compound pattern.

4494 reverse = bool(info.flags & REVERSE)

4495 parsed = parsed.optimise(info, reverse)

4496 parsed = parsed.pack_characters(info)

4497

4498 # Get the required string.

4499 req_offset, req_chars, req_flags = _get_required_string(parsed,

4500 info.flags)

4501

4502 # Check the features of the groups.

4503 _check_group_features(info, parsed)

4504

4505 # Complain if there are any group calls. They are not supported by the

4506 # Scanner class.

4507 if info.call_refs:

4508 raise error("recursive regex not supported by Scanner",

4509 source.string, source.pos)

4510

4511 reverse = bool(info.flags & REVERSE)

4512

4513 # Compile the compound pattern. The result is a list of tuples.

4514 code = parsed.compile(reverse) + [(OP.SUCCESS, )]

4515

4516 # Flatten the code into a list of ints.

4517 code = _flatten_code(code)

4518

4519 if not parsed.has_simple_start():

4520 # Get the first set, if possible.

4521 try:

4522 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))

4523 fs_code = _flatten_code(fs_code)

4524 code = fs_code + code

4525 except _FirstSetError:

4526 pass

4527

4528 # Check the global flags for conflicts.

4529 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4530 if version not in (0, VERSION0, VERSION1):

4531 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")

4532

4533 # Create the PatternObject.

4534 #

4535 # Local flags like IGNORECASE affect the code generation, but aren't

4536 # needed by the PatternObject itself. Conversely, global flags like

4537 # LOCALE _don't_ affect the code generation but _are_ needed by the

4538 # PatternObject.

4539 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,

4540 code, {}, {}, {}, [], req_offset, req_chars, req_flags,

4541 len(patterns))

4542

4543 def scan(self, string):

4544 result = []

4545 append = result.append

4546 match = self.scanner.scanner(string).match

4547 i = 0

4548 while True:

4549 m = match()

4550 if not m:

4551 break

4552 j = m.end()

4553 if i == j:

4554 break

4555 action = self.lexicon[m.lastindex - 1][1]

4556 if hasattr(action, '__call__'):

4557 self.match = m

4558 action = action(self, m.group())

4559 if action is not None:

4560 append(action)

4561 i = j

4562

4563 return result, string[i : ]

4564

4565# Get the known properties dict.

4566PROPERTIES = _regex.get_properties()

4567

4568# Build the inverse of the properties dict.

4569PROPERTY_NAMES = {}

4570for prop_name, (prop_id, values) in PROPERTIES.items():

4571 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))

4572 name = max(name, prop_name, key=len)

4573 PROPERTY_NAMES[prop_id] = name, prop_values

4574

4575 for val_name, val_id in values.items():

4576 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,

4577 key=len)

4578

4579# Character escape sequences.

4580CHARACTER_ESCAPES = {

4581 "a": "\a",

4582 "b": "\b",

4583 "f": "\f",

4584 "n": "\n",

4585 "r": "\r",

4586 "t": "\t",

4587 "v": "\v",

4588}

4589

4590ASCII_ENCODING = 1

4591UNICODE_ENCODING = 2

4592

4593# Predefined character set escape sequences.

4594CHARSET_ESCAPES = {

4595 "d": lookup_property(None, "Digit", True),

4596 "D": lookup_property(None, "Digit", False),

4597 "h": lookup_property(None, "Blank", True),

4598 "s": lookup_property(None, "Space", True),

4599 "S": lookup_property(None, "Space", False),

4600 "w": lookup_property(None, "Word", True),

4601 "W": lookup_property(None, "Word", False),

4602}

4603

4604ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4605ASCII_CHARSET_ESCAPES.update({

4606 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),

4607 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),

4608 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),

4609 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),

4610 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),

4611 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),

4612})

4613UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4614UNICODE_CHARSET_ESCAPES.update({

4615 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),

4616 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),

4617 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),

4618 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),

4619 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),

4620 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),

4621})

4622

4623# Positional escape sequences.

4624POSITION_ESCAPES = {

4625 "A": StartOfString(),

4626 "b": Boundary(),

4627 "B": Boundary(False),

4628 "K": Keep(),

4629 "m": StartOfWord(),

4630 "M": EndOfWord(),

4631 "Z": EndOfString(),

4632}

4633ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4634ASCII_POSITION_ESCAPES.update({

4635 "b": Boundary(encoding=ASCII_ENCODING),

4636 "B": Boundary(False, encoding=ASCII_ENCODING),

4637 "m": StartOfWord(encoding=ASCII_ENCODING),

4638 "M": EndOfWord(encoding=ASCII_ENCODING),

4639})

4640UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4641UNICODE_POSITION_ESCAPES.update({

4642 "b": Boundary(encoding=UNICODE_ENCODING),

4643 "B": Boundary(False, encoding=UNICODE_ENCODING),

4644 "m": StartOfWord(encoding=UNICODE_ENCODING),

4645 "M": EndOfWord(encoding=UNICODE_ENCODING),

4646})

4647

4648# Positional escape sequences when WORD flag set.

4649WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4650WORD_POSITION_ESCAPES.update({

4651 "b": DefaultBoundary(),

4652 "B": DefaultBoundary(False),

4653 "m": DefaultStartOfWord(),

4654 "M": DefaultEndOfWord(),

4655})

4656

4657# Regex control verbs.

4658VERBS = {

4659 "FAIL": Failure(),

4660 "F": Failure(),

4661 "PRUNE": Prune(),

4662 "SKIP": Skip(),

4663}

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 26%

2896 statements