Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/regex-2023.3.23-py3.8-linux-x86_64.egg/regex/_regex

2# Secret Labs' Regular Expression Engine core module

6# This version of the SRE library can be redistributed under CNRI's

7# Python 1.6 license. For any other use, please contact Secret Labs

8# AB (info@pythonware.com).

10# Portions of this engine have been developed in cooperation with

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and

12# other compatibility work.

13#

14# 2010-01-16 mrab Python front-end re-written and extended

16import enum

17import string

18import unicodedata

19from collections import defaultdict

21import regex._regex as _regex

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",

27 "Scanner", "RegexFlag"]

29# The regex exception.

30class error(Exception):

31 """Exception raised for invalid regular expressions.

33 Attributes:

35 msg: The unformatted error message

36 pattern: The regular expression pattern

37 pos: The position in the pattern where compilation failed, or None

38 lineno: The line number where compilation failed, unless pos is None

39 colno: The column number where compilation failed, unless pos is None

40 """

42 def __init__(self, message, pattern=None, pos=None):

43 newline = '\n' if isinstance(pattern, str) else b'\n'

44 self.msg = message

45 self.pattern = pattern

46 self.pos = pos

47 if pattern is not None and pos is not None:

48 self.lineno = pattern.count(newline, 0, pos) + 1

49 self.colno = pos - pattern.rfind(newline, 0, pos)

51 message = "{} at position {}".format(message, pos)

53 if newline in pattern:

54 message += " (line {}, column {})".format(self.lineno,

55 self.colno)

57 Exception.__init__(self, message)

59# The exception for when a positional flag has been turned on in the old

60# behaviour.

61class _UnscopedFlagSet(Exception):

62 pass

64# The exception for when parsing fails and we want to try something else.

65class ParseError(Exception):

66 pass

68# The exception for when there isn't a valid first set.

69class _FirstSetError(Exception):

70 pass

72# Flags.

73class RegexFlag(enum.IntFlag):

74 A = ASCII = 0x80 # Assume ASCII locale.

75 B = BESTMATCH = 0x1000 # Best fuzzy match.

76 D = DEBUG = 0x200 # Print parsed pattern.

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first

78 # fuzzy match.

79 F = FULLCASE = 0x4000 # Unicode full case-folding.

80 I = IGNORECASE = 0x2 # Ignore case.

81 L = LOCALE = 0x4 # Assume current 8-bit locale.

82 M = MULTILINE = 0x8 # Make anchors look for newline.

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).

84 R = REVERSE = 0x400 # Search backwards.

85 S = DOTALL = 0x10 # Make dot match newline.

86 U = UNICODE = 0x20 # Assume Unicode locale.

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.

88 V1 = VERSION1 = 0x100 # New enhanced behaviour.

89 W = WORD = 0x800 # Default Unicode word breaks.

90 X = VERBOSE = 0x40 # Ignore whitespace and comments.

91 T = TEMPLATE = 0x1 # Template (present because re module has it).

93 def __repr__(self):

94 if self._name_ is not None:

95 return 'regex.%s' % self._name_

97 value = self._value_

98 members = []

99 negative = value < 0

100

101 if negative:

102 value = ~value

103

104 for m in self.__class__:

105 if value & m._value_:

106 value &= ~m._value_

107 members.append('regex.%s' % m._name_)

108

109 if value:

110 members.append(hex(value))

111

112 res = '|'.join(members)

113

114 if negative:

115 if len(members) > 1:

116 res = '~(%s)' % res

117 else:

118 res = '~%s' % res

119

120 return res

121

122 __str__ = object.__str__

123

124globals().update(RegexFlag.__members__)

125

126DEFAULT_VERSION = VERSION1

127

128_ALL_VERSIONS = VERSION0 | VERSION1

129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE

130

131# The default flags for the various versions.

132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}

133

134# The mask for the flags.

136 REVERSE)

138 _ALL_ENCODINGS)

139

140ALPHA = frozenset(string.ascii_letters)

141DIGITS = frozenset(string.digits)

142ALNUM = ALPHA | DIGITS

143OCT_DIGITS = frozenset(string.octdigits)

144HEX_DIGITS = frozenset(string.hexdigits)

145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])

146NAMED_CHAR_PART = ALNUM | frozenset(" -")

147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")

148SET_OPS = ("||", "~~", "&&", "--")

149

150# The width of the code words inside the regex engine.

151BYTES_PER_CODE = _regex.get_code_size()

152BITS_PER_CODE = BYTES_PER_CODE * 8

153

154# The repeat count which represents infinity.

155UNLIMITED = (1 << BITS_PER_CODE) - 1

156

157# The regular expression flags.

158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,

159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,

160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":

161 VERBOSE}

162

163# The case flags.

164CASE_FLAGS = FULLCASE | IGNORECASE

165NOCASE = 0

166FULLIGNORECASE = FULLCASE | IGNORECASE

167

168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE

169

170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,

171 FULLIGNORECASE: FULLIGNORECASE}

172

173# The number of digits in hexadecimal escapes.

174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}

175

176# The names of the opcodes.

177OPCODES = """

178FAILURE

179SUCCESS

180ANY

181ANY_ALL

182ANY_ALL_REV

183ANY_REV

184ANY_U

185ANY_U_REV

186ATOMIC

187BOUNDARY

188BRANCH

189CALL_REF

190CHARACTER

191CHARACTER_IGN

192CHARACTER_IGN_REV

193CHARACTER_REV

194CONDITIONAL

195DEFAULT_BOUNDARY

196DEFAULT_END_OF_WORD

197DEFAULT_START_OF_WORD

198END

199END_OF_LINE

200END_OF_LINE_U

201END_OF_STRING

202END_OF_STRING_LINE

203END_OF_STRING_LINE_U

204END_OF_WORD

205FUZZY

206GRAPHEME_BOUNDARY

207GREEDY_REPEAT

208GROUP

209GROUP_CALL

210GROUP_EXISTS

211KEEP

212LAZY_REPEAT

213LOOKAROUND

214NEXT

215PROPERTY

216PROPERTY_IGN

217PROPERTY_IGN_REV

218PROPERTY_REV

219PRUNE

220RANGE

221RANGE_IGN

222RANGE_IGN_REV

223RANGE_REV

224REF_GROUP

225REF_GROUP_FLD

226REF_GROUP_FLD_REV

227REF_GROUP_IGN

228REF_GROUP_IGN_REV

229REF_GROUP_REV

230SEARCH_ANCHOR

231SET_DIFF

232SET_DIFF_IGN

233SET_DIFF_IGN_REV

234SET_DIFF_REV

235SET_INTER

236SET_INTER_IGN

237SET_INTER_IGN_REV

238SET_INTER_REV

239SET_SYM_DIFF

240SET_SYM_DIFF_IGN

241SET_SYM_DIFF_IGN_REV

242SET_SYM_DIFF_REV

243SET_UNION

244SET_UNION_IGN

245SET_UNION_IGN_REV

246SET_UNION_REV

247SKIP

248START_OF_LINE

249START_OF_LINE_U

250START_OF_STRING

251START_OF_WORD

252STRING

253STRING_FLD

254STRING_FLD_REV

255STRING_IGN

256STRING_IGN_REV

257STRING_REV

258FUZZY_EXT

259"""

260

261# Define the opcodes in a namespace.

262class Namespace:

263 pass

264

265OP = Namespace()

266for i, op in enumerate(OPCODES.split()):

267 setattr(OP, op, i)

268

269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):

270 """Make room in the given cache.

271

272 Args:

273 cache_dict: The cache dictionary to modify.

274 args_dict: The dictionary of named list args used by patterns.

275 max_length: Maximum # of entries in cache_dict before it is shrunk.

276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.

277 """

278 # Toss out a fraction of the entries at random to make room for new ones.

279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()

280 # as popitem could penalize the same regular expression repeatedly based

281 # on its internal hash value. Being random should spread the cache miss

282 # love around.

283 cache_keys = tuple(cache_dict.keys())

284 overage = len(cache_keys) - max_length

285 if overage < 0:

286 # Cache is already within limits. Normally this should not happen

287 # but it could due to multithreading.

288 return

289

290 number_to_toss = max_length // divisor + overage

291

292 # The import is done here to avoid a circular dependency.

293 import random

294 if not hasattr(random, 'sample'):

295 # Do nothing while resolving the circular dependency:

296 # re->random->warnings->tokenize->string->re

297 return

298

299 for doomed_key in random.sample(cache_keys, number_to_toss):

300 try:

301 del cache_dict[doomed_key]

302 except KeyError:

303 # Ignore problems if the cache changed from another thread.

304 pass

305

306 # Rebuild the arguments and locale-sensitivity dictionaries.

307 args_dict.clear()

308 sensitivity_dict = {}

309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):

310 args_dict[pattern, pattern_type, flags, default_version, locale] = args

311 try:

312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]

313 except KeyError:

314 pass

315

316 locale_sensitive.clear()

317 locale_sensitive.update(sensitivity_dict)

318

319def _fold_case(info, string):

320 "Folds the case of a string."

321 flags = info.flags

322 if (flags & _ALL_ENCODINGS) == 0:

323 flags |= info.guess_encoding

324

325 return _regex.fold_case(flags, string)

326

327def is_cased_i(info, char):

328 "Checks whether a character is cased."

329 return len(_regex.get_all_cases(info.flags, char)) > 1

330

331def is_cased_f(flags, char):

332 "Checks whether a character is cased."

333 return len(_regex.get_all_cases(flags, char)) > 1

334

335def _compile_firstset(info, fs):

336 "Compiles the firstset for the pattern."

337 reverse = bool(info.flags & REVERSE)

338 fs = _check_firstset(info, reverse, fs)

339 if not fs:

340 return []

341

342 # Compile the firstset.

343 return fs.compile(reverse)

344

345def _check_firstset(info, reverse, fs):

346 "Checks the firstset for the pattern."

347 if not fs or None in fs:

348 return None

349

350 # If we ignore the case, for simplicity we won't build a firstset.

351 members = set()

352 case_flags = NOCASE

353 for i in fs:

354 if isinstance(i, Character) and not i.positive:

355 return None

356

357# if i.case_flags:

358# if isinstance(i, Character):

359# if is_cased_i(info, i.value):

360# return []

361# elif isinstance(i, SetBase):

362# return []

363 case_flags |= i.case_flags

364 members.add(i.with_flags(case_flags=NOCASE))

365

366 if case_flags == (FULLCASE | IGNORECASE):

367 return None

368

369 # Build the firstset.

370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,

371 zerowidth=True)

372 fs = fs.optimise(info, reverse, in_set=True)

373

374 return fs

375

376def _flatten_code(code):

377 "Flattens the code from a list of tuples."

378 flat_code = []

379 for c in code:

380 flat_code.extend(c)

381

382 return flat_code

383

384def make_case_flags(info):

385 "Makes the case flags."

386 flags = info.flags & CASE_FLAGS

387

388 # Turn off FULLCASE if ASCII is turned on.

389 if info.flags & ASCII:

390 flags &= ~FULLCASE

391

392 return flags

393

394def make_character(info, value, in_set=False):

395 "Makes a character literal."

396 if in_set:

397 # A character set is built case-sensitively.

398 return Character(value)

399

400 return Character(value, case_flags=make_case_flags(info))

401

402def make_ref_group(info, name, position):

403 "Makes a group reference."

404 return RefGroup(info, name, position, case_flags=make_case_flags(info))

405

406def make_string_set(info, name):

407 "Makes a string set."

408 return StringSet(info, name, case_flags=make_case_flags(info))

409

410def make_property(info, prop, in_set):

411 "Makes a property."

412 if in_set:

413 return prop

414

415 return prop.with_flags(case_flags=make_case_flags(info))

416

417def _parse_pattern(source, info):

418 "Parses a pattern, eg. 'a|b|c'."

419 branches = [parse_sequence(source, info)]

420 while source.match("|"):

421 branches.append(parse_sequence(source, info))

422

423 if len(branches) == 1:

424 return branches[0]

425 return Branch(branches)

426

427def parse_sequence(source, info):

428 "Parses a sequence, eg. 'abc'."

429 sequence = [None]

430 case_flags = make_case_flags(info)

431 while True:

432 saved_pos = source.pos

433 ch = source.get()

434 if ch in SPECIAL_CHARS:

435 if ch in ")|":

436 # The end of a sequence. At the end of the pattern ch is "".

437 source.pos = saved_pos

438 break

439 elif ch == "\\":

440 # An escape sequence outside a set.

441 sequence.append(parse_escape(source, info, False))

442 elif ch == "(":

443 # A parenthesised subpattern or a flag.

444 element = parse_paren(source, info)

445 if element is None:

446 case_flags = make_case_flags(info)

447 else:

448 sequence.append(element)

449 elif ch == ".":

450 # Any character.

451 if info.flags & DOTALL:

452 sequence.append(AnyAll())

453 elif info.flags & WORD:

454 sequence.append(AnyU())

455 else:

456 sequence.append(Any())

457 elif ch == "[":

458 # A character set.

459 sequence.append(parse_set(source, info))

460 elif ch == "^":

461 # The start of a line or the string.

462 if info.flags & MULTILINE:

463 if info.flags & WORD:

464 sequence.append(StartOfLineU())

465 else:

466 sequence.append(StartOfLine())

467 else:

468 sequence.append(StartOfString())

469 elif ch == "$":

470 # The end of a line or the string.

471 if info.flags & MULTILINE:

472 if info.flags & WORD:

473 sequence.append(EndOfLineU())

474 else:

475 sequence.append(EndOfLine())

476 else:

477 if info.flags & WORD:

478 sequence.append(EndOfStringLineU())

479 else:

480 sequence.append(EndOfStringLine())

481 elif ch in "?*+{":

482 # Looks like a quantifier.

483 counts = parse_quantifier(source, info, ch)

484 if counts:

485 # It _is_ a quantifier.

486 apply_quantifier(source, info, counts, case_flags, ch,

487 saved_pos, sequence)

488 sequence.append(None)

489 else:

490 # It's not a quantifier. Maybe it's a fuzzy constraint.

491 constraints = parse_fuzzy(source, info, ch, case_flags)

492 if constraints:

493 # It _is_ a fuzzy constraint.

494 apply_constraint(source, info, constraints, case_flags,

495 saved_pos, sequence)

496 sequence.append(None)

497 else:

498 # The element was just a literal.

499 sequence.append(Character(ord(ch),

500 case_flags=case_flags))

501 else:

502 # A literal.

503 sequence.append(Character(ord(ch), case_flags=case_flags))

504 else:

505 # A literal.

506 sequence.append(Character(ord(ch), case_flags=case_flags))

507

508 sequence = [item for item in sequence if item is not None]

509 return Sequence(sequence)

510

511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,

512 sequence):

513 element = sequence.pop()

514 if element is None:

515 if sequence:

516 raise error("multiple repeat", source.string, saved_pos)

517 raise error("nothing to repeat", source.string, saved_pos)

518

519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):

520 raise error("multiple repeat", source.string, saved_pos)

521

522 min_count, max_count = counts

523 saved_pos = source.pos

524 ch = source.get()

525 if ch == "?":

526 # The "?" suffix that means it's a lazy repeat.

527 repeated = LazyRepeat

528 elif ch == "+":

529 # The "+" suffix that means it's a possessive repeat.

530 repeated = PossessiveRepeat

531 else:

532 # No suffix means that it's a greedy repeat.

533 source.pos = saved_pos

534 repeated = GreedyRepeat

535

536 # Ignore the quantifier if it applies to a zero-width item or the number of

537 # repeats is fixed at 1.

538 if not element.is_empty() and (min_count != 1 or max_count != 1):

539 element = repeated(element, min_count, max_count)

540

541 sequence.append(element)

542

543def apply_constraint(source, info, constraints, case_flags, saved_pos,

544 sequence):

545 element = sequence.pop()

546 if element is None:

547 raise error("nothing for fuzzy constraint", source.string, saved_pos)

548

549 # If a group is marked as fuzzy then put all of the fuzzy part in the

550 # group.

551 if isinstance(element, Group):

552 element.subpattern = Fuzzy(element.subpattern, constraints)

553 sequence.append(element)

554 else:

555 sequence.append(Fuzzy(element, constraints))

556

557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}

558

559def parse_quantifier(source, info, ch):

560 "Parses a quantifier."

561 q = _QUANTIFIERS.get(ch)

562 if q:

563 # It's a quantifier.

564 return q

565

566 if ch == "{":

567 # Looks like a limited repeated element, eg. 'a{2,3}'.

568 counts = parse_limited_quantifier(source)

569 if counts:

570 return counts

571

572 return None

573

574def is_above_limit(count):

575 "Checks whether a count is above the maximum."

576 return count is not None and count >= UNLIMITED

577

578def parse_limited_quantifier(source):

579 "Parses a limited quantifier."

580 saved_pos = source.pos

581 min_count = parse_count(source)

582 if source.match(","):

583 max_count = parse_count(source)

584

585 # No minimum means 0 and no maximum means unlimited.

586 min_count = int(min_count or 0)

587 max_count = int(max_count) if max_count else None

588 else:

589 if not min_count:

590 source.pos = saved_pos

591 return None

592

593 min_count = max_count = int(min_count)

594

595 if not source.match ("}"):

596 source.pos = saved_pos

597 return None

598

599 if is_above_limit(min_count) or is_above_limit(max_count):

600 raise error("repeat count too big", source.string, saved_pos)

601

602 if max_count is not None and min_count > max_count:

603 raise error("min repeat greater than max repeat", source.string,

604 saved_pos)

605

606 return min_count, max_count

607

608def parse_fuzzy(source, info, ch, case_flags):

609 "Parses a fuzzy setting, if present."

610 saved_pos = source.pos

611

612 if ch != "{":

613 return None

614

615 constraints = {}

616 try:

617 parse_fuzzy_item(source, constraints)

618 while source.match(","):

619 parse_fuzzy_item(source, constraints)

620 except ParseError:

621 source.pos = saved_pos

622 return None

623

624 if source.match(":"):

625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)

626

627 if not source.match("}"):

628 raise error("expected }", source.string, source.pos)

629

630 return constraints

631

632def parse_fuzzy_item(source, constraints):

633 "Parses a fuzzy setting item."

634 saved_pos = source.pos

635 try:

636 parse_cost_constraint(source, constraints)

637 except ParseError:

638 source.pos = saved_pos

639

640 parse_cost_equation(source, constraints)

641

642def parse_cost_constraint(source, constraints):

643 "Parses a cost constraint."

644 saved_pos = source.pos

645 ch = source.get()

646 if ch in ALPHA:

647 # Syntax: constraint [("<=" | "<") cost]

648 constraint = parse_constraint(source, constraints, ch)

649

650 max_inc = parse_fuzzy_compare(source)

651

652 if max_inc is None:

653 # No maximum cost.

654 constraints[constraint] = 0, None

655 else:

656 # There's a maximum cost.

657 cost_pos = source.pos

658 max_cost = parse_cost_limit(source)

659

660 # Inclusive or exclusive limit?

661 if not max_inc:

662 max_cost -= 1

663

664 if max_cost < 0:

665 raise error("bad fuzzy cost limit", source.string, cost_pos)

666

667 constraints[constraint] = 0, max_cost

668 elif ch in DIGITS:

669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost

670 source.pos = saved_pos

671

672 # Minimum cost.

673 cost_pos = source.pos

674 min_cost = parse_cost_limit(source)

675

676 min_inc = parse_fuzzy_compare(source)

677 if min_inc is None:

678 raise ParseError()

679

680 constraint = parse_constraint(source, constraints, source.get())

681

682 max_inc = parse_fuzzy_compare(source)

683 if max_inc is None:

684 raise ParseError()

685

686 # Maximum cost.

687 cost_pos = source.pos

688 max_cost = parse_cost_limit(source)

689

690 # Inclusive or exclusive limits?

691 if not min_inc:

692 min_cost += 1

693 if not max_inc:

694 max_cost -= 1

695

696 if not 0 <= min_cost <= max_cost:

697 raise error("bad fuzzy cost limit", source.string, cost_pos)

698

699 constraints[constraint] = min_cost, max_cost

700 else:

701 raise ParseError()

702

703def parse_cost_limit(source):

704 "Parses a cost limit."

705 cost_pos = source.pos

706 digits = parse_count(source)

707

708 try:

709 return int(digits)

710 except ValueError:

711 pass

712

713 raise error("bad fuzzy cost limit", source.string, cost_pos)

714

715def parse_constraint(source, constraints, ch):

716 "Parses a constraint."

717 if ch not in "deis":

718 raise ParseError()

719

720 if ch in constraints:

721 raise ParseError()

722

723 return ch

724

725def parse_fuzzy_compare(source):

726 "Parses a cost comparator."

727 if source.match("<="):

728 return True

729 elif source.match("<"):

730 return False

731 else:

732 return None

733

734def parse_cost_equation(source, constraints):

735 "Parses a cost equation."

736 if "cost" in constraints:

737 raise error("more than one cost equation", source.string, source.pos)

738

739 cost = {}

740

741 parse_cost_term(source, cost)

742 while source.match("+"):

743 parse_cost_term(source, cost)

744

745 max_inc = parse_fuzzy_compare(source)

746 if max_inc is None:

747 raise ParseError()

748

749 max_cost = int(parse_count(source))

750

751 if not max_inc:

752 max_cost -= 1

753

754 if max_cost < 0:

755 raise error("bad fuzzy cost limit", source.string, source.pos)

756

757 cost["max"] = max_cost

758

759 constraints["cost"] = cost

760

761def parse_cost_term(source, cost):

762 "Parses a cost equation term."

763 coeff = parse_count(source)

764 ch = source.get()

765 if ch not in "dis":

766 raise ParseError()

767

768 if ch in cost:

769 raise error("repeated fuzzy cost", source.string, source.pos)

770

771 cost[ch] = int(coeff or 1)

772

773def parse_fuzzy_test(source, info, case_flags):

774 saved_pos = source.pos

775 ch = source.get()

776 if ch in SPECIAL_CHARS:

777 if ch == "\\":

778 # An escape sequence outside a set.

779 return parse_escape(source, info, False)

780 elif ch == ".":

781 # Any character.

782 if info.flags & DOTALL:

783 return AnyAll()

784 elif info.flags & WORD:

785 return AnyU()

786 else:

787 return Any()

788 elif ch == "[":

789 # A character set.

790 return parse_set(source, info)

791 else:

792 raise error("expected character set", source.string, saved_pos)

793 elif ch:

794 # A literal.

795 return Character(ord(ch), case_flags=case_flags)

796 else:

797 raise error("expected character set", source.string, saved_pos)

798

799def parse_count(source):

800 "Parses a quantifier's count, which can be empty."

801 return source.get_while(DIGITS)

802

803def parse_paren(source, info):

804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an

805 inline flag.

806 """

807 saved_pos = source.pos

808 ch = source.get(True)

809 if ch == "?":

810 # (?...

811 saved_pos_2 = source.pos

812 ch = source.get(True)

813 if ch == "<":

814 # (?<...

815 saved_pos_3 = source.pos

816 ch = source.get()

817 if ch in ("=", "!"):

818 # (?<=... or (?<!...: lookbehind.

819 return parse_lookaround(source, info, True, ch == "=")

820

821 # (?<...: a named capture group.

822 source.pos = saved_pos_3

823 name = parse_name(source)

824 group = info.open_group(name)

825 source.expect(">")

826 saved_flags = info.flags

827 try:

828 subpattern = _parse_pattern(source, info)

829 source.expect(")")

830 finally:

831 info.flags = saved_flags

832 source.ignore_space = bool(info.flags & VERBOSE)

833

834 info.close_group()

835 return Group(info, group, subpattern)

836 if ch in ("=", "!"):

837 # (?=... or (?!...: lookahead.

838 return parse_lookaround(source, info, False, ch == "=")

839 if ch == "P":

840 # (?P...: a Python extension.

841 return parse_extension(source, info)

842 if ch == "#":

843 # (?#...: a comment.

844 return parse_comment(source)

845 if ch == "(":

846 # (?(...: a conditional subpattern.

847 return parse_conditional(source, info)

848 if ch == ">":

849 # (?>...: an atomic subpattern.

850 return parse_atomic(source, info)

851 if ch == "|":

852 # (?|...: a common/reset groups branch.

853 return parse_common(source, info)

854 if ch == "R" or "0" <= ch <= "9":

855 # (?R...: probably a call to a group.

856 return parse_call_group(source, info, ch, saved_pos_2)

857 if ch == "&":

858 # (?&...: a call to a named group.

859 return parse_call_named_group(source, info, saved_pos_2)

860

861 # (?...: probably a flags subpattern.

862 source.pos = saved_pos_2

863 return parse_flags_subpattern(source, info)

864

865 if ch == "*":

866 # (*...

867 saved_pos_2 = source.pos

868 word = source.get_while(set(")>"), include=False)

869 if word[ : 1].isalpha():

870 verb = VERBS.get(word)

871 if not verb:

872 raise error("unknown verb", source.string, saved_pos_2)

873

874 source.expect(")")

875

876 return verb

877

878 # (...: an unnamed capture group.

879 source.pos = saved_pos

880 group = info.open_group()

881 saved_flags = info.flags

882 try:

883 subpattern = _parse_pattern(source, info)

884 source.expect(")")

885 finally:

886 info.flags = saved_flags

887 source.ignore_space = bool(info.flags & VERBOSE)

888

889 info.close_group()

890

891 return Group(info, group, subpattern)

892

893def parse_extension(source, info):

894 "Parses a Python extension."

895 saved_pos = source.pos

896 ch = source.get()

897 if ch == "<":

898 # (?P<...: a named capture group.

899 name = parse_name(source)

900 group = info.open_group(name)

901 source.expect(">")

902 saved_flags = info.flags

903 try:

904 subpattern = _parse_pattern(source, info)

905 source.expect(")")

906 finally:

907 info.flags = saved_flags

908 source.ignore_space = bool(info.flags & VERBOSE)

909

910 info.close_group()

911

912 return Group(info, group, subpattern)

913 if ch == "=":

914 # (?P=...: a named group reference.

915 name = parse_name(source, allow_numeric=True)

916 source.expect(")")

917 if info.is_open_group(name):

918 raise error("cannot refer to an open group", source.string,

919 saved_pos)

920

921 return make_ref_group(info, name, saved_pos)

922 if ch == ">" or ch == "&":

923 # (?P>...: a call to a group.

924 return parse_call_named_group(source, info, saved_pos)

925

926 source.pos = saved_pos

927 raise error("unknown extension", source.string, saved_pos)

928

929def parse_comment(source):

930 "Parses a comment."

931 while True:

932 saved_pos = source.pos

933 c = source.get(True)

934

935 if not c or c == ")":

936 break

937

938 if c == "\\":

939 c = source.get(True)

940

941 source.pos = saved_pos

942 source.expect(")")

943

944 return None

945

946def parse_lookaround(source, info, behind, positive):

947 "Parses a lookaround."

948 saved_flags = info.flags

949 try:

950 subpattern = _parse_pattern(source, info)

951 source.expect(")")

952 finally:

953 info.flags = saved_flags

954 source.ignore_space = bool(info.flags & VERBOSE)

955

956 return LookAround(behind, positive, subpattern)

957

958def parse_conditional(source, info):

959 "Parses a conditional subpattern."

960 saved_flags = info.flags

961 saved_pos = source.pos

962 ch = source.get()

963 if ch == "?":

964 # (?(?...

965 ch = source.get()

966 if ch in ("=", "!"):

967 # (?(?=... or (?(?!...: lookahead conditional.

968 return parse_lookaround_conditional(source, info, False, ch == "=")

969 if ch == "<":

970 # (?(?<...

971 ch = source.get()

972 if ch in ("=", "!"):

973 # (?(?<=... or (?(?<!...: lookbehind conditional.

974 return parse_lookaround_conditional(source, info, True, ch ==

975 "=")

976

977 source.pos = saved_pos

978 raise error("expected lookaround conditional", source.string,

979 source.pos)

980

981 source.pos = saved_pos

982 try:

983 group = parse_name(source, True)

984 source.expect(")")

985 yes_branch = parse_sequence(source, info)

986 if source.match("|"):

987 no_branch = parse_sequence(source, info)

988 else:

989 no_branch = Sequence()

990

991 source.expect(")")

992 finally:

993 info.flags = saved_flags

994 source.ignore_space = bool(info.flags & VERBOSE)

995

996 if yes_branch.is_empty() and no_branch.is_empty():

997 return Sequence()

998

999 return Conditional(info, group, yes_branch, no_branch, saved_pos)

1000

1001def parse_lookaround_conditional(source, info, behind, positive):

1002 saved_flags = info.flags

1003 try:

1004 subpattern = _parse_pattern(source, info)

1005 source.expect(")")

1006 finally:

1007 info.flags = saved_flags

1008 source.ignore_space = bool(info.flags & VERBOSE)

1009

1010 yes_branch = parse_sequence(source, info)

1011 if source.match("|"):

1012 no_branch = parse_sequence(source, info)

1013 else:

1014 no_branch = Sequence()

1015

1016 source.expect(")")

1017

1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,

1019 no_branch)

1020

1021def parse_atomic(source, info):

1022 "Parses an atomic subpattern."

1023 saved_flags = info.flags

1024 try:

1025 subpattern = _parse_pattern(source, info)

1026 source.expect(")")

1027 finally:

1028 info.flags = saved_flags

1029 source.ignore_space = bool(info.flags & VERBOSE)

1030

1031 return Atomic(subpattern)

1032

1033def parse_common(source, info):

1034 "Parses a common groups branch."

1035 # Capture group numbers in different branches can reuse the group numbers.

1036 initial_group_count = info.group_count

1037 branches = [parse_sequence(source, info)]

1038 final_group_count = info.group_count

1039 while source.match("|"):

1040 info.group_count = initial_group_count

1041 branches.append(parse_sequence(source, info))

1042 final_group_count = max(final_group_count, info.group_count)

1043

1044 info.group_count = final_group_count

1045 source.expect(")")

1046

1047 if len(branches) == 1:

1048 return branches[0]

1049 return Branch(branches)

1050

1051def parse_call_group(source, info, ch, pos):

1052 "Parses a call to a group."

1053 if ch == "R":

1054 group = "0"

1055 else:

1056 group = ch + source.get_while(DIGITS)

1057

1058 source.expect(")")

1059

1060 return CallGroup(info, group, pos)

1061

1062def parse_call_named_group(source, info, pos):

1063 "Parses a call to a named group."

1064 group = parse_name(source)

1065 source.expect(")")

1066

1067 return CallGroup(info, group, pos)

1068

1069def parse_flag_set(source):

1070 "Parses a set of inline flags."

1071 flags = 0

1072

1073 try:

1074 while True:

1075 saved_pos = source.pos

1076 ch = source.get()

1077 if ch == "V":

1078 ch += source.get()

1079 flags |= REGEX_FLAGS[ch]

1080 except KeyError:

1081 source.pos = saved_pos

1082

1083 return flags

1084

1085def parse_flags(source, info):

1086 "Parses flags being turned on/off."

1087 flags_on = parse_flag_set(source)

1088 if source.match("-"):

1089 flags_off = parse_flag_set(source)

1090 if not flags_off:

1091 raise error("bad inline flags: no flags after '-'", source.string,

1092 source.pos)

1093 else:

1094 flags_off = 0

1095

1096 if flags_on & LOCALE:

1097 # Remember that this pattern as an inline locale flag.

1098 info.inline_locale = True

1099

1100 return flags_on, flags_off

1101

1102def parse_subpattern(source, info, flags_on, flags_off):

1103 "Parses a subpattern with scoped flags."

1104 saved_flags = info.flags

1105 info.flags = (info.flags | flags_on) & ~flags_off

1106 source.ignore_space = bool(info.flags & VERBOSE)

1107 try:

1108 subpattern = _parse_pattern(source, info)

1109 source.expect(")")

1110 finally:

1111 info.flags = saved_flags

1112 source.ignore_space = bool(info.flags & VERBOSE)

1113

1114 return subpattern

1115

1116def parse_flags_subpattern(source, info):

1117 """Parses a flags subpattern. It could be inline flags or a subpattern

1118 possibly with local flags. If it's a subpattern, then that's returned;

1119 if it's a inline flags, then None is returned.

1120 """

1121 flags_on, flags_off = parse_flags(source, info)

1122

1123 if flags_off & GLOBAL_FLAGS:

1124 raise error("bad inline flags: cannot turn off global flag",

1125 source.string, source.pos)

1126

1127 if flags_on & flags_off:

1128 raise error("bad inline flags: flag turned on and off", source.string,

1129 source.pos)

1130

1131 # Handle flags which are global in all regex behaviours.

1132 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS

1133 if new_global_flags:

1134 info.global_flags |= new_global_flags

1135

1136 # A global has been turned on, so reparse the pattern.

1137 raise _UnscopedFlagSet(info.global_flags)

1138

1139 # Ensure that from now on we have only scoped flags.

1140 flags_on &= ~GLOBAL_FLAGS

1141

1142 if source.match(":"):

1143 return parse_subpattern(source, info, flags_on, flags_off)

1144

1145 if source.match(")"):

1146 parse_positional_flags(source, info, flags_on, flags_off)

1147 return None

1148

1149 raise error("unknown extension", source.string, source.pos)

1150

1151def parse_positional_flags(source, info, flags_on, flags_off):

1152 "Parses positional flags."

1153 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1154 if version == VERSION0:

1155 # Positional flags are global and can only be turned on.

1156 if flags_off:

1157 raise error("bad inline flags: cannot turn flags off",

1158 source.string, source.pos)

1159

1160 new_global_flags = flags_on & ~info.global_flags

1161 if new_global_flags:

1162 info.global_flags |= new_global_flags

1163

1164 # A global has been turned on, so reparse the pattern.

1165 raise _UnscopedFlagSet(info.global_flags)

1166 else:

1167 info.flags = (info.flags | flags_on) & ~flags_off

1168

1169 source.ignore_space = bool(info.flags & VERBOSE)

1170

1171def parse_name(source, allow_numeric=False, allow_group_0=False):

1172 "Parses a name."

1173 name = source.get_while(set(")>"), include=False)

1174

1175 if not name:

1176 raise error("missing group name", source.string, source.pos)

1177

1178 if name.isdigit():

1179 min_group = 0 if allow_group_0 else 1

1180 if not allow_numeric or int(name) < min_group:

1181 raise error("bad character in group name", source.string,

1182 source.pos)

1183 else:

1184 if not name.isidentifier():

1185 raise error("bad character in group name", source.string,

1186 source.pos)

1187

1188 return name

1189

1190def is_octal(string):

1191 "Checks whether a string is octal."

1192 return all(ch in OCT_DIGITS for ch in string)

1193

1194def is_decimal(string):

1195 "Checks whether a string is decimal."

1196 return all(ch in DIGITS for ch in string)

1197

1198def is_hexadecimal(string):

1199 "Checks whether a string is hexadecimal."

1200 return all(ch in HEX_DIGITS for ch in string)

1201

1202def parse_escape(source, info, in_set):

1203 "Parses an escape sequence."

1204 saved_ignore = source.ignore_space

1205 source.ignore_space = False

1206 ch = source.get()

1207 source.ignore_space = saved_ignore

1208 if not ch:

1209 # A backslash at the end of the pattern.

1210 raise error("bad escape (end of pattern)", source.string, source.pos)

1211 if ch in HEX_ESCAPES:

1212 # A hexadecimal escape sequence.

1213 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)

1214 elif ch == "g" and not in_set:

1215 # A group reference.

1216 saved_pos = source.pos

1217 try:

1218 return parse_group_ref(source, info)

1219 except error:

1220 # Invalid as a group reference, so assume it's a literal.

1221 source.pos = saved_pos

1222

1223 return make_character(info, ord(ch), in_set)

1224 elif ch == "G" and not in_set:

1225 # A search anchor.

1226 return SearchAnchor()

1227 elif ch == "L" and not in_set:

1228 # A string set.

1229 return parse_string_set(source, info)

1230 elif ch == "N":

1231 # A named codepoint.

1232 return parse_named_char(source, info, in_set)

1233 elif ch in "pP":

1234 # A Unicode property, positive or negative.

1235 return parse_property(source, info, ch == "p", in_set)

1236 elif ch == "X" and not in_set:

1237 # A grapheme cluster.

1238 return Grapheme()

1239 elif ch in ALPHA:

1240 # An alphabetic escape sequence.

1241 # Positional escapes aren't allowed inside a character set.

1242 if not in_set:

1243 if info.flags & WORD:

1244 value = WORD_POSITION_ESCAPES.get(ch)

1245 else:

1246 value = POSITION_ESCAPES.get(ch)

1247

1248 if value:

1249 return value

1250

1251 value = CHARSET_ESCAPES.get(ch)

1252 if value:

1253 return value

1254

1255 value = CHARACTER_ESCAPES.get(ch)

1256 if value:

1257 return Character(ord(value))

1258

1259 raise error("bad escape \\%s" % ch, source.string, source.pos)

1260 elif ch in DIGITS:

1261 # A numeric escape sequence.

1262 return parse_numeric_escape(source, info, ch, in_set)

1263 else:

1264 # A literal.

1265 return make_character(info, ord(ch), in_set)

1266

1267def parse_numeric_escape(source, info, ch, in_set):

1268 "Parses a numeric escape sequence."

1269 if in_set or ch == "0":

1270 # Octal escape sequence, max 3 digits.

1271 return parse_octal_escape(source, info, [ch], in_set)

1272

1273 # At least 1 digit, so either octal escape or group.

1274 digits = ch

1275 saved_pos = source.pos

1276 ch = source.get()

1277 if ch in DIGITS:

1278 # At least 2 digits, so either octal escape or group.

1279 digits += ch

1280 saved_pos = source.pos

1281 ch = source.get()

1282 if is_octal(digits) and ch in OCT_DIGITS:

1283 # 3 octal digits, so octal escape sequence.

1284 encoding = info.flags & _ALL_ENCODINGS

1285 if encoding == ASCII or encoding == LOCALE:

1286 octal_mask = 0xFF

1287 else:

1288 octal_mask = 0x1FF

1289

1290 value = int(digits + ch, 8) & octal_mask

1291 return make_character(info, value)

1292

1293 # Group reference.

1294 source.pos = saved_pos

1295 if info.is_open_group(digits):

1296 raise error("cannot refer to an open group", source.string, source.pos)

1297

1298 return make_ref_group(info, digits, source.pos)

1299

1300def parse_octal_escape(source, info, digits, in_set):

1301 "Parses an octal escape sequence."

1302 saved_pos = source.pos

1303 ch = source.get()

1304 while len(digits) < 3 and ch in OCT_DIGITS:

1305 digits.append(ch)

1306 saved_pos = source.pos

1307 ch = source.get()

1308

1309 source.pos = saved_pos

1310 try:

1311 value = int("".join(digits), 8)

1312 return make_character(info, value, in_set)

1313 except ValueError:

1314 if digits[0] in OCT_DIGITS:

1315 raise error("incomplete escape \\%s" % ''.join(digits),

1316 source.string, source.pos)

1317 else:

1318 raise error("bad escape \\%s" % digits[0], source.string,

1319 source.pos)

1320

1321def parse_hex_escape(source, info, esc, expected_len, in_set, type):

1322 "Parses a hex escape sequence."

1323 saved_pos = source.pos

1324 digits = []

1325 for i in range(expected_len):

1326 ch = source.get()

1327 if ch not in HEX_DIGITS:

1328 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1329 source.string, saved_pos)

1330 digits.append(ch)

1331

1332 try:

1333 value = int("".join(digits), 16)

1334 except ValueError:

1335 pass

1336 else:

1337 if value < 0x110000:

1338 return make_character(info, value, in_set)

1339

1340 # Bad hex escape.

1341 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),

1342 source.string, saved_pos)

1343

1344def parse_group_ref(source, info):

1345 "Parses a group reference."

1346 source.expect("<")

1347 saved_pos = source.pos

1348 name = parse_name(source, True)

1349 source.expect(">")

1350 if info.is_open_group(name):

1351 raise error("cannot refer to an open group", source.string, source.pos)

1352

1353 return make_ref_group(info, name, saved_pos)

1354

1355def parse_string_set(source, info):

1356 "Parses a string set reference."

1357 source.expect("<")

1358 name = parse_name(source, True)

1359 source.expect(">")

1360 if name is None or name not in info.kwargs:

1361 raise error("undefined named list", source.string, source.pos)

1362

1363 return make_string_set(info, name)

1364

1365def parse_named_char(source, info, in_set):

1366 "Parses a named character."

1367 saved_pos = source.pos

1368 if source.match("{"):

1369 name = source.get_while(NAMED_CHAR_PART)

1370 if source.match("}"):

1371 try:

1372 value = unicodedata.lookup(name)

1373 return make_character(info, ord(value), in_set)

1374 except KeyError:

1375 raise error("undefined character name", source.string,

1376 source.pos)

1377

1378 source.pos = saved_pos

1379 return make_character(info, ord("N"), in_set)

1380

1381def parse_property(source, info, positive, in_set):

1382 "Parses a Unicode property."

1383 saved_pos = source.pos

1384 ch = source.get()

1385 if ch == "{":

1386 negate = source.match("^")

1387 prop_name, name = parse_property_name(source)

1388 if source.match("}"):

1389 # It's correctly delimited.

1390 prop = lookup_property(prop_name, name, positive != negate, source)

1391 return make_property(info, prop, in_set)

1392 elif ch and ch in "CLMNPSZ":

1393 # An abbreviated property, eg \pL.

1394 prop = lookup_property(None, ch, positive, source)

1395 return make_property(info, prop, in_set)

1396

1397 # Not a property, so treat as a literal "p" or "P".

1398 source.pos = saved_pos

1399 ch = "p" if positive else "P"

1400 return make_character(info, ord(ch), in_set)

1401

1402def parse_property_name(source):

1403 "Parses a property name, which may be qualified."

1404 name = source.get_while(PROPERTY_NAME_PART)

1405 saved_pos = source.pos

1406

1407 ch = source.get()

1408 if ch and ch in ":=":

1409 prop_name = name

1410 name = source.get_while(ALNUM | set(" &_-./")).strip()

1411

1412 if name:

1413 # Name after the ":" or "=", so it's a qualified name.

1414 saved_pos = source.pos

1415 else:

1416 # No name after the ":" or "=", so assume it's an unqualified name.

1417 prop_name, name = None, prop_name

1418 else:

1419 prop_name = None

1420

1421 source.pos = saved_pos

1422 return prop_name, name

1423

1424def parse_set(source, info):

1425 "Parses a character set."

1426 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1427

1428 saved_ignore = source.ignore_space

1429 source.ignore_space = False

1430 # Negative set?

1431 negate = source.match("^")

1432 try:

1433 if version == VERSION0:

1434 item = parse_set_imp_union(source, info)

1435 else:

1436 item = parse_set_union(source, info)

1437

1438 if not source.match("]"):

1439 raise error("missing ]", source.string, source.pos)

1440 finally:

1441 source.ignore_space = saved_ignore

1442

1443 if negate:

1444 item = item.with_flags(positive=not item.positive)

1445

1446 item = item.with_flags(case_flags=make_case_flags(info))

1447

1448 return item

1449

1450def parse_set_union(source, info):

1451 "Parses a set union ([x||y])."

1452 items = [parse_set_symm_diff(source, info)]

1453 while source.match("||"):

1454 items.append(parse_set_symm_diff(source, info))

1455

1456 if len(items) == 1:

1457 return items[0]

1458 return SetUnion(info, items)

1459

1460def parse_set_symm_diff(source, info):

1461 "Parses a set symmetric difference ([x~~y])."

1462 items = [parse_set_inter(source, info)]

1463 while source.match("~~"):

1464 items.append(parse_set_inter(source, info))

1465

1466 if len(items) == 1:

1467 return items[0]

1468 return SetSymDiff(info, items)

1469

1470def parse_set_inter(source, info):

1471 "Parses a set intersection ([x&&y])."

1472 items = [parse_set_diff(source, info)]

1473 while source.match("&&"):

1474 items.append(parse_set_diff(source, info))

1475

1476 if len(items) == 1:

1477 return items[0]

1478 return SetInter(info, items)

1479

1480def parse_set_diff(source, info):

1481 "Parses a set difference ([x--y])."

1482 items = [parse_set_imp_union(source, info)]

1483 while source.match("--"):

1484 items.append(parse_set_imp_union(source, info))

1485

1486 if len(items) == 1:

1487 return items[0]

1488 return SetDiff(info, items)

1489

1490def parse_set_imp_union(source, info):

1491 "Parses a set implicit union ([xy])."

1492 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1493

1494 items = [parse_set_member(source, info)]

1495 while True:

1496 saved_pos = source.pos

1497 if source.match("]"):

1498 # End of the set.

1499 source.pos = saved_pos

1500 break

1501

1502 if version == VERSION1 and any(source.match(op) for op in SET_OPS):

1503 # The new behaviour has set operators.

1504 source.pos = saved_pos

1505 break

1506

1507 items.append(parse_set_member(source, info))

1508

1509 if len(items) == 1:

1510 return items[0]

1511 return SetUnion(info, items)

1512

1513def parse_set_member(source, info):

1514 "Parses a member in a character set."

1515 # Parse a set item.

1516 start = parse_set_item(source, info)

1517 saved_pos1 = source.pos

1518 if (not isinstance(start, Character) or not start.positive or not

1519 source.match("-")):

1520 # It's not the start of a range.

1521 return start

1522

1523 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1524

1525 # It looks like the start of a range of characters.

1526 saved_pos2 = source.pos

1527 if version == VERSION1 and source.match("-"):

1528 # It's actually the set difference operator '--', so return the

1529 # character.

1530 source.pos = saved_pos1

1531 return start

1532

1533 if source.match("]"):

1534 # We've reached the end of the set, so return both the character and

1535 # hyphen.

1536 source.pos = saved_pos2

1537 return SetUnion(info, [start, Character(ord("-"))])

1538

1539 # Parse a set item.

1540 end = parse_set_item(source, info)

1541 if not isinstance(end, Character) or not end.positive:

1542 # It's not a range, so return the character, hyphen and property.

1543 return SetUnion(info, [start, Character(ord("-")), end])

1544

1545 # It _is_ a range.

1546 if start.value > end.value:

1547 raise error("bad character range", source.string, source.pos)

1548

1549 if start.value == end.value:

1550 return start

1551

1552 return Range(start.value, end.value)

1553

1554def parse_set_item(source, info):

1555 "Parses an item in a character set."

1556 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1557

1558 if source.match("\\"):

1559 # An escape sequence in a set.

1560 return parse_escape(source, info, True)

1561

1562 saved_pos = source.pos

1563 if source.match("[:"):

1564 # Looks like a POSIX character class.

1565 try:

1566 return parse_posix_class(source, info)

1567 except ParseError:

1568 # Not a POSIX character class.

1569 source.pos = saved_pos

1570

1571 if version == VERSION1 and source.match("["):

1572 # It's the start of a nested set.

1573

1574 # Negative set?

1575 negate = source.match("^")

1576 item = parse_set_union(source, info)

1577

1578 if not source.match("]"):

1579 raise error("missing ]", source.string, source.pos)

1580

1581 if negate:

1582 item = item.with_flags(positive=not item.positive)

1583

1584 return item

1585

1586 ch = source.get()

1587 if not ch:

1588 raise error("unterminated character set", source.string, source.pos)

1589

1590 return Character(ord(ch))

1591

1592def parse_posix_class(source, info):

1593 "Parses a POSIX character class."

1594 negate = source.match("^")

1595 prop_name, name = parse_property_name(source)

1596 if not source.match(":]"):

1597 raise ParseError()

1598

1599 return lookup_property(prop_name, name, not negate, source, posix=True)

1600

1601def float_to_rational(flt):

1602 "Converts a float to a rational pair."

1603 int_part = int(flt)

1604 error = flt - int_part

1605 if abs(error) < 0.0001:

1606 return int_part, 1

1607

1608 den, num = float_to_rational(1.0 / error)

1609

1610 return int_part * den + num, den

1611

1612def numeric_to_rational(numeric):

1613 "Converts a numeric string to a rational string, if possible."

1614 if numeric[ : 1] == "-":

1615 sign, numeric = numeric[0], numeric[1 : ]

1616 else:

1617 sign = ""

1618

1619 parts = numeric.split("/")

1620 if len(parts) == 2:

1621 num, den = float_to_rational(float(parts[0]) / float(parts[1]))

1622 elif len(parts) == 1:

1623 num, den = float_to_rational(float(parts[0]))

1624 else:

1625 raise ValueError()

1626

1627 result = "{}{}/{}".format(sign, num, den)

1628 if result.endswith("/1"):

1629 return result[ : -2]

1630

1631 return result

1632

1633def standardise_name(name):

1634 "Standardises a property or value name."

1635 try:

1636 return numeric_to_rational("".join(name))

1637 except (ValueError, ZeroDivisionError):

1638 return "".join(ch for ch in name if ch not in "_- ").upper()

1639

1640_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())

1641

1642_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())

1643

1644def lookup_property(property, value, positive, source=None, posix=False):

1645 "Looks up a property."

1646 # Normalise the names (which may still be lists).

1647 property = standardise_name(property) if property else None

1648 value = standardise_name(value)

1649

1650 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):

1651 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive

1652

1653 if posix and not property and value.upper() in _POSIX_CLASSES:

1654 value = 'POSIX' + value

1655

1656 if property:

1657 # Both the property and the value are provided.

1658 prop = PROPERTIES.get(property)

1659 if not prop:

1660 if not source:

1661 raise error("unknown property")

1662

1663 raise error("unknown property", source.string, source.pos)

1664

1665 prop_id, value_dict = prop

1666 val_id = value_dict.get(value)

1667 if val_id is None:

1668 if not source:

1669 raise error("unknown property value")

1670

1671 raise error("unknown property value", source.string, source.pos)

1672

1673 return Property((prop_id << 16) | val_id, positive)

1674

1675 # Only the value is provided.

1676 # It might be the name of a GC, script or block value.

1677 for property in ("GC", "SCRIPT", "BLOCK"):

1678 prop_id, value_dict = PROPERTIES.get(property)

1679 val_id = value_dict.get(value)

1680 if val_id is not None:

1681 return Property((prop_id << 16) | val_id, positive)

1682

1683 # It might be the name of a binary property.

1684 prop = PROPERTIES.get(value)

1685 if prop:

1686 prop_id, value_dict = prop

1687 if set(value_dict) == _BINARY_VALUES:

1688 return Property((prop_id << 16) | 1, positive)

1689

1690 return Property(prop_id << 16, not positive)

1691

1692 # It might be the name of a binary property starting with a prefix.

1693 if value.startswith("IS"):

1694 prop = PROPERTIES.get(value[2 : ])

1695 if prop:

1696 prop_id, value_dict = prop

1697 if "YES" in value_dict:

1698 return Property((prop_id << 16) | 1, positive)

1699

1700 # It might be the name of a script or block starting with a prefix.

1701 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):

1702 if value.startswith(prefix):

1703 prop_id, value_dict = PROPERTIES.get(property)

1704 val_id = value_dict.get(value[2 : ])

1705 if val_id is not None:

1706 return Property((prop_id << 16) | val_id, positive)

1707

1708 # Unknown property.

1709 if not source:

1710 raise error("unknown property")

1711

1712 raise error("unknown property", source.string, source.pos)

1713

1714def _compile_replacement(source, pattern, is_unicode):

1715 "Compiles a replacement template escape sequence."

1716 ch = source.get()

1717 if ch in ALPHA:

1718 # An alphabetic escape sequence.

1719 value = CHARACTER_ESCAPES.get(ch)

1720 if value:

1721 return False, [ord(value)]

1722

1723 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):

1724 # A hexadecimal escape sequence.

1725 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]

1726

1727 if ch == "g":

1728 # A group preference.

1729 return True, [compile_repl_group(source, pattern)]

1730

1731 if ch == "N" and is_unicode:

1732 # A named character.

1733 value = parse_repl_named_char(source)

1734 if value is not None:

1735 return False, [value]

1736

1737 raise error("bad escape \\%s" % ch, source.string, source.pos)

1738

1739 if isinstance(source.sep, bytes):

1740 octal_mask = 0xFF

1741 else:

1742 octal_mask = 0x1FF

1743

1744 if ch == "0":

1745 # An octal escape sequence.

1746 digits = ch

1747 while len(digits) < 3:

1748 saved_pos = source.pos

1749 ch = source.get()

1750 if ch not in OCT_DIGITS:

1751 source.pos = saved_pos

1752 break

1753 digits += ch

1754

1755 return False, [int(digits, 8) & octal_mask]

1756

1757 if ch in DIGITS:

1758 # Either an octal escape sequence (3 digits) or a group reference (max

1759 # 2 digits).

1760 digits = ch

1761 saved_pos = source.pos

1762 ch = source.get()

1763 if ch in DIGITS:

1764 digits += ch

1765 saved_pos = source.pos

1766 ch = source.get()

1767 if ch and is_octal(digits + ch):

1768 # An octal escape sequence.

1769 return False, [int(digits + ch, 8) & octal_mask]

1770

1771 # A group reference.

1772 source.pos = saved_pos

1773 return True, [int(digits)]

1774

1775 if ch == "\\":

1776 # An escaped backslash is a backslash.

1777 return False, [ord("\\")]

1778

1779 if not ch:

1780 # A trailing backslash.

1781 raise error("bad escape (end of pattern)", source.string, source.pos)

1782

1783 # An escaped non-backslash is a backslash followed by the literal.

1784 return False, [ord("\\"), ord(ch)]

1785

1786def parse_repl_hex_escape(source, expected_len, type):

1787 "Parses a hex escape sequence in a replacement string."

1788 digits = []

1789 for i in range(expected_len):

1790 ch = source.get()

1791 if ch not in HEX_DIGITS:

1792 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1793 source.string, source.pos)

1794 digits.append(ch)

1795

1796 return int("".join(digits), 16)

1797

1798def parse_repl_named_char(source):

1799 "Parses a named character in a replacement string."

1800 saved_pos = source.pos

1801 if source.match("{"):

1802 name = source.get_while(ALPHA | set(" "))

1803

1804 if source.match("}"):

1805 try:

1806 value = unicodedata.lookup(name)

1807 return ord(value)

1808 except KeyError:

1809 raise error("undefined character name", source.string,

1810 source.pos)

1811

1812 source.pos = saved_pos

1813 return None

1814

1815def compile_repl_group(source, pattern):

1816 "Compiles a replacement template group reference."

1817 source.expect("<")

1818 name = parse_name(source, True, True)

1819

1820 source.expect(">")

1821 if name.isdigit():

1822 index = int(name)

1823 if not 0 <= index <= pattern.groups:

1824 raise error("invalid group reference", source.string, source.pos)

1825

1826 return index

1827

1828 try:

1829 return pattern.groupindex[name]

1830 except KeyError:

1831 raise IndexError("unknown group")

1832

1833# The regular expression is parsed into a syntax tree. The different types of

1834# node are defined below.

1835

1836INDENT = " "

1837POSITIVE_OP = 0x1

1838ZEROWIDTH_OP = 0x2

1839FUZZY_OP = 0x4

1840REVERSE_OP = 0x8

1841REQUIRED_OP = 0x10

1842

1843POS_TEXT = {False: "NON-MATCH", True: "MATCH"}

1844CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",

1845 FULLIGNORECASE: " FULL_IGNORE_CASE"}

1846

1847def make_sequence(items):

1848 if len(items) == 1:

1849 return items[0]

1850 return Sequence(items)

1851

1852# Common base class for all nodes.

1853class RegexBase:

1854 def __init__(self):

1855 self._key = self.__class__

1856

1857 def with_flags(self, positive=None, case_flags=None, zerowidth=None):

1858 if positive is None:

1859 positive = self.positive

1860 else:

1861 positive = bool(positive)

1862 if case_flags is None:

1863 case_flags = self.case_flags

1864 else:

1865 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]

1866 if zerowidth is None:

1867 zerowidth = self.zerowidth

1868 else:

1869 zerowidth = bool(zerowidth)

1870

1871 if (positive == self.positive and case_flags == self.case_flags and

1872 zerowidth == self.zerowidth):

1873 return self

1874

1875 return self.rebuild(positive, case_flags, zerowidth)

1876

1877 def fix_groups(self, pattern, reverse, fuzzy):

1878 pass

1879

1880 def optimise(self, info, reverse):

1881 return self

1882

1883 def pack_characters(self, info):

1884 return self

1885

1886 def remove_captures(self):

1887 return self

1888

1889 def is_atomic(self):

1890 return True

1891

1892 def can_be_affix(self):

1893 return True

1894

1895 def contains_group(self):

1896 return False

1897

1898 def get_firstset(self, reverse):

1899 raise _FirstSetError()

1900

1901 def has_simple_start(self):

1902 return False

1903

1904 def compile(self, reverse=False, fuzzy=False):

1905 return self._compile(reverse, fuzzy)

1906

1907 def is_empty(self):

1908 return False

1909

1910 def __hash__(self):

1911 return hash(self._key)

1912

1913 def __eq__(self, other):

1914 return type(self) is type(other) and self._key == other._key

1915

1916 def __ne__(self, other):

1917 return not self.__eq__(other)

1918

1919 def get_required_string(self, reverse):

1920 return self.max_width(), None

1921

1922# Base class for zero-width nodes.

1923class ZeroWidthBase(RegexBase):

1924 def __init__(self, positive=True):

1925 RegexBase.__init__(self)

1926 self.positive = bool(positive)

1927

1928 self._key = self.__class__, self.positive

1929

1930 def get_firstset(self, reverse):

1931 return set([None])

1932

1933 def _compile(self, reverse, fuzzy):

1934 flags = 0

1935 if self.positive:

1936 flags |= POSITIVE_OP

1937 if fuzzy:

1938 flags |= FUZZY_OP

1939 if reverse:

1940 flags |= REVERSE_OP

1941 return [(self._opcode, flags)]

1942

1943 def dump(self, indent, reverse):

1944 print("{}{} {}".format(INDENT * indent, self._op_name,

1945 POS_TEXT[self.positive]))

1946

1947 def max_width(self):

1948 return 0

1949

1950class Any(RegexBase):

1951 _opcode = {False: OP.ANY, True: OP.ANY_REV}

1952 _op_name = "ANY"

1953

1954 def has_simple_start(self):

1955 return True

1956

1957 def _compile(self, reverse, fuzzy):

1958 flags = 0

1959 if fuzzy:

1960 flags |= FUZZY_OP

1961 return [(self._opcode[reverse], flags)]

1962

1963 def dump(self, indent, reverse):

1964 print("{}{}".format(INDENT * indent, self._op_name))

1965

1966 def max_width(self):

1967 return 1

1968

1969class AnyAll(Any):

1970 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}

1971 _op_name = "ANY_ALL"

1972

1973class AnyU(Any):

1974 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}

1975 _op_name = "ANY_U"

1976

1977class Atomic(RegexBase):

1978 def __init__(self, subpattern):

1979 RegexBase.__init__(self)

1980 self.subpattern = subpattern

1981

1982 def fix_groups(self, pattern, reverse, fuzzy):

1983 self.subpattern.fix_groups(pattern, reverse, fuzzy)

1984

1985 def optimise(self, info, reverse):

1986 self.subpattern = self.subpattern.optimise(info, reverse)

1987

1988 if self.subpattern.is_empty():

1989 return self.subpattern

1990 return self

1991

1992 def pack_characters(self, info):

1993 self.subpattern = self.subpattern.pack_characters(info)

1994 return self

1995

1996 def remove_captures(self):

1997 self.subpattern = self.subpattern.remove_captures()

1998 return self

1999

2000 def can_be_affix(self):

2001 return self.subpattern.can_be_affix()

2002

2003 def contains_group(self):

2004 return self.subpattern.contains_group()

2005

2006 def get_firstset(self, reverse):

2007 return self.subpattern.get_firstset(reverse)

2008

2009 def has_simple_start(self):

2010 return self.subpattern.has_simple_start()

2011

2012 def _compile(self, reverse, fuzzy):

2013 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +

2014 [(OP.END, )])

2015

2016 def dump(self, indent, reverse):

2017 print("{}ATOMIC".format(INDENT * indent))

2018 self.subpattern.dump(indent + 1, reverse)

2019

2020 def is_empty(self):

2021 return self.subpattern.is_empty()

2022

2023 def __eq__(self, other):

2024 return (type(self) is type(other) and self.subpattern ==

2025 other.subpattern)

2026

2027 def max_width(self):

2028 return self.subpattern.max_width()

2029

2030 def get_required_string(self, reverse):

2031 return self.subpattern.get_required_string(reverse)

2032

2033class Boundary(ZeroWidthBase):

2034 _opcode = OP.BOUNDARY

2035 _op_name = "BOUNDARY"

2036

2037class Branch(RegexBase):

2038 def __init__(self, branches):

2039 RegexBase.__init__(self)

2040 self.branches = branches

2041

2042 def fix_groups(self, pattern, reverse, fuzzy):

2043 for b in self.branches:

2044 b.fix_groups(pattern, reverse, fuzzy)

2045

2046 def optimise(self, info, reverse):

2047 if not self.branches:

2048 return Sequence([])

2049

2050 # Flatten branches within branches.

2051 branches = Branch._flatten_branches(info, reverse, self.branches)

2052

2053 # Move any common prefix or suffix out of the branches.

2054 if reverse:

2055 suffix, branches = Branch._split_common_suffix(info, branches)

2056 prefix = []

2057 else:

2058 prefix, branches = Branch._split_common_prefix(info, branches)

2059 suffix = []

2060

2061 # Try to reduce adjacent single-character branches to sets.

2062 branches = Branch._reduce_to_set(info, reverse, branches)

2063

2064 if len(branches) > 1:

2065 sequence = [Branch(branches)]

2066

2067 if not prefix or not suffix:

2068 # We might be able to add a quick precheck before the branches.

2069 firstset = self._add_precheck(info, reverse, branches)

2070

2071 if firstset:

2072 if reverse:

2073 sequence.append(firstset)

2074 else:

2075 sequence.insert(0, firstset)

2076 else:

2077 sequence = branches

2078

2079 return make_sequence(prefix + sequence + suffix)

2080

2081 def _add_precheck(self, info, reverse, branches):

2082 charset = set()

2083 pos = -1 if reverse else 0

2084

2085 for branch in branches:

2086 if type(branch) is Literal and branch.case_flags == NOCASE:

2087 charset.add(branch.characters[pos])

2088 else:

2089 return

2090

2091 if not charset:

2092 return None

2093

2094 return _check_firstset(info, reverse, [Character(c) for c in charset])

2095

2096 def pack_characters(self, info):

2097 self.branches = [b.pack_characters(info) for b in self.branches]

2098 return self

2099

2100 def remove_captures(self):

2101 self.branches = [b.remove_captures() for b in self.branches]

2102 return self

2103

2104 def is_atomic(self):

2105 return all(b.is_atomic() for b in self.branches)

2106

2107 def can_be_affix(self):

2108 return all(b.can_be_affix() for b in self.branches)

2109

2110 def contains_group(self):

2111 return any(b.contains_group() for b in self.branches)

2112

2113 def get_firstset(self, reverse):

2114 fs = set()

2115 for b in self.branches:

2116 fs |= b.get_firstset(reverse)

2117

2118 return fs or set([None])

2119

2120 def _compile(self, reverse, fuzzy):

2121 code = [(OP.BRANCH, )]

2122 for b in self.branches:

2123 code.extend(b.compile(reverse, fuzzy))

2124 code.append((OP.NEXT, ))

2125

2126 code[-1] = (OP.END, )

2127

2128 return code

2129

2130 def dump(self, indent, reverse):

2131 print("{}BRANCH".format(INDENT * indent))

2132 self.branches[0].dump(indent + 1, reverse)

2133 for b in self.branches[1 : ]:

2134 print("{}OR".format(INDENT * indent))

2135 b.dump(indent + 1, reverse)

2136

2137 @staticmethod

2138 def _flatten_branches(info, reverse, branches):

2139 # Flatten the branches so that there aren't branches of branches.

2140 new_branches = []

2141 for b in branches:

2142 b = b.optimise(info, reverse)

2143 if isinstance(b, Branch):

2144 new_branches.extend(b.branches)

2145 else:

2146 new_branches.append(b)

2147

2148 return new_branches

2149

2150 @staticmethod

2151 def _split_common_prefix(info, branches):

2152 # Common leading items can be moved out of the branches.

2153 # Get the items in the branches.

2154 alternatives = []

2155 for b in branches:

2156 if isinstance(b, Sequence):

2157 alternatives.append(b.items)

2158 else:

2159 alternatives.append([b])

2160

2161 # What is the maximum possible length of the prefix?

2162 max_count = min(len(a) for a in alternatives)

2163

2164 # What is the longest common prefix?

2165 prefix = alternatives[0]

2166 pos = 0

2167 end_pos = max_count

2168 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==

2169 prefix[pos] for a in alternatives):

2170 pos += 1

2171 count = pos

2172

2173 if info.flags & UNICODE:

2174 # We need to check that we're not splitting a sequence of

2175 # characters which could form part of full case-folding.

2176 count = pos

2177 while count > 0 and not all(Branch._can_split(a, count) for a in

2178 alternatives):

2179 count -= 1

2180

2181 # No common prefix is possible.

2182 if count == 0:

2183 return [], branches

2184

2185 # Rebuild the branches.

2186 new_branches = []

2187 for a in alternatives:

2188 new_branches.append(make_sequence(a[count : ]))

2189

2190 return prefix[ : count], new_branches

2191

2192 @staticmethod

2193 def _split_common_suffix(info, branches):

2194 # Common trailing items can be moved out of the branches.

2195 # Get the items in the branches.

2196 alternatives = []

2197 for b in branches:

2198 if isinstance(b, Sequence):

2199 alternatives.append(b.items)

2200 else:

2201 alternatives.append([b])

2202

2203 # What is the maximum possible length of the suffix?

2204 max_count = min(len(a) for a in alternatives)

2205

2206 # What is the longest common suffix?

2207 suffix = alternatives[0]

2208 pos = -1

2209 end_pos = -1 - max_count

2210 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==

2211 suffix[pos] for a in alternatives):

2212 pos -= 1

2213 count = -1 - pos

2214

2215 if info.flags & UNICODE:

2216 # We need to check that we're not splitting a sequence of

2217 # characters which could form part of full case-folding.

2218 while count > 0 and not all(Branch._can_split_rev(a, count) for a

2219 in alternatives):

2220 count -= 1

2221

2222 # No common suffix is possible.

2223 if count == 0:

2224 return [], branches

2225

2226 # Rebuild the branches.

2227 new_branches = []

2228 for a in alternatives:

2229 new_branches.append(make_sequence(a[ : -count]))

2230

2231 return suffix[-count : ], new_branches

2232

2233 @staticmethod

2234 def _can_split(items, count):

2235 # Check the characters either side of the proposed split.

2236 if not Branch._is_full_case(items, count - 1):

2237 return True

2238

2239 if not Branch._is_full_case(items, count):

2240 return True

2241

2242 # Check whether a 1-1 split would be OK.

2243 if Branch._is_folded(items[count - 1 : count + 1]):

2244 return False

2245

2246 # Check whether a 1-2 split would be OK.

2247 if (Branch._is_full_case(items, count + 2) and

2248 Branch._is_folded(items[count - 1 : count + 2])):

2249 return False

2250

2251 # Check whether a 2-1 split would be OK.

2252 if (Branch._is_full_case(items, count - 2) and

2253 Branch._is_folded(items[count - 2 : count + 1])):

2254 return False

2255

2256 return True

2257

2258 @staticmethod

2259 def _can_split_rev(items, count):

2260 end = len(items)

2261

2262 # Check the characters either side of the proposed split.

2263 if not Branch._is_full_case(items, end - count):

2264 return True

2265

2266 if not Branch._is_full_case(items, end - count - 1):

2267 return True

2268

2269 # Check whether a 1-1 split would be OK.

2270 if Branch._is_folded(items[end - count - 1 : end - count + 1]):

2271 return False

2272

2273 # Check whether a 1-2 split would be OK.

2274 if (Branch._is_full_case(items, end - count + 2) and

2275 Branch._is_folded(items[end - count - 1 : end - count + 2])):

2276 return False

2277

2278 # Check whether a 2-1 split would be OK.

2279 if (Branch._is_full_case(items, end - count - 2) and

2280 Branch._is_folded(items[end - count - 2 : end - count + 1])):

2281 return False

2282

2283 return True

2284

2285 @staticmethod

2286 def _merge_common_prefixes(info, reverse, branches):

2287 # Branches with the same case-sensitive character prefix can be grouped

2288 # together if they are separated only by other branches with a

2289 # character prefix.

2290 prefixed = defaultdict(list)

2291 order = {}

2292 new_branches = []

2293 for b in branches:

2294 if Branch._is_simple_character(b):

2295 # Branch starts with a simple character.

2296 prefixed[b.value].append([b])

2297 order.setdefault(b.value, len(order))

2298 elif (isinstance(b, Sequence) and b.items and

2299 Branch._is_simple_character(b.items[0])):

2300 # Branch starts with a simple character.

2301 prefixed[b.items[0].value].append(b.items)

2302 order.setdefault(b.items[0].value, len(order))

2303 else:

2304 Branch._flush_char_prefix(info, reverse, prefixed, order,

2305 new_branches)

2306

2307 new_branches.append(b)

2308

2309 Branch._flush_char_prefix(info, prefixed, order, new_branches)

2310

2311 return new_branches

2312

2313 @staticmethod

2314 def _is_simple_character(c):

2315 return isinstance(c, Character) and c.positive and not c.case_flags

2316

2317 @staticmethod

2318 def _reduce_to_set(info, reverse, branches):

2319 # Can the branches be reduced to a set?

2320 new_branches = []

2321 items = set()

2322 case_flags = NOCASE

2323 for b in branches:

2324 if isinstance(b, (Character, Property, SetBase)):

2325 # Branch starts with a single character.

2326 if b.case_flags != case_flags:

2327 # Different case sensitivity, so flush.

2328 Branch._flush_set_members(info, reverse, items, case_flags,

2329 new_branches)

2330

2331 case_flags = b.case_flags

2332

2333 items.add(b.with_flags(case_flags=NOCASE))

2334 else:

2335 Branch._flush_set_members(info, reverse, items, case_flags,

2336 new_branches)

2337

2338 new_branches.append(b)

2339

2340 Branch._flush_set_members(info, reverse, items, case_flags,

2341 new_branches)

2342

2343 return new_branches

2344

2345 @staticmethod

2346 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):

2347 # Flush the prefixed branches.

2348 if not prefixed:

2349 return

2350

2351 for value, branches in sorted(prefixed.items(), key=lambda pair:

2352 order[pair[0]]):

2353 if len(branches) == 1:

2354 new_branches.append(make_sequence(branches[0]))

2355 else:

2356 subbranches = []

2357 optional = False

2358 for b in branches:

2359 if len(b) > 1:

2360 subbranches.append(make_sequence(b[1 : ]))

2361 elif not optional:

2362 subbranches.append(Sequence())

2363 optional = True

2364

2365 sequence = Sequence([Character(value), Branch(subbranches)])

2366 new_branches.append(sequence.optimise(info, reverse))

2367

2368 prefixed.clear()

2369 order.clear()

2370

2371 @staticmethod

2372 def _flush_set_members(info, reverse, items, case_flags, new_branches):

2373 # Flush the set members.

2374 if not items:

2375 return

2376

2377 if len(items) == 1:

2378 item = list(items)[0]

2379 else:

2380 item = SetUnion(info, list(items)).optimise(info, reverse)

2381

2382 new_branches.append(item.with_flags(case_flags=case_flags))

2383

2384 items.clear()

2385

2386 @staticmethod

2387 def _is_full_case(items, i):

2388 if not 0 <= i < len(items):

2389 return False

2390

2391 item = items[i]

2392 return (isinstance(item, Character) and item.positive and

2393 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)

2394

2395 @staticmethod

2396 def _is_folded(items):

2397 if len(items) < 2:

2398 return False

2399

2400 for i in items:

2401 if (not isinstance(i, Character) or not i.positive or not

2402 i.case_flags):

2403 return False

2404

2405 folded = "".join(chr(i.value) for i in items)

2406 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)

2407

2408 # Get the characters which expand to multiple codepoints on folding.

2409 expanding_chars = _regex.get_expand_on_folding()

2410

2411 for c in expanding_chars:

2412 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):

2413 return True

2414

2415 return False

2416

2417 def is_empty(self):

2418 return all(b.is_empty() for b in self.branches)

2419

2420 def __eq__(self, other):

2421 return type(self) is type(other) and self.branches == other.branches

2422

2423 def max_width(self):

2424 return max(b.max_width() for b in self.branches)

2425

2426class CallGroup(RegexBase):

2427 def __init__(self, info, group, position):

2428 RegexBase.__init__(self)

2429 self.info = info

2430 self.group = group

2431 self.position = position

2432

2433 self._key = self.__class__, self.group

2434

2435 def fix_groups(self, pattern, reverse, fuzzy):

2436 try:

2437 self.group = int(self.group)

2438 except ValueError:

2439 try:

2440 self.group = self.info.group_index[self.group]

2441 except KeyError:

2442 raise error("invalid group reference", pattern, self.position)

2443

2444 if not 0 <= self.group <= self.info.group_count:

2445 raise error("unknown group", pattern, self.position)

2446

2447 if self.group > 0 and self.info.open_group_count[self.group] > 1:

2448 raise error("ambiguous group reference", pattern, self.position)

2449

2450 self.info.group_calls.append((self, reverse, fuzzy))

2451

2452 self._key = self.__class__, self.group

2453

2454 def remove_captures(self):

2455 raise error("group reference not allowed", pattern, self.position)

2456

2457 def _compile(self, reverse, fuzzy):

2458 return [(OP.GROUP_CALL, self.call_ref)]

2459

2460 def dump(self, indent, reverse):

2461 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))

2462

2463 def __eq__(self, other):

2464 return type(self) is type(other) and self.group == other.group

2465

2466 def max_width(self):

2467 return UNLIMITED

2468

2469 def __del__(self):

2470 self.info = None

2471

2472class CallRef(RegexBase):

2473 def __init__(self, ref, parsed):

2474 self.ref = ref

2475 self.parsed = parsed

2476

2477 def _compile(self, reverse, fuzzy):

2478 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,

2479 fuzzy) + [(OP.END, )])

2480

2481class Character(RegexBase):

2482 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):

2483 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,

2484 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,

2485 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,

2486 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}

2487

2488 def __init__(self, value, positive=True, case_flags=NOCASE,

2489 zerowidth=False):

2490 RegexBase.__init__(self)

2491 self.value = value

2492 self.positive = bool(positive)

2493 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

2494 self.zerowidth = bool(zerowidth)

2495

2496 if (self.positive and (self.case_flags & FULLIGNORECASE) ==

2497 FULLIGNORECASE):

2498 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))

2499 else:

2500 self.folded = chr(self.value)

2501

2502 self._key = (self.__class__, self.value, self.positive,

2503 self.case_flags, self.zerowidth)

2504

2505 def rebuild(self, positive, case_flags, zerowidth):

2506 return Character(self.value, positive, case_flags, zerowidth)

2507

2508 def optimise(self, info, reverse, in_set=False):

2509 return self

2510

2511 def get_firstset(self, reverse):

2512 return set([self])

2513

2514 def has_simple_start(self):

2515 return True

2516

2517 def _compile(self, reverse, fuzzy):

2518 flags = 0

2519 if self.positive:

2520 flags |= POSITIVE_OP

2521 if self.zerowidth:

2522 flags |= ZEROWIDTH_OP

2523 if fuzzy:

2524 flags |= FUZZY_OP

2525

2526 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,

2527 self.value])

2528

2529 if len(self.folded) > 1:

2530 # The character expands on full case-folding.

2531 code = Branch([code, String([ord(c) for c in self.folded],

2532 case_flags=self.case_flags)])

2533

2534 return code.compile(reverse, fuzzy)

2535

2536 def dump(self, indent, reverse):

2537 display = ascii(chr(self.value)).lstrip("bu")

2538 print("{}CHARACTER {} {}{}".format(INDENT * indent,

2539 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))

2540

2541 def matches(self, ch):

2542 return (ch == self.value) == self.positive

2543

2544 def max_width(self):

2545 return len(self.folded)

2546

2547 def get_required_string(self, reverse):

2548 if not self.positive:

2549 return 1, None

2550

2551 self.folded_characters = tuple(ord(c) for c in self.folded)

2552

2553 return 0, self

2554

2555class Conditional(RegexBase):

2556 def __init__(self, info, group, yes_item, no_item, position):

2557 RegexBase.__init__(self)

2558 self.info = info

2559 self.group = group

2560 self.yes_item = yes_item

2561 self.no_item = no_item

2562 self.position = position

2563

2564 def fix_groups(self, pattern, reverse, fuzzy):

2565 try:

2566 self.group = int(self.group)

2567 except ValueError:

2568 try:

2569 self.group = self.info.group_index[self.group]

2570 except KeyError:

2571 if self.group == 'DEFINE':

2572 # 'DEFINE' is a special name unless there's a group with

2573 # that name.

2574 self.group = 0

2575 else:

2576 raise error("unknown group", pattern, self.position)

2577

2578 if not 0 <= self.group <= self.info.group_count:

2579 raise error("invalid group reference", pattern, self.position)

2580

2581 self.yes_item.fix_groups(pattern, reverse, fuzzy)

2582 self.no_item.fix_groups(pattern, reverse, fuzzy)

2583

2584 def optimise(self, info, reverse):

2585 yes_item = self.yes_item.optimise(info, reverse)

2586 no_item = self.no_item.optimise(info, reverse)

2587

2588 return Conditional(info, self.group, yes_item, no_item, self.position)

2589

2590 def pack_characters(self, info):

2591 self.yes_item = self.yes_item.pack_characters(info)

2592 self.no_item = self.no_item.pack_characters(info)

2593 return self

2594

2595 def remove_captures(self):

2596 self.yes_item = self.yes_item.remove_captures()

2597 self.no_item = self.no_item.remove_captures()

2598

2599 def is_atomic(self):

2600 return self.yes_item.is_atomic() and self.no_item.is_atomic()

2601

2602 def can_be_affix(self):

2603 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()

2604

2605 def contains_group(self):

2606 return self.yes_item.contains_group() or self.no_item.contains_group()

2607

2608 def get_firstset(self, reverse):

2609 return (self.yes_item.get_firstset(reverse) |

2610 self.no_item.get_firstset(reverse))

2611

2612 def _compile(self, reverse, fuzzy):

2613 code = [(OP.GROUP_EXISTS, self.group)]

2614 code.extend(self.yes_item.compile(reverse, fuzzy))

2615 add_code = self.no_item.compile(reverse, fuzzy)

2616 if add_code:

2617 code.append((OP.NEXT, ))

2618 code.extend(add_code)

2619

2620 code.append((OP.END, ))

2621

2622 return code

2623

2624 def dump(self, indent, reverse):

2625 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))

2626 self.yes_item.dump(indent + 1, reverse)

2627 if not self.no_item.is_empty():

2628 print("{}OR".format(INDENT * indent))

2629 self.no_item.dump(indent + 1, reverse)

2630

2631 def is_empty(self):

2632 return self.yes_item.is_empty() and self.no_item.is_empty()

2633

2634 def __eq__(self, other):

2635 return type(self) is type(other) and (self.group, self.yes_item,

2636 self.no_item) == (other.group, other.yes_item, other.no_item)

2637

2638 def max_width(self):

2639 return max(self.yes_item.max_width(), self.no_item.max_width())

2640

2641 def __del__(self):

2642 self.info = None

2643

2644class DefaultBoundary(ZeroWidthBase):

2645 _opcode = OP.DEFAULT_BOUNDARY

2646 _op_name = "DEFAULT_BOUNDARY"

2647

2648class DefaultEndOfWord(ZeroWidthBase):

2649 _opcode = OP.DEFAULT_END_OF_WORD

2650 _op_name = "DEFAULT_END_OF_WORD"

2651

2652class DefaultStartOfWord(ZeroWidthBase):

2653 _opcode = OP.DEFAULT_START_OF_WORD

2654 _op_name = "DEFAULT_START_OF_WORD"

2655

2656class EndOfLine(ZeroWidthBase):

2657 _opcode = OP.END_OF_LINE

2658 _op_name = "END_OF_LINE"

2659

2660class EndOfLineU(EndOfLine):

2661 _opcode = OP.END_OF_LINE_U

2662 _op_name = "END_OF_LINE_U"

2663

2664class EndOfString(ZeroWidthBase):

2665 _opcode = OP.END_OF_STRING

2666 _op_name = "END_OF_STRING"

2667

2668class EndOfStringLine(ZeroWidthBase):

2669 _opcode = OP.END_OF_STRING_LINE

2670 _op_name = "END_OF_STRING_LINE"

2671

2672class EndOfStringLineU(EndOfStringLine):

2673 _opcode = OP.END_OF_STRING_LINE_U

2674 _op_name = "END_OF_STRING_LINE_U"

2675

2676class EndOfWord(ZeroWidthBase):

2677 _opcode = OP.END_OF_WORD

2678 _op_name = "END_OF_WORD"

2679

2680class Failure(ZeroWidthBase):

2681 _op_name = "FAILURE"

2682

2683 def _compile(self, reverse, fuzzy):

2684 return [(OP.FAILURE, )]

2685

2686class Fuzzy(RegexBase):

2687 def __init__(self, subpattern, constraints=None):

2688 RegexBase.__init__(self)

2689 if constraints is None:

2690 constraints = {}

2691 self.subpattern = subpattern

2692 self.constraints = constraints

2693

2694 # If an error type is mentioned in the cost equation, then its maximum

2695 # defaults to unlimited.

2696 if "cost" in constraints:

2697 for e in "dis":

2698 if e in constraints["cost"]:

2699 constraints.setdefault(e, (0, None))

2700

2701 # If any error type is mentioned, then all the error maxima default to

2702 # 0, otherwise they default to unlimited.

2703 if set(constraints) & set("dis"):

2704 for e in "dis":

2705 constraints.setdefault(e, (0, 0))

2706 else:

2707 for e in "dis":

2708 constraints.setdefault(e, (0, None))

2709

2710 # The maximum of the generic error type defaults to unlimited.

2711 constraints.setdefault("e", (0, None))

2712

2713 # The cost equation defaults to equal costs. Also, the cost of any

2714 # error type not mentioned in the cost equation defaults to 0.

2715 if "cost" in constraints:

2716 for e in "dis":

2717 constraints["cost"].setdefault(e, 0)

2718 else:

2719 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":

2720 constraints["e"][1]}

2721

2722 def fix_groups(self, pattern, reverse, fuzzy):

2723 self.subpattern.fix_groups(pattern, reverse, True)

2724

2725 def pack_characters(self, info):

2726 self.subpattern = self.subpattern.pack_characters(info)

2727 return self

2728

2729 def remove_captures(self):

2730 self.subpattern = self.subpattern.remove_captures()

2731 return self

2732

2733 def is_atomic(self):

2734 return self.subpattern.is_atomic()

2735

2736 def contains_group(self):

2737 return self.subpattern.contains_group()

2738

2739 def _compile(self, reverse, fuzzy):

2740 # The individual limits.

2741 arguments = []

2742 for e in "dise":

2743 v = self.constraints[e]

2744 arguments.append(v[0])

2745 arguments.append(UNLIMITED if v[1] is None else v[1])

2746

2747 # The coeffs of the cost equation.

2748 for e in "dis":

2749 arguments.append(self.constraints["cost"][e])

2750

2751 # The maximum of the cost equation.

2752 v = self.constraints["cost"]["max"]

2753 arguments.append(UNLIMITED if v is None else v)

2754

2755 flags = 0

2756 if reverse:

2757 flags |= REVERSE_OP

2758

2759 test = self.constraints.get("test")

2760

2761 if test:

2762 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +

2763 test.compile(reverse, True) + [(OP.NEXT,)] +

2764 self.subpattern.compile(reverse, True) + [(OP.END,)])

2765

2766 return ([(OP.FUZZY, flags) + tuple(arguments)] +

2767 self.subpattern.compile(reverse, True) + [(OP.END,)])

2768

2769 def dump(self, indent, reverse):

2770 constraints = self._constraints_to_string()

2771 if constraints:

2772 constraints = " " + constraints

2773 print("{}FUZZY{}".format(INDENT * indent, constraints))

2774 self.subpattern.dump(indent + 1, reverse)

2775

2776 def is_empty(self):

2777 return self.subpattern.is_empty()

2778

2779 def __eq__(self, other):

2780 return (type(self) is type(other) and self.subpattern ==

2781 other.subpattern and self.constraints == other.constraints)

2782

2783 def max_width(self):

2784 return UNLIMITED

2785

2786 def _constraints_to_string(self):

2787 constraints = []

2788

2789 for name in "ids":

2790 min, max = self.constraints[name]

2791 if max == 0:

2792 continue

2793

2794 con = ""

2795

2796 if min > 0:

2797 con = "{}<=".format(min)

2798

2799 con += name

2800

2801 if max is not None:

2802 con += "<={}".format(max)

2803

2804 constraints.append(con)

2805

2806 cost = []

2807 for name in "ids":

2808 coeff = self.constraints["cost"][name]

2809 if coeff > 0:

2810 cost.append("{}{}".format(coeff, name))

2811

2812 limit = self.constraints["cost"]["max"]

2813 if limit is not None and limit > 0:

2814 cost = "{}<={}".format("+".join(cost), limit)

2815 constraints.append(cost)

2816

2817 return ",".join(constraints)

2818

2819class Grapheme(RegexBase):

2820 def _compile(self, reverse, fuzzy):

2821 # Match at least 1 character until a grapheme boundary is reached. Note

2822 # that this is the same whether matching forwards or backwards.

2823 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),

2824 GraphemeBoundary()]))

2825

2826 return grapheme_matcher.compile(reverse, fuzzy)

2827

2828 def dump(self, indent, reverse):

2829 print("{}GRAPHEME".format(INDENT * indent))

2830

2831 def max_width(self):

2832 return UNLIMITED

2833

2834class GraphemeBoundary:

2835 def compile(self, reverse, fuzzy):

2836 return [(OP.GRAPHEME_BOUNDARY, 1)]

2837

2838class GreedyRepeat(RegexBase):

2839 _opcode = OP.GREEDY_REPEAT

2840 _op_name = "GREEDY_REPEAT"

2841

2842 def __init__(self, subpattern, min_count, max_count):

2843 RegexBase.__init__(self)

2844 self.subpattern = subpattern

2845 self.min_count = min_count

2846 self.max_count = max_count

2847

2848 def fix_groups(self, pattern, reverse, fuzzy):

2849 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2850

2851 def optimise(self, info, reverse):

2852 subpattern = self.subpattern.optimise(info, reverse)

2853

2854 return type(self)(subpattern, self.min_count, self.max_count)

2855

2856 def pack_characters(self, info):

2857 self.subpattern = self.subpattern.pack_characters(info)

2858 return self

2859

2860 def remove_captures(self):

2861 self.subpattern = self.subpattern.remove_captures()

2862 return self

2863

2864 def is_atomic(self):

2865 return self.min_count == self.max_count and self.subpattern.is_atomic()

2866

2867 def can_be_affix(self):

2868 return False

2869

2870 def contains_group(self):

2871 return self.subpattern.contains_group()

2872

2873 def get_firstset(self, reverse):

2874 fs = self.subpattern.get_firstset(reverse)

2875 if self.min_count == 0:

2876 fs.add(None)

2877

2878 return fs

2879

2880 def _compile(self, reverse, fuzzy):

2881 repeat = [self._opcode, self.min_count]

2882 if self.max_count is None:

2883 repeat.append(UNLIMITED)

2884 else:

2885 repeat.append(self.max_count)

2886

2887 subpattern = self.subpattern.compile(reverse, fuzzy)

2888 if not subpattern:

2889 return []

2890

2891 return ([tuple(repeat)] + subpattern + [(OP.END, )])

2892

2893 def dump(self, indent, reverse):

2894 if self.max_count is None:

2895 limit = "INF"

2896 else:

2897 limit = self.max_count

2898 print("{}{} {} {}".format(INDENT * indent, self._op_name,

2899 self.min_count, limit))

2900

2901 self.subpattern.dump(indent + 1, reverse)

2902

2903 def is_empty(self):

2904 return self.subpattern.is_empty()

2905

2906 def __eq__(self, other):

2907 return type(self) is type(other) and (self.subpattern, self.min_count,

2908 self.max_count) == (other.subpattern, other.min_count,

2909 other.max_count)

2910

2911 def max_width(self):

2912 if self.max_count is None:

2913 return UNLIMITED

2914

2915 return self.subpattern.max_width() * self.max_count

2916

2917 def get_required_string(self, reverse):

2918 max_count = UNLIMITED if self.max_count is None else self.max_count

2919 if self.min_count == 0:

2920 w = self.subpattern.max_width() * max_count

2921 return min(w, UNLIMITED), None

2922

2923 ofs, req = self.subpattern.get_required_string(reverse)

2924 if req:

2925 return ofs, req

2926

2927 w = self.subpattern.max_width() * max_count

2928 return min(w, UNLIMITED), None

2929

2930class PossessiveRepeat(GreedyRepeat):

2931 def is_atomic(self):

2932 return True

2933

2934 def _compile(self, reverse, fuzzy):

2935 subpattern = self.subpattern.compile(reverse, fuzzy)

2936 if not subpattern:

2937 return []

2938

2939 repeat = [self._opcode, self.min_count]

2940 if self.max_count is None:

2941 repeat.append(UNLIMITED)

2942 else:

2943 repeat.append(self.max_count)

2944

2945 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),

2946 (OP.END, )])

2947

2948 def dump(self, indent, reverse):

2949 print("{}ATOMIC".format(INDENT * indent))

2950

2951 if self.max_count is None:

2952 limit = "INF"

2953 else:

2954 limit = self.max_count

2955 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,

2956 self.min_count, limit))

2957

2958 self.subpattern.dump(indent + 2, reverse)

2959

2960class Group(RegexBase):

2961 def __init__(self, info, group, subpattern):

2962 RegexBase.__init__(self)

2963 self.info = info

2964 self.group = group

2965 self.subpattern = subpattern

2966

2967 self.call_ref = None

2968

2969 def fix_groups(self, pattern, reverse, fuzzy):

2970 self.info.defined_groups[self.group] = (self, reverse, fuzzy)

2971 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2972

2973 def optimise(self, info, reverse):

2974 subpattern = self.subpattern.optimise(info, reverse)

2975

2976 return Group(self.info, self.group, subpattern)

2977

2978 def pack_characters(self, info):

2979 self.subpattern = self.subpattern.pack_characters(info)

2980 return self

2981

2982 def remove_captures(self):

2983 return self.subpattern.remove_captures()

2984

2985 def is_atomic(self):

2986 return self.subpattern.is_atomic()

2987

2988 def can_be_affix(self):

2989 return False

2990

2991 def contains_group(self):

2992 return True

2993

2994 def get_firstset(self, reverse):

2995 return self.subpattern.get_firstset(reverse)

2996

2997 def has_simple_start(self):

2998 return self.subpattern.has_simple_start()

2999

3000 def _compile(self, reverse, fuzzy):

3001 code = []

3002

3003 key = self.group, reverse, fuzzy

3004 ref = self.info.call_refs.get(key)

3005 if ref is not None:

3006 code += [(OP.CALL_REF, ref)]

3007

3008 public_group = private_group = self.group

3009 if private_group < 0:

3010 public_group = self.info.private_groups[private_group]

3011 private_group = self.info.group_count - private_group

3012

3013 code += ([(OP.GROUP, int(not reverse), private_group, public_group)] +

3014 self.subpattern.compile(reverse, fuzzy) + [(OP.END, )])

3015

3016 if ref is not None:

3017 code += [(OP.END, )]

3018

3019 return code

3020

3021 def dump(self, indent, reverse):

3022 group = self.group

3023 if group < 0:

3024 group = private_groups[group]

3025 print("{}GROUP {}".format(INDENT * indent, group))

3026 self.subpattern.dump(indent + 1, reverse)

3027

3028 def __eq__(self, other):

3029 return (type(self) is type(other) and (self.group, self.subpattern) ==

3030 (other.group, other.subpattern))

3031

3032 def max_width(self):

3033 return self.subpattern.max_width()

3034

3035 def get_required_string(self, reverse):

3036 return self.subpattern.get_required_string(reverse)

3037

3038 def __del__(self):

3039 self.info = None

3040

3041class Keep(ZeroWidthBase):

3042 _opcode = OP.KEEP

3043 _op_name = "KEEP"

3044

3045class LazyRepeat(GreedyRepeat):

3046 _opcode = OP.LAZY_REPEAT

3047 _op_name = "LAZY_REPEAT"

3048

3049class LookAround(RegexBase):

3050 _dir_text = {False: "AHEAD", True: "BEHIND"}

3051

3052 def __init__(self, behind, positive, subpattern):

3053 RegexBase.__init__(self)

3054 self.behind = bool(behind)

3055 self.positive = bool(positive)

3056 self.subpattern = subpattern

3057

3058 def fix_groups(self, pattern, reverse, fuzzy):

3059 self.subpattern.fix_groups(pattern, self.behind, fuzzy)

3060

3061 def optimise(self, info, reverse):

3062 subpattern = self.subpattern.optimise(info, self.behind)

3063 if self.positive and subpattern.is_empty():

3064 return subpattern

3065

3066 return LookAround(self.behind, self.positive, subpattern)

3067

3068 def pack_characters(self, info):

3069 self.subpattern = self.subpattern.pack_characters(info)

3070 return self

3071

3072 def remove_captures(self):

3073 return self.subpattern.remove_captures()

3074

3075 def is_atomic(self):

3076 return self.subpattern.is_atomic()

3077

3078 def can_be_affix(self):

3079 return self.subpattern.can_be_affix()

3080

3081 def contains_group(self):

3082 return self.subpattern.contains_group()

3083

3084 def get_firstset(self, reverse):

3085 if self.positive and self.behind == reverse:

3086 return self.subpattern.get_firstset(reverse)

3087

3088 return set([None])

3089

3090 def _compile(self, reverse, fuzzy):

3091 flags = 0

3092 if self.positive:

3093 flags |= POSITIVE_OP

3094 if fuzzy:

3095 flags |= FUZZY_OP

3096 if reverse:

3097 flags |= REVERSE_OP

3098

3099 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +

3100 self.subpattern.compile(self.behind) + [(OP.END, )])

3101

3102 def dump(self, indent, reverse):

3103 print("{}LOOK{} {}".format(INDENT * indent,

3104 self._dir_text[self.behind], POS_TEXT[self.positive]))

3105 self.subpattern.dump(indent + 1, self.behind)

3106

3107 def is_empty(self):

3108 return self.positive and self.subpattern.is_empty()

3109

3110 def __eq__(self, other):

3111 return type(self) is type(other) and (self.behind, self.positive,

3112 self.subpattern) == (other.behind, other.positive, other.subpattern)

3113

3114 def max_width(self):

3115 return 0

3116

3117class LookAroundConditional(RegexBase):

3118 _dir_text = {False: "AHEAD", True: "BEHIND"}

3119

3120 def __init__(self, behind, positive, subpattern, yes_item, no_item):

3121 RegexBase.__init__(self)

3122 self.behind = bool(behind)

3123 self.positive = bool(positive)

3124 self.subpattern = subpattern

3125 self.yes_item = yes_item

3126 self.no_item = no_item

3127

3128 def fix_groups(self, pattern, reverse, fuzzy):

3129 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3130 self.yes_item.fix_groups(pattern, reverse, fuzzy)

3131 self.no_item.fix_groups(pattern, reverse, fuzzy)

3132

3133 def optimise(self, info, reverse):

3134 subpattern = self.subpattern.optimise(info, self.behind)

3135 yes_item = self.yes_item.optimise(info, self.behind)

3136 no_item = self.no_item.optimise(info, self.behind)

3137

3138 return LookAroundConditional(self.behind, self.positive, subpattern,

3139 yes_item, no_item)

3140

3141 def pack_characters(self, info):

3142 self.subpattern = self.subpattern.pack_characters(info)

3143 self.yes_item = self.yes_item.pack_characters(info)

3144 self.no_item = self.no_item.pack_characters(info)

3145 return self

3146

3147 def remove_captures(self):

3148 self.subpattern = self.subpattern.remove_captures()

3149 self.yes_item = self.yes_item.remove_captures()

3150 self.no_item = self.no_item.remove_captures()

3151

3152 def is_atomic(self):

3153 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and

3154 self.no_item.is_atomic())

3155

3156 def can_be_affix(self):

3157 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()

3158 and self.no_item.can_be_affix())

3159

3160 def contains_group(self):

3161 return (self.subpattern.contains_group() or

3162 self.yes_item.contains_group() or self.no_item.contains_group())

3163

3164 def _compile(self, reverse, fuzzy):

3165 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]

3166 code.extend(self.subpattern.compile(self.behind, fuzzy))

3167 code.append((OP.NEXT, ))

3168 code.extend(self.yes_item.compile(reverse, fuzzy))

3169 add_code = self.no_item.compile(reverse, fuzzy)

3170 if add_code:

3171 code.append((OP.NEXT, ))

3172 code.extend(add_code)

3173

3174 code.append((OP.END, ))

3175

3176 return code

3177

3178 def dump(self, indent, reverse):

3179 print("{}CONDITIONAL {} {}".format(INDENT * indent,

3180 self._dir_text[self.behind], POS_TEXT[self.positive]))

3181 self.subpattern.dump(indent + 1, self.behind)

3182 print("{}EITHER".format(INDENT * indent))

3183 self.yes_item.dump(indent + 1, reverse)

3184 if not self.no_item.is_empty():

3185 print("{}OR".format(INDENT * indent))

3186 self.no_item.dump(indent + 1, reverse)

3187

3188 def is_empty(self):

3189 return (self.subpattern.is_empty() and self.yes_item.is_empty() or

3190 self.no_item.is_empty())

3191

3192 def __eq__(self, other):

3193 return type(self) is type(other) and (self.subpattern, self.yes_item,

3194 self.no_item) == (other.subpattern, other.yes_item, other.no_item)

3195

3196 def max_width(self):

3197 return max(self.yes_item.max_width(), self.no_item.max_width())

3198

3199 def get_required_string(self, reverse):

3200 return self.max_width(), None

3201

3202class PrecompiledCode(RegexBase):

3203 def __init__(self, code):

3204 self.code = code

3205

3206 def _compile(self, reverse, fuzzy):

3207 return [tuple(self.code)]

3208

3209class Property(RegexBase):

3210 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):

3211 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):

3212 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):

3213 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,

3214 True): OP.PROPERTY_IGN_REV}

3215

3216 def __init__(self, value, positive=True, case_flags=NOCASE,

3217 zerowidth=False):

3218 RegexBase.__init__(self)

3219 self.value = value

3220 self.positive = bool(positive)

3221 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3222 self.zerowidth = bool(zerowidth)

3223

3224 self._key = (self.__class__, self.value, self.positive,

3225 self.case_flags, self.zerowidth)

3226

3227 def rebuild(self, positive, case_flags, zerowidth):

3228 return Property(self.value, positive, case_flags, zerowidth)

3229

3230 def optimise(self, info, reverse, in_set=False):

3231 return self

3232

3233 def get_firstset(self, reverse):

3234 return set([self])

3235

3236 def has_simple_start(self):

3237 return True

3238

3239 def _compile(self, reverse, fuzzy):

3240 flags = 0

3241 if self.positive:

3242 flags |= POSITIVE_OP

3243 if self.zerowidth:

3244 flags |= ZEROWIDTH_OP

3245 if fuzzy:

3246 flags |= FUZZY_OP

3247 return [(self._opcode[self.case_flags, reverse], flags, self.value)]

3248

3249 def dump(self, indent, reverse):

3250 prop = PROPERTY_NAMES[self.value >> 16]

3251 name, value = prop[0], prop[1][self.value & 0xFFFF]

3252 print("{}PROPERTY {} {}:{}{}".format(INDENT * indent,

3253 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags]))

3254

3255 def matches(self, ch):

3256 return _regex.has_property_value(self.value, ch) == self.positive

3257

3258 def max_width(self):

3259 return 1

3260

3261class Prune(ZeroWidthBase):

3262 _op_name = "PRUNE"

3263

3264 def _compile(self, reverse, fuzzy):

3265 return [(OP.PRUNE, )]

3266

3267class Range(RegexBase):

3268 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,

3269 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,

3270 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,

3271 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}

3272 _op_name = "RANGE"

3273

3274 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,

3275 zerowidth=False):

3276 RegexBase.__init__(self)

3277 self.lower = lower

3278 self.upper = upper

3279 self.positive = bool(positive)

3280 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3281 self.zerowidth = bool(zerowidth)

3282

3283 self._key = (self.__class__, self.lower, self.upper, self.positive,

3284 self.case_flags, self.zerowidth)

3285

3286 def rebuild(self, positive, case_flags, zerowidth):

3287 return Range(self.lower, self.upper, positive, case_flags, zerowidth)

3288

3289 def optimise(self, info, reverse, in_set=False):

3290 # Is the range case-sensitive?

3291 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3292 return self

3293

3294 # Is full case-folding possible?

3295 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=

3296 FULLIGNORECASE):

3297 return self

3298

3299 # Get the characters which expand to multiple codepoints on folding.

3300 expanding_chars = _regex.get_expand_on_folding()

3301

3302 # Get the folded characters in the range.

3303 items = []

3304 for ch in expanding_chars:

3305 if self.lower <= ord(ch) <= self.upper:

3306 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3307 items.append(String([ord(c) for c in folded],

3308 case_flags=self.case_flags))

3309

3310 if not items:

3311 # We can fall back to simple case-folding.

3312 return self

3313

3314 if len(items) < self.upper - self.lower + 1:

3315 # Not all the characters are covered by the full case-folding.

3316 items.insert(0, self)

3317

3318 return Branch(items)

3319

3320 def _compile(self, reverse, fuzzy):

3321 flags = 0

3322 if self.positive:

3323 flags |= POSITIVE_OP

3324 if self.zerowidth:

3325 flags |= ZEROWIDTH_OP

3326 if fuzzy:

3327 flags |= FUZZY_OP

3328 return [(self._opcode[self.case_flags, reverse], flags, self.lower,

3329 self.upper)]

3330

3331 def dump(self, indent, reverse):

3332 display_lower = ascii(chr(self.lower)).lstrip("bu")

3333 display_upper = ascii(chr(self.upper)).lstrip("bu")

3334 print("{}RANGE {} {} {}{}".format(INDENT * indent,

3335 POS_TEXT[self.positive], display_lower, display_upper,

3336 CASE_TEXT[self.case_flags]))

3337

3338 def matches(self, ch):

3339 return (self.lower <= ch <= self.upper) == self.positive

3340

3341 def max_width(self):

3342 return 1

3343

3344class RefGroup(RegexBase):

3345 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):

3346 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,

3347 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,

3348 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,

3349 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}

3350

3351 def __init__(self, info, group, position, case_flags=NOCASE):

3352 RegexBase.__init__(self)

3353 self.info = info

3354 self.group = group

3355 self.position = position

3356 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3357

3358 self._key = self.__class__, self.group, self.case_flags

3359

3360 def fix_groups(self, pattern, reverse, fuzzy):

3361 try:

3362 self.group = int(self.group)

3363 except ValueError:

3364 try:

3365 self.group = self.info.group_index[self.group]

3366 except KeyError:

3367 raise error("unknown group", pattern, self.position)

3368

3369 if not 1 <= self.group <= self.info.group_count:

3370 raise error("invalid group reference", pattern, self.position)

3371

3372 self._key = self.__class__, self.group, self.case_flags

3373

3374 def remove_captures(self):

3375 raise error("group reference not allowed", pattern, self.position)

3376

3377 def _compile(self, reverse, fuzzy):

3378 flags = 0

3379 if fuzzy:

3380 flags |= FUZZY_OP

3381 return [(self._opcode[self.case_flags, reverse], flags, self.group)]

3382

3383 def dump(self, indent, reverse):

3384 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,

3385 CASE_TEXT[self.case_flags]))

3386

3387 def max_width(self):

3388 return UNLIMITED

3389

3390 def __del__(self):

3391 self.info = None

3392

3393class SearchAnchor(ZeroWidthBase):

3394 _opcode = OP.SEARCH_ANCHOR

3395 _op_name = "SEARCH_ANCHOR"

3396

3397class Sequence(RegexBase):

3398 def __init__(self, items=None):

3399 RegexBase.__init__(self)

3400 if items is None:

3401 items = []

3402

3403 self.items = items

3404

3405 def fix_groups(self, pattern, reverse, fuzzy):

3406 for s in self.items:

3407 s.fix_groups(pattern, reverse, fuzzy)

3408

3409 def optimise(self, info, reverse):

3410 # Flatten the sequences.

3411 items = []

3412 for s in self.items:

3413 s = s.optimise(info, reverse)

3414 if isinstance(s, Sequence):

3415 items.extend(s.items)

3416 else:

3417 items.append(s)

3418

3419 return make_sequence(items)

3420

3421 def pack_characters(self, info):

3422 "Packs sequences of characters into strings."

3423 items = []

3424 characters = []

3425 case_flags = NOCASE

3426 for s in self.items:

3427 if type(s) is Character and s.positive and not s.zerowidth:

3428 if s.case_flags != case_flags:

3429 # Different case sensitivity, so flush, unless neither the

3430 # previous nor the new character are cased.

3431 if s.case_flags or is_cased_i(info, s.value):

3432 Sequence._flush_characters(info, characters,

3433 case_flags, items)

3434

3435 case_flags = s.case_flags

3436

3437 characters.append(s.value)

3438 elif type(s) is String or type(s) is Literal:

3439 if s.case_flags != case_flags:

3440 # Different case sensitivity, so flush, unless the neither

3441 # the previous nor the new string are cased.

3442 if s.case_flags or any(is_cased_i(info, c) for c in

3443 characters):

3444 Sequence._flush_characters(info, characters,

3445 case_flags, items)

3446

3447 case_flags = s.case_flags

3448

3449 characters.extend(s.characters)

3450 else:

3451 Sequence._flush_characters(info, characters, case_flags, items)

3452

3453 items.append(s.pack_characters(info))

3454

3455 Sequence._flush_characters(info, characters, case_flags, items)

3456

3457 return make_sequence(items)

3458

3459 def remove_captures(self):

3460 self.items = [s.remove_captures() for s in self.items]

3461 return self

3462

3463 def is_atomic(self):

3464 return all(s.is_atomic() for s in self.items)

3465

3466 def can_be_affix(self):

3467 return False

3468

3469 def contains_group(self):

3470 return any(s.contains_group() for s in self.items)

3471

3472 def get_firstset(self, reverse):

3473 fs = set()

3474 items = self.items

3475 if reverse:

3476 items.reverse()

3477 for s in items:

3478 fs |= s.get_firstset(reverse)

3479 if None not in fs:

3480 return fs

3481 fs.discard(None)

3482

3483 return fs | set([None])

3484

3485 def has_simple_start(self):

3486 return bool(self.items) and self.items[0].has_simple_start()

3487

3488 def _compile(self, reverse, fuzzy):

3489 seq = self.items

3490 if reverse:

3491 seq = seq[::-1]

3492

3493 code = []

3494 for s in seq:

3495 code.extend(s.compile(reverse, fuzzy))

3496

3497 return code

3498

3499 def dump(self, indent, reverse):

3500 for s in self.items:

3501 s.dump(indent, reverse)

3502

3503 @staticmethod

3504 def _flush_characters(info, characters, case_flags, items):

3505 if not characters:

3506 return

3507

3508 # Disregard case_flags if all of the characters are case-less.

3509 if case_flags & IGNORECASE:

3510 if not any(is_cased_i(info, c) for c in characters):

3511 case_flags = NOCASE

3512

3513 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3514 literals = Sequence._fix_full_casefold(characters)

3515

3516 for item in literals:

3517 chars = item.characters

3518

3519 if len(chars) == 1:

3520 items.append(Character(chars[0], case_flags=item.case_flags))

3521 else:

3522 items.append(String(chars, case_flags=item.case_flags))

3523 else:

3524 if len(characters) == 1:

3525 items.append(Character(characters[0], case_flags=case_flags))

3526 else:

3527 items.append(String(characters, case_flags=case_flags))

3528

3529 characters[:] = []

3530

3531 @staticmethod

3532 def _fix_full_casefold(characters):

3533 # Split a literal needing full case-folding into chunks that need it

3534 # and chunks that can use simple case-folding, which is faster.

3535 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in

3536 _regex.get_expand_on_folding()]

3537 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)

3538 for c in characters)).lower()

3539 chunks = []

3540

3541 for e in expanded:

3542 found = string.find(e)

3543

3544 while found >= 0:

3545 chunks.append((found, found + len(e)))

3546 found = string.find(e, found + 1)

3547

3548 pos = 0

3549 literals = []

3550

3551 for start, end in Sequence._merge_chunks(chunks):

3552 if pos < start:

3553 literals.append(Literal(characters[pos : start],

3554 case_flags=IGNORECASE))

3555

3556 literals.append(Literal(characters[start : end],

3557 case_flags=FULLIGNORECASE))

3558 pos = end

3559

3560 if pos < len(characters):

3561 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))

3562

3563 return literals

3564

3565 @staticmethod

3566 def _merge_chunks(chunks):

3567 if len(chunks) < 2:

3568 return chunks

3569

3570 chunks.sort()

3571

3572 start, end = chunks[0]

3573 new_chunks = []

3574

3575 for s, e in chunks[1 : ]:

3576 if s <= end:

3577 end = max(end, e)

3578 else:

3579 new_chunks.append((start, end))

3580 start, end = s, e

3581

3582 new_chunks.append((start, end))

3583

3584 return new_chunks

3585

3586 def is_empty(self):

3587 return all(i.is_empty() for i in self.items)

3588

3589 def __eq__(self, other):

3590 return type(self) is type(other) and self.items == other.items

3591

3592 def max_width(self):

3593 return sum(s.max_width() for s in self.items)

3594

3595 def get_required_string(self, reverse):

3596 seq = self.items

3597 if reverse:

3598 seq = seq[::-1]

3599

3600 offset = 0

3601

3602 for s in seq:

3603 ofs, req = s.get_required_string(reverse)

3604 offset += ofs

3605 if req:

3606 return offset, req

3607

3608 return offset, None

3609

3610class SetBase(RegexBase):

3611 def __init__(self, info, items, positive=True, case_flags=NOCASE,

3612 zerowidth=False):

3613 RegexBase.__init__(self)

3614 self.info = info

3615 self.items = tuple(items)

3616 self.positive = bool(positive)

3617 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3618 self.zerowidth = bool(zerowidth)

3619

3620 self.char_width = 1

3621

3622 self._key = (self.__class__, self.items, self.positive,

3623 self.case_flags, self.zerowidth)

3624

3625 def rebuild(self, positive, case_flags, zerowidth):

3626 return type(self)(self.info, self.items, positive, case_flags,

3627 zerowidth).optimise(self.info, False)

3628

3629 def get_firstset(self, reverse):

3630 return set([self])

3631

3632 def has_simple_start(self):

3633 return True

3634

3635 def _compile(self, reverse, fuzzy):

3636 flags = 0

3637 if self.positive:

3638 flags |= POSITIVE_OP

3639 if self.zerowidth:

3640 flags |= ZEROWIDTH_OP

3641 if fuzzy:

3642 flags |= FUZZY_OP

3643 code = [(self._opcode[self.case_flags, reverse], flags)]

3644 for m in self.items:

3645 code.extend(m.compile())

3646

3647 code.append((OP.END, ))

3648

3649 return code

3650

3651 def dump(self, indent, reverse):

3652 print("{}{} {}{}".format(INDENT * indent, self._op_name,

3653 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))

3654 for i in self.items:

3655 i.dump(indent + 1, reverse)

3656

3657 def _handle_case_folding(self, info, in_set):

3658 # Is the set case-sensitive?

3659 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3660 return self

3661

3662 # Is full case-folding possible?

3663 if (not (self.info.flags & UNICODE) or (self.case_flags &

3664 FULLIGNORECASE) != FULLIGNORECASE):

3665 return self

3666

3667 # Get the characters which expand to multiple codepoints on folding.

3668 expanding_chars = _regex.get_expand_on_folding()

3669

3670 # Get the folded characters in the set.

3671 items = []

3672 seen = set()

3673 for ch in expanding_chars:

3674 if self.matches(ord(ch)):

3675 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3676 if folded not in seen:

3677 items.append(String([ord(c) for c in folded],

3678 case_flags=self.case_flags))

3679 seen.add(folded)

3680

3681 if not items:

3682 # We can fall back to simple case-folding.

3683 return self

3684

3685 return Branch([self] + items)

3686

3687 def max_width(self):

3688 # Is the set case-sensitive?

3689 if not self.positive or not (self.case_flags & IGNORECASE):

3690 return 1

3691

3692 # Is full case-folding possible?

3693 if (not (self.info.flags & UNICODE) or (self.case_flags &

3694 FULLIGNORECASE) != FULLIGNORECASE):

3695 return 1

3696

3697 # Get the characters which expand to multiple codepoints on folding.

3698 expanding_chars = _regex.get_expand_on_folding()

3699

3700 # Get the folded characters in the set.

3701 seen = set()

3702 for ch in expanding_chars:

3703 if self.matches(ord(ch)):

3704 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3705 seen.add(folded)

3706

3707 if not seen:

3708 return 1

3709

3710 return max(len(folded) for folded in seen)

3711

3712 def __del__(self):

3713 self.info = None

3714

3715class SetDiff(SetBase):

3716 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):

3717 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):

3718 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):

3719 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,

3720 True): OP.SET_DIFF_IGN_REV}

3721 _op_name = "SET_DIFF"

3722

3723 def optimise(self, info, reverse, in_set=False):

3724 items = self.items

3725 if len(items) > 2:

3726 items = [items[0], SetUnion(info, items[1 : ])]

3727

3728 if len(items) == 1:

3729 return items[0].with_flags(case_flags=self.case_flags,

3730 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3731

3732 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in

3733 items)

3734

3735 return self._handle_case_folding(info, in_set)

3736

3737 def matches(self, ch):

3738 m = self.items[0].matches(ch) and not self.items[1].matches(ch)

3739 return m == self.positive

3740

3741class SetInter(SetBase):

3742 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):

3743 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,

3744 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,

3745 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,

3746 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}

3747 _op_name = "SET_INTER"

3748

3749 def optimise(self, info, reverse, in_set=False):

3750 items = []

3751 for m in self.items:

3752 m = m.optimise(info, reverse, in_set=True)

3753 if isinstance(m, SetInter) and m.positive:

3754 # Intersection in intersection.

3755 items.extend(m.items)

3756 else:

3757 items.append(m)

3758

3759 if len(items) == 1:

3760 return items[0].with_flags(case_flags=self.case_flags,

3761 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3762

3763 self.items = tuple(items)

3764

3765 return self._handle_case_folding(info, in_set)

3766

3767 def matches(self, ch):

3768 m = all(i.matches(ch) for i in self.items)

3769 return m == self.positive

3770

3771class SetSymDiff(SetBase):

3772 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):

3773 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,

3774 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,

3775 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):

3776 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}

3777 _op_name = "SET_SYM_DIFF"

3778

3779 def optimise(self, info, reverse, in_set=False):

3780 items = []

3781 for m in self.items:

3782 m = m.optimise(info, reverse, in_set=True)

3783 if isinstance(m, SetSymDiff) and m.positive:

3784 # Symmetric difference in symmetric difference.

3785 items.extend(m.items)

3786 else:

3787 items.append(m)

3788

3789 if len(items) == 1:

3790 return items[0].with_flags(case_flags=self.case_flags,

3791 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3792

3793 self.items = tuple(items)

3794

3795 return self._handle_case_folding(info, in_set)

3796

3797 def matches(self, ch):

3798 m = False

3799 for i in self.items:

3800 m = m != i.matches(ch)

3801

3802 return m == self.positive

3803

3804class SetUnion(SetBase):

3805 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):

3806 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,

3807 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,

3808 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,

3809 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}

3810 _op_name = "SET_UNION"

3811

3812 def optimise(self, info, reverse, in_set=False):

3813 items = []

3814 for m in self.items:

3815 m = m.optimise(info, reverse, in_set=True)

3816 if isinstance(m, SetUnion) and m.positive:

3817 # Union in union.

3818 items.extend(m.items)

3819 else:

3820 items.append(m)

3821

3822 if len(items) == 1:

3823 i = items[0]

3824 return i.with_flags(positive=i.positive == self.positive,

3825 case_flags=self.case_flags,

3826 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3827

3828 self.items = tuple(items)

3829

3830 return self._handle_case_folding(info, in_set)

3831

3832 def _compile(self, reverse, fuzzy):

3833 flags = 0

3834 if self.positive:

3835 flags |= POSITIVE_OP

3836 if self.zerowidth:

3837 flags |= ZEROWIDTH_OP

3838 if fuzzy:

3839 flags |= FUZZY_OP

3840

3841 characters, others = defaultdict(list), []

3842 for m in self.items:

3843 if isinstance(m, Character):

3844 characters[m.positive].append(m.value)

3845 else:

3846 others.append(m)

3847

3848 code = [(self._opcode[self.case_flags, reverse], flags)]

3849

3850 for positive, values in characters.items():

3851 flags = 0

3852 if positive:

3853 flags |= POSITIVE_OP

3854 if len(values) == 1:

3855 code.append((OP.CHARACTER, flags, values[0]))

3856 else:

3857 code.append((OP.STRING, flags, len(values)) + tuple(values))

3858

3859 for m in others:

3860 code.extend(m.compile())

3861

3862 code.append((OP.END, ))

3863

3864 return code

3865

3866 def matches(self, ch):

3867 m = any(i.matches(ch) for i in self.items)

3868 return m == self.positive

3869

3870class Skip(ZeroWidthBase):

3871 _op_name = "SKIP"

3872 _opcode = OP.SKIP

3873

3874class StartOfLine(ZeroWidthBase):

3875 _opcode = OP.START_OF_LINE

3876 _op_name = "START_OF_LINE"

3877

3878class StartOfLineU(StartOfLine):

3879 _opcode = OP.START_OF_LINE_U

3880 _op_name = "START_OF_LINE_U"

3881

3882class StartOfString(ZeroWidthBase):

3883 _opcode = OP.START_OF_STRING

3884 _op_name = "START_OF_STRING"

3885

3886class StartOfWord(ZeroWidthBase):

3887 _opcode = OP.START_OF_WORD

3888 _op_name = "START_OF_WORD"

3889

3890class String(RegexBase):

3891 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,

3892 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,

3893 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,

3894 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):

3895 OP.STRING_FLD_REV}

3896

3897 def __init__(self, characters, case_flags=NOCASE):

3898 self.characters = tuple(characters)

3899 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3900

3901 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3902 folded_characters = []

3903 for char in self.characters:

3904 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))

3905 folded_characters.extend(ord(c) for c in folded)

3906 else:

3907 folded_characters = self.characters

3908

3909 self.folded_characters = tuple(folded_characters)

3910 self.required = False

3911

3912 self._key = self.__class__, self.characters, self.case_flags

3913

3914 def get_firstset(self, reverse):

3915 if reverse:

3916 pos = -1

3917 else:

3918 pos = 0

3919 return set([Character(self.characters[pos],

3920 case_flags=self.case_flags)])

3921

3922 def has_simple_start(self):

3923 return True

3924

3925 def _compile(self, reverse, fuzzy):

3926 flags = 0

3927 if fuzzy:

3928 flags |= FUZZY_OP

3929 if self.required:

3930 flags |= REQUIRED_OP

3931 return [(self._opcode[self.case_flags, reverse], flags,

3932 len(self.folded_characters)) + self.folded_characters]

3933

3934 def dump(self, indent, reverse):

3935 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")

3936 print("{}STRING {}{}".format(INDENT * indent, display,

3937 CASE_TEXT[self.case_flags]))

3938

3939 def max_width(self):

3940 return len(self.folded_characters)

3941

3942 def get_required_string(self, reverse):

3943 return 0, self

3944

3945class Literal(String):

3946 def dump(self, indent, reverse):

3947 literal = ''.join(chr(c) for c in self.characters)

3948 display = ascii(literal).lstrip("bu")

3949 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,

3950 CASE_TEXT[self.case_flags]))

3951

3952class StringSet(Branch):

3953 def __init__(self, info, name, case_flags=NOCASE):

3954 self.info = info

3955 self.name = name

3956 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3957

3958 self._key = self.__class__, self.name, self.case_flags

3959

3960 self.set_key = (name, self.case_flags)

3961 if self.set_key not in info.named_lists_used:

3962 info.named_lists_used[self.set_key] = len(info.named_lists_used)

3963

3964 index = self.info.named_lists_used[self.set_key]

3965 items = self.info.kwargs[self.name]

3966

3967 case_flags = self.case_flags

3968

3969 encoding = self.info.flags & _ALL_ENCODINGS

3970 fold_flags = encoding | case_flags

3971

3972 choices = []

3973

3974 for string in items:

3975 if isinstance(string, str):

3976 string = [ord(c) for c in string]

3977

3978 choices.append([Character(c, case_flags=case_flags) for c in

3979 string])

3980

3981 # Sort from longest to shortest.

3982 choices.sort(key=len, reverse=True)

3983

3984 self.branches = [Sequence(choice) for choice in choices]

3985

3986 def dump(self, indent, reverse):

3987 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,

3988 CASE_TEXT[self.case_flags]))

3989

3990 def __del__(self):

3991 self.info = None

3992

3993class Source:

3994 "Scanner for the regular expression source string."

3995 def __init__(self, string):

3996 if isinstance(string, str):

3997 self.string = string

3998 self.char_type = chr

3999 else:

4000 self.string = string.decode("latin-1")

4001 self.char_type = lambda c: bytes([c])

4002

4003 self.pos = 0

4004 self.ignore_space = False

4005 self.sep = string[ : 0]

4006

4007 def get(self, override_ignore=False):

4008 string = self.string

4009 pos = self.pos

4010

4011 try:

4012 if self.ignore_space and not override_ignore:

4013 while True:

4014 if string[pos].isspace():

4015 # Skip over the whitespace.

4016 pos += 1

4017 elif string[pos] == "#":

4018 # Skip over the comment to the end of the line.

4019 pos = string.index("\n", pos)

4020 else:

4021 break

4022

4023 ch = string[pos]

4024 self.pos = pos + 1

4025 return ch

4026 except IndexError:

4027 # We've reached the end of the string.

4028 self.pos = pos

4029 return string[ : 0]

4030 except ValueError:

4031 # The comment extended to the end of the string.

4032 self.pos = len(string)

4033 return string[ : 0]

4034

4035 def get_many(self, count=1):

4036 string = self.string

4037 pos = self.pos

4038

4039 try:

4040 if self.ignore_space:

4041 substring = []

4042

4043 while len(substring) < count:

4044 while True:

4045 if string[pos].isspace():

4046 # Skip over the whitespace.

4047 pos += 1

4048 elif string[pos] == "#":

4049 # Skip over the comment to the end of the line.

4050 pos = string.index("\n", pos)

4051 else:

4052 break

4053

4054 substring.append(string[pos])

4055 pos += 1

4056

4057 substring = "".join(substring)

4058 else:

4059 substring = string[pos : pos + count]

4060 pos += len(substring)

4061

4062 self.pos = pos

4063 return substring

4064 except IndexError:

4065 # We've reached the end of the string.

4066 self.pos = len(string)

4067 return "".join(substring)

4068 except ValueError:

4069 # The comment extended to the end of the string.

4070 self.pos = len(string)

4071 return "".join(substring)

4072

4073 def get_while(self, test_set, include=True):

4074 string = self.string

4075 pos = self.pos

4076

4077 if self.ignore_space:

4078 try:

4079 substring = []

4080

4081 while True:

4082 if string[pos].isspace():

4083 # Skip over the whitespace.

4084 pos += 1

4085 elif string[pos] == "#":

4086 # Skip over the comment to the end of the line.

4087 pos = string.index("\n", pos)

4088 elif (string[pos] in test_set) == include:

4089 substring.append(string[pos])

4090 pos += 1

4091 else:

4092 break

4093

4094 self.pos = pos

4095 except IndexError:

4096 # We've reached the end of the string.

4097 self.pos = len(string)

4098 except ValueError:

4099 # The comment extended to the end of the string.

4100 self.pos = len(string)

4101

4102 return "".join(substring)

4103 else:

4104 try:

4105 while (string[pos] in test_set) == include:

4106 pos += 1

4107

4108 substring = string[self.pos : pos]

4109

4110 self.pos = pos

4111

4112 return substring

4113 except IndexError:

4114 # We've reached the end of the string.

4115 substring = string[self.pos : pos]

4116

4117 self.pos = pos

4118

4119 return substring

4120

4121 def skip_while(self, test_set, include=True):

4122 string = self.string

4123 pos = self.pos

4124

4125 try:

4126 if self.ignore_space:

4127 while True:

4128 if string[pos].isspace():

4129 # Skip over the whitespace.

4130 pos += 1

4131 elif string[pos] == "#":

4132 # Skip over the comment to the end of the line.

4133 pos = string.index("\n", pos)

4134 elif (string[pos] in test_set) == include:

4135 pos += 1

4136 else:

4137 break

4138 else:

4139 while (string[pos] in test_set) == include:

4140 pos += 1

4141

4142 self.pos = pos

4143 except IndexError:

4144 # We've reached the end of the string.

4145 self.pos = len(string)

4146 except ValueError:

4147 # The comment extended to the end of the string.

4148 self.pos = len(string)

4149

4150 def match(self, substring):

4151 string = self.string

4152 pos = self.pos

4153

4154 if self.ignore_space:

4155 try:

4156 for c in substring:

4157 while True:

4158 if string[pos].isspace():

4159 # Skip over the whitespace.

4160 pos += 1

4161 elif string[pos] == "#":

4162 # Skip over the comment to the end of the line.

4163 pos = string.index("\n", pos)

4164 else:

4165 break

4166

4167 if string[pos] != c:

4168 return False

4169

4170 pos += 1

4171

4172 self.pos = pos

4173

4174 return True

4175 except IndexError:

4176 # We've reached the end of the string.

4177 return False

4178 except ValueError:

4179 # The comment extended to the end of the string.

4180 return False

4181 else:

4182 if not string.startswith(substring, pos):

4183 return False

4184

4185 self.pos = pos + len(substring)

4186

4187 return True

4188

4189 def expect(self, substring):

4190 if not self.match(substring):

4191 raise error("missing {}".format(substring), self.string, self.pos)

4192

4193 def at_end(self):

4194 string = self.string

4195 pos = self.pos

4196

4197 try:

4198 if self.ignore_space:

4199 while True:

4200 if string[pos].isspace():

4201 pos += 1

4202 elif string[pos] == "#":

4203 pos = string.index("\n", pos)

4204 else:

4205 break

4206

4207 return pos >= len(string)

4208 except IndexError:

4209 # We've reached the end of the string.

4210 return True

4211 except ValueError:

4212 # The comment extended to the end of the string.

4213 return True

4214

4215class Info:

4216 "Info about the regular expression."

4217

4218 def __init__(self, flags=0, char_type=None, kwargs={}):

4219 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]

4220 self.flags = flags

4221 self.global_flags = flags

4222 self.inline_locale = False

4223

4224 self.kwargs = kwargs

4225

4226 self.group_count = 0

4227 self.group_index = {}

4228 self.group_name = {}

4229 self.char_type = char_type

4230 self.named_lists_used = {}

4231 self.open_groups = []

4232 self.open_group_count = {}

4233 self.defined_groups = {}

4234 self.group_calls = []

4235 self.private_groups = {}

4236

4237 def open_group(self, name=None):

4238 group = self.group_index.get(name)

4239 if group is None:

4240 while True:

4241 self.group_count += 1

4242 if name is None or self.group_count not in self.group_name:

4243 break

4244

4245 group = self.group_count

4246 if name:

4247 self.group_index[name] = group

4248 self.group_name[group] = name

4249

4250 if group in self.open_groups:

4251 # We have a nested named group. We'll assign it a private group

4252 # number, initially negative until we can assign a proper

4253 # (positive) number.

4254 group_alias = -(len(self.private_groups) + 1)

4255 self.private_groups[group_alias] = group

4256 group = group_alias

4257

4258 self.open_groups.append(group)

4259 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1

4260

4261 return group

4262

4263 def close_group(self):

4264 self.open_groups.pop()

4265

4266 def is_open_group(self, name):

4267 # In version 1, a group reference can refer to an open group. We'll

4268 # just pretend the group isn't open.

4269 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4270 if version == VERSION1:

4271 return False

4272

4273 if name.isdigit():

4274 group = int(name)

4275 else:

4276 group = self.group_index.get(name)

4277

4278 return group in self.open_groups

4279

4280def _check_group_features(info, parsed):

4281 """Checks whether the reverse and fuzzy features of the group calls match

4282 the groups which they call.

4283 """

4284 call_refs = {}

4285 additional_groups = []

4286 for call, reverse, fuzzy in info.group_calls:

4287 # Look up the reference of this group call.

4288 key = (call.group, reverse, fuzzy)

4289 ref = call_refs.get(key)

4290 if ref is None:

4291 # This group doesn't have a reference yet, so look up its features.

4292 if call.group == 0:

4293 # Calling the pattern as a whole.

4294 rev = bool(info.flags & REVERSE)

4295 fuz = isinstance(parsed, Fuzzy)

4296 if (rev, fuz) != (reverse, fuzzy):

4297 # The pattern as a whole doesn't have the features we want,

4298 # so we'll need to make a copy of it with the desired

4299 # features.

4300 additional_groups.append((CallRef(len(call_refs), parsed),

4301 reverse, fuzzy))

4302 else:

4303 # Calling a capture group.

4304 def_info = info.defined_groups[call.group]

4305 group = def_info[0]

4306 if def_info[1 : ] != (reverse, fuzzy):

4307 # The group doesn't have the features we want, so we'll

4308 # need to make a copy of it with the desired features.

4309 additional_groups.append((group, reverse, fuzzy))

4310

4311 ref = len(call_refs)

4312 call_refs[key] = ref

4313

4314 call.call_ref = ref

4315

4316 info.call_refs = call_refs

4317 info.additional_groups = additional_groups

4318

4319def _get_required_string(parsed, flags):

4320 "Gets the required string and related info of a parsed pattern."

4321

4322 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))

4323 if required:

4324 required.required = True

4325 if req_offset >= UNLIMITED:

4326 req_offset = -1

4327

4328 req_flags = required.case_flags

4329 if not (flags & UNICODE):

4330 req_flags &= ~UNICODE

4331

4332 req_chars = required.folded_characters

4333 else:

4334 req_offset = 0

4335 req_chars = ()

4336 req_flags = 0

4337

4338 return req_offset, req_chars, req_flags

4339

4340class Scanner:

4341 def __init__(self, lexicon, flags=0):

4342 self.lexicon = lexicon

4343

4344 # Combine phrases into a compound pattern.

4345 patterns = []

4346 for phrase, action in lexicon:

4347 # Parse the regular expression.

4348 source = Source(phrase)

4349 info = Info(flags, source.char_type)

4350 source.ignore_space = bool(info.flags & VERBOSE)

4351 parsed = _parse_pattern(source, info)

4352 if not source.at_end():

4353 raise error("unbalanced parenthesis", source.string,

4354 source.pos)

4355

4356 # We want to forbid capture groups within each phrase.

4357 patterns.append(parsed.remove_captures())

4358

4359 # Combine all the subpatterns into one pattern.

4360 info = Info(flags)

4361 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]

4362 parsed = Branch(patterns)

4363

4364 # Optimise the compound pattern.

4365 reverse = bool(info.flags & REVERSE)

4366 parsed = parsed.optimise(info, reverse)

4367 parsed = parsed.pack_characters(info)

4368

4369 # Get the required string.

4370 req_offset, req_chars, req_flags = _get_required_string(parsed,

4371 info.flags)

4372

4373 # Check the features of the groups.

4374 _check_group_features(info, parsed)

4375

4376 # Complain if there are any group calls. They are not supported by the

4377 # Scanner class.

4378 if info.call_refs:

4379 raise error("recursive regex not supported by Scanner",

4380 source.string, source.pos)

4381

4382 reverse = bool(info.flags & REVERSE)

4383

4384 # Compile the compound pattern. The result is a list of tuples.

4385 code = parsed.compile(reverse) + [(OP.SUCCESS, )]

4386

4387 # Flatten the code into a list of ints.

4388 code = _flatten_code(code)

4389

4390 if not parsed.has_simple_start():

4391 # Get the first set, if possible.

4392 try:

4393 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))

4394 fs_code = _flatten_code(fs_code)

4395 code = fs_code + code

4396 except _FirstSetError:

4397 pass

4398

4399 # Check the global flags for conflicts.

4400 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4401 if version not in (0, VERSION0, VERSION1):

4402 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")

4403

4404 # Create the PatternObject.

4405 #

4406 # Local flags like IGNORECASE affect the code generation, but aren't

4407 # needed by the PatternObject itself. Conversely, global flags like

4408 # LOCALE _don't_ affect the code generation but _are_ needed by the

4409 # PatternObject.

4410 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,

4411 code, {}, {}, {}, [], req_offset, req_chars, req_flags,

4412 len(patterns))

4413

4414 def scan(self, string):

4415 result = []

4416 append = result.append

4417 match = self.scanner.scanner(string).match

4418 i = 0

4419 while True:

4420 m = match()

4421 if not m:

4422 break

4423 j = m.end()

4424 if i == j:

4425 break

4426 action = self.lexicon[m.lastindex - 1][1]

4427 if hasattr(action, '__call__'):

4428 self.match = m

4429 action = action(self, m.group())

4430 if action is not None:

4431 append(action)

4432 i = j

4433

4434 return result, string[i : ]

4435

4436# Get the known properties dict.

4437PROPERTIES = _regex.get_properties()

4438

4439# Build the inverse of the properties dict.

4440PROPERTY_NAMES = {}

4441for prop_name, (prop_id, values) in PROPERTIES.items():

4442 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))

4443 name = max(name, prop_name, key=len)

4444 PROPERTY_NAMES[prop_id] = name, prop_values

4445

4446 for val_name, val_id in values.items():

4447 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,

4448 key=len)

4449

4450# Character escape sequences.

4451CHARACTER_ESCAPES = {

4452 "a": "\a",

4453 "b": "\b",

4454 "f": "\f",

4455 "n": "\n",

4456 "r": "\r",

4457 "t": "\t",

4458 "v": "\v",

4459}

4460

4461# Predefined character set escape sequences.

4462CHARSET_ESCAPES = {

4463 "d": lookup_property(None, "Digit", True),

4464 "D": lookup_property(None, "Digit", False),

4465 "h": lookup_property(None, "Blank", True),

4466 "s": lookup_property(None, "Space", True),

4467 "S": lookup_property(None, "Space", False),

4468 "w": lookup_property(None, "Word", True),

4469 "W": lookup_property(None, "Word", False),

4470}

4471

4472# Positional escape sequences.

4473POSITION_ESCAPES = {

4474 "A": StartOfString(),

4475 "b": Boundary(),

4476 "B": Boundary(False),

4477 "K": Keep(),

4478 "m": StartOfWord(),

4479 "M": EndOfWord(),

4480 "Z": EndOfString(),

4481}

4482

4483# Positional escape sequences when WORD flag set.

4484WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4485WORD_POSITION_ESCAPES.update({

4486 "b": DefaultBoundary(),

4487 "B": DefaultBoundary(False),

4488 "m": DefaultStartOfWord(),

4489 "M": DefaultEndOfWord(),

4490})

4491

4492# Regex control verbs.

4493VERBS = {

4494 "FAIL": Failure(),

4495 "F": Failure(),

4496 "PRUNE": Prune(),

4497 "SKIP": Skip(),

4498}

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/regex-2023.3.23-py3.8-linux-x86_64.egg/regex/_regex_core.py: 87%

2773 statements