Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/regex/_regex

2# Secret Labs' Regular Expression Engine core module

6# This version of the SRE library can be redistributed under CNRI's

7# Python 1.6 license. For any other use, please contact Secret Labs

8# AB (info@pythonware.com).

10# Portions of this engine have been developed in cooperation with

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and

12# other compatibility work.

13#

14# 2010-01-16 mrab Python front-end re-written and extended

16import enum

17import string

18import unicodedata

19from collections import defaultdict

21import regex._regex as _regex

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",

27 "Scanner", "RegexFlag"]

29# The regex exception.

30class error(Exception):

31 """Exception raised for invalid regular expressions.

33 Attributes:

35 msg: The unformatted error message

36 pattern: The regular expression pattern

37 pos: The position in the pattern where compilation failed, or None

38 lineno: The line number where compilation failed, unless pos is None

39 colno: The column number where compilation failed, unless pos is None

40 """

42 def __init__(self, message, pattern=None, pos=None):

43 newline = '\n' if isinstance(pattern, str) else b'\n'

44 self.msg = message

45 self.pattern = pattern

46 self.pos = pos

47 if pattern is not None and pos is not None:

48 self.lineno = pattern.count(newline, 0, pos) + 1

49 self.colno = pos - pattern.rfind(newline, 0, pos)

51 message = "{} at position {}".format(message, pos)

53 if newline in pattern:

54 message += " (line {}, column {})".format(self.lineno,

55 self.colno)

57 Exception.__init__(self, message)

59# The exception for when a positional flag has been turned on in the old

60# behaviour.

61class _UnscopedFlagSet(Exception):

62 pass

64# The exception for when parsing fails and we want to try something else.

65class ParseError(Exception):

66 pass

68# The exception for when there isn't a valid first set.

69class _FirstSetError(Exception):

70 pass

72# Flags.

73class RegexFlag(enum.IntFlag):

74 A = ASCII = 0x80 # Assume ASCII locale.

75 B = BESTMATCH = 0x1000 # Best fuzzy match.

76 D = DEBUG = 0x200 # Print parsed pattern.

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first

78 # fuzzy match.

79 F = FULLCASE = 0x4000 # Unicode full case-folding.

80 I = IGNORECASE = 0x2 # Ignore case.

81 L = LOCALE = 0x4 # Assume current 8-bit locale.

82 M = MULTILINE = 0x8 # Make anchors look for newline.

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).

84 R = REVERSE = 0x400 # Search backwards.

85 S = DOTALL = 0x10 # Make dot match newline.

86 U = UNICODE = 0x20 # Assume Unicode locale.

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.

88 V1 = VERSION1 = 0x100 # New enhanced behaviour.

89 W = WORD = 0x800 # Default Unicode word breaks.

90 X = VERBOSE = 0x40 # Ignore whitespace and comments.

91 T = TEMPLATE = 0x1 # Template (present because re module has it).

93 def __repr__(self):

94 if self._name_ is not None:

95 return 'regex.%s' % self._name_

97 value = self._value_

98 members = []

99 negative = value < 0

100

101 if negative:

102 value = ~value

103

104 for m in self.__class__:

105 if value & m._value_:

106 value &= ~m._value_

107 members.append('regex.%s' % m._name_)

108

109 if value:

110 members.append(hex(value))

111

112 res = '|'.join(members)

113

114 if negative:

115 if len(members) > 1:

116 res = '~(%s)' % res

117 else:

118 res = '~%s' % res

119

120 return res

121

122 __str__ = object.__str__

123

124globals().update(RegexFlag.__members__)

125

126DEFAULT_VERSION = VERSION1

127

128_ALL_VERSIONS = VERSION0 | VERSION1

129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE

130

131# The default flags for the various versions.

132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}

133

134# The mask for the flags.

136 REVERSE)

138 _ALL_ENCODINGS)

139

140ALPHA = frozenset(string.ascii_letters)

141DIGITS = frozenset(string.digits)

142ALNUM = ALPHA | DIGITS

143OCT_DIGITS = frozenset(string.octdigits)

144HEX_DIGITS = frozenset(string.hexdigits)

145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])

146NAMED_CHAR_PART = ALNUM | frozenset(" -")

147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")

148SET_OPS = ("||", "~~", "&&", "--")

149

150# The width of the code words inside the regex engine.

151BYTES_PER_CODE = _regex.get_code_size()

152BITS_PER_CODE = BYTES_PER_CODE * 8

153

154# The repeat count which represents infinity.

155UNLIMITED = (1 << BITS_PER_CODE) - 1

156

157# The regular expression flags.

158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,

159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,

160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":

161 VERBOSE}

162

163# The case flags.

164CASE_FLAGS = FULLCASE | IGNORECASE

165NOCASE = 0

166FULLIGNORECASE = FULLCASE | IGNORECASE

167

168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE

169

170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,

171 FULLIGNORECASE: FULLIGNORECASE}

172

173# The number of digits in hexadecimal escapes.

174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}

175

176# The names of the opcodes.

177OPCODES = """

178FAILURE

179SUCCESS

180ANY

181ANY_ALL

182ANY_ALL_REV

183ANY_REV

184ANY_U

185ANY_U_REV

186ATOMIC

187BOUNDARY

188BRANCH

189CALL_REF

190CHARACTER

191CHARACTER_IGN

192CHARACTER_IGN_REV

193CHARACTER_REV

194CONDITIONAL

195DEFAULT_BOUNDARY

196DEFAULT_END_OF_WORD

197DEFAULT_START_OF_WORD

198END

199END_OF_LINE

200END_OF_LINE_U

201END_OF_STRING

202END_OF_STRING_LINE

203END_OF_STRING_LINE_U

204END_OF_WORD

205FUZZY

206GRAPHEME_BOUNDARY

207GREEDY_REPEAT

208GROUP

209GROUP_CALL

210GROUP_EXISTS

211KEEP

212LAZY_REPEAT

213LOOKAROUND

214NEXT

215PROPERTY

216PROPERTY_IGN

217PROPERTY_IGN_REV

218PROPERTY_REV

219PRUNE

220RANGE

221RANGE_IGN

222RANGE_IGN_REV

223RANGE_REV

224REF_GROUP

225REF_GROUP_FLD

226REF_GROUP_FLD_REV

227REF_GROUP_IGN

228REF_GROUP_IGN_REV

229REF_GROUP_REV

230SEARCH_ANCHOR

231SET_DIFF

232SET_DIFF_IGN

233SET_DIFF_IGN_REV

234SET_DIFF_REV

235SET_INTER

236SET_INTER_IGN

237SET_INTER_IGN_REV

238SET_INTER_REV

239SET_SYM_DIFF

240SET_SYM_DIFF_IGN

241SET_SYM_DIFF_IGN_REV

242SET_SYM_DIFF_REV

243SET_UNION

244SET_UNION_IGN

245SET_UNION_IGN_REV

246SET_UNION_REV

247SKIP

248START_OF_LINE

249START_OF_LINE_U

250START_OF_STRING

251START_OF_WORD

252STRING

253STRING_FLD

254STRING_FLD_REV

255STRING_IGN

256STRING_IGN_REV

257STRING_REV

258FUZZY_EXT

259"""

260

261# Define the opcodes in a namespace.

262class Namespace:

263 pass

264

265OP = Namespace()

266for i, op in enumerate(OPCODES.split()):

267 setattr(OP, op, i)

268

269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):

270 """Make room in the given cache.

271

272 Args:

273 cache_dict: The cache dictionary to modify.

274 args_dict: The dictionary of named list args used by patterns.

275 max_length: Maximum # of entries in cache_dict before it is shrunk.

276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.

277 """

278 # Toss out a fraction of the entries at random to make room for new ones.

279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()

280 # as popitem could penalize the same regular expression repeatedly based

281 # on its internal hash value. Being random should spread the cache miss

282 # love around.

283 cache_keys = tuple(cache_dict.keys())

284 overage = len(cache_keys) - max_length

285 if overage < 0:

286 # Cache is already within limits. Normally this should not happen

287 # but it could due to multithreading.

288 return

289

290 number_to_toss = max_length // divisor + overage

291

292 # The import is done here to avoid a circular dependency.

293 import random

294 if not hasattr(random, 'sample'):

295 # Do nothing while resolving the circular dependency:

296 # re->random->warnings->tokenize->string->re

297 return

298

299 for doomed_key in random.sample(cache_keys, number_to_toss):

300 try:

301 del cache_dict[doomed_key]

302 except KeyError:

303 # Ignore problems if the cache changed from another thread.

304 pass

305

306 # Rebuild the arguments and locale-sensitivity dictionaries.

307 args_dict.clear()

308 sensitivity_dict = {}

309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):

310 args_dict[pattern, pattern_type, flags, default_version, locale] = args

311 try:

312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]

313 except KeyError:

314 pass

315

316 locale_sensitive.clear()

317 locale_sensitive.update(sensitivity_dict)

318

319def _fold_case(info, string):

320 "Folds the case of a string."

321 flags = info.flags

322 if (flags & _ALL_ENCODINGS) == 0:

323 flags |= info.guess_encoding

324

325 return _regex.fold_case(flags, string)

326

327def is_cased_i(info, char):

328 "Checks whether a character is cased."

329 return len(_regex.get_all_cases(info.flags, char)) > 1

330

331def is_cased_f(flags, char):

332 "Checks whether a character is cased."

333 return len(_regex.get_all_cases(flags, char)) > 1

334

335def _compile_firstset(info, fs):

336 "Compiles the firstset for the pattern."

337 reverse = bool(info.flags & REVERSE)

338 fs = _check_firstset(info, reverse, fs)

339 if not fs:

340 return []

341

342 # Compile the firstset.

343 return fs.compile(reverse)

344

345def _check_firstset(info, reverse, fs):

346 "Checks the firstset for the pattern."

347 if not fs or None in fs:

348 return None

349

350 # If we ignore the case, for simplicity we won't build a firstset.

351 members = set()

352 case_flags = NOCASE

353 for i in fs:

354 if isinstance(i, Character) and not i.positive:

355 return None

356

357# if i.case_flags:

358# if isinstance(i, Character):

359# if is_cased_i(info, i.value):

360# return []

361# elif isinstance(i, SetBase):

362# return []

363 case_flags |= i.case_flags

364 members.add(i.with_flags(case_flags=NOCASE))

365

366 if case_flags == (FULLCASE | IGNORECASE):

367 return None

368

369 # Build the firstset.

370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,

371 zerowidth=True)

372 fs = fs.optimise(info, reverse, in_set=True)

373

374 return fs

375

376def _flatten_code(code):

377 "Flattens the code from a list of tuples."

378 flat_code = []

379 for c in code:

380 flat_code.extend(c)

381

382 return flat_code

383

384def make_case_flags(info):

385 "Makes the case flags."

386 flags = info.flags & CASE_FLAGS

387

388 # Turn off FULLCASE if ASCII is turned on.

389 if info.flags & ASCII:

390 flags &= ~FULLCASE

391

392 return flags

393

394def make_character(info, value, in_set=False):

395 "Makes a character literal."

396 if in_set:

397 # A character set is built case-sensitively.

398 return Character(value)

399

400 return Character(value, case_flags=make_case_flags(info))

401

402def make_ref_group(info, name, position):

403 "Makes a group reference."

404 return RefGroup(info, name, position, case_flags=make_case_flags(info))

405

406def make_string_set(info, name):

407 "Makes a string set."

408 return StringSet(info, name, case_flags=make_case_flags(info))

409

410def make_property(info, prop, in_set):

411 "Makes a property."

412 if in_set:

413 return prop

414

415 return prop.with_flags(case_flags=make_case_flags(info))

416

417def _parse_pattern(source, info):

418 "Parses a pattern, eg. 'a|b|c'."

419 branches = [parse_sequence(source, info)]

420 while source.match("|"):

421 branches.append(parse_sequence(source, info))

422

423 if len(branches) == 1:

424 return branches[0]

425 return Branch(branches)

426

427def parse_sequence(source, info):

428 "Parses a sequence, eg. 'abc'."

429 sequence = [None]

430 case_flags = make_case_flags(info)

431 while True:

432 saved_pos = source.pos

433 ch = source.get()

434 if ch in SPECIAL_CHARS:

435 if ch in ")|":

436 # The end of a sequence. At the end of the pattern ch is "".

437 source.pos = saved_pos

438 break

439 elif ch == "\\":

440 # An escape sequence outside a set.

441 sequence.append(parse_escape(source, info, False))

442 elif ch == "(":

443 # A parenthesised subpattern or a flag.

444 element = parse_paren(source, info)

445 if element is None:

446 case_flags = make_case_flags(info)

447 else:

448 sequence.append(element)

449 elif ch == ".":

450 # Any character.

451 if info.flags & DOTALL:

452 sequence.append(AnyAll())

453 elif info.flags & WORD:

454 sequence.append(AnyU())

455 else:

456 sequence.append(Any())

457 elif ch == "[":

458 # A character set.

459 sequence.append(parse_set(source, info))

460 elif ch == "^":

461 # The start of a line or the string.

462 if info.flags & MULTILINE:

463 if info.flags & WORD:

464 sequence.append(StartOfLineU())

465 else:

466 sequence.append(StartOfLine())

467 else:

468 sequence.append(StartOfString())

469 elif ch == "$":

470 # The end of a line or the string.

471 if info.flags & MULTILINE:

472 if info.flags & WORD:

473 sequence.append(EndOfLineU())

474 else:

475 sequence.append(EndOfLine())

476 else:

477 if info.flags & WORD:

478 sequence.append(EndOfStringLineU())

479 else:

480 sequence.append(EndOfStringLine())

481 elif ch in "?*+{":

482 # Looks like a quantifier.

483 counts = parse_quantifier(source, info, ch)

484 if counts:

485 # It _is_ a quantifier.

486 apply_quantifier(source, info, counts, case_flags, ch,

487 saved_pos, sequence)

488 sequence.append(None)

489 else:

490 # It's not a quantifier. Maybe it's a fuzzy constraint.

491 constraints = parse_fuzzy(source, info, ch, case_flags)

492 if constraints:

493 # It _is_ a fuzzy constraint.

494 apply_constraint(source, info, constraints, case_flags,

495 saved_pos, sequence)

496 sequence.append(None)

497 else:

498 # The element was just a literal.

499 sequence.append(Character(ord(ch),

500 case_flags=case_flags))

501 else:

502 # A literal.

503 sequence.append(Character(ord(ch), case_flags=case_flags))

504 else:

505 # A literal.

506 sequence.append(Character(ord(ch), case_flags=case_flags))

507

508 sequence = [item for item in sequence if item is not None]

509 return Sequence(sequence)

510

511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,

512 sequence):

513 element = sequence.pop()

514 if element is None:

515 if sequence:

516 raise error("multiple repeat", source.string, saved_pos)

517 raise error("nothing to repeat", source.string, saved_pos)

518

519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):

520 raise error("multiple repeat", source.string, saved_pos)

521

522 min_count, max_count = counts

523 saved_pos = source.pos

524 ch = source.get()

525 if ch == "?":

526 # The "?" suffix that means it's a lazy repeat.

527 repeated = LazyRepeat

528 elif ch == "+":

529 # The "+" suffix that means it's a possessive repeat.

530 repeated = PossessiveRepeat

531 else:

532 # No suffix means that it's a greedy repeat.

533 source.pos = saved_pos

534 repeated = GreedyRepeat

535

536 # Ignore the quantifier if it applies to a zero-width item or the number of

537 # repeats is fixed at 1.

538 if not element.is_empty() and (min_count != 1 or max_count != 1):

539 element = repeated(element, min_count, max_count)

540

541 sequence.append(element)

542

543def apply_constraint(source, info, constraints, case_flags, saved_pos,

544 sequence):

545 element = sequence.pop()

546 if element is None:

547 raise error("nothing for fuzzy constraint", source.string, saved_pos)

548

549 # If a group is marked as fuzzy then put all of the fuzzy part in the

550 # group.

551 if isinstance(element, Group):

552 element.subpattern = Fuzzy(element.subpattern, constraints)

553 sequence.append(element)

554 else:

555 sequence.append(Fuzzy(element, constraints))

556

557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}

558

559def parse_quantifier(source, info, ch):

560 "Parses a quantifier."

561 q = _QUANTIFIERS.get(ch)

562 if q:

563 # It's a quantifier.

564 return q

565

566 if ch == "{":

567 # Looks like a limited repeated element, eg. 'a{2,3}'.

568 counts = parse_limited_quantifier(source)

569 if counts:

570 return counts

571

572 return None

573

574def is_above_limit(count):

575 "Checks whether a count is above the maximum."

576 return count is not None and count >= UNLIMITED

577

578def parse_limited_quantifier(source):

579 "Parses a limited quantifier."

580 saved_pos = source.pos

581 min_count = parse_count(source)

582 if source.match(","):

583 max_count = parse_count(source)

584

585 # No minimum means 0 and no maximum means unlimited.

586 min_count = int(min_count or 0)

587 max_count = int(max_count) if max_count else None

588 else:

589 if not min_count:

590 source.pos = saved_pos

591 return None

592

593 min_count = max_count = int(min_count)

594

595 if not source.match ("}"):

596 source.pos = saved_pos

597 return None

598

599 if is_above_limit(min_count) or is_above_limit(max_count):

600 raise error("repeat count too big", source.string, saved_pos)

601

602 if max_count is not None and min_count > max_count:

603 raise error("min repeat greater than max repeat", source.string,

604 saved_pos)

605

606 return min_count, max_count

607

608def parse_fuzzy(source, info, ch, case_flags):

609 "Parses a fuzzy setting, if present."

610 saved_pos = source.pos

611

612 if ch != "{":

613 return None

614

615 constraints = {}

616 try:

617 parse_fuzzy_item(source, constraints)

618 while source.match(","):

619 parse_fuzzy_item(source, constraints)

620 except ParseError:

621 source.pos = saved_pos

622 return None

623

624 if source.match(":"):

625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)

626

627 if not source.match("}"):

628 raise error("expected }", source.string, source.pos)

629

630 return constraints

631

632def parse_fuzzy_item(source, constraints):

633 "Parses a fuzzy setting item."

634 saved_pos = source.pos

635 try:

636 parse_cost_constraint(source, constraints)

637 except ParseError:

638 source.pos = saved_pos

639

640 parse_cost_equation(source, constraints)

641

642def parse_cost_constraint(source, constraints):

643 "Parses a cost constraint."

644 saved_pos = source.pos

645 ch = source.get()

646 if ch in ALPHA:

647 # Syntax: constraint [("<=" | "<") cost]

648 constraint = parse_constraint(source, constraints, ch)

649

650 max_inc = parse_fuzzy_compare(source)

651

652 if max_inc is None:

653 # No maximum cost.

654 constraints[constraint] = 0, None

655 else:

656 # There's a maximum cost.

657 cost_pos = source.pos

658 max_cost = parse_cost_limit(source)

659

660 # Inclusive or exclusive limit?

661 if not max_inc:

662 max_cost -= 1

663

664 if max_cost < 0:

665 raise error("bad fuzzy cost limit", source.string, cost_pos)

666

667 constraints[constraint] = 0, max_cost

668 elif ch in DIGITS:

669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost

670 source.pos = saved_pos

671

672 # Minimum cost.

673 cost_pos = source.pos

674 min_cost = parse_cost_limit(source)

675

676 min_inc = parse_fuzzy_compare(source)

677 if min_inc is None:

678 raise ParseError()

679

680 constraint = parse_constraint(source, constraints, source.get())

681

682 max_inc = parse_fuzzy_compare(source)

683 if max_inc is None:

684 raise ParseError()

685

686 # Maximum cost.

687 cost_pos = source.pos

688 max_cost = parse_cost_limit(source)

689

690 # Inclusive or exclusive limits?

691 if not min_inc:

692 min_cost += 1

693 if not max_inc:

694 max_cost -= 1

695

696 if not 0 <= min_cost <= max_cost:

697 raise error("bad fuzzy cost limit", source.string, cost_pos)

698

699 constraints[constraint] = min_cost, max_cost

700 else:

701 raise ParseError()

702

703def parse_cost_limit(source):

704 "Parses a cost limit."

705 cost_pos = source.pos

706 digits = parse_count(source)

707

708 try:

709 return int(digits)

710 except ValueError:

711 pass

712

713 raise error("bad fuzzy cost limit", source.string, cost_pos)

714

715def parse_constraint(source, constraints, ch):

716 "Parses a constraint."

717 if ch not in "deis":

718 raise ParseError()

719

720 if ch in constraints:

721 raise ParseError()

722

723 return ch

724

725def parse_fuzzy_compare(source):

726 "Parses a cost comparator."

727 if source.match("<="):

728 return True

729 elif source.match("<"):

730 return False

731 else:

732 return None

733

734def parse_cost_equation(source, constraints):

735 "Parses a cost equation."

736 if "cost" in constraints:

737 raise error("more than one cost equation", source.string, source.pos)

738

739 cost = {}

740

741 parse_cost_term(source, cost)

742 while source.match("+"):

743 parse_cost_term(source, cost)

744

745 max_inc = parse_fuzzy_compare(source)

746 if max_inc is None:

747 raise ParseError()

748

749 max_cost = int(parse_count(source))

750

751 if not max_inc:

752 max_cost -= 1

753

754 if max_cost < 0:

755 raise error("bad fuzzy cost limit", source.string, source.pos)

756

757 cost["max"] = max_cost

758

759 constraints["cost"] = cost

760

761def parse_cost_term(source, cost):

762 "Parses a cost equation term."

763 coeff = parse_count(source)

764 ch = source.get()

765 if ch not in "dis":

766 raise ParseError()

767

768 if ch in cost:

769 raise error("repeated fuzzy cost", source.string, source.pos)

770

771 cost[ch] = int(coeff or 1)

772

773def parse_fuzzy_test(source, info, case_flags):

774 saved_pos = source.pos

775 ch = source.get()

776 if ch in SPECIAL_CHARS:

777 if ch == "\\":

778 # An escape sequence outside a set.

779 return parse_escape(source, info, False)

780 elif ch == ".":

781 # Any character.

782 if info.flags & DOTALL:

783 return AnyAll()

784 elif info.flags & WORD:

785 return AnyU()

786 else:

787 return Any()

788 elif ch == "[":

789 # A character set.

790 return parse_set(source, info)

791 else:

792 raise error("expected character set", source.string, saved_pos)

793 elif ch:

794 # A literal.

795 return Character(ord(ch), case_flags=case_flags)

796 else:

797 raise error("expected character set", source.string, saved_pos)

798

799def parse_count(source):

800 "Parses a quantifier's count, which can be empty."

801 return source.get_while(DIGITS)

802

803def parse_paren(source, info):

804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an

805 inline flag.

806 """

807 saved_pos = source.pos

808 ch = source.get(True)

809 if ch == "?":

810 # (?...

811 saved_pos_2 = source.pos

812 ch = source.get(True)

813 if ch == "<":

814 # (?<...

815 saved_pos_3 = source.pos

816 ch = source.get()

817 if ch in ("=", "!"):

818 # (?<=... or (?<!...: lookbehind.

819 return parse_lookaround(source, info, True, ch == "=")

820

821 # (?<...: a named capture group.

822 source.pos = saved_pos_3

823 name = parse_name(source)

824 group = info.open_group(name)

825 source.expect(">")

826 saved_flags = info.flags

827 try:

828 subpattern = _parse_pattern(source, info)

829 source.expect(")")

830 finally:

831 info.flags = saved_flags

832 source.ignore_space = bool(info.flags & VERBOSE)

833

834 info.close_group()

835 return Group(info, group, subpattern)

836 if ch in ("=", "!"):

837 # (?=... or (?!...: lookahead.

838 return parse_lookaround(source, info, False, ch == "=")

839 if ch == "P":

840 # (?P...: a Python extension.

841 return parse_extension(source, info)

842 if ch == "#":

843 # (?#...: a comment.

844 return parse_comment(source)

845 if ch == "(":

846 # (?(...: a conditional subpattern.

847 return parse_conditional(source, info)

848 if ch == ">":

849 # (?>...: an atomic subpattern.

850 return parse_atomic(source, info)

851 if ch == "|":

852 # (?|...: a common/reset groups branch.

853 return parse_common(source, info)

854 if ch == "R" or "0" <= ch <= "9":

855 # (?R...: probably a call to a group.

856 return parse_call_group(source, info, ch, saved_pos_2)

857 if ch == "&":

858 # (?&...: a call to a named group.

859 return parse_call_named_group(source, info, saved_pos_2)

860

861 # (?...: probably a flags subpattern.

862 source.pos = saved_pos_2

863 return parse_flags_subpattern(source, info)

864

865 if ch == "*":

866 # (*...

867 saved_pos_2 = source.pos

868 word = source.get_while(set(")>"), include=False)

869 if word[ : 1].isalpha():

870 verb = VERBS.get(word)

871 if not verb:

872 raise error("unknown verb", source.string, saved_pos_2)

873

874 source.expect(")")

875

876 return verb

877

878 # (...: an unnamed capture group.

879 source.pos = saved_pos

880 group = info.open_group()

881 saved_flags = info.flags

882 try:

883 subpattern = _parse_pattern(source, info)

884 source.expect(")")

885 finally:

886 info.flags = saved_flags

887 source.ignore_space = bool(info.flags & VERBOSE)

888

889 info.close_group()

890

891 return Group(info, group, subpattern)

892

893def parse_extension(source, info):

894 "Parses a Python extension."

895 saved_pos = source.pos

896 ch = source.get()

897 if ch == "<":

898 # (?P<...: a named capture group.

899 name = parse_name(source)

900 group = info.open_group(name)

901 source.expect(">")

902 saved_flags = info.flags

903 try:

904 subpattern = _parse_pattern(source, info)

905 source.expect(")")

906 finally:

907 info.flags = saved_flags

908 source.ignore_space = bool(info.flags & VERBOSE)

909

910 info.close_group()

911

912 return Group(info, group, subpattern)

913 if ch == "=":

914 # (?P=...: a named group reference.

915 name = parse_name(source, allow_numeric=True)

916 source.expect(")")

917 if info.is_open_group(name):

918 raise error("cannot refer to an open group", source.string,

919 saved_pos)

920

921 return make_ref_group(info, name, saved_pos)

922 if ch == ">" or ch == "&":

923 # (?P>...: a call to a group.

924 return parse_call_named_group(source, info, saved_pos)

925

926 source.pos = saved_pos

927 raise error("unknown extension", source.string, saved_pos)

928

929def parse_comment(source):

930 "Parses a comment."

931 while True:

932 saved_pos = source.pos

933 c = source.get(True)

934

935 if not c or c == ")":

936 break

937

938 if c == "\\":

939 c = source.get(True)

940

941 source.pos = saved_pos

942 source.expect(")")

943

944 return None

945

946def parse_lookaround(source, info, behind, positive):

947 "Parses a lookaround."

948 saved_flags = info.flags

949 try:

950 subpattern = _parse_pattern(source, info)

951 source.expect(")")

952 finally:

953 info.flags = saved_flags

954 source.ignore_space = bool(info.flags & VERBOSE)

955

956 return LookAround(behind, positive, subpattern)

957

958def parse_conditional(source, info):

959 "Parses a conditional subpattern."

960 saved_flags = info.flags

961 saved_pos = source.pos

962 ch = source.get()

963 if ch == "?":

964 # (?(?...

965 ch = source.get()

966 if ch in ("=", "!"):

967 # (?(?=... or (?(?!...: lookahead conditional.

968 return parse_lookaround_conditional(source, info, False, ch == "=")

969 if ch == "<":

970 # (?(?<...

971 ch = source.get()

972 if ch in ("=", "!"):

973 # (?(?<=... or (?(?<!...: lookbehind conditional.

974 return parse_lookaround_conditional(source, info, True, ch ==

975 "=")

976

977 source.pos = saved_pos

978 raise error("expected lookaround conditional", source.string,

979 source.pos)

980

981 source.pos = saved_pos

982 try:

983 group = parse_name(source, True)

984 source.expect(")")

985 yes_branch = parse_sequence(source, info)

986 if source.match("|"):

987 no_branch = parse_sequence(source, info)

988 else:

989 no_branch = Sequence()

990

991 source.expect(")")

992 finally:

993 info.flags = saved_flags

994 source.ignore_space = bool(info.flags & VERBOSE)

995

996 if yes_branch.is_empty() and no_branch.is_empty():

997 return Sequence()

998

999 return Conditional(info, group, yes_branch, no_branch, saved_pos)

1000

1001def parse_lookaround_conditional(source, info, behind, positive):

1002 saved_flags = info.flags

1003 try:

1004 subpattern = _parse_pattern(source, info)

1005 source.expect(")")

1006 finally:

1007 info.flags = saved_flags

1008 source.ignore_space = bool(info.flags & VERBOSE)

1009

1010 yes_branch = parse_sequence(source, info)

1011 if source.match("|"):

1012 no_branch = parse_sequence(source, info)

1013 else:

1014 no_branch = Sequence()

1015

1016 source.expect(")")

1017

1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,

1019 no_branch)

1020

1021def parse_atomic(source, info):

1022 "Parses an atomic subpattern."

1023 saved_flags = info.flags

1024 try:

1025 subpattern = _parse_pattern(source, info)

1026 source.expect(")")

1027 finally:

1028 info.flags = saved_flags

1029 source.ignore_space = bool(info.flags & VERBOSE)

1030

1031 return Atomic(subpattern)

1032

1033def parse_common(source, info):

1034 "Parses a common groups branch."

1035 # Capture group numbers in different branches can reuse the group numbers.

1036 initial_group_count = info.group_count

1037 branches = [parse_sequence(source, info)]

1038 final_group_count = info.group_count

1039 while source.match("|"):

1040 info.group_count = initial_group_count

1041 branches.append(parse_sequence(source, info))

1042 final_group_count = max(final_group_count, info.group_count)

1043

1044 info.group_count = final_group_count

1045 source.expect(")")

1046

1047 if len(branches) == 1:

1048 return branches[0]

1049 return Branch(branches)

1050

1051def parse_call_group(source, info, ch, pos):

1052 "Parses a call to a group."

1053 if ch == "R":

1054 group = "0"

1055 else:

1056 group = ch + source.get_while(DIGITS)

1057

1058 source.expect(")")

1059

1060 return CallGroup(info, group, pos)

1061

1062def parse_call_named_group(source, info, pos):

1063 "Parses a call to a named group."

1064 group = parse_name(source)

1065 source.expect(")")

1066

1067 return CallGroup(info, group, pos)

1068

1069def parse_flag_set(source):

1070 "Parses a set of inline flags."

1071 flags = 0

1072

1073 try:

1074 while True:

1075 saved_pos = source.pos

1076 ch = source.get()

1077 if ch == "V":

1078 ch += source.get()

1079 flags |= REGEX_FLAGS[ch]

1080 except KeyError:

1081 source.pos = saved_pos

1082

1083 return flags

1084

1085def parse_flags(source, info):

1086 "Parses flags being turned on/off."

1087 flags_on = parse_flag_set(source)

1088 if source.match("-"):

1089 flags_off = parse_flag_set(source)

1090 if not flags_off:

1091 raise error("bad inline flags: no flags after '-'", source.string,

1092 source.pos)

1093 else:

1094 flags_off = 0

1095

1096 if flags_on & LOCALE:

1097 # Remember that this pattern as an inline locale flag.

1098 info.inline_locale = True

1099

1100 return flags_on, flags_off

1101

1102def parse_subpattern(source, info, flags_on, flags_off):

1103 "Parses a subpattern with scoped flags."

1104 saved_flags = info.flags

1105 info.flags = (info.flags | flags_on) & ~flags_off

1106 source.ignore_space = bool(info.flags & VERBOSE)

1107 try:

1108 subpattern = _parse_pattern(source, info)

1109 source.expect(")")

1110 finally:

1111 info.flags = saved_flags

1112 source.ignore_space = bool(info.flags & VERBOSE)

1113

1114 return subpattern

1115

1116def parse_flags_subpattern(source, info):

1117 """Parses a flags subpattern. It could be inline flags or a subpattern

1118 possibly with local flags. If it's a subpattern, then that's returned;

1119 if it's a inline flags, then None is returned.

1120 """

1121 flags_on, flags_off = parse_flags(source, info)

1122

1123 if flags_off & GLOBAL_FLAGS:

1124 raise error("bad inline flags: cannot turn off global flag",

1125 source.string, source.pos)

1126

1127 if flags_on & flags_off:

1128 raise error("bad inline flags: flag turned on and off", source.string,

1129 source.pos)

1130

1131 # Handle flags which are global in all regex behaviours.

1132 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS

1133 if new_global_flags:

1134 info.global_flags |= new_global_flags

1135

1136 # A global has been turned on, so reparse the pattern.

1137 raise _UnscopedFlagSet(info.global_flags)

1138

1139 # Ensure that from now on we have only scoped flags.

1140 flags_on &= ~GLOBAL_FLAGS

1141

1142 if source.match(":"):

1143 return parse_subpattern(source, info, flags_on, flags_off)

1144

1145 if source.match(")"):

1146 parse_positional_flags(source, info, flags_on, flags_off)

1147 return None

1148

1149 raise error("unknown extension", source.string, source.pos)

1150

1151def parse_positional_flags(source, info, flags_on, flags_off):

1152 "Parses positional flags."

1153 info.flags = (info.flags | flags_on) & ~flags_off

1154 source.ignore_space = bool(info.flags & VERBOSE)

1155

1156def parse_name(source, allow_numeric=False, allow_group_0=False):

1157 "Parses a name."

1158 name = source.get_while(set(")>"), include=False)

1159

1160 if not name:

1161 raise error("missing group name", source.string, source.pos)

1162

1163 if name.isdigit():

1164 min_group = 0 if allow_group_0 else 1

1165 if not allow_numeric or int(name) < min_group:

1166 raise error("bad character in group name", source.string,

1167 source.pos)

1168 else:

1169 if not name.isidentifier():

1170 raise error("bad character in group name", source.string,

1171 source.pos)

1172

1173 return name

1174

1175def is_octal(string):

1176 "Checks whether a string is octal."

1177 return all(ch in OCT_DIGITS for ch in string)

1178

1179def is_decimal(string):

1180 "Checks whether a string is decimal."

1181 return all(ch in DIGITS for ch in string)

1182

1183def is_hexadecimal(string):

1184 "Checks whether a string is hexadecimal."

1185 return all(ch in HEX_DIGITS for ch in string)

1186

1187def parse_escape(source, info, in_set):

1188 "Parses an escape sequence."

1189 saved_ignore = source.ignore_space

1190 source.ignore_space = False

1191 ch = source.get()

1192 source.ignore_space = saved_ignore

1193 if not ch:

1194 # A backslash at the end of the pattern.

1195 raise error("bad escape (end of pattern)", source.string, source.pos)

1196 if ch in HEX_ESCAPES:

1197 # A hexadecimal escape sequence.

1198 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)

1199 elif ch == "g" and not in_set:

1200 # A group reference.

1201 saved_pos = source.pos

1202 try:

1203 return parse_group_ref(source, info)

1204 except error:

1205 # Invalid as a group reference, so assume it's a literal.

1206 source.pos = saved_pos

1207

1208 return make_character(info, ord(ch), in_set)

1209 elif ch == "G" and not in_set:

1210 # A search anchor.

1211 return SearchAnchor()

1212 elif ch == "L" and not in_set:

1213 # A string set.

1214 return parse_string_set(source, info)

1215 elif ch == "N":

1216 # A named codepoint.

1217 return parse_named_char(source, info, in_set)

1218 elif ch in "pP":

1219 # A Unicode property, positive or negative.

1220 return parse_property(source, info, ch == "p", in_set)

1221 elif ch == "R" and not in_set:

1222 # A line ending.

1223 charset = [0x0A, 0x0B, 0x0C, 0x0D]

1224 if info.guess_encoding == UNICODE:

1225 charset.extend([0x85, 0x2028, 0x2029])

1226

1227 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)

1228 for c in charset])]))

1229 elif ch == "X" and not in_set:

1230 # A grapheme cluster.

1231 return Grapheme()

1232 elif ch in ALPHA:

1233 # An alphabetic escape sequence.

1234 # Positional escapes aren't allowed inside a character set.

1235 if not in_set:

1236 if info.flags & WORD:

1237 value = WORD_POSITION_ESCAPES.get(ch)

1238 else:

1239 value = POSITION_ESCAPES.get(ch)

1240

1241 if value:

1242 return value

1243

1244 value = CHARSET_ESCAPES.get(ch)

1245 if value:

1246 return value

1247

1248 value = CHARACTER_ESCAPES.get(ch)

1249 if value:

1250 return Character(ord(value))

1251

1252 raise error("bad escape \\%s" % ch, source.string, source.pos)

1253 elif ch in DIGITS:

1254 # A numeric escape sequence.

1255 return parse_numeric_escape(source, info, ch, in_set)

1256 else:

1257 # A literal.

1258 return make_character(info, ord(ch), in_set)

1259

1260def parse_numeric_escape(source, info, ch, in_set):

1261 "Parses a numeric escape sequence."

1262 if in_set or ch == "0":

1263 # Octal escape sequence, max 3 digits.

1264 return parse_octal_escape(source, info, [ch], in_set)

1265

1266 # At least 1 digit, so either octal escape or group.

1267 digits = ch

1268 saved_pos = source.pos

1269 ch = source.get()

1270 if ch in DIGITS:

1271 # At least 2 digits, so either octal escape or group.

1272 digits += ch

1273 saved_pos = source.pos

1274 ch = source.get()

1275 if is_octal(digits) and ch in OCT_DIGITS:

1276 # 3 octal digits, so octal escape sequence.

1277 encoding = info.flags & _ALL_ENCODINGS

1278 if encoding == ASCII or encoding == LOCALE:

1279 octal_mask = 0xFF

1280 else:

1281 octal_mask = 0x1FF

1282

1283 value = int(digits + ch, 8) & octal_mask

1284 return make_character(info, value)

1285

1286 # Group reference.

1287 source.pos = saved_pos

1288 if info.is_open_group(digits):

1289 raise error("cannot refer to an open group", source.string, source.pos)

1290

1291 return make_ref_group(info, digits, source.pos)

1292

1293def parse_octal_escape(source, info, digits, in_set):

1294 "Parses an octal escape sequence."

1295 saved_pos = source.pos

1296 ch = source.get()

1297 while len(digits) < 3 and ch in OCT_DIGITS:

1298 digits.append(ch)

1299 saved_pos = source.pos

1300 ch = source.get()

1301

1302 source.pos = saved_pos

1303 try:

1304 value = int("".join(digits), 8)

1305 return make_character(info, value, in_set)

1306 except ValueError:

1307 if digits[0] in OCT_DIGITS:

1308 raise error("incomplete escape \\%s" % ''.join(digits),

1309 source.string, source.pos)

1310 else:

1311 raise error("bad escape \\%s" % digits[0], source.string,

1312 source.pos)

1313

1314def parse_hex_escape(source, info, esc, expected_len, in_set, type):

1315 "Parses a hex escape sequence."

1316 saved_pos = source.pos

1317 digits = []

1318 for i in range(expected_len):

1319 ch = source.get()

1320 if ch not in HEX_DIGITS:

1321 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1322 source.string, saved_pos)

1323 digits.append(ch)

1324

1325 try:

1326 value = int("".join(digits), 16)

1327 except ValueError:

1328 pass

1329 else:

1330 if value < 0x110000:

1331 return make_character(info, value, in_set)

1332

1333 # Bad hex escape.

1334 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),

1335 source.string, saved_pos)

1336

1337def parse_group_ref(source, info):

1338 "Parses a group reference."

1339 source.expect("<")

1340 saved_pos = source.pos

1341 name = parse_name(source, True)

1342 source.expect(">")

1343 if info.is_open_group(name):

1344 raise error("cannot refer to an open group", source.string, source.pos)

1345

1346 return make_ref_group(info, name, saved_pos)

1347

1348def parse_string_set(source, info):

1349 "Parses a string set reference."

1350 source.expect("<")

1351 name = parse_name(source, True)

1352 source.expect(">")

1353 if name is None or name not in info.kwargs:

1354 raise error("undefined named list", source.string, source.pos)

1355

1356 return make_string_set(info, name)

1357

1358def parse_named_char(source, info, in_set):

1359 "Parses a named character."

1360 saved_pos = source.pos

1361 if source.match("{"):

1362 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)

1363 if source.match("}"):

1364 try:

1365 value = unicodedata.lookup(name)

1366 return make_character(info, ord(value), in_set)

1367 except KeyError:

1368 raise error("undefined character name", source.string,

1369 source.pos)

1370

1371 source.pos = saved_pos

1372 return make_character(info, ord("N"), in_set)

1373

1374def parse_property(source, info, positive, in_set):

1375 "Parses a Unicode property."

1376 saved_pos = source.pos

1377 ch = source.get()

1378 if ch == "{":

1379 negate = source.match("^")

1380 prop_name, name = parse_property_name(source)

1381 if source.match("}"):

1382 # It's correctly delimited.

1383 prop = lookup_property(prop_name, name, positive != negate, source)

1384 return make_property(info, prop, in_set)

1385 elif ch and ch in "CLMNPSZ":

1386 # An abbreviated property, eg \pL.

1387 prop = lookup_property(None, ch, positive, source)

1388 return make_property(info, prop, in_set)

1389

1390 # Not a property, so treat as a literal "p" or "P".

1391 source.pos = saved_pos

1392 ch = "p" if positive else "P"

1393 return make_character(info, ord(ch), in_set)

1394

1395def parse_property_name(source):

1396 "Parses a property name, which may be qualified."

1397 name = source.get_while(PROPERTY_NAME_PART)

1398 saved_pos = source.pos

1399

1400 ch = source.get()

1401 if ch and ch in ":=":

1402 prop_name = name

1403 name = source.get_while(ALNUM | set(" &_-./")).strip()

1404

1405 if name:

1406 # Name after the ":" or "=", so it's a qualified name.

1407 saved_pos = source.pos

1408 else:

1409 # No name after the ":" or "=", so assume it's an unqualified name.

1410 prop_name, name = None, prop_name

1411 else:

1412 prop_name = None

1413

1414 source.pos = saved_pos

1415 return prop_name, name

1416

1417def parse_set(source, info):

1418 "Parses a character set."

1419 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1420

1421 saved_ignore = source.ignore_space

1422 source.ignore_space = False

1423 # Negative set?

1424 negate = source.match("^")

1425 try:

1426 if version == VERSION0:

1427 item = parse_set_imp_union(source, info)

1428 else:

1429 item = parse_set_union(source, info)

1430

1431 if not source.match("]"):

1432 raise error("missing ]", source.string, source.pos)

1433 finally:

1434 source.ignore_space = saved_ignore

1435

1436 if negate:

1437 item = item.with_flags(positive=not item.positive)

1438

1439 item = item.with_flags(case_flags=make_case_flags(info))

1440

1441 return item

1442

1443def parse_set_union(source, info):

1444 "Parses a set union ([x||y])."

1445 items = [parse_set_symm_diff(source, info)]

1446 while source.match("||"):

1447 items.append(parse_set_symm_diff(source, info))

1448

1449 if len(items) == 1:

1450 return items[0]

1451 return SetUnion(info, items)

1452

1453def parse_set_symm_diff(source, info):

1454 "Parses a set symmetric difference ([x~~y])."

1455 items = [parse_set_inter(source, info)]

1456 while source.match("~~"):

1457 items.append(parse_set_inter(source, info))

1458

1459 if len(items) == 1:

1460 return items[0]

1461 return SetSymDiff(info, items)

1462

1463def parse_set_inter(source, info):

1464 "Parses a set intersection ([x&&y])."

1465 items = [parse_set_diff(source, info)]

1466 while source.match("&&"):

1467 items.append(parse_set_diff(source, info))

1468

1469 if len(items) == 1:

1470 return items[0]

1471 return SetInter(info, items)

1472

1473def parse_set_diff(source, info):

1474 "Parses a set difference ([x--y])."

1475 items = [parse_set_imp_union(source, info)]

1476 while source.match("--"):

1477 items.append(parse_set_imp_union(source, info))

1478

1479 if len(items) == 1:

1480 return items[0]

1481 return SetDiff(info, items)

1482

1483def parse_set_imp_union(source, info):

1484 "Parses a set implicit union ([xy])."

1485 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1486

1487 items = [parse_set_member(source, info)]

1488 while True:

1489 saved_pos = source.pos

1490 if source.match("]"):

1491 # End of the set.

1492 source.pos = saved_pos

1493 break

1494

1495 if version == VERSION1 and any(source.match(op) for op in SET_OPS):

1496 # The new behaviour has set operators.

1497 source.pos = saved_pos

1498 break

1499

1500 items.append(parse_set_member(source, info))

1501

1502 if len(items) == 1:

1503 return items[0]

1504 return SetUnion(info, items)

1505

1506def parse_set_member(source, info):

1507 "Parses a member in a character set."

1508 # Parse a set item.

1509 start = parse_set_item(source, info)

1510 saved_pos1 = source.pos

1511 if (not isinstance(start, Character) or not start.positive or not

1512 source.match("-")):

1513 # It's not the start of a range.

1514 return start

1515

1516 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1517

1518 # It looks like the start of a range of characters.

1519 saved_pos2 = source.pos

1520 if version == VERSION1 and source.match("-"):

1521 # It's actually the set difference operator '--', so return the

1522 # character.

1523 source.pos = saved_pos1

1524 return start

1525

1526 if source.match("]"):

1527 # We've reached the end of the set, so return both the character and

1528 # hyphen.

1529 source.pos = saved_pos2

1530 return SetUnion(info, [start, Character(ord("-"))])

1531

1532 # Parse a set item.

1533 end = parse_set_item(source, info)

1534 if not isinstance(end, Character) or not end.positive:

1535 # It's not a range, so return the character, hyphen and property.

1536 return SetUnion(info, [start, Character(ord("-")), end])

1537

1538 # It _is_ a range.

1539 if start.value > end.value:

1540 raise error("bad character range", source.string, source.pos)

1541

1542 if start.value == end.value:

1543 return start

1544

1545 return Range(start.value, end.value)

1546

1547def parse_set_item(source, info):

1548 "Parses an item in a character set."

1549 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1550

1551 if source.match("\\"):

1552 # An escape sequence in a set.

1553 return parse_escape(source, info, True)

1554

1555 saved_pos = source.pos

1556 if source.match("[:"):

1557 # Looks like a POSIX character class.

1558 try:

1559 return parse_posix_class(source, info)

1560 except ParseError:

1561 # Not a POSIX character class.

1562 source.pos = saved_pos

1563

1564 if version == VERSION1 and source.match("["):

1565 # It's the start of a nested set.

1566

1567 # Negative set?

1568 negate = source.match("^")

1569 item = parse_set_union(source, info)

1570

1571 if not source.match("]"):

1572 raise error("missing ]", source.string, source.pos)

1573

1574 if negate:

1575 item = item.with_flags(positive=not item.positive)

1576

1577 return item

1578

1579 ch = source.get()

1580 if not ch:

1581 raise error("unterminated character set", source.string, source.pos)

1582

1583 return Character(ord(ch))

1584

1585def parse_posix_class(source, info):

1586 "Parses a POSIX character class."

1587 negate = source.match("^")

1588 prop_name, name = parse_property_name(source)

1589 if not source.match(":]"):

1590 raise ParseError()

1591

1592 return lookup_property(prop_name, name, not negate, source, posix=True)

1593

1594def float_to_rational(flt):

1595 "Converts a float to a rational pair."

1596 int_part = int(flt)

1597 error = flt - int_part

1598 if abs(error) < 0.0001:

1599 return int_part, 1

1600

1601 den, num = float_to_rational(1.0 / error)

1602

1603 return int_part * den + num, den

1604

1605def numeric_to_rational(numeric):

1606 "Converts a numeric string to a rational string, if possible."

1607 if numeric[ : 1] == "-":

1608 sign, numeric = numeric[0], numeric[1 : ]

1609 else:

1610 sign = ""

1611

1612 parts = numeric.split("/")

1613 if len(parts) == 2:

1614 num, den = float_to_rational(float(parts[0]) / float(parts[1]))

1615 elif len(parts) == 1:

1616 num, den = float_to_rational(float(parts[0]))

1617 else:

1618 raise ValueError()

1619

1620 result = "{}{}/{}".format(sign, num, den)

1621 if result.endswith("/1"):

1622 return result[ : -2]

1623

1624 return result

1625

1626def standardise_name(name):

1627 "Standardises a property or value name."

1628 try:

1629 return numeric_to_rational("".join(name))

1630 except (ValueError, ZeroDivisionError):

1631 return "".join(ch for ch in name if ch not in "_- ").upper()

1632

1633_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())

1634

1635_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())

1636

1637def lookup_property(property, value, positive, source=None, posix=False):

1638 "Looks up a property."

1639 # Normalise the names (which may still be lists).

1640 property = standardise_name(property) if property else None

1641 value = standardise_name(value)

1642

1643 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):

1644 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive

1645

1646 if posix and not property and value.upper() in _POSIX_CLASSES:

1647 value = 'POSIX' + value

1648

1649 if property:

1650 # Both the property and the value are provided.

1651 prop = PROPERTIES.get(property)

1652 if not prop:

1653 if not source:

1654 raise error("unknown property")

1655

1656 raise error("unknown property", source.string, source.pos)

1657

1658 prop_id, value_dict = prop

1659 val_id = value_dict.get(value)

1660 if val_id is None:

1661 if not source:

1662 raise error("unknown property value")

1663

1664 raise error("unknown property value", source.string, source.pos)

1665

1666 return Property((prop_id << 16) | val_id, positive)

1667

1668 # Only the value is provided.

1669 # It might be the name of a GC, script or block value.

1670 for property in ("GC", "SCRIPT", "BLOCK"):

1671 prop_id, value_dict = PROPERTIES.get(property)

1672 val_id = value_dict.get(value)

1673 if val_id is not None:

1674 return Property((prop_id << 16) | val_id, positive)

1675

1676 # It might be the name of a binary property.

1677 prop = PROPERTIES.get(value)

1678 if prop:

1679 prop_id, value_dict = prop

1680 if set(value_dict) == _BINARY_VALUES:

1681 return Property((prop_id << 16) | 1, positive)

1682

1683 return Property(prop_id << 16, not positive)

1684

1685 # It might be the name of a binary property starting with a prefix.

1686 if value.startswith("IS"):

1687 prop = PROPERTIES.get(value[2 : ])

1688 if prop:

1689 prop_id, value_dict = prop

1690 if "YES" in value_dict:

1691 return Property((prop_id << 16) | 1, positive)

1692

1693 # It might be the name of a script or block starting with a prefix.

1694 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):

1695 if value.startswith(prefix):

1696 prop_id, value_dict = PROPERTIES.get(property)

1697 val_id = value_dict.get(value[2 : ])

1698 if val_id is not None:

1699 return Property((prop_id << 16) | val_id, positive)

1700

1701 # Unknown property.

1702 if not source:

1703 raise error("unknown property")

1704

1705 raise error("unknown property", source.string, source.pos)

1706

1707def _compile_replacement(source, pattern, is_unicode):

1708 "Compiles a replacement template escape sequence."

1709 ch = source.get()

1710 if ch in ALPHA:

1711 # An alphabetic escape sequence.

1712 value = CHARACTER_ESCAPES.get(ch)

1713 if value:

1714 return False, [ord(value)]

1715

1716 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):

1717 # A hexadecimal escape sequence.

1718 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]

1719

1720 if ch == "g":

1721 # A group preference.

1722 return True, [compile_repl_group(source, pattern)]

1723

1724 if ch == "N" and is_unicode:

1725 # A named character.

1726 value = parse_repl_named_char(source)

1727 if value is not None:

1728 return False, [value]

1729

1730 raise error("bad escape \\%s" % ch, source.string, source.pos)

1731

1732 if isinstance(source.sep, bytes):

1733 octal_mask = 0xFF

1734 else:

1735 octal_mask = 0x1FF

1736

1737 if ch == "0":

1738 # An octal escape sequence.

1739 digits = ch

1740 while len(digits) < 3:

1741 saved_pos = source.pos

1742 ch = source.get()

1743 if ch not in OCT_DIGITS:

1744 source.pos = saved_pos

1745 break

1746 digits += ch

1747

1748 return False, [int(digits, 8) & octal_mask]

1749

1750 if ch in DIGITS:

1751 # Either an octal escape sequence (3 digits) or a group reference (max

1752 # 2 digits).

1753 digits = ch

1754 saved_pos = source.pos

1755 ch = source.get()

1756 if ch in DIGITS:

1757 digits += ch

1758 saved_pos = source.pos

1759 ch = source.get()

1760 if ch and is_octal(digits + ch):

1761 # An octal escape sequence.

1762 return False, [int(digits + ch, 8) & octal_mask]

1763

1764 # A group reference.

1765 source.pos = saved_pos

1766 return True, [int(digits)]

1767

1768 if ch == "\\":

1769 # An escaped backslash is a backslash.

1770 return False, [ord("\\")]

1771

1772 if not ch:

1773 # A trailing backslash.

1774 raise error("bad escape (end of pattern)", source.string, source.pos)

1775

1776 # An escaped non-backslash is a backslash followed by the literal.

1777 return False, [ord("\\"), ord(ch)]

1778

1779def parse_repl_hex_escape(source, expected_len, type):

1780 "Parses a hex escape sequence in a replacement string."

1781 digits = []

1782 for i in range(expected_len):

1783 ch = source.get()

1784 if ch not in HEX_DIGITS:

1785 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1786 source.string, source.pos)

1787 digits.append(ch)

1788

1789 return int("".join(digits), 16)

1790

1791def parse_repl_named_char(source):

1792 "Parses a named character in a replacement string."

1793 saved_pos = source.pos

1794 if source.match("{"):

1795 name = source.get_while(ALPHA | set(" "))

1796

1797 if source.match("}"):

1798 try:

1799 value = unicodedata.lookup(name)

1800 return ord(value)

1801 except KeyError:

1802 raise error("undefined character name", source.string,

1803 source.pos)

1804

1805 source.pos = saved_pos

1806 return None

1807

1808def compile_repl_group(source, pattern):

1809 "Compiles a replacement template group reference."

1810 source.expect("<")

1811 name = parse_name(source, True, True)

1812

1813 source.expect(">")

1814 if name.isdigit():

1815 index = int(name)

1816 if not 0 <= index <= pattern.groups:

1817 raise error("invalid group reference", source.string, source.pos)

1818

1819 return index

1820

1821 try:

1822 return pattern.groupindex[name]

1823 except KeyError:

1824 raise IndexError("unknown group")

1825

1826# The regular expression is parsed into a syntax tree. The different types of

1827# node are defined below.

1828

1829INDENT = " "

1830POSITIVE_OP = 0x1

1831ZEROWIDTH_OP = 0x2

1832FUZZY_OP = 0x4

1833REVERSE_OP = 0x8

1834REQUIRED_OP = 0x10

1835

1836POS_TEXT = {False: "NON-MATCH", True: "MATCH"}

1837CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",

1838 FULLIGNORECASE: " FULL_IGNORE_CASE"}

1839

1840def make_sequence(items):

1841 if len(items) == 1:

1842 return items[0]

1843 return Sequence(items)

1844

1845# Common base class for all nodes.

1846class RegexBase:

1847 def __init__(self):

1848 self._key = self.__class__

1849

1850 def with_flags(self, positive=None, case_flags=None, zerowidth=None):

1851 if positive is None:

1852 positive = self.positive

1853 else:

1854 positive = bool(positive)

1855 if case_flags is None:

1856 case_flags = self.case_flags

1857 else:

1858 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]

1859 if zerowidth is None:

1860 zerowidth = self.zerowidth

1861 else:

1862 zerowidth = bool(zerowidth)

1863

1864 if (positive == self.positive and case_flags == self.case_flags and

1865 zerowidth == self.zerowidth):

1866 return self

1867

1868 return self.rebuild(positive, case_flags, zerowidth)

1869

1870 def fix_groups(self, pattern, reverse, fuzzy):

1871 pass

1872

1873 def optimise(self, info, reverse):

1874 return self

1875

1876 def pack_characters(self, info):

1877 return self

1878

1879 def remove_captures(self):

1880 return self

1881

1882 def is_atomic(self):

1883 return True

1884

1885 def can_be_affix(self):

1886 return True

1887

1888 def contains_group(self):

1889 return False

1890

1891 def get_firstset(self, reverse):

1892 raise _FirstSetError()

1893

1894 def has_simple_start(self):

1895 return False

1896

1897 def compile(self, reverse=False, fuzzy=False):

1898 return self._compile(reverse, fuzzy)

1899

1900 def is_empty(self):

1901 return False

1902

1903 def __hash__(self):

1904 return hash(self._key)

1905

1906 def __eq__(self, other):

1907 return type(self) is type(other) and self._key == other._key

1908

1909 def __ne__(self, other):

1910 return not self.__eq__(other)

1911

1912 def get_required_string(self, reverse):

1913 return self.max_width(), None

1914

1915# Base class for zero-width nodes.

1916class ZeroWidthBase(RegexBase):

1917 def __init__(self, positive=True):

1918 RegexBase.__init__(self)

1919 self.positive = bool(positive)

1920

1921 self._key = self.__class__, self.positive

1922

1923 def get_firstset(self, reverse):

1924 return set([None])

1925

1926 def _compile(self, reverse, fuzzy):

1927 flags = 0

1928 if self.positive:

1929 flags |= POSITIVE_OP

1930 if fuzzy:

1931 flags |= FUZZY_OP

1932 if reverse:

1933 flags |= REVERSE_OP

1934 return [(self._opcode, flags)]

1935

1936 def dump(self, indent, reverse):

1937 print("{}{} {}".format(INDENT * indent, self._op_name,

1938 POS_TEXT[self.positive]))

1939

1940 def max_width(self):

1941 return 0

1942

1943class Any(RegexBase):

1944 _opcode = {False: OP.ANY, True: OP.ANY_REV}

1945 _op_name = "ANY"

1946

1947 def has_simple_start(self):

1948 return True

1949

1950 def _compile(self, reverse, fuzzy):

1951 flags = 0

1952 if fuzzy:

1953 flags |= FUZZY_OP

1954 return [(self._opcode[reverse], flags)]

1955

1956 def dump(self, indent, reverse):

1957 print("{}{}".format(INDENT * indent, self._op_name))

1958

1959 def max_width(self):

1960 return 1

1961

1962class AnyAll(Any):

1963 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}

1964 _op_name = "ANY_ALL"

1965

1966class AnyU(Any):

1967 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}

1968 _op_name = "ANY_U"

1969

1970class Atomic(RegexBase):

1971 def __init__(self, subpattern):

1972 RegexBase.__init__(self)

1973 self.subpattern = subpattern

1974

1975 def fix_groups(self, pattern, reverse, fuzzy):

1976 self.subpattern.fix_groups(pattern, reverse, fuzzy)

1977

1978 def optimise(self, info, reverse):

1979 self.subpattern = self.subpattern.optimise(info, reverse)

1980

1981 if self.subpattern.is_empty():

1982 return self.subpattern

1983 return self

1984

1985 def pack_characters(self, info):

1986 self.subpattern = self.subpattern.pack_characters(info)

1987 return self

1988

1989 def remove_captures(self):

1990 self.subpattern = self.subpattern.remove_captures()

1991 return self

1992

1993 def can_be_affix(self):

1994 return self.subpattern.can_be_affix()

1995

1996 def contains_group(self):

1997 return self.subpattern.contains_group()

1998

1999 def get_firstset(self, reverse):

2000 return self.subpattern.get_firstset(reverse)

2001

2002 def has_simple_start(self):

2003 return self.subpattern.has_simple_start()

2004

2005 def _compile(self, reverse, fuzzy):

2006 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +

2007 [(OP.END, )])

2008

2009 def dump(self, indent, reverse):

2010 print("{}ATOMIC".format(INDENT * indent))

2011 self.subpattern.dump(indent + 1, reverse)

2012

2013 def is_empty(self):

2014 return self.subpattern.is_empty()

2015

2016 def __eq__(self, other):

2017 return (type(self) is type(other) and self.subpattern ==

2018 other.subpattern)

2019

2020 def max_width(self):

2021 return self.subpattern.max_width()

2022

2023 def get_required_string(self, reverse):

2024 return self.subpattern.get_required_string(reverse)

2025

2026class Boundary(ZeroWidthBase):

2027 _opcode = OP.BOUNDARY

2028 _op_name = "BOUNDARY"

2029

2030class Branch(RegexBase):

2031 def __init__(self, branches):

2032 RegexBase.__init__(self)

2033 self.branches = branches

2034

2035 def fix_groups(self, pattern, reverse, fuzzy):

2036 for b in self.branches:

2037 b.fix_groups(pattern, reverse, fuzzy)

2038

2039 def optimise(self, info, reverse):

2040 if not self.branches:

2041 return Sequence([])

2042

2043 # Flatten branches within branches.

2044 branches = Branch._flatten_branches(info, reverse, self.branches)

2045

2046 # Move any common prefix or suffix out of the branches.

2047 if reverse:

2048 suffix, branches = Branch._split_common_suffix(info, branches)

2049 prefix = []

2050 else:

2051 prefix, branches = Branch._split_common_prefix(info, branches)

2052 suffix = []

2053

2054 # Try to reduce adjacent single-character branches to sets.

2055 branches = Branch._reduce_to_set(info, reverse, branches)

2056

2057 if len(branches) > 1:

2058 sequence = [Branch(branches)]

2059

2060 if not prefix or not suffix:

2061 # We might be able to add a quick precheck before the branches.

2062 firstset = self._add_precheck(info, reverse, branches)

2063

2064 if firstset:

2065 if reverse:

2066 sequence.append(firstset)

2067 else:

2068 sequence.insert(0, firstset)

2069 else:

2070 sequence = branches

2071

2072 return make_sequence(prefix + sequence + suffix)

2073

2074 def _add_precheck(self, info, reverse, branches):

2075 charset = set()

2076 pos = -1 if reverse else 0

2077

2078 for branch in branches:

2079 if type(branch) is Literal and branch.case_flags == NOCASE:

2080 charset.add(branch.characters[pos])

2081 else:

2082 return

2083

2084 if not charset:

2085 return None

2086

2087 return _check_firstset(info, reverse, [Character(c) for c in charset])

2088

2089 def pack_characters(self, info):

2090 self.branches = [b.pack_characters(info) for b in self.branches]

2091 return self

2092

2093 def remove_captures(self):

2094 self.branches = [b.remove_captures() for b in self.branches]

2095 return self

2096

2097 def is_atomic(self):

2098 return all(b.is_atomic() for b in self.branches)

2099

2100 def can_be_affix(self):

2101 return all(b.can_be_affix() for b in self.branches)

2102

2103 def contains_group(self):

2104 return any(b.contains_group() for b in self.branches)

2105

2106 def get_firstset(self, reverse):

2107 fs = set()

2108 for b in self.branches:

2109 fs |= b.get_firstset(reverse)

2110

2111 return fs or set([None])

2112

2113 def _compile(self, reverse, fuzzy):

2114 if not self.branches:

2115 return []

2116

2117 code = [(OP.BRANCH, )]

2118 for b in self.branches:

2119 code.extend(b.compile(reverse, fuzzy))

2120 code.append((OP.NEXT, ))

2121

2122 code[-1] = (OP.END, )

2123

2124 return code

2125

2126 def dump(self, indent, reverse):

2127 print("{}BRANCH".format(INDENT * indent))

2128 self.branches[0].dump(indent + 1, reverse)

2129 for b in self.branches[1 : ]:

2130 print("{}OR".format(INDENT * indent))

2131 b.dump(indent + 1, reverse)

2132

2133 @staticmethod

2134 def _flatten_branches(info, reverse, branches):

2135 # Flatten the branches so that there aren't branches of branches.

2136 new_branches = []

2137 for b in branches:

2138 b = b.optimise(info, reverse)

2139 if isinstance(b, Branch):

2140 new_branches.extend(b.branches)

2141 else:

2142 new_branches.append(b)

2143

2144 return new_branches

2145

2146 @staticmethod

2147 def _split_common_prefix(info, branches):

2148 # Common leading items can be moved out of the branches.

2149 # Get the items in the branches.

2150 alternatives = []

2151 for b in branches:

2152 if isinstance(b, Sequence):

2153 alternatives.append(b.items)

2154 else:

2155 alternatives.append([b])

2156

2157 # What is the maximum possible length of the prefix?

2158 max_count = min(len(a) for a in alternatives)

2159

2160 # What is the longest common prefix?

2161 prefix = alternatives[0]

2162 pos = 0

2163 end_pos = max_count

2164 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==

2165 prefix[pos] for a in alternatives):

2166 pos += 1

2167 count = pos

2168

2169 if info.flags & UNICODE:

2170 # We need to check that we're not splitting a sequence of

2171 # characters which could form part of full case-folding.

2172 count = pos

2173 while count > 0 and not all(Branch._can_split(a, count) for a in

2174 alternatives):

2175 count -= 1

2176

2177 # No common prefix is possible.

2178 if count == 0:

2179 return [], branches

2180

2181 # Rebuild the branches.

2182 new_branches = []

2183 for a in alternatives:

2184 new_branches.append(make_sequence(a[count : ]))

2185

2186 return prefix[ : count], new_branches

2187

2188 @staticmethod

2189 def _split_common_suffix(info, branches):

2190 # Common trailing items can be moved out of the branches.

2191 # Get the items in the branches.

2192 alternatives = []

2193 for b in branches:

2194 if isinstance(b, Sequence):

2195 alternatives.append(b.items)

2196 else:

2197 alternatives.append([b])

2198

2199 # What is the maximum possible length of the suffix?

2200 max_count = min(len(a) for a in alternatives)

2201

2202 # What is the longest common suffix?

2203 suffix = alternatives[0]

2204 pos = -1

2205 end_pos = -1 - max_count

2206 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==

2207 suffix[pos] for a in alternatives):

2208 pos -= 1

2209 count = -1 - pos

2210

2211 if info.flags & UNICODE:

2212 # We need to check that we're not splitting a sequence of

2213 # characters which could form part of full case-folding.

2214 while count > 0 and not all(Branch._can_split_rev(a, count) for a

2215 in alternatives):

2216 count -= 1

2217

2218 # No common suffix is possible.

2219 if count == 0:

2220 return [], branches

2221

2222 # Rebuild the branches.

2223 new_branches = []

2224 for a in alternatives:

2225 new_branches.append(make_sequence(a[ : -count]))

2226

2227 return suffix[-count : ], new_branches

2228

2229 @staticmethod

2230 def _can_split(items, count):

2231 # Check the characters either side of the proposed split.

2232 if not Branch._is_full_case(items, count - 1):

2233 return True

2234

2235 if not Branch._is_full_case(items, count):

2236 return True

2237

2238 # Check whether a 1-1 split would be OK.

2239 if Branch._is_folded(items[count - 1 : count + 1]):

2240 return False

2241

2242 # Check whether a 1-2 split would be OK.

2243 if (Branch._is_full_case(items, count + 2) and

2244 Branch._is_folded(items[count - 1 : count + 2])):

2245 return False

2246

2247 # Check whether a 2-1 split would be OK.

2248 if (Branch._is_full_case(items, count - 2) and

2249 Branch._is_folded(items[count - 2 : count + 1])):

2250 return False

2251

2252 return True

2253

2254 @staticmethod

2255 def _can_split_rev(items, count):

2256 end = len(items)

2257

2258 # Check the characters either side of the proposed split.

2259 if not Branch._is_full_case(items, end - count):

2260 return True

2261

2262 if not Branch._is_full_case(items, end - count - 1):

2263 return True

2264

2265 # Check whether a 1-1 split would be OK.

2266 if Branch._is_folded(items[end - count - 1 : end - count + 1]):

2267 return False

2268

2269 # Check whether a 1-2 split would be OK.

2270 if (Branch._is_full_case(items, end - count + 2) and

2271 Branch._is_folded(items[end - count - 1 : end - count + 2])):

2272 return False

2273

2274 # Check whether a 2-1 split would be OK.

2275 if (Branch._is_full_case(items, end - count - 2) and

2276 Branch._is_folded(items[end - count - 2 : end - count + 1])):

2277 return False

2278

2279 return True

2280

2281 @staticmethod

2282 def _merge_common_prefixes(info, reverse, branches):

2283 # Branches with the same case-sensitive character prefix can be grouped

2284 # together if they are separated only by other branches with a

2285 # character prefix.

2286 prefixed = defaultdict(list)

2287 order = {}

2288 new_branches = []

2289 for b in branches:

2290 if Branch._is_simple_character(b):

2291 # Branch starts with a simple character.

2292 prefixed[b.value].append([b])

2293 order.setdefault(b.value, len(order))

2294 elif (isinstance(b, Sequence) and b.items and

2295 Branch._is_simple_character(b.items[0])):

2296 # Branch starts with a simple character.

2297 prefixed[b.items[0].value].append(b.items)

2298 order.setdefault(b.items[0].value, len(order))

2299 else:

2300 Branch._flush_char_prefix(info, reverse, prefixed, order,

2301 new_branches)

2302

2303 new_branches.append(b)

2304

2305 Branch._flush_char_prefix(info, prefixed, order, new_branches)

2306

2307 return new_branches

2308

2309 @staticmethod

2310 def _is_simple_character(c):

2311 return isinstance(c, Character) and c.positive and not c.case_flags

2312

2313 @staticmethod

2314 def _reduce_to_set(info, reverse, branches):

2315 # Can the branches be reduced to a set?

2316 new_branches = []

2317 items = set()

2318 case_flags = NOCASE

2319 for b in branches:

2320 if isinstance(b, (Character, Property, SetBase)):

2321 # Branch starts with a single character.

2322 if b.case_flags != case_flags:

2323 # Different case sensitivity, so flush.

2324 Branch._flush_set_members(info, reverse, items, case_flags,

2325 new_branches)

2326

2327 case_flags = b.case_flags

2328

2329 items.add(b.with_flags(case_flags=NOCASE))

2330 else:

2331 Branch._flush_set_members(info, reverse, items, case_flags,

2332 new_branches)

2333

2334 new_branches.append(b)

2335

2336 Branch._flush_set_members(info, reverse, items, case_flags,

2337 new_branches)

2338

2339 return new_branches

2340

2341 @staticmethod

2342 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):

2343 # Flush the prefixed branches.

2344 if not prefixed:

2345 return

2346

2347 for value, branches in sorted(prefixed.items(), key=lambda pair:

2348 order[pair[0]]):

2349 if len(branches) == 1:

2350 new_branches.append(make_sequence(branches[0]))

2351 else:

2352 subbranches = []

2353 optional = False

2354 for b in branches:

2355 if len(b) > 1:

2356 subbranches.append(make_sequence(b[1 : ]))

2357 elif not optional:

2358 subbranches.append(Sequence())

2359 optional = True

2360

2361 sequence = Sequence([Character(value), Branch(subbranches)])

2362 new_branches.append(sequence.optimise(info, reverse))

2363

2364 prefixed.clear()

2365 order.clear()

2366

2367 @staticmethod

2368 def _flush_set_members(info, reverse, items, case_flags, new_branches):

2369 # Flush the set members.

2370 if not items:

2371 return

2372

2373 if len(items) == 1:

2374 item = list(items)[0]

2375 else:

2376 item = SetUnion(info, list(items)).optimise(info, reverse)

2377

2378 new_branches.append(item.with_flags(case_flags=case_flags))

2379

2380 items.clear()

2381

2382 @staticmethod

2383 def _is_full_case(items, i):

2384 if not 0 <= i < len(items):

2385 return False

2386

2387 item = items[i]

2388 return (isinstance(item, Character) and item.positive and

2389 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)

2390

2391 @staticmethod

2392 def _is_folded(items):

2393 if len(items) < 2:

2394 return False

2395

2396 for i in items:

2397 if (not isinstance(i, Character) or not i.positive or not

2398 i.case_flags):

2399 return False

2400

2401 folded = "".join(chr(i.value) for i in items)

2402 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)

2403

2404 # Get the characters which expand to multiple codepoints on folding.

2405 expanding_chars = _regex.get_expand_on_folding()

2406

2407 for c in expanding_chars:

2408 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):

2409 return True

2410

2411 return False

2412

2413 def is_empty(self):

2414 return all(b.is_empty() for b in self.branches)

2415

2416 def __eq__(self, other):

2417 return type(self) is type(other) and self.branches == other.branches

2418

2419 def max_width(self):

2420 return max(b.max_width() for b in self.branches)

2421

2422class CallGroup(RegexBase):

2423 def __init__(self, info, group, position):

2424 RegexBase.__init__(self)

2425 self.info = info

2426 self.group = group

2427 self.position = position

2428

2429 self._key = self.__class__, self.group

2430

2431 def fix_groups(self, pattern, reverse, fuzzy):

2432 try:

2433 self.group = int(self.group)

2434 except ValueError:

2435 try:

2436 self.group = self.info.group_index[self.group]

2437 except KeyError:

2438 raise error("invalid group reference", pattern, self.position)

2439

2440 if not 0 <= self.group <= self.info.group_count:

2441 raise error("unknown group", pattern, self.position)

2442

2443 if self.group > 0 and self.info.open_group_count[self.group] > 1:

2444 raise error("ambiguous group reference", pattern, self.position)

2445

2446 self.info.group_calls.append((self, reverse, fuzzy))

2447

2448 self._key = self.__class__, self.group

2449

2450 def remove_captures(self):

2451 raise error("group reference not allowed", pattern, self.position)

2452

2453 def _compile(self, reverse, fuzzy):

2454 return [(OP.GROUP_CALL, self.call_ref)]

2455

2456 def dump(self, indent, reverse):

2457 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))

2458

2459 def __eq__(self, other):

2460 return type(self) is type(other) and self.group == other.group

2461

2462 def max_width(self):

2463 return UNLIMITED

2464

2465 def __del__(self):

2466 self.info = None

2467

2468class CallRef(RegexBase):

2469 def __init__(self, ref, parsed):

2470 self.ref = ref

2471 self.parsed = parsed

2472

2473 def _compile(self, reverse, fuzzy):

2474 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,

2475 fuzzy) + [(OP.END, )])

2476

2477class Character(RegexBase):

2478 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):

2479 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,

2480 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,

2481 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,

2482 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}

2483

2484 def __init__(self, value, positive=True, case_flags=NOCASE,

2485 zerowidth=False):

2486 RegexBase.__init__(self)

2487 self.value = value

2488 self.positive = bool(positive)

2489 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

2490 self.zerowidth = bool(zerowidth)

2491

2492 if (self.positive and (self.case_flags & FULLIGNORECASE) ==

2493 FULLIGNORECASE):

2494 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))

2495 else:

2496 self.folded = chr(self.value)

2497

2498 self._key = (self.__class__, self.value, self.positive,

2499 self.case_flags, self.zerowidth)

2500

2501 def rebuild(self, positive, case_flags, zerowidth):

2502 return Character(self.value, positive, case_flags, zerowidth)

2503

2504 def optimise(self, info, reverse, in_set=False):

2505 return self

2506

2507 def get_firstset(self, reverse):

2508 return set([self])

2509

2510 def has_simple_start(self):

2511 return True

2512

2513 def _compile(self, reverse, fuzzy):

2514 flags = 0

2515 if self.positive:

2516 flags |= POSITIVE_OP

2517 if self.zerowidth:

2518 flags |= ZEROWIDTH_OP

2519 if fuzzy:

2520 flags |= FUZZY_OP

2521

2522 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,

2523 self.value])

2524

2525 if len(self.folded) > 1:

2526 # The character expands on full case-folding.

2527 code = Branch([code, String([ord(c) for c in self.folded],

2528 case_flags=self.case_flags)])

2529

2530 return code.compile(reverse, fuzzy)

2531

2532 def dump(self, indent, reverse):

2533 display = ascii(chr(self.value)).lstrip("bu")

2534 print("{}CHARACTER {} {}{}".format(INDENT * indent,

2535 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))

2536

2537 def matches(self, ch):

2538 return (ch == self.value) == self.positive

2539

2540 def max_width(self):

2541 return len(self.folded)

2542

2543 def get_required_string(self, reverse):

2544 if not self.positive:

2545 return 1, None

2546

2547 self.folded_characters = tuple(ord(c) for c in self.folded)

2548

2549 return 0, self

2550

2551class Conditional(RegexBase):

2552 def __init__(self, info, group, yes_item, no_item, position):

2553 RegexBase.__init__(self)

2554 self.info = info

2555 self.group = group

2556 self.yes_item = yes_item

2557 self.no_item = no_item

2558 self.position = position

2559

2560 def fix_groups(self, pattern, reverse, fuzzy):

2561 try:

2562 self.group = int(self.group)

2563 except ValueError:

2564 try:

2565 self.group = self.info.group_index[self.group]

2566 except KeyError:

2567 if self.group == 'DEFINE':

2568 # 'DEFINE' is a special name unless there's a group with

2569 # that name.

2570 self.group = 0

2571 else:

2572 raise error("unknown group", pattern, self.position)

2573

2574 if not 0 <= self.group <= self.info.group_count:

2575 raise error("invalid group reference", pattern, self.position)

2576

2577 self.yes_item.fix_groups(pattern, reverse, fuzzy)

2578 self.no_item.fix_groups(pattern, reverse, fuzzy)

2579

2580 def optimise(self, info, reverse):

2581 yes_item = self.yes_item.optimise(info, reverse)

2582 no_item = self.no_item.optimise(info, reverse)

2583

2584 return Conditional(info, self.group, yes_item, no_item, self.position)

2585

2586 def pack_characters(self, info):

2587 self.yes_item = self.yes_item.pack_characters(info)

2588 self.no_item = self.no_item.pack_characters(info)

2589 return self

2590

2591 def remove_captures(self):

2592 self.yes_item = self.yes_item.remove_captures()

2593 self.no_item = self.no_item.remove_captures()

2594

2595 def is_atomic(self):

2596 return self.yes_item.is_atomic() and self.no_item.is_atomic()

2597

2598 def can_be_affix(self):

2599 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()

2600

2601 def contains_group(self):

2602 return self.yes_item.contains_group() or self.no_item.contains_group()

2603

2604 def get_firstset(self, reverse):

2605 return (self.yes_item.get_firstset(reverse) |

2606 self.no_item.get_firstset(reverse))

2607

2608 def _compile(self, reverse, fuzzy):

2609 code = [(OP.GROUP_EXISTS, self.group)]

2610 code.extend(self.yes_item.compile(reverse, fuzzy))

2611 add_code = self.no_item.compile(reverse, fuzzy)

2612 if add_code:

2613 code.append((OP.NEXT, ))

2614 code.extend(add_code)

2615

2616 code.append((OP.END, ))

2617

2618 return code

2619

2620 def dump(self, indent, reverse):

2621 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))

2622 self.yes_item.dump(indent + 1, reverse)

2623 if not self.no_item.is_empty():

2624 print("{}OR".format(INDENT * indent))

2625 self.no_item.dump(indent + 1, reverse)

2626

2627 def is_empty(self):

2628 return self.yes_item.is_empty() and self.no_item.is_empty()

2629

2630 def __eq__(self, other):

2631 return type(self) is type(other) and (self.group, self.yes_item,

2632 self.no_item) == (other.group, other.yes_item, other.no_item)

2633

2634 def max_width(self):

2635 return max(self.yes_item.max_width(), self.no_item.max_width())

2636

2637 def __del__(self):

2638 self.info = None

2639

2640class DefaultBoundary(ZeroWidthBase):

2641 _opcode = OP.DEFAULT_BOUNDARY

2642 _op_name = "DEFAULT_BOUNDARY"

2643

2644class DefaultEndOfWord(ZeroWidthBase):

2645 _opcode = OP.DEFAULT_END_OF_WORD

2646 _op_name = "DEFAULT_END_OF_WORD"

2647

2648class DefaultStartOfWord(ZeroWidthBase):

2649 _opcode = OP.DEFAULT_START_OF_WORD

2650 _op_name = "DEFAULT_START_OF_WORD"

2651

2652class EndOfLine(ZeroWidthBase):

2653 _opcode = OP.END_OF_LINE

2654 _op_name = "END_OF_LINE"

2655

2656class EndOfLineU(EndOfLine):

2657 _opcode = OP.END_OF_LINE_U

2658 _op_name = "END_OF_LINE_U"

2659

2660class EndOfString(ZeroWidthBase):

2661 _opcode = OP.END_OF_STRING

2662 _op_name = "END_OF_STRING"

2663

2664class EndOfStringLine(ZeroWidthBase):

2665 _opcode = OP.END_OF_STRING_LINE

2666 _op_name = "END_OF_STRING_LINE"

2667

2668class EndOfStringLineU(EndOfStringLine):

2669 _opcode = OP.END_OF_STRING_LINE_U

2670 _op_name = "END_OF_STRING_LINE_U"

2671

2672class EndOfWord(ZeroWidthBase):

2673 _opcode = OP.END_OF_WORD

2674 _op_name = "END_OF_WORD"

2675

2676class Failure(ZeroWidthBase):

2677 _op_name = "FAILURE"

2678

2679 def _compile(self, reverse, fuzzy):

2680 return [(OP.FAILURE, )]

2681

2682class Fuzzy(RegexBase):

2683 def __init__(self, subpattern, constraints=None):

2684 RegexBase.__init__(self)

2685 if constraints is None:

2686 constraints = {}

2687 self.subpattern = subpattern

2688 self.constraints = constraints

2689

2690 # If an error type is mentioned in the cost equation, then its maximum

2691 # defaults to unlimited.

2692 if "cost" in constraints:

2693 for e in "dis":

2694 if e in constraints["cost"]:

2695 constraints.setdefault(e, (0, None))

2696

2697 # If any error type is mentioned, then all the error maxima default to

2698 # 0, otherwise they default to unlimited.

2699 if set(constraints) & set("dis"):

2700 for e in "dis":

2701 constraints.setdefault(e, (0, 0))

2702 else:

2703 for e in "dis":

2704 constraints.setdefault(e, (0, None))

2705

2706 # The maximum of the generic error type defaults to unlimited.

2707 constraints.setdefault("e", (0, None))

2708

2709 # The cost equation defaults to equal costs. Also, the cost of any

2710 # error type not mentioned in the cost equation defaults to 0.

2711 if "cost" in constraints:

2712 for e in "dis":

2713 constraints["cost"].setdefault(e, 0)

2714 else:

2715 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":

2716 constraints["e"][1]}

2717

2718 def fix_groups(self, pattern, reverse, fuzzy):

2719 self.subpattern.fix_groups(pattern, reverse, True)

2720

2721 def pack_characters(self, info):

2722 self.subpattern = self.subpattern.pack_characters(info)

2723 return self

2724

2725 def remove_captures(self):

2726 self.subpattern = self.subpattern.remove_captures()

2727 return self

2728

2729 def is_atomic(self):

2730 return self.subpattern.is_atomic()

2731

2732 def contains_group(self):

2733 return self.subpattern.contains_group()

2734

2735 def _compile(self, reverse, fuzzy):

2736 # The individual limits.

2737 arguments = []

2738 for e in "dise":

2739 v = self.constraints[e]

2740 arguments.append(v[0])

2741 arguments.append(UNLIMITED if v[1] is None else v[1])

2742

2743 # The coeffs of the cost equation.

2744 for e in "dis":

2745 arguments.append(self.constraints["cost"][e])

2746

2747 # The maximum of the cost equation.

2748 v = self.constraints["cost"]["max"]

2749 arguments.append(UNLIMITED if v is None else v)

2750

2751 flags = 0

2752 if reverse:

2753 flags |= REVERSE_OP

2754

2755 test = self.constraints.get("test")

2756

2757 if test:

2758 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +

2759 test.compile(reverse, True) + [(OP.NEXT,)] +

2760 self.subpattern.compile(reverse, True) + [(OP.END,)])

2761

2762 return ([(OP.FUZZY, flags) + tuple(arguments)] +

2763 self.subpattern.compile(reverse, True) + [(OP.END,)])

2764

2765 def dump(self, indent, reverse):

2766 constraints = self._constraints_to_string()

2767 if constraints:

2768 constraints = " " + constraints

2769 print("{}FUZZY{}".format(INDENT * indent, constraints))

2770 self.subpattern.dump(indent + 1, reverse)

2771

2772 def is_empty(self):

2773 return self.subpattern.is_empty()

2774

2775 def __eq__(self, other):

2776 return (type(self) is type(other) and self.subpattern ==

2777 other.subpattern and self.constraints == other.constraints)

2778

2779 def max_width(self):

2780 return UNLIMITED

2781

2782 def _constraints_to_string(self):

2783 constraints = []

2784

2785 for name in "ids":

2786 min, max = self.constraints[name]

2787 if max == 0:

2788 continue

2789

2790 con = ""

2791

2792 if min > 0:

2793 con = "{}<=".format(min)

2794

2795 con += name

2796

2797 if max is not None:

2798 con += "<={}".format(max)

2799

2800 constraints.append(con)

2801

2802 cost = []

2803 for name in "ids":

2804 coeff = self.constraints["cost"][name]

2805 if coeff > 0:

2806 cost.append("{}{}".format(coeff, name))

2807

2808 limit = self.constraints["cost"]["max"]

2809 if limit is not None and limit > 0:

2810 cost = "{}<={}".format("+".join(cost), limit)

2811 constraints.append(cost)

2812

2813 return ",".join(constraints)

2814

2815class Grapheme(RegexBase):

2816 def _compile(self, reverse, fuzzy):

2817 # Match at least 1 character until a grapheme boundary is reached. Note

2818 # that this is the same whether matching forwards or backwards.

2819 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),

2820 GraphemeBoundary()]))

2821

2822 return grapheme_matcher.compile(reverse, fuzzy)

2823

2824 def dump(self, indent, reverse):

2825 print("{}GRAPHEME".format(INDENT * indent))

2826

2827 def max_width(self):

2828 return UNLIMITED

2829

2830class GraphemeBoundary:

2831 def compile(self, reverse, fuzzy):

2832 return [(OP.GRAPHEME_BOUNDARY, 1)]

2833

2834class GreedyRepeat(RegexBase):

2835 _opcode = OP.GREEDY_REPEAT

2836 _op_name = "GREEDY_REPEAT"

2837

2838 def __init__(self, subpattern, min_count, max_count):

2839 RegexBase.__init__(self)

2840 self.subpattern = subpattern

2841 self.min_count = min_count

2842 self.max_count = max_count

2843

2844 def fix_groups(self, pattern, reverse, fuzzy):

2845 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2846

2847 def optimise(self, info, reverse):

2848 subpattern = self.subpattern.optimise(info, reverse)

2849

2850 return type(self)(subpattern, self.min_count, self.max_count)

2851

2852 def pack_characters(self, info):

2853 self.subpattern = self.subpattern.pack_characters(info)

2854 return self

2855

2856 def remove_captures(self):

2857 self.subpattern = self.subpattern.remove_captures()

2858 return self

2859

2860 def is_atomic(self):

2861 return self.min_count == self.max_count and self.subpattern.is_atomic()

2862

2863 def can_be_affix(self):

2864 return False

2865

2866 def contains_group(self):

2867 return self.subpattern.contains_group()

2868

2869 def get_firstset(self, reverse):

2870 fs = self.subpattern.get_firstset(reverse)

2871 if self.min_count == 0:

2872 fs.add(None)

2873

2874 return fs

2875

2876 def _compile(self, reverse, fuzzy):

2877 repeat = [self._opcode, self.min_count]

2878 if self.max_count is None:

2879 repeat.append(UNLIMITED)

2880 else:

2881 repeat.append(self.max_count)

2882

2883 subpattern = self.subpattern.compile(reverse, fuzzy)

2884 if not subpattern:

2885 return []

2886

2887 return ([tuple(repeat)] + subpattern + [(OP.END, )])

2888

2889 def dump(self, indent, reverse):

2890 if self.max_count is None:

2891 limit = "INF"

2892 else:

2893 limit = self.max_count

2894 print("{}{} {} {}".format(INDENT * indent, self._op_name,

2895 self.min_count, limit))

2896

2897 self.subpattern.dump(indent + 1, reverse)

2898

2899 def is_empty(self):

2900 return self.subpattern.is_empty()

2901

2902 def __eq__(self, other):

2903 return type(self) is type(other) and (self.subpattern, self.min_count,

2904 self.max_count) == (other.subpattern, other.min_count,

2905 other.max_count)

2906

2907 def max_width(self):

2908 if self.max_count is None:

2909 return UNLIMITED

2910

2911 return self.subpattern.max_width() * self.max_count

2912

2913 def get_required_string(self, reverse):

2914 max_count = UNLIMITED if self.max_count is None else self.max_count

2915 if self.min_count == 0:

2916 w = self.subpattern.max_width() * max_count

2917 return min(w, UNLIMITED), None

2918

2919 ofs, req = self.subpattern.get_required_string(reverse)

2920 if req:

2921 return ofs, req

2922

2923 w = self.subpattern.max_width() * max_count

2924 return min(w, UNLIMITED), None

2925

2926class PossessiveRepeat(GreedyRepeat):

2927 def is_atomic(self):

2928 return True

2929

2930 def _compile(self, reverse, fuzzy):

2931 subpattern = self.subpattern.compile(reverse, fuzzy)

2932 if not subpattern:

2933 return []

2934

2935 repeat = [self._opcode, self.min_count]

2936 if self.max_count is None:

2937 repeat.append(UNLIMITED)

2938 else:

2939 repeat.append(self.max_count)

2940

2941 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),

2942 (OP.END, )])

2943

2944 def dump(self, indent, reverse):

2945 print("{}ATOMIC".format(INDENT * indent))

2946

2947 if self.max_count is None:

2948 limit = "INF"

2949 else:

2950 limit = self.max_count

2951 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,

2952 self.min_count, limit))

2953

2954 self.subpattern.dump(indent + 2, reverse)

2955

2956class Group(RegexBase):

2957 def __init__(self, info, group, subpattern):

2958 RegexBase.__init__(self)

2959 self.info = info

2960 self.group = group

2961 self.subpattern = subpattern

2962

2963 self.call_ref = None

2964

2965 def fix_groups(self, pattern, reverse, fuzzy):

2966 self.info.defined_groups[self.group] = (self, reverse, fuzzy)

2967 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2968

2969 def optimise(self, info, reverse):

2970 subpattern = self.subpattern.optimise(info, reverse)

2971

2972 return Group(self.info, self.group, subpattern)

2973

2974 def pack_characters(self, info):

2975 self.subpattern = self.subpattern.pack_characters(info)

2976 return self

2977

2978 def remove_captures(self):

2979 return self.subpattern.remove_captures()

2980

2981 def is_atomic(self):

2982 return self.subpattern.is_atomic()

2983

2984 def can_be_affix(self):

2985 return False

2986

2987 def contains_group(self):

2988 return True

2989

2990 def get_firstset(self, reverse):

2991 return self.subpattern.get_firstset(reverse)

2992

2993 def has_simple_start(self):

2994 return self.subpattern.has_simple_start()

2995

2996 def _compile(self, reverse, fuzzy):

2997 code = []

2998

2999 public_group = private_group = self.group

3000 if private_group < 0:

3001 public_group = self.info.private_groups[private_group]

3002 private_group = self.info.group_count - private_group

3003

3004 key = self.group, reverse, fuzzy

3005 ref = self.info.call_refs.get(key)

3006 if ref is not None:

3007 code += [(OP.CALL_REF, ref)]

3008

3009 code += [(OP.GROUP, int(not reverse), private_group, public_group)]

3010 code += self.subpattern.compile(reverse, fuzzy)

3011 code += [(OP.END, )]

3012

3013 if ref is not None:

3014 code += [(OP.END, )]

3015

3016 return code

3017

3018 def dump(self, indent, reverse):

3019 group = self.group

3020 if group < 0:

3021 group = private_groups[group]

3022 print("{}GROUP {}".format(INDENT * indent, group))

3023 self.subpattern.dump(indent + 1, reverse)

3024

3025 def __eq__(self, other):

3026 return (type(self) is type(other) and (self.group, self.subpattern) ==

3027 (other.group, other.subpattern))

3028

3029 def max_width(self):

3030 return self.subpattern.max_width()

3031

3032 def get_required_string(self, reverse):

3033 return self.subpattern.get_required_string(reverse)

3034

3035 def __del__(self):

3036 self.info = None

3037

3038class Keep(ZeroWidthBase):

3039 _opcode = OP.KEEP

3040 _op_name = "KEEP"

3041

3042class LazyRepeat(GreedyRepeat):

3043 _opcode = OP.LAZY_REPEAT

3044 _op_name = "LAZY_REPEAT"

3045

3046class LookAround(RegexBase):

3047 _dir_text = {False: "AHEAD", True: "BEHIND"}

3048

3049 def __init__(self, behind, positive, subpattern):

3050 RegexBase.__init__(self)

3051 self.behind = bool(behind)

3052 self.positive = bool(positive)

3053 self.subpattern = subpattern

3054

3055 def fix_groups(self, pattern, reverse, fuzzy):

3056 self.subpattern.fix_groups(pattern, self.behind, fuzzy)

3057

3058 def optimise(self, info, reverse):

3059 subpattern = self.subpattern.optimise(info, self.behind)

3060 if self.positive and subpattern.is_empty():

3061 return subpattern

3062

3063 return LookAround(self.behind, self.positive, subpattern)

3064

3065 def pack_characters(self, info):

3066 self.subpattern = self.subpattern.pack_characters(info)

3067 return self

3068

3069 def remove_captures(self):

3070 return self.subpattern.remove_captures()

3071

3072 def is_atomic(self):

3073 return self.subpattern.is_atomic()

3074

3075 def can_be_affix(self):

3076 return self.subpattern.can_be_affix()

3077

3078 def contains_group(self):

3079 return self.subpattern.contains_group()

3080

3081 def get_firstset(self, reverse):

3082 if self.positive and self.behind == reverse:

3083 return self.subpattern.get_firstset(reverse)

3084

3085 return set([None])

3086

3087 def _compile(self, reverse, fuzzy):

3088 flags = 0

3089 if self.positive:

3090 flags |= POSITIVE_OP

3091 if fuzzy:

3092 flags |= FUZZY_OP

3093 if reverse:

3094 flags |= REVERSE_OP

3095

3096 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +

3097 self.subpattern.compile(self.behind) + [(OP.END, )])

3098

3099 def dump(self, indent, reverse):

3100 print("{}LOOK{} {}".format(INDENT * indent,

3101 self._dir_text[self.behind], POS_TEXT[self.positive]))

3102 self.subpattern.dump(indent + 1, self.behind)

3103

3104 def is_empty(self):

3105 return self.positive and self.subpattern.is_empty()

3106

3107 def __eq__(self, other):

3108 return type(self) is type(other) and (self.behind, self.positive,

3109 self.subpattern) == (other.behind, other.positive, other.subpattern)

3110

3111 def max_width(self):

3112 return 0

3113

3114class LookAroundConditional(RegexBase):

3115 _dir_text = {False: "AHEAD", True: "BEHIND"}

3116

3117 def __init__(self, behind, positive, subpattern, yes_item, no_item):

3118 RegexBase.__init__(self)

3119 self.behind = bool(behind)

3120 self.positive = bool(positive)

3121 self.subpattern = subpattern

3122 self.yes_item = yes_item

3123 self.no_item = no_item

3124

3125 def fix_groups(self, pattern, reverse, fuzzy):

3126 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3127 self.yes_item.fix_groups(pattern, reverse, fuzzy)

3128 self.no_item.fix_groups(pattern, reverse, fuzzy)

3129

3130 def optimise(self, info, reverse):

3131 subpattern = self.subpattern.optimise(info, self.behind)

3132 yes_item = self.yes_item.optimise(info, self.behind)

3133 no_item = self.no_item.optimise(info, self.behind)

3134

3135 return LookAroundConditional(self.behind, self.positive, subpattern,

3136 yes_item, no_item)

3137

3138 def pack_characters(self, info):

3139 self.subpattern = self.subpattern.pack_characters(info)

3140 self.yes_item = self.yes_item.pack_characters(info)

3141 self.no_item = self.no_item.pack_characters(info)

3142 return self

3143

3144 def remove_captures(self):

3145 self.subpattern = self.subpattern.remove_captures()

3146 self.yes_item = self.yes_item.remove_captures()

3147 self.no_item = self.no_item.remove_captures()

3148

3149 def is_atomic(self):

3150 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and

3151 self.no_item.is_atomic())

3152

3153 def can_be_affix(self):

3154 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()

3155 and self.no_item.can_be_affix())

3156

3157 def contains_group(self):

3158 return (self.subpattern.contains_group() or

3159 self.yes_item.contains_group() or self.no_item.contains_group())

3160

3161 def _compile(self, reverse, fuzzy):

3162 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]

3163 code.extend(self.subpattern.compile(self.behind, fuzzy))

3164 code.append((OP.NEXT, ))

3165 code.extend(self.yes_item.compile(reverse, fuzzy))

3166 add_code = self.no_item.compile(reverse, fuzzy)

3167 if add_code:

3168 code.append((OP.NEXT, ))

3169 code.extend(add_code)

3170

3171 code.append((OP.END, ))

3172

3173 return code

3174

3175 def dump(self, indent, reverse):

3176 print("{}CONDITIONAL {} {}".format(INDENT * indent,

3177 self._dir_text[self.behind], POS_TEXT[self.positive]))

3178 self.subpattern.dump(indent + 1, self.behind)

3179 print("{}EITHER".format(INDENT * indent))

3180 self.yes_item.dump(indent + 1, reverse)

3181 if not self.no_item.is_empty():

3182 print("{}OR".format(INDENT * indent))

3183 self.no_item.dump(indent + 1, reverse)

3184

3185 def is_empty(self):

3186 return (self.subpattern.is_empty() and self.yes_item.is_empty() or

3187 self.no_item.is_empty())

3188

3189 def __eq__(self, other):

3190 return type(self) is type(other) and (self.subpattern, self.yes_item,

3191 self.no_item) == (other.subpattern, other.yes_item, other.no_item)

3192

3193 def max_width(self):

3194 return max(self.yes_item.max_width(), self.no_item.max_width())

3195

3196 def get_required_string(self, reverse):

3197 return self.max_width(), None

3198

3199class PrecompiledCode(RegexBase):

3200 def __init__(self, code):

3201 self.code = code

3202

3203 def _compile(self, reverse, fuzzy):

3204 return [tuple(self.code)]

3205

3206class Property(RegexBase):

3207 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):

3208 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):

3209 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):

3210 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,

3211 True): OP.PROPERTY_IGN_REV}

3212

3213 def __init__(self, value, positive=True, case_flags=NOCASE,

3214 zerowidth=False):

3215 RegexBase.__init__(self)

3216 self.value = value

3217 self.positive = bool(positive)

3218 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3219 self.zerowidth = bool(zerowidth)

3220

3221 self._key = (self.__class__, self.value, self.positive,

3222 self.case_flags, self.zerowidth)

3223

3224 def rebuild(self, positive, case_flags, zerowidth):

3225 return Property(self.value, positive, case_flags, zerowidth)

3226

3227 def optimise(self, info, reverse, in_set=False):

3228 return self

3229

3230 def get_firstset(self, reverse):

3231 return set([self])

3232

3233 def has_simple_start(self):

3234 return True

3235

3236 def _compile(self, reverse, fuzzy):

3237 flags = 0

3238 if self.positive:

3239 flags |= POSITIVE_OP

3240 if self.zerowidth:

3241 flags |= ZEROWIDTH_OP

3242 if fuzzy:

3243 flags |= FUZZY_OP

3244 return [(self._opcode[self.case_flags, reverse], flags, self.value)]

3245

3246 def dump(self, indent, reverse):

3247 prop = PROPERTY_NAMES[self.value >> 16]

3248 name, value = prop[0], prop[1][self.value & 0xFFFF]

3249 print("{}PROPERTY {} {}:{}{}".format(INDENT * indent,

3250 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags]))

3251

3252 def matches(self, ch):

3253 return _regex.has_property_value(self.value, ch) == self.positive

3254

3255 def max_width(self):

3256 return 1

3257

3258class Prune(ZeroWidthBase):

3259 _op_name = "PRUNE"

3260

3261 def _compile(self, reverse, fuzzy):

3262 return [(OP.PRUNE, )]

3263

3264class Range(RegexBase):

3265 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,

3266 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,

3267 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,

3268 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}

3269 _op_name = "RANGE"

3270

3271 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,

3272 zerowidth=False):

3273 RegexBase.__init__(self)

3274 self.lower = lower

3275 self.upper = upper

3276 self.positive = bool(positive)

3277 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3278 self.zerowidth = bool(zerowidth)

3279

3280 self._key = (self.__class__, self.lower, self.upper, self.positive,

3281 self.case_flags, self.zerowidth)

3282

3283 def rebuild(self, positive, case_flags, zerowidth):

3284 return Range(self.lower, self.upper, positive, case_flags, zerowidth)

3285

3286 def optimise(self, info, reverse, in_set=False):

3287 # Is the range case-sensitive?

3288 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3289 return self

3290

3291 # Is full case-folding possible?

3292 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=

3293 FULLIGNORECASE):

3294 return self

3295

3296 # Get the characters which expand to multiple codepoints on folding.

3297 expanding_chars = _regex.get_expand_on_folding()

3298

3299 # Get the folded characters in the range.

3300 items = []

3301 for ch in expanding_chars:

3302 if self.lower <= ord(ch) <= self.upper:

3303 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3304 items.append(String([ord(c) for c in folded],

3305 case_flags=self.case_flags))

3306

3307 if not items:

3308 # We can fall back to simple case-folding.

3309 return self

3310

3311 if len(items) < self.upper - self.lower + 1:

3312 # Not all the characters are covered by the full case-folding.

3313 items.insert(0, self)

3314

3315 return Branch(items)

3316

3317 def _compile(self, reverse, fuzzy):

3318 flags = 0

3319 if self.positive:

3320 flags |= POSITIVE_OP

3321 if self.zerowidth:

3322 flags |= ZEROWIDTH_OP

3323 if fuzzy:

3324 flags |= FUZZY_OP

3325 return [(self._opcode[self.case_flags, reverse], flags, self.lower,

3326 self.upper)]

3327

3328 def dump(self, indent, reverse):

3329 display_lower = ascii(chr(self.lower)).lstrip("bu")

3330 display_upper = ascii(chr(self.upper)).lstrip("bu")

3331 print("{}RANGE {} {} {}{}".format(INDENT * indent,

3332 POS_TEXT[self.positive], display_lower, display_upper,

3333 CASE_TEXT[self.case_flags]))

3334

3335 def matches(self, ch):

3336 return (self.lower <= ch <= self.upper) == self.positive

3337

3338 def max_width(self):

3339 return 1

3340

3341class RefGroup(RegexBase):

3342 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):

3343 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,

3344 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,

3345 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,

3346 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}

3347

3348 def __init__(self, info, group, position, case_flags=NOCASE):

3349 RegexBase.__init__(self)

3350 self.info = info

3351 self.group = group

3352 self.position = position

3353 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3354

3355 self._key = self.__class__, self.group, self.case_flags

3356

3357 def fix_groups(self, pattern, reverse, fuzzy):

3358 try:

3359 self.group = int(self.group)

3360 except ValueError:

3361 try:

3362 self.group = self.info.group_index[self.group]

3363 except KeyError:

3364 raise error("unknown group", pattern, self.position)

3365

3366 if not 1 <= self.group <= self.info.group_count:

3367 raise error("invalid group reference", pattern, self.position)

3368

3369 self._key = self.__class__, self.group, self.case_flags

3370

3371 def remove_captures(self):

3372 raise error("group reference not allowed", pattern, self.position)

3373

3374 def _compile(self, reverse, fuzzy):

3375 flags = 0

3376 if fuzzy:

3377 flags |= FUZZY_OP

3378 return [(self._opcode[self.case_flags, reverse], flags, self.group)]

3379

3380 def dump(self, indent, reverse):

3381 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,

3382 CASE_TEXT[self.case_flags]))

3383

3384 def max_width(self):

3385 return UNLIMITED

3386

3387 def __del__(self):

3388 self.info = None

3389

3390class SearchAnchor(ZeroWidthBase):

3391 _opcode = OP.SEARCH_ANCHOR

3392 _op_name = "SEARCH_ANCHOR"

3393

3394class Sequence(RegexBase):

3395 def __init__(self, items=None):

3396 RegexBase.__init__(self)

3397 if items is None:

3398 items = []

3399

3400 self.items = items

3401

3402 def fix_groups(self, pattern, reverse, fuzzy):

3403 for s in self.items:

3404 s.fix_groups(pattern, reverse, fuzzy)

3405

3406 def optimise(self, info, reverse):

3407 # Flatten the sequences.

3408 items = []

3409 for s in self.items:

3410 s = s.optimise(info, reverse)

3411 if isinstance(s, Sequence):

3412 items.extend(s.items)

3413 else:

3414 items.append(s)

3415

3416 return make_sequence(items)

3417

3418 def pack_characters(self, info):

3419 "Packs sequences of characters into strings."

3420 items = []

3421 characters = []

3422 case_flags = NOCASE

3423 for s in self.items:

3424 if type(s) is Character and s.positive and not s.zerowidth:

3425 if s.case_flags != case_flags:

3426 # Different case sensitivity, so flush, unless neither the

3427 # previous nor the new character are cased.

3428 if s.case_flags or is_cased_i(info, s.value):

3429 Sequence._flush_characters(info, characters,

3430 case_flags, items)

3431

3432 case_flags = s.case_flags

3433

3434 characters.append(s.value)

3435 elif type(s) is String or type(s) is Literal:

3436 if s.case_flags != case_flags:

3437 # Different case sensitivity, so flush, unless the neither

3438 # the previous nor the new string are cased.

3439 if s.case_flags or any(is_cased_i(info, c) for c in

3440 characters):

3441 Sequence._flush_characters(info, characters,

3442 case_flags, items)

3443

3444 case_flags = s.case_flags

3445

3446 characters.extend(s.characters)

3447 else:

3448 Sequence._flush_characters(info, characters, case_flags, items)

3449

3450 items.append(s.pack_characters(info))

3451

3452 Sequence._flush_characters(info, characters, case_flags, items)

3453

3454 return make_sequence(items)

3455

3456 def remove_captures(self):

3457 self.items = [s.remove_captures() for s in self.items]

3458 return self

3459

3460 def is_atomic(self):

3461 return all(s.is_atomic() for s in self.items)

3462

3463 def can_be_affix(self):

3464 return False

3465

3466 def contains_group(self):

3467 return any(s.contains_group() for s in self.items)

3468

3469 def get_firstset(self, reverse):

3470 fs = set()

3471 items = self.items

3472 if reverse:

3473 items.reverse()

3474 for s in items:

3475 fs |= s.get_firstset(reverse)

3476 if None not in fs:

3477 return fs

3478 fs.discard(None)

3479

3480 return fs | set([None])

3481

3482 def has_simple_start(self):

3483 return bool(self.items) and self.items[0].has_simple_start()

3484

3485 def _compile(self, reverse, fuzzy):

3486 seq = self.items

3487 if reverse:

3488 seq = seq[::-1]

3489

3490 code = []

3491 for s in seq:

3492 code.extend(s.compile(reverse, fuzzy))

3493

3494 return code

3495

3496 def dump(self, indent, reverse):

3497 for s in self.items:

3498 s.dump(indent, reverse)

3499

3500 @staticmethod

3501 def _flush_characters(info, characters, case_flags, items):

3502 if not characters:

3503 return

3504

3505 # Disregard case_flags if all of the characters are case-less.

3506 if case_flags & IGNORECASE:

3507 if not any(is_cased_i(info, c) for c in characters):

3508 case_flags = NOCASE

3509

3510 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3511 literals = Sequence._fix_full_casefold(characters)

3512

3513 for item in literals:

3514 chars = item.characters

3515

3516 if len(chars) == 1:

3517 items.append(Character(chars[0], case_flags=item.case_flags))

3518 else:

3519 items.append(String(chars, case_flags=item.case_flags))

3520 else:

3521 if len(characters) == 1:

3522 items.append(Character(characters[0], case_flags=case_flags))

3523 else:

3524 items.append(String(characters, case_flags=case_flags))

3525

3526 characters[:] = []

3527

3528 @staticmethod

3529 def _fix_full_casefold(characters):

3530 # Split a literal needing full case-folding into chunks that need it

3531 # and chunks that can use simple case-folding, which is faster.

3532 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in

3533 _regex.get_expand_on_folding()]

3534 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)

3535 for c in characters)).lower()

3536 chunks = []

3537

3538 for e in expanded:

3539 found = string.find(e)

3540

3541 while found >= 0:

3542 chunks.append((found, found + len(e)))

3543 found = string.find(e, found + 1)

3544

3545 pos = 0

3546 literals = []

3547

3548 for start, end in Sequence._merge_chunks(chunks):

3549 if pos < start:

3550 literals.append(Literal(characters[pos : start],

3551 case_flags=IGNORECASE))

3552

3553 literals.append(Literal(characters[start : end],

3554 case_flags=FULLIGNORECASE))

3555 pos = end

3556

3557 if pos < len(characters):

3558 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))

3559

3560 return literals

3561

3562 @staticmethod

3563 def _merge_chunks(chunks):

3564 if len(chunks) < 2:

3565 return chunks

3566

3567 chunks.sort()

3568

3569 start, end = chunks[0]

3570 new_chunks = []

3571

3572 for s, e in chunks[1 : ]:

3573 if s <= end:

3574 end = max(end, e)

3575 else:

3576 new_chunks.append((start, end))

3577 start, end = s, e

3578

3579 new_chunks.append((start, end))

3580

3581 return new_chunks

3582

3583 def is_empty(self):

3584 return all(i.is_empty() for i in self.items)

3585

3586 def __eq__(self, other):

3587 return type(self) is type(other) and self.items == other.items

3588

3589 def max_width(self):

3590 return sum(s.max_width() for s in self.items)

3591

3592 def get_required_string(self, reverse):

3593 seq = self.items

3594 if reverse:

3595 seq = seq[::-1]

3596

3597 offset = 0

3598

3599 for s in seq:

3600 ofs, req = s.get_required_string(reverse)

3601 offset += ofs

3602 if req:

3603 return offset, req

3604

3605 return offset, None

3606

3607class SetBase(RegexBase):

3608 def __init__(self, info, items, positive=True, case_flags=NOCASE,

3609 zerowidth=False):

3610 RegexBase.__init__(self)

3611 self.info = info

3612 self.items = tuple(items)

3613 self.positive = bool(positive)

3614 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3615 self.zerowidth = bool(zerowidth)

3616

3617 self.char_width = 1

3618

3619 self._key = (self.__class__, self.items, self.positive,

3620 self.case_flags, self.zerowidth)

3621

3622 def rebuild(self, positive, case_flags, zerowidth):

3623 return type(self)(self.info, self.items, positive, case_flags,

3624 zerowidth).optimise(self.info, False)

3625

3626 def get_firstset(self, reverse):

3627 return set([self])

3628

3629 def has_simple_start(self):

3630 return True

3631

3632 def _compile(self, reverse, fuzzy):

3633 flags = 0

3634 if self.positive:

3635 flags |= POSITIVE_OP

3636 if self.zerowidth:

3637 flags |= ZEROWIDTH_OP

3638 if fuzzy:

3639 flags |= FUZZY_OP

3640 code = [(self._opcode[self.case_flags, reverse], flags)]

3641 for m in self.items:

3642 code.extend(m.compile())

3643

3644 code.append((OP.END, ))

3645

3646 return code

3647

3648 def dump(self, indent, reverse):

3649 print("{}{} {}{}".format(INDENT * indent, self._op_name,

3650 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))

3651 for i in self.items:

3652 i.dump(indent + 1, reverse)

3653

3654 def _handle_case_folding(self, info, in_set):

3655 # Is the set case-sensitive?

3656 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3657 return self

3658

3659 # Is full case-folding possible?

3660 if (not (self.info.flags & UNICODE) or (self.case_flags &

3661 FULLIGNORECASE) != FULLIGNORECASE):

3662 return self

3663

3664 # Get the characters which expand to multiple codepoints on folding.

3665 expanding_chars = _regex.get_expand_on_folding()

3666

3667 # Get the folded characters in the set.

3668 items = []

3669 seen = set()

3670 for ch in expanding_chars:

3671 if self.matches(ord(ch)):

3672 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3673 if folded not in seen:

3674 items.append(String([ord(c) for c in folded],

3675 case_flags=self.case_flags))

3676 seen.add(folded)

3677

3678 if not items:

3679 # We can fall back to simple case-folding.

3680 return self

3681

3682 return Branch([self] + items)

3683

3684 def max_width(self):

3685 # Is the set case-sensitive?

3686 if not self.positive or not (self.case_flags & IGNORECASE):

3687 return 1

3688

3689 # Is full case-folding possible?

3690 if (not (self.info.flags & UNICODE) or (self.case_flags &

3691 FULLIGNORECASE) != FULLIGNORECASE):

3692 return 1

3693

3694 # Get the characters which expand to multiple codepoints on folding.

3695 expanding_chars = _regex.get_expand_on_folding()

3696

3697 # Get the folded characters in the set.

3698 seen = set()

3699 for ch in expanding_chars:

3700 if self.matches(ord(ch)):

3701 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3702 seen.add(folded)

3703

3704 if not seen:

3705 return 1

3706

3707 return max(len(folded) for folded in seen)

3708

3709 def __del__(self):

3710 self.info = None

3711

3712class SetDiff(SetBase):

3713 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):

3714 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):

3715 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):

3716 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,

3717 True): OP.SET_DIFF_IGN_REV}

3718 _op_name = "SET_DIFF"

3719

3720 def optimise(self, info, reverse, in_set=False):

3721 items = self.items

3722 if len(items) > 2:

3723 items = [items[0], SetUnion(info, items[1 : ])]

3724

3725 if len(items) == 1:

3726 return items[0].with_flags(case_flags=self.case_flags,

3727 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3728

3729 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in

3730 items)

3731

3732 return self._handle_case_folding(info, in_set)

3733

3734 def matches(self, ch):

3735 m = self.items[0].matches(ch) and not self.items[1].matches(ch)

3736 return m == self.positive

3737

3738class SetInter(SetBase):

3739 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):

3740 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,

3741 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,

3742 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,

3743 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}

3744 _op_name = "SET_INTER"

3745

3746 def optimise(self, info, reverse, in_set=False):

3747 items = []

3748 for m in self.items:

3749 m = m.optimise(info, reverse, in_set=True)

3750 if isinstance(m, SetInter) and m.positive:

3751 # Intersection in intersection.

3752 items.extend(m.items)

3753 else:

3754 items.append(m)

3755

3756 if len(items) == 1:

3757 return items[0].with_flags(case_flags=self.case_flags,

3758 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3759

3760 self.items = tuple(items)

3761

3762 return self._handle_case_folding(info, in_set)

3763

3764 def matches(self, ch):

3765 m = all(i.matches(ch) for i in self.items)

3766 return m == self.positive

3767

3768class SetSymDiff(SetBase):

3769 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):

3770 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,

3771 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,

3772 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):

3773 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}

3774 _op_name = "SET_SYM_DIFF"

3775

3776 def optimise(self, info, reverse, in_set=False):

3777 items = []

3778 for m in self.items:

3779 m = m.optimise(info, reverse, in_set=True)

3780 if isinstance(m, SetSymDiff) and m.positive:

3781 # Symmetric difference in symmetric difference.

3782 items.extend(m.items)

3783 else:

3784 items.append(m)

3785

3786 if len(items) == 1:

3787 return items[0].with_flags(case_flags=self.case_flags,

3788 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3789

3790 self.items = tuple(items)

3791

3792 return self._handle_case_folding(info, in_set)

3793

3794 def matches(self, ch):

3795 m = False

3796 for i in self.items:

3797 m = m != i.matches(ch)

3798

3799 return m == self.positive

3800

3801class SetUnion(SetBase):

3802 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):

3803 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,

3804 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,

3805 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,

3806 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}

3807 _op_name = "SET_UNION"

3808

3809 def optimise(self, info, reverse, in_set=False):

3810 items = []

3811 for m in self.items:

3812 m = m.optimise(info, reverse, in_set=True)

3813 if isinstance(m, SetUnion) and m.positive:

3814 # Union in union.

3815 items.extend(m.items)

3816 else:

3817 items.append(m)

3818

3819 if len(items) == 1:

3820 i = items[0]

3821 return i.with_flags(positive=i.positive == self.positive,

3822 case_flags=self.case_flags,

3823 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3824

3825 self.items = tuple(items)

3826

3827 return self._handle_case_folding(info, in_set)

3828

3829 def _compile(self, reverse, fuzzy):

3830 flags = 0

3831 if self.positive:

3832 flags |= POSITIVE_OP

3833 if self.zerowidth:

3834 flags |= ZEROWIDTH_OP

3835 if fuzzy:

3836 flags |= FUZZY_OP

3837

3838 characters, others = defaultdict(list), []

3839 for m in self.items:

3840 if isinstance(m, Character):

3841 characters[m.positive].append(m.value)

3842 else:

3843 others.append(m)

3844

3845 code = [(self._opcode[self.case_flags, reverse], flags)]

3846

3847 for positive, values in characters.items():

3848 flags = 0

3849 if positive:

3850 flags |= POSITIVE_OP

3851 if len(values) == 1:

3852 code.append((OP.CHARACTER, flags, values[0]))

3853 else:

3854 code.append((OP.STRING, flags, len(values)) + tuple(values))

3855

3856 for m in others:

3857 code.extend(m.compile())

3858

3859 code.append((OP.END, ))

3860

3861 return code

3862

3863 def matches(self, ch):

3864 m = any(i.matches(ch) for i in self.items)

3865 return m == self.positive

3866

3867class Skip(ZeroWidthBase):

3868 _op_name = "SKIP"

3869 _opcode = OP.SKIP

3870

3871class StartOfLine(ZeroWidthBase):

3872 _opcode = OP.START_OF_LINE

3873 _op_name = "START_OF_LINE"

3874

3875class StartOfLineU(StartOfLine):

3876 _opcode = OP.START_OF_LINE_U

3877 _op_name = "START_OF_LINE_U"

3878

3879class StartOfString(ZeroWidthBase):

3880 _opcode = OP.START_OF_STRING

3881 _op_name = "START_OF_STRING"

3882

3883class StartOfWord(ZeroWidthBase):

3884 _opcode = OP.START_OF_WORD

3885 _op_name = "START_OF_WORD"

3886

3887class String(RegexBase):

3888 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,

3889 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,

3890 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,

3891 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):

3892 OP.STRING_FLD_REV}

3893

3894 def __init__(self, characters, case_flags=NOCASE):

3895 self.characters = tuple(characters)

3896 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3897

3898 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3899 folded_characters = []

3900 for char in self.characters:

3901 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))

3902 folded_characters.extend(ord(c) for c in folded)

3903 else:

3904 folded_characters = self.characters

3905

3906 self.folded_characters = tuple(folded_characters)

3907 self.required = False

3908

3909 self._key = self.__class__, self.characters, self.case_flags

3910

3911 def get_firstset(self, reverse):

3912 if reverse:

3913 pos = -1

3914 else:

3915 pos = 0

3916 return set([Character(self.characters[pos],

3917 case_flags=self.case_flags)])

3918

3919 def has_simple_start(self):

3920 return True

3921

3922 def _compile(self, reverse, fuzzy):

3923 flags = 0

3924 if fuzzy:

3925 flags |= FUZZY_OP

3926 if self.required:

3927 flags |= REQUIRED_OP

3928 return [(self._opcode[self.case_flags, reverse], flags,

3929 len(self.folded_characters)) + self.folded_characters]

3930

3931 def dump(self, indent, reverse):

3932 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")

3933 print("{}STRING {}{}".format(INDENT * indent, display,

3934 CASE_TEXT[self.case_flags]))

3935

3936 def max_width(self):

3937 return len(self.folded_characters)

3938

3939 def get_required_string(self, reverse):

3940 return 0, self

3941

3942class Literal(String):

3943 def dump(self, indent, reverse):

3944 literal = ''.join(chr(c) for c in self.characters)

3945 display = ascii(literal).lstrip("bu")

3946 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,

3947 CASE_TEXT[self.case_flags]))

3948

3949class StringSet(Branch):

3950 def __init__(self, info, name, case_flags=NOCASE):

3951 self.info = info

3952 self.name = name

3953 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3954

3955 self._key = self.__class__, self.name, self.case_flags

3956

3957 self.set_key = (name, self.case_flags)

3958 if self.set_key not in info.named_lists_used:

3959 info.named_lists_used[self.set_key] = len(info.named_lists_used)

3960

3961 index = self.info.named_lists_used[self.set_key]

3962 items = self.info.kwargs[self.name]

3963

3964 case_flags = self.case_flags

3965

3966 encoding = self.info.flags & _ALL_ENCODINGS

3967 fold_flags = encoding | case_flags

3968

3969 choices = []

3970

3971 for string in items:

3972 if isinstance(string, str):

3973 string = [ord(c) for c in string]

3974

3975 choices.append([Character(c, case_flags=case_flags) for c in

3976 string])

3977

3978 # Sort from longest to shortest.

3979 choices.sort(key=len, reverse=True)

3980

3981 self.branches = [Sequence(choice) for choice in choices]

3982

3983 def dump(self, indent, reverse):

3984 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,

3985 CASE_TEXT[self.case_flags]))

3986

3987 def __del__(self):

3988 self.info = None

3989

3990class Source:

3991 "Scanner for the regular expression source string."

3992 def __init__(self, string):

3993 if isinstance(string, str):

3994 self.string = string

3995 self.char_type = chr

3996 else:

3997 self.string = string.decode("latin-1")

3998 self.char_type = lambda c: bytes([c])

3999

4000 self.pos = 0

4001 self.ignore_space = False

4002 self.sep = string[ : 0]

4003

4004 def get(self, override_ignore=False):

4005 string = self.string

4006 pos = self.pos

4007

4008 try:

4009 if self.ignore_space and not override_ignore:

4010 while True:

4011 if string[pos].isspace():

4012 # Skip over the whitespace.

4013 pos += 1

4014 elif string[pos] == "#":

4015 # Skip over the comment to the end of the line.

4016 pos = string.index("\n", pos)

4017 else:

4018 break

4019

4020 ch = string[pos]

4021 self.pos = pos + 1

4022 return ch

4023 except IndexError:

4024 # We've reached the end of the string.

4025 self.pos = pos

4026 return string[ : 0]

4027 except ValueError:

4028 # The comment extended to the end of the string.

4029 self.pos = len(string)

4030 return string[ : 0]

4031

4032 def get_many(self, count=1):

4033 string = self.string

4034 pos = self.pos

4035

4036 try:

4037 if self.ignore_space:

4038 substring = []

4039

4040 while len(substring) < count:

4041 while True:

4042 if string[pos].isspace():

4043 # Skip over the whitespace.

4044 pos += 1

4045 elif string[pos] == "#":

4046 # Skip over the comment to the end of the line.

4047 pos = string.index("\n", pos)

4048 else:

4049 break

4050

4051 substring.append(string[pos])

4052 pos += 1

4053

4054 substring = "".join(substring)

4055 else:

4056 substring = string[pos : pos + count]

4057 pos += len(substring)

4058

4059 self.pos = pos

4060 return substring

4061 except IndexError:

4062 # We've reached the end of the string.

4063 self.pos = len(string)

4064 return "".join(substring)

4065 except ValueError:

4066 # The comment extended to the end of the string.

4067 self.pos = len(string)

4068 return "".join(substring)

4069

4070 def get_while(self, test_set, include=True, keep_spaces=False):

4071 string = self.string

4072 pos = self.pos

4073

4074 if self.ignore_space and not keep_spaces:

4075 try:

4076 substring = []

4077

4078 while True:

4079 if string[pos].isspace():

4080 # Skip over the whitespace.

4081 pos += 1

4082 elif string[pos] == "#":

4083 # Skip over the comment to the end of the line.

4084 pos = string.index("\n", pos)

4085 elif (string[pos] in test_set) == include:

4086 substring.append(string[pos])

4087 pos += 1

4088 else:

4089 break

4090

4091 self.pos = pos

4092 except IndexError:

4093 # We've reached the end of the string.

4094 self.pos = len(string)

4095 except ValueError:

4096 # The comment extended to the end of the string.

4097 self.pos = len(string)

4098

4099 return "".join(substring)

4100 else:

4101 try:

4102 while (string[pos] in test_set) == include:

4103 pos += 1

4104

4105 substring = string[self.pos : pos]

4106

4107 self.pos = pos

4108

4109 return substring

4110 except IndexError:

4111 # We've reached the end of the string.

4112 substring = string[self.pos : pos]

4113

4114 self.pos = pos

4115

4116 return substring

4117

4118 def skip_while(self, test_set, include=True):

4119 string = self.string

4120 pos = self.pos

4121

4122 try:

4123 if self.ignore_space:

4124 while True:

4125 if string[pos].isspace():

4126 # Skip over the whitespace.

4127 pos += 1

4128 elif string[pos] == "#":

4129 # Skip over the comment to the end of the line.

4130 pos = string.index("\n", pos)

4131 elif (string[pos] in test_set) == include:

4132 pos += 1

4133 else:

4134 break

4135 else:

4136 while (string[pos] in test_set) == include:

4137 pos += 1

4138

4139 self.pos = pos

4140 except IndexError:

4141 # We've reached the end of the string.

4142 self.pos = len(string)

4143 except ValueError:

4144 # The comment extended to the end of the string.

4145 self.pos = len(string)

4146

4147 def match(self, substring):

4148 string = self.string

4149 pos = self.pos

4150

4151 if self.ignore_space:

4152 try:

4153 for c in substring:

4154 while True:

4155 if string[pos].isspace():

4156 # Skip over the whitespace.

4157 pos += 1

4158 elif string[pos] == "#":

4159 # Skip over the comment to the end of the line.

4160 pos = string.index("\n", pos)

4161 else:

4162 break

4163

4164 if string[pos] != c:

4165 return False

4166

4167 pos += 1

4168

4169 self.pos = pos

4170

4171 return True

4172 except IndexError:

4173 # We've reached the end of the string.

4174 return False

4175 except ValueError:

4176 # The comment extended to the end of the string.

4177 return False

4178 else:

4179 if not string.startswith(substring, pos):

4180 return False

4181

4182 self.pos = pos + len(substring)

4183

4184 return True

4185

4186 def expect(self, substring):

4187 if not self.match(substring):

4188 raise error("missing {}".format(substring), self.string, self.pos)

4189

4190 def at_end(self):

4191 string = self.string

4192 pos = self.pos

4193

4194 try:

4195 if self.ignore_space:

4196 while True:

4197 if string[pos].isspace():

4198 pos += 1

4199 elif string[pos] == "#":

4200 pos = string.index("\n", pos)

4201 else:

4202 break

4203

4204 return pos >= len(string)

4205 except IndexError:

4206 # We've reached the end of the string.

4207 return True

4208 except ValueError:

4209 # The comment extended to the end of the string.

4210 return True

4211

4212class Info:

4213 "Info about the regular expression."

4214

4215 def __init__(self, flags=0, char_type=None, kwargs={}):

4216 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]

4217 self.flags = flags

4218 self.global_flags = flags

4219 self.inline_locale = False

4220

4221 self.kwargs = kwargs

4222

4223 self.group_count = 0

4224 self.group_index = {}

4225 self.group_name = {}

4226 self.char_type = char_type

4227 self.named_lists_used = {}

4228 self.open_groups = []

4229 self.open_group_count = {}

4230 self.defined_groups = {}

4231 self.group_calls = []

4232 self.private_groups = {}

4233

4234 def open_group(self, name=None):

4235 group = self.group_index.get(name)

4236 if group is None:

4237 while True:

4238 self.group_count += 1

4239 if name is None or self.group_count not in self.group_name:

4240 break

4241

4242 group = self.group_count

4243 if name:

4244 self.group_index[name] = group

4245 self.group_name[group] = name

4246

4247 if group in self.open_groups:

4248 # We have a nested named group. We'll assign it a private group

4249 # number, initially negative until we can assign a proper

4250 # (positive) number.

4251 group_alias = -(len(self.private_groups) + 1)

4252 self.private_groups[group_alias] = group

4253 group = group_alias

4254

4255 self.open_groups.append(group)

4256 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1

4257

4258 return group

4259

4260 def close_group(self):

4261 self.open_groups.pop()

4262

4263 def is_open_group(self, name):

4264 # In version 1, a group reference can refer to an open group. We'll

4265 # just pretend the group isn't open.

4266 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4267 if version == VERSION1:

4268 return False

4269

4270 if name.isdigit():

4271 group = int(name)

4272 else:

4273 group = self.group_index.get(name)

4274

4275 return group in self.open_groups

4276

4277def _check_group_features(info, parsed):

4278 """Checks whether the reverse and fuzzy features of the group calls match

4279 the groups which they call.

4280 """

4281 call_refs = {}

4282 additional_groups = []

4283 for call, reverse, fuzzy in info.group_calls:

4284 # Look up the reference of this group call.

4285 key = (call.group, reverse, fuzzy)

4286 ref = call_refs.get(key)

4287 if ref is None:

4288 # This group doesn't have a reference yet, so look up its features.

4289 if call.group == 0:

4290 # Calling the pattern as a whole.

4291 rev = bool(info.flags & REVERSE)

4292 fuz = isinstance(parsed, Fuzzy)

4293 if (rev, fuz) != (reverse, fuzzy):

4294 # The pattern as a whole doesn't have the features we want,

4295 # so we'll need to make a copy of it with the desired

4296 # features.

4297 additional_groups.append((CallRef(len(call_refs), parsed),

4298 reverse, fuzzy))

4299 else:

4300 # Calling a capture group.

4301 def_info = info.defined_groups[call.group]

4302 group = def_info[0]

4303 if def_info[1 : ] != (reverse, fuzzy):

4304 # The group doesn't have the features we want, so we'll

4305 # need to make a copy of it with the desired features.

4306 additional_groups.append((group, reverse, fuzzy))

4307

4308 ref = len(call_refs)

4309 call_refs[key] = ref

4310

4311 call.call_ref = ref

4312

4313 info.call_refs = call_refs

4314 info.additional_groups = additional_groups

4315

4316def _get_required_string(parsed, flags):

4317 "Gets the required string and related info of a parsed pattern."

4318

4319 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))

4320 if required:

4321 required.required = True

4322 if req_offset >= UNLIMITED:

4323 req_offset = -1

4324

4325 req_flags = required.case_flags

4326 if not (flags & UNICODE):

4327 req_flags &= ~UNICODE

4328

4329 req_chars = required.folded_characters

4330 else:

4331 req_offset = 0

4332 req_chars = ()

4333 req_flags = 0

4334

4335 return req_offset, req_chars, req_flags

4336

4337class Scanner:

4338 def __init__(self, lexicon, flags=0):

4339 self.lexicon = lexicon

4340

4341 # Combine phrases into a compound pattern.

4342 patterns = []

4343 for phrase, action in lexicon:

4344 # Parse the regular expression.

4345 source = Source(phrase)

4346 info = Info(flags, source.char_type)

4347 source.ignore_space = bool(info.flags & VERBOSE)

4348 parsed = _parse_pattern(source, info)

4349 if not source.at_end():

4350 raise error("unbalanced parenthesis", source.string,

4351 source.pos)

4352

4353 # We want to forbid capture groups within each phrase.

4354 patterns.append(parsed.remove_captures())

4355

4356 # Combine all the subpatterns into one pattern.

4357 info = Info(flags)

4358 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]

4359 parsed = Branch(patterns)

4360

4361 # Optimise the compound pattern.

4362 reverse = bool(info.flags & REVERSE)

4363 parsed = parsed.optimise(info, reverse)

4364 parsed = parsed.pack_characters(info)

4365

4366 # Get the required string.

4367 req_offset, req_chars, req_flags = _get_required_string(parsed,

4368 info.flags)

4369

4370 # Check the features of the groups.

4371 _check_group_features(info, parsed)

4372

4373 # Complain if there are any group calls. They are not supported by the

4374 # Scanner class.

4375 if info.call_refs:

4376 raise error("recursive regex not supported by Scanner",

4377 source.string, source.pos)

4378

4379 reverse = bool(info.flags & REVERSE)

4380

4381 # Compile the compound pattern. The result is a list of tuples.

4382 code = parsed.compile(reverse) + [(OP.SUCCESS, )]

4383

4384 # Flatten the code into a list of ints.

4385 code = _flatten_code(code)

4386

4387 if not parsed.has_simple_start():

4388 # Get the first set, if possible.

4389 try:

4390 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))

4391 fs_code = _flatten_code(fs_code)

4392 code = fs_code + code

4393 except _FirstSetError:

4394 pass

4395

4396 # Check the global flags for conflicts.

4397 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4398 if version not in (0, VERSION0, VERSION1):

4399 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")

4400

4401 # Create the PatternObject.

4402 #

4403 # Local flags like IGNORECASE affect the code generation, but aren't

4404 # needed by the PatternObject itself. Conversely, global flags like

4405 # LOCALE _don't_ affect the code generation but _are_ needed by the

4406 # PatternObject.

4407 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,

4408 code, {}, {}, {}, [], req_offset, req_chars, req_flags,

4409 len(patterns))

4410

4411 def scan(self, string):

4412 result = []

4413 append = result.append

4414 match = self.scanner.scanner(string).match

4415 i = 0

4416 while True:

4417 m = match()

4418 if not m:

4419 break

4420 j = m.end()

4421 if i == j:

4422 break

4423 action = self.lexicon[m.lastindex - 1][1]

4424 if hasattr(action, '__call__'):

4425 self.match = m

4426 action = action(self, m.group())

4427 if action is not None:

4428 append(action)

4429 i = j

4430

4431 return result, string[i : ]

4432

4433# Get the known properties dict.

4434PROPERTIES = _regex.get_properties()

4435

4436# Build the inverse of the properties dict.

4437PROPERTY_NAMES = {}

4438for prop_name, (prop_id, values) in PROPERTIES.items():

4439 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))

4440 name = max(name, prop_name, key=len)

4441 PROPERTY_NAMES[prop_id] = name, prop_values

4442

4443 for val_name, val_id in values.items():

4444 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,

4445 key=len)

4446

4447# Character escape sequences.

4448CHARACTER_ESCAPES = {

4449 "a": "\a",

4450 "b": "\b",

4451 "f": "\f",

4452 "n": "\n",

4453 "r": "\r",

4454 "t": "\t",

4455 "v": "\v",

4456}

4457

4458# Predefined character set escape sequences.

4459CHARSET_ESCAPES = {

4460 "d": lookup_property(None, "Digit", True),

4461 "D": lookup_property(None, "Digit", False),

4462 "h": lookup_property(None, "Blank", True),

4463 "s": lookup_property(None, "Space", True),

4464 "S": lookup_property(None, "Space", False),

4465 "w": lookup_property(None, "Word", True),

4466 "W": lookup_property(None, "Word", False),

4467}

4468

4469# Positional escape sequences.

4470POSITION_ESCAPES = {

4471 "A": StartOfString(),

4472 "b": Boundary(),

4473 "B": Boundary(False),

4474 "K": Keep(),

4475 "m": StartOfWord(),

4476 "M": EndOfWord(),

4477 "Z": EndOfString(),

4478}

4479

4480# Positional escape sequences when WORD flag set.

4481WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4482WORD_POSITION_ESCAPES.update({

4483 "b": DefaultBoundary(),

4484 "B": DefaultBoundary(False),

4485 "m": DefaultStartOfWord(),

4486 "M": DefaultEndOfWord(),

4487})

4488

4489# Regex control verbs.

4490VERBS = {

4491 "FAIL": Failure(),

4492 "F": Failure(),

4493 "PRUNE": Prune(),

4494 "SKIP": Skip(),

4495}

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/regex/_regex_core.py: 51%

2774 statements