Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex

2# Secret Labs' Regular Expression Engine core module

6# This version of the SRE library can be redistributed under CNRI's

7# Python 1.6 license. For any other use, please contact Secret Labs

8# AB (info@pythonware.com).

10# Portions of this engine have been developed in cooperation with

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and

12# other compatibility work.

13#

14# 2010-01-16 mrab Python front-end re-written and extended

16import enum

17import string

18import unicodedata

19from collections import defaultdict

21import regex._regex as _regex

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",

27 "Scanner", "RegexFlag"]

29# The regex exception.

30class error(Exception):

31 """Exception raised for invalid regular expressions.

33 Attributes:

35 msg: The unformatted error message

36 pattern: The regular expression pattern

37 pos: The position in the pattern where compilation failed, or None

38 lineno: The line number where compilation failed, unless pos is None

39 colno: The column number where compilation failed, unless pos is None

40 """

42 def __init__(self, message, pattern=None, pos=None):

43 newline = '\n' if isinstance(pattern, str) else b'\n'

44 self.msg = message

45 self.pattern = pattern

46 self.pos = pos

47 if pattern is not None and pos is not None:

48 self.lineno = pattern.count(newline, 0, pos) + 1

49 self.colno = pos - pattern.rfind(newline, 0, pos)

51 message = "{} at position {}".format(message, pos)

53 if newline in pattern:

54 message += " (line {}, column {})".format(self.lineno,

55 self.colno)

57 Exception.__init__(self, message)

59# The exception for when a positional flag has been turned on in the old

60# behaviour.

61class _UnscopedFlagSet(Exception):

62 pass

64# The exception for when parsing fails and we want to try something else.

65class ParseError(Exception):

66 pass

68# The exception for when there isn't a valid first set.

69class _FirstSetError(Exception):

70 pass

72# Flags.

73class RegexFlag(enum.IntFlag):

74 A = ASCII = 0x80 # Assume ASCII locale.

75 B = BESTMATCH = 0x1000 # Best fuzzy match.

76 D = DEBUG = 0x200 # Print parsed pattern.

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first

78 # fuzzy match.

79 F = FULLCASE = 0x4000 # Unicode full case-folding.

80 I = IGNORECASE = 0x2 # Ignore case.

81 L = LOCALE = 0x4 # Assume current 8-bit locale.

82 M = MULTILINE = 0x8 # Make anchors look for newline.

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).

84 R = REVERSE = 0x400 # Search backwards.

85 S = DOTALL = 0x10 # Make dot match newline.

86 U = UNICODE = 0x20 # Assume Unicode locale.

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.

88 V1 = VERSION1 = 0x100 # New enhanced behaviour.

89 W = WORD = 0x800 # Default Unicode word breaks.

90 X = VERBOSE = 0x40 # Ignore whitespace and comments.

91 T = TEMPLATE = 0x1 # Template (present because re module has it).

93 def __repr__(self):

94 if self._name_ is not None:

95 return 'regex.%s' % self._name_

97 value = self._value_

98 members = []

99 negative = value < 0

100

101 if negative:

102 value = ~value

103

104 for m in self.__class__:

105 if value & m._value_:

106 value &= ~m._value_

107 members.append('regex.%s' % m._name_)

108

109 if value:

110 members.append(hex(value))

111

112 res = '|'.join(members)

113

114 if negative:

115 if len(members) > 1:

116 res = '~(%s)' % res

117 else:

118 res = '~%s' % res

119

120 return res

121

122 __str__ = object.__str__

123

124globals().update(RegexFlag.__members__)

125

126DEFAULT_VERSION = VERSION1

127

128_ALL_VERSIONS = VERSION0 | VERSION1

129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE

130

131# The default flags for the various versions.

132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}

133

134# The mask for the flags.

136 REVERSE)

138 _ALL_ENCODINGS)

139

140ALPHA = frozenset(string.ascii_letters)

141DIGITS = frozenset(string.digits)

142ALNUM = ALPHA | DIGITS

143OCT_DIGITS = frozenset(string.octdigits)

144HEX_DIGITS = frozenset(string.hexdigits)

145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])

146NAMED_CHAR_PART = ALNUM | frozenset(" -")

147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")

148SET_OPS = ("||", "~~", "&&", "--")

149

150# The width of the code words inside the regex engine.

151BYTES_PER_CODE = _regex.get_code_size()

152BITS_PER_CODE = BYTES_PER_CODE * 8

153

154# The repeat count which represents infinity.

155UNLIMITED = (1 << BITS_PER_CODE) - 1

156

157# The regular expression flags.

158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,

159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,

160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":

161 VERBOSE}

162

163# The case flags.

164CASE_FLAGS = FULLCASE | IGNORECASE

165NOCASE = 0

166FULLIGNORECASE = FULLCASE | IGNORECASE

167

168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE

169

170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,

171 FULLIGNORECASE: FULLIGNORECASE}

172

173# The number of digits in hexadecimal escapes.

174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}

175

176# The names of the opcodes.

177OPCODES = """

178FAILURE

179SUCCESS

180ANY

181ANY_ALL

182ANY_ALL_REV

183ANY_REV

184ANY_U

185ANY_U_REV

186ATOMIC

187BOUNDARY

188BRANCH

189CALL_REF

190CHARACTER

191CHARACTER_IGN

192CHARACTER_IGN_REV

193CHARACTER_REV

194CONDITIONAL

195DEFAULT_BOUNDARY

196DEFAULT_END_OF_WORD

197DEFAULT_START_OF_WORD

198END

199END_OF_LINE

200END_OF_LINE_U

201END_OF_STRING

202END_OF_STRING_LINE

203END_OF_STRING_LINE_U

204END_OF_WORD

205FUZZY

206GRAPHEME_BOUNDARY

207GREEDY_REPEAT

208GROUP

209GROUP_CALL

210GROUP_EXISTS

211KEEP

212LAZY_REPEAT

213LOOKAROUND

214NEXT

215PROPERTY

216PROPERTY_IGN

217PROPERTY_IGN_REV

218PROPERTY_REV

219PRUNE

220RANGE

221RANGE_IGN

222RANGE_IGN_REV

223RANGE_REV

224REF_GROUP

225REF_GROUP_FLD

226REF_GROUP_FLD_REV

227REF_GROUP_IGN

228REF_GROUP_IGN_REV

229REF_GROUP_REV

230SEARCH_ANCHOR

231SET_DIFF

232SET_DIFF_IGN

233SET_DIFF_IGN_REV

234SET_DIFF_REV

235SET_INTER

236SET_INTER_IGN

237SET_INTER_IGN_REV

238SET_INTER_REV

239SET_SYM_DIFF

240SET_SYM_DIFF_IGN

241SET_SYM_DIFF_IGN_REV

242SET_SYM_DIFF_REV

243SET_UNION

244SET_UNION_IGN

245SET_UNION_IGN_REV

246SET_UNION_REV

247SKIP

248START_OF_LINE

249START_OF_LINE_U

250START_OF_STRING

251START_OF_WORD

252STRING

253STRING_FLD

254STRING_FLD_REV

255STRING_IGN

256STRING_IGN_REV

257STRING_REV

258FUZZY_EXT

259"""

260

261# Define the opcodes in a namespace.

262class Namespace:

263 pass

264

265OP = Namespace()

266for i, op in enumerate(OPCODES.split()):

267 setattr(OP, op, i)

268

269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):

270 """Make room in the given cache.

271

272 Args:

273 cache_dict: The cache dictionary to modify.

274 args_dict: The dictionary of named list args used by patterns.

275 max_length: Maximum # of entries in cache_dict before it is shrunk.

276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.

277 """

278 # Toss out a fraction of the entries at random to make room for new ones.

279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()

280 # as popitem could penalize the same regular expression repeatedly based

281 # on its internal hash value. Being random should spread the cache miss

282 # love around.

283 cache_keys = tuple(cache_dict.keys())

284 overage = len(cache_keys) - max_length

285 if overage < 0:

286 # Cache is already within limits. Normally this should not happen

287 # but it could due to multithreading.

288 return

289

290 number_to_toss = max_length // divisor + overage

291

292 # The import is done here to avoid a circular dependency.

293 import random

294 if not hasattr(random, 'sample'):

295 # Do nothing while resolving the circular dependency:

296 # re->random->warnings->tokenize->string->re

297 return

298

299 for doomed_key in random.sample(cache_keys, number_to_toss):

300 try:

301 del cache_dict[doomed_key]

302 except KeyError:

303 # Ignore problems if the cache changed from another thread.

304 pass

305

306 # Rebuild the arguments and locale-sensitivity dictionaries.

307 args_dict.clear()

308 sensitivity_dict = {}

309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):

310 args_dict[pattern, pattern_type, flags, default_version, locale] = args

311 try:

312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]

313 except KeyError:

314 pass

315

316 locale_sensitive.clear()

317 locale_sensitive.update(sensitivity_dict)

318

319def _fold_case(info, string):

320 "Folds the case of a string."

321 flags = info.flags

322 if (flags & _ALL_ENCODINGS) == 0:

323 flags |= info.guess_encoding

324

325 return _regex.fold_case(flags, string)

326

327def is_cased_i(info, char):

328 "Checks whether a character is cased."

329 return len(_regex.get_all_cases(info.flags, char)) > 1

330

331def is_cased_f(flags, char):

332 "Checks whether a character is cased."

333 return len(_regex.get_all_cases(flags, char)) > 1

334

335def _compile_firstset(info, fs):

336 "Compiles the firstset for the pattern."

337 reverse = bool(info.flags & REVERSE)

338 fs = _check_firstset(info, reverse, fs)

339 if not fs or isinstance(fs, AnyAll):

340 return []

341

342 # Compile the firstset.

343 return fs.compile(reverse)

344

345def _check_firstset(info, reverse, fs):

346 "Checks the firstset for the pattern."

347 if not fs or None in fs:

348 return None

349

350 # If we ignore the case, for simplicity we won't build a firstset.

351 members = set()

352 case_flags = NOCASE

353 for i in fs:

354 if isinstance(i, Character) and not i.positive:

355 return None

356

357# if i.case_flags:

358# if isinstance(i, Character):

359# if is_cased_i(info, i.value):

360# return []

361# elif isinstance(i, SetBase):

362# return []

363 case_flags |= i.case_flags

364 members.add(i.with_flags(case_flags=NOCASE))

365

366 if case_flags == (FULLCASE | IGNORECASE):

367 return None

368

369 # Build the firstset.

370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,

371 zerowidth=True)

372 fs = fs.optimise(info, reverse, in_set=True)

373

374 return fs

375

376def _flatten_code(code):

377 "Flattens the code from a list of tuples."

378 flat_code = []

379 for c in code:

380 flat_code.extend(c)

381

382 return flat_code

383

384def make_case_flags(info):

385 "Makes the case flags."

386 flags = info.flags & CASE_FLAGS

387

388 # Turn off FULLCASE if ASCII is turned on.

389 if info.flags & ASCII:

390 flags &= ~FULLCASE

391

392 return flags

393

394def make_character(info, value, in_set=False):

395 "Makes a character literal."

396 if in_set:

397 # A character set is built case-sensitively.

398 return Character(value)

399

400 return Character(value, case_flags=make_case_flags(info))

401

402def make_ref_group(info, name, position):

403 "Makes a group reference."

404 return RefGroup(info, name, position, case_flags=make_case_flags(info))

405

406def make_string_set(info, name):

407 "Makes a string set."

408 return StringSet(info, name, case_flags=make_case_flags(info))

409

410def make_property(info, prop, in_set):

411 "Makes a property."

412 if in_set:

413 return prop

414

415 return prop.with_flags(case_flags=make_case_flags(info))

416

417def _parse_pattern(source, info):

418 "Parses a pattern, eg. 'a|b|c'."

419 branches = [parse_sequence(source, info)]

420 while source.match("|"):

421 branches.append(parse_sequence(source, info))

422

423 if len(branches) == 1:

424 return branches[0]

425 return Branch(branches)

426

427def parse_sequence(source, info):

428 "Parses a sequence, eg. 'abc'."

429 sequence = [None]

430 case_flags = make_case_flags(info)

431 while True:

432 saved_pos = source.pos

433 ch = source.get()

434 if ch in SPECIAL_CHARS:

435 if ch in ")|":

436 # The end of a sequence. At the end of the pattern ch is "".

437 source.pos = saved_pos

438 break

439 elif ch == "\\":

440 # An escape sequence outside a set.

441 sequence.append(parse_escape(source, info, False))

442 elif ch == "(":

443 # A parenthesised subpattern or a flag.

444 element = parse_paren(source, info)

445 if element is None:

446 case_flags = make_case_flags(info)

447 else:

448 sequence.append(element)

449 elif ch == ".":

450 # Any character.

451 if info.flags & DOTALL:

452 sequence.append(AnyAll())

453 elif info.flags & WORD:

454 sequence.append(AnyU())

455 else:

456 sequence.append(Any())

457 elif ch == "[":

458 # A character set.

459 sequence.append(parse_set(source, info))

460 elif ch == "^":

461 # The start of a line or the string.

462 if info.flags & MULTILINE:

463 if info.flags & WORD:

464 sequence.append(StartOfLineU())

465 else:

466 sequence.append(StartOfLine())

467 else:

468 sequence.append(StartOfString())

469 elif ch == "$":

470 # The end of a line or the string.

471 if info.flags & MULTILINE:

472 if info.flags & WORD:

473 sequence.append(EndOfLineU())

474 else:

475 sequence.append(EndOfLine())

476 else:

477 if info.flags & WORD:

478 sequence.append(EndOfStringLineU())

479 else:

480 sequence.append(EndOfStringLine())

481 elif ch in "?*+{":

482 # Looks like a quantifier.

483 counts = parse_quantifier(source, info, ch)

484 if counts:

485 # It _is_ a quantifier.

486 apply_quantifier(source, info, counts, case_flags, ch,

487 saved_pos, sequence)

488 sequence.append(None)

489 else:

490 # It's not a quantifier. Maybe it's a fuzzy constraint.

491 constraints = parse_fuzzy(source, info, ch, case_flags)

492 if constraints:

493 # It _is_ a fuzzy constraint.

494 apply_constraint(source, info, constraints, case_flags,

495 saved_pos, sequence)

496 sequence.append(None)

497 else:

498 # The element was just a literal.

499 sequence.append(Character(ord(ch),

500 case_flags=case_flags))

501 else:

502 # A literal.

503 sequence.append(Character(ord(ch), case_flags=case_flags))

504 else:

505 # A literal.

506 sequence.append(Character(ord(ch), case_flags=case_flags))

507

508 sequence = [item for item in sequence if item is not None]

509 return Sequence(sequence)

510

511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,

512 sequence):

513 element = sequence.pop()

514 if element is None:

515 if sequence:

516 raise error("multiple repeat", source.string, saved_pos)

517 raise error("nothing to repeat", source.string, saved_pos)

518

519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):

520 raise error("multiple repeat", source.string, saved_pos)

521

522 min_count, max_count = counts

523 saved_pos = source.pos

524 ch = source.get()

525 if ch == "?":

526 # The "?" suffix that means it's a lazy repeat.

527 repeated = LazyRepeat

528 elif ch == "+":

529 # The "+" suffix that means it's a possessive repeat.

530 repeated = PossessiveRepeat

531 else:

532 # No suffix means that it's a greedy repeat.

533 source.pos = saved_pos

534 repeated = GreedyRepeat

535

536 # Ignore the quantifier if it applies to a zero-width item or the number of

537 # repeats is fixed at 1.

538 if not element.is_empty() and (min_count != 1 or max_count != 1):

539 element = repeated(element, min_count, max_count)

540

541 sequence.append(element)

542

543def apply_constraint(source, info, constraints, case_flags, saved_pos,

544 sequence):

545 element = sequence.pop()

546 if element is None:

547 raise error("nothing for fuzzy constraint", source.string, saved_pos)

548

549 # If a group is marked as fuzzy then put all of the fuzzy part in the

550 # group.

551 if isinstance(element, Group):

552 element.subpattern = Fuzzy(element.subpattern, constraints)

553 sequence.append(element)

554 else:

555 sequence.append(Fuzzy(element, constraints))

556

557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}

558

559def parse_quantifier(source, info, ch):

560 "Parses a quantifier."

561 q = _QUANTIFIERS.get(ch)

562 if q:

563 # It's a quantifier.

564 return q

565

566 if ch == "{":

567 # Looks like a limited repeated element, eg. 'a{2,3}'.

568 counts = parse_limited_quantifier(source)

569 if counts:

570 return counts

571

572 return None

573

574def is_above_limit(count):

575 "Checks whether a count is above the maximum."

576 return count is not None and count >= UNLIMITED

577

578def parse_limited_quantifier(source):

579 "Parses a limited quantifier."

580 saved_pos = source.pos

581 min_count = parse_count(source)

582 if source.match(","):

583 max_count = parse_count(source)

584

585 # No minimum means 0 and no maximum means unlimited.

586 min_count = int(min_count or 0)

587 max_count = int(max_count) if max_count else None

588 else:

589 if not min_count:

590 source.pos = saved_pos

591 return None

592

593 min_count = max_count = int(min_count)

594

595 if not source.match ("}"):

596 source.pos = saved_pos

597 return None

598

599 if is_above_limit(min_count) or is_above_limit(max_count):

600 raise error("repeat count too big", source.string, saved_pos)

601

602 if max_count is not None and min_count > max_count:

603 raise error("min repeat greater than max repeat", source.string,

604 saved_pos)

605

606 return min_count, max_count

607

608def parse_fuzzy(source, info, ch, case_flags):

609 "Parses a fuzzy setting, if present."

610 saved_pos = source.pos

611

612 if ch != "{":

613 return None

614

615 constraints = {}

616 try:

617 parse_fuzzy_item(source, constraints)

618 while source.match(","):

619 parse_fuzzy_item(source, constraints)

620 except ParseError:

621 source.pos = saved_pos

622 return None

623

624 if source.match(":"):

625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)

626

627 if not source.match("}"):

628 raise error("expected }", source.string, source.pos)

629

630 return constraints

631

632def parse_fuzzy_item(source, constraints):

633 "Parses a fuzzy setting item."

634 saved_pos = source.pos

635 try:

636 parse_cost_constraint(source, constraints)

637 except ParseError:

638 source.pos = saved_pos

639

640 parse_cost_equation(source, constraints)

641

642def parse_cost_constraint(source, constraints):

643 "Parses a cost constraint."

644 saved_pos = source.pos

645 ch = source.get()

646 if ch in ALPHA:

647 # Syntax: constraint [("<=" | "<") cost]

648 constraint = parse_constraint(source, constraints, ch)

649

650 max_inc = parse_fuzzy_compare(source)

651

652 if max_inc is None:

653 # No maximum cost.

654 constraints[constraint] = 0, None

655 else:

656 # There's a maximum cost.

657 cost_pos = source.pos

658 max_cost = parse_cost_limit(source)

659

660 # Inclusive or exclusive limit?

661 if not max_inc:

662 max_cost -= 1

663

664 if max_cost < 0:

665 raise error("bad fuzzy cost limit", source.string, cost_pos)

666

667 constraints[constraint] = 0, max_cost

668 elif ch in DIGITS:

669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost

670 source.pos = saved_pos

671

672 # Minimum cost.

673 cost_pos = source.pos

674 min_cost = parse_cost_limit(source)

675

676 min_inc = parse_fuzzy_compare(source)

677 if min_inc is None:

678 raise ParseError()

679

680 constraint = parse_constraint(source, constraints, source.get())

681

682 max_inc = parse_fuzzy_compare(source)

683 if max_inc is None:

684 raise ParseError()

685

686 # Maximum cost.

687 cost_pos = source.pos

688 max_cost = parse_cost_limit(source)

689

690 # Inclusive or exclusive limits?

691 if not min_inc:

692 min_cost += 1

693 if not max_inc:

694 max_cost -= 1

695

696 if not 0 <= min_cost <= max_cost:

697 raise error("bad fuzzy cost limit", source.string, cost_pos)

698

699 constraints[constraint] = min_cost, max_cost

700 else:

701 raise ParseError()

702

703def parse_cost_limit(source):

704 "Parses a cost limit."

705 cost_pos = source.pos

706 digits = parse_count(source)

707

708 try:

709 return int(digits)

710 except ValueError:

711 pass

712

713 raise error("bad fuzzy cost limit", source.string, cost_pos)

714

715def parse_constraint(source, constraints, ch):

716 "Parses a constraint."

717 if ch not in "deis":

718 raise ParseError()

719

720 if ch in constraints:

721 raise ParseError()

722

723 return ch

724

725def parse_fuzzy_compare(source):

726 "Parses a cost comparator."

727 if source.match("<="):

728 return True

729 elif source.match("<"):

730 return False

731 else:

732 return None

733

734def parse_cost_equation(source, constraints):

735 "Parses a cost equation."

736 if "cost" in constraints:

737 raise error("more than one cost equation", source.string, source.pos)

738

739 cost = {}

740

741 parse_cost_term(source, cost)

742 while source.match("+"):

743 parse_cost_term(source, cost)

744

745 max_inc = parse_fuzzy_compare(source)

746 if max_inc is None:

747 raise ParseError()

748

749 max_cost = int(parse_count(source))

750

751 if not max_inc:

752 max_cost -= 1

753

754 if max_cost < 0:

755 raise error("bad fuzzy cost limit", source.string, source.pos)

756

757 cost["max"] = max_cost

758

759 constraints["cost"] = cost

760

761def parse_cost_term(source, cost):

762 "Parses a cost equation term."

763 coeff = parse_count(source)

764 ch = source.get()

765 if ch not in "dis":

766 raise ParseError()

767

768 if ch in cost:

769 raise error("repeated fuzzy cost", source.string, source.pos)

770

771 cost[ch] = int(coeff or 1)

772

773def parse_fuzzy_test(source, info, case_flags):

774 saved_pos = source.pos

775 ch = source.get()

776 if ch in SPECIAL_CHARS:

777 if ch == "\\":

778 # An escape sequence outside a set.

779 return parse_escape(source, info, False)

780 elif ch == ".":

781 # Any character.

782 if info.flags & DOTALL:

783 return AnyAll()

784 elif info.flags & WORD:

785 return AnyU()

786 else:

787 return Any()

788 elif ch == "[":

789 # A character set.

790 return parse_set(source, info)

791 else:

792 raise error("expected character set", source.string, saved_pos)

793 elif ch:

794 # A literal.

795 return Character(ord(ch), case_flags=case_flags)

796 else:

797 raise error("expected character set", source.string, saved_pos)

798

799def parse_count(source):

800 "Parses a quantifier's count, which can be empty."

801 return source.get_while(DIGITS)

802

803def parse_paren(source, info):

804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an

805 inline flag.

806 """

807 saved_pos = source.pos

808 ch = source.get(True)

809 if ch == "?":

810 # (?...

811 saved_pos_2 = source.pos

812 ch = source.get(True)

813 if ch == "<":

814 # (?<...

815 saved_pos_3 = source.pos

816 ch = source.get()

817 if ch in ("=", "!"):

818 # (?<=... or (?<!...: lookbehind.

819 return parse_lookaround(source, info, True, ch == "=")

820

821 # (?<...: a named capture group.

822 source.pos = saved_pos_3

823 name = parse_name(source)

824 group = info.open_group(name)

825 source.expect(">")

826 saved_flags = info.flags

827 try:

828 subpattern = _parse_pattern(source, info)

829 source.expect(")")

830 finally:

831 info.flags = saved_flags

832 source.ignore_space = bool(info.flags & VERBOSE)

833

834 info.close_group()

835 return Group(info, group, subpattern)

836 if ch in ("=", "!"):

837 # (?=... or (?!...: lookahead.

838 return parse_lookaround(source, info, False, ch == "=")

839 if ch == "P":

840 # (?P...: a Python extension.

841 return parse_extension(source, info)

842 if ch == "#":

843 # (?#...: a comment.

844 return parse_comment(source)

845 if ch == "(":

846 # (?(...: a conditional subpattern.

847 return parse_conditional(source, info)

848 if ch == ">":

849 # (?>...: an atomic subpattern.

850 return parse_atomic(source, info)

851 if ch == "|":

852 # (?|...: a common/reset groups branch.

853 return parse_common(source, info)

854 if ch == "R" or "0" <= ch <= "9":

855 # (?R...: probably a call to a group.

856 return parse_call_group(source, info, ch, saved_pos_2)

857 if ch == "&":

858 # (?&...: a call to a named group.

859 return parse_call_named_group(source, info, saved_pos_2)

860

861 # (?...: probably a flags subpattern.

862 source.pos = saved_pos_2

863 return parse_flags_subpattern(source, info)

864

865 if ch == "*":

866 # (*...

867 saved_pos_2 = source.pos

868 word = source.get_while(set(")>"), include=False)

869 if word[ : 1].isalpha():

870 verb = VERBS.get(word)

871 if not verb:

872 raise error("unknown verb", source.string, saved_pos_2)

873

874 source.expect(")")

875

876 return verb

877

878 # (...: an unnamed capture group.

879 source.pos = saved_pos

880 group = info.open_group()

881 saved_flags = info.flags

882 try:

883 subpattern = _parse_pattern(source, info)

884 source.expect(")")

885 finally:

886 info.flags = saved_flags

887 source.ignore_space = bool(info.flags & VERBOSE)

888

889 info.close_group()

890

891 return Group(info, group, subpattern)

892

893def parse_extension(source, info):

894 "Parses a Python extension."

895 saved_pos = source.pos

896 ch = source.get()

897 if ch == "<":

898 # (?P<...: a named capture group.

899 name = parse_name(source)

900 group = info.open_group(name)

901 source.expect(">")

902 saved_flags = info.flags

903 try:

904 subpattern = _parse_pattern(source, info)

905 source.expect(")")

906 finally:

907 info.flags = saved_flags

908 source.ignore_space = bool(info.flags & VERBOSE)

909

910 info.close_group()

911

912 return Group(info, group, subpattern)

913 if ch == "=":

914 # (?P=...: a named group reference.

915 name = parse_name(source, allow_numeric=True)

916 source.expect(")")

917 if info.is_open_group(name):

918 raise error("cannot refer to an open group", source.string,

919 saved_pos)

920

921 return make_ref_group(info, name, saved_pos)

922 if ch == ">" or ch == "&":

923 # (?P>...: a call to a group.

924 return parse_call_named_group(source, info, saved_pos)

925

926 source.pos = saved_pos

927 raise error("unknown extension", source.string, saved_pos)

928

929def parse_comment(source):

930 "Parses a comment."

931 while True:

932 saved_pos = source.pos

933 c = source.get(True)

934

935 if not c or c == ")":

936 break

937

938 if c == "\\":

939 c = source.get(True)

940

941 source.pos = saved_pos

942 source.expect(")")

943

944 return None

945

946def parse_lookaround(source, info, behind, positive):

947 "Parses a lookaround."

948 saved_flags = info.flags

949 try:

950 subpattern = _parse_pattern(source, info)

951 source.expect(")")

952 finally:

953 info.flags = saved_flags

954 source.ignore_space = bool(info.flags & VERBOSE)

955

956 return LookAround(behind, positive, subpattern)

957

958def parse_conditional(source, info):

959 "Parses a conditional subpattern."

960 saved_flags = info.flags

961 saved_pos = source.pos

962 ch = source.get()

963 if ch == "?":

964 # (?(?...

965 ch = source.get()

966 if ch in ("=", "!"):

967 # (?(?=... or (?(?!...: lookahead conditional.

968 return parse_lookaround_conditional(source, info, False, ch == "=")

969 if ch == "<":

970 # (?(?<...

971 ch = source.get()

972 if ch in ("=", "!"):

973 # (?(?<=... or (?(?<!...: lookbehind conditional.

974 return parse_lookaround_conditional(source, info, True, ch ==

975 "=")

976

977 source.pos = saved_pos

978 raise error("expected lookaround conditional", source.string,

979 source.pos)

980

981 source.pos = saved_pos

982 try:

983 group = parse_name(source, True)

984 source.expect(")")

985 yes_branch = parse_sequence(source, info)

986 if source.match("|"):

987 no_branch = parse_sequence(source, info)

988 else:

989 no_branch = Sequence()

990

991 source.expect(")")

992 finally:

993 info.flags = saved_flags

994 source.ignore_space = bool(info.flags & VERBOSE)

995

996 if yes_branch.is_empty() and no_branch.is_empty():

997 return Sequence()

998

999 return Conditional(info, group, yes_branch, no_branch, saved_pos)

1000

1001def parse_lookaround_conditional(source, info, behind, positive):

1002 saved_flags = info.flags

1003 try:

1004 subpattern = _parse_pattern(source, info)

1005 source.expect(")")

1006 finally:

1007 info.flags = saved_flags

1008 source.ignore_space = bool(info.flags & VERBOSE)

1009

1010 yes_branch = parse_sequence(source, info)

1011 if source.match("|"):

1012 no_branch = parse_sequence(source, info)

1013 else:

1014 no_branch = Sequence()

1015

1016 source.expect(")")

1017

1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,

1019 no_branch)

1020

1021def parse_atomic(source, info):

1022 "Parses an atomic subpattern."

1023 saved_flags = info.flags

1024 try:

1025 subpattern = _parse_pattern(source, info)

1026 source.expect(")")

1027 finally:

1028 info.flags = saved_flags

1029 source.ignore_space = bool(info.flags & VERBOSE)

1030

1031 return Atomic(subpattern)

1032

1033def parse_common(source, info):

1034 "Parses a common groups branch."

1035 # Capture group numbers in different branches can reuse the group numbers.

1036 initial_group_count = info.group_count

1037 branches = [parse_sequence(source, info)]

1038 final_group_count = info.group_count

1039 while source.match("|"):

1040 info.group_count = initial_group_count

1041 branches.append(parse_sequence(source, info))

1042 final_group_count = max(final_group_count, info.group_count)

1043

1044 info.group_count = final_group_count

1045 source.expect(")")

1046

1047 if len(branches) == 1:

1048 return branches[0]

1049 return Branch(branches)

1050

1051def parse_call_group(source, info, ch, pos):

1052 "Parses a call to a group."

1053 if ch == "R":

1054 group = "0"

1055 else:

1056 group = ch + source.get_while(DIGITS)

1057

1058 source.expect(")")

1059

1060 return CallGroup(info, group, pos)

1061

1062def parse_call_named_group(source, info, pos):

1063 "Parses a call to a named group."

1064 group = parse_name(source)

1065 source.expect(")")

1066

1067 return CallGroup(info, group, pos)

1068

1069def parse_flag_set(source):

1070 "Parses a set of inline flags."

1071 flags = 0

1072

1073 try:

1074 while True:

1075 saved_pos = source.pos

1076 ch = source.get()

1077 if ch == "V":

1078 ch += source.get()

1079 flags |= REGEX_FLAGS[ch]

1080 except KeyError:

1081 source.pos = saved_pos

1082

1083 return flags

1084

1085def parse_flags(source, info):

1086 "Parses flags being turned on/off."

1087 flags_on = parse_flag_set(source)

1088 if source.match("-"):

1089 flags_off = parse_flag_set(source)

1090 if not flags_off:

1091 raise error("bad inline flags: no flags after '-'", source.string,

1092 source.pos)

1093 else:

1094 flags_off = 0

1095

1096 if flags_on & LOCALE:

1097 # Remember that this pattern as an inline locale flag.

1098 info.inline_locale = True

1099

1100 return flags_on, flags_off

1101

1102def parse_subpattern(source, info, flags_on, flags_off):

1103 "Parses a subpattern with scoped flags."

1104 saved_flags = info.flags

1105 info.flags = (info.flags | flags_on) & ~flags_off

1106

1107 # Ensure that there aren't multiple encoding flags set.

1108 if info.flags & (ASCII | LOCALE | UNICODE):

1109 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on

1110

1111 source.ignore_space = bool(info.flags & VERBOSE)

1112 try:

1113 subpattern = _parse_pattern(source, info)

1114 source.expect(")")

1115 finally:

1116 info.flags = saved_flags

1117 source.ignore_space = bool(info.flags & VERBOSE)

1118

1119 return subpattern

1120

1121def parse_flags_subpattern(source, info):

1122 """Parses a flags subpattern. It could be inline flags or a subpattern

1123 possibly with local flags. If it's a subpattern, then that's returned;

1124 if it's a inline flags, then None is returned.

1125 """

1126 flags_on, flags_off = parse_flags(source, info)

1127

1128 if flags_off & GLOBAL_FLAGS:

1129 raise error("bad inline flags: cannot turn off global flag",

1130 source.string, source.pos)

1131

1132 if flags_on & flags_off:

1133 raise error("bad inline flags: flag turned on and off", source.string,

1134 source.pos)

1135

1136 # Handle flags which are global in all regex behaviours.

1137 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS

1138 if new_global_flags:

1139 info.global_flags |= new_global_flags

1140

1141 # A global has been turned on, so reparse the pattern.

1142 raise _UnscopedFlagSet(info.global_flags)

1143

1144 # Ensure that from now on we have only scoped flags.

1145 flags_on &= ~GLOBAL_FLAGS

1146

1147 if source.match(":"):

1148 return parse_subpattern(source, info, flags_on, flags_off)

1149

1150 if source.match(")"):

1151 parse_positional_flags(source, info, flags_on, flags_off)

1152 return None

1153

1154 raise error("unknown extension", source.string, source.pos)

1155

1156def parse_positional_flags(source, info, flags_on, flags_off):

1157 "Parses positional flags."

1158 info.flags = (info.flags | flags_on) & ~flags_off

1159 source.ignore_space = bool(info.flags & VERBOSE)

1160

1161def parse_name(source, allow_numeric=False, allow_group_0=False):

1162 "Parses a name."

1163 name = source.get_while(set(")>"), include=False)

1164

1165 if not name:

1166 raise error("missing group name", source.string, source.pos)

1167

1168 if name.isdigit():

1169 min_group = 0 if allow_group_0 else 1

1170 if not allow_numeric or int(name) < min_group:

1171 raise error("bad character in group name", source.string,

1172 source.pos)

1173 else:

1174 if not name.isidentifier():

1175 raise error("bad character in group name", source.string,

1176 source.pos)

1177

1178 return name

1179

1180def is_octal(string):

1181 "Checks whether a string is octal."

1182 return all(ch in OCT_DIGITS for ch in string)

1183

1184def is_decimal(string):

1185 "Checks whether a string is decimal."

1186 return all(ch in DIGITS for ch in string)

1187

1188def is_hexadecimal(string):

1189 "Checks whether a string is hexadecimal."

1190 return all(ch in HEX_DIGITS for ch in string)

1191

1192def parse_escape(source, info, in_set):

1193 "Parses an escape sequence."

1194 saved_ignore = source.ignore_space

1195 source.ignore_space = False

1196 ch = source.get()

1197 source.ignore_space = saved_ignore

1198 if not ch:

1199 # A backslash at the end of the pattern.

1200 raise error("bad escape (end of pattern)", source.string, source.pos)

1201 if ch in HEX_ESCAPES:

1202 # A hexadecimal escape sequence.

1203 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)

1204 elif ch == "g" and not in_set:

1205 # A group reference.

1206 saved_pos = source.pos

1207 try:

1208 return parse_group_ref(source, info)

1209 except error:

1210 # Invalid as a group reference, so assume it's a literal.

1211 source.pos = saved_pos

1212

1213 return make_character(info, ord(ch), in_set)

1214 elif ch == "G" and not in_set:

1215 # A search anchor.

1216 return SearchAnchor()

1217 elif ch == "L" and not in_set:

1218 # A string set.

1219 return parse_string_set(source, info)

1220 elif ch == "N":

1221 # A named codepoint.

1222 return parse_named_char(source, info, in_set)

1223 elif ch in "pP":

1224 # A Unicode property, positive or negative.

1225 return parse_property(source, info, ch == "p", in_set)

1226 elif ch == "R" and not in_set:

1227 # A line ending.

1228 charset = [0x0A, 0x0B, 0x0C, 0x0D]

1229 if info.guess_encoding == UNICODE:

1230 charset.extend([0x85, 0x2028, 0x2029])

1231

1232 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)

1233 for c in charset])]))

1234 elif ch == "X" and not in_set:

1235 # A grapheme cluster.

1236 return Grapheme()

1237 elif ch in ALPHA:

1238 # An alphabetic escape sequence.

1239 # Positional escapes aren't allowed inside a character set.

1240 if not in_set:

1241 if info.flags & WORD:

1242 value = WORD_POSITION_ESCAPES.get(ch)

1243 elif info.flags & ASCII:

1244 value = ASCII_POSITION_ESCAPES.get(ch)

1245 elif info.flags & UNICODE:

1246 value = UNICODE_POSITION_ESCAPES.get(ch)

1247 else:

1248 value = POSITION_ESCAPES.get(ch)

1249

1250 if value:

1251 return value

1252

1253 if info.flags & ASCII:

1254 value = ASCII_CHARSET_ESCAPES.get(ch)

1255 elif info.flags & UNICODE:

1256 value = UNICODE_CHARSET_ESCAPES.get(ch)

1257 else:

1258 value = CHARSET_ESCAPES.get(ch)

1259

1260 if value:

1261 return value

1262

1263 value = CHARACTER_ESCAPES.get(ch)

1264 if value:

1265 return Character(ord(value))

1266

1267 raise error("bad escape \\%s" % ch, source.string, source.pos)

1268 elif ch in DIGITS:

1269 # A numeric escape sequence.

1270 return parse_numeric_escape(source, info, ch, in_set)

1271 else:

1272 # A literal.

1273 return make_character(info, ord(ch), in_set)

1274

1275def parse_numeric_escape(source, info, ch, in_set):

1276 "Parses a numeric escape sequence."

1277 if in_set or ch == "0":

1278 # Octal escape sequence, max 3 digits.

1279 return parse_octal_escape(source, info, [ch], in_set)

1280

1281 # At least 1 digit, so either octal escape or group.

1282 digits = ch

1283 saved_pos = source.pos

1284 ch = source.get()

1285 if ch in DIGITS:

1286 # At least 2 digits, so either octal escape or group.

1287 digits += ch

1288 saved_pos = source.pos

1289 ch = source.get()

1290 if is_octal(digits) and ch in OCT_DIGITS:

1291 # 3 octal digits, so octal escape sequence.

1292 encoding = info.flags & _ALL_ENCODINGS

1293 if encoding == ASCII or encoding == LOCALE:

1294 octal_mask = 0xFF

1295 else:

1296 octal_mask = 0x1FF

1297

1298 value = int(digits + ch, 8) & octal_mask

1299 return make_character(info, value)

1300

1301 # Group reference.

1302 source.pos = saved_pos

1303 if info.is_open_group(digits):

1304 raise error("cannot refer to an open group", source.string, source.pos)

1305

1306 return make_ref_group(info, digits, source.pos)

1307

1308def parse_octal_escape(source, info, digits, in_set):

1309 "Parses an octal escape sequence."

1310 saved_pos = source.pos

1311 ch = source.get()

1312 while len(digits) < 3 and ch in OCT_DIGITS:

1313 digits.append(ch)

1314 saved_pos = source.pos

1315 ch = source.get()

1316

1317 source.pos = saved_pos

1318 try:

1319 value = int("".join(digits), 8)

1320 return make_character(info, value, in_set)

1321 except ValueError:

1322 if digits[0] in OCT_DIGITS:

1323 raise error("incomplete escape \\%s" % ''.join(digits),

1324 source.string, source.pos)

1325 else:

1326 raise error("bad escape \\%s" % digits[0], source.string,

1327 source.pos)

1328

1329def parse_hex_escape(source, info, esc, expected_len, in_set, type):

1330 "Parses a hex escape sequence."

1331 saved_pos = source.pos

1332 digits = []

1333 for i in range(expected_len):

1334 ch = source.get()

1335 if ch not in HEX_DIGITS:

1336 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1337 source.string, saved_pos)

1338 digits.append(ch)

1339

1340 try:

1341 value = int("".join(digits), 16)

1342 except ValueError:

1343 pass

1344 else:

1345 if value < 0x110000:

1346 return make_character(info, value, in_set)

1347

1348 # Bad hex escape.

1349 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),

1350 source.string, saved_pos)

1351

1352def parse_group_ref(source, info):

1353 "Parses a group reference."

1354 source.expect("<")

1355 saved_pos = source.pos

1356 name = parse_name(source, True)

1357 source.expect(">")

1358 if info.is_open_group(name):

1359 raise error("cannot refer to an open group", source.string, source.pos)

1360

1361 return make_ref_group(info, name, saved_pos)

1362

1363def parse_string_set(source, info):

1364 "Parses a string set reference."

1365 source.expect("<")

1366 name = parse_name(source, True)

1367 source.expect(">")

1368 if name is None or name not in info.kwargs:

1369 raise error("undefined named list", source.string, source.pos)

1370

1371 return make_string_set(info, name)

1372

1373def parse_named_char(source, info, in_set):

1374 "Parses a named character."

1375 saved_pos = source.pos

1376 if source.match("{"):

1377 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)

1378 if source.match("}"):

1379 try:

1380 value = unicodedata.lookup(name)

1381 return make_character(info, ord(value), in_set)

1382 except KeyError:

1383 raise error("undefined character name", source.string,

1384 source.pos)

1385

1386 source.pos = saved_pos

1387 return make_character(info, ord("N"), in_set)

1388

1389def parse_property(source, info, positive, in_set):

1390 "Parses a Unicode property."

1391 saved_pos = source.pos

1392 ch = source.get()

1393 if ch == "{":

1394 negate = source.match("^")

1395 prop_name, name = parse_property_name(source)

1396 if source.match("}"):

1397 # It's correctly delimited.

1398 if info.flags & ASCII:

1399 encoding = ASCII_ENCODING

1400 elif info.flags & UNICODE:

1401 encoding = UNICODE_ENCODING

1402 else:

1403 encoding = 0

1404

1405 prop = lookup_property(prop_name, name, positive != negate, source,

1406 encoding=encoding)

1407 return make_property(info, prop, in_set)

1408 elif ch and ch in "CLMNPSZ":

1409 # An abbreviated property, eg \pL.

1410 if info.flags & ASCII:

1411 encoding = ASCII_ENCODING

1412 elif info.flags & UNICODE:

1413 encoding = UNICODE_ENCODING

1414 else:

1415 encoding = 0

1416

1417 prop = lookup_property(None, ch, positive, source, encoding=encoding)

1418 return make_property(info, prop, in_set)

1419

1420 # Not a property, so treat as a literal "p" or "P".

1421 source.pos = saved_pos

1422 ch = "p" if positive else "P"

1423 return make_character(info, ord(ch), in_set)

1424

1425def parse_property_name(source):

1426 "Parses a property name, which may be qualified."

1427 name = source.get_while(PROPERTY_NAME_PART)

1428 saved_pos = source.pos

1429

1430 ch = source.get()

1431 if ch and ch in ":=":

1432 prop_name = name

1433 name = source.get_while(ALNUM | set(" &_-./")).strip()

1434

1435 if name:

1436 # Name after the ":" or "=", so it's a qualified name.

1437 saved_pos = source.pos

1438 else:

1439 # No name after the ":" or "=", so assume it's an unqualified name.

1440 prop_name, name = None, prop_name

1441 else:

1442 prop_name = None

1443

1444 source.pos = saved_pos

1445 return prop_name, name

1446

1447def parse_set(source, info):

1448 "Parses a character set."

1449 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1450

1451 saved_ignore = source.ignore_space

1452 source.ignore_space = False

1453 # Negative set?

1454 negate = source.match("^")

1455 try:

1456 if version == VERSION0:

1457 item = parse_set_imp_union(source, info)

1458 else:

1459 item = parse_set_union(source, info)

1460

1461 if not source.match("]"):

1462 raise error("missing ]", source.string, source.pos)

1463 finally:

1464 source.ignore_space = saved_ignore

1465

1466 if negate:

1467 item = item.with_flags(positive=not item.positive)

1468

1469 item = item.with_flags(case_flags=make_case_flags(info))

1470

1471 return item

1472

1473def parse_set_union(source, info):

1474 "Parses a set union ([x||y])."

1475 items = [parse_set_symm_diff(source, info)]

1476 while source.match("||"):

1477 items.append(parse_set_symm_diff(source, info))

1478

1479 if len(items) == 1:

1480 return items[0]

1481 return SetUnion(info, items)

1482

1483def parse_set_symm_diff(source, info):

1484 "Parses a set symmetric difference ([x~~y])."

1485 items = [parse_set_inter(source, info)]

1486 while source.match("~~"):

1487 items.append(parse_set_inter(source, info))

1488

1489 if len(items) == 1:

1490 return items[0]

1491 return SetSymDiff(info, items)

1492

1493def parse_set_inter(source, info):

1494 "Parses a set intersection ([x&&y])."

1495 items = [parse_set_diff(source, info)]

1496 while source.match("&&"):

1497 items.append(parse_set_diff(source, info))

1498

1499 if len(items) == 1:

1500 return items[0]

1501 return SetInter(info, items)

1502

1503def parse_set_diff(source, info):

1504 "Parses a set difference ([x--y])."

1505 items = [parse_set_imp_union(source, info)]

1506 while source.match("--"):

1507 items.append(parse_set_imp_union(source, info))

1508

1509 if len(items) == 1:

1510 return items[0]

1511 return SetDiff(info, items)

1512

1513def parse_set_imp_union(source, info):

1514 "Parses a set implicit union ([xy])."

1515 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1516

1517 items = [parse_set_member(source, info)]

1518 while True:

1519 saved_pos = source.pos

1520 if source.match("]"):

1521 # End of the set.

1522 source.pos = saved_pos

1523 break

1524

1525 if version == VERSION1 and any(source.match(op) for op in SET_OPS):

1526 # The new behaviour has set operators.

1527 source.pos = saved_pos

1528 break

1529

1530 items.append(parse_set_member(source, info))

1531

1532 if len(items) == 1:

1533 return items[0]

1534 return SetUnion(info, items)

1535

1536def parse_set_member(source, info):

1537 "Parses a member in a character set."

1538 # Parse a set item.

1539 start = parse_set_item(source, info)

1540 saved_pos1 = source.pos

1541 if (not isinstance(start, Character) or not start.positive or not

1542 source.match("-")):

1543 # It's not the start of a range.

1544 return start

1545

1546 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1547

1548 # It looks like the start of a range of characters.

1549 saved_pos2 = source.pos

1550 if version == VERSION1 and source.match("-"):

1551 # It's actually the set difference operator '--', so return the

1552 # character.

1553 source.pos = saved_pos1

1554 return start

1555

1556 if source.match("]"):

1557 # We've reached the end of the set, so return both the character and

1558 # hyphen.

1559 source.pos = saved_pos2

1560 return SetUnion(info, [start, Character(ord("-"))])

1561

1562 # Parse a set item.

1563 end = parse_set_item(source, info)

1564 if not isinstance(end, Character) or not end.positive:

1565 # It's not a range, so return the character, hyphen and property.

1566 return SetUnion(info, [start, Character(ord("-")), end])

1567

1568 # It _is_ a range.

1569 if start.value > end.value:

1570 raise error("bad character range", source.string, source.pos)

1571

1572 if start.value == end.value:

1573 return start

1574

1575 return Range(start.value, end.value)

1576

1577def parse_set_item(source, info):

1578 "Parses an item in a character set."

1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1580

1581 if source.match("\\"):

1582 # An escape sequence in a set.

1583 return parse_escape(source, info, True)

1584

1585 saved_pos = source.pos

1586 if source.match("[:"):

1587 # Looks like a POSIX character class.

1588 try:

1589 return parse_posix_class(source, info)

1590 except ParseError:

1591 # Not a POSIX character class.

1592 source.pos = saved_pos

1593

1594 if version == VERSION1 and source.match("["):

1595 # It's the start of a nested set.

1596

1597 # Negative set?

1598 negate = source.match("^")

1599 item = parse_set_union(source, info)

1600

1601 if not source.match("]"):

1602 raise error("missing ]", source.string, source.pos)

1603

1604 if negate:

1605 item = item.with_flags(positive=not item.positive)

1606

1607 return item

1608

1609 ch = source.get()

1610 if not ch:

1611 raise error("unterminated character set", source.string, source.pos)

1612

1613 return Character(ord(ch))

1614

1615def parse_posix_class(source, info):

1616 "Parses a POSIX character class."

1617 negate = source.match("^")

1618 prop_name, name = parse_property_name(source)

1619 if not source.match(":]"):

1620 raise ParseError()

1621

1622 return lookup_property(prop_name, name, not negate, source, posix=True)

1623

1624def float_to_rational(flt):

1625 "Converts a float to a rational pair."

1626 int_part = int(flt)

1627 error = flt - int_part

1628 if abs(error) < 0.0001:

1629 return int_part, 1

1630

1631 den, num = float_to_rational(1.0 / error)

1632

1633 return int_part * den + num, den

1634

1635def numeric_to_rational(numeric):

1636 "Converts a numeric string to a rational string, if possible."

1637 if numeric[ : 1] == "-":

1638 sign, numeric = numeric[0], numeric[1 : ]

1639 else:

1640 sign = ""

1641

1642 parts = numeric.split("/")

1643 if len(parts) == 2:

1644 num, den = float_to_rational(float(parts[0]) / float(parts[1]))

1645 elif len(parts) == 1:

1646 num, den = float_to_rational(float(parts[0]))

1647 else:

1648 raise ValueError()

1649

1650 result = "{}{}/{}".format(sign, num, den)

1651 if result.endswith("/1"):

1652 return result[ : -2]

1653

1654 return result

1655

1656def standardise_name(name):

1657 "Standardises a property or value name."

1658 try:

1659 return numeric_to_rational("".join(name))

1660 except (ValueError, ZeroDivisionError):

1661 return "".join(ch for ch in name if ch not in "_- ").upper()

1662

1663_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())

1664

1665_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())

1666

1667def lookup_property(property, value, positive, source=None, posix=False, encoding=0):

1668 "Looks up a property."

1669 # Normalise the names (which may still be lists).

1670 property = standardise_name(property) if property else None

1671 value = standardise_name(value)

1672

1673 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):

1674 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive

1675

1676 if posix and not property and value.upper() in _POSIX_CLASSES:

1677 value = 'POSIX' + value

1678

1679 if property:

1680 # Both the property and the value are provided.

1681 prop = PROPERTIES.get(property)

1682 if not prop:

1683 if not source:

1684 raise error("unknown property")

1685

1686 raise error("unknown property", source.string, source.pos)

1687

1688 prop_id, value_dict = prop

1689 val_id = value_dict.get(value)

1690 if val_id is None:

1691 if not source:

1692 raise error("unknown property value")

1693

1694 raise error("unknown property value", source.string, source.pos)

1695

1696 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1697

1698 # Only the value is provided.

1699 # It might be the name of a GC, script or block value.

1700 for property in ("GC", "SCRIPT", "BLOCK"):

1701 prop_id, value_dict = PROPERTIES.get(property)

1702 val_id = value_dict.get(value)

1703 if val_id is not None:

1704 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1705

1706 # It might be the name of a binary property.

1707 prop = PROPERTIES.get(value)

1708 if prop:

1709 prop_id, value_dict = prop

1710 if set(value_dict) == _BINARY_VALUES:

1711 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1712

1713 return Property(prop_id << 16, not positive, encoding=encoding)

1714

1715 # It might be the name of a binary property starting with a prefix.

1716 if value.startswith("IS"):

1717 prop = PROPERTIES.get(value[2 : ])

1718 if prop:

1719 prop_id, value_dict = prop

1720 if "YES" in value_dict:

1721 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1722

1723 # It might be the name of a script or block starting with a prefix.

1724 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):

1725 if value.startswith(prefix):

1726 prop_id, value_dict = PROPERTIES.get(property)

1727 val_id = value_dict.get(value[2 : ])

1728 if val_id is not None:

1729 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1730

1731 # Unknown property.

1732 if not source:

1733 raise error("unknown property")

1734

1735 raise error("unknown property", source.string, source.pos)

1736

1737def _compile_replacement(source, pattern, is_unicode):

1738 "Compiles a replacement template escape sequence."

1739 ch = source.get()

1740 if ch in ALPHA:

1741 # An alphabetic escape sequence.

1742 value = CHARACTER_ESCAPES.get(ch)

1743 if value:

1744 return False, [ord(value)]

1745

1746 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):

1747 # A hexadecimal escape sequence.

1748 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]

1749

1750 if ch == "g":

1751 # A group preference.

1752 return True, [compile_repl_group(source, pattern)]

1753

1754 if ch == "N" and is_unicode:

1755 # A named character.

1756 value = parse_repl_named_char(source)

1757 if value is not None:

1758 return False, [value]

1759

1760 raise error("bad escape \\%s" % ch, source.string, source.pos)

1761

1762 if isinstance(source.sep, bytes):

1763 octal_mask = 0xFF

1764 else:

1765 octal_mask = 0x1FF

1766

1767 if ch == "0":

1768 # An octal escape sequence.

1769 digits = ch

1770 while len(digits) < 3:

1771 saved_pos = source.pos

1772 ch = source.get()

1773 if ch not in OCT_DIGITS:

1774 source.pos = saved_pos

1775 break

1776 digits += ch

1777

1778 return False, [int(digits, 8) & octal_mask]

1779

1780 if ch in DIGITS:

1781 # Either an octal escape sequence (3 digits) or a group reference (max

1782 # 2 digits).

1783 digits = ch

1784 saved_pos = source.pos

1785 ch = source.get()

1786 if ch in DIGITS:

1787 digits += ch

1788 saved_pos = source.pos

1789 ch = source.get()

1790 if ch and is_octal(digits + ch):

1791 # An octal escape sequence.

1792 return False, [int(digits + ch, 8) & octal_mask]

1793

1794 # A group reference.

1795 source.pos = saved_pos

1796 return True, [int(digits)]

1797

1798 if ch == "\\":

1799 # An escaped backslash is a backslash.

1800 return False, [ord("\\")]

1801

1802 if not ch:

1803 # A trailing backslash.

1804 raise error("bad escape (end of pattern)", source.string, source.pos)

1805

1806 # An escaped non-backslash is a backslash followed by the literal.

1807 return False, [ord("\\"), ord(ch)]

1808

1809def parse_repl_hex_escape(source, expected_len, type):

1810 "Parses a hex escape sequence in a replacement string."

1811 digits = []

1812 for i in range(expected_len):

1813 ch = source.get()

1814 if ch not in HEX_DIGITS:

1815 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1816 source.string, source.pos)

1817 digits.append(ch)

1818

1819 return int("".join(digits), 16)

1820

1821def parse_repl_named_char(source):

1822 "Parses a named character in a replacement string."

1823 saved_pos = source.pos

1824 if source.match("{"):

1825 name = source.get_while(ALPHA | set(" "))

1826

1827 if source.match("}"):

1828 try:

1829 value = unicodedata.lookup(name)

1830 return ord(value)

1831 except KeyError:

1832 raise error("undefined character name", source.string,

1833 source.pos)

1834

1835 source.pos = saved_pos

1836 return None

1837

1838def compile_repl_group(source, pattern):

1839 "Compiles a replacement template group reference."

1840 source.expect("<")

1841 name = parse_name(source, True, True)

1842

1843 source.expect(">")

1844 if name.isdigit():

1845 index = int(name)

1846 if not 0 <= index <= pattern.groups:

1847 raise error("invalid group reference", source.string, source.pos)

1848

1849 return index

1850

1851 try:

1852 return pattern.groupindex[name]

1853 except KeyError:

1854 raise IndexError("unknown group")

1855

1856# The regular expression is parsed into a syntax tree. The different types of

1857# node are defined below.

1858

1859INDENT = " "

1860POSITIVE_OP = 0x1

1861ZEROWIDTH_OP = 0x2

1862FUZZY_OP = 0x4

1863REVERSE_OP = 0x8

1864REQUIRED_OP = 0x10

1865ENCODING_OP_SHIFT = 5

1866

1867POS_TEXT = {False: "NON-MATCH", True: "MATCH"}

1868CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",

1869 FULLIGNORECASE: " FULL_IGNORE_CASE"}

1870

1871def make_sequence(items):

1872 if len(items) == 1:

1873 return items[0]

1874 return Sequence(items)

1875

1876# Common base class for all nodes.

1877class RegexBase:

1878 def __init__(self):

1879 self._key = self.__class__

1880

1881 def with_flags(self, positive=None, case_flags=None, zerowidth=None):

1882 if positive is None:

1883 positive = self.positive

1884 else:

1885 positive = bool(positive)

1886 if case_flags is None:

1887 case_flags = self.case_flags

1888 else:

1889 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]

1890 if zerowidth is None:

1891 zerowidth = self.zerowidth

1892 else:

1893 zerowidth = bool(zerowidth)

1894

1895 if (positive == self.positive and case_flags == self.case_flags and

1896 zerowidth == self.zerowidth):

1897 return self

1898

1899 return self.rebuild(positive, case_flags, zerowidth)

1900

1901 def fix_groups(self, pattern, reverse, fuzzy):

1902 pass

1903

1904 def optimise(self, info, reverse):

1905 return self

1906

1907 def pack_characters(self, info):

1908 return self

1909

1910 def remove_captures(self):

1911 return self

1912

1913 def is_atomic(self):

1914 return True

1915

1916 def can_be_affix(self):

1917 return True

1918

1919 def contains_group(self):

1920 return False

1921

1922 def get_firstset(self, reverse):

1923 raise _FirstSetError()

1924

1925 def has_simple_start(self):

1926 return False

1927

1928 def compile(self, reverse=False, fuzzy=False):

1929 return self._compile(reverse, fuzzy)

1930

1931 def is_empty(self):

1932 return False

1933

1934 def __hash__(self):

1935 return hash(self._key)

1936

1937 def __eq__(self, other):

1938 return type(self) is type(other) and self._key == other._key

1939

1940 def __ne__(self, other):

1941 return not self.__eq__(other)

1942

1943 def get_required_string(self, reverse):

1944 return self.max_width(), None

1945

1946# Base class for zero-width nodes.

1947class ZeroWidthBase(RegexBase):

1948 def __init__(self, positive=True, encoding=0):

1949 RegexBase.__init__(self)

1950 self.positive = bool(positive)

1951 self.encoding = encoding

1952

1953 self._key = self.__class__, self.positive

1954

1955 def get_firstset(self, reverse):

1956 return set([None])

1957

1958 def _compile(self, reverse, fuzzy):

1959 flags = 0

1960 if self.positive:

1961 flags |= POSITIVE_OP

1962 if fuzzy:

1963 flags |= FUZZY_OP

1964 if reverse:

1965 flags |= REVERSE_OP

1966 flags |= self.encoding << ENCODING_OP_SHIFT

1967 return [(self._opcode, flags)]

1968

1969 def dump(self, indent, reverse):

1970 print("{}{} {}{}".format(INDENT * indent, self._op_name,

1971 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))

1972

1973 def max_width(self):

1974 return 0

1975

1976class Any(RegexBase):

1977 _opcode = {False: OP.ANY, True: OP.ANY_REV}

1978 _op_name = "ANY"

1979

1980 def has_simple_start(self):

1981 return True

1982

1983 def _compile(self, reverse, fuzzy):

1984 flags = 0

1985 if fuzzy:

1986 flags |= FUZZY_OP

1987 return [(self._opcode[reverse], flags)]

1988

1989 def dump(self, indent, reverse):

1990 print("{}{}".format(INDENT * indent, self._op_name))

1991

1992 def max_width(self):

1993 return 1

1994

1995class AnyAll(Any):

1996 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}

1997 _op_name = "ANY_ALL"

1998

1999class AnyU(Any):

2000 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}

2001 _op_name = "ANY_U"

2002

2003class Atomic(RegexBase):

2004 def __init__(self, subpattern):

2005 RegexBase.__init__(self)

2006 self.subpattern = subpattern

2007

2008 def fix_groups(self, pattern, reverse, fuzzy):

2009 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2010

2011 def optimise(self, info, reverse):

2012 self.subpattern = self.subpattern.optimise(info, reverse)

2013

2014 if self.subpattern.is_empty():

2015 return self.subpattern

2016 return self

2017

2018 def pack_characters(self, info):

2019 self.subpattern = self.subpattern.pack_characters(info)

2020 return self

2021

2022 def remove_captures(self):

2023 self.subpattern = self.subpattern.remove_captures()

2024 return self

2025

2026 def can_be_affix(self):

2027 return self.subpattern.can_be_affix()

2028

2029 def contains_group(self):

2030 return self.subpattern.contains_group()

2031

2032 def get_firstset(self, reverse):

2033 return self.subpattern.get_firstset(reverse)

2034

2035 def has_simple_start(self):

2036 return self.subpattern.has_simple_start()

2037

2038 def _compile(self, reverse, fuzzy):

2039 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +

2040 [(OP.END, )])

2041

2042 def dump(self, indent, reverse):

2043 print("{}ATOMIC".format(INDENT * indent))

2044 self.subpattern.dump(indent + 1, reverse)

2045

2046 def is_empty(self):

2047 return self.subpattern.is_empty()

2048

2049 def __eq__(self, other):

2050 return (type(self) is type(other) and self.subpattern ==

2051 other.subpattern)

2052

2053 def max_width(self):

2054 return self.subpattern.max_width()

2055

2056 def get_required_string(self, reverse):

2057 return self.subpattern.get_required_string(reverse)

2058

2059class Boundary(ZeroWidthBase):

2060 _opcode = OP.BOUNDARY

2061 _op_name = "BOUNDARY"

2062

2063class Branch(RegexBase):

2064 def __init__(self, branches):

2065 RegexBase.__init__(self)

2066 self.branches = branches

2067

2068 def fix_groups(self, pattern, reverse, fuzzy):

2069 for b in self.branches:

2070 b.fix_groups(pattern, reverse, fuzzy)

2071

2072 def optimise(self, info, reverse):

2073 if not self.branches:

2074 return Sequence([])

2075

2076 # Flatten branches within branches.

2077 branches = Branch._flatten_branches(info, reverse, self.branches)

2078

2079 # Move any common prefix or suffix out of the branches.

2080 if reverse:

2081 suffix, branches = Branch._split_common_suffix(info, branches)

2082 prefix = []

2083 else:

2084 prefix, branches = Branch._split_common_prefix(info, branches)

2085 suffix = []

2086

2087 # Try to reduce adjacent single-character branches to sets.

2088 branches = Branch._reduce_to_set(info, reverse, branches)

2089

2090 if len(branches) > 1:

2091 sequence = [Branch(branches)]

2092

2093 if not prefix or not suffix:

2094 # We might be able to add a quick precheck before the branches.

2095 firstset = self._add_precheck(info, reverse, branches)

2096

2097 if firstset:

2098 if reverse:

2099 sequence.append(firstset)

2100 else:

2101 sequence.insert(0, firstset)

2102 else:

2103 sequence = branches

2104

2105 return make_sequence(prefix + sequence + suffix)

2106

2107 def _add_precheck(self, info, reverse, branches):

2108 charset = set()

2109 pos = -1 if reverse else 0

2110

2111 for branch in branches:

2112 if type(branch) is Literal and branch.case_flags == NOCASE:

2113 charset.add(branch.characters[pos])

2114 else:

2115 return

2116

2117 if not charset:

2118 return None

2119

2120 return _check_firstset(info, reverse, [Character(c) for c in charset])

2121

2122 def pack_characters(self, info):

2123 self.branches = [b.pack_characters(info) for b in self.branches]

2124 return self

2125

2126 def remove_captures(self):

2127 self.branches = [b.remove_captures() for b in self.branches]

2128 return self

2129

2130 def is_atomic(self):

2131 return all(b.is_atomic() for b in self.branches)

2132

2133 def can_be_affix(self):

2134 return all(b.can_be_affix() for b in self.branches)

2135

2136 def contains_group(self):

2137 return any(b.contains_group() for b in self.branches)

2138

2139 def get_firstset(self, reverse):

2140 fs = set()

2141 for b in self.branches:

2142 fs |= b.get_firstset(reverse)

2143

2144 return fs or set([None])

2145

2146 def _compile(self, reverse, fuzzy):

2147 if not self.branches:

2148 return []

2149

2150 code = [(OP.BRANCH, )]

2151 for b in self.branches:

2152 code.extend(b.compile(reverse, fuzzy))

2153 code.append((OP.NEXT, ))

2154

2155 code[-1] = (OP.END, )

2156

2157 return code

2158

2159 def dump(self, indent, reverse):

2160 print("{}BRANCH".format(INDENT * indent))

2161 self.branches[0].dump(indent + 1, reverse)

2162 for b in self.branches[1 : ]:

2163 print("{}OR".format(INDENT * indent))

2164 b.dump(indent + 1, reverse)

2165

2166 @staticmethod

2167 def _flatten_branches(info, reverse, branches):

2168 # Flatten the branches so that there aren't branches of branches.

2169 new_branches = []

2170 for b in branches:

2171 b = b.optimise(info, reverse)

2172 if isinstance(b, Branch):

2173 new_branches.extend(b.branches)

2174 else:

2175 new_branches.append(b)

2176

2177 return new_branches

2178

2179 @staticmethod

2180 def _split_common_prefix(info, branches):

2181 # Common leading items can be moved out of the branches.

2182 # Get the items in the branches.

2183 alternatives = []

2184 for b in branches:

2185 if isinstance(b, Sequence):

2186 alternatives.append(b.items)

2187 else:

2188 alternatives.append([b])

2189

2190 # What is the maximum possible length of the prefix?

2191 max_count = min(len(a) for a in alternatives)

2192

2193 # What is the longest common prefix?

2194 prefix = alternatives[0]

2195 pos = 0

2196 end_pos = max_count

2197 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==

2198 prefix[pos] for a in alternatives):

2199 pos += 1

2200 count = pos

2201

2202 if info.flags & UNICODE:

2203 # We need to check that we're not splitting a sequence of

2204 # characters which could form part of full case-folding.

2205 count = pos

2206 while count > 0 and not all(Branch._can_split(a, count) for a in

2207 alternatives):

2208 count -= 1

2209

2210 # No common prefix is possible.

2211 if count == 0:

2212 return [], branches

2213

2214 # Rebuild the branches.

2215 new_branches = []

2216 for a in alternatives:

2217 new_branches.append(make_sequence(a[count : ]))

2218

2219 return prefix[ : count], new_branches

2220

2221 @staticmethod

2222 def _split_common_suffix(info, branches):

2223 # Common trailing items can be moved out of the branches.

2224 # Get the items in the branches.

2225 alternatives = []

2226 for b in branches:

2227 if isinstance(b, Sequence):

2228 alternatives.append(b.items)

2229 else:

2230 alternatives.append([b])

2231

2232 # What is the maximum possible length of the suffix?

2233 max_count = min(len(a) for a in alternatives)

2234

2235 # What is the longest common suffix?

2236 suffix = alternatives[0]

2237 pos = -1

2238 end_pos = -1 - max_count

2239 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==

2240 suffix[pos] for a in alternatives):

2241 pos -= 1

2242 count = -1 - pos

2243

2244 if info.flags & UNICODE:

2245 # We need to check that we're not splitting a sequence of

2246 # characters which could form part of full case-folding.

2247 while count > 0 and not all(Branch._can_split_rev(a, count) for a

2248 in alternatives):

2249 count -= 1

2250

2251 # No common suffix is possible.

2252 if count == 0:

2253 return [], branches

2254

2255 # Rebuild the branches.

2256 new_branches = []

2257 for a in alternatives:

2258 new_branches.append(make_sequence(a[ : -count]))

2259

2260 return suffix[-count : ], new_branches

2261

2262 @staticmethod

2263 def _can_split(items, count):

2264 # Check the characters either side of the proposed split.

2265 if not Branch._is_full_case(items, count - 1):

2266 return True

2267

2268 if not Branch._is_full_case(items, count):

2269 return True

2270

2271 # Check whether a 1-1 split would be OK.

2272 if Branch._is_folded(items[count - 1 : count + 1]):

2273 return False

2274

2275 # Check whether a 1-2 split would be OK.

2276 if (Branch._is_full_case(items, count + 2) and

2277 Branch._is_folded(items[count - 1 : count + 2])):

2278 return False

2279

2280 # Check whether a 2-1 split would be OK.

2281 if (Branch._is_full_case(items, count - 2) and

2282 Branch._is_folded(items[count - 2 : count + 1])):

2283 return False

2284

2285 return True

2286

2287 @staticmethod

2288 def _can_split_rev(items, count):

2289 end = len(items)

2290

2291 # Check the characters either side of the proposed split.

2292 if not Branch._is_full_case(items, end - count):

2293 return True

2294

2295 if not Branch._is_full_case(items, end - count - 1):

2296 return True

2297

2298 # Check whether a 1-1 split would be OK.

2299 if Branch._is_folded(items[end - count - 1 : end - count + 1]):

2300 return False

2301

2302 # Check whether a 1-2 split would be OK.

2303 if (Branch._is_full_case(items, end - count + 2) and

2304 Branch._is_folded(items[end - count - 1 : end - count + 2])):

2305 return False

2306

2307 # Check whether a 2-1 split would be OK.

2308 if (Branch._is_full_case(items, end - count - 2) and

2309 Branch._is_folded(items[end - count - 2 : end - count + 1])):

2310 return False

2311

2312 return True

2313

2314 @staticmethod

2315 def _merge_common_prefixes(info, reverse, branches):

2316 # Branches with the same case-sensitive character prefix can be grouped

2317 # together if they are separated only by other branches with a

2318 # character prefix.

2319 prefixed = defaultdict(list)

2320 order = {}

2321 new_branches = []

2322 for b in branches:

2323 if Branch._is_simple_character(b):

2324 # Branch starts with a simple character.

2325 prefixed[b.value].append([b])

2326 order.setdefault(b.value, len(order))

2327 elif (isinstance(b, Sequence) and b.items and

2328 Branch._is_simple_character(b.items[0])):

2329 # Branch starts with a simple character.

2330 prefixed[b.items[0].value].append(b.items)

2331 order.setdefault(b.items[0].value, len(order))

2332 else:

2333 Branch._flush_char_prefix(info, reverse, prefixed, order,

2334 new_branches)

2335

2336 new_branches.append(b)

2337

2338 Branch._flush_char_prefix(info, prefixed, order, new_branches)

2339

2340 return new_branches

2341

2342 @staticmethod

2343 def _is_simple_character(c):

2344 return isinstance(c, Character) and c.positive and not c.case_flags

2345

2346 @staticmethod

2347 def _reduce_to_set(info, reverse, branches):

2348 # Can the branches be reduced to a set?

2349 new_branches = []

2350 items = set()

2351 case_flags = NOCASE

2352 for b in branches:

2353 if isinstance(b, (Character, Property, SetBase)):

2354 # Branch starts with a single character.

2355 if b.case_flags != case_flags:

2356 # Different case sensitivity, so flush.

2357 Branch._flush_set_members(info, reverse, items, case_flags,

2358 new_branches)

2359

2360 case_flags = b.case_flags

2361

2362 items.add(b.with_flags(case_flags=NOCASE))

2363 else:

2364 Branch._flush_set_members(info, reverse, items, case_flags,

2365 new_branches)

2366

2367 new_branches.append(b)

2368

2369 Branch._flush_set_members(info, reverse, items, case_flags,

2370 new_branches)

2371

2372 return new_branches

2373

2374 @staticmethod

2375 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):

2376 # Flush the prefixed branches.

2377 if not prefixed:

2378 return

2379

2380 for value, branches in sorted(prefixed.items(), key=lambda pair:

2381 order[pair[0]]):

2382 if len(branches) == 1:

2383 new_branches.append(make_sequence(branches[0]))

2384 else:

2385 subbranches = []

2386 optional = False

2387 for b in branches:

2388 if len(b) > 1:

2389 subbranches.append(make_sequence(b[1 : ]))

2390 elif not optional:

2391 subbranches.append(Sequence())

2392 optional = True

2393

2394 sequence = Sequence([Character(value), Branch(subbranches)])

2395 new_branches.append(sequence.optimise(info, reverse))

2396

2397 prefixed.clear()

2398 order.clear()

2399

2400 @staticmethod

2401 def _flush_set_members(info, reverse, items, case_flags, new_branches):

2402 # Flush the set members.

2403 if not items:

2404 return

2405

2406 if len(items) == 1:

2407 item = list(items)[0]

2408 else:

2409 item = SetUnion(info, list(items)).optimise(info, reverse)

2410

2411 new_branches.append(item.with_flags(case_flags=case_flags))

2412

2413 items.clear()

2414

2415 @staticmethod

2416 def _is_full_case(items, i):

2417 if not 0 <= i < len(items):

2418 return False

2419

2420 item = items[i]

2421 return (isinstance(item, Character) and item.positive and

2422 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)

2423

2424 @staticmethod

2425 def _is_folded(items):

2426 if len(items) < 2:

2427 return False

2428

2429 for i in items:

2430 if (not isinstance(i, Character) or not i.positive or not

2431 i.case_flags):

2432 return False

2433

2434 folded = "".join(chr(i.value) for i in items)

2435 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)

2436

2437 # Get the characters which expand to multiple codepoints on folding.

2438 expanding_chars = _regex.get_expand_on_folding()

2439

2440 for c in expanding_chars:

2441 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):

2442 return True

2443

2444 return False

2445

2446 def is_empty(self):

2447 return all(b.is_empty() for b in self.branches)

2448

2449 def __eq__(self, other):

2450 return type(self) is type(other) and self.branches == other.branches

2451

2452 def max_width(self):

2453 return max(b.max_width() for b in self.branches)

2454

2455class CallGroup(RegexBase):

2456 def __init__(self, info, group, position):

2457 RegexBase.__init__(self)

2458 self.info = info

2459 self.group = group

2460 self.position = position

2461

2462 self._key = self.__class__, self.group

2463

2464 def fix_groups(self, pattern, reverse, fuzzy):

2465 try:

2466 self.group = int(self.group)

2467 except ValueError:

2468 try:

2469 self.group = self.info.group_index[self.group]

2470 except KeyError:

2471 raise error("invalid group reference", pattern, self.position)

2472

2473 if not 0 <= self.group <= self.info.group_count:

2474 raise error("unknown group", pattern, self.position)

2475

2476 if self.group > 0 and self.info.open_group_count[self.group] > 1:

2477 raise error("ambiguous group reference", pattern, self.position)

2478

2479 self.info.group_calls.append((self, reverse, fuzzy))

2480

2481 self._key = self.__class__, self.group

2482

2483 def remove_captures(self):

2484 raise error("group reference not allowed", pattern, self.position)

2485

2486 def _compile(self, reverse, fuzzy):

2487 return [(OP.GROUP_CALL, self.call_ref)]

2488

2489 def dump(self, indent, reverse):

2490 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))

2491

2492 def __eq__(self, other):

2493 return type(self) is type(other) and self.group == other.group

2494

2495 def max_width(self):

2496 return UNLIMITED

2497

2498 def __del__(self):

2499 self.info = None

2500

2501class CallRef(RegexBase):

2502 def __init__(self, ref, parsed):

2503 self.ref = ref

2504 self.parsed = parsed

2505

2506 def _compile(self, reverse, fuzzy):

2507 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,

2508 fuzzy) + [(OP.END, )])

2509

2510class Character(RegexBase):

2511 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):

2512 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,

2513 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,

2514 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,

2515 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}

2516

2517 def __init__(self, value, positive=True, case_flags=NOCASE,

2518 zerowidth=False):

2519 RegexBase.__init__(self)

2520 self.value = value

2521 self.positive = bool(positive)

2522 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

2523 self.zerowidth = bool(zerowidth)

2524

2525 if (self.positive and (self.case_flags & FULLIGNORECASE) ==

2526 FULLIGNORECASE):

2527 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))

2528 else:

2529 self.folded = chr(self.value)

2530

2531 self._key = (self.__class__, self.value, self.positive,

2532 self.case_flags, self.zerowidth)

2533

2534 def rebuild(self, positive, case_flags, zerowidth):

2535 return Character(self.value, positive, case_flags, zerowidth)

2536

2537 def optimise(self, info, reverse, in_set=False):

2538 return self

2539

2540 def get_firstset(self, reverse):

2541 return set([self])

2542

2543 def has_simple_start(self):

2544 return True

2545

2546 def _compile(self, reverse, fuzzy):

2547 flags = 0

2548 if self.positive:

2549 flags |= POSITIVE_OP

2550 if self.zerowidth:

2551 flags |= ZEROWIDTH_OP

2552 if fuzzy:

2553 flags |= FUZZY_OP

2554

2555 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,

2556 self.value])

2557

2558 if len(self.folded) > 1:

2559 # The character expands on full case-folding.

2560 code = Branch([code, String([ord(c) for c in self.folded],

2561 case_flags=self.case_flags)])

2562

2563 return code.compile(reverse, fuzzy)

2564

2565 def dump(self, indent, reverse):

2566 display = ascii(chr(self.value)).lstrip("bu")

2567 print("{}CHARACTER {} {}{}".format(INDENT * indent,

2568 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))

2569

2570 def matches(self, ch):

2571 return (ch == self.value) == self.positive

2572

2573 def max_width(self):

2574 return len(self.folded)

2575

2576 def get_required_string(self, reverse):

2577 if not self.positive:

2578 return 1, None

2579

2580 self.folded_characters = tuple(ord(c) for c in self.folded)

2581

2582 return 0, self

2583

2584class Conditional(RegexBase):

2585 def __init__(self, info, group, yes_item, no_item, position):

2586 RegexBase.__init__(self)

2587 self.info = info

2588 self.group = group

2589 self.yes_item = yes_item

2590 self.no_item = no_item

2591 self.position = position

2592

2593 def fix_groups(self, pattern, reverse, fuzzy):

2594 try:

2595 self.group = int(self.group)

2596 except ValueError:

2597 try:

2598 self.group = self.info.group_index[self.group]

2599 except KeyError:

2600 if self.group == 'DEFINE':

2601 # 'DEFINE' is a special name unless there's a group with

2602 # that name.

2603 self.group = 0

2604 else:

2605 raise error("unknown group", pattern, self.position)

2606

2607 if not 0 <= self.group <= self.info.group_count:

2608 raise error("invalid group reference", pattern, self.position)

2609

2610 self.yes_item.fix_groups(pattern, reverse, fuzzy)

2611 self.no_item.fix_groups(pattern, reverse, fuzzy)

2612

2613 def optimise(self, info, reverse):

2614 yes_item = self.yes_item.optimise(info, reverse)

2615 no_item = self.no_item.optimise(info, reverse)

2616

2617 return Conditional(info, self.group, yes_item, no_item, self.position)

2618

2619 def pack_characters(self, info):

2620 self.yes_item = self.yes_item.pack_characters(info)

2621 self.no_item = self.no_item.pack_characters(info)

2622 return self

2623

2624 def remove_captures(self):

2625 self.yes_item = self.yes_item.remove_captures()

2626 self.no_item = self.no_item.remove_captures()

2627

2628 def is_atomic(self):

2629 return self.yes_item.is_atomic() and self.no_item.is_atomic()

2630

2631 def can_be_affix(self):

2632 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()

2633

2634 def contains_group(self):

2635 return self.yes_item.contains_group() or self.no_item.contains_group()

2636

2637 def get_firstset(self, reverse):

2638 return (self.yes_item.get_firstset(reverse) |

2639 self.no_item.get_firstset(reverse))

2640

2641 def _compile(self, reverse, fuzzy):

2642 code = [(OP.GROUP_EXISTS, self.group)]

2643 code.extend(self.yes_item.compile(reverse, fuzzy))

2644 add_code = self.no_item.compile(reverse, fuzzy)

2645 if add_code:

2646 code.append((OP.NEXT, ))

2647 code.extend(add_code)

2648

2649 code.append((OP.END, ))

2650

2651 return code

2652

2653 def dump(self, indent, reverse):

2654 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))

2655 self.yes_item.dump(indent + 1, reverse)

2656 if not self.no_item.is_empty():

2657 print("{}OR".format(INDENT * indent))

2658 self.no_item.dump(indent + 1, reverse)

2659

2660 def is_empty(self):

2661 return self.yes_item.is_empty() and self.no_item.is_empty()

2662

2663 def __eq__(self, other):

2664 return type(self) is type(other) and (self.group, self.yes_item,

2665 self.no_item) == (other.group, other.yes_item, other.no_item)

2666

2667 def max_width(self):

2668 return max(self.yes_item.max_width(), self.no_item.max_width())

2669

2670 def __del__(self):

2671 self.info = None

2672

2673class DefaultBoundary(ZeroWidthBase):

2674 _opcode = OP.DEFAULT_BOUNDARY

2675 _op_name = "DEFAULT_BOUNDARY"

2676

2677class DefaultEndOfWord(ZeroWidthBase):

2678 _opcode = OP.DEFAULT_END_OF_WORD

2679 _op_name = "DEFAULT_END_OF_WORD"

2680

2681class DefaultStartOfWord(ZeroWidthBase):

2682 _opcode = OP.DEFAULT_START_OF_WORD

2683 _op_name = "DEFAULT_START_OF_WORD"

2684

2685class EndOfLine(ZeroWidthBase):

2686 _opcode = OP.END_OF_LINE

2687 _op_name = "END_OF_LINE"

2688

2689class EndOfLineU(EndOfLine):

2690 _opcode = OP.END_OF_LINE_U

2691 _op_name = "END_OF_LINE_U"

2692

2693class EndOfString(ZeroWidthBase):

2694 _opcode = OP.END_OF_STRING

2695 _op_name = "END_OF_STRING"

2696

2697class EndOfStringLine(ZeroWidthBase):

2698 _opcode = OP.END_OF_STRING_LINE

2699 _op_name = "END_OF_STRING_LINE"

2700

2701class EndOfStringLineU(EndOfStringLine):

2702 _opcode = OP.END_OF_STRING_LINE_U

2703 _op_name = "END_OF_STRING_LINE_U"

2704

2705class EndOfWord(ZeroWidthBase):

2706 _opcode = OP.END_OF_WORD

2707 _op_name = "END_OF_WORD"

2708

2709class Failure(ZeroWidthBase):

2710 _op_name = "FAILURE"

2711

2712 def _compile(self, reverse, fuzzy):

2713 return [(OP.FAILURE, )]

2714

2715class Fuzzy(RegexBase):

2716 def __init__(self, subpattern, constraints=None):

2717 RegexBase.__init__(self)

2718 if constraints is None:

2719 constraints = {}

2720 self.subpattern = subpattern

2721 self.constraints = constraints

2722

2723 # If an error type is mentioned in the cost equation, then its maximum

2724 # defaults to unlimited.

2725 if "cost" in constraints:

2726 for e in "dis":

2727 if e in constraints["cost"]:

2728 constraints.setdefault(e, (0, None))

2729

2730 # If any error type is mentioned, then all the error maxima default to

2731 # 0, otherwise they default to unlimited.

2732 if set(constraints) & set("dis"):

2733 for e in "dis":

2734 constraints.setdefault(e, (0, 0))

2735 else:

2736 for e in "dis":

2737 constraints.setdefault(e, (0, None))

2738

2739 # The maximum of the generic error type defaults to unlimited.

2740 constraints.setdefault("e", (0, None))

2741

2742 # The cost equation defaults to equal costs. Also, the cost of any

2743 # error type not mentioned in the cost equation defaults to 0.

2744 if "cost" in constraints:

2745 for e in "dis":

2746 constraints["cost"].setdefault(e, 0)

2747 else:

2748 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":

2749 constraints["e"][1]}

2750

2751 def fix_groups(self, pattern, reverse, fuzzy):

2752 self.subpattern.fix_groups(pattern, reverse, True)

2753

2754 def pack_characters(self, info):

2755 self.subpattern = self.subpattern.pack_characters(info)

2756 return self

2757

2758 def remove_captures(self):

2759 self.subpattern = self.subpattern.remove_captures()

2760 return self

2761

2762 def is_atomic(self):

2763 return self.subpattern.is_atomic()

2764

2765 def contains_group(self):

2766 return self.subpattern.contains_group()

2767

2768 def _compile(self, reverse, fuzzy):

2769 # The individual limits.

2770 arguments = []

2771 for e in "dise":

2772 v = self.constraints[e]

2773 arguments.append(v[0])

2774 arguments.append(UNLIMITED if v[1] is None else v[1])

2775

2776 # The coeffs of the cost equation.

2777 for e in "dis":

2778 arguments.append(self.constraints["cost"][e])

2779

2780 # The maximum of the cost equation.

2781 v = self.constraints["cost"]["max"]

2782 arguments.append(UNLIMITED if v is None else v)

2783

2784 flags = 0

2785 if reverse:

2786 flags |= REVERSE_OP

2787

2788 test = self.constraints.get("test")

2789

2790 if test:

2791 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +

2792 test.compile(reverse, True) + [(OP.NEXT,)] +

2793 self.subpattern.compile(reverse, True) + [(OP.END,)])

2794

2795 return ([(OP.FUZZY, flags) + tuple(arguments)] +

2796 self.subpattern.compile(reverse, True) + [(OP.END,)])

2797

2798 def dump(self, indent, reverse):

2799 constraints = self._constraints_to_string()

2800 if constraints:

2801 constraints = " " + constraints

2802 print("{}FUZZY{}".format(INDENT * indent, constraints))

2803 self.subpattern.dump(indent + 1, reverse)

2804

2805 def is_empty(self):

2806 return self.subpattern.is_empty()

2807

2808 def __eq__(self, other):

2809 return (type(self) is type(other) and self.subpattern ==

2810 other.subpattern and self.constraints == other.constraints)

2811

2812 def max_width(self):

2813 return UNLIMITED

2814

2815 def _constraints_to_string(self):

2816 constraints = []

2817

2818 for name in "ids":

2819 min, max = self.constraints[name]

2820 if max == 0:

2821 continue

2822

2823 con = ""

2824

2825 if min > 0:

2826 con = "{}<=".format(min)

2827

2828 con += name

2829

2830 if max is not None:

2831 con += "<={}".format(max)

2832

2833 constraints.append(con)

2834

2835 cost = []

2836 for name in "ids":

2837 coeff = self.constraints["cost"][name]

2838 if coeff > 0:

2839 cost.append("{}{}".format(coeff, name))

2840

2841 limit = self.constraints["cost"]["max"]

2842 if limit is not None and limit > 0:

2843 cost = "{}<={}".format("+".join(cost), limit)

2844 constraints.append(cost)

2845

2846 return ",".join(constraints)

2847

2848class Grapheme(RegexBase):

2849 def _compile(self, reverse, fuzzy):

2850 # Match at least 1 character until a grapheme boundary is reached. Note

2851 # that this is the same whether matching forwards or backwards.

2852 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),

2853 GraphemeBoundary()]))

2854

2855 return grapheme_matcher.compile(reverse, fuzzy)

2856

2857 def dump(self, indent, reverse):

2858 print("{}GRAPHEME".format(INDENT * indent))

2859

2860 def max_width(self):

2861 return UNLIMITED

2862

2863class GraphemeBoundary:

2864 def compile(self, reverse, fuzzy):

2865 return [(OP.GRAPHEME_BOUNDARY, 1)]

2866

2867class GreedyRepeat(RegexBase):

2868 _opcode = OP.GREEDY_REPEAT

2869 _op_name = "GREEDY_REPEAT"

2870

2871 def __init__(self, subpattern, min_count, max_count):

2872 RegexBase.__init__(self)

2873 self.subpattern = subpattern

2874 self.min_count = min_count

2875 self.max_count = max_count

2876

2877 def fix_groups(self, pattern, reverse, fuzzy):

2878 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2879

2880 def optimise(self, info, reverse):

2881 subpattern = self.subpattern.optimise(info, reverse)

2882

2883 return type(self)(subpattern, self.min_count, self.max_count)

2884

2885 def pack_characters(self, info):

2886 self.subpattern = self.subpattern.pack_characters(info)

2887 return self

2888

2889 def remove_captures(self):

2890 self.subpattern = self.subpattern.remove_captures()

2891 return self

2892

2893 def is_atomic(self):

2894 return self.min_count == self.max_count and self.subpattern.is_atomic()

2895

2896 def can_be_affix(self):

2897 return False

2898

2899 def contains_group(self):

2900 return self.subpattern.contains_group()

2901

2902 def get_firstset(self, reverse):

2903 fs = self.subpattern.get_firstset(reverse)

2904 if self.min_count == 0:

2905 fs.add(None)

2906

2907 return fs

2908

2909 def _compile(self, reverse, fuzzy):

2910 repeat = [self._opcode, self.min_count]

2911 if self.max_count is None:

2912 repeat.append(UNLIMITED)

2913 else:

2914 repeat.append(self.max_count)

2915

2916 subpattern = self.subpattern.compile(reverse, fuzzy)

2917 if not subpattern:

2918 return []

2919

2920 return ([tuple(repeat)] + subpattern + [(OP.END, )])

2921

2922 def dump(self, indent, reverse):

2923 if self.max_count is None:

2924 limit = "INF"

2925 else:

2926 limit = self.max_count

2927 print("{}{} {} {}".format(INDENT * indent, self._op_name,

2928 self.min_count, limit))

2929

2930 self.subpattern.dump(indent + 1, reverse)

2931

2932 def is_empty(self):

2933 return self.subpattern.is_empty()

2934

2935 def __eq__(self, other):

2936 return type(self) is type(other) and (self.subpattern, self.min_count,

2937 self.max_count) == (other.subpattern, other.min_count,

2938 other.max_count)

2939

2940 def max_width(self):

2941 if self.max_count is None:

2942 return UNLIMITED

2943

2944 return self.subpattern.max_width() * self.max_count

2945

2946 def get_required_string(self, reverse):

2947 max_count = UNLIMITED if self.max_count is None else self.max_count

2948 if self.min_count == 0:

2949 w = self.subpattern.max_width() * max_count

2950 return min(w, UNLIMITED), None

2951

2952 ofs, req = self.subpattern.get_required_string(reverse)

2953 if req:

2954 return ofs, req

2955

2956 w = self.subpattern.max_width() * max_count

2957 return min(w, UNLIMITED), None

2958

2959class PossessiveRepeat(GreedyRepeat):

2960 def is_atomic(self):

2961 return True

2962

2963 def _compile(self, reverse, fuzzy):

2964 subpattern = self.subpattern.compile(reverse, fuzzy)

2965 if not subpattern:

2966 return []

2967

2968 repeat = [self._opcode, self.min_count]

2969 if self.max_count is None:

2970 repeat.append(UNLIMITED)

2971 else:

2972 repeat.append(self.max_count)

2973

2974 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),

2975 (OP.END, )])

2976

2977 def dump(self, indent, reverse):

2978 print("{}ATOMIC".format(INDENT * indent))

2979

2980 if self.max_count is None:

2981 limit = "INF"

2982 else:

2983 limit = self.max_count

2984 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,

2985 self.min_count, limit))

2986

2987 self.subpattern.dump(indent + 2, reverse)

2988

2989class Group(RegexBase):

2990 def __init__(self, info, group, subpattern):

2991 RegexBase.__init__(self)

2992 self.info = info

2993 self.group = group

2994 self.subpattern = subpattern

2995

2996 self.call_ref = None

2997

2998 def fix_groups(self, pattern, reverse, fuzzy):

2999 self.info.defined_groups[self.group] = (self, reverse, fuzzy)

3000 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3001

3002 def optimise(self, info, reverse):

3003 subpattern = self.subpattern.optimise(info, reverse)

3004

3005 return Group(self.info, self.group, subpattern)

3006

3007 def pack_characters(self, info):

3008 self.subpattern = self.subpattern.pack_characters(info)

3009 return self

3010

3011 def remove_captures(self):

3012 return self.subpattern.remove_captures()

3013

3014 def is_atomic(self):

3015 return self.subpattern.is_atomic()

3016

3017 def can_be_affix(self):

3018 return False

3019

3020 def contains_group(self):

3021 return True

3022

3023 def get_firstset(self, reverse):

3024 return self.subpattern.get_firstset(reverse)

3025

3026 def has_simple_start(self):

3027 return self.subpattern.has_simple_start()

3028

3029 def _compile(self, reverse, fuzzy):

3030 code = []

3031

3032 public_group = private_group = self.group

3033 if private_group < 0:

3034 public_group = self.info.private_groups[private_group]

3035 private_group = self.info.group_count - private_group

3036

3037 key = self.group, reverse, fuzzy

3038 ref = self.info.call_refs.get(key)

3039 if ref is not None:

3040 code += [(OP.CALL_REF, ref)]

3041

3042 code += [(OP.GROUP, int(not reverse), private_group, public_group)]

3043 code += self.subpattern.compile(reverse, fuzzy)

3044 code += [(OP.END, )]

3045

3046 if ref is not None:

3047 code += [(OP.END, )]

3048

3049 return code

3050

3051 def dump(self, indent, reverse):

3052 group = self.group

3053 if group < 0:

3054 group = private_groups[group]

3055 print("{}GROUP {}".format(INDENT * indent, group))

3056 self.subpattern.dump(indent + 1, reverse)

3057

3058 def __eq__(self, other):

3059 return (type(self) is type(other) and (self.group, self.subpattern) ==

3060 (other.group, other.subpattern))

3061

3062 def max_width(self):

3063 return self.subpattern.max_width()

3064

3065 def get_required_string(self, reverse):

3066 return self.subpattern.get_required_string(reverse)

3067

3068 def __del__(self):

3069 self.info = None

3070

3071class Keep(ZeroWidthBase):

3072 _opcode = OP.KEEP

3073 _op_name = "KEEP"

3074

3075class LazyRepeat(GreedyRepeat):

3076 _opcode = OP.LAZY_REPEAT

3077 _op_name = "LAZY_REPEAT"

3078

3079class LookAround(RegexBase):

3080 _dir_text = {False: "AHEAD", True: "BEHIND"}

3081

3082 def __init__(self, behind, positive, subpattern):

3083 RegexBase.__init__(self)

3084 self.behind = bool(behind)

3085 self.positive = bool(positive)

3086 self.subpattern = subpattern

3087

3088 def fix_groups(self, pattern, reverse, fuzzy):

3089 self.subpattern.fix_groups(pattern, self.behind, fuzzy)

3090

3091 def optimise(self, info, reverse):

3092 subpattern = self.subpattern.optimise(info, self.behind)

3093 if self.positive and subpattern.is_empty():

3094 return subpattern

3095

3096 return LookAround(self.behind, self.positive, subpattern)

3097

3098 def pack_characters(self, info):

3099 self.subpattern = self.subpattern.pack_characters(info)

3100 return self

3101

3102 def remove_captures(self):

3103 return self.subpattern.remove_captures()

3104

3105 def is_atomic(self):

3106 return self.subpattern.is_atomic()

3107

3108 def can_be_affix(self):

3109 return self.subpattern.can_be_affix()

3110

3111 def contains_group(self):

3112 return self.subpattern.contains_group()

3113

3114 def get_firstset(self, reverse):

3115 if self.positive and self.behind == reverse:

3116 return self.subpattern.get_firstset(reverse)

3117

3118 return set([None])

3119

3120 def _compile(self, reverse, fuzzy):

3121 flags = 0

3122 if self.positive:

3123 flags |= POSITIVE_OP

3124 if fuzzy:

3125 flags |= FUZZY_OP

3126 if reverse:

3127 flags |= REVERSE_OP

3128

3129 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +

3130 self.subpattern.compile(self.behind) + [(OP.END, )])

3131

3132 def dump(self, indent, reverse):

3133 print("{}LOOK{} {}".format(INDENT * indent,

3134 self._dir_text[self.behind], POS_TEXT[self.positive]))

3135 self.subpattern.dump(indent + 1, self.behind)

3136

3137 def is_empty(self):

3138 return self.positive and self.subpattern.is_empty()

3139

3140 def __eq__(self, other):

3141 return type(self) is type(other) and (self.behind, self.positive,

3142 self.subpattern) == (other.behind, other.positive, other.subpattern)

3143

3144 def max_width(self):

3145 return 0

3146

3147class LookAroundConditional(RegexBase):

3148 _dir_text = {False: "AHEAD", True: "BEHIND"}

3149

3150 def __init__(self, behind, positive, subpattern, yes_item, no_item):

3151 RegexBase.__init__(self)

3152 self.behind = bool(behind)

3153 self.positive = bool(positive)

3154 self.subpattern = subpattern

3155 self.yes_item = yes_item

3156 self.no_item = no_item

3157

3158 def fix_groups(self, pattern, reverse, fuzzy):

3159 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3160 self.yes_item.fix_groups(pattern, reverse, fuzzy)

3161 self.no_item.fix_groups(pattern, reverse, fuzzy)

3162

3163 def optimise(self, info, reverse):

3164 subpattern = self.subpattern.optimise(info, self.behind)

3165 yes_item = self.yes_item.optimise(info, self.behind)

3166 no_item = self.no_item.optimise(info, self.behind)

3167

3168 return LookAroundConditional(self.behind, self.positive, subpattern,

3169 yes_item, no_item)

3170

3171 def pack_characters(self, info):

3172 self.subpattern = self.subpattern.pack_characters(info)

3173 self.yes_item = self.yes_item.pack_characters(info)

3174 self.no_item = self.no_item.pack_characters(info)

3175 return self

3176

3177 def remove_captures(self):

3178 self.subpattern = self.subpattern.remove_captures()

3179 self.yes_item = self.yes_item.remove_captures()

3180 self.no_item = self.no_item.remove_captures()

3181

3182 def is_atomic(self):

3183 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and

3184 self.no_item.is_atomic())

3185

3186 def can_be_affix(self):

3187 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()

3188 and self.no_item.can_be_affix())

3189

3190 def contains_group(self):

3191 return (self.subpattern.contains_group() or

3192 self.yes_item.contains_group() or self.no_item.contains_group())

3193

3194 def _compile(self, reverse, fuzzy):

3195 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]

3196 code.extend(self.subpattern.compile(self.behind, fuzzy))

3197 code.append((OP.NEXT, ))

3198 code.extend(self.yes_item.compile(reverse, fuzzy))

3199 add_code = self.no_item.compile(reverse, fuzzy)

3200 if add_code:

3201 code.append((OP.NEXT, ))

3202 code.extend(add_code)

3203

3204 code.append((OP.END, ))

3205

3206 return code

3207

3208 def dump(self, indent, reverse):

3209 print("{}CONDITIONAL {} {}".format(INDENT * indent,

3210 self._dir_text[self.behind], POS_TEXT[self.positive]))

3211 self.subpattern.dump(indent + 1, self.behind)

3212 print("{}EITHER".format(INDENT * indent))

3213 self.yes_item.dump(indent + 1, reverse)

3214 if not self.no_item.is_empty():

3215 print("{}OR".format(INDENT * indent))

3216 self.no_item.dump(indent + 1, reverse)

3217

3218 def is_empty(self):

3219 return (self.subpattern.is_empty() and self.yes_item.is_empty() or

3220 self.no_item.is_empty())

3221

3222 def __eq__(self, other):

3223 return type(self) is type(other) and (self.subpattern, self.yes_item,

3224 self.no_item) == (other.subpattern, other.yes_item, other.no_item)

3225

3226 def max_width(self):

3227 return max(self.yes_item.max_width(), self.no_item.max_width())

3228

3229 def get_required_string(self, reverse):

3230 return self.max_width(), None

3231

3232class PrecompiledCode(RegexBase):

3233 def __init__(self, code):

3234 self.code = code

3235

3236 def _compile(self, reverse, fuzzy):

3237 return [tuple(self.code)]

3238

3239class Property(RegexBase):

3240 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):

3241 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):

3242 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):

3243 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,

3244 True): OP.PROPERTY_IGN_REV}

3245

3246 def __init__(self, value, positive=True, case_flags=NOCASE,

3247 zerowidth=False, encoding=0):

3248 RegexBase.__init__(self)

3249 self.value = value

3250 self.positive = bool(positive)

3251 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3252 self.zerowidth = bool(zerowidth)

3253 self.encoding = encoding

3254

3255 self._key = (self.__class__, self.value, self.positive,

3256 self.case_flags, self.zerowidth)

3257

3258 def rebuild(self, positive, case_flags, zerowidth):

3259 return Property(self.value, positive, case_flags, zerowidth,

3260 self.encoding)

3261

3262 def optimise(self, info, reverse, in_set=False):

3263 return self

3264

3265 def get_firstset(self, reverse):

3266 return set([self])

3267

3268 def has_simple_start(self):

3269 return True

3270

3271 def _compile(self, reverse, fuzzy):

3272 flags = 0

3273 if self.positive:

3274 flags |= POSITIVE_OP

3275 if self.zerowidth:

3276 flags |= ZEROWIDTH_OP

3277 if fuzzy:

3278 flags |= FUZZY_OP

3279 flags |= self.encoding << ENCODING_OP_SHIFT

3280 return [(self._opcode[self.case_flags, reverse], flags, self.value)]

3281

3282 def dump(self, indent, reverse):

3283 prop = PROPERTY_NAMES[self.value >> 16]

3284 name, value = prop[0], prop[1][self.value & 0xFFFF]

3285 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,

3286 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],

3287 ["", " ASCII"][self.encoding]))

3288

3289 def matches(self, ch):

3290 return _regex.has_property_value(self.value, ch) == self.positive

3291

3292 def max_width(self):

3293 return 1

3294

3295class Prune(ZeroWidthBase):

3296 _op_name = "PRUNE"

3297

3298 def _compile(self, reverse, fuzzy):

3299 return [(OP.PRUNE, )]

3300

3301class Range(RegexBase):

3302 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,

3303 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,

3304 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,

3305 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}

3306 _op_name = "RANGE"

3307

3308 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,

3309 zerowidth=False):

3310 RegexBase.__init__(self)

3311 self.lower = lower

3312 self.upper = upper

3313 self.positive = bool(positive)

3314 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3315 self.zerowidth = bool(zerowidth)

3316

3317 self._key = (self.__class__, self.lower, self.upper, self.positive,

3318 self.case_flags, self.zerowidth)

3319

3320 def rebuild(self, positive, case_flags, zerowidth):

3321 return Range(self.lower, self.upper, positive, case_flags, zerowidth)

3322

3323 def optimise(self, info, reverse, in_set=False):

3324 # Is the range case-sensitive?

3325 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3326 return self

3327

3328 # Is full case-folding possible?

3329 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=

3330 FULLIGNORECASE):

3331 return self

3332

3333 # Get the characters which expand to multiple codepoints on folding.

3334 expanding_chars = _regex.get_expand_on_folding()

3335

3336 # Get the folded characters in the range.

3337 items = []

3338 for ch in expanding_chars:

3339 if self.lower <= ord(ch) <= self.upper:

3340 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3341 items.append(String([ord(c) for c in folded],

3342 case_flags=self.case_flags))

3343

3344 if not items:

3345 # We can fall back to simple case-folding.

3346 return self

3347

3348 if len(items) < self.upper - self.lower + 1:

3349 # Not all the characters are covered by the full case-folding.

3350 items.insert(0, self)

3351

3352 return Branch(items)

3353

3354 def _compile(self, reverse, fuzzy):

3355 flags = 0

3356 if self.positive:

3357 flags |= POSITIVE_OP

3358 if self.zerowidth:

3359 flags |= ZEROWIDTH_OP

3360 if fuzzy:

3361 flags |= FUZZY_OP

3362 return [(self._opcode[self.case_flags, reverse], flags, self.lower,

3363 self.upper)]

3364

3365 def dump(self, indent, reverse):

3366 display_lower = ascii(chr(self.lower)).lstrip("bu")

3367 display_upper = ascii(chr(self.upper)).lstrip("bu")

3368 print("{}RANGE {} {} {}{}".format(INDENT * indent,

3369 POS_TEXT[self.positive], display_lower, display_upper,

3370 CASE_TEXT[self.case_flags]))

3371

3372 def matches(self, ch):

3373 return (self.lower <= ch <= self.upper) == self.positive

3374

3375 def max_width(self):

3376 return 1

3377

3378class RefGroup(RegexBase):

3379 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):

3380 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,

3381 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,

3382 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,

3383 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}

3384

3385 def __init__(self, info, group, position, case_flags=NOCASE):

3386 RegexBase.__init__(self)

3387 self.info = info

3388 self.group = group

3389 self.position = position

3390 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3391

3392 self._key = self.__class__, self.group, self.case_flags

3393

3394 def fix_groups(self, pattern, reverse, fuzzy):

3395 try:

3396 self.group = int(self.group)

3397 except ValueError:

3398 try:

3399 self.group = self.info.group_index[self.group]

3400 except KeyError:

3401 raise error("unknown group", pattern, self.position)

3402

3403 if not 1 <= self.group <= self.info.group_count:

3404 raise error("invalid group reference", pattern, self.position)

3405

3406 self._key = self.__class__, self.group, self.case_flags

3407

3408 def remove_captures(self):

3409 raise error("group reference not allowed", pattern, self.position)

3410

3411 def _compile(self, reverse, fuzzy):

3412 flags = 0

3413 if fuzzy:

3414 flags |= FUZZY_OP

3415 return [(self._opcode[self.case_flags, reverse], flags, self.group)]

3416

3417 def dump(self, indent, reverse):

3418 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,

3419 CASE_TEXT[self.case_flags]))

3420

3421 def max_width(self):

3422 return UNLIMITED

3423

3424 def __del__(self):

3425 self.info = None

3426

3427class SearchAnchor(ZeroWidthBase):

3428 _opcode = OP.SEARCH_ANCHOR

3429 _op_name = "SEARCH_ANCHOR"

3430

3431class Sequence(RegexBase):

3432 def __init__(self, items=None):

3433 RegexBase.__init__(self)

3434 if items is None:

3435 items = []

3436

3437 self.items = items

3438

3439 def fix_groups(self, pattern, reverse, fuzzy):

3440 for s in self.items:

3441 s.fix_groups(pattern, reverse, fuzzy)

3442

3443 def optimise(self, info, reverse):

3444 # Flatten the sequences.

3445 items = []

3446 for s in self.items:

3447 s = s.optimise(info, reverse)

3448 if isinstance(s, Sequence):

3449 items.extend(s.items)

3450 else:

3451 items.append(s)

3452

3453 return make_sequence(items)

3454

3455 def pack_characters(self, info):

3456 "Packs sequences of characters into strings."

3457 items = []

3458 characters = []

3459 case_flags = NOCASE

3460 for s in self.items:

3461 if type(s) is Character and s.positive and not s.zerowidth:

3462 if s.case_flags != case_flags:

3463 # Different case sensitivity, so flush, unless neither the

3464 # previous nor the new character are cased.

3465 if s.case_flags or is_cased_i(info, s.value):

3466 Sequence._flush_characters(info, characters,

3467 case_flags, items)

3468

3469 case_flags = s.case_flags

3470

3471 characters.append(s.value)

3472 elif type(s) is String or type(s) is Literal:

3473 if s.case_flags != case_flags:

3474 # Different case sensitivity, so flush, unless the neither

3475 # the previous nor the new string are cased.

3476 if s.case_flags or any(is_cased_i(info, c) for c in

3477 characters):

3478 Sequence._flush_characters(info, characters,

3479 case_flags, items)

3480

3481 case_flags = s.case_flags

3482

3483 characters.extend(s.characters)

3484 else:

3485 Sequence._flush_characters(info, characters, case_flags, items)

3486

3487 items.append(s.pack_characters(info))

3488

3489 Sequence._flush_characters(info, characters, case_flags, items)

3490

3491 return make_sequence(items)

3492

3493 def remove_captures(self):

3494 self.items = [s.remove_captures() for s in self.items]

3495 return self

3496

3497 def is_atomic(self):

3498 return all(s.is_atomic() for s in self.items)

3499

3500 def can_be_affix(self):

3501 return False

3502

3503 def contains_group(self):

3504 return any(s.contains_group() for s in self.items)

3505

3506 def get_firstset(self, reverse):

3507 fs = set()

3508 items = self.items

3509 if reverse:

3510 items.reverse()

3511 for s in items:

3512 fs |= s.get_firstset(reverse)

3513 if None not in fs:

3514 return fs

3515 fs.discard(None)

3516

3517 return fs | set([None])

3518

3519 def has_simple_start(self):

3520 return bool(self.items) and self.items[0].has_simple_start()

3521

3522 def _compile(self, reverse, fuzzy):

3523 seq = self.items

3524 if reverse:

3525 seq = seq[::-1]

3526

3527 code = []

3528 for s in seq:

3529 code.extend(s.compile(reverse, fuzzy))

3530

3531 return code

3532

3533 def dump(self, indent, reverse):

3534 for s in self.items:

3535 s.dump(indent, reverse)

3536

3537 @staticmethod

3538 def _flush_characters(info, characters, case_flags, items):

3539 if not characters:

3540 return

3541

3542 # Disregard case_flags if all of the characters are case-less.

3543 if case_flags & IGNORECASE:

3544 if not any(is_cased_i(info, c) for c in characters):

3545 case_flags = NOCASE

3546

3547 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3548 literals = Sequence._fix_full_casefold(characters)

3549

3550 for item in literals:

3551 chars = item.characters

3552

3553 if len(chars) == 1:

3554 items.append(Character(chars[0], case_flags=item.case_flags))

3555 else:

3556 items.append(String(chars, case_flags=item.case_flags))

3557 else:

3558 if len(characters) == 1:

3559 items.append(Character(characters[0], case_flags=case_flags))

3560 else:

3561 items.append(String(characters, case_flags=case_flags))

3562

3563 characters[:] = []

3564

3565 @staticmethod

3566 def _fix_full_casefold(characters):

3567 # Split a literal needing full case-folding into chunks that need it

3568 # and chunks that can use simple case-folding, which is faster.

3569 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in

3570 _regex.get_expand_on_folding()]

3571 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)

3572 for c in characters)).lower()

3573 chunks = []

3574

3575 for e in expanded:

3576 found = string.find(e)

3577

3578 while found >= 0:

3579 chunks.append((found, found + len(e)))

3580 found = string.find(e, found + 1)

3581

3582 pos = 0

3583 literals = []

3584

3585 for start, end in Sequence._merge_chunks(chunks):

3586 if pos < start:

3587 literals.append(Literal(characters[pos : start],

3588 case_flags=IGNORECASE))

3589

3590 literals.append(Literal(characters[start : end],

3591 case_flags=FULLIGNORECASE))

3592 pos = end

3593

3594 if pos < len(characters):

3595 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))

3596

3597 return literals

3598

3599 @staticmethod

3600 def _merge_chunks(chunks):

3601 if len(chunks) < 2:

3602 return chunks

3603

3604 chunks.sort()

3605

3606 start, end = chunks[0]

3607 new_chunks = []

3608

3609 for s, e in chunks[1 : ]:

3610 if s <= end:

3611 end = max(end, e)

3612 else:

3613 new_chunks.append((start, end))

3614 start, end = s, e

3615

3616 new_chunks.append((start, end))

3617

3618 return new_chunks

3619

3620 def is_empty(self):

3621 return all(i.is_empty() for i in self.items)

3622

3623 def __eq__(self, other):

3624 return type(self) is type(other) and self.items == other.items

3625

3626 def max_width(self):

3627 return sum(s.max_width() for s in self.items)

3628

3629 def get_required_string(self, reverse):

3630 seq = self.items

3631 if reverse:

3632 seq = seq[::-1]

3633

3634 offset = 0

3635

3636 for s in seq:

3637 ofs, req = s.get_required_string(reverse)

3638 offset += ofs

3639 if req:

3640 return offset, req

3641

3642 return offset, None

3643

3644class SetBase(RegexBase):

3645 def __init__(self, info, items, positive=True, case_flags=NOCASE,

3646 zerowidth=False):

3647 RegexBase.__init__(self)

3648 self.info = info

3649 self.items = tuple(items)

3650 self.positive = bool(positive)

3651 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3652 self.zerowidth = bool(zerowidth)

3653

3654 self.char_width = 1

3655

3656 self._key = (self.__class__, self.items, self.positive,

3657 self.case_flags, self.zerowidth)

3658

3659 def rebuild(self, positive, case_flags, zerowidth):

3660 return type(self)(self.info, self.items, positive, case_flags,

3661 zerowidth).optimise(self.info, False)

3662

3663 def get_firstset(self, reverse):

3664 return set([self])

3665

3666 def has_simple_start(self):

3667 return True

3668

3669 def _compile(self, reverse, fuzzy):

3670 flags = 0

3671 if self.positive:

3672 flags |= POSITIVE_OP

3673 if self.zerowidth:

3674 flags |= ZEROWIDTH_OP

3675 if fuzzy:

3676 flags |= FUZZY_OP

3677 code = [(self._opcode[self.case_flags, reverse], flags)]

3678 for m in self.items:

3679 code.extend(m.compile())

3680

3681 code.append((OP.END, ))

3682

3683 return code

3684

3685 def dump(self, indent, reverse):

3686 print("{}{} {}{}".format(INDENT * indent, self._op_name,

3687 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))

3688 for i in self.items:

3689 i.dump(indent + 1, reverse)

3690

3691 def _handle_case_folding(self, info, in_set):

3692 # Is the set case-sensitive?

3693 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3694 return self

3695

3696 # Is full case-folding possible?

3697 if (not (self.info.flags & UNICODE) or (self.case_flags &

3698 FULLIGNORECASE) != FULLIGNORECASE):

3699 return self

3700

3701 # Get the characters which expand to multiple codepoints on folding.

3702 expanding_chars = _regex.get_expand_on_folding()

3703

3704 # Get the folded characters in the set.

3705 items = []

3706 seen = set()

3707 for ch in expanding_chars:

3708 if self.matches(ord(ch)):

3709 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3710 if folded not in seen:

3711 items.append(String([ord(c) for c in folded],

3712 case_flags=self.case_flags))

3713 seen.add(folded)

3714

3715 if not items:

3716 # We can fall back to simple case-folding.

3717 return self

3718

3719 return Branch([self] + items)

3720

3721 def max_width(self):

3722 # Is the set case-sensitive?

3723 if not self.positive or not (self.case_flags & IGNORECASE):

3724 return 1

3725

3726 # Is full case-folding possible?

3727 if (not (self.info.flags & UNICODE) or (self.case_flags &

3728 FULLIGNORECASE) != FULLIGNORECASE):

3729 return 1

3730

3731 # Get the characters which expand to multiple codepoints on folding.

3732 expanding_chars = _regex.get_expand_on_folding()

3733

3734 # Get the folded characters in the set.

3735 seen = set()

3736 for ch in expanding_chars:

3737 if self.matches(ord(ch)):

3738 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3739 seen.add(folded)

3740

3741 if not seen:

3742 return 1

3743

3744 return max(len(folded) for folded in seen)

3745

3746 def __del__(self):

3747 self.info = None

3748

3749class SetDiff(SetBase):

3750 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):

3751 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):

3752 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):

3753 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,

3754 True): OP.SET_DIFF_IGN_REV}

3755 _op_name = "SET_DIFF"

3756

3757 def optimise(self, info, reverse, in_set=False):

3758 items = self.items

3759 if len(items) > 2:

3760 items = [items[0], SetUnion(info, items[1 : ])]

3761

3762 if len(items) == 1:

3763 return items[0].with_flags(case_flags=self.case_flags,

3764 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3765

3766 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in

3767 items)

3768

3769 return self._handle_case_folding(info, in_set)

3770

3771 def matches(self, ch):

3772 m = self.items[0].matches(ch) and not self.items[1].matches(ch)

3773 return m == self.positive

3774

3775class SetInter(SetBase):

3776 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):

3777 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,

3778 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,

3779 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,

3780 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}

3781 _op_name = "SET_INTER"

3782

3783 def optimise(self, info, reverse, in_set=False):

3784 items = []

3785 for m in self.items:

3786 m = m.optimise(info, reverse, in_set=True)

3787 if isinstance(m, SetInter) and m.positive:

3788 # Intersection in intersection.

3789 items.extend(m.items)

3790 else:

3791 items.append(m)

3792

3793 if len(items) == 1:

3794 return items[0].with_flags(case_flags=self.case_flags,

3795 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3796

3797 self.items = tuple(items)

3798

3799 return self._handle_case_folding(info, in_set)

3800

3801 def matches(self, ch):

3802 m = all(i.matches(ch) for i in self.items)

3803 return m == self.positive

3804

3805class SetSymDiff(SetBase):

3806 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):

3807 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,

3808 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,

3809 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):

3810 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}

3811 _op_name = "SET_SYM_DIFF"

3812

3813 def optimise(self, info, reverse, in_set=False):

3814 items = []

3815 for m in self.items:

3816 m = m.optimise(info, reverse, in_set=True)

3817 if isinstance(m, SetSymDiff) and m.positive:

3818 # Symmetric difference in symmetric difference.

3819 items.extend(m.items)

3820 else:

3821 items.append(m)

3822

3823 if len(items) == 1:

3824 return items[0].with_flags(case_flags=self.case_flags,

3825 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3826

3827 self.items = tuple(items)

3828

3829 return self._handle_case_folding(info, in_set)

3830

3831 def matches(self, ch):

3832 m = False

3833 for i in self.items:

3834 m = m != i.matches(ch)

3835

3836 return m == self.positive

3837

3838class SetUnion(SetBase):

3839 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):

3840 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,

3841 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,

3842 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,

3843 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}

3844 _op_name = "SET_UNION"

3845

3846 def optimise(self, info, reverse, in_set=False):

3847 items = []

3848 for m in self.items:

3849 m = m.optimise(info, reverse, in_set=True)

3850 if isinstance(m, SetUnion) and m.positive:

3851 # Union in union.

3852 items.extend(m.items)

3853 elif isinstance(m, AnyAll):

3854 return AnyAll()

3855 else:

3856 items.append(m)

3857

3858 # Are there complementary properties?

3859 properties = (set(), set())

3860

3861 for m in items:

3862 if isinstance(m, Property):

3863 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))

3864

3865 if properties[0] & properties[1]:

3866 return AnyAll()

3867

3868 if len(items) == 1:

3869 i = items[0]

3870 return i.with_flags(positive=i.positive == self.positive,

3871 case_flags=self.case_flags,

3872 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3873

3874 self.items = tuple(items)

3875

3876 return self._handle_case_folding(info, in_set)

3877

3878 def _compile(self, reverse, fuzzy):

3879 flags = 0

3880 if self.positive:

3881 flags |= POSITIVE_OP

3882 if self.zerowidth:

3883 flags |= ZEROWIDTH_OP

3884 if fuzzy:

3885 flags |= FUZZY_OP

3886

3887 characters, others = defaultdict(list), []

3888 for m in self.items:

3889 if isinstance(m, Character):

3890 characters[m.positive].append(m.value)

3891 else:

3892 others.append(m)

3893

3894 code = [(self._opcode[self.case_flags, reverse], flags)]

3895

3896 for positive, values in characters.items():

3897 flags = 0

3898 if positive:

3899 flags |= POSITIVE_OP

3900 if len(values) == 1:

3901 code.append((OP.CHARACTER, flags, values[0]))

3902 else:

3903 code.append((OP.STRING, flags, len(values)) + tuple(values))

3904

3905 for m in others:

3906 code.extend(m.compile())

3907

3908 code.append((OP.END, ))

3909

3910 return code

3911

3912 def matches(self, ch):

3913 m = any(i.matches(ch) for i in self.items)

3914 return m == self.positive

3915

3916class Skip(ZeroWidthBase):

3917 _op_name = "SKIP"

3918 _opcode = OP.SKIP

3919

3920class StartOfLine(ZeroWidthBase):

3921 _opcode = OP.START_OF_LINE

3922 _op_name = "START_OF_LINE"

3923

3924class StartOfLineU(StartOfLine):

3925 _opcode = OP.START_OF_LINE_U

3926 _op_name = "START_OF_LINE_U"

3927

3928class StartOfString(ZeroWidthBase):

3929 _opcode = OP.START_OF_STRING

3930 _op_name = "START_OF_STRING"

3931

3932class StartOfWord(ZeroWidthBase):

3933 _opcode = OP.START_OF_WORD

3934 _op_name = "START_OF_WORD"

3935

3936class String(RegexBase):

3937 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,

3938 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,

3939 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,

3940 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):

3941 OP.STRING_FLD_REV}

3942

3943 def __init__(self, characters, case_flags=NOCASE):

3944 self.characters = tuple(characters)

3945 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3946

3947 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3948 folded_characters = []

3949 for char in self.characters:

3950 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))

3951 folded_characters.extend(ord(c) for c in folded)

3952 else:

3953 folded_characters = self.characters

3954

3955 self.folded_characters = tuple(folded_characters)

3956 self.required = False

3957

3958 self._key = self.__class__, self.characters, self.case_flags

3959

3960 def get_firstset(self, reverse):

3961 if reverse:

3962 pos = -1

3963 else:

3964 pos = 0

3965 return set([Character(self.characters[pos],

3966 case_flags=self.case_flags)])

3967

3968 def has_simple_start(self):

3969 return True

3970

3971 def _compile(self, reverse, fuzzy):

3972 flags = 0

3973 if fuzzy:

3974 flags |= FUZZY_OP

3975 if self.required:

3976 flags |= REQUIRED_OP

3977 return [(self._opcode[self.case_flags, reverse], flags,

3978 len(self.folded_characters)) + self.folded_characters]

3979

3980 def dump(self, indent, reverse):

3981 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")

3982 print("{}STRING {}{}".format(INDENT * indent, display,

3983 CASE_TEXT[self.case_flags]))

3984

3985 def max_width(self):

3986 return len(self.folded_characters)

3987

3988 def get_required_string(self, reverse):

3989 return 0, self

3990

3991class Literal(String):

3992 def dump(self, indent, reverse):

3993 literal = ''.join(chr(c) for c in self.characters)

3994 display = ascii(literal).lstrip("bu")

3995 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,

3996 CASE_TEXT[self.case_flags]))

3997

3998class StringSet(Branch):

3999 def __init__(self, info, name, case_flags=NOCASE):

4000 self.info = info

4001 self.name = name

4002 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

4003

4004 self._key = self.__class__, self.name, self.case_flags

4005

4006 self.set_key = (name, self.case_flags)

4007 if self.set_key not in info.named_lists_used:

4008 info.named_lists_used[self.set_key] = len(info.named_lists_used)

4009

4010 index = self.info.named_lists_used[self.set_key]

4011 items = self.info.kwargs[self.name]

4012

4013 case_flags = self.case_flags

4014

4015 encoding = self.info.flags & _ALL_ENCODINGS

4016 fold_flags = encoding | case_flags

4017

4018 choices = []

4019

4020 for string in items:

4021 if isinstance(string, str):

4022 string = [ord(c) for c in string]

4023

4024 choices.append([Character(c, case_flags=case_flags) for c in

4025 string])

4026

4027 # Sort from longest to shortest.

4028 choices.sort(key=len, reverse=True)

4029

4030 self.branches = [Sequence(choice) for choice in choices]

4031

4032 def dump(self, indent, reverse):

4033 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,

4034 CASE_TEXT[self.case_flags]))

4035

4036 def __del__(self):

4037 self.info = None

4038

4039class Source:

4040 "Scanner for the regular expression source string."

4041 def __init__(self, string):

4042 if isinstance(string, str):

4043 self.string = string

4044 self.char_type = chr

4045 else:

4046 self.string = string.decode("latin-1")

4047 self.char_type = lambda c: bytes([c])

4048

4049 self.pos = 0

4050 self.ignore_space = False

4051 self.sep = string[ : 0]

4052

4053 def get(self, override_ignore=False):

4054 string = self.string

4055 pos = self.pos

4056

4057 try:

4058 if self.ignore_space and not override_ignore:

4059 while True:

4060 if string[pos].isspace():

4061 # Skip over the whitespace.

4062 pos += 1

4063 elif string[pos] == "#":

4064 # Skip over the comment to the end of the line.

4065 pos = string.index("\n", pos)

4066 else:

4067 break

4068

4069 ch = string[pos]

4070 self.pos = pos + 1

4071 return ch

4072 except IndexError:

4073 # We've reached the end of the string.

4074 self.pos = pos

4075 return string[ : 0]

4076 except ValueError:

4077 # The comment extended to the end of the string.

4078 self.pos = len(string)

4079 return string[ : 0]

4080

4081 def get_many(self, count=1):

4082 string = self.string

4083 pos = self.pos

4084

4085 try:

4086 if self.ignore_space:

4087 substring = []

4088

4089 while len(substring) < count:

4090 while True:

4091 if string[pos].isspace():

4092 # Skip over the whitespace.

4093 pos += 1

4094 elif string[pos] == "#":

4095 # Skip over the comment to the end of the line.

4096 pos = string.index("\n", pos)

4097 else:

4098 break

4099

4100 substring.append(string[pos])

4101 pos += 1

4102

4103 substring = "".join(substring)

4104 else:

4105 substring = string[pos : pos + count]

4106 pos += len(substring)

4107

4108 self.pos = pos

4109 return substring

4110 except IndexError:

4111 # We've reached the end of the string.

4112 self.pos = len(string)

4113 return "".join(substring)

4114 except ValueError:

4115 # The comment extended to the end of the string.

4116 self.pos = len(string)

4117 return "".join(substring)

4118

4119 def get_while(self, test_set, include=True, keep_spaces=False):

4120 string = self.string

4121 pos = self.pos

4122

4123 if self.ignore_space and not keep_spaces:

4124 try:

4125 substring = []

4126

4127 while True:

4128 if string[pos].isspace():

4129 # Skip over the whitespace.

4130 pos += 1

4131 elif string[pos] == "#":

4132 # Skip over the comment to the end of the line.

4133 pos = string.index("\n", pos)

4134 elif (string[pos] in test_set) == include:

4135 substring.append(string[pos])

4136 pos += 1

4137 else:

4138 break

4139

4140 self.pos = pos

4141 except IndexError:

4142 # We've reached the end of the string.

4143 self.pos = len(string)

4144 except ValueError:

4145 # The comment extended to the end of the string.

4146 self.pos = len(string)

4147

4148 return "".join(substring)

4149 else:

4150 try:

4151 while (string[pos] in test_set) == include:

4152 pos += 1

4153

4154 substring = string[self.pos : pos]

4155

4156 self.pos = pos

4157

4158 return substring

4159 except IndexError:

4160 # We've reached the end of the string.

4161 substring = string[self.pos : pos]

4162

4163 self.pos = pos

4164

4165 return substring

4166

4167 def skip_while(self, test_set, include=True):

4168 string = self.string

4169 pos = self.pos

4170

4171 try:

4172 if self.ignore_space:

4173 while True:

4174 if string[pos].isspace():

4175 # Skip over the whitespace.

4176 pos += 1

4177 elif string[pos] == "#":

4178 # Skip over the comment to the end of the line.

4179 pos = string.index("\n", pos)

4180 elif (string[pos] in test_set) == include:

4181 pos += 1

4182 else:

4183 break

4184 else:

4185 while (string[pos] in test_set) == include:

4186 pos += 1

4187

4188 self.pos = pos

4189 except IndexError:

4190 # We've reached the end of the string.

4191 self.pos = len(string)

4192 except ValueError:

4193 # The comment extended to the end of the string.

4194 self.pos = len(string)

4195

4196 def match(self, substring):

4197 string = self.string

4198 pos = self.pos

4199

4200 if self.ignore_space:

4201 try:

4202 for c in substring:

4203 while True:

4204 if string[pos].isspace():

4205 # Skip over the whitespace.

4206 pos += 1

4207 elif string[pos] == "#":

4208 # Skip over the comment to the end of the line.

4209 pos = string.index("\n", pos)

4210 else:

4211 break

4212

4213 if string[pos] != c:

4214 return False

4215

4216 pos += 1

4217

4218 self.pos = pos

4219

4220 return True

4221 except IndexError:

4222 # We've reached the end of the string.

4223 return False

4224 except ValueError:

4225 # The comment extended to the end of the string.

4226 return False

4227 else:

4228 if not string.startswith(substring, pos):

4229 return False

4230

4231 self.pos = pos + len(substring)

4232

4233 return True

4234

4235 def expect(self, substring):

4236 if not self.match(substring):

4237 raise error("missing {}".format(substring), self.string, self.pos)

4238

4239 def at_end(self):

4240 string = self.string

4241 pos = self.pos

4242

4243 try:

4244 if self.ignore_space:

4245 while True:

4246 if string[pos].isspace():

4247 pos += 1

4248 elif string[pos] == "#":

4249 pos = string.index("\n", pos)

4250 else:

4251 break

4252

4253 return pos >= len(string)

4254 except IndexError:

4255 # We've reached the end of the string.

4256 return True

4257 except ValueError:

4258 # The comment extended to the end of the string.

4259 return True

4260

4261class Info:

4262 "Info about the regular expression."

4263

4264 def __init__(self, flags=0, char_type=None, kwargs={}):

4265 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]

4266 self.flags = flags

4267 self.global_flags = flags

4268 self.inline_locale = False

4269

4270 self.kwargs = kwargs

4271

4272 self.group_count = 0

4273 self.group_index = {}

4274 self.group_name = {}

4275 self.char_type = char_type

4276 self.named_lists_used = {}

4277 self.open_groups = []

4278 self.open_group_count = {}

4279 self.defined_groups = {}

4280 self.group_calls = []

4281 self.private_groups = {}

4282

4283 def open_group(self, name=None):

4284 group = self.group_index.get(name)

4285 if group is None:

4286 while True:

4287 self.group_count += 1

4288 if name is None or self.group_count not in self.group_name:

4289 break

4290

4291 group = self.group_count

4292 if name:

4293 self.group_index[name] = group

4294 self.group_name[group] = name

4295

4296 if group in self.open_groups:

4297 # We have a nested named group. We'll assign it a private group

4298 # number, initially negative until we can assign a proper

4299 # (positive) number.

4300 group_alias = -(len(self.private_groups) + 1)

4301 self.private_groups[group_alias] = group

4302 group = group_alias

4303

4304 self.open_groups.append(group)

4305 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1

4306

4307 return group

4308

4309 def close_group(self):

4310 self.open_groups.pop()

4311

4312 def is_open_group(self, name):

4313 # In version 1, a group reference can refer to an open group. We'll

4314 # just pretend the group isn't open.

4315 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4316 if version == VERSION1:

4317 return False

4318

4319 if name.isdigit():

4320 group = int(name)

4321 else:

4322 group = self.group_index.get(name)

4323

4324 return group in self.open_groups

4325

4326def _check_group_features(info, parsed):

4327 """Checks whether the reverse and fuzzy features of the group calls match

4328 the groups which they call.

4329 """

4330 call_refs = {}

4331 additional_groups = []

4332 for call, reverse, fuzzy in info.group_calls:

4333 # Look up the reference of this group call.

4334 key = (call.group, reverse, fuzzy)

4335 ref = call_refs.get(key)

4336 if ref is None:

4337 # This group doesn't have a reference yet, so look up its features.

4338 if call.group == 0:

4339 # Calling the pattern as a whole.

4340 rev = bool(info.flags & REVERSE)

4341 fuz = isinstance(parsed, Fuzzy)

4342 if (rev, fuz) != (reverse, fuzzy):

4343 # The pattern as a whole doesn't have the features we want,

4344 # so we'll need to make a copy of it with the desired

4345 # features.

4346 additional_groups.append((CallRef(len(call_refs), parsed),

4347 reverse, fuzzy))

4348 else:

4349 # Calling a capture group.

4350 def_info = info.defined_groups[call.group]

4351 group = def_info[0]

4352 if def_info[1 : ] != (reverse, fuzzy):

4353 # The group doesn't have the features we want, so we'll

4354 # need to make a copy of it with the desired features.

4355 additional_groups.append((group, reverse, fuzzy))

4356

4357 ref = len(call_refs)

4358 call_refs[key] = ref

4359

4360 call.call_ref = ref

4361

4362 info.call_refs = call_refs

4363 info.additional_groups = additional_groups

4364

4365def _get_required_string(parsed, flags):

4366 "Gets the required string and related info of a parsed pattern."

4367

4368 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))

4369 if required:

4370 required.required = True

4371 if req_offset >= UNLIMITED:

4372 req_offset = -1

4373

4374 req_flags = required.case_flags

4375 if not (flags & UNICODE):

4376 req_flags &= ~UNICODE

4377

4378 req_chars = required.folded_characters

4379 else:

4380 req_offset = 0

4381 req_chars = ()

4382 req_flags = 0

4383

4384 return req_offset, req_chars, req_flags

4385

4386class Scanner:

4387 def __init__(self, lexicon, flags=0):

4388 self.lexicon = lexicon

4389

4390 # Combine phrases into a compound pattern.

4391 patterns = []

4392 for phrase, action in lexicon:

4393 # Parse the regular expression.

4394 source = Source(phrase)

4395 info = Info(flags, source.char_type)

4396 source.ignore_space = bool(info.flags & VERBOSE)

4397 parsed = _parse_pattern(source, info)

4398 if not source.at_end():

4399 raise error("unbalanced parenthesis", source.string,

4400 source.pos)

4401

4402 # We want to forbid capture groups within each phrase.

4403 patterns.append(parsed.remove_captures())

4404

4405 # Combine all the subpatterns into one pattern.

4406 info = Info(flags)

4407 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]

4408 parsed = Branch(patterns)

4409

4410 # Optimise the compound pattern.

4411 reverse = bool(info.flags & REVERSE)

4412 parsed = parsed.optimise(info, reverse)

4413 parsed = parsed.pack_characters(info)

4414

4415 # Get the required string.

4416 req_offset, req_chars, req_flags = _get_required_string(parsed,

4417 info.flags)

4418

4419 # Check the features of the groups.

4420 _check_group_features(info, parsed)

4421

4422 # Complain if there are any group calls. They are not supported by the

4423 # Scanner class.

4424 if info.call_refs:

4425 raise error("recursive regex not supported by Scanner",

4426 source.string, source.pos)

4427

4428 reverse = bool(info.flags & REVERSE)

4429

4430 # Compile the compound pattern. The result is a list of tuples.

4431 code = parsed.compile(reverse) + [(OP.SUCCESS, )]

4432

4433 # Flatten the code into a list of ints.

4434 code = _flatten_code(code)

4435

4436 if not parsed.has_simple_start():

4437 # Get the first set, if possible.

4438 try:

4439 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))

4440 fs_code = _flatten_code(fs_code)

4441 code = fs_code + code

4442 except _FirstSetError:

4443 pass

4444

4445 # Check the global flags for conflicts.

4446 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4447 if version not in (0, VERSION0, VERSION1):

4448 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")

4449

4450 # Create the PatternObject.

4451 #

4452 # Local flags like IGNORECASE affect the code generation, but aren't

4453 # needed by the PatternObject itself. Conversely, global flags like

4454 # LOCALE _don't_ affect the code generation but _are_ needed by the

4455 # PatternObject.

4456 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,

4457 code, {}, {}, {}, [], req_offset, req_chars, req_flags,

4458 len(patterns))

4459

4460 def scan(self, string):

4461 result = []

4462 append = result.append

4463 match = self.scanner.scanner(string).match

4464 i = 0

4465 while True:

4466 m = match()

4467 if not m:

4468 break

4469 j = m.end()

4470 if i == j:

4471 break

4472 action = self.lexicon[m.lastindex - 1][1]

4473 if hasattr(action, '__call__'):

4474 self.match = m

4475 action = action(self, m.group())

4476 if action is not None:

4477 append(action)

4478 i = j

4479

4480 return result, string[i : ]

4481

4482# Get the known properties dict.

4483PROPERTIES = _regex.get_properties()

4484

4485# Build the inverse of the properties dict.

4486PROPERTY_NAMES = {}

4487for prop_name, (prop_id, values) in PROPERTIES.items():

4488 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))

4489 name = max(name, prop_name, key=len)

4490 PROPERTY_NAMES[prop_id] = name, prop_values

4491

4492 for val_name, val_id in values.items():

4493 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,

4494 key=len)

4495

4496# Character escape sequences.

4497CHARACTER_ESCAPES = {

4498 "a": "\a",

4499 "b": "\b",

4500 "f": "\f",

4501 "n": "\n",

4502 "r": "\r",

4503 "t": "\t",

4504 "v": "\v",

4505}

4506

4507ASCII_ENCODING = 1

4508UNICODE_ENCODING = 2

4509

4510# Predefined character set escape sequences.

4511CHARSET_ESCAPES = {

4512 "d": lookup_property(None, "Digit", True),

4513 "D": lookup_property(None, "Digit", False),

4514 "h": lookup_property(None, "Blank", True),

4515 "s": lookup_property(None, "Space", True),

4516 "S": lookup_property(None, "Space", False),

4517 "w": lookup_property(None, "Word", True),

4518 "W": lookup_property(None, "Word", False),

4519}

4520

4521ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4522ASCII_CHARSET_ESCAPES.update({

4523 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),

4524 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),

4525 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),

4526 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),

4527 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),

4528 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),

4529})

4530UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4531UNICODE_CHARSET_ESCAPES.update({

4532 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),

4533 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),

4534 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),

4535 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),

4536 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),

4537 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),

4538})

4539

4540# Positional escape sequences.

4541POSITION_ESCAPES = {

4542 "A": StartOfString(),

4543 "b": Boundary(),

4544 "B": Boundary(False),

4545 "K": Keep(),

4546 "m": StartOfWord(),

4547 "M": EndOfWord(),

4548 "Z": EndOfString(),

4549}

4550ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4551ASCII_POSITION_ESCAPES.update({

4552 "b": Boundary(encoding=ASCII_ENCODING),

4553 "B": Boundary(False, encoding=ASCII_ENCODING),

4554 "m": StartOfWord(encoding=ASCII_ENCODING),

4555 "M": EndOfWord(encoding=ASCII_ENCODING),

4556})

4557UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4558UNICODE_POSITION_ESCAPES.update({

4559 "b": Boundary(encoding=UNICODE_ENCODING),

4560 "B": Boundary(False, encoding=UNICODE_ENCODING),

4561 "m": StartOfWord(encoding=UNICODE_ENCODING),

4562 "M": EndOfWord(encoding=UNICODE_ENCODING),

4563})

4564

4565# Positional escape sequences when WORD flag set.

4566WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4567WORD_POSITION_ESCAPES.update({

4568 "b": DefaultBoundary(),

4569 "B": DefaultBoundary(False),

4570 "m": DefaultStartOfWord(),

4571 "M": DefaultEndOfWord(),

4572})

4573

4574# Regex control verbs.

4575VERBS = {

4576 "FAIL": Failure(),

4577 "F": Failure(),

4578 "PRUNE": Prune(),

4579 "SKIP": Skip(),

4580}

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 43%

2830 statements