Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex

2# Secret Labs' Regular Expression Engine core module

6# This version of the SRE library can be redistributed under CNRI's

7# Python 1.6 license. For any other use, please contact Secret Labs

8# AB (info@pythonware.com).

10# Portions of this engine have been developed in cooperation with

11# CNRI. Hewlett-Packard provided funding for 1.6 integration and

12# other compatibility work.

13#

14# 2010-01-16 mrab Python front-end re-written and extended

16import enum

17import string

18import unicodedata

19from collections import defaultdict

21import regex._regex as _regex

23__all__ = ["A", "ASCII", "B", "BESTMATCH", "D", "DEBUG", "E", "ENHANCEMATCH",

24 "F", "FULLCASE", "I", "IGNORECASE", "L", "LOCALE", "M", "MULTILINE", "P",

25 "POSIX", "R", "REVERSE", "S", "DOTALL", "T", "TEMPLATE", "U", "UNICODE",

26 "V0", "VERSION0", "V1", "VERSION1", "W", "WORD", "X", "VERBOSE", "error",

27 "Scanner", "RegexFlag"]

29# The regex exception.

30class error(Exception):

31 """Exception raised for invalid regular expressions.

33 Attributes:

35 msg: The unformatted error message

36 pattern: The regular expression pattern

37 pos: The position in the pattern where compilation failed, or None

38 lineno: The line number where compilation failed, unless pos is None

39 colno: The column number where compilation failed, unless pos is None

40 """

42 def __init__(self, message, pattern=None, pos=None):

43 newline = '\n' if isinstance(pattern, str) else b'\n'

44 self.msg = message

45 self.pattern = pattern

46 self.pos = pos

47 if pattern is not None and pos is not None:

48 self.lineno = pattern.count(newline, 0, pos) + 1

49 self.colno = pos - pattern.rfind(newline, 0, pos)

51 message = "{} at position {}".format(message, pos)

53 if newline in pattern:

54 message += " (line {}, column {})".format(self.lineno,

55 self.colno)

57 Exception.__init__(self, message)

59# The exception for when a positional flag has been turned on in the old

60# behaviour.

61class _UnscopedFlagSet(Exception):

62 pass

64# The exception for when parsing fails and we want to try something else.

65class ParseError(Exception):

66 pass

68# The exception for when there isn't a valid first set.

69class _FirstSetError(Exception):

70 pass

72# Flags.

73class RegexFlag(enum.IntFlag):

74 A = ASCII = 0x80 # Assume ASCII locale.

75 B = BESTMATCH = 0x1000 # Best fuzzy match.

76 D = DEBUG = 0x200 # Print parsed pattern.

77 E = ENHANCEMATCH = 0x8000 # Attempt to improve the fit after finding the first

78 # fuzzy match.

79 F = FULLCASE = 0x4000 # Unicode full case-folding.

80 I = IGNORECASE = 0x2 # Ignore case.

81 L = LOCALE = 0x4 # Assume current 8-bit locale.

82 M = MULTILINE = 0x8 # Make anchors look for newline.

83 P = POSIX = 0x10000 # POSIX-style matching (leftmost longest).

84 R = REVERSE = 0x400 # Search backwards.

85 S = DOTALL = 0x10 # Make dot match newline.

86 U = UNICODE = 0x20 # Assume Unicode locale.

87 V0 = VERSION0 = 0x2000 # Old legacy behaviour.

88 V1 = VERSION1 = 0x100 # New enhanced behaviour.

89 W = WORD = 0x800 # Default Unicode word breaks.

90 X = VERBOSE = 0x40 # Ignore whitespace and comments.

91 T = TEMPLATE = 0x1 # Template (present because re module has it).

93 def __repr__(self):

94 if self._name_ is not None:

95 return 'regex.%s' % self._name_

97 value = self._value_

98 members = []

99 negative = value < 0

100

101 if negative:

102 value = ~value

103

104 for m in self.__class__:

105 if value & m._value_:

106 value &= ~m._value_

107 members.append('regex.%s' % m._name_)

108

109 if value:

110 members.append(hex(value))

111

112 res = '|'.join(members)

113

114 if negative:

115 if len(members) > 1:

116 res = '~(%s)' % res

117 else:

118 res = '~%s' % res

119

120 return res

121

122 __str__ = object.__str__

123

124globals().update(RegexFlag.__members__)

125

126DEFAULT_VERSION = VERSION1

127

128_ALL_VERSIONS = VERSION0 | VERSION1

129_ALL_ENCODINGS = ASCII | LOCALE | UNICODE

130

131# The default flags for the various versions.

132DEFAULT_FLAGS = {VERSION0: 0, VERSION1: FULLCASE}

133

134# The mask for the flags.

136 REVERSE)

138 _ALL_ENCODINGS)

139

140ALPHA = frozenset(string.ascii_letters)

141DIGITS = frozenset(string.digits)

142ALNUM = ALPHA | DIGITS

143OCT_DIGITS = frozenset(string.octdigits)

144HEX_DIGITS = frozenset(string.hexdigits)

145SPECIAL_CHARS = frozenset("()|?*+{^$.[\\#") | frozenset([""])

146NAMED_CHAR_PART = ALNUM | frozenset(" -")

147PROPERTY_NAME_PART = ALNUM | frozenset(" &_-.")

148SET_OPS = ("||", "~~", "&&", "--")

149

150# The width of the code words inside the regex engine.

151BYTES_PER_CODE = _regex.get_code_size()

152BITS_PER_CODE = BYTES_PER_CODE * 8

153

154# The repeat count which represents infinity.

155UNLIMITED = (1 << BITS_PER_CODE) - 1

156

157# The regular expression flags.

158REGEX_FLAGS = {"a": ASCII, "b": BESTMATCH, "e": ENHANCEMATCH, "f": FULLCASE,

159 "i": IGNORECASE, "L": LOCALE, "m": MULTILINE, "p": POSIX, "r": REVERSE,

160 "s": DOTALL, "u": UNICODE, "V0": VERSION0, "V1": VERSION1, "w": WORD, "x":

161 VERBOSE}

162

163# The case flags.

164CASE_FLAGS = FULLCASE | IGNORECASE

165NOCASE = 0

166FULLIGNORECASE = FULLCASE | IGNORECASE

167

168FULL_CASE_FOLDING = UNICODE | FULLIGNORECASE

169

170CASE_FLAGS_COMBINATIONS = {0: 0, FULLCASE: 0, IGNORECASE: IGNORECASE,

171 FULLIGNORECASE: FULLIGNORECASE}

172

173# The number of digits in hexadecimal escapes.

174HEX_ESCAPES = {"x": 2, "u": 4, "U": 8}

175

176# The names of the opcodes.

177OPCODES = """

178FAILURE

179SUCCESS

180ANY

181ANY_ALL

182ANY_ALL_REV

183ANY_REV

184ANY_U

185ANY_U_REV

186ATOMIC

187BOUNDARY

188BRANCH

189CALL_REF

190CHARACTER

191CHARACTER_IGN

192CHARACTER_IGN_REV

193CHARACTER_REV

194CONDITIONAL

195DEFAULT_BOUNDARY

196DEFAULT_END_OF_WORD

197DEFAULT_START_OF_WORD

198END

199END_OF_LINE

200END_OF_LINE_U

201END_OF_STRING

202END_OF_STRING_LINE

203END_OF_STRING_LINE_U

204END_OF_WORD

205FUZZY

206GRAPHEME_BOUNDARY

207GREEDY_REPEAT

208GROUP

209GROUP_CALL

210GROUP_EXISTS

211KEEP

212LAZY_REPEAT

213LOOKAROUND

214NEXT

215PROPERTY

216PROPERTY_IGN

217PROPERTY_IGN_REV

218PROPERTY_REV

219PRUNE

220RANGE

221RANGE_IGN

222RANGE_IGN_REV

223RANGE_REV

224REF_GROUP

225REF_GROUP_FLD

226REF_GROUP_FLD_REV

227REF_GROUP_IGN

228REF_GROUP_IGN_REV

229REF_GROUP_REV

230SEARCH_ANCHOR

231SET_DIFF

232SET_DIFF_IGN

233SET_DIFF_IGN_REV

234SET_DIFF_REV

235SET_INTER

236SET_INTER_IGN

237SET_INTER_IGN_REV

238SET_INTER_REV

239SET_SYM_DIFF

240SET_SYM_DIFF_IGN

241SET_SYM_DIFF_IGN_REV

242SET_SYM_DIFF_REV

243SET_UNION

244SET_UNION_IGN

245SET_UNION_IGN_REV

246SET_UNION_REV

247SKIP

248START_OF_LINE

249START_OF_LINE_U

250START_OF_STRING

251START_OF_WORD

252STRING

253STRING_FLD

254STRING_FLD_REV

255STRING_IGN

256STRING_IGN_REV

257STRING_REV

258FUZZY_EXT

259"""

260

261# Define the opcodes in a namespace.

262class Namespace:

263 pass

264

265OP = Namespace()

266for i, op in enumerate(OPCODES.split()):

267 setattr(OP, op, i)

268

269def _shrink_cache(cache_dict, args_dict, locale_sensitive, max_length, divisor=5):

270 """Make room in the given cache.

271

272 Args:

273 cache_dict: The cache dictionary to modify.

274 args_dict: The dictionary of named list args used by patterns.

275 max_length: Maximum # of entries in cache_dict before it is shrunk.

276 divisor: Cache will shrink to max_length - 1/divisor*max_length items.

277 """

278 # Toss out a fraction of the entries at random to make room for new ones.

279 # A random algorithm was chosen as opposed to simply cache_dict.popitem()

280 # as popitem could penalize the same regular expression repeatedly based

281 # on its internal hash value. Being random should spread the cache miss

282 # love around.

283 cache_keys = tuple(cache_dict.keys())

284 overage = len(cache_keys) - max_length

285 if overage < 0:

286 # Cache is already within limits. Normally this should not happen

287 # but it could due to multithreading.

288 return

289

290 number_to_toss = max_length // divisor + overage

291

292 # The import is done here to avoid a circular dependency.

293 import random

294 if not hasattr(random, 'sample'):

295 # Do nothing while resolving the circular dependency:

296 # re->random->warnings->tokenize->string->re

297 return

298

299 for doomed_key in random.sample(cache_keys, number_to_toss):

300 try:

301 del cache_dict[doomed_key]

302 except KeyError:

303 # Ignore problems if the cache changed from another thread.

304 pass

305

306 # Rebuild the arguments and locale-sensitivity dictionaries.

307 args_dict.clear()

308 sensitivity_dict = {}

309 for pattern, pattern_type, flags, args, default_version, locale in tuple(cache_dict):

310 args_dict[pattern, pattern_type, flags, default_version, locale] = args

311 try:

312 sensitivity_dict[pattern_type, pattern] = locale_sensitive[pattern_type, pattern]

313 except KeyError:

314 pass

315

316 locale_sensitive.clear()

317 locale_sensitive.update(sensitivity_dict)

318

319def _fold_case(info, string):

320 "Folds the case of a string."

321 flags = info.flags

322 if (flags & _ALL_ENCODINGS) == 0:

323 flags |= info.guess_encoding

324

325 return _regex.fold_case(flags, string)

326

327def is_cased_i(info, char):

328 "Checks whether a character is cased."

329 return len(_regex.get_all_cases(info.flags, char)) > 1

330

331def is_cased_f(flags, char):

332 "Checks whether a character is cased."

333 return len(_regex.get_all_cases(flags, char)) > 1

334

335def _compile_firstset(info, fs):

336 "Compiles the firstset for the pattern."

337 reverse = bool(info.flags & REVERSE)

338 fs = _check_firstset(info, reverse, fs)

339 if not fs or isinstance(fs, AnyAll):

340 return []

341

342 # Compile the firstset.

343 return fs.compile(reverse)

344

345def _check_firstset(info, reverse, fs):

346 "Checks the firstset for the pattern."

347 if not fs or None in fs:

348 return None

349

350 # If we ignore the case, for simplicity we won't build a firstset.

351 members = set()

352 case_flags = NOCASE

353 for i in fs:

354 if isinstance(i, Character) and not i.positive:

355 return None

356

357# if i.case_flags:

358# if isinstance(i, Character):

359# if is_cased_i(info, i.value):

360# return []

361# elif isinstance(i, SetBase):

362# return []

363 case_flags |= i.case_flags

364 members.add(i.with_flags(case_flags=NOCASE))

365

366 if case_flags == (FULLCASE | IGNORECASE):

367 return None

368

369 # Build the firstset.

370 fs = SetUnion(info, list(members), case_flags=case_flags & ~FULLCASE,

371 zerowidth=True)

372 fs = fs.optimise(info, reverse, in_set=True)

373

374 return fs

375

376def _flatten_code(code):

377 "Flattens the code from a list of tuples."

378 flat_code = []

379 for c in code:

380 flat_code.extend(c)

381

382 return flat_code

383

384def make_case_flags(info):

385 "Makes the case flags."

386 flags = info.flags & CASE_FLAGS

387

388 # Turn off FULLCASE if ASCII is turned on.

389 if info.flags & ASCII:

390 flags &= ~FULLCASE

391

392 return flags

393

394def make_character(info, value, in_set=False):

395 "Makes a character literal."

396 if in_set:

397 # A character set is built case-sensitively.

398 return Character(value)

399

400 return Character(value, case_flags=make_case_flags(info))

401

402def make_ref_group(info, name, position):

403 "Makes a group reference."

404 return RefGroup(info, name, position, case_flags=make_case_flags(info))

405

406def make_string_set(info, name):

407 "Makes a string set."

408 return StringSet(info, name, case_flags=make_case_flags(info))

409

410def make_property(info, prop, in_set):

411 "Makes a property."

412 if in_set:

413 return prop

414

415 return prop.with_flags(case_flags=make_case_flags(info))

416

417def _parse_pattern(source, info):

418 "Parses a pattern, eg. 'a|b|c'."

419 branches = [parse_sequence(source, info)]

420 while source.match("|"):

421 branches.append(parse_sequence(source, info))

422

423 if len(branches) == 1:

424 return branches[0]

425 return Branch(branches)

426

427def parse_sequence(source, info):

428 "Parses a sequence, eg. 'abc'."

429 sequence = [None]

430 case_flags = make_case_flags(info)

431 while True:

432 saved_pos = source.pos

433 ch = source.get()

434 if ch in SPECIAL_CHARS:

435 if ch in ")|":

436 # The end of a sequence. At the end of the pattern ch is "".

437 source.pos = saved_pos

438 break

439 elif ch == "\\":

440 # An escape sequence outside a set.

441 sequence.append(parse_escape(source, info, False))

442 elif ch == "(":

443 # A parenthesised subpattern or a flag.

444 element = parse_paren(source, info)

445 if element is None:

446 case_flags = make_case_flags(info)

447 else:

448 sequence.append(element)

449 elif ch == ".":

450 # Any character.

451 if info.flags & DOTALL:

452 sequence.append(AnyAll())

453 elif info.flags & WORD:

454 sequence.append(AnyU())

455 else:

456 sequence.append(Any())

457 elif ch == "[":

458 # A character set.

459 sequence.append(parse_set(source, info))

460 elif ch == "^":

461 # The start of a line or the string.

462 if info.flags & MULTILINE:

463 if info.flags & WORD:

464 sequence.append(StartOfLineU())

465 else:

466 sequence.append(StartOfLine())

467 else:

468 sequence.append(StartOfString())

469 elif ch == "$":

470 # The end of a line or the string.

471 if info.flags & MULTILINE:

472 if info.flags & WORD:

473 sequence.append(EndOfLineU())

474 else:

475 sequence.append(EndOfLine())

476 else:

477 if info.flags & WORD:

478 sequence.append(EndOfStringLineU())

479 else:

480 sequence.append(EndOfStringLine())

481 elif ch in "?*+{":

482 # Looks like a quantifier.

483 counts = parse_quantifier(source, info, ch)

484 if counts:

485 # It _is_ a quantifier.

486 apply_quantifier(source, info, counts, case_flags, ch,

487 saved_pos, sequence)

488 sequence.append(None)

489 else:

490 # It's not a quantifier. Maybe it's a fuzzy constraint.

491 constraints = parse_fuzzy(source, info, ch, case_flags)

492 if constraints:

493 # It _is_ a fuzzy constraint.

494 apply_constraint(source, info, constraints, case_flags,

495 saved_pos, sequence)

496 sequence.append(None)

497 else:

498 # The element was just a literal.

499 sequence.append(Character(ord(ch),

500 case_flags=case_flags))

501 else:

502 # A literal.

503 sequence.append(Character(ord(ch), case_flags=case_flags))

504 else:

505 # A literal.

506 sequence.append(Character(ord(ch), case_flags=case_flags))

507

508 sequence = [item for item in sequence if item is not None]

509 return Sequence(sequence)

510

511def apply_quantifier(source, info, counts, case_flags, ch, saved_pos,

512 sequence):

513 element = sequence.pop()

514 if element is None:

515 if sequence:

516 raise error("multiple repeat", source.string, saved_pos)

517 raise error("nothing to repeat", source.string, saved_pos)

518

519 if isinstance(element, (GreedyRepeat, LazyRepeat, PossessiveRepeat)):

520 raise error("multiple repeat", source.string, saved_pos)

521

522 min_count, max_count = counts

523 saved_pos = source.pos

524 ch = source.get()

525 if ch == "?":

526 # The "?" suffix that means it's a lazy repeat.

527 repeated = LazyRepeat

528 elif ch == "+":

529 # The "+" suffix that means it's a possessive repeat.

530 repeated = PossessiveRepeat

531 else:

532 # No suffix means that it's a greedy repeat.

533 source.pos = saved_pos

534 repeated = GreedyRepeat

535

536 # Ignore the quantifier if it applies to a zero-width item or the number of

537 # repeats is fixed at 1.

538 if not element.is_empty() and (min_count != 1 or max_count != 1):

539 element = repeated(element, min_count, max_count)

540

541 sequence.append(element)

542

543def apply_constraint(source, info, constraints, case_flags, saved_pos,

544 sequence):

545 element = sequence.pop()

546 if element is None:

547 raise error("nothing for fuzzy constraint", source.string, saved_pos)

548

549 # If a group is marked as fuzzy then put all of the fuzzy part in the

550 # group.

551 if isinstance(element, Group):

552 element.subpattern = Fuzzy(element.subpattern, constraints)

553 sequence.append(element)

554 else:

555 sequence.append(Fuzzy(element, constraints))

556

557_QUANTIFIERS = {"?": (0, 1), "*": (0, None), "+": (1, None)}

558

559def parse_quantifier(source, info, ch):

560 "Parses a quantifier."

561 q = _QUANTIFIERS.get(ch)

562 if q:

563 # It's a quantifier.

564 return q

565

566 if ch == "{":

567 # Looks like a limited repeated element, eg. 'a{2,3}'.

568 counts = parse_limited_quantifier(source)

569 if counts:

570 return counts

571

572 return None

573

574def is_above_limit(count):

575 "Checks whether a count is above the maximum."

576 return count is not None and count >= UNLIMITED

577

578def parse_limited_quantifier(source):

579 "Parses a limited quantifier."

580 saved_pos = source.pos

581 min_count = parse_count(source)

582 if source.match(","):

583 max_count = parse_count(source)

584

585 # No minimum means 0 and no maximum means unlimited.

586 min_count = int(min_count or 0)

587 max_count = int(max_count) if max_count else None

588 else:

589 if not min_count:

590 source.pos = saved_pos

591 return None

592

593 min_count = max_count = int(min_count)

594

595 if not source.match ("}"):

596 source.pos = saved_pos

597 return None

598

599 if is_above_limit(min_count) or is_above_limit(max_count):

600 raise error("repeat count too big", source.string, saved_pos)

601

602 if max_count is not None and min_count > max_count:

603 raise error("min repeat greater than max repeat", source.string,

604 saved_pos)

605

606 return min_count, max_count

607

608def parse_fuzzy(source, info, ch, case_flags):

609 "Parses a fuzzy setting, if present."

610 saved_pos = source.pos

611

612 if ch != "{":

613 return None

614

615 constraints = {}

616 try:

617 parse_fuzzy_item(source, constraints)

618 while source.match(","):

619 parse_fuzzy_item(source, constraints)

620 except ParseError:

621 source.pos = saved_pos

622 return None

623

624 if source.match(":"):

625 constraints["test"] = parse_fuzzy_test(source, info, case_flags)

626

627 if not source.match("}"):

628 raise error("expected }", source.string, source.pos)

629

630 return constraints

631

632def parse_fuzzy_item(source, constraints):

633 "Parses a fuzzy setting item."

634 saved_pos = source.pos

635 try:

636 parse_cost_constraint(source, constraints)

637 except ParseError:

638 source.pos = saved_pos

639

640 parse_cost_equation(source, constraints)

641

642def parse_cost_constraint(source, constraints):

643 "Parses a cost constraint."

644 saved_pos = source.pos

645 ch = source.get()

646 if ch in ALPHA:

647 # Syntax: constraint [("<=" | "<") cost]

648 constraint = parse_constraint(source, constraints, ch)

649

650 max_inc = parse_fuzzy_compare(source)

651

652 if max_inc is None:

653 # No maximum cost.

654 constraints[constraint] = 0, None

655 else:

656 # There's a maximum cost.

657 cost_pos = source.pos

658 max_cost = parse_cost_limit(source)

659

660 # Inclusive or exclusive limit?

661 if not max_inc:

662 max_cost -= 1

663

664 if max_cost < 0:

665 raise error("bad fuzzy cost limit", source.string, cost_pos)

666

667 constraints[constraint] = 0, max_cost

668 elif ch in DIGITS:

669 # Syntax: cost ("<=" | "<") constraint ("<=" | "<") cost

670 source.pos = saved_pos

671

672 # Minimum cost.

673 cost_pos = source.pos

674 min_cost = parse_cost_limit(source)

675

676 min_inc = parse_fuzzy_compare(source)

677 if min_inc is None:

678 raise ParseError()

679

680 constraint = parse_constraint(source, constraints, source.get())

681

682 max_inc = parse_fuzzy_compare(source)

683 if max_inc is None:

684 raise ParseError()

685

686 # Maximum cost.

687 cost_pos = source.pos

688 max_cost = parse_cost_limit(source)

689

690 # Inclusive or exclusive limits?

691 if not min_inc:

692 min_cost += 1

693 if not max_inc:

694 max_cost -= 1

695

696 if not 0 <= min_cost <= max_cost:

697 raise error("bad fuzzy cost limit", source.string, cost_pos)

698

699 constraints[constraint] = min_cost, max_cost

700 else:

701 raise ParseError()

702

703def parse_cost_limit(source):

704 "Parses a cost limit."

705 cost_pos = source.pos

706 digits = parse_count(source)

707

708 try:

709 return int(digits)

710 except ValueError:

711 pass

712

713 raise error("bad fuzzy cost limit", source.string, cost_pos)

714

715def parse_constraint(source, constraints, ch):

716 "Parses a constraint."

717 if ch not in "deis":

718 raise ParseError()

719

720 if ch in constraints:

721 raise ParseError()

722

723 return ch

724

725def parse_fuzzy_compare(source):

726 "Parses a cost comparator."

727 if source.match("<="):

728 return True

729 elif source.match("<"):

730 return False

731 else:

732 return None

733

734def parse_cost_equation(source, constraints):

735 "Parses a cost equation."

736 if "cost" in constraints:

737 raise error("more than one cost equation", source.string, source.pos)

738

739 cost = {}

740

741 parse_cost_term(source, cost)

742 while source.match("+"):

743 parse_cost_term(source, cost)

744

745 max_inc = parse_fuzzy_compare(source)

746 if max_inc is None:

747 raise ParseError()

748

749 max_cost = int(parse_count(source))

750

751 if not max_inc:

752 max_cost -= 1

753

754 if max_cost < 0:

755 raise error("bad fuzzy cost limit", source.string, source.pos)

756

757 cost["max"] = max_cost

758

759 constraints["cost"] = cost

760

761def parse_cost_term(source, cost):

762 "Parses a cost equation term."

763 coeff = parse_count(source)

764 ch = source.get()

765 if ch not in "dis":

766 raise ParseError()

767

768 if ch in cost:

769 raise error("repeated fuzzy cost", source.string, source.pos)

770

771 cost[ch] = int(coeff or 1)

772

773def parse_fuzzy_test(source, info, case_flags):

774 saved_pos = source.pos

775 ch = source.get()

776 if ch in SPECIAL_CHARS:

777 if ch == "\\":

778 # An escape sequence outside a set.

779 return parse_escape(source, info, False)

780 elif ch == ".":

781 # Any character.

782 if info.flags & DOTALL:

783 return AnyAll()

784 elif info.flags & WORD:

785 return AnyU()

786 else:

787 return Any()

788 elif ch == "[":

789 # A character set.

790 return parse_set(source, info)

791 else:

792 raise error("expected character set", source.string, saved_pos)

793 elif ch:

794 # A literal.

795 return Character(ord(ch), case_flags=case_flags)

796 else:

797 raise error("expected character set", source.string, saved_pos)

798

799def parse_count(source):

800 "Parses a quantifier's count, which can be empty."

801 return source.get_while(DIGITS)

802

803def parse_paren(source, info):

804 """Parses a parenthesised subpattern or a flag. Returns FLAGS if it's an

805 inline flag.

806 """

807 saved_pos = source.pos

808 ch = source.get(True)

809 if ch == "?":

810 # (?...

811 saved_pos_2 = source.pos

812 ch = source.get(True)

813 if ch == "<":

814 # (?<...

815 saved_pos_3 = source.pos

816 ch = source.get()

817 if ch in ("=", "!"):

818 # (?<=... or (?<!...: lookbehind.

819 return parse_lookaround(source, info, True, ch == "=")

820

821 # (?<...: a named capture group.

822 source.pos = saved_pos_3

823 name = parse_name(source)

824 group = info.open_group(name)

825 source.expect(">")

826 saved_flags = info.flags

827 try:

828 subpattern = _parse_pattern(source, info)

829 source.expect(")")

830 finally:

831 info.flags = saved_flags

832 source.ignore_space = bool(info.flags & VERBOSE)

833

834 info.close_group()

835 return Group(info, group, subpattern)

836 if ch in ("=", "!"):

837 # (?=... or (?!...: lookahead.

838 return parse_lookaround(source, info, False, ch == "=")

839 if ch == "P":

840 # (?P...: a Python extension.

841 return parse_extension(source, info)

842 if ch == "#":

843 # (?#...: a comment.

844 return parse_comment(source)

845 if ch == "(":

846 # (?(...: a conditional subpattern.

847 return parse_conditional(source, info)

848 if ch == ">":

849 # (?>...: an atomic subpattern.

850 return parse_atomic(source, info)

851 if ch == "|":

852 # (?|...: a common/reset groups branch.

853 return parse_common(source, info)

854 if ch == "R" or "0" <= ch <= "9":

855 # (?R...: probably a call to a group.

856 return parse_call_group(source, info, ch, saved_pos_2)

857 if ch == "&":

858 # (?&...: a call to a named group.

859 return parse_call_named_group(source, info, saved_pos_2)

860

861 # (?...: probably a flags subpattern.

862 source.pos = saved_pos_2

863 return parse_flags_subpattern(source, info)

864

865 if ch == "*":

866 # (*...

867 saved_pos_2 = source.pos

868 word = source.get_while(set(")>"), include=False)

869 if word[ : 1].isalpha():

870 verb = VERBS.get(word)

871 if not verb:

872 raise error("unknown verb", source.string, saved_pos_2)

873

874 source.expect(")")

875

876 return verb

877

878 # (...: an unnamed capture group.

879 source.pos = saved_pos

880 group = info.open_group()

881 saved_flags = info.flags

882 try:

883 subpattern = _parse_pattern(source, info)

884 source.expect(")")

885 finally:

886 info.flags = saved_flags

887 source.ignore_space = bool(info.flags & VERBOSE)

888

889 info.close_group()

890

891 return Group(info, group, subpattern)

892

893def parse_extension(source, info):

894 "Parses a Python extension."

895 saved_pos = source.pos

896 ch = source.get()

897 if ch == "<":

898 # (?P<...: a named capture group.

899 name = parse_name(source)

900 group = info.open_group(name)

901 source.expect(">")

902 saved_flags = info.flags

903 try:

904 subpattern = _parse_pattern(source, info)

905 source.expect(")")

906 finally:

907 info.flags = saved_flags

908 source.ignore_space = bool(info.flags & VERBOSE)

909

910 info.close_group()

911

912 return Group(info, group, subpattern)

913 if ch == "=":

914 # (?P=...: a named group reference.

915 name = parse_name(source, allow_numeric=True)

916 source.expect(")")

917 if info.is_open_group(name):

918 raise error("cannot refer to an open group", source.string,

919 saved_pos)

920

921 return make_ref_group(info, name, saved_pos)

922 if ch == ">" or ch == "&":

923 # (?P>...: a call to a group.

924 return parse_call_named_group(source, info, saved_pos)

925

926 source.pos = saved_pos

927 raise error("unknown extension", source.string, saved_pos)

928

929def parse_comment(source):

930 "Parses a comment."

931 while True:

932 saved_pos = source.pos

933 c = source.get(True)

934

935 if not c or c == ")":

936 break

937

938 if c == "\\":

939 c = source.get(True)

940

941 source.pos = saved_pos

942 source.expect(")")

943

944 return None

945

946def parse_lookaround(source, info, behind, positive):

947 "Parses a lookaround."

948 saved_flags = info.flags

949 try:

950 subpattern = _parse_pattern(source, info)

951 source.expect(")")

952 finally:

953 info.flags = saved_flags

954 source.ignore_space = bool(info.flags & VERBOSE)

955

956 return LookAround(behind, positive, subpattern)

957

958def parse_conditional(source, info):

959 "Parses a conditional subpattern."

960 saved_flags = info.flags

961 saved_pos = source.pos

962 ch = source.get()

963 if ch == "?":

964 # (?(?...

965 ch = source.get()

966 if ch in ("=", "!"):

967 # (?(?=... or (?(?!...: lookahead conditional.

968 return parse_lookaround_conditional(source, info, False, ch == "=")

969 if ch == "<":

970 # (?(?<...

971 ch = source.get()

972 if ch in ("=", "!"):

973 # (?(?<=... or (?(?<!...: lookbehind conditional.

974 return parse_lookaround_conditional(source, info, True, ch ==

975 "=")

976

977 source.pos = saved_pos

978 raise error("expected lookaround conditional", source.string,

979 source.pos)

980

981 source.pos = saved_pos

982 try:

983 group = parse_name(source, True)

984 source.expect(")")

985 yes_branch = parse_sequence(source, info)

986 if source.match("|"):

987 no_branch = parse_sequence(source, info)

988 else:

989 no_branch = Sequence()

990

991 source.expect(")")

992 finally:

993 info.flags = saved_flags

994 source.ignore_space = bool(info.flags & VERBOSE)

995

996 if yes_branch.is_empty() and no_branch.is_empty():

997 return Sequence()

998

999 return Conditional(info, group, yes_branch, no_branch, saved_pos)

1000

1001def parse_lookaround_conditional(source, info, behind, positive):

1002 saved_flags = info.flags

1003 try:

1004 subpattern = _parse_pattern(source, info)

1005 source.expect(")")

1006 finally:

1007 info.flags = saved_flags

1008 source.ignore_space = bool(info.flags & VERBOSE)

1009

1010 yes_branch = parse_sequence(source, info)

1011 if source.match("|"):

1012 no_branch = parse_sequence(source, info)

1013 else:

1014 no_branch = Sequence()

1015

1016 source.expect(")")

1017

1018 return LookAroundConditional(behind, positive, subpattern, yes_branch,

1019 no_branch)

1020

1021def parse_atomic(source, info):

1022 "Parses an atomic subpattern."

1023 saved_flags = info.flags

1024 try:

1025 subpattern = _parse_pattern(source, info)

1026 source.expect(")")

1027 finally:

1028 info.flags = saved_flags

1029 source.ignore_space = bool(info.flags & VERBOSE)

1030

1031 return Atomic(subpattern)

1032

1033def parse_common(source, info):

1034 "Parses a common groups branch."

1035 # Capture group numbers in different branches can reuse the group numbers.

1036 initial_group_count = info.group_count

1037 branches = [parse_sequence(source, info)]

1038 final_group_count = info.group_count

1039 while source.match("|"):

1040 info.group_count = initial_group_count

1041 branches.append(parse_sequence(source, info))

1042 final_group_count = max(final_group_count, info.group_count)

1043

1044 info.group_count = final_group_count

1045 source.expect(")")

1046

1047 if len(branches) == 1:

1048 return branches[0]

1049 return Branch(branches)

1050

1051def parse_call_group(source, info, ch, pos):

1052 "Parses a call to a group."

1053 if ch == "R":

1054 group = "0"

1055 else:

1056 group = ch + source.get_while(DIGITS)

1057

1058 source.expect(")")

1059

1060 return CallGroup(info, group, pos)

1061

1062def parse_call_named_group(source, info, pos):

1063 "Parses a call to a named group."

1064 group = parse_name(source)

1065 source.expect(")")

1066

1067 return CallGroup(info, group, pos)

1068

1069def parse_flag_set(source):

1070 "Parses a set of inline flags."

1071 flags = 0

1072

1073 try:

1074 while True:

1075 saved_pos = source.pos

1076 ch = source.get()

1077 if ch == "V":

1078 ch += source.get()

1079 flags |= REGEX_FLAGS[ch]

1080 except KeyError:

1081 source.pos = saved_pos

1082

1083 return flags

1084

1085def parse_flags(source, info):

1086 "Parses flags being turned on/off."

1087 flags_on = parse_flag_set(source)

1088 if source.match("-"):

1089 flags_off = parse_flag_set(source)

1090 if not flags_off:

1091 raise error("bad inline flags: no flags after '-'", source.string,

1092 source.pos)

1093 else:

1094 flags_off = 0

1095

1096 if flags_on & LOCALE:

1097 # Remember that this pattern as an inline locale flag.

1098 info.inline_locale = True

1099

1100 return flags_on, flags_off

1101

1102def parse_subpattern(source, info, flags_on, flags_off):

1103 "Parses a subpattern with scoped flags."

1104 saved_flags = info.flags

1105 info.flags = (info.flags | flags_on) & ~flags_off

1106

1107 # Ensure that there aren't multiple encoding flags set.

1108 if info.flags & (ASCII | LOCALE | UNICODE):

1109 info.flags = (info.flags & ~_ALL_ENCODINGS) | flags_on

1110

1111 source.ignore_space = bool(info.flags & VERBOSE)

1112 try:

1113 subpattern = _parse_pattern(source, info)

1114 source.expect(")")

1115 finally:

1116 info.flags = saved_flags

1117 source.ignore_space = bool(info.flags & VERBOSE)

1118

1119 return subpattern

1120

1121def parse_flags_subpattern(source, info):

1122 """Parses a flags subpattern. It could be inline flags or a subpattern

1123 possibly with local flags. If it's a subpattern, then that's returned;

1124 if it's a inline flags, then None is returned.

1125 """

1126 flags_on, flags_off = parse_flags(source, info)

1127

1128 if flags_off & GLOBAL_FLAGS:

1129 raise error("bad inline flags: cannot turn off global flag",

1130 source.string, source.pos)

1131

1132 if flags_on & flags_off:

1133 raise error("bad inline flags: flag turned on and off", source.string,

1134 source.pos)

1135

1136 # Handle flags which are global in all regex behaviours.

1137 new_global_flags = (flags_on & ~info.global_flags) & GLOBAL_FLAGS

1138 if new_global_flags:

1139 info.global_flags |= new_global_flags

1140

1141 # A global has been turned on, so reparse the pattern.

1142 raise _UnscopedFlagSet(info.global_flags)

1143

1144 # Ensure that from now on we have only scoped flags.

1145 flags_on &= ~GLOBAL_FLAGS

1146

1147 if source.match(":"):

1148 return parse_subpattern(source, info, flags_on, flags_off)

1149

1150 if source.match(")"):

1151 parse_positional_flags(source, info, flags_on, flags_off)

1152 return None

1153

1154 raise error("unknown extension", source.string, source.pos)

1155

1156def parse_positional_flags(source, info, flags_on, flags_off):

1157 "Parses positional flags."

1158 info.flags = (info.flags | flags_on) & ~flags_off

1159 source.ignore_space = bool(info.flags & VERBOSE)

1160

1161def parse_name(source, allow_numeric=False, allow_group_0=False):

1162 "Parses a name."

1163 name = source.get_while(set(")>"), include=False)

1164

1165 if not name:

1166 raise error("missing group name", source.string, source.pos)

1167

1168 if name.isdigit():

1169 min_group = 0 if allow_group_0 else 1

1170 if not allow_numeric or int(name) < min_group:

1171 raise error("bad character in group name", source.string,

1172 source.pos)

1173 else:

1174 if not name.isidentifier():

1175 raise error("bad character in group name", source.string,

1176 source.pos)

1177

1178 return name

1179

1180def is_octal(string):

1181 "Checks whether a string is octal."

1182 return all(ch in OCT_DIGITS for ch in string)

1183

1184def is_decimal(string):

1185 "Checks whether a string is decimal."

1186 return all(ch in DIGITS for ch in string)

1187

1188def is_hexadecimal(string):

1189 "Checks whether a string is hexadecimal."

1190 return all(ch in HEX_DIGITS for ch in string)

1191

1192def parse_escape(source, info, in_set):

1193 "Parses an escape sequence."

1194 saved_ignore = source.ignore_space

1195 source.ignore_space = False

1196 ch = source.get()

1197 source.ignore_space = saved_ignore

1198 if not ch:

1199 # A backslash at the end of the pattern.

1200 raise error("bad escape (end of pattern)", source.string, source.pos)

1201 if ch in HEX_ESCAPES:

1202 # A hexadecimal escape sequence.

1203 return parse_hex_escape(source, info, ch, HEX_ESCAPES[ch], in_set, ch)

1204 elif ch == "g" and not in_set:

1205 # A group reference.

1206 saved_pos = source.pos

1207 try:

1208 return parse_group_ref(source, info)

1209 except error:

1210 # Invalid as a group reference, so assume it's a literal.

1211 source.pos = saved_pos

1212

1213 return make_character(info, ord(ch), in_set)

1214 elif ch == "G" and not in_set:

1215 # A search anchor.

1216 return SearchAnchor()

1217 elif ch == "L" and not in_set:

1218 # A string set.

1219 return parse_string_set(source, info)

1220 elif ch == "N":

1221 # A named codepoint.

1222 return parse_named_char(source, info, in_set)

1223 elif ch in "pP":

1224 # A Unicode property, positive or negative.

1225 return parse_property(source, info, ch == "p", in_set)

1226 elif ch == "R" and not in_set:

1227 # A line ending.

1228 charset = [0x0A, 0x0B, 0x0C, 0x0D]

1229 if info.guess_encoding == UNICODE:

1230 charset.extend([0x85, 0x2028, 0x2029])

1231

1232 return Atomic(Branch([String([0x0D, 0x0A]), SetUnion(info, [Character(c)

1233 for c in charset])]))

1234 elif ch == "X" and not in_set:

1235 # A grapheme cluster.

1236 return Grapheme()

1237 elif ch in ALPHA:

1238 # An alphabetic escape sequence.

1239 # Positional escapes aren't allowed inside a character set.

1240 if not in_set:

1241 if info.flags & WORD:

1242 value = WORD_POSITION_ESCAPES.get(ch)

1243 elif info.flags & ASCII:

1244 value = ASCII_POSITION_ESCAPES.get(ch)

1245 elif info.flags & UNICODE:

1246 value = UNICODE_POSITION_ESCAPES.get(ch)

1247 else:

1248 value = POSITION_ESCAPES.get(ch)

1249

1250 if value:

1251 return value

1252

1253 if info.flags & ASCII:

1254 value = ASCII_CHARSET_ESCAPES.get(ch)

1255 elif info.flags & UNICODE:

1256 value = UNICODE_CHARSET_ESCAPES.get(ch)

1257 else:

1258 value = CHARSET_ESCAPES.get(ch)

1259

1260 if value:

1261 return value

1262

1263 value = CHARACTER_ESCAPES.get(ch)

1264 if value:

1265 return Character(ord(value))

1266

1267 raise error("bad escape \\%s" % ch, source.string, source.pos)

1268 elif ch in DIGITS:

1269 # A numeric escape sequence.

1270 return parse_numeric_escape(source, info, ch, in_set)

1271 else:

1272 # A literal.

1273 return make_character(info, ord(ch), in_set)

1274

1275def parse_numeric_escape(source, info, ch, in_set):

1276 "Parses a numeric escape sequence."

1277 if in_set or ch == "0":

1278 # Octal escape sequence, max 3 digits.

1279 return parse_octal_escape(source, info, [ch], in_set)

1280

1281 # At least 1 digit, so either octal escape or group.

1282 digits = ch

1283 saved_pos = source.pos

1284 ch = source.get()

1285 if ch in DIGITS:

1286 # At least 2 digits, so either octal escape or group.

1287 digits += ch

1288 saved_pos = source.pos

1289 ch = source.get()

1290 if is_octal(digits) and ch in OCT_DIGITS:

1291 # 3 octal digits, so octal escape sequence.

1292 encoding = info.flags & _ALL_ENCODINGS

1293 if encoding == ASCII or encoding == LOCALE:

1294 octal_mask = 0xFF

1295 else:

1296 octal_mask = 0x1FF

1297

1298 value = int(digits + ch, 8) & octal_mask

1299 return make_character(info, value)

1300

1301 # Group reference.

1302 source.pos = saved_pos

1303 if info.is_open_group(digits):

1304 raise error("cannot refer to an open group", source.string, source.pos)

1305

1306 return make_ref_group(info, digits, source.pos)

1307

1308def parse_octal_escape(source, info, digits, in_set):

1309 "Parses an octal escape sequence."

1310 saved_pos = source.pos

1311 ch = source.get()

1312 while len(digits) < 3 and ch in OCT_DIGITS:

1313 digits.append(ch)

1314 saved_pos = source.pos

1315 ch = source.get()

1316

1317 source.pos = saved_pos

1318 try:

1319 value = int("".join(digits), 8)

1320 return make_character(info, value, in_set)

1321 except ValueError:

1322 if digits[0] in OCT_DIGITS:

1323 raise error("incomplete escape \\%s" % ''.join(digits),

1324 source.string, source.pos)

1325 else:

1326 raise error("bad escape \\%s" % digits[0], source.string,

1327 source.pos)

1328

1329def parse_hex_escape(source, info, esc, expected_len, in_set, type):

1330 "Parses a hex escape sequence."

1331 saved_pos = source.pos

1332 digits = []

1333 for i in range(expected_len):

1334 ch = source.get()

1335 if ch not in HEX_DIGITS:

1336 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1337 source.string, saved_pos)

1338 digits.append(ch)

1339

1340 try:

1341 value = int("".join(digits), 16)

1342 except ValueError:

1343 pass

1344 else:

1345 if value < 0x110000:

1346 return make_character(info, value, in_set)

1347

1348 # Bad hex escape.

1349 raise error("bad hex escape \\%s%s" % (esc, ''.join(digits)),

1350 source.string, saved_pos)

1351

1352def parse_group_ref(source, info):

1353 "Parses a group reference."

1354 source.expect("<")

1355 saved_pos = source.pos

1356 name = parse_name(source, True)

1357 source.expect(">")

1358 if info.is_open_group(name):

1359 raise error("cannot refer to an open group", source.string, source.pos)

1360

1361 return make_ref_group(info, name, saved_pos)

1362

1363def parse_string_set(source, info):

1364 "Parses a string set reference."

1365 source.expect("<")

1366 name = parse_name(source, True)

1367 source.expect(">")

1368 if name is None or name not in info.kwargs:

1369 raise error("undefined named list", source.string, source.pos)

1370

1371 return make_string_set(info, name)

1372

1373def parse_named_char(source, info, in_set):

1374 "Parses a named character."

1375 saved_pos = source.pos

1376 if source.match("{"):

1377 name = source.get_while(NAMED_CHAR_PART, keep_spaces=True)

1378 if source.match("}"):

1379 try:

1380 value = unicodedata.lookup(name)

1381 return make_character(info, ord(value), in_set)

1382 except KeyError:

1383 raise error("undefined character name", source.string,

1384 source.pos)

1385

1386 source.pos = saved_pos

1387 return make_character(info, ord("N"), in_set)

1388

1389def parse_property(source, info, positive, in_set):

1390 "Parses a Unicode property."

1391 saved_pos = source.pos

1392 ch = source.get()

1393 if ch == "{":

1394 negate = source.match("^")

1395 prop_name, name = parse_property_name(source)

1396 if source.match("}"):

1397 # It's correctly delimited.

1398 if info.flags & ASCII:

1399 encoding = ASCII_ENCODING

1400 elif info.flags & UNICODE:

1401 encoding = UNICODE_ENCODING

1402 else:

1403 encoding = 0

1404

1405 prop = lookup_property(prop_name, name, positive != negate, source,

1406 encoding=encoding)

1407 return make_property(info, prop, in_set)

1408 elif ch and ch in "CLMNPSZ":

1409 # An abbreviated property, eg \pL.

1410 if info.flags & ASCII:

1411 encoding = ASCII_ENCODING

1412 elif info.flags & UNICODE:

1413 encoding = UNICODE_ENCODING

1414 else:

1415 encoding = 0

1416

1417 prop = lookup_property(None, ch, positive, source, encoding=encoding)

1418 return make_property(info, prop, in_set)

1419

1420 # Not a property, so treat as a literal "p" or "P".

1421 source.pos = saved_pos

1422 ch = "p" if positive else "P"

1423 return make_character(info, ord(ch), in_set)

1424

1425def parse_property_name(source):

1426 "Parses a property name, which may be qualified."

1427 name = source.get_while(PROPERTY_NAME_PART)

1428 saved_pos = source.pos

1429

1430 ch = source.get()

1431 if ch and ch in ":=":

1432 prop_name = name

1433 name = source.get_while(ALNUM | set(" &_-./")).strip()

1434

1435 if name:

1436 # Name after the ":" or "=", so it's a qualified name.

1437 saved_pos = source.pos

1438 else:

1439 # No name after the ":" or "=", so assume it's an unqualified name.

1440 prop_name, name = None, prop_name

1441 else:

1442 prop_name = None

1443

1444 source.pos = saved_pos

1445 return prop_name, name

1446

1447def parse_set(source, info):

1448 "Parses a character set."

1449 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1450

1451 saved_ignore = source.ignore_space

1452 source.ignore_space = False

1453 # Negative set?

1454 negate = source.match("^")

1455 try:

1456 if version == VERSION0:

1457 item = parse_set_imp_union(source, info)

1458 else:

1459 item = parse_set_union(source, info)

1460

1461 if not source.match("]"):

1462 raise error("missing ]", source.string, source.pos)

1463 finally:

1464 source.ignore_space = saved_ignore

1465

1466 if negate:

1467 item = item.with_flags(positive=not item.positive)

1468

1469 item = item.with_flags(case_flags=make_case_flags(info))

1470

1471 return item

1472

1473def parse_set_union(source, info):

1474 "Parses a set union ([x||y])."

1475 items = [parse_set_symm_diff(source, info)]

1476 while source.match("||"):

1477 items.append(parse_set_symm_diff(source, info))

1478

1479 if len(items) == 1:

1480 return items[0]

1481 return SetUnion(info, items)

1482

1483def parse_set_symm_diff(source, info):

1484 "Parses a set symmetric difference ([x~~y])."

1485 items = [parse_set_inter(source, info)]

1486 while source.match("~~"):

1487 items.append(parse_set_inter(source, info))

1488

1489 if len(items) == 1:

1490 return items[0]

1491 return SetSymDiff(info, items)

1492

1493def parse_set_inter(source, info):

1494 "Parses a set intersection ([x&&y])."

1495 items = [parse_set_diff(source, info)]

1496 while source.match("&&"):

1497 items.append(parse_set_diff(source, info))

1498

1499 if len(items) == 1:

1500 return items[0]

1501 return SetInter(info, items)

1502

1503def parse_set_diff(source, info):

1504 "Parses a set difference ([x--y])."

1505 items = [parse_set_imp_union(source, info)]

1506 while source.match("--"):

1507 items.append(parse_set_imp_union(source, info))

1508

1509 if len(items) == 1:

1510 return items[0]

1511 return SetDiff(info, items)

1512

1513def parse_set_imp_union(source, info):

1514 "Parses a set implicit union ([xy])."

1515 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1516

1517 items = [parse_set_member(source, info)]

1518 while True:

1519 saved_pos = source.pos

1520 if source.match("]"):

1521 # End of the set.

1522 source.pos = saved_pos

1523 break

1524

1525 if version == VERSION1 and any(source.match(op) for op in SET_OPS):

1526 # The new behaviour has set operators.

1527 source.pos = saved_pos

1528 break

1529

1530 items.append(parse_set_member(source, info))

1531

1532 if len(items) == 1:

1533 return items[0]

1534 return SetUnion(info, items)

1535

1536def parse_set_member(source, info):

1537 "Parses a member in a character set."

1538 # Parse a set item.

1539 start = parse_set_item(source, info)

1540 saved_pos1 = source.pos

1541 if (not isinstance(start, Character) or not start.positive or not

1542 source.match("-")):

1543 # It's not the start of a range.

1544 return start

1545

1546 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1547

1548 # It looks like the start of a range of characters.

1549 saved_pos2 = source.pos

1550 if version == VERSION1 and source.match("-"):

1551 # It's actually the set difference operator '--', so return the

1552 # character.

1553 source.pos = saved_pos1

1554 return start

1555

1556 if source.match("]"):

1557 # We've reached the end of the set, so return both the character and

1558 # hyphen.

1559 source.pos = saved_pos2

1560 return SetUnion(info, [start, Character(ord("-"))])

1561

1562 # Parse a set item.

1563 end = parse_set_item(source, info)

1564 if not isinstance(end, Character) or not end.positive:

1565 # It's not a range, so return the character, hyphen and property.

1566 return SetUnion(info, [start, Character(ord("-")), end])

1567

1568 # It _is_ a range.

1569 if start.value > end.value:

1570 raise error("bad character range", source.string, source.pos)

1571

1572 if start.value == end.value:

1573 return start

1574

1575 return Range(start.value, end.value)

1576

1577def parse_set_item(source, info):

1578 "Parses an item in a character set."

1579 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

1580

1581 if source.match("\\"):

1582 # An escape sequence in a set.

1583 return parse_escape(source, info, True)

1584

1585 saved_pos = source.pos

1586 if source.match("[:"):

1587 # Looks like a POSIX character class.

1588 try:

1589 return parse_posix_class(source, info)

1590 except ParseError:

1591 # Not a POSIX character class.

1592 source.pos = saved_pos

1593

1594 if version == VERSION1 and source.match("["):

1595 # It's the start of a nested set.

1596

1597 # Negative set?

1598 negate = source.match("^")

1599 item = parse_set_union(source, info)

1600

1601 if not source.match("]"):

1602 raise error("missing ]", source.string, source.pos)

1603

1604 if negate:

1605 item = item.with_flags(positive=not item.positive)

1606

1607 return item

1608

1609 ch = source.get()

1610 if not ch:

1611 raise error("unterminated character set", source.string, source.pos)

1612

1613 return Character(ord(ch))

1614

1615def parse_posix_class(source, info):

1616 "Parses a POSIX character class."

1617 negate = source.match("^")

1618 prop_name, name = parse_property_name(source)

1619 if not source.match(":]"):

1620 raise ParseError()

1621

1622 return lookup_property(prop_name, name, not negate, source, posix=True)

1623

1624def float_to_rational(flt):

1625 "Converts a float to a rational pair."

1626 int_part = int(flt)

1627 error = flt - int_part

1628 if abs(error) < 0.0001:

1629 return int_part, 1

1630

1631 den, num = float_to_rational(1.0 / error)

1632

1633 return int_part * den + num, den

1634

1635def numeric_to_rational(numeric):

1636 "Converts a numeric string to a rational string, if possible."

1637 if numeric[ : 1] == "-":

1638 sign, numeric = numeric[0], numeric[1 : ]

1639 else:

1640 sign = ""

1641

1642 parts = numeric.split("/")

1643 if len(parts) == 2:

1644 num, den = float_to_rational(float(parts[0]) / float(parts[1]))

1645 elif len(parts) == 1:

1646 num, den = float_to_rational(float(parts[0]))

1647 else:

1648 raise ValueError()

1649

1650 result = "{}{}/{}".format(sign, num, den)

1651 if result.endswith("/1"):

1652 return result[ : -2]

1653

1654 return result

1655

1656def standardise_name(name):

1657 "Standardises a property or value name."

1658 try:

1659 return numeric_to_rational("".join(name))

1660 except (ValueError, ZeroDivisionError):

1661 return "".join(ch for ch in name if ch not in "_- ").upper()

1662

1663_POSIX_CLASSES = set('ALNUM DIGIT PUNCT XDIGIT'.split())

1664

1665_BINARY_VALUES = set('YES Y NO N TRUE T FALSE F'.split())

1666

1667def lookup_property(property, value, positive, source=None, posix=False, encoding=0):

1668 "Looks up a property."

1669 # Normalise the names (which may still be lists).

1670 property = standardise_name(property) if property else None

1671 value = standardise_name(value)

1672

1673 if (property, value) == ("GENERALCATEGORY", "ASSIGNED"):

1674 property, value, positive = "GENERALCATEGORY", "UNASSIGNED", not positive

1675

1676 if posix and not property and value.upper() in _POSIX_CLASSES:

1677 value = 'POSIX' + value

1678

1679 if property:

1680 # Both the property and the value are provided.

1681 prop = PROPERTIES.get(property)

1682 if not prop:

1683 if not source:

1684 raise error("unknown property")

1685

1686 raise error("unknown property", source.string, source.pos)

1687

1688 prop_id, value_dict = prop

1689 val_id = value_dict.get(value)

1690 if val_id is None:

1691 if not source:

1692 raise error("unknown property value")

1693

1694 raise error("unknown property value", source.string, source.pos)

1695

1696 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1697

1698 # Only the value is provided.

1699 # It might be the name of a GC, script or block value.

1700 for property in ("GC", "SCRIPT", "BLOCK"):

1701 prop_id, value_dict = PROPERTIES.get(property)

1702 val_id = value_dict.get(value)

1703 if val_id is not None:

1704 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1705

1706 # It might be the name of a binary property.

1707 prop = PROPERTIES.get(value)

1708 if prop:

1709 prop_id, value_dict = prop

1710 if set(value_dict) == _BINARY_VALUES:

1711 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1712

1713 return Property(prop_id << 16, not positive, encoding=encoding)

1714

1715 # It might be the name of a binary property starting with a prefix.

1716 if value.startswith("IS"):

1717 prop = PROPERTIES.get(value[2 : ])

1718 if prop:

1719 prop_id, value_dict = prop

1720 if "YES" in value_dict:

1721 return Property((prop_id << 16) | 1, positive, encoding=encoding)

1722

1723 # It might be the name of a script or block starting with a prefix.

1724 for prefix, property in (("IS", "SCRIPT"), ("IN", "BLOCK")):

1725 if value.startswith(prefix):

1726 prop_id, value_dict = PROPERTIES.get(property)

1727 val_id = value_dict.get(value[2 : ])

1728 if val_id is not None:

1729 return Property((prop_id << 16) | val_id, positive, encoding=encoding)

1730

1731 # Unknown property.

1732 if not source:

1733 raise error("unknown property")

1734

1735 raise error("unknown property", source.string, source.pos)

1736

1737def _compile_replacement(source, pattern, is_unicode):

1738 "Compiles a replacement template escape sequence."

1739 ch = source.get()

1740 if ch in ALPHA:

1741 # An alphabetic escape sequence.

1742 value = CHARACTER_ESCAPES.get(ch)

1743 if value:

1744 return False, [ord(value)]

1745

1746 if ch in HEX_ESCAPES and (ch == "x" or is_unicode):

1747 # A hexadecimal escape sequence.

1748 return False, [parse_repl_hex_escape(source, HEX_ESCAPES[ch], ch)]

1749

1750 if ch == "g":

1751 # A group preference.

1752 return True, [compile_repl_group(source, pattern)]

1753

1754 if ch == "N" and is_unicode:

1755 # A named character.

1756 value = parse_repl_named_char(source)

1757 if value is not None:

1758 return False, [value]

1759

1760 raise error("bad escape \\%s" % ch, source.string, source.pos)

1761

1762 if isinstance(source.sep, bytes):

1763 octal_mask = 0xFF

1764 else:

1765 octal_mask = 0x1FF

1766

1767 if ch == "0":

1768 # An octal escape sequence.

1769 digits = ch

1770 while len(digits) < 3:

1771 saved_pos = source.pos

1772 ch = source.get()

1773 if ch not in OCT_DIGITS:

1774 source.pos = saved_pos

1775 break

1776 digits += ch

1777

1778 return False, [int(digits, 8) & octal_mask]

1779

1780 if ch in DIGITS:

1781 # Either an octal escape sequence (3 digits) or a group reference (max

1782 # 2 digits).

1783 digits = ch

1784 saved_pos = source.pos

1785 ch = source.get()

1786 if ch in DIGITS:

1787 digits += ch

1788 saved_pos = source.pos

1789 ch = source.get()

1790 if ch and is_octal(digits + ch):

1791 # An octal escape sequence.

1792 return False, [int(digits + ch, 8) & octal_mask]

1793

1794 # A group reference.

1795 source.pos = saved_pos

1796 return True, [int(digits)]

1797

1798 if ch == "\\":

1799 # An escaped backslash is a backslash.

1800 return False, [ord("\\")]

1801

1802 if not ch:

1803 # A trailing backslash.

1804 raise error("bad escape (end of pattern)", source.string, source.pos)

1805

1806 # An escaped non-backslash is a backslash followed by the literal.

1807 return False, [ord("\\"), ord(ch)]

1808

1809def parse_repl_hex_escape(source, expected_len, type):

1810 "Parses a hex escape sequence in a replacement string."

1811 digits = []

1812 for i in range(expected_len):

1813 ch = source.get()

1814 if ch not in HEX_DIGITS:

1815 raise error("incomplete escape \\%s%s" % (type, ''.join(digits)),

1816 source.string, source.pos)

1817 digits.append(ch)

1818

1819 return int("".join(digits), 16)

1820

1821def parse_repl_named_char(source):

1822 "Parses a named character in a replacement string."

1823 saved_pos = source.pos

1824 if source.match("{"):

1825 name = source.get_while(ALPHA | set(" "))

1826

1827 if source.match("}"):

1828 try:

1829 value = unicodedata.lookup(name)

1830 return ord(value)

1831 except KeyError:

1832 raise error("undefined character name", source.string,

1833 source.pos)

1834

1835 source.pos = saved_pos

1836 return None

1837

1838def compile_repl_group(source, pattern):

1839 "Compiles a replacement template group reference."

1840 source.expect("<")

1841 name = parse_name(source, True, True)

1842

1843 source.expect(">")

1844 if name.isdigit():

1845 index = int(name)

1846 if not 0 <= index <= pattern.groups:

1847 raise error("invalid group reference", source.string, source.pos)

1848

1849 return index

1850

1851 try:

1852 return pattern.groupindex[name]

1853 except KeyError:

1854 raise IndexError("unknown group")

1855

1856# The regular expression is parsed into a syntax tree. The different types of

1857# node are defined below.

1858

1859INDENT = " "

1860POSITIVE_OP = 0x1

1861ZEROWIDTH_OP = 0x2

1862FUZZY_OP = 0x4

1863REVERSE_OP = 0x8

1864REQUIRED_OP = 0x10

1865ENCODING_OP_SHIFT = 5

1866

1867POS_TEXT = {False: "NON-MATCH", True: "MATCH"}

1868CASE_TEXT = {NOCASE: "", IGNORECASE: " SIMPLE_IGNORE_CASE", FULLCASE: "",

1869 FULLIGNORECASE: " FULL_IGNORE_CASE"}

1870

1871def make_sequence(items):

1872 if len(items) == 1:

1873 return items[0]

1874 return Sequence(items)

1875

1876# Common base class for all nodes.

1877class RegexBase:

1878 def __init__(self):

1879 self._key = self.__class__

1880

1881 def with_flags(self, positive=None, case_flags=None, zerowidth=None):

1882 if positive is None:

1883 positive = self.positive

1884 else:

1885 positive = bool(positive)

1886 if case_flags is None:

1887 case_flags = self.case_flags

1888 else:

1889 case_flags = CASE_FLAGS_COMBINATIONS[case_flags & CASE_FLAGS]

1890 if zerowidth is None:

1891 zerowidth = self.zerowidth

1892 else:

1893 zerowidth = bool(zerowidth)

1894

1895 if (positive == self.positive and case_flags == self.case_flags and

1896 zerowidth == self.zerowidth):

1897 return self

1898

1899 return self.rebuild(positive, case_flags, zerowidth)

1900

1901 def fix_groups(self, pattern, reverse, fuzzy):

1902 pass

1903

1904 def optimise(self, info, reverse):

1905 return self

1906

1907 def pack_characters(self, info):

1908 return self

1909

1910 def remove_captures(self):

1911 return self

1912

1913 def is_atomic(self):

1914 return True

1915

1916 def can_be_affix(self):

1917 return True

1918

1919 def contains_group(self):

1920 return False

1921

1922 def get_firstset(self, reverse):

1923 raise _FirstSetError()

1924

1925 def has_simple_start(self):

1926 return False

1927

1928 def compile(self, reverse=False, fuzzy=False):

1929 return self._compile(reverse, fuzzy)

1930

1931 def is_empty(self):

1932 return False

1933

1934 def __hash__(self):

1935 return hash(self._key)

1936

1937 def __eq__(self, other):

1938 return type(self) is type(other) and self._key == other._key

1939

1940 def __ne__(self, other):

1941 return not self.__eq__(other)

1942

1943 def get_required_string(self, reverse):

1944 return self.max_width(), None

1945

1946# Base class for zero-width nodes.

1947class ZeroWidthBase(RegexBase):

1948 def __init__(self, positive=True, encoding=0):

1949 RegexBase.__init__(self)

1950 self.positive = bool(positive)

1951 self.encoding = encoding

1952

1953 self._key = self.__class__, self.positive

1954

1955 def get_firstset(self, reverse):

1956 return set([None])

1957

1958 def _compile(self, reverse, fuzzy):

1959 flags = 0

1960 if self.positive:

1961 flags |= POSITIVE_OP

1962 if fuzzy:

1963 flags |= FUZZY_OP

1964 if reverse:

1965 flags |= REVERSE_OP

1966 flags |= self.encoding << ENCODING_OP_SHIFT

1967 return [(self._opcode, flags)]

1968

1969 def dump(self, indent, reverse):

1970 print("{}{} {}{}".format(INDENT * indent, self._op_name,

1971 POS_TEXT[self.positive], ["", " ASCII"][self.encoding]))

1972

1973 def max_width(self):

1974 return 0

1975

1976class Any(RegexBase):

1977 _opcode = {False: OP.ANY, True: OP.ANY_REV}

1978 _op_name = "ANY"

1979

1980 def has_simple_start(self):

1981 return True

1982

1983 def _compile(self, reverse, fuzzy):

1984 flags = 0

1985 if fuzzy:

1986 flags |= FUZZY_OP

1987 return [(self._opcode[reverse], flags)]

1988

1989 def dump(self, indent, reverse):

1990 print("{}{}".format(INDENT * indent, self._op_name))

1991

1992 def max_width(self):

1993 return 1

1994

1995class AnyAll(Any):

1996 _opcode = {False: OP.ANY_ALL, True: OP.ANY_ALL_REV}

1997 _op_name = "ANY_ALL"

1998

1999 def __init__(self):

2000 self.positive = True

2001 self.zerowidth = False

2002 self.case_flags = 0

2003

2004class AnyU(Any):

2005 _opcode = {False: OP.ANY_U, True: OP.ANY_U_REV}

2006 _op_name = "ANY_U"

2007

2008class Atomic(RegexBase):

2009 def __init__(self, subpattern):

2010 RegexBase.__init__(self)

2011 self.subpattern = subpattern

2012

2013 def fix_groups(self, pattern, reverse, fuzzy):

2014 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2015

2016 def optimise(self, info, reverse):

2017 self.subpattern = self.subpattern.optimise(info, reverse)

2018

2019 if self.subpattern.is_empty():

2020 return self.subpattern

2021 return self

2022

2023 def pack_characters(self, info):

2024 self.subpattern = self.subpattern.pack_characters(info)

2025 return self

2026

2027 def remove_captures(self):

2028 self.subpattern = self.subpattern.remove_captures()

2029 return self

2030

2031 def can_be_affix(self):

2032 return self.subpattern.can_be_affix()

2033

2034 def contains_group(self):

2035 return self.subpattern.contains_group()

2036

2037 def get_firstset(self, reverse):

2038 return self.subpattern.get_firstset(reverse)

2039

2040 def has_simple_start(self):

2041 return self.subpattern.has_simple_start()

2042

2043 def _compile(self, reverse, fuzzy):

2044 return ([(OP.ATOMIC, )] + self.subpattern.compile(reverse, fuzzy) +

2045 [(OP.END, )])

2046

2047 def dump(self, indent, reverse):

2048 print("{}ATOMIC".format(INDENT * indent))

2049 self.subpattern.dump(indent + 1, reverse)

2050

2051 def is_empty(self):

2052 return self.subpattern.is_empty()

2053

2054 def __eq__(self, other):

2055 return (type(self) is type(other) and self.subpattern ==

2056 other.subpattern)

2057

2058 def max_width(self):

2059 return self.subpattern.max_width()

2060

2061 def get_required_string(self, reverse):

2062 return self.subpattern.get_required_string(reverse)

2063

2064class Boundary(ZeroWidthBase):

2065 _opcode = OP.BOUNDARY

2066 _op_name = "BOUNDARY"

2067

2068class Branch(RegexBase):

2069 def __init__(self, branches):

2070 RegexBase.__init__(self)

2071 self.branches = branches

2072

2073 def fix_groups(self, pattern, reverse, fuzzy):

2074 for b in self.branches:

2075 b.fix_groups(pattern, reverse, fuzzy)

2076

2077 def optimise(self, info, reverse):

2078 if not self.branches:

2079 return Sequence([])

2080

2081 # Flatten branches within branches.

2082 branches = Branch._flatten_branches(info, reverse, self.branches)

2083

2084 # Move any common prefix or suffix out of the branches.

2085 if reverse:

2086 suffix, branches = Branch._split_common_suffix(info, branches)

2087 prefix = []

2088 else:

2089 prefix, branches = Branch._split_common_prefix(info, branches)

2090 suffix = []

2091

2092 # Try to reduce adjacent single-character branches to sets.

2093 branches = Branch._reduce_to_set(info, reverse, branches)

2094

2095 if len(branches) > 1:

2096 sequence = [Branch(branches)]

2097

2098 if not prefix or not suffix:

2099 # We might be able to add a quick precheck before the branches.

2100 firstset = self._add_precheck(info, reverse, branches)

2101

2102 if firstset:

2103 if reverse:

2104 sequence.append(firstset)

2105 else:

2106 sequence.insert(0, firstset)

2107 else:

2108 sequence = branches

2109

2110 return make_sequence(prefix + sequence + suffix)

2111

2112 def _add_precheck(self, info, reverse, branches):

2113 charset = set()

2114 pos = -1 if reverse else 0

2115

2116 for branch in branches:

2117 if type(branch) is Literal and branch.case_flags == NOCASE:

2118 charset.add(branch.characters[pos])

2119 else:

2120 return

2121

2122 if not charset:

2123 return None

2124

2125 return _check_firstset(info, reverse, [Character(c) for c in charset])

2126

2127 def pack_characters(self, info):

2128 self.branches = [b.pack_characters(info) for b in self.branches]

2129 return self

2130

2131 def remove_captures(self):

2132 self.branches = [b.remove_captures() for b in self.branches]

2133 return self

2134

2135 def is_atomic(self):

2136 return all(b.is_atomic() for b in self.branches)

2137

2138 def can_be_affix(self):

2139 return all(b.can_be_affix() for b in self.branches)

2140

2141 def contains_group(self):

2142 return any(b.contains_group() for b in self.branches)

2143

2144 def get_firstset(self, reverse):

2145 fs = set()

2146 for b in self.branches:

2147 fs |= b.get_firstset(reverse)

2148

2149 return fs or set([None])

2150

2151 def _compile(self, reverse, fuzzy):

2152 if not self.branches:

2153 return []

2154

2155 code = [(OP.BRANCH, )]

2156 for b in self.branches:

2157 code.extend(b.compile(reverse, fuzzy))

2158 code.append((OP.NEXT, ))

2159

2160 code[-1] = (OP.END, )

2161

2162 return code

2163

2164 def dump(self, indent, reverse):

2165 print("{}BRANCH".format(INDENT * indent))

2166 self.branches[0].dump(indent + 1, reverse)

2167 for b in self.branches[1 : ]:

2168 print("{}OR".format(INDENT * indent))

2169 b.dump(indent + 1, reverse)

2170

2171 @staticmethod

2172 def _flatten_branches(info, reverse, branches):

2173 # Flatten the branches so that there aren't branches of branches.

2174 new_branches = []

2175 for b in branches:

2176 b = b.optimise(info, reverse)

2177 if isinstance(b, Branch):

2178 new_branches.extend(b.branches)

2179 else:

2180 new_branches.append(b)

2181

2182 return new_branches

2183

2184 @staticmethod

2185 def _split_common_prefix(info, branches):

2186 # Common leading items can be moved out of the branches.

2187 # Get the items in the branches.

2188 alternatives = []

2189 for b in branches:

2190 if isinstance(b, Sequence):

2191 alternatives.append(b.items)

2192 else:

2193 alternatives.append([b])

2194

2195 # What is the maximum possible length of the prefix?

2196 max_count = min(len(a) for a in alternatives)

2197

2198 # What is the longest common prefix?

2199 prefix = alternatives[0]

2200 pos = 0

2201 end_pos = max_count

2202 while pos < end_pos and prefix[pos].can_be_affix() and all(a[pos] ==

2203 prefix[pos] for a in alternatives):

2204 pos += 1

2205 count = pos

2206

2207 if info.flags & UNICODE:

2208 # We need to check that we're not splitting a sequence of

2209 # characters which could form part of full case-folding.

2210 count = pos

2211 while count > 0 and not all(Branch._can_split(a, count) for a in

2212 alternatives):

2213 count -= 1

2214

2215 # No common prefix is possible.

2216 if count == 0:

2217 return [], branches

2218

2219 # Rebuild the branches.

2220 new_branches = []

2221 for a in alternatives:

2222 new_branches.append(make_sequence(a[count : ]))

2223

2224 return prefix[ : count], new_branches

2225

2226 @staticmethod

2227 def _split_common_suffix(info, branches):

2228 # Common trailing items can be moved out of the branches.

2229 # Get the items in the branches.

2230 alternatives = []

2231 for b in branches:

2232 if isinstance(b, Sequence):

2233 alternatives.append(b.items)

2234 else:

2235 alternatives.append([b])

2236

2237 # What is the maximum possible length of the suffix?

2238 max_count = min(len(a) for a in alternatives)

2239

2240 # What is the longest common suffix?

2241 suffix = alternatives[0]

2242 pos = -1

2243 end_pos = -1 - max_count

2244 while pos > end_pos and suffix[pos].can_be_affix() and all(a[pos] ==

2245 suffix[pos] for a in alternatives):

2246 pos -= 1

2247 count = -1 - pos

2248

2249 if info.flags & UNICODE:

2250 # We need to check that we're not splitting a sequence of

2251 # characters which could form part of full case-folding.

2252 while count > 0 and not all(Branch._can_split_rev(a, count) for a

2253 in alternatives):

2254 count -= 1

2255

2256 # No common suffix is possible.

2257 if count == 0:

2258 return [], branches

2259

2260 # Rebuild the branches.

2261 new_branches = []

2262 for a in alternatives:

2263 new_branches.append(make_sequence(a[ : -count]))

2264

2265 return suffix[-count : ], new_branches

2266

2267 @staticmethod

2268 def _can_split(items, count):

2269 # Check the characters either side of the proposed split.

2270 if not Branch._is_full_case(items, count - 1):

2271 return True

2272

2273 if not Branch._is_full_case(items, count):

2274 return True

2275

2276 # Check whether a 1-1 split would be OK.

2277 if Branch._is_folded(items[count - 1 : count + 1]):

2278 return False

2279

2280 # Check whether a 1-2 split would be OK.

2281 if (Branch._is_full_case(items, count + 2) and

2282 Branch._is_folded(items[count - 1 : count + 2])):

2283 return False

2284

2285 # Check whether a 2-1 split would be OK.

2286 if (Branch._is_full_case(items, count - 2) and

2287 Branch._is_folded(items[count - 2 : count + 1])):

2288 return False

2289

2290 return True

2291

2292 @staticmethod

2293 def _can_split_rev(items, count):

2294 end = len(items)

2295

2296 # Check the characters either side of the proposed split.

2297 if not Branch._is_full_case(items, end - count):

2298 return True

2299

2300 if not Branch._is_full_case(items, end - count - 1):

2301 return True

2302

2303 # Check whether a 1-1 split would be OK.

2304 if Branch._is_folded(items[end - count - 1 : end - count + 1]):

2305 return False

2306

2307 # Check whether a 1-2 split would be OK.

2308 if (Branch._is_full_case(items, end - count + 2) and

2309 Branch._is_folded(items[end - count - 1 : end - count + 2])):

2310 return False

2311

2312 # Check whether a 2-1 split would be OK.

2313 if (Branch._is_full_case(items, end - count - 2) and

2314 Branch._is_folded(items[end - count - 2 : end - count + 1])):

2315 return False

2316

2317 return True

2318

2319 @staticmethod

2320 def _merge_common_prefixes(info, reverse, branches):

2321 # Branches with the same case-sensitive character prefix can be grouped

2322 # together if they are separated only by other branches with a

2323 # character prefix.

2324 prefixed = defaultdict(list)

2325 order = {}

2326 new_branches = []

2327 for b in branches:

2328 if Branch._is_simple_character(b):

2329 # Branch starts with a simple character.

2330 prefixed[b.value].append([b])

2331 order.setdefault(b.value, len(order))

2332 elif (isinstance(b, Sequence) and b.items and

2333 Branch._is_simple_character(b.items[0])):

2334 # Branch starts with a simple character.

2335 prefixed[b.items[0].value].append(b.items)

2336 order.setdefault(b.items[0].value, len(order))

2337 else:

2338 Branch._flush_char_prefix(info, reverse, prefixed, order,

2339 new_branches)

2340

2341 new_branches.append(b)

2342

2343 Branch._flush_char_prefix(info, prefixed, order, new_branches)

2344

2345 return new_branches

2346

2347 @staticmethod

2348 def _is_simple_character(c):

2349 return isinstance(c, Character) and c.positive and not c.case_flags

2350

2351 @staticmethod

2352 def _reduce_to_set(info, reverse, branches):

2353 # Can the branches be reduced to a set?

2354 new_branches = []

2355 items = set()

2356 case_flags = NOCASE

2357 for b in branches:

2358 if isinstance(b, (Character, Property, SetBase)):

2359 # Branch starts with a single character.

2360 if b.case_flags != case_flags:

2361 # Different case sensitivity, so flush.

2362 Branch._flush_set_members(info, reverse, items, case_flags,

2363 new_branches)

2364

2365 case_flags = b.case_flags

2366

2367 items.add(b.with_flags(case_flags=NOCASE))

2368 else:

2369 Branch._flush_set_members(info, reverse, items, case_flags,

2370 new_branches)

2371

2372 new_branches.append(b)

2373

2374 Branch._flush_set_members(info, reverse, items, case_flags,

2375 new_branches)

2376

2377 return new_branches

2378

2379 @staticmethod

2380 def _flush_char_prefix(info, reverse, prefixed, order, new_branches):

2381 # Flush the prefixed branches.

2382 if not prefixed:

2383 return

2384

2385 for value, branches in sorted(prefixed.items(), key=lambda pair:

2386 order[pair[0]]):

2387 if len(branches) == 1:

2388 new_branches.append(make_sequence(branches[0]))

2389 else:

2390 subbranches = []

2391 optional = False

2392 for b in branches:

2393 if len(b) > 1:

2394 subbranches.append(make_sequence(b[1 : ]))

2395 elif not optional:

2396 subbranches.append(Sequence())

2397 optional = True

2398

2399 sequence = Sequence([Character(value), Branch(subbranches)])

2400 new_branches.append(sequence.optimise(info, reverse))

2401

2402 prefixed.clear()

2403 order.clear()

2404

2405 @staticmethod

2406 def _flush_set_members(info, reverse, items, case_flags, new_branches):

2407 # Flush the set members.

2408 if not items:

2409 return

2410

2411 if len(items) == 1:

2412 item = list(items)[0]

2413 else:

2414 item = SetUnion(info, list(items)).optimise(info, reverse)

2415

2416 new_branches.append(item.with_flags(case_flags=case_flags))

2417

2418 items.clear()

2419

2420 @staticmethod

2421 def _is_full_case(items, i):

2422 if not 0 <= i < len(items):

2423 return False

2424

2425 item = items[i]

2426 return (isinstance(item, Character) and item.positive and

2427 (item.case_flags & FULLIGNORECASE) == FULLIGNORECASE)

2428

2429 @staticmethod

2430 def _is_folded(items):

2431 if len(items) < 2:

2432 return False

2433

2434 for i in items:

2435 if (not isinstance(i, Character) or not i.positive or not

2436 i.case_flags):

2437 return False

2438

2439 folded = "".join(chr(i.value) for i in items)

2440 folded = _regex.fold_case(FULL_CASE_FOLDING, folded)

2441

2442 # Get the characters which expand to multiple codepoints on folding.

2443 expanding_chars = _regex.get_expand_on_folding()

2444

2445 for c in expanding_chars:

2446 if folded == _regex.fold_case(FULL_CASE_FOLDING, c):

2447 return True

2448

2449 return False

2450

2451 def is_empty(self):

2452 return all(b.is_empty() for b in self.branches)

2453

2454 def __eq__(self, other):

2455 return type(self) is type(other) and self.branches == other.branches

2456

2457 def max_width(self):

2458 return max(b.max_width() for b in self.branches)

2459

2460class CallGroup(RegexBase):

2461 def __init__(self, info, group, position):

2462 RegexBase.__init__(self)

2463 self.info = info

2464 self.group = group

2465 self.position = position

2466

2467 self._key = self.__class__, self.group

2468

2469 def fix_groups(self, pattern, reverse, fuzzy):

2470 try:

2471 self.group = int(self.group)

2472 except ValueError:

2473 try:

2474 self.group = self.info.group_index[self.group]

2475 except KeyError:

2476 raise error("invalid group reference", pattern, self.position)

2477

2478 if not 0 <= self.group <= self.info.group_count:

2479 raise error("unknown group", pattern, self.position)

2480

2481 if self.group > 0 and self.info.open_group_count[self.group] > 1:

2482 raise error("ambiguous group reference", pattern, self.position)

2483

2484 self.info.group_calls.append((self, reverse, fuzzy))

2485

2486 self._key = self.__class__, self.group

2487

2488 def remove_captures(self):

2489 raise error("group reference not allowed", pattern, self.position)

2490

2491 def _compile(self, reverse, fuzzy):

2492 return [(OP.GROUP_CALL, self.call_ref)]

2493

2494 def dump(self, indent, reverse):

2495 print("{}GROUP_CALL {}".format(INDENT * indent, self.group))

2496

2497 def __eq__(self, other):

2498 return type(self) is type(other) and self.group == other.group

2499

2500 def max_width(self):

2501 return UNLIMITED

2502

2503 def __del__(self):

2504 self.info = None

2505

2506class CallRef(RegexBase):

2507 def __init__(self, ref, parsed):

2508 self.ref = ref

2509 self.parsed = parsed

2510

2511 def _compile(self, reverse, fuzzy):

2512 return ([(OP.CALL_REF, self.ref)] + self.parsed._compile(reverse,

2513 fuzzy) + [(OP.END, )])

2514

2515class Character(RegexBase):

2516 _opcode = {(NOCASE, False): OP.CHARACTER, (IGNORECASE, False):

2517 OP.CHARACTER_IGN, (FULLCASE, False): OP.CHARACTER, (FULLIGNORECASE,

2518 False): OP.CHARACTER_IGN, (NOCASE, True): OP.CHARACTER_REV, (IGNORECASE,

2519 True): OP.CHARACTER_IGN_REV, (FULLCASE, True): OP.CHARACTER_REV,

2520 (FULLIGNORECASE, True): OP.CHARACTER_IGN_REV}

2521

2522 def __init__(self, value, positive=True, case_flags=NOCASE,

2523 zerowidth=False):

2524 RegexBase.__init__(self)

2525 self.value = value

2526 self.positive = bool(positive)

2527 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

2528 self.zerowidth = bool(zerowidth)

2529

2530 if (self.positive and (self.case_flags & FULLIGNORECASE) ==

2531 FULLIGNORECASE):

2532 self.folded = _regex.fold_case(FULL_CASE_FOLDING, chr(self.value))

2533 else:

2534 self.folded = chr(self.value)

2535

2536 self._key = (self.__class__, self.value, self.positive,

2537 self.case_flags, self.zerowidth)

2538

2539 def rebuild(self, positive, case_flags, zerowidth):

2540 return Character(self.value, positive, case_flags, zerowidth)

2541

2542 def optimise(self, info, reverse, in_set=False):

2543 return self

2544

2545 def get_firstset(self, reverse):

2546 return set([self])

2547

2548 def has_simple_start(self):

2549 return True

2550

2551 def _compile(self, reverse, fuzzy):

2552 flags = 0

2553 if self.positive:

2554 flags |= POSITIVE_OP

2555 if self.zerowidth:

2556 flags |= ZEROWIDTH_OP

2557 if fuzzy:

2558 flags |= FUZZY_OP

2559

2560 code = PrecompiledCode([self._opcode[self.case_flags, reverse], flags,

2561 self.value])

2562

2563 if len(self.folded) > 1:

2564 # The character expands on full case-folding.

2565 code = Branch([code, String([ord(c) for c in self.folded],

2566 case_flags=self.case_flags)])

2567

2568 return code.compile(reverse, fuzzy)

2569

2570 def dump(self, indent, reverse):

2571 display = ascii(chr(self.value)).lstrip("bu")

2572 print("{}CHARACTER {} {}{}".format(INDENT * indent,

2573 POS_TEXT[self.positive], display, CASE_TEXT[self.case_flags]))

2574

2575 def matches(self, ch):

2576 return (ch == self.value) == self.positive

2577

2578 def max_width(self):

2579 return len(self.folded)

2580

2581 def get_required_string(self, reverse):

2582 if not self.positive:

2583 return 1, None

2584

2585 self.folded_characters = tuple(ord(c) for c in self.folded)

2586

2587 return 0, self

2588

2589class Conditional(RegexBase):

2590 def __init__(self, info, group, yes_item, no_item, position):

2591 RegexBase.__init__(self)

2592 self.info = info

2593 self.group = group

2594 self.yes_item = yes_item

2595 self.no_item = no_item

2596 self.position = position

2597

2598 def fix_groups(self, pattern, reverse, fuzzy):

2599 try:

2600 self.group = int(self.group)

2601 except ValueError:

2602 try:

2603 self.group = self.info.group_index[self.group]

2604 except KeyError:

2605 if self.group == 'DEFINE':

2606 # 'DEFINE' is a special name unless there's a group with

2607 # that name.

2608 self.group = 0

2609 else:

2610 raise error("unknown group", pattern, self.position)

2611

2612 if not 0 <= self.group <= self.info.group_count:

2613 raise error("invalid group reference", pattern, self.position)

2614

2615 self.yes_item.fix_groups(pattern, reverse, fuzzy)

2616 self.no_item.fix_groups(pattern, reverse, fuzzy)

2617

2618 def optimise(self, info, reverse):

2619 yes_item = self.yes_item.optimise(info, reverse)

2620 no_item = self.no_item.optimise(info, reverse)

2621

2622 return Conditional(info, self.group, yes_item, no_item, self.position)

2623

2624 def pack_characters(self, info):

2625 self.yes_item = self.yes_item.pack_characters(info)

2626 self.no_item = self.no_item.pack_characters(info)

2627 return self

2628

2629 def remove_captures(self):

2630 self.yes_item = self.yes_item.remove_captures()

2631 self.no_item = self.no_item.remove_captures()

2632

2633 def is_atomic(self):

2634 return self.yes_item.is_atomic() and self.no_item.is_atomic()

2635

2636 def can_be_affix(self):

2637 return self.yes_item.can_be_affix() and self.no_item.can_be_affix()

2638

2639 def contains_group(self):

2640 return self.yes_item.contains_group() or self.no_item.contains_group()

2641

2642 def get_firstset(self, reverse):

2643 return (self.yes_item.get_firstset(reverse) |

2644 self.no_item.get_firstset(reverse))

2645

2646 def _compile(self, reverse, fuzzy):

2647 code = [(OP.GROUP_EXISTS, self.group)]

2648 code.extend(self.yes_item.compile(reverse, fuzzy))

2649 add_code = self.no_item.compile(reverse, fuzzy)

2650 if add_code:

2651 code.append((OP.NEXT, ))

2652 code.extend(add_code)

2653

2654 code.append((OP.END, ))

2655

2656 return code

2657

2658 def dump(self, indent, reverse):

2659 print("{}GROUP_EXISTS {}".format(INDENT * indent, self.group))

2660 self.yes_item.dump(indent + 1, reverse)

2661 if not self.no_item.is_empty():

2662 print("{}OR".format(INDENT * indent))

2663 self.no_item.dump(indent + 1, reverse)

2664

2665 def is_empty(self):

2666 return self.yes_item.is_empty() and self.no_item.is_empty()

2667

2668 def __eq__(self, other):

2669 return type(self) is type(other) and (self.group, self.yes_item,

2670 self.no_item) == (other.group, other.yes_item, other.no_item)

2671

2672 def max_width(self):

2673 return max(self.yes_item.max_width(), self.no_item.max_width())

2674

2675 def __del__(self):

2676 self.info = None

2677

2678class DefaultBoundary(ZeroWidthBase):

2679 _opcode = OP.DEFAULT_BOUNDARY

2680 _op_name = "DEFAULT_BOUNDARY"

2681

2682class DefaultEndOfWord(ZeroWidthBase):

2683 _opcode = OP.DEFAULT_END_OF_WORD

2684 _op_name = "DEFAULT_END_OF_WORD"

2685

2686class DefaultStartOfWord(ZeroWidthBase):

2687 _opcode = OP.DEFAULT_START_OF_WORD

2688 _op_name = "DEFAULT_START_OF_WORD"

2689

2690class EndOfLine(ZeroWidthBase):

2691 _opcode = OP.END_OF_LINE

2692 _op_name = "END_OF_LINE"

2693

2694class EndOfLineU(EndOfLine):

2695 _opcode = OP.END_OF_LINE_U

2696 _op_name = "END_OF_LINE_U"

2697

2698class EndOfString(ZeroWidthBase):

2699 _opcode = OP.END_OF_STRING

2700 _op_name = "END_OF_STRING"

2701

2702class EndOfStringLine(ZeroWidthBase):

2703 _opcode = OP.END_OF_STRING_LINE

2704 _op_name = "END_OF_STRING_LINE"

2705

2706class EndOfStringLineU(EndOfStringLine):

2707 _opcode = OP.END_OF_STRING_LINE_U

2708 _op_name = "END_OF_STRING_LINE_U"

2709

2710class EndOfWord(ZeroWidthBase):

2711 _opcode = OP.END_OF_WORD

2712 _op_name = "END_OF_WORD"

2713

2714class Failure(ZeroWidthBase):

2715 _op_name = "FAILURE"

2716

2717 def _compile(self, reverse, fuzzy):

2718 return [(OP.FAILURE, )]

2719

2720class Fuzzy(RegexBase):

2721 def __init__(self, subpattern, constraints=None):

2722 RegexBase.__init__(self)

2723 if constraints is None:

2724 constraints = {}

2725 self.subpattern = subpattern

2726 self.constraints = constraints

2727

2728 # If an error type is mentioned in the cost equation, then its maximum

2729 # defaults to unlimited.

2730 if "cost" in constraints:

2731 for e in "dis":

2732 if e in constraints["cost"]:

2733 constraints.setdefault(e, (0, None))

2734

2735 # If any error type is mentioned, then all the error maxima default to

2736 # 0, otherwise they default to unlimited.

2737 if set(constraints) & set("dis"):

2738 for e in "dis":

2739 constraints.setdefault(e, (0, 0))

2740 else:

2741 for e in "dis":

2742 constraints.setdefault(e, (0, None))

2743

2744 # The maximum of the generic error type defaults to unlimited.

2745 constraints.setdefault("e", (0, None))

2746

2747 # The cost equation defaults to equal costs. Also, the cost of any

2748 # error type not mentioned in the cost equation defaults to 0.

2749 if "cost" in constraints:

2750 for e in "dis":

2751 constraints["cost"].setdefault(e, 0)

2752 else:

2753 constraints["cost"] = {"d": 1, "i": 1, "s": 1, "max":

2754 constraints["e"][1]}

2755

2756 def fix_groups(self, pattern, reverse, fuzzy):

2757 self.subpattern.fix_groups(pattern, reverse, True)

2758

2759 def pack_characters(self, info):

2760 self.subpattern = self.subpattern.pack_characters(info)

2761 return self

2762

2763 def remove_captures(self):

2764 self.subpattern = self.subpattern.remove_captures()

2765 return self

2766

2767 def is_atomic(self):

2768 return self.subpattern.is_atomic()

2769

2770 def contains_group(self):

2771 return self.subpattern.contains_group()

2772

2773 def _compile(self, reverse, fuzzy):

2774 # The individual limits.

2775 arguments = []

2776 for e in "dise":

2777 v = self.constraints[e]

2778 arguments.append(v[0])

2779 arguments.append(UNLIMITED if v[1] is None else v[1])

2780

2781 # The coeffs of the cost equation.

2782 for e in "dis":

2783 arguments.append(self.constraints["cost"][e])

2784

2785 # The maximum of the cost equation.

2786 v = self.constraints["cost"]["max"]

2787 arguments.append(UNLIMITED if v is None else v)

2788

2789 flags = 0

2790 if reverse:

2791 flags |= REVERSE_OP

2792

2793 test = self.constraints.get("test")

2794

2795 if test:

2796 return ([(OP.FUZZY_EXT, flags) + tuple(arguments)] +

2797 test.compile(reverse, True) + [(OP.NEXT,)] +

2798 self.subpattern.compile(reverse, True) + [(OP.END,)])

2799

2800 return ([(OP.FUZZY, flags) + tuple(arguments)] +

2801 self.subpattern.compile(reverse, True) + [(OP.END,)])

2802

2803 def dump(self, indent, reverse):

2804 constraints = self._constraints_to_string()

2805 if constraints:

2806 constraints = " " + constraints

2807 print("{}FUZZY{}".format(INDENT * indent, constraints))

2808 self.subpattern.dump(indent + 1, reverse)

2809

2810 def is_empty(self):

2811 return self.subpattern.is_empty()

2812

2813 def __eq__(self, other):

2814 return (type(self) is type(other) and self.subpattern ==

2815 other.subpattern and self.constraints == other.constraints)

2816

2817 def max_width(self):

2818 return UNLIMITED

2819

2820 def _constraints_to_string(self):

2821 constraints = []

2822

2823 for name in "ids":

2824 min, max = self.constraints[name]

2825 if max == 0:

2826 continue

2827

2828 con = ""

2829

2830 if min > 0:

2831 con = "{}<=".format(min)

2832

2833 con += name

2834

2835 if max is not None:

2836 con += "<={}".format(max)

2837

2838 constraints.append(con)

2839

2840 cost = []

2841 for name in "ids":

2842 coeff = self.constraints["cost"][name]

2843 if coeff > 0:

2844 cost.append("{}{}".format(coeff, name))

2845

2846 limit = self.constraints["cost"]["max"]

2847 if limit is not None and limit > 0:

2848 cost = "{}<={}".format("+".join(cost), limit)

2849 constraints.append(cost)

2850

2851 return ",".join(constraints)

2852

2853class Grapheme(RegexBase):

2854 def _compile(self, reverse, fuzzy):

2855 # Match at least 1 character until a grapheme boundary is reached. Note

2856 # that this is the same whether matching forwards or backwards.

2857 grapheme_matcher = Atomic(Sequence([LazyRepeat(AnyAll(), 1, None),

2858 GraphemeBoundary()]))

2859

2860 return grapheme_matcher.compile(reverse, fuzzy)

2861

2862 def dump(self, indent, reverse):

2863 print("{}GRAPHEME".format(INDENT * indent))

2864

2865 def max_width(self):

2866 return UNLIMITED

2867

2868class GraphemeBoundary:

2869 def compile(self, reverse, fuzzy):

2870 return [(OP.GRAPHEME_BOUNDARY, 1)]

2871

2872class GreedyRepeat(RegexBase):

2873 _opcode = OP.GREEDY_REPEAT

2874 _op_name = "GREEDY_REPEAT"

2875

2876 def __init__(self, subpattern, min_count, max_count):

2877 RegexBase.__init__(self)

2878 self.subpattern = subpattern

2879 self.min_count = min_count

2880 self.max_count = max_count

2881

2882 def fix_groups(self, pattern, reverse, fuzzy):

2883 self.subpattern.fix_groups(pattern, reverse, fuzzy)

2884

2885 def optimise(self, info, reverse):

2886 subpattern = self.subpattern.optimise(info, reverse)

2887

2888 return type(self)(subpattern, self.min_count, self.max_count)

2889

2890 def pack_characters(self, info):

2891 self.subpattern = self.subpattern.pack_characters(info)

2892 return self

2893

2894 def remove_captures(self):

2895 self.subpattern = self.subpattern.remove_captures()

2896 return self

2897

2898 def is_atomic(self):

2899 return self.min_count == self.max_count and self.subpattern.is_atomic()

2900

2901 def can_be_affix(self):

2902 return False

2903

2904 def contains_group(self):

2905 return self.subpattern.contains_group()

2906

2907 def get_firstset(self, reverse):

2908 fs = self.subpattern.get_firstset(reverse)

2909 if self.min_count == 0:

2910 fs.add(None)

2911

2912 return fs

2913

2914 def _compile(self, reverse, fuzzy):

2915 repeat = [self._opcode, self.min_count]

2916 if self.max_count is None:

2917 repeat.append(UNLIMITED)

2918 else:

2919 repeat.append(self.max_count)

2920

2921 subpattern = self.subpattern.compile(reverse, fuzzy)

2922 if not subpattern:

2923 return []

2924

2925 return ([tuple(repeat)] + subpattern + [(OP.END, )])

2926

2927 def dump(self, indent, reverse):

2928 if self.max_count is None:

2929 limit = "INF"

2930 else:

2931 limit = self.max_count

2932 print("{}{} {} {}".format(INDENT * indent, self._op_name,

2933 self.min_count, limit))

2934

2935 self.subpattern.dump(indent + 1, reverse)

2936

2937 def is_empty(self):

2938 return self.subpattern.is_empty()

2939

2940 def __eq__(self, other):

2941 return type(self) is type(other) and (self.subpattern, self.min_count,

2942 self.max_count) == (other.subpattern, other.min_count,

2943 other.max_count)

2944

2945 def max_width(self):

2946 if self.max_count is None:

2947 return UNLIMITED

2948

2949 return self.subpattern.max_width() * self.max_count

2950

2951 def get_required_string(self, reverse):

2952 max_count = UNLIMITED if self.max_count is None else self.max_count

2953 if self.min_count == 0:

2954 w = self.subpattern.max_width() * max_count

2955 return min(w, UNLIMITED), None

2956

2957 ofs, req = self.subpattern.get_required_string(reverse)

2958 if req:

2959 return ofs, req

2960

2961 w = self.subpattern.max_width() * max_count

2962 return min(w, UNLIMITED), None

2963

2964class PossessiveRepeat(GreedyRepeat):

2965 def is_atomic(self):

2966 return True

2967

2968 def _compile(self, reverse, fuzzy):

2969 subpattern = self.subpattern.compile(reverse, fuzzy)

2970 if not subpattern:

2971 return []

2972

2973 repeat = [self._opcode, self.min_count]

2974 if self.max_count is None:

2975 repeat.append(UNLIMITED)

2976 else:

2977 repeat.append(self.max_count)

2978

2979 return ([(OP.ATOMIC, ), tuple(repeat)] + subpattern + [(OP.END, ),

2980 (OP.END, )])

2981

2982 def dump(self, indent, reverse):

2983 print("{}ATOMIC".format(INDENT * indent))

2984

2985 if self.max_count is None:

2986 limit = "INF"

2987 else:

2988 limit = self.max_count

2989 print("{}{} {} {}".format(INDENT * (indent + 1), self._op_name,

2990 self.min_count, limit))

2991

2992 self.subpattern.dump(indent + 2, reverse)

2993

2994class Group(RegexBase):

2995 def __init__(self, info, group, subpattern):

2996 RegexBase.__init__(self)

2997 self.info = info

2998 self.group = group

2999 self.subpattern = subpattern

3000

3001 self.call_ref = None

3002

3003 def fix_groups(self, pattern, reverse, fuzzy):

3004 self.info.defined_groups[self.group] = (self, reverse, fuzzy)

3005 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3006

3007 def optimise(self, info, reverse):

3008 subpattern = self.subpattern.optimise(info, reverse)

3009

3010 return Group(self.info, self.group, subpattern)

3011

3012 def pack_characters(self, info):

3013 self.subpattern = self.subpattern.pack_characters(info)

3014 return self

3015

3016 def remove_captures(self):

3017 return self.subpattern.remove_captures()

3018

3019 def is_atomic(self):

3020 return self.subpattern.is_atomic()

3021

3022 def can_be_affix(self):

3023 return False

3024

3025 def contains_group(self):

3026 return True

3027

3028 def get_firstset(self, reverse):

3029 return self.subpattern.get_firstset(reverse)

3030

3031 def has_simple_start(self):

3032 return self.subpattern.has_simple_start()

3033

3034 def _compile(self, reverse, fuzzy):

3035 code = []

3036

3037 public_group = private_group = self.group

3038 if private_group < 0:

3039 public_group = self.info.private_groups[private_group]

3040 private_group = self.info.group_count - private_group

3041

3042 key = self.group, reverse, fuzzy

3043 ref = self.info.call_refs.get(key)

3044 if ref is not None:

3045 code += [(OP.CALL_REF, ref)]

3046

3047 code += [(OP.GROUP, int(not reverse), private_group, public_group)]

3048 code += self.subpattern.compile(reverse, fuzzy)

3049 code += [(OP.END, )]

3050

3051 if ref is not None:

3052 code += [(OP.END, )]

3053

3054 return code

3055

3056 def dump(self, indent, reverse):

3057 group = self.group

3058 if group < 0:

3059 group = private_groups[group]

3060 print("{}GROUP {}".format(INDENT * indent, group))

3061 self.subpattern.dump(indent + 1, reverse)

3062

3063 def __eq__(self, other):

3064 return (type(self) is type(other) and (self.group, self.subpattern) ==

3065 (other.group, other.subpattern))

3066

3067 def max_width(self):

3068 return self.subpattern.max_width()

3069

3070 def get_required_string(self, reverse):

3071 return self.subpattern.get_required_string(reverse)

3072

3073 def __del__(self):

3074 self.info = None

3075

3076class Keep(ZeroWidthBase):

3077 _opcode = OP.KEEP

3078 _op_name = "KEEP"

3079

3080class LazyRepeat(GreedyRepeat):

3081 _opcode = OP.LAZY_REPEAT

3082 _op_name = "LAZY_REPEAT"

3083

3084class LookAround(RegexBase):

3085 _dir_text = {False: "AHEAD", True: "BEHIND"}

3086

3087 def __init__(self, behind, positive, subpattern):

3088 RegexBase.__init__(self)

3089 self.behind = bool(behind)

3090 self.positive = bool(positive)

3091 self.subpattern = subpattern

3092

3093 def fix_groups(self, pattern, reverse, fuzzy):

3094 self.subpattern.fix_groups(pattern, self.behind, fuzzy)

3095

3096 def optimise(self, info, reverse):

3097 subpattern = self.subpattern.optimise(info, self.behind)

3098 if self.positive and subpattern.is_empty():

3099 return subpattern

3100

3101 return LookAround(self.behind, self.positive, subpattern)

3102

3103 def pack_characters(self, info):

3104 self.subpattern = self.subpattern.pack_characters(info)

3105 return self

3106

3107 def remove_captures(self):

3108 return self.subpattern.remove_captures()

3109

3110 def is_atomic(self):

3111 return self.subpattern.is_atomic()

3112

3113 def can_be_affix(self):

3114 return self.subpattern.can_be_affix()

3115

3116 def contains_group(self):

3117 return self.subpattern.contains_group()

3118

3119 def get_firstset(self, reverse):

3120 if self.positive and self.behind == reverse:

3121 return self.subpattern.get_firstset(reverse)

3122

3123 return set([None])

3124

3125 def _compile(self, reverse, fuzzy):

3126 flags = 0

3127 if self.positive:

3128 flags |= POSITIVE_OP

3129 if fuzzy:

3130 flags |= FUZZY_OP

3131 if reverse:

3132 flags |= REVERSE_OP

3133

3134 return ([(OP.LOOKAROUND, flags, int(not self.behind))] +

3135 self.subpattern.compile(self.behind) + [(OP.END, )])

3136

3137 def dump(self, indent, reverse):

3138 print("{}LOOK{} {}".format(INDENT * indent,

3139 self._dir_text[self.behind], POS_TEXT[self.positive]))

3140 self.subpattern.dump(indent + 1, self.behind)

3141

3142 def is_empty(self):

3143 return self.positive and self.subpattern.is_empty()

3144

3145 def __eq__(self, other):

3146 return type(self) is type(other) and (self.behind, self.positive,

3147 self.subpattern) == (other.behind, other.positive, other.subpattern)

3148

3149 def max_width(self):

3150 return 0

3151

3152class LookAroundConditional(RegexBase):

3153 _dir_text = {False: "AHEAD", True: "BEHIND"}

3154

3155 def __init__(self, behind, positive, subpattern, yes_item, no_item):

3156 RegexBase.__init__(self)

3157 self.behind = bool(behind)

3158 self.positive = bool(positive)

3159 self.subpattern = subpattern

3160 self.yes_item = yes_item

3161 self.no_item = no_item

3162

3163 def fix_groups(self, pattern, reverse, fuzzy):

3164 self.subpattern.fix_groups(pattern, reverse, fuzzy)

3165 self.yes_item.fix_groups(pattern, reverse, fuzzy)

3166 self.no_item.fix_groups(pattern, reverse, fuzzy)

3167

3168 def optimise(self, info, reverse):

3169 subpattern = self.subpattern.optimise(info, self.behind)

3170 yes_item = self.yes_item.optimise(info, self.behind)

3171 no_item = self.no_item.optimise(info, self.behind)

3172

3173 return LookAroundConditional(self.behind, self.positive, subpattern,

3174 yes_item, no_item)

3175

3176 def pack_characters(self, info):

3177 self.subpattern = self.subpattern.pack_characters(info)

3178 self.yes_item = self.yes_item.pack_characters(info)

3179 self.no_item = self.no_item.pack_characters(info)

3180 return self

3181

3182 def remove_captures(self):

3183 self.subpattern = self.subpattern.remove_captures()

3184 self.yes_item = self.yes_item.remove_captures()

3185 self.no_item = self.no_item.remove_captures()

3186

3187 def is_atomic(self):

3188 return (self.subpattern.is_atomic() and self.yes_item.is_atomic() and

3189 self.no_item.is_atomic())

3190

3191 def can_be_affix(self):

3192 return (self.subpattern.can_be_affix() and self.yes_item.can_be_affix()

3193 and self.no_item.can_be_affix())

3194

3195 def contains_group(self):

3196 return (self.subpattern.contains_group() or

3197 self.yes_item.contains_group() or self.no_item.contains_group())

3198

3199 def _compile(self, reverse, fuzzy):

3200 code = [(OP.CONDITIONAL, int(self.positive), int(not self.behind))]

3201 code.extend(self.subpattern.compile(self.behind, fuzzy))

3202 code.append((OP.NEXT, ))

3203 code.extend(self.yes_item.compile(reverse, fuzzy))

3204 add_code = self.no_item.compile(reverse, fuzzy)

3205 if add_code:

3206 code.append((OP.NEXT, ))

3207 code.extend(add_code)

3208

3209 code.append((OP.END, ))

3210

3211 return code

3212

3213 def dump(self, indent, reverse):

3214 print("{}CONDITIONAL {} {}".format(INDENT * indent,

3215 self._dir_text[self.behind], POS_TEXT[self.positive]))

3216 self.subpattern.dump(indent + 1, self.behind)

3217 print("{}EITHER".format(INDENT * indent))

3218 self.yes_item.dump(indent + 1, reverse)

3219 if not self.no_item.is_empty():

3220 print("{}OR".format(INDENT * indent))

3221 self.no_item.dump(indent + 1, reverse)

3222

3223 def is_empty(self):

3224 return (self.subpattern.is_empty() and self.yes_item.is_empty() or

3225 self.no_item.is_empty())

3226

3227 def __eq__(self, other):

3228 return type(self) is type(other) and (self.subpattern, self.yes_item,

3229 self.no_item) == (other.subpattern, other.yes_item, other.no_item)

3230

3231 def max_width(self):

3232 return max(self.yes_item.max_width(), self.no_item.max_width())

3233

3234 def get_required_string(self, reverse):

3235 return self.max_width(), None

3236

3237class PrecompiledCode(RegexBase):

3238 def __init__(self, code):

3239 self.code = code

3240

3241 def _compile(self, reverse, fuzzy):

3242 return [tuple(self.code)]

3243

3244class Property(RegexBase):

3245 _opcode = {(NOCASE, False): OP.PROPERTY, (IGNORECASE, False):

3246 OP.PROPERTY_IGN, (FULLCASE, False): OP.PROPERTY, (FULLIGNORECASE, False):

3247 OP.PROPERTY_IGN, (NOCASE, True): OP.PROPERTY_REV, (IGNORECASE, True):

3248 OP.PROPERTY_IGN_REV, (FULLCASE, True): OP.PROPERTY_REV, (FULLIGNORECASE,

3249 True): OP.PROPERTY_IGN_REV}

3250

3251 def __init__(self, value, positive=True, case_flags=NOCASE,

3252 zerowidth=False, encoding=0):

3253 RegexBase.__init__(self)

3254 self.value = value

3255 self.positive = bool(positive)

3256 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3257 self.zerowidth = bool(zerowidth)

3258 self.encoding = encoding

3259

3260 self._key = (self.__class__, self.value, self.positive,

3261 self.case_flags, self.zerowidth)

3262

3263 def rebuild(self, positive, case_flags, zerowidth):

3264 return Property(self.value, positive, case_flags, zerowidth,

3265 self.encoding)

3266

3267 def optimise(self, info, reverse, in_set=False):

3268 return self

3269

3270 def get_firstset(self, reverse):

3271 return set([self])

3272

3273 def has_simple_start(self):

3274 return True

3275

3276 def _compile(self, reverse, fuzzy):

3277 flags = 0

3278 if self.positive:

3279 flags |= POSITIVE_OP

3280 if self.zerowidth:

3281 flags |= ZEROWIDTH_OP

3282 if fuzzy:

3283 flags |= FUZZY_OP

3284 flags |= self.encoding << ENCODING_OP_SHIFT

3285 return [(self._opcode[self.case_flags, reverse], flags, self.value)]

3286

3287 def dump(self, indent, reverse):

3288 prop = PROPERTY_NAMES[self.value >> 16]

3289 name, value = prop[0], prop[1][self.value & 0xFFFF]

3290 print("{}PROPERTY {} {}:{}{}{}".format(INDENT * indent,

3291 POS_TEXT[self.positive], name, value, CASE_TEXT[self.case_flags],

3292 ["", " ASCII"][self.encoding]))

3293

3294 def matches(self, ch):

3295 return _regex.has_property_value(self.value, ch) == self.positive

3296

3297 def max_width(self):

3298 return 1

3299

3300class Prune(ZeroWidthBase):

3301 _op_name = "PRUNE"

3302

3303 def _compile(self, reverse, fuzzy):

3304 return [(OP.PRUNE, )]

3305

3306class Range(RegexBase):

3307 _opcode = {(NOCASE, False): OP.RANGE, (IGNORECASE, False): OP.RANGE_IGN,

3308 (FULLCASE, False): OP.RANGE, (FULLIGNORECASE, False): OP.RANGE_IGN,

3309 (NOCASE, True): OP.RANGE_REV, (IGNORECASE, True): OP.RANGE_IGN_REV,

3310 (FULLCASE, True): OP.RANGE_REV, (FULLIGNORECASE, True): OP.RANGE_IGN_REV}

3311 _op_name = "RANGE"

3312

3313 def __init__(self, lower, upper, positive=True, case_flags=NOCASE,

3314 zerowidth=False):

3315 RegexBase.__init__(self)

3316 self.lower = lower

3317 self.upper = upper

3318 self.positive = bool(positive)

3319 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3320 self.zerowidth = bool(zerowidth)

3321

3322 self._key = (self.__class__, self.lower, self.upper, self.positive,

3323 self.case_flags, self.zerowidth)

3324

3325 def rebuild(self, positive, case_flags, zerowidth):

3326 return Range(self.lower, self.upper, positive, case_flags, zerowidth)

3327

3328 def optimise(self, info, reverse, in_set=False):

3329 # Is the range case-sensitive?

3330 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3331 return self

3332

3333 # Is full case-folding possible?

3334 if (not (info.flags & UNICODE) or (self.case_flags & FULLIGNORECASE) !=

3335 FULLIGNORECASE):

3336 return self

3337

3338 # Get the characters which expand to multiple codepoints on folding.

3339 expanding_chars = _regex.get_expand_on_folding()

3340

3341 # Get the folded characters in the range.

3342 items = []

3343 for ch in expanding_chars:

3344 if self.lower <= ord(ch) <= self.upper:

3345 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3346 items.append(String([ord(c) for c in folded],

3347 case_flags=self.case_flags))

3348

3349 if not items:

3350 # We can fall back to simple case-folding.

3351 return self

3352

3353 if len(items) < self.upper - self.lower + 1:

3354 # Not all the characters are covered by the full case-folding.

3355 items.insert(0, self)

3356

3357 return Branch(items)

3358

3359 def _compile(self, reverse, fuzzy):

3360 flags = 0

3361 if self.positive:

3362 flags |= POSITIVE_OP

3363 if self.zerowidth:

3364 flags |= ZEROWIDTH_OP

3365 if fuzzy:

3366 flags |= FUZZY_OP

3367 return [(self._opcode[self.case_flags, reverse], flags, self.lower,

3368 self.upper)]

3369

3370 def dump(self, indent, reverse):

3371 display_lower = ascii(chr(self.lower)).lstrip("bu")

3372 display_upper = ascii(chr(self.upper)).lstrip("bu")

3373 print("{}RANGE {} {} {}{}".format(INDENT * indent,

3374 POS_TEXT[self.positive], display_lower, display_upper,

3375 CASE_TEXT[self.case_flags]))

3376

3377 def matches(self, ch):

3378 return (self.lower <= ch <= self.upper) == self.positive

3379

3380 def max_width(self):

3381 return 1

3382

3383class RefGroup(RegexBase):

3384 _opcode = {(NOCASE, False): OP.REF_GROUP, (IGNORECASE, False):

3385 OP.REF_GROUP_IGN, (FULLCASE, False): OP.REF_GROUP, (FULLIGNORECASE,

3386 False): OP.REF_GROUP_FLD, (NOCASE, True): OP.REF_GROUP_REV, (IGNORECASE,

3387 True): OP.REF_GROUP_IGN_REV, (FULLCASE, True): OP.REF_GROUP_REV,

3388 (FULLIGNORECASE, True): OP.REF_GROUP_FLD_REV}

3389

3390 def __init__(self, info, group, position, case_flags=NOCASE):

3391 RegexBase.__init__(self)

3392 self.info = info

3393 self.group = group

3394 self.position = position

3395 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3396

3397 self._key = self.__class__, self.group, self.case_flags

3398

3399 def fix_groups(self, pattern, reverse, fuzzy):

3400 try:

3401 self.group = int(self.group)

3402 except ValueError:

3403 try:

3404 self.group = self.info.group_index[self.group]

3405 except KeyError:

3406 raise error("unknown group", pattern, self.position)

3407

3408 if not 1 <= self.group <= self.info.group_count:

3409 raise error("invalid group reference", pattern, self.position)

3410

3411 self._key = self.__class__, self.group, self.case_flags

3412

3413 def remove_captures(self):

3414 raise error("group reference not allowed", pattern, self.position)

3415

3416 def _compile(self, reverse, fuzzy):

3417 flags = 0

3418 if fuzzy:

3419 flags |= FUZZY_OP

3420 return [(self._opcode[self.case_flags, reverse], flags, self.group)]

3421

3422 def dump(self, indent, reverse):

3423 print("{}REF_GROUP {}{}".format(INDENT * indent, self.group,

3424 CASE_TEXT[self.case_flags]))

3425

3426 def max_width(self):

3427 return UNLIMITED

3428

3429 def __del__(self):

3430 self.info = None

3431

3432class SearchAnchor(ZeroWidthBase):

3433 _opcode = OP.SEARCH_ANCHOR

3434 _op_name = "SEARCH_ANCHOR"

3435

3436class Sequence(RegexBase):

3437 def __init__(self, items=None):

3438 RegexBase.__init__(self)

3439 if items is None:

3440 items = []

3441

3442 self.items = items

3443

3444 def fix_groups(self, pattern, reverse, fuzzy):

3445 for s in self.items:

3446 s.fix_groups(pattern, reverse, fuzzy)

3447

3448 def optimise(self, info, reverse):

3449 # Flatten the sequences.

3450 items = []

3451 for s in self.items:

3452 s = s.optimise(info, reverse)

3453 if isinstance(s, Sequence):

3454 items.extend(s.items)

3455 else:

3456 items.append(s)

3457

3458 return make_sequence(items)

3459

3460 def pack_characters(self, info):

3461 "Packs sequences of characters into strings."

3462 items = []

3463 characters = []

3464 case_flags = NOCASE

3465 for s in self.items:

3466 if type(s) is Character and s.positive and not s.zerowidth:

3467 if s.case_flags != case_flags:

3468 # Different case sensitivity, so flush, unless neither the

3469 # previous nor the new character are cased.

3470 if s.case_flags or is_cased_i(info, s.value):

3471 Sequence._flush_characters(info, characters,

3472 case_flags, items)

3473

3474 case_flags = s.case_flags

3475

3476 characters.append(s.value)

3477 elif type(s) is String or type(s) is Literal:

3478 if s.case_flags != case_flags:

3479 # Different case sensitivity, so flush, unless the neither

3480 # the previous nor the new string are cased.

3481 if s.case_flags or any(is_cased_i(info, c) for c in

3482 characters):

3483 Sequence._flush_characters(info, characters,

3484 case_flags, items)

3485

3486 case_flags = s.case_flags

3487

3488 characters.extend(s.characters)

3489 else:

3490 Sequence._flush_characters(info, characters, case_flags, items)

3491

3492 items.append(s.pack_characters(info))

3493

3494 Sequence._flush_characters(info, characters, case_flags, items)

3495

3496 return make_sequence(items)

3497

3498 def remove_captures(self):

3499 self.items = [s.remove_captures() for s in self.items]

3500 return self

3501

3502 def is_atomic(self):

3503 return all(s.is_atomic() for s in self.items)

3504

3505 def can_be_affix(self):

3506 return False

3507

3508 def contains_group(self):

3509 return any(s.contains_group() for s in self.items)

3510

3511 def get_firstset(self, reverse):

3512 fs = set()

3513 items = self.items

3514 if reverse:

3515 items.reverse()

3516 for s in items:

3517 fs |= s.get_firstset(reverse)

3518 if None not in fs:

3519 return fs

3520 fs.discard(None)

3521

3522 return fs | set([None])

3523

3524 def has_simple_start(self):

3525 return bool(self.items) and self.items[0].has_simple_start()

3526

3527 def _compile(self, reverse, fuzzy):

3528 seq = self.items

3529 if reverse:

3530 seq = seq[::-1]

3531

3532 code = []

3533 for s in seq:

3534 code.extend(s.compile(reverse, fuzzy))

3535

3536 return code

3537

3538 def dump(self, indent, reverse):

3539 for s in self.items:

3540 s.dump(indent, reverse)

3541

3542 @staticmethod

3543 def _flush_characters(info, characters, case_flags, items):

3544 if not characters:

3545 return

3546

3547 # Disregard case_flags if all of the characters are case-less.

3548 if case_flags & IGNORECASE:

3549 if not any(is_cased_i(info, c) for c in characters):

3550 case_flags = NOCASE

3551

3552 if (case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3553 literals = Sequence._fix_full_casefold(characters)

3554

3555 for item in literals:

3556 chars = item.characters

3557

3558 if len(chars) == 1:

3559 items.append(Character(chars[0], case_flags=item.case_flags))

3560 else:

3561 items.append(String(chars, case_flags=item.case_flags))

3562 else:

3563 if len(characters) == 1:

3564 items.append(Character(characters[0], case_flags=case_flags))

3565 else:

3566 items.append(String(characters, case_flags=case_flags))

3567

3568 characters[:] = []

3569

3570 @staticmethod

3571 def _fix_full_casefold(characters):

3572 # Split a literal needing full case-folding into chunks that need it

3573 # and chunks that can use simple case-folding, which is faster.

3574 expanded = [_regex.fold_case(FULL_CASE_FOLDING, c) for c in

3575 _regex.get_expand_on_folding()]

3576 string = _regex.fold_case(FULL_CASE_FOLDING, ''.join(chr(c)

3577 for c in characters)).lower()

3578 chunks = []

3579

3580 for e in expanded:

3581 found = string.find(e)

3582

3583 while found >= 0:

3584 chunks.append((found, found + len(e)))

3585 found = string.find(e, found + 1)

3586

3587 pos = 0

3588 literals = []

3589

3590 for start, end in Sequence._merge_chunks(chunks):

3591 if pos < start:

3592 literals.append(Literal(characters[pos : start],

3593 case_flags=IGNORECASE))

3594

3595 literals.append(Literal(characters[start : end],

3596 case_flags=FULLIGNORECASE))

3597 pos = end

3598

3599 if pos < len(characters):

3600 literals.append(Literal(characters[pos : ], case_flags=IGNORECASE))

3601

3602 return literals

3603

3604 @staticmethod

3605 def _merge_chunks(chunks):

3606 if len(chunks) < 2:

3607 return chunks

3608

3609 chunks.sort()

3610

3611 start, end = chunks[0]

3612 new_chunks = []

3613

3614 for s, e in chunks[1 : ]:

3615 if s <= end:

3616 end = max(end, e)

3617 else:

3618 new_chunks.append((start, end))

3619 start, end = s, e

3620

3621 new_chunks.append((start, end))

3622

3623 return new_chunks

3624

3625 def is_empty(self):

3626 return all(i.is_empty() for i in self.items)

3627

3628 def __eq__(self, other):

3629 return type(self) is type(other) and self.items == other.items

3630

3631 def max_width(self):

3632 return sum(s.max_width() for s in self.items)

3633

3634 def get_required_string(self, reverse):

3635 seq = self.items

3636 if reverse:

3637 seq = seq[::-1]

3638

3639 offset = 0

3640

3641 for s in seq:

3642 ofs, req = s.get_required_string(reverse)

3643 offset += ofs

3644 if req:

3645 return offset, req

3646

3647 return offset, None

3648

3649class SetBase(RegexBase):

3650 def __init__(self, info, items, positive=True, case_flags=NOCASE,

3651 zerowidth=False):

3652 RegexBase.__init__(self)

3653 self.info = info

3654 self.items = tuple(items)

3655 self.positive = bool(positive)

3656 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3657 self.zerowidth = bool(zerowidth)

3658

3659 self.char_width = 1

3660

3661 self._key = (self.__class__, self.items, self.positive,

3662 self.case_flags, self.zerowidth)

3663

3664 def rebuild(self, positive, case_flags, zerowidth):

3665 return type(self)(self.info, self.items, positive, case_flags,

3666 zerowidth).optimise(self.info, False)

3667

3668 def get_firstset(self, reverse):

3669 return set([self])

3670

3671 def has_simple_start(self):

3672 return True

3673

3674 def _compile(self, reverse, fuzzy):

3675 flags = 0

3676 if self.positive:

3677 flags |= POSITIVE_OP

3678 if self.zerowidth:

3679 flags |= ZEROWIDTH_OP

3680 if fuzzy:

3681 flags |= FUZZY_OP

3682 code = [(self._opcode[self.case_flags, reverse], flags)]

3683 for m in self.items:

3684 code.extend(m.compile())

3685

3686 code.append((OP.END, ))

3687

3688 return code

3689

3690 def dump(self, indent, reverse):

3691 print("{}{} {}{}".format(INDENT * indent, self._op_name,

3692 POS_TEXT[self.positive], CASE_TEXT[self.case_flags]))

3693 for i in self.items:

3694 i.dump(indent + 1, reverse)

3695

3696 def _handle_case_folding(self, info, in_set):

3697 # Is the set case-sensitive?

3698 if not self.positive or not (self.case_flags & IGNORECASE) or in_set:

3699 return self

3700

3701 # Is full case-folding possible?

3702 if (not (self.info.flags & UNICODE) or (self.case_flags &

3703 FULLIGNORECASE) != FULLIGNORECASE):

3704 return self

3705

3706 # Get the characters which expand to multiple codepoints on folding.

3707 expanding_chars = _regex.get_expand_on_folding()

3708

3709 # Get the folded characters in the set.

3710 items = []

3711 seen = set()

3712 for ch in expanding_chars:

3713 if self.matches(ord(ch)):

3714 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3715 if folded not in seen:

3716 items.append(String([ord(c) for c in folded],

3717 case_flags=self.case_flags))

3718 seen.add(folded)

3719

3720 if not items:

3721 # We can fall back to simple case-folding.

3722 return self

3723

3724 return Branch([self] + items)

3725

3726 def max_width(self):

3727 # Is the set case-sensitive?

3728 if not self.positive or not (self.case_flags & IGNORECASE):

3729 return 1

3730

3731 # Is full case-folding possible?

3732 if (not (self.info.flags & UNICODE) or (self.case_flags &

3733 FULLIGNORECASE) != FULLIGNORECASE):

3734 return 1

3735

3736 # Get the characters which expand to multiple codepoints on folding.

3737 expanding_chars = _regex.get_expand_on_folding()

3738

3739 # Get the folded characters in the set.

3740 seen = set()

3741 for ch in expanding_chars:

3742 if self.matches(ord(ch)):

3743 folded = _regex.fold_case(FULL_CASE_FOLDING, ch)

3744 seen.add(folded)

3745

3746 if not seen:

3747 return 1

3748

3749 return max(len(folded) for folded in seen)

3750

3751 def __del__(self):

3752 self.info = None

3753

3754class SetDiff(SetBase):

3755 _opcode = {(NOCASE, False): OP.SET_DIFF, (IGNORECASE, False):

3756 OP.SET_DIFF_IGN, (FULLCASE, False): OP.SET_DIFF, (FULLIGNORECASE, False):

3757 OP.SET_DIFF_IGN, (NOCASE, True): OP.SET_DIFF_REV, (IGNORECASE, True):

3758 OP.SET_DIFF_IGN_REV, (FULLCASE, True): OP.SET_DIFF_REV, (FULLIGNORECASE,

3759 True): OP.SET_DIFF_IGN_REV}

3760 _op_name = "SET_DIFF"

3761

3762 def optimise(self, info, reverse, in_set=False):

3763 items = self.items

3764 if len(items) > 2:

3765 items = [items[0], SetUnion(info, items[1 : ])]

3766

3767 if len(items) == 1:

3768 return items[0].with_flags(case_flags=self.case_flags,

3769 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3770

3771 self.items = tuple(m.optimise(info, reverse, in_set=True) for m in

3772 items)

3773

3774 return self._handle_case_folding(info, in_set)

3775

3776 def matches(self, ch):

3777 m = self.items[0].matches(ch) and not self.items[1].matches(ch)

3778 return m == self.positive

3779

3780class SetInter(SetBase):

3781 _opcode = {(NOCASE, False): OP.SET_INTER, (IGNORECASE, False):

3782 OP.SET_INTER_IGN, (FULLCASE, False): OP.SET_INTER, (FULLIGNORECASE,

3783 False): OP.SET_INTER_IGN, (NOCASE, True): OP.SET_INTER_REV, (IGNORECASE,

3784 True): OP.SET_INTER_IGN_REV, (FULLCASE, True): OP.SET_INTER_REV,

3785 (FULLIGNORECASE, True): OP.SET_INTER_IGN_REV}

3786 _op_name = "SET_INTER"

3787

3788 def optimise(self, info, reverse, in_set=False):

3789 items = []

3790 for m in self.items:

3791 m = m.optimise(info, reverse, in_set=True)

3792 if isinstance(m, SetInter) and m.positive:

3793 # Intersection in intersection.

3794 items.extend(m.items)

3795 else:

3796 items.append(m)

3797

3798 if len(items) == 1:

3799 return items[0].with_flags(case_flags=self.case_flags,

3800 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3801

3802 self.items = tuple(items)

3803

3804 return self._handle_case_folding(info, in_set)

3805

3806 def matches(self, ch):

3807 m = all(i.matches(ch) for i in self.items)

3808 return m == self.positive

3809

3810class SetSymDiff(SetBase):

3811 _opcode = {(NOCASE, False): OP.SET_SYM_DIFF, (IGNORECASE, False):

3812 OP.SET_SYM_DIFF_IGN, (FULLCASE, False): OP.SET_SYM_DIFF, (FULLIGNORECASE,

3813 False): OP.SET_SYM_DIFF_IGN, (NOCASE, True): OP.SET_SYM_DIFF_REV,

3814 (IGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV, (FULLCASE, True):

3815 OP.SET_SYM_DIFF_REV, (FULLIGNORECASE, True): OP.SET_SYM_DIFF_IGN_REV}

3816 _op_name = "SET_SYM_DIFF"

3817

3818 def optimise(self, info, reverse, in_set=False):

3819 items = []

3820 for m in self.items:

3821 m = m.optimise(info, reverse, in_set=True)

3822 if isinstance(m, SetSymDiff) and m.positive:

3823 # Symmetric difference in symmetric difference.

3824 items.extend(m.items)

3825 else:

3826 items.append(m)

3827

3828 if len(items) == 1:

3829 return items[0].with_flags(case_flags=self.case_flags,

3830 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3831

3832 self.items = tuple(items)

3833

3834 return self._handle_case_folding(info, in_set)

3835

3836 def matches(self, ch):

3837 m = False

3838 for i in self.items:

3839 m = m != i.matches(ch)

3840

3841 return m == self.positive

3842

3843class SetUnion(SetBase):

3844 _opcode = {(NOCASE, False): OP.SET_UNION, (IGNORECASE, False):

3845 OP.SET_UNION_IGN, (FULLCASE, False): OP.SET_UNION, (FULLIGNORECASE,

3846 False): OP.SET_UNION_IGN, (NOCASE, True): OP.SET_UNION_REV, (IGNORECASE,

3847 True): OP.SET_UNION_IGN_REV, (FULLCASE, True): OP.SET_UNION_REV,

3848 (FULLIGNORECASE, True): OP.SET_UNION_IGN_REV}

3849 _op_name = "SET_UNION"

3850

3851 def optimise(self, info, reverse, in_set=False):

3852 items = []

3853 for m in self.items:

3854 m = m.optimise(info, reverse, in_set=True)

3855 if isinstance(m, SetUnion) and m.positive:

3856 # Union in union.

3857 items.extend(m.items)

3858 elif isinstance(m, AnyAll):

3859 return AnyAll()

3860 else:

3861 items.append(m)

3862

3863 # Are there complementary properties?

3864 properties = (set(), set())

3865

3866 for m in items:

3867 if isinstance(m, Property):

3868 properties[m.positive].add((m.value, m.case_flags, m.zerowidth))

3869

3870 if properties[0] & properties[1]:

3871 return AnyAll()

3872

3873 if len(items) == 1:

3874 i = items[0]

3875 return i.with_flags(positive=i.positive == self.positive,

3876 case_flags=self.case_flags,

3877 zerowidth=self.zerowidth).optimise(info, reverse, in_set)

3878

3879 self.items = tuple(items)

3880

3881 return self._handle_case_folding(info, in_set)

3882

3883 def _compile(self, reverse, fuzzy):

3884 flags = 0

3885 if self.positive:

3886 flags |= POSITIVE_OP

3887 if self.zerowidth:

3888 flags |= ZEROWIDTH_OP

3889 if fuzzy:

3890 flags |= FUZZY_OP

3891

3892 characters, others = defaultdict(list), []

3893 for m in self.items:

3894 if isinstance(m, Character):

3895 characters[m.positive].append(m.value)

3896 else:

3897 others.append(m)

3898

3899 code = [(self._opcode[self.case_flags, reverse], flags)]

3900

3901 for positive, values in characters.items():

3902 flags = 0

3903 if positive:

3904 flags |= POSITIVE_OP

3905 if len(values) == 1:

3906 code.append((OP.CHARACTER, flags, values[0]))

3907 else:

3908 code.append((OP.STRING, flags, len(values)) + tuple(values))

3909

3910 for m in others:

3911 code.extend(m.compile())

3912

3913 code.append((OP.END, ))

3914

3915 return code

3916

3917 def matches(self, ch):

3918 m = any(i.matches(ch) for i in self.items)

3919 return m == self.positive

3920

3921class Skip(ZeroWidthBase):

3922 _op_name = "SKIP"

3923 _opcode = OP.SKIP

3924

3925class StartOfLine(ZeroWidthBase):

3926 _opcode = OP.START_OF_LINE

3927 _op_name = "START_OF_LINE"

3928

3929class StartOfLineU(StartOfLine):

3930 _opcode = OP.START_OF_LINE_U

3931 _op_name = "START_OF_LINE_U"

3932

3933class StartOfString(ZeroWidthBase):

3934 _opcode = OP.START_OF_STRING

3935 _op_name = "START_OF_STRING"

3936

3937class StartOfWord(ZeroWidthBase):

3938 _opcode = OP.START_OF_WORD

3939 _op_name = "START_OF_WORD"

3940

3941class String(RegexBase):

3942 _opcode = {(NOCASE, False): OP.STRING, (IGNORECASE, False): OP.STRING_IGN,

3943 (FULLCASE, False): OP.STRING, (FULLIGNORECASE, False): OP.STRING_FLD,

3944 (NOCASE, True): OP.STRING_REV, (IGNORECASE, True): OP.STRING_IGN_REV,

3945 (FULLCASE, True): OP.STRING_REV, (FULLIGNORECASE, True):

3946 OP.STRING_FLD_REV}

3947

3948 def __init__(self, characters, case_flags=NOCASE):

3949 self.characters = tuple(characters)

3950 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

3951

3952 if (self.case_flags & FULLIGNORECASE) == FULLIGNORECASE:

3953 folded_characters = []

3954 for char in self.characters:

3955 folded = _regex.fold_case(FULL_CASE_FOLDING, chr(char))

3956 folded_characters.extend(ord(c) for c in folded)

3957 else:

3958 folded_characters = self.characters

3959

3960 self.folded_characters = tuple(folded_characters)

3961 self.required = False

3962

3963 self._key = self.__class__, self.characters, self.case_flags

3964

3965 def get_firstset(self, reverse):

3966 if reverse:

3967 pos = -1

3968 else:

3969 pos = 0

3970 return set([Character(self.characters[pos],

3971 case_flags=self.case_flags)])

3972

3973 def has_simple_start(self):

3974 return True

3975

3976 def _compile(self, reverse, fuzzy):

3977 flags = 0

3978 if fuzzy:

3979 flags |= FUZZY_OP

3980 if self.required:

3981 flags |= REQUIRED_OP

3982 return [(self._opcode[self.case_flags, reverse], flags,

3983 len(self.folded_characters)) + self.folded_characters]

3984

3985 def dump(self, indent, reverse):

3986 display = ascii("".join(chr(c) for c in self.characters)).lstrip("bu")

3987 print("{}STRING {}{}".format(INDENT * indent, display,

3988 CASE_TEXT[self.case_flags]))

3989

3990 def max_width(self):

3991 return len(self.folded_characters)

3992

3993 def get_required_string(self, reverse):

3994 return 0, self

3995

3996class Literal(String):

3997 def dump(self, indent, reverse):

3998 literal = ''.join(chr(c) for c in self.characters)

3999 display = ascii(literal).lstrip("bu")

4000 print("{}LITERAL MATCH {}{}".format(INDENT * indent, display,

4001 CASE_TEXT[self.case_flags]))

4002

4003class StringSet(Branch):

4004 def __init__(self, info, name, case_flags=NOCASE):

4005 self.info = info

4006 self.name = name

4007 self.case_flags = CASE_FLAGS_COMBINATIONS[case_flags]

4008

4009 self._key = self.__class__, self.name, self.case_flags

4010

4011 self.set_key = (name, self.case_flags)

4012 if self.set_key not in info.named_lists_used:

4013 info.named_lists_used[self.set_key] = len(info.named_lists_used)

4014

4015 index = self.info.named_lists_used[self.set_key]

4016 items = self.info.kwargs[self.name]

4017

4018 case_flags = self.case_flags

4019

4020 encoding = self.info.flags & _ALL_ENCODINGS

4021 fold_flags = encoding | case_flags

4022

4023 choices = []

4024

4025 for string in items:

4026 if isinstance(string, str):

4027 string = [ord(c) for c in string]

4028

4029 choices.append([Character(c, case_flags=case_flags) for c in

4030 string])

4031

4032 # Sort from longest to shortest.

4033 choices.sort(key=len, reverse=True)

4034

4035 self.branches = [Sequence(choice) for choice in choices]

4036

4037 def dump(self, indent, reverse):

4038 print("{}STRING_SET {}{}".format(INDENT * indent, self.name,

4039 CASE_TEXT[self.case_flags]))

4040

4041 def __del__(self):

4042 self.info = None

4043

4044class Source:

4045 "Scanner for the regular expression source string."

4046 def __init__(self, string):

4047 if isinstance(string, str):

4048 self.string = string

4049 self.char_type = chr

4050 else:

4051 self.string = string.decode("latin-1")

4052 self.char_type = lambda c: bytes([c])

4053

4054 self.pos = 0

4055 self.ignore_space = False

4056 self.sep = string[ : 0]

4057

4058 def get(self, override_ignore=False):

4059 string = self.string

4060 pos = self.pos

4061

4062 try:

4063 if self.ignore_space and not override_ignore:

4064 while True:

4065 if string[pos].isspace():

4066 # Skip over the whitespace.

4067 pos += 1

4068 elif string[pos] == "#":

4069 # Skip over the comment to the end of the line.

4070 pos = string.index("\n", pos)

4071 else:

4072 break

4073

4074 ch = string[pos]

4075 self.pos = pos + 1

4076 return ch

4077 except IndexError:

4078 # We've reached the end of the string.

4079 self.pos = pos

4080 return string[ : 0]

4081 except ValueError:

4082 # The comment extended to the end of the string.

4083 self.pos = len(string)

4084 return string[ : 0]

4085

4086 def get_many(self, count=1):

4087 string = self.string

4088 pos = self.pos

4089

4090 try:

4091 if self.ignore_space:

4092 substring = []

4093

4094 while len(substring) < count:

4095 while True:

4096 if string[pos].isspace():

4097 # Skip over the whitespace.

4098 pos += 1

4099 elif string[pos] == "#":

4100 # Skip over the comment to the end of the line.

4101 pos = string.index("\n", pos)

4102 else:

4103 break

4104

4105 substring.append(string[pos])

4106 pos += 1

4107

4108 substring = "".join(substring)

4109 else:

4110 substring = string[pos : pos + count]

4111 pos += len(substring)

4112

4113 self.pos = pos

4114 return substring

4115 except IndexError:

4116 # We've reached the end of the string.

4117 self.pos = len(string)

4118 return "".join(substring)

4119 except ValueError:

4120 # The comment extended to the end of the string.

4121 self.pos = len(string)

4122 return "".join(substring)

4123

4124 def get_while(self, test_set, include=True, keep_spaces=False):

4125 string = self.string

4126 pos = self.pos

4127

4128 if self.ignore_space and not keep_spaces:

4129 try:

4130 substring = []

4131

4132 while True:

4133 if string[pos].isspace():

4134 # Skip over the whitespace.

4135 pos += 1

4136 elif string[pos] == "#":

4137 # Skip over the comment to the end of the line.

4138 pos = string.index("\n", pos)

4139 elif (string[pos] in test_set) == include:

4140 substring.append(string[pos])

4141 pos += 1

4142 else:

4143 break

4144

4145 self.pos = pos

4146 except IndexError:

4147 # We've reached the end of the string.

4148 self.pos = len(string)

4149 except ValueError:

4150 # The comment extended to the end of the string.

4151 self.pos = len(string)

4152

4153 return "".join(substring)

4154 else:

4155 try:

4156 while (string[pos] in test_set) == include:

4157 pos += 1

4158

4159 substring = string[self.pos : pos]

4160

4161 self.pos = pos

4162

4163 return substring

4164 except IndexError:

4165 # We've reached the end of the string.

4166 substring = string[self.pos : pos]

4167

4168 self.pos = pos

4169

4170 return substring

4171

4172 def skip_while(self, test_set, include=True):

4173 string = self.string

4174 pos = self.pos

4175

4176 try:

4177 if self.ignore_space:

4178 while True:

4179 if string[pos].isspace():

4180 # Skip over the whitespace.

4181 pos += 1

4182 elif string[pos] == "#":

4183 # Skip over the comment to the end of the line.

4184 pos = string.index("\n", pos)

4185 elif (string[pos] in test_set) == include:

4186 pos += 1

4187 else:

4188 break

4189 else:

4190 while (string[pos] in test_set) == include:

4191 pos += 1

4192

4193 self.pos = pos

4194 except IndexError:

4195 # We've reached the end of the string.

4196 self.pos = len(string)

4197 except ValueError:

4198 # The comment extended to the end of the string.

4199 self.pos = len(string)

4200

4201 def match(self, substring):

4202 string = self.string

4203 pos = self.pos

4204

4205 if self.ignore_space:

4206 try:

4207 for c in substring:

4208 while True:

4209 if string[pos].isspace():

4210 # Skip over the whitespace.

4211 pos += 1

4212 elif string[pos] == "#":

4213 # Skip over the comment to the end of the line.

4214 pos = string.index("\n", pos)

4215 else:

4216 break

4217

4218 if string[pos] != c:

4219 return False

4220

4221 pos += 1

4222

4223 self.pos = pos

4224

4225 return True

4226 except IndexError:

4227 # We've reached the end of the string.

4228 return False

4229 except ValueError:

4230 # The comment extended to the end of the string.

4231 return False

4232 else:

4233 if not string.startswith(substring, pos):

4234 return False

4235

4236 self.pos = pos + len(substring)

4237

4238 return True

4239

4240 def expect(self, substring):

4241 if not self.match(substring):

4242 raise error("missing {}".format(substring), self.string, self.pos)

4243

4244 def at_end(self):

4245 string = self.string

4246 pos = self.pos

4247

4248 try:

4249 if self.ignore_space:

4250 while True:

4251 if string[pos].isspace():

4252 pos += 1

4253 elif string[pos] == "#":

4254 pos = string.index("\n", pos)

4255 else:

4256 break

4257

4258 return pos >= len(string)

4259 except IndexError:

4260 # We've reached the end of the string.

4261 return True

4262 except ValueError:

4263 # The comment extended to the end of the string.

4264 return True

4265

4266class Info:

4267 "Info about the regular expression."

4268

4269 def __init__(self, flags=0, char_type=None, kwargs={}):

4270 flags |= DEFAULT_FLAGS[(flags & _ALL_VERSIONS) or DEFAULT_VERSION]

4271 self.flags = flags

4272 self.global_flags = flags

4273 self.inline_locale = False

4274

4275 self.kwargs = kwargs

4276

4277 self.group_count = 0

4278 self.group_index = {}

4279 self.group_name = {}

4280 self.char_type = char_type

4281 self.named_lists_used = {}

4282 self.open_groups = []

4283 self.open_group_count = {}

4284 self.defined_groups = {}

4285 self.group_calls = []

4286 self.private_groups = {}

4287

4288 def open_group(self, name=None):

4289 group = self.group_index.get(name)

4290 if group is None:

4291 while True:

4292 self.group_count += 1

4293 if name is None or self.group_count not in self.group_name:

4294 break

4295

4296 group = self.group_count

4297 if name:

4298 self.group_index[name] = group

4299 self.group_name[group] = name

4300

4301 if group in self.open_groups:

4302 # We have a nested named group. We'll assign it a private group

4303 # number, initially negative until we can assign a proper

4304 # (positive) number.

4305 group_alias = -(len(self.private_groups) + 1)

4306 self.private_groups[group_alias] = group

4307 group = group_alias

4308

4309 self.open_groups.append(group)

4310 self.open_group_count[group] = self.open_group_count.get(group, 0) + 1

4311

4312 return group

4313

4314 def close_group(self):

4315 self.open_groups.pop()

4316

4317 def is_open_group(self, name):

4318 # In version 1, a group reference can refer to an open group. We'll

4319 # just pretend the group isn't open.

4320 version = (self.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4321 if version == VERSION1:

4322 return False

4323

4324 if name.isdigit():

4325 group = int(name)

4326 else:

4327 group = self.group_index.get(name)

4328

4329 return group in self.open_groups

4330

4331def _check_group_features(info, parsed):

4332 """Checks whether the reverse and fuzzy features of the group calls match

4333 the groups which they call.

4334 """

4335 call_refs = {}

4336 additional_groups = []

4337 for call, reverse, fuzzy in info.group_calls:

4338 # Look up the reference of this group call.

4339 key = (call.group, reverse, fuzzy)

4340 ref = call_refs.get(key)

4341 if ref is None:

4342 # This group doesn't have a reference yet, so look up its features.

4343 if call.group == 0:

4344 # Calling the pattern as a whole.

4345 rev = bool(info.flags & REVERSE)

4346 fuz = isinstance(parsed, Fuzzy)

4347 if (rev, fuz) != (reverse, fuzzy):

4348 # The pattern as a whole doesn't have the features we want,

4349 # so we'll need to make a copy of it with the desired

4350 # features.

4351 additional_groups.append((CallRef(len(call_refs), parsed),

4352 reverse, fuzzy))

4353 else:

4354 # Calling a capture group.

4355 def_info = info.defined_groups[call.group]

4356 group = def_info[0]

4357 if def_info[1 : ] != (reverse, fuzzy):

4358 # The group doesn't have the features we want, so we'll

4359 # need to make a copy of it with the desired features.

4360 additional_groups.append((group, reverse, fuzzy))

4361

4362 ref = len(call_refs)

4363 call_refs[key] = ref

4364

4365 call.call_ref = ref

4366

4367 info.call_refs = call_refs

4368 info.additional_groups = additional_groups

4369

4370def _get_required_string(parsed, flags):

4371 "Gets the required string and related info of a parsed pattern."

4372

4373 req_offset, required = parsed.get_required_string(bool(flags & REVERSE))

4374 if required:

4375 required.required = True

4376 if req_offset >= UNLIMITED:

4377 req_offset = -1

4378

4379 req_flags = required.case_flags

4380 if not (flags & UNICODE):

4381 req_flags &= ~UNICODE

4382

4383 req_chars = required.folded_characters

4384 else:

4385 req_offset = 0

4386 req_chars = ()

4387 req_flags = 0

4388

4389 return req_offset, req_chars, req_flags

4390

4391class Scanner:

4392 def __init__(self, lexicon, flags=0):

4393 self.lexicon = lexicon

4394

4395 # Combine phrases into a compound pattern.

4396 patterns = []

4397 for phrase, action in lexicon:

4398 # Parse the regular expression.

4399 source = Source(phrase)

4400 info = Info(flags, source.char_type)

4401 source.ignore_space = bool(info.flags & VERBOSE)

4402 parsed = _parse_pattern(source, info)

4403 if not source.at_end():

4404 raise error("unbalanced parenthesis", source.string,

4405 source.pos)

4406

4407 # We want to forbid capture groups within each phrase.

4408 patterns.append(parsed.remove_captures())

4409

4410 # Combine all the subpatterns into one pattern.

4411 info = Info(flags)

4412 patterns = [Group(info, g + 1, p) for g, p in enumerate(patterns)]

4413 parsed = Branch(patterns)

4414

4415 # Optimise the compound pattern.

4416 reverse = bool(info.flags & REVERSE)

4417 parsed = parsed.optimise(info, reverse)

4418 parsed = parsed.pack_characters(info)

4419

4420 # Get the required string.

4421 req_offset, req_chars, req_flags = _get_required_string(parsed,

4422 info.flags)

4423

4424 # Check the features of the groups.

4425 _check_group_features(info, parsed)

4426

4427 # Complain if there are any group calls. They are not supported by the

4428 # Scanner class.

4429 if info.call_refs:

4430 raise error("recursive regex not supported by Scanner",

4431 source.string, source.pos)

4432

4433 reverse = bool(info.flags & REVERSE)

4434

4435 # Compile the compound pattern. The result is a list of tuples.

4436 code = parsed.compile(reverse) + [(OP.SUCCESS, )]

4437

4438 # Flatten the code into a list of ints.

4439 code = _flatten_code(code)

4440

4441 if not parsed.has_simple_start():

4442 # Get the first set, if possible.

4443 try:

4444 fs_code = _compile_firstset(info, parsed.get_firstset(reverse))

4445 fs_code = _flatten_code(fs_code)

4446 code = fs_code + code

4447 except _FirstSetError:

4448 pass

4449

4450 # Check the global flags for conflicts.

4451 version = (info.flags & _ALL_VERSIONS) or DEFAULT_VERSION

4452 if version not in (0, VERSION0, VERSION1):

4453 raise ValueError("VERSION0 and VERSION1 flags are mutually incompatible")

4454

4455 # Create the PatternObject.

4456 #

4457 # Local flags like IGNORECASE affect the code generation, but aren't

4458 # needed by the PatternObject itself. Conversely, global flags like

4459 # LOCALE _don't_ affect the code generation but _are_ needed by the

4460 # PatternObject.

4461 self.scanner = _regex.compile(None, (flags & GLOBAL_FLAGS) | version,

4462 code, {}, {}, {}, [], req_offset, req_chars, req_flags,

4463 len(patterns))

4464

4465 def scan(self, string):

4466 result = []

4467 append = result.append

4468 match = self.scanner.scanner(string).match

4469 i = 0

4470 while True:

4471 m = match()

4472 if not m:

4473 break

4474 j = m.end()

4475 if i == j:

4476 break

4477 action = self.lexicon[m.lastindex - 1][1]

4478 if hasattr(action, '__call__'):

4479 self.match = m

4480 action = action(self, m.group())

4481 if action is not None:

4482 append(action)

4483 i = j

4484

4485 return result, string[i : ]

4486

4487# Get the known properties dict.

4488PROPERTIES = _regex.get_properties()

4489

4490# Build the inverse of the properties dict.

4491PROPERTY_NAMES = {}

4492for prop_name, (prop_id, values) in PROPERTIES.items():

4493 name, prop_values = PROPERTY_NAMES.get(prop_id, ("", {}))

4494 name = max(name, prop_name, key=len)

4495 PROPERTY_NAMES[prop_id] = name, prop_values

4496

4497 for val_name, val_id in values.items():

4498 prop_values[val_id] = max(prop_values.get(val_id, ""), val_name,

4499 key=len)

4500

4501# Character escape sequences.

4502CHARACTER_ESCAPES = {

4503 "a": "\a",

4504 "b": "\b",

4505 "f": "\f",

4506 "n": "\n",

4507 "r": "\r",

4508 "t": "\t",

4509 "v": "\v",

4510}

4511

4512ASCII_ENCODING = 1

4513UNICODE_ENCODING = 2

4514

4515# Predefined character set escape sequences.

4516CHARSET_ESCAPES = {

4517 "d": lookup_property(None, "Digit", True),

4518 "D": lookup_property(None, "Digit", False),

4519 "h": lookup_property(None, "Blank", True),

4520 "s": lookup_property(None, "Space", True),

4521 "S": lookup_property(None, "Space", False),

4522 "w": lookup_property(None, "Word", True),

4523 "W": lookup_property(None, "Word", False),

4524}

4525

4526ASCII_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4527ASCII_CHARSET_ESCAPES.update({

4528 "d": lookup_property(None, "Digit", True, encoding=ASCII_ENCODING),

4529 "D": lookup_property(None, "Digit", False, encoding=ASCII_ENCODING),

4530 "s": lookup_property(None, "Space", True, encoding=ASCII_ENCODING),

4531 "S": lookup_property(None, "Space", False, encoding=ASCII_ENCODING),

4532 "w": lookup_property(None, "Word", True, encoding=ASCII_ENCODING),

4533 "W": lookup_property(None, "Word", False, encoding=ASCII_ENCODING),

4534})

4535UNICODE_CHARSET_ESCAPES = dict(CHARSET_ESCAPES)

4536UNICODE_CHARSET_ESCAPES.update({

4537 "d": lookup_property(None, "Digit", True, encoding=UNICODE_ENCODING),

4538 "D": lookup_property(None, "Digit", False, encoding=UNICODE_ENCODING),

4539 "s": lookup_property(None, "Space", True, encoding=UNICODE_ENCODING),

4540 "S": lookup_property(None, "Space", False, encoding=UNICODE_ENCODING),

4541 "w": lookup_property(None, "Word", True, encoding=UNICODE_ENCODING),

4542 "W": lookup_property(None, "Word", False, encoding=UNICODE_ENCODING),

4543})

4544

4545# Positional escape sequences.

4546POSITION_ESCAPES = {

4547 "A": StartOfString(),

4548 "b": Boundary(),

4549 "B": Boundary(False),

4550 "K": Keep(),

4551 "m": StartOfWord(),

4552 "M": EndOfWord(),

4553 "Z": EndOfString(),

4554}

4555ASCII_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4556ASCII_POSITION_ESCAPES.update({

4557 "b": Boundary(encoding=ASCII_ENCODING),

4558 "B": Boundary(False, encoding=ASCII_ENCODING),

4559 "m": StartOfWord(encoding=ASCII_ENCODING),

4560 "M": EndOfWord(encoding=ASCII_ENCODING),

4561})

4562UNICODE_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4563UNICODE_POSITION_ESCAPES.update({

4564 "b": Boundary(encoding=UNICODE_ENCODING),

4565 "B": Boundary(False, encoding=UNICODE_ENCODING),

4566 "m": StartOfWord(encoding=UNICODE_ENCODING),

4567 "M": EndOfWord(encoding=UNICODE_ENCODING),

4568})

4569

4570# Positional escape sequences when WORD flag set.

4571WORD_POSITION_ESCAPES = dict(POSITION_ESCAPES)

4572WORD_POSITION_ESCAPES.update({

4573 "b": DefaultBoundary(),

4574 "B": DefaultBoundary(False),

4575 "m": DefaultStartOfWord(),

4576 "M": DefaultEndOfWord(),

4577})

4578

4579# Regex control verbs.

4580VERBS = {

4581 "FAIL": Failure(),

4582 "F": Failure(),

4583 "PRUNE": Prune(),

4584 "SKIP": Skip(),

4585}

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/regex/_regex_core.py: 78%

2834 statements